fuzzy_match 1.3.1 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +12 -2
- data/History.txt +13 -0
- data/README.markdown +10 -6
- data/benchmark/before-with-free.txt +21 -21
- data/benchmark/before.txt +21 -21
- data/benchmark/memory.rb +6 -6
- data/examples/bts_aircraft/{blockings.csv → groupings.csv} +0 -0
- data/examples/bts_aircraft/test_bts_aircraft.rb +6 -6
- data/fuzzy_match.gemspec +1 -10
- data/lib/fuzzy_match.rb +41 -33
- data/lib/fuzzy_match/result.rb +1 -1
- data/lib/fuzzy_match/rule.rb +14 -0
- data/lib/fuzzy_match/rule/grouping.rb +32 -0
- data/lib/fuzzy_match/rule/identity.rb +19 -0
- data/lib/fuzzy_match/rule/normalizer.rb +20 -0
- data/lib/fuzzy_match/rule/stop_word.rb +11 -0
- data/lib/fuzzy_match/version.rb +1 -1
- data/test/helper.rb +3 -1
- data/test/test_fuzzy_match.rb +188 -124
- data/test/test_fuzzy_match_convoluted.rb.disabled +12 -12
- data/test/{test_blocking.rb → test_grouping.rb} +6 -6
- data/test/test_identity.rb +8 -8
- data/test/test_normalizer.rb +2 -2
- data/test/test_wrapper.rb +1 -1
- metadata +15 -101
- data/lib/fuzzy_match/blocking.rb +0 -36
- data/lib/fuzzy_match/identity.rb +0 -23
- data/lib/fuzzy_match/normalizer.rb +0 -28
- data/lib/fuzzy_match/stop_word.rb +0 -19
data/lib/fuzzy_match/result.rb
CHANGED
@@ -0,0 +1,32 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
class Rule
|
3
|
+
# "Record linkage typically involves two main steps: grouping and scoring..."
|
4
|
+
# http://en.wikipedia.org/wiki/Record_linkage
|
5
|
+
#
|
6
|
+
# Groupings effectively divide up the haystack into groups that match a pattern
|
7
|
+
#
|
8
|
+
# A grouping (formerly known as a blocking) comes into effect when a str matches.
|
9
|
+
# Then the needle must also match the grouping's regexp.
|
10
|
+
class Grouping < Rule
|
11
|
+
def match?(str)
|
12
|
+
!!(regexp.match(str))
|
13
|
+
end
|
14
|
+
|
15
|
+
# If a grouping "joins" two strings, that means they both fit into it.
|
16
|
+
#
|
17
|
+
# Returns false if they certainly don't fit this grouping.
|
18
|
+
# Returns nil if the grouping doesn't apply, i.e. str2 doesn't fit the grouping.
|
19
|
+
def join?(str1, str2)
|
20
|
+
if str2_match_data = regexp.match(str2)
|
21
|
+
if str1_match_data = regexp.match(str1)
|
22
|
+
str2_match_data.captures.join.downcase == str1_match_data.captures.join.downcase
|
23
|
+
else
|
24
|
+
false
|
25
|
+
end
|
26
|
+
else
|
27
|
+
nil
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
class Rule
|
3
|
+
# Identities take effect when needle and haystack both match a regexp
|
4
|
+
# Then the captured part of the regexp has to match exactly
|
5
|
+
class Identity < Rule
|
6
|
+
# Two strings are "identical" if they both match this identity and the captures are equal.
|
7
|
+
#
|
8
|
+
# Only returns true/false if both strings match the regexp.
|
9
|
+
# Otherwise returns nil.
|
10
|
+
def identical?(str1, str2)
|
11
|
+
if str1_match_data = regexp.match(str1) and match_data = regexp.match(str2)
|
12
|
+
str1_match_data.captures.join.downcase == match_data.captures.join.downcase
|
13
|
+
else
|
14
|
+
nil
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
class Rule
|
3
|
+
# A normalizer just strips a string down to its core
|
4
|
+
class Normalizer < Rule
|
5
|
+
# A normalizer applies when its regexp matches and captures a new (shorter) string
|
6
|
+
def apply?(str)
|
7
|
+
!!(regexp.match(str))
|
8
|
+
end
|
9
|
+
|
10
|
+
# The result of applying a normalizer is just all the captures put together.
|
11
|
+
def apply(str)
|
12
|
+
if match_data = regexp.match(str)
|
13
|
+
match_data.captures.join
|
14
|
+
else
|
15
|
+
str
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/fuzzy_match/version.rb
CHANGED
data/test/helper.rb
CHANGED
@@ -3,7 +3,9 @@ require 'bundler'
|
|
3
3
|
Bundler.setup
|
4
4
|
require 'minitest/spec'
|
5
5
|
require 'minitest/autorun'
|
6
|
-
require '
|
6
|
+
require 'minitest/reporters'
|
7
|
+
MiniTest::Unit.runner = MiniTest::SuiteRunner.new
|
8
|
+
MiniTest::Unit.runner.reporters << MiniTest::Reporters::SpecReporter.new
|
7
9
|
|
8
10
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
9
11
|
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
data/test/test_fuzzy_match.rb
CHANGED
@@ -2,98 +2,126 @@
|
|
2
2
|
require 'helper'
|
3
3
|
|
4
4
|
class TestFuzzyMatch < MiniTest::Spec
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
d = FuzzyMatch.new [ 'X' ]
|
11
|
-
d.find('X').must_equal 'X'
|
12
|
-
d.find('A').must_be_nil
|
13
|
-
end
|
5
|
+
describe '#find' do
|
6
|
+
it %{identifies the best match based on string similarity} do
|
7
|
+
d = FuzzyMatch.new %w{ RATZ CATZ }
|
8
|
+
d.find('RITZ').must_equal 'RATZ'
|
9
|
+
d.find('RíTZ').must_equal 'RATZ'
|
14
10
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
11
|
+
d = FuzzyMatch.new [ 'X' ]
|
12
|
+
d.find('X').must_equal 'X'
|
13
|
+
d.find('A').must_be_nil
|
14
|
+
end
|
15
|
+
|
16
|
+
it %{not return any result if the maximum score is zero} do
|
17
|
+
FuzzyMatch.new(['a']).find('b').must_be_nil
|
18
|
+
end
|
21
19
|
end
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
20
|
+
|
21
|
+
describe '#find_all' do
|
22
|
+
it %{return all records in sorted order} do
|
23
|
+
d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
|
24
|
+
d.find_all('X').must_equal ['X', 'X22' ]
|
25
|
+
d.find_all('A').must_equal []
|
26
|
+
end
|
28
27
|
end
|
29
28
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
29
|
+
describe '#explain' do
|
30
|
+
before do
|
31
|
+
require 'stringio'
|
32
|
+
@capture = StringIO.new
|
33
|
+
@old_stdout = $stdout
|
34
|
+
$stdout = @capture
|
35
|
+
end
|
36
|
+
after do
|
37
|
+
$stdout = @old_stdout
|
38
|
+
end
|
39
|
+
|
40
|
+
it %{print a basic explanation to stdout} do
|
41
|
+
d = FuzzyMatch.new %w{ RATZ CATZ }
|
42
|
+
d.explain('RITZ')
|
43
|
+
@capture.rewind
|
44
|
+
@capture.read.must_include 'CATZ'
|
45
|
+
end
|
46
|
+
|
47
|
+
it %{explains match failures} do
|
48
|
+
FuzzyMatch.new(['aaa']).explain('bbb')
|
49
|
+
@capture.rewind
|
50
|
+
@capture.read.must_match %r{No winner assigned.*aaa.*bbb}
|
51
|
+
end
|
39
52
|
end
|
40
53
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
|
47
|
-
d.find('baz').must_be_nil
|
48
|
-
end
|
54
|
+
describe "normalizers" do
|
55
|
+
it %{sometimes gets false results without them} do
|
56
|
+
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
|
57
|
+
d.find('BOEING 737100 number 900').must_equal 'BOEING 737-900'
|
58
|
+
end
|
49
59
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
60
|
+
it %{can be used to improve results} do
|
61
|
+
normalizers = [
|
62
|
+
%r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
|
63
|
+
]
|
64
|
+
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :normalizers => normalizers
|
65
|
+
d.find('BOEING 737100 number 900').must_equal 'BOEING 737-100/200'
|
66
|
+
end
|
55
67
|
end
|
56
68
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
69
|
+
describe "identities" do
|
70
|
+
it %{sometimes gets false results without them} do
|
71
|
+
# false positive without identity
|
72
|
+
d = FuzzyMatch.new %w{ foo bar }
|
73
|
+
d.find('baz').must_equal 'bar'
|
74
|
+
end
|
62
75
|
|
63
|
-
|
64
|
-
|
65
|
-
|
76
|
+
it %{can be used to improve results} do
|
77
|
+
d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
|
78
|
+
d.find('baz').must_be_nil
|
79
|
+
end
|
66
80
|
end
|
67
81
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
82
|
+
describe 'groupings' do
|
83
|
+
it %{sometimes gets false results without them} do
|
84
|
+
d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ]
|
85
|
+
d.find('Barack Bush').must_equal 'Barack Obama' # luke i am your father
|
86
|
+
d.find('George Obama').must_equal 'George Bush' # nooooooooooooooooooo
|
87
|
+
end
|
88
|
+
|
89
|
+
it %{can be used to improve results} do
|
90
|
+
d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ]
|
91
|
+
d.find('Barack Bush').must_equal 'George Bush'
|
92
|
+
d.find('George Obama').must_equal 'Barack Obama'
|
93
|
+
end
|
72
94
|
end
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
95
|
+
|
96
|
+
describe "the :must_match_grouping option" do
|
97
|
+
it %{optionally only attempt matches with records that fit into a grouping} do
|
98
|
+
d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ], :must_match_grouping => true
|
99
|
+
d.find('George Clinton').must_be_nil
|
100
|
+
|
101
|
+
d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ]
|
102
|
+
d.find('George Clinton', :must_match_grouping => true).must_be_nil
|
103
|
+
end
|
78
104
|
end
|
105
|
+
|
106
|
+
describe "the :first_grouping_decides option" do
|
107
|
+
it %{optionally force the first grouping to decide} do
|
108
|
+
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ]
|
109
|
+
d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ]
|
79
110
|
|
80
|
-
|
81
|
-
|
82
|
-
d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ]
|
83
|
-
|
84
|
-
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
85
|
-
d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR' ]
|
111
|
+
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true
|
112
|
+
d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR' ]
|
86
113
|
|
87
|
-
|
88
|
-
|
89
|
-
|
114
|
+
# first_grouping_decides refers to the needle
|
115
|
+
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true
|
116
|
+
d.find_all('Boeing ER6').must_equal ["Boeing ER6", "Boeing 747", "Boeing 747SR"]
|
90
117
|
|
91
|
-
|
92
|
-
|
118
|
+
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_grouping_decides => true
|
119
|
+
d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
|
93
120
|
|
94
|
-
|
95
|
-
|
96
|
-
|
121
|
+
# or equivalently with an identity
|
122
|
+
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true, :identities => [ /boeing (7|E)/i ]
|
123
|
+
d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
|
124
|
+
end
|
97
125
|
end
|
98
126
|
|
99
127
|
describe "the :read option" do
|
@@ -135,70 +163,106 @@ class TestFuzzyMatch < MiniTest::Spec
|
|
135
163
|
by_first.find('b').must_equal ba
|
136
164
|
by_last.find('a').must_equal ba
|
137
165
|
end
|
138
|
-
|
139
|
-
it %{treat the deprecrated :haystack_reader option as an alias} do
|
140
|
-
ab = ['a', 'b']
|
141
|
-
ba = ['b', 'a']
|
142
|
-
haystack = [ab, ba]
|
143
|
-
by_first = FuzzyMatch.new haystack, :haystack_reader => 0
|
144
|
-
by_first.find('a').must_equal ab
|
145
|
-
by_first.find('b').must_equal ba
|
146
|
-
end
|
147
166
|
end
|
167
|
+
|
168
|
+
describe 'the :must_match_at_least_one_word option' do
|
169
|
+
it %{optionally require that the matching record share at least one word with the needle} do
|
170
|
+
d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
|
171
|
+
d.find('RITZ').must_be_nil
|
172
|
+
|
173
|
+
d = FuzzyMatch.new ["Foo's Bar"], :must_match_at_least_one_word => true
|
174
|
+
d.find("Foo's").must_equal "Foo's Bar"
|
175
|
+
d.find("'s").must_be_nil
|
176
|
+
d.find("Foo").must_be_nil
|
177
|
+
|
178
|
+
d = FuzzyMatch.new ["Bolivia, Plurinational State of"], :must_match_at_least_one_word => true
|
179
|
+
d.find("Bolivia").must_equal "Bolivia, Plurinational State of"
|
180
|
+
end
|
148
181
|
|
149
|
-
|
150
|
-
|
151
|
-
|
182
|
+
it %{use STOP WORDS} do
|
183
|
+
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
|
184
|
+
d.find('A HTL', :must_match_at_least_one_word => true).must_equal 'B HTL'
|
152
185
|
|
153
|
-
|
154
|
-
|
155
|
-
d.find('RITZ').must_be_nil
|
186
|
+
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
|
187
|
+
d.find('A HTL').must_equal 'B HTL'
|
156
188
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
d.find("Foo").must_be_nil
|
189
|
+
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
|
190
|
+
d.find('A HTL').must_equal 'A HOTEL'
|
191
|
+
end
|
161
192
|
|
162
|
-
|
163
|
-
|
193
|
+
it %{not be fooled by substrings (but rather compare whole words to whole words)} do
|
194
|
+
d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
|
195
|
+
d.find('DOLCE LA HULPE BXL FI').must_be_nil
|
196
|
+
end
|
197
|
+
|
198
|
+
it %{not be case-sensitive when checking for sharing of words} do
|
199
|
+
d = FuzzyMatch.new [ 'A', 'B' ]
|
200
|
+
d.find('a', :must_match_at_least_one_word => true).must_equal 'A'
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
describe "the :gather_last_result option" do
|
205
|
+
it %{not gather metadata about the last result by default} do
|
206
|
+
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
207
|
+
d.find('MISSAM')
|
208
|
+
lambda do
|
209
|
+
d.last_result
|
210
|
+
end.must_raise ::RuntimeError, /gather_last_result/
|
211
|
+
end
|
212
|
+
|
213
|
+
it %{optionally gather metadata about the last result} do
|
214
|
+
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
215
|
+
d.find 'MISSAM', :gather_last_result => true
|
216
|
+
d.last_result.score.must_equal 0.6
|
217
|
+
d.last_result.winner.must_equal 'NISSAN'
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
describe 'quirks' do
|
222
|
+
it %{should not return false negatives because of one-letter similarities} do
|
223
|
+
# dices coefficient doesn't think these two are similar at all because it looks at pairs
|
224
|
+
FuzzyMatch.score_class.new('X foo', 'X bar').dices_coefficient_similar.must_equal 0
|
225
|
+
# so we must compensate for that somewhere
|
226
|
+
d = FuzzyMatch.new ['X foo', 'randomness']
|
227
|
+
d.find('X bar').must_equal 'X foo'
|
228
|
+
# without making false positives
|
229
|
+
d.find('Y bar').must_be_nil
|
230
|
+
end
|
164
231
|
end
|
165
232
|
|
166
|
-
|
167
|
-
|
168
|
-
|
233
|
+
describe 'deprecations' do
|
234
|
+
it %{takes :must_match_blocking as :must_match_grouping} do
|
235
|
+
d = FuzzyMatch.new [], :must_match_blocking => :a
|
236
|
+
d.default_options[:must_match_grouping].must_equal :a
|
237
|
+
end
|
169
238
|
|
170
|
-
|
171
|
-
|
239
|
+
it %{takes :first_blocking_decides as :first_grouping_decides} do
|
240
|
+
d = FuzzyMatch.new [], :first_blocking_decides => :b
|
241
|
+
d.default_options[:first_grouping_decides].must_equal :b
|
242
|
+
end
|
172
243
|
|
173
|
-
|
174
|
-
|
175
|
-
|
244
|
+
it %{takes :haystack_reader as :read} do
|
245
|
+
d = FuzzyMatch.new [], :haystack_reader => :c
|
246
|
+
d.read.must_equal :c
|
247
|
+
end
|
176
248
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
begin
|
181
|
-
old_stdout = $stdout
|
182
|
-
$stdout = capture
|
183
|
-
d = FuzzyMatch.new %w{ RATZ CATZ }
|
184
|
-
d.explain('RITZ')
|
185
|
-
ensure
|
186
|
-
$stdout = old_stdout
|
249
|
+
it %{takes :blockings as :groupings} do
|
250
|
+
d = FuzzyMatch.new [], :blockings => [ /X/, /Y/ ]
|
251
|
+
d.groupings.must_equal [ FuzzyMatch::Rule::Grouping.new(/X/), FuzzyMatch::Rule::Grouping.new(/Y/) ]
|
187
252
|
end
|
188
|
-
capture.rewind
|
189
|
-
capture.read.must_include 'CATZ'
|
190
|
-
end
|
191
253
|
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
254
|
+
it %{takes :tighteners as :normalizers} do
|
255
|
+
d = FuzzyMatch.new [], :tighteners => [ /X/, /Y/ ]
|
256
|
+
d.normalizers.must_equal [ FuzzyMatch::Rule::Normalizer.new(/X/), FuzzyMatch::Rule::Normalizer.new(/Y/) ]
|
257
|
+
end
|
196
258
|
|
197
|
-
|
198
|
-
|
199
|
-
|
259
|
+
it %{receives #free method, but doesn't do anything} do
|
260
|
+
d = FuzzyMatch.new %w{ A B }
|
261
|
+
d.free
|
262
|
+
d.find('A').wont_be_nil
|
263
|
+
end
|
200
264
|
end
|
201
|
-
|
265
|
+
|
202
266
|
it %{defaults to a pure-ruby engine, but also has amatch} do
|
203
267
|
if defined?($testing_amatch) and $testing_amatch
|
204
268
|
FuzzyMatch.engine.must_equal :amatch
|