fuzzy_match 1.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.rspec +2 -0
- data/CHANGELOG +14 -0
- data/Gemfile +8 -0
- data/README.markdown +58 -38
- data/Rakefile +0 -9
- data/bin/fuzzy_match +106 -0
- data/fuzzy_match.gemspec +4 -4
- data/groupings-screenshot.png +0 -0
- data/highlevel.graffle +0 -0
- data/highlevel.png +0 -0
- data/lib/fuzzy_match/record.rb +58 -0
- data/lib/fuzzy_match/result.rb +11 -8
- data/lib/fuzzy_match/rule/grouping.rb +70 -12
- data/lib/fuzzy_match/rule/identity.rb +3 -3
- data/lib/fuzzy_match/rule.rb +1 -1
- data/lib/fuzzy_match/score/amatch.rb +0 -4
- data/lib/fuzzy_match/score/pure_ruby.rb +2 -8
- data/lib/fuzzy_match/score.rb +4 -0
- data/lib/fuzzy_match/similarity.rb +10 -32
- data/lib/fuzzy_match/version.rb +1 -1
- data/lib/fuzzy_match.rb +78 -94
- data/{test/test_amatch.rb → spec/amatch_spec.rb} +1 -2
- data/{test/test_cache.rb → spec/cache_spec.rb} +7 -7
- data/spec/foo.rb +9 -0
- data/spec/fuzzy_match_spec.rb +354 -0
- data/spec/grouping_spec.rb +60 -0
- data/spec/identity_spec.rb +29 -0
- data/{test/test_wrapper.rb → spec/record_spec.rb} +3 -7
- data/spec/spec_helper.rb +21 -0
- metadata +56 -50
- data/bin/fuzzy_match_checker +0 -71
- data/examples/bts_aircraft/5-2-A.htm +0 -10305
- data/examples/bts_aircraft/5-2-B.htm +0 -9576
- data/examples/bts_aircraft/5-2-D.htm +0 -7094
- data/examples/bts_aircraft/5-2-E.htm +0 -2349
- data/examples/bts_aircraft/5-2-G.htm +0 -2922
- data/examples/bts_aircraft/groupings.csv +0 -1
- data/examples/bts_aircraft/identities.csv +0 -1
- data/examples/bts_aircraft/negatives.csv +0 -1
- data/examples/bts_aircraft/normalizers.csv +0 -1
- data/examples/bts_aircraft/number_260.csv +0 -334
- data/examples/bts_aircraft/positives.csv +0 -1
- data/examples/bts_aircraft/test_bts_aircraft.rb +0 -116
- data/examples/first_name_matching.rb +0 -15
- data/examples/icao-bts.xls +0 -0
- data/lib/fuzzy_match/rule/normalizer.rb +0 -20
- data/lib/fuzzy_match/rule/stop_word.rb +0 -11
- data/lib/fuzzy_match/wrapper.rb +0 -73
- data/test/helper.rb +0 -12
- data/test/test_fuzzy_match.rb +0 -304
- data/test/test_fuzzy_match_convoluted.rb.disabled +0 -268
- data/test/test_grouping.rb +0 -28
- data/test/test_identity.rb +0 -34
- data/test/test_normalizer.rb +0 -10
data/test/test_fuzzy_match.rb
DELETED
@@ -1,304 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
require 'helper'
|
3
|
-
|
4
|
-
describe FuzzyMatch do
|
5
|
-
describe '#find' do
|
6
|
-
it %{identifies the best match based on string similarity} do
|
7
|
-
d = FuzzyMatch.new %w{ RATZ CATZ }
|
8
|
-
d.find('RITZ').must_equal 'RATZ'
|
9
|
-
d.find('RíTZ').must_equal 'RATZ'
|
10
|
-
|
11
|
-
d = FuzzyMatch.new [ 'X' ]
|
12
|
-
d.find('X').must_equal 'X'
|
13
|
-
d.find('A').must_be_nil
|
14
|
-
end
|
15
|
-
|
16
|
-
it %{not return any result if the maximum score is zero} do
|
17
|
-
FuzzyMatch.new(['a']).find('b').must_be_nil
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
describe '#find_all' do
|
22
|
-
it %{return all records in sorted order} do
|
23
|
-
d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
|
24
|
-
d.find_all('X').must_equal ['X', 'X22' ]
|
25
|
-
d.find_all('A').must_equal []
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
describe '#find_best' do
|
30
|
-
it %{returns one or more records with the best score} do
|
31
|
-
d = FuzzyMatch.new [ 'X', 'X', 'X22', 'Y', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
|
32
|
-
d.find_best('X').must_equal ['X', 'X' ]
|
33
|
-
d.find_best('A').must_equal []
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
describe '#find_all_with_score' do
|
38
|
-
it %{return records with 2 scores} do
|
39
|
-
d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
|
40
|
-
d.find_all_with_score('X').must_equal [ ['X', 1, 1], ['X22', 0, 0.33333333333333337] ]
|
41
|
-
d.find_all_with_score('A').must_equal []
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
describe '#explain' do
|
46
|
-
before do
|
47
|
-
require 'stringio'
|
48
|
-
@capture = StringIO.new
|
49
|
-
@old_stdout = $stdout
|
50
|
-
$stdout = @capture
|
51
|
-
end
|
52
|
-
after do
|
53
|
-
$stdout = @old_stdout
|
54
|
-
end
|
55
|
-
|
56
|
-
it %{print a basic explanation to stdout} do
|
57
|
-
d = FuzzyMatch.new %w{ RATZ CATZ }
|
58
|
-
d.explain('RITZ')
|
59
|
-
@capture.rewind
|
60
|
-
@capture.read.must_include 'CATZ'
|
61
|
-
end
|
62
|
-
|
63
|
-
it %{explains match failures} do
|
64
|
-
FuzzyMatch.new(['aaa']).explain('bbb')
|
65
|
-
@capture.rewind
|
66
|
-
@capture.read.must_match %r{No winner assigned.*aaa.*bbb}
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
describe "normalizers" do
|
71
|
-
it %{sometimes gets false results without them} do
|
72
|
-
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
|
73
|
-
d.find('BOEING 737100 number 900').must_equal 'BOEING 737-900'
|
74
|
-
end
|
75
|
-
|
76
|
-
it %{can be used to improve results} do
|
77
|
-
normalizers = [
|
78
|
-
%r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
|
79
|
-
]
|
80
|
-
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :normalizers => normalizers
|
81
|
-
d.find('BOEING 737100 number 900').must_equal 'BOEING 737-100/200'
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
describe "identities" do
|
86
|
-
it %{sometimes gets false results without them} do
|
87
|
-
# false positive without identity
|
88
|
-
d = FuzzyMatch.new %w{ foo bar }
|
89
|
-
d.find('baz').must_equal 'bar'
|
90
|
-
end
|
91
|
-
|
92
|
-
it %{can be used to improve results} do
|
93
|
-
d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
|
94
|
-
d.find('baz').must_be_nil
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
describe 'groupings' do
|
99
|
-
it %{sometimes gets false results without them} do
|
100
|
-
d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ]
|
101
|
-
d.find('Barack Bush').must_equal 'Barack Obama' # luke i am your father
|
102
|
-
d.find('George Obama').must_equal 'George Bush' # nooooooooooooooooooo
|
103
|
-
end
|
104
|
-
|
105
|
-
it %{can be used to improve results} do
|
106
|
-
d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ]
|
107
|
-
d.find('Barack Bush').must_equal 'George Bush'
|
108
|
-
d.find('George Obama').must_equal 'Barack Obama'
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
112
|
-
describe "the :must_match_grouping option" do
|
113
|
-
it %{optionally only attempt matches with records that fit into a grouping} do
|
114
|
-
d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ], :must_match_grouping => true
|
115
|
-
d.find('George Clinton').must_be_nil
|
116
|
-
|
117
|
-
d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ]
|
118
|
-
d.find('George Clinton', :must_match_grouping => true).must_be_nil
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
describe "the :first_grouping_decides option" do
|
123
|
-
it %{optionally force the first grouping to decide} do
|
124
|
-
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ]
|
125
|
-
d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ]
|
126
|
-
|
127
|
-
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true
|
128
|
-
d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR' ]
|
129
|
-
|
130
|
-
# first_grouping_decides refers to the needle
|
131
|
-
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true
|
132
|
-
d.find_all('Boeing ER6').must_equal ["Boeing ER6", "Boeing 747", "Boeing 747SR"]
|
133
|
-
|
134
|
-
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_grouping_decides => true
|
135
|
-
d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
|
136
|
-
|
137
|
-
# or equivalently with an identity
|
138
|
-
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true, :identities => [ /boeing (7|E)/i ]
|
139
|
-
d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
|
140
|
-
end
|
141
|
-
end
|
142
|
-
|
143
|
-
describe "the :read option" do
|
144
|
-
it %{interpret a Numeric as an array index} do
|
145
|
-
ab = ['a', 'b']
|
146
|
-
ba = ['b', 'a']
|
147
|
-
haystack = [ab, ba]
|
148
|
-
by_first = FuzzyMatch.new haystack, :read => 0
|
149
|
-
by_last = FuzzyMatch.new haystack, :read => 1
|
150
|
-
by_first.find('a').must_equal ab
|
151
|
-
by_last.find('b').must_equal ab
|
152
|
-
by_first.find('b').must_equal ba
|
153
|
-
by_last.find('a').must_equal ba
|
154
|
-
end
|
155
|
-
|
156
|
-
it %{interpret a Symbol, etc. as hash key} do
|
157
|
-
ab = { :one => 'a', :two => 'b' }
|
158
|
-
ba = { :one => 'b', :two => 'a' }
|
159
|
-
haystack = [ab, ba]
|
160
|
-
by_first = FuzzyMatch.new haystack, :read => :one
|
161
|
-
by_last = FuzzyMatch.new haystack, :read => :two
|
162
|
-
by_first.find('a').must_equal ab
|
163
|
-
by_last.find('b').must_equal ab
|
164
|
-
by_first.find('b').must_equal ba
|
165
|
-
by_last.find('a').must_equal ba
|
166
|
-
end
|
167
|
-
|
168
|
-
MyStruct = Struct.new(:one, :two)
|
169
|
-
it %{interpret a Symbol as a method id (if the object responds to it)} do
|
170
|
-
ab = MyStruct.new('a', 'b')
|
171
|
-
ba = MyStruct.new('b', 'a')
|
172
|
-
haystack = [ab, ba]
|
173
|
-
by_first = FuzzyMatch.new haystack, :read => :one
|
174
|
-
by_last = FuzzyMatch.new haystack, :read => :two
|
175
|
-
by_first.read.must_equal :one
|
176
|
-
by_last.read.must_equal :two
|
177
|
-
by_first.find('a').must_equal ab
|
178
|
-
by_last.find('b').must_equal ab
|
179
|
-
by_first.find('b').must_equal ba
|
180
|
-
by_last.find('a').must_equal ba
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
describe 'the :must_match_at_least_one_word option' do
|
185
|
-
it %{optionally require that the matching record share at least one word with the needle} do
|
186
|
-
d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
|
187
|
-
d.find('RITZ').must_be_nil
|
188
|
-
|
189
|
-
d = FuzzyMatch.new ["Foo's Bar"], :must_match_at_least_one_word => true
|
190
|
-
d.find("Foo's").must_equal "Foo's Bar"
|
191
|
-
d.find("'s").must_be_nil
|
192
|
-
d.find("Foo").must_be_nil
|
193
|
-
|
194
|
-
d = FuzzyMatch.new ["Bolivia, Plurinational State of"], :must_match_at_least_one_word => true
|
195
|
-
d.find("Bolivia").must_equal "Bolivia, Plurinational State of"
|
196
|
-
end
|
197
|
-
|
198
|
-
it %{use STOP WORDS} do
|
199
|
-
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
|
200
|
-
d.find('A HTL', :must_match_at_least_one_word => true).must_equal 'B HTL'
|
201
|
-
|
202
|
-
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
|
203
|
-
d.find('A HTL').must_equal 'B HTL'
|
204
|
-
|
205
|
-
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
|
206
|
-
d.find('A HTL').must_equal 'A HOTEL'
|
207
|
-
end
|
208
|
-
|
209
|
-
it %{not be fooled by substrings (but rather compare whole words to whole words)} do
|
210
|
-
d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
|
211
|
-
d.find('DOLCE LA HULPE BXL FI').must_be_nil
|
212
|
-
end
|
213
|
-
|
214
|
-
it %{not be case-sensitive when checking for sharing of words} do
|
215
|
-
d = FuzzyMatch.new [ 'A', 'B' ]
|
216
|
-
d.find('a', :must_match_at_least_one_word => true).must_equal 'A'
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
describe "the :gather_last_result option" do
|
221
|
-
it %{not gather metadata about the last result by default} do
|
222
|
-
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
223
|
-
d.find('MISSAM')
|
224
|
-
lambda do
|
225
|
-
d.last_result
|
226
|
-
end.must_raise ::RuntimeError, /gather_last_result/
|
227
|
-
end
|
228
|
-
|
229
|
-
it %{optionally gather metadata about the last result} do
|
230
|
-
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
231
|
-
d.find 'MISSAM', :gather_last_result => true
|
232
|
-
d.last_result.score.must_equal 0.6
|
233
|
-
d.last_result.winner.must_equal 'NISSAN'
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
describe 'quirks' do
|
238
|
-
it %{should not return false negatives because of one-letter similarities} do
|
239
|
-
# dices coefficient doesn't think these two are similar at all because it looks at pairs
|
240
|
-
FuzzyMatch.score_class.new('X foo', 'X bar').dices_coefficient_similar.must_equal 0
|
241
|
-
# so we must compensate for that somewhere
|
242
|
-
d = FuzzyMatch.new ['X foo', 'randomness']
|
243
|
-
d.find('X bar').must_equal 'X foo'
|
244
|
-
# without making false positives
|
245
|
-
d.find('Y bar').must_be_nil
|
246
|
-
end
|
247
|
-
|
248
|
-
it %{finds possible matches even when pair distance fails} do
|
249
|
-
d = FuzzyMatch.new ['XX', '2 A']
|
250
|
-
d.find('2A').must_equal '2 A'
|
251
|
-
d = FuzzyMatch.new ['XX', '2A']
|
252
|
-
d.find('2 A').must_equal '2A'
|
253
|
-
end
|
254
|
-
|
255
|
-
it %{weird blow ups} do
|
256
|
-
d = FuzzyMatch.new ['XX', '2 A']
|
257
|
-
d.find('A').must_equal '2 A'
|
258
|
-
d = FuzzyMatch.new ['XX', 'A']
|
259
|
-
d.find('2 A').must_equal 'A'
|
260
|
-
end
|
261
|
-
|
262
|
-
end
|
263
|
-
|
264
|
-
describe 'deprecations' do
|
265
|
-
it %{takes :must_match_blocking as :must_match_grouping} do
|
266
|
-
d = FuzzyMatch.new [], :must_match_blocking => :a
|
267
|
-
d.default_options[:must_match_grouping].must_equal :a
|
268
|
-
end
|
269
|
-
|
270
|
-
it %{takes :first_blocking_decides as :first_grouping_decides} do
|
271
|
-
d = FuzzyMatch.new [], :first_blocking_decides => :b
|
272
|
-
d.default_options[:first_grouping_decides].must_equal :b
|
273
|
-
end
|
274
|
-
|
275
|
-
it %{takes :haystack_reader as :read} do
|
276
|
-
d = FuzzyMatch.new [], :haystack_reader => :c
|
277
|
-
d.read.must_equal :c
|
278
|
-
end
|
279
|
-
|
280
|
-
it %{takes :blockings as :groupings} do
|
281
|
-
d = FuzzyMatch.new [], :blockings => [ /X/, /Y/ ]
|
282
|
-
d.groupings.must_equal [ FuzzyMatch::Rule::Grouping.new(/X/), FuzzyMatch::Rule::Grouping.new(/Y/) ]
|
283
|
-
end
|
284
|
-
|
285
|
-
it %{takes :tighteners as :normalizers} do
|
286
|
-
d = FuzzyMatch.new [], :tighteners => [ /X/, /Y/ ]
|
287
|
-
d.normalizers.must_equal [ FuzzyMatch::Rule::Normalizer.new(/X/), FuzzyMatch::Rule::Normalizer.new(/Y/) ]
|
288
|
-
end
|
289
|
-
|
290
|
-
it %{receives #free method, but doesn't do anything} do
|
291
|
-
d = FuzzyMatch.new %w{ A B }
|
292
|
-
d.free
|
293
|
-
d.find('A').wont_be_nil
|
294
|
-
end
|
295
|
-
end
|
296
|
-
|
297
|
-
it %{defaults to a pure-ruby engine, but also has amatch} do
|
298
|
-
if defined?($testing_amatch) and $testing_amatch
|
299
|
-
FuzzyMatch.engine.must_equal :amatch
|
300
|
-
else
|
301
|
-
FuzzyMatch.engine.must_equal :pure_ruby
|
302
|
-
end
|
303
|
-
end
|
304
|
-
end
|
@@ -1,268 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
require 'shoulda'
|
4
|
-
|
5
|
-
$log = false
|
6
|
-
|
7
|
-
class TestFuzzyMatchConvoluted < MiniTest::Spec
|
8
|
-
def setup
|
9
|
-
clear_ltd
|
10
|
-
|
11
|
-
# dh 8 400
|
12
|
-
@a_needle = ['DE HAVILLAND CANADA DHC8400 Dash 8']
|
13
|
-
@a_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
|
14
|
-
# dh 88
|
15
|
-
@b_needle = ['ABCDEFG DH88 HIJKLMNOP']
|
16
|
-
# dh 89
|
17
|
-
@c_haystack = ['ABCDEFG DH89 HIJKLMNOP']
|
18
|
-
# dh 8 200
|
19
|
-
@d_needle = ['DE HAVILLAND CANADA DHC8200 Dash 8']
|
20
|
-
@d_haystack = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
|
21
|
-
@d_lookalike = ['ABCD DHC8200 Dash 8']
|
22
|
-
|
23
|
-
@t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
|
24
|
-
|
25
|
-
@r_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good identity for de havilland' ]
|
26
|
-
|
27
|
-
@needle = [
|
28
|
-
@a_needle,
|
29
|
-
@b_needle,
|
30
|
-
['DE HAVILLAND DH89 Dragon Rapide'],
|
31
|
-
['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
|
32
|
-
@d_needle,
|
33
|
-
['DE HAVILLAND CANADA DHC8300 Dash 8'],
|
34
|
-
['DE HAVILLAND DH90 Dragonfly']
|
35
|
-
]
|
36
|
-
@haystack = [
|
37
|
-
@a_haystack,
|
38
|
-
@c_haystack,
|
39
|
-
@d_haystack,
|
40
|
-
['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
|
41
|
-
['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
|
42
|
-
]
|
43
|
-
@tightenings = []
|
44
|
-
@identities = []
|
45
|
-
@groupings = []
|
46
|
-
@positives = []
|
47
|
-
@negatives = []
|
48
|
-
end
|
49
|
-
|
50
|
-
def clear_ltd
|
51
|
-
@_ltd = nil
|
52
|
-
end
|
53
|
-
|
54
|
-
def ltd
|
55
|
-
@_ltd ||= FuzzyMatch.new @haystack,
|
56
|
-
:tightenings => @tightenings,
|
57
|
-
:identities => @identities,
|
58
|
-
:groupings => @groupings,
|
59
|
-
:positives => @positives,
|
60
|
-
:negatives => @negatives,
|
61
|
-
:grouping_only => @grouping_only,
|
62
|
-
:log => $log
|
63
|
-
end
|
64
|
-
|
65
|
-
should "optionally only pay attention to things that match groupings" do
|
66
|
-
assert_equal @a_haystack, ltd.improver.match(@a_needle)
|
67
|
-
|
68
|
-
clear_ltd
|
69
|
-
@grouping_only = true
|
70
|
-
assert_equal nil, ltd.improver.match(@a_needle)
|
71
|
-
|
72
|
-
clear_ltd
|
73
|
-
@grouping_only = true
|
74
|
-
@groupings.push ['/dash/i']
|
75
|
-
assert_equal @a_haystack, ltd.improver.match(@a_needle)
|
76
|
-
end
|
77
|
-
|
78
|
-
# the example from the readme, considerably uglier here
|
79
|
-
should "check a simple table" do
|
80
|
-
@haystack = [ 'seamus', 'andy', 'ben' ]
|
81
|
-
@positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
|
82
|
-
needle = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
|
83
|
-
|
84
|
-
assert_nothing_raised do
|
85
|
-
ltd.improver.check needle
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
should "treat a String as a full record if passed through" do
|
90
|
-
dash = 'DHC8-400'
|
91
|
-
b747 = 'B747200/300'
|
92
|
-
dc9 = 'DC-9-10'
|
93
|
-
haystack_records = [ dash, b747, dc9 ]
|
94
|
-
simple_ltd = FuzzyMatch.new haystack_records, :log => $log
|
95
|
-
assert_equal dash, simple_ltd.improver.match('DeHavilland Dash-8 DHC-400')
|
96
|
-
assert_equal b747, simple_ltd.improver.match('Boeing 747-300')
|
97
|
-
assert_equal dc9, simple_ltd.improver.match('McDonnell Douglas MD81/DC-9')
|
98
|
-
end
|
99
|
-
|
100
|
-
should "call it a mismatch if you hit a blank positive" do
|
101
|
-
@positives.push [@a_needle[0], '']
|
102
|
-
assert_raises(FuzzyMatch::Improver::Mismatch) do
|
103
|
-
ltd.improver.match @a_needle
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
should "call it a false positive if you hit a blank negative" do
|
108
|
-
@negatives.push [@a_needle[0], '']
|
109
|
-
assert_raises(FuzzyMatch::Improver::FalsePositive) do
|
110
|
-
ltd.improver.match @a_needle
|
111
|
-
end
|
112
|
-
end
|
113
|
-
|
114
|
-
should "have a false match without grouping" do
|
115
|
-
# @d_needle will be our victim
|
116
|
-
@haystack.push @d_lookalike
|
117
|
-
@tightenings.push @t_1
|
118
|
-
|
119
|
-
assert_equal @d_lookalike, ltd.improver.match(@d_needle)
|
120
|
-
end
|
121
|
-
|
122
|
-
should "do grouping if the needle matches a group" do
|
123
|
-
# @d_needle will be our victim
|
124
|
-
@haystack.push @d_lookalike
|
125
|
-
@tightenings.push @t_1
|
126
|
-
@groupings.push ['/(bombardier|de ?havilland)/i']
|
127
|
-
|
128
|
-
assert_equal @d_haystack, ltd.improver.match(@d_needle)
|
129
|
-
end
|
130
|
-
|
131
|
-
should "treat groups as exclusive" do
|
132
|
-
@haystack = [ @d_needle ]
|
133
|
-
@tightenings.push @t_1
|
134
|
-
@groupings.push ['/(bombardier|de ?havilland)/i']
|
135
|
-
|
136
|
-
assert_equal nil, ltd.improver.match(@d_lookalike)
|
137
|
-
end
|
138
|
-
|
139
|
-
should "only use identities if they stem from the same regexp" do
|
140
|
-
@identities.push @r_1
|
141
|
-
@identities.push [ '/(cessna)(?:.*?)(citation)/i' ]
|
142
|
-
@identities.push [ '/(cessna)(?:.*?)(\d\d\d)/i' ]
|
143
|
-
x_needle = [ 'CESSNA D-333 CITATION V']
|
144
|
-
x_haystack = [ 'CESSNA D-333' ]
|
145
|
-
@haystack.push x_haystack
|
146
|
-
|
147
|
-
assert_equal x_haystack, ltd.improver.match(x_needle)
|
148
|
-
end
|
149
|
-
|
150
|
-
should "use the best score from all of the tightenings" do
|
151
|
-
x_needle = ["BOEING 737100"]
|
152
|
-
x_haystack = ["BOEING BOEING 737-100/200"]
|
153
|
-
x_haystack_wrong = ["BOEING BOEING 737-900"]
|
154
|
-
@haystack.push x_haystack
|
155
|
-
@haystack.push x_haystack_wrong
|
156
|
-
@tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
|
157
|
-
@tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
|
158
|
-
|
159
|
-
assert_equal x_haystack, ltd.improver.match(x_needle)
|
160
|
-
end
|
161
|
-
|
162
|
-
should "compare using prefixes if tightened key is shorter than correct match" do
|
163
|
-
x_needle = ["BOEING 720"]
|
164
|
-
x_haystack = ["BOEING BOEING 720-000"]
|
165
|
-
x_haystack_wrong = ["BOEING BOEING 717-200"]
|
166
|
-
@haystack.push x_haystack
|
167
|
-
@haystack.push x_haystack_wrong
|
168
|
-
@tightenings.push @t_1
|
169
|
-
@tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
|
170
|
-
@tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
|
171
|
-
|
172
|
-
assert_equal x_haystack, ltd.improver.match(x_needle)
|
173
|
-
end
|
174
|
-
|
175
|
-
should "use the shortest original input" do
|
176
|
-
x_needle = ['De Havilland DHC8-777 Dash-8 Superstar']
|
177
|
-
x_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
|
178
|
-
x_haystack_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
|
179
|
-
|
180
|
-
@haystack.push x_haystack_long
|
181
|
-
@haystack.push x_haystack
|
182
|
-
@tightenings.push @t_1
|
183
|
-
|
184
|
-
assert_equal x_haystack, ltd.improver.match(x_needle)
|
185
|
-
end
|
186
|
-
|
187
|
-
should "perform lookups needle to haystack" do
|
188
|
-
assert_equal @a_haystack, ltd.improver.match(@a_needle)
|
189
|
-
end
|
190
|
-
|
191
|
-
should "succeed if there are no checks" do
|
192
|
-
assert_nothing_raised do
|
193
|
-
ltd.improver.check @needle
|
194
|
-
end
|
195
|
-
end
|
196
|
-
|
197
|
-
should "succeed if the positive checks just work" do
|
198
|
-
@positives.push [ @a_needle[0], @a_haystack[0] ]
|
199
|
-
|
200
|
-
assert_nothing_raised do
|
201
|
-
ltd.improver.check @needle
|
202
|
-
end
|
203
|
-
end
|
204
|
-
|
205
|
-
should "fail if positive checks don't work" do
|
206
|
-
@positives.push [ @d_needle[0], @d_haystack[0] ]
|
207
|
-
|
208
|
-
assert_raises(FuzzyMatch::Improver::Mismatch) do
|
209
|
-
ltd.improver.check @needle
|
210
|
-
end
|
211
|
-
end
|
212
|
-
|
213
|
-
should "succeed if proper tightening is applied" do
|
214
|
-
@positives.push [ @d_needle[0], @d_haystack[0] ]
|
215
|
-
@tightenings.push @t_1
|
216
|
-
|
217
|
-
assert_nothing_raised do
|
218
|
-
ltd.improver.check @needle
|
219
|
-
end
|
220
|
-
end
|
221
|
-
|
222
|
-
should "use a Google Docs spreadsheet as a source of tightenings" do
|
223
|
-
@positives.push [ @d_needle[0], @d_haystack[0] ]
|
224
|
-
@tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
|
225
|
-
|
226
|
-
# sabshere 9/30/10 this shouldn't raise anything
|
227
|
-
# but the tightenings have been changed... we should be using test-only tightenings, not production ones
|
228
|
-
# assert_nothing_raised do
|
229
|
-
assert_raises(FuzzyMatch::Improver::Mismatch) do
|
230
|
-
ltd.improver.check @needle
|
231
|
-
end
|
232
|
-
end
|
233
|
-
|
234
|
-
should "fail if negative checks don't work" do
|
235
|
-
@negatives.push [ @b_needle[0], @c_haystack[0] ]
|
236
|
-
|
237
|
-
assert_raises(FuzzyMatch::Improver::FalsePositive) do
|
238
|
-
ltd.improver.check @needle
|
239
|
-
end
|
240
|
-
end
|
241
|
-
|
242
|
-
should "do inline checking" do
|
243
|
-
@negatives.push [ @b_needle[0], @c_haystack[0] ]
|
244
|
-
|
245
|
-
assert_raises(FuzzyMatch::Improver::FalsePositive) do
|
246
|
-
ltd.improver.match @b_needle
|
247
|
-
end
|
248
|
-
end
|
249
|
-
|
250
|
-
should "fail if negative checks don't work, even with tightening" do
|
251
|
-
@negatives.push [ @b_needle[0], @c_haystack[0] ]
|
252
|
-
@tightenings.push @t_1
|
253
|
-
|
254
|
-
assert_raises(FuzzyMatch::Improver::FalsePositive) do
|
255
|
-
ltd.improver.check @needle
|
256
|
-
end
|
257
|
-
end
|
258
|
-
|
259
|
-
should "succeed if proper identity is applied" do
|
260
|
-
@negatives.push [ @b_needle[0], @c_haystack[0] ]
|
261
|
-
@positives.push [ @d_needle[0], @d_haystack[0] ]
|
262
|
-
@identities.push @r_1
|
263
|
-
|
264
|
-
assert_nothing_raised do
|
265
|
-
ltd.improver.check @needle
|
266
|
-
end
|
267
|
-
end
|
268
|
-
end
|
data/test/test_grouping.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
describe FuzzyMatch::Rule::Grouping do
|
4
|
-
it %{matches a single string argument} do
|
5
|
-
b = FuzzyMatch::Rule::Grouping.new %r{apple}
|
6
|
-
b.match?('2 apples').must_equal true
|
7
|
-
end
|
8
|
-
|
9
|
-
it %{embraces case insensitivity} do
|
10
|
-
b = FuzzyMatch::Rule::Grouping.new %r{apple}i
|
11
|
-
b.match?('2 Apples').must_equal true
|
12
|
-
end
|
13
|
-
|
14
|
-
it %{joins two string arguments} do
|
15
|
-
b = FuzzyMatch::Rule::Grouping.new %r{apple}
|
16
|
-
b.join?('apple', '2 apples').must_equal true
|
17
|
-
end
|
18
|
-
|
19
|
-
it %{fails to join two string arguments} do
|
20
|
-
b = FuzzyMatch::Rule::Grouping.new %r{apple}
|
21
|
-
b.join?('orange', '2 apples').must_equal false
|
22
|
-
end
|
23
|
-
|
24
|
-
it %{returns nil instead of false when it has no information} do
|
25
|
-
b = FuzzyMatch::Rule::Grouping.new %r{apple}
|
26
|
-
b.join?('orange', 'orange').must_be_nil
|
27
|
-
end
|
28
|
-
end
|
data/test/test_identity.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
describe FuzzyMatch::Rule::Identity do
|
4
|
-
it %{determines whether two records COULD be identical} do
|
5
|
-
i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}
|
6
|
-
i.identical?('A1', 'A 1foobar').must_equal true
|
7
|
-
end
|
8
|
-
|
9
|
-
it %{determines that two records MUST NOT be identical} do
|
10
|
-
i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}
|
11
|
-
i.identical?('A1', 'A 2foobar').must_equal false
|
12
|
-
end
|
13
|
-
|
14
|
-
it %{returns nil indicating no information} do
|
15
|
-
i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}
|
16
|
-
i.identical?('B1', 'A 2foobar').must_equal nil
|
17
|
-
end
|
18
|
-
|
19
|
-
it %{can be initialized with a regexp} do
|
20
|
-
i = FuzzyMatch::Rule::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
|
21
|
-
i.regexp.must_equal %r{\A\\?/(.*)etc/mysql\$$}
|
22
|
-
end
|
23
|
-
|
24
|
-
it %{does not automatically convert strings to regexps} do
|
25
|
-
lambda do
|
26
|
-
FuzzyMatch::Rule::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
|
27
|
-
end.must_raise ArgumentError, /regexp/i
|
28
|
-
end
|
29
|
-
|
30
|
-
it %{embraces case insensitivity} do
|
31
|
-
i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}i
|
32
|
-
i.identical?('A1', 'a 1foobar').must_equal true
|
33
|
-
end
|
34
|
-
end
|
data/test/test_normalizer.rb
DELETED
@@ -1,10 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
describe FuzzyMatch::Rule::Normalizer do
|
4
|
-
it %{applies itself to a string argument} do
|
5
|
-
t = FuzzyMatch::Rule::Normalizer.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
|
6
|
-
t.apply('Ford F-350').must_equal 'Ford F350'
|
7
|
-
t.apply('Ford F150').must_equal 'Ford F150'
|
8
|
-
t.apply('Ford F 350').must_equal 'Ford F350'
|
9
|
-
end
|
10
|
-
end
|