fuzzy_match 1.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +8 -8
  2. data/.rspec +2 -0
  3. data/CHANGELOG +14 -0
  4. data/Gemfile +8 -0
  5. data/README.markdown +58 -38
  6. data/Rakefile +0 -9
  7. data/bin/fuzzy_match +106 -0
  8. data/fuzzy_match.gemspec +4 -4
  9. data/groupings-screenshot.png +0 -0
  10. data/highlevel.graffle +0 -0
  11. data/highlevel.png +0 -0
  12. data/lib/fuzzy_match/record.rb +58 -0
  13. data/lib/fuzzy_match/result.rb +11 -8
  14. data/lib/fuzzy_match/rule/grouping.rb +70 -12
  15. data/lib/fuzzy_match/rule/identity.rb +3 -3
  16. data/lib/fuzzy_match/rule.rb +1 -1
  17. data/lib/fuzzy_match/score/amatch.rb +0 -4
  18. data/lib/fuzzy_match/score/pure_ruby.rb +2 -8
  19. data/lib/fuzzy_match/score.rb +4 -0
  20. data/lib/fuzzy_match/similarity.rb +10 -32
  21. data/lib/fuzzy_match/version.rb +1 -1
  22. data/lib/fuzzy_match.rb +78 -94
  23. data/{test/test_amatch.rb → spec/amatch_spec.rb} +1 -2
  24. data/{test/test_cache.rb → spec/cache_spec.rb} +7 -7
  25. data/spec/foo.rb +9 -0
  26. data/spec/fuzzy_match_spec.rb +354 -0
  27. data/spec/grouping_spec.rb +60 -0
  28. data/spec/identity_spec.rb +29 -0
  29. data/{test/test_wrapper.rb → spec/record_spec.rb} +3 -7
  30. data/spec/spec_helper.rb +21 -0
  31. metadata +56 -50
  32. data/bin/fuzzy_match_checker +0 -71
  33. data/examples/bts_aircraft/5-2-A.htm +0 -10305
  34. data/examples/bts_aircraft/5-2-B.htm +0 -9576
  35. data/examples/bts_aircraft/5-2-D.htm +0 -7094
  36. data/examples/bts_aircraft/5-2-E.htm +0 -2349
  37. data/examples/bts_aircraft/5-2-G.htm +0 -2922
  38. data/examples/bts_aircraft/groupings.csv +0 -1
  39. data/examples/bts_aircraft/identities.csv +0 -1
  40. data/examples/bts_aircraft/negatives.csv +0 -1
  41. data/examples/bts_aircraft/normalizers.csv +0 -1
  42. data/examples/bts_aircraft/number_260.csv +0 -334
  43. data/examples/bts_aircraft/positives.csv +0 -1
  44. data/examples/bts_aircraft/test_bts_aircraft.rb +0 -116
  45. data/examples/first_name_matching.rb +0 -15
  46. data/examples/icao-bts.xls +0 -0
  47. data/lib/fuzzy_match/rule/normalizer.rb +0 -20
  48. data/lib/fuzzy_match/rule/stop_word.rb +0 -11
  49. data/lib/fuzzy_match/wrapper.rb +0 -73
  50. data/test/helper.rb +0 -12
  51. data/test/test_fuzzy_match.rb +0 -304
  52. data/test/test_fuzzy_match_convoluted.rb.disabled +0 -268
  53. data/test/test_grouping.rb +0 -28
  54. data/test/test_identity.rb +0 -34
  55. data/test/test_normalizer.rb +0 -10
@@ -1,304 +0,0 @@
1
- # -*- encoding: utf-8 -*-
2
- require 'helper'
3
-
4
- describe FuzzyMatch do
5
- describe '#find' do
6
- it %{identifies the best match based on string similarity} do
7
- d = FuzzyMatch.new %w{ RATZ CATZ }
8
- d.find('RITZ').must_equal 'RATZ'
9
- d.find('RíTZ').must_equal 'RATZ'
10
-
11
- d = FuzzyMatch.new [ 'X' ]
12
- d.find('X').must_equal 'X'
13
- d.find('A').must_be_nil
14
- end
15
-
16
- it %{not return any result if the maximum score is zero} do
17
- FuzzyMatch.new(['a']).find('b').must_be_nil
18
- end
19
- end
20
-
21
- describe '#find_all' do
22
- it %{return all records in sorted order} do
23
- d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
24
- d.find_all('X').must_equal ['X', 'X22' ]
25
- d.find_all('A').must_equal []
26
- end
27
- end
28
-
29
- describe '#find_best' do
30
- it %{returns one or more records with the best score} do
31
- d = FuzzyMatch.new [ 'X', 'X', 'X22', 'Y', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
32
- d.find_best('X').must_equal ['X', 'X' ]
33
- d.find_best('A').must_equal []
34
- end
35
- end
36
-
37
- describe '#find_all_with_score' do
38
- it %{return records with 2 scores} do
39
- d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
40
- d.find_all_with_score('X').must_equal [ ['X', 1, 1], ['X22', 0, 0.33333333333333337] ]
41
- d.find_all_with_score('A').must_equal []
42
- end
43
- end
44
-
45
- describe '#explain' do
46
- before do
47
- require 'stringio'
48
- @capture = StringIO.new
49
- @old_stdout = $stdout
50
- $stdout = @capture
51
- end
52
- after do
53
- $stdout = @old_stdout
54
- end
55
-
56
- it %{print a basic explanation to stdout} do
57
- d = FuzzyMatch.new %w{ RATZ CATZ }
58
- d.explain('RITZ')
59
- @capture.rewind
60
- @capture.read.must_include 'CATZ'
61
- end
62
-
63
- it %{explains match failures} do
64
- FuzzyMatch.new(['aaa']).explain('bbb')
65
- @capture.rewind
66
- @capture.read.must_match %r{No winner assigned.*aaa.*bbb}
67
- end
68
- end
69
-
70
- describe "normalizers" do
71
- it %{sometimes gets false results without them} do
72
- d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
73
- d.find('BOEING 737100 number 900').must_equal 'BOEING 737-900'
74
- end
75
-
76
- it %{can be used to improve results} do
77
- normalizers = [
78
- %r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
79
- ]
80
- d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :normalizers => normalizers
81
- d.find('BOEING 737100 number 900').must_equal 'BOEING 737-100/200'
82
- end
83
- end
84
-
85
- describe "identities" do
86
- it %{sometimes gets false results without them} do
87
- # false positive without identity
88
- d = FuzzyMatch.new %w{ foo bar }
89
- d.find('baz').must_equal 'bar'
90
- end
91
-
92
- it %{can be used to improve results} do
93
- d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
94
- d.find('baz').must_be_nil
95
- end
96
- end
97
-
98
- describe 'groupings' do
99
- it %{sometimes gets false results without them} do
100
- d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ]
101
- d.find('Barack Bush').must_equal 'Barack Obama' # luke i am your father
102
- d.find('George Obama').must_equal 'George Bush' # nooooooooooooooooooo
103
- end
104
-
105
- it %{can be used to improve results} do
106
- d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ]
107
- d.find('Barack Bush').must_equal 'George Bush'
108
- d.find('George Obama').must_equal 'Barack Obama'
109
- end
110
- end
111
-
112
- describe "the :must_match_grouping option" do
113
- it %{optionally only attempt matches with records that fit into a grouping} do
114
- d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ], :must_match_grouping => true
115
- d.find('George Clinton').must_be_nil
116
-
117
- d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ]
118
- d.find('George Clinton', :must_match_grouping => true).must_be_nil
119
- end
120
- end
121
-
122
- describe "the :first_grouping_decides option" do
123
- it %{optionally force the first grouping to decide} do
124
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ]
125
- d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ]
126
-
127
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true
128
- d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR' ]
129
-
130
- # first_grouping_decides refers to the needle
131
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true
132
- d.find_all('Boeing ER6').must_equal ["Boeing ER6", "Boeing 747", "Boeing 747SR"]
133
-
134
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_grouping_decides => true
135
- d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
136
-
137
- # or equivalently with an identity
138
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true, :identities => [ /boeing (7|E)/i ]
139
- d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
140
- end
141
- end
142
-
143
- describe "the :read option" do
144
- it %{interpret a Numeric as an array index} do
145
- ab = ['a', 'b']
146
- ba = ['b', 'a']
147
- haystack = [ab, ba]
148
- by_first = FuzzyMatch.new haystack, :read => 0
149
- by_last = FuzzyMatch.new haystack, :read => 1
150
- by_first.find('a').must_equal ab
151
- by_last.find('b').must_equal ab
152
- by_first.find('b').must_equal ba
153
- by_last.find('a').must_equal ba
154
- end
155
-
156
- it %{interpret a Symbol, etc. as hash key} do
157
- ab = { :one => 'a', :two => 'b' }
158
- ba = { :one => 'b', :two => 'a' }
159
- haystack = [ab, ba]
160
- by_first = FuzzyMatch.new haystack, :read => :one
161
- by_last = FuzzyMatch.new haystack, :read => :two
162
- by_first.find('a').must_equal ab
163
- by_last.find('b').must_equal ab
164
- by_first.find('b').must_equal ba
165
- by_last.find('a').must_equal ba
166
- end
167
-
168
- MyStruct = Struct.new(:one, :two)
169
- it %{interpret a Symbol as a method id (if the object responds to it)} do
170
- ab = MyStruct.new('a', 'b')
171
- ba = MyStruct.new('b', 'a')
172
- haystack = [ab, ba]
173
- by_first = FuzzyMatch.new haystack, :read => :one
174
- by_last = FuzzyMatch.new haystack, :read => :two
175
- by_first.read.must_equal :one
176
- by_last.read.must_equal :two
177
- by_first.find('a').must_equal ab
178
- by_last.find('b').must_equal ab
179
- by_first.find('b').must_equal ba
180
- by_last.find('a').must_equal ba
181
- end
182
- end
183
-
184
- describe 'the :must_match_at_least_one_word option' do
185
- it %{optionally require that the matching record share at least one word with the needle} do
186
- d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
187
- d.find('RITZ').must_be_nil
188
-
189
- d = FuzzyMatch.new ["Foo's Bar"], :must_match_at_least_one_word => true
190
- d.find("Foo's").must_equal "Foo's Bar"
191
- d.find("'s").must_be_nil
192
- d.find("Foo").must_be_nil
193
-
194
- d = FuzzyMatch.new ["Bolivia, Plurinational State of"], :must_match_at_least_one_word => true
195
- d.find("Bolivia").must_equal "Bolivia, Plurinational State of"
196
- end
197
-
198
- it %{use STOP WORDS} do
199
- d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
200
- d.find('A HTL', :must_match_at_least_one_word => true).must_equal 'B HTL'
201
-
202
- d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
203
- d.find('A HTL').must_equal 'B HTL'
204
-
205
- d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
206
- d.find('A HTL').must_equal 'A HOTEL'
207
- end
208
-
209
- it %{not be fooled by substrings (but rather compare whole words to whole words)} do
210
- d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
211
- d.find('DOLCE LA HULPE BXL FI').must_be_nil
212
- end
213
-
214
- it %{not be case-sensitive when checking for sharing of words} do
215
- d = FuzzyMatch.new [ 'A', 'B' ]
216
- d.find('a', :must_match_at_least_one_word => true).must_equal 'A'
217
- end
218
- end
219
-
220
- describe "the :gather_last_result option" do
221
- it %{not gather metadata about the last result by default} do
222
- d = FuzzyMatch.new %w{ NISSAN HONDA }
223
- d.find('MISSAM')
224
- lambda do
225
- d.last_result
226
- end.must_raise ::RuntimeError, /gather_last_result/
227
- end
228
-
229
- it %{optionally gather metadata about the last result} do
230
- d = FuzzyMatch.new %w{ NISSAN HONDA }
231
- d.find 'MISSAM', :gather_last_result => true
232
- d.last_result.score.must_equal 0.6
233
- d.last_result.winner.must_equal 'NISSAN'
234
- end
235
- end
236
-
237
- describe 'quirks' do
238
- it %{should not return false negatives because of one-letter similarities} do
239
- # dices coefficient doesn't think these two are similar at all because it looks at pairs
240
- FuzzyMatch.score_class.new('X foo', 'X bar').dices_coefficient_similar.must_equal 0
241
- # so we must compensate for that somewhere
242
- d = FuzzyMatch.new ['X foo', 'randomness']
243
- d.find('X bar').must_equal 'X foo'
244
- # without making false positives
245
- d.find('Y bar').must_be_nil
246
- end
247
-
248
- it %{finds possible matches even when pair distance fails} do
249
- d = FuzzyMatch.new ['XX', '2 A']
250
- d.find('2A').must_equal '2 A'
251
- d = FuzzyMatch.new ['XX', '2A']
252
- d.find('2 A').must_equal '2A'
253
- end
254
-
255
- it %{weird blow ups} do
256
- d = FuzzyMatch.new ['XX', '2 A']
257
- d.find('A').must_equal '2 A'
258
- d = FuzzyMatch.new ['XX', 'A']
259
- d.find('2 A').must_equal 'A'
260
- end
261
-
262
- end
263
-
264
- describe 'deprecations' do
265
- it %{takes :must_match_blocking as :must_match_grouping} do
266
- d = FuzzyMatch.new [], :must_match_blocking => :a
267
- d.default_options[:must_match_grouping].must_equal :a
268
- end
269
-
270
- it %{takes :first_blocking_decides as :first_grouping_decides} do
271
- d = FuzzyMatch.new [], :first_blocking_decides => :b
272
- d.default_options[:first_grouping_decides].must_equal :b
273
- end
274
-
275
- it %{takes :haystack_reader as :read} do
276
- d = FuzzyMatch.new [], :haystack_reader => :c
277
- d.read.must_equal :c
278
- end
279
-
280
- it %{takes :blockings as :groupings} do
281
- d = FuzzyMatch.new [], :blockings => [ /X/, /Y/ ]
282
- d.groupings.must_equal [ FuzzyMatch::Rule::Grouping.new(/X/), FuzzyMatch::Rule::Grouping.new(/Y/) ]
283
- end
284
-
285
- it %{takes :tighteners as :normalizers} do
286
- d = FuzzyMatch.new [], :tighteners => [ /X/, /Y/ ]
287
- d.normalizers.must_equal [ FuzzyMatch::Rule::Normalizer.new(/X/), FuzzyMatch::Rule::Normalizer.new(/Y/) ]
288
- end
289
-
290
- it %{receives #free method, but doesn't do anything} do
291
- d = FuzzyMatch.new %w{ A B }
292
- d.free
293
- d.find('A').wont_be_nil
294
- end
295
- end
296
-
297
- it %{defaults to a pure-ruby engine, but also has amatch} do
298
- if defined?($testing_amatch) and $testing_amatch
299
- FuzzyMatch.engine.must_equal :amatch
300
- else
301
- FuzzyMatch.engine.must_equal :pure_ruby
302
- end
303
- end
304
- end
@@ -1,268 +0,0 @@
1
- require 'helper'
2
-
3
- require 'shoulda'
4
-
5
- $log = false
6
-
7
- class TestFuzzyMatchConvoluted < MiniTest::Spec
8
- def setup
9
- clear_ltd
10
-
11
- # dh 8 400
12
- @a_needle = ['DE HAVILLAND CANADA DHC8400 Dash 8']
13
- @a_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
14
- # dh 88
15
- @b_needle = ['ABCDEFG DH88 HIJKLMNOP']
16
- # dh 89
17
- @c_haystack = ['ABCDEFG DH89 HIJKLMNOP']
18
- # dh 8 200
19
- @d_needle = ['DE HAVILLAND CANADA DHC8200 Dash 8']
20
- @d_haystack = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
21
- @d_lookalike = ['ABCD DHC8200 Dash 8']
22
-
23
- @t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
24
-
25
- @r_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good identity for de havilland' ]
26
-
27
- @needle = [
28
- @a_needle,
29
- @b_needle,
30
- ['DE HAVILLAND DH89 Dragon Rapide'],
31
- ['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
32
- @d_needle,
33
- ['DE HAVILLAND CANADA DHC8300 Dash 8'],
34
- ['DE HAVILLAND DH90 Dragonfly']
35
- ]
36
- @haystack = [
37
- @a_haystack,
38
- @c_haystack,
39
- @d_haystack,
40
- ['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
41
- ['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
42
- ]
43
- @tightenings = []
44
- @identities = []
45
- @groupings = []
46
- @positives = []
47
- @negatives = []
48
- end
49
-
50
- def clear_ltd
51
- @_ltd = nil
52
- end
53
-
54
- def ltd
55
- @_ltd ||= FuzzyMatch.new @haystack,
56
- :tightenings => @tightenings,
57
- :identities => @identities,
58
- :groupings => @groupings,
59
- :positives => @positives,
60
- :negatives => @negatives,
61
- :grouping_only => @grouping_only,
62
- :log => $log
63
- end
64
-
65
- should "optionally only pay attention to things that match groupings" do
66
- assert_equal @a_haystack, ltd.improver.match(@a_needle)
67
-
68
- clear_ltd
69
- @grouping_only = true
70
- assert_equal nil, ltd.improver.match(@a_needle)
71
-
72
- clear_ltd
73
- @grouping_only = true
74
- @groupings.push ['/dash/i']
75
- assert_equal @a_haystack, ltd.improver.match(@a_needle)
76
- end
77
-
78
- # the example from the readme, considerably uglier here
79
- should "check a simple table" do
80
- @haystack = [ 'seamus', 'andy', 'ben' ]
81
- @positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
82
- needle = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
83
-
84
- assert_nothing_raised do
85
- ltd.improver.check needle
86
- end
87
- end
88
-
89
- should "treat a String as a full record if passed through" do
90
- dash = 'DHC8-400'
91
- b747 = 'B747200/300'
92
- dc9 = 'DC-9-10'
93
- haystack_records = [ dash, b747, dc9 ]
94
- simple_ltd = FuzzyMatch.new haystack_records, :log => $log
95
- assert_equal dash, simple_ltd.improver.match('DeHavilland Dash-8 DHC-400')
96
- assert_equal b747, simple_ltd.improver.match('Boeing 747-300')
97
- assert_equal dc9, simple_ltd.improver.match('McDonnell Douglas MD81/DC-9')
98
- end
99
-
100
- should "call it a mismatch if you hit a blank positive" do
101
- @positives.push [@a_needle[0], '']
102
- assert_raises(FuzzyMatch::Improver::Mismatch) do
103
- ltd.improver.match @a_needle
104
- end
105
- end
106
-
107
- should "call it a false positive if you hit a blank negative" do
108
- @negatives.push [@a_needle[0], '']
109
- assert_raises(FuzzyMatch::Improver::FalsePositive) do
110
- ltd.improver.match @a_needle
111
- end
112
- end
113
-
114
- should "have a false match without grouping" do
115
- # @d_needle will be our victim
116
- @haystack.push @d_lookalike
117
- @tightenings.push @t_1
118
-
119
- assert_equal @d_lookalike, ltd.improver.match(@d_needle)
120
- end
121
-
122
- should "do grouping if the needle matches a group" do
123
- # @d_needle will be our victim
124
- @haystack.push @d_lookalike
125
- @tightenings.push @t_1
126
- @groupings.push ['/(bombardier|de ?havilland)/i']
127
-
128
- assert_equal @d_haystack, ltd.improver.match(@d_needle)
129
- end
130
-
131
- should "treat groups as exclusive" do
132
- @haystack = [ @d_needle ]
133
- @tightenings.push @t_1
134
- @groupings.push ['/(bombardier|de ?havilland)/i']
135
-
136
- assert_equal nil, ltd.improver.match(@d_lookalike)
137
- end
138
-
139
- should "only use identities if they stem from the same regexp" do
140
- @identities.push @r_1
141
- @identities.push [ '/(cessna)(?:.*?)(citation)/i' ]
142
- @identities.push [ '/(cessna)(?:.*?)(\d\d\d)/i' ]
143
- x_needle = [ 'CESSNA D-333 CITATION V']
144
- x_haystack = [ 'CESSNA D-333' ]
145
- @haystack.push x_haystack
146
-
147
- assert_equal x_haystack, ltd.improver.match(x_needle)
148
- end
149
-
150
- should "use the best score from all of the tightenings" do
151
- x_needle = ["BOEING 737100"]
152
- x_haystack = ["BOEING BOEING 737-100/200"]
153
- x_haystack_wrong = ["BOEING BOEING 737-900"]
154
- @haystack.push x_haystack
155
- @haystack.push x_haystack_wrong
156
- @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
157
- @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
158
-
159
- assert_equal x_haystack, ltd.improver.match(x_needle)
160
- end
161
-
162
- should "compare using prefixes if tightened key is shorter than correct match" do
163
- x_needle = ["BOEING 720"]
164
- x_haystack = ["BOEING BOEING 720-000"]
165
- x_haystack_wrong = ["BOEING BOEING 717-200"]
166
- @haystack.push x_haystack
167
- @haystack.push x_haystack_wrong
168
- @tightenings.push @t_1
169
- @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
170
- @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
171
-
172
- assert_equal x_haystack, ltd.improver.match(x_needle)
173
- end
174
-
175
- should "use the shortest original input" do
176
- x_needle = ['De Havilland DHC8-777 Dash-8 Superstar']
177
- x_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
178
- x_haystack_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
179
-
180
- @haystack.push x_haystack_long
181
- @haystack.push x_haystack
182
- @tightenings.push @t_1
183
-
184
- assert_equal x_haystack, ltd.improver.match(x_needle)
185
- end
186
-
187
- should "perform lookups needle to haystack" do
188
- assert_equal @a_haystack, ltd.improver.match(@a_needle)
189
- end
190
-
191
- should "succeed if there are no checks" do
192
- assert_nothing_raised do
193
- ltd.improver.check @needle
194
- end
195
- end
196
-
197
- should "succeed if the positive checks just work" do
198
- @positives.push [ @a_needle[0], @a_haystack[0] ]
199
-
200
- assert_nothing_raised do
201
- ltd.improver.check @needle
202
- end
203
- end
204
-
205
- should "fail if positive checks don't work" do
206
- @positives.push [ @d_needle[0], @d_haystack[0] ]
207
-
208
- assert_raises(FuzzyMatch::Improver::Mismatch) do
209
- ltd.improver.check @needle
210
- end
211
- end
212
-
213
- should "succeed if proper tightening is applied" do
214
- @positives.push [ @d_needle[0], @d_haystack[0] ]
215
- @tightenings.push @t_1
216
-
217
- assert_nothing_raised do
218
- ltd.improver.check @needle
219
- end
220
- end
221
-
222
- should "use a Google Docs spreadsheet as a source of tightenings" do
223
- @positives.push [ @d_needle[0], @d_haystack[0] ]
224
- @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
225
-
226
- # sabshere 9/30/10 this shouldn't raise anything
227
- # but the tightenings have been changed... we should be using test-only tightenings, not production ones
228
- # assert_nothing_raised do
229
- assert_raises(FuzzyMatch::Improver::Mismatch) do
230
- ltd.improver.check @needle
231
- end
232
- end
233
-
234
- should "fail if negative checks don't work" do
235
- @negatives.push [ @b_needle[0], @c_haystack[0] ]
236
-
237
- assert_raises(FuzzyMatch::Improver::FalsePositive) do
238
- ltd.improver.check @needle
239
- end
240
- end
241
-
242
- should "do inline checking" do
243
- @negatives.push [ @b_needle[0], @c_haystack[0] ]
244
-
245
- assert_raises(FuzzyMatch::Improver::FalsePositive) do
246
- ltd.improver.match @b_needle
247
- end
248
- end
249
-
250
- should "fail if negative checks don't work, even with tightening" do
251
- @negatives.push [ @b_needle[0], @c_haystack[0] ]
252
- @tightenings.push @t_1
253
-
254
- assert_raises(FuzzyMatch::Improver::FalsePositive) do
255
- ltd.improver.check @needle
256
- end
257
- end
258
-
259
- should "succeed if proper identity is applied" do
260
- @negatives.push [ @b_needle[0], @c_haystack[0] ]
261
- @positives.push [ @d_needle[0], @d_haystack[0] ]
262
- @identities.push @r_1
263
-
264
- assert_nothing_raised do
265
- ltd.improver.check @needle
266
- end
267
- end
268
- end
@@ -1,28 +0,0 @@
1
- require 'helper'
2
-
3
- describe FuzzyMatch::Rule::Grouping do
4
- it %{matches a single string argument} do
5
- b = FuzzyMatch::Rule::Grouping.new %r{apple}
6
- b.match?('2 apples').must_equal true
7
- end
8
-
9
- it %{embraces case insensitivity} do
10
- b = FuzzyMatch::Rule::Grouping.new %r{apple}i
11
- b.match?('2 Apples').must_equal true
12
- end
13
-
14
- it %{joins two string arguments} do
15
- b = FuzzyMatch::Rule::Grouping.new %r{apple}
16
- b.join?('apple', '2 apples').must_equal true
17
- end
18
-
19
- it %{fails to join two string arguments} do
20
- b = FuzzyMatch::Rule::Grouping.new %r{apple}
21
- b.join?('orange', '2 apples').must_equal false
22
- end
23
-
24
- it %{returns nil instead of false when it has no information} do
25
- b = FuzzyMatch::Rule::Grouping.new %r{apple}
26
- b.join?('orange', 'orange').must_be_nil
27
- end
28
- end
@@ -1,34 +0,0 @@
1
- require 'helper'
2
-
3
- describe FuzzyMatch::Rule::Identity do
4
- it %{determines whether two records COULD be identical} do
5
- i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}
6
- i.identical?('A1', 'A 1foobar').must_equal true
7
- end
8
-
9
- it %{determines that two records MUST NOT be identical} do
10
- i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}
11
- i.identical?('A1', 'A 2foobar').must_equal false
12
- end
13
-
14
- it %{returns nil indicating no information} do
15
- i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}
16
- i.identical?('B1', 'A 2foobar').must_equal nil
17
- end
18
-
19
- it %{can be initialized with a regexp} do
20
- i = FuzzyMatch::Rule::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
21
- i.regexp.must_equal %r{\A\\?/(.*)etc/mysql\$$}
22
- end
23
-
24
- it %{does not automatically convert strings to regexps} do
25
- lambda do
26
- FuzzyMatch::Rule::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
27
- end.must_raise ArgumentError, /regexp/i
28
- end
29
-
30
- it %{embraces case insensitivity} do
31
- i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}i
32
- i.identical?('A1', 'a 1foobar').must_equal true
33
- end
34
- end
@@ -1,10 +0,0 @@
1
- require 'helper'
2
-
3
- describe FuzzyMatch::Rule::Normalizer do
4
- it %{applies itself to a string argument} do
5
- t = FuzzyMatch::Rule::Normalizer.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
6
- t.apply('Ford F-350').must_equal 'Ford F350'
7
- t.apply('Ford F150').must_equal 'Ford F150'
8
- t.apply('Ford F 350').must_equal 'Ford F350'
9
- end
10
- end