fuzzy_match 1.5.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +8 -8
  2. data/.rspec +2 -0
  3. data/CHANGELOG +14 -0
  4. data/Gemfile +8 -0
  5. data/README.markdown +58 -38
  6. data/Rakefile +0 -9
  7. data/bin/fuzzy_match +106 -0
  8. data/fuzzy_match.gemspec +4 -4
  9. data/groupings-screenshot.png +0 -0
  10. data/highlevel.graffle +0 -0
  11. data/highlevel.png +0 -0
  12. data/lib/fuzzy_match/record.rb +58 -0
  13. data/lib/fuzzy_match/result.rb +11 -8
  14. data/lib/fuzzy_match/rule/grouping.rb +70 -12
  15. data/lib/fuzzy_match/rule/identity.rb +3 -3
  16. data/lib/fuzzy_match/rule.rb +1 -1
  17. data/lib/fuzzy_match/score/amatch.rb +0 -4
  18. data/lib/fuzzy_match/score/pure_ruby.rb +2 -8
  19. data/lib/fuzzy_match/score.rb +4 -0
  20. data/lib/fuzzy_match/similarity.rb +10 -32
  21. data/lib/fuzzy_match/version.rb +1 -1
  22. data/lib/fuzzy_match.rb +78 -94
  23. data/{test/test_amatch.rb → spec/amatch_spec.rb} +1 -2
  24. data/{test/test_cache.rb → spec/cache_spec.rb} +7 -7
  25. data/spec/foo.rb +9 -0
  26. data/spec/fuzzy_match_spec.rb +354 -0
  27. data/spec/grouping_spec.rb +60 -0
  28. data/spec/identity_spec.rb +29 -0
  29. data/{test/test_wrapper.rb → spec/record_spec.rb} +3 -7
  30. data/spec/spec_helper.rb +21 -0
  31. metadata +56 -50
  32. data/bin/fuzzy_match_checker +0 -71
  33. data/examples/bts_aircraft/5-2-A.htm +0 -10305
  34. data/examples/bts_aircraft/5-2-B.htm +0 -9576
  35. data/examples/bts_aircraft/5-2-D.htm +0 -7094
  36. data/examples/bts_aircraft/5-2-E.htm +0 -2349
  37. data/examples/bts_aircraft/5-2-G.htm +0 -2922
  38. data/examples/bts_aircraft/groupings.csv +0 -1
  39. data/examples/bts_aircraft/identities.csv +0 -1
  40. data/examples/bts_aircraft/negatives.csv +0 -1
  41. data/examples/bts_aircraft/normalizers.csv +0 -1
  42. data/examples/bts_aircraft/number_260.csv +0 -334
  43. data/examples/bts_aircraft/positives.csv +0 -1
  44. data/examples/bts_aircraft/test_bts_aircraft.rb +0 -116
  45. data/examples/first_name_matching.rb +0 -15
  46. data/examples/icao-bts.xls +0 -0
  47. data/lib/fuzzy_match/rule/normalizer.rb +0 -20
  48. data/lib/fuzzy_match/rule/stop_word.rb +0 -11
  49. data/lib/fuzzy_match/wrapper.rb +0 -73
  50. data/test/helper.rb +0 -12
  51. data/test/test_fuzzy_match.rb +0 -304
  52. data/test/test_fuzzy_match_convoluted.rb.disabled +0 -268
  53. data/test/test_grouping.rb +0 -28
  54. data/test/test_identity.rb +0 -34
  55. data/test/test_normalizer.rb +0 -10
@@ -1,304 +0,0 @@
1
- # -*- encoding: utf-8 -*-
2
- require 'helper'
3
-
4
- describe FuzzyMatch do
5
- describe '#find' do
6
- it %{identifies the best match based on string similarity} do
7
- d = FuzzyMatch.new %w{ RATZ CATZ }
8
- d.find('RITZ').must_equal 'RATZ'
9
- d.find('RíTZ').must_equal 'RATZ'
10
-
11
- d = FuzzyMatch.new [ 'X' ]
12
- d.find('X').must_equal 'X'
13
- d.find('A').must_be_nil
14
- end
15
-
16
- it %{not return any result if the maximum score is zero} do
17
- FuzzyMatch.new(['a']).find('b').must_be_nil
18
- end
19
- end
20
-
21
- describe '#find_all' do
22
- it %{return all records in sorted order} do
23
- d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
24
- d.find_all('X').must_equal ['X', 'X22' ]
25
- d.find_all('A').must_equal []
26
- end
27
- end
28
-
29
- describe '#find_best' do
30
- it %{returns one or more records with the best score} do
31
- d = FuzzyMatch.new [ 'X', 'X', 'X22', 'Y', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
32
- d.find_best('X').must_equal ['X', 'X' ]
33
- d.find_best('A').must_equal []
34
- end
35
- end
36
-
37
- describe '#find_all_with_score' do
38
- it %{return records with 2 scores} do
39
- d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
40
- d.find_all_with_score('X').must_equal [ ['X', 1, 1], ['X22', 0, 0.33333333333333337] ]
41
- d.find_all_with_score('A').must_equal []
42
- end
43
- end
44
-
45
- describe '#explain' do
46
- before do
47
- require 'stringio'
48
- @capture = StringIO.new
49
- @old_stdout = $stdout
50
- $stdout = @capture
51
- end
52
- after do
53
- $stdout = @old_stdout
54
- end
55
-
56
- it %{print a basic explanation to stdout} do
57
- d = FuzzyMatch.new %w{ RATZ CATZ }
58
- d.explain('RITZ')
59
- @capture.rewind
60
- @capture.read.must_include 'CATZ'
61
- end
62
-
63
- it %{explains match failures} do
64
- FuzzyMatch.new(['aaa']).explain('bbb')
65
- @capture.rewind
66
- @capture.read.must_match %r{No winner assigned.*aaa.*bbb}
67
- end
68
- end
69
-
70
- describe "normalizers" do
71
- it %{sometimes gets false results without them} do
72
- d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
73
- d.find('BOEING 737100 number 900').must_equal 'BOEING 737-900'
74
- end
75
-
76
- it %{can be used to improve results} do
77
- normalizers = [
78
- %r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
79
- ]
80
- d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :normalizers => normalizers
81
- d.find('BOEING 737100 number 900').must_equal 'BOEING 737-100/200'
82
- end
83
- end
84
-
85
- describe "identities" do
86
- it %{sometimes gets false results without them} do
87
- # false positive without identity
88
- d = FuzzyMatch.new %w{ foo bar }
89
- d.find('baz').must_equal 'bar'
90
- end
91
-
92
- it %{can be used to improve results} do
93
- d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
94
- d.find('baz').must_be_nil
95
- end
96
- end
97
-
98
- describe 'groupings' do
99
- it %{sometimes gets false results without them} do
100
- d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ]
101
- d.find('Barack Bush').must_equal 'Barack Obama' # luke i am your father
102
- d.find('George Obama').must_equal 'George Bush' # nooooooooooooooooooo
103
- end
104
-
105
- it %{can be used to improve results} do
106
- d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ]
107
- d.find('Barack Bush').must_equal 'George Bush'
108
- d.find('George Obama').must_equal 'Barack Obama'
109
- end
110
- end
111
-
112
- describe "the :must_match_grouping option" do
113
- it %{optionally only attempt matches with records that fit into a grouping} do
114
- d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ], :must_match_grouping => true
115
- d.find('George Clinton').must_be_nil
116
-
117
- d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ]
118
- d.find('George Clinton', :must_match_grouping => true).must_be_nil
119
- end
120
- end
121
-
122
- describe "the :first_grouping_decides option" do
123
- it %{optionally force the first grouping to decide} do
124
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ]
125
- d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ]
126
-
127
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true
128
- d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR' ]
129
-
130
- # first_grouping_decides refers to the needle
131
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true
132
- d.find_all('Boeing ER6').must_equal ["Boeing ER6", "Boeing 747", "Boeing 747SR"]
133
-
134
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_grouping_decides => true
135
- d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
136
-
137
- # or equivalently with an identity
138
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true, :identities => [ /boeing (7|E)/i ]
139
- d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
140
- end
141
- end
142
-
143
- describe "the :read option" do
144
- it %{interpret a Numeric as an array index} do
145
- ab = ['a', 'b']
146
- ba = ['b', 'a']
147
- haystack = [ab, ba]
148
- by_first = FuzzyMatch.new haystack, :read => 0
149
- by_last = FuzzyMatch.new haystack, :read => 1
150
- by_first.find('a').must_equal ab
151
- by_last.find('b').must_equal ab
152
- by_first.find('b').must_equal ba
153
- by_last.find('a').must_equal ba
154
- end
155
-
156
- it %{interpret a Symbol, etc. as hash key} do
157
- ab = { :one => 'a', :two => 'b' }
158
- ba = { :one => 'b', :two => 'a' }
159
- haystack = [ab, ba]
160
- by_first = FuzzyMatch.new haystack, :read => :one
161
- by_last = FuzzyMatch.new haystack, :read => :two
162
- by_first.find('a').must_equal ab
163
- by_last.find('b').must_equal ab
164
- by_first.find('b').must_equal ba
165
- by_last.find('a').must_equal ba
166
- end
167
-
168
- MyStruct = Struct.new(:one, :two)
169
- it %{interpret a Symbol as a method id (if the object responds to it)} do
170
- ab = MyStruct.new('a', 'b')
171
- ba = MyStruct.new('b', 'a')
172
- haystack = [ab, ba]
173
- by_first = FuzzyMatch.new haystack, :read => :one
174
- by_last = FuzzyMatch.new haystack, :read => :two
175
- by_first.read.must_equal :one
176
- by_last.read.must_equal :two
177
- by_first.find('a').must_equal ab
178
- by_last.find('b').must_equal ab
179
- by_first.find('b').must_equal ba
180
- by_last.find('a').must_equal ba
181
- end
182
- end
183
-
184
- describe 'the :must_match_at_least_one_word option' do
185
- it %{optionally require that the matching record share at least one word with the needle} do
186
- d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
187
- d.find('RITZ').must_be_nil
188
-
189
- d = FuzzyMatch.new ["Foo's Bar"], :must_match_at_least_one_word => true
190
- d.find("Foo's").must_equal "Foo's Bar"
191
- d.find("'s").must_be_nil
192
- d.find("Foo").must_be_nil
193
-
194
- d = FuzzyMatch.new ["Bolivia, Plurinational State of"], :must_match_at_least_one_word => true
195
- d.find("Bolivia").must_equal "Bolivia, Plurinational State of"
196
- end
197
-
198
- it %{use STOP WORDS} do
199
- d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
200
- d.find('A HTL', :must_match_at_least_one_word => true).must_equal 'B HTL'
201
-
202
- d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
203
- d.find('A HTL').must_equal 'B HTL'
204
-
205
- d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
206
- d.find('A HTL').must_equal 'A HOTEL'
207
- end
208
-
209
- it %{not be fooled by substrings (but rather compare whole words to whole words)} do
210
- d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
211
- d.find('DOLCE LA HULPE BXL FI').must_be_nil
212
- end
213
-
214
- it %{not be case-sensitive when checking for sharing of words} do
215
- d = FuzzyMatch.new [ 'A', 'B' ]
216
- d.find('a', :must_match_at_least_one_word => true).must_equal 'A'
217
- end
218
- end
219
-
220
- describe "the :gather_last_result option" do
221
- it %{not gather metadata about the last result by default} do
222
- d = FuzzyMatch.new %w{ NISSAN HONDA }
223
- d.find('MISSAM')
224
- lambda do
225
- d.last_result
226
- end.must_raise ::RuntimeError, /gather_last_result/
227
- end
228
-
229
- it %{optionally gather metadata about the last result} do
230
- d = FuzzyMatch.new %w{ NISSAN HONDA }
231
- d.find 'MISSAM', :gather_last_result => true
232
- d.last_result.score.must_equal 0.6
233
- d.last_result.winner.must_equal 'NISSAN'
234
- end
235
- end
236
-
237
- describe 'quirks' do
238
- it %{should not return false negatives because of one-letter similarities} do
239
- # dices coefficient doesn't think these two are similar at all because it looks at pairs
240
- FuzzyMatch.score_class.new('X foo', 'X bar').dices_coefficient_similar.must_equal 0
241
- # so we must compensate for that somewhere
242
- d = FuzzyMatch.new ['X foo', 'randomness']
243
- d.find('X bar').must_equal 'X foo'
244
- # without making false positives
245
- d.find('Y bar').must_be_nil
246
- end
247
-
248
- it %{finds possible matches even when pair distance fails} do
249
- d = FuzzyMatch.new ['XX', '2 A']
250
- d.find('2A').must_equal '2 A'
251
- d = FuzzyMatch.new ['XX', '2A']
252
- d.find('2 A').must_equal '2A'
253
- end
254
-
255
- it %{weird blow ups} do
256
- d = FuzzyMatch.new ['XX', '2 A']
257
- d.find('A').must_equal '2 A'
258
- d = FuzzyMatch.new ['XX', 'A']
259
- d.find('2 A').must_equal 'A'
260
- end
261
-
262
- end
263
-
264
- describe 'deprecations' do
265
- it %{takes :must_match_blocking as :must_match_grouping} do
266
- d = FuzzyMatch.new [], :must_match_blocking => :a
267
- d.default_options[:must_match_grouping].must_equal :a
268
- end
269
-
270
- it %{takes :first_blocking_decides as :first_grouping_decides} do
271
- d = FuzzyMatch.new [], :first_blocking_decides => :b
272
- d.default_options[:first_grouping_decides].must_equal :b
273
- end
274
-
275
- it %{takes :haystack_reader as :read} do
276
- d = FuzzyMatch.new [], :haystack_reader => :c
277
- d.read.must_equal :c
278
- end
279
-
280
- it %{takes :blockings as :groupings} do
281
- d = FuzzyMatch.new [], :blockings => [ /X/, /Y/ ]
282
- d.groupings.must_equal [ FuzzyMatch::Rule::Grouping.new(/X/), FuzzyMatch::Rule::Grouping.new(/Y/) ]
283
- end
284
-
285
- it %{takes :tighteners as :normalizers} do
286
- d = FuzzyMatch.new [], :tighteners => [ /X/, /Y/ ]
287
- d.normalizers.must_equal [ FuzzyMatch::Rule::Normalizer.new(/X/), FuzzyMatch::Rule::Normalizer.new(/Y/) ]
288
- end
289
-
290
- it %{receives #free method, but doesn't do anything} do
291
- d = FuzzyMatch.new %w{ A B }
292
- d.free
293
- d.find('A').wont_be_nil
294
- end
295
- end
296
-
297
- it %{defaults to a pure-ruby engine, but also has amatch} do
298
- if defined?($testing_amatch) and $testing_amatch
299
- FuzzyMatch.engine.must_equal :amatch
300
- else
301
- FuzzyMatch.engine.must_equal :pure_ruby
302
- end
303
- end
304
- end
@@ -1,268 +0,0 @@
1
- require 'helper'
2
-
3
- require 'shoulda'
4
-
5
- $log = false
6
-
7
- class TestFuzzyMatchConvoluted < MiniTest::Spec
8
- def setup
9
- clear_ltd
10
-
11
- # dh 8 400
12
- @a_needle = ['DE HAVILLAND CANADA DHC8400 Dash 8']
13
- @a_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
14
- # dh 88
15
- @b_needle = ['ABCDEFG DH88 HIJKLMNOP']
16
- # dh 89
17
- @c_haystack = ['ABCDEFG DH89 HIJKLMNOP']
18
- # dh 8 200
19
- @d_needle = ['DE HAVILLAND CANADA DHC8200 Dash 8']
20
- @d_haystack = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
21
- @d_lookalike = ['ABCD DHC8200 Dash 8']
22
-
23
- @t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
24
-
25
- @r_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good identity for de havilland' ]
26
-
27
- @needle = [
28
- @a_needle,
29
- @b_needle,
30
- ['DE HAVILLAND DH89 Dragon Rapide'],
31
- ['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
32
- @d_needle,
33
- ['DE HAVILLAND CANADA DHC8300 Dash 8'],
34
- ['DE HAVILLAND DH90 Dragonfly']
35
- ]
36
- @haystack = [
37
- @a_haystack,
38
- @c_haystack,
39
- @d_haystack,
40
- ['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
41
- ['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
42
- ]
43
- @tightenings = []
44
- @identities = []
45
- @groupings = []
46
- @positives = []
47
- @negatives = []
48
- end
49
-
50
- def clear_ltd
51
- @_ltd = nil
52
- end
53
-
54
- def ltd
55
- @_ltd ||= FuzzyMatch.new @haystack,
56
- :tightenings => @tightenings,
57
- :identities => @identities,
58
- :groupings => @groupings,
59
- :positives => @positives,
60
- :negatives => @negatives,
61
- :grouping_only => @grouping_only,
62
- :log => $log
63
- end
64
-
65
- should "optionally only pay attention to things that match groupings" do
66
- assert_equal @a_haystack, ltd.improver.match(@a_needle)
67
-
68
- clear_ltd
69
- @grouping_only = true
70
- assert_equal nil, ltd.improver.match(@a_needle)
71
-
72
- clear_ltd
73
- @grouping_only = true
74
- @groupings.push ['/dash/i']
75
- assert_equal @a_haystack, ltd.improver.match(@a_needle)
76
- end
77
-
78
- # the example from the readme, considerably uglier here
79
- should "check a simple table" do
80
- @haystack = [ 'seamus', 'andy', 'ben' ]
81
- @positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
82
- needle = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
83
-
84
- assert_nothing_raised do
85
- ltd.improver.check needle
86
- end
87
- end
88
-
89
- should "treat a String as a full record if passed through" do
90
- dash = 'DHC8-400'
91
- b747 = 'B747200/300'
92
- dc9 = 'DC-9-10'
93
- haystack_records = [ dash, b747, dc9 ]
94
- simple_ltd = FuzzyMatch.new haystack_records, :log => $log
95
- assert_equal dash, simple_ltd.improver.match('DeHavilland Dash-8 DHC-400')
96
- assert_equal b747, simple_ltd.improver.match('Boeing 747-300')
97
- assert_equal dc9, simple_ltd.improver.match('McDonnell Douglas MD81/DC-9')
98
- end
99
-
100
- should "call it a mismatch if you hit a blank positive" do
101
- @positives.push [@a_needle[0], '']
102
- assert_raises(FuzzyMatch::Improver::Mismatch) do
103
- ltd.improver.match @a_needle
104
- end
105
- end
106
-
107
- should "call it a false positive if you hit a blank negative" do
108
- @negatives.push [@a_needle[0], '']
109
- assert_raises(FuzzyMatch::Improver::FalsePositive) do
110
- ltd.improver.match @a_needle
111
- end
112
- end
113
-
114
- should "have a false match without grouping" do
115
- # @d_needle will be our victim
116
- @haystack.push @d_lookalike
117
- @tightenings.push @t_1
118
-
119
- assert_equal @d_lookalike, ltd.improver.match(@d_needle)
120
- end
121
-
122
- should "do grouping if the needle matches a group" do
123
- # @d_needle will be our victim
124
- @haystack.push @d_lookalike
125
- @tightenings.push @t_1
126
- @groupings.push ['/(bombardier|de ?havilland)/i']
127
-
128
- assert_equal @d_haystack, ltd.improver.match(@d_needle)
129
- end
130
-
131
- should "treat groups as exclusive" do
132
- @haystack = [ @d_needle ]
133
- @tightenings.push @t_1
134
- @groupings.push ['/(bombardier|de ?havilland)/i']
135
-
136
- assert_equal nil, ltd.improver.match(@d_lookalike)
137
- end
138
-
139
- should "only use identities if they stem from the same regexp" do
140
- @identities.push @r_1
141
- @identities.push [ '/(cessna)(?:.*?)(citation)/i' ]
142
- @identities.push [ '/(cessna)(?:.*?)(\d\d\d)/i' ]
143
- x_needle = [ 'CESSNA D-333 CITATION V']
144
- x_haystack = [ 'CESSNA D-333' ]
145
- @haystack.push x_haystack
146
-
147
- assert_equal x_haystack, ltd.improver.match(x_needle)
148
- end
149
-
150
- should "use the best score from all of the tightenings" do
151
- x_needle = ["BOEING 737100"]
152
- x_haystack = ["BOEING BOEING 737-100/200"]
153
- x_haystack_wrong = ["BOEING BOEING 737-900"]
154
- @haystack.push x_haystack
155
- @haystack.push x_haystack_wrong
156
- @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
157
- @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
158
-
159
- assert_equal x_haystack, ltd.improver.match(x_needle)
160
- end
161
-
162
- should "compare using prefixes if tightened key is shorter than correct match" do
163
- x_needle = ["BOEING 720"]
164
- x_haystack = ["BOEING BOEING 720-000"]
165
- x_haystack_wrong = ["BOEING BOEING 717-200"]
166
- @haystack.push x_haystack
167
- @haystack.push x_haystack_wrong
168
- @tightenings.push @t_1
169
- @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
170
- @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
171
-
172
- assert_equal x_haystack, ltd.improver.match(x_needle)
173
- end
174
-
175
- should "use the shortest original input" do
176
- x_needle = ['De Havilland DHC8-777 Dash-8 Superstar']
177
- x_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
178
- x_haystack_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
179
-
180
- @haystack.push x_haystack_long
181
- @haystack.push x_haystack
182
- @tightenings.push @t_1
183
-
184
- assert_equal x_haystack, ltd.improver.match(x_needle)
185
- end
186
-
187
- should "perform lookups needle to haystack" do
188
- assert_equal @a_haystack, ltd.improver.match(@a_needle)
189
- end
190
-
191
- should "succeed if there are no checks" do
192
- assert_nothing_raised do
193
- ltd.improver.check @needle
194
- end
195
- end
196
-
197
- should "succeed if the positive checks just work" do
198
- @positives.push [ @a_needle[0], @a_haystack[0] ]
199
-
200
- assert_nothing_raised do
201
- ltd.improver.check @needle
202
- end
203
- end
204
-
205
- should "fail if positive checks don't work" do
206
- @positives.push [ @d_needle[0], @d_haystack[0] ]
207
-
208
- assert_raises(FuzzyMatch::Improver::Mismatch) do
209
- ltd.improver.check @needle
210
- end
211
- end
212
-
213
- should "succeed if proper tightening is applied" do
214
- @positives.push [ @d_needle[0], @d_haystack[0] ]
215
- @tightenings.push @t_1
216
-
217
- assert_nothing_raised do
218
- ltd.improver.check @needle
219
- end
220
- end
221
-
222
- should "use a Google Docs spreadsheet as a source of tightenings" do
223
- @positives.push [ @d_needle[0], @d_haystack[0] ]
224
- @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
225
-
226
- # sabshere 9/30/10 this shouldn't raise anything
227
- # but the tightenings have been changed... we should be using test-only tightenings, not production ones
228
- # assert_nothing_raised do
229
- assert_raises(FuzzyMatch::Improver::Mismatch) do
230
- ltd.improver.check @needle
231
- end
232
- end
233
-
234
- should "fail if negative checks don't work" do
235
- @negatives.push [ @b_needle[0], @c_haystack[0] ]
236
-
237
- assert_raises(FuzzyMatch::Improver::FalsePositive) do
238
- ltd.improver.check @needle
239
- end
240
- end
241
-
242
- should "do inline checking" do
243
- @negatives.push [ @b_needle[0], @c_haystack[0] ]
244
-
245
- assert_raises(FuzzyMatch::Improver::FalsePositive) do
246
- ltd.improver.match @b_needle
247
- end
248
- end
249
-
250
- should "fail if negative checks don't work, even with tightening" do
251
- @negatives.push [ @b_needle[0], @c_haystack[0] ]
252
- @tightenings.push @t_1
253
-
254
- assert_raises(FuzzyMatch::Improver::FalsePositive) do
255
- ltd.improver.check @needle
256
- end
257
- end
258
-
259
- should "succeed if proper identity is applied" do
260
- @negatives.push [ @b_needle[0], @c_haystack[0] ]
261
- @positives.push [ @d_needle[0], @d_haystack[0] ]
262
- @identities.push @r_1
263
-
264
- assert_nothing_raised do
265
- ltd.improver.check @needle
266
- end
267
- end
268
- end
@@ -1,28 +0,0 @@
1
- require 'helper'
2
-
3
- describe FuzzyMatch::Rule::Grouping do
4
- it %{matches a single string argument} do
5
- b = FuzzyMatch::Rule::Grouping.new %r{apple}
6
- b.match?('2 apples').must_equal true
7
- end
8
-
9
- it %{embraces case insensitivity} do
10
- b = FuzzyMatch::Rule::Grouping.new %r{apple}i
11
- b.match?('2 Apples').must_equal true
12
- end
13
-
14
- it %{joins two string arguments} do
15
- b = FuzzyMatch::Rule::Grouping.new %r{apple}
16
- b.join?('apple', '2 apples').must_equal true
17
- end
18
-
19
- it %{fails to join two string arguments} do
20
- b = FuzzyMatch::Rule::Grouping.new %r{apple}
21
- b.join?('orange', '2 apples').must_equal false
22
- end
23
-
24
- it %{returns nil instead of false when it has no information} do
25
- b = FuzzyMatch::Rule::Grouping.new %r{apple}
26
- b.join?('orange', 'orange').must_be_nil
27
- end
28
- end
@@ -1,34 +0,0 @@
1
- require 'helper'
2
-
3
- describe FuzzyMatch::Rule::Identity do
4
- it %{determines whether two records COULD be identical} do
5
- i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}
6
- i.identical?('A1', 'A 1foobar').must_equal true
7
- end
8
-
9
- it %{determines that two records MUST NOT be identical} do
10
- i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}
11
- i.identical?('A1', 'A 2foobar').must_equal false
12
- end
13
-
14
- it %{returns nil indicating no information} do
15
- i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}
16
- i.identical?('B1', 'A 2foobar').must_equal nil
17
- end
18
-
19
- it %{can be initialized with a regexp} do
20
- i = FuzzyMatch::Rule::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
21
- i.regexp.must_equal %r{\A\\?/(.*)etc/mysql\$$}
22
- end
23
-
24
- it %{does not automatically convert strings to regexps} do
25
- lambda do
26
- FuzzyMatch::Rule::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
27
- end.must_raise ArgumentError, /regexp/i
28
- end
29
-
30
- it %{embraces case insensitivity} do
31
- i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}i
32
- i.identical?('A1', 'a 1foobar').must_equal true
33
- end
34
- end
@@ -1,10 +0,0 @@
1
- require 'helper'
2
-
3
- describe FuzzyMatch::Rule::Normalizer do
4
- it %{applies itself to a string argument} do
5
- t = FuzzyMatch::Rule::Normalizer.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
6
- t.apply('Ford F-350').must_equal 'Ford F350'
7
- t.apply('Ford F150').must_equal 'Ford F150'
8
- t.apply('Ford F 350').must_equal 'Ford F350'
9
- end
10
- end