fuzzy_match 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,7 +28,7 @@ ERB
28
28
  attr_accessor :haystack
29
29
  attr_accessor :options
30
30
  attr_accessor :normalizers
31
- attr_accessor :blockings
31
+ attr_accessor :groupings
32
32
  attr_accessor :identities
33
33
  attr_accessor :stop_words
34
34
  attr_accessor :winner
@@ -0,0 +1,14 @@
1
+ class FuzzyMatch
2
+ # A rule characterized by a regexp. Abstract.
3
+ class Rule
4
+ attr_reader :regexp
5
+
6
+ def initialize(regexp_or_str)
7
+ @regexp = regexp_or_str.to_regexp
8
+ end
9
+
10
+ def ==(other)
11
+ regexp == other.regexp
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,32 @@
1
+ class FuzzyMatch
2
+ class Rule
3
+ # "Record linkage typically involves two main steps: grouping and scoring..."
4
+ # http://en.wikipedia.org/wiki/Record_linkage
5
+ #
6
+ # Groupings effectively divide up the haystack into groups that match a pattern
7
+ #
8
+ # A grouping (formerly known as a blocking) comes into effect when a str matches.
9
+ # Then the needle must also match the grouping's regexp.
10
+ class Grouping < Rule
11
+ def match?(str)
12
+ !!(regexp.match(str))
13
+ end
14
+
15
+ # If a grouping "joins" two strings, that means they both fit into it.
16
+ #
17
+ # Returns false if they certainly don't fit this grouping.
18
+ # Returns nil if the grouping doesn't apply, i.e. str2 doesn't fit the grouping.
19
+ def join?(str1, str2)
20
+ if str2_match_data = regexp.match(str2)
21
+ if str1_match_data = regexp.match(str1)
22
+ str2_match_data.captures.join.downcase == str1_match_data.captures.join.downcase
23
+ else
24
+ false
25
+ end
26
+ else
27
+ nil
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,19 @@
1
+ class FuzzyMatch
2
+ class Rule
3
+ # Identities take effect when needle and haystack both match a regexp
4
+ # Then the captured part of the regexp has to match exactly
5
+ class Identity < Rule
6
+ # Two strings are "identical" if they both match this identity and the captures are equal.
7
+ #
8
+ # Only returns true/false if both strings match the regexp.
9
+ # Otherwise returns nil.
10
+ def identical?(str1, str2)
11
+ if str1_match_data = regexp.match(str1) and match_data = regexp.match(str2)
12
+ str1_match_data.captures.join.downcase == match_data.captures.join.downcase
13
+ else
14
+ nil
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,20 @@
1
+ class FuzzyMatch
2
+ class Rule
3
+ # A normalizer just strips a string down to its core
4
+ class Normalizer < Rule
5
+ # A normalizer applies when its regexp matches and captures a new (shorter) string
6
+ def apply?(str)
7
+ !!(regexp.match(str))
8
+ end
9
+
10
+ # The result of applying a normalizer is just all the captures put together.
11
+ def apply(str)
12
+ if match_data = regexp.match(str)
13
+ match_data.captures.join
14
+ else
15
+ str
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,11 @@
1
+ class FuzzyMatch
2
+ class Rule
3
+ # A stop word is ignored
4
+ class StopWord < Rule
5
+ # Destructively remove stop words from the string
6
+ def apply!(str)
7
+ str.gsub! regexp, ''
8
+ end
9
+ end
10
+ end
11
+ end
@@ -1,3 +1,3 @@
1
1
  class FuzzyMatch
2
- VERSION = '1.3.1'
2
+ VERSION = '1.3.2'
3
3
  end
data/test/helper.rb CHANGED
@@ -3,7 +3,9 @@ require 'bundler'
3
3
  Bundler.setup
4
4
  require 'minitest/spec'
5
5
  require 'minitest/autorun'
6
- require 'stringio'
6
+ require 'minitest/reporters'
7
+ MiniTest::Unit.runner = MiniTest::SuiteRunner.new
8
+ MiniTest::Unit.runner.reporters << MiniTest::Reporters::SpecReporter.new
7
9
 
8
10
  $LOAD_PATH.unshift(File.dirname(__FILE__))
9
11
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
@@ -2,98 +2,126 @@
2
2
  require 'helper'
3
3
 
4
4
  class TestFuzzyMatch < MiniTest::Spec
5
- it %{identify the best match based on string similarity} do
6
- d = FuzzyMatch.new %w{ RATZ CATZ }
7
- d.find('RITZ').must_equal 'RATZ'
8
- d.find('RíTZ').must_equal 'RATZ'
9
-
10
- d = FuzzyMatch.new [ 'X' ]
11
- d.find('X').must_equal 'X'
12
- d.find('A').must_be_nil
13
- end
5
+ describe '#find' do
6
+ it %{identifies the best match based on string similarity} do
7
+ d = FuzzyMatch.new %w{ RATZ CATZ }
8
+ d.find('RITZ').must_equal 'RATZ'
9
+ d.find('RíTZ').must_equal 'RATZ'
14
10
 
15
- it %{not gather metadata about the last result by default} do
16
- d = FuzzyMatch.new %w{ NISSAN HONDA }
17
- d.find('MISSAM')
18
- lambda do
19
- d.last_result
20
- end.must_raise ::RuntimeError, /gather_last_result/
11
+ d = FuzzyMatch.new [ 'X' ]
12
+ d.find('X').must_equal 'X'
13
+ d.find('A').must_be_nil
14
+ end
15
+
16
+ it %{not return any result if the maximum score is zero} do
17
+ FuzzyMatch.new(['a']).find('b').must_be_nil
18
+ end
21
19
  end
22
-
23
- it %{optionally gather metadata about the last result} do
24
- d = FuzzyMatch.new %w{ NISSAN HONDA }
25
- d.find 'MISSAM', :gather_last_result => true
26
- d.last_result.score.must_equal 0.6
27
- d.last_result.winner.must_equal 'NISSAN'
20
+
21
+ describe '#find_all' do
22
+ it %{return all records in sorted order} do
23
+ d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
24
+ d.find_all('X').must_equal ['X', 'X22' ]
25
+ d.find_all('A').must_equal []
26
+ end
28
27
  end
29
28
 
30
- it %{use NORMALIZERS} do
31
- d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
32
- d.find('BOEING 737100 number 900').must_equal 'BOEING 737-900'
33
-
34
- normalizers = [
35
- %r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
36
- ]
37
- d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :normalizers => normalizers
38
- d.find('BOEING 737100 number 900').must_equal 'BOEING 737-100/200'
29
+ describe '#explain' do
30
+ before do
31
+ require 'stringio'
32
+ @capture = StringIO.new
33
+ @old_stdout = $stdout
34
+ $stdout = @capture
35
+ end
36
+ after do
37
+ $stdout = @old_stdout
38
+ end
39
+
40
+ it %{print a basic explanation to stdout} do
41
+ d = FuzzyMatch.new %w{ RATZ CATZ }
42
+ d.explain('RITZ')
43
+ @capture.rewind
44
+ @capture.read.must_include 'CATZ'
45
+ end
46
+
47
+ it %{explains match failures} do
48
+ FuzzyMatch.new(['aaa']).explain('bbb')
49
+ @capture.rewind
50
+ @capture.read.must_match %r{No winner assigned.*aaa.*bbb}
51
+ end
39
52
  end
40
53
 
41
- it %{use IDENTITIES} do
42
- # false positive without identity
43
- d = FuzzyMatch.new %w{ foo bar }
44
- d.find('baz').must_equal 'bar'
45
-
46
- d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
47
- d.find('baz').must_be_nil
48
- end
54
+ describe "normalizers" do
55
+ it %{sometimes gets false results without them} do
56
+ d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
57
+ d.find('BOEING 737100 number 900').must_equal 'BOEING 737-900'
58
+ end
49
59
 
50
- # TODO this is not very helpful
51
- it %{use BLOCKINGS} do
52
- d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
53
- d.find('X').must_equal 'X'
54
- d.find('A').must_be_nil
60
+ it %{can be used to improve results} do
61
+ normalizers = [
62
+ %r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
63
+ ]
64
+ d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :normalizers => normalizers
65
+ d.find('BOEING 737100 number 900').must_equal 'BOEING 737-100/200'
66
+ end
55
67
  end
56
68
 
57
- # TODO this is not very helpful
58
- it %{optionally only attempt matches with records that fit into a blocking} do
59
- d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
60
- d.find('X').must_equal 'X'
61
- d.find('A').must_be_nil
69
+ describe "identities" do
70
+ it %{sometimes gets false results without them} do
71
+ # false positive without identity
72
+ d = FuzzyMatch.new %w{ foo bar }
73
+ d.find('baz').must_equal 'bar'
74
+ end
62
75
 
63
- d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
64
- d.find('X', :must_match_blocking => true).must_equal 'X'
65
- d.find('A', :must_match_blocking => true).must_be_nil
76
+ it %{can be used to improve results} do
77
+ d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
78
+ d.find('baz').must_be_nil
79
+ end
66
80
  end
67
81
 
68
- it %{receive the deprecated FuzzyMatch#free method without complaint} do
69
- d = FuzzyMatch.new %w{ A B }
70
- d.free
71
- d.find('A').wont_be_nil
82
+ describe 'groupings' do
83
+ it %{sometimes gets false results without them} do
84
+ d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ]
85
+ d.find('Barack Bush').must_equal 'Barack Obama' # luke i am your father
86
+ d.find('George Obama').must_equal 'George Bush' # nooooooooooooooooooo
87
+ end
88
+
89
+ it %{can be used to improve results} do
90
+ d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ]
91
+ d.find('Barack Bush').must_equal 'George Bush'
92
+ d.find('George Obama').must_equal 'Barack Obama'
93
+ end
72
94
  end
73
-
74
- it %{return all records in sorted order} do
75
- d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
76
- d.find_all('X').must_equal ['X', 'X22' ]
77
- d.find_all('A').must_equal []
95
+
96
+ describe "the :must_match_grouping option" do
97
+ it %{optionally only attempt matches with records that fit into a grouping} do
98
+ d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ], :must_match_grouping => true
99
+ d.find('George Clinton').must_be_nil
100
+
101
+ d = FuzzyMatch.new [ 'Barack Obama', 'George Bush' ], :groupings => [ /Obama/, /Bush/ ]
102
+ d.find('George Clinton', :must_match_grouping => true).must_be_nil
103
+ end
78
104
  end
105
+
106
+ describe "the :first_grouping_decides option" do
107
+ it %{optionally force the first grouping to decide} do
108
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ]
109
+ d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ]
79
110
 
80
- it %{optionally force the first blocking to decide} do
81
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ]
82
- d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ]
83
-
84
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
85
- d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR' ]
111
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true
112
+ d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR' ]
86
113
 
87
- # first_blocking_decides refers to the needle
88
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
89
- d.find_all('Boeing ER6').must_equal ["Boeing ER6", "Boeing 747", "Boeing 747SR"]
114
+ # first_grouping_decides refers to the needle
115
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true
116
+ d.find_all('Boeing ER6').must_equal ["Boeing ER6", "Boeing 747", "Boeing 747SR"]
90
117
 
91
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
92
- d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
118
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_grouping_decides => true
119
+ d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
93
120
 
94
- # or equivalently with an identity
95
- d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
96
- d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
121
+ # or equivalently with an identity
122
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :groupings => [ /(boeing \d{3})/i, /boeing/i ], :first_grouping_decides => true, :identities => [ /boeing (7|E)/i ]
123
+ d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
124
+ end
97
125
  end
98
126
 
99
127
  describe "the :read option" do
@@ -135,70 +163,106 @@ class TestFuzzyMatch < MiniTest::Spec
135
163
  by_first.find('b').must_equal ba
136
164
  by_last.find('a').must_equal ba
137
165
  end
138
-
139
- it %{treat the deprecrated :haystack_reader option as an alias} do
140
- ab = ['a', 'b']
141
- ba = ['b', 'a']
142
- haystack = [ab, ba]
143
- by_first = FuzzyMatch.new haystack, :haystack_reader => 0
144
- by_first.find('a').must_equal ab
145
- by_first.find('b').must_equal ba
146
- end
147
166
  end
167
+
168
+ describe 'the :must_match_at_least_one_word option' do
169
+ it %{optionally require that the matching record share at least one word with the needle} do
170
+ d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
171
+ d.find('RITZ').must_be_nil
172
+
173
+ d = FuzzyMatch.new ["Foo's Bar"], :must_match_at_least_one_word => true
174
+ d.find("Foo's").must_equal "Foo's Bar"
175
+ d.find("'s").must_be_nil
176
+ d.find("Foo").must_be_nil
177
+
178
+ d = FuzzyMatch.new ["Bolivia, Plurinational State of"], :must_match_at_least_one_word => true
179
+ d.find("Bolivia").must_equal "Bolivia, Plurinational State of"
180
+ end
148
181
 
149
- it %{not return any result if the maximum score is zero} do
150
- FuzzyMatch.new(['a']).find('b').must_be_nil
151
- end
182
+ it %{use STOP WORDS} do
183
+ d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
184
+ d.find('A HTL', :must_match_at_least_one_word => true).must_equal 'B HTL'
152
185
 
153
- it %{optionally require that the matching record share at least one word with the needle} do
154
- d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
155
- d.find('RITZ').must_be_nil
186
+ d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
187
+ d.find('A HTL').must_equal 'B HTL'
156
188
 
157
- d = FuzzyMatch.new ["Foo's Bar"], :must_match_at_least_one_word => true
158
- d.find("Foo's").must_equal "Foo's Bar"
159
- d.find("'s").must_be_nil
160
- d.find("Foo").must_be_nil
189
+ d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
190
+ d.find('A HTL').must_equal 'A HOTEL'
191
+ end
161
192
 
162
- d = FuzzyMatch.new ["Bolivia, Plurinational State of"], :must_match_at_least_one_word => true
163
- d.find("Bolivia").must_equal "Bolivia, Plurinational State of"
193
+ it %{not be fooled by substrings (but rather compare whole words to whole words)} do
194
+ d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
195
+ d.find('DOLCE LA HULPE BXL FI').must_be_nil
196
+ end
197
+
198
+ it %{not be case-sensitive when checking for sharing of words} do
199
+ d = FuzzyMatch.new [ 'A', 'B' ]
200
+ d.find('a', :must_match_at_least_one_word => true).must_equal 'A'
201
+ end
202
+ end
203
+
204
+ describe "the :gather_last_result option" do
205
+ it %{not gather metadata about the last result by default} do
206
+ d = FuzzyMatch.new %w{ NISSAN HONDA }
207
+ d.find('MISSAM')
208
+ lambda do
209
+ d.last_result
210
+ end.must_raise ::RuntimeError, /gather_last_result/
211
+ end
212
+
213
+ it %{optionally gather metadata about the last result} do
214
+ d = FuzzyMatch.new %w{ NISSAN HONDA }
215
+ d.find 'MISSAM', :gather_last_result => true
216
+ d.last_result.score.must_equal 0.6
217
+ d.last_result.winner.must_equal 'NISSAN'
218
+ end
219
+ end
220
+
221
+ describe 'quirks' do
222
+ it %{should not return false negatives because of one-letter similarities} do
223
+ # dices coefficient doesn't think these two are similar at all because it looks at pairs
224
+ FuzzyMatch.score_class.new('X foo', 'X bar').dices_coefficient_similar.must_equal 0
225
+ # so we must compensate for that somewhere
226
+ d = FuzzyMatch.new ['X foo', 'randomness']
227
+ d.find('X bar').must_equal 'X foo'
228
+ # without making false positives
229
+ d.find('Y bar').must_be_nil
230
+ end
164
231
  end
165
232
 
166
- it %{use STOP WORDS} do
167
- d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
168
- d.find('A HTL', :must_match_at_least_one_word => true).must_equal 'B HTL'
233
+ describe 'deprecations' do
234
+ it %{takes :must_match_blocking as :must_match_grouping} do
235
+ d = FuzzyMatch.new [], :must_match_blocking => :a
236
+ d.default_options[:must_match_grouping].must_equal :a
237
+ end
169
238
 
170
- d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
171
- d.find('A HTL').must_equal 'B HTL'
239
+ it %{takes :first_blocking_decides as :first_grouping_decides} do
240
+ d = FuzzyMatch.new [], :first_blocking_decides => :b
241
+ d.default_options[:first_grouping_decides].must_equal :b
242
+ end
172
243
 
173
- d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
174
- d.find('A HTL').must_equal 'A HOTEL'
175
- end
244
+ it %{takes :haystack_reader as :read} do
245
+ d = FuzzyMatch.new [], :haystack_reader => :c
246
+ d.read.must_equal :c
247
+ end
176
248
 
177
- it %{print a basic explanation to stdout} do
178
- require 'stringio'
179
- capture = StringIO.new
180
- begin
181
- old_stdout = $stdout
182
- $stdout = capture
183
- d = FuzzyMatch.new %w{ RATZ CATZ }
184
- d.explain('RITZ')
185
- ensure
186
- $stdout = old_stdout
249
+ it %{takes :blockings as :groupings} do
250
+ d = FuzzyMatch.new [], :blockings => [ /X/, /Y/ ]
251
+ d.groupings.must_equal [ FuzzyMatch::Rule::Grouping.new(/X/), FuzzyMatch::Rule::Grouping.new(/Y/) ]
187
252
  end
188
- capture.rewind
189
- capture.read.must_include 'CATZ'
190
- end
191
253
 
192
- it %{not be fooled by substrings (but rather compare whole words to whole words)} do
193
- d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
194
- d.find('DOLCE LA HULPE BXL FI').must_be_nil
195
- end
254
+ it %{takes :tighteners as :normalizers} do
255
+ d = FuzzyMatch.new [], :tighteners => [ /X/, /Y/ ]
256
+ d.normalizers.must_equal [ FuzzyMatch::Rule::Normalizer.new(/X/), FuzzyMatch::Rule::Normalizer.new(/Y/) ]
257
+ end
196
258
 
197
- it %{not be case-sensitive when checking for sharing of words} do
198
- d = FuzzyMatch.new [ 'A', 'B' ]
199
- d.find('a', :must_match_at_least_one_word => true).must_equal 'A'
259
+ it %{receives #free method, but doesn't do anything} do
260
+ d = FuzzyMatch.new %w{ A B }
261
+ d.free
262
+ d.find('A').wont_be_nil
263
+ end
200
264
  end
201
-
265
+
202
266
  it %{defaults to a pure-ruby engine, but also has amatch} do
203
267
  if defined?($testing_amatch) and $testing_amatch
204
268
  FuzzyMatch.engine.must_equal :amatch