fuzzy_match 1.1.1 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -1
- data/README.markdown +124 -0
- data/Rakefile +5 -8
- data/benchmark/before-with-free.txt +25 -25
- data/benchmark/before-without-last-result.txt +31 -31
- data/benchmark/before.txt +29 -29
- data/benchmark/memory.rb +3 -4
- data/examples/bts_aircraft/{tighteners.csv → normalizers.csv} +0 -0
- data/examples/bts_aircraft/test_bts_aircraft.rb +3 -3
- data/lib/fuzzy_match/blocking.rb +1 -1
- data/lib/fuzzy_match/identity.rb +1 -1
- data/lib/fuzzy_match/{tightener.rb → normalizer.rb} +5 -5
- data/lib/fuzzy_match/result.rb +1 -1
- data/lib/fuzzy_match/version.rb +1 -1
- data/lib/fuzzy_match/wrapper.rb +3 -3
- data/lib/fuzzy_match.rb +30 -45
- data/test/test_blocking.rb +5 -0
- data/test/test_fuzzy_match.rb +40 -42
- data/test/test_identity.rb +5 -0
- data/test/{test_tightening.rb → test_normalizer.rb} +2 -2
- metadata +26 -25
- data/README.rdoc +0 -94
data/lib/fuzzy_match/version.rb
CHANGED
data/lib/fuzzy_match/wrapper.rb
CHANGED
@@ -58,9 +58,9 @@ class FuzzyMatch
|
|
58
58
|
end
|
59
59
|
|
60
60
|
def variants
|
61
|
-
@variants ||= fuzzy_match.
|
62
|
-
if
|
63
|
-
memo.push
|
61
|
+
@variants ||= fuzzy_match.normalizers.inject([ render ]) do |memo, normalizer|
|
62
|
+
if normalizer.apply? render
|
63
|
+
memo.push normalizer.apply(render)
|
64
64
|
end
|
65
65
|
memo
|
66
66
|
end.uniq
|
data/lib/fuzzy_match.rb
CHANGED
@@ -5,18 +5,21 @@ if ::ActiveSupport::VERSION::MAJOR >= 3
|
|
5
5
|
end
|
6
6
|
require 'to_regexp'
|
7
7
|
|
8
|
+
require 'fuzzy_match/normalizer'
|
9
|
+
require 'fuzzy_match/stop_word'
|
10
|
+
require 'fuzzy_match/blocking'
|
11
|
+
require 'fuzzy_match/identity'
|
12
|
+
require 'fuzzy_match/result'
|
13
|
+
require 'fuzzy_match/wrapper'
|
14
|
+
require 'fuzzy_match/similarity'
|
15
|
+
require 'fuzzy_match/score'
|
16
|
+
|
17
|
+
if defined?(::ActiveRecord)
|
18
|
+
require 'fuzzy_match/cached_result'
|
19
|
+
end
|
20
|
+
|
8
21
|
# See the README for more information.
|
9
22
|
class FuzzyMatch
|
10
|
-
autoload :Tightener, 'fuzzy_match/tightener'
|
11
|
-
autoload :StopWord, 'fuzzy_match/stop_word'
|
12
|
-
autoload :Blocking, 'fuzzy_match/blocking'
|
13
|
-
autoload :Identity, 'fuzzy_match/identity'
|
14
|
-
autoload :Result, 'fuzzy_match/result'
|
15
|
-
autoload :Wrapper, 'fuzzy_match/wrapper'
|
16
|
-
autoload :Similarity, 'fuzzy_match/similarity'
|
17
|
-
autoload :Score, 'fuzzy_match/score'
|
18
|
-
autoload :CachedResult, 'fuzzy_match/cached_result'
|
19
|
-
|
20
23
|
DEFAULT_OPTIONS = {
|
21
24
|
:first_blocking_decides => false,
|
22
25
|
:must_match_blocking => false,
|
@@ -28,33 +31,32 @@ class FuzzyMatch
|
|
28
31
|
attr_reader :haystack
|
29
32
|
attr_reader :blockings
|
30
33
|
attr_reader :identities
|
31
|
-
attr_reader :
|
34
|
+
attr_reader :normalizers
|
32
35
|
attr_reader :stop_words
|
33
36
|
attr_reader :read
|
34
37
|
attr_reader :default_options
|
35
38
|
|
36
39
|
# haystack - a bunch of records that will compete to see who best matches the needle
|
37
40
|
#
|
38
|
-
#
|
39
|
-
# *
|
40
|
-
# * identities
|
41
|
-
# * blockings
|
42
|
-
# * stop_words
|
43
|
-
# * read: how to interpret each entry in the 'haystack', either a Proc or a symbol
|
41
|
+
# Rules (can only be specified at initialization or by using a setter)
|
42
|
+
# * :<tt>normalizers</tt> - regexps (see README)
|
43
|
+
# * :<tt>identities</tt> - regexps
|
44
|
+
# * :<tt>blockings</tt> - regexps
|
45
|
+
# * :<tt>stop_words</tt> - regexps
|
44
46
|
#
|
45
|
-
#
|
46
|
-
# *
|
47
|
-
# * must_match_blocking
|
48
|
-
# * must_match_at_least_one_word
|
49
|
-
# *
|
50
|
-
# *
|
47
|
+
# Options (can be specified at initialization or when calling #find)
|
48
|
+
# * :<tt>read</tt> - how to interpret each record in the 'haystack', either a Proc or a symbol
|
49
|
+
# * :<tt>must_match_blocking</tt> - don't return a match unless the needle fits into one of the blockings you specified
|
50
|
+
# * :<tt>must_match_at_least_one_word</tt> - don't return a match unless the needle shares at least one word with the match
|
51
|
+
# * :<tt>first_blocking_decides</tt> - force records into the first blocking they match, rather than choosing a blocking that will give them a higher score
|
52
|
+
# * :<tt>gather_last_result</tt> - enable <tt>last_result</tt>
|
51
53
|
def initialize(competitors, options_and_rules = {})
|
52
54
|
options_and_rules = options_and_rules.symbolize_keys
|
53
55
|
|
54
56
|
# rules
|
55
57
|
self.blockings = options_and_rules.delete(:blockings) || []
|
56
58
|
self.identities = options_and_rules.delete(:identities) || []
|
57
|
-
self.
|
59
|
+
self.normalizers = options_and_rules.delete(:normalizers) || options_and_rules.delete(:tighteners) || []
|
58
60
|
self.stop_words = options_and_rules.delete(:stop_words) || []
|
59
61
|
@read = options_and_rules.delete(:read) || options_and_rules.delete(:haystack_reader)
|
60
62
|
|
@@ -73,8 +75,8 @@ class FuzzyMatch
|
|
73
75
|
@identities = ary.map { |regexp_or_str| Identity.new regexp_or_str }
|
74
76
|
end
|
75
77
|
|
76
|
-
def
|
77
|
-
@
|
78
|
+
def normalizers=(ary)
|
79
|
+
@normalizers = ary.map { |regexp_or_str| Normalizer.new regexp_or_str }
|
78
80
|
end
|
79
81
|
|
80
82
|
def stop_words=(ary)
|
@@ -95,8 +97,6 @@ class FuzzyMatch
|
|
95
97
|
end
|
96
98
|
|
97
99
|
def find(needle, options = {})
|
98
|
-
raise ::RuntimeError, "[fuzzy_match] Dictionary has already been freed, can't perform more finds" if freed?
|
99
|
-
|
100
100
|
options = options.symbolize_keys.reverse_merge default_options
|
101
101
|
|
102
102
|
gather_last_result = options[:gather_last_result]
|
@@ -106,7 +106,6 @@ class FuzzyMatch
|
|
106
106
|
must_match_at_least_one_word = options[:must_match_at_least_one_word]
|
107
107
|
|
108
108
|
if gather_last_result
|
109
|
-
free_last_result
|
110
109
|
@last_result = Result.new
|
111
110
|
last_result.read = read
|
112
111
|
last_result.haystack = haystack
|
@@ -118,7 +117,7 @@ EOS
|
|
118
117
|
end
|
119
118
|
|
120
119
|
if gather_last_result
|
121
|
-
last_result.
|
120
|
+
last_result.normalizers = normalizers
|
122
121
|
last_result.identities = identities
|
123
122
|
last_result.blockings = blockings
|
124
123
|
last_result.stop_words = stop_words
|
@@ -263,21 +262,7 @@ EOS
|
|
263
262
|
last_result.explain
|
264
263
|
end
|
265
264
|
|
266
|
-
|
267
|
-
@freed == true
|
268
|
-
end
|
269
|
-
|
265
|
+
# DEPRECATED - doesn't do anything
|
270
266
|
def free
|
271
|
-
free_last_result
|
272
|
-
@haystack.try :clear
|
273
|
-
@haystack = nil
|
274
|
-
ensure
|
275
|
-
@freed = true
|
276
|
-
end
|
277
|
-
|
278
|
-
private
|
279
|
-
|
280
|
-
def free_last_result
|
281
|
-
@last_result = nil
|
282
267
|
end
|
283
268
|
end
|
data/test/test_blocking.rb
CHANGED
@@ -20,4 +20,9 @@ class TestBlocking < Test::Unit::TestCase
|
|
20
20
|
b = FuzzyMatch::Blocking.new %r{apple}
|
21
21
|
assert_equal nil, b.join?('orange', 'orange')
|
22
22
|
end
|
23
|
+
|
24
|
+
def test_004_accepts_case_insensitivity
|
25
|
+
b = FuzzyMatch::Blocking.new %r{apple}i
|
26
|
+
assert_equal true, b.match?('2 Apples')
|
27
|
+
end
|
23
28
|
end
|
data/test/test_fuzzy_match.rb
CHANGED
@@ -6,12 +6,12 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
6
6
|
d = FuzzyMatch.new %w{ RATZ CATZ }
|
7
7
|
assert_equal 'RATZ', d.find('RITZ')
|
8
8
|
assert_equal 'RATZ', d.find('RíTZ')
|
9
|
-
|
9
|
+
|
10
10
|
d = FuzzyMatch.new [ 'X' ]
|
11
11
|
assert_equal 'X', d.find('X')
|
12
12
|
assert_equal nil, d.find('A')
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
def test_002_dont_gather_last_result_by_default
|
16
16
|
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
17
17
|
d.find('MISSAM')
|
@@ -19,88 +19,86 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
19
19
|
d.last_result
|
20
20
|
end
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
def test_003_last_result
|
24
24
|
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
25
25
|
d.find 'MISSAM', :gather_last_result => true
|
26
26
|
assert_equal 0.6, d.last_result.score
|
27
27
|
assert_equal 'NISSAN', d.last_result.winner
|
28
28
|
end
|
29
|
-
|
30
|
-
def
|
29
|
+
|
30
|
+
def test_005_correct_with_normalizer
|
31
31
|
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
|
32
|
-
assert_equal 'BOEING 737-900', d.find('BOEING 737100 number 900')
|
33
|
-
|
34
|
-
|
35
|
-
def test_005_correct_with_tightener
|
36
|
-
tighteners = [
|
32
|
+
assert_equal 'BOEING 737-900', d.find('BOEING 737100 number 900') # false positive without normalizer
|
33
|
+
|
34
|
+
normalizers = [
|
37
35
|
%r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
|
38
36
|
]
|
39
|
-
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :
|
37
|
+
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :normalizers => normalizers
|
40
38
|
assert_equal 'BOEING 737-100/200', d.find('BOEING 737100 number 900')
|
41
39
|
end
|
42
|
-
|
40
|
+
|
43
41
|
def test_008_false_positive_without_identity
|
44
42
|
d = FuzzyMatch.new %w{ foo bar }
|
45
43
|
assert_equal 'bar', d.find('baz')
|
46
44
|
end
|
47
|
-
|
45
|
+
|
48
46
|
def test_008_identify_false_positive
|
49
47
|
d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
|
50
48
|
assert_equal nil, d.find('baz')
|
51
49
|
end
|
52
|
-
|
50
|
+
|
53
51
|
# TODO this is not very helpful
|
54
52
|
def test_009_blocking
|
55
53
|
d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
|
56
54
|
assert_equal 'X', d.find('X')
|
57
55
|
assert_equal nil, d.find('A')
|
58
56
|
end
|
59
|
-
|
57
|
+
|
60
58
|
# TODO this is not very helpful
|
61
59
|
def test_0095_must_match_blocking
|
62
60
|
d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
63
61
|
assert_equal 'X', d.find('X')
|
64
62
|
assert_equal nil, d.find('A')
|
65
|
-
|
63
|
+
|
66
64
|
d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
|
67
65
|
assert_equal 'X', d.find('X', :must_match_blocking => true)
|
68
66
|
assert_equal nil, d.find('A', :must_match_blocking => true)
|
69
67
|
end
|
70
|
-
|
71
|
-
def
|
72
|
-
d = FuzzyMatch.new %w{
|
73
|
-
|
74
|
-
|
75
|
-
d.find
|
68
|
+
|
69
|
+
def test_011_free_does_nothing
|
70
|
+
d = FuzzyMatch.new %w{ A B }
|
71
|
+
assert_nothing_raised do
|
72
|
+
d.free
|
73
|
+
d.find 'A'
|
76
74
|
end
|
77
75
|
end
|
78
|
-
|
76
|
+
|
79
77
|
def test_012_find_all
|
80
78
|
d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
81
79
|
assert_equal ['X', 'X22' ], d.find_all('X')
|
82
80
|
assert_equal [], d.find_all('A')
|
83
81
|
end
|
84
|
-
|
82
|
+
|
85
83
|
def test_013_first_blocking_decides
|
86
84
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ]
|
87
85
|
assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing 747')
|
88
|
-
|
86
|
+
|
89
87
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
90
88
|
assert_equal [ 'Boeing 747', 'Boeing 747SR' ], d.find_all('Boeing 747')
|
91
|
-
|
89
|
+
|
92
90
|
# first_blocking_decides refers to the needle
|
93
91
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
94
92
|
assert_equal ["Boeing ER6", "Boeing 747", "Boeing 747SR"], d.find_all('Boeing ER6')
|
95
|
-
|
93
|
+
|
96
94
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
|
97
95
|
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
98
|
-
|
96
|
+
|
99
97
|
# or equivalently with an identity
|
100
98
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
|
101
99
|
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
102
100
|
end
|
103
|
-
|
101
|
+
|
104
102
|
MyStruct = Struct.new(:one, :two)
|
105
103
|
def test_014_symbol_read_sends_method
|
106
104
|
ab = MyStruct.new('a', 'b')
|
@@ -115,7 +113,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
115
113
|
assert_equal ba, by_first.find('b')
|
116
114
|
assert_equal ba, by_last.find('a')
|
117
115
|
end
|
118
|
-
|
116
|
+
|
119
117
|
def test_015_symbol_read_reads_array
|
120
118
|
ab = ['a', 'b']
|
121
119
|
ba = ['b', 'a']
|
@@ -127,7 +125,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
127
125
|
assert_equal ba, by_first.find('b')
|
128
126
|
assert_equal ba, by_last.find('a')
|
129
127
|
end
|
130
|
-
|
128
|
+
|
131
129
|
def test_016_symbol_read_reads_hash
|
132
130
|
ab = { :one => 'a', :two => 'b' }
|
133
131
|
ba = { :one => 'b', :two => 'a' }
|
@@ -139,7 +137,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
139
137
|
assert_equal ba, by_first.find('b')
|
140
138
|
assert_equal ba, by_last.find('a')
|
141
139
|
end
|
142
|
-
|
140
|
+
|
143
141
|
def test_017_understands_haystack_reader_option
|
144
142
|
ab = ['a', 'b']
|
145
143
|
ba = ['b', 'a']
|
@@ -148,31 +146,31 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
148
146
|
assert_equal ab, by_first.find('a')
|
149
147
|
assert_equal ba, by_first.find('b')
|
150
148
|
end
|
151
|
-
|
149
|
+
|
152
150
|
def test_018_no_result_if_best_score_is_zero
|
153
151
|
assert_equal nil, FuzzyMatch.new(['a']).find('b')
|
154
152
|
end
|
155
|
-
|
153
|
+
|
156
154
|
def test_019_must_match_at_least_one_word
|
157
155
|
d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
|
158
156
|
assert_equal nil, d.find('RITZ')
|
159
|
-
|
157
|
+
|
160
158
|
d = FuzzyMatch.new ["Foo's Bar"], :must_match_at_least_one_word => true
|
161
159
|
assert_equal nil, d.find("Jacob's")
|
162
160
|
assert_equal "Foo's Bar", d.find("Foo's")
|
163
161
|
end
|
164
|
-
|
162
|
+
|
165
163
|
def test_020_stop_words
|
166
164
|
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
|
167
165
|
assert_equal 'B HTL', d.find('A HTL', :must_match_at_least_one_word => true)
|
168
|
-
|
166
|
+
|
169
167
|
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
|
170
168
|
assert_equal 'B HTL', d.find('A HTL')
|
171
|
-
|
169
|
+
|
172
170
|
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
|
173
171
|
assert_equal 'A HOTEL', d.find('A HTL')
|
174
172
|
end
|
175
|
-
|
173
|
+
|
176
174
|
def test_021_explain_prints_to_stdout
|
177
175
|
require 'stringio'
|
178
176
|
capture = StringIO.new
|
@@ -187,15 +185,15 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
187
185
|
capture.rewind
|
188
186
|
assert capture.read.include?('CATZ')
|
189
187
|
end
|
190
|
-
|
188
|
+
|
191
189
|
def test_022_compare_words_with_words
|
192
190
|
d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
|
193
191
|
assert_equal nil, d.find('DOLCE LA HULPE BXL FI')
|
194
192
|
end
|
195
|
-
|
193
|
+
|
196
194
|
def test_023_must_match_at_least_one_word_is_case_insensitive
|
197
195
|
d = FuzzyMatch.new [ 'A', 'B' ]
|
198
196
|
assert_equal 'A', d.find('a', :must_match_at_least_one_word => true)
|
199
197
|
end
|
200
|
-
|
198
|
+
|
201
199
|
end
|
data/test/test_identity.rb
CHANGED
@@ -30,4 +30,9 @@ class TestIdentity < Test::Unit::TestCase
|
|
30
30
|
i = FuzzyMatch::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
|
31
31
|
assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
|
32
32
|
end
|
33
|
+
|
34
|
+
def test_007_accepts_case_insensitivity
|
35
|
+
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}i
|
36
|
+
assert_equal true, i.identical?('A1', 'a 1foobar')
|
37
|
+
end
|
33
38
|
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
|
-
class
|
3
|
+
class TestNormalizer < Test::Unit::TestCase
|
4
4
|
def test_001_apply
|
5
|
-
t = FuzzyMatch::
|
5
|
+
t = FuzzyMatch::Normalizer.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
|
6
6
|
assert_equal 'Ford F350', t.apply('Ford F-350')
|
7
7
|
assert_equal 'Ford F150', t.apply('Ford F150')
|
8
8
|
assert_equal 'Ford F350', t.apply('Ford F 350')
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-01-
|
12
|
+
date: 2012-01-18 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: shoulda
|
16
|
-
requirement: &
|
16
|
+
requirement: &2177380220 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2177380220
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: remote_table
|
27
|
-
requirement: &
|
27
|
+
requirement: &2177379700 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2177379700
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: activerecord
|
38
|
-
requirement: &
|
38
|
+
requirement: &2177379100 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '3'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2177379100
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: mysql
|
49
|
-
requirement: &
|
49
|
+
requirement: &2177378440 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2177378440
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: cohort_scope
|
60
|
-
requirement: &
|
60
|
+
requirement: &2177377600 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2177377600
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: weighted_average
|
71
|
-
requirement: &
|
71
|
+
requirement: &2177377020 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *2177377020
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rake
|
82
|
-
requirement: &
|
82
|
+
requirement: &2177376420 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *2177376420
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: activesupport
|
93
|
-
requirement: &
|
93
|
+
requirement: &2177375240 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '3'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *2177375240
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: to_regexp
|
104
|
-
requirement: &
|
104
|
+
requirement: &2177374500 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,7 +109,7 @@ dependencies:
|
|
109
109
|
version: 0.0.3
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *2177374500
|
113
113
|
description: Find a needle in a haystack using string similarity and (optionally)
|
114
114
|
regexp rules. Replaces loose_tight_dictionary.
|
115
115
|
email:
|
@@ -122,7 +122,7 @@ files:
|
|
122
122
|
- .gitignore
|
123
123
|
- Gemfile
|
124
124
|
- LICENSE
|
125
|
-
- README.
|
125
|
+
- README.markdown
|
126
126
|
- Rakefile
|
127
127
|
- THANKS-WILLIAM-JAMES.rb
|
128
128
|
- benchmark/before-with-free.txt
|
@@ -137,10 +137,10 @@ files:
|
|
137
137
|
- examples/bts_aircraft/blockings.csv
|
138
138
|
- examples/bts_aircraft/identities.csv
|
139
139
|
- examples/bts_aircraft/negatives.csv
|
140
|
+
- examples/bts_aircraft/normalizers.csv
|
140
141
|
- examples/bts_aircraft/number_260.csv
|
141
142
|
- examples/bts_aircraft/positives.csv
|
142
143
|
- examples/bts_aircraft/test_bts_aircraft.rb
|
143
|
-
- examples/bts_aircraft/tighteners.csv
|
144
144
|
- examples/first_name_matching.rb
|
145
145
|
- examples/icao-bts.xls
|
146
146
|
- fuzzy_match.gemspec
|
@@ -148,11 +148,11 @@ files:
|
|
148
148
|
- lib/fuzzy_match/blocking.rb
|
149
149
|
- lib/fuzzy_match/cached_result.rb
|
150
150
|
- lib/fuzzy_match/identity.rb
|
151
|
+
- lib/fuzzy_match/normalizer.rb
|
151
152
|
- lib/fuzzy_match/result.rb
|
152
153
|
- lib/fuzzy_match/score.rb
|
153
154
|
- lib/fuzzy_match/similarity.rb
|
154
155
|
- lib/fuzzy_match/stop_word.rb
|
155
|
-
- lib/fuzzy_match/tightener.rb
|
156
156
|
- lib/fuzzy_match/version.rb
|
157
157
|
- lib/fuzzy_match/wrapper.rb
|
158
158
|
- test/helper.rb
|
@@ -161,7 +161,7 @@ files:
|
|
161
161
|
- test/test_fuzzy_match.rb
|
162
162
|
- test/test_fuzzy_match_convoluted.rb.disabled
|
163
163
|
- test/test_identity.rb
|
164
|
-
- test/
|
164
|
+
- test/test_normalizer.rb
|
165
165
|
homepage: https://github.com/seamusabshere/fuzzy_match
|
166
166
|
licenses: []
|
167
167
|
post_install_message:
|
@@ -194,4 +194,5 @@ test_files:
|
|
194
194
|
- test/test_fuzzy_match.rb
|
195
195
|
- test/test_fuzzy_match_convoluted.rb.disabled
|
196
196
|
- test/test_identity.rb
|
197
|
-
- test/
|
197
|
+
- test/test_normalizer.rb
|
198
|
+
has_rdoc:
|
data/README.rdoc
DELETED
@@ -1,94 +0,0 @@
|
|
1
|
-
= fuzzy_match
|
2
|
-
|
3
|
-
Find a needle in a haystack based on string similarity (using the Pair Distance algorithm and Levenshtein distance) and regular expressions.
|
4
|
-
|
5
|
-
Replaces {loose_tight_dictionary}[https://github.com/seamusabshere/loose_tight_dictionary] because that was a confusing name.
|
6
|
-
|
7
|
-
== Quickstart
|
8
|
-
|
9
|
-
>> require 'fuzzy_match'
|
10
|
-
=> true
|
11
|
-
>> FuzzyMatch.new(%w{seamus andy ben}).find('Shamus')
|
12
|
-
=> "seamus"
|
13
|
-
|
14
|
-
== String similarity matching
|
15
|
-
|
16
|
-
Uses {Dice's Coefficient}[http://en.wikipedia.org/wiki/Dice's_coefficient] algorithm (aka Pair Distance).
|
17
|
-
|
18
|
-
If that judges two strings to be be equally similar to a third string, then Levenshtein distance is used. For example, pair distance considers "RATZ" and "CATZ" to be equally similar to "RITZ" so we invoke Levenshtein.
|
19
|
-
|
20
|
-
>> require 'amatch'
|
21
|
-
=> true
|
22
|
-
>> 'RITZ'.pair_distance_similar 'RATZ'
|
23
|
-
=> 0.3333333333333333
|
24
|
-
>> 'RITZ'.pair_distance_similar 'CATZ' # <-- pair distance can't tell the difference, so we fall back to levenshtein...
|
25
|
-
=> 0.3333333333333333
|
26
|
-
>> 'RITZ'.levenshtein_similar 'RATZ'
|
27
|
-
=> 0.75
|
28
|
-
>> 'RITZ'.levenshtein_similar 'CATZ' # <-- which properly shows that RATZ should win
|
29
|
-
=> 0.5
|
30
|
-
|
31
|
-
== Production use
|
32
|
-
|
33
|
-
Over 2 years in {Brighter Planet's environmental impact API}[http://impact.brighterplanet.com] and {reference data service}[http://data.brighterplanet.com].
|
34
|
-
|
35
|
-
== Haystacks and how to read them
|
36
|
-
|
37
|
-
The (admittedly imperfect) metaphor is "look for a needle in a haystack"
|
38
|
-
|
39
|
-
* needle - the search term
|
40
|
-
* haystack - the records you are searching (<b>your result will be an object from here</b>)
|
41
|
-
|
42
|
-
So, what if your needle is a string like <tt>youruguay</tt> and your haystack is full of <tt>Country</tt> objects like <tt><Country name:"Uruguay"></tt>?
|
43
|
-
|
44
|
-
>> FuzzyMatch.new(countries, :read => :name).find('youruguay')
|
45
|
-
=> <Country name:"Uruguay">
|
46
|
-
|
47
|
-
== Regular expressions
|
48
|
-
|
49
|
-
You can improve the default matchings with regular expressions.
|
50
|
-
|
51
|
-
* Emphasize important words using <b>blockings</b> and <b>tighteners</b>
|
52
|
-
* Filter out stop words with <b>tighteners</b>
|
53
|
-
* Prevent impossible matches with <b>blockings</b> and <b>identities</b>
|
54
|
-
* Ignore words with <b>stop words</b>
|
55
|
-
|
56
|
-
=== Blockings
|
57
|
-
|
58
|
-
Setting a blocking of <tt>/Airbus/</tt> ensures that strings containing "Airbus" will only be scored against to other strings containing "Airbus". A better blocking in this case would probably be <tt>/airbus/i</tt>.
|
59
|
-
|
60
|
-
=== Tighteners
|
61
|
-
|
62
|
-
Adding a tightener like <tt>/(boeing).*(7\d\d)/i</tt> will cause "BOEING COMPANY 747" and "boeing747" to be scored as if they were "BOEING 747" and "boeing 747", respectively. See also "Case sensitivity" below.
|
63
|
-
|
64
|
-
=== Identities
|
65
|
-
|
66
|
-
Adding an identity like <tt>/(F)\-?(\d50)/</tt> ensures that "Ford F-150" and "Ford F-250" never match.
|
67
|
-
|
68
|
-
=== Stop words
|
69
|
-
|
70
|
-
Adding a stop word like <tt>THE</tt> ensures that it is not taken into account when comparing "THE CAT", "THE DAT", and "THE CATT"
|
71
|
-
|
72
|
-
== Case sensitivity
|
73
|
-
|
74
|
-
Scoring is case-insensitive. Everything is downcased before scoring. This is a change from previous versions. Your regexps may still be case-sensitive, though.
|
75
|
-
|
76
|
-
== Examples
|
77
|
-
|
78
|
-
Check out the tests.
|
79
|
-
|
80
|
-
== Speed (and who to thank for the algorithms)
|
81
|
-
|
82
|
-
If you add the amatch[http://flori.github.com/amatch/] gem to your Gemfile, it will use that, which is much faster (but {segfaults have been seen in the wild}[https://github.com/flori/amatch/issues/3]). Thanks {Flori}[https://github.com/flori]!
|
83
|
-
|
84
|
-
Otherwise, pure ruby versions of the string similarity algorithms derived from the {answer to a StackOverflow question}[http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings] and {the text gem}[https://github.com/threedaymonk/text/blob/master/lib/text/levenshtein.rb] are used. Thanks {marzagao}[http://stackoverflow.com/users/10997/marzagao] and {threedaymonk}[https://github.com/threedaymonk]!
|
85
|
-
|
86
|
-
== Authors
|
87
|
-
|
88
|
-
* Seamus Abshere <seamus@abshere.net>
|
89
|
-
* Ian Hough <ijhough@gmail.com>
|
90
|
-
* Andy Rossmeissl <andy@rossmeissl.net>
|
91
|
-
|
92
|
-
== Copyright
|
93
|
-
|
94
|
-
Copyright 2011 Brighter Planet, Inc.
|