fuzzy_match 1.1.1 → 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -1
- data/README.markdown +124 -0
- data/Rakefile +5 -8
- data/benchmark/before-with-free.txt +25 -25
- data/benchmark/before-without-last-result.txt +31 -31
- data/benchmark/before.txt +29 -29
- data/benchmark/memory.rb +3 -4
- data/examples/bts_aircraft/{tighteners.csv → normalizers.csv} +0 -0
- data/examples/bts_aircraft/test_bts_aircraft.rb +3 -3
- data/lib/fuzzy_match/blocking.rb +1 -1
- data/lib/fuzzy_match/identity.rb +1 -1
- data/lib/fuzzy_match/{tightener.rb → normalizer.rb} +5 -5
- data/lib/fuzzy_match/result.rb +1 -1
- data/lib/fuzzy_match/version.rb +1 -1
- data/lib/fuzzy_match/wrapper.rb +3 -3
- data/lib/fuzzy_match.rb +30 -45
- data/test/test_blocking.rb +5 -0
- data/test/test_fuzzy_match.rb +40 -42
- data/test/test_identity.rb +5 -0
- data/test/{test_tightening.rb → test_normalizer.rb} +2 -2
- metadata +26 -25
- data/README.rdoc +0 -94
data/lib/fuzzy_match/version.rb
CHANGED
data/lib/fuzzy_match/wrapper.rb
CHANGED
@@ -58,9 +58,9 @@ class FuzzyMatch
|
|
58
58
|
end
|
59
59
|
|
60
60
|
def variants
|
61
|
-
@variants ||= fuzzy_match.
|
62
|
-
if
|
63
|
-
memo.push
|
61
|
+
@variants ||= fuzzy_match.normalizers.inject([ render ]) do |memo, normalizer|
|
62
|
+
if normalizer.apply? render
|
63
|
+
memo.push normalizer.apply(render)
|
64
64
|
end
|
65
65
|
memo
|
66
66
|
end.uniq
|
data/lib/fuzzy_match.rb
CHANGED
@@ -5,18 +5,21 @@ if ::ActiveSupport::VERSION::MAJOR >= 3
|
|
5
5
|
end
|
6
6
|
require 'to_regexp'
|
7
7
|
|
8
|
+
require 'fuzzy_match/normalizer'
|
9
|
+
require 'fuzzy_match/stop_word'
|
10
|
+
require 'fuzzy_match/blocking'
|
11
|
+
require 'fuzzy_match/identity'
|
12
|
+
require 'fuzzy_match/result'
|
13
|
+
require 'fuzzy_match/wrapper'
|
14
|
+
require 'fuzzy_match/similarity'
|
15
|
+
require 'fuzzy_match/score'
|
16
|
+
|
17
|
+
if defined?(::ActiveRecord)
|
18
|
+
require 'fuzzy_match/cached_result'
|
19
|
+
end
|
20
|
+
|
8
21
|
# See the README for more information.
|
9
22
|
class FuzzyMatch
|
10
|
-
autoload :Tightener, 'fuzzy_match/tightener'
|
11
|
-
autoload :StopWord, 'fuzzy_match/stop_word'
|
12
|
-
autoload :Blocking, 'fuzzy_match/blocking'
|
13
|
-
autoload :Identity, 'fuzzy_match/identity'
|
14
|
-
autoload :Result, 'fuzzy_match/result'
|
15
|
-
autoload :Wrapper, 'fuzzy_match/wrapper'
|
16
|
-
autoload :Similarity, 'fuzzy_match/similarity'
|
17
|
-
autoload :Score, 'fuzzy_match/score'
|
18
|
-
autoload :CachedResult, 'fuzzy_match/cached_result'
|
19
|
-
|
20
23
|
DEFAULT_OPTIONS = {
|
21
24
|
:first_blocking_decides => false,
|
22
25
|
:must_match_blocking => false,
|
@@ -28,33 +31,32 @@ class FuzzyMatch
|
|
28
31
|
attr_reader :haystack
|
29
32
|
attr_reader :blockings
|
30
33
|
attr_reader :identities
|
31
|
-
attr_reader :
|
34
|
+
attr_reader :normalizers
|
32
35
|
attr_reader :stop_words
|
33
36
|
attr_reader :read
|
34
37
|
attr_reader :default_options
|
35
38
|
|
36
39
|
# haystack - a bunch of records that will compete to see who best matches the needle
|
37
40
|
#
|
38
|
-
#
|
39
|
-
# *
|
40
|
-
# * identities
|
41
|
-
# * blockings
|
42
|
-
# * stop_words
|
43
|
-
# * read: how to interpret each entry in the 'haystack', either a Proc or a symbol
|
41
|
+
# Rules (can only be specified at initialization or by using a setter)
|
42
|
+
# * :<tt>normalizers</tt> - regexps (see README)
|
43
|
+
# * :<tt>identities</tt> - regexps
|
44
|
+
# * :<tt>blockings</tt> - regexps
|
45
|
+
# * :<tt>stop_words</tt> - regexps
|
44
46
|
#
|
45
|
-
#
|
46
|
-
# *
|
47
|
-
# * must_match_blocking
|
48
|
-
# * must_match_at_least_one_word
|
49
|
-
# *
|
50
|
-
# *
|
47
|
+
# Options (can be specified at initialization or when calling #find)
|
48
|
+
# * :<tt>read</tt> - how to interpret each record in the 'haystack', either a Proc or a symbol
|
49
|
+
# * :<tt>must_match_blocking</tt> - don't return a match unless the needle fits into one of the blockings you specified
|
50
|
+
# * :<tt>must_match_at_least_one_word</tt> - don't return a match unless the needle shares at least one word with the match
|
51
|
+
# * :<tt>first_blocking_decides</tt> - force records into the first blocking they match, rather than choosing a blocking that will give them a higher score
|
52
|
+
# * :<tt>gather_last_result</tt> - enable <tt>last_result</tt>
|
51
53
|
def initialize(competitors, options_and_rules = {})
|
52
54
|
options_and_rules = options_and_rules.symbolize_keys
|
53
55
|
|
54
56
|
# rules
|
55
57
|
self.blockings = options_and_rules.delete(:blockings) || []
|
56
58
|
self.identities = options_and_rules.delete(:identities) || []
|
57
|
-
self.
|
59
|
+
self.normalizers = options_and_rules.delete(:normalizers) || options_and_rules.delete(:tighteners) || []
|
58
60
|
self.stop_words = options_and_rules.delete(:stop_words) || []
|
59
61
|
@read = options_and_rules.delete(:read) || options_and_rules.delete(:haystack_reader)
|
60
62
|
|
@@ -73,8 +75,8 @@ class FuzzyMatch
|
|
73
75
|
@identities = ary.map { |regexp_or_str| Identity.new regexp_or_str }
|
74
76
|
end
|
75
77
|
|
76
|
-
def
|
77
|
-
@
|
78
|
+
def normalizers=(ary)
|
79
|
+
@normalizers = ary.map { |regexp_or_str| Normalizer.new regexp_or_str }
|
78
80
|
end
|
79
81
|
|
80
82
|
def stop_words=(ary)
|
@@ -95,8 +97,6 @@ class FuzzyMatch
|
|
95
97
|
end
|
96
98
|
|
97
99
|
def find(needle, options = {})
|
98
|
-
raise ::RuntimeError, "[fuzzy_match] Dictionary has already been freed, can't perform more finds" if freed?
|
99
|
-
|
100
100
|
options = options.symbolize_keys.reverse_merge default_options
|
101
101
|
|
102
102
|
gather_last_result = options[:gather_last_result]
|
@@ -106,7 +106,6 @@ class FuzzyMatch
|
|
106
106
|
must_match_at_least_one_word = options[:must_match_at_least_one_word]
|
107
107
|
|
108
108
|
if gather_last_result
|
109
|
-
free_last_result
|
110
109
|
@last_result = Result.new
|
111
110
|
last_result.read = read
|
112
111
|
last_result.haystack = haystack
|
@@ -118,7 +117,7 @@ EOS
|
|
118
117
|
end
|
119
118
|
|
120
119
|
if gather_last_result
|
121
|
-
last_result.
|
120
|
+
last_result.normalizers = normalizers
|
122
121
|
last_result.identities = identities
|
123
122
|
last_result.blockings = blockings
|
124
123
|
last_result.stop_words = stop_words
|
@@ -263,21 +262,7 @@ EOS
|
|
263
262
|
last_result.explain
|
264
263
|
end
|
265
264
|
|
266
|
-
|
267
|
-
@freed == true
|
268
|
-
end
|
269
|
-
|
265
|
+
# DEPRECATED - doesn't do anything
|
270
266
|
def free
|
271
|
-
free_last_result
|
272
|
-
@haystack.try :clear
|
273
|
-
@haystack = nil
|
274
|
-
ensure
|
275
|
-
@freed = true
|
276
|
-
end
|
277
|
-
|
278
|
-
private
|
279
|
-
|
280
|
-
def free_last_result
|
281
|
-
@last_result = nil
|
282
267
|
end
|
283
268
|
end
|
data/test/test_blocking.rb
CHANGED
@@ -20,4 +20,9 @@ class TestBlocking < Test::Unit::TestCase
|
|
20
20
|
b = FuzzyMatch::Blocking.new %r{apple}
|
21
21
|
assert_equal nil, b.join?('orange', 'orange')
|
22
22
|
end
|
23
|
+
|
24
|
+
def test_004_accepts_case_insensitivity
|
25
|
+
b = FuzzyMatch::Blocking.new %r{apple}i
|
26
|
+
assert_equal true, b.match?('2 Apples')
|
27
|
+
end
|
23
28
|
end
|
data/test/test_fuzzy_match.rb
CHANGED
@@ -6,12 +6,12 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
6
6
|
d = FuzzyMatch.new %w{ RATZ CATZ }
|
7
7
|
assert_equal 'RATZ', d.find('RITZ')
|
8
8
|
assert_equal 'RATZ', d.find('RíTZ')
|
9
|
-
|
9
|
+
|
10
10
|
d = FuzzyMatch.new [ 'X' ]
|
11
11
|
assert_equal 'X', d.find('X')
|
12
12
|
assert_equal nil, d.find('A')
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
def test_002_dont_gather_last_result_by_default
|
16
16
|
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
17
17
|
d.find('MISSAM')
|
@@ -19,88 +19,86 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
19
19
|
d.last_result
|
20
20
|
end
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
def test_003_last_result
|
24
24
|
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
25
25
|
d.find 'MISSAM', :gather_last_result => true
|
26
26
|
assert_equal 0.6, d.last_result.score
|
27
27
|
assert_equal 'NISSAN', d.last_result.winner
|
28
28
|
end
|
29
|
-
|
30
|
-
def
|
29
|
+
|
30
|
+
def test_005_correct_with_normalizer
|
31
31
|
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
|
32
|
-
assert_equal 'BOEING 737-900', d.find('BOEING 737100 number 900')
|
33
|
-
|
34
|
-
|
35
|
-
def test_005_correct_with_tightener
|
36
|
-
tighteners = [
|
32
|
+
assert_equal 'BOEING 737-900', d.find('BOEING 737100 number 900') # false positive without normalizer
|
33
|
+
|
34
|
+
normalizers = [
|
37
35
|
%r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
|
38
36
|
]
|
39
|
-
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :
|
37
|
+
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :normalizers => normalizers
|
40
38
|
assert_equal 'BOEING 737-100/200', d.find('BOEING 737100 number 900')
|
41
39
|
end
|
42
|
-
|
40
|
+
|
43
41
|
def test_008_false_positive_without_identity
|
44
42
|
d = FuzzyMatch.new %w{ foo bar }
|
45
43
|
assert_equal 'bar', d.find('baz')
|
46
44
|
end
|
47
|
-
|
45
|
+
|
48
46
|
def test_008_identify_false_positive
|
49
47
|
d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
|
50
48
|
assert_equal nil, d.find('baz')
|
51
49
|
end
|
52
|
-
|
50
|
+
|
53
51
|
# TODO this is not very helpful
|
54
52
|
def test_009_blocking
|
55
53
|
d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
|
56
54
|
assert_equal 'X', d.find('X')
|
57
55
|
assert_equal nil, d.find('A')
|
58
56
|
end
|
59
|
-
|
57
|
+
|
60
58
|
# TODO this is not very helpful
|
61
59
|
def test_0095_must_match_blocking
|
62
60
|
d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
63
61
|
assert_equal 'X', d.find('X')
|
64
62
|
assert_equal nil, d.find('A')
|
65
|
-
|
63
|
+
|
66
64
|
d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
|
67
65
|
assert_equal 'X', d.find('X', :must_match_blocking => true)
|
68
66
|
assert_equal nil, d.find('A', :must_match_blocking => true)
|
69
67
|
end
|
70
|
-
|
71
|
-
def
|
72
|
-
d = FuzzyMatch.new %w{
|
73
|
-
|
74
|
-
|
75
|
-
d.find
|
68
|
+
|
69
|
+
def test_011_free_does_nothing
|
70
|
+
d = FuzzyMatch.new %w{ A B }
|
71
|
+
assert_nothing_raised do
|
72
|
+
d.free
|
73
|
+
d.find 'A'
|
76
74
|
end
|
77
75
|
end
|
78
|
-
|
76
|
+
|
79
77
|
def test_012_find_all
|
80
78
|
d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
81
79
|
assert_equal ['X', 'X22' ], d.find_all('X')
|
82
80
|
assert_equal [], d.find_all('A')
|
83
81
|
end
|
84
|
-
|
82
|
+
|
85
83
|
def test_013_first_blocking_decides
|
86
84
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ]
|
87
85
|
assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing 747')
|
88
|
-
|
86
|
+
|
89
87
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
90
88
|
assert_equal [ 'Boeing 747', 'Boeing 747SR' ], d.find_all('Boeing 747')
|
91
|
-
|
89
|
+
|
92
90
|
# first_blocking_decides refers to the needle
|
93
91
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
94
92
|
assert_equal ["Boeing ER6", "Boeing 747", "Boeing 747SR"], d.find_all('Boeing ER6')
|
95
|
-
|
93
|
+
|
96
94
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
|
97
95
|
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
98
|
-
|
96
|
+
|
99
97
|
# or equivalently with an identity
|
100
98
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
|
101
99
|
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
102
100
|
end
|
103
|
-
|
101
|
+
|
104
102
|
MyStruct = Struct.new(:one, :two)
|
105
103
|
def test_014_symbol_read_sends_method
|
106
104
|
ab = MyStruct.new('a', 'b')
|
@@ -115,7 +113,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
115
113
|
assert_equal ba, by_first.find('b')
|
116
114
|
assert_equal ba, by_last.find('a')
|
117
115
|
end
|
118
|
-
|
116
|
+
|
119
117
|
def test_015_symbol_read_reads_array
|
120
118
|
ab = ['a', 'b']
|
121
119
|
ba = ['b', 'a']
|
@@ -127,7 +125,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
127
125
|
assert_equal ba, by_first.find('b')
|
128
126
|
assert_equal ba, by_last.find('a')
|
129
127
|
end
|
130
|
-
|
128
|
+
|
131
129
|
def test_016_symbol_read_reads_hash
|
132
130
|
ab = { :one => 'a', :two => 'b' }
|
133
131
|
ba = { :one => 'b', :two => 'a' }
|
@@ -139,7 +137,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
139
137
|
assert_equal ba, by_first.find('b')
|
140
138
|
assert_equal ba, by_last.find('a')
|
141
139
|
end
|
142
|
-
|
140
|
+
|
143
141
|
def test_017_understands_haystack_reader_option
|
144
142
|
ab = ['a', 'b']
|
145
143
|
ba = ['b', 'a']
|
@@ -148,31 +146,31 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
148
146
|
assert_equal ab, by_first.find('a')
|
149
147
|
assert_equal ba, by_first.find('b')
|
150
148
|
end
|
151
|
-
|
149
|
+
|
152
150
|
def test_018_no_result_if_best_score_is_zero
|
153
151
|
assert_equal nil, FuzzyMatch.new(['a']).find('b')
|
154
152
|
end
|
155
|
-
|
153
|
+
|
156
154
|
def test_019_must_match_at_least_one_word
|
157
155
|
d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
|
158
156
|
assert_equal nil, d.find('RITZ')
|
159
|
-
|
157
|
+
|
160
158
|
d = FuzzyMatch.new ["Foo's Bar"], :must_match_at_least_one_word => true
|
161
159
|
assert_equal nil, d.find("Jacob's")
|
162
160
|
assert_equal "Foo's Bar", d.find("Foo's")
|
163
161
|
end
|
164
|
-
|
162
|
+
|
165
163
|
def test_020_stop_words
|
166
164
|
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
|
167
165
|
assert_equal 'B HTL', d.find('A HTL', :must_match_at_least_one_word => true)
|
168
|
-
|
166
|
+
|
169
167
|
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
|
170
168
|
assert_equal 'B HTL', d.find('A HTL')
|
171
|
-
|
169
|
+
|
172
170
|
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
|
173
171
|
assert_equal 'A HOTEL', d.find('A HTL')
|
174
172
|
end
|
175
|
-
|
173
|
+
|
176
174
|
def test_021_explain_prints_to_stdout
|
177
175
|
require 'stringio'
|
178
176
|
capture = StringIO.new
|
@@ -187,15 +185,15 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
187
185
|
capture.rewind
|
188
186
|
assert capture.read.include?('CATZ')
|
189
187
|
end
|
190
|
-
|
188
|
+
|
191
189
|
def test_022_compare_words_with_words
|
192
190
|
d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
|
193
191
|
assert_equal nil, d.find('DOLCE LA HULPE BXL FI')
|
194
192
|
end
|
195
|
-
|
193
|
+
|
196
194
|
def test_023_must_match_at_least_one_word_is_case_insensitive
|
197
195
|
d = FuzzyMatch.new [ 'A', 'B' ]
|
198
196
|
assert_equal 'A', d.find('a', :must_match_at_least_one_word => true)
|
199
197
|
end
|
200
|
-
|
198
|
+
|
201
199
|
end
|
data/test/test_identity.rb
CHANGED
@@ -30,4 +30,9 @@ class TestIdentity < Test::Unit::TestCase
|
|
30
30
|
i = FuzzyMatch::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
|
31
31
|
assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
|
32
32
|
end
|
33
|
+
|
34
|
+
def test_007_accepts_case_insensitivity
|
35
|
+
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}i
|
36
|
+
assert_equal true, i.identical?('A1', 'a 1foobar')
|
37
|
+
end
|
33
38
|
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
|
-
class
|
3
|
+
class TestNormalizer < Test::Unit::TestCase
|
4
4
|
def test_001_apply
|
5
|
-
t = FuzzyMatch::
|
5
|
+
t = FuzzyMatch::Normalizer.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
|
6
6
|
assert_equal 'Ford F350', t.apply('Ford F-350')
|
7
7
|
assert_equal 'Ford F150', t.apply('Ford F150')
|
8
8
|
assert_equal 'Ford F350', t.apply('Ford F 350')
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-01-
|
12
|
+
date: 2012-01-18 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: shoulda
|
16
|
-
requirement: &
|
16
|
+
requirement: &2177380220 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2177380220
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: remote_table
|
27
|
-
requirement: &
|
27
|
+
requirement: &2177379700 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2177379700
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: activerecord
|
38
|
-
requirement: &
|
38
|
+
requirement: &2177379100 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '3'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2177379100
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: mysql
|
49
|
-
requirement: &
|
49
|
+
requirement: &2177378440 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2177378440
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: cohort_scope
|
60
|
-
requirement: &
|
60
|
+
requirement: &2177377600 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2177377600
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: weighted_average
|
71
|
-
requirement: &
|
71
|
+
requirement: &2177377020 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *2177377020
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rake
|
82
|
-
requirement: &
|
82
|
+
requirement: &2177376420 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *2177376420
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: activesupport
|
93
|
-
requirement: &
|
93
|
+
requirement: &2177375240 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '3'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *2177375240
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: to_regexp
|
104
|
-
requirement: &
|
104
|
+
requirement: &2177374500 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,7 +109,7 @@ dependencies:
|
|
109
109
|
version: 0.0.3
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *2177374500
|
113
113
|
description: Find a needle in a haystack using string similarity and (optionally)
|
114
114
|
regexp rules. Replaces loose_tight_dictionary.
|
115
115
|
email:
|
@@ -122,7 +122,7 @@ files:
|
|
122
122
|
- .gitignore
|
123
123
|
- Gemfile
|
124
124
|
- LICENSE
|
125
|
-
- README.
|
125
|
+
- README.markdown
|
126
126
|
- Rakefile
|
127
127
|
- THANKS-WILLIAM-JAMES.rb
|
128
128
|
- benchmark/before-with-free.txt
|
@@ -137,10 +137,10 @@ files:
|
|
137
137
|
- examples/bts_aircraft/blockings.csv
|
138
138
|
- examples/bts_aircraft/identities.csv
|
139
139
|
- examples/bts_aircraft/negatives.csv
|
140
|
+
- examples/bts_aircraft/normalizers.csv
|
140
141
|
- examples/bts_aircraft/number_260.csv
|
141
142
|
- examples/bts_aircraft/positives.csv
|
142
143
|
- examples/bts_aircraft/test_bts_aircraft.rb
|
143
|
-
- examples/bts_aircraft/tighteners.csv
|
144
144
|
- examples/first_name_matching.rb
|
145
145
|
- examples/icao-bts.xls
|
146
146
|
- fuzzy_match.gemspec
|
@@ -148,11 +148,11 @@ files:
|
|
148
148
|
- lib/fuzzy_match/blocking.rb
|
149
149
|
- lib/fuzzy_match/cached_result.rb
|
150
150
|
- lib/fuzzy_match/identity.rb
|
151
|
+
- lib/fuzzy_match/normalizer.rb
|
151
152
|
- lib/fuzzy_match/result.rb
|
152
153
|
- lib/fuzzy_match/score.rb
|
153
154
|
- lib/fuzzy_match/similarity.rb
|
154
155
|
- lib/fuzzy_match/stop_word.rb
|
155
|
-
- lib/fuzzy_match/tightener.rb
|
156
156
|
- lib/fuzzy_match/version.rb
|
157
157
|
- lib/fuzzy_match/wrapper.rb
|
158
158
|
- test/helper.rb
|
@@ -161,7 +161,7 @@ files:
|
|
161
161
|
- test/test_fuzzy_match.rb
|
162
162
|
- test/test_fuzzy_match_convoluted.rb.disabled
|
163
163
|
- test/test_identity.rb
|
164
|
-
- test/
|
164
|
+
- test/test_normalizer.rb
|
165
165
|
homepage: https://github.com/seamusabshere/fuzzy_match
|
166
166
|
licenses: []
|
167
167
|
post_install_message:
|
@@ -194,4 +194,5 @@ test_files:
|
|
194
194
|
- test/test_fuzzy_match.rb
|
195
195
|
- test/test_fuzzy_match_convoluted.rb.disabled
|
196
196
|
- test/test_identity.rb
|
197
|
-
- test/
|
197
|
+
- test/test_normalizer.rb
|
198
|
+
has_rdoc:
|
data/README.rdoc
DELETED
@@ -1,94 +0,0 @@
|
|
1
|
-
= fuzzy_match
|
2
|
-
|
3
|
-
Find a needle in a haystack based on string similarity (using the Pair Distance algorithm and Levenshtein distance) and regular expressions.
|
4
|
-
|
5
|
-
Replaces {loose_tight_dictionary}[https://github.com/seamusabshere/loose_tight_dictionary] because that was a confusing name.
|
6
|
-
|
7
|
-
== Quickstart
|
8
|
-
|
9
|
-
>> require 'fuzzy_match'
|
10
|
-
=> true
|
11
|
-
>> FuzzyMatch.new(%w{seamus andy ben}).find('Shamus')
|
12
|
-
=> "seamus"
|
13
|
-
|
14
|
-
== String similarity matching
|
15
|
-
|
16
|
-
Uses {Dice's Coefficient}[http://en.wikipedia.org/wiki/Dice's_coefficient] algorithm (aka Pair Distance).
|
17
|
-
|
18
|
-
If that judges two strings to be be equally similar to a third string, then Levenshtein distance is used. For example, pair distance considers "RATZ" and "CATZ" to be equally similar to "RITZ" so we invoke Levenshtein.
|
19
|
-
|
20
|
-
>> require 'amatch'
|
21
|
-
=> true
|
22
|
-
>> 'RITZ'.pair_distance_similar 'RATZ'
|
23
|
-
=> 0.3333333333333333
|
24
|
-
>> 'RITZ'.pair_distance_similar 'CATZ' # <-- pair distance can't tell the difference, so we fall back to levenshtein...
|
25
|
-
=> 0.3333333333333333
|
26
|
-
>> 'RITZ'.levenshtein_similar 'RATZ'
|
27
|
-
=> 0.75
|
28
|
-
>> 'RITZ'.levenshtein_similar 'CATZ' # <-- which properly shows that RATZ should win
|
29
|
-
=> 0.5
|
30
|
-
|
31
|
-
== Production use
|
32
|
-
|
33
|
-
Over 2 years in {Brighter Planet's environmental impact API}[http://impact.brighterplanet.com] and {reference data service}[http://data.brighterplanet.com].
|
34
|
-
|
35
|
-
== Haystacks and how to read them
|
36
|
-
|
37
|
-
The (admittedly imperfect) metaphor is "look for a needle in a haystack"
|
38
|
-
|
39
|
-
* needle - the search term
|
40
|
-
* haystack - the records you are searching (<b>your result will be an object from here</b>)
|
41
|
-
|
42
|
-
So, what if your needle is a string like <tt>youruguay</tt> and your haystack is full of <tt>Country</tt> objects like <tt><Country name:"Uruguay"></tt>?
|
43
|
-
|
44
|
-
>> FuzzyMatch.new(countries, :read => :name).find('youruguay')
|
45
|
-
=> <Country name:"Uruguay">
|
46
|
-
|
47
|
-
== Regular expressions
|
48
|
-
|
49
|
-
You can improve the default matchings with regular expressions.
|
50
|
-
|
51
|
-
* Emphasize important words using <b>blockings</b> and <b>tighteners</b>
|
52
|
-
* Filter out stop words with <b>tighteners</b>
|
53
|
-
* Prevent impossible matches with <b>blockings</b> and <b>identities</b>
|
54
|
-
* Ignore words with <b>stop words</b>
|
55
|
-
|
56
|
-
=== Blockings
|
57
|
-
|
58
|
-
Setting a blocking of <tt>/Airbus/</tt> ensures that strings containing "Airbus" will only be scored against to other strings containing "Airbus". A better blocking in this case would probably be <tt>/airbus/i</tt>.
|
59
|
-
|
60
|
-
=== Tighteners
|
61
|
-
|
62
|
-
Adding a tightener like <tt>/(boeing).*(7\d\d)/i</tt> will cause "BOEING COMPANY 747" and "boeing747" to be scored as if they were "BOEING 747" and "boeing 747", respectively. See also "Case sensitivity" below.
|
63
|
-
|
64
|
-
=== Identities
|
65
|
-
|
66
|
-
Adding an identity like <tt>/(F)\-?(\d50)/</tt> ensures that "Ford F-150" and "Ford F-250" never match.
|
67
|
-
|
68
|
-
=== Stop words
|
69
|
-
|
70
|
-
Adding a stop word like <tt>THE</tt> ensures that it is not taken into account when comparing "THE CAT", "THE DAT", and "THE CATT"
|
71
|
-
|
72
|
-
== Case sensitivity
|
73
|
-
|
74
|
-
Scoring is case-insensitive. Everything is downcased before scoring. This is a change from previous versions. Your regexps may still be case-sensitive, though.
|
75
|
-
|
76
|
-
== Examples
|
77
|
-
|
78
|
-
Check out the tests.
|
79
|
-
|
80
|
-
== Speed (and who to thank for the algorithms)
|
81
|
-
|
82
|
-
If you add the amatch[http://flori.github.com/amatch/] gem to your Gemfile, it will use that, which is much faster (but {segfaults have been seen in the wild}[https://github.com/flori/amatch/issues/3]). Thanks {Flori}[https://github.com/flori]!
|
83
|
-
|
84
|
-
Otherwise, pure ruby versions of the string similarity algorithms derived from the {answer to a StackOverflow question}[http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings] and {the text gem}[https://github.com/threedaymonk/text/blob/master/lib/text/levenshtein.rb] are used. Thanks {marzagao}[http://stackoverflow.com/users/10997/marzagao] and {threedaymonk}[https://github.com/threedaymonk]!
|
85
|
-
|
86
|
-
== Authors
|
87
|
-
|
88
|
-
* Seamus Abshere <seamus@abshere.net>
|
89
|
-
* Ian Hough <ijhough@gmail.com>
|
90
|
-
* Andy Rossmeissl <andy@rossmeissl.net>
|
91
|
-
|
92
|
-
== Copyright
|
93
|
-
|
94
|
-
Copyright 2011 Brighter Planet, Inc.
|