fuzzy_match 1.0.5 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +1 -1
- data/lib/fuzzy_match/cached_result.rb +1 -1
- data/lib/fuzzy_match/result.rb +31 -7
- data/lib/fuzzy_match/score.rb +9 -9
- data/lib/fuzzy_match/version.rb +1 -1
- data/lib/fuzzy_match/wrapper.rb +11 -9
- data/lib/fuzzy_match.rb +147 -116
- data/test/test_cache.rb +2 -2
- data/test/test_fuzzy_match.rb +12 -7
- metadata +20 -20
data/README.rdoc
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Find a needle in a haystack based on string similarity (using the Pair Distance algorithm and Levenshtein distance) and regular expressions.
|
4
4
|
|
5
|
-
Replaces [
|
5
|
+
Replaces {loose_tight_dictionary}[https://github.com/seamusabshere/loose_tight_dictionary] because that was a confusing name.
|
6
6
|
|
7
7
|
== Quickstart
|
8
8
|
|
@@ -24,7 +24,7 @@ class FuzzyMatch
|
|
24
24
|
# required options:
|
25
25
|
# :primary_key - what to call on this class
|
26
26
|
# :foreign_key - what to call on the other class
|
27
|
-
def
|
27
|
+
def cache_fuzzy_match_with(other_active_record_class, options)
|
28
28
|
other = other_active_record_class.to_s.singularize.camelcase
|
29
29
|
me = name
|
30
30
|
if me < other
|
data/lib/fuzzy_match/result.rb
CHANGED
@@ -1,17 +1,41 @@
|
|
1
|
+
require 'erb'
|
2
|
+
|
1
3
|
class FuzzyMatch
|
2
4
|
class Result #:nodoc: all
|
5
|
+
EXPLANATION = <<-ERB
|
6
|
+
You looked for <%= needle.render.inspect %>
|
7
|
+
|
8
|
+
<% if winner %>It was matched with "<%= winner %>"<% else %>No match was found<% end %>
|
9
|
+
|
10
|
+
# THE HAYSTACK
|
11
|
+
|
12
|
+
The haystack reader was <%= read.inspect %>.
|
13
|
+
|
14
|
+
The haystack contained <%= haystack.length %> records like <%= haystack[0, 3].map(&:render).map(&:inspect).join(', ') %>
|
15
|
+
|
16
|
+
# HOW IT WAS MATCHED
|
17
|
+
<% timeline.each_with_index do |event, index| %>
|
18
|
+
(<%= index+1 %>) <%= event %>
|
19
|
+
<% end %>
|
20
|
+
ERB
|
21
|
+
|
22
|
+
def timeline
|
23
|
+
@timeline ||= []
|
24
|
+
end
|
25
|
+
|
3
26
|
attr_accessor :needle
|
27
|
+
attr_accessor :read
|
28
|
+
attr_accessor :haystack
|
29
|
+
attr_accessor :options
|
4
30
|
attr_accessor :tighteners
|
5
31
|
attr_accessor :blockings
|
6
32
|
attr_accessor :identities
|
7
33
|
attr_accessor :stop_words
|
8
|
-
attr_accessor :
|
9
|
-
attr_accessor :joint
|
10
|
-
attr_accessor :disjoint
|
11
|
-
attr_accessor :possibly_identical
|
12
|
-
attr_accessor :certainly_different
|
13
|
-
attr_accessor :similarities
|
14
|
-
attr_accessor :record
|
34
|
+
attr_accessor :winner
|
15
35
|
attr_accessor :score
|
36
|
+
|
37
|
+
def explain
|
38
|
+
$stdout.puts ::ERB.new(EXPLANATION, 0, '%<').result(binding)
|
39
|
+
end
|
16
40
|
end
|
17
41
|
end
|
data/lib/fuzzy_match/score.rb
CHANGED
@@ -14,13 +14,13 @@ class FuzzyMatch
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def inspect
|
17
|
-
%{#<Score: dices_coefficient=#{
|
17
|
+
%{#<Score: dices_coefficient=#{dices_coefficient_similar} levenshtein=#{levenshtein_similar}>}
|
18
18
|
end
|
19
19
|
|
20
20
|
def <=>(other)
|
21
|
-
by_dices_coefficient = (
|
21
|
+
by_dices_coefficient = (dices_coefficient_similar <=> other.dices_coefficient_similar)
|
22
22
|
if by_dices_coefficient == 0
|
23
|
-
|
23
|
+
levenshtein_similar <=> other.levenshtein_similar
|
24
24
|
else
|
25
25
|
by_dices_coefficient
|
26
26
|
end
|
@@ -32,11 +32,11 @@ class FuzzyMatch
|
|
32
32
|
|
33
33
|
if defined?(::Amatch)
|
34
34
|
|
35
|
-
def
|
35
|
+
def dices_coefficient_similar
|
36
36
|
str1.pair_distance_similar str2
|
37
37
|
end
|
38
38
|
|
39
|
-
def
|
39
|
+
def levenshtein_similar
|
40
40
|
str1.levenshtein_similar str2
|
41
41
|
end
|
42
42
|
|
@@ -44,7 +44,7 @@ class FuzzyMatch
|
|
44
44
|
|
45
45
|
SPACE = ' '
|
46
46
|
# http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
|
47
|
-
def
|
47
|
+
def dices_coefficient_similar
|
48
48
|
if str1 == str2
|
49
49
|
return 1.0
|
50
50
|
elsif str1.length == 1 and str2.length == 1
|
@@ -77,7 +77,7 @@ class FuzzyMatch
|
|
77
77
|
# extracted/adapted from the text gem version 1.0.2
|
78
78
|
# normalization added for utf-8 strings
|
79
79
|
# lib/text/levenshtein.rb
|
80
|
-
def
|
80
|
+
def levenshtein_similar
|
81
81
|
if utf8?
|
82
82
|
unpack_rule = 'U*'
|
83
83
|
else
|
@@ -118,8 +118,8 @@ class FuzzyMatch
|
|
118
118
|
end
|
119
119
|
|
120
120
|
extend ::ActiveSupport::Memoizable
|
121
|
-
memoize :
|
122
|
-
memoize :
|
121
|
+
memoize :dices_coefficient_similar
|
122
|
+
memoize :levenshtein_similar
|
123
123
|
memoize :utf8?
|
124
124
|
end
|
125
125
|
end
|
data/lib/fuzzy_match/version.rb
CHANGED
data/lib/fuzzy_match/wrapper.rb
CHANGED
@@ -3,20 +3,25 @@ class FuzzyMatch
|
|
3
3
|
class Wrapper #:nodoc: all
|
4
4
|
attr_reader :fuzzy_match
|
5
5
|
attr_reader :record
|
6
|
-
attr_reader :
|
6
|
+
attr_reader :literal
|
7
|
+
attr_reader :rendered
|
7
8
|
|
8
|
-
def initialize(fuzzy_match, record,
|
9
|
+
def initialize(fuzzy_match, record, literal = false)
|
9
10
|
@fuzzy_match = fuzzy_match
|
10
11
|
@record = record
|
11
|
-
@
|
12
|
+
@literal = literal
|
12
13
|
end
|
13
14
|
|
14
15
|
def inspect
|
15
16
|
"#<Wrapper render=#{render} variants=#{variants.length}>"
|
16
17
|
end
|
18
|
+
|
19
|
+
def read
|
20
|
+
fuzzy_match.read unless literal
|
21
|
+
end
|
17
22
|
|
18
23
|
def render
|
19
|
-
return @render if rendered
|
24
|
+
return @render if rendered
|
20
25
|
str = case read
|
21
26
|
when ::Proc
|
22
27
|
read.call record
|
@@ -42,7 +47,8 @@ class FuzzyMatch
|
|
42
47
|
|
43
48
|
alias :to_str :render
|
44
49
|
|
45
|
-
|
50
|
+
# "Foo's Bar" should be treated as [ "Foo's", "Bar" ], so we don't use traditional regexp word boundaries (\b)
|
51
|
+
WORD_BOUNDARY = %r{\s+}
|
46
52
|
def words
|
47
53
|
@words ||= render.split(WORD_BOUNDARY)
|
48
54
|
end
|
@@ -59,9 +65,5 @@ class FuzzyMatch
|
|
59
65
|
memo
|
60
66
|
end.uniq
|
61
67
|
end
|
62
|
-
|
63
|
-
def rendered?
|
64
|
-
@rendered == true
|
65
|
-
end
|
66
68
|
end
|
67
69
|
end
|
data/lib/fuzzy_match.rb
CHANGED
@@ -17,33 +17,72 @@ class FuzzyMatch
|
|
17
17
|
autoload :Score, 'fuzzy_match/score'
|
18
18
|
autoload :CachedResult, 'fuzzy_match/cached_result'
|
19
19
|
|
20
|
+
DEFAULT_OPTIONS = {
|
21
|
+
:first_blocking_decides => false,
|
22
|
+
:must_match_blocking => false,
|
23
|
+
:must_match_at_least_one_word => false,
|
24
|
+
:gather_last_result => false,
|
25
|
+
:find_all => false
|
26
|
+
}
|
27
|
+
|
20
28
|
attr_reader :haystack
|
21
29
|
attr_reader :blockings
|
22
30
|
attr_reader :identities
|
23
31
|
attr_reader :tighteners
|
24
32
|
attr_reader :stop_words
|
25
|
-
attr_reader :
|
26
|
-
attr_reader :
|
27
|
-
attr_reader :default_must_match_at_least_one_word
|
33
|
+
attr_reader :read
|
34
|
+
attr_reader :default_options
|
28
35
|
|
29
|
-
# haystack - a bunch of records
|
30
|
-
#
|
36
|
+
# haystack - a bunch of records that will compete to see who best matches the needle
|
37
|
+
#
|
38
|
+
# rules (can only be specified at initialization or by using a setter)
|
31
39
|
# * tighteners: regexps (see readme)
|
32
40
|
# * identities: regexps
|
33
41
|
# * blockings: regexps
|
34
42
|
# * stop_words: regexps
|
35
43
|
# * read: how to interpret each entry in the 'haystack', either a Proc or a symbol
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
44
|
+
#
|
45
|
+
# options (can be specified at initialization or when calling #find)
|
46
|
+
# * first_blocking_decides
|
47
|
+
# * must_match_blocking
|
48
|
+
# * must_match_at_least_one_word
|
49
|
+
# * gather_last_result
|
50
|
+
# * find_all
|
51
|
+
def initialize(competitors, options_and_rules = {})
|
52
|
+
options_and_rules = options_and_rules.symbolize_keys
|
53
|
+
|
54
|
+
# rules
|
55
|
+
self.blockings = options_and_rules.delete(:blockings) || []
|
56
|
+
self.identities = options_and_rules.delete(:identities) || []
|
57
|
+
self.tighteners = options_and_rules.delete(:tighteners) || []
|
58
|
+
self.stop_words = options_and_rules.delete(:stop_words) || []
|
59
|
+
@read = options_and_rules.delete(:read) || options_and_rules.delete(:haystack_reader)
|
60
|
+
|
61
|
+
# options
|
62
|
+
@default_options = options_and_rules.reverse_merge(DEFAULT_OPTIONS).freeze
|
63
|
+
|
64
|
+
# do this last
|
65
|
+
self.haystack = competitors
|
66
|
+
end
|
67
|
+
|
68
|
+
def blockings=(ary)
|
69
|
+
@blockings = ary.map { |regexp_or_str| Blocking.new regexp_or_str }
|
70
|
+
end
|
71
|
+
|
72
|
+
def identities=(ary)
|
73
|
+
@identities = ary.map { |regexp_or_str| Identity.new regexp_or_str }
|
74
|
+
end
|
75
|
+
|
76
|
+
def tighteners=(ary)
|
77
|
+
@tighteners = ary.map { |regexp_or_str| Tightener.new regexp_or_str }
|
78
|
+
end
|
79
|
+
|
80
|
+
def stop_words=(ary)
|
81
|
+
@stop_words = ary.map { |regexp_or_str| StopWord.new regexp_or_str }
|
82
|
+
end
|
83
|
+
|
84
|
+
def haystack=(ary)
|
85
|
+
@haystack = ary.map { |competitor| Wrapper.new self, competitor }
|
47
86
|
end
|
48
87
|
|
49
88
|
def last_result
|
@@ -58,16 +97,24 @@ class FuzzyMatch
|
|
58
97
|
def find(needle, options = {})
|
59
98
|
raise ::RuntimeError, "[fuzzy_match] Dictionary has already been freed, can't perform more finds" if freed?
|
60
99
|
|
61
|
-
options = options.symbolize_keys
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
100
|
+
options = options.symbolize_keys.reverse_merge default_options
|
101
|
+
|
102
|
+
gather_last_result = options[:gather_last_result]
|
103
|
+
is_find_all = options[:find_all]
|
104
|
+
first_blocking_decides = options[:first_blocking_decides]
|
105
|
+
must_match_blocking = options[:must_match_blocking]
|
106
|
+
must_match_at_least_one_word = options[:must_match_at_least_one_word]
|
67
107
|
|
68
108
|
if gather_last_result
|
69
109
|
free_last_result
|
70
110
|
@last_result = Result.new
|
111
|
+
last_result.read = read
|
112
|
+
last_result.haystack = haystack
|
113
|
+
last_result.options = options
|
114
|
+
last_result.timeline << <<-EOS
|
115
|
+
Options were set, either by you or by falling back to defaults.
|
116
|
+
\tOptions: #{options.inspect}
|
117
|
+
EOS
|
71
118
|
end
|
72
119
|
|
73
120
|
if gather_last_result
|
@@ -77,13 +124,24 @@ class FuzzyMatch
|
|
77
124
|
last_result.stop_words = stop_words
|
78
125
|
end
|
79
126
|
|
80
|
-
needle = Wrapper.new self, needle
|
127
|
+
needle = Wrapper.new self, needle, true
|
81
128
|
|
82
129
|
if gather_last_result
|
83
130
|
last_result.needle = needle
|
131
|
+
last_result.timeline << <<-EOS
|
132
|
+
The needle's #{needle.variants.length} variants were enumerated.
|
133
|
+
\tVariants: #{needle.variants.map(&:inspect).join(', ')}
|
134
|
+
EOS
|
84
135
|
end
|
85
136
|
|
86
137
|
if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle }
|
138
|
+
if gather_last_result
|
139
|
+
last_result.timeline << <<-EOS
|
140
|
+
The needle didn't match any of the #{blockings.length} blocking, which was a requirement.
|
141
|
+
\tBlockings (first 3): #{blockings[0,3].map(&:inspect).join(', ')}
|
142
|
+
EOS
|
143
|
+
end
|
144
|
+
|
87
145
|
if is_find_all
|
88
146
|
return []
|
89
147
|
else
|
@@ -91,83 +149,109 @@ class FuzzyMatch
|
|
91
149
|
end
|
92
150
|
end
|
93
151
|
|
94
|
-
|
95
|
-
haystack.select do |straw|
|
152
|
+
if must_match_at_least_one_word
|
153
|
+
passed_word_requirement = haystack.select do |straw|
|
96
154
|
(needle.words & straw.words).any?
|
97
155
|
end
|
156
|
+
if gather_last_result
|
157
|
+
last_result.timeline << <<-EOS
|
158
|
+
Since :must_match_at_least_one_word => true, the competition was reduced to records sharing at least one word with the needle.
|
159
|
+
\tNeedle words: #{needle.words.map(&:inspect).join(', ')}
|
160
|
+
\tPassed (first 3): #{passed_word_requirement[0,3].map(&:render).map(&:inspect).join(', ')}
|
161
|
+
\tFailed (first 3): #{(haystack-passed_word_requirement)[0,3].map(&:render).map(&:inspect).join(', ')}
|
162
|
+
EOS
|
163
|
+
end
|
98
164
|
else
|
99
|
-
haystack
|
100
|
-
end
|
101
|
-
|
102
|
-
if gather_last_result
|
103
|
-
last_result.candidates = candidates
|
165
|
+
passed_word_requirement = haystack
|
104
166
|
end
|
105
167
|
|
106
|
-
|
107
|
-
|
168
|
+
if blockings.any?
|
169
|
+
joint = passed_word_requirement.select do |straw|
|
108
170
|
if first_blocking_decides
|
109
171
|
blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
|
110
172
|
else
|
111
173
|
blockings.any? { |blocking| blocking.join? needle, straw }
|
112
174
|
end
|
113
175
|
end
|
176
|
+
if gather_last_result
|
177
|
+
last_result.timeline << <<-EOS
|
178
|
+
Since there were blockings, the competition was reduced to records in the same block as the needle.
|
179
|
+
\tBlockings (first 3): #{blockings[0,3].map(&:inspect).join(', ')}
|
180
|
+
\tPassed (first 3): #{joint[0,3].map(&:render).map(&:inspect).join(', ')}
|
181
|
+
\tFailed (first 3): #{(passed_word_requirement-joint)[0,3].map(&:render).map(&:inspect).join(', ')}
|
182
|
+
EOS
|
183
|
+
end
|
114
184
|
else
|
115
|
-
|
185
|
+
joint = passed_word_requirement.dup
|
116
186
|
end
|
117
187
|
|
118
188
|
if joint.none?
|
119
189
|
if must_match_blocking
|
190
|
+
if gather_last_result
|
191
|
+
last_result.timeline << <<-EOS
|
192
|
+
Since :must_match_at_least_one_word => true and none of the competition was in the same block as the needle, the search stopped.
|
193
|
+
EOS
|
194
|
+
end
|
120
195
|
if is_find_all
|
121
196
|
return []
|
122
197
|
else
|
123
198
|
return nil
|
124
199
|
end
|
125
200
|
else
|
126
|
-
|
127
|
-
joint = disjoint
|
128
|
-
disjoint = []
|
201
|
+
joint = passed_word_requirement.dup
|
129
202
|
end
|
130
203
|
end
|
131
|
-
|
132
|
-
if
|
133
|
-
|
134
|
-
last_result.disjoint = disjoint
|
135
|
-
end
|
136
|
-
|
137
|
-
possibly_identical, certainly_different = if identities.any?
|
138
|
-
joint.partition do |straw|
|
204
|
+
|
205
|
+
if identities.any?
|
206
|
+
possibly_identical = joint.select do |straw|
|
139
207
|
identities.all? do |identity|
|
140
208
|
answer = identity.identical? needle, straw
|
141
209
|
answer.nil? or answer == true
|
142
210
|
end
|
143
211
|
end
|
212
|
+
if gather_last_result
|
213
|
+
last_result.timeline << <<-EOS
|
214
|
+
Since there were identities, the competition was reduced to records that might be identical to the needle (in other words, are not certainly different)
|
215
|
+
\Identities (first 3): #{identities[0,3].map(&:inspect).join(', ')}
|
216
|
+
\tPassed (first 3): #{possibly_identical[0,3].map(&:render).map(&:inspect).join(', ')}
|
217
|
+
\tFailed (first 3): #{(joint-possibly_identical)[0,3].map(&:render).map(&:inspect).join(', ')}
|
218
|
+
EOS
|
219
|
+
end
|
144
220
|
else
|
145
|
-
|
221
|
+
possibly_identical = joint.dup
|
146
222
|
end
|
147
|
-
|
223
|
+
|
224
|
+
similarities = possibly_identical.map { |straw| needle.similarity straw }.sort.reverse
|
225
|
+
|
148
226
|
if gather_last_result
|
149
|
-
|
150
|
-
|
227
|
+
last_result.timeline << <<-EOS
|
228
|
+
The competition was sorted in order of similarity to the needle.
|
229
|
+
\tSimilar (first 3): #{(similarities)[0,3].map(&:wrapper2).map(&:render).map(&:inspect).join(', ')}
|
230
|
+
EOS
|
151
231
|
end
|
152
232
|
|
153
233
|
if is_find_all
|
154
|
-
return
|
155
|
-
end
|
156
|
-
|
157
|
-
similarities = possibly_identical.map { |straw| needle.similarity straw }.sort
|
158
|
-
|
159
|
-
if gather_last_result
|
160
|
-
last_result.similarities = similarities
|
234
|
+
return similarities.map { |similarity| similarity.wrapper2.record }
|
161
235
|
end
|
162
236
|
|
163
|
-
|
164
|
-
|
237
|
+
winner = nil
|
238
|
+
|
239
|
+
if best_similarity = similarities.first and best_similarity.best_score.dices_coefficient_similar > 0
|
240
|
+
winner = best_similarity.wrapper2.record
|
165
241
|
if gather_last_result
|
166
|
-
last_result.
|
167
|
-
last_result.score = best_similarity.best_score.
|
242
|
+
last_result.winner = winner
|
243
|
+
last_result.score = best_similarity.best_score.dices_coefficient_similar
|
244
|
+
last_result.timeline << <<-EOS
|
245
|
+
A winner was determined because the similarity score #{best_similarity.best_score.dices_coefficient_similar} is greater than zero.
|
246
|
+
EOS
|
168
247
|
end
|
169
|
-
|
248
|
+
elsif gather_last_result
|
249
|
+
last_result.timeline << <<-EOS
|
250
|
+
No winner assigned because similarity score was zero.
|
251
|
+
EOS
|
170
252
|
end
|
253
|
+
|
254
|
+
winner
|
171
255
|
end
|
172
256
|
|
173
257
|
# Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.
|
@@ -175,63 +259,10 @@ class FuzzyMatch
|
|
175
259
|
# d = FuzzyMatch.new ['737', '747', '757' ]
|
176
260
|
# d.explain 'boeing 737-100'
|
177
261
|
def explain(needle, options = {})
|
178
|
-
|
179
|
-
|
180
|
-
log "# Match #{needle.inspect} => #{record.inspect}"
|
181
|
-
log "#" * 150
|
182
|
-
log
|
183
|
-
log "Needle"
|
184
|
-
log "-" * 150
|
185
|
-
log last_result.needle.render
|
186
|
-
log
|
187
|
-
log "Stop words"
|
188
|
-
log last_result.stop_words.blank? ? '(none)' : last_result.stop_words.map { |stop_word| stop_word.inspect }.join("\n")
|
189
|
-
log
|
190
|
-
log "Candidates"
|
191
|
-
log "-" * 150
|
192
|
-
log last_result.candidates.map { |record| record.render }.join("\n")
|
193
|
-
log
|
194
|
-
log "Tighteners"
|
195
|
-
log "-" * 150
|
196
|
-
log last_result.tighteners.blank? ? '(none)' : last_result.tighteners.map { |tightener| tightener.inspect }.join("\n")
|
197
|
-
log
|
198
|
-
log "Blockings"
|
199
|
-
log "-" * 150
|
200
|
-
log last_result.blockings.blank? ? '(none)' : last_result.blockings.map { |blocking| blocking.inspect }.join("\n")
|
201
|
-
log
|
202
|
-
log "Identities"
|
203
|
-
log "-" * 150
|
204
|
-
log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
|
205
|
-
log
|
206
|
-
log "Joint"
|
207
|
-
log "-" * 150
|
208
|
-
log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.render }.join("\n")
|
209
|
-
log
|
210
|
-
log "Disjoint"
|
211
|
-
log "-" * 150
|
212
|
-
log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.render }.join("\n")
|
213
|
-
log
|
214
|
-
log "Possibly identical"
|
215
|
-
log "-" * 150
|
216
|
-
log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.render }.join("\n")
|
217
|
-
log
|
218
|
-
log "Certainly different"
|
219
|
-
log "-" * 150
|
220
|
-
log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.render }.join("\n")
|
221
|
-
log
|
222
|
-
log "Similarities"
|
223
|
-
log "-" * 150
|
224
|
-
log last_result.similarities.blank? ? '(none)' : last_result.similarities.reverse[0..9].map { |similarity| similarity.inspect }.join("\n")
|
225
|
-
log
|
226
|
-
log "Match"
|
227
|
-
log "-" * 150
|
228
|
-
log record.inspect
|
229
|
-
end
|
230
|
-
|
231
|
-
def log(str = '') #:nodoc:
|
232
|
-
$stderr.puts str
|
262
|
+
find needle, options.merge(:gather_last_result => true)
|
263
|
+
last_result.explain
|
233
264
|
end
|
234
|
-
|
265
|
+
|
235
266
|
def freed?
|
236
267
|
@freed == true
|
237
268
|
end
|
data/test/test_cache.rb
CHANGED
@@ -26,7 +26,7 @@ require 'fuzzy_match/cached_result'
|
|
26
26
|
class Aircraft < ActiveRecord::Base
|
27
27
|
set_primary_key :icao_code
|
28
28
|
|
29
|
-
|
29
|
+
cache_fuzzy_match_with :flight_segments, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
|
30
30
|
|
31
31
|
def aircraft_description
|
32
32
|
[manufacturer_name, model_name].compact.join(' ')
|
@@ -53,7 +53,7 @@ end
|
|
53
53
|
class FlightSegment < ActiveRecord::Base
|
54
54
|
set_primary_key :row_hash
|
55
55
|
|
56
|
-
|
56
|
+
cache_fuzzy_match_with :aircraft, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
|
57
57
|
|
58
58
|
extend CohortScope
|
59
59
|
self.minimum_cohort_size = 1
|
data/test/test_fuzzy_match.rb
CHANGED
@@ -24,7 +24,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
24
24
|
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
25
25
|
d.find 'MISSAM', :gather_last_result => true
|
26
26
|
assert_equal 0.6, d.last_result.score
|
27
|
-
assert_equal 'NISSAN', d.last_result.
|
27
|
+
assert_equal 'NISSAN', d.last_result.winner
|
28
28
|
end
|
29
29
|
|
30
30
|
def test_004_false_positive_without_tightener
|
@@ -91,7 +91,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
91
91
|
|
92
92
|
# first_blocking_decides refers to the needle
|
93
93
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
94
|
-
assert_equal [
|
94
|
+
assert_equal ["Boeing ER6", "Boeing 747", "Boeing 747SR"], d.find_all('Boeing ER6')
|
95
95
|
|
96
96
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
|
97
97
|
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
@@ -108,6 +108,8 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
108
108
|
haystack = [ab, ba]
|
109
109
|
by_first = FuzzyMatch.new haystack, :read => :one
|
110
110
|
by_last = FuzzyMatch.new haystack, :read => :two
|
111
|
+
assert_equal :one, by_first.read
|
112
|
+
assert_equal :two, by_last.read
|
111
113
|
assert_equal ab, by_first.find('a')
|
112
114
|
assert_equal ab, by_last.find('b')
|
113
115
|
assert_equal ba, by_first.find('b')
|
@@ -154,6 +156,10 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
154
156
|
def test_019_must_match_at_least_one_word
|
155
157
|
d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
|
156
158
|
assert_equal nil, d.find('RITZ')
|
159
|
+
|
160
|
+
d = FuzzyMatch.new ["Foo's Bar"], :must_match_at_least_one_word => true
|
161
|
+
assert_equal nil, d.find("Jacob's")
|
162
|
+
assert_equal "Foo's Bar", d.find("Foo's")
|
157
163
|
end
|
158
164
|
|
159
165
|
def test_020_stop_words
|
@@ -167,20 +173,19 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
167
173
|
assert_equal 'A HOTEL', d.find('A HTL')
|
168
174
|
end
|
169
175
|
|
170
|
-
def
|
176
|
+
def test_021_explain_prints_to_stdout
|
171
177
|
require 'stringio'
|
172
178
|
capture = StringIO.new
|
173
179
|
begin
|
174
|
-
|
175
|
-
$
|
180
|
+
old_stdout = $stdout
|
181
|
+
$stdout = capture
|
176
182
|
d = FuzzyMatch.new %w{ RATZ CATZ }
|
177
183
|
d.explain('RITZ')
|
178
184
|
ensure
|
179
|
-
$
|
185
|
+
$stdout = old_stdout
|
180
186
|
end
|
181
187
|
capture.rewind
|
182
188
|
assert capture.read.include?('CATZ')
|
183
|
-
capture.close
|
184
189
|
end
|
185
190
|
|
186
191
|
def test_022_compare_words_with_words
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-01-
|
12
|
+
date: 2012-01-16 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: shoulda
|
16
|
-
requirement: &
|
16
|
+
requirement: &2153333800 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2153333800
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: remote_table
|
27
|
-
requirement: &
|
27
|
+
requirement: &2153333320 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2153333320
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: activerecord
|
38
|
-
requirement: &
|
38
|
+
requirement: &2153332560 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '3'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2153332560
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: mysql
|
49
|
-
requirement: &
|
49
|
+
requirement: &2153331800 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2153331800
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: cohort_scope
|
60
|
-
requirement: &
|
60
|
+
requirement: &2153325800 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2153325800
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: weighted_average
|
71
|
-
requirement: &
|
71
|
+
requirement: &2153325220 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *2153325220
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rake
|
82
|
-
requirement: &
|
82
|
+
requirement: &2153322620 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *2153322620
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: activesupport
|
93
|
-
requirement: &
|
93
|
+
requirement: &2153322100 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '3'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *2153322100
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: to_regexp
|
104
|
-
requirement: &
|
104
|
+
requirement: &2153321580 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,7 +109,7 @@ dependencies:
|
|
109
109
|
version: 0.0.3
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *2153321580
|
113
113
|
description: Find a needle in a haystack using string similarity and (optionally)
|
114
114
|
regexp rules. Replaces loose_tight_dictionary.
|
115
115
|
email:
|