fuzzy_match 1.0.5 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +1 -1
- data/lib/fuzzy_match/cached_result.rb +1 -1
- data/lib/fuzzy_match/result.rb +31 -7
- data/lib/fuzzy_match/score.rb +9 -9
- data/lib/fuzzy_match/version.rb +1 -1
- data/lib/fuzzy_match/wrapper.rb +11 -9
- data/lib/fuzzy_match.rb +147 -116
- data/test/test_cache.rb +2 -2
- data/test/test_fuzzy_match.rb +12 -7
- metadata +20 -20
data/README.rdoc
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Find a needle in a haystack based on string similarity (using the Pair Distance algorithm and Levenshtein distance) and regular expressions.
|
4
4
|
|
5
|
-
Replaces [
|
5
|
+
Replaces {loose_tight_dictionary}[https://github.com/seamusabshere/loose_tight_dictionary] because that was a confusing name.
|
6
6
|
|
7
7
|
== Quickstart
|
8
8
|
|
@@ -24,7 +24,7 @@ class FuzzyMatch
|
|
24
24
|
# required options:
|
25
25
|
# :primary_key - what to call on this class
|
26
26
|
# :foreign_key - what to call on the other class
|
27
|
-
def
|
27
|
+
def cache_fuzzy_match_with(other_active_record_class, options)
|
28
28
|
other = other_active_record_class.to_s.singularize.camelcase
|
29
29
|
me = name
|
30
30
|
if me < other
|
data/lib/fuzzy_match/result.rb
CHANGED
@@ -1,17 +1,41 @@
|
|
1
|
+
require 'erb'
|
2
|
+
|
1
3
|
class FuzzyMatch
|
2
4
|
class Result #:nodoc: all
|
5
|
+
EXPLANATION = <<-ERB
|
6
|
+
You looked for <%= needle.render.inspect %>
|
7
|
+
|
8
|
+
<% if winner %>It was matched with "<%= winner %>"<% else %>No match was found<% end %>
|
9
|
+
|
10
|
+
# THE HAYSTACK
|
11
|
+
|
12
|
+
The haystack reader was <%= read.inspect %>.
|
13
|
+
|
14
|
+
The haystack contained <%= haystack.length %> records like <%= haystack[0, 3].map(&:render).map(&:inspect).join(', ') %>
|
15
|
+
|
16
|
+
# HOW IT WAS MATCHED
|
17
|
+
<% timeline.each_with_index do |event, index| %>
|
18
|
+
(<%= index+1 %>) <%= event %>
|
19
|
+
<% end %>
|
20
|
+
ERB
|
21
|
+
|
22
|
+
def timeline
|
23
|
+
@timeline ||= []
|
24
|
+
end
|
25
|
+
|
3
26
|
attr_accessor :needle
|
27
|
+
attr_accessor :read
|
28
|
+
attr_accessor :haystack
|
29
|
+
attr_accessor :options
|
4
30
|
attr_accessor :tighteners
|
5
31
|
attr_accessor :blockings
|
6
32
|
attr_accessor :identities
|
7
33
|
attr_accessor :stop_words
|
8
|
-
attr_accessor :
|
9
|
-
attr_accessor :joint
|
10
|
-
attr_accessor :disjoint
|
11
|
-
attr_accessor :possibly_identical
|
12
|
-
attr_accessor :certainly_different
|
13
|
-
attr_accessor :similarities
|
14
|
-
attr_accessor :record
|
34
|
+
attr_accessor :winner
|
15
35
|
attr_accessor :score
|
36
|
+
|
37
|
+
def explain
|
38
|
+
$stdout.puts ::ERB.new(EXPLANATION, 0, '%<').result(binding)
|
39
|
+
end
|
16
40
|
end
|
17
41
|
end
|
data/lib/fuzzy_match/score.rb
CHANGED
@@ -14,13 +14,13 @@ class FuzzyMatch
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def inspect
|
17
|
-
%{#<Score: dices_coefficient=#{
|
17
|
+
%{#<Score: dices_coefficient=#{dices_coefficient_similar} levenshtein=#{levenshtein_similar}>}
|
18
18
|
end
|
19
19
|
|
20
20
|
def <=>(other)
|
21
|
-
by_dices_coefficient = (
|
21
|
+
by_dices_coefficient = (dices_coefficient_similar <=> other.dices_coefficient_similar)
|
22
22
|
if by_dices_coefficient == 0
|
23
|
-
|
23
|
+
levenshtein_similar <=> other.levenshtein_similar
|
24
24
|
else
|
25
25
|
by_dices_coefficient
|
26
26
|
end
|
@@ -32,11 +32,11 @@ class FuzzyMatch
|
|
32
32
|
|
33
33
|
if defined?(::Amatch)
|
34
34
|
|
35
|
-
def
|
35
|
+
def dices_coefficient_similar
|
36
36
|
str1.pair_distance_similar str2
|
37
37
|
end
|
38
38
|
|
39
|
-
def
|
39
|
+
def levenshtein_similar
|
40
40
|
str1.levenshtein_similar str2
|
41
41
|
end
|
42
42
|
|
@@ -44,7 +44,7 @@ class FuzzyMatch
|
|
44
44
|
|
45
45
|
SPACE = ' '
|
46
46
|
# http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
|
47
|
-
def
|
47
|
+
def dices_coefficient_similar
|
48
48
|
if str1 == str2
|
49
49
|
return 1.0
|
50
50
|
elsif str1.length == 1 and str2.length == 1
|
@@ -77,7 +77,7 @@ class FuzzyMatch
|
|
77
77
|
# extracted/adapted from the text gem version 1.0.2
|
78
78
|
# normalization added for utf-8 strings
|
79
79
|
# lib/text/levenshtein.rb
|
80
|
-
def
|
80
|
+
def levenshtein_similar
|
81
81
|
if utf8?
|
82
82
|
unpack_rule = 'U*'
|
83
83
|
else
|
@@ -118,8 +118,8 @@ class FuzzyMatch
|
|
118
118
|
end
|
119
119
|
|
120
120
|
extend ::ActiveSupport::Memoizable
|
121
|
-
memoize :
|
122
|
-
memoize :
|
121
|
+
memoize :dices_coefficient_similar
|
122
|
+
memoize :levenshtein_similar
|
123
123
|
memoize :utf8?
|
124
124
|
end
|
125
125
|
end
|
data/lib/fuzzy_match/version.rb
CHANGED
data/lib/fuzzy_match/wrapper.rb
CHANGED
@@ -3,20 +3,25 @@ class FuzzyMatch
|
|
3
3
|
class Wrapper #:nodoc: all
|
4
4
|
attr_reader :fuzzy_match
|
5
5
|
attr_reader :record
|
6
|
-
attr_reader :
|
6
|
+
attr_reader :literal
|
7
|
+
attr_reader :rendered
|
7
8
|
|
8
|
-
def initialize(fuzzy_match, record,
|
9
|
+
def initialize(fuzzy_match, record, literal = false)
|
9
10
|
@fuzzy_match = fuzzy_match
|
10
11
|
@record = record
|
11
|
-
@
|
12
|
+
@literal = literal
|
12
13
|
end
|
13
14
|
|
14
15
|
def inspect
|
15
16
|
"#<Wrapper render=#{render} variants=#{variants.length}>"
|
16
17
|
end
|
18
|
+
|
19
|
+
def read
|
20
|
+
fuzzy_match.read unless literal
|
21
|
+
end
|
17
22
|
|
18
23
|
def render
|
19
|
-
return @render if rendered
|
24
|
+
return @render if rendered
|
20
25
|
str = case read
|
21
26
|
when ::Proc
|
22
27
|
read.call record
|
@@ -42,7 +47,8 @@ class FuzzyMatch
|
|
42
47
|
|
43
48
|
alias :to_str :render
|
44
49
|
|
45
|
-
|
50
|
+
# "Foo's Bar" should be treated as [ "Foo's", "Bar" ], so we don't use traditional regexp word boundaries (\b)
|
51
|
+
WORD_BOUNDARY = %r{\s+}
|
46
52
|
def words
|
47
53
|
@words ||= render.split(WORD_BOUNDARY)
|
48
54
|
end
|
@@ -59,9 +65,5 @@ class FuzzyMatch
|
|
59
65
|
memo
|
60
66
|
end.uniq
|
61
67
|
end
|
62
|
-
|
63
|
-
def rendered?
|
64
|
-
@rendered == true
|
65
|
-
end
|
66
68
|
end
|
67
69
|
end
|
data/lib/fuzzy_match.rb
CHANGED
@@ -17,33 +17,72 @@ class FuzzyMatch
|
|
17
17
|
autoload :Score, 'fuzzy_match/score'
|
18
18
|
autoload :CachedResult, 'fuzzy_match/cached_result'
|
19
19
|
|
20
|
+
DEFAULT_OPTIONS = {
|
21
|
+
:first_blocking_decides => false,
|
22
|
+
:must_match_blocking => false,
|
23
|
+
:must_match_at_least_one_word => false,
|
24
|
+
:gather_last_result => false,
|
25
|
+
:find_all => false
|
26
|
+
}
|
27
|
+
|
20
28
|
attr_reader :haystack
|
21
29
|
attr_reader :blockings
|
22
30
|
attr_reader :identities
|
23
31
|
attr_reader :tighteners
|
24
32
|
attr_reader :stop_words
|
25
|
-
attr_reader :
|
26
|
-
attr_reader :
|
27
|
-
attr_reader :default_must_match_at_least_one_word
|
33
|
+
attr_reader :read
|
34
|
+
attr_reader :default_options
|
28
35
|
|
29
|
-
# haystack - a bunch of records
|
30
|
-
#
|
36
|
+
# haystack - a bunch of records that will compete to see who best matches the needle
|
37
|
+
#
|
38
|
+
# rules (can only be specified at initialization or by using a setter)
|
31
39
|
# * tighteners: regexps (see readme)
|
32
40
|
# * identities: regexps
|
33
41
|
# * blockings: regexps
|
34
42
|
# * stop_words: regexps
|
35
43
|
# * read: how to interpret each entry in the 'haystack', either a Proc or a symbol
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
44
|
+
#
|
45
|
+
# options (can be specified at initialization or when calling #find)
|
46
|
+
# * first_blocking_decides
|
47
|
+
# * must_match_blocking
|
48
|
+
# * must_match_at_least_one_word
|
49
|
+
# * gather_last_result
|
50
|
+
# * find_all
|
51
|
+
def initialize(competitors, options_and_rules = {})
|
52
|
+
options_and_rules = options_and_rules.symbolize_keys
|
53
|
+
|
54
|
+
# rules
|
55
|
+
self.blockings = options_and_rules.delete(:blockings) || []
|
56
|
+
self.identities = options_and_rules.delete(:identities) || []
|
57
|
+
self.tighteners = options_and_rules.delete(:tighteners) || []
|
58
|
+
self.stop_words = options_and_rules.delete(:stop_words) || []
|
59
|
+
@read = options_and_rules.delete(:read) || options_and_rules.delete(:haystack_reader)
|
60
|
+
|
61
|
+
# options
|
62
|
+
@default_options = options_and_rules.reverse_merge(DEFAULT_OPTIONS).freeze
|
63
|
+
|
64
|
+
# do this last
|
65
|
+
self.haystack = competitors
|
66
|
+
end
|
67
|
+
|
68
|
+
def blockings=(ary)
|
69
|
+
@blockings = ary.map { |regexp_or_str| Blocking.new regexp_or_str }
|
70
|
+
end
|
71
|
+
|
72
|
+
def identities=(ary)
|
73
|
+
@identities = ary.map { |regexp_or_str| Identity.new regexp_or_str }
|
74
|
+
end
|
75
|
+
|
76
|
+
def tighteners=(ary)
|
77
|
+
@tighteners = ary.map { |regexp_or_str| Tightener.new regexp_or_str }
|
78
|
+
end
|
79
|
+
|
80
|
+
def stop_words=(ary)
|
81
|
+
@stop_words = ary.map { |regexp_or_str| StopWord.new regexp_or_str }
|
82
|
+
end
|
83
|
+
|
84
|
+
def haystack=(ary)
|
85
|
+
@haystack = ary.map { |competitor| Wrapper.new self, competitor }
|
47
86
|
end
|
48
87
|
|
49
88
|
def last_result
|
@@ -58,16 +97,24 @@ class FuzzyMatch
|
|
58
97
|
def find(needle, options = {})
|
59
98
|
raise ::RuntimeError, "[fuzzy_match] Dictionary has already been freed, can't perform more finds" if freed?
|
60
99
|
|
61
|
-
options = options.symbolize_keys
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
100
|
+
options = options.symbolize_keys.reverse_merge default_options
|
101
|
+
|
102
|
+
gather_last_result = options[:gather_last_result]
|
103
|
+
is_find_all = options[:find_all]
|
104
|
+
first_blocking_decides = options[:first_blocking_decides]
|
105
|
+
must_match_blocking = options[:must_match_blocking]
|
106
|
+
must_match_at_least_one_word = options[:must_match_at_least_one_word]
|
67
107
|
|
68
108
|
if gather_last_result
|
69
109
|
free_last_result
|
70
110
|
@last_result = Result.new
|
111
|
+
last_result.read = read
|
112
|
+
last_result.haystack = haystack
|
113
|
+
last_result.options = options
|
114
|
+
last_result.timeline << <<-EOS
|
115
|
+
Options were set, either by you or by falling back to defaults.
|
116
|
+
\tOptions: #{options.inspect}
|
117
|
+
EOS
|
71
118
|
end
|
72
119
|
|
73
120
|
if gather_last_result
|
@@ -77,13 +124,24 @@ class FuzzyMatch
|
|
77
124
|
last_result.stop_words = stop_words
|
78
125
|
end
|
79
126
|
|
80
|
-
needle = Wrapper.new self, needle
|
127
|
+
needle = Wrapper.new self, needle, true
|
81
128
|
|
82
129
|
if gather_last_result
|
83
130
|
last_result.needle = needle
|
131
|
+
last_result.timeline << <<-EOS
|
132
|
+
The needle's #{needle.variants.length} variants were enumerated.
|
133
|
+
\tVariants: #{needle.variants.map(&:inspect).join(', ')}
|
134
|
+
EOS
|
84
135
|
end
|
85
136
|
|
86
137
|
if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle }
|
138
|
+
if gather_last_result
|
139
|
+
last_result.timeline << <<-EOS
|
140
|
+
The needle didn't match any of the #{blockings.length} blocking, which was a requirement.
|
141
|
+
\tBlockings (first 3): #{blockings[0,3].map(&:inspect).join(', ')}
|
142
|
+
EOS
|
143
|
+
end
|
144
|
+
|
87
145
|
if is_find_all
|
88
146
|
return []
|
89
147
|
else
|
@@ -91,83 +149,109 @@ class FuzzyMatch
|
|
91
149
|
end
|
92
150
|
end
|
93
151
|
|
94
|
-
|
95
|
-
haystack.select do |straw|
|
152
|
+
if must_match_at_least_one_word
|
153
|
+
passed_word_requirement = haystack.select do |straw|
|
96
154
|
(needle.words & straw.words).any?
|
97
155
|
end
|
156
|
+
if gather_last_result
|
157
|
+
last_result.timeline << <<-EOS
|
158
|
+
Since :must_match_at_least_one_word => true, the competition was reduced to records sharing at least one word with the needle.
|
159
|
+
\tNeedle words: #{needle.words.map(&:inspect).join(', ')}
|
160
|
+
\tPassed (first 3): #{passed_word_requirement[0,3].map(&:render).map(&:inspect).join(', ')}
|
161
|
+
\tFailed (first 3): #{(haystack-passed_word_requirement)[0,3].map(&:render).map(&:inspect).join(', ')}
|
162
|
+
EOS
|
163
|
+
end
|
98
164
|
else
|
99
|
-
haystack
|
100
|
-
end
|
101
|
-
|
102
|
-
if gather_last_result
|
103
|
-
last_result.candidates = candidates
|
165
|
+
passed_word_requirement = haystack
|
104
166
|
end
|
105
167
|
|
106
|
-
|
107
|
-
|
168
|
+
if blockings.any?
|
169
|
+
joint = passed_word_requirement.select do |straw|
|
108
170
|
if first_blocking_decides
|
109
171
|
blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
|
110
172
|
else
|
111
173
|
blockings.any? { |blocking| blocking.join? needle, straw }
|
112
174
|
end
|
113
175
|
end
|
176
|
+
if gather_last_result
|
177
|
+
last_result.timeline << <<-EOS
|
178
|
+
Since there were blockings, the competition was reduced to records in the same block as the needle.
|
179
|
+
\tBlockings (first 3): #{blockings[0,3].map(&:inspect).join(', ')}
|
180
|
+
\tPassed (first 3): #{joint[0,3].map(&:render).map(&:inspect).join(', ')}
|
181
|
+
\tFailed (first 3): #{(passed_word_requirement-joint)[0,3].map(&:render).map(&:inspect).join(', ')}
|
182
|
+
EOS
|
183
|
+
end
|
114
184
|
else
|
115
|
-
|
185
|
+
joint = passed_word_requirement.dup
|
116
186
|
end
|
117
187
|
|
118
188
|
if joint.none?
|
119
189
|
if must_match_blocking
|
190
|
+
if gather_last_result
|
191
|
+
last_result.timeline << <<-EOS
|
192
|
+
Since :must_match_at_least_one_word => true and none of the competition was in the same block as the needle, the search stopped.
|
193
|
+
EOS
|
194
|
+
end
|
120
195
|
if is_find_all
|
121
196
|
return []
|
122
197
|
else
|
123
198
|
return nil
|
124
199
|
end
|
125
200
|
else
|
126
|
-
|
127
|
-
joint = disjoint
|
128
|
-
disjoint = []
|
201
|
+
joint = passed_word_requirement.dup
|
129
202
|
end
|
130
203
|
end
|
131
|
-
|
132
|
-
if
|
133
|
-
|
134
|
-
last_result.disjoint = disjoint
|
135
|
-
end
|
136
|
-
|
137
|
-
possibly_identical, certainly_different = if identities.any?
|
138
|
-
joint.partition do |straw|
|
204
|
+
|
205
|
+
if identities.any?
|
206
|
+
possibly_identical = joint.select do |straw|
|
139
207
|
identities.all? do |identity|
|
140
208
|
answer = identity.identical? needle, straw
|
141
209
|
answer.nil? or answer == true
|
142
210
|
end
|
143
211
|
end
|
212
|
+
if gather_last_result
|
213
|
+
last_result.timeline << <<-EOS
|
214
|
+
Since there were identities, the competition was reduced to records that might be identical to the needle (in other words, are not certainly different)
|
215
|
+
\Identities (first 3): #{identities[0,3].map(&:inspect).join(', ')}
|
216
|
+
\tPassed (first 3): #{possibly_identical[0,3].map(&:render).map(&:inspect).join(', ')}
|
217
|
+
\tFailed (first 3): #{(joint-possibly_identical)[0,3].map(&:render).map(&:inspect).join(', ')}
|
218
|
+
EOS
|
219
|
+
end
|
144
220
|
else
|
145
|
-
|
221
|
+
possibly_identical = joint.dup
|
146
222
|
end
|
147
|
-
|
223
|
+
|
224
|
+
similarities = possibly_identical.map { |straw| needle.similarity straw }.sort.reverse
|
225
|
+
|
148
226
|
if gather_last_result
|
149
|
-
|
150
|
-
|
227
|
+
last_result.timeline << <<-EOS
|
228
|
+
The competition was sorted in order of similarity to the needle.
|
229
|
+
\tSimilar (first 3): #{(similarities)[0,3].map(&:wrapper2).map(&:render).map(&:inspect).join(', ')}
|
230
|
+
EOS
|
151
231
|
end
|
152
232
|
|
153
233
|
if is_find_all
|
154
|
-
return
|
155
|
-
end
|
156
|
-
|
157
|
-
similarities = possibly_identical.map { |straw| needle.similarity straw }.sort
|
158
|
-
|
159
|
-
if gather_last_result
|
160
|
-
last_result.similarities = similarities
|
234
|
+
return similarities.map { |similarity| similarity.wrapper2.record }
|
161
235
|
end
|
162
236
|
|
163
|
-
|
164
|
-
|
237
|
+
winner = nil
|
238
|
+
|
239
|
+
if best_similarity = similarities.first and best_similarity.best_score.dices_coefficient_similar > 0
|
240
|
+
winner = best_similarity.wrapper2.record
|
165
241
|
if gather_last_result
|
166
|
-
last_result.
|
167
|
-
last_result.score = best_similarity.best_score.
|
242
|
+
last_result.winner = winner
|
243
|
+
last_result.score = best_similarity.best_score.dices_coefficient_similar
|
244
|
+
last_result.timeline << <<-EOS
|
245
|
+
A winner was determined because the similarity score #{best_similarity.best_score.dices_coefficient_similar} is greater than zero.
|
246
|
+
EOS
|
168
247
|
end
|
169
|
-
|
248
|
+
elsif gather_last_result
|
249
|
+
last_result.timeline << <<-EOS
|
250
|
+
No winner assigned because similarity score was zero.
|
251
|
+
EOS
|
170
252
|
end
|
253
|
+
|
254
|
+
winner
|
171
255
|
end
|
172
256
|
|
173
257
|
# Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.
|
@@ -175,63 +259,10 @@ class FuzzyMatch
|
|
175
259
|
# d = FuzzyMatch.new ['737', '747', '757' ]
|
176
260
|
# d.explain 'boeing 737-100'
|
177
261
|
def explain(needle, options = {})
|
178
|
-
|
179
|
-
|
180
|
-
log "# Match #{needle.inspect} => #{record.inspect}"
|
181
|
-
log "#" * 150
|
182
|
-
log
|
183
|
-
log "Needle"
|
184
|
-
log "-" * 150
|
185
|
-
log last_result.needle.render
|
186
|
-
log
|
187
|
-
log "Stop words"
|
188
|
-
log last_result.stop_words.blank? ? '(none)' : last_result.stop_words.map { |stop_word| stop_word.inspect }.join("\n")
|
189
|
-
log
|
190
|
-
log "Candidates"
|
191
|
-
log "-" * 150
|
192
|
-
log last_result.candidates.map { |record| record.render }.join("\n")
|
193
|
-
log
|
194
|
-
log "Tighteners"
|
195
|
-
log "-" * 150
|
196
|
-
log last_result.tighteners.blank? ? '(none)' : last_result.tighteners.map { |tightener| tightener.inspect }.join("\n")
|
197
|
-
log
|
198
|
-
log "Blockings"
|
199
|
-
log "-" * 150
|
200
|
-
log last_result.blockings.blank? ? '(none)' : last_result.blockings.map { |blocking| blocking.inspect }.join("\n")
|
201
|
-
log
|
202
|
-
log "Identities"
|
203
|
-
log "-" * 150
|
204
|
-
log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
|
205
|
-
log
|
206
|
-
log "Joint"
|
207
|
-
log "-" * 150
|
208
|
-
log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.render }.join("\n")
|
209
|
-
log
|
210
|
-
log "Disjoint"
|
211
|
-
log "-" * 150
|
212
|
-
log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.render }.join("\n")
|
213
|
-
log
|
214
|
-
log "Possibly identical"
|
215
|
-
log "-" * 150
|
216
|
-
log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.render }.join("\n")
|
217
|
-
log
|
218
|
-
log "Certainly different"
|
219
|
-
log "-" * 150
|
220
|
-
log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.render }.join("\n")
|
221
|
-
log
|
222
|
-
log "Similarities"
|
223
|
-
log "-" * 150
|
224
|
-
log last_result.similarities.blank? ? '(none)' : last_result.similarities.reverse[0..9].map { |similarity| similarity.inspect }.join("\n")
|
225
|
-
log
|
226
|
-
log "Match"
|
227
|
-
log "-" * 150
|
228
|
-
log record.inspect
|
229
|
-
end
|
230
|
-
|
231
|
-
def log(str = '') #:nodoc:
|
232
|
-
$stderr.puts str
|
262
|
+
find needle, options.merge(:gather_last_result => true)
|
263
|
+
last_result.explain
|
233
264
|
end
|
234
|
-
|
265
|
+
|
235
266
|
def freed?
|
236
267
|
@freed == true
|
237
268
|
end
|
data/test/test_cache.rb
CHANGED
@@ -26,7 +26,7 @@ require 'fuzzy_match/cached_result'
|
|
26
26
|
class Aircraft < ActiveRecord::Base
|
27
27
|
set_primary_key :icao_code
|
28
28
|
|
29
|
-
|
29
|
+
cache_fuzzy_match_with :flight_segments, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
|
30
30
|
|
31
31
|
def aircraft_description
|
32
32
|
[manufacturer_name, model_name].compact.join(' ')
|
@@ -53,7 +53,7 @@ end
|
|
53
53
|
class FlightSegment < ActiveRecord::Base
|
54
54
|
set_primary_key :row_hash
|
55
55
|
|
56
|
-
|
56
|
+
cache_fuzzy_match_with :aircraft, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
|
57
57
|
|
58
58
|
extend CohortScope
|
59
59
|
self.minimum_cohort_size = 1
|
data/test/test_fuzzy_match.rb
CHANGED
@@ -24,7 +24,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
24
24
|
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
25
25
|
d.find 'MISSAM', :gather_last_result => true
|
26
26
|
assert_equal 0.6, d.last_result.score
|
27
|
-
assert_equal 'NISSAN', d.last_result.
|
27
|
+
assert_equal 'NISSAN', d.last_result.winner
|
28
28
|
end
|
29
29
|
|
30
30
|
def test_004_false_positive_without_tightener
|
@@ -91,7 +91,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
91
91
|
|
92
92
|
# first_blocking_decides refers to the needle
|
93
93
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
94
|
-
assert_equal [
|
94
|
+
assert_equal ["Boeing ER6", "Boeing 747", "Boeing 747SR"], d.find_all('Boeing ER6')
|
95
95
|
|
96
96
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
|
97
97
|
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
@@ -108,6 +108,8 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
108
108
|
haystack = [ab, ba]
|
109
109
|
by_first = FuzzyMatch.new haystack, :read => :one
|
110
110
|
by_last = FuzzyMatch.new haystack, :read => :two
|
111
|
+
assert_equal :one, by_first.read
|
112
|
+
assert_equal :two, by_last.read
|
111
113
|
assert_equal ab, by_first.find('a')
|
112
114
|
assert_equal ab, by_last.find('b')
|
113
115
|
assert_equal ba, by_first.find('b')
|
@@ -154,6 +156,10 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
154
156
|
def test_019_must_match_at_least_one_word
|
155
157
|
d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
|
156
158
|
assert_equal nil, d.find('RITZ')
|
159
|
+
|
160
|
+
d = FuzzyMatch.new ["Foo's Bar"], :must_match_at_least_one_word => true
|
161
|
+
assert_equal nil, d.find("Jacob's")
|
162
|
+
assert_equal "Foo's Bar", d.find("Foo's")
|
157
163
|
end
|
158
164
|
|
159
165
|
def test_020_stop_words
|
@@ -167,20 +173,19 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
167
173
|
assert_equal 'A HOTEL', d.find('A HTL')
|
168
174
|
end
|
169
175
|
|
170
|
-
def
|
176
|
+
def test_021_explain_prints_to_stdout
|
171
177
|
require 'stringio'
|
172
178
|
capture = StringIO.new
|
173
179
|
begin
|
174
|
-
|
175
|
-
$
|
180
|
+
old_stdout = $stdout
|
181
|
+
$stdout = capture
|
176
182
|
d = FuzzyMatch.new %w{ RATZ CATZ }
|
177
183
|
d.explain('RITZ')
|
178
184
|
ensure
|
179
|
-
$
|
185
|
+
$stdout = old_stdout
|
180
186
|
end
|
181
187
|
capture.rewind
|
182
188
|
assert capture.read.include?('CATZ')
|
183
|
-
capture.close
|
184
189
|
end
|
185
190
|
|
186
191
|
def test_022_compare_words_with_words
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-01-
|
12
|
+
date: 2012-01-16 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: shoulda
|
16
|
-
requirement: &
|
16
|
+
requirement: &2153333800 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2153333800
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: remote_table
|
27
|
-
requirement: &
|
27
|
+
requirement: &2153333320 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2153333320
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: activerecord
|
38
|
-
requirement: &
|
38
|
+
requirement: &2153332560 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '3'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2153332560
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: mysql
|
49
|
-
requirement: &
|
49
|
+
requirement: &2153331800 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2153331800
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: cohort_scope
|
60
|
-
requirement: &
|
60
|
+
requirement: &2153325800 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2153325800
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: weighted_average
|
71
|
-
requirement: &
|
71
|
+
requirement: &2153325220 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *2153325220
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rake
|
82
|
-
requirement: &
|
82
|
+
requirement: &2153322620 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *2153322620
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: activesupport
|
93
|
-
requirement: &
|
93
|
+
requirement: &2153322100 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '3'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *2153322100
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: to_regexp
|
104
|
-
requirement: &
|
104
|
+
requirement: &2153321580 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,7 +109,7 @@ dependencies:
|
|
109
109
|
version: 0.0.3
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *2153321580
|
113
113
|
description: Find a needle in a haystack using string similarity and (optionally)
|
114
114
|
regexp rules. Replaces loose_tight_dictionary.
|
115
115
|
email:
|