fuzzy_match 1.4.0 → 1.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +11 -0
- data/lib/fuzzy_match.rb +33 -6
- data/lib/fuzzy_match/score.rb +3 -2
- data/lib/fuzzy_match/similarity.rb +6 -0
- data/lib/fuzzy_match/version.rb +1 -1
- data/test/helper.rb +2 -2
- data/test/test_fuzzy_match.rb +23 -0
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
1.4.1 / 2013-01-17
|
2
|
+
|
3
|
+
* Bug fixes
|
4
|
+
|
5
|
+
* Don't die when you're comparing a string of length 1 and another string of length less than three (thanks @ihough !)
|
6
|
+
|
7
|
+
* Enhancements
|
8
|
+
|
9
|
+
* '2A' is allowed to match '2 A'... funky stuff with pair distance and short strings
|
10
|
+
* FuzzyMatch#find_all_with_score returns a sorted array of records with their scores - thanks @brycesenz! (https://github.com/seamusabshere/fuzzy_match/issues/3)
|
11
|
+
|
1
12
|
1.4.0 / 2012-09-07
|
2
13
|
|
3
14
|
* Breaking changes
|
data/lib/fuzzy_match.rb
CHANGED
@@ -40,7 +40,9 @@ class FuzzyMatch
|
|
40
40
|
:must_match_grouping => false,
|
41
41
|
:must_match_at_least_one_word => false,
|
42
42
|
:gather_last_result => false,
|
43
|
-
:find_all => false
|
43
|
+
:find_all => false,
|
44
|
+
:find_all_with_score => false,
|
45
|
+
:threshold => 0
|
44
46
|
}
|
45
47
|
|
46
48
|
self.engine = DEFAULT_ENGINE
|
@@ -50,7 +52,7 @@ class FuzzyMatch
|
|
50
52
|
attr_reader :identities
|
51
53
|
attr_reader :normalizers
|
52
54
|
attr_reader :stop_words
|
53
|
-
|
55
|
+
attr_accessor :read
|
54
56
|
attr_reader :default_options
|
55
57
|
|
56
58
|
# haystack - a bunch of records that will compete to see who best matches the needle
|
@@ -60,13 +62,14 @@ class FuzzyMatch
|
|
60
62
|
# * :<tt>identities</tt> - regexps
|
61
63
|
# * :<tt>groupings</tt> - regexps
|
62
64
|
# * :<tt>stop_words</tt> - regexps
|
65
|
+
# * :<tt>read</tt> - how to interpret each record in the 'haystack', either a Proc or a symbol
|
63
66
|
#
|
64
67
|
# Options (can be specified at initialization or when calling #find)
|
65
|
-
# * :<tt>read</tt> - how to interpret each record in the 'haystack', either a Proc or a symbol
|
66
68
|
# * :<tt>must_match_grouping</tt> - don't return a match unless the needle fits into one of the groupings you specified
|
67
69
|
# * :<tt>must_match_at_least_one_word</tt> - don't return a match unless the needle shares at least one word with the match
|
68
70
|
# * :<tt>first_grouping_decides</tt> - force records into the first grouping they match, rather than choosing a grouping that will give them a higher score
|
69
71
|
# * :<tt>gather_last_result</tt> - enable <tt>last_result</tt>
|
72
|
+
# * :<tt>threshold</tt> - set a score threshold below which not to return results (not generally recommended - please test the results of setting a threshold thoroughly - one set of results and their scores probably won't be enough to determine the appropriate number). Only checked against the Pair Distance score and ignored when one string or the other is of length 1.
|
70
73
|
def initialize(competitors, options_and_rules = {})
|
71
74
|
options_and_rules = options_and_rules.dup
|
72
75
|
|
@@ -118,12 +121,19 @@ class FuzzyMatch
|
|
118
121
|
options = options.merge(:find_all => true)
|
119
122
|
find needle, options
|
120
123
|
end
|
124
|
+
|
125
|
+
def find_all_with_score(needle, options = {})
|
126
|
+
options = options.merge(:find_all_with_score => true)
|
127
|
+
find needle, options
|
128
|
+
end
|
121
129
|
|
122
130
|
def find(needle, options = {})
|
123
131
|
options = default_options.merge options
|
124
132
|
|
133
|
+
threshold = options[:threshold]
|
125
134
|
gather_last_result = options[:gather_last_result]
|
126
|
-
|
135
|
+
is_find_all_with_score = options[:find_all_with_score]
|
136
|
+
is_find_all = options[:find_all] || is_find_all_with_score
|
127
137
|
first_grouping_decides = options[:first_grouping_decides]
|
128
138
|
must_match_grouping = options[:must_match_grouping]
|
129
139
|
must_match_at_least_one_word = options[:must_match_at_least_one_word]
|
@@ -254,14 +264,31 @@ The competition was sorted in order of similarity to the needle.
|
|
254
264
|
EOS
|
255
265
|
end
|
256
266
|
|
267
|
+
if is_find_all_with_score
|
268
|
+
memo = []
|
269
|
+
similarities.each do |similarity|
|
270
|
+
if similarity.satisfy?(needle, threshold)
|
271
|
+
bs = similarity.best_score
|
272
|
+
memo << [similarity.wrapper2.record, bs.dices_coefficient_similar, bs.levenshtein_similar]
|
273
|
+
end
|
274
|
+
end
|
275
|
+
return memo
|
276
|
+
end
|
277
|
+
|
257
278
|
if is_find_all
|
258
|
-
|
279
|
+
memo = []
|
280
|
+
similarities.each do |similarity|
|
281
|
+
if similarity.satisfy?(needle, threshold)
|
282
|
+
memo << similarity.wrapper2.record
|
283
|
+
end
|
284
|
+
end
|
285
|
+
return memo
|
259
286
|
end
|
260
287
|
|
261
288
|
best_similarity = similarities.first
|
262
289
|
winner = nil
|
263
290
|
|
264
|
-
if best_similarity and
|
291
|
+
if best_similarity and best_similarity.satisfy?(needle, threshold)
|
265
292
|
winner = best_similarity.wrapper2.record
|
266
293
|
if gather_last_result
|
267
294
|
last_result.winner = winner
|
data/lib/fuzzy_match/score.rb
CHANGED
@@ -12,8 +12,9 @@ class FuzzyMatch
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def <=>(other)
|
15
|
-
|
16
|
-
|
15
|
+
a = dices_coefficient_similar
|
16
|
+
b = other.dices_coefficient_similar
|
17
|
+
if a.nan? or b.nan? or (by_dices_coefficient = (a <=> b)) == 0
|
17
18
|
levenshtein_similar <=> other.levenshtein_similar
|
18
19
|
else
|
19
20
|
by_dices_coefficient
|
@@ -21,6 +21,12 @@ class FuzzyMatch
|
|
21
21
|
@best_score ||= FuzzyMatch.score_class.new(best_wrapper1_variant, best_wrapper2_variant)
|
22
22
|
end
|
23
23
|
|
24
|
+
def satisfy?(needle, threshold)
|
25
|
+
best_score.dices_coefficient_similar > threshold or
|
26
|
+
((wrapper2.render.length < 3 or needle.render.length < 3) and best_score.levenshtein_similar > 0) or
|
27
|
+
(needle.words & wrapper2.words).any?
|
28
|
+
end
|
29
|
+
|
24
30
|
def inspect
|
25
31
|
%{#<FuzzyMatch::Similarity #{wrapper2.render.inspect}=>#{best_wrapper2_variant.inspect} versus #{wrapper1.render.inspect}=>#{best_wrapper1_variant.inspect} original_weight=#{"%0.5f" % original_weight} best_score=#{best_score.inspect}>}
|
26
32
|
end
|
data/lib/fuzzy_match/version.rb
CHANGED
data/test/helper.rb
CHANGED
data/test/test_fuzzy_match.rb
CHANGED
@@ -26,6 +26,14 @@ describe FuzzyMatch do
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
+
describe '#find_all_with_score' do
|
30
|
+
it %{return records with 2 scores} do
|
31
|
+
d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
|
32
|
+
d.find_all_with_score('X').must_equal [ ['X', 1, 1], ['X22', 0, 0.33333333333333337] ]
|
33
|
+
d.find_all_with_score('A').must_equal []
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
29
37
|
describe '#explain' do
|
30
38
|
before do
|
31
39
|
require 'stringio'
|
@@ -228,6 +236,21 @@ describe FuzzyMatch do
|
|
228
236
|
# without making false positives
|
229
237
|
d.find('Y bar').must_be_nil
|
230
238
|
end
|
239
|
+
|
240
|
+
it %{finds possible matches even when pair distance fails} do
|
241
|
+
d = FuzzyMatch.new ['XX', '2 A']
|
242
|
+
d.find('2A').must_equal '2 A'
|
243
|
+
d = FuzzyMatch.new ['XX', '2A']
|
244
|
+
d.find('2 A').must_equal '2A'
|
245
|
+
end
|
246
|
+
|
247
|
+
it %{weird blow ups} do
|
248
|
+
d = FuzzyMatch.new ['XX', '2 A']
|
249
|
+
d.find('A').must_equal '2 A'
|
250
|
+
d = FuzzyMatch.new ['XX', 'A']
|
251
|
+
d.find('2 A').must_equal 'A'
|
252
|
+
end
|
253
|
+
|
231
254
|
end
|
232
255
|
|
233
256
|
describe 'deprecations' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.4.
|
4
|
+
version: 1.4.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-01-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: to_regexp
|