fuzzy_match 1.4.0 → 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,14 @@
1
+ 1.4.1 / 2013-01-17
2
+
3
+ * Bug fixes
4
+
5
+ * Don't die when you're comparing a string of length 1 and another string of length less than three (thanks @ihough !)
6
+
7
+ * Enhancements
8
+
9
+ * '2A' is allowed to match '2 A'... funky stuff with pair distance and short strings
10
+ * FuzzyMatch#find_all_with_score returns a sorted array of records with their scores - thanks @brycesenz! (https://github.com/seamusabshere/fuzzy_match/issues/3)
11
+
1
12
  1.4.0 / 2012-09-07
2
13
 
3
14
  * Breaking changes
@@ -40,7 +40,9 @@ class FuzzyMatch
40
40
  :must_match_grouping => false,
41
41
  :must_match_at_least_one_word => false,
42
42
  :gather_last_result => false,
43
- :find_all => false
43
+ :find_all => false,
44
+ :find_all_with_score => false,
45
+ :threshold => 0
44
46
  }
45
47
 
46
48
  self.engine = DEFAULT_ENGINE
@@ -50,7 +52,7 @@ class FuzzyMatch
50
52
  attr_reader :identities
51
53
  attr_reader :normalizers
52
54
  attr_reader :stop_words
53
- attr_reader :read
55
+ attr_accessor :read
54
56
  attr_reader :default_options
55
57
 
56
58
  # haystack - a bunch of records that will compete to see who best matches the needle
@@ -60,13 +62,14 @@ class FuzzyMatch
60
62
  # * :<tt>identities</tt> - regexps
61
63
  # * :<tt>groupings</tt> - regexps
62
64
  # * :<tt>stop_words</tt> - regexps
65
+ # * :<tt>read</tt> - how to interpret each record in the 'haystack', either a Proc or a symbol
63
66
  #
64
67
  # Options (can be specified at initialization or when calling #find)
65
- # * :<tt>read</tt> - how to interpret each record in the 'haystack', either a Proc or a symbol
66
68
  # * :<tt>must_match_grouping</tt> - don't return a match unless the needle fits into one of the groupings you specified
67
69
  # * :<tt>must_match_at_least_one_word</tt> - don't return a match unless the needle shares at least one word with the match
68
70
  # * :<tt>first_grouping_decides</tt> - force records into the first grouping they match, rather than choosing a grouping that will give them a higher score
69
71
  # * :<tt>gather_last_result</tt> - enable <tt>last_result</tt>
72
+ # * :<tt>threshold</tt> - set a score threshold below which not to return results (not generally recommended - please test the results of setting a threshold thoroughly - one set of results and their scores probably won't be enough to determine the appropriate number). Only checked against the Pair Distance score and ignored when one string or the other is of length 1.
70
73
  def initialize(competitors, options_and_rules = {})
71
74
  options_and_rules = options_and_rules.dup
72
75
 
@@ -118,12 +121,19 @@ class FuzzyMatch
118
121
  options = options.merge(:find_all => true)
119
122
  find needle, options
120
123
  end
124
+
125
+ def find_all_with_score(needle, options = {})
126
+ options = options.merge(:find_all_with_score => true)
127
+ find needle, options
128
+ end
121
129
 
122
130
  def find(needle, options = {})
123
131
  options = default_options.merge options
124
132
 
133
+ threshold = options[:threshold]
125
134
  gather_last_result = options[:gather_last_result]
126
- is_find_all = options[:find_all]
135
+ is_find_all_with_score = options[:find_all_with_score]
136
+ is_find_all = options[:find_all] || is_find_all_with_score
127
137
  first_grouping_decides = options[:first_grouping_decides]
128
138
  must_match_grouping = options[:must_match_grouping]
129
139
  must_match_at_least_one_word = options[:must_match_at_least_one_word]
@@ -254,14 +264,31 @@ The competition was sorted in order of similarity to the needle.
254
264
  EOS
255
265
  end
256
266
 
267
+ if is_find_all_with_score
268
+ memo = []
269
+ similarities.each do |similarity|
270
+ if similarity.satisfy?(needle, threshold)
271
+ bs = similarity.best_score
272
+ memo << [similarity.wrapper2.record, bs.dices_coefficient_similar, bs.levenshtein_similar]
273
+ end
274
+ end
275
+ return memo
276
+ end
277
+
257
278
  if is_find_all
258
- return similarities.map { |similarity| similarity.wrapper2.record }
279
+ memo = []
280
+ similarities.each do |similarity|
281
+ if similarity.satisfy?(needle, threshold)
282
+ memo << similarity.wrapper2.record
283
+ end
284
+ end
285
+ return memo
259
286
  end
260
287
 
261
288
  best_similarity = similarities.first
262
289
  winner = nil
263
290
 
264
- if best_similarity and (best_similarity.best_score.dices_coefficient_similar > 0 or (needle.words & best_similarity.wrapper2.words).any?)
291
+ if best_similarity and best_similarity.satisfy?(needle, threshold)
265
292
  winner = best_similarity.wrapper2.record
266
293
  if gather_last_result
267
294
  last_result.winner = winner
@@ -12,8 +12,9 @@ class FuzzyMatch
12
12
  end
13
13
 
14
14
  def <=>(other)
15
- by_dices_coefficient = (dices_coefficient_similar <=> other.dices_coefficient_similar)
16
- if by_dices_coefficient == 0
15
+ a = dices_coefficient_similar
16
+ b = other.dices_coefficient_similar
17
+ if a.nan? or b.nan? or (by_dices_coefficient = (a <=> b)) == 0
17
18
  levenshtein_similar <=> other.levenshtein_similar
18
19
  else
19
20
  by_dices_coefficient
@@ -21,6 +21,12 @@ class FuzzyMatch
21
21
  @best_score ||= FuzzyMatch.score_class.new(best_wrapper1_variant, best_wrapper2_variant)
22
22
  end
23
23
 
24
+ def satisfy?(needle, threshold)
25
+ best_score.dices_coefficient_similar > threshold or
26
+ ((wrapper2.render.length < 3 or needle.render.length < 3) and best_score.levenshtein_similar > 0) or
27
+ (needle.words & wrapper2.words).any?
28
+ end
29
+
24
30
  def inspect
25
31
  %{#<FuzzyMatch::Similarity #{wrapper2.render.inspect}=>#{best_wrapper2_variant.inspect} versus #{wrapper1.render.inspect}=>#{best_wrapper1_variant.inspect} original_weight=#{"%0.5f" % original_weight} best_score=#{best_score.inspect}>}
26
32
  end
@@ -1,3 +1,3 @@
1
1
  class FuzzyMatch
2
- VERSION = '1.4.0'
2
+ VERSION = '1.4.1'
3
3
  end
@@ -1,6 +1,6 @@
1
1
  require 'rubygems'
2
- require 'bundler'
3
- Bundler.setup
2
+ require 'bundler/setup'
3
+
4
4
  require 'minitest/spec'
5
5
  require 'minitest/autorun'
6
6
 
@@ -26,6 +26,14 @@ describe FuzzyMatch do
26
26
  end
27
27
  end
28
28
 
29
+ describe '#find_all_with_score' do
30
+ it %{return records with 2 scores} do
31
+ d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
32
+ d.find_all_with_score('X').must_equal [ ['X', 1, 1], ['X22', 0, 0.33333333333333337] ]
33
+ d.find_all_with_score('A').must_equal []
34
+ end
35
+ end
36
+
29
37
  describe '#explain' do
30
38
  before do
31
39
  require 'stringio'
@@ -228,6 +236,21 @@ describe FuzzyMatch do
228
236
  # without making false positives
229
237
  d.find('Y bar').must_be_nil
230
238
  end
239
+
240
+ it %{finds possible matches even when pair distance fails} do
241
+ d = FuzzyMatch.new ['XX', '2 A']
242
+ d.find('2A').must_equal '2 A'
243
+ d = FuzzyMatch.new ['XX', '2A']
244
+ d.find('2 A').must_equal '2A'
245
+ end
246
+
247
+ it %{weird blow ups} do
248
+ d = FuzzyMatch.new ['XX', '2 A']
249
+ d.find('A').must_equal '2 A'
250
+ d = FuzzyMatch.new ['XX', 'A']
251
+ d.find('2 A').must_equal 'A'
252
+ end
253
+
231
254
  end
232
255
 
233
256
  describe 'deprecations' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fuzzy_match
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.4.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-07 00:00:00.000000000 Z
12
+ date: 2013-01-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: to_regexp