RubyGems - fuzzy_match - Versions diffs - 1.4.0 → 1.4.1 - Mend

fuzzy_match 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/CHANGELOG +11 -0
data/lib/fuzzy_match.rb +33 -6
data/lib/fuzzy_match/score.rb +3 -2
data/lib/fuzzy_match/similarity.rb +6 -0
data/lib/fuzzy_match/version.rb +1 -1
data/test/helper.rb +2 -2
data/test/test_fuzzy_match.rb +23 -0
metadata +2 -2

data/CHANGELOG CHANGED

@@ -1,3 +1,14 @@
+1.4.1 / 2013-01-17
+* Bug fixes
+  * Don't die when you're comparing a string of length 1 and another string of length less than three (thanks @ihough !)
+* Enhancements
+  * '2A' is allowed to match '2 A'... funky stuff with pair distance and short strings
+  * FuzzyMatch#find_all_with_score returns a sorted array of records with their scores - thanks @brycesenz! (https://github.com/seamusabshere/fuzzy_match/issues/3)
 1.4.0 / 2012-09-07
 * Breaking changes

data/lib/fuzzy_match.rb CHANGED

@@ -40,7 +40,9 @@ class FuzzyMatch
     :must_match_grouping => false,
     :must_match_at_least_one_word => false,
     :gather_last_result => false,
-    :find_all => false
+    :find_all => false,
+    :find_all_with_score => false,
+    :threshold => 0
   }
   self.engine = DEFAULT_ENGINE
@@ -50,7 +52,7 @@ class FuzzyMatch
   attr_reader :identities
   attr_reader :normalizers
   attr_reader :stop_words
-  attr_reader :read
+  attr_accessor :read
   attr_reader :default_options
   # haystack - a bunch of records that will compete to see who best matches the needle
@@ -60,13 +62,14 @@ class FuzzyMatch
   # * :<tt>identities</tt> - regexps
   # * :<tt>groupings</tt> - regexps
   # * :<tt>stop_words</tt> - regexps
+  # * :<tt>read</tt> - how to interpret each record in the 'haystack', either a Proc or a symbol
   #
   # Options (can be specified at initialization or when calling #find)
-  # * :<tt>read</tt> - how to interpret each record in the 'haystack', either a Proc or a symbol
   # * :<tt>must_match_grouping</tt> - don't return a match unless the needle fits into one of the groupings you specified
   # * :<tt>must_match_at_least_one_word</tt> - don't return a match unless the needle shares at least one word with the match
   # * :<tt>first_grouping_decides</tt> - force records into the first grouping they match, rather than choosing a grouping that will give them a higher score
   # * :<tt>gather_last_result</tt> - enable <tt>last_result</tt>
+  # * :<tt>threshold</tt> - set a score threshold below which not to return results (not generally recommended - please test the results of setting a threshold thoroughly - one set of results and their scores probably won't be enough to determine the appropriate number). Only checked against the Pair Distance score and ignored when one string or the other is of length 1.
   def initialize(competitors, options_and_rules = {})
     options_and_rules = options_and_rules.dup
@@ -118,12 +121,19 @@ class FuzzyMatch
     options = options.merge(:find_all => true)
     find needle, options
   end
+  def find_all_with_score(needle, options = {})
+    options = options.merge(:find_all_with_score => true)
+    find needle, options
+  end
   def find(needle, options = {})
     options = default_options.merge options
+    threshold = options[:threshold]
     gather_last_result = options[:gather_last_result]
-    is_find_all = options[:find_all]
+    is_find_all_with_score = options[:find_all_with_score]
+    is_find_all = options[:find_all] || is_find_all_with_score
     first_grouping_decides = options[:first_grouping_decides]
     must_match_grouping = options[:must_match_grouping]
     must_match_at_least_one_word = options[:must_match_at_least_one_word]
@@ -254,14 +264,31 @@ The competition was sorted in order of similarity to the needle.
 EOS
     end
+    if is_find_all_with_score
+      memo = []
+      similarities.each do |similarity|
+        if similarity.satisfy?(needle, threshold)
+          bs = similarity.best_score
+          memo << [similarity.wrapper2.record, bs.dices_coefficient_similar, bs.levenshtein_similar]
+        end
+      end
+      return memo
+    end
     if is_find_all
-      return similarities.map { |similarity| similarity.wrapper2.record }
+      memo = []
+      similarities.each do |similarity|
+        if similarity.satisfy?(needle, threshold)
+          memo << similarity.wrapper2.record
+        end
+      end
+      return memo
     end
     best_similarity = similarities.first
     winner = nil
-    if best_similarity and (best_similarity.best_score.dices_coefficient_similar > 0 or (needle.words & best_similarity.wrapper2.words).any?)
+    if best_similarity and best_similarity.satisfy?(needle, threshold)
       winner = best_similarity.wrapper2.record
       if gather_last_result
         last_result.winner = winner

data/lib/fuzzy_match/score.rb CHANGED

@@ -12,8 +12,9 @@ class FuzzyMatch
     end
     def <=>(other)
-      by_dices_coefficient = (dices_coefficient_similar <=> other.dices_coefficient_similar)
-      if by_dices_coefficient == 0
+      a = dices_coefficient_similar
+      b = other.dices_coefficient_similar
+      if a.nan? or b.nan? or (by_dices_coefficient = (a <=> b)) == 0
         levenshtein_similar <=> other.levenshtein_similar
       else
         by_dices_coefficient

data/lib/fuzzy_match/similarity.rb CHANGED

@@ -21,6 +21,12 @@ class FuzzyMatch
       @best_score ||= FuzzyMatch.score_class.new(best_wrapper1_variant, best_wrapper2_variant)
     end
+    def satisfy?(needle, threshold)
+      best_score.dices_coefficient_similar > threshold or
+        ((wrapper2.render.length < 3 or needle.render.length < 3) and best_score.levenshtein_similar > 0) or
+        (needle.words & wrapper2.words).any?
+    end
     def inspect
       %{#<FuzzyMatch::Similarity #{wrapper2.render.inspect}=>#{best_wrapper2_variant.inspect} versus #{wrapper1.render.inspect}=>#{best_wrapper1_variant.inspect} original_weight=#{"%0.5f" % original_weight} best_score=#{best_score.inspect}>}
     end

data/lib/fuzzy_match/version.rb CHANGED

@@ -1,3 +1,3 @@
 class FuzzyMatch
-  VERSION = '1.4.0'
+  VERSION = '1.4.1'
 end

data/test/helper.rb CHANGED

@@ -1,6 +1,6 @@
 require 'rubygems'
-require 'bundler'
-Bundler.setup
+require 'bundler/setup'
 require 'minitest/spec'
 require 'minitest/autorun'

data/test/test_fuzzy_match.rb CHANGED

@@ -26,6 +26,14 @@ describe FuzzyMatch do
     end
   end
+  describe '#find_all_with_score' do
+    it %{return records with 2 scores} do
+      d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :groupings => [ /X/, /Y/ ], :must_match_grouping => true
+      d.find_all_with_score('X').must_equal [ ['X', 1, 1], ['X22', 0, 0.33333333333333337] ]
+      d.find_all_with_score('A').must_equal []
+    end
+  end
   describe '#explain' do
     before do
       require 'stringio'
@@ -228,6 +236,21 @@ describe FuzzyMatch do
       # without making false positives
       d.find('Y bar').must_be_nil
     end
+    it %{finds possible matches even when pair distance fails} do
+      d = FuzzyMatch.new ['XX', '2 A']
+      d.find('2A').must_equal '2 A'
+      d = FuzzyMatch.new ['XX', '2A']
+      d.find('2 A').must_equal '2A'
+    end
+    it %{weird blow ups} do
+      d = FuzzyMatch.new ['XX', '2 A']
+      d.find('A').must_equal '2 A'
+      d = FuzzyMatch.new ['XX', 'A']
+      d.find('2 A').must_equal 'A'
+    end
   end
   describe 'deprecations' do

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fuzzy_match
 version: !ruby/object:Gem::Version
-  version: 1.4.0
+  version: 1.4.1
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-09-07 00:00:00.000000000 Z
+date: 2013-01-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: to_regexp