RubyGems - redis-autosuggest - Versions diffs - 0.2.0 → 0.3.0 - Mend

redis-autosuggest 0.2.0 → 0.3.0

Files changed (10) hide show

data/README.md +7 -0
data/lib/redis-autosuggest.rb +3 -0
data/lib/redis/autosuggest.rb +26 -5
data/lib/redis/autosuggest/config.rb +36 -8
data/lib/redis/autosuggest/file.rb +4 -1
data/lib/redis/autosuggest/fuzzy.rb +96 -0
data/lib/redis/autosuggest/version.rb +1 -1
data/redis-autosuggest.gemspec +2 -0
data/test/autosuggest_test.rb +14 -0
metadata +35 -2

data/README.md CHANGED Viewed

@@ -36,6 +36,13 @@ Redis::Autosuggest.add_with_score("North By Northwest", 9, Northern Exposure, 3)
 # Increment an item's score
 Redis::Autosuggest.increment("North By Northwest", 1)
 ```
+Fuzzy matching:
+```ruby
+Redis::Autosuggest.fuzzy_match = true
+Redis::Autosuggest.add("North By Northwest")
+Redis::Autosuggest.suggest("nort byenorthwest")
+# => ["north by northwest"]
+```
 ## Rails support

data/lib/redis-autosuggest.rb CHANGED Viewed

@@ -1,9 +1,12 @@
 require 'redis'
 require 'redis-namespace'
+require 'levenshtein'
+require 'text'
 require 'redis/autosuggest'
 require 'redis/autosuggest/config'
 require 'redis/autosuggest/file'
 require 'redis/autosuggest/version'
+require 'redis/autosuggest/fuzzy'
 if defined?(Rails)
   require 'redis/autosuggest/rails/sources'

data/lib/redis/autosuggest.rb CHANGED Viewed

@@ -8,7 +8,11 @@ class Redis
       def add(*items)
         all_new_items = true
         items.each do |item|
-          item = item.downcase
+          if item.size > @max_str_size
+            all_new_items = false
+            next
+          end
+          item = normalize(item)
           item_exists?(item) ? all_new_items = false : add_item(item)
         end
         all_new_items
@@ -20,6 +24,10 @@ class Redis
       def add_with_score(*fields)
         all_new_items = true
         fields.each_slice(2) do |f|
+          if f[0].size > @max_str_size
+            all_new_items = false
+            next
+          end
           f[0] = normalize(f[0])
           item_exists?(f[0]) ? all_new_items = false : add_item(*f)
         end
@@ -36,6 +44,7 @@ class Redis
         @db.hdel(@itemids, item)
         remove_substrings(item, id)
         @redis.zrem(@leaderboard, id) if @use_leaderboard
+        remove_fuzzy(item) if @fuzzy_match
         return true
       end
@@ -50,8 +59,15 @@ class Redis
       # Suggest items from the database that most closely match the queried string.
       # Returns an array of suggestion items (an empty array if nothing found).
+      # Fuzzy matching will only occur when both of these conditions are met:
+      #   - Redis::Autosuggest.fuzzy_match == true
+      #   - The simple suggestion method (matching substrings) yields no results
       def suggest(str, results=@max_results)
-        suggestion_ids = @substrings.zrevrange(normalize(str), 0, results - 1)
+        str = normalize(str)
+        suggestion_ids = @substrings.zrevrange(str, 0, results - 1)
+        if suggestion_ids.empty? && @fuzzy_match
+          return suggest_fuzzy(str, results)
+        end
         suggestion_ids.empty? ? [] : @db.hmget(@items, suggestion_ids)
       end
@@ -77,6 +93,10 @@ class Redis
         return @db.hmget(@itemids, normalize(item)).first
       end
+      def get_item(id)
+        return @db.hmget(@items, id).first
+      end
       private
       def normalize(item)
@@ -89,6 +109,7 @@ class Redis
         @db.hset(@itemids, item, id)
         add_substrings(item, score, id)
         @db.zadd(@leaderboard, score, id) if @use_leaderboard
+        add_fuzzy(item) if @fuzzy_match
       end
       # Yield each substring of a complete string
@@ -106,19 +127,19 @@ class Redis
           end
         end
       end
       # Add the id of an item to a substring
       def add_substring(sub, score, id)
         @substrings.zadd(sub, score, id)
       end
       # Add the id of an item to a substring only when the number of items that
       # substring stores is less then the config value of "max_per_substring".
       # If the substring set is already full, check to see if the item with the
       # lowest score in the substring set has a lower score than the item being added.
       # If yes, remove that item and add this item to the substring set.
       def add_substring_limit(sub, score, id)
-        count = @substrings.zcount(sub, "-inf", "inf")
+        count = @substrings.zcount(sub, "-inf", "+inf")
         if count < @max_per_substring
           add_substring(sub, score, id)
         else

data/lib/redis/autosuggest/config.rb CHANGED Viewed

@@ -3,7 +3,7 @@ class Redis
     # Default Redis server at localhost:6379
     @redis = Redis.new
     # Main Redis namespace for this module
     @namespace = "suggest"
@@ -19,7 +19,7 @@ class Redis
     # have four sorted sets: 'autosuggest:substring:r', 'autosuggest:substring:ru',
     # 'autosuggest:substring:rub', and 'autosuggest:substring:ruby'.
     # Each sorted set would the id to the word 'ruby'
-    @substrings =  Redis::Namespace.new("#{@namespace}:sub", :redis => @redis)
+    @substrings =  Redis::Namespace.new("#{@namespace}:su", :redis => @redis)
     # max number of ids to store per substring.
     @max_per_substring = Float::INFINITY
@@ -27,6 +27,9 @@ class Redis
     # max number of results to return for an autosuggest query
     @max_results = 5
+    # max string size for an item
+    @max_str_size = Float::INFINITY
     # Key to a sorted set holding all id of items in the autosuggest database sorted
     # by their score
     @leaderboard = "lead"
@@ -41,16 +44,41 @@ class Redis
     # Stores the number of items the db has for each rails source
     @rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => @redis)
+    # Fuzzy matching
+    @ngrams = Redis::Namespace.new("#{@namespace}:ng", :redis => @redis)
+    # Whether or not to use fuzzy matching for autocompletions
+    @fuzzy_match = false
+    # The size of n-grams stored (fuzzy matching)
+    @ngram_size = 3
+    # Maximum number of items to be indexed per n-gram (fuzzy matching)
+    @ngram_item_limit = 200
     class << self
-      attr_reader :redis
-      attr_accessor :namespace, :db, :items, :substrings, :max_per_substring, :max_results,
-        :leaderboard, :use_leaderboard, :rails_sources, :rails_source_sizes
+      attr_reader :redis, :namespace
+      attr_accessor :db, :items, :itemids, :substrings, :max_per_substring,
+        :max_results, :max_str_size, :leaderboard, :use_leaderboard, :rails_sources,
+        :rails_source_sizes, :ngrams, :fuzzy_match, :ngram_size, :ngram_item_limit
       def redis=(redis)
         @redis = redis
-        @db = Redis::Namespace.new(@namespace, :redis => redis)
-        @substrings =  Redis::Namespace.new("#{@namespace}:sub", :redis => redis)
-        @rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => redis)
+        set_namespaces()
+      end
+      def namespace=(namespace)
+        @namespace = namespace
+        set_namespaces()
+      end
+      private
+      def set_namespaces
+        @db = Redis::Namespace.new(@namespace, :redis => @redis)
+        @substrings =  Redis::Namespace.new("#{@namespace}:sub", :redis => @redis)
+        @rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => @redis)
+        @ngrams = Redis::Namespace.new("#{@namespace}:ng", :redis => @redis)
       end
     end
   end

data/lib/redis/autosuggest/file.rb CHANGED Viewed

@@ -6,7 +6,10 @@ class Redis
       # Add items to the autosuggest database from a file.
       # Each line be a string representing the item
       def add_from_file(file)
-        add(*(File.open(file, "r").map { |l| l.strip }))
+        File.open(file, "r").each do |l|
+          puts "Adding #{l}"
+          add(l.strip)
+        end
       end
       # Add items and their to the autosuggest database from a file.

data/lib/redis/autosuggest/fuzzy.rb ADDED Viewed

@@ -0,0 +1,96 @@
+class Redis
+  module Autosuggest
+    class << self
+      # Add an item's n-grams to the redis db. The n-grams will be used
+      # as candidates for autocompletions when Redis::Autosuggest.fuzzy_match
+      # is set to true.
+      def add_fuzzy(item)
+        yield_ngrams(item) do |ngram|
+          if @ngrams.scard(ngram).to_i <= @ngram_item_limit
+            @ngrams.sadd(ngram, "#{item}:#{compute_soundex_code(item)}")
+          end
+        end
+      end
+      # Remove an item's n-grams from the Redis db
+      def remove_fuzzy(item)
+        yield_ngrams(item) do |ngram|
+          @ngrams.srem(ngram, "#{item}:#{compute_soundex_code(item)}")
+        end
+      end
+      # Compute the soundex code of a string (only works for single words
+      # so we have to merge multi-word strings)
+      def compute_soundex_code(str)
+        return Text::Soundex.soundex(alphabet_only(str))
+      end
+      # Build a candidate pool for all suitable fuzzy matches for a string
+      # by taking the union of all items in the Redis db that share an n-gram
+      # with the string. Use levenshtein distance, soundex code similarity,
+      # and the number of matching 2-grams to compute a score for each candidate.
+      # Then return the highest-scoring candidates.
+      def suggest_fuzzy(str, results=@max_results)
+        str_mul = alphabet_only(str).size
+        str_soundex_code = compute_soundex_code(str)
+        str_2grams = ngram_list(str, 2)
+        candidates = []
+        @ngrams.sunion(*ngram_list(str)).each do |candidate|
+          candidate = candidate.split(":")
+          candidate_str = candidate[0]
+          candidate_soundex_code = candidate[1]
+          candidate_score = 1.0
+          # Levenshtein distance
+          lev_dist = Levenshtein.distance(str, candidate_str)
+          candidate_score *= Math.exp([str_mul - lev_dist, 1].max)
+          # Soundex
+          if str_soundex_code == candidate_soundex_code
+            candidate_score *= str_mul
+          elsif str_soundex_code[1..-1] == candidate_soundex_code[1..-1]
+            candidate_score *= (str_mul / 2).ceil
+          end
+          # Compute n-grams of size 2 shared between the two strings
+          same_2grams = str_2grams & ngram_list(candidate_str, 2)
+          candidate_score *= Math.exp(same_2grams.size)
+          candidates << [candidate_str, candidate_score] if candidate_score > 1
+        end
+        # Sort results by score and return the highest scoring candidates
+        candidates = candidates.sort { |a, b| b[1] <=> a[1] }
+        # puts candidates.take(10).map { |tuple| "#{tuple[0]} => #{tuple[1]}" }
+        return candidates.take(results).map { |a| a[0] }
+      end
+      # Yield the n-grams of a specified size for a string one at a time
+      def yield_ngrams(str, ngram_size=@ngram_size)
+        ngram_list = ngram_list(str, ngram_size)
+        ngram_list.each { |ngram| yield ngram }
+      end
+      # Returns a list containing all of the n-grams of a specified size
+      # of a string.  The list is ordered by the position of the n-gram
+      # in the string (duplicates included).
+      def ngram_list(str, ngram_size=@ngram_size)
+        str = alphabet_only(str).split("")
+        ngram_list = []
+        (0..str.size - ngram_size).each do |i|
+          ngram = ""
+          (0...ngram_size).each { |j| ngram << str[i + j] }
+          ngram_list << ngram
+        end
+        ngram_list
+      end
+      # Remove all characters not in the range 'a-z' from a string
+      def alphabet_only(str)
+        return str.gsub(/[^abcdefghijklmnopqrstuvwxyz]/, '')
+      end
+    end
+  end
+end

data/lib/redis/autosuggest/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 class Redis
   module Autosuggest
-    VERSION = "0.2.0"
+    VERSION = "0.3.0"
   end
 end

data/redis-autosuggest.gemspec CHANGED Viewed

@@ -20,6 +20,8 @@ Gem::Specification.new do |gem|
   gem.add_dependency("redis", "~> 3.0.2")
   gem.add_dependency("redis-namespace", "~> 1.2.1")
+  gem.add_dependency("levenshtein-ffi", "~> 1.0.3")
+  gem.add_dependency("text", "~> 1.2.1")
   gem.add_development_dependency("minitest", "~> 4.3.3")
 end

data/test/autosuggest_test.rb CHANGED Viewed

@@ -106,11 +106,25 @@ class TestAutosuggest < MiniTest::Unit::TestCase
   end
   def test_adding_with_substring_limit
+    saved_limit = Redis::Autosuggest.max_per_substring
     Redis::Autosuggest.max_per_substring = 1
     Redis::Autosuggest.add_with_score(@str1, 1)
     Redis::Autosuggest.add_with_score("Test", 5)
     item_id = Redis::Autosuggest.get_id("Test")
     assert_equal [item_id], @subs.zrevrange("test", 0, -1)
+    Redis::Autosuggest.max_per_substring = saved_limit
+  end
+  def test_suggesting_items_fuzzy
+    Redis::Autosuggest.fuzzy_match = true
+    str = "north by northwest"
+    Redis::Autosuggest.add(str, "northern exposure", "once upon a time in the west")
+    assert_equal str, Redis::Autosuggest.suggest("northbynorthwest")[0]
+    assert_equal str, Redis::Autosuggest.suggest("morth yb nerthwest")[0]
+    assert_equal str, Redis::Autosuggest.suggest("northe bie")[0]
+    assert_equal str, Redis::Autosuggest.suggest("morthybnerthwest")[0]
+    assert_equal str, Redis::Autosuggest.suggest("nourth bhye nourthwhast")[0]
+    Redis::Autosuggest.fuzzy_match = false
   end
   MiniTest::Unit.after_tests { self.unused_db.flushdb }

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: redis-autosuggest
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-12-18 00:00:00.000000000 Z
+date: 2013-01-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: redis
@@ -43,6 +43,38 @@ dependencies:
     - - ~>
       - !ruby/object:Gem::Version
         version: 1.2.1
+- !ruby/object:Gem::Dependency
+  name: levenshtein-ffi
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.3
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.3
+- !ruby/object:Gem::Dependency
+  name: text
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.2.1
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.2.1
 - !ruby/object:Gem::Dependency
   name: minitest
   requirement: !ruby/object:Gem::Requirement
@@ -76,6 +108,7 @@ files:
 - lib/redis/autosuggest.rb
 - lib/redis/autosuggest/config.rb
 - lib/redis/autosuggest/file.rb
+- lib/redis/autosuggest/fuzzy.rb
 - lib/redis/autosuggest/rails/railtie.rb
 - lib/redis/autosuggest/rails/rake_tasks.rb
 - lib/redis/autosuggest/rails/sources.rb