RubyGems - redis-autosuggest - Versions diffs - 0.2.0 → 0.3.0 - Mend

redis-autosuggest 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/README.md +7 -0
data/lib/redis-autosuggest.rb +3 -0
data/lib/redis/autosuggest.rb +26 -5
data/lib/redis/autosuggest/config.rb +36 -8
data/lib/redis/autosuggest/file.rb +4 -1
data/lib/redis/autosuggest/fuzzy.rb +96 -0
data/lib/redis/autosuggest/version.rb +1 -1
data/redis-autosuggest.gemspec +2 -0
data/test/autosuggest_test.rb +14 -0
metadata +35 -2

data/README.md CHANGED Viewed

@@ -36,6 +36,13 @@ Redis::Autosuggest.add_with_score("North By Northwest", 9, Northern Exposure, 3)
 # Increment an item's score
 Redis::Autosuggest.increment("North By Northwest", 1)
 ```
+Fuzzy matching:
+```ruby
+Redis::Autosuggest.fuzzy_match = true
+Redis::Autosuggest.add("North By Northwest")
+Redis::Autosuggest.suggest("nort byenorthwest")
+# => ["north by northwest"]
+```
 ## Rails support

data/lib/redis-autosuggest.rb CHANGED Viewed

@@ -1,9 +1,12 @@
 require 'redis'
 require 'redis-namespace'
+require 'levenshtein'
+require 'text'
 require 'redis/autosuggest'
 require 'redis/autosuggest/config'
 require 'redis/autosuggest/file'
 require 'redis/autosuggest/version'
+require 'redis/autosuggest/fuzzy'
 if defined?(Rails)
   require 'redis/autosuggest/rails/sources'

data/lib/redis/autosuggest.rb CHANGED Viewed

@@ -8,7 +8,11 @@ class Redis
       def add(*items)
         all_new_items = true
         items.each do |item|
-          item = item.downcase
+          if item.size > @max_str_size
+            all_new_items = false
+            next
+          end
+          item = normalize(item)
           item_exists?(item) ? all_new_items = false : add_item(item)
         end
         all_new_items
@@ -20,6 +24,10 @@ class Redis
       def add_with_score(*fields)
         all_new_items = true
         fields.each_slice(2) do |f|
+          if f[0].size > @max_str_size
+            all_new_items = false
+            next
+          end
           f[0] = normalize(f[0])
           item_exists?(f[0]) ? all_new_items = false : add_item(*f)
         end
@@ -36,6 +44,7 @@ class Redis
         @db.hdel(@itemids, item)
         remove_substrings(item, id)
         @redis.zrem(@leaderboard, id) if @use_leaderboard
+        remove_fuzzy(item) if @fuzzy_match
         return true
       end
@@ -50,8 +59,15 @@ class Redis
       # Suggest items from the database that most closely match the queried string.
       # Returns an array of suggestion items (an empty array if nothing found).
+      # Fuzzy matching will only occur when both of these conditions are met:
+      #   - Redis::Autosuggest.fuzzy_match == true
+      #   - The simple suggestion method (matching substrings) yields no results
       def suggest(str, results=@max_results)
-        suggestion_ids = @substrings.zrevrange(normalize(str), 0, results - 1)
+        str = normalize(str)
+        suggestion_ids = @substrings.zrevrange(str, 0, results - 1)
+        if suggestion_ids.empty? && @fuzzy_match
+          return suggest_fuzzy(str, results)
+        end
         suggestion_ids.empty? ? [] : @db.hmget(@items, suggestion_ids)
       end
@@ -77,6 +93,10 @@ class Redis
         return @db.hmget(@itemids, normalize(item)).first
       end
+      def get_item(id)
+        return @db.hmget(@items, id).first
+      end
       private
       def normalize(item)
@@ -89,6 +109,7 @@ class Redis
         @db.hset(@itemids, item, id)
         add_substrings(item, score, id)
         @db.zadd(@leaderboard, score, id) if @use_leaderboard
+        add_fuzzy(item) if @fuzzy_match
       end
       # Yield each substring of a complete string
@@ -106,19 +127,19 @@ class Redis
           end
         end
       end
       # Add the id of an item to a substring
       def add_substring(sub, score, id)
         @substrings.zadd(sub, score, id)
       end
       # Add the id of an item to a substring only when the number of items that
       # substring stores is less then the config value of "max_per_substring".
       # If the substring set is already full, check to see if the item with the
       # lowest score in the substring set has a lower score than the item being added.
       # If yes, remove that item and add this item to the substring set.
       def add_substring_limit(sub, score, id)
-        count = @substrings.zcount(sub, "-inf", "inf")
+        count = @substrings.zcount(sub, "-inf", "+inf")
         if count < @max_per_substring
           add_substring(sub, score, id)
         else

data/lib/redis/autosuggest/config.rb CHANGED Viewed

@@ -3,7 +3,7 @@ class Redis
     # Default Redis server at localhost:6379
     @redis = Redis.new
     # Main Redis namespace for this module
     @namespace = "suggest"
@@ -19,7 +19,7 @@ class Redis
     # have four sorted sets: 'autosuggest:substring:r', 'autosuggest:substring:ru',
     # 'autosuggest:substring:rub', and 'autosuggest:substring:ruby'.
     # Each sorted set would the id to the word 'ruby'
-    @substrings =  Redis::Namespace.new("#{@namespace}:sub", :redis => @redis)
+    @substrings =  Redis::Namespace.new("#{@namespace}:su", :redis => @redis)
     # max number of ids to store per substring.
     @max_per_substring = Float::INFINITY
@@ -27,6 +27,9 @@ class Redis
     # max number of results to return for an autosuggest query
     @max_results = 5
+    # max string size for an item
+    @max_str_size = Float::INFINITY
     # Key to a sorted set holding all id of items in the autosuggest database sorted
     # by their score
     @leaderboard = "lead"
@@ -41,16 +44,41 @@ class Redis
     # Stores the number of items the db has for each rails source
     @rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => @redis)
+    # Fuzzy matching
+    @ngrams = Redis::Namespace.new("#{@namespace}:ng", :redis => @redis)
+    # Whether or not to use fuzzy matching for autocompletions
+    @fuzzy_match = false
+    # The size of n-grams stored (fuzzy matching)
+    @ngram_size = 3
+    # Maximum number of items to be indexed per n-gram (fuzzy matching)
+    @ngram_item_limit = 200
     class << self
-      attr_reader :redis
-      attr_accessor :namespace, :db, :items, :substrings, :max_per_substring, :max_results,
-        :leaderboard, :use_leaderboard, :rails_sources, :rails_source_sizes
+      attr_reader :redis, :namespace
+      attr_accessor :db, :items, :itemids, :substrings, :max_per_substring,
+        :max_results, :max_str_size, :leaderboard, :use_leaderboard, :rails_sources,
+        :rails_source_sizes, :ngrams, :fuzzy_match, :ngram_size, :ngram_item_limit
       def redis=(redis)
         @redis = redis
-        @db = Redis::Namespace.new(@namespace, :redis => redis)
-        @substrings =  Redis::Namespace.new("#{@namespace}:sub", :redis => redis)
-        @rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => redis)
+        set_namespaces()
+      end
+      def namespace=(namespace)
+        @namespace = namespace
+        set_namespaces()
+      end
+      private
+      def set_namespaces
+        @db = Redis::Namespace.new(@namespace, :redis => @redis)
+        @substrings =  Redis::Namespace.new("#{@namespace}:sub", :redis => @redis)
+        @rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => @redis)
+        @ngrams = Redis::Namespace.new("#{@namespace}:ng", :redis => @redis)
       end
     end
   end

data/lib/redis/autosuggest/file.rb CHANGED Viewed

@@ -6,7 +6,10 @@ class Redis
       # Add items to the autosuggest database from a file.
       # Each line be a string representing the item
       def add_from_file(file)
-        add(*(File.open(file, "r").map { |l| l.strip }))
+        File.open(file, "r").each do |l|
+          puts "Adding #{l}"
+          add(l.strip)
+        end
       end
       # Add items and their to the autosuggest database from a file.

data/lib/redis/autosuggest/fuzzy.rb ADDED Viewed

@@ -0,0 +1,96 @@
+class Redis
+  module Autosuggest
+    class << self
+      # Add an item's n-grams to the redis db. The n-grams will be used
+      # as candidates for autocompletions when Redis::Autosuggest.fuzzy_match
+      # is set to true.
+      def add_fuzzy(item)
+        yield_ngrams(item) do |ngram|
+          if @ngrams.scard(ngram).to_i <= @ngram_item_limit
+            @ngrams.sadd(ngram, "#{item}:#{compute_soundex_code(item)}")
+          end
+        end
+      end
+      # Remove an item's n-grams from the Redis db
+      def remove_fuzzy(item)
+        yield_ngrams(item) do |ngram|
+          @ngrams.srem(ngram, "#{item}:#{compute_soundex_code(item)}")
+        end
+      end
+      # Compute the soundex code of a string (only works for single words
+      # so we have to merge multi-word strings)
+      def compute_soundex_code(str)
+        return Text::Soundex.soundex(alphabet_only(str))
+      end
+      # Build a candidate pool for all suitable fuzzy matches for a string
+      # by taking the union of all items in the Redis db that share an n-gram
+      # with the string. Use levenshtein distance, soundex code similarity,
+      # and the number of matching 2-grams to compute a score for each candidate.
+      # Then return the highest-scoring candidates.
+      def suggest_fuzzy(str, results=@max_results)
+        str_mul = alphabet_only(str).size
+        str_soundex_code = compute_soundex_code(str)
+        str_2grams = ngram_list(str, 2)
+        candidates = []
+        @ngrams.sunion(*ngram_list(str)).each do |candidate|
+          candidate = candidate.split(":")
+          candidate_str = candidate[0]
+          candidate_soundex_code = candidate[1]
+          candidate_score = 1.0
+          # Levenshtein distance
+          lev_dist = Levenshtein.distance(str, candidate_str)
+          candidate_score *= Math.exp([str_mul - lev_dist, 1].max)
+          # Soundex
+          if str_soundex_code == candidate_soundex_code
+            candidate_score *= str_mul
+          elsif str_soundex_code[1..-1] == candidate_soundex_code[1..-1]
+            candidate_score *= (str_mul / 2).ceil
+          end
+          # Compute n-grams of size 2 shared between the two strings
+          same_2grams = str_2grams & ngram_list(candidate_str, 2)
+          candidate_score *= Math.exp(same_2grams.size)
+          candidates << [candidate_str, candidate_score] if candidate_score > 1
+        end
+        # Sort results by score and return the highest scoring candidates
+        candidates = candidates.sort { |a, b| b[1] <=> a[1] }
+        # puts candidates.take(10).map { |tuple| "#{tuple[0]} => #{tuple[1]}" }
+        return candidates.take(results).map { |a| a[0] }
+      end
+      # Yield the n-grams of a specified size for a string one at a time
+      def yield_ngrams(str, ngram_size=@ngram_size)
+        ngram_list = ngram_list(str, ngram_size)
+        ngram_list.each { |ngram| yield ngram }
+      end
+      # Returns a list containing all of the n-grams of a specified size
+      # of a string.  The list is ordered by the position of the n-gram
+      # in the string (duplicates included).
+      def ngram_list(str, ngram_size=@ngram_size)
+        str = alphabet_only(str).split("")
+        ngram_list = []
+        (0..str.size - ngram_size).each do |i|
+          ngram = ""
+          (0...ngram_size).each { |j| ngram << str[i + j] }
+          ngram_list << ngram
+        end
+        ngram_list
+      end
+      # Remove all characters not in the range 'a-z' from a string
+      def alphabet_only(str)
+        return str.gsub(/[^abcdefghijklmnopqrstuvwxyz]/, '')
+      end
+    end
+  end
+end

data/lib/redis/autosuggest/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 class Redis
   module Autosuggest
-    VERSION = "0.2.0"
+    VERSION = "0.3.0"
   end
 end

data/redis-autosuggest.gemspec CHANGED Viewed

@@ -20,6 +20,8 @@ Gem::Specification.new do |gem|
   gem.add_dependency("redis", "~> 3.0.2")
   gem.add_dependency("redis-namespace", "~> 1.2.1")
+  gem.add_dependency("levenshtein-ffi", "~> 1.0.3")
+  gem.add_dependency("text", "~> 1.2.1")
   gem.add_development_dependency("minitest", "~> 4.3.3")
 end

data/test/autosuggest_test.rb CHANGED Viewed

@@ -106,11 +106,25 @@ class TestAutosuggest < MiniTest::Unit::TestCase
   end
   def test_adding_with_substring_limit
+    saved_limit = Redis::Autosuggest.max_per_substring
     Redis::Autosuggest.max_per_substring = 1
     Redis::Autosuggest.add_with_score(@str1, 1)
     Redis::Autosuggest.add_with_score("Test", 5)
     item_id = Redis::Autosuggest.get_id("Test")
     assert_equal [item_id], @subs.zrevrange("test", 0, -1)
+    Redis::Autosuggest.max_per_substring = saved_limit
+  end
+  def test_suggesting_items_fuzzy
+    Redis::Autosuggest.fuzzy_match = true
+    str = "north by northwest"
+    Redis::Autosuggest.add(str, "northern exposure", "once upon a time in the west")
+    assert_equal str, Redis::Autosuggest.suggest("northbynorthwest")[0]
+    assert_equal str, Redis::Autosuggest.suggest("morth yb nerthwest")[0]
+    assert_equal str, Redis::Autosuggest.suggest("northe bie")[0]
+    assert_equal str, Redis::Autosuggest.suggest("morthybnerthwest")[0]
+    assert_equal str, Redis::Autosuggest.suggest("nourth bhye nourthwhast")[0]
+    Redis::Autosuggest.fuzzy_match = false
   end
   MiniTest::Unit.after_tests { self.unused_db.flushdb }

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: redis-autosuggest
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-12-18 00:00:00.000000000 Z
+date: 2013-01-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: redis
@@ -43,6 +43,38 @@ dependencies:
     - - ~>
       - !ruby/object:Gem::Version
         version: 1.2.1
+- !ruby/object:Gem::Dependency
+  name: levenshtein-ffi
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.3
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.3
+- !ruby/object:Gem::Dependency
+  name: text
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.2.1
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.2.1
 - !ruby/object:Gem::Dependency
   name: minitest
   requirement: !ruby/object:Gem::Requirement
@@ -76,6 +108,7 @@ files:
 - lib/redis/autosuggest.rb
 - lib/redis/autosuggest/config.rb
 - lib/redis/autosuggest/file.rb
+- lib/redis/autosuggest/fuzzy.rb
 - lib/redis/autosuggest/rails/railtie.rb
 - lib/redis/autosuggest/rails/rake_tasks.rb
 - lib/redis/autosuggest/rails/sources.rb