redis-autosuggest 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -36,6 +36,13 @@ Redis::Autosuggest.add_with_score("North By Northwest", 9, Northern Exposure, 3)
36
36
  # Increment an item's score
37
37
  Redis::Autosuggest.increment("North By Northwest", 1)
38
38
  ```
39
+ Fuzzy matching:
40
+ ```ruby
41
+ Redis::Autosuggest.fuzzy_match = true
42
+ Redis::Autosuggest.add("North By Northwest")
43
+ Redis::Autosuggest.suggest("nort byenorthwest")
44
+ # => ["north by northwest"]
45
+ ```
39
46
 
40
47
  ## Rails support
41
48
 
@@ -1,9 +1,12 @@
1
1
  require 'redis'
2
2
  require 'redis-namespace'
3
+ require 'levenshtein'
4
+ require 'text'
3
5
  require 'redis/autosuggest'
4
6
  require 'redis/autosuggest/config'
5
7
  require 'redis/autosuggest/file'
6
8
  require 'redis/autosuggest/version'
9
+ require 'redis/autosuggest/fuzzy'
7
10
 
8
11
  if defined?(Rails)
9
12
  require 'redis/autosuggest/rails/sources'
@@ -8,7 +8,11 @@ class Redis
8
8
  def add(*items)
9
9
  all_new_items = true
10
10
  items.each do |item|
11
- item = item.downcase
11
+ if item.size > @max_str_size
12
+ all_new_items = false
13
+ next
14
+ end
15
+ item = normalize(item)
12
16
  item_exists?(item) ? all_new_items = false : add_item(item)
13
17
  end
14
18
  all_new_items
@@ -20,6 +24,10 @@ class Redis
20
24
  def add_with_score(*fields)
21
25
  all_new_items = true
22
26
  fields.each_slice(2) do |f|
27
+ if f[0].size > @max_str_size
28
+ all_new_items = false
29
+ next
30
+ end
23
31
  f[0] = normalize(f[0])
24
32
  item_exists?(f[0]) ? all_new_items = false : add_item(*f)
25
33
  end
@@ -36,6 +44,7 @@ class Redis
36
44
  @db.hdel(@itemids, item)
37
45
  remove_substrings(item, id)
38
46
  @redis.zrem(@leaderboard, id) if @use_leaderboard
47
+ remove_fuzzy(item) if @fuzzy_match
39
48
  return true
40
49
  end
41
50
 
@@ -50,8 +59,15 @@ class Redis
50
59
 
51
60
  # Suggest items from the database that most closely match the queried string.
52
61
  # Returns an array of suggestion items (an empty array if nothing found).
62
+ # Fuzzy matching will only occur when both of these conditions are met:
63
+ # - Redis::Autosuggest.fuzzy_match == true
64
+ # - The simple suggestion method (matching substrings) yields no results
53
65
  def suggest(str, results=@max_results)
54
- suggestion_ids = @substrings.zrevrange(normalize(str), 0, results - 1)
66
+ str = normalize(str)
67
+ suggestion_ids = @substrings.zrevrange(str, 0, results - 1)
68
+ if suggestion_ids.empty? && @fuzzy_match
69
+ return suggest_fuzzy(str, results)
70
+ end
55
71
  suggestion_ids.empty? ? [] : @db.hmget(@items, suggestion_ids)
56
72
  end
57
73
 
@@ -77,6 +93,10 @@ class Redis
77
93
  return @db.hmget(@itemids, normalize(item)).first
78
94
  end
79
95
 
96
+ def get_item(id)
97
+ return @db.hmget(@items, id).first
98
+ end
99
+
80
100
  private
81
101
 
82
102
  def normalize(item)
@@ -89,6 +109,7 @@ class Redis
89
109
  @db.hset(@itemids, item, id)
90
110
  add_substrings(item, score, id)
91
111
  @db.zadd(@leaderboard, score, id) if @use_leaderboard
112
+ add_fuzzy(item) if @fuzzy_match
92
113
  end
93
114
 
94
115
  # Yield each substring of a complete string
@@ -106,19 +127,19 @@ class Redis
106
127
  end
107
128
  end
108
129
  end
109
-
130
+
110
131
  # Add the id of an item to a substring
111
132
  def add_substring(sub, score, id)
112
133
  @substrings.zadd(sub, score, id)
113
134
  end
114
-
135
+
115
136
  # Add the id of an item to a substring only when the number of items that
116
137
  # substring stores is less then the config value of "max_per_substring".
117
138
  # If the substring set is already full, check to see if the item with the
118
139
  # lowest score in the substring set has a lower score than the item being added.
119
140
  # If yes, remove that item and add this item to the substring set.
120
141
  def add_substring_limit(sub, score, id)
121
- count = @substrings.zcount(sub, "-inf", "inf")
142
+ count = @substrings.zcount(sub, "-inf", "+inf")
122
143
  if count < @max_per_substring
123
144
  add_substring(sub, score, id)
124
145
  else
@@ -3,7 +3,7 @@ class Redis
3
3
 
4
4
  # Default Redis server at localhost:6379
5
5
  @redis = Redis.new
6
-
6
+
7
7
  # Main Redis namespace for this module
8
8
  @namespace = "suggest"
9
9
 
@@ -19,7 +19,7 @@ class Redis
19
19
  # have four sorted sets: 'autosuggest:substring:r', 'autosuggest:substring:ru',
20
20
  # 'autosuggest:substring:rub', and 'autosuggest:substring:ruby'.
21
21
  # Each sorted set would the id to the word 'ruby'
22
- @substrings = Redis::Namespace.new("#{@namespace}:sub", :redis => @redis)
22
+ @substrings = Redis::Namespace.new("#{@namespace}:su", :redis => @redis)
23
23
 
24
24
  # max number of ids to store per substring.
25
25
  @max_per_substring = Float::INFINITY
@@ -27,6 +27,9 @@ class Redis
27
27
  # max number of results to return for an autosuggest query
28
28
  @max_results = 5
29
29
 
30
+ # max string size for an item
31
+ @max_str_size = Float::INFINITY
32
+
30
33
  # Key to a sorted set holding all id of items in the autosuggest database sorted
31
34
  # by their score
32
35
  @leaderboard = "lead"
@@ -41,16 +44,41 @@ class Redis
41
44
  # Stores the number of items the db has for each rails source
42
45
  @rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => @redis)
43
46
 
47
+ # Fuzzy matching
48
+ @ngrams = Redis::Namespace.new("#{@namespace}:ng", :redis => @redis)
49
+
50
+ # Whether or not to use fuzzy matching for autocompletions
51
+ @fuzzy_match = false
52
+
53
+ # The size of n-grams stored (fuzzy matching)
54
+ @ngram_size = 3
55
+
56
+ # Maximum number of items to be indexed per n-gram (fuzzy matching)
57
+ @ngram_item_limit = 200
58
+
44
59
  class << self
45
- attr_reader :redis
46
- attr_accessor :namespace, :db, :items, :substrings, :max_per_substring, :max_results,
47
- :leaderboard, :use_leaderboard, :rails_sources, :rails_source_sizes
60
+ attr_reader :redis, :namespace
61
+ attr_accessor :db, :items, :itemids, :substrings, :max_per_substring,
62
+ :max_results, :max_str_size, :leaderboard, :use_leaderboard, :rails_sources,
63
+ :rails_source_sizes, :ngrams, :fuzzy_match, :ngram_size, :ngram_item_limit
48
64
 
49
65
  def redis=(redis)
50
66
  @redis = redis
51
- @db = Redis::Namespace.new(@namespace, :redis => redis)
52
- @substrings = Redis::Namespace.new("#{@namespace}:sub", :redis => redis)
53
- @rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => redis)
67
+ set_namespaces()
68
+ end
69
+
70
+ def namespace=(namespace)
71
+ @namespace = namespace
72
+ set_namespaces()
73
+ end
74
+
75
+ private
76
+
77
+ def set_namespaces
78
+ @db = Redis::Namespace.new(@namespace, :redis => @redis)
79
+ @substrings = Redis::Namespace.new("#{@namespace}:sub", :redis => @redis)
80
+ @rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => @redis)
81
+ @ngrams = Redis::Namespace.new("#{@namespace}:ng", :redis => @redis)
54
82
  end
55
83
  end
56
84
  end
@@ -6,7 +6,10 @@ class Redis
6
6
  # Add items to the autosuggest database from a file.
7
7
  # Each line be a string representing the item
8
8
  def add_from_file(file)
9
- add(*(File.open(file, "r").map { |l| l.strip }))
9
+ File.open(file, "r").each do |l|
10
+ puts "Adding #{l}"
11
+ add(l.strip)
12
+ end
10
13
  end
11
14
 
12
15
  # Add items and their to the autosuggest database from a file.
@@ -0,0 +1,96 @@
1
+ class Redis
2
+ module Autosuggest
3
+
4
+ class << self
5
+
6
+ # Add an item's n-grams to the redis db. The n-grams will be used
7
+ # as candidates for autocompletions when Redis::Autosuggest.fuzzy_match
8
+ # is set to true.
9
+ def add_fuzzy(item)
10
+ yield_ngrams(item) do |ngram|
11
+ if @ngrams.scard(ngram).to_i <= @ngram_item_limit
12
+ @ngrams.sadd(ngram, "#{item}:#{compute_soundex_code(item)}")
13
+ end
14
+ end
15
+ end
16
+
17
+ # Remove an item's n-grams from the Redis db
18
+ def remove_fuzzy(item)
19
+ yield_ngrams(item) do |ngram|
20
+ @ngrams.srem(ngram, "#{item}:#{compute_soundex_code(item)}")
21
+ end
22
+ end
23
+
24
+ # Compute the soundex code of a string (only works for single words
25
+ # so we have to merge multi-word strings)
26
+ def compute_soundex_code(str)
27
+ return Text::Soundex.soundex(alphabet_only(str))
28
+ end
29
+
30
+ # Build a candidate pool for all suitable fuzzy matches for a string
31
+ # by taking the union of all items in the Redis db that share an n-gram
32
+ # with the string. Use levenshtein distance, soundex code similarity,
33
+ # and the number of matching 2-grams to compute a score for each candidate.
34
+ # Then return the highest-scoring candidates.
35
+ def suggest_fuzzy(str, results=@max_results)
36
+ str_mul = alphabet_only(str).size
37
+ str_soundex_code = compute_soundex_code(str)
38
+ str_2grams = ngram_list(str, 2)
39
+ candidates = []
40
+
41
+ @ngrams.sunion(*ngram_list(str)).each do |candidate|
42
+ candidate = candidate.split(":")
43
+ candidate_str = candidate[0]
44
+ candidate_soundex_code = candidate[1]
45
+ candidate_score = 1.0
46
+
47
+ # Levenshtein distance
48
+ lev_dist = Levenshtein.distance(str, candidate_str)
49
+ candidate_score *= Math.exp([str_mul - lev_dist, 1].max)
50
+
51
+ # Soundex
52
+ if str_soundex_code == candidate_soundex_code
53
+ candidate_score *= str_mul
54
+ elsif str_soundex_code[1..-1] == candidate_soundex_code[1..-1]
55
+ candidate_score *= (str_mul / 2).ceil
56
+ end
57
+
58
+ # Compute n-grams of size 2 shared between the two strings
59
+ same_2grams = str_2grams & ngram_list(candidate_str, 2)
60
+ candidate_score *= Math.exp(same_2grams.size)
61
+
62
+ candidates << [candidate_str, candidate_score] if candidate_score > 1
63
+ end
64
+ # Sort results by score and return the highest scoring candidates
65
+ candidates = candidates.sort { |a, b| b[1] <=> a[1] }
66
+ # puts candidates.take(10).map { |tuple| "#{tuple[0]} => #{tuple[1]}" }
67
+ return candidates.take(results).map { |a| a[0] }
68
+ end
69
+
70
+ # Yield the n-grams of a specified size for a string one at a time
71
+ def yield_ngrams(str, ngram_size=@ngram_size)
72
+ ngram_list = ngram_list(str, ngram_size)
73
+ ngram_list.each { |ngram| yield ngram }
74
+ end
75
+
76
+ # Returns a list containing all of the n-grams of a specified size
77
+ # of a string. The list is ordered by the position of the n-gram
78
+ # in the string (duplicates included).
79
+ def ngram_list(str, ngram_size=@ngram_size)
80
+ str = alphabet_only(str).split("")
81
+ ngram_list = []
82
+ (0..str.size - ngram_size).each do |i|
83
+ ngram = ""
84
+ (0...ngram_size).each { |j| ngram << str[i + j] }
85
+ ngram_list << ngram
86
+ end
87
+ ngram_list
88
+ end
89
+
90
+ # Remove all characters not in the range 'a-z' from a string
91
+ def alphabet_only(str)
92
+ return str.gsub(/[^abcdefghijklmnopqrstuvwxyz]/, '')
93
+ end
94
+ end
95
+ end
96
+ end
@@ -1,5 +1,5 @@
1
1
  class Redis
2
2
  module Autosuggest
3
- VERSION = "0.2.0"
3
+ VERSION = "0.3.0"
4
4
  end
5
5
  end
@@ -20,6 +20,8 @@ Gem::Specification.new do |gem|
20
20
 
21
21
  gem.add_dependency("redis", "~> 3.0.2")
22
22
  gem.add_dependency("redis-namespace", "~> 1.2.1")
23
+ gem.add_dependency("levenshtein-ffi", "~> 1.0.3")
24
+ gem.add_dependency("text", "~> 1.2.1")
23
25
 
24
26
  gem.add_development_dependency("minitest", "~> 4.3.3")
25
27
  end
@@ -106,11 +106,25 @@ class TestAutosuggest < MiniTest::Unit::TestCase
106
106
  end
107
107
 
108
108
  def test_adding_with_substring_limit
109
+ saved_limit = Redis::Autosuggest.max_per_substring
109
110
  Redis::Autosuggest.max_per_substring = 1
110
111
  Redis::Autosuggest.add_with_score(@str1, 1)
111
112
  Redis::Autosuggest.add_with_score("Test", 5)
112
113
  item_id = Redis::Autosuggest.get_id("Test")
113
114
  assert_equal [item_id], @subs.zrevrange("test", 0, -1)
115
+ Redis::Autosuggest.max_per_substring = saved_limit
116
+ end
117
+
118
+ def test_suggesting_items_fuzzy
119
+ Redis::Autosuggest.fuzzy_match = true
120
+ str = "north by northwest"
121
+ Redis::Autosuggest.add(str, "northern exposure", "once upon a time in the west")
122
+ assert_equal str, Redis::Autosuggest.suggest("northbynorthwest")[0]
123
+ assert_equal str, Redis::Autosuggest.suggest("morth yb nerthwest")[0]
124
+ assert_equal str, Redis::Autosuggest.suggest("northe bie")[0]
125
+ assert_equal str, Redis::Autosuggest.suggest("morthybnerthwest")[0]
126
+ assert_equal str, Redis::Autosuggest.suggest("nourth bhye nourthwhast")[0]
127
+ Redis::Autosuggest.fuzzy_match = false
114
128
  end
115
129
 
116
130
  MiniTest::Unit.after_tests { self.unused_db.flushdb }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: redis-autosuggest
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-18 00:00:00.000000000 Z
12
+ date: 2013-01-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: redis
@@ -43,6 +43,38 @@ dependencies:
43
43
  - - ~>
44
44
  - !ruby/object:Gem::Version
45
45
  version: 1.2.1
46
+ - !ruby/object:Gem::Dependency
47
+ name: levenshtein-ffi
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 1.0.3
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 1.0.3
62
+ - !ruby/object:Gem::Dependency
63
+ name: text
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: 1.2.1
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 1.2.1
46
78
  - !ruby/object:Gem::Dependency
47
79
  name: minitest
48
80
  requirement: !ruby/object:Gem::Requirement
@@ -76,6 +108,7 @@ files:
76
108
  - lib/redis/autosuggest.rb
77
109
  - lib/redis/autosuggest/config.rb
78
110
  - lib/redis/autosuggest/file.rb
111
+ - lib/redis/autosuggest/fuzzy.rb
79
112
  - lib/redis/autosuggest/rails/railtie.rb
80
113
  - lib/redis/autosuggest/rails/rake_tasks.rb
81
114
  - lib/redis/autosuggest/rails/sources.rb