redis-autosuggest 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -36,6 +36,13 @@ Redis::Autosuggest.add_with_score("North By Northwest", 9, Northern Exposure, 3)
36
36
  # Increment an item's score
37
37
  Redis::Autosuggest.increment("North By Northwest", 1)
38
38
  ```
39
+ Fuzzy matching:
40
+ ```ruby
41
+ Redis::Autosuggest.fuzzy_match = true
42
+ Redis::Autosuggest.add("North By Northwest")
43
+ Redis::Autosuggest.suggest("nort byenorthwest")
44
+ # => ["north by northwest"]
45
+ ```
39
46
 
40
47
  ## Rails support
41
48
 
@@ -1,9 +1,12 @@
1
1
  require 'redis'
2
2
  require 'redis-namespace'
3
+ require 'levenshtein'
4
+ require 'text'
3
5
  require 'redis/autosuggest'
4
6
  require 'redis/autosuggest/config'
5
7
  require 'redis/autosuggest/file'
6
8
  require 'redis/autosuggest/version'
9
+ require 'redis/autosuggest/fuzzy'
7
10
 
8
11
  if defined?(Rails)
9
12
  require 'redis/autosuggest/rails/sources'
@@ -8,7 +8,11 @@ class Redis
8
8
  def add(*items)
9
9
  all_new_items = true
10
10
  items.each do |item|
11
- item = item.downcase
11
+ if item.size > @max_str_size
12
+ all_new_items = false
13
+ next
14
+ end
15
+ item = normalize(item)
12
16
  item_exists?(item) ? all_new_items = false : add_item(item)
13
17
  end
14
18
  all_new_items
@@ -20,6 +24,10 @@ class Redis
20
24
  def add_with_score(*fields)
21
25
  all_new_items = true
22
26
  fields.each_slice(2) do |f|
27
+ if f[0].size > @max_str_size
28
+ all_new_items = false
29
+ next
30
+ end
23
31
  f[0] = normalize(f[0])
24
32
  item_exists?(f[0]) ? all_new_items = false : add_item(*f)
25
33
  end
@@ -36,6 +44,7 @@ class Redis
36
44
  @db.hdel(@itemids, item)
37
45
  remove_substrings(item, id)
38
46
  @redis.zrem(@leaderboard, id) if @use_leaderboard
47
+ remove_fuzzy(item) if @fuzzy_match
39
48
  return true
40
49
  end
41
50
 
@@ -50,8 +59,15 @@ class Redis
50
59
 
51
60
  # Suggest items from the database that most closely match the queried string.
52
61
  # Returns an array of suggestion items (an empty array if nothing found).
62
+ # Fuzzy matching will only occur when both of these conditions are met:
63
+ # - Redis::Autosuggest.fuzzy_match == true
64
+ # - The simple suggestion method (matching substrings) yields no results
53
65
  def suggest(str, results=@max_results)
54
- suggestion_ids = @substrings.zrevrange(normalize(str), 0, results - 1)
66
+ str = normalize(str)
67
+ suggestion_ids = @substrings.zrevrange(str, 0, results - 1)
68
+ if suggestion_ids.empty? && @fuzzy_match
69
+ return suggest_fuzzy(str, results)
70
+ end
55
71
  suggestion_ids.empty? ? [] : @db.hmget(@items, suggestion_ids)
56
72
  end
57
73
 
@@ -77,6 +93,10 @@ class Redis
77
93
  return @db.hmget(@itemids, normalize(item)).first
78
94
  end
79
95
 
96
+ def get_item(id)
97
+ return @db.hmget(@items, id).first
98
+ end
99
+
80
100
  private
81
101
 
82
102
  def normalize(item)
@@ -89,6 +109,7 @@ class Redis
89
109
  @db.hset(@itemids, item, id)
90
110
  add_substrings(item, score, id)
91
111
  @db.zadd(@leaderboard, score, id) if @use_leaderboard
112
+ add_fuzzy(item) if @fuzzy_match
92
113
  end
93
114
 
94
115
  # Yield each substring of a complete string
@@ -106,19 +127,19 @@ class Redis
106
127
  end
107
128
  end
108
129
  end
109
-
130
+
110
131
  # Add the id of an item to a substring
111
132
  def add_substring(sub, score, id)
112
133
  @substrings.zadd(sub, score, id)
113
134
  end
114
-
135
+
115
136
  # Add the id of an item to a substring only when the number of items that
116
137
  # substring stores is less then the config value of "max_per_substring".
117
138
  # If the substring set is already full, check to see if the item with the
118
139
  # lowest score in the substring set has a lower score than the item being added.
119
140
  # If yes, remove that item and add this item to the substring set.
120
141
  def add_substring_limit(sub, score, id)
121
- count = @substrings.zcount(sub, "-inf", "inf")
142
+ count = @substrings.zcount(sub, "-inf", "+inf")
122
143
  if count < @max_per_substring
123
144
  add_substring(sub, score, id)
124
145
  else
@@ -3,7 +3,7 @@ class Redis
3
3
 
4
4
  # Default Redis server at localhost:6379
5
5
  @redis = Redis.new
6
-
6
+
7
7
  # Main Redis namespace for this module
8
8
  @namespace = "suggest"
9
9
 
@@ -19,7 +19,7 @@ class Redis
19
19
  # have four sorted sets: 'autosuggest:substring:r', 'autosuggest:substring:ru',
20
20
  # 'autosuggest:substring:rub', and 'autosuggest:substring:ruby'.
21
21
  # Each sorted set would the id to the word 'ruby'
22
- @substrings = Redis::Namespace.new("#{@namespace}:sub", :redis => @redis)
22
+ @substrings = Redis::Namespace.new("#{@namespace}:su", :redis => @redis)
23
23
 
24
24
  # max number of ids to store per substring.
25
25
  @max_per_substring = Float::INFINITY
@@ -27,6 +27,9 @@ class Redis
27
27
  # max number of results to return for an autosuggest query
28
28
  @max_results = 5
29
29
 
30
+ # max string size for an item
31
+ @max_str_size = Float::INFINITY
32
+
30
33
  # Key to a sorted set holding all id of items in the autosuggest database sorted
31
34
  # by their score
32
35
  @leaderboard = "lead"
@@ -41,16 +44,41 @@ class Redis
41
44
  # Stores the number of items the db has for each rails source
42
45
  @rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => @redis)
43
46
 
47
+ # Fuzzy matching
48
+ @ngrams = Redis::Namespace.new("#{@namespace}:ng", :redis => @redis)
49
+
50
+ # Whether or not to use fuzzy matching for autocompletions
51
+ @fuzzy_match = false
52
+
53
+ # The size of n-grams stored (fuzzy matching)
54
+ @ngram_size = 3
55
+
56
+ # Maximum number of items to be indexed per n-gram (fuzzy matching)
57
+ @ngram_item_limit = 200
58
+
44
59
  class << self
45
- attr_reader :redis
46
- attr_accessor :namespace, :db, :items, :substrings, :max_per_substring, :max_results,
47
- :leaderboard, :use_leaderboard, :rails_sources, :rails_source_sizes
60
+ attr_reader :redis, :namespace
61
+ attr_accessor :db, :items, :itemids, :substrings, :max_per_substring,
62
+ :max_results, :max_str_size, :leaderboard, :use_leaderboard, :rails_sources,
63
+ :rails_source_sizes, :ngrams, :fuzzy_match, :ngram_size, :ngram_item_limit
48
64
 
49
65
  def redis=(redis)
50
66
  @redis = redis
51
- @db = Redis::Namespace.new(@namespace, :redis => redis)
52
- @substrings = Redis::Namespace.new("#{@namespace}:sub", :redis => redis)
53
- @rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => redis)
67
+ set_namespaces()
68
+ end
69
+
70
+ def namespace=(namespace)
71
+ @namespace = namespace
72
+ set_namespaces()
73
+ end
74
+
75
+ private
76
+
77
+ def set_namespaces
78
+ @db = Redis::Namespace.new(@namespace, :redis => @redis)
79
+ @substrings = Redis::Namespace.new("#{@namespace}:sub", :redis => @redis)
80
+ @rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => @redis)
81
+ @ngrams = Redis::Namespace.new("#{@namespace}:ng", :redis => @redis)
54
82
  end
55
83
  end
56
84
  end
@@ -6,7 +6,10 @@ class Redis
6
6
  # Add items to the autosuggest database from a file.
7
7
  # Each line be a string representing the item
8
8
  def add_from_file(file)
9
- add(*(File.open(file, "r").map { |l| l.strip }))
9
+ File.open(file, "r").each do |l|
10
+ puts "Adding #{l}"
11
+ add(l.strip)
12
+ end
10
13
  end
11
14
 
12
15
  # Add items and their to the autosuggest database from a file.
@@ -0,0 +1,96 @@
1
+ class Redis
2
+ module Autosuggest
3
+
4
+ class << self
5
+
6
+ # Add an item's n-grams to the redis db. The n-grams will be used
7
+ # as candidates for autocompletions when Redis::Autosuggest.fuzzy_match
8
+ # is set to true.
9
+ def add_fuzzy(item)
10
+ yield_ngrams(item) do |ngram|
11
+ if @ngrams.scard(ngram).to_i <= @ngram_item_limit
12
+ @ngrams.sadd(ngram, "#{item}:#{compute_soundex_code(item)}")
13
+ end
14
+ end
15
+ end
16
+
17
+ # Remove an item's n-grams from the Redis db
18
+ def remove_fuzzy(item)
19
+ yield_ngrams(item) do |ngram|
20
+ @ngrams.srem(ngram, "#{item}:#{compute_soundex_code(item)}")
21
+ end
22
+ end
23
+
24
+ # Compute the soundex code of a string (only works for single words
25
+ # so we have to merge multi-word strings)
26
+ def compute_soundex_code(str)
27
+ return Text::Soundex.soundex(alphabet_only(str))
28
+ end
29
+
30
+ # Build a candidate pool for all suitable fuzzy matches for a string
31
+ # by taking the union of all items in the Redis db that share an n-gram
32
+ # with the string. Use levenshtein distance, soundex code similarity,
33
+ # and the number of matching 2-grams to compute a score for each candidate.
34
+ # Then return the highest-scoring candidates.
35
+ def suggest_fuzzy(str, results=@max_results)
36
+ str_mul = alphabet_only(str).size
37
+ str_soundex_code = compute_soundex_code(str)
38
+ str_2grams = ngram_list(str, 2)
39
+ candidates = []
40
+
41
+ @ngrams.sunion(*ngram_list(str)).each do |candidate|
42
+ candidate = candidate.split(":")
43
+ candidate_str = candidate[0]
44
+ candidate_soundex_code = candidate[1]
45
+ candidate_score = 1.0
46
+
47
+ # Levenshtein distance
48
+ lev_dist = Levenshtein.distance(str, candidate_str)
49
+ candidate_score *= Math.exp([str_mul - lev_dist, 1].max)
50
+
51
+ # Soundex
52
+ if str_soundex_code == candidate_soundex_code
53
+ candidate_score *= str_mul
54
+ elsif str_soundex_code[1..-1] == candidate_soundex_code[1..-1]
55
+ candidate_score *= (str_mul / 2).ceil
56
+ end
57
+
58
+ # Compute n-grams of size 2 shared between the two strings
59
+ same_2grams = str_2grams & ngram_list(candidate_str, 2)
60
+ candidate_score *= Math.exp(same_2grams.size)
61
+
62
+ candidates << [candidate_str, candidate_score] if candidate_score > 1
63
+ end
64
+ # Sort results by score and return the highest scoring candidates
65
+ candidates = candidates.sort { |a, b| b[1] <=> a[1] }
66
+ # puts candidates.take(10).map { |tuple| "#{tuple[0]} => #{tuple[1]}" }
67
+ return candidates.take(results).map { |a| a[0] }
68
+ end
69
+
70
+ # Yield the n-grams of a specified size for a string one at a time
71
+ def yield_ngrams(str, ngram_size=@ngram_size)
72
+ ngram_list = ngram_list(str, ngram_size)
73
+ ngram_list.each { |ngram| yield ngram }
74
+ end
75
+
76
+ # Returns a list containing all of the n-grams of a specified size
77
+ # of a string. The list is ordered by the position of the n-gram
78
+ # in the string (duplicates included).
79
+ def ngram_list(str, ngram_size=@ngram_size)
80
+ str = alphabet_only(str).split("")
81
+ ngram_list = []
82
+ (0..str.size - ngram_size).each do |i|
83
+ ngram = ""
84
+ (0...ngram_size).each { |j| ngram << str[i + j] }
85
+ ngram_list << ngram
86
+ end
87
+ ngram_list
88
+ end
89
+
90
+ # Remove all characters not in the range 'a-z' from a string
91
+ def alphabet_only(str)
92
+ return str.gsub(/[^abcdefghijklmnopqrstuvwxyz]/, '')
93
+ end
94
+ end
95
+ end
96
+ end
@@ -1,5 +1,5 @@
1
1
  class Redis
2
2
  module Autosuggest
3
- VERSION = "0.2.0"
3
+ VERSION = "0.3.0"
4
4
  end
5
5
  end
@@ -20,6 +20,8 @@ Gem::Specification.new do |gem|
20
20
 
21
21
  gem.add_dependency("redis", "~> 3.0.2")
22
22
  gem.add_dependency("redis-namespace", "~> 1.2.1")
23
+ gem.add_dependency("levenshtein-ffi", "~> 1.0.3")
24
+ gem.add_dependency("text", "~> 1.2.1")
23
25
 
24
26
  gem.add_development_dependency("minitest", "~> 4.3.3")
25
27
  end
@@ -106,11 +106,25 @@ class TestAutosuggest < MiniTest::Unit::TestCase
106
106
  end
107
107
 
108
108
  def test_adding_with_substring_limit
109
+ saved_limit = Redis::Autosuggest.max_per_substring
109
110
  Redis::Autosuggest.max_per_substring = 1
110
111
  Redis::Autosuggest.add_with_score(@str1, 1)
111
112
  Redis::Autosuggest.add_with_score("Test", 5)
112
113
  item_id = Redis::Autosuggest.get_id("Test")
113
114
  assert_equal [item_id], @subs.zrevrange("test", 0, -1)
115
+ Redis::Autosuggest.max_per_substring = saved_limit
116
+ end
117
+
118
+ def test_suggesting_items_fuzzy
119
+ Redis::Autosuggest.fuzzy_match = true
120
+ str = "north by northwest"
121
+ Redis::Autosuggest.add(str, "northern exposure", "once upon a time in the west")
122
+ assert_equal str, Redis::Autosuggest.suggest("northbynorthwest")[0]
123
+ assert_equal str, Redis::Autosuggest.suggest("morth yb nerthwest")[0]
124
+ assert_equal str, Redis::Autosuggest.suggest("northe bie")[0]
125
+ assert_equal str, Redis::Autosuggest.suggest("morthybnerthwest")[0]
126
+ assert_equal str, Redis::Autosuggest.suggest("nourth bhye nourthwhast")[0]
127
+ Redis::Autosuggest.fuzzy_match = false
114
128
  end
115
129
 
116
130
  MiniTest::Unit.after_tests { self.unused_db.flushdb }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: redis-autosuggest
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-18 00:00:00.000000000 Z
12
+ date: 2013-01-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: redis
@@ -43,6 +43,38 @@ dependencies:
43
43
  - - ~>
44
44
  - !ruby/object:Gem::Version
45
45
  version: 1.2.1
46
+ - !ruby/object:Gem::Dependency
47
+ name: levenshtein-ffi
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 1.0.3
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 1.0.3
62
+ - !ruby/object:Gem::Dependency
63
+ name: text
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: 1.2.1
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 1.2.1
46
78
  - !ruby/object:Gem::Dependency
47
79
  name: minitest
48
80
  requirement: !ruby/object:Gem::Requirement
@@ -76,6 +108,7 @@ files:
76
108
  - lib/redis/autosuggest.rb
77
109
  - lib/redis/autosuggest/config.rb
78
110
  - lib/redis/autosuggest/file.rb
111
+ - lib/redis/autosuggest/fuzzy.rb
79
112
  - lib/redis/autosuggest/rails/railtie.rb
80
113
  - lib/redis/autosuggest/rails/rake_tasks.rb
81
114
  - lib/redis/autosuggest/rails/sources.rb