redis-autosuggest 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +7 -0
- data/lib/redis-autosuggest.rb +3 -0
- data/lib/redis/autosuggest.rb +26 -5
- data/lib/redis/autosuggest/config.rb +36 -8
- data/lib/redis/autosuggest/file.rb +4 -1
- data/lib/redis/autosuggest/fuzzy.rb +96 -0
- data/lib/redis/autosuggest/version.rb +1 -1
- data/redis-autosuggest.gemspec +2 -0
- data/test/autosuggest_test.rb +14 -0
- metadata +35 -2
data/README.md
CHANGED
@@ -36,6 +36,13 @@ Redis::Autosuggest.add_with_score("North By Northwest", 9, Northern Exposure, 3)
|
|
36
36
|
# Increment an item's score
|
37
37
|
Redis::Autosuggest.increment("North By Northwest", 1)
|
38
38
|
```
|
39
|
+
Fuzzy matching:
|
40
|
+
```ruby
|
41
|
+
Redis::Autosuggest.fuzzy_match = true
|
42
|
+
Redis::Autosuggest.add("North By Northwest")
|
43
|
+
Redis::Autosuggest.suggest("nort byenorthwest")
|
44
|
+
# => ["north by northwest"]
|
45
|
+
```
|
39
46
|
|
40
47
|
## Rails support
|
41
48
|
|
data/lib/redis-autosuggest.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
require 'redis'
|
2
2
|
require 'redis-namespace'
|
3
|
+
require 'levenshtein'
|
4
|
+
require 'text'
|
3
5
|
require 'redis/autosuggest'
|
4
6
|
require 'redis/autosuggest/config'
|
5
7
|
require 'redis/autosuggest/file'
|
6
8
|
require 'redis/autosuggest/version'
|
9
|
+
require 'redis/autosuggest/fuzzy'
|
7
10
|
|
8
11
|
if defined?(Rails)
|
9
12
|
require 'redis/autosuggest/rails/sources'
|
data/lib/redis/autosuggest.rb
CHANGED
@@ -8,7 +8,11 @@ class Redis
|
|
8
8
|
def add(*items)
|
9
9
|
all_new_items = true
|
10
10
|
items.each do |item|
|
11
|
-
item
|
11
|
+
if item.size > @max_str_size
|
12
|
+
all_new_items = false
|
13
|
+
next
|
14
|
+
end
|
15
|
+
item = normalize(item)
|
12
16
|
item_exists?(item) ? all_new_items = false : add_item(item)
|
13
17
|
end
|
14
18
|
all_new_items
|
@@ -20,6 +24,10 @@ class Redis
|
|
20
24
|
def add_with_score(*fields)
|
21
25
|
all_new_items = true
|
22
26
|
fields.each_slice(2) do |f|
|
27
|
+
if f[0].size > @max_str_size
|
28
|
+
all_new_items = false
|
29
|
+
next
|
30
|
+
end
|
23
31
|
f[0] = normalize(f[0])
|
24
32
|
item_exists?(f[0]) ? all_new_items = false : add_item(*f)
|
25
33
|
end
|
@@ -36,6 +44,7 @@ class Redis
|
|
36
44
|
@db.hdel(@itemids, item)
|
37
45
|
remove_substrings(item, id)
|
38
46
|
@redis.zrem(@leaderboard, id) if @use_leaderboard
|
47
|
+
remove_fuzzy(item) if @fuzzy_match
|
39
48
|
return true
|
40
49
|
end
|
41
50
|
|
@@ -50,8 +59,15 @@ class Redis
|
|
50
59
|
|
51
60
|
# Suggest items from the database that most closely match the queried string.
|
52
61
|
# Returns an array of suggestion items (an empty array if nothing found).
|
62
|
+
# Fuzzy matching will only occur when both of these conditions are met:
|
63
|
+
# - Redis::Autosuggest.fuzzy_match == true
|
64
|
+
# - The simple suggestion method (matching substrings) yields no results
|
53
65
|
def suggest(str, results=@max_results)
|
54
|
-
|
66
|
+
str = normalize(str)
|
67
|
+
suggestion_ids = @substrings.zrevrange(str, 0, results - 1)
|
68
|
+
if suggestion_ids.empty? && @fuzzy_match
|
69
|
+
return suggest_fuzzy(str, results)
|
70
|
+
end
|
55
71
|
suggestion_ids.empty? ? [] : @db.hmget(@items, suggestion_ids)
|
56
72
|
end
|
57
73
|
|
@@ -77,6 +93,10 @@ class Redis
|
|
77
93
|
return @db.hmget(@itemids, normalize(item)).first
|
78
94
|
end
|
79
95
|
|
96
|
+
def get_item(id)
|
97
|
+
return @db.hmget(@items, id).first
|
98
|
+
end
|
99
|
+
|
80
100
|
private
|
81
101
|
|
82
102
|
def normalize(item)
|
@@ -89,6 +109,7 @@ class Redis
|
|
89
109
|
@db.hset(@itemids, item, id)
|
90
110
|
add_substrings(item, score, id)
|
91
111
|
@db.zadd(@leaderboard, score, id) if @use_leaderboard
|
112
|
+
add_fuzzy(item) if @fuzzy_match
|
92
113
|
end
|
93
114
|
|
94
115
|
# Yield each substring of a complete string
|
@@ -106,19 +127,19 @@ class Redis
|
|
106
127
|
end
|
107
128
|
end
|
108
129
|
end
|
109
|
-
|
130
|
+
|
110
131
|
# Add the id of an item to a substring
|
111
132
|
def add_substring(sub, score, id)
|
112
133
|
@substrings.zadd(sub, score, id)
|
113
134
|
end
|
114
|
-
|
135
|
+
|
115
136
|
# Add the id of an item to a substring only when the number of items that
|
116
137
|
# substring stores is less then the config value of "max_per_substring".
|
117
138
|
# If the substring set is already full, check to see if the item with the
|
118
139
|
# lowest score in the substring set has a lower score than the item being added.
|
119
140
|
# If yes, remove that item and add this item to the substring set.
|
120
141
|
def add_substring_limit(sub, score, id)
|
121
|
-
count = @substrings.zcount(sub, "-inf", "inf")
|
142
|
+
count = @substrings.zcount(sub, "-inf", "+inf")
|
122
143
|
if count < @max_per_substring
|
123
144
|
add_substring(sub, score, id)
|
124
145
|
else
|
@@ -3,7 +3,7 @@ class Redis
|
|
3
3
|
|
4
4
|
# Default Redis server at localhost:6379
|
5
5
|
@redis = Redis.new
|
6
|
-
|
6
|
+
|
7
7
|
# Main Redis namespace for this module
|
8
8
|
@namespace = "suggest"
|
9
9
|
|
@@ -19,7 +19,7 @@ class Redis
|
|
19
19
|
# have four sorted sets: 'autosuggest:substring:r', 'autosuggest:substring:ru',
|
20
20
|
# 'autosuggest:substring:rub', and 'autosuggest:substring:ruby'.
|
21
21
|
# Each sorted set would the id to the word 'ruby'
|
22
|
-
@substrings = Redis::Namespace.new("#{@namespace}:
|
22
|
+
@substrings = Redis::Namespace.new("#{@namespace}:su", :redis => @redis)
|
23
23
|
|
24
24
|
# max number of ids to store per substring.
|
25
25
|
@max_per_substring = Float::INFINITY
|
@@ -27,6 +27,9 @@ class Redis
|
|
27
27
|
# max number of results to return for an autosuggest query
|
28
28
|
@max_results = 5
|
29
29
|
|
30
|
+
# max string size for an item
|
31
|
+
@max_str_size = Float::INFINITY
|
32
|
+
|
30
33
|
# Key to a sorted set holding all id of items in the autosuggest database sorted
|
31
34
|
# by their score
|
32
35
|
@leaderboard = "lead"
|
@@ -41,16 +44,41 @@ class Redis
|
|
41
44
|
# Stores the number of items the db has for each rails source
|
42
45
|
@rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => @redis)
|
43
46
|
|
47
|
+
# Fuzzy matching
|
48
|
+
@ngrams = Redis::Namespace.new("#{@namespace}:ng", :redis => @redis)
|
49
|
+
|
50
|
+
# Whether or not to use fuzzy matching for autocompletions
|
51
|
+
@fuzzy_match = false
|
52
|
+
|
53
|
+
# The size of n-grams stored (fuzzy matching)
|
54
|
+
@ngram_size = 3
|
55
|
+
|
56
|
+
# Maximum number of items to be indexed per n-gram (fuzzy matching)
|
57
|
+
@ngram_item_limit = 200
|
58
|
+
|
44
59
|
class << self
|
45
|
-
attr_reader :redis
|
46
|
-
attr_accessor :
|
47
|
-
:leaderboard, :use_leaderboard, :rails_sources,
|
60
|
+
attr_reader :redis, :namespace
|
61
|
+
attr_accessor :db, :items, :itemids, :substrings, :max_per_substring,
|
62
|
+
:max_results, :max_str_size, :leaderboard, :use_leaderboard, :rails_sources,
|
63
|
+
:rails_source_sizes, :ngrams, :fuzzy_match, :ngram_size, :ngram_item_limit
|
48
64
|
|
49
65
|
def redis=(redis)
|
50
66
|
@redis = redis
|
51
|
-
|
52
|
-
|
53
|
-
|
67
|
+
set_namespaces()
|
68
|
+
end
|
69
|
+
|
70
|
+
def namespace=(namespace)
|
71
|
+
@namespace = namespace
|
72
|
+
set_namespaces()
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
def set_namespaces
|
78
|
+
@db = Redis::Namespace.new(@namespace, :redis => @redis)
|
79
|
+
@substrings = Redis::Namespace.new("#{@namespace}:sub", :redis => @redis)
|
80
|
+
@rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => @redis)
|
81
|
+
@ngrams = Redis::Namespace.new("#{@namespace}:ng", :redis => @redis)
|
54
82
|
end
|
55
83
|
end
|
56
84
|
end
|
@@ -6,7 +6,10 @@ class Redis
|
|
6
6
|
# Add items to the autosuggest database from a file.
|
7
7
|
# Each line be a string representing the item
|
8
8
|
def add_from_file(file)
|
9
|
-
|
9
|
+
File.open(file, "r").each do |l|
|
10
|
+
puts "Adding #{l}"
|
11
|
+
add(l.strip)
|
12
|
+
end
|
10
13
|
end
|
11
14
|
|
12
15
|
# Add items and their to the autosuggest database from a file.
|
@@ -0,0 +1,96 @@
|
|
1
|
+
class Redis
|
2
|
+
module Autosuggest
|
3
|
+
|
4
|
+
class << self
|
5
|
+
|
6
|
+
# Add an item's n-grams to the redis db. The n-grams will be used
|
7
|
+
# as candidates for autocompletions when Redis::Autosuggest.fuzzy_match
|
8
|
+
# is set to true.
|
9
|
+
def add_fuzzy(item)
|
10
|
+
yield_ngrams(item) do |ngram|
|
11
|
+
if @ngrams.scard(ngram).to_i <= @ngram_item_limit
|
12
|
+
@ngrams.sadd(ngram, "#{item}:#{compute_soundex_code(item)}")
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Remove an item's n-grams from the Redis db
|
18
|
+
def remove_fuzzy(item)
|
19
|
+
yield_ngrams(item) do |ngram|
|
20
|
+
@ngrams.srem(ngram, "#{item}:#{compute_soundex_code(item)}")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Compute the soundex code of a string (only works for single words
|
25
|
+
# so we have to merge multi-word strings)
|
26
|
+
def compute_soundex_code(str)
|
27
|
+
return Text::Soundex.soundex(alphabet_only(str))
|
28
|
+
end
|
29
|
+
|
30
|
+
# Build a candidate pool for all suitable fuzzy matches for a string
|
31
|
+
# by taking the union of all items in the Redis db that share an n-gram
|
32
|
+
# with the string. Use levenshtein distance, soundex code similarity,
|
33
|
+
# and the number of matching 2-grams to compute a score for each candidate.
|
34
|
+
# Then return the highest-scoring candidates.
|
35
|
+
def suggest_fuzzy(str, results=@max_results)
|
36
|
+
str_mul = alphabet_only(str).size
|
37
|
+
str_soundex_code = compute_soundex_code(str)
|
38
|
+
str_2grams = ngram_list(str, 2)
|
39
|
+
candidates = []
|
40
|
+
|
41
|
+
@ngrams.sunion(*ngram_list(str)).each do |candidate|
|
42
|
+
candidate = candidate.split(":")
|
43
|
+
candidate_str = candidate[0]
|
44
|
+
candidate_soundex_code = candidate[1]
|
45
|
+
candidate_score = 1.0
|
46
|
+
|
47
|
+
# Levenshtein distance
|
48
|
+
lev_dist = Levenshtein.distance(str, candidate_str)
|
49
|
+
candidate_score *= Math.exp([str_mul - lev_dist, 1].max)
|
50
|
+
|
51
|
+
# Soundex
|
52
|
+
if str_soundex_code == candidate_soundex_code
|
53
|
+
candidate_score *= str_mul
|
54
|
+
elsif str_soundex_code[1..-1] == candidate_soundex_code[1..-1]
|
55
|
+
candidate_score *= (str_mul / 2).ceil
|
56
|
+
end
|
57
|
+
|
58
|
+
# Compute n-grams of size 2 shared between the two strings
|
59
|
+
same_2grams = str_2grams & ngram_list(candidate_str, 2)
|
60
|
+
candidate_score *= Math.exp(same_2grams.size)
|
61
|
+
|
62
|
+
candidates << [candidate_str, candidate_score] if candidate_score > 1
|
63
|
+
end
|
64
|
+
# Sort results by score and return the highest scoring candidates
|
65
|
+
candidates = candidates.sort { |a, b| b[1] <=> a[1] }
|
66
|
+
# puts candidates.take(10).map { |tuple| "#{tuple[0]} => #{tuple[1]}" }
|
67
|
+
return candidates.take(results).map { |a| a[0] }
|
68
|
+
end
|
69
|
+
|
70
|
+
# Yield the n-grams of a specified size for a string one at a time
|
71
|
+
def yield_ngrams(str, ngram_size=@ngram_size)
|
72
|
+
ngram_list = ngram_list(str, ngram_size)
|
73
|
+
ngram_list.each { |ngram| yield ngram }
|
74
|
+
end
|
75
|
+
|
76
|
+
# Returns a list containing all of the n-grams of a specified size
|
77
|
+
# of a string. The list is ordered by the position of the n-gram
|
78
|
+
# in the string (duplicates included).
|
79
|
+
def ngram_list(str, ngram_size=@ngram_size)
|
80
|
+
str = alphabet_only(str).split("")
|
81
|
+
ngram_list = []
|
82
|
+
(0..str.size - ngram_size).each do |i|
|
83
|
+
ngram = ""
|
84
|
+
(0...ngram_size).each { |j| ngram << str[i + j] }
|
85
|
+
ngram_list << ngram
|
86
|
+
end
|
87
|
+
ngram_list
|
88
|
+
end
|
89
|
+
|
90
|
+
# Remove all characters not in the range 'a-z' from a string
|
91
|
+
def alphabet_only(str)
|
92
|
+
return str.gsub(/[^abcdefghijklmnopqrstuvwxyz]/, '')
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/redis-autosuggest.gemspec
CHANGED
@@ -20,6 +20,8 @@ Gem::Specification.new do |gem|
|
|
20
20
|
|
21
21
|
gem.add_dependency("redis", "~> 3.0.2")
|
22
22
|
gem.add_dependency("redis-namespace", "~> 1.2.1")
|
23
|
+
gem.add_dependency("levenshtein-ffi", "~> 1.0.3")
|
24
|
+
gem.add_dependency("text", "~> 1.2.1")
|
23
25
|
|
24
26
|
gem.add_development_dependency("minitest", "~> 4.3.3")
|
25
27
|
end
|
data/test/autosuggest_test.rb
CHANGED
@@ -106,11 +106,25 @@ class TestAutosuggest < MiniTest::Unit::TestCase
|
|
106
106
|
end
|
107
107
|
|
108
108
|
def test_adding_with_substring_limit
|
109
|
+
saved_limit = Redis::Autosuggest.max_per_substring
|
109
110
|
Redis::Autosuggest.max_per_substring = 1
|
110
111
|
Redis::Autosuggest.add_with_score(@str1, 1)
|
111
112
|
Redis::Autosuggest.add_with_score("Test", 5)
|
112
113
|
item_id = Redis::Autosuggest.get_id("Test")
|
113
114
|
assert_equal [item_id], @subs.zrevrange("test", 0, -1)
|
115
|
+
Redis::Autosuggest.max_per_substring = saved_limit
|
116
|
+
end
|
117
|
+
|
118
|
+
def test_suggesting_items_fuzzy
|
119
|
+
Redis::Autosuggest.fuzzy_match = true
|
120
|
+
str = "north by northwest"
|
121
|
+
Redis::Autosuggest.add(str, "northern exposure", "once upon a time in the west")
|
122
|
+
assert_equal str, Redis::Autosuggest.suggest("northbynorthwest")[0]
|
123
|
+
assert_equal str, Redis::Autosuggest.suggest("morth yb nerthwest")[0]
|
124
|
+
assert_equal str, Redis::Autosuggest.suggest("northe bie")[0]
|
125
|
+
assert_equal str, Redis::Autosuggest.suggest("morthybnerthwest")[0]
|
126
|
+
assert_equal str, Redis::Autosuggest.suggest("nourth bhye nourthwhast")[0]
|
127
|
+
Redis::Autosuggest.fuzzy_match = false
|
114
128
|
end
|
115
129
|
|
116
130
|
MiniTest::Unit.after_tests { self.unused_db.flushdb }
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: redis-autosuggest
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-01-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: redis
|
@@ -43,6 +43,38 @@ dependencies:
|
|
43
43
|
- - ~>
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: 1.2.1
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: levenshtein-ffi
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.0.3
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.0.3
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: text
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.2.1
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 1.2.1
|
46
78
|
- !ruby/object:Gem::Dependency
|
47
79
|
name: minitest
|
48
80
|
requirement: !ruby/object:Gem::Requirement
|
@@ -76,6 +108,7 @@ files:
|
|
76
108
|
- lib/redis/autosuggest.rb
|
77
109
|
- lib/redis/autosuggest/config.rb
|
78
110
|
- lib/redis/autosuggest/file.rb
|
111
|
+
- lib/redis/autosuggest/fuzzy.rb
|
79
112
|
- lib/redis/autosuggest/rails/railtie.rb
|
80
113
|
- lib/redis/autosuggest/rails/rake_tasks.rb
|
81
114
|
- lib/redis/autosuggest/rails/sources.rb
|