redis-autosuggest 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +7 -0
- data/lib/redis-autosuggest.rb +3 -0
- data/lib/redis/autosuggest.rb +26 -5
- data/lib/redis/autosuggest/config.rb +36 -8
- data/lib/redis/autosuggest/file.rb +4 -1
- data/lib/redis/autosuggest/fuzzy.rb +96 -0
- data/lib/redis/autosuggest/version.rb +1 -1
- data/redis-autosuggest.gemspec +2 -0
- data/test/autosuggest_test.rb +14 -0
- metadata +35 -2
data/README.md
CHANGED
@@ -36,6 +36,13 @@ Redis::Autosuggest.add_with_score("North By Northwest", 9, Northern Exposure, 3)
|
|
36
36
|
# Increment an item's score
|
37
37
|
Redis::Autosuggest.increment("North By Northwest", 1)
|
38
38
|
```
|
39
|
+
Fuzzy matching:
|
40
|
+
```ruby
|
41
|
+
Redis::Autosuggest.fuzzy_match = true
|
42
|
+
Redis::Autosuggest.add("North By Northwest")
|
43
|
+
Redis::Autosuggest.suggest("nort byenorthwest")
|
44
|
+
# => ["north by northwest"]
|
45
|
+
```
|
39
46
|
|
40
47
|
## Rails support
|
41
48
|
|
data/lib/redis-autosuggest.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
require 'redis'
|
2
2
|
require 'redis-namespace'
|
3
|
+
require 'levenshtein'
|
4
|
+
require 'text'
|
3
5
|
require 'redis/autosuggest'
|
4
6
|
require 'redis/autosuggest/config'
|
5
7
|
require 'redis/autosuggest/file'
|
6
8
|
require 'redis/autosuggest/version'
|
9
|
+
require 'redis/autosuggest/fuzzy'
|
7
10
|
|
8
11
|
if defined?(Rails)
|
9
12
|
require 'redis/autosuggest/rails/sources'
|
data/lib/redis/autosuggest.rb
CHANGED
@@ -8,7 +8,11 @@ class Redis
|
|
8
8
|
def add(*items)
|
9
9
|
all_new_items = true
|
10
10
|
items.each do |item|
|
11
|
-
item
|
11
|
+
if item.size > @max_str_size
|
12
|
+
all_new_items = false
|
13
|
+
next
|
14
|
+
end
|
15
|
+
item = normalize(item)
|
12
16
|
item_exists?(item) ? all_new_items = false : add_item(item)
|
13
17
|
end
|
14
18
|
all_new_items
|
@@ -20,6 +24,10 @@ class Redis
|
|
20
24
|
def add_with_score(*fields)
|
21
25
|
all_new_items = true
|
22
26
|
fields.each_slice(2) do |f|
|
27
|
+
if f[0].size > @max_str_size
|
28
|
+
all_new_items = false
|
29
|
+
next
|
30
|
+
end
|
23
31
|
f[0] = normalize(f[0])
|
24
32
|
item_exists?(f[0]) ? all_new_items = false : add_item(*f)
|
25
33
|
end
|
@@ -36,6 +44,7 @@ class Redis
|
|
36
44
|
@db.hdel(@itemids, item)
|
37
45
|
remove_substrings(item, id)
|
38
46
|
@redis.zrem(@leaderboard, id) if @use_leaderboard
|
47
|
+
remove_fuzzy(item) if @fuzzy_match
|
39
48
|
return true
|
40
49
|
end
|
41
50
|
|
@@ -50,8 +59,15 @@ class Redis
|
|
50
59
|
|
51
60
|
# Suggest items from the database that most closely match the queried string.
|
52
61
|
# Returns an array of suggestion items (an empty array if nothing found).
|
62
|
+
# Fuzzy matching will only occur when both of these conditions are met:
|
63
|
+
# - Redis::Autosuggest.fuzzy_match == true
|
64
|
+
# - The simple suggestion method (matching substrings) yields no results
|
53
65
|
def suggest(str, results=@max_results)
|
54
|
-
|
66
|
+
str = normalize(str)
|
67
|
+
suggestion_ids = @substrings.zrevrange(str, 0, results - 1)
|
68
|
+
if suggestion_ids.empty? && @fuzzy_match
|
69
|
+
return suggest_fuzzy(str, results)
|
70
|
+
end
|
55
71
|
suggestion_ids.empty? ? [] : @db.hmget(@items, suggestion_ids)
|
56
72
|
end
|
57
73
|
|
@@ -77,6 +93,10 @@ class Redis
|
|
77
93
|
return @db.hmget(@itemids, normalize(item)).first
|
78
94
|
end
|
79
95
|
|
96
|
+
def get_item(id)
|
97
|
+
return @db.hmget(@items, id).first
|
98
|
+
end
|
99
|
+
|
80
100
|
private
|
81
101
|
|
82
102
|
def normalize(item)
|
@@ -89,6 +109,7 @@ class Redis
|
|
89
109
|
@db.hset(@itemids, item, id)
|
90
110
|
add_substrings(item, score, id)
|
91
111
|
@db.zadd(@leaderboard, score, id) if @use_leaderboard
|
112
|
+
add_fuzzy(item) if @fuzzy_match
|
92
113
|
end
|
93
114
|
|
94
115
|
# Yield each substring of a complete string
|
@@ -106,19 +127,19 @@ class Redis
|
|
106
127
|
end
|
107
128
|
end
|
108
129
|
end
|
109
|
-
|
130
|
+
|
110
131
|
# Add the id of an item to a substring
|
111
132
|
def add_substring(sub, score, id)
|
112
133
|
@substrings.zadd(sub, score, id)
|
113
134
|
end
|
114
|
-
|
135
|
+
|
115
136
|
# Add the id of an item to a substring only when the number of items that
|
116
137
|
# substring stores is less then the config value of "max_per_substring".
|
117
138
|
# If the substring set is already full, check to see if the item with the
|
118
139
|
# lowest score in the substring set has a lower score than the item being added.
|
119
140
|
# If yes, remove that item and add this item to the substring set.
|
120
141
|
def add_substring_limit(sub, score, id)
|
121
|
-
count = @substrings.zcount(sub, "-inf", "inf")
|
142
|
+
count = @substrings.zcount(sub, "-inf", "+inf")
|
122
143
|
if count < @max_per_substring
|
123
144
|
add_substring(sub, score, id)
|
124
145
|
else
|
@@ -3,7 +3,7 @@ class Redis
|
|
3
3
|
|
4
4
|
# Default Redis server at localhost:6379
|
5
5
|
@redis = Redis.new
|
6
|
-
|
6
|
+
|
7
7
|
# Main Redis namespace for this module
|
8
8
|
@namespace = "suggest"
|
9
9
|
|
@@ -19,7 +19,7 @@ class Redis
|
|
19
19
|
# have four sorted sets: 'autosuggest:substring:r', 'autosuggest:substring:ru',
|
20
20
|
# 'autosuggest:substring:rub', and 'autosuggest:substring:ruby'.
|
21
21
|
# Each sorted set would the id to the word 'ruby'
|
22
|
-
@substrings = Redis::Namespace.new("#{@namespace}:
|
22
|
+
@substrings = Redis::Namespace.new("#{@namespace}:su", :redis => @redis)
|
23
23
|
|
24
24
|
# max number of ids to store per substring.
|
25
25
|
@max_per_substring = Float::INFINITY
|
@@ -27,6 +27,9 @@ class Redis
|
|
27
27
|
# max number of results to return for an autosuggest query
|
28
28
|
@max_results = 5
|
29
29
|
|
30
|
+
# max string size for an item
|
31
|
+
@max_str_size = Float::INFINITY
|
32
|
+
|
30
33
|
# Key to a sorted set holding all id of items in the autosuggest database sorted
|
31
34
|
# by their score
|
32
35
|
@leaderboard = "lead"
|
@@ -41,16 +44,41 @@ class Redis
|
|
41
44
|
# Stores the number of items the db has for each rails source
|
42
45
|
@rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => @redis)
|
43
46
|
|
47
|
+
# Fuzzy matching
|
48
|
+
@ngrams = Redis::Namespace.new("#{@namespace}:ng", :redis => @redis)
|
49
|
+
|
50
|
+
# Whether or not to use fuzzy matching for autocompletions
|
51
|
+
@fuzzy_match = false
|
52
|
+
|
53
|
+
# The size of n-grams stored (fuzzy matching)
|
54
|
+
@ngram_size = 3
|
55
|
+
|
56
|
+
# Maximum number of items to be indexed per n-gram (fuzzy matching)
|
57
|
+
@ngram_item_limit = 200
|
58
|
+
|
44
59
|
class << self
|
45
|
-
attr_reader :redis
|
46
|
-
attr_accessor :
|
47
|
-
:leaderboard, :use_leaderboard, :rails_sources,
|
60
|
+
attr_reader :redis, :namespace
|
61
|
+
attr_accessor :db, :items, :itemids, :substrings, :max_per_substring,
|
62
|
+
:max_results, :max_str_size, :leaderboard, :use_leaderboard, :rails_sources,
|
63
|
+
:rails_source_sizes, :ngrams, :fuzzy_match, :ngram_size, :ngram_item_limit
|
48
64
|
|
49
65
|
def redis=(redis)
|
50
66
|
@redis = redis
|
51
|
-
|
52
|
-
|
53
|
-
|
67
|
+
set_namespaces()
|
68
|
+
end
|
69
|
+
|
70
|
+
def namespace=(namespace)
|
71
|
+
@namespace = namespace
|
72
|
+
set_namespaces()
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
def set_namespaces
|
78
|
+
@db = Redis::Namespace.new(@namespace, :redis => @redis)
|
79
|
+
@substrings = Redis::Namespace.new("#{@namespace}:sub", :redis => @redis)
|
80
|
+
@rails_source_sizes = Redis::Namespace.new("#{@namespace}:size", :redis => @redis)
|
81
|
+
@ngrams = Redis::Namespace.new("#{@namespace}:ng", :redis => @redis)
|
54
82
|
end
|
55
83
|
end
|
56
84
|
end
|
@@ -6,7 +6,10 @@ class Redis
|
|
6
6
|
# Add items to the autosuggest database from a file.
|
7
7
|
# Each line be a string representing the item
|
8
8
|
def add_from_file(file)
|
9
|
-
|
9
|
+
File.open(file, "r").each do |l|
|
10
|
+
puts "Adding #{l}"
|
11
|
+
add(l.strip)
|
12
|
+
end
|
10
13
|
end
|
11
14
|
|
12
15
|
# Add items and their to the autosuggest database from a file.
|
@@ -0,0 +1,96 @@
|
|
1
|
+
class Redis
|
2
|
+
module Autosuggest
|
3
|
+
|
4
|
+
class << self
|
5
|
+
|
6
|
+
# Add an item's n-grams to the redis db. The n-grams will be used
|
7
|
+
# as candidates for autocompletions when Redis::Autosuggest.fuzzy_match
|
8
|
+
# is set to true.
|
9
|
+
def add_fuzzy(item)
|
10
|
+
yield_ngrams(item) do |ngram|
|
11
|
+
if @ngrams.scard(ngram).to_i <= @ngram_item_limit
|
12
|
+
@ngrams.sadd(ngram, "#{item}:#{compute_soundex_code(item)}")
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Remove an item's n-grams from the Redis db
|
18
|
+
def remove_fuzzy(item)
|
19
|
+
yield_ngrams(item) do |ngram|
|
20
|
+
@ngrams.srem(ngram, "#{item}:#{compute_soundex_code(item)}")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Compute the soundex code of a string (only works for single words
|
25
|
+
# so we have to merge multi-word strings)
|
26
|
+
def compute_soundex_code(str)
|
27
|
+
return Text::Soundex.soundex(alphabet_only(str))
|
28
|
+
end
|
29
|
+
|
30
|
+
# Build a candidate pool for all suitable fuzzy matches for a string
|
31
|
+
# by taking the union of all items in the Redis db that share an n-gram
|
32
|
+
# with the string. Use levenshtein distance, soundex code similarity,
|
33
|
+
# and the number of matching 2-grams to compute a score for each candidate.
|
34
|
+
# Then return the highest-scoring candidates.
|
35
|
+
def suggest_fuzzy(str, results=@max_results)
|
36
|
+
str_mul = alphabet_only(str).size
|
37
|
+
str_soundex_code = compute_soundex_code(str)
|
38
|
+
str_2grams = ngram_list(str, 2)
|
39
|
+
candidates = []
|
40
|
+
|
41
|
+
@ngrams.sunion(*ngram_list(str)).each do |candidate|
|
42
|
+
candidate = candidate.split(":")
|
43
|
+
candidate_str = candidate[0]
|
44
|
+
candidate_soundex_code = candidate[1]
|
45
|
+
candidate_score = 1.0
|
46
|
+
|
47
|
+
# Levenshtein distance
|
48
|
+
lev_dist = Levenshtein.distance(str, candidate_str)
|
49
|
+
candidate_score *= Math.exp([str_mul - lev_dist, 1].max)
|
50
|
+
|
51
|
+
# Soundex
|
52
|
+
if str_soundex_code == candidate_soundex_code
|
53
|
+
candidate_score *= str_mul
|
54
|
+
elsif str_soundex_code[1..-1] == candidate_soundex_code[1..-1]
|
55
|
+
candidate_score *= (str_mul / 2).ceil
|
56
|
+
end
|
57
|
+
|
58
|
+
# Compute n-grams of size 2 shared between the two strings
|
59
|
+
same_2grams = str_2grams & ngram_list(candidate_str, 2)
|
60
|
+
candidate_score *= Math.exp(same_2grams.size)
|
61
|
+
|
62
|
+
candidates << [candidate_str, candidate_score] if candidate_score > 1
|
63
|
+
end
|
64
|
+
# Sort results by score and return the highest scoring candidates
|
65
|
+
candidates = candidates.sort { |a, b| b[1] <=> a[1] }
|
66
|
+
# puts candidates.take(10).map { |tuple| "#{tuple[0]} => #{tuple[1]}" }
|
67
|
+
return candidates.take(results).map { |a| a[0] }
|
68
|
+
end
|
69
|
+
|
70
|
+
# Yield the n-grams of a specified size for a string one at a time
|
71
|
+
def yield_ngrams(str, ngram_size=@ngram_size)
|
72
|
+
ngram_list = ngram_list(str, ngram_size)
|
73
|
+
ngram_list.each { |ngram| yield ngram }
|
74
|
+
end
|
75
|
+
|
76
|
+
# Returns a list containing all of the n-grams of a specified size
|
77
|
+
# of a string. The list is ordered by the position of the n-gram
|
78
|
+
# in the string (duplicates included).
|
79
|
+
def ngram_list(str, ngram_size=@ngram_size)
|
80
|
+
str = alphabet_only(str).split("")
|
81
|
+
ngram_list = []
|
82
|
+
(0..str.size - ngram_size).each do |i|
|
83
|
+
ngram = ""
|
84
|
+
(0...ngram_size).each { |j| ngram << str[i + j] }
|
85
|
+
ngram_list << ngram
|
86
|
+
end
|
87
|
+
ngram_list
|
88
|
+
end
|
89
|
+
|
90
|
+
# Remove all characters not in the range 'a-z' from a string
|
91
|
+
def alphabet_only(str)
|
92
|
+
return str.gsub(/[^abcdefghijklmnopqrstuvwxyz]/, '')
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/redis-autosuggest.gemspec
CHANGED
@@ -20,6 +20,8 @@ Gem::Specification.new do |gem|
|
|
20
20
|
|
21
21
|
gem.add_dependency("redis", "~> 3.0.2")
|
22
22
|
gem.add_dependency("redis-namespace", "~> 1.2.1")
|
23
|
+
gem.add_dependency("levenshtein-ffi", "~> 1.0.3")
|
24
|
+
gem.add_dependency("text", "~> 1.2.1")
|
23
25
|
|
24
26
|
gem.add_development_dependency("minitest", "~> 4.3.3")
|
25
27
|
end
|
data/test/autosuggest_test.rb
CHANGED
@@ -106,11 +106,25 @@ class TestAutosuggest < MiniTest::Unit::TestCase
|
|
106
106
|
end
|
107
107
|
|
108
108
|
def test_adding_with_substring_limit
|
109
|
+
saved_limit = Redis::Autosuggest.max_per_substring
|
109
110
|
Redis::Autosuggest.max_per_substring = 1
|
110
111
|
Redis::Autosuggest.add_with_score(@str1, 1)
|
111
112
|
Redis::Autosuggest.add_with_score("Test", 5)
|
112
113
|
item_id = Redis::Autosuggest.get_id("Test")
|
113
114
|
assert_equal [item_id], @subs.zrevrange("test", 0, -1)
|
115
|
+
Redis::Autosuggest.max_per_substring = saved_limit
|
116
|
+
end
|
117
|
+
|
118
|
+
def test_suggesting_items_fuzzy
|
119
|
+
Redis::Autosuggest.fuzzy_match = true
|
120
|
+
str = "north by northwest"
|
121
|
+
Redis::Autosuggest.add(str, "northern exposure", "once upon a time in the west")
|
122
|
+
assert_equal str, Redis::Autosuggest.suggest("northbynorthwest")[0]
|
123
|
+
assert_equal str, Redis::Autosuggest.suggest("morth yb nerthwest")[0]
|
124
|
+
assert_equal str, Redis::Autosuggest.suggest("northe bie")[0]
|
125
|
+
assert_equal str, Redis::Autosuggest.suggest("morthybnerthwest")[0]
|
126
|
+
assert_equal str, Redis::Autosuggest.suggest("nourth bhye nourthwhast")[0]
|
127
|
+
Redis::Autosuggest.fuzzy_match = false
|
114
128
|
end
|
115
129
|
|
116
130
|
MiniTest::Unit.after_tests { self.unused_db.flushdb }
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: redis-autosuggest
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-01-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: redis
|
@@ -43,6 +43,38 @@ dependencies:
|
|
43
43
|
- - ~>
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: 1.2.1
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: levenshtein-ffi
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.0.3
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.0.3
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: text
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.2.1
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 1.2.1
|
46
78
|
- !ruby/object:Gem::Dependency
|
47
79
|
name: minitest
|
48
80
|
requirement: !ruby/object:Gem::Requirement
|
@@ -76,6 +108,7 @@ files:
|
|
76
108
|
- lib/redis/autosuggest.rb
|
77
109
|
- lib/redis/autosuggest/config.rb
|
78
110
|
- lib/redis/autosuggest/file.rb
|
111
|
+
- lib/redis/autosuggest/fuzzy.rb
|
79
112
|
- lib/redis/autosuggest/rails/railtie.rb
|
80
113
|
- lib/redis/autosuggest/rails/rake_tasks.rb
|
81
114
|
- lib/redis/autosuggest/rails/sources.rb
|