fuzzzy 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +15 -0
- data/.rspec +2 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/README.md +97 -0
- data/Rakefile +11 -0
- data/benchmark/data/cities.csv +8875 -0
- data/benchmark/fuzzzy_benchmark.rb +128 -0
- data/benchmark/test_ngram.rb +36 -0
- data/benchmark/test_soundex.rb +36 -0
- data/config.ru +11 -0
- data/dictionary/en_stopwords.yml +175 -0
- data/fuzzzy.gemspec +44 -0
- data/lib/fuzzzy.rb +129 -0
- data/lib/fuzzzy/index.rb +17 -0
- data/lib/fuzzzy/methods/indexer.rb +13 -0
- data/lib/fuzzzy/methods/method_base.rb +32 -0
- data/lib/fuzzzy/methods/ngram/base.rb +15 -0
- data/lib/fuzzzy/methods/ngram/indexer.rb +37 -0
- data/lib/fuzzzy/methods/ngram/searcher.rb +95 -0
- data/lib/fuzzzy/methods/soundex/base.rb +13 -0
- data/lib/fuzzzy/methods/soundex/indexer.rb +30 -0
- data/lib/fuzzzy/methods/soundex/searcher.rb +36 -0
- data/lib/fuzzzy/orm/mongoid/index.rb +86 -0
- data/lib/fuzzzy/redis.rb +61 -0
- data/lib/fuzzzy/server/http.rb +99 -0
- data/lib/fuzzzy/version.rb +3 -0
- data/spec/config/mongoid.yml +7 -0
- data/spec/models/city.rb +8 -0
- data/spec/models/indexed_city.rb +9 -0
- data/spec/ngram/indexer_spec.rb +142 -0
- data/spec/ngram/searcher_spec.rb +194 -0
- data/spec/orm/mongoid/index_spec.rb +165 -0
- data/spec/redis_spec.rb +54 -0
- data/spec/soundex/indexer_spec.rb +40 -0
- data/spec/soundex/searcher_spec.rb +63 -0
- data/spec/spec_helper.rb +22 -0
- metadata +325 -0
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'pp'
|
2
|
+
module Fuzzzy
|
3
|
+
module Server
|
4
|
+
class HTTP < Grape::API
|
5
|
+
class ParamsError < StandardError
|
6
|
+
end
|
7
|
+
|
8
|
+
format :json
|
9
|
+
default_format :json
|
10
|
+
error_format :json
|
11
|
+
version 'v1', :using => :path
|
12
|
+
|
13
|
+
rescue_from :all do |e|
|
14
|
+
rack_response({:error => e.class.name, :message => e.message}.to_json)
|
15
|
+
end
|
16
|
+
|
17
|
+
helpers do
|
18
|
+
include Index
|
19
|
+
|
20
|
+
def search
|
21
|
+
context = search_context
|
22
|
+
check_context!(:query, context)
|
23
|
+
context[:distance] = context[:distance].to_i if context[:distance]
|
24
|
+
_searcher(context[:index_method]).search(context)
|
25
|
+
end
|
26
|
+
|
27
|
+
def check_context! *keys
|
28
|
+
context = keys.pop
|
29
|
+
([:index_name, :index_method] + keys).each do |key|
|
30
|
+
raise ParamsError.new("Parameter :#{key} not found") if context[key].nil?
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def construct_context *keys
|
35
|
+
context = {}
|
36
|
+
([:index_name, :index_method] + keys).each do |key|
|
37
|
+
context[key] = params[key] if params[key]
|
38
|
+
end
|
39
|
+
context
|
40
|
+
end
|
41
|
+
|
42
|
+
def index_context
|
43
|
+
construct_context(:id, :dictionary_string)
|
44
|
+
end
|
45
|
+
|
46
|
+
def search_context
|
47
|
+
construct_context(:query, :distance, :sort_by, :with_cache)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
namespace :info do
|
52
|
+
http_basic do |u, p|
|
53
|
+
u == 'admin' && p == 'password'
|
54
|
+
end
|
55
|
+
|
56
|
+
get do
|
57
|
+
info = {
|
58
|
+
:ruby => RUBY_VERSION,
|
59
|
+
:environment => Fuzzzy.env,
|
60
|
+
:redis => Fuzzzy.redis.client.id,
|
61
|
+
:root_dir => Fuzzzy.root.to_s
|
62
|
+
}
|
63
|
+
info[:stopwords] = Fuzzzy.stopwords if params[:show_stopwords]
|
64
|
+
info
|
65
|
+
end
|
66
|
+
|
67
|
+
get 'indexes' do
|
68
|
+
indexes_info = {
|
69
|
+
:redis_size => Fuzzzy.redis.info['used_memory_human'],
|
70
|
+
:indexes => Fuzzzy.redis.hgetall(Fuzzzy::Redis.counter_key)
|
71
|
+
}
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
resource :indexes do
|
76
|
+
# curl /v1/indexes?index_name=city:name&index_method=ngram&query=search%20string
|
77
|
+
get do
|
78
|
+
search
|
79
|
+
end
|
80
|
+
|
81
|
+
post '/search' do
|
82
|
+
search
|
83
|
+
end
|
84
|
+
|
85
|
+
post do
|
86
|
+
context = index_context
|
87
|
+
check_context!(:id, :dictionary_string, context)
|
88
|
+
_indexer(context[:index_method]).create_index(context)
|
89
|
+
end
|
90
|
+
|
91
|
+
delete do
|
92
|
+
context = index_context
|
93
|
+
check_context!(:id, context)
|
94
|
+
_indexer(context[:index_method]).delete_index(context)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
data/spec/models/city.rb
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Fuzzzy::Ngram::Indexer do
|
4
|
+
let(:indexer){Fuzzzy::Ngram::Indexer.new}
|
5
|
+
let(:context){{
|
6
|
+
:index_name => 'city:name',
|
7
|
+
:method => :ngram,
|
8
|
+
:dictionary_string => dictionary_string,
|
9
|
+
:id => id
|
10
|
+
}}
|
11
|
+
let(:dictionary_string){'moscow'}
|
12
|
+
let(:id){'12345'}
|
13
|
+
let(:counter_key){Fuzzzy::Redis.counter_key}
|
14
|
+
|
15
|
+
before do
|
16
|
+
keys = Fuzzzy.redis.keys("*")
|
17
|
+
Fuzzzy.redis.del(*keys) if keys.length > 0
|
18
|
+
end
|
19
|
+
|
20
|
+
describe '#ngrams' do
|
21
|
+
specify do
|
22
|
+
indexer.with_context(context) do
|
23
|
+
indexer.ngrams('mo').should == ['mo']
|
24
|
+
end
|
25
|
+
end
|
26
|
+
specify do
|
27
|
+
indexer.with_context(context) do
|
28
|
+
indexer.ngrams('mos').should == ['mos']
|
29
|
+
end
|
30
|
+
end
|
31
|
+
specify do
|
32
|
+
indexer.with_context(context) do
|
33
|
+
indexer.ngrams.should == ['mos', 'osc', 'sco', 'cow']
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe '#create_index' do
|
39
|
+
let(:keys){Fuzzzy.redis.keys}
|
40
|
+
let(:dictionary_keys){["fuzzzy:city:name:dictionary:#{id}"]}
|
41
|
+
|
42
|
+
before do
|
43
|
+
indexer.create_index(context)
|
44
|
+
end
|
45
|
+
|
46
|
+
specify{keys.size.should == 6}
|
47
|
+
specify do
|
48
|
+
keys.should =~ [
|
49
|
+
'fuzzzy:city:name:ngram_i:mos:0',
|
50
|
+
'fuzzzy:city:name:ngram_i:osc:1',
|
51
|
+
'fuzzzy:city:name:ngram_i:sco:2',
|
52
|
+
'fuzzzy:city:name:ngram_i:cow:3',
|
53
|
+
] + dictionary_keys + [counter_key]
|
54
|
+
end
|
55
|
+
specify do
|
56
|
+
Fuzzzy.redis.mget(*dictionary_keys).should == ['moscow']
|
57
|
+
end
|
58
|
+
specify do
|
59
|
+
Fuzzzy.redis.sunion(*(keys - dictionary_keys - [counter_key])).should == [id]
|
60
|
+
end
|
61
|
+
specify{Fuzzzy.redis.hgetall(counter_key).should == {'city:name' => '1'}}
|
62
|
+
|
63
|
+
context 'with empty string' do
|
64
|
+
let(:dictionary_string){''}
|
65
|
+
|
66
|
+
specify{keys.size.should == 0}
|
67
|
+
end
|
68
|
+
|
69
|
+
context 'with nulled string' do
|
70
|
+
let(:dictionary_string){nil}
|
71
|
+
|
72
|
+
specify{keys.size.should == 0}
|
73
|
+
end
|
74
|
+
|
75
|
+
context 'with multiple calls' do
|
76
|
+
let(:another_id){'11111'}
|
77
|
+
let(:dictionary_keys){[
|
78
|
+
"fuzzzy:city:name:dictionary:#{id}",
|
79
|
+
"fuzzzy:city:name:dictionary:#{another_id}"
|
80
|
+
]}
|
81
|
+
|
82
|
+
before do
|
83
|
+
indexer.create_index(context.merge(
|
84
|
+
:dictionary_string => 'Mostyn',
|
85
|
+
:id => another_id
|
86
|
+
))
|
87
|
+
end
|
88
|
+
|
89
|
+
specify{keys.size.should == 10}
|
90
|
+
specify do
|
91
|
+
keys.should =~ [
|
92
|
+
'fuzzzy:city:name:ngram_i:mos:0',
|
93
|
+
'fuzzzy:city:name:ngram_i:osc:1',
|
94
|
+
'fuzzzy:city:name:ngram_i:sco:2',
|
95
|
+
'fuzzzy:city:name:ngram_i:cow:3',
|
96
|
+
'fuzzzy:city:name:ngram_i:ost:1',
|
97
|
+
'fuzzzy:city:name:ngram_i:sty:2',
|
98
|
+
'fuzzzy:city:name:ngram_i:tyn:3',
|
99
|
+
] + dictionary_keys + [counter_key]
|
100
|
+
end
|
101
|
+
specify do
|
102
|
+
Fuzzzy.redis.mget(*dictionary_keys).should == [dictionary_string, 'mostyn']
|
103
|
+
end
|
104
|
+
specify do
|
105
|
+
Fuzzzy.redis.sunion(*(keys - dictionary_keys - [counter_key])).should =~ [id, another_id]
|
106
|
+
end
|
107
|
+
specify{Fuzzzy.redis.hgetall(counter_key).should == {'city:name' => '2'}}
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
describe '#delete_index' do
|
112
|
+
let(:keys){Fuzzzy.redis.keys}
|
113
|
+
let(:another_id){'11111'}
|
114
|
+
let(:dictionary_keys){["fuzzzy:city:name:dictionary:#{another_id}"]}
|
115
|
+
|
116
|
+
before do
|
117
|
+
indexer.create_index(context)
|
118
|
+
indexer.create_index(context.merge(
|
119
|
+
:dictionary_string => 'Mostyn',
|
120
|
+
:id => another_id
|
121
|
+
))
|
122
|
+
indexer.delete_index(context)
|
123
|
+
end
|
124
|
+
|
125
|
+
specify{keys.size.should == 6}
|
126
|
+
specify do
|
127
|
+
keys.should =~ [
|
128
|
+
'fuzzzy:city:name:ngram_i:mos:0',
|
129
|
+
'fuzzzy:city:name:ngram_i:ost:1',
|
130
|
+
'fuzzzy:city:name:ngram_i:sty:2',
|
131
|
+
'fuzzzy:city:name:ngram_i:tyn:3',
|
132
|
+
] + dictionary_keys + [counter_key]
|
133
|
+
end
|
134
|
+
specify do
|
135
|
+
Fuzzzy.redis.mget(*dictionary_keys).should == ['mostyn']
|
136
|
+
end
|
137
|
+
specify do
|
138
|
+
Fuzzzy.redis.sunion(*(keys - dictionary_keys - [counter_key])).should =~ [another_id]
|
139
|
+
end
|
140
|
+
specify{Fuzzzy.redis.hgetall(counter_key).should == {'city:name' => '1'}}
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,194 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Fuzzzy::Ngram::Searcher do
|
4
|
+
let(:indexer){Fuzzzy::Ngram::Indexer.new}
|
5
|
+
let(:index_context){{
|
6
|
+
:index_name => 'city:name',
|
7
|
+
:method => :ngram
|
8
|
+
}}
|
9
|
+
let(:searcher){Fuzzzy::Ngram::Searcher.new}
|
10
|
+
let(:context){index_context.merge(:query => query_string, :distance => 1)}
|
11
|
+
|
12
|
+
before do
|
13
|
+
keys = Fuzzzy.redis.keys("*")
|
14
|
+
Fuzzzy.redis.del(*keys) if keys.length > 0
|
15
|
+
end
|
16
|
+
|
17
|
+
describe '#segment_points' do
|
18
|
+
context 'when distance = 0' do
|
19
|
+
let(:result){[]}
|
20
|
+
let(:sample){[
|
21
|
+
[0], # mos
|
22
|
+
[1], # osk
|
23
|
+
[2], # sko
|
24
|
+
[3], # kow
|
25
|
+
]}
|
26
|
+
before do
|
27
|
+
searcher.with_context(:distance => 0, :query => 'moscow') do
|
28
|
+
searcher.segment_points(index) do |i|
|
29
|
+
result << i
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
(0...4).each do |idx|
|
35
|
+
context "and index = #{idx}" do
|
36
|
+
let(:index){idx}
|
37
|
+
specify{result.should == sample[index]}
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
context 'when distance = 1' do
|
43
|
+
let(:result){[]}
|
44
|
+
let(:sample){[
|
45
|
+
[0, 1], # mos
|
46
|
+
[0, 1, 2], # osk
|
47
|
+
[1, 2, 3], # sko
|
48
|
+
[2, 3, 4], # kow
|
49
|
+
]}
|
50
|
+
before do
|
51
|
+
searcher.with_context(:distance => 1, :query => 'moscow') do
|
52
|
+
searcher.segment_points(index) do |i|
|
53
|
+
result << i
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
(0...4).each do |idx|
|
59
|
+
context "and index = #{idx}" do
|
60
|
+
let(:index){idx}
|
61
|
+
specify{result.should == sample[index]}
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
context 'when distance = 3' do
|
67
|
+
let(:result){[]}
|
68
|
+
let(:sample){[
|
69
|
+
[0, 1, 2, 3], # mos
|
70
|
+
[0, 1, 2, 3, 4], # osk
|
71
|
+
[0, 1, 2, 3, 4, 5], # sko
|
72
|
+
[0, 1, 2, 3, 4, 5, 6] # kow
|
73
|
+
]}
|
74
|
+
before do
|
75
|
+
searcher.with_context(:distance => 3, :query => 'moscow') do
|
76
|
+
searcher.segment_points(index) do |i|
|
77
|
+
result << i
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
(0...4).each do |idx|
|
83
|
+
context "and index = #{idx}" do
|
84
|
+
let(:index){idx}
|
85
|
+
specify{result.should == sample[index]}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
context 'when distance = 3 and long word' do
|
91
|
+
let(:result){[]}
|
92
|
+
let(:sample){[
|
93
|
+
[0, 1, 2, 3], # lev
|
94
|
+
[0, 1, 2, 3, 4], # eve
|
95
|
+
[0, 1, 2, 3, 4, 5], # ven
|
96
|
+
[0, 1, 2, 3, 4, 5, 6], # ens
|
97
|
+
[1, 2, 3, 4, 5, 6, 7], # nsh
|
98
|
+
[2, 3, 4, 5, 6, 7, 8], # sht
|
99
|
+
[3, 4, 5, 6, 7, 8, 9], # hte
|
100
|
+
[4, 5, 6, 7, 8, 9, 10], # tei
|
101
|
+
[5, 6, 7, 8, 9, 10, 11] # ein
|
102
|
+
]}
|
103
|
+
before do
|
104
|
+
searcher.with_context(:distance => 3, :query => 'levenshtein') do
|
105
|
+
searcher.segment_points(index) do |i|
|
106
|
+
result << i
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
(0...9).each do |idx|
|
112
|
+
context "and index = #{idx}" do
|
113
|
+
let(:index){idx}
|
114
|
+
specify{result.should == sample[index]}
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
describe '#index_keys' do
|
121
|
+
let(:query_string){'mascow'}
|
122
|
+
specify do
|
123
|
+
searcher.with_context(context) do
|
124
|
+
searcher.index_keys.should =~ [
|
125
|
+
searcher.index_key('mas', 0),
|
126
|
+
searcher.index_key('mas', 1),
|
127
|
+
searcher.index_key('asc', 0),
|
128
|
+
searcher.index_key('asc', 1),
|
129
|
+
searcher.index_key('asc', 2),
|
130
|
+
searcher.index_key('sco', 1),
|
131
|
+
searcher.index_key('sco', 2),
|
132
|
+
searcher.index_key('sco', 3),
|
133
|
+
searcher.index_key('cow', 2),
|
134
|
+
searcher.index_key('cow', 3),
|
135
|
+
searcher.index_key('cow', 4)
|
136
|
+
]
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
describe '#search' do
|
142
|
+
context 'single word - #1' do
|
143
|
+
before do
|
144
|
+
indexer.create_index(index_context.merge(
|
145
|
+
:dictionary_string => dictionary_string,
|
146
|
+
:id => id
|
147
|
+
))
|
148
|
+
end
|
149
|
+
|
150
|
+
let(:query_string){'mascow'}
|
151
|
+
let(:dictionary_string){'moscow'}
|
152
|
+
let(:id){'12345'}
|
153
|
+
|
154
|
+
specify{searcher.search(context).should == [id]}
|
155
|
+
end
|
156
|
+
|
157
|
+
context 'single word - #2' do
|
158
|
+
before do
|
159
|
+
indexer.create_index(index_context.merge(
|
160
|
+
:dictionary_string => dictionary_string,
|
161
|
+
:id => id
|
162
|
+
))
|
163
|
+
end
|
164
|
+
|
165
|
+
let(:query_string){'jenergija'}
|
166
|
+
let(:dictionary_string){'energiya'}
|
167
|
+
let(:id){'12345'}
|
168
|
+
|
169
|
+
specify{searcher.search(context.merge(
|
170
|
+
:distance => 2
|
171
|
+
)).should == [id]}
|
172
|
+
end
|
173
|
+
|
174
|
+
context 'single word - #2' do
|
175
|
+
before do
|
176
|
+
indexer.create_index(index_context.merge(
|
177
|
+
:dictionary_string => dictionary_string,
|
178
|
+
:id => id
|
179
|
+
))
|
180
|
+
end
|
181
|
+
|
182
|
+
let(:query_string){'rhus'}
|
183
|
+
let(:dictionary_string){'Aarhus'}
|
184
|
+
let(:id){'12345'}
|
185
|
+
|
186
|
+
specify{searcher.search(context.merge(
|
187
|
+
:distance => 2
|
188
|
+
)).should == [id]}
|
189
|
+
end
|
190
|
+
|
191
|
+
context 'many words' do
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|