fuzzzy 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +15 -0
- data/.rspec +2 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/README.md +97 -0
- data/Rakefile +11 -0
- data/benchmark/data/cities.csv +8875 -0
- data/benchmark/fuzzzy_benchmark.rb +128 -0
- data/benchmark/test_ngram.rb +36 -0
- data/benchmark/test_soundex.rb +36 -0
- data/config.ru +11 -0
- data/dictionary/en_stopwords.yml +175 -0
- data/fuzzzy.gemspec +44 -0
- data/lib/fuzzzy.rb +129 -0
- data/lib/fuzzzy/index.rb +17 -0
- data/lib/fuzzzy/methods/indexer.rb +13 -0
- data/lib/fuzzzy/methods/method_base.rb +32 -0
- data/lib/fuzzzy/methods/ngram/base.rb +15 -0
- data/lib/fuzzzy/methods/ngram/indexer.rb +37 -0
- data/lib/fuzzzy/methods/ngram/searcher.rb +95 -0
- data/lib/fuzzzy/methods/soundex/base.rb +13 -0
- data/lib/fuzzzy/methods/soundex/indexer.rb +30 -0
- data/lib/fuzzzy/methods/soundex/searcher.rb +36 -0
- data/lib/fuzzzy/orm/mongoid/index.rb +86 -0
- data/lib/fuzzzy/redis.rb +61 -0
- data/lib/fuzzzy/server/http.rb +99 -0
- data/lib/fuzzzy/version.rb +3 -0
- data/spec/config/mongoid.yml +7 -0
- data/spec/models/city.rb +8 -0
- data/spec/models/indexed_city.rb +9 -0
- data/spec/ngram/indexer_spec.rb +142 -0
- data/spec/ngram/searcher_spec.rb +194 -0
- data/spec/orm/mongoid/index_spec.rb +165 -0
- data/spec/redis_spec.rb +54 -0
- data/spec/soundex/indexer_spec.rb +40 -0
- data/spec/soundex/searcher_spec.rb +63 -0
- data/spec/spec_helper.rb +22 -0
- metadata +325 -0
data/lib/fuzzzy/index.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Index
|
3
|
+
def _indexer method
|
4
|
+
@indexer ||= {}
|
5
|
+
@indexer[method] ||= class_for(:indexer, method).new
|
6
|
+
end
|
7
|
+
|
8
|
+
def _searcher method
|
9
|
+
@searcher ||= {}
|
10
|
+
@searcher[method] ||= class_for(:searcher, method).new
|
11
|
+
end
|
12
|
+
|
13
|
+
def class_for type, method
|
14
|
+
"fuzzzy/#{method}/#{type}".classify.constantize
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Indexer
|
3
|
+
def delete_dictionary id
|
4
|
+
redis.del(dictionary_key(id))
|
5
|
+
redis.hincrby(counter_key, index_name, -1)
|
6
|
+
end
|
7
|
+
|
8
|
+
def save_dictionary id, string
|
9
|
+
redis.set(dictionary_key(id), query_index_string)
|
10
|
+
redis.hincrby(counter_key, index_name, 1)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
class MethodBase
|
3
|
+
include Redis
|
4
|
+
|
5
|
+
attr_reader :context
|
6
|
+
|
7
|
+
def with_context cntx
|
8
|
+
@context = cntx.dup and yield if cntx
|
9
|
+
rescue => e
|
10
|
+
raise e
|
11
|
+
ensure
|
12
|
+
@context = nil
|
13
|
+
end
|
14
|
+
|
15
|
+
def index_name
|
16
|
+
context[:index_name]
|
17
|
+
end
|
18
|
+
|
19
|
+
def prepare_string string
|
20
|
+
return '' unless string
|
21
|
+
str = string.dup.downcase
|
22
|
+
str = context[:filter].call(str) if context[:filter] && context[:filter].respond_to?(:call)
|
23
|
+
str = (str.split - stopwords).join(' ') if context[:strip_stopwords]
|
24
|
+
str
|
25
|
+
end
|
26
|
+
|
27
|
+
def stopwords
|
28
|
+
return context[:strip_stopwords] if context[:strip_stopwords] && context[:strip_stopwords].is_a?(Array)
|
29
|
+
Fuzzzy.stopwords
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Ngram
|
3
|
+
class Base < MethodBase
|
4
|
+
def ngrams string=nil
|
5
|
+
string ||= query_index_string
|
6
|
+
return [string] if string.size < 3
|
7
|
+
context[string] ||= (0..string.length-3).map{|idx| string[idx,3] }
|
8
|
+
end
|
9
|
+
|
10
|
+
def index_type
|
11
|
+
'ngram_i'
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Ngram
|
3
|
+
class Indexer < Base
|
4
|
+
include Fuzzzy::Indexer
|
5
|
+
def query_index_string
|
6
|
+
context[:prepared_dictionary_string] ||= prepare_string(context[:dictionary_string])
|
7
|
+
end
|
8
|
+
|
9
|
+
def create_index cntx
|
10
|
+
with_context(cntx) do
|
11
|
+
return if query_index_string.empty?
|
12
|
+
|
13
|
+
delete_index
|
14
|
+
|
15
|
+
ngrams.each_with_index do |ngram, index|
|
16
|
+
redis.sadd(index_key(ngram, index), context[:id])
|
17
|
+
end
|
18
|
+
|
19
|
+
save_dictionary(context[:id], query_index_string)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def delete_index cntx=nil
|
24
|
+
block = lambda do
|
25
|
+
if older_string = redis.get(dictionary_key(context[:id]))
|
26
|
+
ngrams(older_string).each_with_index do |ngram, index|
|
27
|
+
redis.srem(index_key(ngram, index), context[:id])
|
28
|
+
end
|
29
|
+
|
30
|
+
delete_dictionary(context[:id])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
cntx ? with_context(cntx, &block) : block.call
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Ngram
|
3
|
+
class Searcher < Base
|
4
|
+
# Ruby implementation:
|
5
|
+
# def segment_points index
|
6
|
+
# right = distance + index
|
7
|
+
# left = index > distance ? (index - distance) : 0
|
8
|
+
# i = left
|
9
|
+
# while i <= right do
|
10
|
+
# yield i
|
11
|
+
# i += 1
|
12
|
+
# end
|
13
|
+
# end
|
14
|
+
SEGMENT_POINTS = <<-EOC
|
15
|
+
VALUE
|
16
|
+
_segment_points(VALUE self, VALUE _index)
|
17
|
+
{
|
18
|
+
int index, distance, left, right, i;
|
19
|
+
|
20
|
+
index = NUM2INT(_index);
|
21
|
+
distance = NUM2INT(rb_funcall(self, rb_intern("distance"), 0));
|
22
|
+
right = index + distance;
|
23
|
+
|
24
|
+
if(index > distance) {
|
25
|
+
left = index - distance;
|
26
|
+
} else {
|
27
|
+
left = 0;
|
28
|
+
}
|
29
|
+
|
30
|
+
for(i = left; i <= right; i++) {
|
31
|
+
rb_yield(INT2NUM(i));
|
32
|
+
}
|
33
|
+
|
34
|
+
return Qnil;
|
35
|
+
}
|
36
|
+
EOC
|
37
|
+
|
38
|
+
inline do |builder|
|
39
|
+
builder.c_raw(SEGMENT_POINTS, :method_name => 'segment_points', :arity => 1)
|
40
|
+
end
|
41
|
+
|
42
|
+
def search cntx
|
43
|
+
with_context(cntx) do
|
44
|
+
return [] if query_index_string.empty?
|
45
|
+
if ids = redis.sunion(*index_keys)
|
46
|
+
ids.each do |id|
|
47
|
+
string = redis.get(dictionary_key(id))
|
48
|
+
dist = Levenshtein.distance(query_index_string, string)
|
49
|
+
result << {
|
50
|
+
:id => id,
|
51
|
+
:distance => dist,
|
52
|
+
:alpha => string
|
53
|
+
} if dist <= distance
|
54
|
+
end
|
55
|
+
result.sort_by!{|item|item[sort_by]} if sort_by
|
56
|
+
result.map!{|item|item[:id]} unless with_cache?
|
57
|
+
result
|
58
|
+
else
|
59
|
+
[]
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def index_keys
|
65
|
+
keys = []
|
66
|
+
ngrams.each_with_index do |ngram, index|
|
67
|
+
segment_points(index) do |i|
|
68
|
+
keys << index_key(ngram, i)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
keys
|
72
|
+
end
|
73
|
+
|
74
|
+
def distance
|
75
|
+
context[:distance] ||= 0
|
76
|
+
end
|
77
|
+
|
78
|
+
def result
|
79
|
+
context[:result] ||= []
|
80
|
+
end
|
81
|
+
|
82
|
+
def with_cache?
|
83
|
+
context[:with_cache] ||= false
|
84
|
+
end
|
85
|
+
|
86
|
+
def sort_by
|
87
|
+
context[:sort_by]
|
88
|
+
end
|
89
|
+
|
90
|
+
def query_index_string
|
91
|
+
context[:prepared_query] ||= prepare_string(context[:query])
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Soundex
|
3
|
+
class Indexer < Base
|
4
|
+
include Fuzzzy::Indexer
|
5
|
+
def query_index_string
|
6
|
+
context[:prepared_dictionary_string] ||= prepare_string(context[:dictionary_string])
|
7
|
+
end
|
8
|
+
|
9
|
+
def create_index cntx
|
10
|
+
with_context(cntx) do
|
11
|
+
return if query_index_string.empty?
|
12
|
+
|
13
|
+
delete_index
|
14
|
+
redis.sadd(index_key(soundex), context[:id])
|
15
|
+
save_dictionary(context[:id], query_index_string)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def delete_index cntx=nil
|
20
|
+
block = lambda do
|
21
|
+
if older_string = redis.get(dictionary_key(context[:id]))
|
22
|
+
redis.srem(index_key(soundex(older_string)), context[:id])
|
23
|
+
delete_dictionary(context[:id])
|
24
|
+
end
|
25
|
+
end
|
26
|
+
cntx ? with_context(cntx, &block) : block.call
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Soundex
|
3
|
+
class Searcher < Base
|
4
|
+
def search context
|
5
|
+
with_context(context) do
|
6
|
+
return if query_index_string.empty?
|
7
|
+
|
8
|
+
if ids = redis.smembers(index_key(soundex))
|
9
|
+
result = ids.map do |id|
|
10
|
+
string = redis.get(dictionary_key(id))
|
11
|
+
{
|
12
|
+
:id => id,
|
13
|
+
:distance => Levenshtein.distance(query_index_string, string),
|
14
|
+
:alpha => string
|
15
|
+
}
|
16
|
+
end
|
17
|
+
|
18
|
+
result.sort_by!{|item|item[sort_by]} if sort_by
|
19
|
+
result.reject!{|item|item[:distance] > context[:distance]} if context[:distance]
|
20
|
+
result.map{|item|item[:id]}
|
21
|
+
else
|
22
|
+
[]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def sort_by
|
28
|
+
context[:sort_by]
|
29
|
+
end
|
30
|
+
|
31
|
+
def query_index_string
|
32
|
+
context[:prepared_query] ||= prepare_string(context[:query])
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Mongoid
|
3
|
+
module Index
|
4
|
+
extend ActiveSupport::Concern
|
5
|
+
|
6
|
+
included do
|
7
|
+
extend Fuzzzy::Index
|
8
|
+
class_attribute :fuzzzy_indexes
|
9
|
+
around_save :create_fuzzzy_indexes
|
10
|
+
around_update :create_fuzzzy_indexes
|
11
|
+
around_destroy :delete_fuzzzy_indexes
|
12
|
+
end
|
13
|
+
|
14
|
+
module ClassMethods
|
15
|
+
def define_fuzzzy_index field, options={}
|
16
|
+
self.fuzzzy_indexes ||= {}
|
17
|
+
options[:index_name] = index_name(field)
|
18
|
+
self.fuzzzy_indexes[field.to_sym] = default_options.merge(options)
|
19
|
+
end
|
20
|
+
|
21
|
+
def clear_fuzzzy_index field
|
22
|
+
self.fuzzzy_indexes.delete(field.to_sym)
|
23
|
+
end
|
24
|
+
|
25
|
+
def has_fuzzzy_indexes?
|
26
|
+
!!self.fuzzzy_indexes
|
27
|
+
end
|
28
|
+
|
29
|
+
def default_options
|
30
|
+
{:method => :soundex}
|
31
|
+
end
|
32
|
+
|
33
|
+
def indexer method
|
34
|
+
return nil unless has_fuzzzy_indexes?
|
35
|
+
_indexer(method)
|
36
|
+
end
|
37
|
+
|
38
|
+
def searcher method
|
39
|
+
return nil unless has_fuzzzy_indexes?
|
40
|
+
_searcher(method)
|
41
|
+
end
|
42
|
+
|
43
|
+
def search_by field, query, context={}
|
44
|
+
index_context = self.fuzzzy_indexes[field.to_sym].dup
|
45
|
+
raise "You have not fuzzy index for '#{field}' field" unless index_context
|
46
|
+
|
47
|
+
index_context[:query] = query
|
48
|
+
index_context.merge!(context)
|
49
|
+
ids = searcher(index_context[:method]).search(index_context)
|
50
|
+
|
51
|
+
(only_ids? ? ids : scoped.find(ids)) if ids
|
52
|
+
end
|
53
|
+
|
54
|
+
def only_ids?
|
55
|
+
index_context[:only_ids] && !index_context[:with_cache]
|
56
|
+
end
|
57
|
+
|
58
|
+
def index_name field
|
59
|
+
"#{self.name.downcase}:#{field}"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def delete_fuzzzy_indexes &block
|
64
|
+
change_indexes(:delete_index, &block)
|
65
|
+
end
|
66
|
+
|
67
|
+
def create_fuzzzy_indexes &block
|
68
|
+
change_indexes(:create_index, &block)
|
69
|
+
end
|
70
|
+
|
71
|
+
def change_indexes command, condition=nil
|
72
|
+
self.class.fuzzzy_indexes.each do |(field, opts)|
|
73
|
+
change_field_index(command, field, opts.dup) if command == :delete_index ||
|
74
|
+
self.changed.include?(field.to_s)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def change_field_index command, field, options
|
79
|
+
self.class.indexer(options[:method]).send(command, options.merge(
|
80
|
+
:id => self.id,
|
81
|
+
:dictionary_string => self.send(field)
|
82
|
+
))
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
data/lib/fuzzzy/redis.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'inline'
|
2
|
+
module Fuzzzy
|
3
|
+
module Redis
|
4
|
+
# Ruby implementation is:
|
5
|
+
# def index_key key, key2=nil
|
6
|
+
# key = "#{key}:#key2" if key2
|
7
|
+
# "#{shared_key}:#{index_key}:#{key}"
|
8
|
+
# end
|
9
|
+
INDEX_KEY = <<-EOC
|
10
|
+
VALUE
|
11
|
+
_index_key(int argc, VALUE *argv, VALUE self)
|
12
|
+
{
|
13
|
+
VALUE type = rb_funcall(self, rb_intern("index_type"), 0);
|
14
|
+
VALUE shared_key = rb_funcall(self, rb_intern("shared_key"), 0);
|
15
|
+
char sep[2] = ":";
|
16
|
+
VALUE key, key2, result;
|
17
|
+
char * buf;
|
18
|
+
unsigned long long length;
|
19
|
+
|
20
|
+
if(rb_scan_args(argc, argv, "11", &key, &key2) == 2) {
|
21
|
+
key = rb_str_dup(key);
|
22
|
+
rb_str_cat(key, sep, 1);
|
23
|
+
rb_str_concat(key, rb_funcall(key2, rb_intern("to_s"), 0));
|
24
|
+
}
|
25
|
+
|
26
|
+
length = RSTRING_LEN(shared_key) + RSTRING_LEN(type) + RSTRING_LEN(key) + 4;
|
27
|
+
buf = malloc(length);
|
28
|
+
snprintf(buf, length, "%s:%s:%s", RSTRING_PTR(shared_key), RSTRING_PTR(type), RSTRING_PTR(key));
|
29
|
+
result = rb_str_new2(buf);
|
30
|
+
free(buf);
|
31
|
+
|
32
|
+
return result;
|
33
|
+
}
|
34
|
+
EOC
|
35
|
+
|
36
|
+
inline do |builder|
|
37
|
+
builder.add_compile_flags('-std=c99')
|
38
|
+
builder.c_raw(INDEX_KEY, :method_name => 'index_key', :arity => -1)
|
39
|
+
end
|
40
|
+
|
41
|
+
def redis
|
42
|
+
Fuzzzy.redis
|
43
|
+
end
|
44
|
+
|
45
|
+
def shared_key
|
46
|
+
context[:shared_key] ||= "fuzzzy:#{index_name}"
|
47
|
+
end
|
48
|
+
|
49
|
+
def dictionary_key id
|
50
|
+
"#{shared_key}:dictionary:#{id}"
|
51
|
+
end
|
52
|
+
|
53
|
+
def counter_key
|
54
|
+
"fuzzzy:indexes:info"
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.counter_key
|
58
|
+
"fuzzzy:indexes:info"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|