fuzzzy 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +15 -0
- data/.rspec +2 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/README.md +97 -0
- data/Rakefile +11 -0
- data/benchmark/data/cities.csv +8875 -0
- data/benchmark/fuzzzy_benchmark.rb +128 -0
- data/benchmark/test_ngram.rb +36 -0
- data/benchmark/test_soundex.rb +36 -0
- data/config.ru +11 -0
- data/dictionary/en_stopwords.yml +175 -0
- data/fuzzzy.gemspec +44 -0
- data/lib/fuzzzy.rb +129 -0
- data/lib/fuzzzy/index.rb +17 -0
- data/lib/fuzzzy/methods/indexer.rb +13 -0
- data/lib/fuzzzy/methods/method_base.rb +32 -0
- data/lib/fuzzzy/methods/ngram/base.rb +15 -0
- data/lib/fuzzzy/methods/ngram/indexer.rb +37 -0
- data/lib/fuzzzy/methods/ngram/searcher.rb +95 -0
- data/lib/fuzzzy/methods/soundex/base.rb +13 -0
- data/lib/fuzzzy/methods/soundex/indexer.rb +30 -0
- data/lib/fuzzzy/methods/soundex/searcher.rb +36 -0
- data/lib/fuzzzy/orm/mongoid/index.rb +86 -0
- data/lib/fuzzzy/redis.rb +61 -0
- data/lib/fuzzzy/server/http.rb +99 -0
- data/lib/fuzzzy/version.rb +3 -0
- data/spec/config/mongoid.yml +7 -0
- data/spec/models/city.rb +8 -0
- data/spec/models/indexed_city.rb +9 -0
- data/spec/ngram/indexer_spec.rb +142 -0
- data/spec/ngram/searcher_spec.rb +194 -0
- data/spec/orm/mongoid/index_spec.rb +165 -0
- data/spec/redis_spec.rb +54 -0
- data/spec/soundex/indexer_spec.rb +40 -0
- data/spec/soundex/searcher_spec.rb +63 -0
- data/spec/spec_helper.rb +22 -0
- metadata +325 -0
data/lib/fuzzzy/index.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Index
|
3
|
+
def _indexer method
|
4
|
+
@indexer ||= {}
|
5
|
+
@indexer[method] ||= class_for(:indexer, method).new
|
6
|
+
end
|
7
|
+
|
8
|
+
def _searcher method
|
9
|
+
@searcher ||= {}
|
10
|
+
@searcher[method] ||= class_for(:searcher, method).new
|
11
|
+
end
|
12
|
+
|
13
|
+
def class_for type, method
|
14
|
+
"fuzzzy/#{method}/#{type}".classify.constantize
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Indexer
|
3
|
+
def delete_dictionary id
|
4
|
+
redis.del(dictionary_key(id))
|
5
|
+
redis.hincrby(counter_key, index_name, -1)
|
6
|
+
end
|
7
|
+
|
8
|
+
def save_dictionary id, string
|
9
|
+
redis.set(dictionary_key(id), query_index_string)
|
10
|
+
redis.hincrby(counter_key, index_name, 1)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
class MethodBase
|
3
|
+
include Redis
|
4
|
+
|
5
|
+
attr_reader :context
|
6
|
+
|
7
|
+
def with_context cntx
|
8
|
+
@context = cntx.dup and yield if cntx
|
9
|
+
rescue => e
|
10
|
+
raise e
|
11
|
+
ensure
|
12
|
+
@context = nil
|
13
|
+
end
|
14
|
+
|
15
|
+
def index_name
|
16
|
+
context[:index_name]
|
17
|
+
end
|
18
|
+
|
19
|
+
def prepare_string string
|
20
|
+
return '' unless string
|
21
|
+
str = string.dup.downcase
|
22
|
+
str = context[:filter].call(str) if context[:filter] && context[:filter].respond_to?(:call)
|
23
|
+
str = (str.split - stopwords).join(' ') if context[:strip_stopwords]
|
24
|
+
str
|
25
|
+
end
|
26
|
+
|
27
|
+
def stopwords
|
28
|
+
return context[:strip_stopwords] if context[:strip_stopwords] && context[:strip_stopwords].is_a?(Array)
|
29
|
+
Fuzzzy.stopwords
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Ngram
|
3
|
+
class Base < MethodBase
|
4
|
+
def ngrams string=nil
|
5
|
+
string ||= query_index_string
|
6
|
+
return [string] if string.size < 3
|
7
|
+
context[string] ||= (0..string.length-3).map{|idx| string[idx,3] }
|
8
|
+
end
|
9
|
+
|
10
|
+
def index_type
|
11
|
+
'ngram_i'
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Ngram
|
3
|
+
class Indexer < Base
|
4
|
+
include Fuzzzy::Indexer
|
5
|
+
def query_index_string
|
6
|
+
context[:prepared_dictionary_string] ||= prepare_string(context[:dictionary_string])
|
7
|
+
end
|
8
|
+
|
9
|
+
def create_index cntx
|
10
|
+
with_context(cntx) do
|
11
|
+
return if query_index_string.empty?
|
12
|
+
|
13
|
+
delete_index
|
14
|
+
|
15
|
+
ngrams.each_with_index do |ngram, index|
|
16
|
+
redis.sadd(index_key(ngram, index), context[:id])
|
17
|
+
end
|
18
|
+
|
19
|
+
save_dictionary(context[:id], query_index_string)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def delete_index cntx=nil
|
24
|
+
block = lambda do
|
25
|
+
if older_string = redis.get(dictionary_key(context[:id]))
|
26
|
+
ngrams(older_string).each_with_index do |ngram, index|
|
27
|
+
redis.srem(index_key(ngram, index), context[:id])
|
28
|
+
end
|
29
|
+
|
30
|
+
delete_dictionary(context[:id])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
cntx ? with_context(cntx, &block) : block.call
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Ngram
|
3
|
+
class Searcher < Base
|
4
|
+
# Ruby implementation:
|
5
|
+
# def segment_points index
|
6
|
+
# right = distance + index
|
7
|
+
# left = index > distance ? (index - distance) : 0
|
8
|
+
# i = left
|
9
|
+
# while i <= right do
|
10
|
+
# yield i
|
11
|
+
# i += 1
|
12
|
+
# end
|
13
|
+
# end
|
14
|
+
SEGMENT_POINTS = <<-EOC
|
15
|
+
VALUE
|
16
|
+
_segment_points(VALUE self, VALUE _index)
|
17
|
+
{
|
18
|
+
int index, distance, left, right, i;
|
19
|
+
|
20
|
+
index = NUM2INT(_index);
|
21
|
+
distance = NUM2INT(rb_funcall(self, rb_intern("distance"), 0));
|
22
|
+
right = index + distance;
|
23
|
+
|
24
|
+
if(index > distance) {
|
25
|
+
left = index - distance;
|
26
|
+
} else {
|
27
|
+
left = 0;
|
28
|
+
}
|
29
|
+
|
30
|
+
for(i = left; i <= right; i++) {
|
31
|
+
rb_yield(INT2NUM(i));
|
32
|
+
}
|
33
|
+
|
34
|
+
return Qnil;
|
35
|
+
}
|
36
|
+
EOC
|
37
|
+
|
38
|
+
inline do |builder|
|
39
|
+
builder.c_raw(SEGMENT_POINTS, :method_name => 'segment_points', :arity => 1)
|
40
|
+
end
|
41
|
+
|
42
|
+
def search cntx
|
43
|
+
with_context(cntx) do
|
44
|
+
return [] if query_index_string.empty?
|
45
|
+
if ids = redis.sunion(*index_keys)
|
46
|
+
ids.each do |id|
|
47
|
+
string = redis.get(dictionary_key(id))
|
48
|
+
dist = Levenshtein.distance(query_index_string, string)
|
49
|
+
result << {
|
50
|
+
:id => id,
|
51
|
+
:distance => dist,
|
52
|
+
:alpha => string
|
53
|
+
} if dist <= distance
|
54
|
+
end
|
55
|
+
result.sort_by!{|item|item[sort_by]} if sort_by
|
56
|
+
result.map!{|item|item[:id]} unless with_cache?
|
57
|
+
result
|
58
|
+
else
|
59
|
+
[]
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def index_keys
|
65
|
+
keys = []
|
66
|
+
ngrams.each_with_index do |ngram, index|
|
67
|
+
segment_points(index) do |i|
|
68
|
+
keys << index_key(ngram, i)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
keys
|
72
|
+
end
|
73
|
+
|
74
|
+
def distance
|
75
|
+
context[:distance] ||= 0
|
76
|
+
end
|
77
|
+
|
78
|
+
def result
|
79
|
+
context[:result] ||= []
|
80
|
+
end
|
81
|
+
|
82
|
+
def with_cache?
|
83
|
+
context[:with_cache] ||= false
|
84
|
+
end
|
85
|
+
|
86
|
+
def sort_by
|
87
|
+
context[:sort_by]
|
88
|
+
end
|
89
|
+
|
90
|
+
def query_index_string
|
91
|
+
context[:prepared_query] ||= prepare_string(context[:query])
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Soundex
|
3
|
+
class Indexer < Base
|
4
|
+
include Fuzzzy::Indexer
|
5
|
+
def query_index_string
|
6
|
+
context[:prepared_dictionary_string] ||= prepare_string(context[:dictionary_string])
|
7
|
+
end
|
8
|
+
|
9
|
+
def create_index cntx
|
10
|
+
with_context(cntx) do
|
11
|
+
return if query_index_string.empty?
|
12
|
+
|
13
|
+
delete_index
|
14
|
+
redis.sadd(index_key(soundex), context[:id])
|
15
|
+
save_dictionary(context[:id], query_index_string)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def delete_index cntx=nil
|
20
|
+
block = lambda do
|
21
|
+
if older_string = redis.get(dictionary_key(context[:id]))
|
22
|
+
redis.srem(index_key(soundex(older_string)), context[:id])
|
23
|
+
delete_dictionary(context[:id])
|
24
|
+
end
|
25
|
+
end
|
26
|
+
cntx ? with_context(cntx, &block) : block.call
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Soundex
|
3
|
+
class Searcher < Base
|
4
|
+
def search context
|
5
|
+
with_context(context) do
|
6
|
+
return if query_index_string.empty?
|
7
|
+
|
8
|
+
if ids = redis.smembers(index_key(soundex))
|
9
|
+
result = ids.map do |id|
|
10
|
+
string = redis.get(dictionary_key(id))
|
11
|
+
{
|
12
|
+
:id => id,
|
13
|
+
:distance => Levenshtein.distance(query_index_string, string),
|
14
|
+
:alpha => string
|
15
|
+
}
|
16
|
+
end
|
17
|
+
|
18
|
+
result.sort_by!{|item|item[sort_by]} if sort_by
|
19
|
+
result.reject!{|item|item[:distance] > context[:distance]} if context[:distance]
|
20
|
+
result.map{|item|item[:id]}
|
21
|
+
else
|
22
|
+
[]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def sort_by
|
28
|
+
context[:sort_by]
|
29
|
+
end
|
30
|
+
|
31
|
+
def query_index_string
|
32
|
+
context[:prepared_query] ||= prepare_string(context[:query])
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module Fuzzzy
|
2
|
+
module Mongoid
|
3
|
+
module Index
|
4
|
+
extend ActiveSupport::Concern
|
5
|
+
|
6
|
+
included do
|
7
|
+
extend Fuzzzy::Index
|
8
|
+
class_attribute :fuzzzy_indexes
|
9
|
+
around_save :create_fuzzzy_indexes
|
10
|
+
around_update :create_fuzzzy_indexes
|
11
|
+
around_destroy :delete_fuzzzy_indexes
|
12
|
+
end
|
13
|
+
|
14
|
+
module ClassMethods
|
15
|
+
def define_fuzzzy_index field, options={}
|
16
|
+
self.fuzzzy_indexes ||= {}
|
17
|
+
options[:index_name] = index_name(field)
|
18
|
+
self.fuzzzy_indexes[field.to_sym] = default_options.merge(options)
|
19
|
+
end
|
20
|
+
|
21
|
+
def clear_fuzzzy_index field
|
22
|
+
self.fuzzzy_indexes.delete(field.to_sym)
|
23
|
+
end
|
24
|
+
|
25
|
+
def has_fuzzzy_indexes?
|
26
|
+
!!self.fuzzzy_indexes
|
27
|
+
end
|
28
|
+
|
29
|
+
def default_options
|
30
|
+
{:method => :soundex}
|
31
|
+
end
|
32
|
+
|
33
|
+
def indexer method
|
34
|
+
return nil unless has_fuzzzy_indexes?
|
35
|
+
_indexer(method)
|
36
|
+
end
|
37
|
+
|
38
|
+
def searcher method
|
39
|
+
return nil unless has_fuzzzy_indexes?
|
40
|
+
_searcher(method)
|
41
|
+
end
|
42
|
+
|
43
|
+
def search_by field, query, context={}
|
44
|
+
index_context = self.fuzzzy_indexes[field.to_sym].dup
|
45
|
+
raise "You have not fuzzy index for '#{field}' field" unless index_context
|
46
|
+
|
47
|
+
index_context[:query] = query
|
48
|
+
index_context.merge!(context)
|
49
|
+
ids = searcher(index_context[:method]).search(index_context)
|
50
|
+
|
51
|
+
(only_ids? ? ids : scoped.find(ids)) if ids
|
52
|
+
end
|
53
|
+
|
54
|
+
def only_ids?
|
55
|
+
index_context[:only_ids] && !index_context[:with_cache]
|
56
|
+
end
|
57
|
+
|
58
|
+
def index_name field
|
59
|
+
"#{self.name.downcase}:#{field}"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def delete_fuzzzy_indexes &block
|
64
|
+
change_indexes(:delete_index, &block)
|
65
|
+
end
|
66
|
+
|
67
|
+
def create_fuzzzy_indexes &block
|
68
|
+
change_indexes(:create_index, &block)
|
69
|
+
end
|
70
|
+
|
71
|
+
def change_indexes command, condition=nil
|
72
|
+
self.class.fuzzzy_indexes.each do |(field, opts)|
|
73
|
+
change_field_index(command, field, opts.dup) if command == :delete_index ||
|
74
|
+
self.changed.include?(field.to_s)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def change_field_index command, field, options
|
79
|
+
self.class.indexer(options[:method]).send(command, options.merge(
|
80
|
+
:id => self.id,
|
81
|
+
:dictionary_string => self.send(field)
|
82
|
+
))
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
data/lib/fuzzzy/redis.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'inline'
|
2
|
+
module Fuzzzy
|
3
|
+
module Redis
|
4
|
+
# Ruby implementation is:
|
5
|
+
# def index_key key, key2=nil
|
6
|
+
# key = "#{key}:#key2" if key2
|
7
|
+
# "#{shared_key}:#{index_key}:#{key}"
|
8
|
+
# end
|
9
|
+
INDEX_KEY = <<-EOC
|
10
|
+
VALUE
|
11
|
+
_index_key(int argc, VALUE *argv, VALUE self)
|
12
|
+
{
|
13
|
+
VALUE type = rb_funcall(self, rb_intern("index_type"), 0);
|
14
|
+
VALUE shared_key = rb_funcall(self, rb_intern("shared_key"), 0);
|
15
|
+
char sep[2] = ":";
|
16
|
+
VALUE key, key2, result;
|
17
|
+
char * buf;
|
18
|
+
unsigned long long length;
|
19
|
+
|
20
|
+
if(rb_scan_args(argc, argv, "11", &key, &key2) == 2) {
|
21
|
+
key = rb_str_dup(key);
|
22
|
+
rb_str_cat(key, sep, 1);
|
23
|
+
rb_str_concat(key, rb_funcall(key2, rb_intern("to_s"), 0));
|
24
|
+
}
|
25
|
+
|
26
|
+
length = RSTRING_LEN(shared_key) + RSTRING_LEN(type) + RSTRING_LEN(key) + 4;
|
27
|
+
buf = malloc(length);
|
28
|
+
snprintf(buf, length, "%s:%s:%s", RSTRING_PTR(shared_key), RSTRING_PTR(type), RSTRING_PTR(key));
|
29
|
+
result = rb_str_new2(buf);
|
30
|
+
free(buf);
|
31
|
+
|
32
|
+
return result;
|
33
|
+
}
|
34
|
+
EOC
|
35
|
+
|
36
|
+
inline do |builder|
|
37
|
+
builder.add_compile_flags('-std=c99')
|
38
|
+
builder.c_raw(INDEX_KEY, :method_name => 'index_key', :arity => -1)
|
39
|
+
end
|
40
|
+
|
41
|
+
def redis
|
42
|
+
Fuzzzy.redis
|
43
|
+
end
|
44
|
+
|
45
|
+
def shared_key
|
46
|
+
context[:shared_key] ||= "fuzzzy:#{index_name}"
|
47
|
+
end
|
48
|
+
|
49
|
+
def dictionary_key id
|
50
|
+
"#{shared_key}:dictionary:#{id}"
|
51
|
+
end
|
52
|
+
|
53
|
+
def counter_key
|
54
|
+
"fuzzzy:indexes:info"
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.counter_key
|
58
|
+
"fuzzzy:indexes:info"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|