fuzzzy 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ module Fuzzzy
2
+ module Index
3
+ def _indexer method
4
+ @indexer ||= {}
5
+ @indexer[method] ||= class_for(:indexer, method).new
6
+ end
7
+
8
+ def _searcher method
9
+ @searcher ||= {}
10
+ @searcher[method] ||= class_for(:searcher, method).new
11
+ end
12
+
13
+ def class_for type, method
14
+ "fuzzzy/#{method}/#{type}".classify.constantize
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ module Fuzzzy
2
+ module Indexer
3
+ def delete_dictionary id
4
+ redis.del(dictionary_key(id))
5
+ redis.hincrby(counter_key, index_name, -1)
6
+ end
7
+
8
+ def save_dictionary id, string
9
+ redis.set(dictionary_key(id), query_index_string)
10
+ redis.hincrby(counter_key, index_name, 1)
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,32 @@
1
+ module Fuzzzy
2
+ class MethodBase
3
+ include Redis
4
+
5
+ attr_reader :context
6
+
7
+ def with_context cntx
8
+ @context = cntx.dup and yield if cntx
9
+ rescue => e
10
+ raise e
11
+ ensure
12
+ @context = nil
13
+ end
14
+
15
+ def index_name
16
+ context[:index_name]
17
+ end
18
+
19
+ def prepare_string string
20
+ return '' unless string
21
+ str = string.dup.downcase
22
+ str = context[:filter].call(str) if context[:filter] && context[:filter].respond_to?(:call)
23
+ str = (str.split - stopwords).join(' ') if context[:strip_stopwords]
24
+ str
25
+ end
26
+
27
+ def stopwords
28
+ return context[:strip_stopwords] if context[:strip_stopwords] && context[:strip_stopwords].is_a?(Array)
29
+ Fuzzzy.stopwords
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,15 @@
1
+ module Fuzzzy
2
+ module Ngram
3
+ class Base < MethodBase
4
+ def ngrams string=nil
5
+ string ||= query_index_string
6
+ return [string] if string.size < 3
7
+ context[string] ||= (0..string.length-3).map{|idx| string[idx,3] }
8
+ end
9
+
10
+ def index_type
11
+ 'ngram_i'
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,37 @@
1
+ module Fuzzzy
2
+ module Ngram
3
+ class Indexer < Base
4
+ include Fuzzzy::Indexer
5
+ def query_index_string
6
+ context[:prepared_dictionary_string] ||= prepare_string(context[:dictionary_string])
7
+ end
8
+
9
+ def create_index cntx
10
+ with_context(cntx) do
11
+ return if query_index_string.empty?
12
+
13
+ delete_index
14
+
15
+ ngrams.each_with_index do |ngram, index|
16
+ redis.sadd(index_key(ngram, index), context[:id])
17
+ end
18
+
19
+ save_dictionary(context[:id], query_index_string)
20
+ end
21
+ end
22
+
23
+ def delete_index cntx=nil
24
+ block = lambda do
25
+ if older_string = redis.get(dictionary_key(context[:id]))
26
+ ngrams(older_string).each_with_index do |ngram, index|
27
+ redis.srem(index_key(ngram, index), context[:id])
28
+ end
29
+
30
+ delete_dictionary(context[:id])
31
+ end
32
+ end
33
+ cntx ? with_context(cntx, &block) : block.call
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,95 @@
1
+ module Fuzzzy
2
+ module Ngram
3
+ class Searcher < Base
4
+ # Ruby implementation:
5
+ # def segment_points index
6
+ # right = distance + index
7
+ # left = index > distance ? (index - distance) : 0
8
+ # i = left
9
+ # while i <= right do
10
+ # yield i
11
+ # i += 1
12
+ # end
13
+ # end
14
+ SEGMENT_POINTS = <<-EOC
15
+ VALUE
16
+ _segment_points(VALUE self, VALUE _index)
17
+ {
18
+ int index, distance, left, right, i;
19
+
20
+ index = NUM2INT(_index);
21
+ distance = NUM2INT(rb_funcall(self, rb_intern("distance"), 0));
22
+ right = index + distance;
23
+
24
+ if(index > distance) {
25
+ left = index - distance;
26
+ } else {
27
+ left = 0;
28
+ }
29
+
30
+ for(i = left; i <= right; i++) {
31
+ rb_yield(INT2NUM(i));
32
+ }
33
+
34
+ return Qnil;
35
+ }
36
+ EOC
37
+
38
+ inline do |builder|
39
+ builder.c_raw(SEGMENT_POINTS, :method_name => 'segment_points', :arity => 1)
40
+ end
41
+
42
+ def search cntx
43
+ with_context(cntx) do
44
+ return [] if query_index_string.empty?
45
+ if ids = redis.sunion(*index_keys)
46
+ ids.each do |id|
47
+ string = redis.get(dictionary_key(id))
48
+ dist = Levenshtein.distance(query_index_string, string)
49
+ result << {
50
+ :id => id,
51
+ :distance => dist,
52
+ :alpha => string
53
+ } if dist <= distance
54
+ end
55
+ result.sort_by!{|item|item[sort_by]} if sort_by
56
+ result.map!{|item|item[:id]} unless with_cache?
57
+ result
58
+ else
59
+ []
60
+ end
61
+ end
62
+ end
63
+
64
+ def index_keys
65
+ keys = []
66
+ ngrams.each_with_index do |ngram, index|
67
+ segment_points(index) do |i|
68
+ keys << index_key(ngram, i)
69
+ end
70
+ end
71
+ keys
72
+ end
73
+
74
+ def distance
75
+ context[:distance] ||= 0
76
+ end
77
+
78
+ def result
79
+ context[:result] ||= []
80
+ end
81
+
82
+ def with_cache?
83
+ context[:with_cache] ||= false
84
+ end
85
+
86
+ def sort_by
87
+ context[:sort_by]
88
+ end
89
+
90
+ def query_index_string
91
+ context[:prepared_query] ||= prepare_string(context[:query])
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,13 @@
1
+ module Fuzzzy
2
+ module Soundex
3
+ class Base < MethodBase
4
+ def soundex string=nil
5
+ context[:soundex] ||= Text::Soundex.soundex(string || query_index_string).to_s
6
+ end
7
+
8
+ def index_type
9
+ 'soundex_i'
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,30 @@
1
+ module Fuzzzy
2
+ module Soundex
3
+ class Indexer < Base
4
+ include Fuzzzy::Indexer
5
+ def query_index_string
6
+ context[:prepared_dictionary_string] ||= prepare_string(context[:dictionary_string])
7
+ end
8
+
9
+ def create_index cntx
10
+ with_context(cntx) do
11
+ return if query_index_string.empty?
12
+
13
+ delete_index
14
+ redis.sadd(index_key(soundex), context[:id])
15
+ save_dictionary(context[:id], query_index_string)
16
+ end
17
+ end
18
+
19
+ def delete_index cntx=nil
20
+ block = lambda do
21
+ if older_string = redis.get(dictionary_key(context[:id]))
22
+ redis.srem(index_key(soundex(older_string)), context[:id])
23
+ delete_dictionary(context[:id])
24
+ end
25
+ end
26
+ cntx ? with_context(cntx, &block) : block.call
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,36 @@
1
+ module Fuzzzy
2
+ module Soundex
3
+ class Searcher < Base
4
+ def search context
5
+ with_context(context) do
6
+ return if query_index_string.empty?
7
+
8
+ if ids = redis.smembers(index_key(soundex))
9
+ result = ids.map do |id|
10
+ string = redis.get(dictionary_key(id))
11
+ {
12
+ :id => id,
13
+ :distance => Levenshtein.distance(query_index_string, string),
14
+ :alpha => string
15
+ }
16
+ end
17
+
18
+ result.sort_by!{|item|item[sort_by]} if sort_by
19
+ result.reject!{|item|item[:distance] > context[:distance]} if context[:distance]
20
+ result.map{|item|item[:id]}
21
+ else
22
+ []
23
+ end
24
+ end
25
+ end
26
+
27
+ def sort_by
28
+ context[:sort_by]
29
+ end
30
+
31
+ def query_index_string
32
+ context[:prepared_query] ||= prepare_string(context[:query])
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,86 @@
1
+ module Fuzzzy
2
+ module Mongoid
3
+ module Index
4
+ extend ActiveSupport::Concern
5
+
6
+ included do
7
+ extend Fuzzzy::Index
8
+ class_attribute :fuzzzy_indexes
9
+ around_save :create_fuzzzy_indexes
10
+ around_update :create_fuzzzy_indexes
11
+ around_destroy :delete_fuzzzy_indexes
12
+ end
13
+
14
+ module ClassMethods
15
+ def define_fuzzzy_index field, options={}
16
+ self.fuzzzy_indexes ||= {}
17
+ options[:index_name] = index_name(field)
18
+ self.fuzzzy_indexes[field.to_sym] = default_options.merge(options)
19
+ end
20
+
21
+ def clear_fuzzzy_index field
22
+ self.fuzzzy_indexes.delete(field.to_sym)
23
+ end
24
+
25
+ def has_fuzzzy_indexes?
26
+ !!self.fuzzzy_indexes
27
+ end
28
+
29
+ def default_options
30
+ {:method => :soundex}
31
+ end
32
+
33
+ def indexer method
34
+ return nil unless has_fuzzzy_indexes?
35
+ _indexer(method)
36
+ end
37
+
38
+ def searcher method
39
+ return nil unless has_fuzzzy_indexes?
40
+ _searcher(method)
41
+ end
42
+
43
+ def search_by field, query, context={}
44
+ index_context = self.fuzzzy_indexes[field.to_sym].dup
45
+ raise "You have not fuzzy index for '#{field}' field" unless index_context
46
+
47
+ index_context[:query] = query
48
+ index_context.merge!(context)
49
+ ids = searcher(index_context[:method]).search(index_context)
50
+
51
+ (only_ids? ? ids : scoped.find(ids)) if ids
52
+ end
53
+
54
+ def only_ids?
55
+ index_context[:only_ids] && !index_context[:with_cache]
56
+ end
57
+
58
+ def index_name field
59
+ "#{self.name.downcase}:#{field}"
60
+ end
61
+ end
62
+
63
+ def delete_fuzzzy_indexes &block
64
+ change_indexes(:delete_index, &block)
65
+ end
66
+
67
+ def create_fuzzzy_indexes &block
68
+ change_indexes(:create_index, &block)
69
+ end
70
+
71
+ def change_indexes command, condition=nil
72
+ self.class.fuzzzy_indexes.each do |(field, opts)|
73
+ change_field_index(command, field, opts.dup) if command == :delete_index ||
74
+ self.changed.include?(field.to_s)
75
+ end
76
+ end
77
+
78
+ def change_field_index command, field, options
79
+ self.class.indexer(options[:method]).send(command, options.merge(
80
+ :id => self.id,
81
+ :dictionary_string => self.send(field)
82
+ ))
83
+ end
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,61 @@
1
+ require 'inline'
2
+ module Fuzzzy
3
+ module Redis
4
+ # Ruby implementation is:
5
+ # def index_key key, key2=nil
6
+ # key = "#{key}:#key2" if key2
7
+ # "#{shared_key}:#{index_key}:#{key}"
8
+ # end
9
+ INDEX_KEY = <<-EOC
10
+ VALUE
11
+ _index_key(int argc, VALUE *argv, VALUE self)
12
+ {
13
+ VALUE type = rb_funcall(self, rb_intern("index_type"), 0);
14
+ VALUE shared_key = rb_funcall(self, rb_intern("shared_key"), 0);
15
+ char sep[2] = ":";
16
+ VALUE key, key2, result;
17
+ char * buf;
18
+ unsigned long long length;
19
+
20
+ if(rb_scan_args(argc, argv, "11", &key, &key2) == 2) {
21
+ key = rb_str_dup(key);
22
+ rb_str_cat(key, sep, 1);
23
+ rb_str_concat(key, rb_funcall(key2, rb_intern("to_s"), 0));
24
+ }
25
+
26
+ length = RSTRING_LEN(shared_key) + RSTRING_LEN(type) + RSTRING_LEN(key) + 4;
27
+ buf = malloc(length);
28
+ snprintf(buf, length, "%s:%s:%s", RSTRING_PTR(shared_key), RSTRING_PTR(type), RSTRING_PTR(key));
29
+ result = rb_str_new2(buf);
30
+ free(buf);
31
+
32
+ return result;
33
+ }
34
+ EOC
35
+
36
+ inline do |builder|
37
+ builder.add_compile_flags('-std=c99')
38
+ builder.c_raw(INDEX_KEY, :method_name => 'index_key', :arity => -1)
39
+ end
40
+
41
+ def redis
42
+ Fuzzzy.redis
43
+ end
44
+
45
+ def shared_key
46
+ context[:shared_key] ||= "fuzzzy:#{index_name}"
47
+ end
48
+
49
+ def dictionary_key id
50
+ "#{shared_key}:dictionary:#{id}"
51
+ end
52
+
53
+ def counter_key
54
+ "fuzzzy:indexes:info"
55
+ end
56
+
57
+ def self.counter_key
58
+ "fuzzzy:indexes:info"
59
+ end
60
+ end
61
+ end