fuzzzy 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ module Fuzzzy
2
+ module Index
3
+ def _indexer method
4
+ @indexer ||= {}
5
+ @indexer[method] ||= class_for(:indexer, method).new
6
+ end
7
+
8
+ def _searcher method
9
+ @searcher ||= {}
10
+ @searcher[method] ||= class_for(:searcher, method).new
11
+ end
12
+
13
+ def class_for type, method
14
+ "fuzzzy/#{method}/#{type}".classify.constantize
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ module Fuzzzy
2
+ module Indexer
3
+ def delete_dictionary id
4
+ redis.del(dictionary_key(id))
5
+ redis.hincrby(counter_key, index_name, -1)
6
+ end
7
+
8
+ def save_dictionary id, string
9
+ redis.set(dictionary_key(id), query_index_string)
10
+ redis.hincrby(counter_key, index_name, 1)
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,32 @@
1
+ module Fuzzzy
2
+ class MethodBase
3
+ include Redis
4
+
5
+ attr_reader :context
6
+
7
+ def with_context cntx
8
+ @context = cntx.dup and yield if cntx
9
+ rescue => e
10
+ raise e
11
+ ensure
12
+ @context = nil
13
+ end
14
+
15
+ def index_name
16
+ context[:index_name]
17
+ end
18
+
19
+ def prepare_string string
20
+ return '' unless string
21
+ str = string.dup.downcase
22
+ str = context[:filter].call(str) if context[:filter] && context[:filter].respond_to?(:call)
23
+ str = (str.split - stopwords).join(' ') if context[:strip_stopwords]
24
+ str
25
+ end
26
+
27
+ def stopwords
28
+ return context[:strip_stopwords] if context[:strip_stopwords] && context[:strip_stopwords].is_a?(Array)
29
+ Fuzzzy.stopwords
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,15 @@
1
+ module Fuzzzy
2
+ module Ngram
3
+ class Base < MethodBase
4
+ def ngrams string=nil
5
+ string ||= query_index_string
6
+ return [string] if string.size < 3
7
+ context[string] ||= (0..string.length-3).map{|idx| string[idx,3] }
8
+ end
9
+
10
+ def index_type
11
+ 'ngram_i'
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,37 @@
1
+ module Fuzzzy
2
+ module Ngram
3
+ class Indexer < Base
4
+ include Fuzzzy::Indexer
5
+ def query_index_string
6
+ context[:prepared_dictionary_string] ||= prepare_string(context[:dictionary_string])
7
+ end
8
+
9
+ def create_index cntx
10
+ with_context(cntx) do
11
+ return if query_index_string.empty?
12
+
13
+ delete_index
14
+
15
+ ngrams.each_with_index do |ngram, index|
16
+ redis.sadd(index_key(ngram, index), context[:id])
17
+ end
18
+
19
+ save_dictionary(context[:id], query_index_string)
20
+ end
21
+ end
22
+
23
+ def delete_index cntx=nil
24
+ block = lambda do
25
+ if older_string = redis.get(dictionary_key(context[:id]))
26
+ ngrams(older_string).each_with_index do |ngram, index|
27
+ redis.srem(index_key(ngram, index), context[:id])
28
+ end
29
+
30
+ delete_dictionary(context[:id])
31
+ end
32
+ end
33
+ cntx ? with_context(cntx, &block) : block.call
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,95 @@
1
+ module Fuzzzy
2
+ module Ngram
3
+ class Searcher < Base
4
+ # Ruby implementation:
5
+ # def segment_points index
6
+ # right = distance + index
7
+ # left = index > distance ? (index - distance) : 0
8
+ # i = left
9
+ # while i <= right do
10
+ # yield i
11
+ # i += 1
12
+ # end
13
+ # end
14
+ SEGMENT_POINTS = <<-EOC
15
+ VALUE
16
+ _segment_points(VALUE self, VALUE _index)
17
+ {
18
+ int index, distance, left, right, i;
19
+
20
+ index = NUM2INT(_index);
21
+ distance = NUM2INT(rb_funcall(self, rb_intern("distance"), 0));
22
+ right = index + distance;
23
+
24
+ if(index > distance) {
25
+ left = index - distance;
26
+ } else {
27
+ left = 0;
28
+ }
29
+
30
+ for(i = left; i <= right; i++) {
31
+ rb_yield(INT2NUM(i));
32
+ }
33
+
34
+ return Qnil;
35
+ }
36
+ EOC
37
+
38
+ inline do |builder|
39
+ builder.c_raw(SEGMENT_POINTS, :method_name => 'segment_points', :arity => 1)
40
+ end
41
+
42
+ def search cntx
43
+ with_context(cntx) do
44
+ return [] if query_index_string.empty?
45
+ if ids = redis.sunion(*index_keys)
46
+ ids.each do |id|
47
+ string = redis.get(dictionary_key(id))
48
+ dist = Levenshtein.distance(query_index_string, string)
49
+ result << {
50
+ :id => id,
51
+ :distance => dist,
52
+ :alpha => string
53
+ } if dist <= distance
54
+ end
55
+ result.sort_by!{|item|item[sort_by]} if sort_by
56
+ result.map!{|item|item[:id]} unless with_cache?
57
+ result
58
+ else
59
+ []
60
+ end
61
+ end
62
+ end
63
+
64
+ def index_keys
65
+ keys = []
66
+ ngrams.each_with_index do |ngram, index|
67
+ segment_points(index) do |i|
68
+ keys << index_key(ngram, i)
69
+ end
70
+ end
71
+ keys
72
+ end
73
+
74
+ def distance
75
+ context[:distance] ||= 0
76
+ end
77
+
78
+ def result
79
+ context[:result] ||= []
80
+ end
81
+
82
+ def with_cache?
83
+ context[:with_cache] ||= false
84
+ end
85
+
86
+ def sort_by
87
+ context[:sort_by]
88
+ end
89
+
90
+ def query_index_string
91
+ context[:prepared_query] ||= prepare_string(context[:query])
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,13 @@
1
+ module Fuzzzy
2
+ module Soundex
3
+ class Base < MethodBase
4
+ def soundex string=nil
5
+ context[:soundex] ||= Text::Soundex.soundex(string || query_index_string).to_s
6
+ end
7
+
8
+ def index_type
9
+ 'soundex_i'
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,30 @@
1
+ module Fuzzzy
2
+ module Soundex
3
+ class Indexer < Base
4
+ include Fuzzzy::Indexer
5
+ def query_index_string
6
+ context[:prepared_dictionary_string] ||= prepare_string(context[:dictionary_string])
7
+ end
8
+
9
+ def create_index cntx
10
+ with_context(cntx) do
11
+ return if query_index_string.empty?
12
+
13
+ delete_index
14
+ redis.sadd(index_key(soundex), context[:id])
15
+ save_dictionary(context[:id], query_index_string)
16
+ end
17
+ end
18
+
19
+ def delete_index cntx=nil
20
+ block = lambda do
21
+ if older_string = redis.get(dictionary_key(context[:id]))
22
+ redis.srem(index_key(soundex(older_string)), context[:id])
23
+ delete_dictionary(context[:id])
24
+ end
25
+ end
26
+ cntx ? with_context(cntx, &block) : block.call
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,36 @@
1
+ module Fuzzzy
2
+ module Soundex
3
+ class Searcher < Base
4
+ def search context
5
+ with_context(context) do
6
+ return if query_index_string.empty?
7
+
8
+ if ids = redis.smembers(index_key(soundex))
9
+ result = ids.map do |id|
10
+ string = redis.get(dictionary_key(id))
11
+ {
12
+ :id => id,
13
+ :distance => Levenshtein.distance(query_index_string, string),
14
+ :alpha => string
15
+ }
16
+ end
17
+
18
+ result.sort_by!{|item|item[sort_by]} if sort_by
19
+ result.reject!{|item|item[:distance] > context[:distance]} if context[:distance]
20
+ result.map{|item|item[:id]}
21
+ else
22
+ []
23
+ end
24
+ end
25
+ end
26
+
27
+ def sort_by
28
+ context[:sort_by]
29
+ end
30
+
31
+ def query_index_string
32
+ context[:prepared_query] ||= prepare_string(context[:query])
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,86 @@
1
+ module Fuzzzy
2
+ module Mongoid
3
+ module Index
4
+ extend ActiveSupport::Concern
5
+
6
+ included do
7
+ extend Fuzzzy::Index
8
+ class_attribute :fuzzzy_indexes
9
+ around_save :create_fuzzzy_indexes
10
+ around_update :create_fuzzzy_indexes
11
+ around_destroy :delete_fuzzzy_indexes
12
+ end
13
+
14
+ module ClassMethods
15
+ def define_fuzzzy_index field, options={}
16
+ self.fuzzzy_indexes ||= {}
17
+ options[:index_name] = index_name(field)
18
+ self.fuzzzy_indexes[field.to_sym] = default_options.merge(options)
19
+ end
20
+
21
+ def clear_fuzzzy_index field
22
+ self.fuzzzy_indexes.delete(field.to_sym)
23
+ end
24
+
25
+ def has_fuzzzy_indexes?
26
+ !!self.fuzzzy_indexes
27
+ end
28
+
29
+ def default_options
30
+ {:method => :soundex}
31
+ end
32
+
33
+ def indexer method
34
+ return nil unless has_fuzzzy_indexes?
35
+ _indexer(method)
36
+ end
37
+
38
+ def searcher method
39
+ return nil unless has_fuzzzy_indexes?
40
+ _searcher(method)
41
+ end
42
+
43
+ def search_by field, query, context={}
44
+ index_context = self.fuzzzy_indexes[field.to_sym].dup
45
+ raise "You have not fuzzy index for '#{field}' field" unless index_context
46
+
47
+ index_context[:query] = query
48
+ index_context.merge!(context)
49
+ ids = searcher(index_context[:method]).search(index_context)
50
+
51
+ (only_ids? ? ids : scoped.find(ids)) if ids
52
+ end
53
+
54
+ def only_ids?
55
+ index_context[:only_ids] && !index_context[:with_cache]
56
+ end
57
+
58
+ def index_name field
59
+ "#{self.name.downcase}:#{field}"
60
+ end
61
+ end
62
+
63
+ def delete_fuzzzy_indexes &block
64
+ change_indexes(:delete_index, &block)
65
+ end
66
+
67
+ def create_fuzzzy_indexes &block
68
+ change_indexes(:create_index, &block)
69
+ end
70
+
71
+ def change_indexes command, condition=nil
72
+ self.class.fuzzzy_indexes.each do |(field, opts)|
73
+ change_field_index(command, field, opts.dup) if command == :delete_index ||
74
+ self.changed.include?(field.to_s)
75
+ end
76
+ end
77
+
78
+ def change_field_index command, field, options
79
+ self.class.indexer(options[:method]).send(command, options.merge(
80
+ :id => self.id,
81
+ :dictionary_string => self.send(field)
82
+ ))
83
+ end
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,61 @@
1
+ require 'inline'
2
+ module Fuzzzy
3
+ module Redis
4
+ # Ruby implementation is:
5
+ # def index_key key, key2=nil
6
+ # key = "#{key}:#key2" if key2
7
+ # "#{shared_key}:#{index_key}:#{key}"
8
+ # end
9
+ INDEX_KEY = <<-EOC
10
+ VALUE
11
+ _index_key(int argc, VALUE *argv, VALUE self)
12
+ {
13
+ VALUE type = rb_funcall(self, rb_intern("index_type"), 0);
14
+ VALUE shared_key = rb_funcall(self, rb_intern("shared_key"), 0);
15
+ char sep[2] = ":";
16
+ VALUE key, key2, result;
17
+ char * buf;
18
+ unsigned long long length;
19
+
20
+ if(rb_scan_args(argc, argv, "11", &key, &key2) == 2) {
21
+ key = rb_str_dup(key);
22
+ rb_str_cat(key, sep, 1);
23
+ rb_str_concat(key, rb_funcall(key2, rb_intern("to_s"), 0));
24
+ }
25
+
26
+ length = RSTRING_LEN(shared_key) + RSTRING_LEN(type) + RSTRING_LEN(key) + 4;
27
+ buf = malloc(length);
28
+ snprintf(buf, length, "%s:%s:%s", RSTRING_PTR(shared_key), RSTRING_PTR(type), RSTRING_PTR(key));
29
+ result = rb_str_new2(buf);
30
+ free(buf);
31
+
32
+ return result;
33
+ }
34
+ EOC
35
+
36
+ inline do |builder|
37
+ builder.add_compile_flags('-std=c99')
38
+ builder.c_raw(INDEX_KEY, :method_name => 'index_key', :arity => -1)
39
+ end
40
+
41
+ def redis
42
+ Fuzzzy.redis
43
+ end
44
+
45
+ def shared_key
46
+ context[:shared_key] ||= "fuzzzy:#{index_name}"
47
+ end
48
+
49
+ def dictionary_key id
50
+ "#{shared_key}:dictionary:#{id}"
51
+ end
52
+
53
+ def counter_key
54
+ "fuzzzy:indexes:info"
55
+ end
56
+
57
+ def self.counter_key
58
+ "fuzzzy:indexes:info"
59
+ end
60
+ end
61
+ end