RubyTrie 1.0 → 1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. data/ChangeLog +2 -1
  2. data/README +5 -0
  3. data/ext/trie/t.rb +20 -0
  4. data/ext/trie/trie.c +76 -0
  5. metadata +5 -4
data/ChangeLog CHANGED
@@ -1 +1,2 @@
1
- 1.0 added children and each methods
1
+ 1.0 added children and each methods
2
+ 1.1 added levenshtein algorithm implementation - inspired from: http://stevehanov.ca/blog/index.php?id=114
data/README CHANGED
@@ -27,6 +27,11 @@ t.children("go") => [1,2,3]
27
27
  t.each("partial key") will yield to the given block all values that are matched by the partial key
28
28
  t.each("go") {|v| puts v } => prints 1, 2 & 3
29
29
 
30
+ Levenshtein search
31
+
32
+ t.levenshtein_search("go", 2) - will return all values for the words in the trie that have 1 from the search word
33
+ ex: [["go", 0], ["goes", 2], ["gone", 2]]
34
+
30
35
  == Bugs
31
36
 
32
37
  I'm sure there are plenty!
data/ext/trie/t.rb ADDED
@@ -0,0 +1,20 @@
1
+ require 'trie'
2
+ require 'benchmark'
3
+
4
+ t = Trie.new
5
+ c = 0
6
+ s1 = (Benchmark.measure do
7
+ open('/usr/share/dict/words').each_line do |w|
8
+ t[w.chop] = w.chop
9
+ c += 1
10
+ end
11
+ end)
12
+
13
+ # %w(gol golas golaster lux xal).each {|w| t[w] = w}
14
+ s2 = (Benchmark.measure do
15
+ t.levenshtein_search('tread', 2).sort_by {|p| p.last }.each {|p| puts "#{p.last}: #{p.first}"}
16
+ end)
17
+
18
+ puts "#{t.memory/(1024*1024)}Mb, #{c} words"
19
+ puts s1
20
+ puts s2
data/ext/trie/trie.c CHANGED
@@ -26,6 +26,7 @@ static trie_node * trie_new_node_with_char(char ch);
26
26
  static trie_node * trie_new_node();
27
27
  static VALUE rb_trie_find_children(VALUE self, VALUE key);
28
28
  static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key);
29
+ static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance);
29
30
  static void trie_collect_values(void * t, VALUE prary);
30
31
  static void trie_collect_values_with_yield(void * t);
31
32
  static void trie_traverse(trie_node * trie, void (*lambda_func)(void *));
@@ -33,6 +34,8 @@ static void trie_traverse_with_context(trie_node * trie, VALUE context, void (*l
33
34
  static void free_trie(trie_node * trie);
34
35
  static void count_nodes_callback(void *n, VALUE accum);
35
36
  static VALUE rb_trie_count_nodes(VALUE self);
37
+ static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length);
38
+ // int print_arr(char c, int* arr, int len);
36
39
 
37
40
 
38
41
  // ========================
@@ -172,6 +175,8 @@ void Init_trie() {
172
175
 
173
176
  arg_count = 2;
174
177
  rb_define_method(rb_cTrie, "[]=", rb_trie_set_key_to_value, arg_count);
178
+ // trie.levenshtein_search(word, max_distance)
179
+ rb_define_method(rb_cTrie, "levenshtein_search", rb_trie_levenshtein_search, arg_count);
175
180
  }
176
181
 
177
182
 
@@ -268,6 +273,77 @@ static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key) {
268
273
  return rary;
269
274
  }
270
275
 
276
+ static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance) {
277
+ trie_node *root;
278
+ trie_node *node;
279
+ char *word_cstring;
280
+ VALUE rary = rb_ary_new();
281
+ int i=0;
282
+
283
+ Data_Get_Struct(self, trie_node, root);
284
+
285
+ word_cstring = StringValuePtr(word);
286
+
287
+ int first_line[strlen(word_cstring) + 1];
288
+ for(; i < strlen(word_cstring) + 1; i++) {
289
+ first_line[i] = i;
290
+ }
291
+ // print_arr('R', first_line, strlen(word_cstring)+1);
292
+ recursive_levenshtein_search(root->next_sibling, rary, first_line, FIX2INT(max_distance), word_cstring, strlen(word_cstring));
293
+
294
+ return rary;
295
+ }
296
+
297
+ int minimum(int* numbers, int len) {
298
+ int minValue = numbers[0];
299
+ int i;
300
+ for(i=1; i<len; i++) {
301
+ if (numbers[i] < minValue) minValue = numbers[i];
302
+ }
303
+ return minValue;
304
+ }
305
+
306
+ int min3(int a, int b, int c) {
307
+ int min = a;
308
+ if (b < min) min = b;
309
+ if (c < min) min = c;
310
+ return min;
311
+ }
312
+
313
+ static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length) {
314
+ int cur_line[word_length + 1];
315
+ int i,j, insert_cost, replace_cost, delete_cost;
316
+ VALUE carr;
317
+
318
+ cur_line[0] = prev_line[0] + 1;
319
+
320
+ for(i=1; i < word_length + 1; i++) {
321
+ insert_cost = cur_line[i-1] + 1;
322
+ delete_cost = prev_line[i] + 1;
323
+ if (trie->character != word[i-1]) {
324
+ replace_cost = prev_line[i-1] + 1;
325
+ } else {
326
+ replace_cost = prev_line[i-1];
327
+ }
328
+ cur_line[i] = min3(insert_cost, delete_cost, replace_cost);
329
+ }
330
+
331
+
332
+ if (cur_line[word_length] <= max_dist && trie->value != Qnil) {
333
+ carr = rb_ary_new();
334
+ rb_ary_push(carr, trie->value);
335
+ rb_ary_push(carr, INT2FIX(cur_line[word_length]));
336
+ rb_ary_push(rary, carr);
337
+ }
338
+
339
+ if (minimum(cur_line, word_length + 1) <= max_dist) {
340
+ if (trie->first_child != NULL)
341
+ recursive_levenshtein_search(trie->first_child, rary, cur_line, max_dist, word, word_length);
342
+ if (trie->next_sibling != NULL)
343
+ recursive_levenshtein_search(trie->next_sibling, rary, prev_line, max_dist, word, word_length);
344
+ }
345
+ }
346
+
271
347
  static trie_node * trie_sibling_for_char(trie_node * node, char ch) {
272
348
  while(true) {
273
349
  if (node == NULL) return NULL;
metadata CHANGED
@@ -1,12 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: RubyTrie
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 13
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
- - 0
9
- version: "1.0"
8
+ - 1
9
+ version: "1.1"
10
10
  platform: ruby
11
11
  authors:
12
12
  - Matt Freels
@@ -15,7 +15,7 @@ autorequire: trie
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-09-30 00:00:00 +03:00
18
+ date: 2011-01-16 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies: []
21
21
 
@@ -32,6 +32,7 @@ files:
32
32
  - ChangeLog
33
33
  - ext/trie/trie.c
34
34
  - ext/trie/extconf.rb
35
+ - ext/trie/t.rb
35
36
  - lib/ruby_trie.rb
36
37
  - test/rubytrie.rb
37
38
  - test/trie_test.rb