RubyTrie 1.0 → 1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. data/ChangeLog +2 -1
  2. data/README +5 -0
  3. data/ext/trie/t.rb +20 -0
  4. data/ext/trie/trie.c +76 -0
  5. metadata +5 -4
data/ChangeLog CHANGED
@@ -1 +1,2 @@
1
- 1.0 added children and each methods
1
+ 1.0 added children and each methods
2
+ 1.1 added levenshtein algorithm implementation - inspired from: http://stevehanov.ca/blog/index.php?id=114
data/README CHANGED
@@ -27,6 +27,11 @@ t.children("go") => [1,2,3]
27
27
  t.each("partial key") will yield to the given block all values that are matched by the partial key
28
28
  t.each("go") {|v| puts v } => prints 1, 2 & 3
29
29
 
30
+ Levenshtein search
31
+
32
+ t.levenshtein_search("go", 2) - will return all values for the words in the trie that have 1 from the search word
33
+ ex: [["go", 0], ["goes", 2], ["gone", 2]]
34
+
30
35
  == Bugs
31
36
 
32
37
  I'm sure there are plenty!
data/ext/trie/t.rb ADDED
@@ -0,0 +1,20 @@
1
+ require 'trie'
2
+ require 'benchmark'
3
+
4
+ t = Trie.new
5
+ c = 0
6
+ s1 = (Benchmark.measure do
7
+ open('/usr/share/dict/words').each_line do |w|
8
+ t[w.chop] = w.chop
9
+ c += 1
10
+ end
11
+ end)
12
+
13
+ # %w(gol golas golaster lux xal).each {|w| t[w] = w}
14
+ s2 = (Benchmark.measure do
15
+ t.levenshtein_search('tread', 2).sort_by {|p| p.last }.each {|p| puts "#{p.last}: #{p.first}"}
16
+ end)
17
+
18
+ puts "#{t.memory/(1024*1024)}Mb, #{c} words"
19
+ puts s1
20
+ puts s2
data/ext/trie/trie.c CHANGED
@@ -26,6 +26,7 @@ static trie_node * trie_new_node_with_char(char ch);
26
26
  static trie_node * trie_new_node();
27
27
  static VALUE rb_trie_find_children(VALUE self, VALUE key);
28
28
  static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key);
29
+ static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance);
29
30
  static void trie_collect_values(void * t, VALUE prary);
30
31
  static void trie_collect_values_with_yield(void * t);
31
32
  static void trie_traverse(trie_node * trie, void (*lambda_func)(void *));
@@ -33,6 +34,8 @@ static void trie_traverse_with_context(trie_node * trie, VALUE context, void (*l
33
34
  static void free_trie(trie_node * trie);
34
35
  static void count_nodes_callback(void *n, VALUE accum);
35
36
  static VALUE rb_trie_count_nodes(VALUE self);
37
+ static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length);
38
+ // int print_arr(char c, int* arr, int len);
36
39
 
37
40
 
38
41
  // ========================
@@ -172,6 +175,8 @@ void Init_trie() {
172
175
 
173
176
  arg_count = 2;
174
177
  rb_define_method(rb_cTrie, "[]=", rb_trie_set_key_to_value, arg_count);
178
+ // trie.levenshtein_search(word, max_distance)
179
+ rb_define_method(rb_cTrie, "levenshtein_search", rb_trie_levenshtein_search, arg_count);
175
180
  }
176
181
 
177
182
 
@@ -268,6 +273,77 @@ static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key) {
268
273
  return rary;
269
274
  }
270
275
 
276
+ static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance) {
277
+ trie_node *root;
278
+ trie_node *node;
279
+ char *word_cstring;
280
+ VALUE rary = rb_ary_new();
281
+ int i=0;
282
+
283
+ Data_Get_Struct(self, trie_node, root);
284
+
285
+ word_cstring = StringValuePtr(word);
286
+
287
+ int first_line[strlen(word_cstring) + 1];
288
+ for(; i < strlen(word_cstring) + 1; i++) {
289
+ first_line[i] = i;
290
+ }
291
+ // print_arr('R', first_line, strlen(word_cstring)+1);
292
+ recursive_levenshtein_search(root->next_sibling, rary, first_line, FIX2INT(max_distance), word_cstring, strlen(word_cstring));
293
+
294
+ return rary;
295
+ }
296
+
297
+ int minimum(int* numbers, int len) {
298
+ int minValue = numbers[0];
299
+ int i;
300
+ for(i=1; i<len; i++) {
301
+ if (numbers[i] < minValue) minValue = numbers[i];
302
+ }
303
+ return minValue;
304
+ }
305
+
306
+ int min3(int a, int b, int c) {
307
+ int min = a;
308
+ if (b < min) min = b;
309
+ if (c < min) min = c;
310
+ return min;
311
+ }
312
+
313
+ static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length) {
314
+ int cur_line[word_length + 1];
315
+ int i,j, insert_cost, replace_cost, delete_cost;
316
+ VALUE carr;
317
+
318
+ cur_line[0] = prev_line[0] + 1;
319
+
320
+ for(i=1; i < word_length + 1; i++) {
321
+ insert_cost = cur_line[i-1] + 1;
322
+ delete_cost = prev_line[i] + 1;
323
+ if (trie->character != word[i-1]) {
324
+ replace_cost = prev_line[i-1] + 1;
325
+ } else {
326
+ replace_cost = prev_line[i-1];
327
+ }
328
+ cur_line[i] = min3(insert_cost, delete_cost, replace_cost);
329
+ }
330
+
331
+
332
+ if (cur_line[word_length] <= max_dist && trie->value != Qnil) {
333
+ carr = rb_ary_new();
334
+ rb_ary_push(carr, trie->value);
335
+ rb_ary_push(carr, INT2FIX(cur_line[word_length]));
336
+ rb_ary_push(rary, carr);
337
+ }
338
+
339
+ if (minimum(cur_line, word_length + 1) <= max_dist) {
340
+ if (trie->first_child != NULL)
341
+ recursive_levenshtein_search(trie->first_child, rary, cur_line, max_dist, word, word_length);
342
+ if (trie->next_sibling != NULL)
343
+ recursive_levenshtein_search(trie->next_sibling, rary, prev_line, max_dist, word, word_length);
344
+ }
345
+ }
346
+
271
347
  static trie_node * trie_sibling_for_char(trie_node * node, char ch) {
272
348
  while(true) {
273
349
  if (node == NULL) return NULL;
metadata CHANGED
@@ -1,12 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: RubyTrie
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 13
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
- - 0
9
- version: "1.0"
8
+ - 1
9
+ version: "1.1"
10
10
  platform: ruby
11
11
  authors:
12
12
  - Matt Freels
@@ -15,7 +15,7 @@ autorequire: trie
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-09-30 00:00:00 +03:00
18
+ date: 2011-01-16 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies: []
21
21
 
@@ -32,6 +32,7 @@ files:
32
32
  - ChangeLog
33
33
  - ext/trie/trie.c
34
34
  - ext/trie/extconf.rb
35
+ - ext/trie/t.rb
35
36
  - lib/ruby_trie.rb
36
37
  - test/rubytrie.rb
37
38
  - test/trie_test.rb