RubyTrie 1.0 → 1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +2 -1
- data/README +5 -0
- data/ext/trie/t.rb +20 -0
- data/ext/trie/trie.c +76 -0
- metadata +5 -4
data/ChangeLog
CHANGED
@@ -1 +1,2 @@
|
|
1
|
-
1.0 added children and each methods
|
1
|
+
1.0 added children and each methods
|
2
|
+
1.1 added levenshtein algorithm implementation - inspired from: http://stevehanov.ca/blog/index.php?id=114
|
data/README
CHANGED
@@ -27,6 +27,11 @@ t.children("go") => [1,2,3]
|
|
27
27
|
t.each("partial key") will yield to the given block all values that are matched by the partial key
|
28
28
|
t.each("go") {|v| puts v } => prints 1, 2 & 3
|
29
29
|
|
30
|
+
Levenshtein search
|
31
|
+
|
32
|
+
t.levenshtein_search("go", 2) - will return all values for the words in the trie that have 1 from the search word
|
33
|
+
ex: [["go", 0], ["goes", 2], ["gone", 2]]
|
34
|
+
|
30
35
|
== Bugs
|
31
36
|
|
32
37
|
I'm sure there are plenty!
|
data/ext/trie/t.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'trie'
|
2
|
+
require 'benchmark'
|
3
|
+
|
4
|
+
t = Trie.new
|
5
|
+
c = 0
|
6
|
+
s1 = (Benchmark.measure do
|
7
|
+
open('/usr/share/dict/words').each_line do |w|
|
8
|
+
t[w.chop] = w.chop
|
9
|
+
c += 1
|
10
|
+
end
|
11
|
+
end)
|
12
|
+
|
13
|
+
# %w(gol golas golaster lux xal).each {|w| t[w] = w}
|
14
|
+
s2 = (Benchmark.measure do
|
15
|
+
t.levenshtein_search('tread', 2).sort_by {|p| p.last }.each {|p| puts "#{p.last}: #{p.first}"}
|
16
|
+
end)
|
17
|
+
|
18
|
+
puts "#{t.memory/(1024*1024)}Mb, #{c} words"
|
19
|
+
puts s1
|
20
|
+
puts s2
|
data/ext/trie/trie.c
CHANGED
@@ -26,6 +26,7 @@ static trie_node * trie_new_node_with_char(char ch);
|
|
26
26
|
static trie_node * trie_new_node();
|
27
27
|
static VALUE rb_trie_find_children(VALUE self, VALUE key);
|
28
28
|
static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key);
|
29
|
+
static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance);
|
29
30
|
static void trie_collect_values(void * t, VALUE prary);
|
30
31
|
static void trie_collect_values_with_yield(void * t);
|
31
32
|
static void trie_traverse(trie_node * trie, void (*lambda_func)(void *));
|
@@ -33,6 +34,8 @@ static void trie_traverse_with_context(trie_node * trie, VALUE context, void (*l
|
|
33
34
|
static void free_trie(trie_node * trie);
|
34
35
|
static void count_nodes_callback(void *n, VALUE accum);
|
35
36
|
static VALUE rb_trie_count_nodes(VALUE self);
|
37
|
+
static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length);
|
38
|
+
// int print_arr(char c, int* arr, int len);
|
36
39
|
|
37
40
|
|
38
41
|
// ========================
|
@@ -172,6 +175,8 @@ void Init_trie() {
|
|
172
175
|
|
173
176
|
arg_count = 2;
|
174
177
|
rb_define_method(rb_cTrie, "[]=", rb_trie_set_key_to_value, arg_count);
|
178
|
+
// trie.levenshtein_search(word, max_distance)
|
179
|
+
rb_define_method(rb_cTrie, "levenshtein_search", rb_trie_levenshtein_search, arg_count);
|
175
180
|
}
|
176
181
|
|
177
182
|
|
@@ -268,6 +273,77 @@ static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key) {
|
|
268
273
|
return rary;
|
269
274
|
}
|
270
275
|
|
276
|
+
static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance) {
|
277
|
+
trie_node *root;
|
278
|
+
trie_node *node;
|
279
|
+
char *word_cstring;
|
280
|
+
VALUE rary = rb_ary_new();
|
281
|
+
int i=0;
|
282
|
+
|
283
|
+
Data_Get_Struct(self, trie_node, root);
|
284
|
+
|
285
|
+
word_cstring = StringValuePtr(word);
|
286
|
+
|
287
|
+
int first_line[strlen(word_cstring) + 1];
|
288
|
+
for(; i < strlen(word_cstring) + 1; i++) {
|
289
|
+
first_line[i] = i;
|
290
|
+
}
|
291
|
+
// print_arr('R', first_line, strlen(word_cstring)+1);
|
292
|
+
recursive_levenshtein_search(root->next_sibling, rary, first_line, FIX2INT(max_distance), word_cstring, strlen(word_cstring));
|
293
|
+
|
294
|
+
return rary;
|
295
|
+
}
|
296
|
+
|
297
|
+
int minimum(int* numbers, int len) {
|
298
|
+
int minValue = numbers[0];
|
299
|
+
int i;
|
300
|
+
for(i=1; i<len; i++) {
|
301
|
+
if (numbers[i] < minValue) minValue = numbers[i];
|
302
|
+
}
|
303
|
+
return minValue;
|
304
|
+
}
|
305
|
+
|
306
|
+
int min3(int a, int b, int c) {
|
307
|
+
int min = a;
|
308
|
+
if (b < min) min = b;
|
309
|
+
if (c < min) min = c;
|
310
|
+
return min;
|
311
|
+
}
|
312
|
+
|
313
|
+
static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length) {
|
314
|
+
int cur_line[word_length + 1];
|
315
|
+
int i,j, insert_cost, replace_cost, delete_cost;
|
316
|
+
VALUE carr;
|
317
|
+
|
318
|
+
cur_line[0] = prev_line[0] + 1;
|
319
|
+
|
320
|
+
for(i=1; i < word_length + 1; i++) {
|
321
|
+
insert_cost = cur_line[i-1] + 1;
|
322
|
+
delete_cost = prev_line[i] + 1;
|
323
|
+
if (trie->character != word[i-1]) {
|
324
|
+
replace_cost = prev_line[i-1] + 1;
|
325
|
+
} else {
|
326
|
+
replace_cost = prev_line[i-1];
|
327
|
+
}
|
328
|
+
cur_line[i] = min3(insert_cost, delete_cost, replace_cost);
|
329
|
+
}
|
330
|
+
|
331
|
+
|
332
|
+
if (cur_line[word_length] <= max_dist && trie->value != Qnil) {
|
333
|
+
carr = rb_ary_new();
|
334
|
+
rb_ary_push(carr, trie->value);
|
335
|
+
rb_ary_push(carr, INT2FIX(cur_line[word_length]));
|
336
|
+
rb_ary_push(rary, carr);
|
337
|
+
}
|
338
|
+
|
339
|
+
if (minimum(cur_line, word_length + 1) <= max_dist) {
|
340
|
+
if (trie->first_child != NULL)
|
341
|
+
recursive_levenshtein_search(trie->first_child, rary, cur_line, max_dist, word, word_length);
|
342
|
+
if (trie->next_sibling != NULL)
|
343
|
+
recursive_levenshtein_search(trie->next_sibling, rary, prev_line, max_dist, word, word_length);
|
344
|
+
}
|
345
|
+
}
|
346
|
+
|
271
347
|
static trie_node * trie_sibling_for_char(trie_node * node, char ch) {
|
272
348
|
while(true) {
|
273
349
|
if (node == NULL) return NULL;
|
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: RubyTrie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 13
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: "1.
|
8
|
+
- 1
|
9
|
+
version: "1.1"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Matt Freels
|
@@ -15,7 +15,7 @@ autorequire: trie
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-16 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|
@@ -32,6 +32,7 @@ files:
|
|
32
32
|
- ChangeLog
|
33
33
|
- ext/trie/trie.c
|
34
34
|
- ext/trie/extconf.rb
|
35
|
+
- ext/trie/t.rb
|
35
36
|
- lib/ruby_trie.rb
|
36
37
|
- test/rubytrie.rb
|
37
38
|
- test/trie_test.rb
|