RubyTrie 1.0 → 1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +2 -1
- data/README +5 -0
- data/ext/trie/t.rb +20 -0
- data/ext/trie/trie.c +76 -0
- metadata +5 -4
data/ChangeLog
CHANGED
@@ -1 +1,2 @@
|
|
1
|
-
1.0 added children and each methods
|
1
|
+
1.0 added children and each methods
|
2
|
+
1.1 added levenshtein algorithm implementation - inspired from: http://stevehanov.ca/blog/index.php?id=114
|
data/README
CHANGED
@@ -27,6 +27,11 @@ t.children("go") => [1,2,3]
|
|
27
27
|
t.each("partial key") will yield to the given block all values that are matched by the partial key
|
28
28
|
t.each("go") {|v| puts v } => prints 1, 2 & 3
|
29
29
|
|
30
|
+
Levenshtein search
|
31
|
+
|
32
|
+
t.levenshtein_search("go", 2) - will return all values for the words in the trie that have 1 from the search word
|
33
|
+
ex: [["go", 0], ["goes", 2], ["gone", 2]]
|
34
|
+
|
30
35
|
== Bugs
|
31
36
|
|
32
37
|
I'm sure there are plenty!
|
data/ext/trie/t.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'trie'
|
2
|
+
require 'benchmark'
|
3
|
+
|
4
|
+
t = Trie.new
|
5
|
+
c = 0
|
6
|
+
s1 = (Benchmark.measure do
|
7
|
+
open('/usr/share/dict/words').each_line do |w|
|
8
|
+
t[w.chop] = w.chop
|
9
|
+
c += 1
|
10
|
+
end
|
11
|
+
end)
|
12
|
+
|
13
|
+
# %w(gol golas golaster lux xal).each {|w| t[w] = w}
|
14
|
+
s2 = (Benchmark.measure do
|
15
|
+
t.levenshtein_search('tread', 2).sort_by {|p| p.last }.each {|p| puts "#{p.last}: #{p.first}"}
|
16
|
+
end)
|
17
|
+
|
18
|
+
puts "#{t.memory/(1024*1024)}Mb, #{c} words"
|
19
|
+
puts s1
|
20
|
+
puts s2
|
data/ext/trie/trie.c
CHANGED
@@ -26,6 +26,7 @@ static trie_node * trie_new_node_with_char(char ch);
|
|
26
26
|
static trie_node * trie_new_node();
|
27
27
|
static VALUE rb_trie_find_children(VALUE self, VALUE key);
|
28
28
|
static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key);
|
29
|
+
static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance);
|
29
30
|
static void trie_collect_values(void * t, VALUE prary);
|
30
31
|
static void trie_collect_values_with_yield(void * t);
|
31
32
|
static void trie_traverse(trie_node * trie, void (*lambda_func)(void *));
|
@@ -33,6 +34,8 @@ static void trie_traverse_with_context(trie_node * trie, VALUE context, void (*l
|
|
33
34
|
static void free_trie(trie_node * trie);
|
34
35
|
static void count_nodes_callback(void *n, VALUE accum);
|
35
36
|
static VALUE rb_trie_count_nodes(VALUE self);
|
37
|
+
static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length);
|
38
|
+
// int print_arr(char c, int* arr, int len);
|
36
39
|
|
37
40
|
|
38
41
|
// ========================
|
@@ -172,6 +175,8 @@ void Init_trie() {
|
|
172
175
|
|
173
176
|
arg_count = 2;
|
174
177
|
rb_define_method(rb_cTrie, "[]=", rb_trie_set_key_to_value, arg_count);
|
178
|
+
// trie.levenshtein_search(word, max_distance)
|
179
|
+
rb_define_method(rb_cTrie, "levenshtein_search", rb_trie_levenshtein_search, arg_count);
|
175
180
|
}
|
176
181
|
|
177
182
|
|
@@ -268,6 +273,77 @@ static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key) {
|
|
268
273
|
return rary;
|
269
274
|
}
|
270
275
|
|
276
|
+
static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance) {
|
277
|
+
trie_node *root;
|
278
|
+
trie_node *node;
|
279
|
+
char *word_cstring;
|
280
|
+
VALUE rary = rb_ary_new();
|
281
|
+
int i=0;
|
282
|
+
|
283
|
+
Data_Get_Struct(self, trie_node, root);
|
284
|
+
|
285
|
+
word_cstring = StringValuePtr(word);
|
286
|
+
|
287
|
+
int first_line[strlen(word_cstring) + 1];
|
288
|
+
for(; i < strlen(word_cstring) + 1; i++) {
|
289
|
+
first_line[i] = i;
|
290
|
+
}
|
291
|
+
// print_arr('R', first_line, strlen(word_cstring)+1);
|
292
|
+
recursive_levenshtein_search(root->next_sibling, rary, first_line, FIX2INT(max_distance), word_cstring, strlen(word_cstring));
|
293
|
+
|
294
|
+
return rary;
|
295
|
+
}
|
296
|
+
|
297
|
+
int minimum(int* numbers, int len) {
|
298
|
+
int minValue = numbers[0];
|
299
|
+
int i;
|
300
|
+
for(i=1; i<len; i++) {
|
301
|
+
if (numbers[i] < minValue) minValue = numbers[i];
|
302
|
+
}
|
303
|
+
return minValue;
|
304
|
+
}
|
305
|
+
|
306
|
+
int min3(int a, int b, int c) {
|
307
|
+
int min = a;
|
308
|
+
if (b < min) min = b;
|
309
|
+
if (c < min) min = c;
|
310
|
+
return min;
|
311
|
+
}
|
312
|
+
|
313
|
+
static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length) {
|
314
|
+
int cur_line[word_length + 1];
|
315
|
+
int i,j, insert_cost, replace_cost, delete_cost;
|
316
|
+
VALUE carr;
|
317
|
+
|
318
|
+
cur_line[0] = prev_line[0] + 1;
|
319
|
+
|
320
|
+
for(i=1; i < word_length + 1; i++) {
|
321
|
+
insert_cost = cur_line[i-1] + 1;
|
322
|
+
delete_cost = prev_line[i] + 1;
|
323
|
+
if (trie->character != word[i-1]) {
|
324
|
+
replace_cost = prev_line[i-1] + 1;
|
325
|
+
} else {
|
326
|
+
replace_cost = prev_line[i-1];
|
327
|
+
}
|
328
|
+
cur_line[i] = min3(insert_cost, delete_cost, replace_cost);
|
329
|
+
}
|
330
|
+
|
331
|
+
|
332
|
+
if (cur_line[word_length] <= max_dist && trie->value != Qnil) {
|
333
|
+
carr = rb_ary_new();
|
334
|
+
rb_ary_push(carr, trie->value);
|
335
|
+
rb_ary_push(carr, INT2FIX(cur_line[word_length]));
|
336
|
+
rb_ary_push(rary, carr);
|
337
|
+
}
|
338
|
+
|
339
|
+
if (minimum(cur_line, word_length + 1) <= max_dist) {
|
340
|
+
if (trie->first_child != NULL)
|
341
|
+
recursive_levenshtein_search(trie->first_child, rary, cur_line, max_dist, word, word_length);
|
342
|
+
if (trie->next_sibling != NULL)
|
343
|
+
recursive_levenshtein_search(trie->next_sibling, rary, prev_line, max_dist, word, word_length);
|
344
|
+
}
|
345
|
+
}
|
346
|
+
|
271
347
|
static trie_node * trie_sibling_for_char(trie_node * node, char ch) {
|
272
348
|
while(true) {
|
273
349
|
if (node == NULL) return NULL;
|
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: RubyTrie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 13
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: "1.
|
8
|
+
- 1
|
9
|
+
version: "1.1"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Matt Freels
|
@@ -15,7 +15,7 @@ autorequire: trie
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-16 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|
@@ -32,6 +32,7 @@ files:
|
|
32
32
|
- ChangeLog
|
33
33
|
- ext/trie/trie.c
|
34
34
|
- ext/trie/extconf.rb
|
35
|
+
- ext/trie/t.rb
|
35
36
|
- lib/ruby_trie.rb
|
36
37
|
- test/rubytrie.rb
|
37
38
|
- test/trie_test.rb
|