RubyTrie 1.1 → 2.1

Sign up to get free protection for your applications and to get access to all the features.
data/ChangeLog CHANGED
@@ -1,2 +1,4 @@
1
1
  1.0 added children and each methods
2
- 1.1 added levenshtein algorithm implementation - inspired from: http://stevehanov.ca/blog/index.php?id=114
2
+ 1.1 added levenshtein algorithm implementation - inspired from: http://stevehanov.ca/blog/index.php?id=114
3
+ 2.0 completely rewritten the trie implementation to use more optimized data structure. For /usr/share/dict/web2 it takes the memory to about 6Mb from 40Mb. The levenshtein now takes a block and passes in the value but it can be easy extened to return an array. It also has a glitch where some values are for some reason just false... :|
4
+ 2.1 bug fixes, fixing levenshtein search
data/README CHANGED
@@ -22,6 +22,11 @@ t["goes"] = 2
22
22
  t["gone"] = 3
23
23
  t["other"] = 4
24
24
 
25
+ by default assigning multiple values to the same object will add them to an array.
26
+ t['a'] = 1
27
+ t['a'] = 2
28
+ will return for t['a'] => [1,2]
29
+
25
30
  t.children("go") => [1,2,3]
26
31
 
27
32
  t.each("partial key") will yield to the given block all values that are matched by the partial key
@@ -29,8 +34,11 @@ t.each("go") {|v| puts v } => prints 1, 2 & 3
29
34
 
30
35
  Levenshtein search
31
36
 
32
- t.levenshtein_search("go", 2) - will return all values for the words in the trie that have 1 from the search word
33
- ex: [["go", 0], ["goes", 2], ["gone", 2]]
37
+ t.levenshtein_search("go", 2) {|value| puts value} - will print all values for the words in the trie that have 2 distance from the search word
38
+ Eg.
39
+ 1
40
+ 2
41
+ 3
34
42
 
35
43
  == Bugs
36
44
 
@@ -0,0 +1,77 @@
1
+ /*
2
+ * levenshtein_distance.c
3
+ * otrie
4
+ *
5
+ * Created by Petrica Ghiurca on 18.03.2011.
6
+ * Copyright 2011 __MyCompanyName__. All rights reserved.
7
+ *
8
+ */
9
+
10
+ #include <stdlib.h>
11
+ #include <string.h> /* for memcmp, memmove */
12
+ #include <stdio.h>
13
+
14
+ #include "otrie2.h"
15
+ #include "levenshtein_distance.h"
16
+
17
+
18
+ void recursive_levenshtein_search(Node* trie, int node_offset, levenshtein_distance_callback cb, int* prev_line, int max_dist, const char* word, int word_length);
19
+
20
+ void levenshtein_distance(Node* trie, const char* word, int max_distance, levenshtein_distance_callback cb) {
21
+ int first_line[strlen(word) + 1];
22
+ int i=0;
23
+ for(; i < strlen(word) + 1; i++) {
24
+ first_line[i] = i;
25
+ }
26
+ recursive_levenshtein_search(trie->next_sibling, 0, cb, first_line, max_distance, word, strlen(word));
27
+ }
28
+
29
+
30
+ int minimum(int* numbers, int len) {
31
+ int minValue = numbers[0];
32
+ int i;
33
+ for(i=1; i<len; i++) {
34
+ if (numbers[i] < minValue) minValue = numbers[i];
35
+ }
36
+ return minValue;
37
+ }
38
+
39
+ int min3(int a, int b, int c) {
40
+ int min = a;
41
+ if (b < min) min = b;
42
+ if (c < min) min = c;
43
+ return min;
44
+ }
45
+
46
+ void recursive_levenshtein_search(Node* trie, int node_offset, levenshtein_distance_callback cb, int* prev_line, int max_dist, const char* word, int word_length) {
47
+ int cur_line[word_length + 1];
48
+ int i, insert_cost, replace_cost, delete_cost;
49
+
50
+ cur_line[0] = prev_line[0] + 1;
51
+
52
+ for(i=1; i < word_length + 1; i++) {
53
+ insert_cost = cur_line[i-1] + 1;
54
+ delete_cost = prev_line[i] + 1;
55
+ if (trie->data[node_offset] != word[i-1]) {
56
+ replace_cost = prev_line[i-1] + 1;
57
+ } else {
58
+ replace_cost = prev_line[i-1];
59
+ }
60
+ cur_line[i] = min3(insert_cost, delete_cost, replace_cost);
61
+ }
62
+
63
+
64
+ if (cur_line[word_length] <= max_dist && (strlen(trie->data) == (node_offset + 1)) && trie->value != Qnil) {
65
+ cb(trie, cur_line[word_length]);
66
+ }
67
+
68
+ if (minimum(cur_line, word_length + 1) <= max_dist) {
69
+ if (strlen(trie->data) > (node_offset + 1))
70
+ recursive_levenshtein_search(trie, node_offset + 1, cb, cur_line, max_dist, word, word_length);
71
+ if (trie->first_child != NULL)
72
+ recursive_levenshtein_search(trie->first_child, 0, cb, cur_line, max_dist, word, word_length);
73
+ if (trie->next_sibling != NULL)
74
+ recursive_levenshtein_search(trie->next_sibling, 0, cb, prev_line, max_dist, word, word_length);
75
+ }
76
+
77
+ }
@@ -0,0 +1,13 @@
1
+ /*
2
+ * levenshtein_distance.h
3
+ * otrie
4
+ *
5
+ * Created by Petrica Ghiurca on 18.03.2011.
6
+ * Copyright 2011 __MyCompanyName__. All rights reserved.
7
+ *
8
+ */
9
+
10
+ #include "otrie2.h"
11
+
12
+ typedef void (*levenshtein_distance_callback)(Node* node, int length);
13
+ void levenshtein_distance(Node* trie, const char* word, int max_distance, levenshtein_distance_callback cb);
data/ext/trie/otrie2.c ADDED
@@ -0,0 +1,161 @@
1
+ /*
2
+ * otrie2.c
3
+ * otrie
4
+ *
5
+ * Created by Petrica Ghiurca on 18.03.2011.
6
+ * Copyright 2011 __MyCompanyName__. All rights reserved.
7
+ *
8
+ */
9
+
10
+ #include <stdlib.h> /* for malloc, free */
11
+ #include <string.h> /* for memcmp, memmove */
12
+ #include <stdio.h>
13
+
14
+ #include "otrie2.h"
15
+
16
+ Node* new_node() {
17
+ Node *node = malloc(sizeof(Node));
18
+ memset(node, 0, sizeof(Node));
19
+ return node;
20
+ }
21
+
22
+ Node* new_node_string_len(const char* string, const int len) {
23
+ Node *node = new_node();
24
+ node->data = malloc(len+1);
25
+ strncpy(node->data, string, len);
26
+ node->data[len] = 0;
27
+ return node;
28
+ }
29
+
30
+ Node* new_node_string(const char* string) {
31
+ Node *node = new_node();
32
+ int len = strlen(string);
33
+ node->data = malloc(len+1);
34
+ strcpy(node->data, string);
35
+ return node;
36
+ }
37
+
38
+ void node_update_data(Node* node, const char* string, int len) {
39
+ char* new_data = malloc(len+1);
40
+ strncpy(new_data, string, len);
41
+ new_data[len] = 0;
42
+ free(node->data);
43
+ node->data = new_data;
44
+ }
45
+
46
+ void free_node(Node *node) {
47
+ if (node->first_child) free_node(node->first_child);
48
+ if (node->next_sibling) free_node(node->next_sibling);
49
+ free(node->data);
50
+ free(node);
51
+ }
52
+
53
+ void node_insert(Node* node, const char* string, const VALUE value) {
54
+ int len = strlen(string);
55
+ Pos *cur = new_pos(node, 0);
56
+ int i=0;
57
+ for (; i<len; i++) {
58
+ pos_next(cur, string + i, true);
59
+ }
60
+ cur->node->value = value;
61
+ }
62
+
63
+ Node* node_find(Node* this, const char* string) {
64
+ int len = strlen(string);
65
+ Pos *cur = new_pos(this, 0);
66
+ int i=0;
67
+ for(; i<len; i++) {
68
+ pos_next(cur, string + i, false);
69
+ if (cur->node == NULL) { return NULL; }
70
+ }
71
+ if (strlen(cur->node->data) == cur->offset + 1)
72
+ return cur->node;
73
+ return NULL;
74
+ }
75
+
76
+ Pos* new_pos(Node *node, int offset) {
77
+ Pos *pos = malloc(sizeof(Pos));
78
+ pos->node = node;
79
+ pos->offset = offset;
80
+ return pos;
81
+ }
82
+
83
+ Node* pos_find_or_create_child(Pos* this, const char* string, bool down, bool insert) {
84
+ Node *child = this->node->next_sibling;
85
+ if (down) child = this->node->first_child;
86
+
87
+ Node *last_child = NULL;
88
+
89
+ while(child != NULL && *child->data != *string) {
90
+ last_child = child;
91
+ child = child -> next_sibling;
92
+ }
93
+ if (child == NULL && insert) {
94
+ child = new_node_string(string);
95
+ if (!down) {
96
+ if (this->node -> next_sibling != NULL) {
97
+ last_child -> next_sibling = child;
98
+ } else {
99
+ this -> node -> next_sibling = child;
100
+ }
101
+ } else {
102
+ if (this -> node -> first_child != NULL) {
103
+ last_child -> next_sibling = child;
104
+ } else {
105
+ this->node -> first_child = child;
106
+ }
107
+ }
108
+ }
109
+ return child;
110
+ }
111
+
112
+ void pos_next(Pos *this, const char* string, bool insert) {
113
+ if (this -> node -> data == NULL) {
114
+ this->node = pos_find_or_create_child(this, string, false, insert);
115
+ this->offset = 0;
116
+ return;
117
+ }
118
+
119
+ int len = strlen(this->node->data);
120
+ if (this -> offset + 1 < len) {
121
+ if (this -> node->data[this -> offset + 1] == string[0]) {
122
+ this -> offset++;
123
+ return;
124
+ } else {
125
+ // split paths
126
+ // - new child node with old partial content
127
+ // - new child node with new content
128
+ if (insert) {
129
+ Node *splitChild = new_node_string(this->node->data + this->offset + 1);
130
+ splitChild -> value = this -> node -> value;
131
+ Node *newChild = new_node_string(string);
132
+ newChild -> value = Qnil;
133
+ node_update_data(this->node, this->node->data, this->offset + 1);
134
+ splitChild -> next_sibling = newChild;
135
+ this->node -> first_child = splitChild;
136
+ this->node -> value = Qnil;
137
+
138
+ this->node = newChild;
139
+ this->offset = 0;
140
+ } else {
141
+ this -> node = NULL;
142
+ this -> offset = 0;
143
+ }
144
+ }
145
+ } else {
146
+ // reached end of data... find a child
147
+ this->node = pos_find_or_create_child(this, string, true, insert);
148
+ this->offset = 0;
149
+ return;
150
+ }
151
+ }
152
+
153
+ void node_visit(Node* this, node_iterator func, VALUE context) {
154
+ func(this, context);
155
+ if (this->first_child != NULL) {
156
+ node_visit(this->first_child, func, context);
157
+ }
158
+ if (this->next_sibling != NULL) {
159
+ node_visit(this->next_sibling, func, context);
160
+ }
161
+ }
data/ext/trie/otrie2.h ADDED
@@ -0,0 +1,41 @@
1
+ /*
2
+ * otrie2.h
3
+ * otrie
4
+ *
5
+ * Created by Petrica Ghiurca on 18.03.2011.
6
+ * Copyright 2011 __MyCompanyName__. All rights reserved.
7
+ *
8
+ */
9
+ #include <ruby.h>
10
+
11
+ #define bool int
12
+ #define true 1
13
+ #define false 0
14
+
15
+ #ifndef TRIE_NODE
16
+ #define TRIE_NODE
17
+
18
+ typedef struct trie_node {
19
+ char *data;
20
+ struct trie_node* first_child;
21
+ struct trie_node* next_sibling;
22
+ VALUE value;
23
+ } Node;
24
+
25
+ typedef struct pos_struct {
26
+ Node *node;
27
+ int offset;
28
+ } Pos;
29
+
30
+ typedef void (*node_iterator)(Node* node, VALUE context);
31
+
32
+ Node* new_node();
33
+ void free_node(Node*);
34
+ void node_insert(Node* node, const char* string, const VALUE value);
35
+ Node* node_find(Node* this, const char* string);
36
+
37
+ Pos* new_pos(Node *node, int offset);
38
+ Node* pos_find_or_create_child(Pos* this, const char* string, bool down, bool insert);
39
+ void pos_next(Pos *this, const char* string, bool);
40
+ void node_visit(Node* this, node_iterator func, VALUE context);
41
+ #endif
@@ -0,0 +1,199 @@
1
+ /*
2
+ * ruby_trie.c
3
+ * otrie
4
+ *
5
+ * Created by Petrica Ghiurca on 18.03.2011.
6
+ * Copyright 2011 Petrica Ghiurca. All rights reserved.
7
+ *
8
+ */
9
+
10
+ #include <ruby.h>
11
+ #include <stdlib.h> /* for malloc, free */
12
+ #include <string.h> /* for memcmp, memmove */
13
+ #include "otrie2.h"
14
+ #include "levenshtein_distance.h"
15
+
16
+ static VALUE rb_cTrie;
17
+
18
+ static void count_nodes_callback(Node *trie, VALUE accum);
19
+ static VALUE rb_trie_count_nodes(VALUE self);
20
+ static VALUE rb_trie_allocate(VALUE klass);
21
+ static VALUE rb_trie_get_key(VALUE self, VALUE key);
22
+ static void trie_mark_value(Node*, VALUE);
23
+ static void rb_trie_mark(Node* t);
24
+ static void rb_trie_free(Node * t);
25
+ void tree_collect_values(Node *node, VALUE rary);
26
+ static VALUE rb_trie_find_children(VALUE self, VALUE key);
27
+ static void trie_collect_values_with_yield(Node * node, VALUE context);
28
+ static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key);
29
+ static VALUE rb_trie_set_key_to_value(VALUE self, VALUE key, VALUE value);
30
+ static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance);
31
+
32
+ // extension init
33
+ void Init_trie() {
34
+ rb_cTrie = rb_define_class("Trie", rb_cObject);
35
+
36
+ rb_define_alloc_func(rb_cTrie, rb_trie_allocate);
37
+
38
+ int arg_count = 0;
39
+ //rb_define_method(rb_cTrie, "inspect", rb_trie_inspect, arg_count);
40
+ rb_define_method(rb_cTrie, "memory", rb_trie_count_nodes, arg_count);
41
+
42
+ arg_count = 1;
43
+ rb_define_method(rb_cTrie, "[]", rb_trie_get_key, arg_count);
44
+ // rb_define_method(rb_cTrie, "delete", rb_trie_undef_key, arg_count);
45
+ rb_define_method(rb_cTrie, "children", rb_trie_find_children, arg_count);
46
+ rb_define_method(rb_cTrie, "each", rb_trie_find_children_with_block, arg_count);
47
+
48
+ arg_count = 2;
49
+ rb_define_method(rb_cTrie, "[]=", rb_trie_set_key_to_value, arg_count);
50
+ // trie.levenshtein_search(word, max_distance)
51
+ rb_define_method(rb_cTrie, "levenshtein_search", rb_trie_levenshtein_search, arg_count);
52
+ }
53
+
54
+ static int total_memory;
55
+ static void count_nodes_callback(Node *trie, VALUE accum) {
56
+ int len = 0;
57
+ if (trie->data) len = strlen(trie->data);
58
+ rb_big_plus(accum, rb_uint2big(len + sizeof(Node)));
59
+ total_memory += len + sizeof(Node);
60
+ }
61
+
62
+ static VALUE rb_trie_count_nodes(VALUE self) {
63
+ Node *root;
64
+ Data_Get_Struct(self, Node, root);
65
+ VALUE accum = rb_uint2big(0);
66
+ total_memory = 0;
67
+ node_visit(root, count_nodes_callback, accum);
68
+ //return accum;
69
+ return rb_uint2big(total_memory);
70
+ }
71
+
72
+ static VALUE rb_trie_allocate(VALUE klass) {
73
+ Node * t = new_node();
74
+ return Data_Wrap_Struct(klass, rb_trie_mark, rb_trie_free, t);
75
+ }
76
+
77
+ static VALUE rb_trie_get_key(VALUE self, VALUE key) {
78
+ Node * root;
79
+ Node * node;
80
+ char * key_cstring;
81
+
82
+ Check_Type(key, T_STRING);
83
+ key_cstring = StringValuePtr(key);
84
+
85
+ Data_Get_Struct(self, Node, root);
86
+
87
+ node = node_find(root, key_cstring);
88
+ if (node == NULL) return Qnil;
89
+ return node->value;
90
+ }
91
+
92
+
93
+ static void trie_mark_value(Node * t, VALUE context) {
94
+ rb_gc_mark( t->value );
95
+ }
96
+
97
+ static void rb_trie_mark(Node* t) {
98
+ node_visit(t, trie_mark_value, Qnil);
99
+ }
100
+
101
+ static void rb_trie_free(Node * t) {
102
+ free_node(t);
103
+ }
104
+
105
+ void tree_collect_values(Node *node, VALUE rary) {
106
+ if (node->value != Qnil) {
107
+ rb_ary_push(rary, node->value);
108
+ }
109
+ }
110
+
111
+ static VALUE rb_trie_find_children(VALUE self, VALUE key) {
112
+ Node * root;
113
+ Node * node;
114
+ char * key_cstring;
115
+ VALUE rary = rb_ary_new();
116
+
117
+ key_cstring = StringValuePtr(key);
118
+ Data_Get_Struct(self, Node, root);
119
+
120
+ node = node_find(root, key_cstring);
121
+
122
+ if (node != NULL && node->value != Qnil) {
123
+ rb_ary_push(rary, node->value);
124
+ }
125
+
126
+ if (node == NULL || node->first_child == NULL) return rary;
127
+
128
+ node_visit(node->first_child, tree_collect_values, rary);
129
+ return rary;
130
+ }
131
+
132
+ static void trie_collect_values_with_yield(Node * node, VALUE context) {
133
+ if (node->value != Qnil) {
134
+ rb_yield(node->value);
135
+ }
136
+ }
137
+
138
+ static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key) {
139
+ Node * root;
140
+ Node * node;
141
+ char * key_cstring;
142
+ VALUE rary = rb_ary_new();
143
+
144
+ key_cstring = StringValuePtr(key);
145
+ Data_Get_Struct(self, Node, root);
146
+
147
+ node = node_find(root, key_cstring);
148
+
149
+ if (node != NULL && node->value != Qnil) {
150
+ rb_yield(node->value);
151
+ }
152
+
153
+ if (node == NULL || node->first_child == NULL) return rary;
154
+
155
+ node_visit(node->first_child, trie_collect_values_with_yield, Qnil);
156
+ return rary;
157
+ }
158
+
159
+
160
+ static VALUE rb_trie_set_key_to_value(VALUE self, VALUE key, VALUE value) {
161
+ Node * root, *node;
162
+ char * key_cstring;
163
+
164
+ Check_Type(key, T_STRING);
165
+ key_cstring = StringValuePtr(key);
166
+
167
+ Data_Get_Struct(self, Node, root);
168
+
169
+ node = node_find(root, key_cstring);
170
+ if (node == NULL || node -> value == Qnil) {
171
+ // printf("New node for %s -> %d\n", key_cstring, value);
172
+ VALUE arr = rb_ary_new();
173
+ rb_ary_push(arr, value);
174
+ node_insert(root, key_cstring, arr);
175
+ } else {
176
+ // printf("Append value %s to %s -> %d\n", node->data, key_cstring, node->value);
177
+ rb_ary_push(node->value, value);
178
+ }
179
+
180
+ return Qnil;
181
+ }
182
+
183
+ void rb_levensthtein_cb(Node* node, int distance) {
184
+ if (node->value != Qnil)
185
+ rb_yield(node->value);
186
+ }
187
+
188
+ static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance) {
189
+ Node *root;
190
+ char *word_cstring;
191
+
192
+ Data_Get_Struct(self, Node, root);
193
+
194
+ word_cstring = StringValuePtr(word);
195
+
196
+ levenshtein_distance(root, word_cstring, FIX2INT(max_distance), rb_levensthtein_cb);
197
+
198
+ return Qnil;
199
+ }
data/ext/trie/t.rb CHANGED
@@ -1,20 +1,22 @@
1
1
  require 'trie'
2
+ require 'rubygems'
2
3
  require 'benchmark'
3
4
 
4
5
  t = Trie.new
5
6
  c = 0
7
+ a = 0
6
8
  s1 = (Benchmark.measure do
7
- open('/usr/share/dict/words').each_line do |w|
8
- t[w.chop] = w.chop
9
+ open('/usr/share/dict/web2').each_line do |w|
10
+ t[w.chop]= w
9
11
  c += 1
10
12
  end
11
13
  end)
12
14
 
13
15
  # %w(gol golas golaster lux xal).each {|w| t[w] = w}
14
16
  s2 = (Benchmark.measure do
15
- t.levenshtein_search('tread', 2).sort_by {|p| p.last }.each {|p| puts "#{p.last}: #{p.first}"}
17
+ t.levenshtein_search('food', 1) {|p| puts p}
16
18
  end)
17
19
 
18
- puts "#{t.memory/(1024*1024)}Mb, #{c} words"
20
+ puts "#{t.memory/(1024*1024)}Mb, #{c} words, #{a} unique"
19
21
  puts s1
20
22
  puts s2
data/test/trie_test.rb CHANGED
@@ -23,10 +23,11 @@ time do
23
23
  1.upto(max) do |i|
24
24
  t["item #{i}"].class
25
25
  end
26
-
27
- 1.upto(max) do |i|
28
- t.delete("item #{i}")
29
- end
26
+
27
+ # not implemented yet
28
+ # 1.upto(max) do |i|
29
+ # t.delete("item #{i}")
30
+ # end
30
31
  end
31
32
 
32
33
 
@@ -41,9 +42,9 @@ time do
41
42
  t["item #{i}"].class
42
43
  end
43
44
 
44
- 1.upto(max) do |i|
45
- t.delete("item #{i}")
46
- end
45
+ # 1.upto(max) do |i|
46
+ # t.delete("item #{i}")
47
+ # end
47
48
  end
48
49
 
49
50
  # puts "With a Ruby Trie..."
metadata CHANGED
@@ -1,21 +1,21 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: RubyTrie
3
3
  version: !ruby/object:Gem::Version
4
- hash: 13
4
+ hash: 1
5
5
  prerelease: false
6
6
  segments:
7
+ - 2
7
8
  - 1
8
- - 1
9
- version: "1.1"
9
+ version: "2.1"
10
10
  platform: ruby
11
11
  authors:
12
- - Matt Freels
13
12
  - Petrica Ghiurca
13
+ - Matt Freels
14
14
  autorequire: trie
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-16 00:00:00 +02:00
18
+ date: 2011-03-21 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies: []
21
21
 
@@ -30,7 +30,11 @@ extra_rdoc_files:
30
30
  files:
31
31
  - README
32
32
  - ChangeLog
33
- - ext/trie/trie.c
33
+ - ext/trie/levenshtein_distance.h
34
+ - ext/trie/otrie2.h
35
+ - ext/trie/levenshtein_distance.c
36
+ - ext/trie/otrie2.c
37
+ - ext/trie/ruby_trie.c
34
38
  - ext/trie/extconf.rb
35
39
  - ext/trie/t.rb
36
40
  - lib/ruby_trie.rb
data/ext/trie/trie.c DELETED
@@ -1,408 +0,0 @@
1
- #include <stdlib.h> /* for malloc, free */
2
- #include <string.h> /* for memcmp, memmove */
3
- #include "ruby.h"
4
-
5
- // typdefs!
6
- typedef enum { false = 0, true} bool;
7
-
8
- typedef struct node {
9
- char character;
10
- VALUE value;
11
- struct node * first_child;
12
- struct node * next_sibling;
13
- } trie_node;
14
-
15
- static VALUE rb_cTrie;
16
-
17
- // =========================
18
- // = function declarations =
19
- // =========================
20
-
21
- //trie implementation
22
- static trie_node * trie_node_for_key(trie_node * root, char * key, bool create_missing_nodes);
23
- static trie_node * trie_sibling_for_char(trie_node * node, char ch);
24
- static trie_node * trie_add_sibling_for_char(trie_node * node, char ch);
25
- static trie_node * trie_new_node_with_char(char ch);
26
- static trie_node * trie_new_node();
27
- static VALUE rb_trie_find_children(VALUE self, VALUE key);
28
- static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key);
29
- static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance);
30
- static void trie_collect_values(void * t, VALUE prary);
31
- static void trie_collect_values_with_yield(void * t);
32
- static void trie_traverse(trie_node * trie, void (*lambda_func)(void *));
33
- static void trie_traverse_with_context(trie_node * trie, VALUE context, void (*lambda_func)(void *, VALUE));
34
- static void free_trie(trie_node * trie);
35
- static void count_nodes_callback(void *n, VALUE accum);
36
- static VALUE rb_trie_count_nodes(VALUE self);
37
- static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length);
38
- // int print_arr(char c, int* arr, int len);
39
-
40
-
41
- // ========================
42
- // = function definitions =
43
- // ========================
44
-
45
- // instance methods
46
- static VALUE rb_trie_get_key(VALUE self, VALUE key) {
47
- trie_node * root;
48
- trie_node * node;
49
- char * key_cstring;
50
-
51
- //Check_Type(key, T_STRING);
52
- key_cstring = StringValuePtr(key);
53
-
54
- Data_Get_Struct(self, trie_node, root);
55
-
56
- node = trie_node_for_key(root, key_cstring, false);
57
- if (node == NULL) return Qnil;
58
- return node->value;
59
- }
60
-
61
- static VALUE rb_trie_set_key_to_value(VALUE self, VALUE key, VALUE value) {
62
- trie_node * root;
63
- trie_node * node;
64
- char * key_cstring;
65
-
66
- //Check_Type(key, T_STRING);
67
- key_cstring = StringValuePtr(key);
68
-
69
- Data_Get_Struct(self, trie_node, root);
70
-
71
- node = trie_node_for_key(root, key_cstring, true);
72
- node->value = value;
73
-
74
- return Qnil;
75
- }
76
-
77
- static uint mem_count = 0;
78
-
79
- static void count_nodes_callback(void *n, VALUE accum) {
80
- trie_node *node = (trie_node*)n;
81
- // rb_big_plus(accum, rb_uint2big(sizeof(*node)));
82
- mem_count+=sizeof(*node);
83
- }
84
-
85
- static VALUE rb_trie_count_nodes(VALUE self) {
86
- trie_node *root;
87
- Data_Get_Struct(self, trie_node, root);
88
- VALUE accum = rb_uint2big(0);
89
- mem_count = 0;
90
- trie_traverse_with_context(root, accum, count_nodes_callback);
91
- return rb_uint2big(mem_count);
92
- }
93
-
94
- static VALUE rb_trie_undef_key(VALUE self, VALUE key) {
95
- trie_node * root, * node, * prev, * next;
96
- VALUE return_value;
97
- char * key_cstring;
98
- int steps;
99
- int i;
100
-
101
- //Check_Type(key, T_STRING);
102
- key_cstring = StringValuePtr(key);
103
-
104
- Data_Get_Struct(self, trie_node, root);
105
- next = root;
106
- node = NULL;
107
- prev = NULL;
108
-
109
- steps = strlen(key_cstring);
110
-
111
- for (i = 0; i < steps; i++) {
112
- if (next == NULL) return Qnil;
113
-
114
- while(next->character != key_cstring[i]) {
115
- if (next == NULL) return Qnil;
116
- next = next->next_sibling;
117
- }
118
- prev = node;
119
- node = next;
120
- next = node->first_child;
121
- }
122
-
123
- return_value = node->value;
124
- node->value = Qnil;
125
-
126
- if (node->first_child == NULL) { //node has no children. we can delete it.
127
- if (prev == NULL) {
128
- //printf("should delete root");
129
- } else if (prev->first_child == node) {
130
- prev->first_child = node->next_sibling;
131
- free(node);
132
- } else if (prev->next_sibling == node) {
133
- prev->next_sibling = node->next_sibling;
134
- free(node);
135
- }
136
- }
137
-
138
- return return_value;
139
- }
140
-
141
- // garbage collection and allocation
142
- static void trie_mark_value(void * t) {
143
- rb_gc_mark( ((trie_node *)t)->value );
144
- }
145
-
146
- static void rb_trie_mark(trie_node * t) {
147
- trie_traverse(t, trie_mark_value);
148
- }
149
-
150
- static void rb_trie_free(trie_node * t) {
151
- free_trie(t);
152
- }
153
-
154
- static VALUE rb_trie_allocate (VALUE klass) {
155
- trie_node * t = trie_new_node();
156
-
157
- return Data_Wrap_Struct(klass, rb_trie_mark, rb_trie_free, t);
158
- }
159
-
160
- // extension init
161
- void Init_trie() {
162
- rb_cTrie = rb_define_class("Trie", rb_cObject);
163
-
164
- rb_define_alloc_func (rb_cTrie, rb_trie_allocate);
165
-
166
- int arg_count = 0;
167
- //rb_define_method(rb_cTrie, "inspect", rb_trie_inspect, arg_count);
168
- rb_define_method(rb_cTrie, "memory", rb_trie_count_nodes, arg_count);
169
-
170
- arg_count = 1;
171
- rb_define_method(rb_cTrie, "[]", rb_trie_get_key, arg_count);
172
- rb_define_method(rb_cTrie, "delete", rb_trie_undef_key, arg_count);
173
- rb_define_method(rb_cTrie, "children", rb_trie_find_children, arg_count);
174
- rb_define_method(rb_cTrie, "each", rb_trie_find_children_with_block, arg_count);
175
-
176
- arg_count = 2;
177
- rb_define_method(rb_cTrie, "[]=", rb_trie_set_key_to_value, arg_count);
178
- // trie.levenshtein_search(word, max_distance)
179
- rb_define_method(rb_cTrie, "levenshtein_search", rb_trie_levenshtein_search, arg_count);
180
- }
181
-
182
-
183
- // =======================
184
- // = trie implementation =
185
- // =======================
186
-
187
- static trie_node * trie_node_for_key(trie_node * root, char * key, bool create_missing_nodes) {
188
- int steps, i;
189
- trie_node * next, * node;
190
-
191
- steps = strlen(key);
192
- next = root;
193
-
194
- for (i = 0; i < steps; i++) {
195
- if (next == NULL) {
196
- if (create_missing_nodes) {
197
- node->first_child = trie_new_node();
198
- next = node->first_child;
199
- }
200
- else return NULL;
201
- }
202
-
203
- node = trie_sibling_for_char(next, key[i]);
204
-
205
- if (node == NULL) {
206
- if (create_missing_nodes) {
207
- node = trie_add_sibling_for_char(next, key[i]);
208
- }
209
- else return NULL;
210
- }
211
-
212
- next = node->first_child;
213
- }
214
-
215
- return node;
216
- }
217
-
218
- static void trie_collect_values(void * t, VALUE rary) {
219
- trie_node *node = (trie_node*)t;
220
- if (node->value != Qnil) {
221
- rb_ary_push(rary, node->value);
222
- }
223
- }
224
-
225
- static void trie_collect_values_with_yield(void * t) {
226
- trie_node *node = (trie_node*)t;
227
- if (node->value != Qnil) {
228
- // rb_ary_push(rary, node->value);
229
- rb_yield(node->value);
230
- }
231
- }
232
-
233
- static VALUE rb_trie_find_children(VALUE self, VALUE key) {
234
- trie_node * root;
235
- trie_node * node;
236
- char * key_cstring;
237
- VALUE rary = rb_ary_new();
238
-
239
- key_cstring = StringValuePtr(key);
240
- Data_Get_Struct(self, trie_node, root);
241
-
242
- node = trie_node_for_key(root, key_cstring, false);
243
-
244
- if (node != NULL && node->value != Qnil) {
245
- rb_ary_push(rary, node->value);
246
- }
247
-
248
- if (node == NULL || node->first_child == NULL) return rary;
249
-
250
- trie_traverse_with_context(node->first_child, rary, trie_collect_values);
251
- return rary;
252
- }
253
-
254
-
255
- static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key) {
256
- trie_node * root;
257
- trie_node * node;
258
- char * key_cstring;
259
- VALUE rary = rb_ary_new();
260
-
261
- key_cstring = StringValuePtr(key);
262
- Data_Get_Struct(self, trie_node, root);
263
-
264
- node = trie_node_for_key(root, key_cstring, false);
265
-
266
- if (node != NULL && node->value != Qnil) {
267
- rb_yield(node->value);
268
- }
269
-
270
- if (node == NULL || node->first_child == NULL) return rary;
271
-
272
- trie_traverse(node->first_child, trie_collect_values_with_yield);
273
- return rary;
274
- }
275
-
276
- static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance) {
277
- trie_node *root;
278
- trie_node *node;
279
- char *word_cstring;
280
- VALUE rary = rb_ary_new();
281
- int i=0;
282
-
283
- Data_Get_Struct(self, trie_node, root);
284
-
285
- word_cstring = StringValuePtr(word);
286
-
287
- int first_line[strlen(word_cstring) + 1];
288
- for(; i < strlen(word_cstring) + 1; i++) {
289
- first_line[i] = i;
290
- }
291
- // print_arr('R', first_line, strlen(word_cstring)+1);
292
- recursive_levenshtein_search(root->next_sibling, rary, first_line, FIX2INT(max_distance), word_cstring, strlen(word_cstring));
293
-
294
- return rary;
295
- }
296
-
297
- int minimum(int* numbers, int len) {
298
- int minValue = numbers[0];
299
- int i;
300
- for(i=1; i<len; i++) {
301
- if (numbers[i] < minValue) minValue = numbers[i];
302
- }
303
- return minValue;
304
- }
305
-
306
- int min3(int a, int b, int c) {
307
- int min = a;
308
- if (b < min) min = b;
309
- if (c < min) min = c;
310
- return min;
311
- }
312
-
313
- static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length) {
314
- int cur_line[word_length + 1];
315
- int i,j, insert_cost, replace_cost, delete_cost;
316
- VALUE carr;
317
-
318
- cur_line[0] = prev_line[0] + 1;
319
-
320
- for(i=1; i < word_length + 1; i++) {
321
- insert_cost = cur_line[i-1] + 1;
322
- delete_cost = prev_line[i] + 1;
323
- if (trie->character != word[i-1]) {
324
- replace_cost = prev_line[i-1] + 1;
325
- } else {
326
- replace_cost = prev_line[i-1];
327
- }
328
- cur_line[i] = min3(insert_cost, delete_cost, replace_cost);
329
- }
330
-
331
-
332
- if (cur_line[word_length] <= max_dist && trie->value != Qnil) {
333
- carr = rb_ary_new();
334
- rb_ary_push(carr, trie->value);
335
- rb_ary_push(carr, INT2FIX(cur_line[word_length]));
336
- rb_ary_push(rary, carr);
337
- }
338
-
339
- if (minimum(cur_line, word_length + 1) <= max_dist) {
340
- if (trie->first_child != NULL)
341
- recursive_levenshtein_search(trie->first_child, rary, cur_line, max_dist, word, word_length);
342
- if (trie->next_sibling != NULL)
343
- recursive_levenshtein_search(trie->next_sibling, rary, prev_line, max_dist, word, word_length);
344
- }
345
- }
346
-
347
- static trie_node * trie_sibling_for_char(trie_node * node, char ch) {
348
- while(true) {
349
- if (node == NULL) return NULL;
350
-
351
- if (node->character == ch) return node;
352
-
353
- node = node->next_sibling;
354
- }
355
- return node;
356
- }
357
-
358
- static trie_node * trie_add_sibling_for_char(trie_node * node, char ch) {
359
- trie_node * current_next;
360
-
361
- current_next = node->next_sibling;
362
- node->next_sibling = trie_new_node_with_char(ch);
363
- node->next_sibling->next_sibling = current_next;
364
-
365
- return node->next_sibling;
366
- }
367
-
368
- static trie_node * trie_new_node_with_char(char ch) {
369
- trie_node * trie;
370
- trie = malloc(sizeof(trie_node));
371
- trie->character = ch;
372
- trie->value = Qnil;
373
- trie->first_child = NULL;
374
- trie->next_sibling = NULL;
375
- return trie;
376
- }
377
-
378
- static trie_node * trie_new_node() {
379
- return trie_new_node_with_char('s'); //insert most common starting letter here.
380
- }
381
-
382
- static void trie_traverse(trie_node * trie, void (* lambda_func)(void *)) {
383
- if (trie->next_sibling != NULL) {
384
- trie_traverse(trie->next_sibling, lambda_func);
385
- }
386
-
387
- if (trie->first_child != NULL) {
388
- trie_traverse(trie->first_child, lambda_func);
389
- }
390
-
391
- lambda_func(trie);
392
- }
393
-
394
- static void trie_traverse_with_context(trie_node * trie, VALUE context, void (*lambda_func)(void *, VALUE)) {
395
- if (trie->next_sibling != NULL) {
396
- trie_traverse_with_context(trie->next_sibling, context, lambda_func);
397
- }
398
-
399
- if (trie->first_child != NULL) {
400
- trie_traverse_with_context(trie->first_child, context, lambda_func);
401
- }
402
-
403
- lambda_func(trie, context);
404
- }
405
-
406
- static void free_trie(trie_node * trie) {
407
- trie_traverse(trie, free);
408
- }