RubyTrie 1.1 → 2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ChangeLog CHANGED
@@ -1,2 +1,4 @@
1
1
  1.0 added children and each methods
2
- 1.1 added levenshtein algorithm implementation - inspired from: http://stevehanov.ca/blog/index.php?id=114
2
+ 1.1 added levenshtein algorithm implementation - inspired from: http://stevehanov.ca/blog/index.php?id=114
3
+ 2.0 completely rewritten the trie implementation to use more optimized data structure. For /usr/share/dict/web2 it takes the memory to about 6Mb from 40Mb. The levenshtein now takes a block and passes in the value but it can be easy extened to return an array. It also has a glitch where some values are for some reason just false... :|
4
+ 2.1 bug fixes, fixing levenshtein search
data/README CHANGED
@@ -22,6 +22,11 @@ t["goes"] = 2
22
22
  t["gone"] = 3
23
23
  t["other"] = 4
24
24
 
25
+ by default assigning multiple values to the same object will add them to an array.
26
+ t['a'] = 1
27
+ t['a'] = 2
28
+ will return for t['a'] => [1,2]
29
+
25
30
  t.children("go") => [1,2,3]
26
31
 
27
32
  t.each("partial key") will yield to the given block all values that are matched by the partial key
@@ -29,8 +34,11 @@ t.each("go") {|v| puts v } => prints 1, 2 & 3
29
34
 
30
35
  Levenshtein search
31
36
 
32
- t.levenshtein_search("go", 2) - will return all values for the words in the trie that have 1 from the search word
33
- ex: [["go", 0], ["goes", 2], ["gone", 2]]
37
+ t.levenshtein_search("go", 2) {|value| puts value} - will print all values for the words in the trie that have 2 distance from the search word
38
+ Eg.
39
+ 1
40
+ 2
41
+ 3
34
42
 
35
43
  == Bugs
36
44
 
@@ -0,0 +1,77 @@
1
+ /*
2
+ * levenshtein_distance.c
3
+ * otrie
4
+ *
5
+ * Created by Petrica Ghiurca on 18.03.2011.
6
+ * Copyright 2011 __MyCompanyName__. All rights reserved.
7
+ *
8
+ */
9
+
10
+ #include <stdlib.h>
11
+ #include <string.h> /* for memcmp, memmove */
12
+ #include <stdio.h>
13
+
14
+ #include "otrie2.h"
15
+ #include "levenshtein_distance.h"
16
+
17
+
18
+ void recursive_levenshtein_search(Node* trie, int node_offset, levenshtein_distance_callback cb, int* prev_line, int max_dist, const char* word, int word_length);
19
+
20
+ void levenshtein_distance(Node* trie, const char* word, int max_distance, levenshtein_distance_callback cb) {
21
+ int first_line[strlen(word) + 1];
22
+ int i=0;
23
+ for(; i < strlen(word) + 1; i++) {
24
+ first_line[i] = i;
25
+ }
26
+ recursive_levenshtein_search(trie->next_sibling, 0, cb, first_line, max_distance, word, strlen(word));
27
+ }
28
+
29
+
30
+ int minimum(int* numbers, int len) {
31
+ int minValue = numbers[0];
32
+ int i;
33
+ for(i=1; i<len; i++) {
34
+ if (numbers[i] < minValue) minValue = numbers[i];
35
+ }
36
+ return minValue;
37
+ }
38
+
39
+ int min3(int a, int b, int c) {
40
+ int min = a;
41
+ if (b < min) min = b;
42
+ if (c < min) min = c;
43
+ return min;
44
+ }
45
+
46
+ void recursive_levenshtein_search(Node* trie, int node_offset, levenshtein_distance_callback cb, int* prev_line, int max_dist, const char* word, int word_length) {
47
+ int cur_line[word_length + 1];
48
+ int i, insert_cost, replace_cost, delete_cost;
49
+
50
+ cur_line[0] = prev_line[0] + 1;
51
+
52
+ for(i=1; i < word_length + 1; i++) {
53
+ insert_cost = cur_line[i-1] + 1;
54
+ delete_cost = prev_line[i] + 1;
55
+ if (trie->data[node_offset] != word[i-1]) {
56
+ replace_cost = prev_line[i-1] + 1;
57
+ } else {
58
+ replace_cost = prev_line[i-1];
59
+ }
60
+ cur_line[i] = min3(insert_cost, delete_cost, replace_cost);
61
+ }
62
+
63
+
64
+ if (cur_line[word_length] <= max_dist && (strlen(trie->data) == (node_offset + 1)) && trie->value != Qnil) {
65
+ cb(trie, cur_line[word_length]);
66
+ }
67
+
68
+ if (minimum(cur_line, word_length + 1) <= max_dist) {
69
+ if (strlen(trie->data) > (node_offset + 1))
70
+ recursive_levenshtein_search(trie, node_offset + 1, cb, cur_line, max_dist, word, word_length);
71
+ if (trie->first_child != NULL)
72
+ recursive_levenshtein_search(trie->first_child, 0, cb, cur_line, max_dist, word, word_length);
73
+ if (trie->next_sibling != NULL)
74
+ recursive_levenshtein_search(trie->next_sibling, 0, cb, prev_line, max_dist, word, word_length);
75
+ }
76
+
77
+ }
@@ -0,0 +1,13 @@
1
+ /*
2
+ * levenshtein_distance.h
3
+ * otrie
4
+ *
5
+ * Created by Petrica Ghiurca on 18.03.2011.
6
+ * Copyright 2011 __MyCompanyName__. All rights reserved.
7
+ *
8
+ */
9
+
10
+ #include "otrie2.h"
11
+
12
+ typedef void (*levenshtein_distance_callback)(Node* node, int length);
13
+ void levenshtein_distance(Node* trie, const char* word, int max_distance, levenshtein_distance_callback cb);
data/ext/trie/otrie2.c ADDED
@@ -0,0 +1,161 @@
1
+ /*
2
+ * otrie2.c
3
+ * otrie
4
+ *
5
+ * Created by Petrica Ghiurca on 18.03.2011.
6
+ * Copyright 2011 __MyCompanyName__. All rights reserved.
7
+ *
8
+ */
9
+
10
+ #include <stdlib.h> /* for malloc, free */
11
+ #include <string.h> /* for memcmp, memmove */
12
+ #include <stdio.h>
13
+
14
+ #include "otrie2.h"
15
+
16
+ Node* new_node() {
17
+ Node *node = malloc(sizeof(Node));
18
+ memset(node, 0, sizeof(Node));
19
+ return node;
20
+ }
21
+
22
+ Node* new_node_string_len(const char* string, const int len) {
23
+ Node *node = new_node();
24
+ node->data = malloc(len+1);
25
+ strncpy(node->data, string, len);
26
+ node->data[len] = 0;
27
+ return node;
28
+ }
29
+
30
+ Node* new_node_string(const char* string) {
31
+ Node *node = new_node();
32
+ int len = strlen(string);
33
+ node->data = malloc(len+1);
34
+ strcpy(node->data, string);
35
+ return node;
36
+ }
37
+
38
+ void node_update_data(Node* node, const char* string, int len) {
39
+ char* new_data = malloc(len+1);
40
+ strncpy(new_data, string, len);
41
+ new_data[len] = 0;
42
+ free(node->data);
43
+ node->data = new_data;
44
+ }
45
+
46
+ void free_node(Node *node) {
47
+ if (node->first_child) free_node(node->first_child);
48
+ if (node->next_sibling) free_node(node->next_sibling);
49
+ free(node->data);
50
+ free(node);
51
+ }
52
+
53
+ void node_insert(Node* node, const char* string, const VALUE value) {
54
+ int len = strlen(string);
55
+ Pos *cur = new_pos(node, 0);
56
+ int i=0;
57
+ for (; i<len; i++) {
58
+ pos_next(cur, string + i, true);
59
+ }
60
+ cur->node->value = value;
61
+ }
62
+
63
+ Node* node_find(Node* this, const char* string) {
64
+ int len = strlen(string);
65
+ Pos *cur = new_pos(this, 0);
66
+ int i=0;
67
+ for(; i<len; i++) {
68
+ pos_next(cur, string + i, false);
69
+ if (cur->node == NULL) { return NULL; }
70
+ }
71
+ if (strlen(cur->node->data) == cur->offset + 1)
72
+ return cur->node;
73
+ return NULL;
74
+ }
75
+
76
+ Pos* new_pos(Node *node, int offset) {
77
+ Pos *pos = malloc(sizeof(Pos));
78
+ pos->node = node;
79
+ pos->offset = offset;
80
+ return pos;
81
+ }
82
+
83
+ Node* pos_find_or_create_child(Pos* this, const char* string, bool down, bool insert) {
84
+ Node *child = this->node->next_sibling;
85
+ if (down) child = this->node->first_child;
86
+
87
+ Node *last_child = NULL;
88
+
89
+ while(child != NULL && *child->data != *string) {
90
+ last_child = child;
91
+ child = child -> next_sibling;
92
+ }
93
+ if (child == NULL && insert) {
94
+ child = new_node_string(string);
95
+ if (!down) {
96
+ if (this->node -> next_sibling != NULL) {
97
+ last_child -> next_sibling = child;
98
+ } else {
99
+ this -> node -> next_sibling = child;
100
+ }
101
+ } else {
102
+ if (this -> node -> first_child != NULL) {
103
+ last_child -> next_sibling = child;
104
+ } else {
105
+ this->node -> first_child = child;
106
+ }
107
+ }
108
+ }
109
+ return child;
110
+ }
111
+
112
+ void pos_next(Pos *this, const char* string, bool insert) {
113
+ if (this -> node -> data == NULL) {
114
+ this->node = pos_find_or_create_child(this, string, false, insert);
115
+ this->offset = 0;
116
+ return;
117
+ }
118
+
119
+ int len = strlen(this->node->data);
120
+ if (this -> offset + 1 < len) {
121
+ if (this -> node->data[this -> offset + 1] == string[0]) {
122
+ this -> offset++;
123
+ return;
124
+ } else {
125
+ // split paths
126
+ // - new child node with old partial content
127
+ // - new child node with new content
128
+ if (insert) {
129
+ Node *splitChild = new_node_string(this->node->data + this->offset + 1);
130
+ splitChild -> value = this -> node -> value;
131
+ Node *newChild = new_node_string(string);
132
+ newChild -> value = Qnil;
133
+ node_update_data(this->node, this->node->data, this->offset + 1);
134
+ splitChild -> next_sibling = newChild;
135
+ this->node -> first_child = splitChild;
136
+ this->node -> value = Qnil;
137
+
138
+ this->node = newChild;
139
+ this->offset = 0;
140
+ } else {
141
+ this -> node = NULL;
142
+ this -> offset = 0;
143
+ }
144
+ }
145
+ } else {
146
+ // reached end of data... find a child
147
+ this->node = pos_find_or_create_child(this, string, true, insert);
148
+ this->offset = 0;
149
+ return;
150
+ }
151
+ }
152
+
153
+ void node_visit(Node* this, node_iterator func, VALUE context) {
154
+ func(this, context);
155
+ if (this->first_child != NULL) {
156
+ node_visit(this->first_child, func, context);
157
+ }
158
+ if (this->next_sibling != NULL) {
159
+ node_visit(this->next_sibling, func, context);
160
+ }
161
+ }
data/ext/trie/otrie2.h ADDED
@@ -0,0 +1,41 @@
1
+ /*
2
+ * otrie2.h
3
+ * otrie
4
+ *
5
+ * Created by Petrica Ghiurca on 18.03.2011.
6
+ * Copyright 2011 __MyCompanyName__. All rights reserved.
7
+ *
8
+ */
9
+ #include <ruby.h>
10
+
11
+ #define bool int
12
+ #define true 1
13
+ #define false 0
14
+
15
+ #ifndef TRIE_NODE
16
+ #define TRIE_NODE
17
+
18
+ typedef struct trie_node {
19
+ char *data;
20
+ struct trie_node* first_child;
21
+ struct trie_node* next_sibling;
22
+ VALUE value;
23
+ } Node;
24
+
25
+ typedef struct pos_struct {
26
+ Node *node;
27
+ int offset;
28
+ } Pos;
29
+
30
+ typedef void (*node_iterator)(Node* node, VALUE context);
31
+
32
+ Node* new_node();
33
+ void free_node(Node*);
34
+ void node_insert(Node* node, const char* string, const VALUE value);
35
+ Node* node_find(Node* this, const char* string);
36
+
37
+ Pos* new_pos(Node *node, int offset);
38
+ Node* pos_find_or_create_child(Pos* this, const char* string, bool down, bool insert);
39
+ void pos_next(Pos *this, const char* string, bool);
40
+ void node_visit(Node* this, node_iterator func, VALUE context);
41
+ #endif
@@ -0,0 +1,199 @@
1
+ /*
2
+ * ruby_trie.c
3
+ * otrie
4
+ *
5
+ * Created by Petrica Ghiurca on 18.03.2011.
6
+ * Copyright 2011 Petrica Ghiurca. All rights reserved.
7
+ *
8
+ */
9
+
10
+ #include <ruby.h>
11
+ #include <stdlib.h> /* for malloc, free */
12
+ #include <string.h> /* for memcmp, memmove */
13
+ #include "otrie2.h"
14
+ #include "levenshtein_distance.h"
15
+
16
+ static VALUE rb_cTrie;
17
+
18
+ static void count_nodes_callback(Node *trie, VALUE accum);
19
+ static VALUE rb_trie_count_nodes(VALUE self);
20
+ static VALUE rb_trie_allocate(VALUE klass);
21
+ static VALUE rb_trie_get_key(VALUE self, VALUE key);
22
+ static void trie_mark_value(Node*, VALUE);
23
+ static void rb_trie_mark(Node* t);
24
+ static void rb_trie_free(Node * t);
25
+ void tree_collect_values(Node *node, VALUE rary);
26
+ static VALUE rb_trie_find_children(VALUE self, VALUE key);
27
+ static void trie_collect_values_with_yield(Node * node, VALUE context);
28
+ static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key);
29
+ static VALUE rb_trie_set_key_to_value(VALUE self, VALUE key, VALUE value);
30
+ static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance);
31
+
32
+ // extension init
33
+ void Init_trie() {
34
+ rb_cTrie = rb_define_class("Trie", rb_cObject);
35
+
36
+ rb_define_alloc_func(rb_cTrie, rb_trie_allocate);
37
+
38
+ int arg_count = 0;
39
+ //rb_define_method(rb_cTrie, "inspect", rb_trie_inspect, arg_count);
40
+ rb_define_method(rb_cTrie, "memory", rb_trie_count_nodes, arg_count);
41
+
42
+ arg_count = 1;
43
+ rb_define_method(rb_cTrie, "[]", rb_trie_get_key, arg_count);
44
+ // rb_define_method(rb_cTrie, "delete", rb_trie_undef_key, arg_count);
45
+ rb_define_method(rb_cTrie, "children", rb_trie_find_children, arg_count);
46
+ rb_define_method(rb_cTrie, "each", rb_trie_find_children_with_block, arg_count);
47
+
48
+ arg_count = 2;
49
+ rb_define_method(rb_cTrie, "[]=", rb_trie_set_key_to_value, arg_count);
50
+ // trie.levenshtein_search(word, max_distance)
51
+ rb_define_method(rb_cTrie, "levenshtein_search", rb_trie_levenshtein_search, arg_count);
52
+ }
53
+
54
+ static int total_memory;
55
+ static void count_nodes_callback(Node *trie, VALUE accum) {
56
+ int len = 0;
57
+ if (trie->data) len = strlen(trie->data);
58
+ rb_big_plus(accum, rb_uint2big(len + sizeof(Node)));
59
+ total_memory += len + sizeof(Node);
60
+ }
61
+
62
+ static VALUE rb_trie_count_nodes(VALUE self) {
63
+ Node *root;
64
+ Data_Get_Struct(self, Node, root);
65
+ VALUE accum = rb_uint2big(0);
66
+ total_memory = 0;
67
+ node_visit(root, count_nodes_callback, accum);
68
+ //return accum;
69
+ return rb_uint2big(total_memory);
70
+ }
71
+
72
+ static VALUE rb_trie_allocate(VALUE klass) {
73
+ Node * t = new_node();
74
+ return Data_Wrap_Struct(klass, rb_trie_mark, rb_trie_free, t);
75
+ }
76
+
77
+ static VALUE rb_trie_get_key(VALUE self, VALUE key) {
78
+ Node * root;
79
+ Node * node;
80
+ char * key_cstring;
81
+
82
+ Check_Type(key, T_STRING);
83
+ key_cstring = StringValuePtr(key);
84
+
85
+ Data_Get_Struct(self, Node, root);
86
+
87
+ node = node_find(root, key_cstring);
88
+ if (node == NULL) return Qnil;
89
+ return node->value;
90
+ }
91
+
92
+
93
+ static void trie_mark_value(Node * t, VALUE context) {
94
+ rb_gc_mark( t->value );
95
+ }
96
+
97
+ static void rb_trie_mark(Node* t) {
98
+ node_visit(t, trie_mark_value, Qnil);
99
+ }
100
+
101
+ static void rb_trie_free(Node * t) {
102
+ free_node(t);
103
+ }
104
+
105
+ void tree_collect_values(Node *node, VALUE rary) {
106
+ if (node->value != Qnil) {
107
+ rb_ary_push(rary, node->value);
108
+ }
109
+ }
110
+
111
+ static VALUE rb_trie_find_children(VALUE self, VALUE key) {
112
+ Node * root;
113
+ Node * node;
114
+ char * key_cstring;
115
+ VALUE rary = rb_ary_new();
116
+
117
+ key_cstring = StringValuePtr(key);
118
+ Data_Get_Struct(self, Node, root);
119
+
120
+ node = node_find(root, key_cstring);
121
+
122
+ if (node != NULL && node->value != Qnil) {
123
+ rb_ary_push(rary, node->value);
124
+ }
125
+
126
+ if (node == NULL || node->first_child == NULL) return rary;
127
+
128
+ node_visit(node->first_child, tree_collect_values, rary);
129
+ return rary;
130
+ }
131
+
132
+ static void trie_collect_values_with_yield(Node * node, VALUE context) {
133
+ if (node->value != Qnil) {
134
+ rb_yield(node->value);
135
+ }
136
+ }
137
+
138
+ static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key) {
139
+ Node * root;
140
+ Node * node;
141
+ char * key_cstring;
142
+ VALUE rary = rb_ary_new();
143
+
144
+ key_cstring = StringValuePtr(key);
145
+ Data_Get_Struct(self, Node, root);
146
+
147
+ node = node_find(root, key_cstring);
148
+
149
+ if (node != NULL && node->value != Qnil) {
150
+ rb_yield(node->value);
151
+ }
152
+
153
+ if (node == NULL || node->first_child == NULL) return rary;
154
+
155
+ node_visit(node->first_child, trie_collect_values_with_yield, Qnil);
156
+ return rary;
157
+ }
158
+
159
+
160
+ static VALUE rb_trie_set_key_to_value(VALUE self, VALUE key, VALUE value) {
161
+ Node * root, *node;
162
+ char * key_cstring;
163
+
164
+ Check_Type(key, T_STRING);
165
+ key_cstring = StringValuePtr(key);
166
+
167
+ Data_Get_Struct(self, Node, root);
168
+
169
+ node = node_find(root, key_cstring);
170
+ if (node == NULL || node -> value == Qnil) {
171
+ // printf("New node for %s -> %d\n", key_cstring, value);
172
+ VALUE arr = rb_ary_new();
173
+ rb_ary_push(arr, value);
174
+ node_insert(root, key_cstring, arr);
175
+ } else {
176
+ // printf("Append value %s to %s -> %d\n", node->data, key_cstring, node->value);
177
+ rb_ary_push(node->value, value);
178
+ }
179
+
180
+ return Qnil;
181
+ }
182
+
183
+ void rb_levensthtein_cb(Node* node, int distance) {
184
+ if (node->value != Qnil)
185
+ rb_yield(node->value);
186
+ }
187
+
188
+ static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance) {
189
+ Node *root;
190
+ char *word_cstring;
191
+
192
+ Data_Get_Struct(self, Node, root);
193
+
194
+ word_cstring = StringValuePtr(word);
195
+
196
+ levenshtein_distance(root, word_cstring, FIX2INT(max_distance), rb_levensthtein_cb);
197
+
198
+ return Qnil;
199
+ }
data/ext/trie/t.rb CHANGED
@@ -1,20 +1,22 @@
1
1
  require 'trie'
2
+ require 'rubygems'
2
3
  require 'benchmark'
3
4
 
4
5
  t = Trie.new
5
6
  c = 0
7
+ a = 0
6
8
  s1 = (Benchmark.measure do
7
- open('/usr/share/dict/words').each_line do |w|
8
- t[w.chop] = w.chop
9
+ open('/usr/share/dict/web2').each_line do |w|
10
+ t[w.chop]= w
9
11
  c += 1
10
12
  end
11
13
  end)
12
14
 
13
15
  # %w(gol golas golaster lux xal).each {|w| t[w] = w}
14
16
  s2 = (Benchmark.measure do
15
- t.levenshtein_search('tread', 2).sort_by {|p| p.last }.each {|p| puts "#{p.last}: #{p.first}"}
17
+ t.levenshtein_search('food', 1) {|p| puts p}
16
18
  end)
17
19
 
18
- puts "#{t.memory/(1024*1024)}Mb, #{c} words"
20
+ puts "#{t.memory/(1024*1024)}Mb, #{c} words, #{a} unique"
19
21
  puts s1
20
22
  puts s2
data/test/trie_test.rb CHANGED
@@ -23,10 +23,11 @@ time do
23
23
  1.upto(max) do |i|
24
24
  t["item #{i}"].class
25
25
  end
26
-
27
- 1.upto(max) do |i|
28
- t.delete("item #{i}")
29
- end
26
+
27
+ # not implemented yet
28
+ # 1.upto(max) do |i|
29
+ # t.delete("item #{i}")
30
+ # end
30
31
  end
31
32
 
32
33
 
@@ -41,9 +42,9 @@ time do
41
42
  t["item #{i}"].class
42
43
  end
43
44
 
44
- 1.upto(max) do |i|
45
- t.delete("item #{i}")
46
- end
45
+ # 1.upto(max) do |i|
46
+ # t.delete("item #{i}")
47
+ # end
47
48
  end
48
49
 
49
50
  # puts "With a Ruby Trie..."
metadata CHANGED
@@ -1,21 +1,21 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: RubyTrie
3
3
  version: !ruby/object:Gem::Version
4
- hash: 13
4
+ hash: 1
5
5
  prerelease: false
6
6
  segments:
7
+ - 2
7
8
  - 1
8
- - 1
9
- version: "1.1"
9
+ version: "2.1"
10
10
  platform: ruby
11
11
  authors:
12
- - Matt Freels
13
12
  - Petrica Ghiurca
13
+ - Matt Freels
14
14
  autorequire: trie
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-16 00:00:00 +02:00
18
+ date: 2011-03-21 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies: []
21
21
 
@@ -30,7 +30,11 @@ extra_rdoc_files:
30
30
  files:
31
31
  - README
32
32
  - ChangeLog
33
- - ext/trie/trie.c
33
+ - ext/trie/levenshtein_distance.h
34
+ - ext/trie/otrie2.h
35
+ - ext/trie/levenshtein_distance.c
36
+ - ext/trie/otrie2.c
37
+ - ext/trie/ruby_trie.c
34
38
  - ext/trie/extconf.rb
35
39
  - ext/trie/t.rb
36
40
  - lib/ruby_trie.rb
data/ext/trie/trie.c DELETED
@@ -1,408 +0,0 @@
1
- #include <stdlib.h> /* for malloc, free */
2
- #include <string.h> /* for memcmp, memmove */
3
- #include "ruby.h"
4
-
5
- // typdefs!
6
- typedef enum { false = 0, true} bool;
7
-
8
- typedef struct node {
9
- char character;
10
- VALUE value;
11
- struct node * first_child;
12
- struct node * next_sibling;
13
- } trie_node;
14
-
15
- static VALUE rb_cTrie;
16
-
17
- // =========================
18
- // = function declarations =
19
- // =========================
20
-
21
- //trie implementation
22
- static trie_node * trie_node_for_key(trie_node * root, char * key, bool create_missing_nodes);
23
- static trie_node * trie_sibling_for_char(trie_node * node, char ch);
24
- static trie_node * trie_add_sibling_for_char(trie_node * node, char ch);
25
- static trie_node * trie_new_node_with_char(char ch);
26
- static trie_node * trie_new_node();
27
- static VALUE rb_trie_find_children(VALUE self, VALUE key);
28
- static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key);
29
- static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance);
30
- static void trie_collect_values(void * t, VALUE prary);
31
- static void trie_collect_values_with_yield(void * t);
32
- static void trie_traverse(trie_node * trie, void (*lambda_func)(void *));
33
- static void trie_traverse_with_context(trie_node * trie, VALUE context, void (*lambda_func)(void *, VALUE));
34
- static void free_trie(trie_node * trie);
35
- static void count_nodes_callback(void *n, VALUE accum);
36
- static VALUE rb_trie_count_nodes(VALUE self);
37
- static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length);
38
- // int print_arr(char c, int* arr, int len);
39
-
40
-
41
- // ========================
42
- // = function definitions =
43
- // ========================
44
-
45
- // instance methods
46
- static VALUE rb_trie_get_key(VALUE self, VALUE key) {
47
- trie_node * root;
48
- trie_node * node;
49
- char * key_cstring;
50
-
51
- //Check_Type(key, T_STRING);
52
- key_cstring = StringValuePtr(key);
53
-
54
- Data_Get_Struct(self, trie_node, root);
55
-
56
- node = trie_node_for_key(root, key_cstring, false);
57
- if (node == NULL) return Qnil;
58
- return node->value;
59
- }
60
-
61
- static VALUE rb_trie_set_key_to_value(VALUE self, VALUE key, VALUE value) {
62
- trie_node * root;
63
- trie_node * node;
64
- char * key_cstring;
65
-
66
- //Check_Type(key, T_STRING);
67
- key_cstring = StringValuePtr(key);
68
-
69
- Data_Get_Struct(self, trie_node, root);
70
-
71
- node = trie_node_for_key(root, key_cstring, true);
72
- node->value = value;
73
-
74
- return Qnil;
75
- }
76
-
77
- static uint mem_count = 0;
78
-
79
- static void count_nodes_callback(void *n, VALUE accum) {
80
- trie_node *node = (trie_node*)n;
81
- // rb_big_plus(accum, rb_uint2big(sizeof(*node)));
82
- mem_count+=sizeof(*node);
83
- }
84
-
85
- static VALUE rb_trie_count_nodes(VALUE self) {
86
- trie_node *root;
87
- Data_Get_Struct(self, trie_node, root);
88
- VALUE accum = rb_uint2big(0);
89
- mem_count = 0;
90
- trie_traverse_with_context(root, accum, count_nodes_callback);
91
- return rb_uint2big(mem_count);
92
- }
93
-
94
- static VALUE rb_trie_undef_key(VALUE self, VALUE key) {
95
- trie_node * root, * node, * prev, * next;
96
- VALUE return_value;
97
- char * key_cstring;
98
- int steps;
99
- int i;
100
-
101
- //Check_Type(key, T_STRING);
102
- key_cstring = StringValuePtr(key);
103
-
104
- Data_Get_Struct(self, trie_node, root);
105
- next = root;
106
- node = NULL;
107
- prev = NULL;
108
-
109
- steps = strlen(key_cstring);
110
-
111
- for (i = 0; i < steps; i++) {
112
- if (next == NULL) return Qnil;
113
-
114
- while(next->character != key_cstring[i]) {
115
- if (next == NULL) return Qnil;
116
- next = next->next_sibling;
117
- }
118
- prev = node;
119
- node = next;
120
- next = node->first_child;
121
- }
122
-
123
- return_value = node->value;
124
- node->value = Qnil;
125
-
126
- if (node->first_child == NULL) { //node has no children. we can delete it.
127
- if (prev == NULL) {
128
- //printf("should delete root");
129
- } else if (prev->first_child == node) {
130
- prev->first_child = node->next_sibling;
131
- free(node);
132
- } else if (prev->next_sibling == node) {
133
- prev->next_sibling = node->next_sibling;
134
- free(node);
135
- }
136
- }
137
-
138
- return return_value;
139
- }
140
-
141
- // garbage collection and allocation
142
- static void trie_mark_value(void * t) {
143
- rb_gc_mark( ((trie_node *)t)->value );
144
- }
145
-
146
- static void rb_trie_mark(trie_node * t) {
147
- trie_traverse(t, trie_mark_value);
148
- }
149
-
150
- static void rb_trie_free(trie_node * t) {
151
- free_trie(t);
152
- }
153
-
154
- static VALUE rb_trie_allocate (VALUE klass) {
155
- trie_node * t = trie_new_node();
156
-
157
- return Data_Wrap_Struct(klass, rb_trie_mark, rb_trie_free, t);
158
- }
159
-
160
- // extension init
161
- void Init_trie() {
162
- rb_cTrie = rb_define_class("Trie", rb_cObject);
163
-
164
- rb_define_alloc_func (rb_cTrie, rb_trie_allocate);
165
-
166
- int arg_count = 0;
167
- //rb_define_method(rb_cTrie, "inspect", rb_trie_inspect, arg_count);
168
- rb_define_method(rb_cTrie, "memory", rb_trie_count_nodes, arg_count);
169
-
170
- arg_count = 1;
171
- rb_define_method(rb_cTrie, "[]", rb_trie_get_key, arg_count);
172
- rb_define_method(rb_cTrie, "delete", rb_trie_undef_key, arg_count);
173
- rb_define_method(rb_cTrie, "children", rb_trie_find_children, arg_count);
174
- rb_define_method(rb_cTrie, "each", rb_trie_find_children_with_block, arg_count);
175
-
176
- arg_count = 2;
177
- rb_define_method(rb_cTrie, "[]=", rb_trie_set_key_to_value, arg_count);
178
- // trie.levenshtein_search(word, max_distance)
179
- rb_define_method(rb_cTrie, "levenshtein_search", rb_trie_levenshtein_search, arg_count);
180
- }
181
-
182
-
183
- // =======================
184
- // = trie implementation =
185
- // =======================
186
-
187
- static trie_node * trie_node_for_key(trie_node * root, char * key, bool create_missing_nodes) {
188
- int steps, i;
189
- trie_node * next, * node;
190
-
191
- steps = strlen(key);
192
- next = root;
193
-
194
- for (i = 0; i < steps; i++) {
195
- if (next == NULL) {
196
- if (create_missing_nodes) {
197
- node->first_child = trie_new_node();
198
- next = node->first_child;
199
- }
200
- else return NULL;
201
- }
202
-
203
- node = trie_sibling_for_char(next, key[i]);
204
-
205
- if (node == NULL) {
206
- if (create_missing_nodes) {
207
- node = trie_add_sibling_for_char(next, key[i]);
208
- }
209
- else return NULL;
210
- }
211
-
212
- next = node->first_child;
213
- }
214
-
215
- return node;
216
- }
217
-
218
- static void trie_collect_values(void * t, VALUE rary) {
219
- trie_node *node = (trie_node*)t;
220
- if (node->value != Qnil) {
221
- rb_ary_push(rary, node->value);
222
- }
223
- }
224
-
225
- static void trie_collect_values_with_yield(void * t) {
226
- trie_node *node = (trie_node*)t;
227
- if (node->value != Qnil) {
228
- // rb_ary_push(rary, node->value);
229
- rb_yield(node->value);
230
- }
231
- }
232
-
233
- static VALUE rb_trie_find_children(VALUE self, VALUE key) {
234
- trie_node * root;
235
- trie_node * node;
236
- char * key_cstring;
237
- VALUE rary = rb_ary_new();
238
-
239
- key_cstring = StringValuePtr(key);
240
- Data_Get_Struct(self, trie_node, root);
241
-
242
- node = trie_node_for_key(root, key_cstring, false);
243
-
244
- if (node != NULL && node->value != Qnil) {
245
- rb_ary_push(rary, node->value);
246
- }
247
-
248
- if (node == NULL || node->first_child == NULL) return rary;
249
-
250
- trie_traverse_with_context(node->first_child, rary, trie_collect_values);
251
- return rary;
252
- }
253
-
254
-
255
- static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key) {
256
- trie_node * root;
257
- trie_node * node;
258
- char * key_cstring;
259
- VALUE rary = rb_ary_new();
260
-
261
- key_cstring = StringValuePtr(key);
262
- Data_Get_Struct(self, trie_node, root);
263
-
264
- node = trie_node_for_key(root, key_cstring, false);
265
-
266
- if (node != NULL && node->value != Qnil) {
267
- rb_yield(node->value);
268
- }
269
-
270
- if (node == NULL || node->first_child == NULL) return rary;
271
-
272
- trie_traverse(node->first_child, trie_collect_values_with_yield);
273
- return rary;
274
- }
275
-
276
- static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance) {
277
- trie_node *root;
278
- trie_node *node;
279
- char *word_cstring;
280
- VALUE rary = rb_ary_new();
281
- int i=0;
282
-
283
- Data_Get_Struct(self, trie_node, root);
284
-
285
- word_cstring = StringValuePtr(word);
286
-
287
- int first_line[strlen(word_cstring) + 1];
288
- for(; i < strlen(word_cstring) + 1; i++) {
289
- first_line[i] = i;
290
- }
291
- // print_arr('R', first_line, strlen(word_cstring)+1);
292
- recursive_levenshtein_search(root->next_sibling, rary, first_line, FIX2INT(max_distance), word_cstring, strlen(word_cstring));
293
-
294
- return rary;
295
- }
296
-
297
- int minimum(int* numbers, int len) {
298
- int minValue = numbers[0];
299
- int i;
300
- for(i=1; i<len; i++) {
301
- if (numbers[i] < minValue) minValue = numbers[i];
302
- }
303
- return minValue;
304
- }
305
-
306
- int min3(int a, int b, int c) {
307
- int min = a;
308
- if (b < min) min = b;
309
- if (c < min) min = c;
310
- return min;
311
- }
312
-
313
- static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length) {
314
- int cur_line[word_length + 1];
315
- int i,j, insert_cost, replace_cost, delete_cost;
316
- VALUE carr;
317
-
318
- cur_line[0] = prev_line[0] + 1;
319
-
320
- for(i=1; i < word_length + 1; i++) {
321
- insert_cost = cur_line[i-1] + 1;
322
- delete_cost = prev_line[i] + 1;
323
- if (trie->character != word[i-1]) {
324
- replace_cost = prev_line[i-1] + 1;
325
- } else {
326
- replace_cost = prev_line[i-1];
327
- }
328
- cur_line[i] = min3(insert_cost, delete_cost, replace_cost);
329
- }
330
-
331
-
332
- if (cur_line[word_length] <= max_dist && trie->value != Qnil) {
333
- carr = rb_ary_new();
334
- rb_ary_push(carr, trie->value);
335
- rb_ary_push(carr, INT2FIX(cur_line[word_length]));
336
- rb_ary_push(rary, carr);
337
- }
338
-
339
- if (minimum(cur_line, word_length + 1) <= max_dist) {
340
- if (trie->first_child != NULL)
341
- recursive_levenshtein_search(trie->first_child, rary, cur_line, max_dist, word, word_length);
342
- if (trie->next_sibling != NULL)
343
- recursive_levenshtein_search(trie->next_sibling, rary, prev_line, max_dist, word, word_length);
344
- }
345
- }
346
-
347
- static trie_node * trie_sibling_for_char(trie_node * node, char ch) {
348
- while(true) {
349
- if (node == NULL) return NULL;
350
-
351
- if (node->character == ch) return node;
352
-
353
- node = node->next_sibling;
354
- }
355
- return node;
356
- }
357
-
358
- static trie_node * trie_add_sibling_for_char(trie_node * node, char ch) {
359
- trie_node * current_next;
360
-
361
- current_next = node->next_sibling;
362
- node->next_sibling = trie_new_node_with_char(ch);
363
- node->next_sibling->next_sibling = current_next;
364
-
365
- return node->next_sibling;
366
- }
367
-
368
- static trie_node * trie_new_node_with_char(char ch) {
369
- trie_node * trie;
370
- trie = malloc(sizeof(trie_node));
371
- trie->character = ch;
372
- trie->value = Qnil;
373
- trie->first_child = NULL;
374
- trie->next_sibling = NULL;
375
- return trie;
376
- }
377
-
378
- static trie_node * trie_new_node() {
379
- return trie_new_node_with_char('s'); //insert most common starting letter here.
380
- }
381
-
382
- static void trie_traverse(trie_node * trie, void (* lambda_func)(void *)) {
383
- if (trie->next_sibling != NULL) {
384
- trie_traverse(trie->next_sibling, lambda_func);
385
- }
386
-
387
- if (trie->first_child != NULL) {
388
- trie_traverse(trie->first_child, lambda_func);
389
- }
390
-
391
- lambda_func(trie);
392
- }
393
-
394
- static void trie_traverse_with_context(trie_node * trie, VALUE context, void (*lambda_func)(void *, VALUE)) {
395
- if (trie->next_sibling != NULL) {
396
- trie_traverse_with_context(trie->next_sibling, context, lambda_func);
397
- }
398
-
399
- if (trie->first_child != NULL) {
400
- trie_traverse_with_context(trie->first_child, context, lambda_func);
401
- }
402
-
403
- lambda_func(trie, context);
404
- }
405
-
406
- static void free_trie(trie_node * trie) {
407
- trie_traverse(trie, free);
408
- }