fast_trie 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,31 @@
1
+ /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
+ /*
3
+ * trie-private.h - Private utilities for trie implementation
4
+ * Created: 2007-08-25
5
+ * Author: Theppitak Karoonboonyanan <thep@linux.thai.net>
6
+ */
7
+
8
+ #ifndef __TRIE_PRIVATE_H
9
+ #define __TRIE_PRIVATE_H
10
+
11
+ #include "typedefs.h"
12
+
13
+ /**
14
+ * @file trie-private.h
15
+ * @brief Private utilities for trie implementation
16
+ */
17
+
18
+ /**
19
+ * @brief Minimum value macro
20
+ */
21
+ #define MIN_VAL(a,b) ((a)<(b)?(a):(b))
22
+ /**
23
+ * @brief Maximum value macro
24
+ */
25
+ #define MAX_VAL(a,b) ((a)>(b)?(a):(b))
26
+
27
+ #endif /* __TRIE_PRIVATE_H */
28
+
29
+ /*
30
+ vi:ts=4:ai:expandtab
31
+ */
data/ext/trie/trie.c ADDED
@@ -0,0 +1,452 @@
1
+ #include "ruby.h"
2
+ #include "trie.h"
3
+ #include <stdlib.h>
4
+ #include <stdio.h>
5
+ #include <string.h>
6
+
7
+ VALUE cTrie, cTrieNode;
8
+
9
+ /*
10
+ * Document-class: Trie
11
+ *
12
+ * A key-value data structure for string keys which is efficient memory usage and fast retrieval time.
13
+ *
14
+ */
15
+
16
+ static VALUE rb_trie_alloc(VALUE klass) {
17
+ VALUE obj;
18
+ obj = Data_Wrap_Struct(klass, 0, trie_free, trie_new());
19
+ return obj;
20
+ }
21
+
22
+ /*
23
+ * call-seq:
24
+ * has_key?(key) -> true/false
25
+ *
26
+ * Determines whether or not a key exists in the Trie. Use this if you don't care about the value, as it
27
+ * is marginally faster than Trie#get.
28
+ *
29
+ */
30
+ static VALUE rb_trie_has_key(VALUE self, VALUE key) {
31
+ Trie *trie;
32
+ Data_Get_Struct(self, Trie, trie);
33
+
34
+ if(trie_has_key(trie, (TrieChar*)RSTRING(key)->ptr))
35
+ return Qtrue;
36
+ else
37
+ return Qnil;
38
+ }
39
+
40
+ /*
41
+ * call-seq:
42
+ * get(key) -> value
43
+ * [key] -> value
44
+ *
45
+ * Retrieves the value for a particular key (or nil) from the Trie.
46
+ *
47
+ */
48
+ static VALUE rb_trie_get(VALUE self, VALUE key) {
49
+ Trie *trie;
50
+ Data_Get_Struct(self, Trie, trie);
51
+
52
+ TrieData data;
53
+ if(trie_retrieve(trie, (TrieChar*)RSTRING(key)->ptr, &data))
54
+ return (VALUE)data;
55
+ else
56
+ return Qnil;
57
+ }
58
+
59
+ /*
60
+ * call-seq:
61
+ * add(key)
62
+ * add(key,value)
63
+ *
64
+ * Add a key, or a key and value to the Trie. If you add a key without a value it assumes true for the value.
65
+ *
66
+ */
67
+ static VALUE rb_trie_add(VALUE self, VALUE args) {
68
+ Trie *trie;
69
+ Data_Get_Struct(self, Trie, trie);
70
+
71
+ int size = RARRAY(args)->len;
72
+ if(size < 1 || size > 2)
73
+ return Qnil;
74
+
75
+ VALUE key;
76
+ key = RARRAY(args)->ptr[0];
77
+ TrieData value = size == 2 ? RARRAY(args)->ptr[1] : TRIE_DATA_ERROR;
78
+
79
+ if(trie_store(trie, (TrieChar*)RSTRING(key)->ptr, value))
80
+ return Qtrue;
81
+ else
82
+ return Qnil;
83
+ }
84
+
85
+ /*
86
+ * call-seq:
87
+ * delete(key)
88
+ *
89
+ * Delete a key from the Trie. Returns true if it deleted a key, nil otherwise.
90
+ *
91
+ */
92
+ static VALUE rb_trie_delete(VALUE self, VALUE key) {
93
+ Trie *trie;
94
+ Data_Get_Struct(self, Trie, trie);
95
+
96
+ if(trie_delete(trie, (TrieChar*)RSTRING(key)->ptr))
97
+ return Qtrue;
98
+ else
99
+ return Qnil;
100
+ }
101
+
102
+ static VALUE walk_all_paths(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) {
103
+ int c;
104
+ for(c = 1; c < 256; c++) {
105
+ if(trie_state_is_walkable(state,c)) {
106
+ TrieState *next_state = trie_state_clone(state);
107
+ trie_state_walk(next_state, c);
108
+
109
+ prefix[prefix_size] = c;
110
+ prefix[prefix_size + 1] = 0;
111
+
112
+ if(trie_state_is_terminal(next_state)) {
113
+ char *word = (char*) malloc(prefix_size + 2);
114
+ memcpy(word, prefix, prefix_size + 2);
115
+ rb_ary_push(children, rb_str_new2(word));
116
+ }
117
+
118
+ walk_all_paths(trie, children, next_state, prefix, prefix_size + 1);
119
+
120
+ prefix[prefix_size] = 0;
121
+ trie_state_free(next_state);
122
+ }
123
+ }
124
+ }
125
+
126
+ /*
127
+ * call-seq:
128
+ * children(prefix) -> [ key, ... ]
129
+ *
130
+ * Finds all keys in the Trie beginning with the given prefix.
131
+ *
132
+ */
133
+ static VALUE rb_trie_children(VALUE self, VALUE prefix) {
134
+ if(NIL_P(prefix))
135
+ return rb_ary_new();
136
+
137
+ Trie *trie;
138
+ Data_Get_Struct(self, Trie, trie);
139
+
140
+ int prefix_size = RSTRING(prefix)->len;
141
+ TrieState *state = trie_root(trie);
142
+ VALUE children = rb_ary_new();
143
+ TrieChar *char_prefix = (TrieChar*)RSTRING(prefix)->ptr;
144
+
145
+ const TrieChar *iterator = char_prefix;
146
+ while(*iterator != 0) {
147
+ if(!trie_state_is_walkable(state, *iterator))
148
+ return children;
149
+ trie_state_walk(state, *iterator);
150
+ iterator++;
151
+ }
152
+
153
+ if(trie_state_is_terminal(state))
154
+ rb_ary_push(children, prefix);
155
+
156
+ char prefix_buffer[1024];
157
+ memcpy(prefix_buffer, char_prefix, prefix_size);
158
+ prefix_buffer[prefix_size] = 0;
159
+
160
+ walk_all_paths(trie, children, state, prefix_buffer, prefix_size);
161
+
162
+ trie_state_free(state);
163
+ return children;
164
+ }
165
+
166
+
167
+ static VALUE walk_all_paths_with_values(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) {
168
+ int c;
169
+ for(c = 1; c < 256; c++) {
170
+ if(trie_state_is_walkable(state,c)) {
171
+ TrieState *next_state = trie_state_clone(state);
172
+ trie_state_walk(next_state, c);
173
+
174
+ prefix[prefix_size] = c;
175
+ prefix[prefix_size + 1] = 0;
176
+
177
+ if(trie_state_is_terminal(next_state)) {
178
+ TrieState *end_state = trie_state_clone(next_state);
179
+ trie_state_walk(end_state, '\0');
180
+
181
+ char *word = (char*) malloc(prefix_size + 2);
182
+ memcpy(word, prefix, prefix_size + 2);
183
+
184
+ VALUE tuple = rb_ary_new();
185
+ rb_ary_push(tuple, rb_str_new2(word));
186
+
187
+ TrieData trie_data = trie_state_get_data(end_state);
188
+ rb_ary_push(tuple, (VALUE)trie_data);
189
+ rb_ary_push(children, tuple);
190
+
191
+ trie_state_free(end_state);
192
+ }
193
+
194
+ walk_all_paths_with_values(trie, children, next_state, prefix, prefix_size + 1);
195
+
196
+ prefix[prefix_size] = 0;
197
+ trie_state_free(next_state);
198
+ }
199
+ }
200
+ }
201
+
202
+ /*
203
+ * call-seq:
204
+ * children_with_values(key) -> [ [key,value], ... ]
205
+ *
206
+ * Finds all keys with their respective values in the Trie beginning with the given prefix.
207
+ *
208
+ */
209
+ static VALUE rb_trie_children_with_values(VALUE self, VALUE prefix) {
210
+ if(NIL_P(prefix))
211
+ return rb_ary_new();
212
+
213
+ Trie *trie;
214
+ Data_Get_Struct(self, Trie, trie);
215
+
216
+ int prefix_size = RSTRING(prefix)->len;
217
+ TrieChar *char_prefix = (TrieChar*)RSTRING(prefix)->ptr;
218
+
219
+ VALUE children = rb_ary_new();
220
+
221
+ TrieState *state = trie_root(trie);
222
+
223
+ const TrieChar *iterator = char_prefix;
224
+ while(*iterator != 0) {
225
+ if(!trie_state_is_walkable(state, *iterator))
226
+ return rb_ary_new();
227
+ trie_state_walk(state, *iterator);
228
+ iterator++;
229
+ }
230
+
231
+ if(trie_state_is_terminal(state)) {
232
+ TrieState *end_state = trie_state_clone(state);
233
+ trie_state_walk(end_state, '\0');
234
+
235
+ VALUE tuple = rb_ary_new();
236
+ rb_ary_push(tuple, prefix);
237
+ TrieData trie_data = trie_state_get_data(end_state);
238
+ rb_ary_push(tuple, (VALUE)trie_data);
239
+ rb_ary_push(children, tuple);
240
+
241
+ trie_state_free(end_state);
242
+ }
243
+
244
+ char prefix_buffer[1024];
245
+ memcpy(prefix_buffer, char_prefix, prefix_size);
246
+ prefix_buffer[prefix_size] = 0;
247
+
248
+ walk_all_paths_with_values(trie, children, state, prefix_buffer, prefix_size);
249
+
250
+ trie_state_free(state);
251
+ return children;
252
+ }
253
+
254
+ static VALUE rb_trie_node_alloc(VALUE klass);
255
+
256
+ /*
257
+ * call-seq:
258
+ * root -> TrieNode
259
+ *
260
+ * Returns a TrieNode representing the root of the Trie.
261
+ *
262
+ */
263
+ static VALUE rb_trie_root(VALUE self) {
264
+ Trie *trie;
265
+ Data_Get_Struct(self, Trie, trie);
266
+
267
+ VALUE trie_node = rb_trie_node_alloc(cTrieNode);
268
+
269
+ TrieState *state = trie_root(trie);
270
+ RDATA(trie_node)->data = state;
271
+
272
+ rb_iv_set(trie_node, "@state", Qnil);
273
+ rb_iv_set(trie_node, "@full_state", rb_str_new2(""));
274
+ return trie_node;
275
+ }
276
+
277
+
278
+ /*
279
+ * Document-class: TrieNode
280
+ *
281
+ * Represents a single node in the Trie. It can be used as a cursor to walk around the Trie.
282
+ * You can grab a TrieNode for the root of the Trie by using Trie#root.
283
+ *
284
+ */
285
+
286
+ static VALUE rb_trie_node_alloc(VALUE klass) {
287
+ VALUE obj;
288
+ obj = Data_Wrap_Struct(klass, 0, trie_state_free, NULL);
289
+ return obj;
290
+ }
291
+
292
+ /* nodoc */
293
+ static VALUE rb_trie_node_initialize_copy(VALUE self, VALUE from) {
294
+ RDATA(self)->data = trie_state_clone(RDATA(from)->data);
295
+
296
+ rb_iv_set(self, "@state", rb_iv_get(from, "@state"));
297
+ rb_iv_set(self, "@full_state", rb_iv_get(from, "@full_state"));
298
+
299
+ return self;
300
+ }
301
+
302
+ /*
303
+ * call-seq:
304
+ * state -> single character
305
+ *
306
+ * Returns the letter that the TrieNode instance points to. So, if the node is pointing at the "e" in "monkeys", the state is "e".
307
+ *
308
+ */
309
+ static VALUE rb_trie_node_get_state(VALUE self) {
310
+ return rb_iv_get(self, "@state");
311
+ }
312
+
313
+ /*
314
+ * call-seq:
315
+ * full_state -> string
316
+ *
317
+ * Returns the full string from the root of the Trie up to this node. So if the node pointing at the "e" in "monkeys",
318
+ * the full_state is "monke".
319
+ *
320
+ */
321
+ static VALUE rb_trie_node_get_full_state(VALUE self) {
322
+ return rb_iv_get(self, "@full_state");
323
+ }
324
+
325
+ /*
326
+ * call-seq:
327
+ * walk!(letter) -> TrieNode
328
+ *
329
+ * Tries to walk down a particular branch of the Trie. It modifies the node it is called on.
330
+ *
331
+ */
332
+ static VALUE rb_trie_node_walk_bang(VALUE self, VALUE rchar) {
333
+ TrieState *state;
334
+ Data_Get_Struct(self, TrieState, state);
335
+
336
+ if(RSTRING(rchar)->len != 1)
337
+ return Qnil;
338
+
339
+ Bool result = trie_state_walk(state, *RSTRING(rchar)->ptr);
340
+
341
+ if(result) {
342
+ rb_iv_set(self, "@state", rchar);
343
+ VALUE full_state = rb_iv_get(self, "@full_state");
344
+ rb_str_append(full_state, rchar);
345
+ rb_iv_set(self, "@full_state", full_state);
346
+ return self;
347
+ } else
348
+ return Qnil;
349
+ }
350
+
351
+ /*
352
+ * call-seq:
353
+ * walk(letter) -> TrieNode
354
+ *
355
+ * Tries to walk down a particular branch of the Trie. It clones the node it is called on and
356
+ * walks with that one, leaving the original unchanged.
357
+ *
358
+ */
359
+ static VALUE rb_trie_node_walk(VALUE self, VALUE rchar) {
360
+ VALUE new_node = rb_funcall(self, rb_intern("dup"), 0);
361
+
362
+ TrieState *state;
363
+ Data_Get_Struct(new_node, TrieState, state);
364
+
365
+ if(RSTRING(rchar)->len != 1)
366
+ return Qnil;
367
+
368
+ Bool result = trie_state_walk(state, *RSTRING(rchar)->ptr);
369
+
370
+ if(result) {
371
+ rb_iv_set(new_node, "@state", rchar);
372
+ VALUE full_state = rb_iv_get(new_node, "@full_state");
373
+ rb_str_append(full_state, rchar);
374
+ rb_iv_set(new_node, "@full_state", full_state);
375
+ return self;
376
+ } else
377
+ return Qnil;
378
+ }
379
+
380
+ /*
381
+ * call-seq:
382
+ * value
383
+ *
384
+ * Attempts to get the value at this node of the Trie. This only works if the node is a terminal
385
+ * (i.e. end of a key), otherwise it returns nil.
386
+ *
387
+ */
388
+ static VALUE rb_trie_node_value(VALUE self) {
389
+ TrieState *state;
390
+ TrieState *dup;
391
+ Data_Get_Struct(self, TrieState, state);
392
+
393
+ dup = trie_state_clone(state);
394
+
395
+ trie_state_walk(dup, 0);
396
+ TrieData trie_data = trie_state_get_data(dup);
397
+ trie_state_free(dup);
398
+
399
+ return TRIE_DATA_ERROR == trie_data ? Qnil : (VALUE)trie_data;
400
+ }
401
+
402
+ /*
403
+ * call-seq:
404
+ * terminal? -> true/false
405
+ *
406
+ * Returns true if this node is at the end of a key. So if you have two keys in your Trie, "he" and
407
+ * "hello", and you walk all the way to the end of "hello", the "e" and the "o" will return true for terminal?.
408
+ *
409
+ */
410
+ static VALUE rb_trie_node_terminal(VALUE self) {
411
+ TrieState *state;
412
+ Data_Get_Struct(self, TrieState, state);
413
+
414
+ return trie_state_is_terminal(state) ? Qtrue : Qnil;
415
+ }
416
+
417
+ /*
418
+ * call-seq:
419
+ * leaf? -> true/false
420
+ *
421
+ * Returns true if there are no branches at this node.
422
+ */
423
+ static VALUE rb_trie_node_leaf(VALUE self) {
424
+ TrieState *state;
425
+ Data_Get_Struct(self, TrieState, state);
426
+
427
+ return trie_state_is_leaf(state) ? Qtrue : Qnil;
428
+ }
429
+
430
+
431
+ void Init_trie() {
432
+ cTrie = rb_define_class("Trie", rb_cObject);
433
+ rb_define_alloc_func(cTrie, rb_trie_alloc);
434
+ rb_define_method(cTrie, "has_key?", rb_trie_has_key, 1);
435
+ rb_define_method(cTrie, "get", rb_trie_get, 1);
436
+ rb_define_method(cTrie, "add", rb_trie_add, -2);
437
+ rb_define_method(cTrie, "delete", rb_trie_delete, 1);
438
+ rb_define_method(cTrie, "children", rb_trie_children, 1);
439
+ rb_define_method(cTrie, "children_with_values", rb_trie_children_with_values, 1);
440
+ rb_define_method(cTrie, "root", rb_trie_root, 0);
441
+
442
+ cTrieNode = rb_define_class("TrieNode", rb_cObject);
443
+ rb_define_alloc_func(cTrieNode, rb_trie_node_alloc);
444
+ rb_define_method(cTrieNode, "initialize_copy", rb_trie_node_initialize_copy, 1);
445
+ rb_define_method(cTrieNode, "state", rb_trie_node_get_state, 0);
446
+ rb_define_method(cTrieNode, "full_state", rb_trie_node_get_full_state, 0);
447
+ rb_define_method(cTrieNode, "walk!", rb_trie_node_walk_bang, 1);
448
+ rb_define_method(cTrieNode, "walk", rb_trie_node_walk, 1);
449
+ rb_define_method(cTrieNode, "value", rb_trie_node_value, 0);
450
+ rb_define_method(cTrieNode, "terminal?", rb_trie_node_terminal, 0);
451
+ rb_define_method(cTrieNode, "leaf?", rb_trie_node_leaf, 0);
452
+ }