fast_trie 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
+ /*
3
+ * trie-private.h - Private utilities for trie implementation
4
+ * Created: 2007-08-25
5
+ * Author: Theppitak Karoonboonyanan <thep@linux.thai.net>
6
+ */
7
+
8
+ #ifndef __TRIE_PRIVATE_H
9
+ #define __TRIE_PRIVATE_H
10
+
11
+ #include "typedefs.h"
12
+
13
+ /**
14
+ * @file trie-private.h
15
+ * @brief Private utilities for trie implementation
16
+ */
17
+
18
+ /**
19
+ * @brief Minimum value macro
20
+ */
21
+ #define MIN_VAL(a,b) ((a)<(b)?(a):(b))
22
+ /**
23
+ * @brief Maximum value macro
24
+ */
25
+ #define MAX_VAL(a,b) ((a)>(b)?(a):(b))
26
+
27
+ #endif /* __TRIE_PRIVATE_H */
28
+
29
+ /*
30
+ vi:ts=4:ai:expandtab
31
+ */
data/ext/trie/trie.c ADDED
@@ -0,0 +1,452 @@
1
+ #include "ruby.h"
2
+ #include "trie.h"
3
+ #include <stdlib.h>
4
+ #include <stdio.h>
5
+ #include <string.h>
6
+
7
+ VALUE cTrie, cTrieNode;
8
+
9
+ /*
10
+ * Document-class: Trie
11
+ *
12
+ * A key-value data structure for string keys which is efficient memory usage and fast retrieval time.
13
+ *
14
+ */
15
+
16
+ static VALUE rb_trie_alloc(VALUE klass) {
17
+ VALUE obj;
18
+ obj = Data_Wrap_Struct(klass, 0, trie_free, trie_new());
19
+ return obj;
20
+ }
21
+
22
+ /*
23
+ * call-seq:
24
+ * has_key?(key) -> true/false
25
+ *
26
+ * Determines whether or not a key exists in the Trie. Use this if you don't care about the value, as it
27
+ * is marginally faster than Trie#get.
28
+ *
29
+ */
30
+ static VALUE rb_trie_has_key(VALUE self, VALUE key) {
31
+ Trie *trie;
32
+ Data_Get_Struct(self, Trie, trie);
33
+
34
+ if(trie_has_key(trie, (TrieChar*)RSTRING(key)->ptr))
35
+ return Qtrue;
36
+ else
37
+ return Qnil;
38
+ }
39
+
40
+ /*
41
+ * call-seq:
42
+ * get(key) -> value
43
+ * [key] -> value
44
+ *
45
+ * Retrieves the value for a particular key (or nil) from the Trie.
46
+ *
47
+ */
48
+ static VALUE rb_trie_get(VALUE self, VALUE key) {
49
+ Trie *trie;
50
+ Data_Get_Struct(self, Trie, trie);
51
+
52
+ TrieData data;
53
+ if(trie_retrieve(trie, (TrieChar*)RSTRING(key)->ptr, &data))
54
+ return (VALUE)data;
55
+ else
56
+ return Qnil;
57
+ }
58
+
59
+ /*
60
+ * call-seq:
61
+ * add(key)
62
+ * add(key,value)
63
+ *
64
+ * Add a key, or a key and value to the Trie. If you add a key without a value it assumes true for the value.
65
+ *
66
+ */
67
+ static VALUE rb_trie_add(VALUE self, VALUE args) {
68
+ Trie *trie;
69
+ Data_Get_Struct(self, Trie, trie);
70
+
71
+ int size = RARRAY(args)->len;
72
+ if(size < 1 || size > 2)
73
+ return Qnil;
74
+
75
+ VALUE key;
76
+ key = RARRAY(args)->ptr[0];
77
+ TrieData value = size == 2 ? RARRAY(args)->ptr[1] : TRIE_DATA_ERROR;
78
+
79
+ if(trie_store(trie, (TrieChar*)RSTRING(key)->ptr, value))
80
+ return Qtrue;
81
+ else
82
+ return Qnil;
83
+ }
84
+
85
+ /*
86
+ * call-seq:
87
+ * delete(key)
88
+ *
89
+ * Delete a key from the Trie. Returns true if it deleted a key, nil otherwise.
90
+ *
91
+ */
92
+ static VALUE rb_trie_delete(VALUE self, VALUE key) {
93
+ Trie *trie;
94
+ Data_Get_Struct(self, Trie, trie);
95
+
96
+ if(trie_delete(trie, (TrieChar*)RSTRING(key)->ptr))
97
+ return Qtrue;
98
+ else
99
+ return Qnil;
100
+ }
101
+
102
+ static VALUE walk_all_paths(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) {
103
+ int c;
104
+ for(c = 1; c < 256; c++) {
105
+ if(trie_state_is_walkable(state,c)) {
106
+ TrieState *next_state = trie_state_clone(state);
107
+ trie_state_walk(next_state, c);
108
+
109
+ prefix[prefix_size] = c;
110
+ prefix[prefix_size + 1] = 0;
111
+
112
+ if(trie_state_is_terminal(next_state)) {
113
+ char *word = (char*) malloc(prefix_size + 2);
114
+ memcpy(word, prefix, prefix_size + 2);
115
+ rb_ary_push(children, rb_str_new2(word));
116
+ }
117
+
118
+ walk_all_paths(trie, children, next_state, prefix, prefix_size + 1);
119
+
120
+ prefix[prefix_size] = 0;
121
+ trie_state_free(next_state);
122
+ }
123
+ }
124
+ }
125
+
126
+ /*
127
+ * call-seq:
128
+ * children(prefix) -> [ key, ... ]
129
+ *
130
+ * Finds all keys in the Trie beginning with the given prefix.
131
+ *
132
+ */
133
+ static VALUE rb_trie_children(VALUE self, VALUE prefix) {
134
+ if(NIL_P(prefix))
135
+ return rb_ary_new();
136
+
137
+ Trie *trie;
138
+ Data_Get_Struct(self, Trie, trie);
139
+
140
+ int prefix_size = RSTRING(prefix)->len;
141
+ TrieState *state = trie_root(trie);
142
+ VALUE children = rb_ary_new();
143
+ TrieChar *char_prefix = (TrieChar*)RSTRING(prefix)->ptr;
144
+
145
+ const TrieChar *iterator = char_prefix;
146
+ while(*iterator != 0) {
147
+ if(!trie_state_is_walkable(state, *iterator))
148
+ return children;
149
+ trie_state_walk(state, *iterator);
150
+ iterator++;
151
+ }
152
+
153
+ if(trie_state_is_terminal(state))
154
+ rb_ary_push(children, prefix);
155
+
156
+ char prefix_buffer[1024];
157
+ memcpy(prefix_buffer, char_prefix, prefix_size);
158
+ prefix_buffer[prefix_size] = 0;
159
+
160
+ walk_all_paths(trie, children, state, prefix_buffer, prefix_size);
161
+
162
+ trie_state_free(state);
163
+ return children;
164
+ }
165
+
166
+
167
+ static VALUE walk_all_paths_with_values(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) {
168
+ int c;
169
+ for(c = 1; c < 256; c++) {
170
+ if(trie_state_is_walkable(state,c)) {
171
+ TrieState *next_state = trie_state_clone(state);
172
+ trie_state_walk(next_state, c);
173
+
174
+ prefix[prefix_size] = c;
175
+ prefix[prefix_size + 1] = 0;
176
+
177
+ if(trie_state_is_terminal(next_state)) {
178
+ TrieState *end_state = trie_state_clone(next_state);
179
+ trie_state_walk(end_state, '\0');
180
+
181
+ char *word = (char*) malloc(prefix_size + 2);
182
+ memcpy(word, prefix, prefix_size + 2);
183
+
184
+ VALUE tuple = rb_ary_new();
185
+ rb_ary_push(tuple, rb_str_new2(word));
186
+
187
+ TrieData trie_data = trie_state_get_data(end_state);
188
+ rb_ary_push(tuple, (VALUE)trie_data);
189
+ rb_ary_push(children, tuple);
190
+
191
+ trie_state_free(end_state);
192
+ }
193
+
194
+ walk_all_paths_with_values(trie, children, next_state, prefix, prefix_size + 1);
195
+
196
+ prefix[prefix_size] = 0;
197
+ trie_state_free(next_state);
198
+ }
199
+ }
200
+ }
201
+
202
+ /*
203
+ * call-seq:
204
+ * children_with_values(key) -> [ [key,value], ... ]
205
+ *
206
+ * Finds all keys with their respective values in the Trie beginning with the given prefix.
207
+ *
208
+ */
209
+ static VALUE rb_trie_children_with_values(VALUE self, VALUE prefix) {
210
+ if(NIL_P(prefix))
211
+ return rb_ary_new();
212
+
213
+ Trie *trie;
214
+ Data_Get_Struct(self, Trie, trie);
215
+
216
+ int prefix_size = RSTRING(prefix)->len;
217
+ TrieChar *char_prefix = (TrieChar*)RSTRING(prefix)->ptr;
218
+
219
+ VALUE children = rb_ary_new();
220
+
221
+ TrieState *state = trie_root(trie);
222
+
223
+ const TrieChar *iterator = char_prefix;
224
+ while(*iterator != 0) {
225
+ if(!trie_state_is_walkable(state, *iterator))
226
+ return rb_ary_new();
227
+ trie_state_walk(state, *iterator);
228
+ iterator++;
229
+ }
230
+
231
+ if(trie_state_is_terminal(state)) {
232
+ TrieState *end_state = trie_state_clone(state);
233
+ trie_state_walk(end_state, '\0');
234
+
235
+ VALUE tuple = rb_ary_new();
236
+ rb_ary_push(tuple, prefix);
237
+ TrieData trie_data = trie_state_get_data(end_state);
238
+ rb_ary_push(tuple, (VALUE)trie_data);
239
+ rb_ary_push(children, tuple);
240
+
241
+ trie_state_free(end_state);
242
+ }
243
+
244
+ char prefix_buffer[1024];
245
+ memcpy(prefix_buffer, char_prefix, prefix_size);
246
+ prefix_buffer[prefix_size] = 0;
247
+
248
+ walk_all_paths_with_values(trie, children, state, prefix_buffer, prefix_size);
249
+
250
+ trie_state_free(state);
251
+ return children;
252
+ }
253
+
254
+ static VALUE rb_trie_node_alloc(VALUE klass);
255
+
256
+ /*
257
+ * call-seq:
258
+ * root -> TrieNode
259
+ *
260
+ * Returns a TrieNode representing the root of the Trie.
261
+ *
262
+ */
263
+ static VALUE rb_trie_root(VALUE self) {
264
+ Trie *trie;
265
+ Data_Get_Struct(self, Trie, trie);
266
+
267
+ VALUE trie_node = rb_trie_node_alloc(cTrieNode);
268
+
269
+ TrieState *state = trie_root(trie);
270
+ RDATA(trie_node)->data = state;
271
+
272
+ rb_iv_set(trie_node, "@state", Qnil);
273
+ rb_iv_set(trie_node, "@full_state", rb_str_new2(""));
274
+ return trie_node;
275
+ }
276
+
277
+
278
+ /*
279
+ * Document-class: TrieNode
280
+ *
281
+ * Represents a single node in the Trie. It can be used as a cursor to walk around the Trie.
282
+ * You can grab a TrieNode for the root of the Trie by using Trie#root.
283
+ *
284
+ */
285
+
286
+ static VALUE rb_trie_node_alloc(VALUE klass) {
287
+ VALUE obj;
288
+ obj = Data_Wrap_Struct(klass, 0, trie_state_free, NULL);
289
+ return obj;
290
+ }
291
+
292
+ /* nodoc */
293
+ static VALUE rb_trie_node_initialize_copy(VALUE self, VALUE from) {
294
+ RDATA(self)->data = trie_state_clone(RDATA(from)->data);
295
+
296
+ rb_iv_set(self, "@state", rb_iv_get(from, "@state"));
297
+ rb_iv_set(self, "@full_state", rb_iv_get(from, "@full_state"));
298
+
299
+ return self;
300
+ }
301
+
302
+ /*
303
+ * call-seq:
304
+ * state -> single character
305
+ *
306
+ * Returns the letter that the TrieNode instance points to. So, if the node is pointing at the "e" in "monkeys", the state is "e".
307
+ *
308
+ */
309
+ static VALUE rb_trie_node_get_state(VALUE self) {
310
+ return rb_iv_get(self, "@state");
311
+ }
312
+
313
+ /*
314
+ * call-seq:
315
+ * full_state -> string
316
+ *
317
+ * Returns the full string from the root of the Trie up to this node. So if the node pointing at the "e" in "monkeys",
318
+ * the full_state is "monke".
319
+ *
320
+ */
321
+ static VALUE rb_trie_node_get_full_state(VALUE self) {
322
+ return rb_iv_get(self, "@full_state");
323
+ }
324
+
325
+ /*
326
+ * call-seq:
327
+ * walk!(letter) -> TrieNode
328
+ *
329
+ * Tries to walk down a particular branch of the Trie. It modifies the node it is called on.
330
+ *
331
+ */
332
+ static VALUE rb_trie_node_walk_bang(VALUE self, VALUE rchar) {
333
+ TrieState *state;
334
+ Data_Get_Struct(self, TrieState, state);
335
+
336
+ if(RSTRING(rchar)->len != 1)
337
+ return Qnil;
338
+
339
+ Bool result = trie_state_walk(state, *RSTRING(rchar)->ptr);
340
+
341
+ if(result) {
342
+ rb_iv_set(self, "@state", rchar);
343
+ VALUE full_state = rb_iv_get(self, "@full_state");
344
+ rb_str_append(full_state, rchar);
345
+ rb_iv_set(self, "@full_state", full_state);
346
+ return self;
347
+ } else
348
+ return Qnil;
349
+ }
350
+
351
+ /*
352
+ * call-seq:
353
+ * walk(letter) -> TrieNode
354
+ *
355
+ * Tries to walk down a particular branch of the Trie. It clones the node it is called on and
356
+ * walks with that one, leaving the original unchanged.
357
+ *
358
+ */
359
+ static VALUE rb_trie_node_walk(VALUE self, VALUE rchar) {
360
+ VALUE new_node = rb_funcall(self, rb_intern("dup"), 0);
361
+
362
+ TrieState *state;
363
+ Data_Get_Struct(new_node, TrieState, state);
364
+
365
+ if(RSTRING(rchar)->len != 1)
366
+ return Qnil;
367
+
368
+ Bool result = trie_state_walk(state, *RSTRING(rchar)->ptr);
369
+
370
+ if(result) {
371
+ rb_iv_set(new_node, "@state", rchar);
372
+ VALUE full_state = rb_iv_get(new_node, "@full_state");
373
+ rb_str_append(full_state, rchar);
374
+ rb_iv_set(new_node, "@full_state", full_state);
375
+ return self;
376
+ } else
377
+ return Qnil;
378
+ }
379
+
380
+ /*
381
+ * call-seq:
382
+ * value
383
+ *
384
+ * Attempts to get the value at this node of the Trie. This only works if the node is a terminal
385
+ * (i.e. end of a key), otherwise it returns nil.
386
+ *
387
+ */
388
+ static VALUE rb_trie_node_value(VALUE self) {
389
+ TrieState *state;
390
+ TrieState *dup;
391
+ Data_Get_Struct(self, TrieState, state);
392
+
393
+ dup = trie_state_clone(state);
394
+
395
+ trie_state_walk(dup, 0);
396
+ TrieData trie_data = trie_state_get_data(dup);
397
+ trie_state_free(dup);
398
+
399
+ return TRIE_DATA_ERROR == trie_data ? Qnil : (VALUE)trie_data;
400
+ }
401
+
402
+ /*
403
+ * call-seq:
404
+ * terminal? -> true/false
405
+ *
406
+ * Returns true if this node is at the end of a key. So if you have two keys in your Trie, "he" and
407
+ * "hello", and you walk all the way to the end of "hello", the "e" and the "o" will return true for terminal?.
408
+ *
409
+ */
410
+ static VALUE rb_trie_node_terminal(VALUE self) {
411
+ TrieState *state;
412
+ Data_Get_Struct(self, TrieState, state);
413
+
414
+ return trie_state_is_terminal(state) ? Qtrue : Qnil;
415
+ }
416
+
417
+ /*
418
+ * call-seq:
419
+ * leaf? -> true/false
420
+ *
421
+ * Returns true if there are no branches at this node.
422
+ */
423
+ static VALUE rb_trie_node_leaf(VALUE self) {
424
+ TrieState *state;
425
+ Data_Get_Struct(self, TrieState, state);
426
+
427
+ return trie_state_is_leaf(state) ? Qtrue : Qnil;
428
+ }
429
+
430
+
431
+ void Init_trie() {
432
+ cTrie = rb_define_class("Trie", rb_cObject);
433
+ rb_define_alloc_func(cTrie, rb_trie_alloc);
434
+ rb_define_method(cTrie, "has_key?", rb_trie_has_key, 1);
435
+ rb_define_method(cTrie, "get", rb_trie_get, 1);
436
+ rb_define_method(cTrie, "add", rb_trie_add, -2);
437
+ rb_define_method(cTrie, "delete", rb_trie_delete, 1);
438
+ rb_define_method(cTrie, "children", rb_trie_children, 1);
439
+ rb_define_method(cTrie, "children_with_values", rb_trie_children_with_values, 1);
440
+ rb_define_method(cTrie, "root", rb_trie_root, 0);
441
+
442
+ cTrieNode = rb_define_class("TrieNode", rb_cObject);
443
+ rb_define_alloc_func(cTrieNode, rb_trie_node_alloc);
444
+ rb_define_method(cTrieNode, "initialize_copy", rb_trie_node_initialize_copy, 1);
445
+ rb_define_method(cTrieNode, "state", rb_trie_node_get_state, 0);
446
+ rb_define_method(cTrieNode, "full_state", rb_trie_node_get_full_state, 0);
447
+ rb_define_method(cTrieNode, "walk!", rb_trie_node_walk_bang, 1);
448
+ rb_define_method(cTrieNode, "walk", rb_trie_node_walk, 1);
449
+ rb_define_method(cTrieNode, "value", rb_trie_node_value, 0);
450
+ rb_define_method(cTrieNode, "terminal?", rb_trie_node_terminal, 0);
451
+ rb_define_method(cTrieNode, "leaf?", rb_trie_node_leaf, 0);
452
+ }