fast_trie 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +132 -0
- data/VERSION.yml +4 -0
- data/ext/trie/Makefile +149 -0
- data/ext/trie/darray.c +673 -0
- data/ext/trie/darray.h +233 -0
- data/ext/trie/extconf.rb +3 -0
- data/ext/trie/fileutils.c +151 -0
- data/ext/trie/fileutils.h +36 -0
- data/ext/trie/tail.c +340 -0
- data/ext/trie/tail.h +207 -0
- data/ext/trie/trie-private.c +299 -0
- data/ext/trie/trie-private.h +31 -0
- data/ext/trie/trie.c +452 -0
- data/ext/trie/trie.h +40 -0
- data/ext/trie/triedefs.h +73 -0
- data/ext/trie/typedefs.h +113 -0
- data/lib/trie.rb +1 -0
- data/spec/trie_spec.rb +266 -0
- metadata +80 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
2
|
+
/*
|
3
|
+
* trie-private.h - Private utilities for trie implementation
|
4
|
+
* Created: 2007-08-25
|
5
|
+
* Author: Theppitak Karoonboonyanan <thep@linux.thai.net>
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef __TRIE_PRIVATE_H
|
9
|
+
#define __TRIE_PRIVATE_H
|
10
|
+
|
11
|
+
#include "typedefs.h"
|
12
|
+
|
13
|
+
/**
|
14
|
+
* @file trie-private.h
|
15
|
+
* @brief Private utilities for trie implementation
|
16
|
+
*/
|
17
|
+
|
18
|
+
/**
|
19
|
+
* @brief Minimum value macro
|
20
|
+
*/
|
21
|
+
#define MIN_VAL(a,b) ((a)<(b)?(a):(b))
|
22
|
+
/**
|
23
|
+
* @brief Maximum value macro
|
24
|
+
*/
|
25
|
+
#define MAX_VAL(a,b) ((a)>(b)?(a):(b))
|
26
|
+
|
27
|
+
#endif /* __TRIE_PRIVATE_H */
|
28
|
+
|
29
|
+
/*
|
30
|
+
vi:ts=4:ai:expandtab
|
31
|
+
*/
|
data/ext/trie/trie.c
ADDED
@@ -0,0 +1,452 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "trie.h"
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <stdio.h>
|
5
|
+
#include <string.h>
|
6
|
+
|
7
|
+
VALUE cTrie, cTrieNode;
|
8
|
+
|
9
|
+
/*
|
10
|
+
* Document-class: Trie
|
11
|
+
*
|
12
|
+
* A key-value data structure for string keys which is efficient memory usage and fast retrieval time.
|
13
|
+
*
|
14
|
+
*/
|
15
|
+
|
16
|
+
static VALUE rb_trie_alloc(VALUE klass) {
|
17
|
+
VALUE obj;
|
18
|
+
obj = Data_Wrap_Struct(klass, 0, trie_free, trie_new());
|
19
|
+
return obj;
|
20
|
+
}
|
21
|
+
|
22
|
+
/*
|
23
|
+
* call-seq:
|
24
|
+
* has_key?(key) -> true/false
|
25
|
+
*
|
26
|
+
* Determines whether or not a key exists in the Trie. Use this if you don't care about the value, as it
|
27
|
+
* is marginally faster than Trie#get.
|
28
|
+
*
|
29
|
+
*/
|
30
|
+
static VALUE rb_trie_has_key(VALUE self, VALUE key) {
|
31
|
+
Trie *trie;
|
32
|
+
Data_Get_Struct(self, Trie, trie);
|
33
|
+
|
34
|
+
if(trie_has_key(trie, (TrieChar*)RSTRING(key)->ptr))
|
35
|
+
return Qtrue;
|
36
|
+
else
|
37
|
+
return Qnil;
|
38
|
+
}
|
39
|
+
|
40
|
+
/*
|
41
|
+
* call-seq:
|
42
|
+
* get(key) -> value
|
43
|
+
* [key] -> value
|
44
|
+
*
|
45
|
+
* Retrieves the value for a particular key (or nil) from the Trie.
|
46
|
+
*
|
47
|
+
*/
|
48
|
+
static VALUE rb_trie_get(VALUE self, VALUE key) {
|
49
|
+
Trie *trie;
|
50
|
+
Data_Get_Struct(self, Trie, trie);
|
51
|
+
|
52
|
+
TrieData data;
|
53
|
+
if(trie_retrieve(trie, (TrieChar*)RSTRING(key)->ptr, &data))
|
54
|
+
return (VALUE)data;
|
55
|
+
else
|
56
|
+
return Qnil;
|
57
|
+
}
|
58
|
+
|
59
|
+
/*
|
60
|
+
* call-seq:
|
61
|
+
* add(key)
|
62
|
+
* add(key,value)
|
63
|
+
*
|
64
|
+
* Add a key, or a key and value to the Trie. If you add a key without a value it assumes true for the value.
|
65
|
+
*
|
66
|
+
*/
|
67
|
+
static VALUE rb_trie_add(VALUE self, VALUE args) {
|
68
|
+
Trie *trie;
|
69
|
+
Data_Get_Struct(self, Trie, trie);
|
70
|
+
|
71
|
+
int size = RARRAY(args)->len;
|
72
|
+
if(size < 1 || size > 2)
|
73
|
+
return Qnil;
|
74
|
+
|
75
|
+
VALUE key;
|
76
|
+
key = RARRAY(args)->ptr[0];
|
77
|
+
TrieData value = size == 2 ? RARRAY(args)->ptr[1] : TRIE_DATA_ERROR;
|
78
|
+
|
79
|
+
if(trie_store(trie, (TrieChar*)RSTRING(key)->ptr, value))
|
80
|
+
return Qtrue;
|
81
|
+
else
|
82
|
+
return Qnil;
|
83
|
+
}
|
84
|
+
|
85
|
+
/*
|
86
|
+
* call-seq:
|
87
|
+
* delete(key)
|
88
|
+
*
|
89
|
+
* Delete a key from the Trie. Returns true if it deleted a key, nil otherwise.
|
90
|
+
*
|
91
|
+
*/
|
92
|
+
static VALUE rb_trie_delete(VALUE self, VALUE key) {
|
93
|
+
Trie *trie;
|
94
|
+
Data_Get_Struct(self, Trie, trie);
|
95
|
+
|
96
|
+
if(trie_delete(trie, (TrieChar*)RSTRING(key)->ptr))
|
97
|
+
return Qtrue;
|
98
|
+
else
|
99
|
+
return Qnil;
|
100
|
+
}
|
101
|
+
|
102
|
+
static VALUE walk_all_paths(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) {
|
103
|
+
int c;
|
104
|
+
for(c = 1; c < 256; c++) {
|
105
|
+
if(trie_state_is_walkable(state,c)) {
|
106
|
+
TrieState *next_state = trie_state_clone(state);
|
107
|
+
trie_state_walk(next_state, c);
|
108
|
+
|
109
|
+
prefix[prefix_size] = c;
|
110
|
+
prefix[prefix_size + 1] = 0;
|
111
|
+
|
112
|
+
if(trie_state_is_terminal(next_state)) {
|
113
|
+
char *word = (char*) malloc(prefix_size + 2);
|
114
|
+
memcpy(word, prefix, prefix_size + 2);
|
115
|
+
rb_ary_push(children, rb_str_new2(word));
|
116
|
+
}
|
117
|
+
|
118
|
+
walk_all_paths(trie, children, next_state, prefix, prefix_size + 1);
|
119
|
+
|
120
|
+
prefix[prefix_size] = 0;
|
121
|
+
trie_state_free(next_state);
|
122
|
+
}
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
/*
|
127
|
+
* call-seq:
|
128
|
+
* children(prefix) -> [ key, ... ]
|
129
|
+
*
|
130
|
+
* Finds all keys in the Trie beginning with the given prefix.
|
131
|
+
*
|
132
|
+
*/
|
133
|
+
static VALUE rb_trie_children(VALUE self, VALUE prefix) {
|
134
|
+
if(NIL_P(prefix))
|
135
|
+
return rb_ary_new();
|
136
|
+
|
137
|
+
Trie *trie;
|
138
|
+
Data_Get_Struct(self, Trie, trie);
|
139
|
+
|
140
|
+
int prefix_size = RSTRING(prefix)->len;
|
141
|
+
TrieState *state = trie_root(trie);
|
142
|
+
VALUE children = rb_ary_new();
|
143
|
+
TrieChar *char_prefix = (TrieChar*)RSTRING(prefix)->ptr;
|
144
|
+
|
145
|
+
const TrieChar *iterator = char_prefix;
|
146
|
+
while(*iterator != 0) {
|
147
|
+
if(!trie_state_is_walkable(state, *iterator))
|
148
|
+
return children;
|
149
|
+
trie_state_walk(state, *iterator);
|
150
|
+
iterator++;
|
151
|
+
}
|
152
|
+
|
153
|
+
if(trie_state_is_terminal(state))
|
154
|
+
rb_ary_push(children, prefix);
|
155
|
+
|
156
|
+
char prefix_buffer[1024];
|
157
|
+
memcpy(prefix_buffer, char_prefix, prefix_size);
|
158
|
+
prefix_buffer[prefix_size] = 0;
|
159
|
+
|
160
|
+
walk_all_paths(trie, children, state, prefix_buffer, prefix_size);
|
161
|
+
|
162
|
+
trie_state_free(state);
|
163
|
+
return children;
|
164
|
+
}
|
165
|
+
|
166
|
+
|
167
|
+
static VALUE walk_all_paths_with_values(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) {
|
168
|
+
int c;
|
169
|
+
for(c = 1; c < 256; c++) {
|
170
|
+
if(trie_state_is_walkable(state,c)) {
|
171
|
+
TrieState *next_state = trie_state_clone(state);
|
172
|
+
trie_state_walk(next_state, c);
|
173
|
+
|
174
|
+
prefix[prefix_size] = c;
|
175
|
+
prefix[prefix_size + 1] = 0;
|
176
|
+
|
177
|
+
if(trie_state_is_terminal(next_state)) {
|
178
|
+
TrieState *end_state = trie_state_clone(next_state);
|
179
|
+
trie_state_walk(end_state, '\0');
|
180
|
+
|
181
|
+
char *word = (char*) malloc(prefix_size + 2);
|
182
|
+
memcpy(word, prefix, prefix_size + 2);
|
183
|
+
|
184
|
+
VALUE tuple = rb_ary_new();
|
185
|
+
rb_ary_push(tuple, rb_str_new2(word));
|
186
|
+
|
187
|
+
TrieData trie_data = trie_state_get_data(end_state);
|
188
|
+
rb_ary_push(tuple, (VALUE)trie_data);
|
189
|
+
rb_ary_push(children, tuple);
|
190
|
+
|
191
|
+
trie_state_free(end_state);
|
192
|
+
}
|
193
|
+
|
194
|
+
walk_all_paths_with_values(trie, children, next_state, prefix, prefix_size + 1);
|
195
|
+
|
196
|
+
prefix[prefix_size] = 0;
|
197
|
+
trie_state_free(next_state);
|
198
|
+
}
|
199
|
+
}
|
200
|
+
}
|
201
|
+
|
202
|
+
/*
|
203
|
+
* call-seq:
|
204
|
+
* children_with_values(key) -> [ [key,value], ... ]
|
205
|
+
*
|
206
|
+
* Finds all keys with their respective values in the Trie beginning with the given prefix.
|
207
|
+
*
|
208
|
+
*/
|
209
|
+
static VALUE rb_trie_children_with_values(VALUE self, VALUE prefix) {
|
210
|
+
if(NIL_P(prefix))
|
211
|
+
return rb_ary_new();
|
212
|
+
|
213
|
+
Trie *trie;
|
214
|
+
Data_Get_Struct(self, Trie, trie);
|
215
|
+
|
216
|
+
int prefix_size = RSTRING(prefix)->len;
|
217
|
+
TrieChar *char_prefix = (TrieChar*)RSTRING(prefix)->ptr;
|
218
|
+
|
219
|
+
VALUE children = rb_ary_new();
|
220
|
+
|
221
|
+
TrieState *state = trie_root(trie);
|
222
|
+
|
223
|
+
const TrieChar *iterator = char_prefix;
|
224
|
+
while(*iterator != 0) {
|
225
|
+
if(!trie_state_is_walkable(state, *iterator))
|
226
|
+
return rb_ary_new();
|
227
|
+
trie_state_walk(state, *iterator);
|
228
|
+
iterator++;
|
229
|
+
}
|
230
|
+
|
231
|
+
if(trie_state_is_terminal(state)) {
|
232
|
+
TrieState *end_state = trie_state_clone(state);
|
233
|
+
trie_state_walk(end_state, '\0');
|
234
|
+
|
235
|
+
VALUE tuple = rb_ary_new();
|
236
|
+
rb_ary_push(tuple, prefix);
|
237
|
+
TrieData trie_data = trie_state_get_data(end_state);
|
238
|
+
rb_ary_push(tuple, (VALUE)trie_data);
|
239
|
+
rb_ary_push(children, tuple);
|
240
|
+
|
241
|
+
trie_state_free(end_state);
|
242
|
+
}
|
243
|
+
|
244
|
+
char prefix_buffer[1024];
|
245
|
+
memcpy(prefix_buffer, char_prefix, prefix_size);
|
246
|
+
prefix_buffer[prefix_size] = 0;
|
247
|
+
|
248
|
+
walk_all_paths_with_values(trie, children, state, prefix_buffer, prefix_size);
|
249
|
+
|
250
|
+
trie_state_free(state);
|
251
|
+
return children;
|
252
|
+
}
|
253
|
+
|
254
|
+
static VALUE rb_trie_node_alloc(VALUE klass);
|
255
|
+
|
256
|
+
/*
|
257
|
+
* call-seq:
|
258
|
+
* root -> TrieNode
|
259
|
+
*
|
260
|
+
* Returns a TrieNode representing the root of the Trie.
|
261
|
+
*
|
262
|
+
*/
|
263
|
+
static VALUE rb_trie_root(VALUE self) {
|
264
|
+
Trie *trie;
|
265
|
+
Data_Get_Struct(self, Trie, trie);
|
266
|
+
|
267
|
+
VALUE trie_node = rb_trie_node_alloc(cTrieNode);
|
268
|
+
|
269
|
+
TrieState *state = trie_root(trie);
|
270
|
+
RDATA(trie_node)->data = state;
|
271
|
+
|
272
|
+
rb_iv_set(trie_node, "@state", Qnil);
|
273
|
+
rb_iv_set(trie_node, "@full_state", rb_str_new2(""));
|
274
|
+
return trie_node;
|
275
|
+
}
|
276
|
+
|
277
|
+
|
278
|
+
/*
|
279
|
+
* Document-class: TrieNode
|
280
|
+
*
|
281
|
+
* Represents a single node in the Trie. It can be used as a cursor to walk around the Trie.
|
282
|
+
* You can grab a TrieNode for the root of the Trie by using Trie#root.
|
283
|
+
*
|
284
|
+
*/
|
285
|
+
|
286
|
+
static VALUE rb_trie_node_alloc(VALUE klass) {
|
287
|
+
VALUE obj;
|
288
|
+
obj = Data_Wrap_Struct(klass, 0, trie_state_free, NULL);
|
289
|
+
return obj;
|
290
|
+
}
|
291
|
+
|
292
|
+
/* nodoc */
|
293
|
+
static VALUE rb_trie_node_initialize_copy(VALUE self, VALUE from) {
|
294
|
+
RDATA(self)->data = trie_state_clone(RDATA(from)->data);
|
295
|
+
|
296
|
+
rb_iv_set(self, "@state", rb_iv_get(from, "@state"));
|
297
|
+
rb_iv_set(self, "@full_state", rb_iv_get(from, "@full_state"));
|
298
|
+
|
299
|
+
return self;
|
300
|
+
}
|
301
|
+
|
302
|
+
/*
|
303
|
+
* call-seq:
|
304
|
+
* state -> single character
|
305
|
+
*
|
306
|
+
* Returns the letter that the TrieNode instance points to. So, if the node is pointing at the "e" in "monkeys", the state is "e".
|
307
|
+
*
|
308
|
+
*/
|
309
|
+
static VALUE rb_trie_node_get_state(VALUE self) {
|
310
|
+
return rb_iv_get(self, "@state");
|
311
|
+
}
|
312
|
+
|
313
|
+
/*
|
314
|
+
* call-seq:
|
315
|
+
* full_state -> string
|
316
|
+
*
|
317
|
+
* Returns the full string from the root of the Trie up to this node. So if the node pointing at the "e" in "monkeys",
|
318
|
+
* the full_state is "monke".
|
319
|
+
*
|
320
|
+
*/
|
321
|
+
static VALUE rb_trie_node_get_full_state(VALUE self) {
|
322
|
+
return rb_iv_get(self, "@full_state");
|
323
|
+
}
|
324
|
+
|
325
|
+
/*
|
326
|
+
* call-seq:
|
327
|
+
* walk!(letter) -> TrieNode
|
328
|
+
*
|
329
|
+
* Tries to walk down a particular branch of the Trie. It modifies the node it is called on.
|
330
|
+
*
|
331
|
+
*/
|
332
|
+
static VALUE rb_trie_node_walk_bang(VALUE self, VALUE rchar) {
|
333
|
+
TrieState *state;
|
334
|
+
Data_Get_Struct(self, TrieState, state);
|
335
|
+
|
336
|
+
if(RSTRING(rchar)->len != 1)
|
337
|
+
return Qnil;
|
338
|
+
|
339
|
+
Bool result = trie_state_walk(state, *RSTRING(rchar)->ptr);
|
340
|
+
|
341
|
+
if(result) {
|
342
|
+
rb_iv_set(self, "@state", rchar);
|
343
|
+
VALUE full_state = rb_iv_get(self, "@full_state");
|
344
|
+
rb_str_append(full_state, rchar);
|
345
|
+
rb_iv_set(self, "@full_state", full_state);
|
346
|
+
return self;
|
347
|
+
} else
|
348
|
+
return Qnil;
|
349
|
+
}
|
350
|
+
|
351
|
+
/*
|
352
|
+
* call-seq:
|
353
|
+
* walk(letter) -> TrieNode
|
354
|
+
*
|
355
|
+
* Tries to walk down a particular branch of the Trie. It clones the node it is called on and
|
356
|
+
* walks with that one, leaving the original unchanged.
|
357
|
+
*
|
358
|
+
*/
|
359
|
+
static VALUE rb_trie_node_walk(VALUE self, VALUE rchar) {
|
360
|
+
VALUE new_node = rb_funcall(self, rb_intern("dup"), 0);
|
361
|
+
|
362
|
+
TrieState *state;
|
363
|
+
Data_Get_Struct(new_node, TrieState, state);
|
364
|
+
|
365
|
+
if(RSTRING(rchar)->len != 1)
|
366
|
+
return Qnil;
|
367
|
+
|
368
|
+
Bool result = trie_state_walk(state, *RSTRING(rchar)->ptr);
|
369
|
+
|
370
|
+
if(result) {
|
371
|
+
rb_iv_set(new_node, "@state", rchar);
|
372
|
+
VALUE full_state = rb_iv_get(new_node, "@full_state");
|
373
|
+
rb_str_append(full_state, rchar);
|
374
|
+
rb_iv_set(new_node, "@full_state", full_state);
|
375
|
+
return self;
|
376
|
+
} else
|
377
|
+
return Qnil;
|
378
|
+
}
|
379
|
+
|
380
|
+
/*
|
381
|
+
* call-seq:
|
382
|
+
* value
|
383
|
+
*
|
384
|
+
* Attempts to get the value at this node of the Trie. This only works if the node is a terminal
|
385
|
+
* (i.e. end of a key), otherwise it returns nil.
|
386
|
+
*
|
387
|
+
*/
|
388
|
+
static VALUE rb_trie_node_value(VALUE self) {
|
389
|
+
TrieState *state;
|
390
|
+
TrieState *dup;
|
391
|
+
Data_Get_Struct(self, TrieState, state);
|
392
|
+
|
393
|
+
dup = trie_state_clone(state);
|
394
|
+
|
395
|
+
trie_state_walk(dup, 0);
|
396
|
+
TrieData trie_data = trie_state_get_data(dup);
|
397
|
+
trie_state_free(dup);
|
398
|
+
|
399
|
+
return TRIE_DATA_ERROR == trie_data ? Qnil : (VALUE)trie_data;
|
400
|
+
}
|
401
|
+
|
402
|
+
/*
|
403
|
+
* call-seq:
|
404
|
+
* terminal? -> true/false
|
405
|
+
*
|
406
|
+
* Returns true if this node is at the end of a key. So if you have two keys in your Trie, "he" and
|
407
|
+
* "hello", and you walk all the way to the end of "hello", the "e" and the "o" will return true for terminal?.
|
408
|
+
*
|
409
|
+
*/
|
410
|
+
static VALUE rb_trie_node_terminal(VALUE self) {
|
411
|
+
TrieState *state;
|
412
|
+
Data_Get_Struct(self, TrieState, state);
|
413
|
+
|
414
|
+
return trie_state_is_terminal(state) ? Qtrue : Qnil;
|
415
|
+
}
|
416
|
+
|
417
|
+
/*
|
418
|
+
* call-seq:
|
419
|
+
* leaf? -> true/false
|
420
|
+
*
|
421
|
+
* Returns true if there are no branches at this node.
|
422
|
+
*/
|
423
|
+
static VALUE rb_trie_node_leaf(VALUE self) {
|
424
|
+
TrieState *state;
|
425
|
+
Data_Get_Struct(self, TrieState, state);
|
426
|
+
|
427
|
+
return trie_state_is_leaf(state) ? Qtrue : Qnil;
|
428
|
+
}
|
429
|
+
|
430
|
+
|
431
|
+
void Init_trie() {
|
432
|
+
cTrie = rb_define_class("Trie", rb_cObject);
|
433
|
+
rb_define_alloc_func(cTrie, rb_trie_alloc);
|
434
|
+
rb_define_method(cTrie, "has_key?", rb_trie_has_key, 1);
|
435
|
+
rb_define_method(cTrie, "get", rb_trie_get, 1);
|
436
|
+
rb_define_method(cTrie, "add", rb_trie_add, -2);
|
437
|
+
rb_define_method(cTrie, "delete", rb_trie_delete, 1);
|
438
|
+
rb_define_method(cTrie, "children", rb_trie_children, 1);
|
439
|
+
rb_define_method(cTrie, "children_with_values", rb_trie_children_with_values, 1);
|
440
|
+
rb_define_method(cTrie, "root", rb_trie_root, 0);
|
441
|
+
|
442
|
+
cTrieNode = rb_define_class("TrieNode", rb_cObject);
|
443
|
+
rb_define_alloc_func(cTrieNode, rb_trie_node_alloc);
|
444
|
+
rb_define_method(cTrieNode, "initialize_copy", rb_trie_node_initialize_copy, 1);
|
445
|
+
rb_define_method(cTrieNode, "state", rb_trie_node_get_state, 0);
|
446
|
+
rb_define_method(cTrieNode, "full_state", rb_trie_node_get_full_state, 0);
|
447
|
+
rb_define_method(cTrieNode, "walk!", rb_trie_node_walk_bang, 1);
|
448
|
+
rb_define_method(cTrieNode, "walk", rb_trie_node_walk, 1);
|
449
|
+
rb_define_method(cTrieNode, "value", rb_trie_node_value, 0);
|
450
|
+
rb_define_method(cTrieNode, "terminal?", rb_trie_node_terminal, 0);
|
451
|
+
rb_define_method(cTrieNode, "leaf?", rb_trie_node_leaf, 0);
|
452
|
+
}
|