fast_trie 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +132 -0
- data/VERSION.yml +4 -0
- data/ext/trie/Makefile +149 -0
- data/ext/trie/darray.c +673 -0
- data/ext/trie/darray.h +233 -0
- data/ext/trie/extconf.rb +3 -0
- data/ext/trie/fileutils.c +151 -0
- data/ext/trie/fileutils.h +36 -0
- data/ext/trie/tail.c +340 -0
- data/ext/trie/tail.h +207 -0
- data/ext/trie/trie-private.c +299 -0
- data/ext/trie/trie-private.h +31 -0
- data/ext/trie/trie.c +452 -0
- data/ext/trie/trie.h +40 -0
- data/ext/trie/triedefs.h +73 -0
- data/ext/trie/typedefs.h +113 -0
- data/lib/trie.rb +1 -0
- data/spec/trie_spec.rb +266 -0
- metadata +80 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
2
|
+
/*
|
3
|
+
* trie-private.h - Private utilities for trie implementation
|
4
|
+
* Created: 2007-08-25
|
5
|
+
* Author: Theppitak Karoonboonyanan <thep@linux.thai.net>
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef __TRIE_PRIVATE_H
|
9
|
+
#define __TRIE_PRIVATE_H
|
10
|
+
|
11
|
+
#include "typedefs.h"
|
12
|
+
|
13
|
+
/**
|
14
|
+
* @file trie-private.h
|
15
|
+
* @brief Private utilities for trie implementation
|
16
|
+
*/
|
17
|
+
|
18
|
+
/**
|
19
|
+
* @brief Minimum value macro
|
20
|
+
*/
|
21
|
+
#define MIN_VAL(a,b) ((a)<(b)?(a):(b))
|
22
|
+
/**
|
23
|
+
* @brief Maximum value macro
|
24
|
+
*/
|
25
|
+
#define MAX_VAL(a,b) ((a)>(b)?(a):(b))
|
26
|
+
|
27
|
+
#endif /* __TRIE_PRIVATE_H */
|
28
|
+
|
29
|
+
/*
|
30
|
+
vi:ts=4:ai:expandtab
|
31
|
+
*/
|
data/ext/trie/trie.c
ADDED
@@ -0,0 +1,452 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "trie.h"
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <stdio.h>
|
5
|
+
#include <string.h>
|
6
|
+
|
7
|
+
VALUE cTrie, cTrieNode;
|
8
|
+
|
9
|
+
/*
|
10
|
+
* Document-class: Trie
|
11
|
+
*
|
12
|
+
* A key-value data structure for string keys which is efficient memory usage and fast retrieval time.
|
13
|
+
*
|
14
|
+
*/
|
15
|
+
|
16
|
+
static VALUE rb_trie_alloc(VALUE klass) {
|
17
|
+
VALUE obj;
|
18
|
+
obj = Data_Wrap_Struct(klass, 0, trie_free, trie_new());
|
19
|
+
return obj;
|
20
|
+
}
|
21
|
+
|
22
|
+
/*
|
23
|
+
* call-seq:
|
24
|
+
* has_key?(key) -> true/false
|
25
|
+
*
|
26
|
+
* Determines whether or not a key exists in the Trie. Use this if you don't care about the value, as it
|
27
|
+
* is marginally faster than Trie#get.
|
28
|
+
*
|
29
|
+
*/
|
30
|
+
static VALUE rb_trie_has_key(VALUE self, VALUE key) {
|
31
|
+
Trie *trie;
|
32
|
+
Data_Get_Struct(self, Trie, trie);
|
33
|
+
|
34
|
+
if(trie_has_key(trie, (TrieChar*)RSTRING(key)->ptr))
|
35
|
+
return Qtrue;
|
36
|
+
else
|
37
|
+
return Qnil;
|
38
|
+
}
|
39
|
+
|
40
|
+
/*
|
41
|
+
* call-seq:
|
42
|
+
* get(key) -> value
|
43
|
+
* [key] -> value
|
44
|
+
*
|
45
|
+
* Retrieves the value for a particular key (or nil) from the Trie.
|
46
|
+
*
|
47
|
+
*/
|
48
|
+
static VALUE rb_trie_get(VALUE self, VALUE key) {
|
49
|
+
Trie *trie;
|
50
|
+
Data_Get_Struct(self, Trie, trie);
|
51
|
+
|
52
|
+
TrieData data;
|
53
|
+
if(trie_retrieve(trie, (TrieChar*)RSTRING(key)->ptr, &data))
|
54
|
+
return (VALUE)data;
|
55
|
+
else
|
56
|
+
return Qnil;
|
57
|
+
}
|
58
|
+
|
59
|
+
/*
|
60
|
+
* call-seq:
|
61
|
+
* add(key)
|
62
|
+
* add(key,value)
|
63
|
+
*
|
64
|
+
* Add a key, or a key and value to the Trie. If you add a key without a value it assumes true for the value.
|
65
|
+
*
|
66
|
+
*/
|
67
|
+
static VALUE rb_trie_add(VALUE self, VALUE args) {
|
68
|
+
Trie *trie;
|
69
|
+
Data_Get_Struct(self, Trie, trie);
|
70
|
+
|
71
|
+
int size = RARRAY(args)->len;
|
72
|
+
if(size < 1 || size > 2)
|
73
|
+
return Qnil;
|
74
|
+
|
75
|
+
VALUE key;
|
76
|
+
key = RARRAY(args)->ptr[0];
|
77
|
+
TrieData value = size == 2 ? RARRAY(args)->ptr[1] : TRIE_DATA_ERROR;
|
78
|
+
|
79
|
+
if(trie_store(trie, (TrieChar*)RSTRING(key)->ptr, value))
|
80
|
+
return Qtrue;
|
81
|
+
else
|
82
|
+
return Qnil;
|
83
|
+
}
|
84
|
+
|
85
|
+
/*
|
86
|
+
* call-seq:
|
87
|
+
* delete(key)
|
88
|
+
*
|
89
|
+
* Delete a key from the Trie. Returns true if it deleted a key, nil otherwise.
|
90
|
+
*
|
91
|
+
*/
|
92
|
+
static VALUE rb_trie_delete(VALUE self, VALUE key) {
|
93
|
+
Trie *trie;
|
94
|
+
Data_Get_Struct(self, Trie, trie);
|
95
|
+
|
96
|
+
if(trie_delete(trie, (TrieChar*)RSTRING(key)->ptr))
|
97
|
+
return Qtrue;
|
98
|
+
else
|
99
|
+
return Qnil;
|
100
|
+
}
|
101
|
+
|
102
|
+
static VALUE walk_all_paths(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) {
|
103
|
+
int c;
|
104
|
+
for(c = 1; c < 256; c++) {
|
105
|
+
if(trie_state_is_walkable(state,c)) {
|
106
|
+
TrieState *next_state = trie_state_clone(state);
|
107
|
+
trie_state_walk(next_state, c);
|
108
|
+
|
109
|
+
prefix[prefix_size] = c;
|
110
|
+
prefix[prefix_size + 1] = 0;
|
111
|
+
|
112
|
+
if(trie_state_is_terminal(next_state)) {
|
113
|
+
char *word = (char*) malloc(prefix_size + 2);
|
114
|
+
memcpy(word, prefix, prefix_size + 2);
|
115
|
+
rb_ary_push(children, rb_str_new2(word));
|
116
|
+
}
|
117
|
+
|
118
|
+
walk_all_paths(trie, children, next_state, prefix, prefix_size + 1);
|
119
|
+
|
120
|
+
prefix[prefix_size] = 0;
|
121
|
+
trie_state_free(next_state);
|
122
|
+
}
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
/*
|
127
|
+
* call-seq:
|
128
|
+
* children(prefix) -> [ key, ... ]
|
129
|
+
*
|
130
|
+
* Finds all keys in the Trie beginning with the given prefix.
|
131
|
+
*
|
132
|
+
*/
|
133
|
+
static VALUE rb_trie_children(VALUE self, VALUE prefix) {
|
134
|
+
if(NIL_P(prefix))
|
135
|
+
return rb_ary_new();
|
136
|
+
|
137
|
+
Trie *trie;
|
138
|
+
Data_Get_Struct(self, Trie, trie);
|
139
|
+
|
140
|
+
int prefix_size = RSTRING(prefix)->len;
|
141
|
+
TrieState *state = trie_root(trie);
|
142
|
+
VALUE children = rb_ary_new();
|
143
|
+
TrieChar *char_prefix = (TrieChar*)RSTRING(prefix)->ptr;
|
144
|
+
|
145
|
+
const TrieChar *iterator = char_prefix;
|
146
|
+
while(*iterator != 0) {
|
147
|
+
if(!trie_state_is_walkable(state, *iterator))
|
148
|
+
return children;
|
149
|
+
trie_state_walk(state, *iterator);
|
150
|
+
iterator++;
|
151
|
+
}
|
152
|
+
|
153
|
+
if(trie_state_is_terminal(state))
|
154
|
+
rb_ary_push(children, prefix);
|
155
|
+
|
156
|
+
char prefix_buffer[1024];
|
157
|
+
memcpy(prefix_buffer, char_prefix, prefix_size);
|
158
|
+
prefix_buffer[prefix_size] = 0;
|
159
|
+
|
160
|
+
walk_all_paths(trie, children, state, prefix_buffer, prefix_size);
|
161
|
+
|
162
|
+
trie_state_free(state);
|
163
|
+
return children;
|
164
|
+
}
|
165
|
+
|
166
|
+
|
167
|
+
static VALUE walk_all_paths_with_values(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) {
|
168
|
+
int c;
|
169
|
+
for(c = 1; c < 256; c++) {
|
170
|
+
if(trie_state_is_walkable(state,c)) {
|
171
|
+
TrieState *next_state = trie_state_clone(state);
|
172
|
+
trie_state_walk(next_state, c);
|
173
|
+
|
174
|
+
prefix[prefix_size] = c;
|
175
|
+
prefix[prefix_size + 1] = 0;
|
176
|
+
|
177
|
+
if(trie_state_is_terminal(next_state)) {
|
178
|
+
TrieState *end_state = trie_state_clone(next_state);
|
179
|
+
trie_state_walk(end_state, '\0');
|
180
|
+
|
181
|
+
char *word = (char*) malloc(prefix_size + 2);
|
182
|
+
memcpy(word, prefix, prefix_size + 2);
|
183
|
+
|
184
|
+
VALUE tuple = rb_ary_new();
|
185
|
+
rb_ary_push(tuple, rb_str_new2(word));
|
186
|
+
|
187
|
+
TrieData trie_data = trie_state_get_data(end_state);
|
188
|
+
rb_ary_push(tuple, (VALUE)trie_data);
|
189
|
+
rb_ary_push(children, tuple);
|
190
|
+
|
191
|
+
trie_state_free(end_state);
|
192
|
+
}
|
193
|
+
|
194
|
+
walk_all_paths_with_values(trie, children, next_state, prefix, prefix_size + 1);
|
195
|
+
|
196
|
+
prefix[prefix_size] = 0;
|
197
|
+
trie_state_free(next_state);
|
198
|
+
}
|
199
|
+
}
|
200
|
+
}
|
201
|
+
|
202
|
+
/*
|
203
|
+
* call-seq:
|
204
|
+
* children_with_values(key) -> [ [key,value], ... ]
|
205
|
+
*
|
206
|
+
* Finds all keys with their respective values in the Trie beginning with the given prefix.
|
207
|
+
*
|
208
|
+
*/
|
209
|
+
static VALUE rb_trie_children_with_values(VALUE self, VALUE prefix) {
|
210
|
+
if(NIL_P(prefix))
|
211
|
+
return rb_ary_new();
|
212
|
+
|
213
|
+
Trie *trie;
|
214
|
+
Data_Get_Struct(self, Trie, trie);
|
215
|
+
|
216
|
+
int prefix_size = RSTRING(prefix)->len;
|
217
|
+
TrieChar *char_prefix = (TrieChar*)RSTRING(prefix)->ptr;
|
218
|
+
|
219
|
+
VALUE children = rb_ary_new();
|
220
|
+
|
221
|
+
TrieState *state = trie_root(trie);
|
222
|
+
|
223
|
+
const TrieChar *iterator = char_prefix;
|
224
|
+
while(*iterator != 0) {
|
225
|
+
if(!trie_state_is_walkable(state, *iterator))
|
226
|
+
return rb_ary_new();
|
227
|
+
trie_state_walk(state, *iterator);
|
228
|
+
iterator++;
|
229
|
+
}
|
230
|
+
|
231
|
+
if(trie_state_is_terminal(state)) {
|
232
|
+
TrieState *end_state = trie_state_clone(state);
|
233
|
+
trie_state_walk(end_state, '\0');
|
234
|
+
|
235
|
+
VALUE tuple = rb_ary_new();
|
236
|
+
rb_ary_push(tuple, prefix);
|
237
|
+
TrieData trie_data = trie_state_get_data(end_state);
|
238
|
+
rb_ary_push(tuple, (VALUE)trie_data);
|
239
|
+
rb_ary_push(children, tuple);
|
240
|
+
|
241
|
+
trie_state_free(end_state);
|
242
|
+
}
|
243
|
+
|
244
|
+
char prefix_buffer[1024];
|
245
|
+
memcpy(prefix_buffer, char_prefix, prefix_size);
|
246
|
+
prefix_buffer[prefix_size] = 0;
|
247
|
+
|
248
|
+
walk_all_paths_with_values(trie, children, state, prefix_buffer, prefix_size);
|
249
|
+
|
250
|
+
trie_state_free(state);
|
251
|
+
return children;
|
252
|
+
}
|
253
|
+
|
254
|
+
static VALUE rb_trie_node_alloc(VALUE klass);
|
255
|
+
|
256
|
+
/*
|
257
|
+
* call-seq:
|
258
|
+
* root -> TrieNode
|
259
|
+
*
|
260
|
+
* Returns a TrieNode representing the root of the Trie.
|
261
|
+
*
|
262
|
+
*/
|
263
|
+
static VALUE rb_trie_root(VALUE self) {
|
264
|
+
Trie *trie;
|
265
|
+
Data_Get_Struct(self, Trie, trie);
|
266
|
+
|
267
|
+
VALUE trie_node = rb_trie_node_alloc(cTrieNode);
|
268
|
+
|
269
|
+
TrieState *state = trie_root(trie);
|
270
|
+
RDATA(trie_node)->data = state;
|
271
|
+
|
272
|
+
rb_iv_set(trie_node, "@state", Qnil);
|
273
|
+
rb_iv_set(trie_node, "@full_state", rb_str_new2(""));
|
274
|
+
return trie_node;
|
275
|
+
}
|
276
|
+
|
277
|
+
|
278
|
+
/*
|
279
|
+
* Document-class: TrieNode
|
280
|
+
*
|
281
|
+
* Represents a single node in the Trie. It can be used as a cursor to walk around the Trie.
|
282
|
+
* You can grab a TrieNode for the root of the Trie by using Trie#root.
|
283
|
+
*
|
284
|
+
*/
|
285
|
+
|
286
|
+
static VALUE rb_trie_node_alloc(VALUE klass) {
|
287
|
+
VALUE obj;
|
288
|
+
obj = Data_Wrap_Struct(klass, 0, trie_state_free, NULL);
|
289
|
+
return obj;
|
290
|
+
}
|
291
|
+
|
292
|
+
/* nodoc */
|
293
|
+
static VALUE rb_trie_node_initialize_copy(VALUE self, VALUE from) {
|
294
|
+
RDATA(self)->data = trie_state_clone(RDATA(from)->data);
|
295
|
+
|
296
|
+
rb_iv_set(self, "@state", rb_iv_get(from, "@state"));
|
297
|
+
rb_iv_set(self, "@full_state", rb_iv_get(from, "@full_state"));
|
298
|
+
|
299
|
+
return self;
|
300
|
+
}
|
301
|
+
|
302
|
+
/*
|
303
|
+
* call-seq:
|
304
|
+
* state -> single character
|
305
|
+
*
|
306
|
+
* Returns the letter that the TrieNode instance points to. So, if the node is pointing at the "e" in "monkeys", the state is "e".
|
307
|
+
*
|
308
|
+
*/
|
309
|
+
static VALUE rb_trie_node_get_state(VALUE self) {
|
310
|
+
return rb_iv_get(self, "@state");
|
311
|
+
}
|
312
|
+
|
313
|
+
/*
|
314
|
+
* call-seq:
|
315
|
+
* full_state -> string
|
316
|
+
*
|
317
|
+
* Returns the full string from the root of the Trie up to this node. So if the node pointing at the "e" in "monkeys",
|
318
|
+
* the full_state is "monke".
|
319
|
+
*
|
320
|
+
*/
|
321
|
+
static VALUE rb_trie_node_get_full_state(VALUE self) {
|
322
|
+
return rb_iv_get(self, "@full_state");
|
323
|
+
}
|
324
|
+
|
325
|
+
/*
|
326
|
+
* call-seq:
|
327
|
+
* walk!(letter) -> TrieNode
|
328
|
+
*
|
329
|
+
* Tries to walk down a particular branch of the Trie. It modifies the node it is called on.
|
330
|
+
*
|
331
|
+
*/
|
332
|
+
static VALUE rb_trie_node_walk_bang(VALUE self, VALUE rchar) {
|
333
|
+
TrieState *state;
|
334
|
+
Data_Get_Struct(self, TrieState, state);
|
335
|
+
|
336
|
+
if(RSTRING(rchar)->len != 1)
|
337
|
+
return Qnil;
|
338
|
+
|
339
|
+
Bool result = trie_state_walk(state, *RSTRING(rchar)->ptr);
|
340
|
+
|
341
|
+
if(result) {
|
342
|
+
rb_iv_set(self, "@state", rchar);
|
343
|
+
VALUE full_state = rb_iv_get(self, "@full_state");
|
344
|
+
rb_str_append(full_state, rchar);
|
345
|
+
rb_iv_set(self, "@full_state", full_state);
|
346
|
+
return self;
|
347
|
+
} else
|
348
|
+
return Qnil;
|
349
|
+
}
|
350
|
+
|
351
|
+
/*
|
352
|
+
* call-seq:
|
353
|
+
* walk(letter) -> TrieNode
|
354
|
+
*
|
355
|
+
* Tries to walk down a particular branch of the Trie. It clones the node it is called on and
|
356
|
+
* walks with that one, leaving the original unchanged.
|
357
|
+
*
|
358
|
+
*/
|
359
|
+
static VALUE rb_trie_node_walk(VALUE self, VALUE rchar) {
|
360
|
+
VALUE new_node = rb_funcall(self, rb_intern("dup"), 0);
|
361
|
+
|
362
|
+
TrieState *state;
|
363
|
+
Data_Get_Struct(new_node, TrieState, state);
|
364
|
+
|
365
|
+
if(RSTRING(rchar)->len != 1)
|
366
|
+
return Qnil;
|
367
|
+
|
368
|
+
Bool result = trie_state_walk(state, *RSTRING(rchar)->ptr);
|
369
|
+
|
370
|
+
if(result) {
|
371
|
+
rb_iv_set(new_node, "@state", rchar);
|
372
|
+
VALUE full_state = rb_iv_get(new_node, "@full_state");
|
373
|
+
rb_str_append(full_state, rchar);
|
374
|
+
rb_iv_set(new_node, "@full_state", full_state);
|
375
|
+
return self;
|
376
|
+
} else
|
377
|
+
return Qnil;
|
378
|
+
}
|
379
|
+
|
380
|
+
/*
|
381
|
+
* call-seq:
|
382
|
+
* value
|
383
|
+
*
|
384
|
+
* Attempts to get the value at this node of the Trie. This only works if the node is a terminal
|
385
|
+
* (i.e. end of a key), otherwise it returns nil.
|
386
|
+
*
|
387
|
+
*/
|
388
|
+
static VALUE rb_trie_node_value(VALUE self) {
|
389
|
+
TrieState *state;
|
390
|
+
TrieState *dup;
|
391
|
+
Data_Get_Struct(self, TrieState, state);
|
392
|
+
|
393
|
+
dup = trie_state_clone(state);
|
394
|
+
|
395
|
+
trie_state_walk(dup, 0);
|
396
|
+
TrieData trie_data = trie_state_get_data(dup);
|
397
|
+
trie_state_free(dup);
|
398
|
+
|
399
|
+
return TRIE_DATA_ERROR == trie_data ? Qnil : (VALUE)trie_data;
|
400
|
+
}
|
401
|
+
|
402
|
+
/*
|
403
|
+
* call-seq:
|
404
|
+
* terminal? -> true/false
|
405
|
+
*
|
406
|
+
* Returns true if this node is at the end of a key. So if you have two keys in your Trie, "he" and
|
407
|
+
* "hello", and you walk all the way to the end of "hello", the "e" and the "o" will return true for terminal?.
|
408
|
+
*
|
409
|
+
*/
|
410
|
+
static VALUE rb_trie_node_terminal(VALUE self) {
|
411
|
+
TrieState *state;
|
412
|
+
Data_Get_Struct(self, TrieState, state);
|
413
|
+
|
414
|
+
return trie_state_is_terminal(state) ? Qtrue : Qnil;
|
415
|
+
}
|
416
|
+
|
417
|
+
/*
|
418
|
+
* call-seq:
|
419
|
+
* leaf? -> true/false
|
420
|
+
*
|
421
|
+
* Returns true if there are no branches at this node.
|
422
|
+
*/
|
423
|
+
static VALUE rb_trie_node_leaf(VALUE self) {
|
424
|
+
TrieState *state;
|
425
|
+
Data_Get_Struct(self, TrieState, state);
|
426
|
+
|
427
|
+
return trie_state_is_leaf(state) ? Qtrue : Qnil;
|
428
|
+
}
|
429
|
+
|
430
|
+
|
431
|
+
void Init_trie() {
|
432
|
+
cTrie = rb_define_class("Trie", rb_cObject);
|
433
|
+
rb_define_alloc_func(cTrie, rb_trie_alloc);
|
434
|
+
rb_define_method(cTrie, "has_key?", rb_trie_has_key, 1);
|
435
|
+
rb_define_method(cTrie, "get", rb_trie_get, 1);
|
436
|
+
rb_define_method(cTrie, "add", rb_trie_add, -2);
|
437
|
+
rb_define_method(cTrie, "delete", rb_trie_delete, 1);
|
438
|
+
rb_define_method(cTrie, "children", rb_trie_children, 1);
|
439
|
+
rb_define_method(cTrie, "children_with_values", rb_trie_children_with_values, 1);
|
440
|
+
rb_define_method(cTrie, "root", rb_trie_root, 0);
|
441
|
+
|
442
|
+
cTrieNode = rb_define_class("TrieNode", rb_cObject);
|
443
|
+
rb_define_alloc_func(cTrieNode, rb_trie_node_alloc);
|
444
|
+
rb_define_method(cTrieNode, "initialize_copy", rb_trie_node_initialize_copy, 1);
|
445
|
+
rb_define_method(cTrieNode, "state", rb_trie_node_get_state, 0);
|
446
|
+
rb_define_method(cTrieNode, "full_state", rb_trie_node_get_full_state, 0);
|
447
|
+
rb_define_method(cTrieNode, "walk!", rb_trie_node_walk_bang, 1);
|
448
|
+
rb_define_method(cTrieNode, "walk", rb_trie_node_walk, 1);
|
449
|
+
rb_define_method(cTrieNode, "value", rb_trie_node_value, 0);
|
450
|
+
rb_define_method(cTrieNode, "terminal?", rb_trie_node_terminal, 0);
|
451
|
+
rb_define_method(cTrieNode, "leaf?", rb_trie_node_leaf, 0);
|
452
|
+
}
|