fast_trie 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +132 -0
- data/VERSION.yml +4 -0
- data/ext/trie/Makefile +149 -0
- data/ext/trie/darray.c +673 -0
- data/ext/trie/darray.h +233 -0
- data/ext/trie/extconf.rb +3 -0
- data/ext/trie/fileutils.c +151 -0
- data/ext/trie/fileutils.h +36 -0
- data/ext/trie/tail.c +340 -0
- data/ext/trie/tail.h +207 -0
- data/ext/trie/trie-private.c +299 -0
- data/ext/trie/trie-private.h +31 -0
- data/ext/trie/trie.c +452 -0
- data/ext/trie/trie.h +40 -0
- data/ext/trie/triedefs.h +73 -0
- data/ext/trie/typedefs.h +113 -0
- data/lib/trie.rb +1 -0
- data/spec/trie_spec.rb +266 -0
- metadata +80 -0
data/ext/trie/tail.h
ADDED
@@ -0,0 +1,207 @@
|
|
1
|
+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
2
|
+
/*
|
3
|
+
* tail.h - trie tail for keeping suffixes
|
4
|
+
* Created: 2006-08-12
|
5
|
+
* Author: Theppitak Karoonboonyanan <thep@linux.thai.net>
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef __TAIL_H
|
9
|
+
#define __TAIL_H
|
10
|
+
|
11
|
+
#include "triedefs.h"
|
12
|
+
|
13
|
+
/**
|
14
|
+
* @file tail.h
|
15
|
+
* @brief trie tail for keeping suffixes
|
16
|
+
*/
|
17
|
+
|
18
|
+
/**
|
19
|
+
* @brief Double-array structure type
|
20
|
+
*/
|
21
|
+
typedef struct _Tail Tail;
|
22
|
+
|
23
|
+
/**
|
24
|
+
* @brief Create a new tail object
|
25
|
+
*
|
26
|
+
* Create a new empty tail object.
|
27
|
+
*/
|
28
|
+
Tail * tail_new ();
|
29
|
+
|
30
|
+
/**
|
31
|
+
* @brief Read tail data from file
|
32
|
+
*
|
33
|
+
* @param file : the file to read
|
34
|
+
*
|
35
|
+
* @return a pointer to the openned tail data, NULL on failure
|
36
|
+
*
|
37
|
+
* Read tail data from the opened file, starting from the current
|
38
|
+
* file pointer until the end of tail data block. On return, the
|
39
|
+
* file pointer is left at the position after the read block.
|
40
|
+
*/
|
41
|
+
Tail * tail_read (FILE *file);
|
42
|
+
|
43
|
+
/**
|
44
|
+
* @brief Free tail data
|
45
|
+
*
|
46
|
+
* @param t : the tail data
|
47
|
+
*
|
48
|
+
* @return 0 on success, non-zero on failure
|
49
|
+
*
|
50
|
+
* Free the given tail data.
|
51
|
+
*/
|
52
|
+
void tail_free (Tail *t);
|
53
|
+
|
54
|
+
/**
|
55
|
+
* @brief Write tail data
|
56
|
+
*
|
57
|
+
* @param t : the tail data
|
58
|
+
* @param file : the file to write to
|
59
|
+
*
|
60
|
+
* @return 0 on success, non-zero on failure
|
61
|
+
*
|
62
|
+
* Write tail data to the given @a file, starting from the current file
|
63
|
+
* pointer. On return, the file pointer is left after the tail data block.
|
64
|
+
*/
|
65
|
+
int tail_write (const Tail *t, FILE *file);
|
66
|
+
|
67
|
+
|
68
|
+
/**
|
69
|
+
* @brief Get suffix
|
70
|
+
*
|
71
|
+
* @param t : the tail data
|
72
|
+
* @param index : the index of the suffix
|
73
|
+
*
|
74
|
+
* @return an allocated string of the indexed suffix.
|
75
|
+
*
|
76
|
+
* Get suffix from tail with given @a index. The returned string is allocated.
|
77
|
+
* The caller should free it with free().
|
78
|
+
*/
|
79
|
+
const TrieChar * tail_get_suffix (const Tail *t, TrieIndex index);
|
80
|
+
|
81
|
+
/**
|
82
|
+
* @brief Set suffix of existing entry
|
83
|
+
*
|
84
|
+
* @param t : the tail data
|
85
|
+
* @param index : the index of the suffix
|
86
|
+
* @param suffix : the new suffix
|
87
|
+
*
|
88
|
+
* Set suffix of existing entry of given @a index in tail.
|
89
|
+
*/
|
90
|
+
Bool tail_set_suffix (Tail *t, TrieIndex index, const TrieChar *suffix);
|
91
|
+
|
92
|
+
/**
|
93
|
+
* @brief Add a new suffix
|
94
|
+
*
|
95
|
+
* @param t : the tail data
|
96
|
+
* @param suffix : the new suffix
|
97
|
+
*
|
98
|
+
* @return the index of the newly added suffix.
|
99
|
+
*
|
100
|
+
* Add a new suffix entry to tail.
|
101
|
+
*/
|
102
|
+
TrieIndex tail_add_suffix (Tail *t, const TrieChar *suffix);
|
103
|
+
|
104
|
+
/**
|
105
|
+
* @brief Get data associated to suffix entry
|
106
|
+
*
|
107
|
+
* @param t : the tail data
|
108
|
+
* @param index : the index of the suffix
|
109
|
+
*
|
110
|
+
* @return the data associated to the suffix entry
|
111
|
+
*
|
112
|
+
* Get data associated to suffix entry @a index in tail data.
|
113
|
+
*/
|
114
|
+
TrieData tail_get_data (const Tail *t, TrieIndex index);
|
115
|
+
|
116
|
+
/**
|
117
|
+
* @brief Set data associated to suffix entry
|
118
|
+
*
|
119
|
+
* @param t : the tail data
|
120
|
+
* @param index : the index of the suffix
|
121
|
+
* @param data : the data to set
|
122
|
+
*
|
123
|
+
* @return boolean indicating success
|
124
|
+
*
|
125
|
+
* Set data associated to suffix entry @a index in tail data.
|
126
|
+
*/
|
127
|
+
Bool tail_set_data (Tail *t, TrieIndex index, TrieData data);
|
128
|
+
|
129
|
+
/**
|
130
|
+
* @brief Delete suffix entry
|
131
|
+
*
|
132
|
+
* @param t : the tail data
|
133
|
+
* @param index : the index of the suffix to delete
|
134
|
+
*
|
135
|
+
* Delete suffix entry from the tail data.
|
136
|
+
*/
|
137
|
+
void tail_delete (Tail *t, TrieIndex index);
|
138
|
+
|
139
|
+
/**
|
140
|
+
* @brief Walk in tail with a string
|
141
|
+
*
|
142
|
+
* @param t : the tail data
|
143
|
+
* @param s : the tail data index
|
144
|
+
* @param suffix_idx : pointer to current character index in suffix
|
145
|
+
* @param str : the string to use in walking
|
146
|
+
* @param len : total characters in @a str to walk
|
147
|
+
*
|
148
|
+
* @return total number of characters successfully walked
|
149
|
+
*
|
150
|
+
* Walk in the tail data @a t at entry @a s, from given character position
|
151
|
+
* @a *suffix_idx, using @a len characters of given string @a str. On return,
|
152
|
+
* @a *suffix_idx is updated to the position after the last successful walk,
|
153
|
+
* and the function returns the total number of character succesfully walked.
|
154
|
+
*/
|
155
|
+
int tail_walk_str (const Tail *t,
|
156
|
+
TrieIndex s,
|
157
|
+
short *suffix_idx,
|
158
|
+
const TrieChar *str,
|
159
|
+
int len);
|
160
|
+
|
161
|
+
/**
|
162
|
+
* @brief Walk in tail with a character
|
163
|
+
*
|
164
|
+
* @param t : the tail data
|
165
|
+
* @param s : the tail data index
|
166
|
+
* @param suffix_idx : pointer to current character index in suffix
|
167
|
+
* @param c : the character to use in walking
|
168
|
+
*
|
169
|
+
* @return boolean indicating success
|
170
|
+
*
|
171
|
+
* Walk in the tail data @a t at entry @a s, from given character position
|
172
|
+
* @a *suffix_idx, using given character @a c. If the walk is successful,
|
173
|
+
* it returns TRUE, and @a *suffix_idx is updated to the next character.
|
174
|
+
* Otherwise, it returns FALSE, and @a *suffix_idx is left unchanged.
|
175
|
+
*/
|
176
|
+
Bool tail_walk_char (const Tail *t,
|
177
|
+
TrieIndex s,
|
178
|
+
short *suffix_idx,
|
179
|
+
TrieChar c);
|
180
|
+
|
181
|
+
/**
|
182
|
+
* @brief Test walkability in tail with a character
|
183
|
+
*
|
184
|
+
* @param t : the tail data
|
185
|
+
* @param s : the tail data index
|
186
|
+
* @param suffix_idx : current character index in suffix
|
187
|
+
* @param c : the character to test walkability
|
188
|
+
*
|
189
|
+
* @return boolean indicating walkability
|
190
|
+
*
|
191
|
+
* Test if the character @a c can be used to walk from given character
|
192
|
+
* position @a suffix_idx of entry @a s of the tail data @a t.
|
193
|
+
*/
|
194
|
+
/*
|
195
|
+
Bool tail_is_walkable_char (Tail *t,
|
196
|
+
TrieIndex s,
|
197
|
+
short suffix_idx,
|
198
|
+
const TrieChar c);
|
199
|
+
*/
|
200
|
+
#define tail_is_walkable_char(t,s,suffix_idx,c) \
|
201
|
+
(tail_get_suffix ((t), (s)) [suffix_idx] == (c))
|
202
|
+
|
203
|
+
#endif /* __TAIL_H */
|
204
|
+
|
205
|
+
/*
|
206
|
+
vi:ts=4:ai:expandtab
|
207
|
+
*/
|
@@ -0,0 +1,299 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include "darray.h"
|
5
|
+
#include "tail.h"
|
6
|
+
#include "trie.h"
|
7
|
+
|
8
|
+
Trie* trie_new() {
|
9
|
+
Trie *trie = (Trie*) malloc(sizeof(Trie));
|
10
|
+
trie->da = da_new();
|
11
|
+
trie->tail = tail_new();
|
12
|
+
return trie;
|
13
|
+
}
|
14
|
+
|
15
|
+
void trie_free(Trie *trie) {
|
16
|
+
da_free(trie->da);
|
17
|
+
tail_free(trie->tail);
|
18
|
+
free(trie);
|
19
|
+
}
|
20
|
+
|
21
|
+
static Bool trie_branch_in_branch (Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data) {
|
22
|
+
TrieIndex new_da, new_tail;
|
23
|
+
|
24
|
+
new_da = da_insert_branch (trie->da, sep_node, *suffix);
|
25
|
+
if (TRIE_INDEX_ERROR == new_da)
|
26
|
+
return FALSE;
|
27
|
+
|
28
|
+
if ('\0' != *suffix)
|
29
|
+
++suffix;
|
30
|
+
|
31
|
+
new_tail = tail_add_suffix (trie->tail, suffix);
|
32
|
+
tail_set_data (trie->tail, new_tail, data);
|
33
|
+
trie_da_set_tail_index (trie->da, new_da, new_tail);
|
34
|
+
|
35
|
+
// trie->is_dirty = TRUE;
|
36
|
+
return TRUE;
|
37
|
+
}
|
38
|
+
|
39
|
+
static Bool trie_branch_in_tail(Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data) {
|
40
|
+
TrieIndex old_tail, old_da, s;
|
41
|
+
const TrieChar *old_suffix, *p;
|
42
|
+
|
43
|
+
/* adjust separate point in old path */
|
44
|
+
old_tail = trie_da_get_tail_index (trie->da, sep_node);
|
45
|
+
old_suffix = tail_get_suffix (trie->tail, old_tail);
|
46
|
+
if (!old_suffix)
|
47
|
+
return FALSE;
|
48
|
+
|
49
|
+
for (p = old_suffix, s = sep_node; *p == *suffix; p++, suffix++) {
|
50
|
+
TrieIndex t = da_insert_branch (trie->da, s, *p);
|
51
|
+
if (TRIE_INDEX_ERROR == t)
|
52
|
+
goto fail;
|
53
|
+
s = t;
|
54
|
+
}
|
55
|
+
|
56
|
+
old_da = da_insert_branch (trie->da, s, *p);
|
57
|
+
if (TRIE_INDEX_ERROR == old_da)
|
58
|
+
goto fail;
|
59
|
+
|
60
|
+
if ('\0' != *p)
|
61
|
+
++p;
|
62
|
+
tail_set_suffix (trie->tail, old_tail, p);
|
63
|
+
trie_da_set_tail_index (trie->da, old_da, old_tail);
|
64
|
+
|
65
|
+
/* insert the new branch at the new separate point */
|
66
|
+
return trie_branch_in_branch (trie, s, suffix, data);
|
67
|
+
|
68
|
+
fail:
|
69
|
+
/* failed, undo previous insertions and return error */
|
70
|
+
da_prune_upto (trie->da, sep_node, s);
|
71
|
+
trie_da_set_tail_index (trie->da, sep_node, old_tail);
|
72
|
+
return FALSE;
|
73
|
+
}
|
74
|
+
|
75
|
+
Bool trie_store (Trie *trie, const TrieChar *key, TrieData data) {
|
76
|
+
TrieIndex s, t;
|
77
|
+
short suffix_idx;
|
78
|
+
const TrieChar *p, *sep;
|
79
|
+
size_t len;
|
80
|
+
|
81
|
+
/* walk through branches */
|
82
|
+
s = da_get_root (trie->da);
|
83
|
+
for (p = key; !trie_da_is_separate (trie->da, s); p++) {
|
84
|
+
if (!da_walk (trie->da, &s, *p))
|
85
|
+
return trie_branch_in_branch (trie, s, p, data);
|
86
|
+
if (0 == *p)
|
87
|
+
break;
|
88
|
+
}
|
89
|
+
|
90
|
+
/* walk through tail */
|
91
|
+
sep = p;
|
92
|
+
t = trie_da_get_tail_index (trie->da, s);
|
93
|
+
suffix_idx = 0;
|
94
|
+
len = strlen ((const char *) p) + 1; /* including null-terminator */
|
95
|
+
if (tail_walk_str (trie->tail, t, &suffix_idx, p, len) != len)
|
96
|
+
return trie_branch_in_tail (trie, s, p, data);
|
97
|
+
|
98
|
+
/* duplicated key, overwrite val */
|
99
|
+
tail_set_data (trie->tail, t, data);
|
100
|
+
// trie->is_dirty = TRUE;
|
101
|
+
return TRUE;
|
102
|
+
}
|
103
|
+
|
104
|
+
|
105
|
+
Bool trie_has_key (const Trie *trie, const TrieChar *key) {
|
106
|
+
TrieIndex s;
|
107
|
+
short suffix_idx;
|
108
|
+
const TrieChar *p;
|
109
|
+
|
110
|
+
/* walk through branches */
|
111
|
+
s = da_get_root (trie->da);
|
112
|
+
for (p = key; !trie_da_is_separate (trie->da, s); p++) {
|
113
|
+
if (!da_walk (trie->da, &s, *p))
|
114
|
+
return FALSE;
|
115
|
+
if (0 == *p)
|
116
|
+
break;
|
117
|
+
}
|
118
|
+
|
119
|
+
/* walk through tail */
|
120
|
+
s = trie_da_get_tail_index (trie->da, s);
|
121
|
+
suffix_idx = 0;
|
122
|
+
for ( ; ; p++) {
|
123
|
+
if (!tail_walk_char (trie->tail, s, &suffix_idx, *p))
|
124
|
+
return FALSE;
|
125
|
+
if (0 == *p)
|
126
|
+
break;
|
127
|
+
}
|
128
|
+
|
129
|
+
return TRUE;
|
130
|
+
}
|
131
|
+
|
132
|
+
|
133
|
+
Bool trie_retrieve (const Trie *trie, const TrieChar *key, TrieData *o_data) {
|
134
|
+
TrieIndex s;
|
135
|
+
short suffix_idx;
|
136
|
+
const TrieChar *p;
|
137
|
+
|
138
|
+
/* walk through branches */
|
139
|
+
s = da_get_root (trie->da);
|
140
|
+
for (p = key; !trie_da_is_separate (trie->da, s); p++) {
|
141
|
+
if (!da_walk (trie->da, &s, *p))
|
142
|
+
return FALSE;
|
143
|
+
if (0 == *p)
|
144
|
+
break;
|
145
|
+
}
|
146
|
+
|
147
|
+
/* walk through tail */
|
148
|
+
s = trie_da_get_tail_index (trie->da, s);
|
149
|
+
suffix_idx = 0;
|
150
|
+
for ( ; ; p++) {
|
151
|
+
if (!tail_walk_char (trie->tail, s, &suffix_idx, *p))
|
152
|
+
return FALSE;
|
153
|
+
if (0 == *p)
|
154
|
+
break;
|
155
|
+
}
|
156
|
+
|
157
|
+
/* found, set the val and return */
|
158
|
+
if (o_data)
|
159
|
+
*o_data = tail_get_data (trie->tail, s);
|
160
|
+
return TRUE;
|
161
|
+
}
|
162
|
+
|
163
|
+
Bool trie_delete (Trie *trie, const TrieChar *key) {
|
164
|
+
TrieIndex s, t;
|
165
|
+
short suffix_idx;
|
166
|
+
const TrieChar *p;
|
167
|
+
|
168
|
+
/* walk through branches */
|
169
|
+
s = da_get_root (trie->da);
|
170
|
+
for (p = key; !trie_da_is_separate (trie->da, s); p++) {
|
171
|
+
if (!da_walk (trie->da, &s, *p))
|
172
|
+
return FALSE;
|
173
|
+
if (0 == *p)
|
174
|
+
break;
|
175
|
+
}
|
176
|
+
|
177
|
+
/* walk through tail */
|
178
|
+
t = trie_da_get_tail_index (trie->da, s);
|
179
|
+
suffix_idx = 0;
|
180
|
+
for ( ; ; p++) {
|
181
|
+
if (!tail_walk_char (trie->tail, t, &suffix_idx, *p))
|
182
|
+
return FALSE;
|
183
|
+
if (0 == *p)
|
184
|
+
break;
|
185
|
+
}
|
186
|
+
|
187
|
+
tail_delete (trie->tail, t);
|
188
|
+
da_set_base (trie->da, s, TRIE_INDEX_ERROR);
|
189
|
+
da_prune (trie->da, s);
|
190
|
+
|
191
|
+
//trie->is_dirty = TRUE;
|
192
|
+
return TRUE;
|
193
|
+
}
|
194
|
+
|
195
|
+
/*-------------------------------*
|
196
|
+
* STEPWISE QUERY OPERATIONS *
|
197
|
+
*-------------------------------*/
|
198
|
+
|
199
|
+
TrieState * trie_root (const Trie *trie) {
|
200
|
+
return trie_state_new (trie, da_get_root (trie->da), 0, FALSE);
|
201
|
+
}
|
202
|
+
|
203
|
+
/*----------------*
|
204
|
+
* TRIE STATE *
|
205
|
+
*----------------*/
|
206
|
+
|
207
|
+
static TrieState * trie_state_new (const Trie *trie, TrieIndex index, short suffix_idx, short is_suffix) {
|
208
|
+
TrieState *s;
|
209
|
+
|
210
|
+
s = (TrieState *) malloc (sizeof (TrieState));
|
211
|
+
if (!s)
|
212
|
+
return NULL;
|
213
|
+
|
214
|
+
s->trie = trie;
|
215
|
+
s->index = index;
|
216
|
+
s->suffix_idx = suffix_idx;
|
217
|
+
s->is_suffix = is_suffix;
|
218
|
+
|
219
|
+
return s;
|
220
|
+
}
|
221
|
+
|
222
|
+
TrieState * trie_state_clone (const TrieState *s) {
|
223
|
+
return trie_state_new (s->trie, s->index, s->suffix_idx, s->is_suffix);
|
224
|
+
}
|
225
|
+
|
226
|
+
void trie_state_free (TrieState *s) {
|
227
|
+
free (s);
|
228
|
+
}
|
229
|
+
|
230
|
+
void trie_state_rewind (TrieState *s) {
|
231
|
+
s->index = da_get_root (s->trie->da);
|
232
|
+
s->is_suffix = FALSE;
|
233
|
+
}
|
234
|
+
|
235
|
+
Bool trie_state_walk (TrieState *s, TrieChar c) {
|
236
|
+
if (!s->is_suffix) {
|
237
|
+
Bool ret;
|
238
|
+
|
239
|
+
ret = da_walk (s->trie->da, &s->index, c);
|
240
|
+
|
241
|
+
if (ret && trie_da_is_separate (s->trie->da, s->index)) {
|
242
|
+
s->index = trie_da_get_tail_index (s->trie->da, s->index);
|
243
|
+
s->suffix_idx = 0;
|
244
|
+
s->is_suffix = TRUE;
|
245
|
+
}
|
246
|
+
|
247
|
+
return ret;
|
248
|
+
} else {
|
249
|
+
return tail_walk_char (s->trie->tail, s->index, &s->suffix_idx, c);
|
250
|
+
}
|
251
|
+
}
|
252
|
+
|
253
|
+
Bool trie_state_is_walkable (const TrieState *s, TrieChar c) {
|
254
|
+
if (!s->is_suffix)
|
255
|
+
return da_is_walkable (s->trie->da, s->index, c);
|
256
|
+
else
|
257
|
+
return tail_is_walkable_char (s->trie->tail, s->index, s->suffix_idx, c);
|
258
|
+
}
|
259
|
+
|
260
|
+
Bool trie_state_is_leaf (const TrieState *s) {
|
261
|
+
return s->is_suffix && trie_state_is_terminal (s);
|
262
|
+
}
|
263
|
+
|
264
|
+
TrieData trie_state_get_data (const TrieState *s) {
|
265
|
+
return s->is_suffix ? tail_get_data (s->trie->tail, s->index) : TRIE_DATA_ERROR;
|
266
|
+
}
|
267
|
+
|
268
|
+
int main(void) {
|
269
|
+
Bool res;
|
270
|
+
TrieData *data = (TrieData*)malloc(sizeof(TrieData));
|
271
|
+
Trie *trie = trie_new();
|
272
|
+
|
273
|
+
|
274
|
+
trie_store(trie, (const TrieChar*)"hello", 1);
|
275
|
+
trie_store(trie, (const TrieChar*)"he", 4);
|
276
|
+
trie_store(trie, (const TrieChar*)"hel", 3);
|
277
|
+
trie_store(trie, (const TrieChar*)"h", 5);
|
278
|
+
trie_store(trie, (const TrieChar*)"hell", 2);
|
279
|
+
|
280
|
+
|
281
|
+
res = trie_retrieve(trie, (const TrieChar*)"hello", data);
|
282
|
+
printf(res ? "Win!\n" : "Fail!\n");
|
283
|
+
|
284
|
+
res = trie_retrieve(trie, (const TrieChar*)"hell", data);
|
285
|
+
printf(res ? "Win!\n" : "Fail!\n");
|
286
|
+
|
287
|
+
res = trie_retrieve(trie, (const TrieChar*)"hel", data);
|
288
|
+
printf(res ? "Win!\n" : "Fail!\n");
|
289
|
+
|
290
|
+
res = trie_retrieve(trie, (const TrieChar*)"he", data);
|
291
|
+
printf(res ? "Win!\n" : "Fail!\n");
|
292
|
+
|
293
|
+
res = trie_retrieve(trie, (const TrieChar*)"h", data);
|
294
|
+
printf(res ? "Win!\n" : "Fail!\n");
|
295
|
+
|
296
|
+
|
297
|
+
trie_free(trie);
|
298
|
+
return 0;
|
299
|
+
}
|