aurelian-ruby-ahocorasick 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +21 -0
- data/README.textile +36 -0
- data/examples/dict.rb +14 -0
- data/examples/elev.rb +19 -0
- data/examples/sample.c +94 -0
- data/examples/test.rb +46 -0
- data/ext/ac.c +623 -0
- data/ext/ac.h +36 -0
- data/ext/extconf.rb +4 -0
- data/ext/ruby-ahocorasick.c +329 -0
- data/spec/ahocorasick_spec.rb +183 -0
- metadata +69 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2008 Aurelian Oancea
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
data/README.textile
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
h3. Introduction
|
2
|
+
|
3
|
+
This library is a ruby extension, a wrapper around the "Aho-Corasick":http://en.wikipedia.org/wiki/Aho-Corasick_algorithm implementation in C, found in "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html package.
|
4
|
+
|
5
|
+
The source code (ac.c and ac.h) was "adapted" from Strmat. In fact, I've changed only 3-4 lines of code from the original implementation so it will feat my needs: search needed to return the current position in the searched string.
|
6
|
+
|
7
|
+
h3. What's the idea?
|
8
|
+
|
9
|
+
Having a dictionary of known sentences, how can I find individual patterns in an incoming stream of data? Fast.
|
10
|
+
|
11
|
+
h1. TBD
|
12
|
+
|
13
|
+
<pre>
|
14
|
+
[aurelian@stalingrad ext]$ time ./dict.rb
|
15
|
+
110196
|
16
|
+
711
|
17
|
+
|
18
|
+
real 0m0.538s
|
19
|
+
user 0m0.435s
|
20
|
+
sys 0m0.036s
|
21
|
+
</pre>
|
22
|
+
|
23
|
+
h3. Additional Reading / Implementations
|
24
|
+
|
25
|
+
Other suffix - tree
|
26
|
+
|
27
|
+
* "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html
|
28
|
+
* Pytst / Ruby-Pytst
|
29
|
+
* "Aho-Corasick extension":http://hkn.eecs.berkeley.edu/~dyoo/python/ahocorasick/
|
30
|
+
* "Keyword Prospector":http://latimes.rubyforge.org/keyword_prospector/rdoc/
|
31
|
+
* "libstree":http://www.cl.cam.ac.uk/~cpk25/libstree/
|
32
|
+
|
33
|
+
--
|
34
|
+
|
35
|
+
(c) 2008 - Aurelian Oancea, < aurelian at locknet . ro >
|
36
|
+
released under MIT-LICENCE
|
data/examples/dict.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.dirname(__FILE__) + '/../ext/ahocorasick'
|
4
|
+
|
5
|
+
k= AhoCorasick::KeywordTree.from_file(File.dirname(__FILE__) + "/../spec/data/en.words")
|
6
|
+
|
7
|
+
query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt")
|
8
|
+
|
9
|
+
results= k.search query
|
10
|
+
|
11
|
+
results.each do | r |
|
12
|
+
puts query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr + " => " + r[:value]
|
13
|
+
end
|
14
|
+
|
data/examples/elev.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$kcode='UTF-8'
|
4
|
+
|
5
|
+
require File.dirname(__FILE__) + '/../ext/ahocorasick'
|
6
|
+
|
7
|
+
k= AhoCorasick::KeywordTree.new
|
8
|
+
|
9
|
+
k << "I've"
|
10
|
+
k << "data"
|
11
|
+
k << "base"
|
12
|
+
k << "database"
|
13
|
+
|
14
|
+
query= "I've moved my data to a database"
|
15
|
+
|
16
|
+
k.search(query).each do | r |
|
17
|
+
puts "-> [ " + r[:id].to_s + " ] " + r[:value] + " / " + query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr
|
18
|
+
end
|
19
|
+
|
data/examples/sample.c
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
//
|
2
|
+
// Getting started with Aho-Corasick from Strmat
|
3
|
+
//
|
4
|
+
// lasick Makefile:
|
5
|
+
//
|
6
|
+
// --
|
7
|
+
// ac.o :
|
8
|
+
// gcc -c -fPIC -shared ac.c
|
9
|
+
// libasick : ac.o
|
10
|
+
// gcc -shared -Wl,-soname,libasick.so -o libasick.so.1.0.1
|
11
|
+
// ar rcs libasick.a ac.o
|
12
|
+
// clean :
|
13
|
+
// rm -rf *.o *.a *.so* *.dylib*
|
14
|
+
// --
|
15
|
+
//
|
16
|
+
// Compile this stuff - asick is the library name, generated with the above Makefile :)
|
17
|
+
//
|
18
|
+
// gcc sample.c -o ac-sample -I../ext/ -L../ext/ -lasick
|
19
|
+
//
|
20
|
+
|
21
|
+
#include <string.h>
|
22
|
+
#include <stdio.h>
|
23
|
+
#include <stdlib.h>
|
24
|
+
#include "ac.h"
|
25
|
+
|
26
|
+
int main(int argc, char *argv[]) {
|
27
|
+
char * search;
|
28
|
+
char * remain;
|
29
|
+
char * result;
|
30
|
+
char word[1024];
|
31
|
+
|
32
|
+
FILE *dictionary;
|
33
|
+
FILE *input;
|
34
|
+
int lgt, id, n, i;
|
35
|
+
|
36
|
+
AC_STRUCT * tree;
|
37
|
+
|
38
|
+
input= fopen(argv[1], "r");
|
39
|
+
|
40
|
+
if(input == NULL) {
|
41
|
+
search= argv[1];
|
42
|
+
} else {
|
43
|
+
long lSize;
|
44
|
+
fseek (input , 0 , SEEK_END);
|
45
|
+
lSize= ftell (input);
|
46
|
+
rewind(input);
|
47
|
+
search = (char*) malloc (sizeof(char)*lSize);
|
48
|
+
if (search == NULL) { fputs ("Error: Memory error",stderr); exit(-2); }
|
49
|
+
fread(search, 1, lSize-1, input);
|
50
|
+
}
|
51
|
+
|
52
|
+
dictionary= fopen("../spec/data/dictionary.txt", "r");
|
53
|
+
|
54
|
+
if(dictionary == NULL) {
|
55
|
+
printf("Error: can't open file.\n");
|
56
|
+
return -1;
|
57
|
+
}
|
58
|
+
|
59
|
+
tree= ac_alloc();
|
60
|
+
|
61
|
+
// start counting from 1
|
62
|
+
n= 1;
|
63
|
+
|
64
|
+
printf("==> building dictionary ...");
|
65
|
+
|
66
|
+
while(fgets(word, 1024, dictionary) != NULL) {
|
67
|
+
// strip \n
|
68
|
+
ac_add_string(tree, word, strlen(word)-1, n++);
|
69
|
+
}
|
70
|
+
|
71
|
+
printf("%d entries added.\n",n);
|
72
|
+
|
73
|
+
ac_prep(tree);
|
74
|
+
|
75
|
+
printf("==> input text [%d]:\n--\n%s\n--\n", strlen(search), search);
|
76
|
+
|
77
|
+
ac_search_init(tree, search, strlen(search) );
|
78
|
+
|
79
|
+
while((remain= ac_search(tree, &lgt, &id)) != NULL) {
|
80
|
+
printf("`%d'", remain[lgt+1]);
|
81
|
+
result = (char*) malloc (sizeof(char)*lgt);
|
82
|
+
sprintf( result, "%.*s", lgt, remain);
|
83
|
+
// result: should read first lgt chars from remain.
|
84
|
+
printf("==> result: lenght=> %d, id=> %d [%s]\n", lgt, id, result);
|
85
|
+
free(result);
|
86
|
+
}
|
87
|
+
|
88
|
+
ac_free(tree);
|
89
|
+
fclose(dictionary);
|
90
|
+
free(search);
|
91
|
+
|
92
|
+
return 0;
|
93
|
+
}
|
94
|
+
|
data/examples/test.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.dirname(__FILE__) + '/../ext/ahocorasick'
|
4
|
+
|
5
|
+
k= AhoCorasick::KeywordTree.new
|
6
|
+
|
7
|
+
puts k.size
|
8
|
+
k.add_string("foo");
|
9
|
+
|
10
|
+
puts k.size
|
11
|
+
k.add_string("bar", 1991);
|
12
|
+
|
13
|
+
puts k.size
|
14
|
+
k.add_string("fomz");
|
15
|
+
|
16
|
+
begin
|
17
|
+
k.add_string("foo", -1);
|
18
|
+
rescue RuntimeError => err
|
19
|
+
puts "[ok]==> got " + err.class.name + ": " + err.message
|
20
|
+
end
|
21
|
+
|
22
|
+
begin
|
23
|
+
k.add_string("foo", "bar");
|
24
|
+
rescue RuntimeError => err
|
25
|
+
puts "[ok]==> got " + err.class.name + ": " + err.message
|
26
|
+
end
|
27
|
+
|
28
|
+
k.add_string("timisoara", 22);
|
29
|
+
|
30
|
+
puts k.size
|
31
|
+
begin
|
32
|
+
k.add_string("bucuresti", 22);
|
33
|
+
rescue RuntimeError => err
|
34
|
+
puts "[ok]==> got " + err.class.name + ": " + err.message
|
35
|
+
end
|
36
|
+
|
37
|
+
k << "bacau"
|
38
|
+
|
39
|
+
k.search('am fost la bacau').each do | result |
|
40
|
+
puts result.inspect
|
41
|
+
end
|
42
|
+
|
43
|
+
k.search( 'din foo in foo' ).each do | q |
|
44
|
+
puts q.inspect
|
45
|
+
end
|
46
|
+
|
data/ext/ac.c
ADDED
@@ -0,0 +1,623 @@
|
|
1
|
+
/*
|
2
|
+
* ac.c
|
3
|
+
*
|
4
|
+
* Implementation of the Aho-Corasick algorithm.
|
5
|
+
*
|
6
|
+
* NOTES:
|
7
|
+
* 8/94 - Original Implementation (Sean Davis)
|
8
|
+
* 9/94 - Redid Implementation (James Knight)
|
9
|
+
* 3/96 - Modularized the code (James Knight)
|
10
|
+
* 7/96 - Finished the modularization (James Knight)
|
11
|
+
*/
|
12
|
+
|
13
|
+
#include <stdio.h>
|
14
|
+
#include <stdlib.h>
|
15
|
+
#include <string.h>
|
16
|
+
#include "ac.h"
|
17
|
+
|
18
|
+
/*
|
19
|
+
* ac_alloc
|
20
|
+
*
|
21
|
+
* Creates a new AC_STRUCT structure and initializes its fields.
|
22
|
+
*
|
23
|
+
* Parameters: none.
|
24
|
+
*
|
25
|
+
* Returns: A dynamically allocated AC_STRUCT structure.
|
26
|
+
*/
|
27
|
+
AC_STRUCT *ac_alloc(void)
|
28
|
+
{
|
29
|
+
AC_STRUCT *node;
|
30
|
+
|
31
|
+
if ((node = malloc(sizeof(AC_STRUCT))) == NULL)
|
32
|
+
return NULL;
|
33
|
+
memset(node, 0, sizeof(AC_STRUCT));
|
34
|
+
|
35
|
+
if ((node->tree = malloc(sizeof(ACTREE_NODE))) == NULL) {
|
36
|
+
free(node);
|
37
|
+
return NULL;
|
38
|
+
}
|
39
|
+
memset(node->tree, 0, sizeof(ACTREE_NODE));
|
40
|
+
|
41
|
+
return node;
|
42
|
+
}
|
43
|
+
|
44
|
+
|
45
|
+
/*
|
46
|
+
* ac_add_string
|
47
|
+
*
|
48
|
+
* Adds a string to the AC_STRUCT structure's keyword tree.
|
49
|
+
*
|
50
|
+
* NOTE: The `id' value given must be unique to any of the strings
|
51
|
+
* added to the tree, and must be a small integer greater than
|
52
|
+
* 0 (since it is used to index an array holding information
|
53
|
+
* about each of the strings).
|
54
|
+
*
|
55
|
+
* The best id's to use are to number the strings from 1 to K.
|
56
|
+
*
|
57
|
+
* Parameters: node - an AC_STRUCT structure
|
58
|
+
* P - the sequence
|
59
|
+
* M - the sequence length
|
60
|
+
* id - the sequence identifier
|
61
|
+
*
|
62
|
+
* Returns: non-zero on success, zero on error.
|
63
|
+
*/
|
64
|
+
int ac_add_string(AC_STRUCT *node, char *P, int M, int id)
|
65
|
+
{
|
66
|
+
int i, j, newsize;
|
67
|
+
AC_TREE tnode, child, back, newnode, list, tail;
|
68
|
+
|
69
|
+
/*
|
70
|
+
* Return a zero if a previous error had occurred, or if the
|
71
|
+
* given id equals zero. An id value of zero is used by the
|
72
|
+
* algorithm to signal that no pattern ends at a node in the
|
73
|
+
* keyword tree. So, it can't be used as a pattern's id.
|
74
|
+
*/
|
75
|
+
if (node->errorflag || id == 0)
|
76
|
+
return 0;
|
77
|
+
|
78
|
+
P--; /* Shift to make sequence be P[1],...,P[M] */
|
79
|
+
|
80
|
+
/*
|
81
|
+
* Allocate space for the new string's information.
|
82
|
+
*/
|
83
|
+
if (node->Psize <= id) {
|
84
|
+
if (node->Psize == 0) {
|
85
|
+
newsize = (id >= 16 ? id + 1 : 16);
|
86
|
+
node->Plengths = malloc(newsize * sizeof(int));
|
87
|
+
}
|
88
|
+
else {
|
89
|
+
newsize = node->Psize + id + 1;
|
90
|
+
node->Plengths = realloc(node->Plengths, newsize * sizeof(int));
|
91
|
+
}
|
92
|
+
if (node->Plengths == NULL) {
|
93
|
+
node->errorflag = 1;
|
94
|
+
return 0;
|
95
|
+
}
|
96
|
+
|
97
|
+
for (i=node->Psize; i < newsize; i++)
|
98
|
+
node->Plengths[i] = 0;
|
99
|
+
node->Psize = newsize;
|
100
|
+
}
|
101
|
+
|
102
|
+
// duplicate id
|
103
|
+
if (node->Plengths[id] != 0)
|
104
|
+
return 0;
|
105
|
+
|
106
|
+
/*
|
107
|
+
* Add the string to the keyword tree.
|
108
|
+
*/
|
109
|
+
tnode = node->tree;
|
110
|
+
for (i=1; i <= M; i++) {
|
111
|
+
/*
|
112
|
+
* Find the child whose character is P[i].
|
113
|
+
*/
|
114
|
+
back = NULL;
|
115
|
+
child = tnode->children;
|
116
|
+
while (child != NULL && child->ch < P[i]) {
|
117
|
+
back = child;
|
118
|
+
child = child->sibling;
|
119
|
+
}
|
120
|
+
|
121
|
+
if (child == NULL || child->ch != P[i])
|
122
|
+
break;
|
123
|
+
|
124
|
+
tnode = child;
|
125
|
+
|
126
|
+
#ifdef STATS
|
127
|
+
node->prep_old_edges++;
|
128
|
+
#endif
|
129
|
+
|
130
|
+
}
|
131
|
+
|
132
|
+
/*
|
133
|
+
* If only part of the pattern exists in the tree, add the
|
134
|
+
* rest of the pattern to the tree.
|
135
|
+
*/
|
136
|
+
if (i <= M) {
|
137
|
+
list = tail = NULL;
|
138
|
+
for (j=i; j <= M; j++) {
|
139
|
+
if ((newnode = malloc(sizeof(ACTREE_NODE))) == NULL)
|
140
|
+
break;
|
141
|
+
memset(newnode, 0, sizeof(ACTREE_NODE));
|
142
|
+
newnode->ch = P[j];
|
143
|
+
|
144
|
+
if (list == NULL)
|
145
|
+
list = tail = newnode;
|
146
|
+
else
|
147
|
+
tail = tail->children = newnode;
|
148
|
+
|
149
|
+
#ifdef STATS
|
150
|
+
node->prep_new_edges++;
|
151
|
+
#endif
|
152
|
+
|
153
|
+
}
|
154
|
+
if (j <= M) {
|
155
|
+
while (list != NULL) {
|
156
|
+
tail = list->children;
|
157
|
+
free(list);
|
158
|
+
list = tail;
|
159
|
+
}
|
160
|
+
return 0;
|
161
|
+
}
|
162
|
+
|
163
|
+
list->sibling = child;
|
164
|
+
if (back == NULL)
|
165
|
+
tnode->children = list;
|
166
|
+
else
|
167
|
+
back->sibling = list;
|
168
|
+
|
169
|
+
tnode = tail;
|
170
|
+
}
|
171
|
+
|
172
|
+
tnode->matchid = id;
|
173
|
+
node->Plengths[id] = M;
|
174
|
+
node->ispreprocessed = 0;
|
175
|
+
|
176
|
+
return 1;
|
177
|
+
}
|
178
|
+
|
179
|
+
|
180
|
+
/*
|
181
|
+
* ac_del_string
|
182
|
+
*
|
183
|
+
* Deletes a string from the keyword tree.
|
184
|
+
*
|
185
|
+
* Parameters: node - an AC_STRUCT structure
|
186
|
+
* P - the sequence to be deleted
|
187
|
+
* M - its length
|
188
|
+
* id - its identifier
|
189
|
+
*
|
190
|
+
* Returns: non-zero on success, zero on error.
|
191
|
+
*/
|
192
|
+
int ac_del_string(AC_STRUCT *node, char *P, int M, int id)
|
193
|
+
{
|
194
|
+
int i, flag;
|
195
|
+
AC_TREE tnode, tlast, tback, child, back;
|
196
|
+
|
197
|
+
if (node->errorflag || id > node->Psize || node->Plengths[id] == 0)
|
198
|
+
return 0;
|
199
|
+
|
200
|
+
P--; /* Shift to make sequence be P[1],...,P[M] */
|
201
|
+
|
202
|
+
/*
|
203
|
+
* Scan the tree for the path corresponding to the keyword to be deleted.
|
204
|
+
*/
|
205
|
+
flag = 1;
|
206
|
+
tlast = tnode = node->tree;
|
207
|
+
tback = NULL;
|
208
|
+
|
209
|
+
for (i=1; i <= M; i++) {
|
210
|
+
/*
|
211
|
+
* Find the child matching P[i]. It must be there.
|
212
|
+
*/
|
213
|
+
child = tnode->children;
|
214
|
+
back = NULL;
|
215
|
+
while (child != NULL && child->ch != P[i]) {
|
216
|
+
back = child;
|
217
|
+
child = child->sibling;
|
218
|
+
}
|
219
|
+
|
220
|
+
if (child == NULL) {
|
221
|
+
fprintf(stderr, "Error in Aho-Corasick preprocessing. String to be "
|
222
|
+
"deleted is not in tree.\n");
|
223
|
+
return 0;
|
224
|
+
}
|
225
|
+
|
226
|
+
/*
|
227
|
+
* Try to find the point where the pattern to be deleted branches off
|
228
|
+
* from the paths of the other patterns in the tree. This point must
|
229
|
+
* be at the latest node which satisfies one of these two conditions:
|
230
|
+
*
|
231
|
+
* 1) Another pattern ends at that node (and so
|
232
|
+
* `child->matchid != 0'). In this case, the branch point is
|
233
|
+
* just below this node and so the children of this node
|
234
|
+
* should be removed.
|
235
|
+
* 2) A node has other siblings. In this case, the node itself
|
236
|
+
* is the branch point, and it and its children should be
|
237
|
+
* removed.
|
238
|
+
*/
|
239
|
+
if (i < M && child->matchid != 0) {
|
240
|
+
flag = 1;
|
241
|
+
tlast = child;
|
242
|
+
}
|
243
|
+
else if (back != NULL || child->sibling != NULL) {
|
244
|
+
flag = 2;
|
245
|
+
tlast = child;
|
246
|
+
tback = (back == NULL ? tnode : back);
|
247
|
+
}
|
248
|
+
|
249
|
+
tnode = child;
|
250
|
+
}
|
251
|
+
|
252
|
+
/*
|
253
|
+
* If the node corresponding to the end of the keyword has children,
|
254
|
+
* then the tree should not be altered, except to remove the keyword's
|
255
|
+
* identifier from the tree.
|
256
|
+
*
|
257
|
+
* Otherwise, apply the appropriate removal, as described above.
|
258
|
+
*/
|
259
|
+
if (tnode->children != NULL) {
|
260
|
+
tnode->matchid = 0;
|
261
|
+
}
|
262
|
+
else {
|
263
|
+
if (flag == 1) {
|
264
|
+
child = tlast->children;
|
265
|
+
tlast->children = NULL;
|
266
|
+
tlast = child;
|
267
|
+
}
|
268
|
+
else {
|
269
|
+
if (tback->children == tlast)
|
270
|
+
tback->children = tlast->sibling;
|
271
|
+
else
|
272
|
+
tback->sibling = tlast->sibling;
|
273
|
+
}
|
274
|
+
|
275
|
+
while (tlast != NULL) {
|
276
|
+
child = tlast->children;
|
277
|
+
free(tlast);
|
278
|
+
tlast = child;
|
279
|
+
}
|
280
|
+
}
|
281
|
+
|
282
|
+
node->Plengths[id] = 0;
|
283
|
+
node->ispreprocessed = 0;
|
284
|
+
|
285
|
+
return 1;
|
286
|
+
}
|
287
|
+
|
288
|
+
|
289
|
+
/*
|
290
|
+
* ac_prep
|
291
|
+
*
|
292
|
+
* Compute the failure and output links for the keyword tree.
|
293
|
+
*
|
294
|
+
* Parameters: node - an AC_STRUCT structure
|
295
|
+
*
|
296
|
+
* Returns: non-zero on success, zero on error.
|
297
|
+
*/
|
298
|
+
int ac_prep(AC_STRUCT *node)
|
299
|
+
{
|
300
|
+
char x;
|
301
|
+
AC_TREE v, vprime, w, wprime, root, front, back, child;
|
302
|
+
|
303
|
+
if (node->errorflag)
|
304
|
+
return 0;
|
305
|
+
|
306
|
+
/*
|
307
|
+
* The failure link and output link computation requires a breadth-first
|
308
|
+
* traversal of the keyword tree. And, to do that, we need a queue of
|
309
|
+
* the nodes yet to be processed.
|
310
|
+
*
|
311
|
+
* The `faillink' fields will be used as the pointers for the queue
|
312
|
+
* of nodes to be computed (since the failure link is only set after
|
313
|
+
* the node is removed from the queue).
|
314
|
+
*
|
315
|
+
* The `outlink' fields will be used as the pointers to a node's parent
|
316
|
+
* for nodes in the queue (since the output link is also only set after
|
317
|
+
* the node is removed from the queue).
|
318
|
+
*/
|
319
|
+
root = node->tree;
|
320
|
+
|
321
|
+
front = back = root;
|
322
|
+
front->faillink = NULL;
|
323
|
+
front->outlink = NULL;
|
324
|
+
|
325
|
+
while (front != NULL) {
|
326
|
+
v = front;
|
327
|
+
x = v->ch;
|
328
|
+
vprime = v->outlink;
|
329
|
+
|
330
|
+
/*
|
331
|
+
* Add the node's children to the queue.
|
332
|
+
*/
|
333
|
+
for (child=v->children; child != NULL; child=child->sibling) {
|
334
|
+
child->outlink = v;
|
335
|
+
back->faillink = child;
|
336
|
+
back = child;
|
337
|
+
}
|
338
|
+
back->faillink = NULL;
|
339
|
+
|
340
|
+
front = front->faillink;
|
341
|
+
v->faillink = v->outlink = NULL;
|
342
|
+
|
343
|
+
/*
|
344
|
+
* Set the failure and output links.
|
345
|
+
*/
|
346
|
+
if (v == root)
|
347
|
+
;
|
348
|
+
else if (vprime == root)
|
349
|
+
v->faillink = root;
|
350
|
+
else {
|
351
|
+
/*
|
352
|
+
* Find the find link in the failure link chain which has a child
|
353
|
+
* labeled with x.
|
354
|
+
*/
|
355
|
+
wprime = NULL;
|
356
|
+
w = vprime->faillink;
|
357
|
+
|
358
|
+
while (1) {
|
359
|
+
wprime = w->children;
|
360
|
+
while (wprime != NULL && wprime->ch < x)
|
361
|
+
wprime = wprime->sibling;
|
362
|
+
|
363
|
+
if ((wprime != NULL && wprime->ch == x) || w == root)
|
364
|
+
break;
|
365
|
+
|
366
|
+
w = w->faillink;
|
367
|
+
|
368
|
+
#ifdef STATS
|
369
|
+
node->prep_fail_compares++;
|
370
|
+
#endif
|
371
|
+
}
|
372
|
+
#ifdef STATS
|
373
|
+
node->prep_fail_compares++;
|
374
|
+
#endif
|
375
|
+
|
376
|
+
if (wprime != NULL && wprime->ch == x)
|
377
|
+
v->faillink = wprime;
|
378
|
+
else
|
379
|
+
v->faillink = root;
|
380
|
+
|
381
|
+
if (v->matchid != 0) {
|
382
|
+
if (v->faillink->matchid != 0)
|
383
|
+
v->outlink = v->faillink;
|
384
|
+
else
|
385
|
+
v->outlink = v->faillink->outlink;
|
386
|
+
}
|
387
|
+
}
|
388
|
+
}
|
389
|
+
|
390
|
+
node->ispreprocessed = 1;
|
391
|
+
node->initflag = 0;
|
392
|
+
|
393
|
+
return 1;
|
394
|
+
}
|
395
|
+
|
396
|
+
|
397
|
+
/*
|
398
|
+
* ac_search_init
|
399
|
+
*
|
400
|
+
* Initializes the variables used during an Aho-Corasick search.
|
401
|
+
* See ac_search for an example of how it should be used.
|
402
|
+
*
|
403
|
+
* Parameters: node - an AC_STRUCT structure
|
404
|
+
* T - the sequence to be searched
|
405
|
+
* N - the length of the sequence
|
406
|
+
*
|
407
|
+
* Returns: nothing.
|
408
|
+
*/
|
409
|
+
void ac_search_init(AC_STRUCT *node, char *T, int N)
|
410
|
+
{
|
411
|
+
if (node->errorflag)
|
412
|
+
return;
|
413
|
+
else if (!node->ispreprocessed) {
|
414
|
+
fprintf(stderr, "Error in Aho-Corasick search. The preprocessing "
|
415
|
+
"has not been completed.\n");
|
416
|
+
return;
|
417
|
+
}
|
418
|
+
|
419
|
+
node->T = T - 1; /* Shift to make sequence be T[1],...,T[N] */
|
420
|
+
node->N = N;
|
421
|
+
node->c = 1;
|
422
|
+
node->w = node->tree;
|
423
|
+
node->output = NULL;
|
424
|
+
node->initflag = 1;
|
425
|
+
node->endflag = 0;
|
426
|
+
}
|
427
|
+
|
428
|
+
|
429
|
+
/*
|
430
|
+
* ac_search
|
431
|
+
*
|
432
|
+
* Scans a text to look for the next occurrence of one of the patterns
|
433
|
+
* in the text. An example of how this search should be used is the
|
434
|
+
* following:
|
435
|
+
*
|
436
|
+
* s = T;
|
437
|
+
* len = N;
|
438
|
+
* contflag = 0;
|
439
|
+
* ac_search_init(node, T, N);
|
440
|
+
* while ((s = ac_search(node, &matchlen, &matchid) != NULL) {
|
441
|
+
* >>> Pattern `matchid' matched from `s' to `s + matchlen - 1'. <<<
|
442
|
+
* }
|
443
|
+
*
|
444
|
+
* where `node', `T' and `N' are assumed to be initialized appropriately.
|
445
|
+
*
|
446
|
+
* Parameters: node - a preprocessed AC_STRUCT structure
|
447
|
+
* length_out - where to store the new match's length
|
448
|
+
* id_out - where to store the identifier of the
|
449
|
+
* pattern that matched
|
450
|
+
* ends_at - where to store the n-th matched char
|
451
|
+
*
|
452
|
+
* Returns: the left end of the text that matches a pattern, or NULL
|
453
|
+
* if no match occurs. (It also stores values in `*length_out',
|
454
|
+
* and `*id_out' giving the match's length and pattern identifier.
|
455
|
+
*/
|
456
|
+
char *ac_search(AC_STRUCT *node, int *length_out, int *id_out, int *ends_at)
|
457
|
+
{
|
458
|
+
int c, N, id;
|
459
|
+
char *T;
|
460
|
+
AC_TREE w, wprime, root;
|
461
|
+
|
462
|
+
if (node->errorflag)
|
463
|
+
return NULL;
|
464
|
+
else if (!node->ispreprocessed) {
|
465
|
+
fprintf(stderr, "Error in Aho-Corasick search. The preprocessing "
|
466
|
+
"has not been completed.\n");
|
467
|
+
return NULL;
|
468
|
+
}
|
469
|
+
else if (!node->initflag) {
|
470
|
+
fprintf(stderr, "Error in Aho-Corasick search. ac_search_init was not "
|
471
|
+
"called.\n");
|
472
|
+
return NULL;
|
473
|
+
}
|
474
|
+
else if (node->endflag)
|
475
|
+
return NULL;
|
476
|
+
|
477
|
+
T = node->T;
|
478
|
+
N = node->N;
|
479
|
+
c = node->c;
|
480
|
+
w = node->w;
|
481
|
+
root = node->tree;
|
482
|
+
|
483
|
+
/*
|
484
|
+
* If the last call to ac_search returned a match, check for another
|
485
|
+
* match ending at the same right endpoint (denoted by a non-NULL
|
486
|
+
* output link).
|
487
|
+
*/
|
488
|
+
if (node->output != NULL) {
|
489
|
+
node->output = node->output->outlink;
|
490
|
+
|
491
|
+
#ifdef STATS
|
492
|
+
node->outlinks_traversed++;
|
493
|
+
#endif
|
494
|
+
|
495
|
+
if (node->output != NULL) {
|
496
|
+
id = node->output->matchid;
|
497
|
+
if (id_out)
|
498
|
+
*id_out = id;
|
499
|
+
if (length_out)
|
500
|
+
*length_out = node->Plengths[id];
|
501
|
+
if (ends_at)
|
502
|
+
*ends_at= c;
|
503
|
+
return &T[c] - node->Plengths[id];
|
504
|
+
}
|
505
|
+
|
506
|
+
}
|
507
|
+
|
508
|
+
/*
|
509
|
+
* Run the search algorithm, stopping at the first position where a
|
510
|
+
* match to one of the patterns occurs.
|
511
|
+
*/
|
512
|
+
while (c <= N) {
|
513
|
+
/*
|
514
|
+
* Try to match the next input character to a child in the tree.
|
515
|
+
*/
|
516
|
+
wprime = w->children;
|
517
|
+
while (wprime != NULL && wprime->ch != T[c])
|
518
|
+
wprime = wprime->sibling;
|
519
|
+
|
520
|
+
#ifdef STATS
|
521
|
+
node->num_compares++;
|
522
|
+
#endif
|
523
|
+
|
524
|
+
/*
|
525
|
+
* If the match fails, then either use the failure link (if not
|
526
|
+
* at the root), or move to the next character since no prefix
|
527
|
+
* of any pattern ends with character T[c].
|
528
|
+
*/
|
529
|
+
if (wprime == NULL) {
|
530
|
+
if (w == root)
|
531
|
+
c++;
|
532
|
+
else {
|
533
|
+
w = w->faillink;
|
534
|
+
|
535
|
+
#ifdef STATS
|
536
|
+
node->num_failures++;
|
537
|
+
#endif
|
538
|
+
|
539
|
+
}
|
540
|
+
}
|
541
|
+
else {
|
542
|
+
/*
|
543
|
+
* If we could match the input, move down the tree and to the
|
544
|
+
* next input character, and see if that match completes the
|
545
|
+
* match to a pattern (when matchid != 0 or outlink != NULL).
|
546
|
+
*/
|
547
|
+
c++;
|
548
|
+
w = wprime;
|
549
|
+
|
550
|
+
#ifdef STATS
|
551
|
+
node->edges_traversed++;
|
552
|
+
#endif
|
553
|
+
|
554
|
+
if (w->matchid != 0)
|
555
|
+
node->output = w;
|
556
|
+
else if (w->outlink != NULL) {
|
557
|
+
node->output = w->outlink;
|
558
|
+
|
559
|
+
#ifdef STATS
|
560
|
+
node->outlinks_traversed++;
|
561
|
+
#endif
|
562
|
+
|
563
|
+
}
|
564
|
+
|
565
|
+
if (node->output != NULL) {
|
566
|
+
id = node->output->matchid;
|
567
|
+
if (id_out)
|
568
|
+
*id_out = id;
|
569
|
+
if (length_out)
|
570
|
+
*length_out= node->Plengths[id];
|
571
|
+
if(ends_at)
|
572
|
+
*ends_at= c;
|
573
|
+
|
574
|
+
node->w = w;
|
575
|
+
node->c = c; // ends_at - length_out;
|
576
|
+
|
577
|
+
return &T[c] - node->Plengths[id];
|
578
|
+
}
|
579
|
+
}
|
580
|
+
}
|
581
|
+
|
582
|
+
node->c = c;
|
583
|
+
node->endflag = 1;
|
584
|
+
|
585
|
+
return NULL;
|
586
|
+
}
|
587
|
+
|
588
|
+
|
589
|
+
/*
|
590
|
+
* ac_free
|
591
|
+
*
|
592
|
+
* Free up the allocated AC_STRUCT structure.
|
593
|
+
*
|
594
|
+
* Parameters: node - a AC_STRUCT structure
|
595
|
+
*
|
596
|
+
* Returns: nothing.
|
597
|
+
*/
|
598
|
+
void ac_free(AC_STRUCT *node)
|
599
|
+
{
|
600
|
+
AC_TREE front, back, next;
|
601
|
+
|
602
|
+
if (node == NULL)
|
603
|
+
return;
|
604
|
+
|
605
|
+
if (node->tree != NULL) {
|
606
|
+
front = back = node->tree;
|
607
|
+
while (front != NULL) {
|
608
|
+
back->sibling = front->children;
|
609
|
+
while (back->sibling != NULL)
|
610
|
+
back = back->sibling;
|
611
|
+
|
612
|
+
next = front->sibling;
|
613
|
+
free(front);
|
614
|
+
front = next;
|
615
|
+
}
|
616
|
+
}
|
617
|
+
|
618
|
+
if (node->Plengths != NULL)
|
619
|
+
free(node->Plengths);
|
620
|
+
|
621
|
+
free(node);
|
622
|
+
}
|
623
|
+
|