aurelian-ruby-ahocorasick 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +21 -0
- data/README.textile +36 -0
- data/examples/dict.rb +14 -0
- data/examples/elev.rb +19 -0
- data/examples/sample.c +94 -0
- data/examples/test.rb +46 -0
- data/ext/ac.c +623 -0
- data/ext/ac.h +36 -0
- data/ext/extconf.rb +4 -0
- data/ext/ruby-ahocorasick.c +329 -0
- data/spec/ahocorasick_spec.rb +183 -0
- metadata +69 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2008 Aurelian Oancea
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
data/README.textile
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
h3. Introduction
|
2
|
+
|
3
|
+
This library is a ruby extension, a wrapper around the "Aho-Corasick":http://en.wikipedia.org/wiki/Aho-Corasick_algorithm implementation in C, found in "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html package.
|
4
|
+
|
5
|
+
The source code (ac.c and ac.h) was "adapted" from Strmat. In fact, I've changed only 3-4 lines of code from the original implementation so it will feat my needs: search needed to return the current position in the searched string.
|
6
|
+
|
7
|
+
h3. What's the idea?
|
8
|
+
|
9
|
+
Having a dictionary of known sentences, how can I find individual patterns in an incoming stream of data? Fast.
|
10
|
+
|
11
|
+
h1. TBD
|
12
|
+
|
13
|
+
<pre>
|
14
|
+
[aurelian@stalingrad ext]$ time ./dict.rb
|
15
|
+
110196
|
16
|
+
711
|
17
|
+
|
18
|
+
real 0m0.538s
|
19
|
+
user 0m0.435s
|
20
|
+
sys 0m0.036s
|
21
|
+
</pre>
|
22
|
+
|
23
|
+
h3. Additional Reading / Implementations
|
24
|
+
|
25
|
+
Other suffix - tree
|
26
|
+
|
27
|
+
* "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html
|
28
|
+
* Pytst / Ruby-Pytst
|
29
|
+
* "Aho-Corasick extension":http://hkn.eecs.berkeley.edu/~dyoo/python/ahocorasick/
|
30
|
+
* "Keyword Prospector":http://latimes.rubyforge.org/keyword_prospector/rdoc/
|
31
|
+
* "libstree":http://www.cl.cam.ac.uk/~cpk25/libstree/
|
32
|
+
|
33
|
+
--
|
34
|
+
|
35
|
+
(c) 2008 - Aurelian Oancea, < aurelian at locknet . ro >
|
36
|
+
released under MIT-LICENCE
|
data/examples/dict.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.dirname(__FILE__) + '/../ext/ahocorasick'
|
4
|
+
|
5
|
+
k= AhoCorasick::KeywordTree.from_file(File.dirname(__FILE__) + "/../spec/data/en.words")
|
6
|
+
|
7
|
+
query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt")
|
8
|
+
|
9
|
+
results= k.search query
|
10
|
+
|
11
|
+
results.each do | r |
|
12
|
+
puts query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr + " => " + r[:value]
|
13
|
+
end
|
14
|
+
|
data/examples/elev.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$kcode='UTF-8'
|
4
|
+
|
5
|
+
require File.dirname(__FILE__) + '/../ext/ahocorasick'
|
6
|
+
|
7
|
+
k= AhoCorasick::KeywordTree.new
|
8
|
+
|
9
|
+
k << "I've"
|
10
|
+
k << "data"
|
11
|
+
k << "base"
|
12
|
+
k << "database"
|
13
|
+
|
14
|
+
query= "I've moved my data to a database"
|
15
|
+
|
16
|
+
k.search(query).each do | r |
|
17
|
+
puts "-> [ " + r[:id].to_s + " ] " + r[:value] + " / " + query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr
|
18
|
+
end
|
19
|
+
|
data/examples/sample.c
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
//
|
2
|
+
// Getting started with Aho-Corasick from Strmat
|
3
|
+
//
|
4
|
+
// lasick Makefile:
|
5
|
+
//
|
6
|
+
// --
|
7
|
+
// ac.o :
|
8
|
+
// gcc -c -fPIC -shared ac.c
|
9
|
+
// libasick : ac.o
|
10
|
+
// gcc -shared -Wl,-soname,libasick.so -o libasick.so.1.0.1
|
11
|
+
// ar rcs libasick.a ac.o
|
12
|
+
// clean :
|
13
|
+
// rm -rf *.o *.a *.so* *.dylib*
|
14
|
+
// --
|
15
|
+
//
|
16
|
+
// Compile this stuff - asick is the library name, generated with the above Makefile :)
|
17
|
+
//
|
18
|
+
// gcc sample.c -o ac-sample -I../ext/ -L../ext/ -lasick
|
19
|
+
//
|
20
|
+
|
21
|
+
#include <string.h>
|
22
|
+
#include <stdio.h>
|
23
|
+
#include <stdlib.h>
|
24
|
+
#include "ac.h"
|
25
|
+
|
26
|
+
int main(int argc, char *argv[]) {
|
27
|
+
char * search;
|
28
|
+
char * remain;
|
29
|
+
char * result;
|
30
|
+
char word[1024];
|
31
|
+
|
32
|
+
FILE *dictionary;
|
33
|
+
FILE *input;
|
34
|
+
int lgt, id, n, i;
|
35
|
+
|
36
|
+
AC_STRUCT * tree;
|
37
|
+
|
38
|
+
input= fopen(argv[1], "r");
|
39
|
+
|
40
|
+
if(input == NULL) {
|
41
|
+
search= argv[1];
|
42
|
+
} else {
|
43
|
+
long lSize;
|
44
|
+
fseek (input , 0 , SEEK_END);
|
45
|
+
lSize= ftell (input);
|
46
|
+
rewind(input);
|
47
|
+
search = (char*) malloc (sizeof(char)*lSize);
|
48
|
+
if (search == NULL) { fputs ("Error: Memory error",stderr); exit(-2); }
|
49
|
+
fread(search, 1, lSize-1, input);
|
50
|
+
}
|
51
|
+
|
52
|
+
dictionary= fopen("../spec/data/dictionary.txt", "r");
|
53
|
+
|
54
|
+
if(dictionary == NULL) {
|
55
|
+
printf("Error: can't open file.\n");
|
56
|
+
return -1;
|
57
|
+
}
|
58
|
+
|
59
|
+
tree= ac_alloc();
|
60
|
+
|
61
|
+
// start counting from 1
|
62
|
+
n= 1;
|
63
|
+
|
64
|
+
printf("==> building dictionary ...");
|
65
|
+
|
66
|
+
while(fgets(word, 1024, dictionary) != NULL) {
|
67
|
+
// strip \n
|
68
|
+
ac_add_string(tree, word, strlen(word)-1, n++);
|
69
|
+
}
|
70
|
+
|
71
|
+
printf("%d entries added.\n",n);
|
72
|
+
|
73
|
+
ac_prep(tree);
|
74
|
+
|
75
|
+
printf("==> input text [%d]:\n--\n%s\n--\n", strlen(search), search);
|
76
|
+
|
77
|
+
ac_search_init(tree, search, strlen(search) );
|
78
|
+
|
79
|
+
while((remain= ac_search(tree, &lgt, &id)) != NULL) {
|
80
|
+
printf("`%d'", remain[lgt+1]);
|
81
|
+
result = (char*) malloc (sizeof(char)*lgt);
|
82
|
+
sprintf( result, "%.*s", lgt, remain);
|
83
|
+
// result: should read first lgt chars from remain.
|
84
|
+
printf("==> result: lenght=> %d, id=> %d [%s]\n", lgt, id, result);
|
85
|
+
free(result);
|
86
|
+
}
|
87
|
+
|
88
|
+
ac_free(tree);
|
89
|
+
fclose(dictionary);
|
90
|
+
free(search);
|
91
|
+
|
92
|
+
return 0;
|
93
|
+
}
|
94
|
+
|
data/examples/test.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.dirname(__FILE__) + '/../ext/ahocorasick'
|
4
|
+
|
5
|
+
k= AhoCorasick::KeywordTree.new
|
6
|
+
|
7
|
+
puts k.size
|
8
|
+
k.add_string("foo");
|
9
|
+
|
10
|
+
puts k.size
|
11
|
+
k.add_string("bar", 1991);
|
12
|
+
|
13
|
+
puts k.size
|
14
|
+
k.add_string("fomz");
|
15
|
+
|
16
|
+
begin
|
17
|
+
k.add_string("foo", -1);
|
18
|
+
rescue RuntimeError => err
|
19
|
+
puts "[ok]==> got " + err.class.name + ": " + err.message
|
20
|
+
end
|
21
|
+
|
22
|
+
begin
|
23
|
+
k.add_string("foo", "bar");
|
24
|
+
rescue RuntimeError => err
|
25
|
+
puts "[ok]==> got " + err.class.name + ": " + err.message
|
26
|
+
end
|
27
|
+
|
28
|
+
k.add_string("timisoara", 22);
|
29
|
+
|
30
|
+
puts k.size
|
31
|
+
begin
|
32
|
+
k.add_string("bucuresti", 22);
|
33
|
+
rescue RuntimeError => err
|
34
|
+
puts "[ok]==> got " + err.class.name + ": " + err.message
|
35
|
+
end
|
36
|
+
|
37
|
+
k << "bacau"
|
38
|
+
|
39
|
+
k.search('am fost la bacau').each do | result |
|
40
|
+
puts result.inspect
|
41
|
+
end
|
42
|
+
|
43
|
+
k.search( 'din foo in foo' ).each do | q |
|
44
|
+
puts q.inspect
|
45
|
+
end
|
46
|
+
|
data/ext/ac.c
ADDED
@@ -0,0 +1,623 @@
|
|
1
|
+
/*
|
2
|
+
* ac.c
|
3
|
+
*
|
4
|
+
* Implementation of the Aho-Corasick algorithm.
|
5
|
+
*
|
6
|
+
* NOTES:
|
7
|
+
* 8/94 - Original Implementation (Sean Davis)
|
8
|
+
* 9/94 - Redid Implementation (James Knight)
|
9
|
+
* 3/96 - Modularized the code (James Knight)
|
10
|
+
* 7/96 - Finished the modularization (James Knight)
|
11
|
+
*/
|
12
|
+
|
13
|
+
#include <stdio.h>
|
14
|
+
#include <stdlib.h>
|
15
|
+
#include <string.h>
|
16
|
+
#include "ac.h"
|
17
|
+
|
18
|
+
/*
|
19
|
+
* ac_alloc
|
20
|
+
*
|
21
|
+
* Creates a new AC_STRUCT structure and initializes its fields.
|
22
|
+
*
|
23
|
+
* Parameters: none.
|
24
|
+
*
|
25
|
+
* Returns: A dynamically allocated AC_STRUCT structure.
|
26
|
+
*/
|
27
|
+
AC_STRUCT *ac_alloc(void)
|
28
|
+
{
|
29
|
+
AC_STRUCT *node;
|
30
|
+
|
31
|
+
if ((node = malloc(sizeof(AC_STRUCT))) == NULL)
|
32
|
+
return NULL;
|
33
|
+
memset(node, 0, sizeof(AC_STRUCT));
|
34
|
+
|
35
|
+
if ((node->tree = malloc(sizeof(ACTREE_NODE))) == NULL) {
|
36
|
+
free(node);
|
37
|
+
return NULL;
|
38
|
+
}
|
39
|
+
memset(node->tree, 0, sizeof(ACTREE_NODE));
|
40
|
+
|
41
|
+
return node;
|
42
|
+
}
|
43
|
+
|
44
|
+
|
45
|
+
/*
|
46
|
+
* ac_add_string
|
47
|
+
*
|
48
|
+
* Adds a string to the AC_STRUCT structure's keyword tree.
|
49
|
+
*
|
50
|
+
* NOTE: The `id' value given must be unique to any of the strings
|
51
|
+
* added to the tree, and must be a small integer greater than
|
52
|
+
* 0 (since it is used to index an array holding information
|
53
|
+
* about each of the strings).
|
54
|
+
*
|
55
|
+
* The best id's to use are to number the strings from 1 to K.
|
56
|
+
*
|
57
|
+
* Parameters: node - an AC_STRUCT structure
|
58
|
+
* P - the sequence
|
59
|
+
* M - the sequence length
|
60
|
+
* id - the sequence identifier
|
61
|
+
*
|
62
|
+
* Returns: non-zero on success, zero on error.
|
63
|
+
*/
|
64
|
+
int ac_add_string(AC_STRUCT *node, char *P, int M, int id)
|
65
|
+
{
|
66
|
+
int i, j, newsize;
|
67
|
+
AC_TREE tnode, child, back, newnode, list, tail;
|
68
|
+
|
69
|
+
/*
|
70
|
+
* Return a zero if a previous error had occurred, or if the
|
71
|
+
* given id equals zero. An id value of zero is used by the
|
72
|
+
* algorithm to signal that no pattern ends at a node in the
|
73
|
+
* keyword tree. So, it can't be used as a pattern's id.
|
74
|
+
*/
|
75
|
+
if (node->errorflag || id == 0)
|
76
|
+
return 0;
|
77
|
+
|
78
|
+
P--; /* Shift to make sequence be P[1],...,P[M] */
|
79
|
+
|
80
|
+
/*
|
81
|
+
* Allocate space for the new string's information.
|
82
|
+
*/
|
83
|
+
if (node->Psize <= id) {
|
84
|
+
if (node->Psize == 0) {
|
85
|
+
newsize = (id >= 16 ? id + 1 : 16);
|
86
|
+
node->Plengths = malloc(newsize * sizeof(int));
|
87
|
+
}
|
88
|
+
else {
|
89
|
+
newsize = node->Psize + id + 1;
|
90
|
+
node->Plengths = realloc(node->Plengths, newsize * sizeof(int));
|
91
|
+
}
|
92
|
+
if (node->Plengths == NULL) {
|
93
|
+
node->errorflag = 1;
|
94
|
+
return 0;
|
95
|
+
}
|
96
|
+
|
97
|
+
for (i=node->Psize; i < newsize; i++)
|
98
|
+
node->Plengths[i] = 0;
|
99
|
+
node->Psize = newsize;
|
100
|
+
}
|
101
|
+
|
102
|
+
// duplicate id
|
103
|
+
if (node->Plengths[id] != 0)
|
104
|
+
return 0;
|
105
|
+
|
106
|
+
/*
|
107
|
+
* Add the string to the keyword tree.
|
108
|
+
*/
|
109
|
+
tnode = node->tree;
|
110
|
+
for (i=1; i <= M; i++) {
|
111
|
+
/*
|
112
|
+
* Find the child whose character is P[i].
|
113
|
+
*/
|
114
|
+
back = NULL;
|
115
|
+
child = tnode->children;
|
116
|
+
while (child != NULL && child->ch < P[i]) {
|
117
|
+
back = child;
|
118
|
+
child = child->sibling;
|
119
|
+
}
|
120
|
+
|
121
|
+
if (child == NULL || child->ch != P[i])
|
122
|
+
break;
|
123
|
+
|
124
|
+
tnode = child;
|
125
|
+
|
126
|
+
#ifdef STATS
|
127
|
+
node->prep_old_edges++;
|
128
|
+
#endif
|
129
|
+
|
130
|
+
}
|
131
|
+
|
132
|
+
/*
|
133
|
+
* If only part of the pattern exists in the tree, add the
|
134
|
+
* rest of the pattern to the tree.
|
135
|
+
*/
|
136
|
+
if (i <= M) {
|
137
|
+
list = tail = NULL;
|
138
|
+
for (j=i; j <= M; j++) {
|
139
|
+
if ((newnode = malloc(sizeof(ACTREE_NODE))) == NULL)
|
140
|
+
break;
|
141
|
+
memset(newnode, 0, sizeof(ACTREE_NODE));
|
142
|
+
newnode->ch = P[j];
|
143
|
+
|
144
|
+
if (list == NULL)
|
145
|
+
list = tail = newnode;
|
146
|
+
else
|
147
|
+
tail = tail->children = newnode;
|
148
|
+
|
149
|
+
#ifdef STATS
|
150
|
+
node->prep_new_edges++;
|
151
|
+
#endif
|
152
|
+
|
153
|
+
}
|
154
|
+
if (j <= M) {
|
155
|
+
while (list != NULL) {
|
156
|
+
tail = list->children;
|
157
|
+
free(list);
|
158
|
+
list = tail;
|
159
|
+
}
|
160
|
+
return 0;
|
161
|
+
}
|
162
|
+
|
163
|
+
list->sibling = child;
|
164
|
+
if (back == NULL)
|
165
|
+
tnode->children = list;
|
166
|
+
else
|
167
|
+
back->sibling = list;
|
168
|
+
|
169
|
+
tnode = tail;
|
170
|
+
}
|
171
|
+
|
172
|
+
tnode->matchid = id;
|
173
|
+
node->Plengths[id] = M;
|
174
|
+
node->ispreprocessed = 0;
|
175
|
+
|
176
|
+
return 1;
|
177
|
+
}
|
178
|
+
|
179
|
+
|
180
|
+
/*
|
181
|
+
* ac_del_string
|
182
|
+
*
|
183
|
+
* Deletes a string from the keyword tree.
|
184
|
+
*
|
185
|
+
* Parameters: node - an AC_STRUCT structure
|
186
|
+
* P - the sequence to be deleted
|
187
|
+
* M - its length
|
188
|
+
* id - its identifier
|
189
|
+
*
|
190
|
+
* Returns: non-zero on success, zero on error.
|
191
|
+
*/
|
192
|
+
int ac_del_string(AC_STRUCT *node, char *P, int M, int id)
|
193
|
+
{
|
194
|
+
int i, flag;
|
195
|
+
AC_TREE tnode, tlast, tback, child, back;
|
196
|
+
|
197
|
+
if (node->errorflag || id > node->Psize || node->Plengths[id] == 0)
|
198
|
+
return 0;
|
199
|
+
|
200
|
+
P--; /* Shift to make sequence be P[1],...,P[M] */
|
201
|
+
|
202
|
+
/*
|
203
|
+
* Scan the tree for the path corresponding to the keyword to be deleted.
|
204
|
+
*/
|
205
|
+
flag = 1;
|
206
|
+
tlast = tnode = node->tree;
|
207
|
+
tback = NULL;
|
208
|
+
|
209
|
+
for (i=1; i <= M; i++) {
|
210
|
+
/*
|
211
|
+
* Find the child matching P[i]. It must be there.
|
212
|
+
*/
|
213
|
+
child = tnode->children;
|
214
|
+
back = NULL;
|
215
|
+
while (child != NULL && child->ch != P[i]) {
|
216
|
+
back = child;
|
217
|
+
child = child->sibling;
|
218
|
+
}
|
219
|
+
|
220
|
+
if (child == NULL) {
|
221
|
+
fprintf(stderr, "Error in Aho-Corasick preprocessing. String to be "
|
222
|
+
"deleted is not in tree.\n");
|
223
|
+
return 0;
|
224
|
+
}
|
225
|
+
|
226
|
+
/*
|
227
|
+
* Try to find the point where the pattern to be deleted branches off
|
228
|
+
* from the paths of the other patterns in the tree. This point must
|
229
|
+
* be at the latest node which satisfies one of these two conditions:
|
230
|
+
*
|
231
|
+
* 1) Another pattern ends at that node (and so
|
232
|
+
* `child->matchid != 0'). In this case, the branch point is
|
233
|
+
* just below this node and so the children of this node
|
234
|
+
* should be removed.
|
235
|
+
* 2) A node has other siblings. In this case, the node itself
|
236
|
+
* is the branch point, and it and its children should be
|
237
|
+
* removed.
|
238
|
+
*/
|
239
|
+
if (i < M && child->matchid != 0) {
|
240
|
+
flag = 1;
|
241
|
+
tlast = child;
|
242
|
+
}
|
243
|
+
else if (back != NULL || child->sibling != NULL) {
|
244
|
+
flag = 2;
|
245
|
+
tlast = child;
|
246
|
+
tback = (back == NULL ? tnode : back);
|
247
|
+
}
|
248
|
+
|
249
|
+
tnode = child;
|
250
|
+
}
|
251
|
+
|
252
|
+
/*
|
253
|
+
* If the node corresponding to the end of the keyword has children,
|
254
|
+
* then the tree should not be altered, except to remove the keyword's
|
255
|
+
* identifier from the tree.
|
256
|
+
*
|
257
|
+
* Otherwise, apply the appropriate removal, as described above.
|
258
|
+
*/
|
259
|
+
if (tnode->children != NULL) {
|
260
|
+
tnode->matchid = 0;
|
261
|
+
}
|
262
|
+
else {
|
263
|
+
if (flag == 1) {
|
264
|
+
child = tlast->children;
|
265
|
+
tlast->children = NULL;
|
266
|
+
tlast = child;
|
267
|
+
}
|
268
|
+
else {
|
269
|
+
if (tback->children == tlast)
|
270
|
+
tback->children = tlast->sibling;
|
271
|
+
else
|
272
|
+
tback->sibling = tlast->sibling;
|
273
|
+
}
|
274
|
+
|
275
|
+
while (tlast != NULL) {
|
276
|
+
child = tlast->children;
|
277
|
+
free(tlast);
|
278
|
+
tlast = child;
|
279
|
+
}
|
280
|
+
}
|
281
|
+
|
282
|
+
node->Plengths[id] = 0;
|
283
|
+
node->ispreprocessed = 0;
|
284
|
+
|
285
|
+
return 1;
|
286
|
+
}
|
287
|
+
|
288
|
+
|
289
|
+
/*
|
290
|
+
* ac_prep
|
291
|
+
*
|
292
|
+
* Compute the failure and output links for the keyword tree.
|
293
|
+
*
|
294
|
+
* Parameters: node - an AC_STRUCT structure
|
295
|
+
*
|
296
|
+
* Returns: non-zero on success, zero on error.
|
297
|
+
*/
|
298
|
+
int ac_prep(AC_STRUCT *node)
|
299
|
+
{
|
300
|
+
char x;
|
301
|
+
AC_TREE v, vprime, w, wprime, root, front, back, child;
|
302
|
+
|
303
|
+
if (node->errorflag)
|
304
|
+
return 0;
|
305
|
+
|
306
|
+
/*
|
307
|
+
* The failure link and output link computation requires a breadth-first
|
308
|
+
* traversal of the keyword tree. And, to do that, we need a queue of
|
309
|
+
* the nodes yet to be processed.
|
310
|
+
*
|
311
|
+
* The `faillink' fields will be used as the pointers for the queue
|
312
|
+
* of nodes to be computed (since the failure link is only set after
|
313
|
+
* the node is removed from the queue).
|
314
|
+
*
|
315
|
+
* The `outlink' fields will be used as the pointers to a node's parent
|
316
|
+
* for nodes in the queue (since the output link is also only set after
|
317
|
+
* the node is removed from the queue).
|
318
|
+
*/
|
319
|
+
root = node->tree;
|
320
|
+
|
321
|
+
front = back = root;
|
322
|
+
front->faillink = NULL;
|
323
|
+
front->outlink = NULL;
|
324
|
+
|
325
|
+
while (front != NULL) {
|
326
|
+
v = front;
|
327
|
+
x = v->ch;
|
328
|
+
vprime = v->outlink;
|
329
|
+
|
330
|
+
/*
|
331
|
+
* Add the node's children to the queue.
|
332
|
+
*/
|
333
|
+
for (child=v->children; child != NULL; child=child->sibling) {
|
334
|
+
child->outlink = v;
|
335
|
+
back->faillink = child;
|
336
|
+
back = child;
|
337
|
+
}
|
338
|
+
back->faillink = NULL;
|
339
|
+
|
340
|
+
front = front->faillink;
|
341
|
+
v->faillink = v->outlink = NULL;
|
342
|
+
|
343
|
+
/*
|
344
|
+
* Set the failure and output links.
|
345
|
+
*/
|
346
|
+
if (v == root)
|
347
|
+
;
|
348
|
+
else if (vprime == root)
|
349
|
+
v->faillink = root;
|
350
|
+
else {
|
351
|
+
/*
|
352
|
+
* Find the find link in the failure link chain which has a child
|
353
|
+
* labeled with x.
|
354
|
+
*/
|
355
|
+
wprime = NULL;
|
356
|
+
w = vprime->faillink;
|
357
|
+
|
358
|
+
while (1) {
|
359
|
+
wprime = w->children;
|
360
|
+
while (wprime != NULL && wprime->ch < x)
|
361
|
+
wprime = wprime->sibling;
|
362
|
+
|
363
|
+
if ((wprime != NULL && wprime->ch == x) || w == root)
|
364
|
+
break;
|
365
|
+
|
366
|
+
w = w->faillink;
|
367
|
+
|
368
|
+
#ifdef STATS
|
369
|
+
node->prep_fail_compares++;
|
370
|
+
#endif
|
371
|
+
}
|
372
|
+
#ifdef STATS
|
373
|
+
node->prep_fail_compares++;
|
374
|
+
#endif
|
375
|
+
|
376
|
+
if (wprime != NULL && wprime->ch == x)
|
377
|
+
v->faillink = wprime;
|
378
|
+
else
|
379
|
+
v->faillink = root;
|
380
|
+
|
381
|
+
if (v->matchid != 0) {
|
382
|
+
if (v->faillink->matchid != 0)
|
383
|
+
v->outlink = v->faillink;
|
384
|
+
else
|
385
|
+
v->outlink = v->faillink->outlink;
|
386
|
+
}
|
387
|
+
}
|
388
|
+
}
|
389
|
+
|
390
|
+
node->ispreprocessed = 1;
|
391
|
+
node->initflag = 0;
|
392
|
+
|
393
|
+
return 1;
|
394
|
+
}
|
395
|
+
|
396
|
+
|
397
|
+
/*
|
398
|
+
* ac_search_init
|
399
|
+
*
|
400
|
+
* Initializes the variables used during an Aho-Corasick search.
|
401
|
+
* See ac_search for an example of how it should be used.
|
402
|
+
*
|
403
|
+
* Parameters: node - an AC_STRUCT structure
|
404
|
+
* T - the sequence to be searched
|
405
|
+
* N - the length of the sequence
|
406
|
+
*
|
407
|
+
* Returns: nothing.
|
408
|
+
*/
|
409
|
+
void ac_search_init(AC_STRUCT *node, char *T, int N)
|
410
|
+
{
|
411
|
+
if (node->errorflag)
|
412
|
+
return;
|
413
|
+
else if (!node->ispreprocessed) {
|
414
|
+
fprintf(stderr, "Error in Aho-Corasick search. The preprocessing "
|
415
|
+
"has not been completed.\n");
|
416
|
+
return;
|
417
|
+
}
|
418
|
+
|
419
|
+
node->T = T - 1; /* Shift to make sequence be T[1],...,T[N] */
|
420
|
+
node->N = N;
|
421
|
+
node->c = 1;
|
422
|
+
node->w = node->tree;
|
423
|
+
node->output = NULL;
|
424
|
+
node->initflag = 1;
|
425
|
+
node->endflag = 0;
|
426
|
+
}
|
427
|
+
|
428
|
+
|
429
|
+
/*
|
430
|
+
* ac_search
|
431
|
+
*
|
432
|
+
* Scans a text to look for the next occurrence of one of the patterns
|
433
|
+
* in the text. An example of how this search should be used is the
|
434
|
+
* following:
|
435
|
+
*
|
436
|
+
* s = T;
|
437
|
+
* len = N;
|
438
|
+
* contflag = 0;
|
439
|
+
* ac_search_init(node, T, N);
|
440
|
+
* while ((s = ac_search(node, &matchlen, &matchid) != NULL) {
|
441
|
+
* >>> Pattern `matchid' matched from `s' to `s + matchlen - 1'. <<<
|
442
|
+
* }
|
443
|
+
*
|
444
|
+
* where `node', `T' and `N' are assumed to be initialized appropriately.
|
445
|
+
*
|
446
|
+
* Parameters: node - a preprocessed AC_STRUCT structure
|
447
|
+
* length_out - where to store the new match's length
|
448
|
+
* id_out - where to store the identifier of the
|
449
|
+
* pattern that matched
|
450
|
+
* ends_at - where to store the n-th matched char
|
451
|
+
*
|
452
|
+
* Returns: the left end of the text that matches a pattern, or NULL
|
453
|
+
* if no match occurs. (It also stores values in `*length_out',
|
454
|
+
* and `*id_out' giving the match's length and pattern identifier.
|
455
|
+
*/
|
456
|
+
char *ac_search(AC_STRUCT *node, int *length_out, int *id_out, int *ends_at)
|
457
|
+
{
|
458
|
+
int c, N, id;
|
459
|
+
char *T;
|
460
|
+
AC_TREE w, wprime, root;
|
461
|
+
|
462
|
+
if (node->errorflag)
|
463
|
+
return NULL;
|
464
|
+
else if (!node->ispreprocessed) {
|
465
|
+
fprintf(stderr, "Error in Aho-Corasick search. The preprocessing "
|
466
|
+
"has not been completed.\n");
|
467
|
+
return NULL;
|
468
|
+
}
|
469
|
+
else if (!node->initflag) {
|
470
|
+
fprintf(stderr, "Error in Aho-Corasick search. ac_search_init was not "
|
471
|
+
"called.\n");
|
472
|
+
return NULL;
|
473
|
+
}
|
474
|
+
else if (node->endflag)
|
475
|
+
return NULL;
|
476
|
+
|
477
|
+
T = node->T;
|
478
|
+
N = node->N;
|
479
|
+
c = node->c;
|
480
|
+
w = node->w;
|
481
|
+
root = node->tree;
|
482
|
+
|
483
|
+
/*
|
484
|
+
* If the last call to ac_search returned a match, check for another
|
485
|
+
* match ending at the same right endpoint (denoted by a non-NULL
|
486
|
+
* output link).
|
487
|
+
*/
|
488
|
+
if (node->output != NULL) {
|
489
|
+
node->output = node->output->outlink;
|
490
|
+
|
491
|
+
#ifdef STATS
|
492
|
+
node->outlinks_traversed++;
|
493
|
+
#endif
|
494
|
+
|
495
|
+
if (node->output != NULL) {
|
496
|
+
id = node->output->matchid;
|
497
|
+
if (id_out)
|
498
|
+
*id_out = id;
|
499
|
+
if (length_out)
|
500
|
+
*length_out = node->Plengths[id];
|
501
|
+
if (ends_at)
|
502
|
+
*ends_at= c;
|
503
|
+
return &T[c] - node->Plengths[id];
|
504
|
+
}
|
505
|
+
|
506
|
+
}
|
507
|
+
|
508
|
+
/*
|
509
|
+
* Run the search algorithm, stopping at the first position where a
|
510
|
+
* match to one of the patterns occurs.
|
511
|
+
*/
|
512
|
+
while (c <= N) {
|
513
|
+
/*
|
514
|
+
* Try to match the next input character to a child in the tree.
|
515
|
+
*/
|
516
|
+
wprime = w->children;
|
517
|
+
while (wprime != NULL && wprime->ch != T[c])
|
518
|
+
wprime = wprime->sibling;
|
519
|
+
|
520
|
+
#ifdef STATS
|
521
|
+
node->num_compares++;
|
522
|
+
#endif
|
523
|
+
|
524
|
+
/*
|
525
|
+
* If the match fails, then either use the failure link (if not
|
526
|
+
* at the root), or move to the next character since no prefix
|
527
|
+
* of any pattern ends with character T[c].
|
528
|
+
*/
|
529
|
+
if (wprime == NULL) {
|
530
|
+
if (w == root)
|
531
|
+
c++;
|
532
|
+
else {
|
533
|
+
w = w->faillink;
|
534
|
+
|
535
|
+
#ifdef STATS
|
536
|
+
node->num_failures++;
|
537
|
+
#endif
|
538
|
+
|
539
|
+
}
|
540
|
+
}
|
541
|
+
else {
|
542
|
+
/*
|
543
|
+
* If we could match the input, move down the tree and to the
|
544
|
+
* next input character, and see if that match completes the
|
545
|
+
* match to a pattern (when matchid != 0 or outlink != NULL).
|
546
|
+
*/
|
547
|
+
c++;
|
548
|
+
w = wprime;
|
549
|
+
|
550
|
+
#ifdef STATS
|
551
|
+
node->edges_traversed++;
|
552
|
+
#endif
|
553
|
+
|
554
|
+
if (w->matchid != 0)
|
555
|
+
node->output = w;
|
556
|
+
else if (w->outlink != NULL) {
|
557
|
+
node->output = w->outlink;
|
558
|
+
|
559
|
+
#ifdef STATS
|
560
|
+
node->outlinks_traversed++;
|
561
|
+
#endif
|
562
|
+
|
563
|
+
}
|
564
|
+
|
565
|
+
if (node->output != NULL) {
|
566
|
+
id = node->output->matchid;
|
567
|
+
if (id_out)
|
568
|
+
*id_out = id;
|
569
|
+
if (length_out)
|
570
|
+
*length_out= node->Plengths[id];
|
571
|
+
if(ends_at)
|
572
|
+
*ends_at= c;
|
573
|
+
|
574
|
+
node->w = w;
|
575
|
+
node->c = c; // ends_at - length_out;
|
576
|
+
|
577
|
+
return &T[c] - node->Plengths[id];
|
578
|
+
}
|
579
|
+
}
|
580
|
+
}
|
581
|
+
|
582
|
+
node->c = c;
|
583
|
+
node->endflag = 1;
|
584
|
+
|
585
|
+
return NULL;
|
586
|
+
}
|
587
|
+
|
588
|
+
|
589
|
+
/*
|
590
|
+
* ac_free
|
591
|
+
*
|
592
|
+
* Free up the allocated AC_STRUCT structure.
|
593
|
+
*
|
594
|
+
* Parameters: node - a AC_STRUCT structure
|
595
|
+
*
|
596
|
+
* Returns: nothing.
|
597
|
+
*/
|
598
|
+
void ac_free(AC_STRUCT *node)
|
599
|
+
{
|
600
|
+
AC_TREE front, back, next;
|
601
|
+
|
602
|
+
if (node == NULL)
|
603
|
+
return;
|
604
|
+
|
605
|
+
if (node->tree != NULL) {
|
606
|
+
front = back = node->tree;
|
607
|
+
while (front != NULL) {
|
608
|
+
back->sibling = front->children;
|
609
|
+
while (back->sibling != NULL)
|
610
|
+
back = back->sibling;
|
611
|
+
|
612
|
+
next = front->sibling;
|
613
|
+
free(front);
|
614
|
+
front = next;
|
615
|
+
}
|
616
|
+
}
|
617
|
+
|
618
|
+
if (node->Plengths != NULL)
|
619
|
+
free(node->Plengths);
|
620
|
+
|
621
|
+
free(node);
|
622
|
+
}
|
623
|
+
|