aurelian-ruby-ahocorasick 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/MIT-LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2008 Aurelian Oancea
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
data/README.textile ADDED
@@ -0,0 +1,36 @@
1
+ h3. Introduction
2
+
3
+ This library is a ruby extension, a wrapper around the "Aho-Corasick":http://en.wikipedia.org/wiki/Aho-Corasick_algorithm implementation in C, found in "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html package.
4
+
5
+ The source code (ac.c and ac.h) was "adapted" from Strmat. In fact, I've changed only 3-4 lines of code from the original implementation so it will feat my needs: search needed to return the current position in the searched string.
6
+
7
+ h3. What's the idea?
8
+
9
+ Having a dictionary of known sentences, how can I find individual patterns in an incoming stream of data? Fast.
10
+
11
+ h1. TBD
12
+
13
+ <pre>
14
+ [aurelian@stalingrad ext]$ time ./dict.rb
15
+ 110196
16
+ 711
17
+
18
+ real 0m0.538s
19
+ user 0m0.435s
20
+ sys 0m0.036s
21
+ </pre>
22
+
23
+ h3. Additional Reading / Implementations
24
+
25
+ Other suffix - tree
26
+
27
+ * "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html
28
+ * Pytst / Ruby-Pytst
29
+ * "Aho-Corasick extension":http://hkn.eecs.berkeley.edu/~dyoo/python/ahocorasick/
30
+ * "Keyword Prospector":http://latimes.rubyforge.org/keyword_prospector/rdoc/
31
+ * "libstree":http://www.cl.cam.ac.uk/~cpk25/libstree/
32
+
33
+ --
34
+
35
+ (c) 2008 - Aurelian Oancea, < aurelian at locknet . ro >
36
+ released under MIT-LICENCE
data/examples/dict.rb ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.dirname(__FILE__) + '/../ext/ahocorasick'
4
+
5
+ k= AhoCorasick::KeywordTree.from_file(File.dirname(__FILE__) + "/../spec/data/en.words")
6
+
7
+ query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt")
8
+
9
+ results= k.search query
10
+
11
+ results.each do | r |
12
+ puts query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr + " => " + r[:value]
13
+ end
14
+
data/examples/elev.rb ADDED
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $kcode='UTF-8'
4
+
5
+ require File.dirname(__FILE__) + '/../ext/ahocorasick'
6
+
7
+ k= AhoCorasick::KeywordTree.new
8
+
9
+ k << "I've"
10
+ k << "data"
11
+ k << "base"
12
+ k << "database"
13
+
14
+ query= "I've moved my data to a database"
15
+
16
+ k.search(query).each do | r |
17
+ puts "-> [ " + r[:id].to_s + " ] " + r[:value] + " / " + query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr
18
+ end
19
+
data/examples/sample.c ADDED
@@ -0,0 +1,94 @@
1
+ //
2
+ // Getting started with Aho-Corasick from Strmat
3
+ //
4
+ // lasick Makefile:
5
+ //
6
+ // --
7
+ // ac.o :
8
+ // gcc -c -fPIC -shared ac.c
9
+ // libasick : ac.o
10
+ // gcc -shared -Wl,-soname,libasick.so -o libasick.so.1.0.1
11
+ // ar rcs libasick.a ac.o
12
+ // clean :
13
+ // rm -rf *.o *.a *.so* *.dylib*
14
+ // --
15
+ //
16
+ // Compile this stuff - asick is the library name, generated with the above Makefile :)
17
+ //
18
+ // gcc sample.c -o ac-sample -I../ext/ -L../ext/ -lasick
19
+ //
20
+
21
+ #include <string.h>
22
+ #include <stdio.h>
23
+ #include <stdlib.h>
24
+ #include "ac.h"
25
+
26
+ int main(int argc, char *argv[]) {
27
+ char * search;
28
+ char * remain;
29
+ char * result;
30
+ char word[1024];
31
+
32
+ FILE *dictionary;
33
+ FILE *input;
34
+ int lgt, id, n, i;
35
+
36
+ AC_STRUCT * tree;
37
+
38
+ input= fopen(argv[1], "r");
39
+
40
+ if(input == NULL) {
41
+ search= argv[1];
42
+ } else {
43
+ long lSize;
44
+ fseek (input , 0 , SEEK_END);
45
+ lSize= ftell (input);
46
+ rewind(input);
47
+ search = (char*) malloc (sizeof(char)*lSize);
48
+ if (search == NULL) { fputs ("Error: Memory error",stderr); exit(-2); }
49
+ fread(search, 1, lSize-1, input);
50
+ }
51
+
52
+ dictionary= fopen("../spec/data/dictionary.txt", "r");
53
+
54
+ if(dictionary == NULL) {
55
+ printf("Error: can't open file.\n");
56
+ return -1;
57
+ }
58
+
59
+ tree= ac_alloc();
60
+
61
+ // start counting from 1
62
+ n= 1;
63
+
64
+ printf("==> building dictionary ...");
65
+
66
+ while(fgets(word, 1024, dictionary) != NULL) {
67
+ // strip \n
68
+ ac_add_string(tree, word, strlen(word)-1, n++);
69
+ }
70
+
71
+ printf("%d entries added.\n",n);
72
+
73
+ ac_prep(tree);
74
+
75
+ printf("==> input text [%d]:\n--\n%s\n--\n", strlen(search), search);
76
+
77
+ ac_search_init(tree, search, strlen(search) );
78
+
79
+ while((remain= ac_search(tree, &lgt, &id)) != NULL) {
80
+ printf("`%d'", remain[lgt+1]);
81
+ result = (char*) malloc (sizeof(char)*lgt);
82
+ sprintf( result, "%.*s", lgt, remain);
83
+ // result: should read first lgt chars from remain.
84
+ printf("==> result: lenght=> %d, id=> %d [%s]\n", lgt, id, result);
85
+ free(result);
86
+ }
87
+
88
+ ac_free(tree);
89
+ fclose(dictionary);
90
+ free(search);
91
+
92
+ return 0;
93
+ }
94
+
data/examples/test.rb ADDED
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.dirname(__FILE__) + '/../ext/ahocorasick'
4
+
5
+ k= AhoCorasick::KeywordTree.new
6
+
7
+ puts k.size
8
+ k.add_string("foo");
9
+
10
+ puts k.size
11
+ k.add_string("bar", 1991);
12
+
13
+ puts k.size
14
+ k.add_string("fomz");
15
+
16
+ begin
17
+ k.add_string("foo", -1);
18
+ rescue RuntimeError => err
19
+ puts "[ok]==> got " + err.class.name + ": " + err.message
20
+ end
21
+
22
+ begin
23
+ k.add_string("foo", "bar");
24
+ rescue RuntimeError => err
25
+ puts "[ok]==> got " + err.class.name + ": " + err.message
26
+ end
27
+
28
+ k.add_string("timisoara", 22);
29
+
30
+ puts k.size
31
+ begin
32
+ k.add_string("bucuresti", 22);
33
+ rescue RuntimeError => err
34
+ puts "[ok]==> got " + err.class.name + ": " + err.message
35
+ end
36
+
37
+ k << "bacau"
38
+
39
+ k.search('am fost la bacau').each do | result |
40
+ puts result.inspect
41
+ end
42
+
43
+ k.search( 'din foo in foo' ).each do | q |
44
+ puts q.inspect
45
+ end
46
+
data/ext/ac.c ADDED
@@ -0,0 +1,623 @@
1
+ /*
2
+ * ac.c
3
+ *
4
+ * Implementation of the Aho-Corasick algorithm.
5
+ *
6
+ * NOTES:
7
+ * 8/94 - Original Implementation (Sean Davis)
8
+ * 9/94 - Redid Implementation (James Knight)
9
+ * 3/96 - Modularized the code (James Knight)
10
+ * 7/96 - Finished the modularization (James Knight)
11
+ */
12
+
13
+ #include <stdio.h>
14
+ #include <stdlib.h>
15
+ #include <string.h>
16
+ #include "ac.h"
17
+
18
+ /*
19
+ * ac_alloc
20
+ *
21
+ * Creates a new AC_STRUCT structure and initializes its fields.
22
+ *
23
+ * Parameters: none.
24
+ *
25
+ * Returns: A dynamically allocated AC_STRUCT structure.
26
+ */
27
+ AC_STRUCT *ac_alloc(void)
28
+ {
29
+ AC_STRUCT *node;
30
+
31
+ if ((node = malloc(sizeof(AC_STRUCT))) == NULL)
32
+ return NULL;
33
+ memset(node, 0, sizeof(AC_STRUCT));
34
+
35
+ if ((node->tree = malloc(sizeof(ACTREE_NODE))) == NULL) {
36
+ free(node);
37
+ return NULL;
38
+ }
39
+ memset(node->tree, 0, sizeof(ACTREE_NODE));
40
+
41
+ return node;
42
+ }
43
+
44
+
45
+ /*
46
+ * ac_add_string
47
+ *
48
+ * Adds a string to the AC_STRUCT structure's keyword tree.
49
+ *
50
+ * NOTE: The `id' value given must be unique to any of the strings
51
+ * added to the tree, and must be a small integer greater than
52
+ * 0 (since it is used to index an array holding information
53
+ * about each of the strings).
54
+ *
55
+ * The best id's to use are to number the strings from 1 to K.
56
+ *
57
+ * Parameters: node - an AC_STRUCT structure
58
+ * P - the sequence
59
+ * M - the sequence length
60
+ * id - the sequence identifier
61
+ *
62
+ * Returns: non-zero on success, zero on error.
63
+ */
64
+ int ac_add_string(AC_STRUCT *node, char *P, int M, int id)
65
+ {
66
+ int i, j, newsize;
67
+ AC_TREE tnode, child, back, newnode, list, tail;
68
+
69
+ /*
70
+ * Return a zero if a previous error had occurred, or if the
71
+ * given id equals zero. An id value of zero is used by the
72
+ * algorithm to signal that no pattern ends at a node in the
73
+ * keyword tree. So, it can't be used as a pattern's id.
74
+ */
75
+ if (node->errorflag || id == 0)
76
+ return 0;
77
+
78
+ P--; /* Shift to make sequence be P[1],...,P[M] */
79
+
80
+ /*
81
+ * Allocate space for the new string's information.
82
+ */
83
+ if (node->Psize <= id) {
84
+ if (node->Psize == 0) {
85
+ newsize = (id >= 16 ? id + 1 : 16);
86
+ node->Plengths = malloc(newsize * sizeof(int));
87
+ }
88
+ else {
89
+ newsize = node->Psize + id + 1;
90
+ node->Plengths = realloc(node->Plengths, newsize * sizeof(int));
91
+ }
92
+ if (node->Plengths == NULL) {
93
+ node->errorflag = 1;
94
+ return 0;
95
+ }
96
+
97
+ for (i=node->Psize; i < newsize; i++)
98
+ node->Plengths[i] = 0;
99
+ node->Psize = newsize;
100
+ }
101
+
102
+ // duplicate id
103
+ if (node->Plengths[id] != 0)
104
+ return 0;
105
+
106
+ /*
107
+ * Add the string to the keyword tree.
108
+ */
109
+ tnode = node->tree;
110
+ for (i=1; i <= M; i++) {
111
+ /*
112
+ * Find the child whose character is P[i].
113
+ */
114
+ back = NULL;
115
+ child = tnode->children;
116
+ while (child != NULL && child->ch < P[i]) {
117
+ back = child;
118
+ child = child->sibling;
119
+ }
120
+
121
+ if (child == NULL || child->ch != P[i])
122
+ break;
123
+
124
+ tnode = child;
125
+
126
+ #ifdef STATS
127
+ node->prep_old_edges++;
128
+ #endif
129
+
130
+ }
131
+
132
+ /*
133
+ * If only part of the pattern exists in the tree, add the
134
+ * rest of the pattern to the tree.
135
+ */
136
+ if (i <= M) {
137
+ list = tail = NULL;
138
+ for (j=i; j <= M; j++) {
139
+ if ((newnode = malloc(sizeof(ACTREE_NODE))) == NULL)
140
+ break;
141
+ memset(newnode, 0, sizeof(ACTREE_NODE));
142
+ newnode->ch = P[j];
143
+
144
+ if (list == NULL)
145
+ list = tail = newnode;
146
+ else
147
+ tail = tail->children = newnode;
148
+
149
+ #ifdef STATS
150
+ node->prep_new_edges++;
151
+ #endif
152
+
153
+ }
154
+ if (j <= M) {
155
+ while (list != NULL) {
156
+ tail = list->children;
157
+ free(list);
158
+ list = tail;
159
+ }
160
+ return 0;
161
+ }
162
+
163
+ list->sibling = child;
164
+ if (back == NULL)
165
+ tnode->children = list;
166
+ else
167
+ back->sibling = list;
168
+
169
+ tnode = tail;
170
+ }
171
+
172
+ tnode->matchid = id;
173
+ node->Plengths[id] = M;
174
+ node->ispreprocessed = 0;
175
+
176
+ return 1;
177
+ }
178
+
179
+
180
+ /*
181
+ * ac_del_string
182
+ *
183
+ * Deletes a string from the keyword tree.
184
+ *
185
+ * Parameters: node - an AC_STRUCT structure
186
+ * P - the sequence to be deleted
187
+ * M - its length
188
+ * id - its identifier
189
+ *
190
+ * Returns: non-zero on success, zero on error.
191
+ */
192
+ int ac_del_string(AC_STRUCT *node, char *P, int M, int id)
193
+ {
194
+ int i, flag;
195
+ AC_TREE tnode, tlast, tback, child, back;
196
+
197
+ if (node->errorflag || id > node->Psize || node->Plengths[id] == 0)
198
+ return 0;
199
+
200
+ P--; /* Shift to make sequence be P[1],...,P[M] */
201
+
202
+ /*
203
+ * Scan the tree for the path corresponding to the keyword to be deleted.
204
+ */
205
+ flag = 1;
206
+ tlast = tnode = node->tree;
207
+ tback = NULL;
208
+
209
+ for (i=1; i <= M; i++) {
210
+ /*
211
+ * Find the child matching P[i]. It must be there.
212
+ */
213
+ child = tnode->children;
214
+ back = NULL;
215
+ while (child != NULL && child->ch != P[i]) {
216
+ back = child;
217
+ child = child->sibling;
218
+ }
219
+
220
+ if (child == NULL) {
221
+ fprintf(stderr, "Error in Aho-Corasick preprocessing. String to be "
222
+ "deleted is not in tree.\n");
223
+ return 0;
224
+ }
225
+
226
+ /*
227
+ * Try to find the point where the pattern to be deleted branches off
228
+ * from the paths of the other patterns in the tree. This point must
229
+ * be at the latest node which satisfies one of these two conditions:
230
+ *
231
+ * 1) Another pattern ends at that node (and so
232
+ * `child->matchid != 0'). In this case, the branch point is
233
+ * just below this node and so the children of this node
234
+ * should be removed.
235
+ * 2) A node has other siblings. In this case, the node itself
236
+ * is the branch point, and it and its children should be
237
+ * removed.
238
+ */
239
+ if (i < M && child->matchid != 0) {
240
+ flag = 1;
241
+ tlast = child;
242
+ }
243
+ else if (back != NULL || child->sibling != NULL) {
244
+ flag = 2;
245
+ tlast = child;
246
+ tback = (back == NULL ? tnode : back);
247
+ }
248
+
249
+ tnode = child;
250
+ }
251
+
252
+ /*
253
+ * If the node corresponding to the end of the keyword has children,
254
+ * then the tree should not be altered, except to remove the keyword's
255
+ * identifier from the tree.
256
+ *
257
+ * Otherwise, apply the appropriate removal, as described above.
258
+ */
259
+ if (tnode->children != NULL) {
260
+ tnode->matchid = 0;
261
+ }
262
+ else {
263
+ if (flag == 1) {
264
+ child = tlast->children;
265
+ tlast->children = NULL;
266
+ tlast = child;
267
+ }
268
+ else {
269
+ if (tback->children == tlast)
270
+ tback->children = tlast->sibling;
271
+ else
272
+ tback->sibling = tlast->sibling;
273
+ }
274
+
275
+ while (tlast != NULL) {
276
+ child = tlast->children;
277
+ free(tlast);
278
+ tlast = child;
279
+ }
280
+ }
281
+
282
+ node->Plengths[id] = 0;
283
+ node->ispreprocessed = 0;
284
+
285
+ return 1;
286
+ }
287
+
288
+
289
+ /*
290
+ * ac_prep
291
+ *
292
+ * Compute the failure and output links for the keyword tree.
293
+ *
294
+ * Parameters: node - an AC_STRUCT structure
295
+ *
296
+ * Returns: non-zero on success, zero on error.
297
+ */
298
+ int ac_prep(AC_STRUCT *node)
299
+ {
300
+ char x;
301
+ AC_TREE v, vprime, w, wprime, root, front, back, child;
302
+
303
+ if (node->errorflag)
304
+ return 0;
305
+
306
+ /*
307
+ * The failure link and output link computation requires a breadth-first
308
+ * traversal of the keyword tree. And, to do that, we need a queue of
309
+ * the nodes yet to be processed.
310
+ *
311
+ * The `faillink' fields will be used as the pointers for the queue
312
+ * of nodes to be computed (since the failure link is only set after
313
+ * the node is removed from the queue).
314
+ *
315
+ * The `outlink' fields will be used as the pointers to a node's parent
316
+ * for nodes in the queue (since the output link is also only set after
317
+ * the node is removed from the queue).
318
+ */
319
+ root = node->tree;
320
+
321
+ front = back = root;
322
+ front->faillink = NULL;
323
+ front->outlink = NULL;
324
+
325
+ while (front != NULL) {
326
+ v = front;
327
+ x = v->ch;
328
+ vprime = v->outlink;
329
+
330
+ /*
331
+ * Add the node's children to the queue.
332
+ */
333
+ for (child=v->children; child != NULL; child=child->sibling) {
334
+ child->outlink = v;
335
+ back->faillink = child;
336
+ back = child;
337
+ }
338
+ back->faillink = NULL;
339
+
340
+ front = front->faillink;
341
+ v->faillink = v->outlink = NULL;
342
+
343
+ /*
344
+ * Set the failure and output links.
345
+ */
346
+ if (v == root)
347
+ ;
348
+ else if (vprime == root)
349
+ v->faillink = root;
350
+ else {
351
+ /*
352
+ * Find the find link in the failure link chain which has a child
353
+ * labeled with x.
354
+ */
355
+ wprime = NULL;
356
+ w = vprime->faillink;
357
+
358
+ while (1) {
359
+ wprime = w->children;
360
+ while (wprime != NULL && wprime->ch < x)
361
+ wprime = wprime->sibling;
362
+
363
+ if ((wprime != NULL && wprime->ch == x) || w == root)
364
+ break;
365
+
366
+ w = w->faillink;
367
+
368
+ #ifdef STATS
369
+ node->prep_fail_compares++;
370
+ #endif
371
+ }
372
+ #ifdef STATS
373
+ node->prep_fail_compares++;
374
+ #endif
375
+
376
+ if (wprime != NULL && wprime->ch == x)
377
+ v->faillink = wprime;
378
+ else
379
+ v->faillink = root;
380
+
381
+ if (v->matchid != 0) {
382
+ if (v->faillink->matchid != 0)
383
+ v->outlink = v->faillink;
384
+ else
385
+ v->outlink = v->faillink->outlink;
386
+ }
387
+ }
388
+ }
389
+
390
+ node->ispreprocessed = 1;
391
+ node->initflag = 0;
392
+
393
+ return 1;
394
+ }
395
+
396
+
397
+ /*
398
+ * ac_search_init
399
+ *
400
+ * Initializes the variables used during an Aho-Corasick search.
401
+ * See ac_search for an example of how it should be used.
402
+ *
403
+ * Parameters: node - an AC_STRUCT structure
404
+ * T - the sequence to be searched
405
+ * N - the length of the sequence
406
+ *
407
+ * Returns: nothing.
408
+ */
409
+ void ac_search_init(AC_STRUCT *node, char *T, int N)
410
+ {
411
+ if (node->errorflag)
412
+ return;
413
+ else if (!node->ispreprocessed) {
414
+ fprintf(stderr, "Error in Aho-Corasick search. The preprocessing "
415
+ "has not been completed.\n");
416
+ return;
417
+ }
418
+
419
+ node->T = T - 1; /* Shift to make sequence be T[1],...,T[N] */
420
+ node->N = N;
421
+ node->c = 1;
422
+ node->w = node->tree;
423
+ node->output = NULL;
424
+ node->initflag = 1;
425
+ node->endflag = 0;
426
+ }
427
+
428
+
429
+ /*
430
+ * ac_search
431
+ *
432
+ * Scans a text to look for the next occurrence of one of the patterns
433
+ * in the text. An example of how this search should be used is the
434
+ * following:
435
+ *
436
+ * s = T;
437
+ * len = N;
438
+ * contflag = 0;
439
+ * ac_search_init(node, T, N);
440
+ * while ((s = ac_search(node, &matchlen, &matchid) != NULL) {
441
+ * >>> Pattern `matchid' matched from `s' to `s + matchlen - 1'. <<<
442
+ * }
443
+ *
444
+ * where `node', `T' and `N' are assumed to be initialized appropriately.
445
+ *
446
+ * Parameters: node - a preprocessed AC_STRUCT structure
447
+ * length_out - where to store the new match's length
448
+ * id_out - where to store the identifier of the
449
+ * pattern that matched
450
+ * ends_at - where to store the n-th matched char
451
+ *
452
+ * Returns: the left end of the text that matches a pattern, or NULL
453
+ * if no match occurs. (It also stores values in `*length_out',
454
+ * and `*id_out' giving the match's length and pattern identifier.
455
+ */
456
+ char *ac_search(AC_STRUCT *node, int *length_out, int *id_out, int *ends_at)
457
+ {
458
+ int c, N, id;
459
+ char *T;
460
+ AC_TREE w, wprime, root;
461
+
462
+ if (node->errorflag)
463
+ return NULL;
464
+ else if (!node->ispreprocessed) {
465
+ fprintf(stderr, "Error in Aho-Corasick search. The preprocessing "
466
+ "has not been completed.\n");
467
+ return NULL;
468
+ }
469
+ else if (!node->initflag) {
470
+ fprintf(stderr, "Error in Aho-Corasick search. ac_search_init was not "
471
+ "called.\n");
472
+ return NULL;
473
+ }
474
+ else if (node->endflag)
475
+ return NULL;
476
+
477
+ T = node->T;
478
+ N = node->N;
479
+ c = node->c;
480
+ w = node->w;
481
+ root = node->tree;
482
+
483
+ /*
484
+ * If the last call to ac_search returned a match, check for another
485
+ * match ending at the same right endpoint (denoted by a non-NULL
486
+ * output link).
487
+ */
488
+ if (node->output != NULL) {
489
+ node->output = node->output->outlink;
490
+
491
+ #ifdef STATS
492
+ node->outlinks_traversed++;
493
+ #endif
494
+
495
+ if (node->output != NULL) {
496
+ id = node->output->matchid;
497
+ if (id_out)
498
+ *id_out = id;
499
+ if (length_out)
500
+ *length_out = node->Plengths[id];
501
+ if (ends_at)
502
+ *ends_at= c;
503
+ return &T[c] - node->Plengths[id];
504
+ }
505
+
506
+ }
507
+
508
+ /*
509
+ * Run the search algorithm, stopping at the first position where a
510
+ * match to one of the patterns occurs.
511
+ */
512
+ while (c <= N) {
513
+ /*
514
+ * Try to match the next input character to a child in the tree.
515
+ */
516
+ wprime = w->children;
517
+ while (wprime != NULL && wprime->ch != T[c])
518
+ wprime = wprime->sibling;
519
+
520
+ #ifdef STATS
521
+ node->num_compares++;
522
+ #endif
523
+
524
+ /*
525
+ * If the match fails, then either use the failure link (if not
526
+ * at the root), or move to the next character since no prefix
527
+ * of any pattern ends with character T[c].
528
+ */
529
+ if (wprime == NULL) {
530
+ if (w == root)
531
+ c++;
532
+ else {
533
+ w = w->faillink;
534
+
535
+ #ifdef STATS
536
+ node->num_failures++;
537
+ #endif
538
+
539
+ }
540
+ }
541
+ else {
542
+ /*
543
+ * If we could match the input, move down the tree and to the
544
+ * next input character, and see if that match completes the
545
+ * match to a pattern (when matchid != 0 or outlink != NULL).
546
+ */
547
+ c++;
548
+ w = wprime;
549
+
550
+ #ifdef STATS
551
+ node->edges_traversed++;
552
+ #endif
553
+
554
+ if (w->matchid != 0)
555
+ node->output = w;
556
+ else if (w->outlink != NULL) {
557
+ node->output = w->outlink;
558
+
559
+ #ifdef STATS
560
+ node->outlinks_traversed++;
561
+ #endif
562
+
563
+ }
564
+
565
+ if (node->output != NULL) {
566
+ id = node->output->matchid;
567
+ if (id_out)
568
+ *id_out = id;
569
+ if (length_out)
570
+ *length_out= node->Plengths[id];
571
+ if(ends_at)
572
+ *ends_at= c;
573
+
574
+ node->w = w;
575
+ node->c = c; // ends_at - length_out;
576
+
577
+ return &T[c] - node->Plengths[id];
578
+ }
579
+ }
580
+ }
581
+
582
+ node->c = c;
583
+ node->endflag = 1;
584
+
585
+ return NULL;
586
+ }
587
+
588
+
589
+ /*
590
+ * ac_free
591
+ *
592
+ * Free up the allocated AC_STRUCT structure.
593
+ *
594
+ * Parameters: node - a AC_STRUCT structure
595
+ *
596
+ * Returns: nothing.
597
+ */
598
+ void ac_free(AC_STRUCT *node)
599
+ {
600
+ AC_TREE front, back, next;
601
+
602
+ if (node == NULL)
603
+ return;
604
+
605
+ if (node->tree != NULL) {
606
+ front = back = node->tree;
607
+ while (front != NULL) {
608
+ back->sibling = front->children;
609
+ while (back->sibling != NULL)
610
+ back = back->sibling;
611
+
612
+ next = front->sibling;
613
+ free(front);
614
+ front = next;
615
+ }
616
+ }
617
+
618
+ if (node->Plengths != NULL)
619
+ free(node->Plengths);
620
+
621
+ free(node);
622
+ }
623
+