aurelian-ruby-ahocorasick 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/MIT-LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2008 Aurelian Oancea
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
data/README.textile ADDED
@@ -0,0 +1,36 @@
1
+ h3. Introduction
2
+
3
+ This library is a ruby extension, a wrapper around the "Aho-Corasick":http://en.wikipedia.org/wiki/Aho-Corasick_algorithm implementation in C, found in "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html package.
4
+
5
+ The source code (ac.c and ac.h) was "adapted" from Strmat. In fact, I've changed only 3-4 lines of code from the original implementation so it will feat my needs: search needed to return the current position in the searched string.
6
+
7
+ h3. What's the idea?
8
+
9
+ Having a dictionary of known sentences, how can I find individual patterns in an incoming stream of data? Fast.
10
+
11
+ h1. TBD
12
+
13
+ <pre>
14
+ [aurelian@stalingrad ext]$ time ./dict.rb
15
+ 110196
16
+ 711
17
+
18
+ real 0m0.538s
19
+ user 0m0.435s
20
+ sys 0m0.036s
21
+ </pre>
22
+
23
+ h3. Additional Reading / Implementations
24
+
25
+ Other suffix - tree
26
+
27
+ * "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html
28
+ * Pytst / Ruby-Pytst
29
+ * "Aho-Corasick extension":http://hkn.eecs.berkeley.edu/~dyoo/python/ahocorasick/
30
+ * "Keyword Prospector":http://latimes.rubyforge.org/keyword_prospector/rdoc/
31
+ * "libstree":http://www.cl.cam.ac.uk/~cpk25/libstree/
32
+
33
+ --
34
+
35
+ (c) 2008 - Aurelian Oancea, < aurelian at locknet . ro >
36
+ released under MIT-LICENCE
data/examples/dict.rb ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.dirname(__FILE__) + '/../ext/ahocorasick'
4
+
5
+ k= AhoCorasick::KeywordTree.from_file(File.dirname(__FILE__) + "/../spec/data/en.words")
6
+
7
+ query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt")
8
+
9
+ results= k.search query
10
+
11
+ results.each do | r |
12
+ puts query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr + " => " + r[:value]
13
+ end
14
+
data/examples/elev.rb ADDED
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $kcode='UTF-8'
4
+
5
+ require File.dirname(__FILE__) + '/../ext/ahocorasick'
6
+
7
+ k= AhoCorasick::KeywordTree.new
8
+
9
+ k << "I've"
10
+ k << "data"
11
+ k << "base"
12
+ k << "database"
13
+
14
+ query= "I've moved my data to a database"
15
+
16
+ k.search(query).each do | r |
17
+ puts "-> [ " + r[:id].to_s + " ] " + r[:value] + " / " + query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr
18
+ end
19
+
data/examples/sample.c ADDED
@@ -0,0 +1,94 @@
1
+ //
2
+ // Getting started with Aho-Corasick from Strmat
3
+ //
4
+ // lasick Makefile:
5
+ //
6
+ // --
7
+ // ac.o :
8
+ // gcc -c -fPIC -shared ac.c
9
+ // libasick : ac.o
10
+ // gcc -shared -Wl,-soname,libasick.so -o libasick.so.1.0.1
11
+ // ar rcs libasick.a ac.o
12
+ // clean :
13
+ // rm -rf *.o *.a *.so* *.dylib*
14
+ // --
15
+ //
16
+ // Compile this stuff - asick is the library name, generated with the above Makefile :)
17
+ //
18
+ // gcc sample.c -o ac-sample -I../ext/ -L../ext/ -lasick
19
+ //
20
+
21
+ #include <string.h>
22
+ #include <stdio.h>
23
+ #include <stdlib.h>
24
+ #include "ac.h"
25
+
26
+ int main(int argc, char *argv[]) {
27
+ char * search;
28
+ char * remain;
29
+ char * result;
30
+ char word[1024];
31
+
32
+ FILE *dictionary;
33
+ FILE *input;
34
+ int lgt, id, n, i;
35
+
36
+ AC_STRUCT * tree;
37
+
38
+ input= fopen(argv[1], "r");
39
+
40
+ if(input == NULL) {
41
+ search= argv[1];
42
+ } else {
43
+ long lSize;
44
+ fseek (input , 0 , SEEK_END);
45
+ lSize= ftell (input);
46
+ rewind(input);
47
+ search = (char*) malloc (sizeof(char)*lSize);
48
+ if (search == NULL) { fputs ("Error: Memory error",stderr); exit(-2); }
49
+ fread(search, 1, lSize-1, input);
50
+ }
51
+
52
+ dictionary= fopen("../spec/data/dictionary.txt", "r");
53
+
54
+ if(dictionary == NULL) {
55
+ printf("Error: can't open file.\n");
56
+ return -1;
57
+ }
58
+
59
+ tree= ac_alloc();
60
+
61
+ // start counting from 1
62
+ n= 1;
63
+
64
+ printf("==> building dictionary ...");
65
+
66
+ while(fgets(word, 1024, dictionary) != NULL) {
67
+ // strip \n
68
+ ac_add_string(tree, word, strlen(word)-1, n++);
69
+ }
70
+
71
+ printf("%d entries added.\n",n);
72
+
73
+ ac_prep(tree);
74
+
75
+ printf("==> input text [%d]:\n--\n%s\n--\n", strlen(search), search);
76
+
77
+ ac_search_init(tree, search, strlen(search) );
78
+
79
+ while((remain= ac_search(tree, &lgt, &id)) != NULL) {
80
+ printf("`%d'", remain[lgt+1]);
81
+ result = (char*) malloc (sizeof(char)*lgt);
82
+ sprintf( result, "%.*s", lgt, remain);
83
+ // result: should read first lgt chars from remain.
84
+ printf("==> result: lenght=> %d, id=> %d [%s]\n", lgt, id, result);
85
+ free(result);
86
+ }
87
+
88
+ ac_free(tree);
89
+ fclose(dictionary);
90
+ free(search);
91
+
92
+ return 0;
93
+ }
94
+
data/examples/test.rb ADDED
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.dirname(__FILE__) + '/../ext/ahocorasick'
4
+
5
+ k= AhoCorasick::KeywordTree.new
6
+
7
+ puts k.size
8
+ k.add_string("foo");
9
+
10
+ puts k.size
11
+ k.add_string("bar", 1991);
12
+
13
+ puts k.size
14
+ k.add_string("fomz");
15
+
16
+ begin
17
+ k.add_string("foo", -1);
18
+ rescue RuntimeError => err
19
+ puts "[ok]==> got " + err.class.name + ": " + err.message
20
+ end
21
+
22
+ begin
23
+ k.add_string("foo", "bar");
24
+ rescue RuntimeError => err
25
+ puts "[ok]==> got " + err.class.name + ": " + err.message
26
+ end
27
+
28
+ k.add_string("timisoara", 22);
29
+
30
+ puts k.size
31
+ begin
32
+ k.add_string("bucuresti", 22);
33
+ rescue RuntimeError => err
34
+ puts "[ok]==> got " + err.class.name + ": " + err.message
35
+ end
36
+
37
+ k << "bacau"
38
+
39
+ k.search('am fost la bacau').each do | result |
40
+ puts result.inspect
41
+ end
42
+
43
+ k.search( 'din foo in foo' ).each do | q |
44
+ puts q.inspect
45
+ end
46
+
data/ext/ac.c ADDED
@@ -0,0 +1,623 @@
1
+ /*
2
+ * ac.c
3
+ *
4
+ * Implementation of the Aho-Corasick algorithm.
5
+ *
6
+ * NOTES:
7
+ * 8/94 - Original Implementation (Sean Davis)
8
+ * 9/94 - Redid Implementation (James Knight)
9
+ * 3/96 - Modularized the code (James Knight)
10
+ * 7/96 - Finished the modularization (James Knight)
11
+ */
12
+
13
+ #include <stdio.h>
14
+ #include <stdlib.h>
15
+ #include <string.h>
16
+ #include "ac.h"
17
+
18
+ /*
19
+ * ac_alloc
20
+ *
21
+ * Creates a new AC_STRUCT structure and initializes its fields.
22
+ *
23
+ * Parameters: none.
24
+ *
25
+ * Returns: A dynamically allocated AC_STRUCT structure.
26
+ */
27
+ AC_STRUCT *ac_alloc(void)
28
+ {
29
+ AC_STRUCT *node;
30
+
31
+ if ((node = malloc(sizeof(AC_STRUCT))) == NULL)
32
+ return NULL;
33
+ memset(node, 0, sizeof(AC_STRUCT));
34
+
35
+ if ((node->tree = malloc(sizeof(ACTREE_NODE))) == NULL) {
36
+ free(node);
37
+ return NULL;
38
+ }
39
+ memset(node->tree, 0, sizeof(ACTREE_NODE));
40
+
41
+ return node;
42
+ }
43
+
44
+
45
+ /*
46
+ * ac_add_string
47
+ *
48
+ * Adds a string to the AC_STRUCT structure's keyword tree.
49
+ *
50
+ * NOTE: The `id' value given must be unique to any of the strings
51
+ * added to the tree, and must be a small integer greater than
52
+ * 0 (since it is used to index an array holding information
53
+ * about each of the strings).
54
+ *
55
+ * The best id's to use are to number the strings from 1 to K.
56
+ *
57
+ * Parameters: node - an AC_STRUCT structure
58
+ * P - the sequence
59
+ * M - the sequence length
60
+ * id - the sequence identifier
61
+ *
62
+ * Returns: non-zero on success, zero on error.
63
+ */
64
+ int ac_add_string(AC_STRUCT *node, char *P, int M, int id)
65
+ {
66
+ int i, j, newsize;
67
+ AC_TREE tnode, child, back, newnode, list, tail;
68
+
69
+ /*
70
+ * Return a zero if a previous error had occurred, or if the
71
+ * given id equals zero. An id value of zero is used by the
72
+ * algorithm to signal that no pattern ends at a node in the
73
+ * keyword tree. So, it can't be used as a pattern's id.
74
+ */
75
+ if (node->errorflag || id == 0)
76
+ return 0;
77
+
78
+ P--; /* Shift to make sequence be P[1],...,P[M] */
79
+
80
+ /*
81
+ * Allocate space for the new string's information.
82
+ */
83
+ if (node->Psize <= id) {
84
+ if (node->Psize == 0) {
85
+ newsize = (id >= 16 ? id + 1 : 16);
86
+ node->Plengths = malloc(newsize * sizeof(int));
87
+ }
88
+ else {
89
+ newsize = node->Psize + id + 1;
90
+ node->Plengths = realloc(node->Plengths, newsize * sizeof(int));
91
+ }
92
+ if (node->Plengths == NULL) {
93
+ node->errorflag = 1;
94
+ return 0;
95
+ }
96
+
97
+ for (i=node->Psize; i < newsize; i++)
98
+ node->Plengths[i] = 0;
99
+ node->Psize = newsize;
100
+ }
101
+
102
+ // duplicate id
103
+ if (node->Plengths[id] != 0)
104
+ return 0;
105
+
106
+ /*
107
+ * Add the string to the keyword tree.
108
+ */
109
+ tnode = node->tree;
110
+ for (i=1; i <= M; i++) {
111
+ /*
112
+ * Find the child whose character is P[i].
113
+ */
114
+ back = NULL;
115
+ child = tnode->children;
116
+ while (child != NULL && child->ch < P[i]) {
117
+ back = child;
118
+ child = child->sibling;
119
+ }
120
+
121
+ if (child == NULL || child->ch != P[i])
122
+ break;
123
+
124
+ tnode = child;
125
+
126
+ #ifdef STATS
127
+ node->prep_old_edges++;
128
+ #endif
129
+
130
+ }
131
+
132
+ /*
133
+ * If only part of the pattern exists in the tree, add the
134
+ * rest of the pattern to the tree.
135
+ */
136
+ if (i <= M) {
137
+ list = tail = NULL;
138
+ for (j=i; j <= M; j++) {
139
+ if ((newnode = malloc(sizeof(ACTREE_NODE))) == NULL)
140
+ break;
141
+ memset(newnode, 0, sizeof(ACTREE_NODE));
142
+ newnode->ch = P[j];
143
+
144
+ if (list == NULL)
145
+ list = tail = newnode;
146
+ else
147
+ tail = tail->children = newnode;
148
+
149
+ #ifdef STATS
150
+ node->prep_new_edges++;
151
+ #endif
152
+
153
+ }
154
+ if (j <= M) {
155
+ while (list != NULL) {
156
+ tail = list->children;
157
+ free(list);
158
+ list = tail;
159
+ }
160
+ return 0;
161
+ }
162
+
163
+ list->sibling = child;
164
+ if (back == NULL)
165
+ tnode->children = list;
166
+ else
167
+ back->sibling = list;
168
+
169
+ tnode = tail;
170
+ }
171
+
172
+ tnode->matchid = id;
173
+ node->Plengths[id] = M;
174
+ node->ispreprocessed = 0;
175
+
176
+ return 1;
177
+ }
178
+
179
+
180
+ /*
181
+ * ac_del_string
182
+ *
183
+ * Deletes a string from the keyword tree.
184
+ *
185
+ * Parameters: node - an AC_STRUCT structure
186
+ * P - the sequence to be deleted
187
+ * M - its length
188
+ * id - its identifier
189
+ *
190
+ * Returns: non-zero on success, zero on error.
191
+ */
192
+ int ac_del_string(AC_STRUCT *node, char *P, int M, int id)
193
+ {
194
+ int i, flag;
195
+ AC_TREE tnode, tlast, tback, child, back;
196
+
197
+ if (node->errorflag || id > node->Psize || node->Plengths[id] == 0)
198
+ return 0;
199
+
200
+ P--; /* Shift to make sequence be P[1],...,P[M] */
201
+
202
+ /*
203
+ * Scan the tree for the path corresponding to the keyword to be deleted.
204
+ */
205
+ flag = 1;
206
+ tlast = tnode = node->tree;
207
+ tback = NULL;
208
+
209
+ for (i=1; i <= M; i++) {
210
+ /*
211
+ * Find the child matching P[i]. It must be there.
212
+ */
213
+ child = tnode->children;
214
+ back = NULL;
215
+ while (child != NULL && child->ch != P[i]) {
216
+ back = child;
217
+ child = child->sibling;
218
+ }
219
+
220
+ if (child == NULL) {
221
+ fprintf(stderr, "Error in Aho-Corasick preprocessing. String to be "
222
+ "deleted is not in tree.\n");
223
+ return 0;
224
+ }
225
+
226
+ /*
227
+ * Try to find the point where the pattern to be deleted branches off
228
+ * from the paths of the other patterns in the tree. This point must
229
+ * be at the latest node which satisfies one of these two conditions:
230
+ *
231
+ * 1) Another pattern ends at that node (and so
232
+ * `child->matchid != 0'). In this case, the branch point is
233
+ * just below this node and so the children of this node
234
+ * should be removed.
235
+ * 2) A node has other siblings. In this case, the node itself
236
+ * is the branch point, and it and its children should be
237
+ * removed.
238
+ */
239
+ if (i < M && child->matchid != 0) {
240
+ flag = 1;
241
+ tlast = child;
242
+ }
243
+ else if (back != NULL || child->sibling != NULL) {
244
+ flag = 2;
245
+ tlast = child;
246
+ tback = (back == NULL ? tnode : back);
247
+ }
248
+
249
+ tnode = child;
250
+ }
251
+
252
+ /*
253
+ * If the node corresponding to the end of the keyword has children,
254
+ * then the tree should not be altered, except to remove the keyword's
255
+ * identifier from the tree.
256
+ *
257
+ * Otherwise, apply the appropriate removal, as described above.
258
+ */
259
+ if (tnode->children != NULL) {
260
+ tnode->matchid = 0;
261
+ }
262
+ else {
263
+ if (flag == 1) {
264
+ child = tlast->children;
265
+ tlast->children = NULL;
266
+ tlast = child;
267
+ }
268
+ else {
269
+ if (tback->children == tlast)
270
+ tback->children = tlast->sibling;
271
+ else
272
+ tback->sibling = tlast->sibling;
273
+ }
274
+
275
+ while (tlast != NULL) {
276
+ child = tlast->children;
277
+ free(tlast);
278
+ tlast = child;
279
+ }
280
+ }
281
+
282
+ node->Plengths[id] = 0;
283
+ node->ispreprocessed = 0;
284
+
285
+ return 1;
286
+ }
287
+
288
+
289
+ /*
290
+ * ac_prep
291
+ *
292
+ * Compute the failure and output links for the keyword tree.
293
+ *
294
+ * Parameters: node - an AC_STRUCT structure
295
+ *
296
+ * Returns: non-zero on success, zero on error.
297
+ */
298
+ int ac_prep(AC_STRUCT *node)
299
+ {
300
+ char x;
301
+ AC_TREE v, vprime, w, wprime, root, front, back, child;
302
+
303
+ if (node->errorflag)
304
+ return 0;
305
+
306
+ /*
307
+ * The failure link and output link computation requires a breadth-first
308
+ * traversal of the keyword tree. And, to do that, we need a queue of
309
+ * the nodes yet to be processed.
310
+ *
311
+ * The `faillink' fields will be used as the pointers for the queue
312
+ * of nodes to be computed (since the failure link is only set after
313
+ * the node is removed from the queue).
314
+ *
315
+ * The `outlink' fields will be used as the pointers to a node's parent
316
+ * for nodes in the queue (since the output link is also only set after
317
+ * the node is removed from the queue).
318
+ */
319
+ root = node->tree;
320
+
321
+ front = back = root;
322
+ front->faillink = NULL;
323
+ front->outlink = NULL;
324
+
325
+ while (front != NULL) {
326
+ v = front;
327
+ x = v->ch;
328
+ vprime = v->outlink;
329
+
330
+ /*
331
+ * Add the node's children to the queue.
332
+ */
333
+ for (child=v->children; child != NULL; child=child->sibling) {
334
+ child->outlink = v;
335
+ back->faillink = child;
336
+ back = child;
337
+ }
338
+ back->faillink = NULL;
339
+
340
+ front = front->faillink;
341
+ v->faillink = v->outlink = NULL;
342
+
343
+ /*
344
+ * Set the failure and output links.
345
+ */
346
+ if (v == root)
347
+ ;
348
+ else if (vprime == root)
349
+ v->faillink = root;
350
+ else {
351
+ /*
352
+ * Find the find link in the failure link chain which has a child
353
+ * labeled with x.
354
+ */
355
+ wprime = NULL;
356
+ w = vprime->faillink;
357
+
358
+ while (1) {
359
+ wprime = w->children;
360
+ while (wprime != NULL && wprime->ch < x)
361
+ wprime = wprime->sibling;
362
+
363
+ if ((wprime != NULL && wprime->ch == x) || w == root)
364
+ break;
365
+
366
+ w = w->faillink;
367
+
368
+ #ifdef STATS
369
+ node->prep_fail_compares++;
370
+ #endif
371
+ }
372
+ #ifdef STATS
373
+ node->prep_fail_compares++;
374
+ #endif
375
+
376
+ if (wprime != NULL && wprime->ch == x)
377
+ v->faillink = wprime;
378
+ else
379
+ v->faillink = root;
380
+
381
+ if (v->matchid != 0) {
382
+ if (v->faillink->matchid != 0)
383
+ v->outlink = v->faillink;
384
+ else
385
+ v->outlink = v->faillink->outlink;
386
+ }
387
+ }
388
+ }
389
+
390
+ node->ispreprocessed = 1;
391
+ node->initflag = 0;
392
+
393
+ return 1;
394
+ }
395
+
396
+
397
+ /*
398
+ * ac_search_init
399
+ *
400
+ * Initializes the variables used during an Aho-Corasick search.
401
+ * See ac_search for an example of how it should be used.
402
+ *
403
+ * Parameters: node - an AC_STRUCT structure
404
+ * T - the sequence to be searched
405
+ * N - the length of the sequence
406
+ *
407
+ * Returns: nothing.
408
+ */
409
+ void ac_search_init(AC_STRUCT *node, char *T, int N)
410
+ {
411
+ if (node->errorflag)
412
+ return;
413
+ else if (!node->ispreprocessed) {
414
+ fprintf(stderr, "Error in Aho-Corasick search. The preprocessing "
415
+ "has not been completed.\n");
416
+ return;
417
+ }
418
+
419
+ node->T = T - 1; /* Shift to make sequence be T[1],...,T[N] */
420
+ node->N = N;
421
+ node->c = 1;
422
+ node->w = node->tree;
423
+ node->output = NULL;
424
+ node->initflag = 1;
425
+ node->endflag = 0;
426
+ }
427
+
428
+
429
+ /*
430
+ * ac_search
431
+ *
432
+ * Scans a text to look for the next occurrence of one of the patterns
433
+ * in the text. An example of how this search should be used is the
434
+ * following:
435
+ *
436
+ * s = T;
437
+ * len = N;
438
+ * contflag = 0;
439
+ * ac_search_init(node, T, N);
440
+ * while ((s = ac_search(node, &matchlen, &matchid) != NULL) {
441
+ * >>> Pattern `matchid' matched from `s' to `s + matchlen - 1'. <<<
442
+ * }
443
+ *
444
+ * where `node', `T' and `N' are assumed to be initialized appropriately.
445
+ *
446
+ * Parameters: node - a preprocessed AC_STRUCT structure
447
+ * length_out - where to store the new match's length
448
+ * id_out - where to store the identifier of the
449
+ * pattern that matched
450
+ * ends_at - where to store the n-th matched char
451
+ *
452
+ * Returns: the left end of the text that matches a pattern, or NULL
453
+ * if no match occurs. (It also stores values in `*length_out',
454
+ * and `*id_out' giving the match's length and pattern identifier.
455
+ */
456
+ char *ac_search(AC_STRUCT *node, int *length_out, int *id_out, int *ends_at)
457
+ {
458
+ int c, N, id;
459
+ char *T;
460
+ AC_TREE w, wprime, root;
461
+
462
+ if (node->errorflag)
463
+ return NULL;
464
+ else if (!node->ispreprocessed) {
465
+ fprintf(stderr, "Error in Aho-Corasick search. The preprocessing "
466
+ "has not been completed.\n");
467
+ return NULL;
468
+ }
469
+ else if (!node->initflag) {
470
+ fprintf(stderr, "Error in Aho-Corasick search. ac_search_init was not "
471
+ "called.\n");
472
+ return NULL;
473
+ }
474
+ else if (node->endflag)
475
+ return NULL;
476
+
477
+ T = node->T;
478
+ N = node->N;
479
+ c = node->c;
480
+ w = node->w;
481
+ root = node->tree;
482
+
483
+ /*
484
+ * If the last call to ac_search returned a match, check for another
485
+ * match ending at the same right endpoint (denoted by a non-NULL
486
+ * output link).
487
+ */
488
+ if (node->output != NULL) {
489
+ node->output = node->output->outlink;
490
+
491
+ #ifdef STATS
492
+ node->outlinks_traversed++;
493
+ #endif
494
+
495
+ if (node->output != NULL) {
496
+ id = node->output->matchid;
497
+ if (id_out)
498
+ *id_out = id;
499
+ if (length_out)
500
+ *length_out = node->Plengths[id];
501
+ if (ends_at)
502
+ *ends_at= c;
503
+ return &T[c] - node->Plengths[id];
504
+ }
505
+
506
+ }
507
+
508
+ /*
509
+ * Run the search algorithm, stopping at the first position where a
510
+ * match to one of the patterns occurs.
511
+ */
512
+ while (c <= N) {
513
+ /*
514
+ * Try to match the next input character to a child in the tree.
515
+ */
516
+ wprime = w->children;
517
+ while (wprime != NULL && wprime->ch != T[c])
518
+ wprime = wprime->sibling;
519
+
520
+ #ifdef STATS
521
+ node->num_compares++;
522
+ #endif
523
+
524
+ /*
525
+ * If the match fails, then either use the failure link (if not
526
+ * at the root), or move to the next character since no prefix
527
+ * of any pattern ends with character T[c].
528
+ */
529
+ if (wprime == NULL) {
530
+ if (w == root)
531
+ c++;
532
+ else {
533
+ w = w->faillink;
534
+
535
+ #ifdef STATS
536
+ node->num_failures++;
537
+ #endif
538
+
539
+ }
540
+ }
541
+ else {
542
+ /*
543
+ * If we could match the input, move down the tree and to the
544
+ * next input character, and see if that match completes the
545
+ * match to a pattern (when matchid != 0 or outlink != NULL).
546
+ */
547
+ c++;
548
+ w = wprime;
549
+
550
+ #ifdef STATS
551
+ node->edges_traversed++;
552
+ #endif
553
+
554
+ if (w->matchid != 0)
555
+ node->output = w;
556
+ else if (w->outlink != NULL) {
557
+ node->output = w->outlink;
558
+
559
+ #ifdef STATS
560
+ node->outlinks_traversed++;
561
+ #endif
562
+
563
+ }
564
+
565
+ if (node->output != NULL) {
566
+ id = node->output->matchid;
567
+ if (id_out)
568
+ *id_out = id;
569
+ if (length_out)
570
+ *length_out= node->Plengths[id];
571
+ if(ends_at)
572
+ *ends_at= c;
573
+
574
+ node->w = w;
575
+ node->c = c; // ends_at - length_out;
576
+
577
+ return &T[c] - node->Plengths[id];
578
+ }
579
+ }
580
+ }
581
+
582
+ node->c = c;
583
+ node->endflag = 1;
584
+
585
+ return NULL;
586
+ }
587
+
588
+
589
+ /*
590
+ * ac_free
591
+ *
592
+ * Free up the allocated AC_STRUCT structure.
593
+ *
594
+ * Parameters: node - a AC_STRUCT structure
595
+ *
596
+ * Returns: nothing.
597
+ */
598
+ void ac_free(AC_STRUCT *node)
599
+ {
600
+ AC_TREE front, back, next;
601
+
602
+ if (node == NULL)
603
+ return;
604
+
605
+ if (node->tree != NULL) {
606
+ front = back = node->tree;
607
+ while (front != NULL) {
608
+ back->sibling = front->children;
609
+ while (back->sibling != NULL)
610
+ back = back->sibling;
611
+
612
+ next = front->sibling;
613
+ free(front);
614
+ front = next;
615
+ }
616
+ }
617
+
618
+ if (node->Plengths != NULL)
619
+ free(node->Plengths);
620
+
621
+ free(node);
622
+ }
623
+