aurelian-ruby-ahocorasick 0.4.4 → 0.4.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +25 -13
- data/ext/ruby-ahocorasick.c +3 -7
- metadata +1 -1
data/README.textile
CHANGED
@@ -1,30 +1,36 @@
|
|
1
1
|
h1. This is a work in progress.
|
2
2
|
|
3
|
-
|
3
|
+
|
4
|
+
h2. Introduction
|
4
5
|
|
5
6
|
This library is a "Ruby":http://ruby-lang.org extension, a wrapper around the "Aho-Corasick":http://en.wikipedia.org/wiki/Aho-Corasick_algorithm implementation in C, found in "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html package.
|
6
7
|
|
7
8
|
The source code (ac.c and ac.h) was "adapted" from Strmat. In fact, I've changed only 3-4 lines of code from the original implementation so it will feat my needs: search needed to return the current position in the searched string.
|
8
9
|
|
9
|
-
|
10
|
+
|
11
|
+
h2. Okay, so what's the idea?
|
10
12
|
|
11
13
|
Having a dictionary of known sentences (note: not *words*!), this kick ass algorithm can find individual patterns in an incoming stream of data. Kinda Fast.
|
12
14
|
|
13
15
|
The algorithm has 2 stages: one where an internal tree in being build from the given dictionary leaving the search to the second step.
|
14
16
|
|
15
|
-
|
17
|
+
|
18
|
+
h2. Okay, so where can I use this?
|
16
19
|
|
17
20
|
Well, you can do some crazy things with it, like, you can lookup for DNA patterns or maybe analyze network sequences (read: strange and maybe proprietary network protocols), or domestic stuff like building contextual links on your blog posts to enrich your users experience.
|
18
21
|
|
19
|
-
h3. Okay, so how can I install it?
|
20
22
|
|
21
|
-
|
23
|
+
h2. Okay, so how can I install it?
|
24
|
+
|
25
|
+
|
26
|
+
h3. Rubygems - Development Version
|
22
27
|
|
23
28
|
<pre>
|
24
29
|
gem install aurelian-ruby-ahocorasick --source=http://gems.github.com
|
25
30
|
</pre>
|
26
31
|
|
27
|
-
|
32
|
+
|
33
|
+
h3. Build it from source
|
28
34
|
|
29
35
|
<pre>
|
30
36
|
$ git clone git://github.com/aurelian/ruby-ahocorasick.git
|
@@ -37,18 +43,21 @@ To build and install the gem on your machine (run with sudo if needed):
|
|
37
43
|
$ rake install
|
38
44
|
</pre>
|
39
45
|
|
40
|
-
@rake -T@
|
46
|
+
@rake -T@ will list other cool tasks.
|
41
47
|
|
42
|
-
|
48
|
+
|
49
|
+
h3. Rubygems - Stable Version
|
43
50
|
|
44
51
|
There's no stable version right now.
|
45
52
|
|
46
|
-
|
53
|
+
|
54
|
+
h4. Notes
|
47
55
|
|
48
56
|
It's known to work / compile / install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc around.
|
49
57
|
Unfortunately I don't have a Windows PC around nor required knowledge about Microsoft compliers.
|
50
58
|
|
51
|
-
|
59
|
+
|
60
|
+
h2. Okay, so how do I use it?
|
52
61
|
|
53
62
|
<pre>
|
54
63
|
require 'ahocorasick'
|
@@ -68,11 +77,13 @@ h3. Okay, so how do I use it?
|
|
68
77
|
|
69
78
|
You can get some API reference on the "wiki":http://github.com/aurelian/ruby-ahocorasick/wikis.
|
70
79
|
|
71
|
-
|
80
|
+
|
81
|
+
h2. Bugs? Suggestions? Ideas? Patches?
|
72
82
|
|
73
83
|
For now, just use the email address.
|
74
84
|
|
75
|
-
|
85
|
+
|
86
|
+
h2. Additional Reading
|
76
87
|
|
77
88
|
Other suffix - tree implementations:
|
78
89
|
|
@@ -82,7 +93,8 @@ Other suffix - tree implementations:
|
|
82
93
|
* "Keyword Prospector":http://latimes.rubyforge.org/keyword_prospector/rdoc/
|
83
94
|
* "libstree":http://www.cl.cam.ac.uk/~cpk25/libstree/
|
84
95
|
|
85
|
-
|
96
|
+
|
97
|
+
h2. License
|
86
98
|
|
87
99
|
(c) 2008 - Aurelian Oancea, < oancea at gmail dot com >
|
88
100
|
|
data/ext/ruby-ahocorasick.c
CHANGED
@@ -131,12 +131,11 @@ rb_kwt_make(VALUE self)
|
|
131
131
|
static VALUE
|
132
132
|
rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
133
133
|
{
|
134
|
-
char * result; // itermediate result
|
135
134
|
char * remain; // returned by ac_search, the remaing text to search
|
136
|
-
int lgt, id, ends_at; // filled in by ac_search, the id,
|
137
|
-
int starts_at;
|
135
|
+
int lgt, id, ends_at, starts_at; // filled in by ac_search: the length of the result, the id, and starts_at/ends_at position
|
138
136
|
VALUE v_result; // one result, as hash
|
139
137
|
VALUE v_results; // all the results, an array
|
138
|
+
|
140
139
|
VALUE v_search; // search string, function argument
|
141
140
|
struct kwt_struct_data *kwt_data;
|
142
141
|
|
@@ -166,11 +165,8 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
|
166
165
|
rb_hash_aset( v_result, sym_id, INT2FIX(id) );
|
167
166
|
rb_hash_aset( v_result, sym_starts_at, INT2FIX( ends_at - lgt - 1 ) );
|
168
167
|
rb_hash_aset( v_result, sym_ends_at, INT2FIX( ends_at - 1 ) );
|
169
|
-
|
170
|
-
sprintf( result, "%.*s", lgt, remain);
|
171
|
-
rb_hash_aset( v_result, sym_value, rb_str_new(result, lgt) );
|
168
|
+
rb_hash_aset( v_result, sym_value, rb_str_new(remain, lgt) );
|
172
169
|
rb_ary_push( v_results, v_result );
|
173
|
-
free(result);
|
174
170
|
}
|
175
171
|
// reopen the tree
|
176
172
|
kwt_data->is_frozen= 0;
|