aurelian-ruby-ahocorasick 0.4.4 → 0.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +25 -13
- data/ext/ruby-ahocorasick.c +3 -7
- metadata +1 -1
data/README.textile
CHANGED
@@ -1,30 +1,36 @@
|
|
1
1
|
h1. This is a work in progress.
|
2
2
|
|
3
|
-
|
3
|
+
|
4
|
+
h2. Introduction
|
4
5
|
|
5
6
|
This library is a "Ruby":http://ruby-lang.org extension, a wrapper around the "Aho-Corasick":http://en.wikipedia.org/wiki/Aho-Corasick_algorithm implementation in C, found in "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html package.
|
6
7
|
|
7
8
|
The source code (ac.c and ac.h) was "adapted" from Strmat. In fact, I've changed only 3-4 lines of code from the original implementation so it will feat my needs: search needed to return the current position in the searched string.
|
8
9
|
|
9
|
-
|
10
|
+
|
11
|
+
h2. Okay, so what's the idea?
|
10
12
|
|
11
13
|
Having a dictionary of known sentences (note: not *words*!), this kick ass algorithm can find individual patterns in an incoming stream of data. Kinda Fast.
|
12
14
|
|
13
15
|
The algorithm has 2 stages: one where an internal tree in being build from the given dictionary leaving the search to the second step.
|
14
16
|
|
15
|
-
|
17
|
+
|
18
|
+
h2. Okay, so where can I use this?
|
16
19
|
|
17
20
|
Well, you can do some crazy things with it, like, you can lookup for DNA patterns or maybe analyze network sequences (read: strange and maybe proprietary network protocols), or domestic stuff like building contextual links on your blog posts to enrich your users experience.
|
18
21
|
|
19
|
-
h3. Okay, so how can I install it?
|
20
22
|
|
21
|
-
|
23
|
+
h2. Okay, so how can I install it?
|
24
|
+
|
25
|
+
|
26
|
+
h3. Rubygems - Development Version
|
22
27
|
|
23
28
|
<pre>
|
24
29
|
gem install aurelian-ruby-ahocorasick --source=http://gems.github.com
|
25
30
|
</pre>
|
26
31
|
|
27
|
-
|
32
|
+
|
33
|
+
h3. Build it from source
|
28
34
|
|
29
35
|
<pre>
|
30
36
|
$ git clone git://github.com/aurelian/ruby-ahocorasick.git
|
@@ -37,18 +43,21 @@ To build and install the gem on your machine (run with sudo if needed):
|
|
37
43
|
$ rake install
|
38
44
|
</pre>
|
39
45
|
|
40
|
-
@rake -T@
|
46
|
+
@rake -T@ will list other cool tasks.
|
41
47
|
|
42
|
-
|
48
|
+
|
49
|
+
h3. Rubygems - Stable Version
|
43
50
|
|
44
51
|
There's no stable version right now.
|
45
52
|
|
46
|
-
|
53
|
+
|
54
|
+
h4. Notes
|
47
55
|
|
48
56
|
It's known to work / compile / install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc around.
|
49
57
|
Unfortunately I don't have a Windows PC around nor required knowledge about Microsoft compliers.
|
50
58
|
|
51
|
-
|
59
|
+
|
60
|
+
h2. Okay, so how do I use it?
|
52
61
|
|
53
62
|
<pre>
|
54
63
|
require 'ahocorasick'
|
@@ -68,11 +77,13 @@ h3. Okay, so how do I use it?
|
|
68
77
|
|
69
78
|
You can get some API reference on the "wiki":http://github.com/aurelian/ruby-ahocorasick/wikis.
|
70
79
|
|
71
|
-
|
80
|
+
|
81
|
+
h2. Bugs? Suggestions? Ideas? Patches?
|
72
82
|
|
73
83
|
For now, just use the email address.
|
74
84
|
|
75
|
-
|
85
|
+
|
86
|
+
h2. Additional Reading
|
76
87
|
|
77
88
|
Other suffix - tree implementations:
|
78
89
|
|
@@ -82,7 +93,8 @@ Other suffix - tree implementations:
|
|
82
93
|
* "Keyword Prospector":http://latimes.rubyforge.org/keyword_prospector/rdoc/
|
83
94
|
* "libstree":http://www.cl.cam.ac.uk/~cpk25/libstree/
|
84
95
|
|
85
|
-
|
96
|
+
|
97
|
+
h2. License
|
86
98
|
|
87
99
|
(c) 2008 - Aurelian Oancea, < oancea at gmail dot com >
|
88
100
|
|
data/ext/ruby-ahocorasick.c
CHANGED
@@ -131,12 +131,11 @@ rb_kwt_make(VALUE self)
|
|
131
131
|
static VALUE
|
132
132
|
rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
133
133
|
{
|
134
|
-
char * result; // itermediate result
|
135
134
|
char * remain; // returned by ac_search, the remaing text to search
|
136
|
-
int lgt, id, ends_at; // filled in by ac_search, the id,
|
137
|
-
int starts_at;
|
135
|
+
int lgt, id, ends_at, starts_at; // filled in by ac_search: the length of the result, the id, and starts_at/ends_at position
|
138
136
|
VALUE v_result; // one result, as hash
|
139
137
|
VALUE v_results; // all the results, an array
|
138
|
+
|
140
139
|
VALUE v_search; // search string, function argument
|
141
140
|
struct kwt_struct_data *kwt_data;
|
142
141
|
|
@@ -166,11 +165,8 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
|
166
165
|
rb_hash_aset( v_result, sym_id, INT2FIX(id) );
|
167
166
|
rb_hash_aset( v_result, sym_starts_at, INT2FIX( ends_at - lgt - 1 ) );
|
168
167
|
rb_hash_aset( v_result, sym_ends_at, INT2FIX( ends_at - 1 ) );
|
169
|
-
|
170
|
-
sprintf( result, "%.*s", lgt, remain);
|
171
|
-
rb_hash_aset( v_result, sym_value, rb_str_new(result, lgt) );
|
168
|
+
rb_hash_aset( v_result, sym_value, rb_str_new(remain, lgt) );
|
172
169
|
rb_ary_push( v_results, v_result );
|
173
|
-
free(result);
|
174
170
|
}
|
175
171
|
// reopen the tree
|
176
172
|
kwt_data->is_frozen= 0;
|