aurelian-ruby-ahocorasick 0.2.2 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +47 -13
- data/examples/dict.rb +12 -1
- data/ext/ruby-ahocorasick.c +22 -4
- data/spec/ahocorasick_spec.rb +26 -14
- metadata +1 -1
data/README.textile
CHANGED
@@ -1,28 +1,61 @@
|
|
1
|
+
|
2
|
+
h1. This is a work in progress.
|
3
|
+
|
1
4
|
h3. Introduction
|
2
5
|
|
3
6
|
This library is a ruby extension, a wrapper around the "Aho-Corasick":http://en.wikipedia.org/wiki/Aho-Corasick_algorithm implementation in C, found in "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html package.
|
4
7
|
|
5
8
|
The source code (ac.c and ac.h) was "adapted" from Strmat. In fact, I've changed only 3-4 lines of code from the original implementation so it will feat my needs: search needed to return the current position in the searched string.
|
6
9
|
|
7
|
-
h3.
|
10
|
+
h3. Okay, so what's the idea?
|
11
|
+
|
12
|
+
Having a dictionary of known sentences (note: not *words*!), this kick ass algorithm can find individual patterns in an incoming stream of data. Kinda Fast.
|
13
|
+
|
14
|
+
The algorithm has 2 stages: one where an internal tree in being build from the given dictionary leaving the search to the second step.
|
15
|
+
|
16
|
+
h3. Okay, so where can I use this?
|
8
17
|
|
9
|
-
|
18
|
+
Well, you can do some crazy things with it, like, you can lookup for DNA patterns or maybe analyze network sequences (read: strange and maybe proprietary network protocols), or domestic stuff like building contextual links on your blog posts to enrich your users experience.
|
10
19
|
|
11
|
-
|
20
|
+
h3. Okay, so how can I install it?
|
21
|
+
|
22
|
+
If you don't have github surces, type:
|
12
23
|
|
13
24
|
<pre>
|
14
|
-
|
15
|
-
|
16
|
-
|
25
|
+
gem sources -a http://gems.github.com
|
26
|
+
</pre>
|
27
|
+
|
28
|
+
then,
|
17
29
|
|
18
|
-
|
19
|
-
|
20
|
-
sys 0m0.036s
|
30
|
+
<pre>
|
31
|
+
gem install aurelian-ruby-ahocorasick
|
21
32
|
</pre>
|
22
33
|
|
23
|
-
|
34
|
+
h5. Note
|
35
|
+
|
36
|
+
It's known to work - compile - install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc around.
|
24
37
|
|
25
|
-
|
38
|
+
h3. Okay, so how do I use it?
|
39
|
+
|
40
|
+
<pre>
|
41
|
+
require 'ahocorasick'
|
42
|
+
|
43
|
+
keyword_tree= AhoCorasick::KeywordTree.new # creates a new tree
|
44
|
+
keyword_tree.add_string( "foo-- Z@!bar" ) # add's a keyword to the tree
|
45
|
+
keyword_tree.add_string( "cervantes" ) # even more
|
46
|
+
|
47
|
+
results= keyword_tree.find_all( "1011000129 foo-- Z@!bar761 ! 001211 6xU" ).each do | result |
|
48
|
+
result[:value] # => "foo-- Z@!bar"
|
49
|
+
result[:starts_at] # => 11
|
50
|
+
result[:ends_at] # => 23
|
51
|
+
result[:id] # => 1
|
52
|
+
end
|
53
|
+
|
54
|
+
</pre>
|
55
|
+
|
56
|
+
h3. Additional Reading
|
57
|
+
|
58
|
+
Other suffix - tree implementations:
|
26
59
|
|
27
60
|
* "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html
|
28
61
|
* Pytst / Ruby-Pytst
|
@@ -30,7 +63,8 @@ Other suffix - tree
|
|
30
63
|
* "Keyword Prospector":http://latimes.rubyforge.org/keyword_prospector/rdoc/
|
31
64
|
* "libstree":http://www.cl.cam.ac.uk/~cpk25/libstree/
|
32
65
|
|
33
|
-
|
66
|
+
h3. License
|
67
|
+
|
68
|
+
(c) 2008 - Aurelian Oancea, < oancea at gmail dot com >
|
34
69
|
|
35
|
-
(c) 2008 - Aurelian Oancea, < aurelian at locknet . ro >
|
36
70
|
released under MIT-LICENCE
|
data/examples/dict.rb
CHANGED
@@ -1,13 +1,24 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'time'
|
4
|
+
|
3
5
|
require File.dirname(__FILE__) + '/../ext/ahocorasick'
|
4
6
|
|
7
|
+
t= Time.now
|
8
|
+
|
5
9
|
k= AhoCorasick::KeywordTree.from_file(File.dirname(__FILE__) + "/../spec/data/en.words")
|
6
10
|
|
7
|
-
|
11
|
+
t1= Time.now
|
12
|
+
|
13
|
+
puts "%d words added in %s seconds" % [k.size, (t1-t)]
|
14
|
+
|
15
|
+
query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt" )
|
8
16
|
|
9
17
|
results= k.search query
|
10
18
|
|
19
|
+
puts "took %s seconds to find %d results in a streem with %d charachters" % [(Time.now-t1), results.size, query.size]
|
20
|
+
|
21
|
+
exit
|
11
22
|
results.each do | r |
|
12
23
|
puts query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr + " => " + r[:value]
|
13
24
|
end
|
data/ext/ruby-ahocorasick.c
CHANGED
@@ -1,15 +1,22 @@
|
|
1
1
|
|
2
2
|
//
|
3
|
-
// (c) 2008, Aurelian Oancea <
|
3
|
+
// (c) 2008, Aurelian Oancea < oancea at gmail dot com >
|
4
4
|
//
|
5
5
|
// Released under MIT-LICENSE
|
6
6
|
//
|
7
7
|
|
8
8
|
//
|
9
9
|
// TODO: new methods?
|
10
|
+
//
|
10
11
|
// * kwt[id] = word
|
11
12
|
// * kwt.from_file (class instance method)
|
12
13
|
//
|
14
|
+
// * kwt.find_each ("str") {|r| .. }
|
15
|
+
// * kwt.find_first("str")
|
16
|
+
// * kwt.find_all ("str")
|
17
|
+
//
|
18
|
+
// TODO: rename search to find_all
|
19
|
+
//
|
13
20
|
|
14
21
|
#include <ruby.h>
|
15
22
|
#include "ac.h"
|
@@ -196,13 +203,18 @@ rb_kwt_size(VALUE self)
|
|
196
203
|
*
|
197
204
|
* kwt.add_string("foo1$21^ 98N3 ba>Z")
|
198
205
|
* kwt << "bar" # using the alias
|
199
|
-
*
|
206
|
+
*
|
200
207
|
* ==== Note: you can also specify the id, a number between 1 and k
|
201
208
|
*
|
202
|
-
* kwt.add_string "bar", 123
|
209
|
+
* kwt.add_string "bar", 123 # => 123
|
203
210
|
*
|
204
211
|
* This id should be unique in the context of the current tree.
|
205
212
|
*
|
213
|
+
* Returns the id of the inserted object.
|
214
|
+
*
|
215
|
+
* kwt.add_string("test", 18) # => 18
|
216
|
+
* kwt.add_string("baz") # => 19
|
217
|
+
*
|
206
218
|
*/
|
207
219
|
static VALUE
|
208
220
|
rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
@@ -232,13 +244,16 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
|
232
244
|
id= NUM2INT(v_id);
|
233
245
|
}
|
234
246
|
|
247
|
+
// printf("[internal]==> %d\n", id);
|
248
|
+
|
235
249
|
if( ac_add_string(kwt_data->tree, string, strlen(string), id) == 0 ) {
|
236
250
|
rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
|
237
251
|
}
|
238
252
|
|
239
253
|
kwt_data->last_id= id + 1;
|
240
254
|
kwt_data->dictionary_size++;
|
241
|
-
|
255
|
+
// printf("[internal]==> %d\n", id);
|
256
|
+
return id;
|
242
257
|
}
|
243
258
|
|
244
259
|
/*
|
@@ -316,7 +331,10 @@ void Init_ahocorasick() {
|
|
316
331
|
rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
|
317
332
|
rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
|
318
333
|
rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
|
334
|
+
|
319
335
|
rb_define_method(rb_cKeywordTree, "search", rb_kwt_search, -1);
|
336
|
+
rb_define_alias(rb_cKeywordTree, "find_all", "search");
|
337
|
+
|
320
338
|
rb_define_alias(rb_cKeywordTree, "<<", "add_string");
|
321
339
|
rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
|
322
340
|
|
data/spec/ahocorasick_spec.rb
CHANGED
@@ -23,28 +23,25 @@ describe KeywordTree do
|
|
23
23
|
after(:each) do
|
24
24
|
@kwt= nil
|
25
25
|
end
|
26
|
-
it "should return an array" do
|
26
|
+
it "find_all should return an array" do
|
27
27
|
@kwt << "foo"
|
28
|
-
@kwt.
|
28
|
+
@kwt.find_all("foo").class.should == Array
|
29
29
|
end
|
30
30
|
|
31
31
|
it "the array should contain hashes" do
|
32
|
-
@kwt << "bar"
|
33
|
-
@kwt
|
32
|
+
@kwt << "bar"
|
33
|
+
@kwt << "foo"
|
34
|
+
@kwt.find_all("foo")[0].class.should == Hash
|
34
35
|
end
|
35
36
|
|
36
|
-
# XXX: this is subject of ...talks. no yield at this point
|
37
|
-
# it "should return nil if block_given?" do
|
38
|
-
# @kwt.search("foo"){|r| r[:id]}.should == nil
|
39
|
-
# end
|
40
|
-
|
41
37
|
it "should return empty array if no results" do
|
42
|
-
@kwt.
|
38
|
+
@kwt.find_all("1a4a").should == []
|
43
39
|
end
|
44
40
|
|
45
41
|
it "each hash should have the required symbols values" do
|
46
|
-
@kwt << "bar"
|
47
|
-
@kwt
|
42
|
+
@kwt << "bar"
|
43
|
+
@kwt << "foo"
|
44
|
+
@kwt.find_all("foo").each do | r |
|
48
45
|
r[:id].class.should == Fixnum
|
49
46
|
r[:starts_at].class.should == Fixnum
|
50
47
|
r[:ends_at].class.should == Fixnum
|
@@ -57,7 +54,7 @@ describe KeywordTree do
|
|
57
54
|
# | |
|
58
55
|
@kwt << "data"
|
59
56
|
q= "data moved"
|
60
|
-
@kwt.
|
57
|
+
@kwt.find_all(q).each do | result |
|
61
58
|
result[:starts_at].should == 0
|
62
59
|
result[:ends_at].should == 4
|
63
60
|
end
|
@@ -86,6 +83,14 @@ describe KeywordTree do
|
|
86
83
|
(r[:ends_at]-r[:starts_at]).should == r[:value].size
|
87
84
|
end
|
88
85
|
end
|
86
|
+
|
87
|
+
it "even more unicode" do
|
88
|
+
@kwt << "șșt"
|
89
|
+
# 0124789
|
90
|
+
result= @kwt.search("mușștar").first
|
91
|
+
result[:starts_at].should == 2
|
92
|
+
result[:ends_at].should == result[:starts_at] + "șșt".size
|
93
|
+
end
|
89
94
|
|
90
95
|
it "checks for result length" do
|
91
96
|
@kwt << "foo"
|
@@ -101,7 +106,6 @@ describe KeywordTree do
|
|
101
106
|
describe "Context Match vs. Exact Word Match" do
|
102
107
|
|
103
108
|
before(:each) do
|
104
|
-
# data, base, database
|
105
109
|
@kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
|
106
110
|
end
|
107
111
|
|
@@ -118,6 +122,7 @@ describe KeywordTree do
|
|
118
122
|
kwt << "bar"
|
119
123
|
kwt.size.should == 2
|
120
124
|
end
|
125
|
+
|
121
126
|
it "should add 2 strings with id" do
|
122
127
|
kwt= KeywordTree.new
|
123
128
|
kwt.add_string "foo", 1
|
@@ -146,6 +151,13 @@ describe KeywordTree do
|
|
146
151
|
kwt.size.should == 3
|
147
152
|
end
|
148
153
|
|
154
|
+
it "should return the id" do
|
155
|
+
kwt= KeywordTree.new
|
156
|
+
kwt.add_string("foo").should == 1
|
157
|
+
kwt.add_string("bar", 2008).should == 2008
|
158
|
+
kwt.add_string("kwt").should == 2009
|
159
|
+
end
|
160
|
+
|
149
161
|
it "should add strings from file and manually" do
|
150
162
|
kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
|
151
163
|
kwt << "foo"
|