aurelian-ruby-ahocorasick 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +47 -13
- data/examples/dict.rb +12 -1
- data/ext/ruby-ahocorasick.c +22 -4
- data/spec/ahocorasick_spec.rb +26 -14
- metadata +1 -1
data/README.textile
CHANGED
@@ -1,28 +1,61 @@
|
|
1
|
+
|
2
|
+
h1. This is a work in progress.
|
3
|
+
|
1
4
|
h3. Introduction
|
2
5
|
|
3
6
|
This library is a ruby extension, a wrapper around the "Aho-Corasick":http://en.wikipedia.org/wiki/Aho-Corasick_algorithm implementation in C, found in "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html package.
|
4
7
|
|
5
8
|
The source code (ac.c and ac.h) was "adapted" from Strmat. In fact, I've changed only 3-4 lines of code from the original implementation so it will feat my needs: search needed to return the current position in the searched string.
|
6
9
|
|
7
|
-
h3.
|
10
|
+
h3. Okay, so what's the idea?
|
11
|
+
|
12
|
+
Having a dictionary of known sentences (note: not *words*!), this kick ass algorithm can find individual patterns in an incoming stream of data. Kinda Fast.
|
13
|
+
|
14
|
+
The algorithm has 2 stages: one where an internal tree in being build from the given dictionary leaving the search to the second step.
|
15
|
+
|
16
|
+
h3. Okay, so where can I use this?
|
8
17
|
|
9
|
-
|
18
|
+
Well, you can do some crazy things with it, like, you can lookup for DNA patterns or maybe analyze network sequences (read: strange and maybe proprietary network protocols), or domestic stuff like building contextual links on your blog posts to enrich your users experience.
|
10
19
|
|
11
|
-
|
20
|
+
h3. Okay, so how can I install it?
|
21
|
+
|
22
|
+
If you don't have github surces, type:
|
12
23
|
|
13
24
|
<pre>
|
14
|
-
|
15
|
-
|
16
|
-
|
25
|
+
gem sources -a http://gems.github.com
|
26
|
+
</pre>
|
27
|
+
|
28
|
+
then,
|
17
29
|
|
18
|
-
|
19
|
-
|
20
|
-
sys 0m0.036s
|
30
|
+
<pre>
|
31
|
+
gem install aurelian-ruby-ahocorasick
|
21
32
|
</pre>
|
22
33
|
|
23
|
-
|
34
|
+
h5. Note
|
35
|
+
|
36
|
+
It's known to work - compile - install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc around.
|
24
37
|
|
25
|
-
|
38
|
+
h3. Okay, so how do I use it?
|
39
|
+
|
40
|
+
<pre>
|
41
|
+
require 'ahocorasick'
|
42
|
+
|
43
|
+
keyword_tree= AhoCorasick::KeywordTree.new # creates a new tree
|
44
|
+
keyword_tree.add_string( "foo-- Z@!bar" ) # add's a keyword to the tree
|
45
|
+
keyword_tree.add_string( "cervantes" ) # even more
|
46
|
+
|
47
|
+
results= keyword_tree.find_all( "1011000129 foo-- Z@!bar761 ! 001211 6xU" ).each do | result |
|
48
|
+
result[:value] # => "foo-- Z@!bar"
|
49
|
+
result[:starts_at] # => 11
|
50
|
+
result[:ends_at] # => 23
|
51
|
+
result[:id] # => 1
|
52
|
+
end
|
53
|
+
|
54
|
+
</pre>
|
55
|
+
|
56
|
+
h3. Additional Reading
|
57
|
+
|
58
|
+
Other suffix - tree implementations:
|
26
59
|
|
27
60
|
* "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html
|
28
61
|
* Pytst / Ruby-Pytst
|
@@ -30,7 +63,8 @@ Other suffix - tree
|
|
30
63
|
* "Keyword Prospector":http://latimes.rubyforge.org/keyword_prospector/rdoc/
|
31
64
|
* "libstree":http://www.cl.cam.ac.uk/~cpk25/libstree/
|
32
65
|
|
33
|
-
|
66
|
+
h3. License
|
67
|
+
|
68
|
+
(c) 2008 - Aurelian Oancea, < oancea at gmail dot com >
|
34
69
|
|
35
|
-
(c) 2008 - Aurelian Oancea, < aurelian at locknet . ro >
|
36
70
|
released under MIT-LICENCE
|
data/examples/dict.rb
CHANGED
@@ -1,13 +1,24 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'time'
|
4
|
+
|
3
5
|
require File.dirname(__FILE__) + '/../ext/ahocorasick'
|
4
6
|
|
7
|
+
t= Time.now
|
8
|
+
|
5
9
|
k= AhoCorasick::KeywordTree.from_file(File.dirname(__FILE__) + "/../spec/data/en.words")
|
6
10
|
|
7
|
-
|
11
|
+
t1= Time.now
|
12
|
+
|
13
|
+
puts "%d words added in %s seconds" % [k.size, (t1-t)]
|
14
|
+
|
15
|
+
query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt" )
|
8
16
|
|
9
17
|
results= k.search query
|
10
18
|
|
19
|
+
puts "took %s seconds to find %d results in a streem with %d charachters" % [(Time.now-t1), results.size, query.size]
|
20
|
+
|
21
|
+
exit
|
11
22
|
results.each do | r |
|
12
23
|
puts query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr + " => " + r[:value]
|
13
24
|
end
|
data/ext/ruby-ahocorasick.c
CHANGED
@@ -1,15 +1,22 @@
|
|
1
1
|
|
2
2
|
//
|
3
|
-
// (c) 2008, Aurelian Oancea <
|
3
|
+
// (c) 2008, Aurelian Oancea < oancea at gmail dot com >
|
4
4
|
//
|
5
5
|
// Released under MIT-LICENSE
|
6
6
|
//
|
7
7
|
|
8
8
|
//
|
9
9
|
// TODO: new methods?
|
10
|
+
//
|
10
11
|
// * kwt[id] = word
|
11
12
|
// * kwt.from_file (class instance method)
|
12
13
|
//
|
14
|
+
// * kwt.find_each ("str") {|r| .. }
|
15
|
+
// * kwt.find_first("str")
|
16
|
+
// * kwt.find_all ("str")
|
17
|
+
//
|
18
|
+
// TODO: rename search to find_all
|
19
|
+
//
|
13
20
|
|
14
21
|
#include <ruby.h>
|
15
22
|
#include "ac.h"
|
@@ -196,13 +203,18 @@ rb_kwt_size(VALUE self)
|
|
196
203
|
*
|
197
204
|
* kwt.add_string("foo1$21^ 98N3 ba>Z")
|
198
205
|
* kwt << "bar" # using the alias
|
199
|
-
*
|
206
|
+
*
|
200
207
|
* ==== Note: you can also specify the id, a number between 1 and k
|
201
208
|
*
|
202
|
-
* kwt.add_string "bar", 123
|
209
|
+
* kwt.add_string "bar", 123 # => 123
|
203
210
|
*
|
204
211
|
* This id should be unique in the context of the current tree.
|
205
212
|
*
|
213
|
+
* Returns the id of the inserted object.
|
214
|
+
*
|
215
|
+
* kwt.add_string("test", 18) # => 18
|
216
|
+
* kwt.add_string("baz") # => 19
|
217
|
+
*
|
206
218
|
*/
|
207
219
|
static VALUE
|
208
220
|
rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
@@ -232,13 +244,16 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
|
232
244
|
id= NUM2INT(v_id);
|
233
245
|
}
|
234
246
|
|
247
|
+
// printf("[internal]==> %d\n", id);
|
248
|
+
|
235
249
|
if( ac_add_string(kwt_data->tree, string, strlen(string), id) == 0 ) {
|
236
250
|
rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
|
237
251
|
}
|
238
252
|
|
239
253
|
kwt_data->last_id= id + 1;
|
240
254
|
kwt_data->dictionary_size++;
|
241
|
-
|
255
|
+
// printf("[internal]==> %d\n", id);
|
256
|
+
return id;
|
242
257
|
}
|
243
258
|
|
244
259
|
/*
|
@@ -316,7 +331,10 @@ void Init_ahocorasick() {
|
|
316
331
|
rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
|
317
332
|
rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
|
318
333
|
rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
|
334
|
+
|
319
335
|
rb_define_method(rb_cKeywordTree, "search", rb_kwt_search, -1);
|
336
|
+
rb_define_alias(rb_cKeywordTree, "find_all", "search");
|
337
|
+
|
320
338
|
rb_define_alias(rb_cKeywordTree, "<<", "add_string");
|
321
339
|
rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
|
322
340
|
|
data/spec/ahocorasick_spec.rb
CHANGED
@@ -23,28 +23,25 @@ describe KeywordTree do
|
|
23
23
|
after(:each) do
|
24
24
|
@kwt= nil
|
25
25
|
end
|
26
|
-
it "should return an array" do
|
26
|
+
it "find_all should return an array" do
|
27
27
|
@kwt << "foo"
|
28
|
-
@kwt.
|
28
|
+
@kwt.find_all("foo").class.should == Array
|
29
29
|
end
|
30
30
|
|
31
31
|
it "the array should contain hashes" do
|
32
|
-
@kwt << "bar"
|
33
|
-
@kwt
|
32
|
+
@kwt << "bar"
|
33
|
+
@kwt << "foo"
|
34
|
+
@kwt.find_all("foo")[0].class.should == Hash
|
34
35
|
end
|
35
36
|
|
36
|
-
# XXX: this is subject of ...talks. no yield at this point
|
37
|
-
# it "should return nil if block_given?" do
|
38
|
-
# @kwt.search("foo"){|r| r[:id]}.should == nil
|
39
|
-
# end
|
40
|
-
|
41
37
|
it "should return empty array if no results" do
|
42
|
-
@kwt.
|
38
|
+
@kwt.find_all("1a4a").should == []
|
43
39
|
end
|
44
40
|
|
45
41
|
it "each hash should have the required symbols values" do
|
46
|
-
@kwt << "bar"
|
47
|
-
@kwt
|
42
|
+
@kwt << "bar"
|
43
|
+
@kwt << "foo"
|
44
|
+
@kwt.find_all("foo").each do | r |
|
48
45
|
r[:id].class.should == Fixnum
|
49
46
|
r[:starts_at].class.should == Fixnum
|
50
47
|
r[:ends_at].class.should == Fixnum
|
@@ -57,7 +54,7 @@ describe KeywordTree do
|
|
57
54
|
# | |
|
58
55
|
@kwt << "data"
|
59
56
|
q= "data moved"
|
60
|
-
@kwt.
|
57
|
+
@kwt.find_all(q).each do | result |
|
61
58
|
result[:starts_at].should == 0
|
62
59
|
result[:ends_at].should == 4
|
63
60
|
end
|
@@ -86,6 +83,14 @@ describe KeywordTree do
|
|
86
83
|
(r[:ends_at]-r[:starts_at]).should == r[:value].size
|
87
84
|
end
|
88
85
|
end
|
86
|
+
|
87
|
+
it "even more unicode" do
|
88
|
+
@kwt << "șșt"
|
89
|
+
# 0124789
|
90
|
+
result= @kwt.search("mușștar").first
|
91
|
+
result[:starts_at].should == 2
|
92
|
+
result[:ends_at].should == result[:starts_at] + "șșt".size
|
93
|
+
end
|
89
94
|
|
90
95
|
it "checks for result length" do
|
91
96
|
@kwt << "foo"
|
@@ -101,7 +106,6 @@ describe KeywordTree do
|
|
101
106
|
describe "Context Match vs. Exact Word Match" do
|
102
107
|
|
103
108
|
before(:each) do
|
104
|
-
# data, base, database
|
105
109
|
@kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
|
106
110
|
end
|
107
111
|
|
@@ -118,6 +122,7 @@ describe KeywordTree do
|
|
118
122
|
kwt << "bar"
|
119
123
|
kwt.size.should == 2
|
120
124
|
end
|
125
|
+
|
121
126
|
it "should add 2 strings with id" do
|
122
127
|
kwt= KeywordTree.new
|
123
128
|
kwt.add_string "foo", 1
|
@@ -146,6 +151,13 @@ describe KeywordTree do
|
|
146
151
|
kwt.size.should == 3
|
147
152
|
end
|
148
153
|
|
154
|
+
it "should return the id" do
|
155
|
+
kwt= KeywordTree.new
|
156
|
+
kwt.add_string("foo").should == 1
|
157
|
+
kwt.add_string("bar", 2008).should == 2008
|
158
|
+
kwt.add_string("kwt").should == 2009
|
159
|
+
end
|
160
|
+
|
149
161
|
it "should add strings from file and manually" do
|
150
162
|
kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
|
151
163
|
kwt << "foo"
|