aurelian-ruby-ahocorasick 0.2.2 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,28 +1,61 @@
1
+
2
+ h1. This is a work in progress.
3
+
1
4
  h3. Introduction
2
5
 
3
6
  This library is a ruby extension, a wrapper around the "Aho-Corasick":http://en.wikipedia.org/wiki/Aho-Corasick_algorithm implementation in C, found in "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html package.
4
7
 
5
8
  The source code (ac.c and ac.h) was "adapted" from Strmat. In fact, I've changed only 3-4 lines of code from the original implementation so it will feat my needs: search needed to return the current position in the searched string.
6
9
 
7
- h3. What's the idea?
10
+ h3. Okay, so what's the idea?
11
+
12
+ Having a dictionary of known sentences (note: not *words*!), this kick ass algorithm can find individual patterns in an incoming stream of data. Kinda Fast.
13
+
14
+ The algorithm has 2 stages: one where an internal tree in being build from the given dictionary leaving the search to the second step.
15
+
16
+ h3. Okay, so where can I use this?
8
17
 
9
- Having a dictionary of known sentences, how can I find individual patterns in an incoming stream of data? Fast.
18
+ Well, you can do some crazy things with it, like, you can lookup for DNA patterns or maybe analyze network sequences (read: strange and maybe proprietary network protocols), or domestic stuff like building contextual links on your blog posts to enrich your users experience.
10
19
 
11
- h1. TBD
20
+ h3. Okay, so how can I install it?
21
+
22
+ If you don't have github surces, type:
12
23
 
13
24
  <pre>
14
- [aurelian@stalingrad ext]$ time ./dict.rb
15
- 110196
16
- 711
25
+ gem sources -a http://gems.github.com
26
+ </pre>
27
+
28
+ then,
17
29
 
18
- real 0m0.538s
19
- user 0m0.435s
20
- sys 0m0.036s
30
+ <pre>
31
+ gem install aurelian-ruby-ahocorasick
21
32
  </pre>
22
33
 
23
- h3. Additional Reading / Implementations
34
+ h5. Note
35
+
36
+ It's known to work - compile - install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc around.
24
37
 
25
- Other suffix - tree
38
+ h3. Okay, so how do I use it?
39
+
40
+ <pre>
41
+ require 'ahocorasick'
42
+
43
+ keyword_tree= AhoCorasick::KeywordTree.new # creates a new tree
44
+ keyword_tree.add_string( "foo-- Z@!bar" ) # add's a keyword to the tree
45
+ keyword_tree.add_string( "cervantes" ) # even more
46
+
47
+ results= keyword_tree.find_all( "1011000129 foo-- Z@!bar761 ! 001211 6xU" ).each do | result |
48
+ result[:value] # => "foo-- Z@!bar"
49
+ result[:starts_at] # => 11
50
+ result[:ends_at] # => 23
51
+ result[:id] # => 1
52
+ end
53
+
54
+ </pre>
55
+
56
+ h3. Additional Reading
57
+
58
+ Other suffix - tree implementations:
26
59
 
27
60
  * "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html
28
61
  * Pytst / Ruby-Pytst
@@ -30,7 +63,8 @@ Other suffix - tree
30
63
  * "Keyword Prospector":http://latimes.rubyforge.org/keyword_prospector/rdoc/
31
64
  * "libstree":http://www.cl.cam.ac.uk/~cpk25/libstree/
32
65
 
33
- --
66
+ h3. License
67
+
68
+ (c) 2008 - Aurelian Oancea, < oancea at gmail dot com >
34
69
 
35
- (c) 2008 - Aurelian Oancea, < aurelian at locknet . ro >
36
70
  released under MIT-LICENCE
data/examples/dict.rb CHANGED
@@ -1,13 +1,24 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'time'
4
+
3
5
  require File.dirname(__FILE__) + '/../ext/ahocorasick'
4
6
 
7
+ t= Time.now
8
+
5
9
  k= AhoCorasick::KeywordTree.from_file(File.dirname(__FILE__) + "/../spec/data/en.words")
6
10
 
7
- query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt")
11
+ t1= Time.now
12
+
13
+ puts "%d words added in %s seconds" % [k.size, (t1-t)]
14
+
15
+ query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt" )
8
16
 
9
17
  results= k.search query
10
18
 
19
+ puts "took %s seconds to find %d results in a streem with %d charachters" % [(Time.now-t1), results.size, query.size]
20
+
21
+ exit
11
22
  results.each do | r |
12
23
  puts query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr + " => " + r[:value]
13
24
  end
@@ -1,15 +1,22 @@
1
1
 
2
2
  //
3
- // (c) 2008, Aurelian Oancea < aurelian at locknet . ro >
3
+ // (c) 2008, Aurelian Oancea < oancea at gmail dot com >
4
4
  //
5
5
  // Released under MIT-LICENSE
6
6
  //
7
7
 
8
8
  //
9
9
  // TODO: new methods?
10
+ //
10
11
  // * kwt[id] = word
11
12
  // * kwt.from_file (class instance method)
12
13
  //
14
+ // * kwt.find_each ("str") {|r| .. }
15
+ // * kwt.find_first("str")
16
+ // * kwt.find_all ("str")
17
+ //
18
+ // TODO: rename search to find_all
19
+ //
13
20
 
14
21
  #include <ruby.h>
15
22
  #include "ac.h"
@@ -196,13 +203,18 @@ rb_kwt_size(VALUE self)
196
203
  *
197
204
  * kwt.add_string("foo1$21^ 98N3 ba>Z")
198
205
  * kwt << "bar" # using the alias
199
- *
206
+ *
200
207
  * ==== Note: you can also specify the id, a number between 1 and k
201
208
  *
202
- * kwt.add_string "bar", 123
209
+ * kwt.add_string "bar", 123 # => 123
203
210
  *
204
211
  * This id should be unique in the context of the current tree.
205
212
  *
213
+ * Returns the id of the inserted object.
214
+ *
215
+ * kwt.add_string("test", 18) # => 18
216
+ * kwt.add_string("baz") # => 19
217
+ *
206
218
  */
207
219
  static VALUE
208
220
  rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
@@ -232,13 +244,16 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
232
244
  id= NUM2INT(v_id);
233
245
  }
234
246
 
247
+ // printf("[internal]==> %d\n", id);
248
+
235
249
  if( ac_add_string(kwt_data->tree, string, strlen(string), id) == 0 ) {
236
250
  rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
237
251
  }
238
252
 
239
253
  kwt_data->last_id= id + 1;
240
254
  kwt_data->dictionary_size++;
241
- return self;
255
+ // printf("[internal]==> %d\n", id);
256
+ return id;
242
257
  }
243
258
 
244
259
  /*
@@ -316,7 +331,10 @@ void Init_ahocorasick() {
316
331
  rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
317
332
  rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
318
333
  rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
334
+
319
335
  rb_define_method(rb_cKeywordTree, "search", rb_kwt_search, -1);
336
+ rb_define_alias(rb_cKeywordTree, "find_all", "search");
337
+
320
338
  rb_define_alias(rb_cKeywordTree, "<<", "add_string");
321
339
  rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
322
340
 
@@ -23,28 +23,25 @@ describe KeywordTree do
23
23
  after(:each) do
24
24
  @kwt= nil
25
25
  end
26
- it "should return an array" do
26
+ it "find_all should return an array" do
27
27
  @kwt << "foo"
28
- @kwt.search("bar").class.should == Array
28
+ @kwt.find_all("foo").class.should == Array
29
29
  end
30
30
 
31
31
  it "the array should contain hashes" do
32
- @kwt << "bar" << "foo"
33
- @kwt.search("foo")[0].class.should == Hash
32
+ @kwt << "bar"
33
+ @kwt << "foo"
34
+ @kwt.find_all("foo")[0].class.should == Hash
34
35
  end
35
36
 
36
- # XXX: this is subject of ...talks. no yield at this point
37
- # it "should return nil if block_given?" do
38
- # @kwt.search("foo"){|r| r[:id]}.should == nil
39
- # end
40
-
41
37
  it "should return empty array if no results" do
42
- @kwt.search("baba").should == []
38
+ @kwt.find_all("1a4a").should == []
43
39
  end
44
40
 
45
41
  it "each hash should have the required symbols values" do
46
- @kwt << "bar" << "foo"
47
- @kwt.search("foo").each do | r |
42
+ @kwt << "bar"
43
+ @kwt << "foo"
44
+ @kwt.find_all("foo").each do | r |
48
45
  r[:id].class.should == Fixnum
49
46
  r[:starts_at].class.should == Fixnum
50
47
  r[:ends_at].class.should == Fixnum
@@ -57,7 +54,7 @@ describe KeywordTree do
57
54
  # | |
58
55
  @kwt << "data"
59
56
  q= "data moved"
60
- @kwt.search(q).each do | result |
57
+ @kwt.find_all(q).each do | result |
61
58
  result[:starts_at].should == 0
62
59
  result[:ends_at].should == 4
63
60
  end
@@ -86,6 +83,14 @@ describe KeywordTree do
86
83
  (r[:ends_at]-r[:starts_at]).should == r[:value].size
87
84
  end
88
85
  end
86
+
87
+ it "even more unicode" do
88
+ @kwt << "șșt"
89
+ # 0124789
90
+ result= @kwt.search("mușștar").first
91
+ result[:starts_at].should == 2
92
+ result[:ends_at].should == result[:starts_at] + "șșt".size
93
+ end
89
94
 
90
95
  it "checks for result length" do
91
96
  @kwt << "foo"
@@ -101,7 +106,6 @@ describe KeywordTree do
101
106
  describe "Context Match vs. Exact Word Match" do
102
107
 
103
108
  before(:each) do
104
- # data, base, database
105
109
  @kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
106
110
  end
107
111
 
@@ -118,6 +122,7 @@ describe KeywordTree do
118
122
  kwt << "bar"
119
123
  kwt.size.should == 2
120
124
  end
125
+
121
126
  it "should add 2 strings with id" do
122
127
  kwt= KeywordTree.new
123
128
  kwt.add_string "foo", 1
@@ -146,6 +151,13 @@ describe KeywordTree do
146
151
  kwt.size.should == 3
147
152
  end
148
153
 
154
+ it "should return the id" do
155
+ kwt= KeywordTree.new
156
+ kwt.add_string("foo").should == 1
157
+ kwt.add_string("bar", 2008).should == 2008
158
+ kwt.add_string("kwt").should == 2009
159
+ end
160
+
149
161
  it "should add strings from file and manually" do
150
162
  kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
151
163
  kwt << "foo"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aurelian-ruby-ahocorasick
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aurelian Oancea