aurelian-ruby-ahocorasick 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,28 +1,61 @@
1
+
2
+ h1. This is a work in progress.
3
+
1
4
  h3. Introduction
2
5
 
3
6
  This library is a ruby extension, a wrapper around the "Aho-Corasick":http://en.wikipedia.org/wiki/Aho-Corasick_algorithm implementation in C, found in "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html package.
4
7
 
5
8
  The source code (ac.c and ac.h) was "adapted" from Strmat. In fact, I've changed only 3-4 lines of code from the original implementation so it will feat my needs: search needed to return the current position in the searched string.
6
9
 
7
- h3. What's the idea?
10
+ h3. Okay, so what's the idea?
11
+
12
+ Having a dictionary of known sentences (note: not *words*!), this kick ass algorithm can find individual patterns in an incoming stream of data. Kinda Fast.
13
+
14
+ The algorithm has 2 stages: one where an internal tree in being build from the given dictionary leaving the search to the second step.
15
+
16
+ h3. Okay, so where can I use this?
8
17
 
9
- Having a dictionary of known sentences, how can I find individual patterns in an incoming stream of data? Fast.
18
+ Well, you can do some crazy things with it, like, you can lookup for DNA patterns or maybe analyze network sequences (read: strange and maybe proprietary network protocols), or domestic stuff like building contextual links on your blog posts to enrich your users experience.
10
19
 
11
- h1. TBD
20
+ h3. Okay, so how can I install it?
21
+
22
+ If you don't have github surces, type:
12
23
 
13
24
  <pre>
14
- [aurelian@stalingrad ext]$ time ./dict.rb
15
- 110196
16
- 711
25
+ gem sources -a http://gems.github.com
26
+ </pre>
27
+
28
+ then,
17
29
 
18
- real 0m0.538s
19
- user 0m0.435s
20
- sys 0m0.036s
30
+ <pre>
31
+ gem install aurelian-ruby-ahocorasick
21
32
  </pre>
22
33
 
23
- h3. Additional Reading / Implementations
34
+ h5. Note
35
+
36
+ It's known to work - compile - install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc around.
24
37
 
25
- Other suffix - tree
38
+ h3. Okay, so how do I use it?
39
+
40
+ <pre>
41
+ require 'ahocorasick'
42
+
43
+ keyword_tree= AhoCorasick::KeywordTree.new # creates a new tree
44
+ keyword_tree.add_string( "foo-- Z@!bar" ) # add's a keyword to the tree
45
+ keyword_tree.add_string( "cervantes" ) # even more
46
+
47
+ results= keyword_tree.find_all( "1011000129 foo-- Z@!bar761 ! 001211 6xU" ).each do | result |
48
+ result[:value] # => "foo-- Z@!bar"
49
+ result[:starts_at] # => 11
50
+ result[:ends_at] # => 23
51
+ result[:id] # => 1
52
+ end
53
+
54
+ </pre>
55
+
56
+ h3. Additional Reading
57
+
58
+ Other suffix - tree implementations:
26
59
 
27
60
  * "Strmat":http://www.cs.ucdavis.edu/~gusfield/strmat.html
28
61
  * Pytst / Ruby-Pytst
@@ -30,7 +63,8 @@ Other suffix - tree
30
63
  * "Keyword Prospector":http://latimes.rubyforge.org/keyword_prospector/rdoc/
31
64
  * "libstree":http://www.cl.cam.ac.uk/~cpk25/libstree/
32
65
 
33
- --
66
+ h3. License
67
+
68
+ (c) 2008 - Aurelian Oancea, < oancea at gmail dot com >
34
69
 
35
- (c) 2008 - Aurelian Oancea, < aurelian at locknet . ro >
36
70
  released under MIT-LICENCE
data/examples/dict.rb CHANGED
@@ -1,13 +1,24 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'time'
4
+
3
5
  require File.dirname(__FILE__) + '/../ext/ahocorasick'
4
6
 
7
+ t= Time.now
8
+
5
9
  k= AhoCorasick::KeywordTree.from_file(File.dirname(__FILE__) + "/../spec/data/en.words")
6
10
 
7
- query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt")
11
+ t1= Time.now
12
+
13
+ puts "%d words added in %s seconds" % [k.size, (t1-t)]
14
+
15
+ query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt" )
8
16
 
9
17
  results= k.search query
10
18
 
19
+ puts "took %s seconds to find %d results in a streem with %d charachters" % [(Time.now-t1), results.size, query.size]
20
+
21
+ exit
11
22
  results.each do | r |
12
23
  puts query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr + " => " + r[:value]
13
24
  end
@@ -1,15 +1,22 @@
1
1
 
2
2
  //
3
- // (c) 2008, Aurelian Oancea < aurelian at locknet . ro >
3
+ // (c) 2008, Aurelian Oancea < oancea at gmail dot com >
4
4
  //
5
5
  // Released under MIT-LICENSE
6
6
  //
7
7
 
8
8
  //
9
9
  // TODO: new methods?
10
+ //
10
11
  // * kwt[id] = word
11
12
  // * kwt.from_file (class instance method)
12
13
  //
14
+ // * kwt.find_each ("str") {|r| .. }
15
+ // * kwt.find_first("str")
16
+ // * kwt.find_all ("str")
17
+ //
18
+ // TODO: rename search to find_all
19
+ //
13
20
 
14
21
  #include <ruby.h>
15
22
  #include "ac.h"
@@ -196,13 +203,18 @@ rb_kwt_size(VALUE self)
196
203
  *
197
204
  * kwt.add_string("foo1$21^ 98N3 ba>Z")
198
205
  * kwt << "bar" # using the alias
199
- *
206
+ *
200
207
  * ==== Note: you can also specify the id, a number between 1 and k
201
208
  *
202
- * kwt.add_string "bar", 123
209
+ * kwt.add_string "bar", 123 # => 123
203
210
  *
204
211
  * This id should be unique in the context of the current tree.
205
212
  *
213
+ * Returns the id of the inserted object.
214
+ *
215
+ * kwt.add_string("test", 18) # => 18
216
+ * kwt.add_string("baz") # => 19
217
+ *
206
218
  */
207
219
  static VALUE
208
220
  rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
@@ -232,13 +244,16 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
232
244
  id= NUM2INT(v_id);
233
245
  }
234
246
 
247
+ // printf("[internal]==> %d\n", id);
248
+
235
249
  if( ac_add_string(kwt_data->tree, string, strlen(string), id) == 0 ) {
236
250
  rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
237
251
  }
238
252
 
239
253
  kwt_data->last_id= id + 1;
240
254
  kwt_data->dictionary_size++;
241
- return self;
255
+ // printf("[internal]==> %d\n", id);
256
+ return id;
242
257
  }
243
258
 
244
259
  /*
@@ -316,7 +331,10 @@ void Init_ahocorasick() {
316
331
  rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
317
332
  rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
318
333
  rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
334
+
319
335
  rb_define_method(rb_cKeywordTree, "search", rb_kwt_search, -1);
336
+ rb_define_alias(rb_cKeywordTree, "find_all", "search");
337
+
320
338
  rb_define_alias(rb_cKeywordTree, "<<", "add_string");
321
339
  rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
322
340
 
@@ -23,28 +23,25 @@ describe KeywordTree do
23
23
  after(:each) do
24
24
  @kwt= nil
25
25
  end
26
- it "should return an array" do
26
+ it "find_all should return an array" do
27
27
  @kwt << "foo"
28
- @kwt.search("bar").class.should == Array
28
+ @kwt.find_all("foo").class.should == Array
29
29
  end
30
30
 
31
31
  it "the array should contain hashes" do
32
- @kwt << "bar" << "foo"
33
- @kwt.search("foo")[0].class.should == Hash
32
+ @kwt << "bar"
33
+ @kwt << "foo"
34
+ @kwt.find_all("foo")[0].class.should == Hash
34
35
  end
35
36
 
36
- # XXX: this is subject of ...talks. no yield at this point
37
- # it "should return nil if block_given?" do
38
- # @kwt.search("foo"){|r| r[:id]}.should == nil
39
- # end
40
-
41
37
  it "should return empty array if no results" do
42
- @kwt.search("baba").should == []
38
+ @kwt.find_all("1a4a").should == []
43
39
  end
44
40
 
45
41
  it "each hash should have the required symbols values" do
46
- @kwt << "bar" << "foo"
47
- @kwt.search("foo").each do | r |
42
+ @kwt << "bar"
43
+ @kwt << "foo"
44
+ @kwt.find_all("foo").each do | r |
48
45
  r[:id].class.should == Fixnum
49
46
  r[:starts_at].class.should == Fixnum
50
47
  r[:ends_at].class.should == Fixnum
@@ -57,7 +54,7 @@ describe KeywordTree do
57
54
  # | |
58
55
  @kwt << "data"
59
56
  q= "data moved"
60
- @kwt.search(q).each do | result |
57
+ @kwt.find_all(q).each do | result |
61
58
  result[:starts_at].should == 0
62
59
  result[:ends_at].should == 4
63
60
  end
@@ -86,6 +83,14 @@ describe KeywordTree do
86
83
  (r[:ends_at]-r[:starts_at]).should == r[:value].size
87
84
  end
88
85
  end
86
+
87
+ it "even more unicode" do
88
+ @kwt << "șșt"
89
+ # 0124789
90
+ result= @kwt.search("mușștar").first
91
+ result[:starts_at].should == 2
92
+ result[:ends_at].should == result[:starts_at] + "șșt".size
93
+ end
89
94
 
90
95
  it "checks for result length" do
91
96
  @kwt << "foo"
@@ -101,7 +106,6 @@ describe KeywordTree do
101
106
  describe "Context Match vs. Exact Word Match" do
102
107
 
103
108
  before(:each) do
104
- # data, base, database
105
109
  @kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
106
110
  end
107
111
 
@@ -118,6 +122,7 @@ describe KeywordTree do
118
122
  kwt << "bar"
119
123
  kwt.size.should == 2
120
124
  end
125
+
121
126
  it "should add 2 strings with id" do
122
127
  kwt= KeywordTree.new
123
128
  kwt.add_string "foo", 1
@@ -146,6 +151,13 @@ describe KeywordTree do
146
151
  kwt.size.should == 3
147
152
  end
148
153
 
154
+ it "should return the id" do
155
+ kwt= KeywordTree.new
156
+ kwt.add_string("foo").should == 1
157
+ kwt.add_string("bar", 2008).should == 2008
158
+ kwt.add_string("kwt").should == 2009
159
+ end
160
+
149
161
  it "should add strings from file and manually" do
150
162
  kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
151
163
  kwt << "foo"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aurelian-ruby-ahocorasick
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aurelian Oancea