aurelian-ruby-ahocorasick 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/ac.h ADDED
@@ -0,0 +1,36 @@
1
+
2
+ #ifndef _AC_H_
3
+ #define _AC_H_
4
+
5
+ typedef struct actreenode {
6
+ char ch;
7
+ int matchid;
8
+ struct actreenode *outlink, *faillink;
9
+ struct actreenode *children, *sibling;
10
+ } ACTREE_NODE, *AC_TREE;
11
+
12
+ typedef struct {
13
+ AC_TREE tree;
14
+ int ispreprocessed, errorflag;
15
+
16
+ int Psize;
17
+ int *Plengths;
18
+
19
+ char *T;
20
+ int N, c, initflag, endflag;
21
+ AC_TREE w, output;
22
+ #ifdef STATS
23
+ int prep_new_edges, prep_old_edges, prep_fail_compares;
24
+ int num_compares, num_failures, edges_traversed, outlinks_traversed;
25
+ #endif
26
+ } AC_STRUCT;
27
+
28
+ AC_STRUCT *ac_alloc(void);
29
+ int ac_add_string(AC_STRUCT *node, char *P, int M, int id);
30
+ // int ac_del_string(AC_STRUCT *node, char *P, int M, int id);
31
+ int ac_prep(AC_STRUCT *node);
32
+ void ac_search_init(AC_STRUCT *node, char *T, int N);
33
+ char *ac_search(AC_STRUCT *node, int *length_out, int *id_out, int *ends_at);
34
+ void ac_free(AC_STRUCT *node);
35
+
36
+ #endif
data/ext/extconf.rb ADDED
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+
3
+ create_makefile("ahocorasick")
4
+
@@ -0,0 +1,329 @@
1
+
2
+ //
3
+ // (c) 2008, Aurelian Oancea < aurelian at locknet . ro >
4
+ //
5
+ // Released under MIT-LICENSE
6
+ //
7
+
8
+ //
9
+ // TODO: new methods?
10
+ // * kwt[id] = word
11
+ // * kwt.from_file (class instance method)
12
+ //
13
+
14
+ #include <ruby.h>
15
+ #include "ac.h"
16
+
17
+ static VALUE sym_id, sym_value, sym_ends_at, sym_starts_at;
18
+
19
+ VALUE rb_mAhoCorasick;
20
+ VALUE rb_cKeywordTree;
21
+
22
+ #define KeywordTree(obj, kwt_data) {\
23
+ Data_Get_Struct(obj, struct kwt_struct_data, kwt_data);\
24
+ }
25
+
26
+ struct kwt_struct_data {
27
+ AC_STRUCT * tree;
28
+ int last_id;
29
+ int dictionary_size;
30
+ int is_frozen;
31
+ };
32
+
33
+ /*
34
+ * call-seq: initialize
35
+ *
36
+ * Creates a new KeywordTree
37
+ *
38
+ * require 'ahocorasick'
39
+ * kwt = Ahocorasick::KeywordTree.new
40
+ *
41
+ */
42
+ static VALUE
43
+ rb_kwt_init(VALUE self)
44
+ {
45
+ AC_STRUCT * tree;
46
+ struct kwt_struct_data *kwt_data;
47
+
48
+ kwt_data = ALLOC(struct kwt_struct_data);
49
+ tree = ac_alloc();
50
+ DATA_PTR(self) = kwt_data;
51
+ kwt_data->tree = tree;
52
+ kwt_data->last_id = 1;
53
+ kwt_data->dictionary_size = 0;
54
+ kwt_data->is_frozen = 0;
55
+ return self;
56
+ }
57
+
58
+ /*
59
+ * Document-method: make
60
+ * call-seq: make
61
+ *
62
+ * It freezes the current KeywordTree. After this point, the tree will not accept any new entries.
63
+ *
64
+ * ==== Note: This method is called internally by search
65
+ *
66
+ * require 'ahocorasick'
67
+ *
68
+ * kwt = Ahocorasick::KeywordTree.new
69
+ *
70
+ * kwt.add_string("one")
71
+ * kwt.add_string("two")
72
+ * kwt.make()
73
+ */
74
+ static VALUE
75
+ rb_kwt_make(VALUE self)
76
+ {
77
+ struct kwt_struct_data *kwt_data;
78
+ KeywordTree(self, kwt_data);
79
+
80
+ ac_prep( kwt_data->tree );
81
+ kwt_data->is_frozen = 1;
82
+ return self;
83
+ }
84
+
85
+ /*
86
+ * Document-method: search
87
+ * call-seq: search
88
+ *
89
+ * Search the current tree.
90
+ *
91
+ * It returns an array on hashes, e.g.
92
+ *
93
+ * [ { :id => int, :value => int, :starts_at => int, :ends_at => int}, { ... } ]
94
+ *
95
+ * Returns an empty array when the search didn't return any result.
96
+ *
97
+ * # assuming a valid KeywordTree kwt object:
98
+ * kwt.add_string("one")
99
+ * kwt.add_string("two")
100
+ *
101
+ * kwt.search( "moved two times already" ).each do | result |
102
+ * result[:id] # => 2
103
+ * result[:ends_at] # => 9
104
+ * result[:starts_at] # => 6
105
+ * result[:value] # => two
106
+ * end # => 1
107
+ *
108
+ */
109
+ static VALUE
110
+ rb_kwt_search(int argc, VALUE *argv, VALUE self)
111
+ {
112
+ char * result; // itermediate result
113
+ char * remain; // returned by ac_search, the remaing text to search
114
+ int lgt, id, ends_at; // filled in by ac_search, the id, length and ends_at position
115
+ int starts_at;
116
+ VALUE v_result; // one result, as hash
117
+ VALUE v_results; // all the results, an array
118
+ VALUE v_search; // search string, function argument
119
+ struct kwt_struct_data *kwt_data;
120
+
121
+ // one mandatory argument.
122
+ rb_scan_args(argc, argv, "1", &v_search);
123
+ // it should be string.
124
+ Check_Type(v_search, T_STRING);
125
+ // get the structure
126
+ KeywordTree(self, kwt_data);
127
+ // freeze the tree, if not already
128
+ if(kwt_data->is_frozen == 0) {
129
+ ac_prep( kwt_data->tree );
130
+ kwt_data->is_frozen = 1;
131
+ }
132
+ // prepare the return value
133
+ // v_results= rb_block_given_p()? Qnil : rb_ary_new();
134
+ v_results= rb_ary_new();
135
+ // fail quickly and return the empty array
136
+ if(kwt_data->dictionary_size == 0)
137
+ return v_results;
138
+ // prepare the search
139
+ ac_search_init(kwt_data->tree, RSTRING( v_search )->ptr, RSTRING( v_search )->len);
140
+ // loop trought the results
141
+ while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
142
+ // this is an individual result as a hash
143
+ v_result= rb_hash_new();
144
+ rb_hash_aset( v_result, sym_id, INT2FIX(id) );
145
+ rb_hash_aset( v_result, sym_starts_at, INT2FIX( ends_at - lgt - 1 ) );
146
+ rb_hash_aset( v_result, sym_ends_at, INT2FIX( ends_at - 1 ) );
147
+ result = (char*) malloc (sizeof(char)*lgt);
148
+ sprintf( result, "%.*s", lgt, remain);
149
+ rb_hash_aset( v_result, sym_value, rb_str_new(result, lgt) );
150
+
151
+ // yield this hash or, add it to the results
152
+ // if(rb_block_given_p())
153
+ // rb_yield(v_result);
154
+ // else
155
+ rb_ary_push( v_results, v_result );
156
+ free(result);
157
+ }
158
+
159
+ // TODO: maybe the Tree can be re-opened to add new items to dictionary
160
+
161
+ // return the results or nil if none
162
+ // if( v_results != Qnil && RARRAY(v_results)->len > 0 ) {
163
+ return v_results;
164
+ // } else {
165
+ // return Qnil;
166
+ // }
167
+ }
168
+
169
+
170
+ /*
171
+ * Document-method: size
172
+ * call-seq: size
173
+ *
174
+ * Returns the size of this KeywordTree
175
+ *
176
+ * kwt.add_string("foo")
177
+ * kwt.add_string("bar")
178
+ * kwt.size #=> 2
179
+ *
180
+ */
181
+ static VALUE
182
+ rb_kwt_size(VALUE self)
183
+ {
184
+ struct kwt_struct_data *kwt_data;
185
+ KeywordTree(self, kwt_data);
186
+
187
+ return INT2FIX(kwt_data->dictionary_size);
188
+ }
189
+
190
+
191
+ /*
192
+ * Document-method: add_string
193
+ * call-seq: add_string
194
+ *
195
+ * Adds a sequence to this KeywordTree.
196
+ *
197
+ * kwt.add_string("foo1$21^ 98N3 ba>Z")
198
+ * kwt << "bar" # using the alias
199
+ *
200
+ * ==== Note: you can also specify the id, a number between 1 and k
201
+ *
202
+ * kwt.add_string "bar", 123
203
+ *
204
+ * This id should be unique in the context of the current tree.
205
+ *
206
+ */
207
+ static VALUE
208
+ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
209
+ {
210
+ VALUE v_string, v_id;
211
+ struct kwt_struct_data *kwt_data;
212
+ char * string;
213
+ int id;
214
+
215
+ rb_scan_args(argc, argv, "11", &v_string, &v_id);
216
+
217
+ Check_Type(v_string, T_STRING);
218
+ string= RSTRING(v_string)->ptr;
219
+
220
+ KeywordTree(self, kwt_data);
221
+
222
+ if(kwt_data->is_frozen == 1)
223
+ rb_raise(rb_eRuntimeError, "Cannot add `%s\" into a frozen tree.", string);
224
+
225
+ if(v_id == Qnil) {
226
+ id = kwt_data->last_id;
227
+ } else if(TYPE(v_id) != T_FIXNUM) {
228
+ rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%s\" given.", RSTRING(v_id)->ptr);
229
+ } else if(NUM2INT(v_id) <= 0) {
230
+ rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%d\" given.", NUM2INT(v_id));
231
+ } else {
232
+ id= NUM2INT(v_id);
233
+ }
234
+
235
+ if( ac_add_string(kwt_data->tree, string, strlen(string), id) == 0 ) {
236
+ rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
237
+ }
238
+
239
+ kwt_data->last_id= id + 1;
240
+ kwt_data->dictionary_size++;
241
+ return self;
242
+ }
243
+
244
+ /*
245
+ * call-seq: from_file
246
+ *
247
+ * Creates a new KeywordTree and loads the dictionary from a file
248
+ *
249
+ * % cat dict0.txt
250
+ * foo
251
+ * bar
252
+ * base
253
+ *
254
+ * k= AhoCorasick::KeywordTree.from_file "dict0.txt"
255
+ * k.search("basement").size # => 1
256
+ *
257
+ */
258
+ static VALUE
259
+ rb_kwt_new_from_file(int argc, VALUE *argv, VALUE klass)
260
+ {
261
+
262
+ // TODO:
263
+ // * use rb_kwt_add_string
264
+ // * use rb_io* to handle the file
265
+
266
+ struct kwt_struct_data *kwt_data;
267
+ char word[1024];
268
+ int id;
269
+ VALUE self;
270
+ VALUE f_string;
271
+ FILE *dictionary;
272
+
273
+ rb_scan_args(argc, argv, "10", &f_string);
274
+
275
+ id = 0;
276
+ SafeStringValue( f_string );
277
+ self= rb_class_new_instance( 0, NULL, klass );
278
+ KeywordTree( self, kwt_data );
279
+
280
+ dictionary = fopen( RSTRING( f_string )->ptr, "r" );
281
+ if(dictionary == NULL) {
282
+ rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?", RSTRING(f_string)->ptr);
283
+ }
284
+
285
+ while(fgets(word, 1024, dictionary) != NULL) {
286
+ ac_add_string(kwt_data->tree, word, strlen(word)-1, id++);
287
+ kwt_data->dictionary_size++;
288
+ }
289
+ kwt_data->last_id= id+1;
290
+ fclose(dictionary);
291
+ return self;
292
+ }
293
+
294
+ static void
295
+ rb_kwt_struct_free(struct kwt_struct_data * kwt_data)
296
+ {
297
+ ac_free(kwt_data->tree);
298
+ }
299
+
300
+ static VALUE
301
+ rb_kwt_struct_alloc(VALUE klass)
302
+ {
303
+ return Data_Wrap_Struct(klass, 0, rb_kwt_struct_free, 0);
304
+ }
305
+
306
+ /*
307
+ * Blump.
308
+ */
309
+ void Init_ahocorasick() {
310
+ rb_mAhoCorasick = rb_define_module("AhoCorasick");
311
+ rb_cKeywordTree = rb_define_class_under(rb_mAhoCorasick, "KeywordTree", rb_cObject);
312
+
313
+ rb_define_alloc_func(rb_cKeywordTree, rb_kwt_struct_alloc);
314
+
315
+ rb_define_method(rb_cKeywordTree, "initialize", rb_kwt_init, 0);
316
+ rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
317
+ rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
318
+ rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
319
+ rb_define_method(rb_cKeywordTree, "search", rb_kwt_search, -1);
320
+ rb_define_alias(rb_cKeywordTree, "<<", "add_string");
321
+ rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
322
+
323
+ sym_id= ID2SYM(rb_intern("id"));
324
+ sym_value= ID2SYM(rb_intern("value"));
325
+ sym_ends_at= ID2SYM( rb_intern("ends_at") );
326
+ sym_starts_at= ID2SYM( rb_intern("starts_at") );
327
+
328
+ }
329
+
@@ -0,0 +1,183 @@
1
+ require 'ext/ahocorasick'
2
+
3
+ include AhoCorasick
4
+
5
+ describe KeywordTree do
6
+
7
+ describe "How to create a new KeywordTree" do
8
+ it "should create a new KeywordTree" do
9
+ KeywordTree.new.class.should == KeywordTree
10
+ end
11
+ it "should create a new KeywordTree" do
12
+ KeywordTree.from_file( File.dirname(__FILE__) + "/data/dict0.txt").class.should == KeywordTree
13
+ end
14
+ end
15
+
16
+ describe "How to search" do
17
+
18
+ before(:each) do
19
+ @kwt= KeywordTree.new
20
+ end
21
+
22
+ # XXX: is this usefull?
23
+ after(:each) do
24
+ @kwt= nil
25
+ end
26
+ it "should return an array" do
27
+ @kwt << "foo"
28
+ @kwt.search("bar").class.should == Array
29
+ end
30
+
31
+ it "the array should contain hashes" do
32
+ @kwt << "bar" << "foo"
33
+ @kwt.search("foo")[0].class.should == Hash
34
+ end
35
+
36
+ # XXX: this is subject of ...talks. no yield at this point
37
+ # it "should return nil if block_given?" do
38
+ # @kwt.search("foo"){|r| r[:id]}.should == nil
39
+ # end
40
+
41
+ it "should return empty array if no results" do
42
+ @kwt.search("baba").should == []
43
+ end
44
+
45
+ it "each hash should have the required symbols values" do
46
+ @kwt << "bar" << "foo"
47
+ @kwt.search("foo").each do | r |
48
+ r[:id].class.should == Fixnum
49
+ r[:starts_at].class.should == Fixnum
50
+ r[:ends_at].class.should == Fixnum
51
+ r[:value].should == "foo"
52
+ end
53
+ end
54
+
55
+ it "should match position" do
56
+ # 0123
57
+ # | |
58
+ @kwt << "data"
59
+ q= "data moved"
60
+ @kwt.search(q).each do | result |
61
+ result[:starts_at].should == 0
62
+ result[:ends_at].should == 4
63
+ end
64
+ end
65
+
66
+ it "should match position with unicode" do
67
+ # 012345689
68
+ # | |
69
+ @kwt << "bucurești"
70
+ # 01234567890123456789023
71
+ # | |
72
+ q= "data moved to bucurești"
73
+ @kwt.search(q).each do | result |
74
+ result[:starts_at].should == 14
75
+ result[:ends_at].should == 24
76
+ end
77
+ end
78
+
79
+ it "more unicode" do
80
+ @kwt << "expected"
81
+ # 012345678901234578901234567890
82
+ q = "moved to bucurești as expected"
83
+ @kwt.search(q).each do | r |
84
+ r[:starts_at].should == 23
85
+ r[:ends_at].should == q.size
86
+ (r[:ends_at]-r[:starts_at]).should == r[:value].size
87
+ end
88
+ end
89
+
90
+ it "checks for result length" do
91
+ @kwt << "foo"
92
+ result= @kwt.search("foo").first
93
+ # 4 0
94
+ (result[:ends_at]-result[:starts_at]).should == result[:value].size
95
+ "foo"[result[:ends_at]].should == nil
96
+ "foo"[result[:ends_at]-1].chr.should == "o"
97
+ end
98
+
99
+ end
100
+
101
+ describe "Context Match vs. Exact Word Match" do
102
+
103
+ before(:each) do
104
+ # data, base, database
105
+ @kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
106
+ end
107
+
108
+ it "should match on context" do
109
+ @kwt.search("I've moved the data to a new database").size.should == 4
110
+ end
111
+
112
+ end
113
+
114
+ describe "How to add strings" do
115
+ it "should add 2 strings" do
116
+ kwt= KeywordTree.new
117
+ kwt.add_string "foo"
118
+ kwt << "bar"
119
+ kwt.size.should == 2
120
+ end
121
+ it "should add 2 strings with id" do
122
+ kwt= KeywordTree.new
123
+ kwt.add_string "foo", 1
124
+ kwt.add_string "bar", 2
125
+ kwt.size.should == 2
126
+ end
127
+
128
+ it "should rise an error when adding same id twice" do
129
+ kwt= KeywordTree.new
130
+ kwt.add_string "foo", 1
131
+ lambda{kwt.add_string("bar", 1)}.should raise_error(RuntimeError)
132
+ end
133
+
134
+ it "should raise an error when not using id's > 0" do
135
+ kwt= KeywordTree.new
136
+ lambda{kwt.add_string("bar", -1)}.should raise_error(RuntimeError)
137
+ lambda{kwt.add_string("bar", "a")}.should raise_error(RuntimeError)
138
+ lambda{kwt.add_string("bar", 0)}.should raise_error(RuntimeError)
139
+ end
140
+
141
+ it "should work to add a random id" do
142
+ kwt= KeywordTree.new
143
+ kwt << "baz"
144
+ kwt.add_string "foo", 1990
145
+ kwt << "bar"
146
+ kwt.size.should == 3
147
+ end
148
+
149
+ it "should add strings from file and manually" do
150
+ kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
151
+ kwt << "foo"
152
+ kwt.size.should == File.readlines( File.dirname(__FILE__) + "/data/dict0.txt" ).size + 1
153
+ end
154
+
155
+ it "should raise an error when adding new strings after the tree is frozen" do
156
+ kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
157
+ kwt.make
158
+ lambda{kwt << "foo"}.should raise_error(RuntimeError)
159
+ end
160
+
161
+ end
162
+
163
+ describe "Benchmarks. Loading from a file" do
164
+
165
+ it "should be fast to load a bunch of english words" do
166
+ start= Time.now
167
+ k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
168
+ puts "\n%d words loaded in %s seconds" % [k.size, (Time.now - start)]
169
+ (Time.now-start).should < 0.2
170
+ end
171
+
172
+ it "should be fast to find" do
173
+ start= Time.now
174
+ k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
175
+ load_time= Time.now
176
+ results= k.search( File.read( File.dirname(__FILE__) + "/data/melville-moby_dick.txt" ) )
177
+ puts "\n%d words re-loaded in %s seconds.\nGot %d results in %s seconds" % [k.size, (load_time - start), results.size, (Time.now-load_time)]
178
+ (Time.now-load_time).should < 1.2
179
+ end
180
+ end
181
+
182
+
183
+ end