aurelian-ruby-ahocorasick 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/ext/ac.h ADDED
@@ -0,0 +1,36 @@
1
+
2
+ #ifndef _AC_H_
3
+ #define _AC_H_
4
+
5
+ typedef struct actreenode {
6
+ char ch;
7
+ int matchid;
8
+ struct actreenode *outlink, *faillink;
9
+ struct actreenode *children, *sibling;
10
+ } ACTREE_NODE, *AC_TREE;
11
+
12
+ typedef struct {
13
+ AC_TREE tree;
14
+ int ispreprocessed, errorflag;
15
+
16
+ int Psize;
17
+ int *Plengths;
18
+
19
+ char *T;
20
+ int N, c, initflag, endflag;
21
+ AC_TREE w, output;
22
+ #ifdef STATS
23
+ int prep_new_edges, prep_old_edges, prep_fail_compares;
24
+ int num_compares, num_failures, edges_traversed, outlinks_traversed;
25
+ #endif
26
+ } AC_STRUCT;
27
+
28
+ AC_STRUCT *ac_alloc(void);
29
+ int ac_add_string(AC_STRUCT *node, char *P, int M, int id);
30
+ // int ac_del_string(AC_STRUCT *node, char *P, int M, int id);
31
+ int ac_prep(AC_STRUCT *node);
32
+ void ac_search_init(AC_STRUCT *node, char *T, int N);
33
+ char *ac_search(AC_STRUCT *node, int *length_out, int *id_out, int *ends_at);
34
+ void ac_free(AC_STRUCT *node);
35
+
36
+ #endif
data/ext/extconf.rb ADDED
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+
3
+ create_makefile("ahocorasick")
4
+
@@ -0,0 +1,329 @@
1
+
2
+ //
3
+ // (c) 2008, Aurelian Oancea < aurelian at locknet . ro >
4
+ //
5
+ // Released under MIT-LICENSE
6
+ //
7
+
8
+ //
9
+ // TODO: new methods?
10
+ // * kwt[id] = word
11
+ // * kwt.from_file (class instance method)
12
+ //
13
+
14
+ #include <ruby.h>
15
+ #include "ac.h"
16
+
17
+ static VALUE sym_id, sym_value, sym_ends_at, sym_starts_at;
18
+
19
+ VALUE rb_mAhoCorasick;
20
+ VALUE rb_cKeywordTree;
21
+
22
+ #define KeywordTree(obj, kwt_data) {\
23
+ Data_Get_Struct(obj, struct kwt_struct_data, kwt_data);\
24
+ }
25
+
26
+ struct kwt_struct_data {
27
+ AC_STRUCT * tree;
28
+ int last_id;
29
+ int dictionary_size;
30
+ int is_frozen;
31
+ };
32
+
33
+ /*
34
+ * call-seq: initialize
35
+ *
36
+ * Creates a new KeywordTree
37
+ *
38
+ * require 'ahocorasick'
39
+ * kwt = Ahocorasick::KeywordTree.new
40
+ *
41
+ */
42
+ static VALUE
43
+ rb_kwt_init(VALUE self)
44
+ {
45
+ AC_STRUCT * tree;
46
+ struct kwt_struct_data *kwt_data;
47
+
48
+ kwt_data = ALLOC(struct kwt_struct_data);
49
+ tree = ac_alloc();
50
+ DATA_PTR(self) = kwt_data;
51
+ kwt_data->tree = tree;
52
+ kwt_data->last_id = 1;
53
+ kwt_data->dictionary_size = 0;
54
+ kwt_data->is_frozen = 0;
55
+ return self;
56
+ }
57
+
58
+ /*
59
+ * Document-method: make
60
+ * call-seq: make
61
+ *
62
+ * It freezes the current KeywordTree. After this point, the tree will not accept any new entries.
63
+ *
64
+ * ==== Note: This method is called internally by search
65
+ *
66
+ * require 'ahocorasick'
67
+ *
68
+ * kwt = Ahocorasick::KeywordTree.new
69
+ *
70
+ * kwt.add_string("one")
71
+ * kwt.add_string("two")
72
+ * kwt.make()
73
+ */
74
+ static VALUE
75
+ rb_kwt_make(VALUE self)
76
+ {
77
+ struct kwt_struct_data *kwt_data;
78
+ KeywordTree(self, kwt_data);
79
+
80
+ ac_prep( kwt_data->tree );
81
+ kwt_data->is_frozen = 1;
82
+ return self;
83
+ }
84
+
85
+ /*
86
+ * Document-method: search
87
+ * call-seq: search
88
+ *
89
+ * Search the current tree.
90
+ *
91
+ * It returns an array on hashes, e.g.
92
+ *
93
+ * [ { :id => int, :value => int, :starts_at => int, :ends_at => int}, { ... } ]
94
+ *
95
+ * Returns an empty array when the search didn't return any result.
96
+ *
97
+ * # assuming a valid KeywordTree kwt object:
98
+ * kwt.add_string("one")
99
+ * kwt.add_string("two")
100
+ *
101
+ * kwt.search( "moved two times already" ).each do | result |
102
+ * result[:id] # => 2
103
+ * result[:ends_at] # => 9
104
+ * result[:starts_at] # => 6
105
+ * result[:value] # => two
106
+ * end # => 1
107
+ *
108
+ */
109
+ static VALUE
110
+ rb_kwt_search(int argc, VALUE *argv, VALUE self)
111
+ {
112
+ char * result; // itermediate result
113
+ char * remain; // returned by ac_search, the remaing text to search
114
+ int lgt, id, ends_at; // filled in by ac_search, the id, length and ends_at position
115
+ int starts_at;
116
+ VALUE v_result; // one result, as hash
117
+ VALUE v_results; // all the results, an array
118
+ VALUE v_search; // search string, function argument
119
+ struct kwt_struct_data *kwt_data;
120
+
121
+ // one mandatory argument.
122
+ rb_scan_args(argc, argv, "1", &v_search);
123
+ // it should be string.
124
+ Check_Type(v_search, T_STRING);
125
+ // get the structure
126
+ KeywordTree(self, kwt_data);
127
+ // freeze the tree, if not already
128
+ if(kwt_data->is_frozen == 0) {
129
+ ac_prep( kwt_data->tree );
130
+ kwt_data->is_frozen = 1;
131
+ }
132
+ // prepare the return value
133
+ // v_results= rb_block_given_p()? Qnil : rb_ary_new();
134
+ v_results= rb_ary_new();
135
+ // fail quickly and return the empty array
136
+ if(kwt_data->dictionary_size == 0)
137
+ return v_results;
138
+ // prepare the search
139
+ ac_search_init(kwt_data->tree, RSTRING( v_search )->ptr, RSTRING( v_search )->len);
140
+ // loop trought the results
141
+ while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
142
+ // this is an individual result as a hash
143
+ v_result= rb_hash_new();
144
+ rb_hash_aset( v_result, sym_id, INT2FIX(id) );
145
+ rb_hash_aset( v_result, sym_starts_at, INT2FIX( ends_at - lgt - 1 ) );
146
+ rb_hash_aset( v_result, sym_ends_at, INT2FIX( ends_at - 1 ) );
147
+ result = (char*) malloc (sizeof(char)*lgt);
148
+ sprintf( result, "%.*s", lgt, remain);
149
+ rb_hash_aset( v_result, sym_value, rb_str_new(result, lgt) );
150
+
151
+ // yield this hash or, add it to the results
152
+ // if(rb_block_given_p())
153
+ // rb_yield(v_result);
154
+ // else
155
+ rb_ary_push( v_results, v_result );
156
+ free(result);
157
+ }
158
+
159
+ // TODO: maybe the Tree can be re-opened to add new items to dictionary
160
+
161
+ // return the results or nil if none
162
+ // if( v_results != Qnil && RARRAY(v_results)->len > 0 ) {
163
+ return v_results;
164
+ // } else {
165
+ // return Qnil;
166
+ // }
167
+ }
168
+
169
+
170
+ /*
171
+ * Document-method: size
172
+ * call-seq: size
173
+ *
174
+ * Returns the size of this KeywordTree
175
+ *
176
+ * kwt.add_string("foo")
177
+ * kwt.add_string("bar")
178
+ * kwt.size #=> 2
179
+ *
180
+ */
181
+ static VALUE
182
+ rb_kwt_size(VALUE self)
183
+ {
184
+ struct kwt_struct_data *kwt_data;
185
+ KeywordTree(self, kwt_data);
186
+
187
+ return INT2FIX(kwt_data->dictionary_size);
188
+ }
189
+
190
+
191
+ /*
192
+ * Document-method: add_string
193
+ * call-seq: add_string
194
+ *
195
+ * Adds a sequence to this KeywordTree.
196
+ *
197
+ * kwt.add_string("foo1$21^ 98N3 ba>Z")
198
+ * kwt << "bar" # using the alias
199
+ *
200
+ * ==== Note: you can also specify the id, a number between 1 and k
201
+ *
202
+ * kwt.add_string "bar", 123
203
+ *
204
+ * This id should be unique in the context of the current tree.
205
+ *
206
+ */
207
+ static VALUE
208
+ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
209
+ {
210
+ VALUE v_string, v_id;
211
+ struct kwt_struct_data *kwt_data;
212
+ char * string;
213
+ int id;
214
+
215
+ rb_scan_args(argc, argv, "11", &v_string, &v_id);
216
+
217
+ Check_Type(v_string, T_STRING);
218
+ string= RSTRING(v_string)->ptr;
219
+
220
+ KeywordTree(self, kwt_data);
221
+
222
+ if(kwt_data->is_frozen == 1)
223
+ rb_raise(rb_eRuntimeError, "Cannot add `%s\" into a frozen tree.", string);
224
+
225
+ if(v_id == Qnil) {
226
+ id = kwt_data->last_id;
227
+ } else if(TYPE(v_id) != T_FIXNUM) {
228
+ rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%s\" given.", RSTRING(v_id)->ptr);
229
+ } else if(NUM2INT(v_id) <= 0) {
230
+ rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%d\" given.", NUM2INT(v_id));
231
+ } else {
232
+ id= NUM2INT(v_id);
233
+ }
234
+
235
+ if( ac_add_string(kwt_data->tree, string, strlen(string), id) == 0 ) {
236
+ rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
237
+ }
238
+
239
+ kwt_data->last_id= id + 1;
240
+ kwt_data->dictionary_size++;
241
+ return self;
242
+ }
243
+
244
+ /*
245
+ * call-seq: from_file
246
+ *
247
+ * Creates a new KeywordTree and loads the dictionary from a file
248
+ *
249
+ * % cat dict0.txt
250
+ * foo
251
+ * bar
252
+ * base
253
+ *
254
+ * k= AhoCorasick::KeywordTree.from_file "dict0.txt"
255
+ * k.search("basement").size # => 1
256
+ *
257
+ */
258
+ static VALUE
259
+ rb_kwt_new_from_file(int argc, VALUE *argv, VALUE klass)
260
+ {
261
+
262
+ // TODO:
263
+ // * use rb_kwt_add_string
264
+ // * use rb_io* to handle the file
265
+
266
+ struct kwt_struct_data *kwt_data;
267
+ char word[1024];
268
+ int id;
269
+ VALUE self;
270
+ VALUE f_string;
271
+ FILE *dictionary;
272
+
273
+ rb_scan_args(argc, argv, "10", &f_string);
274
+
275
+ id = 0;
276
+ SafeStringValue( f_string );
277
+ self= rb_class_new_instance( 0, NULL, klass );
278
+ KeywordTree( self, kwt_data );
279
+
280
+ dictionary = fopen( RSTRING( f_string )->ptr, "r" );
281
+ if(dictionary == NULL) {
282
+ rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?", RSTRING(f_string)->ptr);
283
+ }
284
+
285
+ while(fgets(word, 1024, dictionary) != NULL) {
286
+ ac_add_string(kwt_data->tree, word, strlen(word)-1, id++);
287
+ kwt_data->dictionary_size++;
288
+ }
289
+ kwt_data->last_id= id+1;
290
+ fclose(dictionary);
291
+ return self;
292
+ }
293
+
294
+ static void
295
+ rb_kwt_struct_free(struct kwt_struct_data * kwt_data)
296
+ {
297
+ ac_free(kwt_data->tree);
298
+ }
299
+
300
+ static VALUE
301
+ rb_kwt_struct_alloc(VALUE klass)
302
+ {
303
+ return Data_Wrap_Struct(klass, 0, rb_kwt_struct_free, 0);
304
+ }
305
+
306
+ /*
307
+ * Blump.
308
+ */
309
+ void Init_ahocorasick() {
310
+ rb_mAhoCorasick = rb_define_module("AhoCorasick");
311
+ rb_cKeywordTree = rb_define_class_under(rb_mAhoCorasick, "KeywordTree", rb_cObject);
312
+
313
+ rb_define_alloc_func(rb_cKeywordTree, rb_kwt_struct_alloc);
314
+
315
+ rb_define_method(rb_cKeywordTree, "initialize", rb_kwt_init, 0);
316
+ rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
317
+ rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
318
+ rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
319
+ rb_define_method(rb_cKeywordTree, "search", rb_kwt_search, -1);
320
+ rb_define_alias(rb_cKeywordTree, "<<", "add_string");
321
+ rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
322
+
323
+ sym_id= ID2SYM(rb_intern("id"));
324
+ sym_value= ID2SYM(rb_intern("value"));
325
+ sym_ends_at= ID2SYM( rb_intern("ends_at") );
326
+ sym_starts_at= ID2SYM( rb_intern("starts_at") );
327
+
328
+ }
329
+
@@ -0,0 +1,183 @@
1
+ require 'ext/ahocorasick'
2
+
3
+ include AhoCorasick
4
+
5
+ describe KeywordTree do
6
+
7
+ describe "How to create a new KeywordTree" do
8
+ it "should create a new KeywordTree" do
9
+ KeywordTree.new.class.should == KeywordTree
10
+ end
11
+ it "should create a new KeywordTree" do
12
+ KeywordTree.from_file( File.dirname(__FILE__) + "/data/dict0.txt").class.should == KeywordTree
13
+ end
14
+ end
15
+
16
+ describe "How to search" do
17
+
18
+ before(:each) do
19
+ @kwt= KeywordTree.new
20
+ end
21
+
22
+ # XXX: is this usefull?
23
+ after(:each) do
24
+ @kwt= nil
25
+ end
26
+ it "should return an array" do
27
+ @kwt << "foo"
28
+ @kwt.search("bar").class.should == Array
29
+ end
30
+
31
+ it "the array should contain hashes" do
32
+ @kwt << "bar" << "foo"
33
+ @kwt.search("foo")[0].class.should == Hash
34
+ end
35
+
36
+ # XXX: this is subject of ...talks. no yield at this point
37
+ # it "should return nil if block_given?" do
38
+ # @kwt.search("foo"){|r| r[:id]}.should == nil
39
+ # end
40
+
41
+ it "should return empty array if no results" do
42
+ @kwt.search("baba").should == []
43
+ end
44
+
45
+ it "each hash should have the required symbols values" do
46
+ @kwt << "bar" << "foo"
47
+ @kwt.search("foo").each do | r |
48
+ r[:id].class.should == Fixnum
49
+ r[:starts_at].class.should == Fixnum
50
+ r[:ends_at].class.should == Fixnum
51
+ r[:value].should == "foo"
52
+ end
53
+ end
54
+
55
+ it "should match position" do
56
+ # 0123
57
+ # | |
58
+ @kwt << "data"
59
+ q= "data moved"
60
+ @kwt.search(q).each do | result |
61
+ result[:starts_at].should == 0
62
+ result[:ends_at].should == 4
63
+ end
64
+ end
65
+
66
+ it "should match position with unicode" do
67
+ # 012345689
68
+ # | |
69
+ @kwt << "bucurești"
70
+ # 01234567890123456789023
71
+ # | |
72
+ q= "data moved to bucurești"
73
+ @kwt.search(q).each do | result |
74
+ result[:starts_at].should == 14
75
+ result[:ends_at].should == 24
76
+ end
77
+ end
78
+
79
+ it "more unicode" do
80
+ @kwt << "expected"
81
+ # 012345678901234578901234567890
82
+ q = "moved to bucurești as expected"
83
+ @kwt.search(q).each do | r |
84
+ r[:starts_at].should == 23
85
+ r[:ends_at].should == q.size
86
+ (r[:ends_at]-r[:starts_at]).should == r[:value].size
87
+ end
88
+ end
89
+
90
+ it "checks for result length" do
91
+ @kwt << "foo"
92
+ result= @kwt.search("foo").first
93
+ # 4 0
94
+ (result[:ends_at]-result[:starts_at]).should == result[:value].size
95
+ "foo"[result[:ends_at]].should == nil
96
+ "foo"[result[:ends_at]-1].chr.should == "o"
97
+ end
98
+
99
+ end
100
+
101
+ describe "Context Match vs. Exact Word Match" do
102
+
103
+ before(:each) do
104
+ # data, base, database
105
+ @kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
106
+ end
107
+
108
+ it "should match on context" do
109
+ @kwt.search("I've moved the data to a new database").size.should == 4
110
+ end
111
+
112
+ end
113
+
114
+ describe "How to add strings" do
115
+ it "should add 2 strings" do
116
+ kwt= KeywordTree.new
117
+ kwt.add_string "foo"
118
+ kwt << "bar"
119
+ kwt.size.should == 2
120
+ end
121
+ it "should add 2 strings with id" do
122
+ kwt= KeywordTree.new
123
+ kwt.add_string "foo", 1
124
+ kwt.add_string "bar", 2
125
+ kwt.size.should == 2
126
+ end
127
+
128
+ it "should rise an error when adding same id twice" do
129
+ kwt= KeywordTree.new
130
+ kwt.add_string "foo", 1
131
+ lambda{kwt.add_string("bar", 1)}.should raise_error(RuntimeError)
132
+ end
133
+
134
+ it "should raise an error when not using id's > 0" do
135
+ kwt= KeywordTree.new
136
+ lambda{kwt.add_string("bar", -1)}.should raise_error(RuntimeError)
137
+ lambda{kwt.add_string("bar", "a")}.should raise_error(RuntimeError)
138
+ lambda{kwt.add_string("bar", 0)}.should raise_error(RuntimeError)
139
+ end
140
+
141
+ it "should work to add a random id" do
142
+ kwt= KeywordTree.new
143
+ kwt << "baz"
144
+ kwt.add_string "foo", 1990
145
+ kwt << "bar"
146
+ kwt.size.should == 3
147
+ end
148
+
149
+ it "should add strings from file and manually" do
150
+ kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
151
+ kwt << "foo"
152
+ kwt.size.should == File.readlines( File.dirname(__FILE__) + "/data/dict0.txt" ).size + 1
153
+ end
154
+
155
+ it "should raise an error when adding new strings after the tree is frozen" do
156
+ kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
157
+ kwt.make
158
+ lambda{kwt << "foo"}.should raise_error(RuntimeError)
159
+ end
160
+
161
+ end
162
+
163
+ describe "Benchmarks. Loading from a file" do
164
+
165
+ it "should be fast to load a bunch of english words" do
166
+ start= Time.now
167
+ k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
168
+ puts "\n%d words loaded in %s seconds" % [k.size, (Time.now - start)]
169
+ (Time.now-start).should < 0.2
170
+ end
171
+
172
+ it "should be fast to find" do
173
+ start= Time.now
174
+ k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
175
+ load_time= Time.now
176
+ results= k.search( File.read( File.dirname(__FILE__) + "/data/melville-moby_dick.txt" ) )
177
+ puts "\n%d words re-loaded in %s seconds.\nGot %d results in %s seconds" % [k.size, (load_time - start), results.size, (Time.now-load_time)]
178
+ (Time.now-load_time).should < 1.2
179
+ end
180
+ end
181
+
182
+
183
+ end