aurelian-ruby-ahocorasick 0.3.1 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/ruby-ahocorasick.c +23 -35
- data/spec/ahocorasick_spec.rb +9 -9
- metadata +2 -2
data/ext/ruby-ahocorasick.c
CHANGED
@@ -15,7 +15,7 @@
|
|
15
15
|
// * kwt.find_first("str")
|
16
16
|
// * kwt.find_all ("str")
|
17
17
|
//
|
18
|
-
// TODO:
|
18
|
+
// TODO: change last_id and dictionary_size to long
|
19
19
|
//
|
20
20
|
|
21
21
|
#include <ruby.h>
|
@@ -37,6 +37,15 @@ struct kwt_struct_data {
|
|
37
37
|
int is_frozen;
|
38
38
|
};
|
39
39
|
|
40
|
+
// int
|
41
|
+
// rb_add_string(struct kwt_struct_data *kwt, char *word, int size, int id) {
|
42
|
+
// if(ac_add_string( kwt->tree, word, size, id ) == 0)
|
43
|
+
// return 0;
|
44
|
+
// kwt->dictionary_size++;
|
45
|
+
// kwt->last_id= id+1;
|
46
|
+
// return 1;
|
47
|
+
// }
|
48
|
+
|
40
49
|
/*
|
41
50
|
* call-seq: initialize
|
42
51
|
*
|
@@ -114,7 +123,7 @@ rb_kwt_make(VALUE self)
|
|
114
123
|
*
|
115
124
|
*/
|
116
125
|
static VALUE
|
117
|
-
|
126
|
+
rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
118
127
|
{
|
119
128
|
char * result; // itermediate result
|
120
129
|
char * remain; // returned by ac_search, the remaing text to search
|
@@ -137,7 +146,6 @@ rb_kwt_search(int argc, VALUE *argv, VALUE self)
|
|
137
146
|
kwt_data->is_frozen = 1;
|
138
147
|
}
|
139
148
|
// prepare the return value
|
140
|
-
// v_results= rb_block_given_p()? Qnil : rb_ary_new();
|
141
149
|
v_results= rb_ary_new();
|
142
150
|
// fail quickly and return the empty array
|
143
151
|
if(kwt_data->dictionary_size == 0)
|
@@ -148,32 +156,19 @@ rb_kwt_search(int argc, VALUE *argv, VALUE self)
|
|
148
156
|
while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
|
149
157
|
// this is an individual result as a hash
|
150
158
|
v_result= rb_hash_new();
|
151
|
-
rb_hash_aset( v_result, sym_id,
|
159
|
+
rb_hash_aset( v_result, sym_id, INT2FIX(id) );
|
152
160
|
rb_hash_aset( v_result, sym_starts_at, INT2FIX( ends_at - lgt - 1 ) );
|
153
|
-
rb_hash_aset( v_result, sym_ends_at,
|
161
|
+
rb_hash_aset( v_result, sym_ends_at, INT2FIX( ends_at - 1 ) );
|
154
162
|
result = (char*) malloc (sizeof(char)*lgt);
|
155
163
|
sprintf( result, "%.*s", lgt, remain);
|
156
164
|
rb_hash_aset( v_result, sym_value, rb_str_new(result, lgt) );
|
157
|
-
|
158
|
-
// yield this hash or, add it to the results
|
159
|
-
// if(rb_block_given_p())
|
160
|
-
// rb_yield(v_result);
|
161
|
-
// else
|
162
165
|
rb_ary_push( v_results, v_result );
|
163
166
|
free(result);
|
164
167
|
}
|
165
|
-
|
166
168
|
// TODO: maybe the Tree can be re-opened to add new items to dictionary
|
167
|
-
|
168
|
-
// return the results or nil if none
|
169
|
-
// if( v_results != Qnil && RARRAY(v_results)->len > 0 ) {
|
170
169
|
return v_results;
|
171
|
-
// } else {
|
172
|
-
// return Qnil;
|
173
|
-
// }
|
174
170
|
}
|
175
171
|
|
176
|
-
|
177
172
|
/*
|
178
173
|
* Document-method: size
|
179
174
|
* call-seq: size
|
@@ -194,7 +189,6 @@ rb_kwt_size(VALUE self)
|
|
194
189
|
return INT2FIX(kwt_data->dictionary_size);
|
195
190
|
}
|
196
191
|
|
197
|
-
|
198
192
|
/*
|
199
193
|
* Document-method: add_string
|
200
194
|
* call-seq: add_string
|
@@ -228,7 +222,6 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
|
228
222
|
|
229
223
|
Check_Type(v_string, T_STRING);
|
230
224
|
string= RSTRING(v_string)->ptr;
|
231
|
-
|
232
225
|
KeywordTree(self, kwt_data);
|
233
226
|
|
234
227
|
if(kwt_data->is_frozen == 1)
|
@@ -244,16 +237,12 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
|
244
237
|
id= NUM2INT(v_id);
|
245
238
|
}
|
246
239
|
|
247
|
-
|
248
|
-
|
249
|
-
if( ac_add_string(kwt_data->tree, string, strlen(string), id) == 0 ) {
|
240
|
+
if(ac_add_string(kwt_data->tree, string, strlen(string), id) == 0)
|
250
241
|
rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
|
251
|
-
}
|
252
242
|
|
253
243
|
kwt_data->last_id= id + 1;
|
254
244
|
kwt_data->dictionary_size++;
|
255
|
-
|
256
|
-
return id;
|
245
|
+
return INT2FIX(id);
|
257
246
|
}
|
258
247
|
|
259
248
|
/*
|
@@ -293,14 +282,14 @@ rb_kwt_new_from_file(int argc, VALUE *argv, VALUE klass)
|
|
293
282
|
KeywordTree( self, kwt_data );
|
294
283
|
|
295
284
|
dictionary = fopen( RSTRING( f_string )->ptr, "r" );
|
296
|
-
if(dictionary == NULL)
|
285
|
+
if(dictionary == NULL)
|
297
286
|
rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?", RSTRING(f_string)->ptr);
|
298
|
-
}
|
299
287
|
|
300
288
|
while(fgets(word, 1024, dictionary) != NULL) {
|
301
289
|
ac_add_string(kwt_data->tree, word, strlen(word)-1, id++);
|
302
290
|
kwt_data->dictionary_size++;
|
303
291
|
}
|
292
|
+
|
304
293
|
kwt_data->last_id= id+1;
|
305
294
|
fclose(dictionary);
|
306
295
|
return self;
|
@@ -331,17 +320,16 @@ void Init_ahocorasick() {
|
|
331
320
|
rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
|
332
321
|
rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
|
333
322
|
rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
|
323
|
+
rb_define_alias(rb_cKeywordTree, "<<", "add_string");
|
334
324
|
|
335
|
-
rb_define_method(rb_cKeywordTree, "
|
336
|
-
rb_define_alias(rb_cKeywordTree, "
|
325
|
+
rb_define_method(rb_cKeywordTree, "find_all", rb_kwt_find_all, -1);
|
326
|
+
rb_define_alias(rb_cKeywordTree, "search", "find_all");
|
337
327
|
|
338
|
-
rb_define_alias(rb_cKeywordTree, "<<", "add_string");
|
339
328
|
rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
|
340
329
|
|
341
|
-
sym_id= ID2SYM(rb_intern("id"));
|
342
|
-
sym_value= ID2SYM(rb_intern("value"));
|
343
|
-
sym_ends_at= ID2SYM( rb_intern("ends_at") );
|
330
|
+
sym_id = ID2SYM(rb_intern("id"));
|
331
|
+
sym_value = ID2SYM(rb_intern("value"));
|
332
|
+
sym_ends_at = ID2SYM( rb_intern("ends_at") );
|
344
333
|
sym_starts_at= ID2SYM( rb_intern("starts_at") );
|
345
|
-
|
346
334
|
}
|
347
335
|
|
data/spec/ahocorasick_spec.rb
CHANGED
@@ -67,7 +67,7 @@ describe KeywordTree do
|
|
67
67
|
# 01234567890123456789023
|
68
68
|
# | |
|
69
69
|
q= "data moved to bucurești"
|
70
|
-
@kwt.
|
70
|
+
@kwt.find_all(q).each do | result |
|
71
71
|
result[:starts_at].should == 14
|
72
72
|
result[:ends_at].should == 24
|
73
73
|
end
|
@@ -77,7 +77,7 @@ describe KeywordTree do
|
|
77
77
|
@kwt << "expected"
|
78
78
|
# 012345678901234578901234567890
|
79
79
|
q = "moved to bucurești as expected"
|
80
|
-
@kwt.
|
80
|
+
@kwt.find_all(q).each do | r |
|
81
81
|
r[:starts_at].should == 23
|
82
82
|
r[:ends_at].should == q.size
|
83
83
|
(r[:ends_at]-r[:starts_at]).should == r[:value].size
|
@@ -86,15 +86,15 @@ describe KeywordTree do
|
|
86
86
|
|
87
87
|
it "even more unicode" do
|
88
88
|
@kwt << "șșt"
|
89
|
-
#
|
90
|
-
result= @kwt.
|
89
|
+
# 0124789
|
90
|
+
result= @kwt.find_all("mușștar").first
|
91
91
|
result[:starts_at].should == 2
|
92
92
|
result[:ends_at].should == result[:starts_at] + "șșt".size
|
93
93
|
end
|
94
94
|
|
95
95
|
it "checks for result length" do
|
96
96
|
@kwt << "foo"
|
97
|
-
result= @kwt.
|
97
|
+
result= @kwt.find_all("foo").first
|
98
98
|
# 4 0
|
99
99
|
(result[:ends_at]-result[:starts_at]).should == result[:value].size
|
100
100
|
"foo"[result[:ends_at]].should == nil
|
@@ -110,7 +110,7 @@ describe KeywordTree do
|
|
110
110
|
end
|
111
111
|
|
112
112
|
it "should match on context" do
|
113
|
-
@kwt.
|
113
|
+
@kwt.find_all("I've moved the data to a new database").size.should == 4
|
114
114
|
end
|
115
115
|
|
116
116
|
end
|
@@ -146,8 +146,8 @@ describe KeywordTree do
|
|
146
146
|
it "should work to add a random id" do
|
147
147
|
kwt= KeywordTree.new
|
148
148
|
kwt << "baz"
|
149
|
-
kwt.add_string
|
150
|
-
kwt
|
149
|
+
kwt.add_string("foo", 1990).should == 1990
|
150
|
+
kwt.add_string("bar").should == 1991
|
151
151
|
kwt.size.should == 3
|
152
152
|
end
|
153
153
|
|
@@ -185,7 +185,7 @@ describe KeywordTree do
|
|
185
185
|
start= Time.now
|
186
186
|
k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
|
187
187
|
load_time= Time.now
|
188
|
-
results= k.
|
188
|
+
results= k.find_all( File.read( File.dirname(__FILE__) + "/data/melville-moby_dick.txt" ) )
|
189
189
|
puts "\n%d words re-loaded in %s seconds.\nGot %d results in %s seconds" % [k.size, (load_time - start), results.size, (Time.now-load_time)]
|
190
190
|
(Time.now-load_time).should < 1.2
|
191
191
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aurelian-ruby-ahocorasick
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aurelian Oancea
|
@@ -14,7 +14,7 @@ default_executable:
|
|
14
14
|
dependencies: []
|
15
15
|
|
16
16
|
description: Expose Aho-Corasick implementation from Strmat to Ruby.
|
17
|
-
email:
|
17
|
+
email: oancea at gmail dot com
|
18
18
|
executables: []
|
19
19
|
|
20
20
|
extensions:
|