aurelian-ruby-ahocorasick 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/ruby-ahocorasick.c +23 -35
- data/spec/ahocorasick_spec.rb +9 -9
- metadata +2 -2
data/ext/ruby-ahocorasick.c
CHANGED
@@ -15,7 +15,7 @@
|
|
15
15
|
// * kwt.find_first("str")
|
16
16
|
// * kwt.find_all ("str")
|
17
17
|
//
|
18
|
-
// TODO:
|
18
|
+
// TODO: change last_id and dictionary_size to long
|
19
19
|
//
|
20
20
|
|
21
21
|
#include <ruby.h>
|
@@ -37,6 +37,15 @@ struct kwt_struct_data {
|
|
37
37
|
int is_frozen;
|
38
38
|
};
|
39
39
|
|
40
|
+
// int
|
41
|
+
// rb_add_string(struct kwt_struct_data *kwt, char *word, int size, int id) {
|
42
|
+
// if(ac_add_string( kwt->tree, word, size, id ) == 0)
|
43
|
+
// return 0;
|
44
|
+
// kwt->dictionary_size++;
|
45
|
+
// kwt->last_id= id+1;
|
46
|
+
// return 1;
|
47
|
+
// }
|
48
|
+
|
40
49
|
/*
|
41
50
|
* call-seq: initialize
|
42
51
|
*
|
@@ -114,7 +123,7 @@ rb_kwt_make(VALUE self)
|
|
114
123
|
*
|
115
124
|
*/
|
116
125
|
static VALUE
|
117
|
-
|
126
|
+
rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
118
127
|
{
|
119
128
|
char * result; // itermediate result
|
120
129
|
char * remain; // returned by ac_search, the remaing text to search
|
@@ -137,7 +146,6 @@ rb_kwt_search(int argc, VALUE *argv, VALUE self)
|
|
137
146
|
kwt_data->is_frozen = 1;
|
138
147
|
}
|
139
148
|
// prepare the return value
|
140
|
-
// v_results= rb_block_given_p()? Qnil : rb_ary_new();
|
141
149
|
v_results= rb_ary_new();
|
142
150
|
// fail quickly and return the empty array
|
143
151
|
if(kwt_data->dictionary_size == 0)
|
@@ -148,32 +156,19 @@ rb_kwt_search(int argc, VALUE *argv, VALUE self)
|
|
148
156
|
while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
|
149
157
|
// this is an individual result as a hash
|
150
158
|
v_result= rb_hash_new();
|
151
|
-
rb_hash_aset( v_result, sym_id,
|
159
|
+
rb_hash_aset( v_result, sym_id, INT2FIX(id) );
|
152
160
|
rb_hash_aset( v_result, sym_starts_at, INT2FIX( ends_at - lgt - 1 ) );
|
153
|
-
rb_hash_aset( v_result, sym_ends_at,
|
161
|
+
rb_hash_aset( v_result, sym_ends_at, INT2FIX( ends_at - 1 ) );
|
154
162
|
result = (char*) malloc (sizeof(char)*lgt);
|
155
163
|
sprintf( result, "%.*s", lgt, remain);
|
156
164
|
rb_hash_aset( v_result, sym_value, rb_str_new(result, lgt) );
|
157
|
-
|
158
|
-
// yield this hash or, add it to the results
|
159
|
-
// if(rb_block_given_p())
|
160
|
-
// rb_yield(v_result);
|
161
|
-
// else
|
162
165
|
rb_ary_push( v_results, v_result );
|
163
166
|
free(result);
|
164
167
|
}
|
165
|
-
|
166
168
|
// TODO: maybe the Tree can be re-opened to add new items to dictionary
|
167
|
-
|
168
|
-
// return the results or nil if none
|
169
|
-
// if( v_results != Qnil && RARRAY(v_results)->len > 0 ) {
|
170
169
|
return v_results;
|
171
|
-
// } else {
|
172
|
-
// return Qnil;
|
173
|
-
// }
|
174
170
|
}
|
175
171
|
|
176
|
-
|
177
172
|
/*
|
178
173
|
* Document-method: size
|
179
174
|
* call-seq: size
|
@@ -194,7 +189,6 @@ rb_kwt_size(VALUE self)
|
|
194
189
|
return INT2FIX(kwt_data->dictionary_size);
|
195
190
|
}
|
196
191
|
|
197
|
-
|
198
192
|
/*
|
199
193
|
* Document-method: add_string
|
200
194
|
* call-seq: add_string
|
@@ -228,7 +222,6 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
|
228
222
|
|
229
223
|
Check_Type(v_string, T_STRING);
|
230
224
|
string= RSTRING(v_string)->ptr;
|
231
|
-
|
232
225
|
KeywordTree(self, kwt_data);
|
233
226
|
|
234
227
|
if(kwt_data->is_frozen == 1)
|
@@ -244,16 +237,12 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
|
244
237
|
id= NUM2INT(v_id);
|
245
238
|
}
|
246
239
|
|
247
|
-
|
248
|
-
|
249
|
-
if( ac_add_string(kwt_data->tree, string, strlen(string), id) == 0 ) {
|
240
|
+
if(ac_add_string(kwt_data->tree, string, strlen(string), id) == 0)
|
250
241
|
rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
|
251
|
-
}
|
252
242
|
|
253
243
|
kwt_data->last_id= id + 1;
|
254
244
|
kwt_data->dictionary_size++;
|
255
|
-
|
256
|
-
return id;
|
245
|
+
return INT2FIX(id);
|
257
246
|
}
|
258
247
|
|
259
248
|
/*
|
@@ -293,14 +282,14 @@ rb_kwt_new_from_file(int argc, VALUE *argv, VALUE klass)
|
|
293
282
|
KeywordTree( self, kwt_data );
|
294
283
|
|
295
284
|
dictionary = fopen( RSTRING( f_string )->ptr, "r" );
|
296
|
-
if(dictionary == NULL)
|
285
|
+
if(dictionary == NULL)
|
297
286
|
rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?", RSTRING(f_string)->ptr);
|
298
|
-
}
|
299
287
|
|
300
288
|
while(fgets(word, 1024, dictionary) != NULL) {
|
301
289
|
ac_add_string(kwt_data->tree, word, strlen(word)-1, id++);
|
302
290
|
kwt_data->dictionary_size++;
|
303
291
|
}
|
292
|
+
|
304
293
|
kwt_data->last_id= id+1;
|
305
294
|
fclose(dictionary);
|
306
295
|
return self;
|
@@ -331,17 +320,16 @@ void Init_ahocorasick() {
|
|
331
320
|
rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
|
332
321
|
rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
|
333
322
|
rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
|
323
|
+
rb_define_alias(rb_cKeywordTree, "<<", "add_string");
|
334
324
|
|
335
|
-
rb_define_method(rb_cKeywordTree, "
|
336
|
-
rb_define_alias(rb_cKeywordTree, "
|
325
|
+
rb_define_method(rb_cKeywordTree, "find_all", rb_kwt_find_all, -1);
|
326
|
+
rb_define_alias(rb_cKeywordTree, "search", "find_all");
|
337
327
|
|
338
|
-
rb_define_alias(rb_cKeywordTree, "<<", "add_string");
|
339
328
|
rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
|
340
329
|
|
341
|
-
sym_id= ID2SYM(rb_intern("id"));
|
342
|
-
sym_value= ID2SYM(rb_intern("value"));
|
343
|
-
sym_ends_at= ID2SYM( rb_intern("ends_at") );
|
330
|
+
sym_id = ID2SYM(rb_intern("id"));
|
331
|
+
sym_value = ID2SYM(rb_intern("value"));
|
332
|
+
sym_ends_at = ID2SYM( rb_intern("ends_at") );
|
344
333
|
sym_starts_at= ID2SYM( rb_intern("starts_at") );
|
345
|
-
|
346
334
|
}
|
347
335
|
|
data/spec/ahocorasick_spec.rb
CHANGED
@@ -67,7 +67,7 @@ describe KeywordTree do
|
|
67
67
|
# 01234567890123456789023
|
68
68
|
# | |
|
69
69
|
q= "data moved to bucurești"
|
70
|
-
@kwt.
|
70
|
+
@kwt.find_all(q).each do | result |
|
71
71
|
result[:starts_at].should == 14
|
72
72
|
result[:ends_at].should == 24
|
73
73
|
end
|
@@ -77,7 +77,7 @@ describe KeywordTree do
|
|
77
77
|
@kwt << "expected"
|
78
78
|
# 012345678901234578901234567890
|
79
79
|
q = "moved to bucurești as expected"
|
80
|
-
@kwt.
|
80
|
+
@kwt.find_all(q).each do | r |
|
81
81
|
r[:starts_at].should == 23
|
82
82
|
r[:ends_at].should == q.size
|
83
83
|
(r[:ends_at]-r[:starts_at]).should == r[:value].size
|
@@ -86,15 +86,15 @@ describe KeywordTree do
|
|
86
86
|
|
87
87
|
it "even more unicode" do
|
88
88
|
@kwt << "șșt"
|
89
|
-
#
|
90
|
-
result= @kwt.
|
89
|
+
# 0124789
|
90
|
+
result= @kwt.find_all("mușștar").first
|
91
91
|
result[:starts_at].should == 2
|
92
92
|
result[:ends_at].should == result[:starts_at] + "șșt".size
|
93
93
|
end
|
94
94
|
|
95
95
|
it "checks for result length" do
|
96
96
|
@kwt << "foo"
|
97
|
-
result= @kwt.
|
97
|
+
result= @kwt.find_all("foo").first
|
98
98
|
# 4 0
|
99
99
|
(result[:ends_at]-result[:starts_at]).should == result[:value].size
|
100
100
|
"foo"[result[:ends_at]].should == nil
|
@@ -110,7 +110,7 @@ describe KeywordTree do
|
|
110
110
|
end
|
111
111
|
|
112
112
|
it "should match on context" do
|
113
|
-
@kwt.
|
113
|
+
@kwt.find_all("I've moved the data to a new database").size.should == 4
|
114
114
|
end
|
115
115
|
|
116
116
|
end
|
@@ -146,8 +146,8 @@ describe KeywordTree do
|
|
146
146
|
it "should work to add a random id" do
|
147
147
|
kwt= KeywordTree.new
|
148
148
|
kwt << "baz"
|
149
|
-
kwt.add_string
|
150
|
-
kwt
|
149
|
+
kwt.add_string("foo", 1990).should == 1990
|
150
|
+
kwt.add_string("bar").should == 1991
|
151
151
|
kwt.size.should == 3
|
152
152
|
end
|
153
153
|
|
@@ -185,7 +185,7 @@ describe KeywordTree do
|
|
185
185
|
start= Time.now
|
186
186
|
k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
|
187
187
|
load_time= Time.now
|
188
|
-
results= k.
|
188
|
+
results= k.find_all( File.read( File.dirname(__FILE__) + "/data/melville-moby_dick.txt" ) )
|
189
189
|
puts "\n%d words re-loaded in %s seconds.\nGot %d results in %s seconds" % [k.size, (load_time - start), results.size, (Time.now-load_time)]
|
190
190
|
(Time.now-load_time).should < 1.2
|
191
191
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aurelian-ruby-ahocorasick
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aurelian Oancea
|
@@ -14,7 +14,7 @@ default_executable:
|
|
14
14
|
dependencies: []
|
15
15
|
|
16
16
|
description: Expose Aho-Corasick implementation from Strmat to Ruby.
|
17
|
-
email:
|
17
|
+
email: oancea at gmail dot com
|
18
18
|
executables: []
|
19
19
|
|
20
20
|
extensions:
|