aurelian-ruby-ahocorasick 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +2 -2
- data/examples/dict.rb +14 -12
- data/examples/elev.rb +4 -2
- data/examples/test.rb +5 -1
- data/ext/ahocorasick/ruby-ahocorasick.c +63 -12
- data/lib/ahocorasick.rb +9 -0
- metadata +1 -1
data/README.textile
CHANGED
@@ -57,7 +57,7 @@ $ gem install ruby-ahocorasick
|
|
57
57
|
|
58
58
|
h4. Notes
|
59
59
|
|
60
|
-
It's known to work / compile / install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc
|
60
|
+
It's known to work / compile / install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc.
|
61
61
|
Unfortunately I don't have a Windows PC around nor required knowledge about Microsoft compliers.
|
62
62
|
|
63
63
|
|
@@ -102,4 +102,4 @@ h2. License
|
|
102
102
|
|
103
103
|
(c) 2008 - Aurelian Oancea, < oancea at gmail dot com >
|
104
104
|
|
105
|
-
released under MIT-LICENCE
|
105
|
+
released under MIT-LICENCE
|
data/examples/dict.rb
CHANGED
@@ -1,25 +1,27 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
%w(../lib ../ext).each do |path|
|
4
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
|
5
|
+
end
|
4
6
|
|
5
|
-
require
|
7
|
+
require "ahocorasick"
|
8
|
+
require "time"
|
6
9
|
|
7
10
|
t= Time.now
|
8
|
-
|
9
11
|
k= AhoCorasick::KeywordTree.from_file(File.dirname(__FILE__) + "/../spec/data/en.words")
|
10
|
-
|
11
12
|
t1= Time.now
|
12
|
-
|
13
|
-
puts "%d words added in %s seconds" % [k.size, (t1-t)]
|
13
|
+
puts "==> %d words added in %s seconds" % [k.size, (t1-t)]
|
14
14
|
|
15
15
|
query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt" )
|
16
|
-
|
17
16
|
results= k.search query
|
17
|
+
puts "==> took %s seconds to find %d results in a streem with %d charachters" % \
|
18
|
+
[(Time.now-t1), results.size, query.size]
|
18
19
|
|
19
|
-
puts "
|
20
|
-
|
21
|
-
|
22
|
-
results.each do | r |
|
23
|
-
puts query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr + " => " + r[:value]
|
20
|
+
puts "==> 20 results"
|
21
|
+
results[0..20].each do | r |
|
22
|
+
puts "-- #{query[r[:starts_at]].chr}..#{query[r[:ends_at]-1].chr} => #{r[:value]}"
|
24
23
|
end
|
25
24
|
|
25
|
+
puts "==> query was -first 100 chars:"
|
26
|
+
puts query[0..100]
|
27
|
+
|
data/examples/elev.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
%w(../lib ../ext).each do |path|
|
4
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
|
5
|
+
end
|
4
6
|
|
5
|
-
require
|
7
|
+
require "ahocorasick"
|
6
8
|
|
7
9
|
k= AhoCorasick::KeywordTree.new
|
8
10
|
|
data/examples/test.rb
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
%w(../lib ../ext).each do |path|
|
4
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
|
5
|
+
end
|
6
|
+
|
7
|
+
require "ahocorasick"
|
4
8
|
|
5
9
|
k= AhoCorasick::KeywordTree.new
|
6
10
|
|
@@ -25,6 +25,7 @@ static VALUE sym_id, sym_value, sym_ends_at, sym_starts_at;
|
|
25
25
|
|
26
26
|
VALUE rb_mAhoCorasick;
|
27
27
|
VALUE rb_cKeywordTree;
|
28
|
+
VALUE rb_cResultFilter;
|
28
29
|
|
29
30
|
#define KeywordTree(obj, kwt_data) {\
|
30
31
|
Data_Get_Struct(obj, struct kwt_struct_data, kwt_data);\
|
@@ -37,14 +38,26 @@ struct kwt_struct_data {
|
|
37
38
|
int is_frozen;
|
38
39
|
};
|
39
40
|
|
40
|
-
//
|
41
|
-
//
|
42
|
-
//
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
41
|
+
//
|
42
|
+
// ResultFilter interface
|
43
|
+
//
|
44
|
+
|
45
|
+
static VALUE
|
46
|
+
rb_rf_init(VALUE self) {
|
47
|
+
return self;
|
48
|
+
}
|
49
|
+
static VALUE
|
50
|
+
rb_rf_valid(int argc, VALUE *argv, VALUE self) {
|
51
|
+
VALUE result;
|
52
|
+
VALUE remain;
|
53
|
+
rb_scan_args(argc, argv, "20", &result, &remain);
|
54
|
+
rb_raise(rb_eNotImpError, "Method AhoCorasick::ResultFilter.valid?(<Hash> result, <String> remain) should be implemented in child classes.");
|
55
|
+
return Qtrue;
|
56
|
+
}
|
57
|
+
|
58
|
+
//
|
59
|
+
// ~ResultFilter
|
60
|
+
//
|
48
61
|
|
49
62
|
/*
|
50
63
|
* call-seq: initialize
|
@@ -68,6 +81,7 @@ rb_kwt_init(VALUE self)
|
|
68
81
|
kwt_data->last_id = 1;
|
69
82
|
kwt_data->dictionary_size = 0;
|
70
83
|
kwt_data->is_frozen = 0;
|
84
|
+
rb_iv_set( self, "@filter", Qnil );
|
71
85
|
return self;
|
72
86
|
}
|
73
87
|
|
@@ -135,6 +149,7 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
|
135
149
|
int lgt, id, ends_at; // filled in by ac_search: the length of the result, the id, and starts_at/ends_at position
|
136
150
|
VALUE v_result; // one result, as hash
|
137
151
|
VALUE v_results; // all the results, an array
|
152
|
+
VALUE filter; // filter to be applied to results
|
138
153
|
|
139
154
|
VALUE v_search; // search string, function argument
|
140
155
|
struct kwt_struct_data *kwt_data;
|
@@ -160,6 +175,8 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
|
160
175
|
return v_results;
|
161
176
|
// prepare the search
|
162
177
|
ac_search_init(kwt_data->tree, StringValuePtr(v_search), (int)NUM2INT(rb_funcall(v_search, rb_intern("length"), 0)));
|
178
|
+
// get the filter
|
179
|
+
filter= rb_iv_get(self, "@filter");
|
163
180
|
// loop trought the results
|
164
181
|
while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
|
165
182
|
// this is an individual result as a hash
|
@@ -168,7 +185,8 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
|
168
185
|
rb_hash_aset( v_result, sym_starts_at, INT2NUM( (long)(ends_at - lgt - 1) ) );
|
169
186
|
rb_hash_aset( v_result, sym_ends_at, INT2NUM( (long)(ends_at - 1) ) );
|
170
187
|
rb_hash_aset( v_result, sym_value, rb_str_new(remain, (long)lgt) );
|
171
|
-
|
188
|
+
if (filter == Qnil || rb_funcall( filter, rb_intern("valid?"), 2, v_result, rb_str_new(remain, (long)strlen(remain)) )!=Qfalse)
|
189
|
+
rb_ary_push( v_results, v_result );
|
172
190
|
}
|
173
191
|
// reopen the tree
|
174
192
|
kwt_data->is_frozen= 0;
|
@@ -250,6 +268,32 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
|
250
268
|
return INT2FIX(id);
|
251
269
|
}
|
252
270
|
|
271
|
+
static VALUE
|
272
|
+
rb_kwt_set_filter(int argc, VALUE *argv, VALUE self) {
|
273
|
+
struct kwt_struct_data *kwt_data;
|
274
|
+
VALUE filter;
|
275
|
+
|
276
|
+
rb_scan_args(argc, argv, "10", &filter);
|
277
|
+
|
278
|
+
if(rb_obj_is_kind_of(filter, rb_cResultFilter) == 0)
|
279
|
+
rb_raise(rb_eTypeError, "Type mismatch: required %s, %s given.", rb_class2name(rb_cResultFilter), rb_class2name(CLASS_OF(filter)));
|
280
|
+
|
281
|
+
KeywordTree( self, kwt_data );
|
282
|
+
rb_iv_set( self, "@filter", filter );
|
283
|
+
|
284
|
+
return filter;
|
285
|
+
}
|
286
|
+
|
287
|
+
static VALUE
|
288
|
+
rb_kwt_get_filter(VALUE self) {
|
289
|
+
VALUE filter;
|
290
|
+
struct kwt_struct_data *kwt_data;
|
291
|
+
KeywordTree( self, kwt_data );
|
292
|
+
|
293
|
+
filter= rb_iv_get(self, "@filter");
|
294
|
+
return filter;
|
295
|
+
}
|
296
|
+
|
253
297
|
/*
|
254
298
|
* call-seq: from_file
|
255
299
|
*
|
@@ -323,17 +367,24 @@ void Init_native() {
|
|
323
367
|
rb_define_method(rb_cKeywordTree, "initialize", rb_kwt_init, 0);
|
324
368
|
rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
|
325
369
|
rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
|
370
|
+
rb_define_method(rb_cKeywordTree, "filter=", rb_kwt_set_filter, -1);
|
371
|
+
rb_define_method(rb_cKeywordTree, "filter", rb_kwt_get_filter, 0);
|
372
|
+
|
326
373
|
rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
|
327
374
|
rb_define_alias(rb_cKeywordTree, "<<", "add_string");
|
328
375
|
|
329
376
|
rb_define_method(rb_cKeywordTree, "find_all", rb_kwt_find_all, -1);
|
330
377
|
rb_define_alias(rb_cKeywordTree, "search", "find_all");
|
331
378
|
|
332
|
-
rb_define_singleton_method(rb_cKeywordTree, "
|
379
|
+
rb_define_singleton_method(rb_cKeywordTree, "_from_file", rb_kwt_new_from_file, -1);
|
380
|
+
|
381
|
+
rb_cResultFilter = rb_define_class_under(rb_mAhoCorasick, "ResultFilter", rb_cObject);
|
382
|
+
rb_define_method(rb_cResultFilter, "initialize", rb_rf_init, 0);
|
383
|
+
rb_define_method(rb_cResultFilter, "valid?", rb_rf_valid, -1);
|
333
384
|
|
334
385
|
sym_id = ID2SYM(rb_intern("id"));
|
335
386
|
sym_value = ID2SYM(rb_intern("value"));
|
336
|
-
sym_ends_at = ID2SYM(
|
337
|
-
sym_starts_at= ID2SYM(
|
387
|
+
sym_ends_at = ID2SYM(rb_intern("ends_at"));
|
388
|
+
sym_starts_at= ID2SYM(rb_intern("starts_at"));
|
338
389
|
}
|
339
390
|
|
data/lib/ahocorasick.rb
CHANGED