aurelian-ruby-ahocorasick 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +2 -2
- data/examples/dict.rb +14 -12
- data/examples/elev.rb +4 -2
- data/examples/test.rb +5 -1
- data/ext/ahocorasick/ruby-ahocorasick.c +63 -12
- data/lib/ahocorasick.rb +9 -0
- metadata +1 -1
data/README.textile
CHANGED
@@ -57,7 +57,7 @@ $ gem install ruby-ahocorasick
|
|
57
57
|
|
58
58
|
h4. Notes
|
59
59
|
|
60
|
-
It's known to work / compile / install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc
|
60
|
+
It's known to work / compile / install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc.
|
61
61
|
Unfortunately I don't have a Windows PC around nor required knowledge about Microsoft compliers.
|
62
62
|
|
63
63
|
|
@@ -102,4 +102,4 @@ h2. License
|
|
102
102
|
|
103
103
|
(c) 2008 - Aurelian Oancea, < oancea at gmail dot com >
|
104
104
|
|
105
|
-
released under MIT-LICENCE
|
105
|
+
released under MIT-LICENCE
|
data/examples/dict.rb
CHANGED
@@ -1,25 +1,27 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
%w(../lib ../ext).each do |path|
|
4
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
|
5
|
+
end
|
4
6
|
|
5
|
-
require
|
7
|
+
require "ahocorasick"
|
8
|
+
require "time"
|
6
9
|
|
7
10
|
t= Time.now
|
8
|
-
|
9
11
|
k= AhoCorasick::KeywordTree.from_file(File.dirname(__FILE__) + "/../spec/data/en.words")
|
10
|
-
|
11
12
|
t1= Time.now
|
12
|
-
|
13
|
-
puts "%d words added in %s seconds" % [k.size, (t1-t)]
|
13
|
+
puts "==> %d words added in %s seconds" % [k.size, (t1-t)]
|
14
14
|
|
15
15
|
query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt" )
|
16
|
-
|
17
16
|
results= k.search query
|
17
|
+
puts "==> took %s seconds to find %d results in a streem with %d charachters" % \
|
18
|
+
[(Time.now-t1), results.size, query.size]
|
18
19
|
|
19
|
-
puts "
|
20
|
-
|
21
|
-
|
22
|
-
results.each do | r |
|
23
|
-
puts query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr + " => " + r[:value]
|
20
|
+
puts "==> 20 results"
|
21
|
+
results[0..20].each do | r |
|
22
|
+
puts "-- #{query[r[:starts_at]].chr}..#{query[r[:ends_at]-1].chr} => #{r[:value]}"
|
24
23
|
end
|
25
24
|
|
25
|
+
puts "==> query was -first 100 chars:"
|
26
|
+
puts query[0..100]
|
27
|
+
|
data/examples/elev.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
%w(../lib ../ext).each do |path|
|
4
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
|
5
|
+
end
|
4
6
|
|
5
|
-
require
|
7
|
+
require "ahocorasick"
|
6
8
|
|
7
9
|
k= AhoCorasick::KeywordTree.new
|
8
10
|
|
data/examples/test.rb
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
%w(../lib ../ext).each do |path|
|
4
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
|
5
|
+
end
|
6
|
+
|
7
|
+
require "ahocorasick"
|
4
8
|
|
5
9
|
k= AhoCorasick::KeywordTree.new
|
6
10
|
|
@@ -25,6 +25,7 @@ static VALUE sym_id, sym_value, sym_ends_at, sym_starts_at;
|
|
25
25
|
|
26
26
|
VALUE rb_mAhoCorasick;
|
27
27
|
VALUE rb_cKeywordTree;
|
28
|
+
VALUE rb_cResultFilter;
|
28
29
|
|
29
30
|
#define KeywordTree(obj, kwt_data) {\
|
30
31
|
Data_Get_Struct(obj, struct kwt_struct_data, kwt_data);\
|
@@ -37,14 +38,26 @@ struct kwt_struct_data {
|
|
37
38
|
int is_frozen;
|
38
39
|
};
|
39
40
|
|
40
|
-
//
|
41
|
-
//
|
42
|
-
//
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
41
|
+
//
|
42
|
+
// ResultFilter interface
|
43
|
+
//
|
44
|
+
|
45
|
+
static VALUE
|
46
|
+
rb_rf_init(VALUE self) {
|
47
|
+
return self;
|
48
|
+
}
|
49
|
+
static VALUE
|
50
|
+
rb_rf_valid(int argc, VALUE *argv, VALUE self) {
|
51
|
+
VALUE result;
|
52
|
+
VALUE remain;
|
53
|
+
rb_scan_args(argc, argv, "20", &result, &remain);
|
54
|
+
rb_raise(rb_eNotImpError, "Method AhoCorasick::ResultFilter.valid?(<Hash> result, <String> remain) should be implemented in child classes.");
|
55
|
+
return Qtrue;
|
56
|
+
}
|
57
|
+
|
58
|
+
//
|
59
|
+
// ~ResultFilter
|
60
|
+
//
|
48
61
|
|
49
62
|
/*
|
50
63
|
* call-seq: initialize
|
@@ -68,6 +81,7 @@ rb_kwt_init(VALUE self)
|
|
68
81
|
kwt_data->last_id = 1;
|
69
82
|
kwt_data->dictionary_size = 0;
|
70
83
|
kwt_data->is_frozen = 0;
|
84
|
+
rb_iv_set( self, "@filter", Qnil );
|
71
85
|
return self;
|
72
86
|
}
|
73
87
|
|
@@ -135,6 +149,7 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
|
135
149
|
int lgt, id, ends_at; // filled in by ac_search: the length of the result, the id, and starts_at/ends_at position
|
136
150
|
VALUE v_result; // one result, as hash
|
137
151
|
VALUE v_results; // all the results, an array
|
152
|
+
VALUE filter; // filter to be applied to results
|
138
153
|
|
139
154
|
VALUE v_search; // search string, function argument
|
140
155
|
struct kwt_struct_data *kwt_data;
|
@@ -160,6 +175,8 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
|
160
175
|
return v_results;
|
161
176
|
// prepare the search
|
162
177
|
ac_search_init(kwt_data->tree, StringValuePtr(v_search), (int)NUM2INT(rb_funcall(v_search, rb_intern("length"), 0)));
|
178
|
+
// get the filter
|
179
|
+
filter= rb_iv_get(self, "@filter");
|
163
180
|
// loop trought the results
|
164
181
|
while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
|
165
182
|
// this is an individual result as a hash
|
@@ -168,7 +185,8 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
|
168
185
|
rb_hash_aset( v_result, sym_starts_at, INT2NUM( (long)(ends_at - lgt - 1) ) );
|
169
186
|
rb_hash_aset( v_result, sym_ends_at, INT2NUM( (long)(ends_at - 1) ) );
|
170
187
|
rb_hash_aset( v_result, sym_value, rb_str_new(remain, (long)lgt) );
|
171
|
-
|
188
|
+
if (filter == Qnil || rb_funcall( filter, rb_intern("valid?"), 2, v_result, rb_str_new(remain, (long)strlen(remain)) )!=Qfalse)
|
189
|
+
rb_ary_push( v_results, v_result );
|
172
190
|
}
|
173
191
|
// reopen the tree
|
174
192
|
kwt_data->is_frozen= 0;
|
@@ -250,6 +268,32 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
|
250
268
|
return INT2FIX(id);
|
251
269
|
}
|
252
270
|
|
271
|
+
static VALUE
|
272
|
+
rb_kwt_set_filter(int argc, VALUE *argv, VALUE self) {
|
273
|
+
struct kwt_struct_data *kwt_data;
|
274
|
+
VALUE filter;
|
275
|
+
|
276
|
+
rb_scan_args(argc, argv, "10", &filter);
|
277
|
+
|
278
|
+
if(rb_obj_is_kind_of(filter, rb_cResultFilter) == 0)
|
279
|
+
rb_raise(rb_eTypeError, "Type mismatch: required %s, %s given.", rb_class2name(rb_cResultFilter), rb_class2name(CLASS_OF(filter)));
|
280
|
+
|
281
|
+
KeywordTree( self, kwt_data );
|
282
|
+
rb_iv_set( self, "@filter", filter );
|
283
|
+
|
284
|
+
return filter;
|
285
|
+
}
|
286
|
+
|
287
|
+
static VALUE
|
288
|
+
rb_kwt_get_filter(VALUE self) {
|
289
|
+
VALUE filter;
|
290
|
+
struct kwt_struct_data *kwt_data;
|
291
|
+
KeywordTree( self, kwt_data );
|
292
|
+
|
293
|
+
filter= rb_iv_get(self, "@filter");
|
294
|
+
return filter;
|
295
|
+
}
|
296
|
+
|
253
297
|
/*
|
254
298
|
* call-seq: from_file
|
255
299
|
*
|
@@ -323,17 +367,24 @@ void Init_native() {
|
|
323
367
|
rb_define_method(rb_cKeywordTree, "initialize", rb_kwt_init, 0);
|
324
368
|
rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
|
325
369
|
rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
|
370
|
+
rb_define_method(rb_cKeywordTree, "filter=", rb_kwt_set_filter, -1);
|
371
|
+
rb_define_method(rb_cKeywordTree, "filter", rb_kwt_get_filter, 0);
|
372
|
+
|
326
373
|
rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
|
327
374
|
rb_define_alias(rb_cKeywordTree, "<<", "add_string");
|
328
375
|
|
329
376
|
rb_define_method(rb_cKeywordTree, "find_all", rb_kwt_find_all, -1);
|
330
377
|
rb_define_alias(rb_cKeywordTree, "search", "find_all");
|
331
378
|
|
332
|
-
rb_define_singleton_method(rb_cKeywordTree, "
|
379
|
+
rb_define_singleton_method(rb_cKeywordTree, "_from_file", rb_kwt_new_from_file, -1);
|
380
|
+
|
381
|
+
rb_cResultFilter = rb_define_class_under(rb_mAhoCorasick, "ResultFilter", rb_cObject);
|
382
|
+
rb_define_method(rb_cResultFilter, "initialize", rb_rf_init, 0);
|
383
|
+
rb_define_method(rb_cResultFilter, "valid?", rb_rf_valid, -1);
|
333
384
|
|
334
385
|
sym_id = ID2SYM(rb_intern("id"));
|
335
386
|
sym_value = ID2SYM(rb_intern("value"));
|
336
|
-
sym_ends_at = ID2SYM(
|
337
|
-
sym_starts_at= ID2SYM(
|
387
|
+
sym_ends_at = ID2SYM(rb_intern("ends_at"));
|
388
|
+
sym_starts_at= ID2SYM(rb_intern("starts_at"));
|
338
389
|
}
|
339
390
|
|
data/lib/ahocorasick.rb
CHANGED