aurelian-ruby-ahocorasick 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -57,7 +57,7 @@ $ gem install ruby-ahocorasick
57
57
 
58
58
  h4. Notes
59
59
 
60
- It's known to work / compile / install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc around.
60
+ It's known to work / compile / install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc.
61
61
  Unfortunately I don't have a Windows PC around nor required knowledge about Microsoft compliers.
62
62
 
63
63
 
@@ -102,4 +102,4 @@ h2. License
102
102
 
103
103
  (c) 2008 - Aurelian Oancea, < oancea at gmail dot com >
104
104
 
105
- released under MIT-LICENCE
105
+ released under MIT-LICENCE
data/examples/dict.rb CHANGED
@@ -1,25 +1,27 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'time'
3
+ %w(../lib ../ext).each do |path|
4
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
5
+ end
4
6
 
5
- require File.dirname(__FILE__) + '/../ext/ahocorasick'
7
+ require "ahocorasick"
8
+ require "time"
6
9
 
7
10
  t= Time.now
8
-
9
11
  k= AhoCorasick::KeywordTree.from_file(File.dirname(__FILE__) + "/../spec/data/en.words")
10
-
11
12
  t1= Time.now
12
-
13
- puts "%d words added in %s seconds" % [k.size, (t1-t)]
13
+ puts "==> %d words added in %s seconds" % [k.size, (t1-t)]
14
14
 
15
15
  query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt" )
16
-
17
16
  results= k.search query
17
+ puts "==> took %s seconds to find %d results in a streem with %d charachters" % \
18
+ [(Time.now-t1), results.size, query.size]
18
19
 
19
- puts "took %s seconds to find %d results in a streem with %d charachters" % [(Time.now-t1), results.size, query.size]
20
-
21
- exit
22
- results.each do | r |
23
- puts query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr + " => " + r[:value]
20
+ puts "==> 20 results"
21
+ results[0..20].each do | r |
22
+ puts "-- #{query[r[:starts_at]].chr}..#{query[r[:ends_at]-1].chr} => #{r[:value]}"
24
23
  end
25
24
 
25
+ puts "==> query was -first 100 chars:"
26
+ puts query[0..100]
27
+
data/examples/elev.rb CHANGED
@@ -1,8 +1,10 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- $kcode='UTF-8'
3
+ %w(../lib ../ext).each do |path|
4
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
5
+ end
4
6
 
5
- require File.dirname(__FILE__) + '/../ext/ahocorasick'
7
+ require "ahocorasick"
6
8
 
7
9
  k= AhoCorasick::KeywordTree.new
8
10
 
data/examples/test.rb CHANGED
@@ -1,6 +1,10 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require File.dirname(__FILE__) + '/../ext/ahocorasick'
3
+ %w(../lib ../ext).each do |path|
4
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
5
+ end
6
+
7
+ require "ahocorasick"
4
8
 
5
9
  k= AhoCorasick::KeywordTree.new
6
10
 
@@ -25,6 +25,7 @@ static VALUE sym_id, sym_value, sym_ends_at, sym_starts_at;
25
25
 
26
26
  VALUE rb_mAhoCorasick;
27
27
  VALUE rb_cKeywordTree;
28
+ VALUE rb_cResultFilter;
28
29
 
29
30
  #define KeywordTree(obj, kwt_data) {\
30
31
  Data_Get_Struct(obj, struct kwt_struct_data, kwt_data);\
@@ -37,14 +38,26 @@ struct kwt_struct_data {
37
38
  int is_frozen;
38
39
  };
39
40
 
40
- // int
41
- // rb_add_string(struct kwt_struct_data *kwt, char *word, int size, int id) {
42
- // if(ac_add_string( kwt->tree, word, size, id ) == 0)
43
- // return 0;
44
- // kwt->dictionary_size++;
45
- // kwt->last_id= id+1;
46
- // return 1;
47
- // }
41
+ //
42
+ // ResultFilter interface
43
+ //
44
+
45
+ static VALUE
46
+ rb_rf_init(VALUE self) {
47
+ return self;
48
+ }
49
+ static VALUE
50
+ rb_rf_valid(int argc, VALUE *argv, VALUE self) {
51
+ VALUE result;
52
+ VALUE remain;
53
+ rb_scan_args(argc, argv, "20", &result, &remain);
54
+ rb_raise(rb_eNotImpError, "Method AhoCorasick::ResultFilter.valid?(<Hash> result, <String> remain) should be implemented in child classes.");
55
+ return Qtrue;
56
+ }
57
+
58
+ //
59
+ // ~ResultFilter
60
+ //
48
61
 
49
62
  /*
50
63
  * call-seq: initialize
@@ -68,6 +81,7 @@ rb_kwt_init(VALUE self)
68
81
  kwt_data->last_id = 1;
69
82
  kwt_data->dictionary_size = 0;
70
83
  kwt_data->is_frozen = 0;
84
+ rb_iv_set( self, "@filter", Qnil );
71
85
  return self;
72
86
  }
73
87
 
@@ -135,6 +149,7 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
135
149
  int lgt, id, ends_at; // filled in by ac_search: the length of the result, the id, and starts_at/ends_at position
136
150
  VALUE v_result; // one result, as hash
137
151
  VALUE v_results; // all the results, an array
152
+ VALUE filter; // filter to be applied to results
138
153
 
139
154
  VALUE v_search; // search string, function argument
140
155
  struct kwt_struct_data *kwt_data;
@@ -160,6 +175,8 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
160
175
  return v_results;
161
176
  // prepare the search
162
177
  ac_search_init(kwt_data->tree, StringValuePtr(v_search), (int)NUM2INT(rb_funcall(v_search, rb_intern("length"), 0)));
178
+ // get the filter
179
+ filter= rb_iv_get(self, "@filter");
163
180
  // loop trought the results
164
181
  while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
165
182
  // this is an individual result as a hash
@@ -168,7 +185,8 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
168
185
  rb_hash_aset( v_result, sym_starts_at, INT2NUM( (long)(ends_at - lgt - 1) ) );
169
186
  rb_hash_aset( v_result, sym_ends_at, INT2NUM( (long)(ends_at - 1) ) );
170
187
  rb_hash_aset( v_result, sym_value, rb_str_new(remain, (long)lgt) );
171
- rb_ary_push( v_results, v_result );
188
+ if (filter == Qnil || rb_funcall( filter, rb_intern("valid?"), 2, v_result, rb_str_new(remain, (long)strlen(remain)) )!=Qfalse)
189
+ rb_ary_push( v_results, v_result );
172
190
  }
173
191
  // reopen the tree
174
192
  kwt_data->is_frozen= 0;
@@ -250,6 +268,32 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
250
268
  return INT2FIX(id);
251
269
  }
252
270
 
271
+ static VALUE
272
+ rb_kwt_set_filter(int argc, VALUE *argv, VALUE self) {
273
+ struct kwt_struct_data *kwt_data;
274
+ VALUE filter;
275
+
276
+ rb_scan_args(argc, argv, "10", &filter);
277
+
278
+ if(rb_obj_is_kind_of(filter, rb_cResultFilter) == 0)
279
+ rb_raise(rb_eTypeError, "Type mismatch: required %s, %s given.", rb_class2name(rb_cResultFilter), rb_class2name(CLASS_OF(filter)));
280
+
281
+ KeywordTree( self, kwt_data );
282
+ rb_iv_set( self, "@filter", filter );
283
+
284
+ return filter;
285
+ }
286
+
287
+ static VALUE
288
+ rb_kwt_get_filter(VALUE self) {
289
+ VALUE filter;
290
+ struct kwt_struct_data *kwt_data;
291
+ KeywordTree( self, kwt_data );
292
+
293
+ filter= rb_iv_get(self, "@filter");
294
+ return filter;
295
+ }
296
+
253
297
  /*
254
298
  * call-seq: from_file
255
299
  *
@@ -323,17 +367,24 @@ void Init_native() {
323
367
  rb_define_method(rb_cKeywordTree, "initialize", rb_kwt_init, 0);
324
368
  rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
325
369
  rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
370
+ rb_define_method(rb_cKeywordTree, "filter=", rb_kwt_set_filter, -1);
371
+ rb_define_method(rb_cKeywordTree, "filter", rb_kwt_get_filter, 0);
372
+
326
373
  rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
327
374
  rb_define_alias(rb_cKeywordTree, "<<", "add_string");
328
375
 
329
376
  rb_define_method(rb_cKeywordTree, "find_all", rb_kwt_find_all, -1);
330
377
  rb_define_alias(rb_cKeywordTree, "search", "find_all");
331
378
 
332
- rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
379
+ rb_define_singleton_method(rb_cKeywordTree, "_from_file", rb_kwt_new_from_file, -1);
380
+
381
+ rb_cResultFilter = rb_define_class_under(rb_mAhoCorasick, "ResultFilter", rb_cObject);
382
+ rb_define_method(rb_cResultFilter, "initialize", rb_rf_init, 0);
383
+ rb_define_method(rb_cResultFilter, "valid?", rb_rf_valid, -1);
333
384
 
334
385
  sym_id = ID2SYM(rb_intern("id"));
335
386
  sym_value = ID2SYM(rb_intern("value"));
336
- sym_ends_at = ID2SYM( rb_intern("ends_at") );
337
- sym_starts_at= ID2SYM( rb_intern("starts_at") );
387
+ sym_ends_at = ID2SYM(rb_intern("ends_at"));
388
+ sym_starts_at= ID2SYM(rb_intern("starts_at"));
338
389
  }
339
390
 
data/lib/ahocorasick.rb CHANGED
@@ -3,5 +3,14 @@ require 'ahocorasick/native'
3
3
 
4
4
  module AhoCorasick
5
5
  VERSION='0.5.0'
6
+
7
+ class KeywordTree
8
+
9
+ def self.from_file filename
10
+ self._from_file filename
11
+ end
12
+
13
+ end
14
+
6
15
  end
7
16
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aurelian-ruby-ahocorasick
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aurelian Oancea