aurelian-ruby-ahocorasick 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -57,7 +57,7 @@ $ gem install ruby-ahocorasick
57
57
 
58
58
  h4. Notes
59
59
 
60
- It's known to work / compile / install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc around.
60
+ It's known to work / compile / install on Ubuntu 8.04 and Mac OS 10.4.*. It should work out of the box if you have gcc.
61
61
  Unfortunately I don't have a Windows PC around nor required knowledge about Microsoft compliers.
62
62
 
63
63
 
@@ -102,4 +102,4 @@ h2. License
102
102
 
103
103
  (c) 2008 - Aurelian Oancea, < oancea at gmail dot com >
104
104
 
105
- released under MIT-LICENCE
105
+ released under MIT-LICENCE
data/examples/dict.rb CHANGED
@@ -1,25 +1,27 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'time'
3
+ %w(../lib ../ext).each do |path|
4
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
5
+ end
4
6
 
5
- require File.dirname(__FILE__) + '/../ext/ahocorasick'
7
+ require "ahocorasick"
8
+ require "time"
6
9
 
7
10
  t= Time.now
8
-
9
11
  k= AhoCorasick::KeywordTree.from_file(File.dirname(__FILE__) + "/../spec/data/en.words")
10
-
11
12
  t1= Time.now
12
-
13
- puts "%d words added in %s seconds" % [k.size, (t1-t)]
13
+ puts "==> %d words added in %s seconds" % [k.size, (t1-t)]
14
14
 
15
15
  query = File.read( File.dirname(__FILE__) + "/../spec/data/news.txt" )
16
-
17
16
  results= k.search query
17
+ puts "==> took %s seconds to find %d results in a streem with %d charachters" % \
18
+ [(Time.now-t1), results.size, query.size]
18
19
 
19
- puts "took %s seconds to find %d results in a streem with %d charachters" % [(Time.now-t1), results.size, query.size]
20
-
21
- exit
22
- results.each do | r |
23
- puts query[r[:starts_at]].chr + ".." + query[r[:ends_at]-1].chr + " => " + r[:value]
20
+ puts "==> 20 results"
21
+ results[0..20].each do | r |
22
+ puts "-- #{query[r[:starts_at]].chr}..#{query[r[:ends_at]-1].chr} => #{r[:value]}"
24
23
  end
25
24
 
25
+ puts "==> query was -first 100 chars:"
26
+ puts query[0..100]
27
+
data/examples/elev.rb CHANGED
@@ -1,8 +1,10 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- $kcode='UTF-8'
3
+ %w(../lib ../ext).each do |path|
4
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
5
+ end
4
6
 
5
- require File.dirname(__FILE__) + '/../ext/ahocorasick'
7
+ require "ahocorasick"
6
8
 
7
9
  k= AhoCorasick::KeywordTree.new
8
10
 
data/examples/test.rb CHANGED
@@ -1,6 +1,10 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require File.dirname(__FILE__) + '/../ext/ahocorasick'
3
+ %w(../lib ../ext).each do |path|
4
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
5
+ end
6
+
7
+ require "ahocorasick"
4
8
 
5
9
  k= AhoCorasick::KeywordTree.new
6
10
 
@@ -25,6 +25,7 @@ static VALUE sym_id, sym_value, sym_ends_at, sym_starts_at;
25
25
 
26
26
  VALUE rb_mAhoCorasick;
27
27
  VALUE rb_cKeywordTree;
28
+ VALUE rb_cResultFilter;
28
29
 
29
30
  #define KeywordTree(obj, kwt_data) {\
30
31
  Data_Get_Struct(obj, struct kwt_struct_data, kwt_data);\
@@ -37,14 +38,26 @@ struct kwt_struct_data {
37
38
  int is_frozen;
38
39
  };
39
40
 
40
- // int
41
- // rb_add_string(struct kwt_struct_data *kwt, char *word, int size, int id) {
42
- // if(ac_add_string( kwt->tree, word, size, id ) == 0)
43
- // return 0;
44
- // kwt->dictionary_size++;
45
- // kwt->last_id= id+1;
46
- // return 1;
47
- // }
41
+ //
42
+ // ResultFilter interface
43
+ //
44
+
45
+ static VALUE
46
+ rb_rf_init(VALUE self) {
47
+ return self;
48
+ }
49
+ static VALUE
50
+ rb_rf_valid(int argc, VALUE *argv, VALUE self) {
51
+ VALUE result;
52
+ VALUE remain;
53
+ rb_scan_args(argc, argv, "20", &result, &remain);
54
+ rb_raise(rb_eNotImpError, "Method AhoCorasick::ResultFilter.valid?(<Hash> result, <String> remain) should be implemented in child classes.");
55
+ return Qtrue;
56
+ }
57
+
58
+ //
59
+ // ~ResultFilter
60
+ //
48
61
 
49
62
  /*
50
63
  * call-seq: initialize
@@ -68,6 +81,7 @@ rb_kwt_init(VALUE self)
68
81
  kwt_data->last_id = 1;
69
82
  kwt_data->dictionary_size = 0;
70
83
  kwt_data->is_frozen = 0;
84
+ rb_iv_set( self, "@filter", Qnil );
71
85
  return self;
72
86
  }
73
87
 
@@ -135,6 +149,7 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
135
149
  int lgt, id, ends_at; // filled in by ac_search: the length of the result, the id, and starts_at/ends_at position
136
150
  VALUE v_result; // one result, as hash
137
151
  VALUE v_results; // all the results, an array
152
+ VALUE filter; // filter to be applied to results
138
153
 
139
154
  VALUE v_search; // search string, function argument
140
155
  struct kwt_struct_data *kwt_data;
@@ -160,6 +175,8 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
160
175
  return v_results;
161
176
  // prepare the search
162
177
  ac_search_init(kwt_data->tree, StringValuePtr(v_search), (int)NUM2INT(rb_funcall(v_search, rb_intern("length"), 0)));
178
+ // get the filter
179
+ filter= rb_iv_get(self, "@filter");
163
180
  // loop trought the results
164
181
  while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
165
182
  // this is an individual result as a hash
@@ -168,7 +185,8 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
168
185
  rb_hash_aset( v_result, sym_starts_at, INT2NUM( (long)(ends_at - lgt - 1) ) );
169
186
  rb_hash_aset( v_result, sym_ends_at, INT2NUM( (long)(ends_at - 1) ) );
170
187
  rb_hash_aset( v_result, sym_value, rb_str_new(remain, (long)lgt) );
171
- rb_ary_push( v_results, v_result );
188
+ if (filter == Qnil || rb_funcall( filter, rb_intern("valid?"), 2, v_result, rb_str_new(remain, (long)strlen(remain)) )!=Qfalse)
189
+ rb_ary_push( v_results, v_result );
172
190
  }
173
191
  // reopen the tree
174
192
  kwt_data->is_frozen= 0;
@@ -250,6 +268,32 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
250
268
  return INT2FIX(id);
251
269
  }
252
270
 
271
+ static VALUE
272
+ rb_kwt_set_filter(int argc, VALUE *argv, VALUE self) {
273
+ struct kwt_struct_data *kwt_data;
274
+ VALUE filter;
275
+
276
+ rb_scan_args(argc, argv, "10", &filter);
277
+
278
+ if(rb_obj_is_kind_of(filter, rb_cResultFilter) == 0)
279
+ rb_raise(rb_eTypeError, "Type mismatch: required %s, %s given.", rb_class2name(rb_cResultFilter), rb_class2name(CLASS_OF(filter)));
280
+
281
+ KeywordTree( self, kwt_data );
282
+ rb_iv_set( self, "@filter", filter );
283
+
284
+ return filter;
285
+ }
286
+
287
+ static VALUE
288
+ rb_kwt_get_filter(VALUE self) {
289
+ VALUE filter;
290
+ struct kwt_struct_data *kwt_data;
291
+ KeywordTree( self, kwt_data );
292
+
293
+ filter= rb_iv_get(self, "@filter");
294
+ return filter;
295
+ }
296
+
253
297
  /*
254
298
  * call-seq: from_file
255
299
  *
@@ -323,17 +367,24 @@ void Init_native() {
323
367
  rb_define_method(rb_cKeywordTree, "initialize", rb_kwt_init, 0);
324
368
  rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
325
369
  rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
370
+ rb_define_method(rb_cKeywordTree, "filter=", rb_kwt_set_filter, -1);
371
+ rb_define_method(rb_cKeywordTree, "filter", rb_kwt_get_filter, 0);
372
+
326
373
  rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
327
374
  rb_define_alias(rb_cKeywordTree, "<<", "add_string");
328
375
 
329
376
  rb_define_method(rb_cKeywordTree, "find_all", rb_kwt_find_all, -1);
330
377
  rb_define_alias(rb_cKeywordTree, "search", "find_all");
331
378
 
332
- rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
379
+ rb_define_singleton_method(rb_cKeywordTree, "_from_file", rb_kwt_new_from_file, -1);
380
+
381
+ rb_cResultFilter = rb_define_class_under(rb_mAhoCorasick, "ResultFilter", rb_cObject);
382
+ rb_define_method(rb_cResultFilter, "initialize", rb_rf_init, 0);
383
+ rb_define_method(rb_cResultFilter, "valid?", rb_rf_valid, -1);
333
384
 
334
385
  sym_id = ID2SYM(rb_intern("id"));
335
386
  sym_value = ID2SYM(rb_intern("value"));
336
- sym_ends_at = ID2SYM( rb_intern("ends_at") );
337
- sym_starts_at= ID2SYM( rb_intern("starts_at") );
387
+ sym_ends_at = ID2SYM(rb_intern("ends_at"));
388
+ sym_starts_at= ID2SYM(rb_intern("starts_at"));
338
389
  }
339
390
 
data/lib/ahocorasick.rb CHANGED
@@ -3,5 +3,14 @@ require 'ahocorasick/native'
3
3
 
4
4
  module AhoCorasick
5
5
  VERSION='0.5.0'
6
+
7
+ class KeywordTree
8
+
9
+ def self.from_file filename
10
+ self._from_file filename
11
+ end
12
+
13
+ end
14
+
6
15
  end
7
16
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aurelian-ruby-ahocorasick
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aurelian Oancea