aurelian-ruby-ahocorasick 0.3.1 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -15,7 +15,7 @@
15
15
  // * kwt.find_first("str")
16
16
  // * kwt.find_all ("str")
17
17
  //
18
- // TODO: rename search to find_all
18
+ // TODO: change last_id and dictionary_size to long
19
19
  //
20
20
 
21
21
  #include <ruby.h>
@@ -37,6 +37,15 @@ struct kwt_struct_data {
37
37
  int is_frozen;
38
38
  };
39
39
 
40
+ // int
41
+ // rb_add_string(struct kwt_struct_data *kwt, char *word, int size, int id) {
42
+ // if(ac_add_string( kwt->tree, word, size, id ) == 0)
43
+ // return 0;
44
+ // kwt->dictionary_size++;
45
+ // kwt->last_id= id+1;
46
+ // return 1;
47
+ // }
48
+
40
49
  /*
41
50
  * call-seq: initialize
42
51
  *
@@ -114,7 +123,7 @@ rb_kwt_make(VALUE self)
114
123
  *
115
124
  */
116
125
  static VALUE
117
- rb_kwt_search(int argc, VALUE *argv, VALUE self)
126
+ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
118
127
  {
119
128
  char * result; // itermediate result
120
129
  char * remain; // returned by ac_search, the remaing text to search
@@ -137,7 +146,6 @@ rb_kwt_search(int argc, VALUE *argv, VALUE self)
137
146
  kwt_data->is_frozen = 1;
138
147
  }
139
148
  // prepare the return value
140
- // v_results= rb_block_given_p()? Qnil : rb_ary_new();
141
149
  v_results= rb_ary_new();
142
150
  // fail quickly and return the empty array
143
151
  if(kwt_data->dictionary_size == 0)
@@ -148,32 +156,19 @@ rb_kwt_search(int argc, VALUE *argv, VALUE self)
148
156
  while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
149
157
  // this is an individual result as a hash
150
158
  v_result= rb_hash_new();
151
- rb_hash_aset( v_result, sym_id, INT2FIX(id) );
159
+ rb_hash_aset( v_result, sym_id, INT2FIX(id) );
152
160
  rb_hash_aset( v_result, sym_starts_at, INT2FIX( ends_at - lgt - 1 ) );
153
- rb_hash_aset( v_result, sym_ends_at, INT2FIX( ends_at - 1 ) );
161
+ rb_hash_aset( v_result, sym_ends_at, INT2FIX( ends_at - 1 ) );
154
162
  result = (char*) malloc (sizeof(char)*lgt);
155
163
  sprintf( result, "%.*s", lgt, remain);
156
164
  rb_hash_aset( v_result, sym_value, rb_str_new(result, lgt) );
157
-
158
- // yield this hash or, add it to the results
159
- // if(rb_block_given_p())
160
- // rb_yield(v_result);
161
- // else
162
165
  rb_ary_push( v_results, v_result );
163
166
  free(result);
164
167
  }
165
-
166
168
  // TODO: maybe the Tree can be re-opened to add new items to dictionary
167
-
168
- // return the results or nil if none
169
- // if( v_results != Qnil && RARRAY(v_results)->len > 0 ) {
170
169
  return v_results;
171
- // } else {
172
- // return Qnil;
173
- // }
174
170
  }
175
171
 
176
-
177
172
  /*
178
173
  * Document-method: size
179
174
  * call-seq: size
@@ -194,7 +189,6 @@ rb_kwt_size(VALUE self)
194
189
  return INT2FIX(kwt_data->dictionary_size);
195
190
  }
196
191
 
197
-
198
192
  /*
199
193
  * Document-method: add_string
200
194
  * call-seq: add_string
@@ -228,7 +222,6 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
228
222
 
229
223
  Check_Type(v_string, T_STRING);
230
224
  string= RSTRING(v_string)->ptr;
231
-
232
225
  KeywordTree(self, kwt_data);
233
226
 
234
227
  if(kwt_data->is_frozen == 1)
@@ -244,16 +237,12 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
244
237
  id= NUM2INT(v_id);
245
238
  }
246
239
 
247
- // printf("[internal]==> %d\n", id);
248
-
249
- if( ac_add_string(kwt_data->tree, string, strlen(string), id) == 0 ) {
240
+ if(ac_add_string(kwt_data->tree, string, strlen(string), id) == 0)
250
241
  rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
251
- }
252
242
 
253
243
  kwt_data->last_id= id + 1;
254
244
  kwt_data->dictionary_size++;
255
- // printf("[internal]==> %d\n", id);
256
- return id;
245
+ return INT2FIX(id);
257
246
  }
258
247
 
259
248
  /*
@@ -293,14 +282,14 @@ rb_kwt_new_from_file(int argc, VALUE *argv, VALUE klass)
293
282
  KeywordTree( self, kwt_data );
294
283
 
295
284
  dictionary = fopen( RSTRING( f_string )->ptr, "r" );
296
- if(dictionary == NULL) {
285
+ if(dictionary == NULL)
297
286
  rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?", RSTRING(f_string)->ptr);
298
- }
299
287
 
300
288
  while(fgets(word, 1024, dictionary) != NULL) {
301
289
  ac_add_string(kwt_data->tree, word, strlen(word)-1, id++);
302
290
  kwt_data->dictionary_size++;
303
291
  }
292
+
304
293
  kwt_data->last_id= id+1;
305
294
  fclose(dictionary);
306
295
  return self;
@@ -331,17 +320,16 @@ void Init_ahocorasick() {
331
320
  rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
332
321
  rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
333
322
  rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
323
+ rb_define_alias(rb_cKeywordTree, "<<", "add_string");
334
324
 
335
- rb_define_method(rb_cKeywordTree, "search", rb_kwt_search, -1);
336
- rb_define_alias(rb_cKeywordTree, "find_all", "search");
325
+ rb_define_method(rb_cKeywordTree, "find_all", rb_kwt_find_all, -1);
326
+ rb_define_alias(rb_cKeywordTree, "search", "find_all");
337
327
 
338
- rb_define_alias(rb_cKeywordTree, "<<", "add_string");
339
328
  rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
340
329
 
341
- sym_id= ID2SYM(rb_intern("id"));
342
- sym_value= ID2SYM(rb_intern("value"));
343
- sym_ends_at= ID2SYM( rb_intern("ends_at") );
330
+ sym_id = ID2SYM(rb_intern("id"));
331
+ sym_value = ID2SYM(rb_intern("value"));
332
+ sym_ends_at = ID2SYM( rb_intern("ends_at") );
344
333
  sym_starts_at= ID2SYM( rb_intern("starts_at") );
345
-
346
334
  }
347
335
 
@@ -67,7 +67,7 @@ describe KeywordTree do
67
67
  # 01234567890123456789023
68
68
  # | |
69
69
  q= "data moved to bucurești"
70
- @kwt.search(q).each do | result |
70
+ @kwt.find_all(q).each do | result |
71
71
  result[:starts_at].should == 14
72
72
  result[:ends_at].should == 24
73
73
  end
@@ -77,7 +77,7 @@ describe KeywordTree do
77
77
  @kwt << "expected"
78
78
  # 012345678901234578901234567890
79
79
  q = "moved to bucurești as expected"
80
- @kwt.search(q).each do | r |
80
+ @kwt.find_all(q).each do | r |
81
81
  r[:starts_at].should == 23
82
82
  r[:ends_at].should == q.size
83
83
  (r[:ends_at]-r[:starts_at]).should == r[:value].size
@@ -86,15 +86,15 @@ describe KeywordTree do
86
86
 
87
87
  it "even more unicode" do
88
88
  @kwt << "șșt"
89
- # 0124789
90
- result= @kwt.search("mușștar").first
89
+ # 0124789
90
+ result= @kwt.find_all("mușștar").first
91
91
  result[:starts_at].should == 2
92
92
  result[:ends_at].should == result[:starts_at] + "șșt".size
93
93
  end
94
94
 
95
95
  it "checks for result length" do
96
96
  @kwt << "foo"
97
- result= @kwt.search("foo").first
97
+ result= @kwt.find_all("foo").first
98
98
  # 4 0
99
99
  (result[:ends_at]-result[:starts_at]).should == result[:value].size
100
100
  "foo"[result[:ends_at]].should == nil
@@ -110,7 +110,7 @@ describe KeywordTree do
110
110
  end
111
111
 
112
112
  it "should match on context" do
113
- @kwt.search("I've moved the data to a new database").size.should == 4
113
+ @kwt.find_all("I've moved the data to a new database").size.should == 4
114
114
  end
115
115
 
116
116
  end
@@ -146,8 +146,8 @@ describe KeywordTree do
146
146
  it "should work to add a random id" do
147
147
  kwt= KeywordTree.new
148
148
  kwt << "baz"
149
- kwt.add_string "foo", 1990
150
- kwt << "bar"
149
+ kwt.add_string("foo", 1990).should == 1990
150
+ kwt.add_string("bar").should == 1991
151
151
  kwt.size.should == 3
152
152
  end
153
153
 
@@ -185,7 +185,7 @@ describe KeywordTree do
185
185
  start= Time.now
186
186
  k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
187
187
  load_time= Time.now
188
- results= k.search( File.read( File.dirname(__FILE__) + "/data/melville-moby_dick.txt" ) )
188
+ results= k.find_all( File.read( File.dirname(__FILE__) + "/data/melville-moby_dick.txt" ) )
189
189
  puts "\n%d words re-loaded in %s seconds.\nGot %d results in %s seconds" % [k.size, (load_time - start), results.size, (Time.now-load_time)]
190
190
  (Time.now-load_time).should < 1.2
191
191
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aurelian-ruby-ahocorasick
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aurelian Oancea
@@ -14,7 +14,7 @@ default_executable:
14
14
  dependencies: []
15
15
 
16
16
  description: Expose Aho-Corasick implementation from Strmat to Ruby.
17
- email: aurelian [ at ] locknet.ro
17
+ email: oancea at gmail dot com
18
18
  executables: []
19
19
 
20
20
  extensions: