aurelian-ruby-ahocorasick 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,7 @@
15
15
  // * kwt.find_first("str")
16
16
  // * kwt.find_all ("str")
17
17
  //
18
- // TODO: rename search to find_all
18
+ // TODO: change last_id and dictionary_size to long
19
19
  //
20
20
 
21
21
  #include <ruby.h>
@@ -37,6 +37,15 @@ struct kwt_struct_data {
37
37
  int is_frozen;
38
38
  };
39
39
 
40
+ // int
41
+ // rb_add_string(struct kwt_struct_data *kwt, char *word, int size, int id) {
42
+ // if(ac_add_string( kwt->tree, word, size, id ) == 0)
43
+ // return 0;
44
+ // kwt->dictionary_size++;
45
+ // kwt->last_id= id+1;
46
+ // return 1;
47
+ // }
48
+
40
49
  /*
41
50
  * call-seq: initialize
42
51
  *
@@ -114,7 +123,7 @@ rb_kwt_make(VALUE self)
114
123
  *
115
124
  */
116
125
  static VALUE
117
- rb_kwt_search(int argc, VALUE *argv, VALUE self)
126
+ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
118
127
  {
119
128
  char * result; // itermediate result
120
129
  char * remain; // returned by ac_search, the remaing text to search
@@ -137,7 +146,6 @@ rb_kwt_search(int argc, VALUE *argv, VALUE self)
137
146
  kwt_data->is_frozen = 1;
138
147
  }
139
148
  // prepare the return value
140
- // v_results= rb_block_given_p()? Qnil : rb_ary_new();
141
149
  v_results= rb_ary_new();
142
150
  // fail quickly and return the empty array
143
151
  if(kwt_data->dictionary_size == 0)
@@ -148,32 +156,19 @@ rb_kwt_search(int argc, VALUE *argv, VALUE self)
148
156
  while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
149
157
  // this is an individual result as a hash
150
158
  v_result= rb_hash_new();
151
- rb_hash_aset( v_result, sym_id, INT2FIX(id) );
159
+ rb_hash_aset( v_result, sym_id, INT2FIX(id) );
152
160
  rb_hash_aset( v_result, sym_starts_at, INT2FIX( ends_at - lgt - 1 ) );
153
- rb_hash_aset( v_result, sym_ends_at, INT2FIX( ends_at - 1 ) );
161
+ rb_hash_aset( v_result, sym_ends_at, INT2FIX( ends_at - 1 ) );
154
162
  result = (char*) malloc (sizeof(char)*lgt);
155
163
  sprintf( result, "%.*s", lgt, remain);
156
164
  rb_hash_aset( v_result, sym_value, rb_str_new(result, lgt) );
157
-
158
- // yield this hash or, add it to the results
159
- // if(rb_block_given_p())
160
- // rb_yield(v_result);
161
- // else
162
165
  rb_ary_push( v_results, v_result );
163
166
  free(result);
164
167
  }
165
-
166
168
  // TODO: maybe the Tree can be re-opened to add new items to dictionary
167
-
168
- // return the results or nil if none
169
- // if( v_results != Qnil && RARRAY(v_results)->len > 0 ) {
170
169
  return v_results;
171
- // } else {
172
- // return Qnil;
173
- // }
174
170
  }
175
171
 
176
-
177
172
  /*
178
173
  * Document-method: size
179
174
  * call-seq: size
@@ -194,7 +189,6 @@ rb_kwt_size(VALUE self)
194
189
  return INT2FIX(kwt_data->dictionary_size);
195
190
  }
196
191
 
197
-
198
192
  /*
199
193
  * Document-method: add_string
200
194
  * call-seq: add_string
@@ -228,7 +222,6 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
228
222
 
229
223
  Check_Type(v_string, T_STRING);
230
224
  string= RSTRING(v_string)->ptr;
231
-
232
225
  KeywordTree(self, kwt_data);
233
226
 
234
227
  if(kwt_data->is_frozen == 1)
@@ -244,16 +237,12 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
244
237
  id= NUM2INT(v_id);
245
238
  }
246
239
 
247
- // printf("[internal]==> %d\n", id);
248
-
249
- if( ac_add_string(kwt_data->tree, string, strlen(string), id) == 0 ) {
240
+ if(ac_add_string(kwt_data->tree, string, strlen(string), id) == 0)
250
241
  rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
251
- }
252
242
 
253
243
  kwt_data->last_id= id + 1;
254
244
  kwt_data->dictionary_size++;
255
- // printf("[internal]==> %d\n", id);
256
- return id;
245
+ return INT2FIX(id);
257
246
  }
258
247
 
259
248
  /*
@@ -293,14 +282,14 @@ rb_kwt_new_from_file(int argc, VALUE *argv, VALUE klass)
293
282
  KeywordTree( self, kwt_data );
294
283
 
295
284
  dictionary = fopen( RSTRING( f_string )->ptr, "r" );
296
- if(dictionary == NULL) {
285
+ if(dictionary == NULL)
297
286
  rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?", RSTRING(f_string)->ptr);
298
- }
299
287
 
300
288
  while(fgets(word, 1024, dictionary) != NULL) {
301
289
  ac_add_string(kwt_data->tree, word, strlen(word)-1, id++);
302
290
  kwt_data->dictionary_size++;
303
291
  }
292
+
304
293
  kwt_data->last_id= id+1;
305
294
  fclose(dictionary);
306
295
  return self;
@@ -331,17 +320,16 @@ void Init_ahocorasick() {
331
320
  rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
332
321
  rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
333
322
  rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
323
+ rb_define_alias(rb_cKeywordTree, "<<", "add_string");
334
324
 
335
- rb_define_method(rb_cKeywordTree, "search", rb_kwt_search, -1);
336
- rb_define_alias(rb_cKeywordTree, "find_all", "search");
325
+ rb_define_method(rb_cKeywordTree, "find_all", rb_kwt_find_all, -1);
326
+ rb_define_alias(rb_cKeywordTree, "search", "find_all");
337
327
 
338
- rb_define_alias(rb_cKeywordTree, "<<", "add_string");
339
328
  rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
340
329
 
341
- sym_id= ID2SYM(rb_intern("id"));
342
- sym_value= ID2SYM(rb_intern("value"));
343
- sym_ends_at= ID2SYM( rb_intern("ends_at") );
330
+ sym_id = ID2SYM(rb_intern("id"));
331
+ sym_value = ID2SYM(rb_intern("value"));
332
+ sym_ends_at = ID2SYM( rb_intern("ends_at") );
344
333
  sym_starts_at= ID2SYM( rb_intern("starts_at") );
345
-
346
334
  }
347
335
 
@@ -67,7 +67,7 @@ describe KeywordTree do
67
67
  # 01234567890123456789023
68
68
  # | |
69
69
  q= "data moved to bucurești"
70
- @kwt.search(q).each do | result |
70
+ @kwt.find_all(q).each do | result |
71
71
  result[:starts_at].should == 14
72
72
  result[:ends_at].should == 24
73
73
  end
@@ -77,7 +77,7 @@ describe KeywordTree do
77
77
  @kwt << "expected"
78
78
  # 012345678901234578901234567890
79
79
  q = "moved to bucurești as expected"
80
- @kwt.search(q).each do | r |
80
+ @kwt.find_all(q).each do | r |
81
81
  r[:starts_at].should == 23
82
82
  r[:ends_at].should == q.size
83
83
  (r[:ends_at]-r[:starts_at]).should == r[:value].size
@@ -86,15 +86,15 @@ describe KeywordTree do
86
86
 
87
87
  it "even more unicode" do
88
88
  @kwt << "șșt"
89
- # 0124789
90
- result= @kwt.search("mușștar").first
89
+ # 0124789
90
+ result= @kwt.find_all("mușștar").first
91
91
  result[:starts_at].should == 2
92
92
  result[:ends_at].should == result[:starts_at] + "șșt".size
93
93
  end
94
94
 
95
95
  it "checks for result length" do
96
96
  @kwt << "foo"
97
- result= @kwt.search("foo").first
97
+ result= @kwt.find_all("foo").first
98
98
  # 4 0
99
99
  (result[:ends_at]-result[:starts_at]).should == result[:value].size
100
100
  "foo"[result[:ends_at]].should == nil
@@ -110,7 +110,7 @@ describe KeywordTree do
110
110
  end
111
111
 
112
112
  it "should match on context" do
113
- @kwt.search("I've moved the data to a new database").size.should == 4
113
+ @kwt.find_all("I've moved the data to a new database").size.should == 4
114
114
  end
115
115
 
116
116
  end
@@ -146,8 +146,8 @@ describe KeywordTree do
146
146
  it "should work to add a random id" do
147
147
  kwt= KeywordTree.new
148
148
  kwt << "baz"
149
- kwt.add_string "foo", 1990
150
- kwt << "bar"
149
+ kwt.add_string("foo", 1990).should == 1990
150
+ kwt.add_string("bar").should == 1991
151
151
  kwt.size.should == 3
152
152
  end
153
153
 
@@ -185,7 +185,7 @@ describe KeywordTree do
185
185
  start= Time.now
186
186
  k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
187
187
  load_time= Time.now
188
- results= k.search( File.read( File.dirname(__FILE__) + "/data/melville-moby_dick.txt" ) )
188
+ results= k.find_all( File.read( File.dirname(__FILE__) + "/data/melville-moby_dick.txt" ) )
189
189
  puts "\n%d words re-loaded in %s seconds.\nGot %d results in %s seconds" % [k.size, (load_time - start), results.size, (Time.now-load_time)]
190
190
  (Time.now-load_time).should < 1.2
191
191
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aurelian-ruby-ahocorasick
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aurelian Oancea
@@ -14,7 +14,7 @@ default_executable:
14
14
  dependencies: []
15
15
 
16
16
  description: Expose Aho-Corasick implementation from Strmat to Ruby.
17
- email: aurelian [ at ] locknet.ro
17
+ email: oancea at gmail dot com
18
18
  executables: []
19
19
 
20
20
  extensions: