aurelian-ruby-ahocorasick 0.4.5 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +5 -1
- data/ext/{ac.c → ahocorasick/ac.c} +0 -0
- data/ext/{ac.h → ahocorasick/ac.h} +0 -0
- data/ext/ahocorasick/extconf.rb +8 -0
- data/ext/{ruby-ahocorasick.c → ahocorasick/ruby-ahocorasick.c} +24 -24
- data/lib/ahocorasick.rb +7 -0
- data/spec/ahocorasick_spec.rb +62 -6
- metadata +8 -7
- data/ext/extconf.rb +0 -6
data/README.textile
CHANGED
@@ -48,7 +48,11 @@ $ rake install
|
|
48
48
|
|
49
49
|
h3. Rubygems - Stable Version
|
50
50
|
|
51
|
-
|
51
|
+
Get version 0.4.5 (released on 19 November 2008) from "rubyforge":http://rubyforge.org/frs/?group_id=4024&release_id=28421 :
|
52
|
+
|
53
|
+
<pre>
|
54
|
+
$ gem install ruby-ahocorasick
|
55
|
+
</pre>
|
52
56
|
|
53
57
|
|
54
58
|
h4. Notes
|
File without changes
|
File without changes
|
@@ -132,7 +132,7 @@ static VALUE
|
|
132
132
|
rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
133
133
|
{
|
134
134
|
char * remain; // returned by ac_search, the remaing text to search
|
135
|
-
int lgt, id, ends_at
|
135
|
+
int lgt, id, ends_at; // filled in by ac_search: the length of the result, the id, and starts_at/ends_at position
|
136
136
|
VALUE v_result; // one result, as hash
|
137
137
|
VALUE v_results; // all the results, an array
|
138
138
|
|
@@ -143,12 +143,14 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
|
143
143
|
rb_scan_args(argc, argv, "1", &v_search);
|
144
144
|
// it should be string.
|
145
145
|
Check_Type(v_search, T_STRING);
|
146
|
+
v_search= StringValue( v_search );
|
147
|
+
|
146
148
|
// get the structure
|
147
149
|
KeywordTree(self, kwt_data);
|
148
150
|
// freeze the tree, if not already
|
149
151
|
if(kwt_data->is_frozen == 0) {
|
150
152
|
if(ac_prep( kwt_data->tree ) == 0)
|
151
|
-
rb_raise(rb_eRuntimeError, "Cannot freeze the tree");
|
153
|
+
rb_raise(rb_eRuntimeError, "Cannot freeze the tree!");
|
152
154
|
kwt_data->is_frozen = 1;
|
153
155
|
}
|
154
156
|
// prepare the return value
|
@@ -157,15 +159,15 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
|
157
159
|
if(kwt_data->dictionary_size == 0)
|
158
160
|
return v_results;
|
159
161
|
// prepare the search
|
160
|
-
ac_search_init(kwt_data->tree,
|
162
|
+
ac_search_init(kwt_data->tree, StringValuePtr(v_search), (int)NUM2INT(rb_funcall(v_search, rb_intern("length"), 0)));
|
161
163
|
// loop trought the results
|
162
164
|
while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
|
163
165
|
// this is an individual result as a hash
|
164
166
|
v_result= rb_hash_new();
|
165
|
-
rb_hash_aset( v_result, sym_id,
|
166
|
-
rb_hash_aset( v_result, sym_starts_at,
|
167
|
-
rb_hash_aset( v_result, sym_ends_at,
|
168
|
-
rb_hash_aset( v_result, sym_value, rb_str_new(remain, lgt) );
|
167
|
+
rb_hash_aset( v_result, sym_id, INT2NUM( (long)id ) );
|
168
|
+
rb_hash_aset( v_result, sym_starts_at, INT2NUM( (long)(ends_at - lgt - 1) ) );
|
169
|
+
rb_hash_aset( v_result, sym_ends_at, INT2NUM( (long)(ends_at - 1) ) );
|
170
|
+
rb_hash_aset( v_result, sym_value, rb_str_new(remain, (long)lgt) );
|
169
171
|
rb_ary_push( v_results, v_result );
|
170
172
|
}
|
171
173
|
// reopen the tree
|
@@ -219,30 +221,29 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
|
219
221
|
{
|
220
222
|
VALUE v_string, v_id;
|
221
223
|
struct kwt_struct_data *kwt_data;
|
222
|
-
char * string;
|
224
|
+
// char * string;
|
223
225
|
int id;
|
224
226
|
|
225
227
|
rb_scan_args(argc, argv, "11", &v_string, &v_id);
|
226
228
|
|
227
229
|
Check_Type(v_string, T_STRING);
|
228
|
-
string=
|
230
|
+
// string= StringValuePtr(v_string);
|
229
231
|
KeywordTree(self, kwt_data);
|
230
232
|
|
231
233
|
if(kwt_data->is_frozen == 1)
|
232
|
-
rb_raise(rb_eRuntimeError, "Cannot add `%s\" into a frozen tree.",
|
234
|
+
rb_raise(rb_eRuntimeError, "Cannot add `%s\" into a frozen tree.", StringValuePtr(v_string));
|
233
235
|
|
234
236
|
if(v_id == Qnil) {
|
235
237
|
id = kwt_data->last_id;
|
236
238
|
} else if(TYPE(v_id) != T_FIXNUM) {
|
237
|
-
rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%s\" given.",
|
239
|
+
rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%s\" given.", StringValuePtr(v_id));
|
238
240
|
} else if(NUM2INT(v_id) <= 0) {
|
239
241
|
rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%d\" given.", NUM2INT(v_id));
|
240
242
|
} else {
|
241
243
|
id= NUM2INT(v_id);
|
242
244
|
}
|
243
|
-
|
244
|
-
|
245
|
-
rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
|
245
|
+
if(ac_add_string(kwt_data->tree, StringValuePtr(v_string), (int)NUM2INT(rb_funcall(v_string, rb_intern("length"), 0)), id) == 0)
|
246
|
+
rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", StringValuePtr(v_string), id);
|
246
247
|
|
247
248
|
kwt_data->last_id= id + 1;
|
248
249
|
kwt_data->dictionary_size++;
|
@@ -273,24 +274,23 @@ rb_kwt_new_from_file(int argc, VALUE *argv, VALUE klass)
|
|
273
274
|
|
274
275
|
struct kwt_struct_data *kwt_data;
|
275
276
|
char word[1024];
|
276
|
-
int id;
|
277
|
+
int id = 0;
|
277
278
|
VALUE self;
|
278
|
-
VALUE
|
279
|
+
VALUE filename;
|
279
280
|
FILE *dictionary;
|
280
281
|
|
281
|
-
rb_scan_args(argc, argv, "10", &
|
282
|
-
|
283
|
-
|
284
|
-
SafeStringValue( f_string );
|
282
|
+
rb_scan_args(argc, argv, "10", &filename);
|
283
|
+
|
284
|
+
SafeStringValue(filename);
|
285
285
|
self= rb_class_new_instance( 0, NULL, klass );
|
286
286
|
KeywordTree( self, kwt_data );
|
287
287
|
|
288
|
-
dictionary
|
288
|
+
dictionary= fopen( StringValuePtr(filename), "r" );
|
289
289
|
if(dictionary == NULL)
|
290
|
-
rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?",
|
290
|
+
rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?", StringValuePtr(filename));
|
291
291
|
|
292
292
|
while(fgets(word, 1024, dictionary) != NULL) {
|
293
|
-
ac_add_string(kwt_data->tree, word, strlen(word)-1, id++);
|
293
|
+
ac_add_string(kwt_data->tree, word, (int)(strlen(word)-1), id++);
|
294
294
|
kwt_data->dictionary_size++;
|
295
295
|
}
|
296
296
|
|
@@ -314,7 +314,7 @@ rb_kwt_struct_alloc(VALUE klass)
|
|
314
314
|
/*
|
315
315
|
* Blump.
|
316
316
|
*/
|
317
|
-
void
|
317
|
+
void Init_native() {
|
318
318
|
rb_mAhoCorasick = rb_define_module("AhoCorasick");
|
319
319
|
rb_cKeywordTree = rb_define_class_under(rb_mAhoCorasick, "KeywordTree", rb_cObject);
|
320
320
|
|
data/lib/ahocorasick.rb
ADDED
data/spec/ahocorasick_spec.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
-
|
1
|
+
%w(../lib ../ext).each do |path|
|
2
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
|
3
|
+
end
|
2
4
|
|
5
|
+
require 'ahocorasick'
|
3
6
|
include AhoCorasick
|
4
7
|
|
5
8
|
describe KeywordTree do
|
@@ -15,6 +18,51 @@ describe KeywordTree do
|
|
15
18
|
end
|
16
19
|
end
|
17
20
|
|
21
|
+
describe "not bugs" do
|
22
|
+
it "should return 3 results" do
|
23
|
+
tree= KeywordTree.new
|
24
|
+
tree.add_string "data"
|
25
|
+
tree.add_string "database"
|
26
|
+
results= tree.find_all "move all the data to a new database"
|
27
|
+
results.size.should == 3
|
28
|
+
end
|
29
|
+
it "should also return 3 results" do
|
30
|
+
tree= KeywordTree.new
|
31
|
+
tree.add_string "database"
|
32
|
+
tree.add_string "data"
|
33
|
+
results= tree.find_all "move all the data to a new database"
|
34
|
+
results.size.should == 3
|
35
|
+
end
|
36
|
+
it "should return 2 results" do
|
37
|
+
tree= KeywordTree.new
|
38
|
+
tree.add_string "base"
|
39
|
+
tree.add_string "database"
|
40
|
+
results= tree.find_all "move all the data to a new database"
|
41
|
+
results.size.should == 2
|
42
|
+
end
|
43
|
+
it "should also return 2 results" do
|
44
|
+
tree= KeywordTree.new
|
45
|
+
tree.add_string "database"
|
46
|
+
tree.add_string "base"
|
47
|
+
results= tree.find_all "move all the data to a new database"
|
48
|
+
results.size.should == 2
|
49
|
+
end
|
50
|
+
it "should return 2 results" do
|
51
|
+
tree= KeywordTree.new
|
52
|
+
tree.add_string "data"
|
53
|
+
results= tree.find_all "move all the data to a new database"
|
54
|
+
results.size.should == 2
|
55
|
+
end
|
56
|
+
it "should return 1 result on duplicates" do
|
57
|
+
tree= KeywordTree.new
|
58
|
+
tree.add_string "database"
|
59
|
+
tree.add_string "database"
|
60
|
+
results= tree.find_all "move all the data to a new database"
|
61
|
+
results.size.should == 1
|
62
|
+
results[0][:id] == 2
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
18
66
|
describe "How to create a new KeywordTree" do
|
19
67
|
it "should create a new KeywordTree" do
|
20
68
|
KeywordTree.new.class.should == KeywordTree
|
@@ -192,20 +240,28 @@ describe KeywordTree do
|
|
192
240
|
|
193
241
|
describe "Benchmarks. Loading from a file" do
|
194
242
|
|
243
|
+
before(:each) do
|
244
|
+
@start= Time.now
|
245
|
+
end
|
246
|
+
|
247
|
+
after(:each) do
|
248
|
+
@start=nil
|
249
|
+
end
|
250
|
+
|
195
251
|
it "should be fast to load a bunch of english words" do
|
196
|
-
start= Time.now
|
197
252
|
k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
|
198
|
-
puts "\n%d words loaded in %s seconds" % [k.size, (Time.now - start)]
|
199
|
-
(Time.now
|
253
|
+
puts "\n%d words loaded in %s seconds" % [k.size, (Time.now - @start)]
|
254
|
+
(Time.now-@start).should < 0.2
|
200
255
|
end
|
201
256
|
|
202
257
|
it "should be fast to find" do
|
203
|
-
start= Time.now
|
258
|
+
# start= Time.now
|
204
259
|
k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
|
205
260
|
load_time= Time.now
|
206
261
|
results= k.find_all( File.read( File.dirname(__FILE__) + "/data/melville-moby_dick.txt" ) )
|
207
|
-
puts "\n%d words re-loaded in %s seconds.\nGot %d results in %s seconds" % [k.size, (load_time - start), results.size, (Time.now-load_time)]
|
262
|
+
puts "\n%d words re-loaded in %s seconds.\nGot %d results in %s seconds" % [k.size, (load_time - @start), results.size, (Time.now-load_time)]
|
208
263
|
(Time.now-load_time).should < 1.3
|
264
|
+
puts results.last.inspect
|
209
265
|
end
|
210
266
|
end
|
211
267
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aurelian-ruby-ahocorasick
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aurelian Oancea
|
@@ -18,14 +18,15 @@ email: oancea at gmail dot com
|
|
18
18
|
executables: []
|
19
19
|
|
20
20
|
extensions:
|
21
|
-
- ext/extconf.rb
|
21
|
+
- ext/ahocorasick/extconf.rb
|
22
22
|
extra_rdoc_files: []
|
23
23
|
|
24
24
|
files:
|
25
|
-
- ext/extconf.rb
|
26
|
-
- ext/ruby-ahocorasick.c
|
27
|
-
- ext/ac.h
|
28
|
-
- ext/ac.c
|
25
|
+
- ext/ahocorasick/extconf.rb
|
26
|
+
- ext/ahocorasick/ruby-ahocorasick.c
|
27
|
+
- ext/ahocorasick/ac.h
|
28
|
+
- ext/ahocorasick/ac.c
|
29
|
+
- lib/ahocorasick.rb
|
29
30
|
- examples/dict.rb
|
30
31
|
- examples/test.rb
|
31
32
|
- examples/elev.rb
|
@@ -40,7 +41,7 @@ rdoc_options:
|
|
40
41
|
- --title
|
41
42
|
- Ruby-AhoCorasick
|
42
43
|
- --inline-source
|
43
|
-
- ext/ruby-ahocorasick.c
|
44
|
+
- ext/ahocorasick/ruby-ahocorasick.c
|
44
45
|
- README.textile
|
45
46
|
- --main
|
46
47
|
- README.textile
|