aurelian-ruby-ahocorasick 0.4.5 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +5 -1
- data/ext/{ac.c → ahocorasick/ac.c} +0 -0
- data/ext/{ac.h → ahocorasick/ac.h} +0 -0
- data/ext/ahocorasick/extconf.rb +8 -0
- data/ext/{ruby-ahocorasick.c → ahocorasick/ruby-ahocorasick.c} +24 -24
- data/lib/ahocorasick.rb +7 -0
- data/spec/ahocorasick_spec.rb +62 -6
- metadata +8 -7
- data/ext/extconf.rb +0 -6
data/README.textile
CHANGED
@@ -48,7 +48,11 @@ $ rake install
|
|
48
48
|
|
49
49
|
h3. Rubygems - Stable Version
|
50
50
|
|
51
|
-
|
51
|
+
Get version 0.4.5 (released on 19 November 2008) from "rubyforge":http://rubyforge.org/frs/?group_id=4024&release_id=28421 :
|
52
|
+
|
53
|
+
<pre>
|
54
|
+
$ gem install ruby-ahocorasick
|
55
|
+
</pre>
|
52
56
|
|
53
57
|
|
54
58
|
h4. Notes
|
File without changes
|
File without changes
|
@@ -132,7 +132,7 @@ static VALUE
|
|
132
132
|
rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
133
133
|
{
|
134
134
|
char * remain; // returned by ac_search, the remaing text to search
|
135
|
-
int lgt, id, ends_at
|
135
|
+
int lgt, id, ends_at; // filled in by ac_search: the length of the result, the id, and starts_at/ends_at position
|
136
136
|
VALUE v_result; // one result, as hash
|
137
137
|
VALUE v_results; // all the results, an array
|
138
138
|
|
@@ -143,12 +143,14 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
|
143
143
|
rb_scan_args(argc, argv, "1", &v_search);
|
144
144
|
// it should be string.
|
145
145
|
Check_Type(v_search, T_STRING);
|
146
|
+
v_search= StringValue( v_search );
|
147
|
+
|
146
148
|
// get the structure
|
147
149
|
KeywordTree(self, kwt_data);
|
148
150
|
// freeze the tree, if not already
|
149
151
|
if(kwt_data->is_frozen == 0) {
|
150
152
|
if(ac_prep( kwt_data->tree ) == 0)
|
151
|
-
rb_raise(rb_eRuntimeError, "Cannot freeze the tree");
|
153
|
+
rb_raise(rb_eRuntimeError, "Cannot freeze the tree!");
|
152
154
|
kwt_data->is_frozen = 1;
|
153
155
|
}
|
154
156
|
// prepare the return value
|
@@ -157,15 +159,15 @@ rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
|
|
157
159
|
if(kwt_data->dictionary_size == 0)
|
158
160
|
return v_results;
|
159
161
|
// prepare the search
|
160
|
-
ac_search_init(kwt_data->tree,
|
162
|
+
ac_search_init(kwt_data->tree, StringValuePtr(v_search), (int)NUM2INT(rb_funcall(v_search, rb_intern("length"), 0)));
|
161
163
|
// loop trought the results
|
162
164
|
while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
|
163
165
|
// this is an individual result as a hash
|
164
166
|
v_result= rb_hash_new();
|
165
|
-
rb_hash_aset( v_result, sym_id,
|
166
|
-
rb_hash_aset( v_result, sym_starts_at,
|
167
|
-
rb_hash_aset( v_result, sym_ends_at,
|
168
|
-
rb_hash_aset( v_result, sym_value, rb_str_new(remain, lgt) );
|
167
|
+
rb_hash_aset( v_result, sym_id, INT2NUM( (long)id ) );
|
168
|
+
rb_hash_aset( v_result, sym_starts_at, INT2NUM( (long)(ends_at - lgt - 1) ) );
|
169
|
+
rb_hash_aset( v_result, sym_ends_at, INT2NUM( (long)(ends_at - 1) ) );
|
170
|
+
rb_hash_aset( v_result, sym_value, rb_str_new(remain, (long)lgt) );
|
169
171
|
rb_ary_push( v_results, v_result );
|
170
172
|
}
|
171
173
|
// reopen the tree
|
@@ -219,30 +221,29 @@ rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
|
219
221
|
{
|
220
222
|
VALUE v_string, v_id;
|
221
223
|
struct kwt_struct_data *kwt_data;
|
222
|
-
char * string;
|
224
|
+
// char * string;
|
223
225
|
int id;
|
224
226
|
|
225
227
|
rb_scan_args(argc, argv, "11", &v_string, &v_id);
|
226
228
|
|
227
229
|
Check_Type(v_string, T_STRING);
|
228
|
-
string=
|
230
|
+
// string= StringValuePtr(v_string);
|
229
231
|
KeywordTree(self, kwt_data);
|
230
232
|
|
231
233
|
if(kwt_data->is_frozen == 1)
|
232
|
-
rb_raise(rb_eRuntimeError, "Cannot add `%s\" into a frozen tree.",
|
234
|
+
rb_raise(rb_eRuntimeError, "Cannot add `%s\" into a frozen tree.", StringValuePtr(v_string));
|
233
235
|
|
234
236
|
if(v_id == Qnil) {
|
235
237
|
id = kwt_data->last_id;
|
236
238
|
} else if(TYPE(v_id) != T_FIXNUM) {
|
237
|
-
rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%s\" given.",
|
239
|
+
rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%s\" given.", StringValuePtr(v_id));
|
238
240
|
} else if(NUM2INT(v_id) <= 0) {
|
239
241
|
rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%d\" given.", NUM2INT(v_id));
|
240
242
|
} else {
|
241
243
|
id= NUM2INT(v_id);
|
242
244
|
}
|
243
|
-
|
244
|
-
|
245
|
-
rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
|
245
|
+
if(ac_add_string(kwt_data->tree, StringValuePtr(v_string), (int)NUM2INT(rb_funcall(v_string, rb_intern("length"), 0)), id) == 0)
|
246
|
+
rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", StringValuePtr(v_string), id);
|
246
247
|
|
247
248
|
kwt_data->last_id= id + 1;
|
248
249
|
kwt_data->dictionary_size++;
|
@@ -273,24 +274,23 @@ rb_kwt_new_from_file(int argc, VALUE *argv, VALUE klass)
|
|
273
274
|
|
274
275
|
struct kwt_struct_data *kwt_data;
|
275
276
|
char word[1024];
|
276
|
-
int id;
|
277
|
+
int id = 0;
|
277
278
|
VALUE self;
|
278
|
-
VALUE
|
279
|
+
VALUE filename;
|
279
280
|
FILE *dictionary;
|
280
281
|
|
281
|
-
rb_scan_args(argc, argv, "10", &
|
282
|
-
|
283
|
-
|
284
|
-
SafeStringValue( f_string );
|
282
|
+
rb_scan_args(argc, argv, "10", &filename);
|
283
|
+
|
284
|
+
SafeStringValue(filename);
|
285
285
|
self= rb_class_new_instance( 0, NULL, klass );
|
286
286
|
KeywordTree( self, kwt_data );
|
287
287
|
|
288
|
-
dictionary
|
288
|
+
dictionary= fopen( StringValuePtr(filename), "r" );
|
289
289
|
if(dictionary == NULL)
|
290
|
-
rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?",
|
290
|
+
rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?", StringValuePtr(filename));
|
291
291
|
|
292
292
|
while(fgets(word, 1024, dictionary) != NULL) {
|
293
|
-
ac_add_string(kwt_data->tree, word, strlen(word)-1, id++);
|
293
|
+
ac_add_string(kwt_data->tree, word, (int)(strlen(word)-1), id++);
|
294
294
|
kwt_data->dictionary_size++;
|
295
295
|
}
|
296
296
|
|
@@ -314,7 +314,7 @@ rb_kwt_struct_alloc(VALUE klass)
|
|
314
314
|
/*
|
315
315
|
* Blump.
|
316
316
|
*/
|
317
|
-
void
|
317
|
+
void Init_native() {
|
318
318
|
rb_mAhoCorasick = rb_define_module("AhoCorasick");
|
319
319
|
rb_cKeywordTree = rb_define_class_under(rb_mAhoCorasick, "KeywordTree", rb_cObject);
|
320
320
|
|
data/lib/ahocorasick.rb
ADDED
data/spec/ahocorasick_spec.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
-
|
1
|
+
%w(../lib ../ext).each do |path|
|
2
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
|
3
|
+
end
|
2
4
|
|
5
|
+
require 'ahocorasick'
|
3
6
|
include AhoCorasick
|
4
7
|
|
5
8
|
describe KeywordTree do
|
@@ -15,6 +18,51 @@ describe KeywordTree do
|
|
15
18
|
end
|
16
19
|
end
|
17
20
|
|
21
|
+
describe "not bugs" do
|
22
|
+
it "should return 3 results" do
|
23
|
+
tree= KeywordTree.new
|
24
|
+
tree.add_string "data"
|
25
|
+
tree.add_string "database"
|
26
|
+
results= tree.find_all "move all the data to a new database"
|
27
|
+
results.size.should == 3
|
28
|
+
end
|
29
|
+
it "should also return 3 results" do
|
30
|
+
tree= KeywordTree.new
|
31
|
+
tree.add_string "database"
|
32
|
+
tree.add_string "data"
|
33
|
+
results= tree.find_all "move all the data to a new database"
|
34
|
+
results.size.should == 3
|
35
|
+
end
|
36
|
+
it "should return 2 results" do
|
37
|
+
tree= KeywordTree.new
|
38
|
+
tree.add_string "base"
|
39
|
+
tree.add_string "database"
|
40
|
+
results= tree.find_all "move all the data to a new database"
|
41
|
+
results.size.should == 2
|
42
|
+
end
|
43
|
+
it "should also return 2 results" do
|
44
|
+
tree= KeywordTree.new
|
45
|
+
tree.add_string "database"
|
46
|
+
tree.add_string "base"
|
47
|
+
results= tree.find_all "move all the data to a new database"
|
48
|
+
results.size.should == 2
|
49
|
+
end
|
50
|
+
it "should return 2 results" do
|
51
|
+
tree= KeywordTree.new
|
52
|
+
tree.add_string "data"
|
53
|
+
results= tree.find_all "move all the data to a new database"
|
54
|
+
results.size.should == 2
|
55
|
+
end
|
56
|
+
it "should return 1 result on duplicates" do
|
57
|
+
tree= KeywordTree.new
|
58
|
+
tree.add_string "database"
|
59
|
+
tree.add_string "database"
|
60
|
+
results= tree.find_all "move all the data to a new database"
|
61
|
+
results.size.should == 1
|
62
|
+
results[0][:id] == 2
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
18
66
|
describe "How to create a new KeywordTree" do
|
19
67
|
it "should create a new KeywordTree" do
|
20
68
|
KeywordTree.new.class.should == KeywordTree
|
@@ -192,20 +240,28 @@ describe KeywordTree do
|
|
192
240
|
|
193
241
|
describe "Benchmarks. Loading from a file" do
|
194
242
|
|
243
|
+
before(:each) do
|
244
|
+
@start= Time.now
|
245
|
+
end
|
246
|
+
|
247
|
+
after(:each) do
|
248
|
+
@start=nil
|
249
|
+
end
|
250
|
+
|
195
251
|
it "should be fast to load a bunch of english words" do
|
196
|
-
start= Time.now
|
197
252
|
k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
|
198
|
-
puts "\n%d words loaded in %s seconds" % [k.size, (Time.now - start)]
|
199
|
-
(Time.now
|
253
|
+
puts "\n%d words loaded in %s seconds" % [k.size, (Time.now - @start)]
|
254
|
+
(Time.now-@start).should < 0.2
|
200
255
|
end
|
201
256
|
|
202
257
|
it "should be fast to find" do
|
203
|
-
start= Time.now
|
258
|
+
# start= Time.now
|
204
259
|
k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
|
205
260
|
load_time= Time.now
|
206
261
|
results= k.find_all( File.read( File.dirname(__FILE__) + "/data/melville-moby_dick.txt" ) )
|
207
|
-
puts "\n%d words re-loaded in %s seconds.\nGot %d results in %s seconds" % [k.size, (load_time - start), results.size, (Time.now-load_time)]
|
262
|
+
puts "\n%d words re-loaded in %s seconds.\nGot %d results in %s seconds" % [k.size, (load_time - @start), results.size, (Time.now-load_time)]
|
208
263
|
(Time.now-load_time).should < 1.3
|
264
|
+
puts results.last.inspect
|
209
265
|
end
|
210
266
|
end
|
211
267
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aurelian-ruby-ahocorasick
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aurelian Oancea
|
@@ -18,14 +18,15 @@ email: oancea at gmail dot com
|
|
18
18
|
executables: []
|
19
19
|
|
20
20
|
extensions:
|
21
|
-
- ext/extconf.rb
|
21
|
+
- ext/ahocorasick/extconf.rb
|
22
22
|
extra_rdoc_files: []
|
23
23
|
|
24
24
|
files:
|
25
|
-
- ext/extconf.rb
|
26
|
-
- ext/ruby-ahocorasick.c
|
27
|
-
- ext/ac.h
|
28
|
-
- ext/ac.c
|
25
|
+
- ext/ahocorasick/extconf.rb
|
26
|
+
- ext/ahocorasick/ruby-ahocorasick.c
|
27
|
+
- ext/ahocorasick/ac.h
|
28
|
+
- ext/ahocorasick/ac.c
|
29
|
+
- lib/ahocorasick.rb
|
29
30
|
- examples/dict.rb
|
30
31
|
- examples/test.rb
|
31
32
|
- examples/elev.rb
|
@@ -40,7 +41,7 @@ rdoc_options:
|
|
40
41
|
- --title
|
41
42
|
- Ruby-AhoCorasick
|
42
43
|
- --inline-source
|
43
|
-
- ext/ruby-ahocorasick.c
|
44
|
+
- ext/ahocorasick/ruby-ahocorasick.c
|
44
45
|
- README.textile
|
45
46
|
- --main
|
46
47
|
- README.textile
|