aurelian-ruby-ahocorasick 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +21 -0
- data/README.textile +36 -0
- data/examples/dict.rb +14 -0
- data/examples/elev.rb +19 -0
- data/examples/sample.c +94 -0
- data/examples/test.rb +46 -0
- data/ext/ac.c +623 -0
- data/ext/ac.h +36 -0
- data/ext/extconf.rb +4 -0
- data/ext/ruby-ahocorasick.c +329 -0
- data/spec/ahocorasick_spec.rb +183 -0
- metadata +69 -0
data/ext/ac.h
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
|
2
|
+
#ifndef _AC_H_
|
3
|
+
#define _AC_H_
|
4
|
+
|
5
|
+
typedef struct actreenode {
|
6
|
+
char ch;
|
7
|
+
int matchid;
|
8
|
+
struct actreenode *outlink, *faillink;
|
9
|
+
struct actreenode *children, *sibling;
|
10
|
+
} ACTREE_NODE, *AC_TREE;
|
11
|
+
|
12
|
+
typedef struct {
|
13
|
+
AC_TREE tree;
|
14
|
+
int ispreprocessed, errorflag;
|
15
|
+
|
16
|
+
int Psize;
|
17
|
+
int *Plengths;
|
18
|
+
|
19
|
+
char *T;
|
20
|
+
int N, c, initflag, endflag;
|
21
|
+
AC_TREE w, output;
|
22
|
+
#ifdef STATS
|
23
|
+
int prep_new_edges, prep_old_edges, prep_fail_compares;
|
24
|
+
int num_compares, num_failures, edges_traversed, outlinks_traversed;
|
25
|
+
#endif
|
26
|
+
} AC_STRUCT;
|
27
|
+
|
28
|
+
AC_STRUCT *ac_alloc(void);
|
29
|
+
int ac_add_string(AC_STRUCT *node, char *P, int M, int id);
|
30
|
+
// int ac_del_string(AC_STRUCT *node, char *P, int M, int id);
|
31
|
+
int ac_prep(AC_STRUCT *node);
|
32
|
+
void ac_search_init(AC_STRUCT *node, char *T, int N);
|
33
|
+
char *ac_search(AC_STRUCT *node, int *length_out, int *id_out, int *ends_at);
|
34
|
+
void ac_free(AC_STRUCT *node);
|
35
|
+
|
36
|
+
#endif
|
data/ext/extconf.rb
ADDED
@@ -0,0 +1,329 @@
|
|
1
|
+
|
2
|
+
//
|
3
|
+
// (c) 2008, Aurelian Oancea < aurelian at locknet . ro >
|
4
|
+
//
|
5
|
+
// Released under MIT-LICENSE
|
6
|
+
//
|
7
|
+
|
8
|
+
//
|
9
|
+
// TODO: new methods?
|
10
|
+
// * kwt[id] = word
|
11
|
+
// * kwt.from_file (class instance method)
|
12
|
+
//
|
13
|
+
|
14
|
+
#include <ruby.h>
|
15
|
+
#include "ac.h"
|
16
|
+
|
17
|
+
static VALUE sym_id, sym_value, sym_ends_at, sym_starts_at;
|
18
|
+
|
19
|
+
VALUE rb_mAhoCorasick;
|
20
|
+
VALUE rb_cKeywordTree;
|
21
|
+
|
22
|
+
#define KeywordTree(obj, kwt_data) {\
|
23
|
+
Data_Get_Struct(obj, struct kwt_struct_data, kwt_data);\
|
24
|
+
}
|
25
|
+
|
26
|
+
struct kwt_struct_data {
|
27
|
+
AC_STRUCT * tree;
|
28
|
+
int last_id;
|
29
|
+
int dictionary_size;
|
30
|
+
int is_frozen;
|
31
|
+
};
|
32
|
+
|
33
|
+
/*
|
34
|
+
* call-seq: initialize
|
35
|
+
*
|
36
|
+
* Creates a new KeywordTree
|
37
|
+
*
|
38
|
+
* require 'ahocorasick'
|
39
|
+
* kwt = Ahocorasick::KeywordTree.new
|
40
|
+
*
|
41
|
+
*/
|
42
|
+
static VALUE
|
43
|
+
rb_kwt_init(VALUE self)
|
44
|
+
{
|
45
|
+
AC_STRUCT * tree;
|
46
|
+
struct kwt_struct_data *kwt_data;
|
47
|
+
|
48
|
+
kwt_data = ALLOC(struct kwt_struct_data);
|
49
|
+
tree = ac_alloc();
|
50
|
+
DATA_PTR(self) = kwt_data;
|
51
|
+
kwt_data->tree = tree;
|
52
|
+
kwt_data->last_id = 1;
|
53
|
+
kwt_data->dictionary_size = 0;
|
54
|
+
kwt_data->is_frozen = 0;
|
55
|
+
return self;
|
56
|
+
}
|
57
|
+
|
58
|
+
/*
|
59
|
+
* Document-method: make
|
60
|
+
* call-seq: make
|
61
|
+
*
|
62
|
+
* It freezes the current KeywordTree. After this point, the tree will not accept any new entries.
|
63
|
+
*
|
64
|
+
* ==== Note: This method is called internally by search
|
65
|
+
*
|
66
|
+
* require 'ahocorasick'
|
67
|
+
*
|
68
|
+
* kwt = Ahocorasick::KeywordTree.new
|
69
|
+
*
|
70
|
+
* kwt.add_string("one")
|
71
|
+
* kwt.add_string("two")
|
72
|
+
* kwt.make()
|
73
|
+
*/
|
74
|
+
static VALUE
|
75
|
+
rb_kwt_make(VALUE self)
|
76
|
+
{
|
77
|
+
struct kwt_struct_data *kwt_data;
|
78
|
+
KeywordTree(self, kwt_data);
|
79
|
+
|
80
|
+
ac_prep( kwt_data->tree );
|
81
|
+
kwt_data->is_frozen = 1;
|
82
|
+
return self;
|
83
|
+
}
|
84
|
+
|
85
|
+
/*
|
86
|
+
* Document-method: search
|
87
|
+
* call-seq: search
|
88
|
+
*
|
89
|
+
* Search the current tree.
|
90
|
+
*
|
91
|
+
* It returns an array on hashes, e.g.
|
92
|
+
*
|
93
|
+
* [ { :id => int, :value => int, :starts_at => int, :ends_at => int}, { ... } ]
|
94
|
+
*
|
95
|
+
* Returns an empty array when the search didn't return any result.
|
96
|
+
*
|
97
|
+
* # assuming a valid KeywordTree kwt object:
|
98
|
+
* kwt.add_string("one")
|
99
|
+
* kwt.add_string("two")
|
100
|
+
*
|
101
|
+
* kwt.search( "moved two times already" ).each do | result |
|
102
|
+
* result[:id] # => 2
|
103
|
+
* result[:ends_at] # => 9
|
104
|
+
* result[:starts_at] # => 6
|
105
|
+
* result[:value] # => two
|
106
|
+
* end # => 1
|
107
|
+
*
|
108
|
+
*/
|
109
|
+
static VALUE
|
110
|
+
rb_kwt_search(int argc, VALUE *argv, VALUE self)
|
111
|
+
{
|
112
|
+
char * result; // itermediate result
|
113
|
+
char * remain; // returned by ac_search, the remaing text to search
|
114
|
+
int lgt, id, ends_at; // filled in by ac_search, the id, length and ends_at position
|
115
|
+
int starts_at;
|
116
|
+
VALUE v_result; // one result, as hash
|
117
|
+
VALUE v_results; // all the results, an array
|
118
|
+
VALUE v_search; // search string, function argument
|
119
|
+
struct kwt_struct_data *kwt_data;
|
120
|
+
|
121
|
+
// one mandatory argument.
|
122
|
+
rb_scan_args(argc, argv, "1", &v_search);
|
123
|
+
// it should be string.
|
124
|
+
Check_Type(v_search, T_STRING);
|
125
|
+
// get the structure
|
126
|
+
KeywordTree(self, kwt_data);
|
127
|
+
// freeze the tree, if not already
|
128
|
+
if(kwt_data->is_frozen == 0) {
|
129
|
+
ac_prep( kwt_data->tree );
|
130
|
+
kwt_data->is_frozen = 1;
|
131
|
+
}
|
132
|
+
// prepare the return value
|
133
|
+
// v_results= rb_block_given_p()? Qnil : rb_ary_new();
|
134
|
+
v_results= rb_ary_new();
|
135
|
+
// fail quickly and return the empty array
|
136
|
+
if(kwt_data->dictionary_size == 0)
|
137
|
+
return v_results;
|
138
|
+
// prepare the search
|
139
|
+
ac_search_init(kwt_data->tree, RSTRING( v_search )->ptr, RSTRING( v_search )->len);
|
140
|
+
// loop trought the results
|
141
|
+
while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
|
142
|
+
// this is an individual result as a hash
|
143
|
+
v_result= rb_hash_new();
|
144
|
+
rb_hash_aset( v_result, sym_id, INT2FIX(id) );
|
145
|
+
rb_hash_aset( v_result, sym_starts_at, INT2FIX( ends_at - lgt - 1 ) );
|
146
|
+
rb_hash_aset( v_result, sym_ends_at, INT2FIX( ends_at - 1 ) );
|
147
|
+
result = (char*) malloc (sizeof(char)*lgt);
|
148
|
+
sprintf( result, "%.*s", lgt, remain);
|
149
|
+
rb_hash_aset( v_result, sym_value, rb_str_new(result, lgt) );
|
150
|
+
|
151
|
+
// yield this hash or, add it to the results
|
152
|
+
// if(rb_block_given_p())
|
153
|
+
// rb_yield(v_result);
|
154
|
+
// else
|
155
|
+
rb_ary_push( v_results, v_result );
|
156
|
+
free(result);
|
157
|
+
}
|
158
|
+
|
159
|
+
// TODO: maybe the Tree can be re-opened to add new items to dictionary
|
160
|
+
|
161
|
+
// return the results or nil if none
|
162
|
+
// if( v_results != Qnil && RARRAY(v_results)->len > 0 ) {
|
163
|
+
return v_results;
|
164
|
+
// } else {
|
165
|
+
// return Qnil;
|
166
|
+
// }
|
167
|
+
}
|
168
|
+
|
169
|
+
|
170
|
+
/*
|
171
|
+
* Document-method: size
|
172
|
+
* call-seq: size
|
173
|
+
*
|
174
|
+
* Returns the size of this KeywordTree
|
175
|
+
*
|
176
|
+
* kwt.add_string("foo")
|
177
|
+
* kwt.add_string("bar")
|
178
|
+
* kwt.size #=> 2
|
179
|
+
*
|
180
|
+
*/
|
181
|
+
static VALUE
|
182
|
+
rb_kwt_size(VALUE self)
|
183
|
+
{
|
184
|
+
struct kwt_struct_data *kwt_data;
|
185
|
+
KeywordTree(self, kwt_data);
|
186
|
+
|
187
|
+
return INT2FIX(kwt_data->dictionary_size);
|
188
|
+
}
|
189
|
+
|
190
|
+
|
191
|
+
/*
|
192
|
+
* Document-method: add_string
|
193
|
+
* call-seq: add_string
|
194
|
+
*
|
195
|
+
* Adds a sequence to this KeywordTree.
|
196
|
+
*
|
197
|
+
* kwt.add_string("foo1$21^ 98N3 ba>Z")
|
198
|
+
* kwt << "bar" # using the alias
|
199
|
+
*
|
200
|
+
* ==== Note: you can also specify the id, a number between 1 and k
|
201
|
+
*
|
202
|
+
* kwt.add_string "bar", 123
|
203
|
+
*
|
204
|
+
* This id should be unique in the context of the current tree.
|
205
|
+
*
|
206
|
+
*/
|
207
|
+
static VALUE
|
208
|
+
rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
209
|
+
{
|
210
|
+
VALUE v_string, v_id;
|
211
|
+
struct kwt_struct_data *kwt_data;
|
212
|
+
char * string;
|
213
|
+
int id;
|
214
|
+
|
215
|
+
rb_scan_args(argc, argv, "11", &v_string, &v_id);
|
216
|
+
|
217
|
+
Check_Type(v_string, T_STRING);
|
218
|
+
string= RSTRING(v_string)->ptr;
|
219
|
+
|
220
|
+
KeywordTree(self, kwt_data);
|
221
|
+
|
222
|
+
if(kwt_data->is_frozen == 1)
|
223
|
+
rb_raise(rb_eRuntimeError, "Cannot add `%s\" into a frozen tree.", string);
|
224
|
+
|
225
|
+
if(v_id == Qnil) {
|
226
|
+
id = kwt_data->last_id;
|
227
|
+
} else if(TYPE(v_id) != T_FIXNUM) {
|
228
|
+
rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%s\" given.", RSTRING(v_id)->ptr);
|
229
|
+
} else if(NUM2INT(v_id) <= 0) {
|
230
|
+
rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%d\" given.", NUM2INT(v_id));
|
231
|
+
} else {
|
232
|
+
id= NUM2INT(v_id);
|
233
|
+
}
|
234
|
+
|
235
|
+
if( ac_add_string(kwt_data->tree, string, strlen(string), id) == 0 ) {
|
236
|
+
rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
|
237
|
+
}
|
238
|
+
|
239
|
+
kwt_data->last_id= id + 1;
|
240
|
+
kwt_data->dictionary_size++;
|
241
|
+
return self;
|
242
|
+
}
|
243
|
+
|
244
|
+
/*
|
245
|
+
* call-seq: from_file
|
246
|
+
*
|
247
|
+
* Creates a new KeywordTree and loads the dictionary from a file
|
248
|
+
*
|
249
|
+
* % cat dict0.txt
|
250
|
+
* foo
|
251
|
+
* bar
|
252
|
+
* base
|
253
|
+
*
|
254
|
+
* k= AhoCorasick::KeywordTree.from_file "dict0.txt"
|
255
|
+
* k.search("basement").size # => 1
|
256
|
+
*
|
257
|
+
*/
|
258
|
+
static VALUE
|
259
|
+
rb_kwt_new_from_file(int argc, VALUE *argv, VALUE klass)
|
260
|
+
{
|
261
|
+
|
262
|
+
// TODO:
|
263
|
+
// * use rb_kwt_add_string
|
264
|
+
// * use rb_io* to handle the file
|
265
|
+
|
266
|
+
struct kwt_struct_data *kwt_data;
|
267
|
+
char word[1024];
|
268
|
+
int id;
|
269
|
+
VALUE self;
|
270
|
+
VALUE f_string;
|
271
|
+
FILE *dictionary;
|
272
|
+
|
273
|
+
rb_scan_args(argc, argv, "10", &f_string);
|
274
|
+
|
275
|
+
id = 0;
|
276
|
+
SafeStringValue( f_string );
|
277
|
+
self= rb_class_new_instance( 0, NULL, klass );
|
278
|
+
KeywordTree( self, kwt_data );
|
279
|
+
|
280
|
+
dictionary = fopen( RSTRING( f_string )->ptr, "r" );
|
281
|
+
if(dictionary == NULL) {
|
282
|
+
rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?", RSTRING(f_string)->ptr);
|
283
|
+
}
|
284
|
+
|
285
|
+
while(fgets(word, 1024, dictionary) != NULL) {
|
286
|
+
ac_add_string(kwt_data->tree, word, strlen(word)-1, id++);
|
287
|
+
kwt_data->dictionary_size++;
|
288
|
+
}
|
289
|
+
kwt_data->last_id= id+1;
|
290
|
+
fclose(dictionary);
|
291
|
+
return self;
|
292
|
+
}
|
293
|
+
|
294
|
+
static void
|
295
|
+
rb_kwt_struct_free(struct kwt_struct_data * kwt_data)
|
296
|
+
{
|
297
|
+
ac_free(kwt_data->tree);
|
298
|
+
}
|
299
|
+
|
300
|
+
static VALUE
|
301
|
+
rb_kwt_struct_alloc(VALUE klass)
|
302
|
+
{
|
303
|
+
return Data_Wrap_Struct(klass, 0, rb_kwt_struct_free, 0);
|
304
|
+
}
|
305
|
+
|
306
|
+
/*
|
307
|
+
* Blump.
|
308
|
+
*/
|
309
|
+
void Init_ahocorasick() {
|
310
|
+
rb_mAhoCorasick = rb_define_module("AhoCorasick");
|
311
|
+
rb_cKeywordTree = rb_define_class_under(rb_mAhoCorasick, "KeywordTree", rb_cObject);
|
312
|
+
|
313
|
+
rb_define_alloc_func(rb_cKeywordTree, rb_kwt_struct_alloc);
|
314
|
+
|
315
|
+
rb_define_method(rb_cKeywordTree, "initialize", rb_kwt_init, 0);
|
316
|
+
rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
|
317
|
+
rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
|
318
|
+
rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
|
319
|
+
rb_define_method(rb_cKeywordTree, "search", rb_kwt_search, -1);
|
320
|
+
rb_define_alias(rb_cKeywordTree, "<<", "add_string");
|
321
|
+
rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
|
322
|
+
|
323
|
+
sym_id= ID2SYM(rb_intern("id"));
|
324
|
+
sym_value= ID2SYM(rb_intern("value"));
|
325
|
+
sym_ends_at= ID2SYM( rb_intern("ends_at") );
|
326
|
+
sym_starts_at= ID2SYM( rb_intern("starts_at") );
|
327
|
+
|
328
|
+
}
|
329
|
+
|
@@ -0,0 +1,183 @@
|
|
1
|
+
require 'ext/ahocorasick'
|
2
|
+
|
3
|
+
include AhoCorasick
|
4
|
+
|
5
|
+
describe KeywordTree do
|
6
|
+
|
7
|
+
describe "How to create a new KeywordTree" do
|
8
|
+
it "should create a new KeywordTree" do
|
9
|
+
KeywordTree.new.class.should == KeywordTree
|
10
|
+
end
|
11
|
+
it "should create a new KeywordTree" do
|
12
|
+
KeywordTree.from_file( File.dirname(__FILE__) + "/data/dict0.txt").class.should == KeywordTree
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "How to search" do
|
17
|
+
|
18
|
+
before(:each) do
|
19
|
+
@kwt= KeywordTree.new
|
20
|
+
end
|
21
|
+
|
22
|
+
# XXX: is this usefull?
|
23
|
+
after(:each) do
|
24
|
+
@kwt= nil
|
25
|
+
end
|
26
|
+
it "should return an array" do
|
27
|
+
@kwt << "foo"
|
28
|
+
@kwt.search("bar").class.should == Array
|
29
|
+
end
|
30
|
+
|
31
|
+
it "the array should contain hashes" do
|
32
|
+
@kwt << "bar" << "foo"
|
33
|
+
@kwt.search("foo")[0].class.should == Hash
|
34
|
+
end
|
35
|
+
|
36
|
+
# XXX: this is subject of ...talks. no yield at this point
|
37
|
+
# it "should return nil if block_given?" do
|
38
|
+
# @kwt.search("foo"){|r| r[:id]}.should == nil
|
39
|
+
# end
|
40
|
+
|
41
|
+
it "should return empty array if no results" do
|
42
|
+
@kwt.search("baba").should == []
|
43
|
+
end
|
44
|
+
|
45
|
+
it "each hash should have the required symbols values" do
|
46
|
+
@kwt << "bar" << "foo"
|
47
|
+
@kwt.search("foo").each do | r |
|
48
|
+
r[:id].class.should == Fixnum
|
49
|
+
r[:starts_at].class.should == Fixnum
|
50
|
+
r[:ends_at].class.should == Fixnum
|
51
|
+
r[:value].should == "foo"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should match position" do
|
56
|
+
# 0123
|
57
|
+
# | |
|
58
|
+
@kwt << "data"
|
59
|
+
q= "data moved"
|
60
|
+
@kwt.search(q).each do | result |
|
61
|
+
result[:starts_at].should == 0
|
62
|
+
result[:ends_at].should == 4
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should match position with unicode" do
|
67
|
+
# 012345689
|
68
|
+
# | |
|
69
|
+
@kwt << "bucurești"
|
70
|
+
# 01234567890123456789023
|
71
|
+
# | |
|
72
|
+
q= "data moved to bucurești"
|
73
|
+
@kwt.search(q).each do | result |
|
74
|
+
result[:starts_at].should == 14
|
75
|
+
result[:ends_at].should == 24
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
it "more unicode" do
|
80
|
+
@kwt << "expected"
|
81
|
+
# 012345678901234578901234567890
|
82
|
+
q = "moved to bucurești as expected"
|
83
|
+
@kwt.search(q).each do | r |
|
84
|
+
r[:starts_at].should == 23
|
85
|
+
r[:ends_at].should == q.size
|
86
|
+
(r[:ends_at]-r[:starts_at]).should == r[:value].size
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
it "checks for result length" do
|
91
|
+
@kwt << "foo"
|
92
|
+
result= @kwt.search("foo").first
|
93
|
+
# 4 0
|
94
|
+
(result[:ends_at]-result[:starts_at]).should == result[:value].size
|
95
|
+
"foo"[result[:ends_at]].should == nil
|
96
|
+
"foo"[result[:ends_at]-1].chr.should == "o"
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
describe "Context Match vs. Exact Word Match" do
|
102
|
+
|
103
|
+
before(:each) do
|
104
|
+
# data, base, database
|
105
|
+
@kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
|
106
|
+
end
|
107
|
+
|
108
|
+
it "should match on context" do
|
109
|
+
@kwt.search("I've moved the data to a new database").size.should == 4
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
describe "How to add strings" do
|
115
|
+
it "should add 2 strings" do
|
116
|
+
kwt= KeywordTree.new
|
117
|
+
kwt.add_string "foo"
|
118
|
+
kwt << "bar"
|
119
|
+
kwt.size.should == 2
|
120
|
+
end
|
121
|
+
it "should add 2 strings with id" do
|
122
|
+
kwt= KeywordTree.new
|
123
|
+
kwt.add_string "foo", 1
|
124
|
+
kwt.add_string "bar", 2
|
125
|
+
kwt.size.should == 2
|
126
|
+
end
|
127
|
+
|
128
|
+
it "should rise an error when adding same id twice" do
|
129
|
+
kwt= KeywordTree.new
|
130
|
+
kwt.add_string "foo", 1
|
131
|
+
lambda{kwt.add_string("bar", 1)}.should raise_error(RuntimeError)
|
132
|
+
end
|
133
|
+
|
134
|
+
it "should raise an error when not using id's > 0" do
|
135
|
+
kwt= KeywordTree.new
|
136
|
+
lambda{kwt.add_string("bar", -1)}.should raise_error(RuntimeError)
|
137
|
+
lambda{kwt.add_string("bar", "a")}.should raise_error(RuntimeError)
|
138
|
+
lambda{kwt.add_string("bar", 0)}.should raise_error(RuntimeError)
|
139
|
+
end
|
140
|
+
|
141
|
+
it "should work to add a random id" do
|
142
|
+
kwt= KeywordTree.new
|
143
|
+
kwt << "baz"
|
144
|
+
kwt.add_string "foo", 1990
|
145
|
+
kwt << "bar"
|
146
|
+
kwt.size.should == 3
|
147
|
+
end
|
148
|
+
|
149
|
+
it "should add strings from file and manually" do
|
150
|
+
kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
|
151
|
+
kwt << "foo"
|
152
|
+
kwt.size.should == File.readlines( File.dirname(__FILE__) + "/data/dict0.txt" ).size + 1
|
153
|
+
end
|
154
|
+
|
155
|
+
it "should raise an error when adding new strings after the tree is frozen" do
|
156
|
+
kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
|
157
|
+
kwt.make
|
158
|
+
lambda{kwt << "foo"}.should raise_error(RuntimeError)
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
162
|
+
|
163
|
+
describe "Benchmarks. Loading from a file" do
|
164
|
+
|
165
|
+
it "should be fast to load a bunch of english words" do
|
166
|
+
start= Time.now
|
167
|
+
k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
|
168
|
+
puts "\n%d words loaded in %s seconds" % [k.size, (Time.now - start)]
|
169
|
+
(Time.now-start).should < 0.2
|
170
|
+
end
|
171
|
+
|
172
|
+
it "should be fast to find" do
|
173
|
+
start= Time.now
|
174
|
+
k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
|
175
|
+
load_time= Time.now
|
176
|
+
results= k.search( File.read( File.dirname(__FILE__) + "/data/melville-moby_dick.txt" ) )
|
177
|
+
puts "\n%d words re-loaded in %s seconds.\nGot %d results in %s seconds" % [k.size, (load_time - start), results.size, (Time.now-load_time)]
|
178
|
+
(Time.now-load_time).should < 1.2
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
|
183
|
+
end
|