aurelian-ruby-ahocorasick 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +21 -0
- data/README.textile +36 -0
- data/examples/dict.rb +14 -0
- data/examples/elev.rb +19 -0
- data/examples/sample.c +94 -0
- data/examples/test.rb +46 -0
- data/ext/ac.c +623 -0
- data/ext/ac.h +36 -0
- data/ext/extconf.rb +4 -0
- data/ext/ruby-ahocorasick.c +329 -0
- data/spec/ahocorasick_spec.rb +183 -0
- metadata +69 -0
data/ext/ac.h
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
|
2
|
+
#ifndef _AC_H_
|
3
|
+
#define _AC_H_
|
4
|
+
|
5
|
+
typedef struct actreenode {
|
6
|
+
char ch;
|
7
|
+
int matchid;
|
8
|
+
struct actreenode *outlink, *faillink;
|
9
|
+
struct actreenode *children, *sibling;
|
10
|
+
} ACTREE_NODE, *AC_TREE;
|
11
|
+
|
12
|
+
typedef struct {
|
13
|
+
AC_TREE tree;
|
14
|
+
int ispreprocessed, errorflag;
|
15
|
+
|
16
|
+
int Psize;
|
17
|
+
int *Plengths;
|
18
|
+
|
19
|
+
char *T;
|
20
|
+
int N, c, initflag, endflag;
|
21
|
+
AC_TREE w, output;
|
22
|
+
#ifdef STATS
|
23
|
+
int prep_new_edges, prep_old_edges, prep_fail_compares;
|
24
|
+
int num_compares, num_failures, edges_traversed, outlinks_traversed;
|
25
|
+
#endif
|
26
|
+
} AC_STRUCT;
|
27
|
+
|
28
|
+
AC_STRUCT *ac_alloc(void);
|
29
|
+
int ac_add_string(AC_STRUCT *node, char *P, int M, int id);
|
30
|
+
// int ac_del_string(AC_STRUCT *node, char *P, int M, int id);
|
31
|
+
int ac_prep(AC_STRUCT *node);
|
32
|
+
void ac_search_init(AC_STRUCT *node, char *T, int N);
|
33
|
+
char *ac_search(AC_STRUCT *node, int *length_out, int *id_out, int *ends_at);
|
34
|
+
void ac_free(AC_STRUCT *node);
|
35
|
+
|
36
|
+
#endif
|
data/ext/extconf.rb
ADDED
@@ -0,0 +1,329 @@
|
|
1
|
+
|
2
|
+
//
|
3
|
+
// (c) 2008, Aurelian Oancea < aurelian at locknet . ro >
|
4
|
+
//
|
5
|
+
// Released under MIT-LICENSE
|
6
|
+
//
|
7
|
+
|
8
|
+
//
|
9
|
+
// TODO: new methods?
|
10
|
+
// * kwt[id] = word
|
11
|
+
// * kwt.from_file (class instance method)
|
12
|
+
//
|
13
|
+
|
14
|
+
#include <ruby.h>
|
15
|
+
#include "ac.h"
|
16
|
+
|
17
|
+
static VALUE sym_id, sym_value, sym_ends_at, sym_starts_at;
|
18
|
+
|
19
|
+
VALUE rb_mAhoCorasick;
|
20
|
+
VALUE rb_cKeywordTree;
|
21
|
+
|
22
|
+
#define KeywordTree(obj, kwt_data) {\
|
23
|
+
Data_Get_Struct(obj, struct kwt_struct_data, kwt_data);\
|
24
|
+
}
|
25
|
+
|
26
|
+
struct kwt_struct_data {
|
27
|
+
AC_STRUCT * tree;
|
28
|
+
int last_id;
|
29
|
+
int dictionary_size;
|
30
|
+
int is_frozen;
|
31
|
+
};
|
32
|
+
|
33
|
+
/*
|
34
|
+
* call-seq: initialize
|
35
|
+
*
|
36
|
+
* Creates a new KeywordTree
|
37
|
+
*
|
38
|
+
* require 'ahocorasick'
|
39
|
+
* kwt = Ahocorasick::KeywordTree.new
|
40
|
+
*
|
41
|
+
*/
|
42
|
+
static VALUE
|
43
|
+
rb_kwt_init(VALUE self)
|
44
|
+
{
|
45
|
+
AC_STRUCT * tree;
|
46
|
+
struct kwt_struct_data *kwt_data;
|
47
|
+
|
48
|
+
kwt_data = ALLOC(struct kwt_struct_data);
|
49
|
+
tree = ac_alloc();
|
50
|
+
DATA_PTR(self) = kwt_data;
|
51
|
+
kwt_data->tree = tree;
|
52
|
+
kwt_data->last_id = 1;
|
53
|
+
kwt_data->dictionary_size = 0;
|
54
|
+
kwt_data->is_frozen = 0;
|
55
|
+
return self;
|
56
|
+
}
|
57
|
+
|
58
|
+
/*
|
59
|
+
* Document-method: make
|
60
|
+
* call-seq: make
|
61
|
+
*
|
62
|
+
* It freezes the current KeywordTree. After this point, the tree will not accept any new entries.
|
63
|
+
*
|
64
|
+
* ==== Note: This method is called internally by search
|
65
|
+
*
|
66
|
+
* require 'ahocorasick'
|
67
|
+
*
|
68
|
+
* kwt = Ahocorasick::KeywordTree.new
|
69
|
+
*
|
70
|
+
* kwt.add_string("one")
|
71
|
+
* kwt.add_string("two")
|
72
|
+
* kwt.make()
|
73
|
+
*/
|
74
|
+
static VALUE
|
75
|
+
rb_kwt_make(VALUE self)
|
76
|
+
{
|
77
|
+
struct kwt_struct_data *kwt_data;
|
78
|
+
KeywordTree(self, kwt_data);
|
79
|
+
|
80
|
+
ac_prep( kwt_data->tree );
|
81
|
+
kwt_data->is_frozen = 1;
|
82
|
+
return self;
|
83
|
+
}
|
84
|
+
|
85
|
+
/*
|
86
|
+
* Document-method: search
|
87
|
+
* call-seq: search
|
88
|
+
*
|
89
|
+
* Search the current tree.
|
90
|
+
*
|
91
|
+
* It returns an array on hashes, e.g.
|
92
|
+
*
|
93
|
+
* [ { :id => int, :value => int, :starts_at => int, :ends_at => int}, { ... } ]
|
94
|
+
*
|
95
|
+
* Returns an empty array when the search didn't return any result.
|
96
|
+
*
|
97
|
+
* # assuming a valid KeywordTree kwt object:
|
98
|
+
* kwt.add_string("one")
|
99
|
+
* kwt.add_string("two")
|
100
|
+
*
|
101
|
+
* kwt.search( "moved two times already" ).each do | result |
|
102
|
+
* result[:id] # => 2
|
103
|
+
* result[:ends_at] # => 9
|
104
|
+
* result[:starts_at] # => 6
|
105
|
+
* result[:value] # => two
|
106
|
+
* end # => 1
|
107
|
+
*
|
108
|
+
*/
|
109
|
+
static VALUE
|
110
|
+
rb_kwt_search(int argc, VALUE *argv, VALUE self)
|
111
|
+
{
|
112
|
+
char * result; // itermediate result
|
113
|
+
char * remain; // returned by ac_search, the remaing text to search
|
114
|
+
int lgt, id, ends_at; // filled in by ac_search, the id, length and ends_at position
|
115
|
+
int starts_at;
|
116
|
+
VALUE v_result; // one result, as hash
|
117
|
+
VALUE v_results; // all the results, an array
|
118
|
+
VALUE v_search; // search string, function argument
|
119
|
+
struct kwt_struct_data *kwt_data;
|
120
|
+
|
121
|
+
// one mandatory argument.
|
122
|
+
rb_scan_args(argc, argv, "1", &v_search);
|
123
|
+
// it should be string.
|
124
|
+
Check_Type(v_search, T_STRING);
|
125
|
+
// get the structure
|
126
|
+
KeywordTree(self, kwt_data);
|
127
|
+
// freeze the tree, if not already
|
128
|
+
if(kwt_data->is_frozen == 0) {
|
129
|
+
ac_prep( kwt_data->tree );
|
130
|
+
kwt_data->is_frozen = 1;
|
131
|
+
}
|
132
|
+
// prepare the return value
|
133
|
+
// v_results= rb_block_given_p()? Qnil : rb_ary_new();
|
134
|
+
v_results= rb_ary_new();
|
135
|
+
// fail quickly and return the empty array
|
136
|
+
if(kwt_data->dictionary_size == 0)
|
137
|
+
return v_results;
|
138
|
+
// prepare the search
|
139
|
+
ac_search_init(kwt_data->tree, RSTRING( v_search )->ptr, RSTRING( v_search )->len);
|
140
|
+
// loop trought the results
|
141
|
+
while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
|
142
|
+
// this is an individual result as a hash
|
143
|
+
v_result= rb_hash_new();
|
144
|
+
rb_hash_aset( v_result, sym_id, INT2FIX(id) );
|
145
|
+
rb_hash_aset( v_result, sym_starts_at, INT2FIX( ends_at - lgt - 1 ) );
|
146
|
+
rb_hash_aset( v_result, sym_ends_at, INT2FIX( ends_at - 1 ) );
|
147
|
+
result = (char*) malloc (sizeof(char)*lgt);
|
148
|
+
sprintf( result, "%.*s", lgt, remain);
|
149
|
+
rb_hash_aset( v_result, sym_value, rb_str_new(result, lgt) );
|
150
|
+
|
151
|
+
// yield this hash or, add it to the results
|
152
|
+
// if(rb_block_given_p())
|
153
|
+
// rb_yield(v_result);
|
154
|
+
// else
|
155
|
+
rb_ary_push( v_results, v_result );
|
156
|
+
free(result);
|
157
|
+
}
|
158
|
+
|
159
|
+
// TODO: maybe the Tree can be re-opened to add new items to dictionary
|
160
|
+
|
161
|
+
// return the results or nil if none
|
162
|
+
// if( v_results != Qnil && RARRAY(v_results)->len > 0 ) {
|
163
|
+
return v_results;
|
164
|
+
// } else {
|
165
|
+
// return Qnil;
|
166
|
+
// }
|
167
|
+
}
|
168
|
+
|
169
|
+
|
170
|
+
/*
|
171
|
+
* Document-method: size
|
172
|
+
* call-seq: size
|
173
|
+
*
|
174
|
+
* Returns the size of this KeywordTree
|
175
|
+
*
|
176
|
+
* kwt.add_string("foo")
|
177
|
+
* kwt.add_string("bar")
|
178
|
+
* kwt.size #=> 2
|
179
|
+
*
|
180
|
+
*/
|
181
|
+
static VALUE
|
182
|
+
rb_kwt_size(VALUE self)
|
183
|
+
{
|
184
|
+
struct kwt_struct_data *kwt_data;
|
185
|
+
KeywordTree(self, kwt_data);
|
186
|
+
|
187
|
+
return INT2FIX(kwt_data->dictionary_size);
|
188
|
+
}
|
189
|
+
|
190
|
+
|
191
|
+
/*
|
192
|
+
* Document-method: add_string
|
193
|
+
* call-seq: add_string
|
194
|
+
*
|
195
|
+
* Adds a sequence to this KeywordTree.
|
196
|
+
*
|
197
|
+
* kwt.add_string("foo1$21^ 98N3 ba>Z")
|
198
|
+
* kwt << "bar" # using the alias
|
199
|
+
*
|
200
|
+
* ==== Note: you can also specify the id, a number between 1 and k
|
201
|
+
*
|
202
|
+
* kwt.add_string "bar", 123
|
203
|
+
*
|
204
|
+
* This id should be unique in the context of the current tree.
|
205
|
+
*
|
206
|
+
*/
|
207
|
+
static VALUE
|
208
|
+
rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
|
209
|
+
{
|
210
|
+
VALUE v_string, v_id;
|
211
|
+
struct kwt_struct_data *kwt_data;
|
212
|
+
char * string;
|
213
|
+
int id;
|
214
|
+
|
215
|
+
rb_scan_args(argc, argv, "11", &v_string, &v_id);
|
216
|
+
|
217
|
+
Check_Type(v_string, T_STRING);
|
218
|
+
string= RSTRING(v_string)->ptr;
|
219
|
+
|
220
|
+
KeywordTree(self, kwt_data);
|
221
|
+
|
222
|
+
if(kwt_data->is_frozen == 1)
|
223
|
+
rb_raise(rb_eRuntimeError, "Cannot add `%s\" into a frozen tree.", string);
|
224
|
+
|
225
|
+
if(v_id == Qnil) {
|
226
|
+
id = kwt_data->last_id;
|
227
|
+
} else if(TYPE(v_id) != T_FIXNUM) {
|
228
|
+
rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%s\" given.", RSTRING(v_id)->ptr);
|
229
|
+
} else if(NUM2INT(v_id) <= 0) {
|
230
|
+
rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%d\" given.", NUM2INT(v_id));
|
231
|
+
} else {
|
232
|
+
id= NUM2INT(v_id);
|
233
|
+
}
|
234
|
+
|
235
|
+
if( ac_add_string(kwt_data->tree, string, strlen(string), id) == 0 ) {
|
236
|
+
rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);
|
237
|
+
}
|
238
|
+
|
239
|
+
kwt_data->last_id= id + 1;
|
240
|
+
kwt_data->dictionary_size++;
|
241
|
+
return self;
|
242
|
+
}
|
243
|
+
|
244
|
+
/*
|
245
|
+
* call-seq: from_file
|
246
|
+
*
|
247
|
+
* Creates a new KeywordTree and loads the dictionary from a file
|
248
|
+
*
|
249
|
+
* % cat dict0.txt
|
250
|
+
* foo
|
251
|
+
* bar
|
252
|
+
* base
|
253
|
+
*
|
254
|
+
* k= AhoCorasick::KeywordTree.from_file "dict0.txt"
|
255
|
+
* k.search("basement").size # => 1
|
256
|
+
*
|
257
|
+
*/
|
258
|
+
static VALUE
|
259
|
+
rb_kwt_new_from_file(int argc, VALUE *argv, VALUE klass)
|
260
|
+
{
|
261
|
+
|
262
|
+
// TODO:
|
263
|
+
// * use rb_kwt_add_string
|
264
|
+
// * use rb_io* to handle the file
|
265
|
+
|
266
|
+
struct kwt_struct_data *kwt_data;
|
267
|
+
char word[1024];
|
268
|
+
int id;
|
269
|
+
VALUE self;
|
270
|
+
VALUE f_string;
|
271
|
+
FILE *dictionary;
|
272
|
+
|
273
|
+
rb_scan_args(argc, argv, "10", &f_string);
|
274
|
+
|
275
|
+
id = 0;
|
276
|
+
SafeStringValue( f_string );
|
277
|
+
self= rb_class_new_instance( 0, NULL, klass );
|
278
|
+
KeywordTree( self, kwt_data );
|
279
|
+
|
280
|
+
dictionary = fopen( RSTRING( f_string )->ptr, "r" );
|
281
|
+
if(dictionary == NULL) {
|
282
|
+
rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?", RSTRING(f_string)->ptr);
|
283
|
+
}
|
284
|
+
|
285
|
+
while(fgets(word, 1024, dictionary) != NULL) {
|
286
|
+
ac_add_string(kwt_data->tree, word, strlen(word)-1, id++);
|
287
|
+
kwt_data->dictionary_size++;
|
288
|
+
}
|
289
|
+
kwt_data->last_id= id+1;
|
290
|
+
fclose(dictionary);
|
291
|
+
return self;
|
292
|
+
}
|
293
|
+
|
294
|
+
static void
|
295
|
+
rb_kwt_struct_free(struct kwt_struct_data * kwt_data)
|
296
|
+
{
|
297
|
+
ac_free(kwt_data->tree);
|
298
|
+
}
|
299
|
+
|
300
|
+
static VALUE
|
301
|
+
rb_kwt_struct_alloc(VALUE klass)
|
302
|
+
{
|
303
|
+
return Data_Wrap_Struct(klass, 0, rb_kwt_struct_free, 0);
|
304
|
+
}
|
305
|
+
|
306
|
+
/*
|
307
|
+
* Blump.
|
308
|
+
*/
|
309
|
+
void Init_ahocorasick() {
|
310
|
+
rb_mAhoCorasick = rb_define_module("AhoCorasick");
|
311
|
+
rb_cKeywordTree = rb_define_class_under(rb_mAhoCorasick, "KeywordTree", rb_cObject);
|
312
|
+
|
313
|
+
rb_define_alloc_func(rb_cKeywordTree, rb_kwt_struct_alloc);
|
314
|
+
|
315
|
+
rb_define_method(rb_cKeywordTree, "initialize", rb_kwt_init, 0);
|
316
|
+
rb_define_method(rb_cKeywordTree, "size", rb_kwt_size, 0);
|
317
|
+
rb_define_method(rb_cKeywordTree, "make", rb_kwt_make, 0);
|
318
|
+
rb_define_method(rb_cKeywordTree, "add_string", rb_kwt_add_string, -1);
|
319
|
+
rb_define_method(rb_cKeywordTree, "search", rb_kwt_search, -1);
|
320
|
+
rb_define_alias(rb_cKeywordTree, "<<", "add_string");
|
321
|
+
rb_define_singleton_method(rb_cKeywordTree, "from_file", rb_kwt_new_from_file, -1);
|
322
|
+
|
323
|
+
sym_id= ID2SYM(rb_intern("id"));
|
324
|
+
sym_value= ID2SYM(rb_intern("value"));
|
325
|
+
sym_ends_at= ID2SYM( rb_intern("ends_at") );
|
326
|
+
sym_starts_at= ID2SYM( rb_intern("starts_at") );
|
327
|
+
|
328
|
+
}
|
329
|
+
|
@@ -0,0 +1,183 @@
|
|
1
|
+
require 'ext/ahocorasick'
|
2
|
+
|
3
|
+
include AhoCorasick
|
4
|
+
|
5
|
+
describe KeywordTree do
|
6
|
+
|
7
|
+
describe "How to create a new KeywordTree" do
|
8
|
+
it "should create a new KeywordTree" do
|
9
|
+
KeywordTree.new.class.should == KeywordTree
|
10
|
+
end
|
11
|
+
it "should create a new KeywordTree" do
|
12
|
+
KeywordTree.from_file( File.dirname(__FILE__) + "/data/dict0.txt").class.should == KeywordTree
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "How to search" do
|
17
|
+
|
18
|
+
before(:each) do
|
19
|
+
@kwt= KeywordTree.new
|
20
|
+
end
|
21
|
+
|
22
|
+
# XXX: is this usefull?
|
23
|
+
after(:each) do
|
24
|
+
@kwt= nil
|
25
|
+
end
|
26
|
+
it "should return an array" do
|
27
|
+
@kwt << "foo"
|
28
|
+
@kwt.search("bar").class.should == Array
|
29
|
+
end
|
30
|
+
|
31
|
+
it "the array should contain hashes" do
|
32
|
+
@kwt << "bar" << "foo"
|
33
|
+
@kwt.search("foo")[0].class.should == Hash
|
34
|
+
end
|
35
|
+
|
36
|
+
# XXX: this is subject of ...talks. no yield at this point
|
37
|
+
# it "should return nil if block_given?" do
|
38
|
+
# @kwt.search("foo"){|r| r[:id]}.should == nil
|
39
|
+
# end
|
40
|
+
|
41
|
+
it "should return empty array if no results" do
|
42
|
+
@kwt.search("baba").should == []
|
43
|
+
end
|
44
|
+
|
45
|
+
it "each hash should have the required symbols values" do
|
46
|
+
@kwt << "bar" << "foo"
|
47
|
+
@kwt.search("foo").each do | r |
|
48
|
+
r[:id].class.should == Fixnum
|
49
|
+
r[:starts_at].class.should == Fixnum
|
50
|
+
r[:ends_at].class.should == Fixnum
|
51
|
+
r[:value].should == "foo"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should match position" do
|
56
|
+
# 0123
|
57
|
+
# | |
|
58
|
+
@kwt << "data"
|
59
|
+
q= "data moved"
|
60
|
+
@kwt.search(q).each do | result |
|
61
|
+
result[:starts_at].should == 0
|
62
|
+
result[:ends_at].should == 4
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should match position with unicode" do
|
67
|
+
# 012345689
|
68
|
+
# | |
|
69
|
+
@kwt << "bucurești"
|
70
|
+
# 01234567890123456789023
|
71
|
+
# | |
|
72
|
+
q= "data moved to bucurești"
|
73
|
+
@kwt.search(q).each do | result |
|
74
|
+
result[:starts_at].should == 14
|
75
|
+
result[:ends_at].should == 24
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
it "more unicode" do
|
80
|
+
@kwt << "expected"
|
81
|
+
# 012345678901234578901234567890
|
82
|
+
q = "moved to bucurești as expected"
|
83
|
+
@kwt.search(q).each do | r |
|
84
|
+
r[:starts_at].should == 23
|
85
|
+
r[:ends_at].should == q.size
|
86
|
+
(r[:ends_at]-r[:starts_at]).should == r[:value].size
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
it "checks for result length" do
|
91
|
+
@kwt << "foo"
|
92
|
+
result= @kwt.search("foo").first
|
93
|
+
# 4 0
|
94
|
+
(result[:ends_at]-result[:starts_at]).should == result[:value].size
|
95
|
+
"foo"[result[:ends_at]].should == nil
|
96
|
+
"foo"[result[:ends_at]-1].chr.should == "o"
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
describe "Context Match vs. Exact Word Match" do
|
102
|
+
|
103
|
+
before(:each) do
|
104
|
+
# data, base, database
|
105
|
+
@kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
|
106
|
+
end
|
107
|
+
|
108
|
+
it "should match on context" do
|
109
|
+
@kwt.search("I've moved the data to a new database").size.should == 4
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
describe "How to add strings" do
|
115
|
+
it "should add 2 strings" do
|
116
|
+
kwt= KeywordTree.new
|
117
|
+
kwt.add_string "foo"
|
118
|
+
kwt << "bar"
|
119
|
+
kwt.size.should == 2
|
120
|
+
end
|
121
|
+
it "should add 2 strings with id" do
|
122
|
+
kwt= KeywordTree.new
|
123
|
+
kwt.add_string "foo", 1
|
124
|
+
kwt.add_string "bar", 2
|
125
|
+
kwt.size.should == 2
|
126
|
+
end
|
127
|
+
|
128
|
+
it "should rise an error when adding same id twice" do
|
129
|
+
kwt= KeywordTree.new
|
130
|
+
kwt.add_string "foo", 1
|
131
|
+
lambda{kwt.add_string("bar", 1)}.should raise_error(RuntimeError)
|
132
|
+
end
|
133
|
+
|
134
|
+
it "should raise an error when not using id's > 0" do
|
135
|
+
kwt= KeywordTree.new
|
136
|
+
lambda{kwt.add_string("bar", -1)}.should raise_error(RuntimeError)
|
137
|
+
lambda{kwt.add_string("bar", "a")}.should raise_error(RuntimeError)
|
138
|
+
lambda{kwt.add_string("bar", 0)}.should raise_error(RuntimeError)
|
139
|
+
end
|
140
|
+
|
141
|
+
it "should work to add a random id" do
|
142
|
+
kwt= KeywordTree.new
|
143
|
+
kwt << "baz"
|
144
|
+
kwt.add_string "foo", 1990
|
145
|
+
kwt << "bar"
|
146
|
+
kwt.size.should == 3
|
147
|
+
end
|
148
|
+
|
149
|
+
it "should add strings from file and manually" do
|
150
|
+
kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
|
151
|
+
kwt << "foo"
|
152
|
+
kwt.size.should == File.readlines( File.dirname(__FILE__) + "/data/dict0.txt" ).size + 1
|
153
|
+
end
|
154
|
+
|
155
|
+
it "should raise an error when adding new strings after the tree is frozen" do
|
156
|
+
kwt= KeywordTree.from_file File.dirname(__FILE__) + "/data/dict0.txt"
|
157
|
+
kwt.make
|
158
|
+
lambda{kwt << "foo"}.should raise_error(RuntimeError)
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
162
|
+
|
163
|
+
describe "Benchmarks. Loading from a file" do
|
164
|
+
|
165
|
+
it "should be fast to load a bunch of english words" do
|
166
|
+
start= Time.now
|
167
|
+
k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
|
168
|
+
puts "\n%d words loaded in %s seconds" % [k.size, (Time.now - start)]
|
169
|
+
(Time.now-start).should < 0.2
|
170
|
+
end
|
171
|
+
|
172
|
+
it "should be fast to find" do
|
173
|
+
start= Time.now
|
174
|
+
k= KeywordTree.from_file File.dirname(__FILE__) + "/data/en.words"
|
175
|
+
load_time= Time.now
|
176
|
+
results= k.search( File.read( File.dirname(__FILE__) + "/data/melville-moby_dick.txt" ) )
|
177
|
+
puts "\n%d words re-loaded in %s seconds.\nGot %d results in %s seconds" % [k.size, (load_time - start), results.size, (Time.now-load_time)]
|
178
|
+
(Time.now-load_time).should < 1.2
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
|
183
|
+
end
|