whistlepig 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +86 -0
- data/ext/whistlepig/defaults.h +28 -0
- data/ext/whistlepig/entry.c +181 -0
- data/ext/whistlepig/entry.h +66 -0
- data/ext/whistlepig/error.c +24 -0
- data/ext/whistlepig/error.h +94 -0
- data/ext/whistlepig/extconf.rb +6 -0
- data/ext/whistlepig/index.c +294 -0
- data/ext/whistlepig/index.h +88 -0
- data/ext/whistlepig/khash.h +316 -0
- data/ext/whistlepig/mmap-obj.c +76 -0
- data/ext/whistlepig/mmap-obj.h +52 -0
- data/ext/whistlepig/query-parser.c +37 -0
- data/ext/whistlepig/query-parser.h +25 -0
- data/ext/whistlepig/query-parser.lex.c +2249 -0
- data/ext/whistlepig/query-parser.lex.h +359 -0
- data/ext/whistlepig/query-parser.tab.c +1757 -0
- data/ext/whistlepig/query-parser.tab.h +85 -0
- data/ext/whistlepig/query.c +194 -0
- data/ext/whistlepig/query.h +78 -0
- data/ext/whistlepig/search.c +746 -0
- data/ext/whistlepig/search.h +76 -0
- data/ext/whistlepig/segment.c +615 -0
- data/ext/whistlepig/segment.h +137 -0
- data/ext/whistlepig/stringmap.c +278 -0
- data/ext/whistlepig/stringmap.h +82 -0
- data/ext/whistlepig/stringpool.c +44 -0
- data/ext/whistlepig/stringpool.h +58 -0
- data/ext/whistlepig/termhash.c +294 -0
- data/ext/whistlepig/termhash.h +79 -0
- data/ext/whistlepig/tokenizer.lex.c +2263 -0
- data/ext/whistlepig/tokenizer.lex.h +360 -0
- data/ext/whistlepig/whistlepig.h +15 -0
- data/ext/whistlepig/whistlepigc.c +537 -0
- data/lib/whistlepig.rb +119 -0
- metadata +103 -0
@@ -0,0 +1,537 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <ruby.h>
|
3
|
+
#include "whistlepig.h"
|
4
|
+
|
5
|
+
static VALUE m_whistlepig;
|
6
|
+
static VALUE c_index;
|
7
|
+
static VALUE c_entry;
|
8
|
+
static VALUE c_query;
|
9
|
+
static VALUE c_error;
|
10
|
+
static VALUE c_parseerror;
|
11
|
+
|
12
|
+
static char* strdup(const char* old) { // wtf stupid
|
13
|
+
size_t len = strlen(old) + 1;
|
14
|
+
char *new = malloc(len * sizeof(char));
|
15
|
+
return (char *)memcpy(new, old, len);
|
16
|
+
}
|
17
|
+
|
18
|
+
static void index_free(wp_index* index) {
|
19
|
+
wp_error* e = wp_index_free(index);
|
20
|
+
//printf("# index free at %p with error %p\n", index, e);
|
21
|
+
if(e != NULL) {
|
22
|
+
PRINT_ERROR(e, stderr); // why not?
|
23
|
+
wp_error_free(e);
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
#define RAISE_IF_NECESSARY(e) do { \
|
28
|
+
if(e != NULL) { \
|
29
|
+
VALUE exc = rb_exc_new2(c_error, e->msg); \
|
30
|
+
wp_error_free(e); \
|
31
|
+
rb_exc_raise(exc); \
|
32
|
+
} \
|
33
|
+
} while(0)
|
34
|
+
|
35
|
+
// support 1.9 and 1.8
|
36
|
+
#ifndef RSTRING_PTR
|
37
|
+
#define RSTRING_PTR(v) RSTRING(v)->ptr
|
38
|
+
#endif
|
39
|
+
|
40
|
+
/*
|
41
|
+
* call-seq: Index.new(pathname_base)
|
42
|
+
*
|
43
|
+
* Creates or loads a new index. The on-disk representation will be multiple
|
44
|
+
* files starting * with +pathname_base+.
|
45
|
+
*
|
46
|
+
* The index may be later be explicitly closed with Index#close. It will also
|
47
|
+
* be automatically closed when Ruby exits.
|
48
|
+
*
|
49
|
+
*/
|
50
|
+
|
51
|
+
static VALUE index_new(VALUE class, VALUE v_pathname_base) {
|
52
|
+
Check_Type(v_pathname_base, T_STRING);
|
53
|
+
|
54
|
+
wp_index* index;
|
55
|
+
wp_error* e;
|
56
|
+
char* pathname_base = RSTRING_PTR(v_pathname_base);
|
57
|
+
|
58
|
+
if(wp_index_exists(pathname_base)) e = wp_index_load(&index, strdup(pathname_base));
|
59
|
+
else e = wp_index_create(&index, strdup(pathname_base));
|
60
|
+
RAISE_IF_NECESSARY(e);
|
61
|
+
|
62
|
+
VALUE o_index = Data_Wrap_Struct(class, NULL, index_free, index);
|
63
|
+
VALUE argv[1] = { v_pathname_base };
|
64
|
+
rb_obj_call_init(o_index, 1, argv);
|
65
|
+
return o_index;
|
66
|
+
}
|
67
|
+
|
68
|
+
/*
|
69
|
+
* call-seq: Index.create(pathname_base)
|
70
|
+
*
|
71
|
+
* Creates a new index, raising an error if it already exists. The on-disk
|
72
|
+
* representation will be multiple files starting with
|
73
|
+
* +pathname_base+.
|
74
|
+
*
|
75
|
+
*/
|
76
|
+
|
77
|
+
static VALUE index_create(VALUE class, VALUE v_pathname_base) {
|
78
|
+
Check_Type(v_pathname_base, T_STRING);
|
79
|
+
|
80
|
+
wp_index* index;
|
81
|
+
wp_error* e = wp_index_create(&index, strdup(RSTRING_PTR(v_pathname_base)));
|
82
|
+
//printf("# index create at %p, error is %p\n", index, e);
|
83
|
+
RAISE_IF_NECESSARY(e);
|
84
|
+
|
85
|
+
VALUE o_index = Data_Wrap_Struct(class, NULL, index_free, index);
|
86
|
+
VALUE argv[1] = { v_pathname_base };
|
87
|
+
rb_obj_call_init(o_index, 1, argv);
|
88
|
+
return o_index;
|
89
|
+
}
|
90
|
+
|
91
|
+
/*
|
92
|
+
* call-seq: Index.load(pathname_base)
|
93
|
+
*
|
94
|
+
* Loads a new index, raising an error if it doesn't exists. The on-disk *
|
95
|
+
* representation will be multiple files starting with
|
96
|
+
* +pathname_base+.
|
97
|
+
*
|
98
|
+
*/
|
99
|
+
|
100
|
+
static VALUE index_load(VALUE class, VALUE v_pathname_base) {
|
101
|
+
Check_Type(v_pathname_base, T_STRING);
|
102
|
+
|
103
|
+
wp_index* index;
|
104
|
+
wp_error* e = wp_index_load(&index, strdup(RSTRING_PTR(v_pathname_base)));
|
105
|
+
//printf("# index load at %p, error is %p\n", index, e);
|
106
|
+
RAISE_IF_NECESSARY(e);
|
107
|
+
|
108
|
+
VALUE o_index = Data_Wrap_Struct(class, NULL, index_free, index);
|
109
|
+
VALUE argv[1] = { v_pathname_base };
|
110
|
+
rb_obj_call_init(o_index, 1, argv);
|
111
|
+
return o_index;
|
112
|
+
}
|
113
|
+
|
114
|
+
/*
|
115
|
+
* call-seq: Index.exists?(pathname_base)
|
116
|
+
*
|
117
|
+
* Returns true iff an index with base pathname of +pathname_base+
|
118
|
+
* exists on disk.
|
119
|
+
*
|
120
|
+
*/
|
121
|
+
static VALUE index_exists(VALUE class, VALUE v_pathname_base) {
|
122
|
+
Check_Type(v_pathname_base, T_STRING);
|
123
|
+
|
124
|
+
if(wp_index_exists(RSTRING_PTR(v_pathname_base))) return Qtrue;
|
125
|
+
else return Qfalse;
|
126
|
+
}
|
127
|
+
|
128
|
+
/*
|
129
|
+
* call-seq: Index.delete!(pathname_base)
|
130
|
+
*
|
131
|
+
* Deletes the index with base pathname +pathname_base+ from disk.
|
132
|
+
* Does nothing if the index does not exist. If that index is currently loaded
|
133
|
+
* in memory, expect may to see segfaults when you try to access it.
|
134
|
+
*
|
135
|
+
*/
|
136
|
+
static VALUE index_delete(VALUE class, VALUE v_pathname_base) {
|
137
|
+
Check_Type(v_pathname_base, T_STRING);
|
138
|
+
|
139
|
+
wp_error* e = wp_index_delete(RSTRING_PTR(v_pathname_base));
|
140
|
+
RAISE_IF_NECESSARY(e);
|
141
|
+
|
142
|
+
return v_pathname_base;
|
143
|
+
}
|
144
|
+
|
145
|
+
/*
|
146
|
+
* Returns the number of entries in the index.
|
147
|
+
*
|
148
|
+
*/
|
149
|
+
static VALUE index_size(VALUE self) {
|
150
|
+
wp_index* index;
|
151
|
+
Data_Get_Struct(self, wp_index, index);
|
152
|
+
return INT2NUM(wp_index_num_docs(index));
|
153
|
+
}
|
154
|
+
|
155
|
+
static VALUE index_init(VALUE self, VALUE v_pathname_base) {
|
156
|
+
rb_iv_set(self, "@pathname_base", v_pathname_base);
|
157
|
+
return self;
|
158
|
+
}
|
159
|
+
|
160
|
+
/*
|
161
|
+
* call-seq: count(query)
|
162
|
+
*
|
163
|
+
* Returns the number of entries matched by +query+, which should be a Query object.
|
164
|
+
* Note that in the current implementation, this is almost as expensive as retrieving all the
|
165
|
+
* results directly.
|
166
|
+
*
|
167
|
+
*/
|
168
|
+
static VALUE index_count(VALUE self, VALUE v_query) {
|
169
|
+
if(CLASS_OF(v_query) != c_query) {
|
170
|
+
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
|
171
|
+
// not reached
|
172
|
+
}
|
173
|
+
|
174
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
175
|
+
wp_query* query; Data_Get_Struct(v_query, wp_query, query);
|
176
|
+
uint32_t num_results;
|
177
|
+
// clone the query because we don't want to interrupt any search state
|
178
|
+
// which may otherwise be being used for pagination.
|
179
|
+
wp_error* e = wp_index_count_results(index, wp_query_clone(query), &num_results);
|
180
|
+
RAISE_IF_NECESSARY(e);
|
181
|
+
|
182
|
+
return INT2NUM(num_results);
|
183
|
+
}
|
184
|
+
|
185
|
+
/*
|
186
|
+
* Closes the index, flushing all changes to disk. Future calls to this index
|
187
|
+
* may result in a segfault.
|
188
|
+
*
|
189
|
+
*/
|
190
|
+
static VALUE index_close(VALUE self) {
|
191
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
192
|
+
wp_error* e = wp_index_unload(index);
|
193
|
+
RAISE_IF_NECESSARY(e);
|
194
|
+
|
195
|
+
return Qnil;
|
196
|
+
}
|
197
|
+
|
198
|
+
static void entry_free(wp_entry* entry) {
|
199
|
+
wp_error* e = wp_entry_free(entry);
|
200
|
+
//printf("# entry free at %p with error %p\n", entry, e);
|
201
|
+
if(e != NULL) {
|
202
|
+
PRINT_ERROR(e, stderr); // why not?
|
203
|
+
wp_error_free(e);
|
204
|
+
}
|
205
|
+
}
|
206
|
+
|
207
|
+
/* Creates a new, empty entry. */
|
208
|
+
static VALUE entry_new(VALUE class) {
|
209
|
+
wp_entry* entry = wp_entry_new();
|
210
|
+
|
211
|
+
//printf("# entry create at %p\n", entry);
|
212
|
+
VALUE o_entry = Data_Wrap_Struct(class, NULL, entry_free, entry);
|
213
|
+
rb_obj_call_init(o_entry, 0, NULL);
|
214
|
+
return o_entry;
|
215
|
+
}
|
216
|
+
|
217
|
+
/*
|
218
|
+
* call-seq: add_token(field, token)
|
219
|
+
*
|
220
|
+
* Adds a single token +token+ with field +field</field> to an entry. Both
|
221
|
+
* +token+ and +field</field> must be strings.
|
222
|
+
*
|
223
|
+
* Returns itself.
|
224
|
+
*/
|
225
|
+
static VALUE entry_add_token(VALUE self, VALUE field, VALUE term) {
|
226
|
+
Check_Type(field, T_STRING);
|
227
|
+
Check_Type(term, T_STRING);
|
228
|
+
|
229
|
+
wp_entry* entry; Data_Get_Struct(self, wp_entry, entry);
|
230
|
+
wp_error* e = wp_entry_add_token(entry, RSTRING_PTR(field), RSTRING_PTR(term));
|
231
|
+
RAISE_IF_NECESSARY(e);
|
232
|
+
|
233
|
+
return self;
|
234
|
+
}
|
235
|
+
|
236
|
+
/*
|
237
|
+
* call-seq: add_string(field, string)
|
238
|
+
*
|
239
|
+
* Adds a String +string+ with field +field</field> to an entry. The string
|
240
|
+
* will be tokenized on whitespace. Both +token+ and +string</field> must be
|
241
|
+
* strings.
|
242
|
+
*
|
243
|
+
* Returns itself.
|
244
|
+
*/
|
245
|
+
static VALUE entry_add_string(VALUE self, VALUE field, VALUE string) {
|
246
|
+
Check_Type(field, T_STRING);
|
247
|
+
Check_Type(string, T_STRING);
|
248
|
+
|
249
|
+
wp_entry* entry; Data_Get_Struct(self, wp_entry, entry);
|
250
|
+
wp_error* e = wp_entry_add_string(entry, RSTRING_PTR(field), RSTRING_PTR(string));
|
251
|
+
RAISE_IF_NECESSARY(e);
|
252
|
+
|
253
|
+
return self;
|
254
|
+
}
|
255
|
+
|
256
|
+
/*
|
257
|
+
* Returns the number of tokens in the entry.
|
258
|
+
*/
|
259
|
+
static VALUE entry_size(VALUE self) {
|
260
|
+
wp_entry* entry; Data_Get_Struct(self, wp_entry, entry);
|
261
|
+
return INT2NUM(wp_entry_size(entry));
|
262
|
+
}
|
263
|
+
|
264
|
+
/*
|
265
|
+
* call-seq: add_entry(entry)
|
266
|
+
*
|
267
|
+
* Adds the entry +entry+ to the index. Returns the document id
|
268
|
+
* corresponding to this entry.
|
269
|
+
*/
|
270
|
+
static VALUE index_add_entry(VALUE self, VALUE v_entry) {
|
271
|
+
if(CLASS_OF(v_entry) != c_entry) {
|
272
|
+
rb_raise(rb_eTypeError, "entry must be a Whistlepig::Entry object"); // would be nice to support subclasses somehow...
|
273
|
+
// not reached
|
274
|
+
}
|
275
|
+
|
276
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
277
|
+
wp_entry* entry; Data_Get_Struct(v_entry, wp_entry, entry);
|
278
|
+
uint64_t doc_id;
|
279
|
+
wp_error* e = wp_index_add_entry(index, entry, &doc_id);
|
280
|
+
RAISE_IF_NECESSARY(e);
|
281
|
+
|
282
|
+
return INT2NUM(doc_id);
|
283
|
+
}
|
284
|
+
|
285
|
+
/*
|
286
|
+
* call-seq: add_label(doc_id, label)
|
287
|
+
*
|
288
|
+
* Adds the label +label+ to the document corresponding to doc id
|
289
|
+
* +doc_id+ in the index. +label+ must be a String.
|
290
|
+
* If the label has already been added to the document, does nothing.
|
291
|
+
*/
|
292
|
+
static VALUE index_add_label(VALUE self, VALUE v_doc_id, VALUE v_label) {
|
293
|
+
Check_Type(v_doc_id, T_FIXNUM);
|
294
|
+
Check_Type(v_label, T_STRING);
|
295
|
+
|
296
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
297
|
+
wp_error* e = wp_index_add_label(index, RSTRING_PTR(v_label), NUM2INT(v_doc_id));
|
298
|
+
RAISE_IF_NECESSARY(e);
|
299
|
+
|
300
|
+
return v_label;
|
301
|
+
}
|
302
|
+
|
303
|
+
/*
|
304
|
+
* call-seq: remove_label(doc_id, label)
|
305
|
+
*
|
306
|
+
* Removes the label +label+ from the document corresponding to doc id
|
307
|
+
* +doc_id+ in the index. +label+ must be a String.
|
308
|
+
* If the label has not been added to the document, does nothing.
|
309
|
+
*/
|
310
|
+
static VALUE index_remove_label(VALUE self, VALUE v_doc_id, VALUE v_label) {
|
311
|
+
Check_Type(v_doc_id, T_FIXNUM);
|
312
|
+
Check_Type(v_label, T_STRING);
|
313
|
+
|
314
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
315
|
+
wp_error* e = wp_index_remove_label(index, RSTRING_PTR(v_label), NUM2INT(v_doc_id));
|
316
|
+
RAISE_IF_NECESSARY(e);
|
317
|
+
|
318
|
+
return v_label;
|
319
|
+
}
|
320
|
+
|
321
|
+
/*
|
322
|
+
* call-seq: Query.new(default_field, query_string)
|
323
|
+
*
|
324
|
+
* Creates a new query by parsing the string +query_string+, which must be a
|
325
|
+
* String. Any non-fielded terms will used the field +default_field+, which
|
326
|
+
* must also be a String. Raises a ParseError if the query cannot be parsed.
|
327
|
+
*
|
328
|
+
*/
|
329
|
+
static VALUE query_new(VALUE class, VALUE default_field, VALUE string) {
|
330
|
+
Check_Type(default_field, T_STRING);
|
331
|
+
Check_Type(string, T_STRING);
|
332
|
+
|
333
|
+
wp_query* query;
|
334
|
+
wp_error* e = wp_query_parse(RSTRING_PTR(string), RSTRING_PTR(default_field), &query);
|
335
|
+
if(e != NULL) {
|
336
|
+
VALUE exc = rb_exc_new2(c_parseerror, e->msg);
|
337
|
+
wp_error_free(e);
|
338
|
+
rb_exc_raise(exc);
|
339
|
+
}
|
340
|
+
|
341
|
+
VALUE o_query = Data_Wrap_Struct(class, NULL, wp_query_free, query);
|
342
|
+
VALUE argv[2] = { string, default_field };
|
343
|
+
rb_obj_call_init(o_query, 2, argv);
|
344
|
+
|
345
|
+
return o_query;
|
346
|
+
}
|
347
|
+
|
348
|
+
/*
|
349
|
+
* Returns a parsed representation of a String, useful for debugging.
|
350
|
+
*/
|
351
|
+
static VALUE query_to_s(VALUE self) {
|
352
|
+
char buf[1024];
|
353
|
+
|
354
|
+
wp_query* query; Data_Get_Struct(self, wp_query, query);
|
355
|
+
wp_query_to_s(query, 1024, buf);
|
356
|
+
|
357
|
+
return rb_str_new2(buf);
|
358
|
+
}
|
359
|
+
|
360
|
+
/*
|
361
|
+
* call-seq: and(other)
|
362
|
+
*
|
363
|
+
* Returns a new Query that is a conjunction of this query and +other+, which
|
364
|
+
* must also be a Query object.
|
365
|
+
*
|
366
|
+
*/
|
367
|
+
static VALUE query_and(VALUE self, VALUE v_other) {
|
368
|
+
if(CLASS_OF(v_other) != c_query) {
|
369
|
+
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
|
370
|
+
// not reached
|
371
|
+
}
|
372
|
+
|
373
|
+
wp_query* query; Data_Get_Struct(self, wp_query, query);
|
374
|
+
wp_query* other; Data_Get_Struct(v_other, wp_query, other);
|
375
|
+
|
376
|
+
wp_query* result = wp_query_new_conjunction();
|
377
|
+
result = wp_query_add(result, wp_query_clone(query));
|
378
|
+
result = wp_query_add(result, wp_query_clone(other));
|
379
|
+
|
380
|
+
VALUE o_result = Data_Wrap_Struct(c_query, NULL, wp_query_free, result);
|
381
|
+
VALUE argv[2] = { Qnil, Qnil }; // i guess
|
382
|
+
rb_obj_call_init(o_result, 2, argv);
|
383
|
+
|
384
|
+
return o_result;
|
385
|
+
}
|
386
|
+
|
387
|
+
/*
|
388
|
+
* call-seq: or(other)
|
389
|
+
*
|
390
|
+
* Returns a new Query that is a disjunction of this query and +other+, which
|
391
|
+
* must also be a Query object.
|
392
|
+
*
|
393
|
+
*/
|
394
|
+
static VALUE query_or(VALUE self, VALUE v_other) {
|
395
|
+
if(CLASS_OF(v_other) != c_query) {
|
396
|
+
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
|
397
|
+
// not reached
|
398
|
+
}
|
399
|
+
|
400
|
+
wp_query* query; Data_Get_Struct(self, wp_query, query);
|
401
|
+
wp_query* other; Data_Get_Struct(v_other, wp_query, other);
|
402
|
+
|
403
|
+
wp_query* result = wp_query_new_disjunction();
|
404
|
+
result = wp_query_add(result, wp_query_clone(query));
|
405
|
+
result = wp_query_add(result, wp_query_clone(other));
|
406
|
+
|
407
|
+
VALUE o_result = Data_Wrap_Struct(c_query, NULL, wp_query_free, result);
|
408
|
+
VALUE argv[2] = { Qnil, Qnil }; // i guess
|
409
|
+
rb_obj_call_init(o_result, 2, argv);
|
410
|
+
|
411
|
+
return o_result;
|
412
|
+
}
|
413
|
+
|
414
|
+
static VALUE query_init(VALUE self, VALUE query) {
|
415
|
+
rb_iv_set(self, "@query", query);
|
416
|
+
return self;
|
417
|
+
}
|
418
|
+
|
419
|
+
/*
|
420
|
+
* call-seq: setup_query(query)
|
421
|
+
*
|
422
|
+
* Initializes query for use with run_query. If you do not call teardown_query
|
423
|
+
* on this query later, you will leak memory.
|
424
|
+
*/
|
425
|
+
static VALUE index_setup_query(VALUE self, VALUE v_query) {
|
426
|
+
if(CLASS_OF(v_query) != c_query) {
|
427
|
+
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
|
428
|
+
// not reached
|
429
|
+
}
|
430
|
+
|
431
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
432
|
+
wp_query* query; Data_Get_Struct(v_query, wp_query, query);
|
433
|
+
wp_error* e = wp_index_setup_query(index, query);
|
434
|
+
RAISE_IF_NECESSARY(e);
|
435
|
+
|
436
|
+
return self;
|
437
|
+
}
|
438
|
+
|
439
|
+
/*
|
440
|
+
* call-seq: teardown_query(query)
|
441
|
+
*
|
442
|
+
* Releases any held state used by the query, if it has been first passed to
|
443
|
+
* setup_query. If you call run_query on this query after calling this
|
444
|
+
* function, terrible things will happen.
|
445
|
+
*/
|
446
|
+
static VALUE index_teardown_query(VALUE self, VALUE v_query) {
|
447
|
+
if(CLASS_OF(v_query) != c_query) {
|
448
|
+
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
|
449
|
+
// not reached
|
450
|
+
}
|
451
|
+
|
452
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
453
|
+
wp_query* query; Data_Get_Struct(v_query, wp_query, query);
|
454
|
+
wp_error* e = wp_index_teardown_query(index, query);
|
455
|
+
RAISE_IF_NECESSARY(e);
|
456
|
+
|
457
|
+
return self;
|
458
|
+
}
|
459
|
+
|
460
|
+
/*
|
461
|
+
* call-seq: run_query(query, max_num_results)
|
462
|
+
*
|
463
|
+
* Runs a query which has been first passed to setup_query, and returns an
|
464
|
+
* array of at most +max_num_results+ doc ids. Can be called
|
465
|
+
* multiple times to retrieve successive results from the query. The query
|
466
|
+
* must have been passed to setup_query first, or terrible things will happen.
|
467
|
+
* The query must be passed to teardown_query when done, or memory leaks will
|
468
|
+
* occur.
|
469
|
+
*
|
470
|
+
*/
|
471
|
+
static VALUE index_run_query(VALUE self, VALUE v_query, VALUE v_max_num_results) {
|
472
|
+
Check_Type(v_max_num_results, T_FIXNUM);
|
473
|
+
if(CLASS_OF(v_query) != c_query) {
|
474
|
+
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
|
475
|
+
// not reached
|
476
|
+
}
|
477
|
+
|
478
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
479
|
+
wp_query* query; Data_Get_Struct(v_query, wp_query, query);
|
480
|
+
|
481
|
+
uint32_t max_num_results = NUM2INT(v_max_num_results);
|
482
|
+
uint32_t num_results;
|
483
|
+
uint64_t* results = malloc(sizeof(uint64_t) * max_num_results);
|
484
|
+
|
485
|
+
wp_error* e = wp_index_run_query(index, query, max_num_results, &num_results, results);
|
486
|
+
RAISE_IF_NECESSARY(e);
|
487
|
+
|
488
|
+
VALUE array = rb_ary_new2(num_results);
|
489
|
+
for(uint32_t i = 0; i < num_results; i++) {
|
490
|
+
rb_ary_store(array, i, INT2NUM(results[i]));
|
491
|
+
}
|
492
|
+
free(results);
|
493
|
+
|
494
|
+
return array;
|
495
|
+
}
|
496
|
+
|
497
|
+
void Init_whistlepigc() {
|
498
|
+
VALUE m_whistlepig;
|
499
|
+
|
500
|
+
m_whistlepig = rb_define_module("Whistlepig");
|
501
|
+
|
502
|
+
c_index = rb_define_class_under(m_whistlepig, "Index", rb_cObject);
|
503
|
+
rb_define_singleton_method(c_index, "new", index_new, 1);
|
504
|
+
rb_define_singleton_method(c_index, "create", index_create, 1);
|
505
|
+
rb_define_singleton_method(c_index, "load", index_load, 1);
|
506
|
+
rb_define_singleton_method(c_index, "delete!", index_delete, 1);
|
507
|
+
rb_define_singleton_method(c_index, "exists?", index_exists, 1);
|
508
|
+
rb_define_method(c_index, "initialize", index_init, 1);
|
509
|
+
rb_define_method(c_index, "close", index_close, 0);
|
510
|
+
rb_define_method(c_index, "size", index_size, 0);
|
511
|
+
rb_define_method(c_index, "add_entry", index_add_entry, 1);
|
512
|
+
rb_define_method(c_index, "add_label", index_add_label, 2);
|
513
|
+
rb_define_method(c_index, "remove_label", index_remove_label, 2);
|
514
|
+
rb_define_method(c_index, "count", index_count, 1);
|
515
|
+
rb_define_method(c_index, "setup_query", index_setup_query, 1);
|
516
|
+
rb_define_method(c_index, "run_query", index_run_query, 2);
|
517
|
+
rb_define_method(c_index, "teardown_query", index_teardown_query, 1);
|
518
|
+
rb_define_attr(c_index, "pathname_base", 1, 0);
|
519
|
+
|
520
|
+
c_entry = rb_define_class_under(m_whistlepig, "Entry", rb_cObject);
|
521
|
+
rb_define_singleton_method(c_entry, "new", entry_new, 0);
|
522
|
+
rb_define_method(c_entry, "size", entry_size, 0);
|
523
|
+
rb_define_method(c_entry, "add_token", entry_add_token, 2);
|
524
|
+
rb_define_method(c_entry, "add_string", entry_add_string, 2);
|
525
|
+
//rb_define_method(c_entry, "add_file", entry_add_file, 2);
|
526
|
+
|
527
|
+
c_query = rb_define_class_under(m_whistlepig, "Query", rb_cObject);
|
528
|
+
rb_define_singleton_method(c_query, "new", query_new, 2);
|
529
|
+
rb_define_method(c_query, "initialize", query_init, 2);
|
530
|
+
rb_define_method(c_query, "and", query_and, 1);
|
531
|
+
rb_define_method(c_query, "or", query_or, 1);
|
532
|
+
rb_define_method(c_query, "to_s", query_to_s, 0);
|
533
|
+
rb_define_attr(c_query, "query", 1, 0);
|
534
|
+
|
535
|
+
c_error = rb_define_class_under(m_whistlepig, "Error", rb_eStandardError);
|
536
|
+
c_parseerror = rb_define_class_under(m_whistlepig, "ParseError", rb_eStandardError);
|
537
|
+
}
|
data/lib/whistlepig.rb
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
require "whistlepigc"
|
2
|
+
|
3
|
+
module Whistlepig
|
4
|
+
## A full-text index. You can add entries to it, and you can run queries
|
5
|
+
## against it.
|
6
|
+
##
|
7
|
+
## To add documents, create Entry objects and call add_entry. Entries
|
8
|
+
## represent the document before addition; add_entry will return an integer
|
9
|
+
## docid and the entry can be discarded at that point.
|
10
|
+
##
|
11
|
+
## To run queries, the simplest option is to call Index#search or
|
12
|
+
## Index#each_result_for.
|
13
|
+
##
|
14
|
+
## The more complex option is to use setup_query, run_query, and
|
15
|
+
## teardown_query, in that order. The advantage of this approach is that
|
16
|
+
## run_query can be called multiple times, and each call will return more
|
17
|
+
## results, allowing for query pagination.
|
18
|
+
class Index
|
19
|
+
## Runs a query and yield each matching doc id. Handles the mechanics of
|
20
|
+
## setting up and tearing down the query.
|
21
|
+
def each_result_for query, chunk_size=10
|
22
|
+
setup_query query
|
23
|
+
begin
|
24
|
+
while true
|
25
|
+
results = run_query query, chunk_size
|
26
|
+
results.each { |r| yield r }
|
27
|
+
break if results.size < chunk_size
|
28
|
+
end
|
29
|
+
ensure
|
30
|
+
teardown_query query
|
31
|
+
end
|
32
|
+
self
|
33
|
+
end
|
34
|
+
|
35
|
+
## Convenience method. Runs a query and returns up to +max_results+
|
36
|
+
## matching doc ids. Handles the mechanics of setting up and tearing down
|
37
|
+
## the query.
|
38
|
+
def search query, max_results=nil
|
39
|
+
setup_query query
|
40
|
+
ret = []
|
41
|
+
num_per_call = max_results || 100
|
42
|
+
begin
|
43
|
+
while true
|
44
|
+
results = run_query query, num_per_call
|
45
|
+
ret += results
|
46
|
+
break if max_results || results.size < num_per_call
|
47
|
+
end
|
48
|
+
ensure
|
49
|
+
teardown_query query
|
50
|
+
end
|
51
|
+
|
52
|
+
ret
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
## Represents document, before being added to the index.
|
57
|
+
##
|
58
|
+
## Entries allow you to build up a document in memory before indexing it.
|
59
|
+
## Once you've built it, pass it to Index#add_entry.
|
60
|
+
class Entry
|
61
|
+
end
|
62
|
+
|
63
|
+
## A generic error.
|
64
|
+
class Error
|
65
|
+
end
|
66
|
+
|
67
|
+
## A parser error.
|
68
|
+
class ParseError
|
69
|
+
end
|
70
|
+
|
71
|
+
## A query. Queries are created from strings with Query#new. If parsing the
|
72
|
+
## string fails, a ParseError is thrown.
|
73
|
+
##
|
74
|
+
## At the lowest level, queries are composed of space-separated terms.
|
75
|
+
## Matches against that term are restricted to the default field specified at
|
76
|
+
## parse time.
|
77
|
+
##
|
78
|
+
## hello # search for "hello" in the default field
|
79
|
+
##
|
80
|
+
## Term matches can be restricted to another field by by
|
81
|
+
## prefixing them with the field name and ":", e.g. "subject:hello".
|
82
|
+
##
|
83
|
+
## subject:hello # search for "hello" in the "subject" field
|
84
|
+
##
|
85
|
+
## Multiple terms are considered conjunctive (i.e. all must match) unless the
|
86
|
+
## special token "OR" appears between them. The "OR" must be capitalized
|
87
|
+
## in this case.
|
88
|
+
## word1 word2 # search for word1 and word2
|
89
|
+
## word1 OR word2 # search for word1 or word2
|
90
|
+
## subject:hello bob # "hello" in the subject field and "bob" in the
|
91
|
+
## # default field
|
92
|
+
##
|
93
|
+
## Parentheses can be used to group disjunctions, conjunctions or fields.
|
94
|
+
## (word1 OR word2) word3 # "word3" and either "word1" or "word2"
|
95
|
+
## field:(word1 OR word2) # "word1" or "word2" in field "field"
|
96
|
+
##
|
97
|
+
## Phrases are specified by surrounding the terms with double quotes.
|
98
|
+
## "bob jones" # documents with the phrase "bob jones"
|
99
|
+
##
|
100
|
+
## Negations can be specified with a - prefix.
|
101
|
+
## -word # docs without "word"
|
102
|
+
## -subject:(bob OR joe) # docs with neither "bob" nor "joe" in subject
|
103
|
+
##
|
104
|
+
## Labels are specified with a ~ prefix. Labels do not have fields.
|
105
|
+
## ~inbox # docs with the "inbox" label
|
106
|
+
## -~inbox # docs without the "inbox" label
|
107
|
+
## -~inbox subject:hello # docs with subject "hello" and without the
|
108
|
+
## # inbox label
|
109
|
+
##
|
110
|
+
## All of the above can be mixed and matched, of course.
|
111
|
+
## -subject:"spam email" ~inbox (money OR cash)
|
112
|
+
## ("love you" OR "hate you") -(~deleted OR ~spam)
|
113
|
+
## etc...
|
114
|
+
##
|
115
|
+
## Existing query objects can also be altered programmatically, at least to
|
116
|
+
## a limited extent, by calling Query#and and Query#or.
|
117
|
+
class Query
|
118
|
+
end
|
119
|
+
end
|