whistlepig 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +86 -0
- data/ext/whistlepig/defaults.h +28 -0
- data/ext/whistlepig/entry.c +181 -0
- data/ext/whistlepig/entry.h +66 -0
- data/ext/whistlepig/error.c +24 -0
- data/ext/whistlepig/error.h +94 -0
- data/ext/whistlepig/extconf.rb +6 -0
- data/ext/whistlepig/index.c +294 -0
- data/ext/whistlepig/index.h +88 -0
- data/ext/whistlepig/khash.h +316 -0
- data/ext/whistlepig/mmap-obj.c +76 -0
- data/ext/whistlepig/mmap-obj.h +52 -0
- data/ext/whistlepig/query-parser.c +37 -0
- data/ext/whistlepig/query-parser.h +25 -0
- data/ext/whistlepig/query-parser.lex.c +2249 -0
- data/ext/whistlepig/query-parser.lex.h +359 -0
- data/ext/whistlepig/query-parser.tab.c +1757 -0
- data/ext/whistlepig/query-parser.tab.h +85 -0
- data/ext/whistlepig/query.c +194 -0
- data/ext/whistlepig/query.h +78 -0
- data/ext/whistlepig/search.c +746 -0
- data/ext/whistlepig/search.h +76 -0
- data/ext/whistlepig/segment.c +615 -0
- data/ext/whistlepig/segment.h +137 -0
- data/ext/whistlepig/stringmap.c +278 -0
- data/ext/whistlepig/stringmap.h +82 -0
- data/ext/whistlepig/stringpool.c +44 -0
- data/ext/whistlepig/stringpool.h +58 -0
- data/ext/whistlepig/termhash.c +294 -0
- data/ext/whistlepig/termhash.h +79 -0
- data/ext/whistlepig/tokenizer.lex.c +2263 -0
- data/ext/whistlepig/tokenizer.lex.h +360 -0
- data/ext/whistlepig/whistlepig.h +15 -0
- data/ext/whistlepig/whistlepigc.c +537 -0
- data/lib/whistlepig.rb +119 -0
- metadata +103 -0
@@ -0,0 +1,537 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <ruby.h>
|
3
|
+
#include "whistlepig.h"
|
4
|
+
|
5
|
+
static VALUE m_whistlepig;
|
6
|
+
static VALUE c_index;
|
7
|
+
static VALUE c_entry;
|
8
|
+
static VALUE c_query;
|
9
|
+
static VALUE c_error;
|
10
|
+
static VALUE c_parseerror;
|
11
|
+
|
12
|
+
static char* strdup(const char* old) { // wtf stupid
|
13
|
+
size_t len = strlen(old) + 1;
|
14
|
+
char *new = malloc(len * sizeof(char));
|
15
|
+
return (char *)memcpy(new, old, len);
|
16
|
+
}
|
17
|
+
|
18
|
+
static void index_free(wp_index* index) {
|
19
|
+
wp_error* e = wp_index_free(index);
|
20
|
+
//printf("# index free at %p with error %p\n", index, e);
|
21
|
+
if(e != NULL) {
|
22
|
+
PRINT_ERROR(e, stderr); // why not?
|
23
|
+
wp_error_free(e);
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
#define RAISE_IF_NECESSARY(e) do { \
|
28
|
+
if(e != NULL) { \
|
29
|
+
VALUE exc = rb_exc_new2(c_error, e->msg); \
|
30
|
+
wp_error_free(e); \
|
31
|
+
rb_exc_raise(exc); \
|
32
|
+
} \
|
33
|
+
} while(0)
|
34
|
+
|
35
|
+
// support 1.9 and 1.8
|
36
|
+
#ifndef RSTRING_PTR
|
37
|
+
#define RSTRING_PTR(v) RSTRING(v)->ptr
|
38
|
+
#endif
|
39
|
+
|
40
|
+
/*
|
41
|
+
* call-seq: Index.new(pathname_base)
|
42
|
+
*
|
43
|
+
* Creates or loads a new index. The on-disk representation will be multiple
|
44
|
+
* files starting * with +pathname_base+.
|
45
|
+
*
|
46
|
+
* The index may be later be explicitly closed with Index#close. It will also
|
47
|
+
* be automatically closed when Ruby exits.
|
48
|
+
*
|
49
|
+
*/
|
50
|
+
|
51
|
+
static VALUE index_new(VALUE class, VALUE v_pathname_base) {
|
52
|
+
Check_Type(v_pathname_base, T_STRING);
|
53
|
+
|
54
|
+
wp_index* index;
|
55
|
+
wp_error* e;
|
56
|
+
char* pathname_base = RSTRING_PTR(v_pathname_base);
|
57
|
+
|
58
|
+
if(wp_index_exists(pathname_base)) e = wp_index_load(&index, strdup(pathname_base));
|
59
|
+
else e = wp_index_create(&index, strdup(pathname_base));
|
60
|
+
RAISE_IF_NECESSARY(e);
|
61
|
+
|
62
|
+
VALUE o_index = Data_Wrap_Struct(class, NULL, index_free, index);
|
63
|
+
VALUE argv[1] = { v_pathname_base };
|
64
|
+
rb_obj_call_init(o_index, 1, argv);
|
65
|
+
return o_index;
|
66
|
+
}
|
67
|
+
|
68
|
+
/*
|
69
|
+
* call-seq: Index.create(pathname_base)
|
70
|
+
*
|
71
|
+
* Creates a new index, raising an error if it already exists. The on-disk
|
72
|
+
* representation will be multiple files starting with
|
73
|
+
* +pathname_base+.
|
74
|
+
*
|
75
|
+
*/
|
76
|
+
|
77
|
+
static VALUE index_create(VALUE class, VALUE v_pathname_base) {
|
78
|
+
Check_Type(v_pathname_base, T_STRING);
|
79
|
+
|
80
|
+
wp_index* index;
|
81
|
+
wp_error* e = wp_index_create(&index, strdup(RSTRING_PTR(v_pathname_base)));
|
82
|
+
//printf("# index create at %p, error is %p\n", index, e);
|
83
|
+
RAISE_IF_NECESSARY(e);
|
84
|
+
|
85
|
+
VALUE o_index = Data_Wrap_Struct(class, NULL, index_free, index);
|
86
|
+
VALUE argv[1] = { v_pathname_base };
|
87
|
+
rb_obj_call_init(o_index, 1, argv);
|
88
|
+
return o_index;
|
89
|
+
}
|
90
|
+
|
91
|
+
/*
|
92
|
+
* call-seq: Index.load(pathname_base)
|
93
|
+
*
|
94
|
+
* Loads a new index, raising an error if it doesn't exists. The on-disk *
|
95
|
+
* representation will be multiple files starting with
|
96
|
+
* +pathname_base+.
|
97
|
+
*
|
98
|
+
*/
|
99
|
+
|
100
|
+
static VALUE index_load(VALUE class, VALUE v_pathname_base) {
|
101
|
+
Check_Type(v_pathname_base, T_STRING);
|
102
|
+
|
103
|
+
wp_index* index;
|
104
|
+
wp_error* e = wp_index_load(&index, strdup(RSTRING_PTR(v_pathname_base)));
|
105
|
+
//printf("# index load at %p, error is %p\n", index, e);
|
106
|
+
RAISE_IF_NECESSARY(e);
|
107
|
+
|
108
|
+
VALUE o_index = Data_Wrap_Struct(class, NULL, index_free, index);
|
109
|
+
VALUE argv[1] = { v_pathname_base };
|
110
|
+
rb_obj_call_init(o_index, 1, argv);
|
111
|
+
return o_index;
|
112
|
+
}
|
113
|
+
|
114
|
+
/*
|
115
|
+
* call-seq: Index.exists?(pathname_base)
|
116
|
+
*
|
117
|
+
* Returns true iff an index with base pathname of +pathname_base+
|
118
|
+
* exists on disk.
|
119
|
+
*
|
120
|
+
*/
|
121
|
+
static VALUE index_exists(VALUE class, VALUE v_pathname_base) {
|
122
|
+
Check_Type(v_pathname_base, T_STRING);
|
123
|
+
|
124
|
+
if(wp_index_exists(RSTRING_PTR(v_pathname_base))) return Qtrue;
|
125
|
+
else return Qfalse;
|
126
|
+
}
|
127
|
+
|
128
|
+
/*
|
129
|
+
* call-seq: Index.delete!(pathname_base)
|
130
|
+
*
|
131
|
+
* Deletes the index with base pathname +pathname_base+ from disk.
|
132
|
+
* Does nothing if the index does not exist. If that index is currently loaded
|
133
|
+
* in memory, expect may to see segfaults when you try to access it.
|
134
|
+
*
|
135
|
+
*/
|
136
|
+
static VALUE index_delete(VALUE class, VALUE v_pathname_base) {
|
137
|
+
Check_Type(v_pathname_base, T_STRING);
|
138
|
+
|
139
|
+
wp_error* e = wp_index_delete(RSTRING_PTR(v_pathname_base));
|
140
|
+
RAISE_IF_NECESSARY(e);
|
141
|
+
|
142
|
+
return v_pathname_base;
|
143
|
+
}
|
144
|
+
|
145
|
+
/*
|
146
|
+
* Returns the number of entries in the index.
|
147
|
+
*
|
148
|
+
*/
|
149
|
+
static VALUE index_size(VALUE self) {
|
150
|
+
wp_index* index;
|
151
|
+
Data_Get_Struct(self, wp_index, index);
|
152
|
+
return INT2NUM(wp_index_num_docs(index));
|
153
|
+
}
|
154
|
+
|
155
|
+
static VALUE index_init(VALUE self, VALUE v_pathname_base) {
|
156
|
+
rb_iv_set(self, "@pathname_base", v_pathname_base);
|
157
|
+
return self;
|
158
|
+
}
|
159
|
+
|
160
|
+
/*
|
161
|
+
* call-seq: count(query)
|
162
|
+
*
|
163
|
+
* Returns the number of entries matched by +query+, which should be a Query object.
|
164
|
+
* Note that in the current implementation, this is almost as expensive as retrieving all the
|
165
|
+
* results directly.
|
166
|
+
*
|
167
|
+
*/
|
168
|
+
static VALUE index_count(VALUE self, VALUE v_query) {
|
169
|
+
if(CLASS_OF(v_query) != c_query) {
|
170
|
+
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
|
171
|
+
// not reached
|
172
|
+
}
|
173
|
+
|
174
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
175
|
+
wp_query* query; Data_Get_Struct(v_query, wp_query, query);
|
176
|
+
uint32_t num_results;
|
177
|
+
// clone the query because we don't want to interrupt any search state
|
178
|
+
// which may otherwise be being used for pagination.
|
179
|
+
wp_error* e = wp_index_count_results(index, wp_query_clone(query), &num_results);
|
180
|
+
RAISE_IF_NECESSARY(e);
|
181
|
+
|
182
|
+
return INT2NUM(num_results);
|
183
|
+
}
|
184
|
+
|
185
|
+
/*
|
186
|
+
* Closes the index, flushing all changes to disk. Future calls to this index
|
187
|
+
* may result in a segfault.
|
188
|
+
*
|
189
|
+
*/
|
190
|
+
static VALUE index_close(VALUE self) {
|
191
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
192
|
+
wp_error* e = wp_index_unload(index);
|
193
|
+
RAISE_IF_NECESSARY(e);
|
194
|
+
|
195
|
+
return Qnil;
|
196
|
+
}
|
197
|
+
|
198
|
+
static void entry_free(wp_entry* entry) {
|
199
|
+
wp_error* e = wp_entry_free(entry);
|
200
|
+
//printf("# entry free at %p with error %p\n", entry, e);
|
201
|
+
if(e != NULL) {
|
202
|
+
PRINT_ERROR(e, stderr); // why not?
|
203
|
+
wp_error_free(e);
|
204
|
+
}
|
205
|
+
}
|
206
|
+
|
207
|
+
/* Creates a new, empty entry. */
|
208
|
+
static VALUE entry_new(VALUE class) {
|
209
|
+
wp_entry* entry = wp_entry_new();
|
210
|
+
|
211
|
+
//printf("# entry create at %p\n", entry);
|
212
|
+
VALUE o_entry = Data_Wrap_Struct(class, NULL, entry_free, entry);
|
213
|
+
rb_obj_call_init(o_entry, 0, NULL);
|
214
|
+
return o_entry;
|
215
|
+
}
|
216
|
+
|
217
|
+
/*
|
218
|
+
* call-seq: add_token(field, token)
|
219
|
+
*
|
220
|
+
* Adds a single token +token+ with field +field</field> to an entry. Both
|
221
|
+
* +token+ and +field</field> must be strings.
|
222
|
+
*
|
223
|
+
* Returns itself.
|
224
|
+
*/
|
225
|
+
static VALUE entry_add_token(VALUE self, VALUE field, VALUE term) {
|
226
|
+
Check_Type(field, T_STRING);
|
227
|
+
Check_Type(term, T_STRING);
|
228
|
+
|
229
|
+
wp_entry* entry; Data_Get_Struct(self, wp_entry, entry);
|
230
|
+
wp_error* e = wp_entry_add_token(entry, RSTRING_PTR(field), RSTRING_PTR(term));
|
231
|
+
RAISE_IF_NECESSARY(e);
|
232
|
+
|
233
|
+
return self;
|
234
|
+
}
|
235
|
+
|
236
|
+
/*
|
237
|
+
* call-seq: add_string(field, string)
|
238
|
+
*
|
239
|
+
* Adds a String +string+ with field +field</field> to an entry. The string
|
240
|
+
* will be tokenized on whitespace. Both +token+ and +string</field> must be
|
241
|
+
* strings.
|
242
|
+
*
|
243
|
+
* Returns itself.
|
244
|
+
*/
|
245
|
+
static VALUE entry_add_string(VALUE self, VALUE field, VALUE string) {
|
246
|
+
Check_Type(field, T_STRING);
|
247
|
+
Check_Type(string, T_STRING);
|
248
|
+
|
249
|
+
wp_entry* entry; Data_Get_Struct(self, wp_entry, entry);
|
250
|
+
wp_error* e = wp_entry_add_string(entry, RSTRING_PTR(field), RSTRING_PTR(string));
|
251
|
+
RAISE_IF_NECESSARY(e);
|
252
|
+
|
253
|
+
return self;
|
254
|
+
}
|
255
|
+
|
256
|
+
/*
|
257
|
+
* Returns the number of tokens in the entry.
|
258
|
+
*/
|
259
|
+
static VALUE entry_size(VALUE self) {
|
260
|
+
wp_entry* entry; Data_Get_Struct(self, wp_entry, entry);
|
261
|
+
return INT2NUM(wp_entry_size(entry));
|
262
|
+
}
|
263
|
+
|
264
|
+
/*
|
265
|
+
* call-seq: add_entry(entry)
|
266
|
+
*
|
267
|
+
* Adds the entry +entry+ to the index. Returns the document id
|
268
|
+
* corresponding to this entry.
|
269
|
+
*/
|
270
|
+
static VALUE index_add_entry(VALUE self, VALUE v_entry) {
|
271
|
+
if(CLASS_OF(v_entry) != c_entry) {
|
272
|
+
rb_raise(rb_eTypeError, "entry must be a Whistlepig::Entry object"); // would be nice to support subclasses somehow...
|
273
|
+
// not reached
|
274
|
+
}
|
275
|
+
|
276
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
277
|
+
wp_entry* entry; Data_Get_Struct(v_entry, wp_entry, entry);
|
278
|
+
uint64_t doc_id;
|
279
|
+
wp_error* e = wp_index_add_entry(index, entry, &doc_id);
|
280
|
+
RAISE_IF_NECESSARY(e);
|
281
|
+
|
282
|
+
return INT2NUM(doc_id);
|
283
|
+
}
|
284
|
+
|
285
|
+
/*
|
286
|
+
* call-seq: add_label(doc_id, label)
|
287
|
+
*
|
288
|
+
* Adds the label +label+ to the document corresponding to doc id
|
289
|
+
* +doc_id+ in the index. +label+ must be a String.
|
290
|
+
* If the label has already been added to the document, does nothing.
|
291
|
+
*/
|
292
|
+
static VALUE index_add_label(VALUE self, VALUE v_doc_id, VALUE v_label) {
|
293
|
+
Check_Type(v_doc_id, T_FIXNUM);
|
294
|
+
Check_Type(v_label, T_STRING);
|
295
|
+
|
296
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
297
|
+
wp_error* e = wp_index_add_label(index, RSTRING_PTR(v_label), NUM2INT(v_doc_id));
|
298
|
+
RAISE_IF_NECESSARY(e);
|
299
|
+
|
300
|
+
return v_label;
|
301
|
+
}
|
302
|
+
|
303
|
+
/*
|
304
|
+
* call-seq: remove_label(doc_id, label)
|
305
|
+
*
|
306
|
+
* Removes the label +label+ from the document corresponding to doc id
|
307
|
+
* +doc_id+ in the index. +label+ must be a String.
|
308
|
+
* If the label has not been added to the document, does nothing.
|
309
|
+
*/
|
310
|
+
static VALUE index_remove_label(VALUE self, VALUE v_doc_id, VALUE v_label) {
|
311
|
+
Check_Type(v_doc_id, T_FIXNUM);
|
312
|
+
Check_Type(v_label, T_STRING);
|
313
|
+
|
314
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
315
|
+
wp_error* e = wp_index_remove_label(index, RSTRING_PTR(v_label), NUM2INT(v_doc_id));
|
316
|
+
RAISE_IF_NECESSARY(e);
|
317
|
+
|
318
|
+
return v_label;
|
319
|
+
}
|
320
|
+
|
321
|
+
/*
|
322
|
+
* call-seq: Query.new(default_field, query_string)
|
323
|
+
*
|
324
|
+
* Creates a new query by parsing the string +query_string+, which must be a
|
325
|
+
* String. Any non-fielded terms will used the field +default_field+, which
|
326
|
+
* must also be a String. Raises a ParseError if the query cannot be parsed.
|
327
|
+
*
|
328
|
+
*/
|
329
|
+
static VALUE query_new(VALUE class, VALUE default_field, VALUE string) {
|
330
|
+
Check_Type(default_field, T_STRING);
|
331
|
+
Check_Type(string, T_STRING);
|
332
|
+
|
333
|
+
wp_query* query;
|
334
|
+
wp_error* e = wp_query_parse(RSTRING_PTR(string), RSTRING_PTR(default_field), &query);
|
335
|
+
if(e != NULL) {
|
336
|
+
VALUE exc = rb_exc_new2(c_parseerror, e->msg);
|
337
|
+
wp_error_free(e);
|
338
|
+
rb_exc_raise(exc);
|
339
|
+
}
|
340
|
+
|
341
|
+
VALUE o_query = Data_Wrap_Struct(class, NULL, wp_query_free, query);
|
342
|
+
VALUE argv[2] = { string, default_field };
|
343
|
+
rb_obj_call_init(o_query, 2, argv);
|
344
|
+
|
345
|
+
return o_query;
|
346
|
+
}
|
347
|
+
|
348
|
+
/*
|
349
|
+
* Returns a parsed representation of a String, useful for debugging.
|
350
|
+
*/
|
351
|
+
static VALUE query_to_s(VALUE self) {
|
352
|
+
char buf[1024];
|
353
|
+
|
354
|
+
wp_query* query; Data_Get_Struct(self, wp_query, query);
|
355
|
+
wp_query_to_s(query, 1024, buf);
|
356
|
+
|
357
|
+
return rb_str_new2(buf);
|
358
|
+
}
|
359
|
+
|
360
|
+
/*
|
361
|
+
* call-seq: and(other)
|
362
|
+
*
|
363
|
+
* Returns a new Query that is a conjunction of this query and +other+, which
|
364
|
+
* must also be a Query object.
|
365
|
+
*
|
366
|
+
*/
|
367
|
+
static VALUE query_and(VALUE self, VALUE v_other) {
|
368
|
+
if(CLASS_OF(v_other) != c_query) {
|
369
|
+
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
|
370
|
+
// not reached
|
371
|
+
}
|
372
|
+
|
373
|
+
wp_query* query; Data_Get_Struct(self, wp_query, query);
|
374
|
+
wp_query* other; Data_Get_Struct(v_other, wp_query, other);
|
375
|
+
|
376
|
+
wp_query* result = wp_query_new_conjunction();
|
377
|
+
result = wp_query_add(result, wp_query_clone(query));
|
378
|
+
result = wp_query_add(result, wp_query_clone(other));
|
379
|
+
|
380
|
+
VALUE o_result = Data_Wrap_Struct(c_query, NULL, wp_query_free, result);
|
381
|
+
VALUE argv[2] = { Qnil, Qnil }; // i guess
|
382
|
+
rb_obj_call_init(o_result, 2, argv);
|
383
|
+
|
384
|
+
return o_result;
|
385
|
+
}
|
386
|
+
|
387
|
+
/*
|
388
|
+
* call-seq: or(other)
|
389
|
+
*
|
390
|
+
* Returns a new Query that is a disjunction of this query and +other+, which
|
391
|
+
* must also be a Query object.
|
392
|
+
*
|
393
|
+
*/
|
394
|
+
static VALUE query_or(VALUE self, VALUE v_other) {
|
395
|
+
if(CLASS_OF(v_other) != c_query) {
|
396
|
+
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
|
397
|
+
// not reached
|
398
|
+
}
|
399
|
+
|
400
|
+
wp_query* query; Data_Get_Struct(self, wp_query, query);
|
401
|
+
wp_query* other; Data_Get_Struct(v_other, wp_query, other);
|
402
|
+
|
403
|
+
wp_query* result = wp_query_new_disjunction();
|
404
|
+
result = wp_query_add(result, wp_query_clone(query));
|
405
|
+
result = wp_query_add(result, wp_query_clone(other));
|
406
|
+
|
407
|
+
VALUE o_result = Data_Wrap_Struct(c_query, NULL, wp_query_free, result);
|
408
|
+
VALUE argv[2] = { Qnil, Qnil }; // i guess
|
409
|
+
rb_obj_call_init(o_result, 2, argv);
|
410
|
+
|
411
|
+
return o_result;
|
412
|
+
}
|
413
|
+
|
414
|
+
static VALUE query_init(VALUE self, VALUE query) {
|
415
|
+
rb_iv_set(self, "@query", query);
|
416
|
+
return self;
|
417
|
+
}
|
418
|
+
|
419
|
+
/*
|
420
|
+
* call-seq: setup_query(query)
|
421
|
+
*
|
422
|
+
* Initializes query for use with run_query. If you do not call teardown_query
|
423
|
+
* on this query later, you will leak memory.
|
424
|
+
*/
|
425
|
+
static VALUE index_setup_query(VALUE self, VALUE v_query) {
|
426
|
+
if(CLASS_OF(v_query) != c_query) {
|
427
|
+
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
|
428
|
+
// not reached
|
429
|
+
}
|
430
|
+
|
431
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
432
|
+
wp_query* query; Data_Get_Struct(v_query, wp_query, query);
|
433
|
+
wp_error* e = wp_index_setup_query(index, query);
|
434
|
+
RAISE_IF_NECESSARY(e);
|
435
|
+
|
436
|
+
return self;
|
437
|
+
}
|
438
|
+
|
439
|
+
/*
|
440
|
+
* call-seq: teardown_query(query)
|
441
|
+
*
|
442
|
+
* Releases any held state used by the query, if it has been first passed to
|
443
|
+
* setup_query. If you call run_query on this query after calling this
|
444
|
+
* function, terrible things will happen.
|
445
|
+
*/
|
446
|
+
static VALUE index_teardown_query(VALUE self, VALUE v_query) {
|
447
|
+
if(CLASS_OF(v_query) != c_query) {
|
448
|
+
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
|
449
|
+
// not reached
|
450
|
+
}
|
451
|
+
|
452
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
453
|
+
wp_query* query; Data_Get_Struct(v_query, wp_query, query);
|
454
|
+
wp_error* e = wp_index_teardown_query(index, query);
|
455
|
+
RAISE_IF_NECESSARY(e);
|
456
|
+
|
457
|
+
return self;
|
458
|
+
}
|
459
|
+
|
460
|
+
/*
|
461
|
+
* call-seq: run_query(query, max_num_results)
|
462
|
+
*
|
463
|
+
* Runs a query which has been first passed to setup_query, and returns an
|
464
|
+
* array of at most +max_num_results+ doc ids. Can be called
|
465
|
+
* multiple times to retrieve successive results from the query. The query
|
466
|
+
* must have been passed to setup_query first, or terrible things will happen.
|
467
|
+
* The query must be passed to teardown_query when done, or memory leaks will
|
468
|
+
* occur.
|
469
|
+
*
|
470
|
+
*/
|
471
|
+
static VALUE index_run_query(VALUE self, VALUE v_query, VALUE v_max_num_results) {
|
472
|
+
Check_Type(v_max_num_results, T_FIXNUM);
|
473
|
+
if(CLASS_OF(v_query) != c_query) {
|
474
|
+
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
|
475
|
+
// not reached
|
476
|
+
}
|
477
|
+
|
478
|
+
wp_index* index; Data_Get_Struct(self, wp_index, index);
|
479
|
+
wp_query* query; Data_Get_Struct(v_query, wp_query, query);
|
480
|
+
|
481
|
+
uint32_t max_num_results = NUM2INT(v_max_num_results);
|
482
|
+
uint32_t num_results;
|
483
|
+
uint64_t* results = malloc(sizeof(uint64_t) * max_num_results);
|
484
|
+
|
485
|
+
wp_error* e = wp_index_run_query(index, query, max_num_results, &num_results, results);
|
486
|
+
RAISE_IF_NECESSARY(e);
|
487
|
+
|
488
|
+
VALUE array = rb_ary_new2(num_results);
|
489
|
+
for(uint32_t i = 0; i < num_results; i++) {
|
490
|
+
rb_ary_store(array, i, INT2NUM(results[i]));
|
491
|
+
}
|
492
|
+
free(results);
|
493
|
+
|
494
|
+
return array;
|
495
|
+
}
|
496
|
+
|
497
|
+
void Init_whistlepigc() {
|
498
|
+
VALUE m_whistlepig;
|
499
|
+
|
500
|
+
m_whistlepig = rb_define_module("Whistlepig");
|
501
|
+
|
502
|
+
c_index = rb_define_class_under(m_whistlepig, "Index", rb_cObject);
|
503
|
+
rb_define_singleton_method(c_index, "new", index_new, 1);
|
504
|
+
rb_define_singleton_method(c_index, "create", index_create, 1);
|
505
|
+
rb_define_singleton_method(c_index, "load", index_load, 1);
|
506
|
+
rb_define_singleton_method(c_index, "delete!", index_delete, 1);
|
507
|
+
rb_define_singleton_method(c_index, "exists?", index_exists, 1);
|
508
|
+
rb_define_method(c_index, "initialize", index_init, 1);
|
509
|
+
rb_define_method(c_index, "close", index_close, 0);
|
510
|
+
rb_define_method(c_index, "size", index_size, 0);
|
511
|
+
rb_define_method(c_index, "add_entry", index_add_entry, 1);
|
512
|
+
rb_define_method(c_index, "add_label", index_add_label, 2);
|
513
|
+
rb_define_method(c_index, "remove_label", index_remove_label, 2);
|
514
|
+
rb_define_method(c_index, "count", index_count, 1);
|
515
|
+
rb_define_method(c_index, "setup_query", index_setup_query, 1);
|
516
|
+
rb_define_method(c_index, "run_query", index_run_query, 2);
|
517
|
+
rb_define_method(c_index, "teardown_query", index_teardown_query, 1);
|
518
|
+
rb_define_attr(c_index, "pathname_base", 1, 0);
|
519
|
+
|
520
|
+
c_entry = rb_define_class_under(m_whistlepig, "Entry", rb_cObject);
|
521
|
+
rb_define_singleton_method(c_entry, "new", entry_new, 0);
|
522
|
+
rb_define_method(c_entry, "size", entry_size, 0);
|
523
|
+
rb_define_method(c_entry, "add_token", entry_add_token, 2);
|
524
|
+
rb_define_method(c_entry, "add_string", entry_add_string, 2);
|
525
|
+
//rb_define_method(c_entry, "add_file", entry_add_file, 2);
|
526
|
+
|
527
|
+
c_query = rb_define_class_under(m_whistlepig, "Query", rb_cObject);
|
528
|
+
rb_define_singleton_method(c_query, "new", query_new, 2);
|
529
|
+
rb_define_method(c_query, "initialize", query_init, 2);
|
530
|
+
rb_define_method(c_query, "and", query_and, 1);
|
531
|
+
rb_define_method(c_query, "or", query_or, 1);
|
532
|
+
rb_define_method(c_query, "to_s", query_to_s, 0);
|
533
|
+
rb_define_attr(c_query, "query", 1, 0);
|
534
|
+
|
535
|
+
c_error = rb_define_class_under(m_whistlepig, "Error", rb_eStandardError);
|
536
|
+
c_parseerror = rb_define_class_under(m_whistlepig, "ParseError", rb_eStandardError);
|
537
|
+
}
|
data/lib/whistlepig.rb
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
require "whistlepigc"
|
2
|
+
|
3
|
+
module Whistlepig
|
4
|
+
## A full-text index. You can add entries to it, and you can run queries
|
5
|
+
## against it.
|
6
|
+
##
|
7
|
+
## To add documents, create Entry objects and call add_entry. Entries
|
8
|
+
## represent the document before addition; add_entry will return an integer
|
9
|
+
## docid and the entry can be discarded at that point.
|
10
|
+
##
|
11
|
+
## To run queries, the simplest option is to call Index#search or
|
12
|
+
## Index#each_result_for.
|
13
|
+
##
|
14
|
+
## The more complex option is to use setup_query, run_query, and
|
15
|
+
## teardown_query, in that order. The advantage of this approach is that
|
16
|
+
## run_query can be called multiple times, and each call will return more
|
17
|
+
## results, allowing for query pagination.
|
18
|
+
class Index
|
19
|
+
## Runs a query and yield each matching doc id. Handles the mechanics of
|
20
|
+
## setting up and tearing down the query.
|
21
|
+
def each_result_for query, chunk_size=10
|
22
|
+
setup_query query
|
23
|
+
begin
|
24
|
+
while true
|
25
|
+
results = run_query query, chunk_size
|
26
|
+
results.each { |r| yield r }
|
27
|
+
break if results.size < chunk_size
|
28
|
+
end
|
29
|
+
ensure
|
30
|
+
teardown_query query
|
31
|
+
end
|
32
|
+
self
|
33
|
+
end
|
34
|
+
|
35
|
+
## Convenience method. Runs a query and returns up to +max_results+
|
36
|
+
## matching doc ids. Handles the mechanics of setting up and tearing down
|
37
|
+
## the query.
|
38
|
+
def search query, max_results=nil
|
39
|
+
setup_query query
|
40
|
+
ret = []
|
41
|
+
num_per_call = max_results || 100
|
42
|
+
begin
|
43
|
+
while true
|
44
|
+
results = run_query query, num_per_call
|
45
|
+
ret += results
|
46
|
+
break if max_results || results.size < num_per_call
|
47
|
+
end
|
48
|
+
ensure
|
49
|
+
teardown_query query
|
50
|
+
end
|
51
|
+
|
52
|
+
ret
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
## Represents document, before being added to the index.
|
57
|
+
##
|
58
|
+
## Entries allow you to build up a document in memory before indexing it.
|
59
|
+
## Once you've built it, pass it to Index#add_entry.
|
60
|
+
class Entry
|
61
|
+
end
|
62
|
+
|
63
|
+
## A generic error.
|
64
|
+
class Error
|
65
|
+
end
|
66
|
+
|
67
|
+
## A parser error.
|
68
|
+
class ParseError
|
69
|
+
end
|
70
|
+
|
71
|
+
## A query. Queries are created from strings with Query#new. If parsing the
|
72
|
+
## string fails, a ParseError is thrown.
|
73
|
+
##
|
74
|
+
## At the lowest level, queries are composed of space-separated terms.
|
75
|
+
## Matches against that term are restricted to the default field specified at
|
76
|
+
## parse time.
|
77
|
+
##
|
78
|
+
## hello # search for "hello" in the default field
|
79
|
+
##
|
80
|
+
## Term matches can be restricted to another field by by
|
81
|
+
## prefixing them with the field name and ":", e.g. "subject:hello".
|
82
|
+
##
|
83
|
+
## subject:hello # search for "hello" in the "subject" field
|
84
|
+
##
|
85
|
+
## Multiple terms are considered conjunctive (i.e. all must match) unless the
|
86
|
+
## special token "OR" appears between them. The "OR" must be capitalized
|
87
|
+
## in this case.
|
88
|
+
## word1 word2 # search for word1 and word2
|
89
|
+
## word1 OR word2 # search for word1 or word2
|
90
|
+
## subject:hello bob # "hello" in the subject field and "bob" in the
|
91
|
+
## # default field
|
92
|
+
##
|
93
|
+
## Parentheses can be used to group disjunctions, conjunctions or fields.
|
94
|
+
## (word1 OR word2) word3 # "word3" and either "word1" or "word2"
|
95
|
+
## field:(word1 OR word2) # "word1" or "word2" in field "field"
|
96
|
+
##
|
97
|
+
## Phrases are specified by surrounding the terms with double quotes.
|
98
|
+
## "bob jones" # documents with the phrase "bob jones"
|
99
|
+
##
|
100
|
+
## Negations can be specified with a - prefix.
|
101
|
+
## -word # docs without "word"
|
102
|
+
## -subject:(bob OR joe) # docs with neither "bob" nor "joe" in subject
|
103
|
+
##
|
104
|
+
## Labels are specified with a ~ prefix. Labels do not have fields.
|
105
|
+
## ~inbox # docs with the "inbox" label
|
106
|
+
## -~inbox # docs without the "inbox" label
|
107
|
+
## -~inbox subject:hello # docs with subject "hello" and without the
|
108
|
+
## # inbox label
|
109
|
+
##
|
110
|
+
## All of the above can be mixed and matched, of course.
|
111
|
+
## -subject:"spam email" ~inbox (money OR cash)
|
112
|
+
## ("love you" OR "hate you") -(~deleted OR ~spam)
|
113
|
+
## etc...
|
114
|
+
##
|
115
|
+
## Existing query objects can also be altered programmatically, at least to
|
116
|
+
## a limited extent, by calling Query#and and Query#or.
|
117
|
+
class Query
|
118
|
+
end
|
119
|
+
end
|