whistlepig 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +86 -0
- data/ext/whistlepig/defaults.h +28 -0
- data/ext/whistlepig/entry.c +181 -0
- data/ext/whistlepig/entry.h +66 -0
- data/ext/whistlepig/error.c +24 -0
- data/ext/whistlepig/error.h +94 -0
- data/ext/whistlepig/extconf.rb +6 -0
- data/ext/whistlepig/index.c +294 -0
- data/ext/whistlepig/index.h +88 -0
- data/ext/whistlepig/khash.h +316 -0
- data/ext/whistlepig/mmap-obj.c +76 -0
- data/ext/whistlepig/mmap-obj.h +52 -0
- data/ext/whistlepig/query-parser.c +37 -0
- data/ext/whistlepig/query-parser.h +25 -0
- data/ext/whistlepig/query-parser.lex.c +2249 -0
- data/ext/whistlepig/query-parser.lex.h +359 -0
- data/ext/whistlepig/query-parser.tab.c +1757 -0
- data/ext/whistlepig/query-parser.tab.h +85 -0
- data/ext/whistlepig/query.c +194 -0
- data/ext/whistlepig/query.h +78 -0
- data/ext/whistlepig/search.c +746 -0
- data/ext/whistlepig/search.h +76 -0
- data/ext/whistlepig/segment.c +615 -0
- data/ext/whistlepig/segment.h +137 -0
- data/ext/whistlepig/stringmap.c +278 -0
- data/ext/whistlepig/stringmap.h +82 -0
- data/ext/whistlepig/stringpool.c +44 -0
- data/ext/whistlepig/stringpool.h +58 -0
- data/ext/whistlepig/termhash.c +294 -0
- data/ext/whistlepig/termhash.h +79 -0
- data/ext/whistlepig/tokenizer.lex.c +2263 -0
- data/ext/whistlepig/tokenizer.lex.h +360 -0
- data/ext/whistlepig/whistlepig.h +15 -0
- data/ext/whistlepig/whistlepigc.c +537 -0
- data/lib/whistlepig.rb +119 -0
- metadata +103 -0
@@ -0,0 +1,746 @@
|
|
1
|
+
#include "whistlepig.h"
|
2
|
+
|
3
|
+
/********* search states *********/
|
4
|
+
typedef struct term_search_state {
|
5
|
+
posting posting;
|
6
|
+
int started;
|
7
|
+
int done;
|
8
|
+
int label; // 1 if a label; 0 if a term
|
9
|
+
} term_search_state;
|
10
|
+
|
11
|
+
typedef struct neg_search_state {
|
12
|
+
docid_t next; // the next document in the child stream. we will never return this document.
|
13
|
+
docid_t cur; // the last doc we returned
|
14
|
+
} neg_search_state;
|
15
|
+
|
16
|
+
#define DISJ_SEARCH_STATE_EMPTY 0
|
17
|
+
#define DISJ_SEARCH_STATE_FILLED 1
|
18
|
+
#define DISJ_SEARCH_STATE_DONE 2
|
19
|
+
|
20
|
+
typedef struct disj_search_state {
|
21
|
+
docid_t last_docid;
|
22
|
+
uint8_t* states; // whether the search result has been initialized or not
|
23
|
+
search_result* results; // array of search results, one per child
|
24
|
+
} disj_search_state;
|
25
|
+
|
26
|
+
void wp_search_result_free(search_result* result) {
|
27
|
+
for(int i = 0; i < result->num_doc_matches; i++) {
|
28
|
+
//printf("for result at %p (dm %d), freeing positions at %p\n", result, i, result->doc_matches[i].positions);
|
29
|
+
free(result->doc_matches[i].positions);
|
30
|
+
}
|
31
|
+
free(result->doc_matches);
|
32
|
+
}
|
33
|
+
|
34
|
+
RAISING_STATIC(search_result_init(search_result* result, const char* field, const char* word, posting* posting)) {
|
35
|
+
result->doc_id = posting->doc_id;
|
36
|
+
result->num_doc_matches = 1;
|
37
|
+
result->doc_matches = malloc(sizeof(doc_match));
|
38
|
+
result->doc_matches[0].field = field;
|
39
|
+
result->doc_matches[0].word = word;
|
40
|
+
result->doc_matches[0].num_positions = posting->num_positions;
|
41
|
+
|
42
|
+
size_t size = sizeof(pos_t) * posting->num_positions;
|
43
|
+
result->doc_matches[0].positions = malloc(size);
|
44
|
+
//printf("for result at %p, allocated %u bytes for positions at %p\n", result, size, result->doc_matches[0].positions);
|
45
|
+
memcpy(result->doc_matches[0].positions, posting->positions, size);
|
46
|
+
|
47
|
+
return NO_ERROR;
|
48
|
+
}
|
49
|
+
|
50
|
+
RAISING_STATIC(search_result_combine_into(search_result* result, search_result* child_results, int num_child_results)) {
|
51
|
+
if(num_child_results <= 0) RAISE_ERROR("no child results");
|
52
|
+
result->doc_id = child_results[0].doc_id;
|
53
|
+
result->num_doc_matches = num_child_results;
|
54
|
+
result->doc_matches = malloc(sizeof(doc_match) * num_child_results);
|
55
|
+
for(int i = 0; i < num_child_results; i++) {
|
56
|
+
if(child_results[i].doc_matches == NULL) {
|
57
|
+
result->doc_matches[i].field = NULL;
|
58
|
+
result->doc_matches[i].word = NULL;
|
59
|
+
result->doc_matches[i].num_positions = 0;
|
60
|
+
result->doc_matches[i].positions = NULL;
|
61
|
+
}
|
62
|
+
else result->doc_matches[i] = child_results[i].doc_matches[0];
|
63
|
+
}
|
64
|
+
|
65
|
+
return NO_ERROR;
|
66
|
+
}
|
67
|
+
|
68
|
+
/*
|
69
|
+
* we provide two functions for iterating through result streams: next() and
|
70
|
+
* advance().
|
71
|
+
*
|
72
|
+
* next() returns results one at a time. it will set done = true if you're at
|
73
|
+
* the end of the stream. otherwise, it will give you a result. the next
|
74
|
+
* call to next() will give you the next result (or set done = true).
|
75
|
+
*
|
76
|
+
* advance() is given a docid and advances the stream to just *after* that
|
77
|
+
* document, and tells you whether it saw the docid on the way(and set a result
|
78
|
+
* if so for your convenience).
|
79
|
+
*
|
80
|
+
* a next() followed by one or more advance() calls with the returned docid
|
81
|
+
* will set found = true and will not advance the stream beyond where it
|
82
|
+
* already is.
|
83
|
+
*
|
84
|
+
* however, an advance() to a docid followed by next() may skip a document in
|
85
|
+
* the stream. you probably don't want this.
|
86
|
+
*
|
87
|
+
* so advance is only useful if you have a particular doc_id in mind, and you
|
88
|
+
* want to see if this stream contains it. if you want to actually see all the
|
89
|
+
* docids in a stream, you must use next().
|
90
|
+
*
|
91
|
+
*/
|
92
|
+
|
93
|
+
/********** dispatch functions ***********/
|
94
|
+
static wp_error* term_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
|
95
|
+
static wp_error* conj_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
|
96
|
+
static wp_error* disj_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
|
97
|
+
static wp_error* phrase_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
|
98
|
+
static wp_error* neg_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
|
99
|
+
static wp_error* term_release_search_state(wp_query* q) RAISES_ERROR;
|
100
|
+
static wp_error* conj_release_search_state(wp_query* q) RAISES_ERROR;
|
101
|
+
static wp_error* disj_release_search_state(wp_query* q) RAISES_ERROR;
|
102
|
+
static wp_error* phrase_release_search_state(wp_query* q) RAISES_ERROR;
|
103
|
+
static wp_error* neg_release_search_state(wp_query* q) RAISES_ERROR;
|
104
|
+
static wp_error* term_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
|
105
|
+
static wp_error* conj_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
|
106
|
+
static wp_error* disj_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
|
107
|
+
static wp_error* phrase_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
|
108
|
+
static wp_error* neg_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
|
109
|
+
static wp_error* term_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
|
110
|
+
static wp_error* conj_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
|
111
|
+
static wp_error* disj_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
|
112
|
+
static wp_error* phrase_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
|
113
|
+
static wp_error* neg_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
|
114
|
+
|
115
|
+
// the term_* functions also handle labels
|
116
|
+
// we use conj for empty queries as well (why not)
|
117
|
+
#define DISPATCH(type, suffix, ...) \
|
118
|
+
switch(type) { \
|
119
|
+
case WP_QUERY_TERM: \
|
120
|
+
case WP_QUERY_LABEL: RELAY_ERROR(term_##suffix(__VA_ARGS__)); break; \
|
121
|
+
case WP_QUERY_EMPTY: \
|
122
|
+
case WP_QUERY_CONJ: RELAY_ERROR(conj_##suffix(__VA_ARGS__)); break; \
|
123
|
+
case WP_QUERY_DISJ: RELAY_ERROR(disj_##suffix(__VA_ARGS__)); break; \
|
124
|
+
case WP_QUERY_PHRASE: RELAY_ERROR(phrase_##suffix(__VA_ARGS__)); break; \
|
125
|
+
case WP_QUERY_NEG: RELAY_ERROR(neg_##suffix(__VA_ARGS__)); break; \
|
126
|
+
default: RAISE_ERROR("unknown query node type %d", type); \
|
127
|
+
} \
|
128
|
+
|
129
|
+
wp_error* wp_search_init_search_state(wp_query* q, wp_segment* s) {
|
130
|
+
DISPATCH(q->type, init_search_state, q, s);
|
131
|
+
return NO_ERROR;
|
132
|
+
}
|
133
|
+
|
134
|
+
wp_error* wp_search_release_search_state(wp_query* q) {
|
135
|
+
DISPATCH(q->type, release_search_state, q)
|
136
|
+
return NO_ERROR;
|
137
|
+
}
|
138
|
+
|
139
|
+
RAISING_STATIC(query_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done)) {
|
140
|
+
DISPATCH(q->type, next_doc, q, s, result, done);
|
141
|
+
#ifdef DEBUGOUTPUT
|
142
|
+
char buf[1024];
|
143
|
+
wp_query_to_s(q, 1024, buf);
|
144
|
+
|
145
|
+
if(*done) DEBUG("query %s is done", buf);
|
146
|
+
else DEBUG("query %s has doc %u", buf, result->doc_id);
|
147
|
+
#endif
|
148
|
+
return NO_ERROR;
|
149
|
+
}
|
150
|
+
|
151
|
+
RAISING_STATIC(query_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done)) {
|
152
|
+
DISPATCH(q->type, advance_to_doc, q, s, doc_id, result, found, done);
|
153
|
+
#ifdef DEBUGOUTPUT
|
154
|
+
char buf[1024];
|
155
|
+
wp_query_to_s(q, 1024, buf);
|
156
|
+
|
157
|
+
if(*done) DEBUG("query %s is done", buf);
|
158
|
+
else {
|
159
|
+
if(*found) DEBUG("query %s has doc %u", buf, doc_id);
|
160
|
+
else DEBUG("query %s does not have doc %u", buf, doc_id);
|
161
|
+
}
|
162
|
+
#endif
|
163
|
+
return NO_ERROR;
|
164
|
+
}
|
165
|
+
|
166
|
+
/************** init functions *************/
|
167
|
+
|
168
|
+
RAISING_STATIC(init_children(wp_query* q, wp_segment* s)) {
|
169
|
+
for(wp_query* child = q->children; child != NULL; child = child->next) RELAY_ERROR(wp_search_init_search_state(child, s));
|
170
|
+
return NO_ERROR;
|
171
|
+
}
|
172
|
+
|
173
|
+
RAISING_STATIC(release_children(wp_query* q)) {
|
174
|
+
for(wp_query* child = q->children; child != NULL; child = child->next) RELAY_ERROR(wp_search_release_search_state(child));
|
175
|
+
return NO_ERROR;
|
176
|
+
}
|
177
|
+
|
178
|
+
static wp_error* term_init_search_state(wp_query* q, wp_segment* seg) {
|
179
|
+
term t;
|
180
|
+
stringmap* sh = MMAP_OBJ(seg->stringmap, stringmap);
|
181
|
+
termhash* th = MMAP_OBJ(seg->termhash, termhash);
|
182
|
+
|
183
|
+
term_search_state* state = q->search_data = malloc(sizeof(term_search_state));
|
184
|
+
state->started = 0;
|
185
|
+
|
186
|
+
state->label = q->type == WP_QUERY_LABEL ? 1 : 0;
|
187
|
+
if(state->label) t.field_s = 0;
|
188
|
+
else t.field_s = stringmap_string_to_int(sh, q->field); // will be -1 if not found
|
189
|
+
|
190
|
+
t.word_s = stringmap_string_to_int(sh, q->word);
|
191
|
+
|
192
|
+
uint32_t offset = termhash_get_val(th, t);
|
193
|
+
if(offset == (uint32_t)-1) offset = OFFSET_NONE;
|
194
|
+
|
195
|
+
if(offset == OFFSET_NONE) state->done = 1; // no entry in term hash
|
196
|
+
else {
|
197
|
+
state->done = 0;
|
198
|
+
if(state->label) RELAY_ERROR(wp_segment_read_label(seg, offset, &state->posting));
|
199
|
+
else RELAY_ERROR(wp_segment_read_posting(seg, offset, &state->posting, 1));
|
200
|
+
}
|
201
|
+
|
202
|
+
RELAY_ERROR(init_children(q, seg));
|
203
|
+
|
204
|
+
return NO_ERROR;
|
205
|
+
}
|
206
|
+
|
207
|
+
static wp_error* term_release_search_state(wp_query* q) {
|
208
|
+
term_search_state* state = q->search_data;
|
209
|
+
if(!state->done) free(state->posting.positions);
|
210
|
+
free(state);
|
211
|
+
RELAY_ERROR(release_children(q));
|
212
|
+
return NO_ERROR;
|
213
|
+
}
|
214
|
+
|
215
|
+
static wp_error* conj_init_search_state(wp_query* q, wp_segment* s) {
|
216
|
+
q->search_data = NULL; // no state needed
|
217
|
+
RELAY_ERROR(init_children(q, s));
|
218
|
+
return NO_ERROR;
|
219
|
+
}
|
220
|
+
|
221
|
+
static wp_error* conj_release_search_state(wp_query* q) {
|
222
|
+
RELAY_ERROR(release_children(q));
|
223
|
+
return NO_ERROR;
|
224
|
+
}
|
225
|
+
|
226
|
+
static wp_error* disj_init_search_state(wp_query* q, wp_segment* s) {
|
227
|
+
disj_search_state* state = q->search_data = malloc(sizeof(disj_search_state));
|
228
|
+
state->states = NULL;
|
229
|
+
state->results = NULL;
|
230
|
+
state->last_docid = DOCID_NONE;
|
231
|
+
RELAY_ERROR(init_children(q, s));
|
232
|
+
return NO_ERROR;
|
233
|
+
}
|
234
|
+
|
235
|
+
static wp_error* disj_release_search_state(wp_query* q) {
|
236
|
+
disj_search_state* state = (disj_search_state*)q->search_data;
|
237
|
+
if(state->states) {
|
238
|
+
// free any remaining search results in the buffer
|
239
|
+
for(uint16_t i = 0; i < q->num_children; i++) {
|
240
|
+
if(state->states[i] == DISJ_SEARCH_STATE_FILLED) wp_search_result_free(&state->results[i]);
|
241
|
+
}
|
242
|
+
free(state->states);
|
243
|
+
free(state->results);
|
244
|
+
}
|
245
|
+
free(state);
|
246
|
+
RELAY_ERROR(release_children(q));
|
247
|
+
return NO_ERROR;
|
248
|
+
}
|
249
|
+
|
250
|
+
static wp_error* phrase_init_search_state(wp_query* q, wp_segment* s) {
|
251
|
+
q->search_data = NULL; // no state needed
|
252
|
+
RELAY_ERROR(init_children(q, s));
|
253
|
+
return NO_ERROR;
|
254
|
+
}
|
255
|
+
|
256
|
+
static wp_error* phrase_release_search_state(wp_query* q) {
|
257
|
+
RELAY_ERROR(release_children(q));
|
258
|
+
return NO_ERROR;
|
259
|
+
}
|
260
|
+
|
261
|
+
static wp_error* neg_init_search_state(wp_query* q, wp_segment* seg) {
|
262
|
+
if(q->num_children != 1) RAISE_ERROR("negations currently only operate on single children");
|
263
|
+
|
264
|
+
RELAY_ERROR(wp_search_init_search_state(q->children, seg));
|
265
|
+
|
266
|
+
postings_region* pr = MMAP_OBJ(seg->postings, postings_region);
|
267
|
+
neg_search_state* state = q->search_data = malloc(sizeof(neg_search_state));
|
268
|
+
|
269
|
+
state->cur = pr->num_docs + 1;
|
270
|
+
search_result result;
|
271
|
+
int done;
|
272
|
+
RELAY_ERROR(query_next_doc(q->children, seg, &result, &done));
|
273
|
+
if(done) state->next = DOCID_NONE;
|
274
|
+
else {
|
275
|
+
state->next = result.doc_id;
|
276
|
+
wp_search_result_free(&result);
|
277
|
+
}
|
278
|
+
DEBUG("initialized with cur %u and next %u", state->cur, state->next);
|
279
|
+
|
280
|
+
return NO_ERROR;
|
281
|
+
}
|
282
|
+
|
283
|
+
static wp_error* neg_release_search_state(wp_query* q) {
|
284
|
+
RELAY_ERROR(wp_search_release_search_state(q->children));
|
285
|
+
free(q->search_data);
|
286
|
+
return NO_ERROR;
|
287
|
+
}
|
288
|
+
|
289
|
+
/********** search functions **********/
|
290
|
+
|
291
|
+
static wp_error* term_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) {
|
292
|
+
term_search_state* state = (term_search_state*)q->search_data;
|
293
|
+
|
294
|
+
DEBUG("[%s:'%s'] before: started is %d, done is %d", q->field, q->word, state->started, state->done);
|
295
|
+
if(state->done) {
|
296
|
+
*done = 1;
|
297
|
+
return NO_ERROR;
|
298
|
+
}
|
299
|
+
|
300
|
+
*done = 0;
|
301
|
+
if(!state->started) { // start
|
302
|
+
state->started = 1;
|
303
|
+
RELAY_ERROR(search_result_init(result, q->field, q->word, &state->posting));
|
304
|
+
}
|
305
|
+
else { // advance
|
306
|
+
free(state->posting.positions);
|
307
|
+
if(state->posting.next_offset == OFFSET_NONE) { // end of stream
|
308
|
+
*done = state->done = 1;
|
309
|
+
}
|
310
|
+
else {
|
311
|
+
if(state->label) RELAY_ERROR(wp_segment_read_label(s, state->posting.next_offset, &state->posting));
|
312
|
+
else RELAY_ERROR(wp_segment_read_posting(s, state->posting.next_offset, &state->posting, 1));
|
313
|
+
RELAY_ERROR(search_result_init(result, q->field, q->word, &state->posting));
|
314
|
+
}
|
315
|
+
}
|
316
|
+
DEBUG("[%s:'%s'] after: doc id %u, done is %d, started is %d", q->field, q->word, (state->started && !state->done && result) ? result->doc_id : 0, *done, state->started);
|
317
|
+
|
318
|
+
return NO_ERROR;
|
319
|
+
}
|
320
|
+
|
321
|
+
static wp_error* term_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) {
|
322
|
+
term_search_state* state = (term_search_state*)q->search_data;
|
323
|
+
DEBUG("[%s:'%s'] seeking through postings for doc %u", q->field, q->word, doc_id);
|
324
|
+
|
325
|
+
if(state->done) { // end of stream
|
326
|
+
*found = 0;
|
327
|
+
*done = 1;
|
328
|
+
return NO_ERROR;
|
329
|
+
}
|
330
|
+
|
331
|
+
while(state->posting.doc_id > doc_id) {
|
332
|
+
free(state->posting.positions);
|
333
|
+
DEBUG("skipping doc_id %u", state->posting.doc_id);
|
334
|
+
if(state->posting.next_offset == OFFSET_NONE) {
|
335
|
+
state->done = 1;
|
336
|
+
break;
|
337
|
+
}
|
338
|
+
|
339
|
+
if(state->label) RELAY_ERROR(wp_segment_read_label(s, state->posting.next_offset, &state->posting));
|
340
|
+
else RELAY_ERROR(wp_segment_read_posting(s, state->posting.next_offset, &state->posting, 1));
|
341
|
+
//DEBUG("advanced posting to %p", state->posting);
|
342
|
+
}
|
343
|
+
|
344
|
+
if(state->done) {
|
345
|
+
DEBUG("[%s:'%s'] posting list exhausted", q->field, q->word);
|
346
|
+
*found = 0;
|
347
|
+
*done = 1;
|
348
|
+
}
|
349
|
+
else {
|
350
|
+
*done = 0;
|
351
|
+
DEBUG("[%s:'%s'] posting advanced to that of doc %u", q->field, q->word, state->posting.doc_id);
|
352
|
+
*found = (doc_id == state->posting.doc_id ? 1 : 0);
|
353
|
+
if(*found) RELAY_ERROR(search_result_init(result, q->field, q->word, &state->posting));
|
354
|
+
}
|
355
|
+
|
356
|
+
return NO_ERROR;
|
357
|
+
}
|
358
|
+
|
359
|
+
// this advances all children *until* it finds a child that doesn't have the
|
360
|
+
// doc. at that point it stops. so it will return found=0 if any single child
|
361
|
+
// doesn't have the doc, and done=1 if any single child is done.
|
362
|
+
//
|
363
|
+
// this is used by both phrasal and conjunctive queries.
|
364
|
+
static wp_error* advance_all_children(wp_query* q, wp_segment* seg, docid_t search_doc, search_result* child_results, int* found, int* done) {
|
365
|
+
int num_children_searched = 0;
|
366
|
+
*found = 1;
|
367
|
+
|
368
|
+
DEBUG("advancing all children to doc %u with early termination", search_doc);
|
369
|
+
|
370
|
+
for(wp_query* child = q->children; child != NULL; child = child->next) {
|
371
|
+
RELAY_ERROR(query_advance_to_doc(child, seg, search_doc, &child_results[num_children_searched], found, done));
|
372
|
+
num_children_searched++;
|
373
|
+
if(!*found) break;
|
374
|
+
}
|
375
|
+
|
376
|
+
if(!*found) for(int i = 0; i < num_children_searched - 1; i++) wp_search_result_free(&child_results[i]);
|
377
|
+
|
378
|
+
return NO_ERROR;
|
379
|
+
}
|
380
|
+
|
381
|
+
static wp_error* disj_next_doc(wp_query* q, wp_segment* seg, search_result* result, int* done) {
|
382
|
+
if(q->children == NULL) {
|
383
|
+
*done = 1;
|
384
|
+
return NO_ERROR;
|
385
|
+
}
|
386
|
+
|
387
|
+
// allocate search state if necessary
|
388
|
+
disj_search_state* state = (disj_search_state*)q->search_data;
|
389
|
+
if(state->states == NULL) {
|
390
|
+
state->states = malloc(sizeof(uint8_t) * q->num_children);
|
391
|
+
state->results = malloc(sizeof(search_result) * q->num_children);
|
392
|
+
memset(state->states, DISJ_SEARCH_STATE_EMPTY, sizeof(uint8_t) * q->num_children);
|
393
|
+
}
|
394
|
+
|
395
|
+
// fill all the results we can into the buffer by calling next_doc on all
|
396
|
+
// non-done children
|
397
|
+
uint16_t i = 0;
|
398
|
+
for(wp_query* child = q->children; child != NULL; child = child->next) {
|
399
|
+
if(state->states[i] == DISJ_SEARCH_STATE_EMPTY) {
|
400
|
+
int thisdone = 0;
|
401
|
+
DEBUG("recursing on child %d", i);
|
402
|
+
RELAY_ERROR(query_next_doc(child, seg, &(state->results[i]), &thisdone));
|
403
|
+
if(thisdone == 1) state->states[i] = DISJ_SEARCH_STATE_DONE;
|
404
|
+
else state->states[i] = DISJ_SEARCH_STATE_FILLED;
|
405
|
+
DEBUG("after recurse, state %d is marked %d", i, state->states[i]);
|
406
|
+
}
|
407
|
+
i++;
|
408
|
+
}
|
409
|
+
|
410
|
+
// now find the largest
|
411
|
+
uint16_t max_doc_idx = 0;
|
412
|
+
docid_t max_docid = 0;
|
413
|
+
|
414
|
+
*done = 1;
|
415
|
+
i = 0;
|
416
|
+
for(wp_query* child = q->children; child != NULL; child = child->next) {
|
417
|
+
DEBUG("child %d is marked as %d", i, state->states[i]);
|
418
|
+
if(state->states[i] == DISJ_SEARCH_STATE_FILLED) {
|
419
|
+
if((*done == 1) || (state->results[i].doc_id > max_docid)) {
|
420
|
+
if(state->results[i].doc_id == state->last_docid) { // discard dupes
|
421
|
+
DEBUG("child %d has old result %u; voiding", i, state->last_docid);
|
422
|
+
wp_search_result_free(&state->results[i]);
|
423
|
+
state->states[i] = DISJ_SEARCH_STATE_EMPTY;
|
424
|
+
}
|
425
|
+
else {
|
426
|
+
*done = 0;
|
427
|
+
max_docid = state->results[i].doc_id;
|
428
|
+
max_doc_idx = i;
|
429
|
+
}
|
430
|
+
}
|
431
|
+
}
|
432
|
+
i++;
|
433
|
+
}
|
434
|
+
|
435
|
+
// finally, copy the result
|
436
|
+
if(*done == 0) {
|
437
|
+
DEBUG("returning doc %d at index %d", max_docid, max_doc_idx);
|
438
|
+
memcpy(result, &state->results[max_doc_idx], sizeof(search_result));
|
439
|
+
state->states[max_doc_idx] = DISJ_SEARCH_STATE_EMPTY;
|
440
|
+
state->last_docid = result->doc_id;
|
441
|
+
}
|
442
|
+
|
443
|
+
return NO_ERROR;
|
444
|
+
}
|
445
|
+
|
446
|
+
static wp_error* conj_next_doc(wp_query* q, wp_segment* seg, search_result* result, int* done) {
|
447
|
+
docid_t search_doc;
|
448
|
+
int found = 0;
|
449
|
+
*done = 0;
|
450
|
+
|
451
|
+
// start with the first child's first doc
|
452
|
+
// TODO: find smallest postings list and use that instead
|
453
|
+
wp_query* master = q->children;
|
454
|
+
if(master == NULL) *done = 1;
|
455
|
+
|
456
|
+
while(!found && !*done) {
|
457
|
+
RELAY_ERROR(query_next_doc(master, seg, result, done));
|
458
|
+
DEBUG("master reports doc %u done %d", result->doc_id, *done);
|
459
|
+
if(!*done) {
|
460
|
+
search_doc = result->doc_id;
|
461
|
+
wp_search_result_free(result); // sigh
|
462
|
+
RELAY_ERROR(conj_advance_to_doc(q, seg, search_doc, result, &found, done));
|
463
|
+
}
|
464
|
+
DEBUG("after search, found is %d and done is %d", found, *done);
|
465
|
+
}
|
466
|
+
|
467
|
+
return NO_ERROR;
|
468
|
+
}
|
469
|
+
|
470
|
+
static wp_error* conj_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) {
|
471
|
+
search_result* child_results = malloc(sizeof(search_result) * q->num_children);
|
472
|
+
RELAY_ERROR(advance_all_children(q, s, doc_id, child_results, found, done));
|
473
|
+
|
474
|
+
if(*found) {
|
475
|
+
DEBUG("successfully found doc %u", doc_id);
|
476
|
+
RELAY_ERROR(search_result_combine_into(result, child_results, q->num_children));
|
477
|
+
}
|
478
|
+
|
479
|
+
free(child_results);
|
480
|
+
return NO_ERROR;
|
481
|
+
}
|
482
|
+
|
483
|
+
static wp_error* disj_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_id, search_result* result, int* found, int* done) {
|
484
|
+
search_result child_result;
|
485
|
+
int child_found;
|
486
|
+
|
487
|
+
DEBUG("advancing all to %d", doc_id);
|
488
|
+
|
489
|
+
*found = 0;
|
490
|
+
*done = 0;
|
491
|
+
uint16_t i = 0;
|
492
|
+
for(wp_query* child = q->children; child != NULL; child = child->next) {
|
493
|
+
int child_done;
|
494
|
+
RELAY_ERROR(query_advance_to_doc(child, seg, doc_id, &child_result, &child_found, &child_done));
|
495
|
+
DEBUG("child %u reports found %d and done %d", i, child_found, child_done);
|
496
|
+
*done = *done && child_done; // we're only done if ALL children are done
|
497
|
+
if(child_found && !*found) {
|
498
|
+
*found = 1;
|
499
|
+
*result = child_result;
|
500
|
+
}
|
501
|
+
|
502
|
+
i += 1;
|
503
|
+
// TODO XXXXXXXXXX does this leak memory when multiple children all return results?
|
504
|
+
}
|
505
|
+
|
506
|
+
#ifdef DEBUGOUTPUT
|
507
|
+
if(*found) DEBUG("successfully found doc %u", doc_id);
|
508
|
+
else DEBUG("did not find doc %u", doc_id);
|
509
|
+
#endif
|
510
|
+
|
511
|
+
// now release any buffered results if they're > doc_id
|
512
|
+
disj_search_state* state = (disj_search_state*)q->search_data;
|
513
|
+
if(state->states != NULL) {
|
514
|
+
uint16_t i = 0;
|
515
|
+
for(wp_query* child = q->children; child != NULL; child = child->next) {
|
516
|
+
if((state->states[i] == DISJ_SEARCH_STATE_FILLED) && (state->results[i].doc_id > doc_id)) {
|
517
|
+
wp_search_result_free(&state->results[i]);
|
518
|
+
state->states[i] = DISJ_SEARCH_STATE_EMPTY;
|
519
|
+
}
|
520
|
+
i++;
|
521
|
+
}
|
522
|
+
}
|
523
|
+
|
524
|
+
return NO_ERROR;
|
525
|
+
}
|
526
|
+
|
527
|
+
// sadly, this is basically a copy of conj_next_doc right now. all the
|
528
|
+
// interesting phrasal checking is done by phrase_advance_to_doc.
|
529
|
+
static wp_error* phrase_next_doc(wp_query* q, wp_segment* seg, search_result* result, int* done) {
|
530
|
+
#ifdef DEBUGOUTPUT
|
531
|
+
char query_s[1024];
|
532
|
+
wp_query_to_s(q, 1024, query_s);
|
533
|
+
DEBUG("called on %s", query_s);
|
534
|
+
#endif
|
535
|
+
|
536
|
+
docid_t search_doc;
|
537
|
+
int found = 0;
|
538
|
+
*done = 0;
|
539
|
+
|
540
|
+
// start with the first child's first doc
|
541
|
+
// TODO: find smallest postings list and use that instead
|
542
|
+
wp_query* master = q->children;
|
543
|
+
if(master == NULL) *done = 1;
|
544
|
+
|
545
|
+
while(!found && !*done) {
|
546
|
+
RELAY_ERROR(query_next_doc(master, seg, result, done));
|
547
|
+
DEBUG("master reports doc %u done %d", result->doc_id, *done);
|
548
|
+
if(!*done) {
|
549
|
+
search_doc = result->doc_id;
|
550
|
+
wp_search_result_free(result); // sigh
|
551
|
+
RELAY_ERROR(phrase_advance_to_doc(q, seg, search_doc, result, &found, done));
|
552
|
+
}
|
553
|
+
DEBUG("after search, found is %d and done is %d", found, *done);
|
554
|
+
}
|
555
|
+
|
556
|
+
return NO_ERROR;
|
557
|
+
}
|
558
|
+
|
559
|
+
static wp_error* phrase_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_id, search_result* result, int* found, int* done) {
|
560
|
+
#ifdef DEBUGOUTPUT
|
561
|
+
char query_s[1024];
|
562
|
+
wp_query_to_s(q, 1024, query_s);
|
563
|
+
DEBUG("called on %s", query_s);
|
564
|
+
#endif
|
565
|
+
|
566
|
+
search_result* child_results = malloc(sizeof(search_result) * q->num_children);
|
567
|
+
|
568
|
+
DEBUG("will be searching for doc %u", doc_id);
|
569
|
+
RELAY_ERROR(advance_all_children(q, seg, doc_id, child_results, found, done));
|
570
|
+
|
571
|
+
if(*found) {
|
572
|
+
DEBUG("found doc %u. now checking for positional matches", doc_id);
|
573
|
+
|
574
|
+
// TODO remove this once we're less paranoid
|
575
|
+
for(int i = 0; i < q->num_children; i++) {
|
576
|
+
if(child_results[i].num_doc_matches != 1) RAISE_ERROR("invalid state: %d results", child_results[i].num_doc_matches);
|
577
|
+
if(child_results[i].doc_id != doc_id) RAISE_ERROR("invalid state: doc id %u vs searched-for %u", child_results[i].doc_id, doc_id);
|
578
|
+
}
|
579
|
+
|
580
|
+
/* the following can be optimized in several ways:
|
581
|
+
|
582
|
+
1. choose the doc with the smallest number of term matches, rather than aways picking the first.
|
583
|
+
2. do a binary search to find the position (since the array is sorted), rather than a linear
|
584
|
+
scan.
|
585
|
+
|
586
|
+
this is simply the simplest, stupidest, first-approach implementation.
|
587
|
+
*/
|
588
|
+
|
589
|
+
// we'll base everything off of this guy
|
590
|
+
doc_match* first_dm = &child_results[0].doc_matches[0];
|
591
|
+
|
592
|
+
// allocate enough space to hold the maximum number of positions
|
593
|
+
pos_t* phrase_positions = malloc(sizeof(pos_t) * first_dm->num_positions);
|
594
|
+
int num_positions_found = 0;
|
595
|
+
|
596
|
+
for(int i = 0; i < first_dm->num_positions; i++) {
|
597
|
+
pos_t position = first_dm->positions[i];
|
598
|
+
DEBUG("try %d: match by term 0 at position %u", i, position);
|
599
|
+
|
600
|
+
int found_in_this_position = 1;
|
601
|
+
for(int j = 1; j < q->num_children; j++) {
|
602
|
+
doc_match* this_dm = &child_results[j].doc_matches[0];
|
603
|
+
|
604
|
+
int k, found_in_doc = 0;
|
605
|
+
for(k = 0; k < this_dm->num_positions; k++) {
|
606
|
+
if(this_dm->positions[k] == (position + j)) {
|
607
|
+
found_in_doc = 1;
|
608
|
+
break;
|
609
|
+
}
|
610
|
+
}
|
611
|
+
|
612
|
+
if(!found_in_doc) {
|
613
|
+
found_in_this_position = 0;
|
614
|
+
DEBUG("term %d did NOT match at position %u after %d comparisons", j, position + j, k + 1);
|
615
|
+
break;
|
616
|
+
}
|
617
|
+
#ifdef DEBUGOUTPUT
|
618
|
+
else DEBUG("term %d matched at position %u after %d/%d comparisons", j, position + j, k + 1, this_dm->num_positions);
|
619
|
+
#endif
|
620
|
+
}
|
621
|
+
|
622
|
+
if(found_in_this_position) phrase_positions[num_positions_found++] = position; // got a match!
|
623
|
+
}
|
624
|
+
|
625
|
+
if(num_positions_found > 0) {
|
626
|
+
// fill in the result
|
627
|
+
result->doc_id = doc_id;
|
628
|
+
result->num_doc_matches = 1;
|
629
|
+
result->doc_matches = malloc(sizeof(doc_match));
|
630
|
+
result->doc_matches[0].field = NULL;
|
631
|
+
result->doc_matches[0].word = NULL;
|
632
|
+
result->doc_matches[0].num_positions = num_positions_found;
|
633
|
+
result->doc_matches[0].positions = phrase_positions;
|
634
|
+
}
|
635
|
+
else {
|
636
|
+
*found = 0;
|
637
|
+
free(phrase_positions);
|
638
|
+
}
|
639
|
+
for(int i = 0; i < q->num_children; i++) wp_search_result_free(&child_results[i]);
|
640
|
+
}
|
641
|
+
|
642
|
+
free(child_results);
|
643
|
+
return NO_ERROR;
|
644
|
+
}
|
645
|
+
|
646
|
+
static wp_error* neg_next_doc(wp_query* q, wp_segment* seg, search_result* result, int* done) {
|
647
|
+
neg_search_state* state = (neg_search_state*)q->search_data;
|
648
|
+
|
649
|
+
DEBUG("called with cur %u and next %u", state->cur, state->next);
|
650
|
+
|
651
|
+
if(state->cur == DOCID_NONE) {
|
652
|
+
*done = 1;
|
653
|
+
return NO_ERROR;
|
654
|
+
}
|
655
|
+
|
656
|
+
state->cur--; // advance virtual doc pointer
|
657
|
+
|
658
|
+
// if state->cur == state->next, we need to load the substream's next
|
659
|
+
// document, decrement our cur, and recheck.
|
660
|
+
while((state->cur > DOCID_NONE) && (state->cur == state->next)) { // need to advance the child stream
|
661
|
+
state->cur--; // can't use the previous value because == next; decrement
|
662
|
+
|
663
|
+
int child_done;
|
664
|
+
RELAY_ERROR(query_next_doc(q->children, seg, result, &child_done));
|
665
|
+
if(child_done) state->next = DOCID_NONE; // child stream is done
|
666
|
+
else {
|
667
|
+
state->next = result->doc_id;
|
668
|
+
wp_search_result_free(result);
|
669
|
+
}
|
670
|
+
|
671
|
+
DEBUG("after bump, cur %u and next %u", state->cur, state->next);
|
672
|
+
}
|
673
|
+
|
674
|
+
// check again... sigh
|
675
|
+
if(state->cur == DOCID_NONE) {
|
676
|
+
*done = 1;
|
677
|
+
return NO_ERROR;
|
678
|
+
}
|
679
|
+
|
680
|
+
DEBUG("returning doc %u", state->cur);
|
681
|
+
result->doc_id = state->cur;
|
682
|
+
result->num_doc_matches = 0;
|
683
|
+
result->doc_matches = NULL;
|
684
|
+
*done = 0;
|
685
|
+
return NO_ERROR;
|
686
|
+
}
|
687
|
+
|
688
|
+
static wp_error* neg_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_id, search_result* result, int* found, int* done) {
|
689
|
+
neg_search_state* state = (neg_search_state*)q->search_data;
|
690
|
+
|
691
|
+
DEBUG("in search for %u, called with cur %u and next %u", doc_id, state->cur, state->next);
|
692
|
+
|
693
|
+
if(state->cur == DOCID_NONE) {
|
694
|
+
*done = 1;
|
695
|
+
*found = 0;
|
696
|
+
return NO_ERROR;
|
697
|
+
}
|
698
|
+
|
699
|
+
// seek through child stream until we find a docid it contains that's <= doc_id
|
700
|
+
while(state->next > doc_id) { // need to advance child stream
|
701
|
+
int child_done;
|
702
|
+
RELAY_ERROR(query_next_doc(q->children, seg, result, &child_done));
|
703
|
+
if(child_done) state->next = DOCID_NONE; // will break the loop too
|
704
|
+
else state->next = result->doc_id;
|
705
|
+
}
|
706
|
+
|
707
|
+
DEBUG("in search for %u, intermediate state is cur %u and next %u", doc_id, state->cur, state->next);
|
708
|
+
|
709
|
+
// at this point we know state->next, our child pointer, is <= doc_id
|
710
|
+
state->cur = doc_id;
|
711
|
+
if(state->next == doc_id) *found = 0; // opposite day
|
712
|
+
else {
|
713
|
+
*found = 1;
|
714
|
+
result->doc_id = doc_id;
|
715
|
+
result->num_doc_matches = 0;
|
716
|
+
result->doc_matches = NULL;
|
717
|
+
}
|
718
|
+
|
719
|
+
*done = state->cur == DOCID_NONE ? 1 : 0;
|
720
|
+
|
721
|
+
DEBUG("finally, state is cur %u and next %u and found is %d and done is %d", state->cur, state->next, *found, *done);
|
722
|
+
return NO_ERROR;
|
723
|
+
}
|
724
|
+
|
725
|
+
wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment* s, uint32_t max_num_results, uint32_t* num_results, search_result* results) {
|
726
|
+
int done;
|
727
|
+
|
728
|
+
*num_results = 0;
|
729
|
+
|
730
|
+
#ifdef DEBUG
|
731
|
+
char buf[1024];
|
732
|
+
wp_query_to_s(q, 1024, buf);
|
733
|
+
DEBUG("running query %s", buf);
|
734
|
+
#endif
|
735
|
+
|
736
|
+
while(*num_results < max_num_results) {
|
737
|
+
RELAY_ERROR(query_next_doc(q, s, &results[*num_results], &done));
|
738
|
+
if(done) break;
|
739
|
+
DEBUG("got result %u (%u doc matches)", results[*num_results].doc_id, results[*num_results].num_doc_matches);
|
740
|
+
(*num_results)++;
|
741
|
+
DEBUG("num results now %d", *num_results);
|
742
|
+
}
|
743
|
+
|
744
|
+
return NO_ERROR;
|
745
|
+
}
|
746
|
+
|