whistlepig 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,746 @@
1
+ #include "whistlepig.h"
2
+
3
+ /********* search states *********/
4
+ typedef struct term_search_state {
5
+ posting posting;
6
+ int started;
7
+ int done;
8
+ int label; // 1 if a label; 0 if a term
9
+ } term_search_state;
10
+
11
+ typedef struct neg_search_state {
12
+ docid_t next; // the next document in the child stream. we will never return this document.
13
+ docid_t cur; // the last doc we returned
14
+ } neg_search_state;
15
+
16
+ #define DISJ_SEARCH_STATE_EMPTY 0
17
+ #define DISJ_SEARCH_STATE_FILLED 1
18
+ #define DISJ_SEARCH_STATE_DONE 2
19
+
20
+ typedef struct disj_search_state {
21
+ docid_t last_docid;
22
+ uint8_t* states; // whether the search result has been initialized or not
23
+ search_result* results; // array of search results, one per child
24
+ } disj_search_state;
25
+
26
+ void wp_search_result_free(search_result* result) {
27
+ for(int i = 0; i < result->num_doc_matches; i++) {
28
+ //printf("for result at %p (dm %d), freeing positions at %p\n", result, i, result->doc_matches[i].positions);
29
+ free(result->doc_matches[i].positions);
30
+ }
31
+ free(result->doc_matches);
32
+ }
33
+
34
+ RAISING_STATIC(search_result_init(search_result* result, const char* field, const char* word, posting* posting)) {
35
+ result->doc_id = posting->doc_id;
36
+ result->num_doc_matches = 1;
37
+ result->doc_matches = malloc(sizeof(doc_match));
38
+ result->doc_matches[0].field = field;
39
+ result->doc_matches[0].word = word;
40
+ result->doc_matches[0].num_positions = posting->num_positions;
41
+
42
+ size_t size = sizeof(pos_t) * posting->num_positions;
43
+ result->doc_matches[0].positions = malloc(size);
44
+ //printf("for result at %p, allocated %u bytes for positions at %p\n", result, size, result->doc_matches[0].positions);
45
+ memcpy(result->doc_matches[0].positions, posting->positions, size);
46
+
47
+ return NO_ERROR;
48
+ }
49
+
50
+ RAISING_STATIC(search_result_combine_into(search_result* result, search_result* child_results, int num_child_results)) {
51
+ if(num_child_results <= 0) RAISE_ERROR("no child results");
52
+ result->doc_id = child_results[0].doc_id;
53
+ result->num_doc_matches = num_child_results;
54
+ result->doc_matches = malloc(sizeof(doc_match) * num_child_results);
55
+ for(int i = 0; i < num_child_results; i++) {
56
+ if(child_results[i].doc_matches == NULL) {
57
+ result->doc_matches[i].field = NULL;
58
+ result->doc_matches[i].word = NULL;
59
+ result->doc_matches[i].num_positions = 0;
60
+ result->doc_matches[i].positions = NULL;
61
+ }
62
+ else result->doc_matches[i] = child_results[i].doc_matches[0];
63
+ }
64
+
65
+ return NO_ERROR;
66
+ }
67
+
68
+ /*
69
+ * we provide two functions for iterating through result streams: next() and
70
+ * advance().
71
+ *
72
+ * next() returns results one at a time. it will set done = true if you're at
73
+ * the end of the stream. otherwise, it will give you a result. the next
74
+ * call to next() will give you the next result (or set done = true).
75
+ *
76
+ * advance() is given a docid and advances the stream to just *after* that
77
+ * document, and tells you whether it saw the docid on the way(and set a result
78
+ * if so for your convenience).
79
+ *
80
+ * a next() followed by one or more advance() calls with the returned docid
81
+ * will set found = true and will not advance the stream beyond where it
82
+ * already is.
83
+ *
84
+ * however, an advance() to a docid followed by next() may skip a document in
85
+ * the stream. you probably don't want this.
86
+ *
87
+ * so advance is only useful if you have a particular doc_id in mind, and you
88
+ * want to see if this stream contains it. if you want to actually see all the
89
+ * docids in a stream, you must use next().
90
+ *
91
+ */
92
+
93
+ /********** dispatch functions ***********/
94
+ static wp_error* term_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
95
+ static wp_error* conj_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
96
+ static wp_error* disj_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
97
+ static wp_error* phrase_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
98
+ static wp_error* neg_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
99
+ static wp_error* term_release_search_state(wp_query* q) RAISES_ERROR;
100
+ static wp_error* conj_release_search_state(wp_query* q) RAISES_ERROR;
101
+ static wp_error* disj_release_search_state(wp_query* q) RAISES_ERROR;
102
+ static wp_error* phrase_release_search_state(wp_query* q) RAISES_ERROR;
103
+ static wp_error* neg_release_search_state(wp_query* q) RAISES_ERROR;
104
+ static wp_error* term_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
105
+ static wp_error* conj_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
106
+ static wp_error* disj_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
107
+ static wp_error* phrase_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
108
+ static wp_error* neg_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
109
+ static wp_error* term_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
110
+ static wp_error* conj_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
111
+ static wp_error* disj_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
112
+ static wp_error* phrase_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
113
+ static wp_error* neg_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
114
+
115
+ // the term_* functions also handle labels
116
+ // we use conj for empty queries as well (why not)
117
+ #define DISPATCH(type, suffix, ...) \
118
+ switch(type) { \
119
+ case WP_QUERY_TERM: \
120
+ case WP_QUERY_LABEL: RELAY_ERROR(term_##suffix(__VA_ARGS__)); break; \
121
+ case WP_QUERY_EMPTY: \
122
+ case WP_QUERY_CONJ: RELAY_ERROR(conj_##suffix(__VA_ARGS__)); break; \
123
+ case WP_QUERY_DISJ: RELAY_ERROR(disj_##suffix(__VA_ARGS__)); break; \
124
+ case WP_QUERY_PHRASE: RELAY_ERROR(phrase_##suffix(__VA_ARGS__)); break; \
125
+ case WP_QUERY_NEG: RELAY_ERROR(neg_##suffix(__VA_ARGS__)); break; \
126
+ default: RAISE_ERROR("unknown query node type %d", type); \
127
+ } \
128
+
129
+ wp_error* wp_search_init_search_state(wp_query* q, wp_segment* s) {
130
+ DISPATCH(q->type, init_search_state, q, s);
131
+ return NO_ERROR;
132
+ }
133
+
134
+ wp_error* wp_search_release_search_state(wp_query* q) {
135
+ DISPATCH(q->type, release_search_state, q)
136
+ return NO_ERROR;
137
+ }
138
+
139
+ RAISING_STATIC(query_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done)) {
140
+ DISPATCH(q->type, next_doc, q, s, result, done);
141
+ #ifdef DEBUGOUTPUT
142
+ char buf[1024];
143
+ wp_query_to_s(q, 1024, buf);
144
+
145
+ if(*done) DEBUG("query %s is done", buf);
146
+ else DEBUG("query %s has doc %u", buf, result->doc_id);
147
+ #endif
148
+ return NO_ERROR;
149
+ }
150
+
151
+ RAISING_STATIC(query_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done)) {
152
+ DISPATCH(q->type, advance_to_doc, q, s, doc_id, result, found, done);
153
+ #ifdef DEBUGOUTPUT
154
+ char buf[1024];
155
+ wp_query_to_s(q, 1024, buf);
156
+
157
+ if(*done) DEBUG("query %s is done", buf);
158
+ else {
159
+ if(*found) DEBUG("query %s has doc %u", buf, doc_id);
160
+ else DEBUG("query %s does not have doc %u", buf, doc_id);
161
+ }
162
+ #endif
163
+ return NO_ERROR;
164
+ }
165
+
166
+ /************** init functions *************/
167
+
168
+ RAISING_STATIC(init_children(wp_query* q, wp_segment* s)) {
169
+ for(wp_query* child = q->children; child != NULL; child = child->next) RELAY_ERROR(wp_search_init_search_state(child, s));
170
+ return NO_ERROR;
171
+ }
172
+
173
+ RAISING_STATIC(release_children(wp_query* q)) {
174
+ for(wp_query* child = q->children; child != NULL; child = child->next) RELAY_ERROR(wp_search_release_search_state(child));
175
+ return NO_ERROR;
176
+ }
177
+
178
+ static wp_error* term_init_search_state(wp_query* q, wp_segment* seg) {
179
+ term t;
180
+ stringmap* sh = MMAP_OBJ(seg->stringmap, stringmap);
181
+ termhash* th = MMAP_OBJ(seg->termhash, termhash);
182
+
183
+ term_search_state* state = q->search_data = malloc(sizeof(term_search_state));
184
+ state->started = 0;
185
+
186
+ state->label = q->type == WP_QUERY_LABEL ? 1 : 0;
187
+ if(state->label) t.field_s = 0;
188
+ else t.field_s = stringmap_string_to_int(sh, q->field); // will be -1 if not found
189
+
190
+ t.word_s = stringmap_string_to_int(sh, q->word);
191
+
192
+ uint32_t offset = termhash_get_val(th, t);
193
+ if(offset == (uint32_t)-1) offset = OFFSET_NONE;
194
+
195
+ if(offset == OFFSET_NONE) state->done = 1; // no entry in term hash
196
+ else {
197
+ state->done = 0;
198
+ if(state->label) RELAY_ERROR(wp_segment_read_label(seg, offset, &state->posting));
199
+ else RELAY_ERROR(wp_segment_read_posting(seg, offset, &state->posting, 1));
200
+ }
201
+
202
+ RELAY_ERROR(init_children(q, seg));
203
+
204
+ return NO_ERROR;
205
+ }
206
+
207
+ static wp_error* term_release_search_state(wp_query* q) {
208
+ term_search_state* state = q->search_data;
209
+ if(!state->done) free(state->posting.positions);
210
+ free(state);
211
+ RELAY_ERROR(release_children(q));
212
+ return NO_ERROR;
213
+ }
214
+
215
+ static wp_error* conj_init_search_state(wp_query* q, wp_segment* s) {
216
+ q->search_data = NULL; // no state needed
217
+ RELAY_ERROR(init_children(q, s));
218
+ return NO_ERROR;
219
+ }
220
+
221
+ static wp_error* conj_release_search_state(wp_query* q) {
222
+ RELAY_ERROR(release_children(q));
223
+ return NO_ERROR;
224
+ }
225
+
226
+ static wp_error* disj_init_search_state(wp_query* q, wp_segment* s) {
227
+ disj_search_state* state = q->search_data = malloc(sizeof(disj_search_state));
228
+ state->states = NULL;
229
+ state->results = NULL;
230
+ state->last_docid = DOCID_NONE;
231
+ RELAY_ERROR(init_children(q, s));
232
+ return NO_ERROR;
233
+ }
234
+
235
+ static wp_error* disj_release_search_state(wp_query* q) {
236
+ disj_search_state* state = (disj_search_state*)q->search_data;
237
+ if(state->states) {
238
+ // free any remaining search results in the buffer
239
+ for(uint16_t i = 0; i < q->num_children; i++) {
240
+ if(state->states[i] == DISJ_SEARCH_STATE_FILLED) wp_search_result_free(&state->results[i]);
241
+ }
242
+ free(state->states);
243
+ free(state->results);
244
+ }
245
+ free(state);
246
+ RELAY_ERROR(release_children(q));
247
+ return NO_ERROR;
248
+ }
249
+
250
+ static wp_error* phrase_init_search_state(wp_query* q, wp_segment* s) {
251
+ q->search_data = NULL; // no state needed
252
+ RELAY_ERROR(init_children(q, s));
253
+ return NO_ERROR;
254
+ }
255
+
256
+ static wp_error* phrase_release_search_state(wp_query* q) {
257
+ RELAY_ERROR(release_children(q));
258
+ return NO_ERROR;
259
+ }
260
+
261
+ static wp_error* neg_init_search_state(wp_query* q, wp_segment* seg) {
262
+ if(q->num_children != 1) RAISE_ERROR("negations currently only operate on single children");
263
+
264
+ RELAY_ERROR(wp_search_init_search_state(q->children, seg));
265
+
266
+ postings_region* pr = MMAP_OBJ(seg->postings, postings_region);
267
+ neg_search_state* state = q->search_data = malloc(sizeof(neg_search_state));
268
+
269
+ state->cur = pr->num_docs + 1;
270
+ search_result result;
271
+ int done;
272
+ RELAY_ERROR(query_next_doc(q->children, seg, &result, &done));
273
+ if(done) state->next = DOCID_NONE;
274
+ else {
275
+ state->next = result.doc_id;
276
+ wp_search_result_free(&result);
277
+ }
278
+ DEBUG("initialized with cur %u and next %u", state->cur, state->next);
279
+
280
+ return NO_ERROR;
281
+ }
282
+
283
+ static wp_error* neg_release_search_state(wp_query* q) {
284
+ RELAY_ERROR(wp_search_release_search_state(q->children));
285
+ free(q->search_data);
286
+ return NO_ERROR;
287
+ }
288
+
289
+ /********** search functions **********/
290
+
291
+ static wp_error* term_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) {
292
+ term_search_state* state = (term_search_state*)q->search_data;
293
+
294
+ DEBUG("[%s:'%s'] before: started is %d, done is %d", q->field, q->word, state->started, state->done);
295
+ if(state->done) {
296
+ *done = 1;
297
+ return NO_ERROR;
298
+ }
299
+
300
+ *done = 0;
301
+ if(!state->started) { // start
302
+ state->started = 1;
303
+ RELAY_ERROR(search_result_init(result, q->field, q->word, &state->posting));
304
+ }
305
+ else { // advance
306
+ free(state->posting.positions);
307
+ if(state->posting.next_offset == OFFSET_NONE) { // end of stream
308
+ *done = state->done = 1;
309
+ }
310
+ else {
311
+ if(state->label) RELAY_ERROR(wp_segment_read_label(s, state->posting.next_offset, &state->posting));
312
+ else RELAY_ERROR(wp_segment_read_posting(s, state->posting.next_offset, &state->posting, 1));
313
+ RELAY_ERROR(search_result_init(result, q->field, q->word, &state->posting));
314
+ }
315
+ }
316
+ DEBUG("[%s:'%s'] after: doc id %u, done is %d, started is %d", q->field, q->word, (state->started && !state->done && result) ? result->doc_id : 0, *done, state->started);
317
+
318
+ return NO_ERROR;
319
+ }
320
+
321
+ static wp_error* term_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) {
322
+ term_search_state* state = (term_search_state*)q->search_data;
323
+ DEBUG("[%s:'%s'] seeking through postings for doc %u", q->field, q->word, doc_id);
324
+
325
+ if(state->done) { // end of stream
326
+ *found = 0;
327
+ *done = 1;
328
+ return NO_ERROR;
329
+ }
330
+
331
+ while(state->posting.doc_id > doc_id) {
332
+ free(state->posting.positions);
333
+ DEBUG("skipping doc_id %u", state->posting.doc_id);
334
+ if(state->posting.next_offset == OFFSET_NONE) {
335
+ state->done = 1;
336
+ break;
337
+ }
338
+
339
+ if(state->label) RELAY_ERROR(wp_segment_read_label(s, state->posting.next_offset, &state->posting));
340
+ else RELAY_ERROR(wp_segment_read_posting(s, state->posting.next_offset, &state->posting, 1));
341
+ //DEBUG("advanced posting to %p", state->posting);
342
+ }
343
+
344
+ if(state->done) {
345
+ DEBUG("[%s:'%s'] posting list exhausted", q->field, q->word);
346
+ *found = 0;
347
+ *done = 1;
348
+ }
349
+ else {
350
+ *done = 0;
351
+ DEBUG("[%s:'%s'] posting advanced to that of doc %u", q->field, q->word, state->posting.doc_id);
352
+ *found = (doc_id == state->posting.doc_id ? 1 : 0);
353
+ if(*found) RELAY_ERROR(search_result_init(result, q->field, q->word, &state->posting));
354
+ }
355
+
356
+ return NO_ERROR;
357
+ }
358
+
359
+ // this advances all children *until* it finds a child that doesn't have the
360
+ // doc. at that point it stops. so it will return found=0 if any single child
361
+ // doesn't have the doc, and done=1 if any single child is done.
362
+ //
363
+ // this is used by both phrasal and conjunctive queries.
364
+ static wp_error* advance_all_children(wp_query* q, wp_segment* seg, docid_t search_doc, search_result* child_results, int* found, int* done) {
365
+ int num_children_searched = 0;
366
+ *found = 1;
367
+
368
+ DEBUG("advancing all children to doc %u with early termination", search_doc);
369
+
370
+ for(wp_query* child = q->children; child != NULL; child = child->next) {
371
+ RELAY_ERROR(query_advance_to_doc(child, seg, search_doc, &child_results[num_children_searched], found, done));
372
+ num_children_searched++;
373
+ if(!*found) break;
374
+ }
375
+
376
+ if(!*found) for(int i = 0; i < num_children_searched - 1; i++) wp_search_result_free(&child_results[i]);
377
+
378
+ return NO_ERROR;
379
+ }
380
+
381
+ static wp_error* disj_next_doc(wp_query* q, wp_segment* seg, search_result* result, int* done) {
382
+ if(q->children == NULL) {
383
+ *done = 1;
384
+ return NO_ERROR;
385
+ }
386
+
387
+ // allocate search state if necessary
388
+ disj_search_state* state = (disj_search_state*)q->search_data;
389
+ if(state->states == NULL) {
390
+ state->states = malloc(sizeof(uint8_t) * q->num_children);
391
+ state->results = malloc(sizeof(search_result) * q->num_children);
392
+ memset(state->states, DISJ_SEARCH_STATE_EMPTY, sizeof(uint8_t) * q->num_children);
393
+ }
394
+
395
+ // fill all the results we can into the buffer by calling next_doc on all
396
+ // non-done children
397
+ uint16_t i = 0;
398
+ for(wp_query* child = q->children; child != NULL; child = child->next) {
399
+ if(state->states[i] == DISJ_SEARCH_STATE_EMPTY) {
400
+ int thisdone = 0;
401
+ DEBUG("recursing on child %d", i);
402
+ RELAY_ERROR(query_next_doc(child, seg, &(state->results[i]), &thisdone));
403
+ if(thisdone == 1) state->states[i] = DISJ_SEARCH_STATE_DONE;
404
+ else state->states[i] = DISJ_SEARCH_STATE_FILLED;
405
+ DEBUG("after recurse, state %d is marked %d", i, state->states[i]);
406
+ }
407
+ i++;
408
+ }
409
+
410
+ // now find the largest
411
+ uint16_t max_doc_idx = 0;
412
+ docid_t max_docid = 0;
413
+
414
+ *done = 1;
415
+ i = 0;
416
+ for(wp_query* child = q->children; child != NULL; child = child->next) {
417
+ DEBUG("child %d is marked as %d", i, state->states[i]);
418
+ if(state->states[i] == DISJ_SEARCH_STATE_FILLED) {
419
+ if((*done == 1) || (state->results[i].doc_id > max_docid)) {
420
+ if(state->results[i].doc_id == state->last_docid) { // discard dupes
421
+ DEBUG("child %d has old result %u; voiding", i, state->last_docid);
422
+ wp_search_result_free(&state->results[i]);
423
+ state->states[i] = DISJ_SEARCH_STATE_EMPTY;
424
+ }
425
+ else {
426
+ *done = 0;
427
+ max_docid = state->results[i].doc_id;
428
+ max_doc_idx = i;
429
+ }
430
+ }
431
+ }
432
+ i++;
433
+ }
434
+
435
+ // finally, copy the result
436
+ if(*done == 0) {
437
+ DEBUG("returning doc %d at index %d", max_docid, max_doc_idx);
438
+ memcpy(result, &state->results[max_doc_idx], sizeof(search_result));
439
+ state->states[max_doc_idx] = DISJ_SEARCH_STATE_EMPTY;
440
+ state->last_docid = result->doc_id;
441
+ }
442
+
443
+ return NO_ERROR;
444
+ }
445
+
446
+ static wp_error* conj_next_doc(wp_query* q, wp_segment* seg, search_result* result, int* done) {
447
+ docid_t search_doc;
448
+ int found = 0;
449
+ *done = 0;
450
+
451
+ // start with the first child's first doc
452
+ // TODO: find smallest postings list and use that instead
453
+ wp_query* master = q->children;
454
+ if(master == NULL) *done = 1;
455
+
456
+ while(!found && !*done) {
457
+ RELAY_ERROR(query_next_doc(master, seg, result, done));
458
+ DEBUG("master reports doc %u done %d", result->doc_id, *done);
459
+ if(!*done) {
460
+ search_doc = result->doc_id;
461
+ wp_search_result_free(result); // sigh
462
+ RELAY_ERROR(conj_advance_to_doc(q, seg, search_doc, result, &found, done));
463
+ }
464
+ DEBUG("after search, found is %d and done is %d", found, *done);
465
+ }
466
+
467
+ return NO_ERROR;
468
+ }
469
+
470
+ static wp_error* conj_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) {
471
+ search_result* child_results = malloc(sizeof(search_result) * q->num_children);
472
+ RELAY_ERROR(advance_all_children(q, s, doc_id, child_results, found, done));
473
+
474
+ if(*found) {
475
+ DEBUG("successfully found doc %u", doc_id);
476
+ RELAY_ERROR(search_result_combine_into(result, child_results, q->num_children));
477
+ }
478
+
479
+ free(child_results);
480
+ return NO_ERROR;
481
+ }
482
+
483
+ static wp_error* disj_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_id, search_result* result, int* found, int* done) {
484
+ search_result child_result;
485
+ int child_found;
486
+
487
+ DEBUG("advancing all to %d", doc_id);
488
+
489
+ *found = 0;
490
+ *done = 0;
491
+ uint16_t i = 0;
492
+ for(wp_query* child = q->children; child != NULL; child = child->next) {
493
+ int child_done;
494
+ RELAY_ERROR(query_advance_to_doc(child, seg, doc_id, &child_result, &child_found, &child_done));
495
+ DEBUG("child %u reports found %d and done %d", i, child_found, child_done);
496
+ *done = *done && child_done; // we're only done if ALL children are done
497
+ if(child_found && !*found) {
498
+ *found = 1;
499
+ *result = child_result;
500
+ }
501
+
502
+ i += 1;
503
+ // TODO XXXXXXXXXX does this leak memory when multiple children all return results?
504
+ }
505
+
506
+ #ifdef DEBUGOUTPUT
507
+ if(*found) DEBUG("successfully found doc %u", doc_id);
508
+ else DEBUG("did not find doc %u", doc_id);
509
+ #endif
510
+
511
+ // now release any buffered results if they're > doc_id
512
+ disj_search_state* state = (disj_search_state*)q->search_data;
513
+ if(state->states != NULL) {
514
+ uint16_t i = 0;
515
+ for(wp_query* child = q->children; child != NULL; child = child->next) {
516
+ if((state->states[i] == DISJ_SEARCH_STATE_FILLED) && (state->results[i].doc_id > doc_id)) {
517
+ wp_search_result_free(&state->results[i]);
518
+ state->states[i] = DISJ_SEARCH_STATE_EMPTY;
519
+ }
520
+ i++;
521
+ }
522
+ }
523
+
524
+ return NO_ERROR;
525
+ }
526
+
527
+ // sadly, this is basically a copy of conj_next_doc right now. all the
528
+ // interesting phrasal checking is done by phrase_advance_to_doc.
529
+ static wp_error* phrase_next_doc(wp_query* q, wp_segment* seg, search_result* result, int* done) {
530
+ #ifdef DEBUGOUTPUT
531
+ char query_s[1024];
532
+ wp_query_to_s(q, 1024, query_s);
533
+ DEBUG("called on %s", query_s);
534
+ #endif
535
+
536
+ docid_t search_doc;
537
+ int found = 0;
538
+ *done = 0;
539
+
540
+ // start with the first child's first doc
541
+ // TODO: find smallest postings list and use that instead
542
+ wp_query* master = q->children;
543
+ if(master == NULL) *done = 1;
544
+
545
+ while(!found && !*done) {
546
+ RELAY_ERROR(query_next_doc(master, seg, result, done));
547
+ DEBUG("master reports doc %u done %d", result->doc_id, *done);
548
+ if(!*done) {
549
+ search_doc = result->doc_id;
550
+ wp_search_result_free(result); // sigh
551
+ RELAY_ERROR(phrase_advance_to_doc(q, seg, search_doc, result, &found, done));
552
+ }
553
+ DEBUG("after search, found is %d and done is %d", found, *done);
554
+ }
555
+
556
+ return NO_ERROR;
557
+ }
558
+
559
+ static wp_error* phrase_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_id, search_result* result, int* found, int* done) {
560
+ #ifdef DEBUGOUTPUT
561
+ char query_s[1024];
562
+ wp_query_to_s(q, 1024, query_s);
563
+ DEBUG("called on %s", query_s);
564
+ #endif
565
+
566
+ search_result* child_results = malloc(sizeof(search_result) * q->num_children);
567
+
568
+ DEBUG("will be searching for doc %u", doc_id);
569
+ RELAY_ERROR(advance_all_children(q, seg, doc_id, child_results, found, done));
570
+
571
+ if(*found) {
572
+ DEBUG("found doc %u. now checking for positional matches", doc_id);
573
+
574
+ // TODO remove this once we're less paranoid
575
+ for(int i = 0; i < q->num_children; i++) {
576
+ if(child_results[i].num_doc_matches != 1) RAISE_ERROR("invalid state: %d results", child_results[i].num_doc_matches);
577
+ if(child_results[i].doc_id != doc_id) RAISE_ERROR("invalid state: doc id %u vs searched-for %u", child_results[i].doc_id, doc_id);
578
+ }
579
+
580
+ /* the following can be optimized in several ways:
581
+
582
+ 1. choose the doc with the smallest number of term matches, rather than aways picking the first.
583
+ 2. do a binary search to find the position (since the array is sorted), rather than a linear
584
+ scan.
585
+
586
+ this is simply the simplest, stupidest, first-approach implementation.
587
+ */
588
+
589
+ // we'll base everything off of this guy
590
+ doc_match* first_dm = &child_results[0].doc_matches[0];
591
+
592
+ // allocate enough space to hold the maximum number of positions
593
+ pos_t* phrase_positions = malloc(sizeof(pos_t) * first_dm->num_positions);
594
+ int num_positions_found = 0;
595
+
596
+ for(int i = 0; i < first_dm->num_positions; i++) {
597
+ pos_t position = first_dm->positions[i];
598
+ DEBUG("try %d: match by term 0 at position %u", i, position);
599
+
600
+ int found_in_this_position = 1;
601
+ for(int j = 1; j < q->num_children; j++) {
602
+ doc_match* this_dm = &child_results[j].doc_matches[0];
603
+
604
+ int k, found_in_doc = 0;
605
+ for(k = 0; k < this_dm->num_positions; k++) {
606
+ if(this_dm->positions[k] == (position + j)) {
607
+ found_in_doc = 1;
608
+ break;
609
+ }
610
+ }
611
+
612
+ if(!found_in_doc) {
613
+ found_in_this_position = 0;
614
+ DEBUG("term %d did NOT match at position %u after %d comparisons", j, position + j, k + 1);
615
+ break;
616
+ }
617
+ #ifdef DEBUGOUTPUT
618
+ else DEBUG("term %d matched at position %u after %d/%d comparisons", j, position + j, k + 1, this_dm->num_positions);
619
+ #endif
620
+ }
621
+
622
+ if(found_in_this_position) phrase_positions[num_positions_found++] = position; // got a match!
623
+ }
624
+
625
+ if(num_positions_found > 0) {
626
+ // fill in the result
627
+ result->doc_id = doc_id;
628
+ result->num_doc_matches = 1;
629
+ result->doc_matches = malloc(sizeof(doc_match));
630
+ result->doc_matches[0].field = NULL;
631
+ result->doc_matches[0].word = NULL;
632
+ result->doc_matches[0].num_positions = num_positions_found;
633
+ result->doc_matches[0].positions = phrase_positions;
634
+ }
635
+ else {
636
+ *found = 0;
637
+ free(phrase_positions);
638
+ }
639
+ for(int i = 0; i < q->num_children; i++) wp_search_result_free(&child_results[i]);
640
+ }
641
+
642
+ free(child_results);
643
+ return NO_ERROR;
644
+ }
645
+
646
+ static wp_error* neg_next_doc(wp_query* q, wp_segment* seg, search_result* result, int* done) {
647
+ neg_search_state* state = (neg_search_state*)q->search_data;
648
+
649
+ DEBUG("called with cur %u and next %u", state->cur, state->next);
650
+
651
+ if(state->cur == DOCID_NONE) {
652
+ *done = 1;
653
+ return NO_ERROR;
654
+ }
655
+
656
+ state->cur--; // advance virtual doc pointer
657
+
658
+ // if state->cur == state->next, we need to load the substream's next
659
+ // document, decrement our cur, and recheck.
660
+ while((state->cur > DOCID_NONE) && (state->cur == state->next)) { // need to advance the child stream
661
+ state->cur--; // can't use the previous value because == next; decrement
662
+
663
+ int child_done;
664
+ RELAY_ERROR(query_next_doc(q->children, seg, result, &child_done));
665
+ if(child_done) state->next = DOCID_NONE; // child stream is done
666
+ else {
667
+ state->next = result->doc_id;
668
+ wp_search_result_free(result);
669
+ }
670
+
671
+ DEBUG("after bump, cur %u and next %u", state->cur, state->next);
672
+ }
673
+
674
+ // check again... sigh
675
+ if(state->cur == DOCID_NONE) {
676
+ *done = 1;
677
+ return NO_ERROR;
678
+ }
679
+
680
+ DEBUG("returning doc %u", state->cur);
681
+ result->doc_id = state->cur;
682
+ result->num_doc_matches = 0;
683
+ result->doc_matches = NULL;
684
+ *done = 0;
685
+ return NO_ERROR;
686
+ }
687
+
688
+ static wp_error* neg_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_id, search_result* result, int* found, int* done) {
689
+ neg_search_state* state = (neg_search_state*)q->search_data;
690
+
691
+ DEBUG("in search for %u, called with cur %u and next %u", doc_id, state->cur, state->next);
692
+
693
+ if(state->cur == DOCID_NONE) {
694
+ *done = 1;
695
+ *found = 0;
696
+ return NO_ERROR;
697
+ }
698
+
699
+ // seek through child stream until we find a docid it contains that's <= doc_id
700
+ while(state->next > doc_id) { // need to advance child stream
701
+ int child_done;
702
+ RELAY_ERROR(query_next_doc(q->children, seg, result, &child_done));
703
+ if(child_done) state->next = DOCID_NONE; // will break the loop too
704
+ else state->next = result->doc_id;
705
+ }
706
+
707
+ DEBUG("in search for %u, intermediate state is cur %u and next %u", doc_id, state->cur, state->next);
708
+
709
+ // at this point we know state->next, our child pointer, is <= doc_id
710
+ state->cur = doc_id;
711
+ if(state->next == doc_id) *found = 0; // opposite day
712
+ else {
713
+ *found = 1;
714
+ result->doc_id = doc_id;
715
+ result->num_doc_matches = 0;
716
+ result->doc_matches = NULL;
717
+ }
718
+
719
+ *done = state->cur == DOCID_NONE ? 1 : 0;
720
+
721
+ DEBUG("finally, state is cur %u and next %u and found is %d and done is %d", state->cur, state->next, *found, *done);
722
+ return NO_ERROR;
723
+ }
724
+
725
+ wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment* s, uint32_t max_num_results, uint32_t* num_results, search_result* results) {
726
+ int done;
727
+
728
+ *num_results = 0;
729
+
730
+ #ifdef DEBUG
731
+ char buf[1024];
732
+ wp_query_to_s(q, 1024, buf);
733
+ DEBUG("running query %s", buf);
734
+ #endif
735
+
736
+ while(*num_results < max_num_results) {
737
+ RELAY_ERROR(query_next_doc(q, s, &results[*num_results], &done));
738
+ if(done) break;
739
+ DEBUG("got result %u (%u doc matches)", results[*num_results].doc_id, results[*num_results].num_doc_matches);
740
+ (*num_results)++;
741
+ DEBUG("num results now %d", *num_results);
742
+ }
743
+
744
+ return NO_ERROR;
745
+ }
746
+