whistlepig 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,746 @@
1
+ #include "whistlepig.h"
2
+
3
+ /********* search states *********/
4
+ typedef struct term_search_state {
5
+ posting posting;
6
+ int started;
7
+ int done;
8
+ int label; // 1 if a label; 0 if a term
9
+ } term_search_state;
10
+
11
+ typedef struct neg_search_state {
12
+ docid_t next; // the next document in the child stream. we will never return this document.
13
+ docid_t cur; // the last doc we returned
14
+ } neg_search_state;
15
+
16
+ #define DISJ_SEARCH_STATE_EMPTY 0
17
+ #define DISJ_SEARCH_STATE_FILLED 1
18
+ #define DISJ_SEARCH_STATE_DONE 2
19
+
20
+ typedef struct disj_search_state {
21
+ docid_t last_docid;
22
+ uint8_t* states; // whether the search result has been initialized or not
23
+ search_result* results; // array of search results, one per child
24
+ } disj_search_state;
25
+
26
+ void wp_search_result_free(search_result* result) {
27
+ for(int i = 0; i < result->num_doc_matches; i++) {
28
+ //printf("for result at %p (dm %d), freeing positions at %p\n", result, i, result->doc_matches[i].positions);
29
+ free(result->doc_matches[i].positions);
30
+ }
31
+ free(result->doc_matches);
32
+ }
33
+
34
+ RAISING_STATIC(search_result_init(search_result* result, const char* field, const char* word, posting* posting)) {
35
+ result->doc_id = posting->doc_id;
36
+ result->num_doc_matches = 1;
37
+ result->doc_matches = malloc(sizeof(doc_match));
38
+ result->doc_matches[0].field = field;
39
+ result->doc_matches[0].word = word;
40
+ result->doc_matches[0].num_positions = posting->num_positions;
41
+
42
+ size_t size = sizeof(pos_t) * posting->num_positions;
43
+ result->doc_matches[0].positions = malloc(size);
44
+ //printf("for result at %p, allocated %u bytes for positions at %p\n", result, size, result->doc_matches[0].positions);
45
+ memcpy(result->doc_matches[0].positions, posting->positions, size);
46
+
47
+ return NO_ERROR;
48
+ }
49
+
50
+ RAISING_STATIC(search_result_combine_into(search_result* result, search_result* child_results, int num_child_results)) {
51
+ if(num_child_results <= 0) RAISE_ERROR("no child results");
52
+ result->doc_id = child_results[0].doc_id;
53
+ result->num_doc_matches = num_child_results;
54
+ result->doc_matches = malloc(sizeof(doc_match) * num_child_results);
55
+ for(int i = 0; i < num_child_results; i++) {
56
+ if(child_results[i].doc_matches == NULL) {
57
+ result->doc_matches[i].field = NULL;
58
+ result->doc_matches[i].word = NULL;
59
+ result->doc_matches[i].num_positions = 0;
60
+ result->doc_matches[i].positions = NULL;
61
+ }
62
+ else result->doc_matches[i] = child_results[i].doc_matches[0];
63
+ }
64
+
65
+ return NO_ERROR;
66
+ }
67
+
68
+ /*
69
+ * we provide two functions for iterating through result streams: next() and
70
+ * advance().
71
+ *
72
+ * next() returns results one at a time. it will set done = true if you're at
73
+ * the end of the stream. otherwise, it will give you a result. the next
74
+ * call to next() will give you the next result (or set done = true).
75
+ *
76
+ * advance() is given a docid and advances the stream to just *after* that
77
+ * document, and tells you whether it saw the docid on the way(and set a result
78
+ * if so for your convenience).
79
+ *
80
+ * a next() followed by one or more advance() calls with the returned docid
81
+ * will set found = true and will not advance the stream beyond where it
82
+ * already is.
83
+ *
84
+ * however, an advance() to a docid followed by next() may skip a document in
85
+ * the stream. you probably don't want this.
86
+ *
87
+ * so advance is only useful if you have a particular doc_id in mind, and you
88
+ * want to see if this stream contains it. if you want to actually see all the
89
+ * docids in a stream, you must use next().
90
+ *
91
+ */
92
+
93
+ /********** dispatch functions ***********/
94
+ static wp_error* term_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
95
+ static wp_error* conj_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
96
+ static wp_error* disj_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
97
+ static wp_error* phrase_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
98
+ static wp_error* neg_init_search_state(wp_query* q, wp_segment* s) RAISES_ERROR;
99
+ static wp_error* term_release_search_state(wp_query* q) RAISES_ERROR;
100
+ static wp_error* conj_release_search_state(wp_query* q) RAISES_ERROR;
101
+ static wp_error* disj_release_search_state(wp_query* q) RAISES_ERROR;
102
+ static wp_error* phrase_release_search_state(wp_query* q) RAISES_ERROR;
103
+ static wp_error* neg_release_search_state(wp_query* q) RAISES_ERROR;
104
+ static wp_error* term_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
105
+ static wp_error* conj_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
106
+ static wp_error* disj_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
107
+ static wp_error* phrase_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
108
+ static wp_error* neg_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) RAISES_ERROR;
109
+ static wp_error* term_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
110
+ static wp_error* conj_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
111
+ static wp_error* disj_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
112
+ static wp_error* phrase_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
113
+ static wp_error* neg_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) RAISES_ERROR;
114
+
115
+ // the term_* functions also handle labels
116
+ // we use conj for empty queries as well (why not)
117
+ #define DISPATCH(type, suffix, ...) \
118
+ switch(type) { \
119
+ case WP_QUERY_TERM: \
120
+ case WP_QUERY_LABEL: RELAY_ERROR(term_##suffix(__VA_ARGS__)); break; \
121
+ case WP_QUERY_EMPTY: \
122
+ case WP_QUERY_CONJ: RELAY_ERROR(conj_##suffix(__VA_ARGS__)); break; \
123
+ case WP_QUERY_DISJ: RELAY_ERROR(disj_##suffix(__VA_ARGS__)); break; \
124
+ case WP_QUERY_PHRASE: RELAY_ERROR(phrase_##suffix(__VA_ARGS__)); break; \
125
+ case WP_QUERY_NEG: RELAY_ERROR(neg_##suffix(__VA_ARGS__)); break; \
126
+ default: RAISE_ERROR("unknown query node type %d", type); \
127
+ } \
128
+
129
+ wp_error* wp_search_init_search_state(wp_query* q, wp_segment* s) {
130
+ DISPATCH(q->type, init_search_state, q, s);
131
+ return NO_ERROR;
132
+ }
133
+
134
+ wp_error* wp_search_release_search_state(wp_query* q) {
135
+ DISPATCH(q->type, release_search_state, q)
136
+ return NO_ERROR;
137
+ }
138
+
139
+ RAISING_STATIC(query_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done)) {
140
+ DISPATCH(q->type, next_doc, q, s, result, done);
141
+ #ifdef DEBUGOUTPUT
142
+ char buf[1024];
143
+ wp_query_to_s(q, 1024, buf);
144
+
145
+ if(*done) DEBUG("query %s is done", buf);
146
+ else DEBUG("query %s has doc %u", buf, result->doc_id);
147
+ #endif
148
+ return NO_ERROR;
149
+ }
150
+
151
+ RAISING_STATIC(query_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done)) {
152
+ DISPATCH(q->type, advance_to_doc, q, s, doc_id, result, found, done);
153
+ #ifdef DEBUGOUTPUT
154
+ char buf[1024];
155
+ wp_query_to_s(q, 1024, buf);
156
+
157
+ if(*done) DEBUG("query %s is done", buf);
158
+ else {
159
+ if(*found) DEBUG("query %s has doc %u", buf, doc_id);
160
+ else DEBUG("query %s does not have doc %u", buf, doc_id);
161
+ }
162
+ #endif
163
+ return NO_ERROR;
164
+ }
165
+
166
+ /************** init functions *************/
167
+
168
+ RAISING_STATIC(init_children(wp_query* q, wp_segment* s)) {
169
+ for(wp_query* child = q->children; child != NULL; child = child->next) RELAY_ERROR(wp_search_init_search_state(child, s));
170
+ return NO_ERROR;
171
+ }
172
+
173
+ RAISING_STATIC(release_children(wp_query* q)) {
174
+ for(wp_query* child = q->children; child != NULL; child = child->next) RELAY_ERROR(wp_search_release_search_state(child));
175
+ return NO_ERROR;
176
+ }
177
+
178
+ static wp_error* term_init_search_state(wp_query* q, wp_segment* seg) {
179
+ term t;
180
+ stringmap* sh = MMAP_OBJ(seg->stringmap, stringmap);
181
+ termhash* th = MMAP_OBJ(seg->termhash, termhash);
182
+
183
+ term_search_state* state = q->search_data = malloc(sizeof(term_search_state));
184
+ state->started = 0;
185
+
186
+ state->label = q->type == WP_QUERY_LABEL ? 1 : 0;
187
+ if(state->label) t.field_s = 0;
188
+ else t.field_s = stringmap_string_to_int(sh, q->field); // will be -1 if not found
189
+
190
+ t.word_s = stringmap_string_to_int(sh, q->word);
191
+
192
+ uint32_t offset = termhash_get_val(th, t);
193
+ if(offset == (uint32_t)-1) offset = OFFSET_NONE;
194
+
195
+ if(offset == OFFSET_NONE) state->done = 1; // no entry in term hash
196
+ else {
197
+ state->done = 0;
198
+ if(state->label) RELAY_ERROR(wp_segment_read_label(seg, offset, &state->posting));
199
+ else RELAY_ERROR(wp_segment_read_posting(seg, offset, &state->posting, 1));
200
+ }
201
+
202
+ RELAY_ERROR(init_children(q, seg));
203
+
204
+ return NO_ERROR;
205
+ }
206
+
207
+ static wp_error* term_release_search_state(wp_query* q) {
208
+ term_search_state* state = q->search_data;
209
+ if(!state->done) free(state->posting.positions);
210
+ free(state);
211
+ RELAY_ERROR(release_children(q));
212
+ return NO_ERROR;
213
+ }
214
+
215
+ static wp_error* conj_init_search_state(wp_query* q, wp_segment* s) {
216
+ q->search_data = NULL; // no state needed
217
+ RELAY_ERROR(init_children(q, s));
218
+ return NO_ERROR;
219
+ }
220
+
221
+ static wp_error* conj_release_search_state(wp_query* q) {
222
+ RELAY_ERROR(release_children(q));
223
+ return NO_ERROR;
224
+ }
225
+
226
+ static wp_error* disj_init_search_state(wp_query* q, wp_segment* s) {
227
+ disj_search_state* state = q->search_data = malloc(sizeof(disj_search_state));
228
+ state->states = NULL;
229
+ state->results = NULL;
230
+ state->last_docid = DOCID_NONE;
231
+ RELAY_ERROR(init_children(q, s));
232
+ return NO_ERROR;
233
+ }
234
+
235
+ static wp_error* disj_release_search_state(wp_query* q) {
236
+ disj_search_state* state = (disj_search_state*)q->search_data;
237
+ if(state->states) {
238
+ // free any remaining search results in the buffer
239
+ for(uint16_t i = 0; i < q->num_children; i++) {
240
+ if(state->states[i] == DISJ_SEARCH_STATE_FILLED) wp_search_result_free(&state->results[i]);
241
+ }
242
+ free(state->states);
243
+ free(state->results);
244
+ }
245
+ free(state);
246
+ RELAY_ERROR(release_children(q));
247
+ return NO_ERROR;
248
+ }
249
+
250
+ static wp_error* phrase_init_search_state(wp_query* q, wp_segment* s) {
251
+ q->search_data = NULL; // no state needed
252
+ RELAY_ERROR(init_children(q, s));
253
+ return NO_ERROR;
254
+ }
255
+
256
+ static wp_error* phrase_release_search_state(wp_query* q) {
257
+ RELAY_ERROR(release_children(q));
258
+ return NO_ERROR;
259
+ }
260
+
261
+ static wp_error* neg_init_search_state(wp_query* q, wp_segment* seg) {
262
+ if(q->num_children != 1) RAISE_ERROR("negations currently only operate on single children");
263
+
264
+ RELAY_ERROR(wp_search_init_search_state(q->children, seg));
265
+
266
+ postings_region* pr = MMAP_OBJ(seg->postings, postings_region);
267
+ neg_search_state* state = q->search_data = malloc(sizeof(neg_search_state));
268
+
269
+ state->cur = pr->num_docs + 1;
270
+ search_result result;
271
+ int done;
272
+ RELAY_ERROR(query_next_doc(q->children, seg, &result, &done));
273
+ if(done) state->next = DOCID_NONE;
274
+ else {
275
+ state->next = result.doc_id;
276
+ wp_search_result_free(&result);
277
+ }
278
+ DEBUG("initialized with cur %u and next %u", state->cur, state->next);
279
+
280
+ return NO_ERROR;
281
+ }
282
+
283
+ static wp_error* neg_release_search_state(wp_query* q) {
284
+ RELAY_ERROR(wp_search_release_search_state(q->children));
285
+ free(q->search_data);
286
+ return NO_ERROR;
287
+ }
288
+
289
+ /********** search functions **********/
290
+
291
+ static wp_error* term_next_doc(wp_query* q, wp_segment* s, search_result* result, int* done) {
292
+ term_search_state* state = (term_search_state*)q->search_data;
293
+
294
+ DEBUG("[%s:'%s'] before: started is %d, done is %d", q->field, q->word, state->started, state->done);
295
+ if(state->done) {
296
+ *done = 1;
297
+ return NO_ERROR;
298
+ }
299
+
300
+ *done = 0;
301
+ if(!state->started) { // start
302
+ state->started = 1;
303
+ RELAY_ERROR(search_result_init(result, q->field, q->word, &state->posting));
304
+ }
305
+ else { // advance
306
+ free(state->posting.positions);
307
+ if(state->posting.next_offset == OFFSET_NONE) { // end of stream
308
+ *done = state->done = 1;
309
+ }
310
+ else {
311
+ if(state->label) RELAY_ERROR(wp_segment_read_label(s, state->posting.next_offset, &state->posting));
312
+ else RELAY_ERROR(wp_segment_read_posting(s, state->posting.next_offset, &state->posting, 1));
313
+ RELAY_ERROR(search_result_init(result, q->field, q->word, &state->posting));
314
+ }
315
+ }
316
+ DEBUG("[%s:'%s'] after: doc id %u, done is %d, started is %d", q->field, q->word, (state->started && !state->done && result) ? result->doc_id : 0, *done, state->started);
317
+
318
+ return NO_ERROR;
319
+ }
320
+
321
+ static wp_error* term_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) {
322
+ term_search_state* state = (term_search_state*)q->search_data;
323
+ DEBUG("[%s:'%s'] seeking through postings for doc %u", q->field, q->word, doc_id);
324
+
325
+ if(state->done) { // end of stream
326
+ *found = 0;
327
+ *done = 1;
328
+ return NO_ERROR;
329
+ }
330
+
331
+ while(state->posting.doc_id > doc_id) {
332
+ free(state->posting.positions);
333
+ DEBUG("skipping doc_id %u", state->posting.doc_id);
334
+ if(state->posting.next_offset == OFFSET_NONE) {
335
+ state->done = 1;
336
+ break;
337
+ }
338
+
339
+ if(state->label) RELAY_ERROR(wp_segment_read_label(s, state->posting.next_offset, &state->posting));
340
+ else RELAY_ERROR(wp_segment_read_posting(s, state->posting.next_offset, &state->posting, 1));
341
+ //DEBUG("advanced posting to %p", state->posting);
342
+ }
343
+
344
+ if(state->done) {
345
+ DEBUG("[%s:'%s'] posting list exhausted", q->field, q->word);
346
+ *found = 0;
347
+ *done = 1;
348
+ }
349
+ else {
350
+ *done = 0;
351
+ DEBUG("[%s:'%s'] posting advanced to that of doc %u", q->field, q->word, state->posting.doc_id);
352
+ *found = (doc_id == state->posting.doc_id ? 1 : 0);
353
+ if(*found) RELAY_ERROR(search_result_init(result, q->field, q->word, &state->posting));
354
+ }
355
+
356
+ return NO_ERROR;
357
+ }
358
+
359
+ // this advances all children *until* it finds a child that doesn't have the
360
+ // doc. at that point it stops. so it will return found=0 if any single child
361
+ // doesn't have the doc, and done=1 if any single child is done.
362
+ //
363
+ // this is used by both phrasal and conjunctive queries.
364
+ static wp_error* advance_all_children(wp_query* q, wp_segment* seg, docid_t search_doc, search_result* child_results, int* found, int* done) {
365
+ int num_children_searched = 0;
366
+ *found = 1;
367
+
368
+ DEBUG("advancing all children to doc %u with early termination", search_doc);
369
+
370
+ for(wp_query* child = q->children; child != NULL; child = child->next) {
371
+ RELAY_ERROR(query_advance_to_doc(child, seg, search_doc, &child_results[num_children_searched], found, done));
372
+ num_children_searched++;
373
+ if(!*found) break;
374
+ }
375
+
376
+ if(!*found) for(int i = 0; i < num_children_searched - 1; i++) wp_search_result_free(&child_results[i]);
377
+
378
+ return NO_ERROR;
379
+ }
380
+
381
+ static wp_error* disj_next_doc(wp_query* q, wp_segment* seg, search_result* result, int* done) {
382
+ if(q->children == NULL) {
383
+ *done = 1;
384
+ return NO_ERROR;
385
+ }
386
+
387
+ // allocate search state if necessary
388
+ disj_search_state* state = (disj_search_state*)q->search_data;
389
+ if(state->states == NULL) {
390
+ state->states = malloc(sizeof(uint8_t) * q->num_children);
391
+ state->results = malloc(sizeof(search_result) * q->num_children);
392
+ memset(state->states, DISJ_SEARCH_STATE_EMPTY, sizeof(uint8_t) * q->num_children);
393
+ }
394
+
395
+ // fill all the results we can into the buffer by calling next_doc on all
396
+ // non-done children
397
+ uint16_t i = 0;
398
+ for(wp_query* child = q->children; child != NULL; child = child->next) {
399
+ if(state->states[i] == DISJ_SEARCH_STATE_EMPTY) {
400
+ int thisdone = 0;
401
+ DEBUG("recursing on child %d", i);
402
+ RELAY_ERROR(query_next_doc(child, seg, &(state->results[i]), &thisdone));
403
+ if(thisdone == 1) state->states[i] = DISJ_SEARCH_STATE_DONE;
404
+ else state->states[i] = DISJ_SEARCH_STATE_FILLED;
405
+ DEBUG("after recurse, state %d is marked %d", i, state->states[i]);
406
+ }
407
+ i++;
408
+ }
409
+
410
+ // now find the largest
411
+ uint16_t max_doc_idx = 0;
412
+ docid_t max_docid = 0;
413
+
414
+ *done = 1;
415
+ i = 0;
416
+ for(wp_query* child = q->children; child != NULL; child = child->next) {
417
+ DEBUG("child %d is marked as %d", i, state->states[i]);
418
+ if(state->states[i] == DISJ_SEARCH_STATE_FILLED) {
419
+ if((*done == 1) || (state->results[i].doc_id > max_docid)) {
420
+ if(state->results[i].doc_id == state->last_docid) { // discard dupes
421
+ DEBUG("child %d has old result %u; voiding", i, state->last_docid);
422
+ wp_search_result_free(&state->results[i]);
423
+ state->states[i] = DISJ_SEARCH_STATE_EMPTY;
424
+ }
425
+ else {
426
+ *done = 0;
427
+ max_docid = state->results[i].doc_id;
428
+ max_doc_idx = i;
429
+ }
430
+ }
431
+ }
432
+ i++;
433
+ }
434
+
435
+ // finally, copy the result
436
+ if(*done == 0) {
437
+ DEBUG("returning doc %d at index %d", max_docid, max_doc_idx);
438
+ memcpy(result, &state->results[max_doc_idx], sizeof(search_result));
439
+ state->states[max_doc_idx] = DISJ_SEARCH_STATE_EMPTY;
440
+ state->last_docid = result->doc_id;
441
+ }
442
+
443
+ return NO_ERROR;
444
+ }
445
+
446
+ static wp_error* conj_next_doc(wp_query* q, wp_segment* seg, search_result* result, int* done) {
447
+ docid_t search_doc;
448
+ int found = 0;
449
+ *done = 0;
450
+
451
+ // start with the first child's first doc
452
+ // TODO: find smallest postings list and use that instead
453
+ wp_query* master = q->children;
454
+ if(master == NULL) *done = 1;
455
+
456
+ while(!found && !*done) {
457
+ RELAY_ERROR(query_next_doc(master, seg, result, done));
458
+ DEBUG("master reports doc %u done %d", result->doc_id, *done);
459
+ if(!*done) {
460
+ search_doc = result->doc_id;
461
+ wp_search_result_free(result); // sigh
462
+ RELAY_ERROR(conj_advance_to_doc(q, seg, search_doc, result, &found, done));
463
+ }
464
+ DEBUG("after search, found is %d and done is %d", found, *done);
465
+ }
466
+
467
+ return NO_ERROR;
468
+ }
469
+
470
+ static wp_error* conj_advance_to_doc(wp_query* q, wp_segment* s, docid_t doc_id, search_result* result, int* found, int* done) {
471
+ search_result* child_results = malloc(sizeof(search_result) * q->num_children);
472
+ RELAY_ERROR(advance_all_children(q, s, doc_id, child_results, found, done));
473
+
474
+ if(*found) {
475
+ DEBUG("successfully found doc %u", doc_id);
476
+ RELAY_ERROR(search_result_combine_into(result, child_results, q->num_children));
477
+ }
478
+
479
+ free(child_results);
480
+ return NO_ERROR;
481
+ }
482
+
483
+ static wp_error* disj_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_id, search_result* result, int* found, int* done) {
484
+ search_result child_result;
485
+ int child_found;
486
+
487
+ DEBUG("advancing all to %d", doc_id);
488
+
489
+ *found = 0;
490
+ *done = 0;
491
+ uint16_t i = 0;
492
+ for(wp_query* child = q->children; child != NULL; child = child->next) {
493
+ int child_done;
494
+ RELAY_ERROR(query_advance_to_doc(child, seg, doc_id, &child_result, &child_found, &child_done));
495
+ DEBUG("child %u reports found %d and done %d", i, child_found, child_done);
496
+ *done = *done && child_done; // we're only done if ALL children are done
497
+ if(child_found && !*found) {
498
+ *found = 1;
499
+ *result = child_result;
500
+ }
501
+
502
+ i += 1;
503
+ // TODO XXXXXXXXXX does this leak memory when multiple children all return results?
504
+ }
505
+
506
+ #ifdef DEBUGOUTPUT
507
+ if(*found) DEBUG("successfully found doc %u", doc_id);
508
+ else DEBUG("did not find doc %u", doc_id);
509
+ #endif
510
+
511
+ // now release any buffered results if they're > doc_id
512
+ disj_search_state* state = (disj_search_state*)q->search_data;
513
+ if(state->states != NULL) {
514
+ uint16_t i = 0;
515
+ for(wp_query* child = q->children; child != NULL; child = child->next) {
516
+ if((state->states[i] == DISJ_SEARCH_STATE_FILLED) && (state->results[i].doc_id > doc_id)) {
517
+ wp_search_result_free(&state->results[i]);
518
+ state->states[i] = DISJ_SEARCH_STATE_EMPTY;
519
+ }
520
+ i++;
521
+ }
522
+ }
523
+
524
+ return NO_ERROR;
525
+ }
526
+
527
+ // sadly, this is basically a copy of conj_next_doc right now. all the
528
+ // interesting phrasal checking is done by phrase_advance_to_doc.
529
+ static wp_error* phrase_next_doc(wp_query* q, wp_segment* seg, search_result* result, int* done) {
530
+ #ifdef DEBUGOUTPUT
531
+ char query_s[1024];
532
+ wp_query_to_s(q, 1024, query_s);
533
+ DEBUG("called on %s", query_s);
534
+ #endif
535
+
536
+ docid_t search_doc;
537
+ int found = 0;
538
+ *done = 0;
539
+
540
+ // start with the first child's first doc
541
+ // TODO: find smallest postings list and use that instead
542
+ wp_query* master = q->children;
543
+ if(master == NULL) *done = 1;
544
+
545
+ while(!found && !*done) {
546
+ RELAY_ERROR(query_next_doc(master, seg, result, done));
547
+ DEBUG("master reports doc %u done %d", result->doc_id, *done);
548
+ if(!*done) {
549
+ search_doc = result->doc_id;
550
+ wp_search_result_free(result); // sigh
551
+ RELAY_ERROR(phrase_advance_to_doc(q, seg, search_doc, result, &found, done));
552
+ }
553
+ DEBUG("after search, found is %d and done is %d", found, *done);
554
+ }
555
+
556
+ return NO_ERROR;
557
+ }
558
+
559
+ static wp_error* phrase_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_id, search_result* result, int* found, int* done) {
560
+ #ifdef DEBUGOUTPUT
561
+ char query_s[1024];
562
+ wp_query_to_s(q, 1024, query_s);
563
+ DEBUG("called on %s", query_s);
564
+ #endif
565
+
566
+ search_result* child_results = malloc(sizeof(search_result) * q->num_children);
567
+
568
+ DEBUG("will be searching for doc %u", doc_id);
569
+ RELAY_ERROR(advance_all_children(q, seg, doc_id, child_results, found, done));
570
+
571
+ if(*found) {
572
+ DEBUG("found doc %u. now checking for positional matches", doc_id);
573
+
574
+ // TODO remove this once we're less paranoid
575
+ for(int i = 0; i < q->num_children; i++) {
576
+ if(child_results[i].num_doc_matches != 1) RAISE_ERROR("invalid state: %d results", child_results[i].num_doc_matches);
577
+ if(child_results[i].doc_id != doc_id) RAISE_ERROR("invalid state: doc id %u vs searched-for %u", child_results[i].doc_id, doc_id);
578
+ }
579
+
580
+ /* the following can be optimized in several ways:
581
+
582
+ 1. choose the doc with the smallest number of term matches, rather than aways picking the first.
583
+ 2. do a binary search to find the position (since the array is sorted), rather than a linear
584
+ scan.
585
+
586
+ this is simply the simplest, stupidest, first-approach implementation.
587
+ */
588
+
589
+ // we'll base everything off of this guy
590
+ doc_match* first_dm = &child_results[0].doc_matches[0];
591
+
592
+ // allocate enough space to hold the maximum number of positions
593
+ pos_t* phrase_positions = malloc(sizeof(pos_t) * first_dm->num_positions);
594
+ int num_positions_found = 0;
595
+
596
+ for(int i = 0; i < first_dm->num_positions; i++) {
597
+ pos_t position = first_dm->positions[i];
598
+ DEBUG("try %d: match by term 0 at position %u", i, position);
599
+
600
+ int found_in_this_position = 1;
601
+ for(int j = 1; j < q->num_children; j++) {
602
+ doc_match* this_dm = &child_results[j].doc_matches[0];
603
+
604
+ int k, found_in_doc = 0;
605
+ for(k = 0; k < this_dm->num_positions; k++) {
606
+ if(this_dm->positions[k] == (position + j)) {
607
+ found_in_doc = 1;
608
+ break;
609
+ }
610
+ }
611
+
612
+ if(!found_in_doc) {
613
+ found_in_this_position = 0;
614
+ DEBUG("term %d did NOT match at position %u after %d comparisons", j, position + j, k + 1);
615
+ break;
616
+ }
617
+ #ifdef DEBUGOUTPUT
618
+ else DEBUG("term %d matched at position %u after %d/%d comparisons", j, position + j, k + 1, this_dm->num_positions);
619
+ #endif
620
+ }
621
+
622
+ if(found_in_this_position) phrase_positions[num_positions_found++] = position; // got a match!
623
+ }
624
+
625
+ if(num_positions_found > 0) {
626
+ // fill in the result
627
+ result->doc_id = doc_id;
628
+ result->num_doc_matches = 1;
629
+ result->doc_matches = malloc(sizeof(doc_match));
630
+ result->doc_matches[0].field = NULL;
631
+ result->doc_matches[0].word = NULL;
632
+ result->doc_matches[0].num_positions = num_positions_found;
633
+ result->doc_matches[0].positions = phrase_positions;
634
+ }
635
+ else {
636
+ *found = 0;
637
+ free(phrase_positions);
638
+ }
639
+ for(int i = 0; i < q->num_children; i++) wp_search_result_free(&child_results[i]);
640
+ }
641
+
642
+ free(child_results);
643
+ return NO_ERROR;
644
+ }
645
+
646
+ static wp_error* neg_next_doc(wp_query* q, wp_segment* seg, search_result* result, int* done) {
647
+ neg_search_state* state = (neg_search_state*)q->search_data;
648
+
649
+ DEBUG("called with cur %u and next %u", state->cur, state->next);
650
+
651
+ if(state->cur == DOCID_NONE) {
652
+ *done = 1;
653
+ return NO_ERROR;
654
+ }
655
+
656
+ state->cur--; // advance virtual doc pointer
657
+
658
+ // if state->cur == state->next, we need to load the substream's next
659
+ // document, decrement our cur, and recheck.
660
+ while((state->cur > DOCID_NONE) && (state->cur == state->next)) { // need to advance the child stream
661
+ state->cur--; // can't use the previous value because == next; decrement
662
+
663
+ int child_done;
664
+ RELAY_ERROR(query_next_doc(q->children, seg, result, &child_done));
665
+ if(child_done) state->next = DOCID_NONE; // child stream is done
666
+ else {
667
+ state->next = result->doc_id;
668
+ wp_search_result_free(result);
669
+ }
670
+
671
+ DEBUG("after bump, cur %u and next %u", state->cur, state->next);
672
+ }
673
+
674
+ // check again... sigh
675
+ if(state->cur == DOCID_NONE) {
676
+ *done = 1;
677
+ return NO_ERROR;
678
+ }
679
+
680
+ DEBUG("returning doc %u", state->cur);
681
+ result->doc_id = state->cur;
682
+ result->num_doc_matches = 0;
683
+ result->doc_matches = NULL;
684
+ *done = 0;
685
+ return NO_ERROR;
686
+ }
687
+
688
+ static wp_error* neg_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_id, search_result* result, int* found, int* done) {
689
+ neg_search_state* state = (neg_search_state*)q->search_data;
690
+
691
+ DEBUG("in search for %u, called with cur %u and next %u", doc_id, state->cur, state->next);
692
+
693
+ if(state->cur == DOCID_NONE) {
694
+ *done = 1;
695
+ *found = 0;
696
+ return NO_ERROR;
697
+ }
698
+
699
+ // seek through child stream until we find a docid it contains that's <= doc_id
700
+ while(state->next > doc_id) { // need to advance child stream
701
+ int child_done;
702
+ RELAY_ERROR(query_next_doc(q->children, seg, result, &child_done));
703
+ if(child_done) state->next = DOCID_NONE; // will break the loop too
704
+ else state->next = result->doc_id;
705
+ }
706
+
707
+ DEBUG("in search for %u, intermediate state is cur %u and next %u", doc_id, state->cur, state->next);
708
+
709
+ // at this point we know state->next, our child pointer, is <= doc_id
710
+ state->cur = doc_id;
711
+ if(state->next == doc_id) *found = 0; // opposite day
712
+ else {
713
+ *found = 1;
714
+ result->doc_id = doc_id;
715
+ result->num_doc_matches = 0;
716
+ result->doc_matches = NULL;
717
+ }
718
+
719
+ *done = state->cur == DOCID_NONE ? 1 : 0;
720
+
721
+ DEBUG("finally, state is cur %u and next %u and found is %d and done is %d", state->cur, state->next, *found, *done);
722
+ return NO_ERROR;
723
+ }
724
+
725
+ wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment* s, uint32_t max_num_results, uint32_t* num_results, search_result* results) {
726
+ int done;
727
+
728
+ *num_results = 0;
729
+
730
+ #ifdef DEBUG
731
+ char buf[1024];
732
+ wp_query_to_s(q, 1024, buf);
733
+ DEBUG("running query %s", buf);
734
+ #endif
735
+
736
+ while(*num_results < max_num_results) {
737
+ RELAY_ERROR(query_next_doc(q, s, &results[*num_results], &done));
738
+ if(done) break;
739
+ DEBUG("got result %u (%u doc matches)", results[*num_results].doc_id, results[*num_results].num_doc_matches);
740
+ (*num_results)++;
741
+ DEBUG("num results now %d", *num_results);
742
+ }
743
+
744
+ return NO_ERROR;
745
+ }
746
+