whistlepig 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,76 @@
1
+ #ifndef WP_SEARCH_H_
2
+ #define WP_SEARCH_H_
3
+
4
+ // whistlepig search code
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // what you need to know about search:
8
+ // 1. it runs on a per-segment basis; and
9
+ // 2. query objects maintain search state internally.
10
+ //
11
+ // to run a query on a segment, you need to use this call sequence:
12
+ //
13
+ // 1. wp_search_init_search_state
14
+ // 2. wp_search_run_query_on_segment (zero or more times)
15
+ // 3. wp_search_release_search_state
16
+ //
17
+ // because the query objects maintain state, you can repeat step 2 as much as
18
+ // you'd like to get more results without doing any duplicate work. if you
19
+ // don't do step 3, you'll leak memory.
20
+ //
21
+ // the corollary is that if you want to do multithreaded search across segments
22
+ // in parallel, you will have to clone the query for each segment to avoid
23
+ // sharing state.
24
+ //
25
+ // (right now the index does a serial search across segments, so cloning is not
26
+ // required.)
27
+
28
+ #include <stdint.h>
29
+
30
+ #include "defaults.h"
31
+ #include "segment.h"
32
+ #include "query.h"
33
+ #include "error.h"
34
+
35
+ // a match of a particular fielded phrase on a particular document
36
+ typedef struct doc_match {
37
+ const char* field;
38
+ const char* word;
39
+ uint16_t num_positions;
40
+ pos_t* positions;
41
+ } doc_match;
42
+
43
+ // a generic match on a document of a search stream
44
+ typedef struct search_result {
45
+ docid_t doc_id;
46
+ uint16_t num_doc_matches;
47
+ doc_match* doc_matches;
48
+ } search_result;
49
+
50
+ struct wp_segment;
51
+ struct wp_query;
52
+ struct wp_error;
53
+
54
+ // API methods
55
+
56
+ // initialize the query search state for running on segment s. this must precede any call
57
+ // to wp_search_run_query_on_segment.
58
+ wp_error* wp_search_init_search_state(struct wp_query* q, struct wp_segment* s) RAISES_ERROR;
59
+
60
+ // release any query search state. this must follow any call to wp_search_run_query_on_segment.
61
+ wp_error* wp_search_release_search_state(struct wp_query* q) RAISES_ERROR;
62
+
63
+ // run a query on a segment, filling at most max_num_results slots in results.
64
+ // this is the main entry point into the actual search logic, and is called by
65
+ // index.c in various ways. this must be preceded by an init_search_state and
66
+ // followed by a release_search_state.
67
+ //
68
+ // if you get num_results > 0, you should call wp_search_result_free on each of the
69
+ // results when you're done with them.
70
+ wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment* s, uint32_t max_num_results, uint32_t* num_results, search_result* results) RAISES_ERROR;
71
+
72
+ // if you got non-zero num_results from wp_search_run_query_on_segment, call
73
+ // this on each result when you're done with it.
74
+ void wp_search_result_free(search_result* result);
75
+
76
+ #endif
@@ -0,0 +1,615 @@
1
+ #include <sys/stat.h>
2
+ #include <fcntl.h>
3
+ #include <unistd.h>
4
+ #include "whistlepig.h"
5
+
6
+ #define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
7
+ #define POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS 2 // bigger, mutable
8
+
9
+ #define wp_segment_label_posting_at(posting_region, offset) ((label_posting*)(posting_region->postings + offset))
10
+
11
+ static void postings_region_init(postings_region* pr, uint32_t initial_size, uint32_t index_type_and_flags) {
12
+ pr->index_type_and_flags = index_type_and_flags;
13
+ pr->num_docs = 0;
14
+ pr->num_postings = 0;
15
+ pr->postings_head = 1; // skip one byte, which is reserved as OFFSET_NONE
16
+ pr->postings_tail = initial_size;
17
+ }
18
+
19
+ RAISING_STATIC(postings_region_validate(postings_region* pr, uint32_t index_type_and_flags)) {
20
+ if(pr->index_type_and_flags != index_type_and_flags) RAISE_ERROR("segment has index type %u; expecting type %u", pr->index_type_and_flags, index_type_and_flags);
21
+ return NO_ERROR;
22
+ }
23
+
24
+ #define INITIAL_POSTINGS_SIZE 2048
25
+ #define FN_SIZE 1024
26
+
27
+ wp_error* wp_segment_load(wp_segment* segment, const char* pathname_base) {
28
+ char fn[FN_SIZE];
29
+
30
+ // open the string pool
31
+ snprintf(fn, 128, "%s.sp", pathname_base);
32
+ RELAY_ERROR(mmap_obj_load(&segment->stringpool, "ti/stringpool", fn));
33
+
34
+ // open the string hash
35
+ snprintf(fn, 128, "%s.sh_", pathname_base);
36
+ RELAY_ERROR(mmap_obj_load(&segment->stringmap, "ti/stringmap", fn));
37
+ stringmap_setup(MMAP_OBJ(segment->stringmap, stringmap), MMAP_OBJ(segment->stringpool, stringpool));
38
+
39
+ // open the term hash
40
+ snprintf(fn, 128, "%s.th", pathname_base);
41
+ RELAY_ERROR(mmap_obj_load(&segment->termhash, "ti/termhash", fn));
42
+ termhash_setup(MMAP_OBJ(segment->termhash, termhash));
43
+
44
+ // open the postings region
45
+ snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
46
+ RELAY_ERROR(mmap_obj_load(&segment->postings, "ti/postings", fn));
47
+ RELAY_ERROR(postings_region_validate(MMAP_OBJ(segment->postings, postings_region), POSTINGS_REGION_TYPE_IMMUTABLE_VBE));
48
+
49
+ // open the labels postings region
50
+ snprintf(fn, 128, "%s.lb", pathname_base);
51
+ RELAY_ERROR(mmap_obj_load(&segment->labels, "ti/labels", fn));
52
+ RELAY_ERROR(postings_region_validate(MMAP_OBJ(segment->labels, postings_region), POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS));
53
+
54
+ return NO_ERROR;
55
+ }
56
+
57
+ wp_error* wp_segment_create(wp_segment* segment, const char* pathname_base) {
58
+ char fn[FN_SIZE];
59
+
60
+ // create the string pool
61
+ snprintf(fn, 128, "%s.sp", pathname_base);
62
+ RELAY_ERROR(mmap_obj_create(&segment->stringpool, "ti/stringpool", fn, stringpool_initial_size()));
63
+ stringpool_init(MMAP_OBJ(segment->stringpool, stringpool));
64
+
65
+ // create the string hash
66
+ snprintf(fn, 128, "%s.sh_", pathname_base);
67
+ RELAY_ERROR(mmap_obj_create(&segment->stringmap, "ti/stringmap", fn, stringmap_initial_size()));
68
+ stringmap_init(MMAP_OBJ(segment->stringmap, stringmap), MMAP_OBJ(segment->stringpool, stringpool));
69
+
70
+ // create the term hash
71
+ snprintf(fn, 128, "%s.th", pathname_base);
72
+ RELAY_ERROR(mmap_obj_create(&segment->termhash, "ti/termhash", fn, termhash_initial_size()));
73
+ termhash_init(MMAP_OBJ(segment->termhash, termhash));
74
+
75
+ // create the postings region
76
+ snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
77
+ RELAY_ERROR(mmap_obj_create(&segment->postings, "ti/postings", fn, sizeof(postings_region) + INITIAL_POSTINGS_SIZE));
78
+ postings_region_init(MMAP_OBJ(segment->postings, postings_region), INITIAL_POSTINGS_SIZE, POSTINGS_REGION_TYPE_IMMUTABLE_VBE);
79
+
80
+ // create the labels postings region
81
+ snprintf(fn, 128, "%s.lb", pathname_base);
82
+ RELAY_ERROR(mmap_obj_create(&segment->labels, "ti/labels", fn, sizeof(postings_region) + INITIAL_POSTINGS_SIZE));
83
+ postings_region_init(MMAP_OBJ(segment->labels, postings_region), INITIAL_POSTINGS_SIZE, POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS);
84
+
85
+ return NO_ERROR;
86
+ }
87
+
88
+ int wp_segment_exists(const char* pathname_base) {
89
+ struct stat fstat;
90
+ char fn[FN_SIZE];
91
+
92
+ snprintf(fn, 128, "%s.sp", pathname_base);
93
+ return !stat(fn, &fstat);
94
+ }
95
+
96
+ wp_error* wp_segment_delete(const char* pathname_base) {
97
+ char fn[FN_SIZE];
98
+
99
+ snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
100
+ unlink(fn);
101
+ snprintf(fn, 128, "%s.sp", pathname_base);
102
+ unlink(fn);
103
+ snprintf(fn, 128, "%s.sh_", pathname_base);
104
+ unlink(fn);
105
+ snprintf(fn, 128, "%s.th", pathname_base);
106
+ unlink(fn);
107
+ snprintf(fn, 128, "%s.lb", pathname_base);
108
+ unlink(fn);
109
+
110
+ return NO_ERROR;
111
+ }
112
+
113
+ wp_error* wp_segment_unload(wp_segment* s) {
114
+ RELAY_ERROR(mmap_obj_unload(&s->stringpool));
115
+ RELAY_ERROR(mmap_obj_unload(&s->stringmap));
116
+ RELAY_ERROR(mmap_obj_unload(&s->termhash));
117
+ RELAY_ERROR(mmap_obj_unload(&s->postings));
118
+ RELAY_ERROR(mmap_obj_unload(&s->labels));
119
+ return NO_ERROR;
120
+ }
121
+
122
+ RAISING_STATIC(bump_stringmap(wp_segment* s, int* success)) {
123
+ stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
124
+
125
+ *success = 1;
126
+ if(stringmap_needs_bump(sh)) {
127
+ DEBUG("bumping stringmap size");
128
+ uint32_t next_size = stringmap_next_size(sh);
129
+ if(next_size <= stringmap_size(sh)) {
130
+ DEBUG("stringmap can't be bumped no more!");
131
+ *success = 0;
132
+ }
133
+ else {
134
+ RELAY_ERROR(mmap_obj_resize(&s->stringmap, next_size));
135
+ sh = MMAP_OBJ(s->stringmap, stringmap); // this could have changed!
136
+ stringmap_setup(sh, MMAP_OBJ(s->stringpool, stringpool));
137
+ RELAY_ERROR(stringmap_bump_size(sh));
138
+ }
139
+ }
140
+
141
+ return NO_ERROR;
142
+ }
143
+
144
+ RAISING_STATIC(bump_stringpool(wp_segment* s, int* success)) {
145
+ stringpool* sp = MMAP_OBJ(s->stringpool, stringpool);
146
+
147
+ *success = 1;
148
+ if(stringpool_needs_bump(sp)) {
149
+ DEBUG("bumping stringpool size");
150
+ uint32_t next_size = stringpool_next_size(sp);
151
+ if(next_size <= stringpool_size(sp)) {
152
+ DEBUG("stringpool can't be bumped no more!");
153
+ *success = 0;
154
+ }
155
+ else {
156
+ RELAY_ERROR(mmap_obj_resize(&s->stringpool, next_size));
157
+ sp = MMAP_OBJ(s->stringpool, stringpool); // may have changed!
158
+ stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
159
+ sh->pool = sp; // need to update it here too
160
+ stringpool_bump_size(sp);
161
+ }
162
+ }
163
+
164
+ return NO_ERROR;
165
+ }
166
+
167
+ RAISING_STATIC(bump_termhash(wp_segment* s, int* success)) {
168
+ termhash* th = MMAP_OBJ(s->termhash, termhash);
169
+
170
+ *success = 1;
171
+ if(termhash_needs_bump(th)) {
172
+ DEBUG("bumping termhash size");
173
+ uint32_t next_size = termhash_next_size(th);
174
+ if(next_size <= termhash_size(th)) {
175
+ DEBUG("termhash can't be bumped no more!");
176
+ *success = 0;
177
+ }
178
+ else {
179
+ RELAY_ERROR(mmap_obj_resize(&s->termhash, next_size));
180
+ th = MMAP_OBJ(s->termhash, termhash); // could have changed!
181
+ termhash_setup(th);
182
+ RELAY_ERROR(termhash_bump_size(th));
183
+ *success = 1;
184
+ }
185
+ }
186
+
187
+ return NO_ERROR;
188
+ }
189
+
190
+ RAISING_STATIC(postings_region_ensure_fit(mmap_obj* mmopr, uint32_t postings_bytes, int* success)) {
191
+ postings_region* pr = MMAP_OBJ_PTR(mmopr, postings_region);
192
+ uint32_t new_head = pr->postings_head + postings_bytes;
193
+
194
+ DEBUG("ensuring fit for %u postings bytes", postings_bytes);
195
+
196
+ uint32_t new_tail = pr->postings_tail;
197
+ while(new_tail <= new_head) new_tail = new_tail * 2;
198
+
199
+ if(new_tail > MAX_POSTINGS_REGION_SIZE) new_tail = MAX_POSTINGS_REGION_SIZE;
200
+ DEBUG("new tail will be %u, current is %u, max is %u", new_tail, pr->postings_tail, MAX_POSTINGS_REGION_SIZE);
201
+
202
+ if(new_tail <= new_head) { // can't increase enough
203
+ *success = 0;
204
+ return NO_ERROR;
205
+ }
206
+
207
+ if(new_tail != pr->postings_tail) { // need to resize
208
+ DEBUG("request for %u postings bytes, old tail is %u, new tail will be %u, max is %u\n", postings_bytes, pr->postings_tail, new_tail, MAX_POSTINGS_REGION_SIZE);
209
+ RELAY_ERROR(mmap_obj_resize(mmopr, new_tail));
210
+ pr = MMAP_OBJ_PTR(mmopr, postings_region); // may have changed!
211
+ pr->postings_tail = new_tail;
212
+ }
213
+
214
+ *success = 1;
215
+ return NO_ERROR;
216
+ }
217
+
218
+ // TODO make this function take the number of stringpool entries, the number of
219
+ // terms, etc rather than just being a heuristic for everything except for the
220
+ // postings list
221
+ wp_error* wp_segment_ensure_fit(wp_segment* seg, uint32_t postings_bytes, uint32_t label_bytes, int* success) {
222
+ RELAY_ERROR(postings_region_ensure_fit(&seg->postings, postings_bytes, success));
223
+ if(!*success) return NO_ERROR;
224
+
225
+ RELAY_ERROR(postings_region_ensure_fit(&seg->labels, label_bytes, success));
226
+ if(!*success) return NO_ERROR;
227
+
228
+ RELAY_ERROR(bump_stringmap(seg, success));
229
+ if(!*success) return NO_ERROR;
230
+
231
+ RELAY_ERROR(bump_stringpool(seg, success));
232
+ if(!*success) return NO_ERROR;
233
+
234
+ RELAY_ERROR(bump_termhash(seg, success));
235
+ if(!*success) return NO_ERROR;
236
+
237
+ DEBUG("fit of %u postings bytes ensured", postings_bytes);
238
+
239
+ return NO_ERROR;
240
+ }
241
+
242
+ static uint32_t size_of(uint32_t num_positions, pos_t positions[]) {
243
+ (void)positions;
244
+ uint32_t position_size = sizeof(pos_t) * num_positions;
245
+ uint32_t size = sizeof(posting) - sizeof(pos_t*) + position_size;
246
+
247
+ return size;
248
+ }
249
+
250
+ wp_error* wp_segment_sizeof_posarray(wp_segment* seg, uint32_t num_positions, pos_t* positions, uint32_t* size) {
251
+ (void)seg;
252
+ *size = size_of(num_positions, positions);
253
+ return NO_ERROR;
254
+ }
255
+
256
+ #define BITMASK 0x7f
257
+
258
+ RAISING_STATIC(write_multibyte(uint8_t* location, uint32_t val, uint32_t* size)) {
259
+ //printf("xx writing %u to position %p as:\n", val, location);
260
+ uint8_t* start = location;
261
+
262
+ while(val > BITMASK) {
263
+ uint8_t c = (val & BITMASK) | 0x80;
264
+ *location = c;
265
+ //printf("xx %d = %d | %d at %p\n", c, val & BITMASK, 0x80, location);
266
+ location++;
267
+ val >>= 7;
268
+ }
269
+ uint8_t c = (val & BITMASK);
270
+ *location = c;
271
+ //printf("xx %d at %p\n", c, location);
272
+ *size = location + 1 - start;
273
+ //printf("xx total %u bytes\n", *size);
274
+ return NO_ERROR;
275
+ }
276
+
277
+ RAISING_STATIC(read_multibyte(uint8_t* location, uint32_t* val, uint32_t* size)) {
278
+ uint8_t* start = location;
279
+ uint32_t shift = 0;
280
+
281
+ *val = 0;
282
+ while(*location & 0x80) {
283
+ //printf("yy read continue byte %d -> %d at %p\n", *location, *location & ~0x80, location);
284
+ *val |= (*location & ~0x80) << shift;
285
+ shift += 7;
286
+ location++;
287
+ }
288
+ *val |= *location << shift;
289
+ //printf("yy read final byte %d at %p\n", *location, location);
290
+ *size = location + 1 - start;
291
+ //printf("yy total %d bytes, val = %d\n\n", *size, *val);
292
+ return NO_ERROR;
293
+ }
294
+
295
+ /* write posting entry using a variable-byte encoding
296
+
297
+ unfortunately we can't write doc_id deltas, which is what would really make
298
+ this encoding pay off, because we write the entries in increasing doc_id
299
+ order but read them in decreasing order. so we write doc_ids raw.
300
+
301
+ for next_offsets, we write the delta against the current offset. since the
302
+ next_offset is guaranteed to be less than the current offset, we subtract
303
+ next from current.
304
+
305
+ positions are written as deltas.
306
+ */
307
+
308
+ RAISING_STATIC(write_posting(wp_segment* seg, posting* po, pos_t positions[])) {
309
+ postings_region* pr = MMAP_OBJ(seg->postings, postings_region);
310
+
311
+ uint32_t size;
312
+ uint32_t offset = pr->postings_head;
313
+
314
+ if(po->next_offset >= pr->postings_head) RAISE_ERROR("next_offset %u >= postings_head %u", po->next_offset, pr->postings_head);
315
+ if(po->num_positions == 0) RAISE_ERROR("num_positions == 0");
316
+
317
+ uint32_t doc_id = po->doc_id << 1;
318
+ if(po->num_positions == 1) doc_id |= 1; // marker for single postings
319
+ RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], doc_id, &size));
320
+ pr->postings_head += size;
321
+ //printf("wrote %u-byte doc_id %u (np1 == %d)\n", size, doc_id, po->num_positions == 1 ? 1 : 0);
322
+
323
+ RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], offset - po->next_offset, &size));
324
+ pr->postings_head += size;
325
+ //printf("wrote %u-byte offset %u\n", size, offset - po->next_offset);
326
+
327
+ if(po->num_positions > 1) {
328
+ RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], po->num_positions, &size));
329
+ pr->postings_head += size;
330
+ //printf("wrote %u-byte num positions %u\n", size, po->num_positions);
331
+ }
332
+
333
+ for(uint32_t i = 0; i < po->num_positions; i++) {
334
+ RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], positions[i] - (i == 0 ? 0 : positions[i - 1]), &size));
335
+ pr->postings_head += size;
336
+ //printf("wrote %u-byte positions %u\n", size, positions[i] - (i == 0 ? 0 : positions[i - 1]));
337
+ }
338
+
339
+ //printf("done writing posting\n\n");
340
+
341
+ //printf(">>> done writing posting %d %d %d to %p\n\n", (prev_docid == 0 ? po->doc_id : prev_docid - po->doc_id), offset - po->next_offset, po->num_positions, &pr->postings[pl->postings_head]);
342
+ pr->num_postings++;
343
+
344
+ return NO_ERROR;
345
+ }
346
+
347
+ /* if include_positions is true, will malloc the positions array for you, and
348
+ * you must free it when done (assuming num_positions > 0)!
349
+ */
350
+
351
+ wp_error* wp_segment_read_posting(wp_segment* s, uint32_t offset, posting* po, int include_positions) {
352
+ uint32_t size;
353
+ uint32_t orig_offset = offset;
354
+ postings_region* pr = MMAP_OBJ(s->postings, postings_region);
355
+
356
+ //DEBUG("reading posting from offset %u -> %p (pr %p base %p)", offset, &pr->postings[offset], pr, &pr->postings);
357
+
358
+ RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->doc_id, &size));
359
+ int is_single_posting = po->doc_id & 1;
360
+ po->doc_id = po->doc_id >> 1;
361
+ //DEBUG("read doc_id %u (%u bytes)", po->doc_id, size);
362
+ offset += size;
363
+
364
+ RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->next_offset, &size));
365
+ //DEBUG("read next_offset %u -> %u (%u bytes)", po->next_offset, orig_offset - po->next_offset, size);
366
+ if((po->next_offset == 0) || (po->next_offset > orig_offset)) RAISE_ERROR("read invalid next_offset %u (must be > 0 and < %u", po->next_offset, orig_offset);
367
+ po->next_offset = orig_offset - po->next_offset;
368
+ offset += size;
369
+
370
+ if(include_positions) {
371
+ if(is_single_posting) po->num_positions = 1;
372
+ else {
373
+ RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->num_positions, &size));
374
+ //DEBUG("read num_positions: %u (%u bytes)", po->num_positions, size);
375
+ offset += size;
376
+ }
377
+
378
+ po->positions = malloc(po->num_positions * sizeof(pos_t));
379
+
380
+ for(uint32_t i = 0; i < po->num_positions; i++) {
381
+ RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->positions[i], &size));
382
+ offset += size;
383
+ po->positions[i] += (i == 0 ? 0 : po->positions[i - 1]);
384
+ //DEBUG("read position %u (%u bytes)", po->positions[i], size);
385
+ }
386
+ }
387
+ else {
388
+ po->num_positions = 0;
389
+ po->positions = NULL;
390
+ }
391
+ //DEBUG("total record took %u bytes", offset - orig_offset);
392
+ //printf("*** read posting %u %u %u from %u\n", po->doc_id, po->next_offset, po->num_positions, orig_offset);
393
+
394
+ return NO_ERROR;
395
+ }
396
+
397
+ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* word, docid_t doc_id, uint32_t num_positions, pos_t positions[]) {
398
+ // TODO move this logic up to ensure_fit()
399
+ int success;
400
+ RELAY_ERROR(bump_stringmap(s, &success));
401
+ RELAY_ERROR(bump_stringpool(s, &success));
402
+ RELAY_ERROR(bump_termhash(s, &success));
403
+
404
+ DEBUG("adding posting for %s:%s and doc %u", field, word, doc_id);
405
+
406
+ postings_region* pr = MMAP_OBJ(s->postings, postings_region);
407
+ stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
408
+ termhash* th = MMAP_OBJ(s->termhash, termhash);
409
+
410
+ // construct the term object
411
+ term t;
412
+ RELAY_ERROR(stringmap_add(sh, field, &t.field_s));
413
+ RELAY_ERROR(stringmap_add(sh, word, &t.word_s));
414
+
415
+ // find the offset of the next posting
416
+ posting po;
417
+ uint32_t next_offset = termhash_get_val(th, t);
418
+ if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
419
+ if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy
420
+ RELAY_ERROR(wp_segment_read_posting(s, next_offset, &po, 0));
421
+ if(po.doc_id >= doc_id) RAISE_ERROR("cannot add a doc_id out of sorted order");
422
+ }
423
+
424
+ // write the entry to the postings region
425
+ uint32_t entry_offset = pr->postings_head;
426
+ //DEBUG("entry will be at offset %u, prev offset is %u and next offset is %u", entry_offset, prev_offset, next_offset);
427
+ po.doc_id = doc_id;
428
+ po.next_offset = next_offset;
429
+ po.num_positions = num_positions;
430
+ RELAY_ERROR(write_posting(s, &po, positions)); // prev_docid is 0 for th
431
+ DEBUG("postings list head now at %u", pr->postings_head);
432
+
433
+ // really finally, update the tail pointer so that readers can access this posting
434
+ RELAY_ERROR(termhash_put_val(th, t, entry_offset));
435
+
436
+ return NO_ERROR;
437
+ }
438
+
439
+ /*
440
+ * currently, labels are implemented as a separate postings space and separate
441
+ * postings structure, but with the same term hash (the offsets just are
442
+ * relative to the different space).
443
+ *
444
+ * we use the sentinel field value 0 to demarcate a label. since no strings have
445
+ * have stringmap value 0, this is safe.
446
+ *
447
+ * we also maintain a free list of unused label postings. since all label
448
+ * postings are the same size, we can do this to reuse them and avoid losing
449
+ * space in this area; since label postings can be changed frequently, this is
450
+ * desirable. we use the sentinel postings value field=0 word=0 to keep track
451
+ * of this list.
452
+ *
453
+ */
454
+ wp_error* wp_segment_read_label(wp_segment* s, uint32_t offset, posting* po) {
455
+ postings_region* pr = MMAP_OBJ(s->labels, postings_region);
456
+
457
+ label_posting* lp = wp_segment_label_posting_at(pr, offset);
458
+ po->doc_id = lp->doc_id;
459
+ po->next_offset = lp->next_offset;
460
+ po->num_positions = 0;
461
+ po->positions = NULL;
462
+
463
+ return NO_ERROR;
464
+ }
465
+
466
+ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id) {
467
+ // TODO move this logic up to ensure_fit()
468
+ int success;
469
+ RELAY_ERROR(bump_stringmap(s, &success));
470
+ RELAY_ERROR(bump_stringpool(s, &success));
471
+ RELAY_ERROR(bump_termhash(s, &success));
472
+
473
+ DEBUG("adding label %s to doc %u", label, doc_id);
474
+
475
+ postings_region* pr = MMAP_OBJ(s->labels, postings_region);
476
+ stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
477
+ termhash* th = MMAP_OBJ(s->termhash, termhash);
478
+
479
+ // construct the term object. term objects for labels have the special
480
+ // sentinel field value 0
481
+ term t;
482
+ t.field_s = 0; // label sentinel value
483
+ RELAY_ERROR(stringmap_add(sh, label, &t.word_s)); // get word key
484
+
485
+ // find the previous and next label postings, between which we'll insert this
486
+ // posting
487
+ uint32_t prev_offset = OFFSET_NONE;
488
+ uint32_t next_offset = termhash_get_val(th, t);
489
+ if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
490
+
491
+ while(next_offset != OFFSET_NONE) {
492
+ label_posting* po = wp_segment_label_posting_at(pr, next_offset);
493
+ if(po->doc_id == doc_id) {
494
+ DEBUG("already have label '%s' for doc %u; returning", label, doc_id);
495
+ return NO_ERROR;
496
+ }
497
+ else if(po->doc_id < doc_id) break;
498
+ prev_offset = next_offset;
499
+ next_offset = po->next_offset;
500
+ }
501
+
502
+ // find a space for the posting by first checking for a free postings in the
503
+ // dead list. the dead list is the list stored under the sentinel term
504
+ // with field 0 and word 0.
505
+ term dead_term = { .field_s = 0, .word_s = 0 };
506
+ uint32_t entry_offset;
507
+ uint32_t dead_offset = termhash_get_val(th, dead_term);
508
+ if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
509
+
510
+ if(dead_offset == OFFSET_NONE) { // make a new posting
511
+ entry_offset = pr->postings_head;
512
+ }
513
+ else { // we'll use this one; remove it from the linked list
514
+ DEBUG("offset from dead list is %u, using it for the new posting!", dead_offset);
515
+ entry_offset = dead_offset;
516
+ RELAY_ERROR(termhash_put_val(th, dead_term, wp_segment_label_posting_at(pr, dead_offset)->next_offset));
517
+ }
518
+
519
+ // finally, write the entry to the label postings region
520
+ DEBUG("label entry will be at offset %u, prev offset is %u and next offset is %u", entry_offset, prev_offset, next_offset);
521
+ label_posting* po = wp_segment_label_posting_at(pr, entry_offset);
522
+ po->doc_id = doc_id;
523
+ po->next_offset = next_offset;
524
+
525
+ pr->postings_head += sizeof(label_posting);
526
+ DEBUG("label postings list head now at %u", pr->postings_head);
527
+
528
+ // really finally, update either the previous offset or the tail pointer
529
+ // for this label so that readers can access this posting
530
+ if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, entry_offset));
531
+ else wp_segment_label_posting_at(pr, prev_offset)->next_offset = entry_offset;
532
+
533
+ return NO_ERROR;
534
+ }
535
+
536
+ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_id) {
537
+ // TODO move this logic to ensure_fit
538
+ int success;
539
+ RELAY_ERROR(bump_termhash(s, &success)); // we might add an entry for the dead list
540
+
541
+ postings_region* pr = MMAP_OBJ(s->labels, postings_region);
542
+ stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
543
+ termhash* th = MMAP_OBJ(s->termhash, termhash);
544
+
545
+ // construct the term object. term objects for labels have the special
546
+ // sentinel field value 0
547
+ term t;
548
+ t.field_s = 0; // label sentinel value
549
+ t.word_s = stringmap_string_to_int(sh, label); // will be -1 if not there
550
+
551
+ // find the posting and the previous posting in the list, if any
552
+ uint32_t prev_offset = OFFSET_NONE;
553
+ uint32_t offset = termhash_get_val(th, t);
554
+ if(offset == (uint32_t)-1) offset = OFFSET_NONE;
555
+ label_posting* lp = NULL;
556
+
557
+ while(offset != OFFSET_NONE) {
558
+ lp = wp_segment_label_posting_at(pr, offset);
559
+ if(lp->doc_id < doc_id) offset = OFFSET_NONE; // nasty hack to induce failure
560
+ if(lp->doc_id <= doc_id) break;
561
+ prev_offset = offset;
562
+ offset = lp->next_offset;
563
+ }
564
+
565
+ DEBUG("found label posting for doc %u at offset %u; prev_offset is %u", doc_id, offset, prev_offset);
566
+
567
+ if(offset == OFFSET_NONE) {
568
+ DEBUG("no label %s found for doc %u", label, doc_id);
569
+ return NO_ERROR;
570
+ }
571
+
572
+ // we've found the posting; now remove it from the list
573
+ if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, lp->next_offset));
574
+ else wp_segment_label_posting_at(pr, prev_offset)->next_offset = lp->next_offset;
575
+
576
+ // now add it to the dead list for later reclamation
577
+ term dead_term = { .field_s = 0, .word_s = 0 };
578
+ uint32_t dead_offset = termhash_get_val(th, dead_term);
579
+ if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
580
+
581
+ lp->next_offset = dead_offset;
582
+ DEBUG("adding dead label posting %u to head of deadlist with next_offset %u", offset, lp->next_offset);
583
+ RELAY_ERROR(termhash_put_val(th, dead_term, offset));
584
+
585
+ return NO_ERROR;
586
+ }
587
+
588
+ wp_error* wp_segment_grab_docid(wp_segment* segment, docid_t* doc_id) {
589
+ postings_region* pr = MMAP_OBJ(segment->postings, postings_region);
590
+ *doc_id = ++pr->num_docs;
591
+ return NO_ERROR;
592
+ }
593
+
594
+ wp_error* wp_segment_dumpinfo(wp_segment* segment, FILE* stream) {
595
+ postings_region* pr = MMAP_OBJ(segment->postings, postings_region);
596
+ stringmap* sh = MMAP_OBJ(segment->stringmap, stringmap);
597
+ stringpool* sp = MMAP_OBJ(segment->stringpool, stringpool);
598
+ termhash* th = MMAP_OBJ(segment->termhash, termhash);
599
+
600
+ #define p(a, b) 100.0 * (float)a / (float)b
601
+
602
+ fprintf(stream, "segment has type %u\n", pr->index_type_and_flags);
603
+ fprintf(stream, "segment has %u docs and %u postings\n", pr->num_docs, pr->num_postings);
604
+ fprintf(stream, "postings region is %6ukb at %3.1f%% saturation\n", segment->postings.header->size / 1024, p(pr->postings_head, pr->postings_tail));
605
+ fprintf(stream, " string hash is %6ukb at %3.1f%% saturation\n", segment->stringmap.header->size / 1024, p(sh->n_occupied, sh->n_buckets));
606
+ fprintf(stream, " stringpool is %6ukb at %3.1f%% saturation\n", segment->stringpool.header->size / 1024, p(sp->next, sp->size));
607
+ fprintf(stream, " term hash has %6ukb at %3.1f%% saturation\n", segment->termhash.header->size / 1024, p(th->n_occupied, th->n_buckets));
608
+
609
+ return NO_ERROR;
610
+ }
611
+
612
+ uint64_t wp_segment_num_docs(wp_segment* seg) {
613
+ postings_region* pr = MMAP_OBJ(seg->postings, postings_region);
614
+ return pr->num_docs;
615
+ }