whistlepig 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,76 @@
1
+ #ifndef WP_SEARCH_H_
2
+ #define WP_SEARCH_H_
3
+
4
+ // whistlepig search code
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // what you need to know about search:
8
+ // 1. it runs on a per-segment basis; and
9
+ // 2. query objects maintain search state internally.
10
+ //
11
+ // to run a query on a segment, you need to use this call sequence:
12
+ //
13
+ // 1. wp_search_init_search_state
14
+ // 2. wp_search_run_query_on_segment (zero or more times)
15
+ // 3. wp_search_release_search_state
16
+ //
17
+ // because the query objects maintain state, you can repeat step 2 as much as
18
+ // you'd like to get more results without doing any duplicate work. if you
19
+ // don't do step 3, you'll leak memory.
20
+ //
21
+ // the corollary is that if you want to do multithreaded search across segments
22
+ // in parallel, you will have to clone the query for each segment to avoid
23
+ // sharing state.
24
+ //
25
+ // (right now the index does a serial search across segments, so cloning is not
26
+ // required.)
27
+
28
+ #include <stdint.h>
29
+
30
+ #include "defaults.h"
31
+ #include "segment.h"
32
+ #include "query.h"
33
+ #include "error.h"
34
+
35
+ // a match of a particular fielded phrase on a particular document
36
+ typedef struct doc_match {
37
+ const char* field;
38
+ const char* word;
39
+ uint16_t num_positions;
40
+ pos_t* positions;
41
+ } doc_match;
42
+
43
+ // a generic match on a document of a search stream
44
+ typedef struct search_result {
45
+ docid_t doc_id;
46
+ uint16_t num_doc_matches;
47
+ doc_match* doc_matches;
48
+ } search_result;
49
+
50
+ struct wp_segment;
51
+ struct wp_query;
52
+ struct wp_error;
53
+
54
+ // API methods
55
+
56
+ // initialize the query search state for running on segment s. this must precede any call
57
+ // to wp_search_run_query_on_segment.
58
+ wp_error* wp_search_init_search_state(struct wp_query* q, struct wp_segment* s) RAISES_ERROR;
59
+
60
+ // release any query search state. this must follow any call to wp_search_run_query_on_segment.
61
+ wp_error* wp_search_release_search_state(struct wp_query* q) RAISES_ERROR;
62
+
63
+ // run a query on a segment, filling at most max_num_results slots in results.
64
+ // this is the main entry point into the actual search logic, and is called by
65
+ // index.c in various ways. this must be preceded by an init_search_state and
66
+ // followed by a release_search_state.
67
+ //
68
+ // if you get num_results > 0, you should call wp_search_result_free on each of the
69
+ // results when you're done with them.
70
+ wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment* s, uint32_t max_num_results, uint32_t* num_results, search_result* results) RAISES_ERROR;
71
+
72
+ // if you got non-zero num_results from wp_search_run_query_on_segment, call
73
+ // this on each result when you're done with it.
74
+ void wp_search_result_free(search_result* result);
75
+
76
+ #endif
@@ -0,0 +1,615 @@
1
+ #include <sys/stat.h>
2
+ #include <fcntl.h>
3
+ #include <unistd.h>
4
+ #include "whistlepig.h"
5
+
6
+ #define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
7
+ #define POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS 2 // bigger, mutable
8
+
9
+ #define wp_segment_label_posting_at(posting_region, offset) ((label_posting*)(posting_region->postings + offset))
10
+
11
+ static void postings_region_init(postings_region* pr, uint32_t initial_size, uint32_t index_type_and_flags) {
12
+ pr->index_type_and_flags = index_type_and_flags;
13
+ pr->num_docs = 0;
14
+ pr->num_postings = 0;
15
+ pr->postings_head = 1; // skip one byte, which is reserved as OFFSET_NONE
16
+ pr->postings_tail = initial_size;
17
+ }
18
+
19
+ RAISING_STATIC(postings_region_validate(postings_region* pr, uint32_t index_type_and_flags)) {
20
+ if(pr->index_type_and_flags != index_type_and_flags) RAISE_ERROR("segment has index type %u; expecting type %u", pr->index_type_and_flags, index_type_and_flags);
21
+ return NO_ERROR;
22
+ }
23
+
24
+ #define INITIAL_POSTINGS_SIZE 2048
25
+ #define FN_SIZE 1024
26
+
27
+ wp_error* wp_segment_load(wp_segment* segment, const char* pathname_base) {
28
+ char fn[FN_SIZE];
29
+
30
+ // open the string pool
31
+ snprintf(fn, 128, "%s.sp", pathname_base);
32
+ RELAY_ERROR(mmap_obj_load(&segment->stringpool, "ti/stringpool", fn));
33
+
34
+ // open the string hash
35
+ snprintf(fn, 128, "%s.sh_", pathname_base);
36
+ RELAY_ERROR(mmap_obj_load(&segment->stringmap, "ti/stringmap", fn));
37
+ stringmap_setup(MMAP_OBJ(segment->stringmap, stringmap), MMAP_OBJ(segment->stringpool, stringpool));
38
+
39
+ // open the term hash
40
+ snprintf(fn, 128, "%s.th", pathname_base);
41
+ RELAY_ERROR(mmap_obj_load(&segment->termhash, "ti/termhash", fn));
42
+ termhash_setup(MMAP_OBJ(segment->termhash, termhash));
43
+
44
+ // open the postings region
45
+ snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
46
+ RELAY_ERROR(mmap_obj_load(&segment->postings, "ti/postings", fn));
47
+ RELAY_ERROR(postings_region_validate(MMAP_OBJ(segment->postings, postings_region), POSTINGS_REGION_TYPE_IMMUTABLE_VBE));
48
+
49
+ // open the labels postings region
50
+ snprintf(fn, 128, "%s.lb", pathname_base);
51
+ RELAY_ERROR(mmap_obj_load(&segment->labels, "ti/labels", fn));
52
+ RELAY_ERROR(postings_region_validate(MMAP_OBJ(segment->labels, postings_region), POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS));
53
+
54
+ return NO_ERROR;
55
+ }
56
+
57
+ wp_error* wp_segment_create(wp_segment* segment, const char* pathname_base) {
58
+ char fn[FN_SIZE];
59
+
60
+ // create the string pool
61
+ snprintf(fn, 128, "%s.sp", pathname_base);
62
+ RELAY_ERROR(mmap_obj_create(&segment->stringpool, "ti/stringpool", fn, stringpool_initial_size()));
63
+ stringpool_init(MMAP_OBJ(segment->stringpool, stringpool));
64
+
65
+ // create the string hash
66
+ snprintf(fn, 128, "%s.sh_", pathname_base);
67
+ RELAY_ERROR(mmap_obj_create(&segment->stringmap, "ti/stringmap", fn, stringmap_initial_size()));
68
+ stringmap_init(MMAP_OBJ(segment->stringmap, stringmap), MMAP_OBJ(segment->stringpool, stringpool));
69
+
70
+ // create the term hash
71
+ snprintf(fn, 128, "%s.th", pathname_base);
72
+ RELAY_ERROR(mmap_obj_create(&segment->termhash, "ti/termhash", fn, termhash_initial_size()));
73
+ termhash_init(MMAP_OBJ(segment->termhash, termhash));
74
+
75
+ // create the postings region
76
+ snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
77
+ RELAY_ERROR(mmap_obj_create(&segment->postings, "ti/postings", fn, sizeof(postings_region) + INITIAL_POSTINGS_SIZE));
78
+ postings_region_init(MMAP_OBJ(segment->postings, postings_region), INITIAL_POSTINGS_SIZE, POSTINGS_REGION_TYPE_IMMUTABLE_VBE);
79
+
80
+ // create the labels postings region
81
+ snprintf(fn, 128, "%s.lb", pathname_base);
82
+ RELAY_ERROR(mmap_obj_create(&segment->labels, "ti/labels", fn, sizeof(postings_region) + INITIAL_POSTINGS_SIZE));
83
+ postings_region_init(MMAP_OBJ(segment->labels, postings_region), INITIAL_POSTINGS_SIZE, POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS);
84
+
85
+ return NO_ERROR;
86
+ }
87
+
88
+ int wp_segment_exists(const char* pathname_base) {
89
+ struct stat fstat;
90
+ char fn[FN_SIZE];
91
+
92
+ snprintf(fn, 128, "%s.sp", pathname_base);
93
+ return !stat(fn, &fstat);
94
+ }
95
+
96
+ wp_error* wp_segment_delete(const char* pathname_base) {
97
+ char fn[FN_SIZE];
98
+
99
+ snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
100
+ unlink(fn);
101
+ snprintf(fn, 128, "%s.sp", pathname_base);
102
+ unlink(fn);
103
+ snprintf(fn, 128, "%s.sh_", pathname_base);
104
+ unlink(fn);
105
+ snprintf(fn, 128, "%s.th", pathname_base);
106
+ unlink(fn);
107
+ snprintf(fn, 128, "%s.lb", pathname_base);
108
+ unlink(fn);
109
+
110
+ return NO_ERROR;
111
+ }
112
+
113
+ wp_error* wp_segment_unload(wp_segment* s) {
114
+ RELAY_ERROR(mmap_obj_unload(&s->stringpool));
115
+ RELAY_ERROR(mmap_obj_unload(&s->stringmap));
116
+ RELAY_ERROR(mmap_obj_unload(&s->termhash));
117
+ RELAY_ERROR(mmap_obj_unload(&s->postings));
118
+ RELAY_ERROR(mmap_obj_unload(&s->labels));
119
+ return NO_ERROR;
120
+ }
121
+
122
+ RAISING_STATIC(bump_stringmap(wp_segment* s, int* success)) {
123
+ stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
124
+
125
+ *success = 1;
126
+ if(stringmap_needs_bump(sh)) {
127
+ DEBUG("bumping stringmap size");
128
+ uint32_t next_size = stringmap_next_size(sh);
129
+ if(next_size <= stringmap_size(sh)) {
130
+ DEBUG("stringmap can't be bumped no more!");
131
+ *success = 0;
132
+ }
133
+ else {
134
+ RELAY_ERROR(mmap_obj_resize(&s->stringmap, next_size));
135
+ sh = MMAP_OBJ(s->stringmap, stringmap); // this could have changed!
136
+ stringmap_setup(sh, MMAP_OBJ(s->stringpool, stringpool));
137
+ RELAY_ERROR(stringmap_bump_size(sh));
138
+ }
139
+ }
140
+
141
+ return NO_ERROR;
142
+ }
143
+
144
+ RAISING_STATIC(bump_stringpool(wp_segment* s, int* success)) {
145
+ stringpool* sp = MMAP_OBJ(s->stringpool, stringpool);
146
+
147
+ *success = 1;
148
+ if(stringpool_needs_bump(sp)) {
149
+ DEBUG("bumping stringpool size");
150
+ uint32_t next_size = stringpool_next_size(sp);
151
+ if(next_size <= stringpool_size(sp)) {
152
+ DEBUG("stringpool can't be bumped no more!");
153
+ *success = 0;
154
+ }
155
+ else {
156
+ RELAY_ERROR(mmap_obj_resize(&s->stringpool, next_size));
157
+ sp = MMAP_OBJ(s->stringpool, stringpool); // may have changed!
158
+ stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
159
+ sh->pool = sp; // need to update it here too
160
+ stringpool_bump_size(sp);
161
+ }
162
+ }
163
+
164
+ return NO_ERROR;
165
+ }
166
+
167
+ RAISING_STATIC(bump_termhash(wp_segment* s, int* success)) {
168
+ termhash* th = MMAP_OBJ(s->termhash, termhash);
169
+
170
+ *success = 1;
171
+ if(termhash_needs_bump(th)) {
172
+ DEBUG("bumping termhash size");
173
+ uint32_t next_size = termhash_next_size(th);
174
+ if(next_size <= termhash_size(th)) {
175
+ DEBUG("termhash can't be bumped no more!");
176
+ *success = 0;
177
+ }
178
+ else {
179
+ RELAY_ERROR(mmap_obj_resize(&s->termhash, next_size));
180
+ th = MMAP_OBJ(s->termhash, termhash); // could have changed!
181
+ termhash_setup(th);
182
+ RELAY_ERROR(termhash_bump_size(th));
183
+ *success = 1;
184
+ }
185
+ }
186
+
187
+ return NO_ERROR;
188
+ }
189
+
190
+ RAISING_STATIC(postings_region_ensure_fit(mmap_obj* mmopr, uint32_t postings_bytes, int* success)) {
191
+ postings_region* pr = MMAP_OBJ_PTR(mmopr, postings_region);
192
+ uint32_t new_head = pr->postings_head + postings_bytes;
193
+
194
+ DEBUG("ensuring fit for %u postings bytes", postings_bytes);
195
+
196
+ uint32_t new_tail = pr->postings_tail;
197
+ while(new_tail <= new_head) new_tail = new_tail * 2;
198
+
199
+ if(new_tail > MAX_POSTINGS_REGION_SIZE) new_tail = MAX_POSTINGS_REGION_SIZE;
200
+ DEBUG("new tail will be %u, current is %u, max is %u", new_tail, pr->postings_tail, MAX_POSTINGS_REGION_SIZE);
201
+
202
+ if(new_tail <= new_head) { // can't increase enough
203
+ *success = 0;
204
+ return NO_ERROR;
205
+ }
206
+
207
+ if(new_tail != pr->postings_tail) { // need to resize
208
+ DEBUG("request for %u postings bytes, old tail is %u, new tail will be %u, max is %u\n", postings_bytes, pr->postings_tail, new_tail, MAX_POSTINGS_REGION_SIZE);
209
+ RELAY_ERROR(mmap_obj_resize(mmopr, new_tail));
210
+ pr = MMAP_OBJ_PTR(mmopr, postings_region); // may have changed!
211
+ pr->postings_tail = new_tail;
212
+ }
213
+
214
+ *success = 1;
215
+ return NO_ERROR;
216
+ }
217
+
218
+ // TODO make this function take the number of stringpool entries, the number of
219
+ // terms, etc rather than just being a heuristic for everything except for the
220
+ // postings list
221
+ wp_error* wp_segment_ensure_fit(wp_segment* seg, uint32_t postings_bytes, uint32_t label_bytes, int* success) {
222
+ RELAY_ERROR(postings_region_ensure_fit(&seg->postings, postings_bytes, success));
223
+ if(!*success) return NO_ERROR;
224
+
225
+ RELAY_ERROR(postings_region_ensure_fit(&seg->labels, label_bytes, success));
226
+ if(!*success) return NO_ERROR;
227
+
228
+ RELAY_ERROR(bump_stringmap(seg, success));
229
+ if(!*success) return NO_ERROR;
230
+
231
+ RELAY_ERROR(bump_stringpool(seg, success));
232
+ if(!*success) return NO_ERROR;
233
+
234
+ RELAY_ERROR(bump_termhash(seg, success));
235
+ if(!*success) return NO_ERROR;
236
+
237
+ DEBUG("fit of %u postings bytes ensured", postings_bytes);
238
+
239
+ return NO_ERROR;
240
+ }
241
+
242
+ static uint32_t size_of(uint32_t num_positions, pos_t positions[]) {
243
+ (void)positions;
244
+ uint32_t position_size = sizeof(pos_t) * num_positions;
245
+ uint32_t size = sizeof(posting) - sizeof(pos_t*) + position_size;
246
+
247
+ return size;
248
+ }
249
+
250
+ wp_error* wp_segment_sizeof_posarray(wp_segment* seg, uint32_t num_positions, pos_t* positions, uint32_t* size) {
251
+ (void)seg;
252
+ *size = size_of(num_positions, positions);
253
+ return NO_ERROR;
254
+ }
255
+
256
+ #define BITMASK 0x7f
257
+
258
+ RAISING_STATIC(write_multibyte(uint8_t* location, uint32_t val, uint32_t* size)) {
259
+ //printf("xx writing %u to position %p as:\n", val, location);
260
+ uint8_t* start = location;
261
+
262
+ while(val > BITMASK) {
263
+ uint8_t c = (val & BITMASK) | 0x80;
264
+ *location = c;
265
+ //printf("xx %d = %d | %d at %p\n", c, val & BITMASK, 0x80, location);
266
+ location++;
267
+ val >>= 7;
268
+ }
269
+ uint8_t c = (val & BITMASK);
270
+ *location = c;
271
+ //printf("xx %d at %p\n", c, location);
272
+ *size = location + 1 - start;
273
+ //printf("xx total %u bytes\n", *size);
274
+ return NO_ERROR;
275
+ }
276
+
277
+ RAISING_STATIC(read_multibyte(uint8_t* location, uint32_t* val, uint32_t* size)) {
278
+ uint8_t* start = location;
279
+ uint32_t shift = 0;
280
+
281
+ *val = 0;
282
+ while(*location & 0x80) {
283
+ //printf("yy read continue byte %d -> %d at %p\n", *location, *location & ~0x80, location);
284
+ *val |= (*location & ~0x80) << shift;
285
+ shift += 7;
286
+ location++;
287
+ }
288
+ *val |= *location << shift;
289
+ //printf("yy read final byte %d at %p\n", *location, location);
290
+ *size = location + 1 - start;
291
+ //printf("yy total %d bytes, val = %d\n\n", *size, *val);
292
+ return NO_ERROR;
293
+ }
294
+
295
+ /* write posting entry using a variable-byte encoding
296
+
297
+ unfortunately we can't write doc_id deltas, which is what would really make
298
+ this encoding pay off, because we write the entries in increasing doc_id
299
+ order but read them in decreasing order. so we write doc_ids raw.
300
+
301
+ for next_offsets, we write the delta against the current offset. since the
302
+ next_offset is guaranteed to be less than the current offset, we subtract
303
+ next from current.
304
+
305
+ positions are written as deltas.
306
+ */
307
+
308
+ RAISING_STATIC(write_posting(wp_segment* seg, posting* po, pos_t positions[])) {
309
+ postings_region* pr = MMAP_OBJ(seg->postings, postings_region);
310
+
311
+ uint32_t size;
312
+ uint32_t offset = pr->postings_head;
313
+
314
+ if(po->next_offset >= pr->postings_head) RAISE_ERROR("next_offset %u >= postings_head %u", po->next_offset, pr->postings_head);
315
+ if(po->num_positions == 0) RAISE_ERROR("num_positions == 0");
316
+
317
+ uint32_t doc_id = po->doc_id << 1;
318
+ if(po->num_positions == 1) doc_id |= 1; // marker for single postings
319
+ RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], doc_id, &size));
320
+ pr->postings_head += size;
321
+ //printf("wrote %u-byte doc_id %u (np1 == %d)\n", size, doc_id, po->num_positions == 1 ? 1 : 0);
322
+
323
+ RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], offset - po->next_offset, &size));
324
+ pr->postings_head += size;
325
+ //printf("wrote %u-byte offset %u\n", size, offset - po->next_offset);
326
+
327
+ if(po->num_positions > 1) {
328
+ RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], po->num_positions, &size));
329
+ pr->postings_head += size;
330
+ //printf("wrote %u-byte num positions %u\n", size, po->num_positions);
331
+ }
332
+
333
+ for(uint32_t i = 0; i < po->num_positions; i++) {
334
+ RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], positions[i] - (i == 0 ? 0 : positions[i - 1]), &size));
335
+ pr->postings_head += size;
336
+ //printf("wrote %u-byte positions %u\n", size, positions[i] - (i == 0 ? 0 : positions[i - 1]));
337
+ }
338
+
339
+ //printf("done writing posting\n\n");
340
+
341
+ //printf(">>> done writing posting %d %d %d to %p\n\n", (prev_docid == 0 ? po->doc_id : prev_docid - po->doc_id), offset - po->next_offset, po->num_positions, &pr->postings[pl->postings_head]);
342
+ pr->num_postings++;
343
+
344
+ return NO_ERROR;
345
+ }
346
+
347
+ /* if include_positions is true, will malloc the positions array for you, and
348
+ * you must free it when done (assuming num_positions > 0)!
349
+ */
350
+
351
+ wp_error* wp_segment_read_posting(wp_segment* s, uint32_t offset, posting* po, int include_positions) {
352
+ uint32_t size;
353
+ uint32_t orig_offset = offset;
354
+ postings_region* pr = MMAP_OBJ(s->postings, postings_region);
355
+
356
+ //DEBUG("reading posting from offset %u -> %p (pr %p base %p)", offset, &pr->postings[offset], pr, &pr->postings);
357
+
358
+ RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->doc_id, &size));
359
+ int is_single_posting = po->doc_id & 1;
360
+ po->doc_id = po->doc_id >> 1;
361
+ //DEBUG("read doc_id %u (%u bytes)", po->doc_id, size);
362
+ offset += size;
363
+
364
+ RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->next_offset, &size));
365
+ //DEBUG("read next_offset %u -> %u (%u bytes)", po->next_offset, orig_offset - po->next_offset, size);
366
+ if((po->next_offset == 0) || (po->next_offset > orig_offset)) RAISE_ERROR("read invalid next_offset %u (must be > 0 and < %u", po->next_offset, orig_offset);
367
+ po->next_offset = orig_offset - po->next_offset;
368
+ offset += size;
369
+
370
+ if(include_positions) {
371
+ if(is_single_posting) po->num_positions = 1;
372
+ else {
373
+ RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->num_positions, &size));
374
+ //DEBUG("read num_positions: %u (%u bytes)", po->num_positions, size);
375
+ offset += size;
376
+ }
377
+
378
+ po->positions = malloc(po->num_positions * sizeof(pos_t));
379
+
380
+ for(uint32_t i = 0; i < po->num_positions; i++) {
381
+ RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->positions[i], &size));
382
+ offset += size;
383
+ po->positions[i] += (i == 0 ? 0 : po->positions[i - 1]);
384
+ //DEBUG("read position %u (%u bytes)", po->positions[i], size);
385
+ }
386
+ }
387
+ else {
388
+ po->num_positions = 0;
389
+ po->positions = NULL;
390
+ }
391
+ //DEBUG("total record took %u bytes", offset - orig_offset);
392
+ //printf("*** read posting %u %u %u from %u\n", po->doc_id, po->next_offset, po->num_positions, orig_offset);
393
+
394
+ return NO_ERROR;
395
+ }
396
+
397
+ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* word, docid_t doc_id, uint32_t num_positions, pos_t positions[]) {
398
+ // TODO move this logic up to ensure_fit()
399
+ int success;
400
+ RELAY_ERROR(bump_stringmap(s, &success));
401
+ RELAY_ERROR(bump_stringpool(s, &success));
402
+ RELAY_ERROR(bump_termhash(s, &success));
403
+
404
+ DEBUG("adding posting for %s:%s and doc %u", field, word, doc_id);
405
+
406
+ postings_region* pr = MMAP_OBJ(s->postings, postings_region);
407
+ stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
408
+ termhash* th = MMAP_OBJ(s->termhash, termhash);
409
+
410
+ // construct the term object
411
+ term t;
412
+ RELAY_ERROR(stringmap_add(sh, field, &t.field_s));
413
+ RELAY_ERROR(stringmap_add(sh, word, &t.word_s));
414
+
415
+ // find the offset of the next posting
416
+ posting po;
417
+ uint32_t next_offset = termhash_get_val(th, t);
418
+ if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
419
+ if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy
420
+ RELAY_ERROR(wp_segment_read_posting(s, next_offset, &po, 0));
421
+ if(po.doc_id >= doc_id) RAISE_ERROR("cannot add a doc_id out of sorted order");
422
+ }
423
+
424
+ // write the entry to the postings region
425
+ uint32_t entry_offset = pr->postings_head;
426
+ //DEBUG("entry will be at offset %u, prev offset is %u and next offset is %u", entry_offset, prev_offset, next_offset);
427
+ po.doc_id = doc_id;
428
+ po.next_offset = next_offset;
429
+ po.num_positions = num_positions;
430
+ RELAY_ERROR(write_posting(s, &po, positions)); // prev_docid is 0 for th
431
+ DEBUG("postings list head now at %u", pr->postings_head);
432
+
433
+ // really finally, update the tail pointer so that readers can access this posting
434
+ RELAY_ERROR(termhash_put_val(th, t, entry_offset));
435
+
436
+ return NO_ERROR;
437
+ }
438
+
439
+ /*
440
+ * currently, labels are implemented as a separate postings space and separate
441
+ * postings structure, but with the same term hash (the offsets just are
442
+ * relative to the different space).
443
+ *
444
+ * we use the sentinel field value 0 to demarcate a label. since no strings have
445
+ * have stringmap value 0, this is safe.
446
+ *
447
+ * we also maintain a free list of unused label postings. since all label
448
+ * postings are the same size, we can do this to reuse them and avoid losing
449
+ * space in this area; since label postings can be changed frequently, this is
450
+ * desirable. we use the sentinel postings value field=0 word=0 to keep track
451
+ * of this list.
452
+ *
453
+ */
454
+ wp_error* wp_segment_read_label(wp_segment* s, uint32_t offset, posting* po) {
455
+ postings_region* pr = MMAP_OBJ(s->labels, postings_region);
456
+
457
+ label_posting* lp = wp_segment_label_posting_at(pr, offset);
458
+ po->doc_id = lp->doc_id;
459
+ po->next_offset = lp->next_offset;
460
+ po->num_positions = 0;
461
+ po->positions = NULL;
462
+
463
+ return NO_ERROR;
464
+ }
465
+
466
+ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id) {
467
+ // TODO move this logic up to ensure_fit()
468
+ int success;
469
+ RELAY_ERROR(bump_stringmap(s, &success));
470
+ RELAY_ERROR(bump_stringpool(s, &success));
471
+ RELAY_ERROR(bump_termhash(s, &success));
472
+
473
+ DEBUG("adding label %s to doc %u", label, doc_id);
474
+
475
+ postings_region* pr = MMAP_OBJ(s->labels, postings_region);
476
+ stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
477
+ termhash* th = MMAP_OBJ(s->termhash, termhash);
478
+
479
+ // construct the term object. term objects for labels have the special
480
+ // sentinel field value 0
481
+ term t;
482
+ t.field_s = 0; // label sentinel value
483
+ RELAY_ERROR(stringmap_add(sh, label, &t.word_s)); // get word key
484
+
485
+ // find the previous and next label postings, between which we'll insert this
486
+ // posting
487
+ uint32_t prev_offset = OFFSET_NONE;
488
+ uint32_t next_offset = termhash_get_val(th, t);
489
+ if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
490
+
491
+ while(next_offset != OFFSET_NONE) {
492
+ label_posting* po = wp_segment_label_posting_at(pr, next_offset);
493
+ if(po->doc_id == doc_id) {
494
+ DEBUG("already have label '%s' for doc %u; returning", label, doc_id);
495
+ return NO_ERROR;
496
+ }
497
+ else if(po->doc_id < doc_id) break;
498
+ prev_offset = next_offset;
499
+ next_offset = po->next_offset;
500
+ }
501
+
502
+ // find a space for the posting by first checking for a free postings in the
503
+ // dead list. the dead list is the list stored under the sentinel term
504
+ // with field 0 and word 0.
505
+ term dead_term = { .field_s = 0, .word_s = 0 };
506
+ uint32_t entry_offset;
507
+ uint32_t dead_offset = termhash_get_val(th, dead_term);
508
+ if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
509
+
510
+ if(dead_offset == OFFSET_NONE) { // make a new posting
511
+ entry_offset = pr->postings_head;
512
+ }
513
+ else { // we'll use this one; remove it from the linked list
514
+ DEBUG("offset from dead list is %u, using it for the new posting!", dead_offset);
515
+ entry_offset = dead_offset;
516
+ RELAY_ERROR(termhash_put_val(th, dead_term, wp_segment_label_posting_at(pr, dead_offset)->next_offset));
517
+ }
518
+
519
+ // finally, write the entry to the label postings region
520
+ DEBUG("label entry will be at offset %u, prev offset is %u and next offset is %u", entry_offset, prev_offset, next_offset);
521
+ label_posting* po = wp_segment_label_posting_at(pr, entry_offset);
522
+ po->doc_id = doc_id;
523
+ po->next_offset = next_offset;
524
+
525
+ pr->postings_head += sizeof(label_posting);
526
+ DEBUG("label postings list head now at %u", pr->postings_head);
527
+
528
+ // really finally, update either the previous offset or the tail pointer
529
+ // for this label so that readers can access this posting
530
+ if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, entry_offset));
531
+ else wp_segment_label_posting_at(pr, prev_offset)->next_offset = entry_offset;
532
+
533
+ return NO_ERROR;
534
+ }
535
+
536
+ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_id) {
537
+ // TODO move this logic to ensure_fit
538
+ int success;
539
+ RELAY_ERROR(bump_termhash(s, &success)); // we might add an entry for the dead list
540
+
541
+ postings_region* pr = MMAP_OBJ(s->labels, postings_region);
542
+ stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
543
+ termhash* th = MMAP_OBJ(s->termhash, termhash);
544
+
545
+ // construct the term object. term objects for labels have the special
546
+ // sentinel field value 0
547
+ term t;
548
+ t.field_s = 0; // label sentinel value
549
+ t.word_s = stringmap_string_to_int(sh, label); // will be -1 if not there
550
+
551
+ // find the posting and the previous posting in the list, if any
552
+ uint32_t prev_offset = OFFSET_NONE;
553
+ uint32_t offset = termhash_get_val(th, t);
554
+ if(offset == (uint32_t)-1) offset = OFFSET_NONE;
555
+ label_posting* lp = NULL;
556
+
557
+ while(offset != OFFSET_NONE) {
558
+ lp = wp_segment_label_posting_at(pr, offset);
559
+ if(lp->doc_id < doc_id) offset = OFFSET_NONE; // nasty hack to induce failure
560
+ if(lp->doc_id <= doc_id) break;
561
+ prev_offset = offset;
562
+ offset = lp->next_offset;
563
+ }
564
+
565
+ DEBUG("found label posting for doc %u at offset %u; prev_offset is %u", doc_id, offset, prev_offset);
566
+
567
+ if(offset == OFFSET_NONE) {
568
+ DEBUG("no label %s found for doc %u", label, doc_id);
569
+ return NO_ERROR;
570
+ }
571
+
572
+ // we've found the posting; now remove it from the list
573
+ if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, lp->next_offset));
574
+ else wp_segment_label_posting_at(pr, prev_offset)->next_offset = lp->next_offset;
575
+
576
+ // now add it to the dead list for later reclamation
577
+ term dead_term = { .field_s = 0, .word_s = 0 };
578
+ uint32_t dead_offset = termhash_get_val(th, dead_term);
579
+ if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
580
+
581
+ lp->next_offset = dead_offset;
582
+ DEBUG("adding dead label posting %u to head of deadlist with next_offset %u", offset, lp->next_offset);
583
+ RELAY_ERROR(termhash_put_val(th, dead_term, offset));
584
+
585
+ return NO_ERROR;
586
+ }
587
+
588
+ wp_error* wp_segment_grab_docid(wp_segment* segment, docid_t* doc_id) {
589
+ postings_region* pr = MMAP_OBJ(segment->postings, postings_region);
590
+ *doc_id = ++pr->num_docs;
591
+ return NO_ERROR;
592
+ }
593
+
594
+ wp_error* wp_segment_dumpinfo(wp_segment* segment, FILE* stream) {
595
+ postings_region* pr = MMAP_OBJ(segment->postings, postings_region);
596
+ stringmap* sh = MMAP_OBJ(segment->stringmap, stringmap);
597
+ stringpool* sp = MMAP_OBJ(segment->stringpool, stringpool);
598
+ termhash* th = MMAP_OBJ(segment->termhash, termhash);
599
+
600
+ #define p(a, b) 100.0 * (float)a / (float)b
601
+
602
+ fprintf(stream, "segment has type %u\n", pr->index_type_and_flags);
603
+ fprintf(stream, "segment has %u docs and %u postings\n", pr->num_docs, pr->num_postings);
604
+ fprintf(stream, "postings region is %6ukb at %3.1f%% saturation\n", segment->postings.header->size / 1024, p(pr->postings_head, pr->postings_tail));
605
+ fprintf(stream, " string hash is %6ukb at %3.1f%% saturation\n", segment->stringmap.header->size / 1024, p(sh->n_occupied, sh->n_buckets));
606
+ fprintf(stream, " stringpool is %6ukb at %3.1f%% saturation\n", segment->stringpool.header->size / 1024, p(sp->next, sp->size));
607
+ fprintf(stream, " term hash has %6ukb at %3.1f%% saturation\n", segment->termhash.header->size / 1024, p(th->n_occupied, th->n_buckets));
608
+
609
+ return NO_ERROR;
610
+ }
611
+
612
+ uint64_t wp_segment_num_docs(wp_segment* seg) {
613
+ postings_region* pr = MMAP_OBJ(seg->postings, postings_region);
614
+ return pr->num_docs;
615
+ }