whistlepig 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +86 -0
- data/ext/whistlepig/defaults.h +28 -0
- data/ext/whistlepig/entry.c +181 -0
- data/ext/whistlepig/entry.h +66 -0
- data/ext/whistlepig/error.c +24 -0
- data/ext/whistlepig/error.h +94 -0
- data/ext/whistlepig/extconf.rb +6 -0
- data/ext/whistlepig/index.c +294 -0
- data/ext/whistlepig/index.h +88 -0
- data/ext/whistlepig/khash.h +316 -0
- data/ext/whistlepig/mmap-obj.c +76 -0
- data/ext/whistlepig/mmap-obj.h +52 -0
- data/ext/whistlepig/query-parser.c +37 -0
- data/ext/whistlepig/query-parser.h +25 -0
- data/ext/whistlepig/query-parser.lex.c +2249 -0
- data/ext/whistlepig/query-parser.lex.h +359 -0
- data/ext/whistlepig/query-parser.tab.c +1757 -0
- data/ext/whistlepig/query-parser.tab.h +85 -0
- data/ext/whistlepig/query.c +194 -0
- data/ext/whistlepig/query.h +78 -0
- data/ext/whistlepig/search.c +746 -0
- data/ext/whistlepig/search.h +76 -0
- data/ext/whistlepig/segment.c +615 -0
- data/ext/whistlepig/segment.h +137 -0
- data/ext/whistlepig/stringmap.c +278 -0
- data/ext/whistlepig/stringmap.h +82 -0
- data/ext/whistlepig/stringpool.c +44 -0
- data/ext/whistlepig/stringpool.h +58 -0
- data/ext/whistlepig/termhash.c +294 -0
- data/ext/whistlepig/termhash.h +79 -0
- data/ext/whistlepig/tokenizer.lex.c +2263 -0
- data/ext/whistlepig/tokenizer.lex.h +360 -0
- data/ext/whistlepig/whistlepig.h +15 -0
- data/ext/whistlepig/whistlepigc.c +537 -0
- data/lib/whistlepig.rb +119 -0
- metadata +103 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
#ifndef WP_SEARCH_H_
|
2
|
+
#define WP_SEARCH_H_
|
3
|
+
|
4
|
+
// whistlepig search code
|
5
|
+
// (c) 2011 William Morgan. See COPYING for license terms.
|
6
|
+
//
|
7
|
+
// what you need to know about search:
|
8
|
+
// 1. it runs on a per-segment basis; and
|
9
|
+
// 2. query objects maintain search state internally.
|
10
|
+
//
|
11
|
+
// to run a query on a segment, you need to use this call sequence:
|
12
|
+
//
|
13
|
+
// 1. wp_search_init_search_state
|
14
|
+
// 2. wp_search_run_query_on_segment (zero or more times)
|
15
|
+
// 3. wp_search_release_search_state
|
16
|
+
//
|
17
|
+
// because the query objects maintain state, you can repeat step 2 as much as
|
18
|
+
// you'd like to get more results without doing any duplicate work. if you
|
19
|
+
// don't do step 3, you'll leak memory.
|
20
|
+
//
|
21
|
+
// the corollary is that if you want to do multithreaded search across segments
|
22
|
+
// in parallel, you will have to clone the query for each segment to avoid
|
23
|
+
// sharing state.
|
24
|
+
//
|
25
|
+
// (right now the index does a serial search across segments, so cloning is not
|
26
|
+
// required.)
|
27
|
+
|
28
|
+
#include <stdint.h>
|
29
|
+
|
30
|
+
#include "defaults.h"
|
31
|
+
#include "segment.h"
|
32
|
+
#include "query.h"
|
33
|
+
#include "error.h"
|
34
|
+
|
35
|
+
// a match of a particular fielded phrase on a particular document
|
36
|
+
typedef struct doc_match {
|
37
|
+
const char* field;
|
38
|
+
const char* word;
|
39
|
+
uint16_t num_positions;
|
40
|
+
pos_t* positions;
|
41
|
+
} doc_match;
|
42
|
+
|
43
|
+
// a generic match on a document of a search stream
|
44
|
+
typedef struct search_result {
|
45
|
+
docid_t doc_id;
|
46
|
+
uint16_t num_doc_matches;
|
47
|
+
doc_match* doc_matches;
|
48
|
+
} search_result;
|
49
|
+
|
50
|
+
struct wp_segment;
|
51
|
+
struct wp_query;
|
52
|
+
struct wp_error;
|
53
|
+
|
54
|
+
// API methods
|
55
|
+
|
56
|
+
// initialize the query search state for running on segment s. this must precede any call
|
57
|
+
// to wp_search_run_query_on_segment.
|
58
|
+
wp_error* wp_search_init_search_state(struct wp_query* q, struct wp_segment* s) RAISES_ERROR;
|
59
|
+
|
60
|
+
// release any query search state. this must follow any call to wp_search_run_query_on_segment.
|
61
|
+
wp_error* wp_search_release_search_state(struct wp_query* q) RAISES_ERROR;
|
62
|
+
|
63
|
+
// run a query on a segment, filling at most max_num_results slots in results.
|
64
|
+
// this is the main entry point into the actual search logic, and is called by
|
65
|
+
// index.c in various ways. this must be preceded by an init_search_state and
|
66
|
+
// followed by a release_search_state.
|
67
|
+
//
|
68
|
+
// if you get num_results > 0, you should call wp_search_result_free on each of the
|
69
|
+
// results when you're done with them.
|
70
|
+
wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment* s, uint32_t max_num_results, uint32_t* num_results, search_result* results) RAISES_ERROR;
|
71
|
+
|
72
|
+
// if you got non-zero num_results from wp_search_run_query_on_segment, call
|
73
|
+
// this on each result when you're done with it.
|
74
|
+
void wp_search_result_free(search_result* result);
|
75
|
+
|
76
|
+
#endif
|
@@ -0,0 +1,615 @@
|
|
1
|
+
#include <sys/stat.h>
|
2
|
+
#include <fcntl.h>
|
3
|
+
#include <unistd.h>
|
4
|
+
#include "whistlepig.h"
|
5
|
+
|
6
|
+
#define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
|
7
|
+
#define POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS 2 // bigger, mutable
|
8
|
+
|
9
|
+
#define wp_segment_label_posting_at(posting_region, offset) ((label_posting*)(posting_region->postings + offset))
|
10
|
+
|
11
|
+
static void postings_region_init(postings_region* pr, uint32_t initial_size, uint32_t index_type_and_flags) {
|
12
|
+
pr->index_type_and_flags = index_type_and_flags;
|
13
|
+
pr->num_docs = 0;
|
14
|
+
pr->num_postings = 0;
|
15
|
+
pr->postings_head = 1; // skip one byte, which is reserved as OFFSET_NONE
|
16
|
+
pr->postings_tail = initial_size;
|
17
|
+
}
|
18
|
+
|
19
|
+
RAISING_STATIC(postings_region_validate(postings_region* pr, uint32_t index_type_and_flags)) {
|
20
|
+
if(pr->index_type_and_flags != index_type_and_flags) RAISE_ERROR("segment has index type %u; expecting type %u", pr->index_type_and_flags, index_type_and_flags);
|
21
|
+
return NO_ERROR;
|
22
|
+
}
|
23
|
+
|
24
|
+
#define INITIAL_POSTINGS_SIZE 2048
|
25
|
+
#define FN_SIZE 1024
|
26
|
+
|
27
|
+
wp_error* wp_segment_load(wp_segment* segment, const char* pathname_base) {
|
28
|
+
char fn[FN_SIZE];
|
29
|
+
|
30
|
+
// open the string pool
|
31
|
+
snprintf(fn, 128, "%s.sp", pathname_base);
|
32
|
+
RELAY_ERROR(mmap_obj_load(&segment->stringpool, "ti/stringpool", fn));
|
33
|
+
|
34
|
+
// open the string hash
|
35
|
+
snprintf(fn, 128, "%s.sh_", pathname_base);
|
36
|
+
RELAY_ERROR(mmap_obj_load(&segment->stringmap, "ti/stringmap", fn));
|
37
|
+
stringmap_setup(MMAP_OBJ(segment->stringmap, stringmap), MMAP_OBJ(segment->stringpool, stringpool));
|
38
|
+
|
39
|
+
// open the term hash
|
40
|
+
snprintf(fn, 128, "%s.th", pathname_base);
|
41
|
+
RELAY_ERROR(mmap_obj_load(&segment->termhash, "ti/termhash", fn));
|
42
|
+
termhash_setup(MMAP_OBJ(segment->termhash, termhash));
|
43
|
+
|
44
|
+
// open the postings region
|
45
|
+
snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
|
46
|
+
RELAY_ERROR(mmap_obj_load(&segment->postings, "ti/postings", fn));
|
47
|
+
RELAY_ERROR(postings_region_validate(MMAP_OBJ(segment->postings, postings_region), POSTINGS_REGION_TYPE_IMMUTABLE_VBE));
|
48
|
+
|
49
|
+
// open the labels postings region
|
50
|
+
snprintf(fn, 128, "%s.lb", pathname_base);
|
51
|
+
RELAY_ERROR(mmap_obj_load(&segment->labels, "ti/labels", fn));
|
52
|
+
RELAY_ERROR(postings_region_validate(MMAP_OBJ(segment->labels, postings_region), POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS));
|
53
|
+
|
54
|
+
return NO_ERROR;
|
55
|
+
}
|
56
|
+
|
57
|
+
wp_error* wp_segment_create(wp_segment* segment, const char* pathname_base) {
|
58
|
+
char fn[FN_SIZE];
|
59
|
+
|
60
|
+
// create the string pool
|
61
|
+
snprintf(fn, 128, "%s.sp", pathname_base);
|
62
|
+
RELAY_ERROR(mmap_obj_create(&segment->stringpool, "ti/stringpool", fn, stringpool_initial_size()));
|
63
|
+
stringpool_init(MMAP_OBJ(segment->stringpool, stringpool));
|
64
|
+
|
65
|
+
// create the string hash
|
66
|
+
snprintf(fn, 128, "%s.sh_", pathname_base);
|
67
|
+
RELAY_ERROR(mmap_obj_create(&segment->stringmap, "ti/stringmap", fn, stringmap_initial_size()));
|
68
|
+
stringmap_init(MMAP_OBJ(segment->stringmap, stringmap), MMAP_OBJ(segment->stringpool, stringpool));
|
69
|
+
|
70
|
+
// create the term hash
|
71
|
+
snprintf(fn, 128, "%s.th", pathname_base);
|
72
|
+
RELAY_ERROR(mmap_obj_create(&segment->termhash, "ti/termhash", fn, termhash_initial_size()));
|
73
|
+
termhash_init(MMAP_OBJ(segment->termhash, termhash));
|
74
|
+
|
75
|
+
// create the postings region
|
76
|
+
snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
|
77
|
+
RELAY_ERROR(mmap_obj_create(&segment->postings, "ti/postings", fn, sizeof(postings_region) + INITIAL_POSTINGS_SIZE));
|
78
|
+
postings_region_init(MMAP_OBJ(segment->postings, postings_region), INITIAL_POSTINGS_SIZE, POSTINGS_REGION_TYPE_IMMUTABLE_VBE);
|
79
|
+
|
80
|
+
// create the labels postings region
|
81
|
+
snprintf(fn, 128, "%s.lb", pathname_base);
|
82
|
+
RELAY_ERROR(mmap_obj_create(&segment->labels, "ti/labels", fn, sizeof(postings_region) + INITIAL_POSTINGS_SIZE));
|
83
|
+
postings_region_init(MMAP_OBJ(segment->labels, postings_region), INITIAL_POSTINGS_SIZE, POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS);
|
84
|
+
|
85
|
+
return NO_ERROR;
|
86
|
+
}
|
87
|
+
|
88
|
+
int wp_segment_exists(const char* pathname_base) {
|
89
|
+
struct stat fstat;
|
90
|
+
char fn[FN_SIZE];
|
91
|
+
|
92
|
+
snprintf(fn, 128, "%s.sp", pathname_base);
|
93
|
+
return !stat(fn, &fstat);
|
94
|
+
}
|
95
|
+
|
96
|
+
wp_error* wp_segment_delete(const char* pathname_base) {
|
97
|
+
char fn[FN_SIZE];
|
98
|
+
|
99
|
+
snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
|
100
|
+
unlink(fn);
|
101
|
+
snprintf(fn, 128, "%s.sp", pathname_base);
|
102
|
+
unlink(fn);
|
103
|
+
snprintf(fn, 128, "%s.sh_", pathname_base);
|
104
|
+
unlink(fn);
|
105
|
+
snprintf(fn, 128, "%s.th", pathname_base);
|
106
|
+
unlink(fn);
|
107
|
+
snprintf(fn, 128, "%s.lb", pathname_base);
|
108
|
+
unlink(fn);
|
109
|
+
|
110
|
+
return NO_ERROR;
|
111
|
+
}
|
112
|
+
|
113
|
+
wp_error* wp_segment_unload(wp_segment* s) {
|
114
|
+
RELAY_ERROR(mmap_obj_unload(&s->stringpool));
|
115
|
+
RELAY_ERROR(mmap_obj_unload(&s->stringmap));
|
116
|
+
RELAY_ERROR(mmap_obj_unload(&s->termhash));
|
117
|
+
RELAY_ERROR(mmap_obj_unload(&s->postings));
|
118
|
+
RELAY_ERROR(mmap_obj_unload(&s->labels));
|
119
|
+
return NO_ERROR;
|
120
|
+
}
|
121
|
+
|
122
|
+
RAISING_STATIC(bump_stringmap(wp_segment* s, int* success)) {
|
123
|
+
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
124
|
+
|
125
|
+
*success = 1;
|
126
|
+
if(stringmap_needs_bump(sh)) {
|
127
|
+
DEBUG("bumping stringmap size");
|
128
|
+
uint32_t next_size = stringmap_next_size(sh);
|
129
|
+
if(next_size <= stringmap_size(sh)) {
|
130
|
+
DEBUG("stringmap can't be bumped no more!");
|
131
|
+
*success = 0;
|
132
|
+
}
|
133
|
+
else {
|
134
|
+
RELAY_ERROR(mmap_obj_resize(&s->stringmap, next_size));
|
135
|
+
sh = MMAP_OBJ(s->stringmap, stringmap); // this could have changed!
|
136
|
+
stringmap_setup(sh, MMAP_OBJ(s->stringpool, stringpool));
|
137
|
+
RELAY_ERROR(stringmap_bump_size(sh));
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
return NO_ERROR;
|
142
|
+
}
|
143
|
+
|
144
|
+
RAISING_STATIC(bump_stringpool(wp_segment* s, int* success)) {
|
145
|
+
stringpool* sp = MMAP_OBJ(s->stringpool, stringpool);
|
146
|
+
|
147
|
+
*success = 1;
|
148
|
+
if(stringpool_needs_bump(sp)) {
|
149
|
+
DEBUG("bumping stringpool size");
|
150
|
+
uint32_t next_size = stringpool_next_size(sp);
|
151
|
+
if(next_size <= stringpool_size(sp)) {
|
152
|
+
DEBUG("stringpool can't be bumped no more!");
|
153
|
+
*success = 0;
|
154
|
+
}
|
155
|
+
else {
|
156
|
+
RELAY_ERROR(mmap_obj_resize(&s->stringpool, next_size));
|
157
|
+
sp = MMAP_OBJ(s->stringpool, stringpool); // may have changed!
|
158
|
+
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
159
|
+
sh->pool = sp; // need to update it here too
|
160
|
+
stringpool_bump_size(sp);
|
161
|
+
}
|
162
|
+
}
|
163
|
+
|
164
|
+
return NO_ERROR;
|
165
|
+
}
|
166
|
+
|
167
|
+
RAISING_STATIC(bump_termhash(wp_segment* s, int* success)) {
|
168
|
+
termhash* th = MMAP_OBJ(s->termhash, termhash);
|
169
|
+
|
170
|
+
*success = 1;
|
171
|
+
if(termhash_needs_bump(th)) {
|
172
|
+
DEBUG("bumping termhash size");
|
173
|
+
uint32_t next_size = termhash_next_size(th);
|
174
|
+
if(next_size <= termhash_size(th)) {
|
175
|
+
DEBUG("termhash can't be bumped no more!");
|
176
|
+
*success = 0;
|
177
|
+
}
|
178
|
+
else {
|
179
|
+
RELAY_ERROR(mmap_obj_resize(&s->termhash, next_size));
|
180
|
+
th = MMAP_OBJ(s->termhash, termhash); // could have changed!
|
181
|
+
termhash_setup(th);
|
182
|
+
RELAY_ERROR(termhash_bump_size(th));
|
183
|
+
*success = 1;
|
184
|
+
}
|
185
|
+
}
|
186
|
+
|
187
|
+
return NO_ERROR;
|
188
|
+
}
|
189
|
+
|
190
|
+
RAISING_STATIC(postings_region_ensure_fit(mmap_obj* mmopr, uint32_t postings_bytes, int* success)) {
|
191
|
+
postings_region* pr = MMAP_OBJ_PTR(mmopr, postings_region);
|
192
|
+
uint32_t new_head = pr->postings_head + postings_bytes;
|
193
|
+
|
194
|
+
DEBUG("ensuring fit for %u postings bytes", postings_bytes);
|
195
|
+
|
196
|
+
uint32_t new_tail = pr->postings_tail;
|
197
|
+
while(new_tail <= new_head) new_tail = new_tail * 2;
|
198
|
+
|
199
|
+
if(new_tail > MAX_POSTINGS_REGION_SIZE) new_tail = MAX_POSTINGS_REGION_SIZE;
|
200
|
+
DEBUG("new tail will be %u, current is %u, max is %u", new_tail, pr->postings_tail, MAX_POSTINGS_REGION_SIZE);
|
201
|
+
|
202
|
+
if(new_tail <= new_head) { // can't increase enough
|
203
|
+
*success = 0;
|
204
|
+
return NO_ERROR;
|
205
|
+
}
|
206
|
+
|
207
|
+
if(new_tail != pr->postings_tail) { // need to resize
|
208
|
+
DEBUG("request for %u postings bytes, old tail is %u, new tail will be %u, max is %u\n", postings_bytes, pr->postings_tail, new_tail, MAX_POSTINGS_REGION_SIZE);
|
209
|
+
RELAY_ERROR(mmap_obj_resize(mmopr, new_tail));
|
210
|
+
pr = MMAP_OBJ_PTR(mmopr, postings_region); // may have changed!
|
211
|
+
pr->postings_tail = new_tail;
|
212
|
+
}
|
213
|
+
|
214
|
+
*success = 1;
|
215
|
+
return NO_ERROR;
|
216
|
+
}
|
217
|
+
|
218
|
+
// TODO make this function take the number of stringpool entries, the number of
|
219
|
+
// terms, etc rather than just being a heuristic for everything except for the
|
220
|
+
// postings list
|
221
|
+
wp_error* wp_segment_ensure_fit(wp_segment* seg, uint32_t postings_bytes, uint32_t label_bytes, int* success) {
|
222
|
+
RELAY_ERROR(postings_region_ensure_fit(&seg->postings, postings_bytes, success));
|
223
|
+
if(!*success) return NO_ERROR;
|
224
|
+
|
225
|
+
RELAY_ERROR(postings_region_ensure_fit(&seg->labels, label_bytes, success));
|
226
|
+
if(!*success) return NO_ERROR;
|
227
|
+
|
228
|
+
RELAY_ERROR(bump_stringmap(seg, success));
|
229
|
+
if(!*success) return NO_ERROR;
|
230
|
+
|
231
|
+
RELAY_ERROR(bump_stringpool(seg, success));
|
232
|
+
if(!*success) return NO_ERROR;
|
233
|
+
|
234
|
+
RELAY_ERROR(bump_termhash(seg, success));
|
235
|
+
if(!*success) return NO_ERROR;
|
236
|
+
|
237
|
+
DEBUG("fit of %u postings bytes ensured", postings_bytes);
|
238
|
+
|
239
|
+
return NO_ERROR;
|
240
|
+
}
|
241
|
+
|
242
|
+
static uint32_t size_of(uint32_t num_positions, pos_t positions[]) {
|
243
|
+
(void)positions;
|
244
|
+
uint32_t position_size = sizeof(pos_t) * num_positions;
|
245
|
+
uint32_t size = sizeof(posting) - sizeof(pos_t*) + position_size;
|
246
|
+
|
247
|
+
return size;
|
248
|
+
}
|
249
|
+
|
250
|
+
wp_error* wp_segment_sizeof_posarray(wp_segment* seg, uint32_t num_positions, pos_t* positions, uint32_t* size) {
|
251
|
+
(void)seg;
|
252
|
+
*size = size_of(num_positions, positions);
|
253
|
+
return NO_ERROR;
|
254
|
+
}
|
255
|
+
|
256
|
+
#define BITMASK 0x7f
|
257
|
+
|
258
|
+
RAISING_STATIC(write_multibyte(uint8_t* location, uint32_t val, uint32_t* size)) {
|
259
|
+
//printf("xx writing %u to position %p as:\n", val, location);
|
260
|
+
uint8_t* start = location;
|
261
|
+
|
262
|
+
while(val > BITMASK) {
|
263
|
+
uint8_t c = (val & BITMASK) | 0x80;
|
264
|
+
*location = c;
|
265
|
+
//printf("xx %d = %d | %d at %p\n", c, val & BITMASK, 0x80, location);
|
266
|
+
location++;
|
267
|
+
val >>= 7;
|
268
|
+
}
|
269
|
+
uint8_t c = (val & BITMASK);
|
270
|
+
*location = c;
|
271
|
+
//printf("xx %d at %p\n", c, location);
|
272
|
+
*size = location + 1 - start;
|
273
|
+
//printf("xx total %u bytes\n", *size);
|
274
|
+
return NO_ERROR;
|
275
|
+
}
|
276
|
+
|
277
|
+
RAISING_STATIC(read_multibyte(uint8_t* location, uint32_t* val, uint32_t* size)) {
|
278
|
+
uint8_t* start = location;
|
279
|
+
uint32_t shift = 0;
|
280
|
+
|
281
|
+
*val = 0;
|
282
|
+
while(*location & 0x80) {
|
283
|
+
//printf("yy read continue byte %d -> %d at %p\n", *location, *location & ~0x80, location);
|
284
|
+
*val |= (*location & ~0x80) << shift;
|
285
|
+
shift += 7;
|
286
|
+
location++;
|
287
|
+
}
|
288
|
+
*val |= *location << shift;
|
289
|
+
//printf("yy read final byte %d at %p\n", *location, location);
|
290
|
+
*size = location + 1 - start;
|
291
|
+
//printf("yy total %d bytes, val = %d\n\n", *size, *val);
|
292
|
+
return NO_ERROR;
|
293
|
+
}
|
294
|
+
|
295
|
+
/* write posting entry using a variable-byte encoding
|
296
|
+
|
297
|
+
unfortunately we can't write doc_id deltas, which is what would really make
|
298
|
+
this encoding pay off, because we write the entries in increasing doc_id
|
299
|
+
order but read them in decreasing order. so we write doc_ids raw.
|
300
|
+
|
301
|
+
for next_offsets, we write the delta against the current offset. since the
|
302
|
+
next_offset is guaranteed to be less than the current offset, we subtract
|
303
|
+
next from current.
|
304
|
+
|
305
|
+
positions are written as deltas.
|
306
|
+
*/
|
307
|
+
|
308
|
+
RAISING_STATIC(write_posting(wp_segment* seg, posting* po, pos_t positions[])) {
|
309
|
+
postings_region* pr = MMAP_OBJ(seg->postings, postings_region);
|
310
|
+
|
311
|
+
uint32_t size;
|
312
|
+
uint32_t offset = pr->postings_head;
|
313
|
+
|
314
|
+
if(po->next_offset >= pr->postings_head) RAISE_ERROR("next_offset %u >= postings_head %u", po->next_offset, pr->postings_head);
|
315
|
+
if(po->num_positions == 0) RAISE_ERROR("num_positions == 0");
|
316
|
+
|
317
|
+
uint32_t doc_id = po->doc_id << 1;
|
318
|
+
if(po->num_positions == 1) doc_id |= 1; // marker for single postings
|
319
|
+
RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], doc_id, &size));
|
320
|
+
pr->postings_head += size;
|
321
|
+
//printf("wrote %u-byte doc_id %u (np1 == %d)\n", size, doc_id, po->num_positions == 1 ? 1 : 0);
|
322
|
+
|
323
|
+
RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], offset - po->next_offset, &size));
|
324
|
+
pr->postings_head += size;
|
325
|
+
//printf("wrote %u-byte offset %u\n", size, offset - po->next_offset);
|
326
|
+
|
327
|
+
if(po->num_positions > 1) {
|
328
|
+
RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], po->num_positions, &size));
|
329
|
+
pr->postings_head += size;
|
330
|
+
//printf("wrote %u-byte num positions %u\n", size, po->num_positions);
|
331
|
+
}
|
332
|
+
|
333
|
+
for(uint32_t i = 0; i < po->num_positions; i++) {
|
334
|
+
RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], positions[i] - (i == 0 ? 0 : positions[i - 1]), &size));
|
335
|
+
pr->postings_head += size;
|
336
|
+
//printf("wrote %u-byte positions %u\n", size, positions[i] - (i == 0 ? 0 : positions[i - 1]));
|
337
|
+
}
|
338
|
+
|
339
|
+
//printf("done writing posting\n\n");
|
340
|
+
|
341
|
+
//printf(">>> done writing posting %d %d %d to %p\n\n", (prev_docid == 0 ? po->doc_id : prev_docid - po->doc_id), offset - po->next_offset, po->num_positions, &pr->postings[pl->postings_head]);
|
342
|
+
pr->num_postings++;
|
343
|
+
|
344
|
+
return NO_ERROR;
|
345
|
+
}
|
346
|
+
|
347
|
+
/* if include_positions is true, will malloc the positions array for you, and
|
348
|
+
* you must free it when done (assuming num_positions > 0)!
|
349
|
+
*/
|
350
|
+
|
351
|
+
wp_error* wp_segment_read_posting(wp_segment* s, uint32_t offset, posting* po, int include_positions) {
|
352
|
+
uint32_t size;
|
353
|
+
uint32_t orig_offset = offset;
|
354
|
+
postings_region* pr = MMAP_OBJ(s->postings, postings_region);
|
355
|
+
|
356
|
+
//DEBUG("reading posting from offset %u -> %p (pr %p base %p)", offset, &pr->postings[offset], pr, &pr->postings);
|
357
|
+
|
358
|
+
RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->doc_id, &size));
|
359
|
+
int is_single_posting = po->doc_id & 1;
|
360
|
+
po->doc_id = po->doc_id >> 1;
|
361
|
+
//DEBUG("read doc_id %u (%u bytes)", po->doc_id, size);
|
362
|
+
offset += size;
|
363
|
+
|
364
|
+
RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->next_offset, &size));
|
365
|
+
//DEBUG("read next_offset %u -> %u (%u bytes)", po->next_offset, orig_offset - po->next_offset, size);
|
366
|
+
if((po->next_offset == 0) || (po->next_offset > orig_offset)) RAISE_ERROR("read invalid next_offset %u (must be > 0 and < %u", po->next_offset, orig_offset);
|
367
|
+
po->next_offset = orig_offset - po->next_offset;
|
368
|
+
offset += size;
|
369
|
+
|
370
|
+
if(include_positions) {
|
371
|
+
if(is_single_posting) po->num_positions = 1;
|
372
|
+
else {
|
373
|
+
RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->num_positions, &size));
|
374
|
+
//DEBUG("read num_positions: %u (%u bytes)", po->num_positions, size);
|
375
|
+
offset += size;
|
376
|
+
}
|
377
|
+
|
378
|
+
po->positions = malloc(po->num_positions * sizeof(pos_t));
|
379
|
+
|
380
|
+
for(uint32_t i = 0; i < po->num_positions; i++) {
|
381
|
+
RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->positions[i], &size));
|
382
|
+
offset += size;
|
383
|
+
po->positions[i] += (i == 0 ? 0 : po->positions[i - 1]);
|
384
|
+
//DEBUG("read position %u (%u bytes)", po->positions[i], size);
|
385
|
+
}
|
386
|
+
}
|
387
|
+
else {
|
388
|
+
po->num_positions = 0;
|
389
|
+
po->positions = NULL;
|
390
|
+
}
|
391
|
+
//DEBUG("total record took %u bytes", offset - orig_offset);
|
392
|
+
//printf("*** read posting %u %u %u from %u\n", po->doc_id, po->next_offset, po->num_positions, orig_offset);
|
393
|
+
|
394
|
+
return NO_ERROR;
|
395
|
+
}
|
396
|
+
|
397
|
+
wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* word, docid_t doc_id, uint32_t num_positions, pos_t positions[]) {
|
398
|
+
// TODO move this logic up to ensure_fit()
|
399
|
+
int success;
|
400
|
+
RELAY_ERROR(bump_stringmap(s, &success));
|
401
|
+
RELAY_ERROR(bump_stringpool(s, &success));
|
402
|
+
RELAY_ERROR(bump_termhash(s, &success));
|
403
|
+
|
404
|
+
DEBUG("adding posting for %s:%s and doc %u", field, word, doc_id);
|
405
|
+
|
406
|
+
postings_region* pr = MMAP_OBJ(s->postings, postings_region);
|
407
|
+
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
408
|
+
termhash* th = MMAP_OBJ(s->termhash, termhash);
|
409
|
+
|
410
|
+
// construct the term object
|
411
|
+
term t;
|
412
|
+
RELAY_ERROR(stringmap_add(sh, field, &t.field_s));
|
413
|
+
RELAY_ERROR(stringmap_add(sh, word, &t.word_s));
|
414
|
+
|
415
|
+
// find the offset of the next posting
|
416
|
+
posting po;
|
417
|
+
uint32_t next_offset = termhash_get_val(th, t);
|
418
|
+
if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
|
419
|
+
if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy
|
420
|
+
RELAY_ERROR(wp_segment_read_posting(s, next_offset, &po, 0));
|
421
|
+
if(po.doc_id >= doc_id) RAISE_ERROR("cannot add a doc_id out of sorted order");
|
422
|
+
}
|
423
|
+
|
424
|
+
// write the entry to the postings region
|
425
|
+
uint32_t entry_offset = pr->postings_head;
|
426
|
+
//DEBUG("entry will be at offset %u, prev offset is %u and next offset is %u", entry_offset, prev_offset, next_offset);
|
427
|
+
po.doc_id = doc_id;
|
428
|
+
po.next_offset = next_offset;
|
429
|
+
po.num_positions = num_positions;
|
430
|
+
RELAY_ERROR(write_posting(s, &po, positions)); // prev_docid is 0 for th
|
431
|
+
DEBUG("postings list head now at %u", pr->postings_head);
|
432
|
+
|
433
|
+
// really finally, update the tail pointer so that readers can access this posting
|
434
|
+
RELAY_ERROR(termhash_put_val(th, t, entry_offset));
|
435
|
+
|
436
|
+
return NO_ERROR;
|
437
|
+
}
|
438
|
+
|
439
|
+
/*
|
440
|
+
* currently, labels are implemented as a separate postings space and separate
|
441
|
+
* postings structure, but with the same term hash (the offsets just are
|
442
|
+
* relative to the different space).
|
443
|
+
*
|
444
|
+
* we use the sentinel field value 0 to demarcate a label. since no strings have
|
445
|
+
* have stringmap value 0, this is safe.
|
446
|
+
*
|
447
|
+
* we also maintain a free list of unused label postings. since all label
|
448
|
+
* postings are the same size, we can do this to reuse them and avoid losing
|
449
|
+
* space in this area; since label postings can be changed frequently, this is
|
450
|
+
* desirable. we use the sentinel postings value field=0 word=0 to keep track
|
451
|
+
* of this list.
|
452
|
+
*
|
453
|
+
*/
|
454
|
+
wp_error* wp_segment_read_label(wp_segment* s, uint32_t offset, posting* po) {
|
455
|
+
postings_region* pr = MMAP_OBJ(s->labels, postings_region);
|
456
|
+
|
457
|
+
label_posting* lp = wp_segment_label_posting_at(pr, offset);
|
458
|
+
po->doc_id = lp->doc_id;
|
459
|
+
po->next_offset = lp->next_offset;
|
460
|
+
po->num_positions = 0;
|
461
|
+
po->positions = NULL;
|
462
|
+
|
463
|
+
return NO_ERROR;
|
464
|
+
}
|
465
|
+
|
466
|
+
wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id) {
|
467
|
+
// TODO move this logic up to ensure_fit()
|
468
|
+
int success;
|
469
|
+
RELAY_ERROR(bump_stringmap(s, &success));
|
470
|
+
RELAY_ERROR(bump_stringpool(s, &success));
|
471
|
+
RELAY_ERROR(bump_termhash(s, &success));
|
472
|
+
|
473
|
+
DEBUG("adding label %s to doc %u", label, doc_id);
|
474
|
+
|
475
|
+
postings_region* pr = MMAP_OBJ(s->labels, postings_region);
|
476
|
+
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
477
|
+
termhash* th = MMAP_OBJ(s->termhash, termhash);
|
478
|
+
|
479
|
+
// construct the term object. term objects for labels have the special
|
480
|
+
// sentinel field value 0
|
481
|
+
term t;
|
482
|
+
t.field_s = 0; // label sentinel value
|
483
|
+
RELAY_ERROR(stringmap_add(sh, label, &t.word_s)); // get word key
|
484
|
+
|
485
|
+
// find the previous and next label postings, between which we'll insert this
|
486
|
+
// posting
|
487
|
+
uint32_t prev_offset = OFFSET_NONE;
|
488
|
+
uint32_t next_offset = termhash_get_val(th, t);
|
489
|
+
if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
|
490
|
+
|
491
|
+
while(next_offset != OFFSET_NONE) {
|
492
|
+
label_posting* po = wp_segment_label_posting_at(pr, next_offset);
|
493
|
+
if(po->doc_id == doc_id) {
|
494
|
+
DEBUG("already have label '%s' for doc %u; returning", label, doc_id);
|
495
|
+
return NO_ERROR;
|
496
|
+
}
|
497
|
+
else if(po->doc_id < doc_id) break;
|
498
|
+
prev_offset = next_offset;
|
499
|
+
next_offset = po->next_offset;
|
500
|
+
}
|
501
|
+
|
502
|
+
// find a space for the posting by first checking for a free postings in the
|
503
|
+
// dead list. the dead list is the list stored under the sentinel term
|
504
|
+
// with field 0 and word 0.
|
505
|
+
term dead_term = { .field_s = 0, .word_s = 0 };
|
506
|
+
uint32_t entry_offset;
|
507
|
+
uint32_t dead_offset = termhash_get_val(th, dead_term);
|
508
|
+
if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
|
509
|
+
|
510
|
+
if(dead_offset == OFFSET_NONE) { // make a new posting
|
511
|
+
entry_offset = pr->postings_head;
|
512
|
+
}
|
513
|
+
else { // we'll use this one; remove it from the linked list
|
514
|
+
DEBUG("offset from dead list is %u, using it for the new posting!", dead_offset);
|
515
|
+
entry_offset = dead_offset;
|
516
|
+
RELAY_ERROR(termhash_put_val(th, dead_term, wp_segment_label_posting_at(pr, dead_offset)->next_offset));
|
517
|
+
}
|
518
|
+
|
519
|
+
// finally, write the entry to the label postings region
|
520
|
+
DEBUG("label entry will be at offset %u, prev offset is %u and next offset is %u", entry_offset, prev_offset, next_offset);
|
521
|
+
label_posting* po = wp_segment_label_posting_at(pr, entry_offset);
|
522
|
+
po->doc_id = doc_id;
|
523
|
+
po->next_offset = next_offset;
|
524
|
+
|
525
|
+
pr->postings_head += sizeof(label_posting);
|
526
|
+
DEBUG("label postings list head now at %u", pr->postings_head);
|
527
|
+
|
528
|
+
// really finally, update either the previous offset or the tail pointer
|
529
|
+
// for this label so that readers can access this posting
|
530
|
+
if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, entry_offset));
|
531
|
+
else wp_segment_label_posting_at(pr, prev_offset)->next_offset = entry_offset;
|
532
|
+
|
533
|
+
return NO_ERROR;
|
534
|
+
}
|
535
|
+
|
536
|
+
wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_id) {
|
537
|
+
// TODO move this logic to ensure_fit
|
538
|
+
int success;
|
539
|
+
RELAY_ERROR(bump_termhash(s, &success)); // we might add an entry for the dead list
|
540
|
+
|
541
|
+
postings_region* pr = MMAP_OBJ(s->labels, postings_region);
|
542
|
+
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
543
|
+
termhash* th = MMAP_OBJ(s->termhash, termhash);
|
544
|
+
|
545
|
+
// construct the term object. term objects for labels have the special
|
546
|
+
// sentinel field value 0
|
547
|
+
term t;
|
548
|
+
t.field_s = 0; // label sentinel value
|
549
|
+
t.word_s = stringmap_string_to_int(sh, label); // will be -1 if not there
|
550
|
+
|
551
|
+
// find the posting and the previous posting in the list, if any
|
552
|
+
uint32_t prev_offset = OFFSET_NONE;
|
553
|
+
uint32_t offset = termhash_get_val(th, t);
|
554
|
+
if(offset == (uint32_t)-1) offset = OFFSET_NONE;
|
555
|
+
label_posting* lp = NULL;
|
556
|
+
|
557
|
+
while(offset != OFFSET_NONE) {
|
558
|
+
lp = wp_segment_label_posting_at(pr, offset);
|
559
|
+
if(lp->doc_id < doc_id) offset = OFFSET_NONE; // nasty hack to induce failure
|
560
|
+
if(lp->doc_id <= doc_id) break;
|
561
|
+
prev_offset = offset;
|
562
|
+
offset = lp->next_offset;
|
563
|
+
}
|
564
|
+
|
565
|
+
DEBUG("found label posting for doc %u at offset %u; prev_offset is %u", doc_id, offset, prev_offset);
|
566
|
+
|
567
|
+
if(offset == OFFSET_NONE) {
|
568
|
+
DEBUG("no label %s found for doc %u", label, doc_id);
|
569
|
+
return NO_ERROR;
|
570
|
+
}
|
571
|
+
|
572
|
+
// we've found the posting; now remove it from the list
|
573
|
+
if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, lp->next_offset));
|
574
|
+
else wp_segment_label_posting_at(pr, prev_offset)->next_offset = lp->next_offset;
|
575
|
+
|
576
|
+
// now add it to the dead list for later reclamation
|
577
|
+
term dead_term = { .field_s = 0, .word_s = 0 };
|
578
|
+
uint32_t dead_offset = termhash_get_val(th, dead_term);
|
579
|
+
if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
|
580
|
+
|
581
|
+
lp->next_offset = dead_offset;
|
582
|
+
DEBUG("adding dead label posting %u to head of deadlist with next_offset %u", offset, lp->next_offset);
|
583
|
+
RELAY_ERROR(termhash_put_val(th, dead_term, offset));
|
584
|
+
|
585
|
+
return NO_ERROR;
|
586
|
+
}
|
587
|
+
|
588
|
+
wp_error* wp_segment_grab_docid(wp_segment* segment, docid_t* doc_id) {
|
589
|
+
postings_region* pr = MMAP_OBJ(segment->postings, postings_region);
|
590
|
+
*doc_id = ++pr->num_docs;
|
591
|
+
return NO_ERROR;
|
592
|
+
}
|
593
|
+
|
594
|
+
wp_error* wp_segment_dumpinfo(wp_segment* segment, FILE* stream) {
|
595
|
+
postings_region* pr = MMAP_OBJ(segment->postings, postings_region);
|
596
|
+
stringmap* sh = MMAP_OBJ(segment->stringmap, stringmap);
|
597
|
+
stringpool* sp = MMAP_OBJ(segment->stringpool, stringpool);
|
598
|
+
termhash* th = MMAP_OBJ(segment->termhash, termhash);
|
599
|
+
|
600
|
+
#define p(a, b) 100.0 * (float)a / (float)b
|
601
|
+
|
602
|
+
fprintf(stream, "segment has type %u\n", pr->index_type_and_flags);
|
603
|
+
fprintf(stream, "segment has %u docs and %u postings\n", pr->num_docs, pr->num_postings);
|
604
|
+
fprintf(stream, "postings region is %6ukb at %3.1f%% saturation\n", segment->postings.header->size / 1024, p(pr->postings_head, pr->postings_tail));
|
605
|
+
fprintf(stream, " string hash is %6ukb at %3.1f%% saturation\n", segment->stringmap.header->size / 1024, p(sh->n_occupied, sh->n_buckets));
|
606
|
+
fprintf(stream, " stringpool is %6ukb at %3.1f%% saturation\n", segment->stringpool.header->size / 1024, p(sp->next, sp->size));
|
607
|
+
fprintf(stream, " term hash has %6ukb at %3.1f%% saturation\n", segment->termhash.header->size / 1024, p(th->n_occupied, th->n_buckets));
|
608
|
+
|
609
|
+
return NO_ERROR;
|
610
|
+
}
|
611
|
+
|
612
|
+
uint64_t wp_segment_num_docs(wp_segment* seg) {
|
613
|
+
postings_region* pr = MMAP_OBJ(seg->postings, postings_region);
|
614
|
+
return pr->num_docs;
|
615
|
+
}
|