whistlepig 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +86 -0
- data/ext/whistlepig/defaults.h +28 -0
- data/ext/whistlepig/entry.c +181 -0
- data/ext/whistlepig/entry.h +66 -0
- data/ext/whistlepig/error.c +24 -0
- data/ext/whistlepig/error.h +94 -0
- data/ext/whistlepig/extconf.rb +6 -0
- data/ext/whistlepig/index.c +294 -0
- data/ext/whistlepig/index.h +88 -0
- data/ext/whistlepig/khash.h +316 -0
- data/ext/whistlepig/mmap-obj.c +76 -0
- data/ext/whistlepig/mmap-obj.h +52 -0
- data/ext/whistlepig/query-parser.c +37 -0
- data/ext/whistlepig/query-parser.h +25 -0
- data/ext/whistlepig/query-parser.lex.c +2249 -0
- data/ext/whistlepig/query-parser.lex.h +359 -0
- data/ext/whistlepig/query-parser.tab.c +1757 -0
- data/ext/whistlepig/query-parser.tab.h +85 -0
- data/ext/whistlepig/query.c +194 -0
- data/ext/whistlepig/query.h +78 -0
- data/ext/whistlepig/search.c +746 -0
- data/ext/whistlepig/search.h +76 -0
- data/ext/whistlepig/segment.c +615 -0
- data/ext/whistlepig/segment.h +137 -0
- data/ext/whistlepig/stringmap.c +278 -0
- data/ext/whistlepig/stringmap.h +82 -0
- data/ext/whistlepig/stringpool.c +44 -0
- data/ext/whistlepig/stringpool.h +58 -0
- data/ext/whistlepig/termhash.c +294 -0
- data/ext/whistlepig/termhash.h +79 -0
- data/ext/whistlepig/tokenizer.lex.c +2263 -0
- data/ext/whistlepig/tokenizer.lex.h +360 -0
- data/ext/whistlepig/whistlepig.h +15 -0
- data/ext/whistlepig/whistlepigc.c +537 -0
- data/lib/whistlepig.rb +119 -0
- metadata +103 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
#ifndef WP_SEARCH_H_
|
2
|
+
#define WP_SEARCH_H_
|
3
|
+
|
4
|
+
// whistlepig search code
|
5
|
+
// (c) 2011 William Morgan. See COPYING for license terms.
|
6
|
+
//
|
7
|
+
// what you need to know about search:
|
8
|
+
// 1. it runs on a per-segment basis; and
|
9
|
+
// 2. query objects maintain search state internally.
|
10
|
+
//
|
11
|
+
// to run a query on a segment, you need to use this call sequence:
|
12
|
+
//
|
13
|
+
// 1. wp_search_init_search_state
|
14
|
+
// 2. wp_search_run_query_on_segment (zero or more times)
|
15
|
+
// 3. wp_search_release_search_state
|
16
|
+
//
|
17
|
+
// because the query objects maintain state, you can repeat step 2 as much as
|
18
|
+
// you'd like to get more results without doing any duplicate work. if you
|
19
|
+
// don't do step 3, you'll leak memory.
|
20
|
+
//
|
21
|
+
// the corollary is that if you want to do multithreaded search across segments
|
22
|
+
// in parallel, you will have to clone the query for each segment to avoid
|
23
|
+
// sharing state.
|
24
|
+
//
|
25
|
+
// (right now the index does a serial search across segments, so cloning is not
|
26
|
+
// required.)
|
27
|
+
|
28
|
+
#include <stdint.h>
|
29
|
+
|
30
|
+
#include "defaults.h"
|
31
|
+
#include "segment.h"
|
32
|
+
#include "query.h"
|
33
|
+
#include "error.h"
|
34
|
+
|
35
|
+
// a match of a particular fielded phrase on a particular document
|
36
|
+
typedef struct doc_match {
|
37
|
+
const char* field;
|
38
|
+
const char* word;
|
39
|
+
uint16_t num_positions;
|
40
|
+
pos_t* positions;
|
41
|
+
} doc_match;
|
42
|
+
|
43
|
+
// a generic match on a document of a search stream
|
44
|
+
typedef struct search_result {
|
45
|
+
docid_t doc_id;
|
46
|
+
uint16_t num_doc_matches;
|
47
|
+
doc_match* doc_matches;
|
48
|
+
} search_result;
|
49
|
+
|
50
|
+
struct wp_segment;
|
51
|
+
struct wp_query;
|
52
|
+
struct wp_error;
|
53
|
+
|
54
|
+
// API methods
|
55
|
+
|
56
|
+
// initialize the query search state for running on segment s. this must precede any call
|
57
|
+
// to wp_search_run_query_on_segment.
|
58
|
+
wp_error* wp_search_init_search_state(struct wp_query* q, struct wp_segment* s) RAISES_ERROR;
|
59
|
+
|
60
|
+
// release any query search state. this must follow any call to wp_search_run_query_on_segment.
|
61
|
+
wp_error* wp_search_release_search_state(struct wp_query* q) RAISES_ERROR;
|
62
|
+
|
63
|
+
// run a query on a segment, filling at most max_num_results slots in results.
|
64
|
+
// this is the main entry point into the actual search logic, and is called by
|
65
|
+
// index.c in various ways. this must be preceded by an init_search_state and
|
66
|
+
// followed by a release_search_state.
|
67
|
+
//
|
68
|
+
// if you get num_results > 0, you should call wp_search_result_free on each of the
|
69
|
+
// results when you're done with them.
|
70
|
+
wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment* s, uint32_t max_num_results, uint32_t* num_results, search_result* results) RAISES_ERROR;
|
71
|
+
|
72
|
+
// if you got non-zero num_results from wp_search_run_query_on_segment, call
|
73
|
+
// this on each result when you're done with it.
|
74
|
+
void wp_search_result_free(search_result* result);
|
75
|
+
|
76
|
+
#endif
|
@@ -0,0 +1,615 @@
|
|
1
|
+
#include <sys/stat.h>
|
2
|
+
#include <fcntl.h>
|
3
|
+
#include <unistd.h>
|
4
|
+
#include "whistlepig.h"
|
5
|
+
|
6
|
+
#define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
|
7
|
+
#define POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS 2 // bigger, mutable
|
8
|
+
|
9
|
+
#define wp_segment_label_posting_at(posting_region, offset) ((label_posting*)(posting_region->postings + offset))
|
10
|
+
|
11
|
+
static void postings_region_init(postings_region* pr, uint32_t initial_size, uint32_t index_type_and_flags) {
|
12
|
+
pr->index_type_and_flags = index_type_and_flags;
|
13
|
+
pr->num_docs = 0;
|
14
|
+
pr->num_postings = 0;
|
15
|
+
pr->postings_head = 1; // skip one byte, which is reserved as OFFSET_NONE
|
16
|
+
pr->postings_tail = initial_size;
|
17
|
+
}
|
18
|
+
|
19
|
+
RAISING_STATIC(postings_region_validate(postings_region* pr, uint32_t index_type_and_flags)) {
|
20
|
+
if(pr->index_type_and_flags != index_type_and_flags) RAISE_ERROR("segment has index type %u; expecting type %u", pr->index_type_and_flags, index_type_and_flags);
|
21
|
+
return NO_ERROR;
|
22
|
+
}
|
23
|
+
|
24
|
+
#define INITIAL_POSTINGS_SIZE 2048
|
25
|
+
#define FN_SIZE 1024
|
26
|
+
|
27
|
+
wp_error* wp_segment_load(wp_segment* segment, const char* pathname_base) {
|
28
|
+
char fn[FN_SIZE];
|
29
|
+
|
30
|
+
// open the string pool
|
31
|
+
snprintf(fn, 128, "%s.sp", pathname_base);
|
32
|
+
RELAY_ERROR(mmap_obj_load(&segment->stringpool, "ti/stringpool", fn));
|
33
|
+
|
34
|
+
// open the string hash
|
35
|
+
snprintf(fn, 128, "%s.sh_", pathname_base);
|
36
|
+
RELAY_ERROR(mmap_obj_load(&segment->stringmap, "ti/stringmap", fn));
|
37
|
+
stringmap_setup(MMAP_OBJ(segment->stringmap, stringmap), MMAP_OBJ(segment->stringpool, stringpool));
|
38
|
+
|
39
|
+
// open the term hash
|
40
|
+
snprintf(fn, 128, "%s.th", pathname_base);
|
41
|
+
RELAY_ERROR(mmap_obj_load(&segment->termhash, "ti/termhash", fn));
|
42
|
+
termhash_setup(MMAP_OBJ(segment->termhash, termhash));
|
43
|
+
|
44
|
+
// open the postings region
|
45
|
+
snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
|
46
|
+
RELAY_ERROR(mmap_obj_load(&segment->postings, "ti/postings", fn));
|
47
|
+
RELAY_ERROR(postings_region_validate(MMAP_OBJ(segment->postings, postings_region), POSTINGS_REGION_TYPE_IMMUTABLE_VBE));
|
48
|
+
|
49
|
+
// open the labels postings region
|
50
|
+
snprintf(fn, 128, "%s.lb", pathname_base);
|
51
|
+
RELAY_ERROR(mmap_obj_load(&segment->labels, "ti/labels", fn));
|
52
|
+
RELAY_ERROR(postings_region_validate(MMAP_OBJ(segment->labels, postings_region), POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS));
|
53
|
+
|
54
|
+
return NO_ERROR;
|
55
|
+
}
|
56
|
+
|
57
|
+
wp_error* wp_segment_create(wp_segment* segment, const char* pathname_base) {
|
58
|
+
char fn[FN_SIZE];
|
59
|
+
|
60
|
+
// create the string pool
|
61
|
+
snprintf(fn, 128, "%s.sp", pathname_base);
|
62
|
+
RELAY_ERROR(mmap_obj_create(&segment->stringpool, "ti/stringpool", fn, stringpool_initial_size()));
|
63
|
+
stringpool_init(MMAP_OBJ(segment->stringpool, stringpool));
|
64
|
+
|
65
|
+
// create the string hash
|
66
|
+
snprintf(fn, 128, "%s.sh_", pathname_base);
|
67
|
+
RELAY_ERROR(mmap_obj_create(&segment->stringmap, "ti/stringmap", fn, stringmap_initial_size()));
|
68
|
+
stringmap_init(MMAP_OBJ(segment->stringmap, stringmap), MMAP_OBJ(segment->stringpool, stringpool));
|
69
|
+
|
70
|
+
// create the term hash
|
71
|
+
snprintf(fn, 128, "%s.th", pathname_base);
|
72
|
+
RELAY_ERROR(mmap_obj_create(&segment->termhash, "ti/termhash", fn, termhash_initial_size()));
|
73
|
+
termhash_init(MMAP_OBJ(segment->termhash, termhash));
|
74
|
+
|
75
|
+
// create the postings region
|
76
|
+
snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
|
77
|
+
RELAY_ERROR(mmap_obj_create(&segment->postings, "ti/postings", fn, sizeof(postings_region) + INITIAL_POSTINGS_SIZE));
|
78
|
+
postings_region_init(MMAP_OBJ(segment->postings, postings_region), INITIAL_POSTINGS_SIZE, POSTINGS_REGION_TYPE_IMMUTABLE_VBE);
|
79
|
+
|
80
|
+
// create the labels postings region
|
81
|
+
snprintf(fn, 128, "%s.lb", pathname_base);
|
82
|
+
RELAY_ERROR(mmap_obj_create(&segment->labels, "ti/labels", fn, sizeof(postings_region) + INITIAL_POSTINGS_SIZE));
|
83
|
+
postings_region_init(MMAP_OBJ(segment->labels, postings_region), INITIAL_POSTINGS_SIZE, POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS);
|
84
|
+
|
85
|
+
return NO_ERROR;
|
86
|
+
}
|
87
|
+
|
88
|
+
int wp_segment_exists(const char* pathname_base) {
|
89
|
+
struct stat fstat;
|
90
|
+
char fn[FN_SIZE];
|
91
|
+
|
92
|
+
snprintf(fn, 128, "%s.sp", pathname_base);
|
93
|
+
return !stat(fn, &fstat);
|
94
|
+
}
|
95
|
+
|
96
|
+
wp_error* wp_segment_delete(const char* pathname_base) {
|
97
|
+
char fn[FN_SIZE];
|
98
|
+
|
99
|
+
snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
|
100
|
+
unlink(fn);
|
101
|
+
snprintf(fn, 128, "%s.sp", pathname_base);
|
102
|
+
unlink(fn);
|
103
|
+
snprintf(fn, 128, "%s.sh_", pathname_base);
|
104
|
+
unlink(fn);
|
105
|
+
snprintf(fn, 128, "%s.th", pathname_base);
|
106
|
+
unlink(fn);
|
107
|
+
snprintf(fn, 128, "%s.lb", pathname_base);
|
108
|
+
unlink(fn);
|
109
|
+
|
110
|
+
return NO_ERROR;
|
111
|
+
}
|
112
|
+
|
113
|
+
wp_error* wp_segment_unload(wp_segment* s) {
|
114
|
+
RELAY_ERROR(mmap_obj_unload(&s->stringpool));
|
115
|
+
RELAY_ERROR(mmap_obj_unload(&s->stringmap));
|
116
|
+
RELAY_ERROR(mmap_obj_unload(&s->termhash));
|
117
|
+
RELAY_ERROR(mmap_obj_unload(&s->postings));
|
118
|
+
RELAY_ERROR(mmap_obj_unload(&s->labels));
|
119
|
+
return NO_ERROR;
|
120
|
+
}
|
121
|
+
|
122
|
+
RAISING_STATIC(bump_stringmap(wp_segment* s, int* success)) {
|
123
|
+
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
124
|
+
|
125
|
+
*success = 1;
|
126
|
+
if(stringmap_needs_bump(sh)) {
|
127
|
+
DEBUG("bumping stringmap size");
|
128
|
+
uint32_t next_size = stringmap_next_size(sh);
|
129
|
+
if(next_size <= stringmap_size(sh)) {
|
130
|
+
DEBUG("stringmap can't be bumped no more!");
|
131
|
+
*success = 0;
|
132
|
+
}
|
133
|
+
else {
|
134
|
+
RELAY_ERROR(mmap_obj_resize(&s->stringmap, next_size));
|
135
|
+
sh = MMAP_OBJ(s->stringmap, stringmap); // this could have changed!
|
136
|
+
stringmap_setup(sh, MMAP_OBJ(s->stringpool, stringpool));
|
137
|
+
RELAY_ERROR(stringmap_bump_size(sh));
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
return NO_ERROR;
|
142
|
+
}
|
143
|
+
|
144
|
+
RAISING_STATIC(bump_stringpool(wp_segment* s, int* success)) {
|
145
|
+
stringpool* sp = MMAP_OBJ(s->stringpool, stringpool);
|
146
|
+
|
147
|
+
*success = 1;
|
148
|
+
if(stringpool_needs_bump(sp)) {
|
149
|
+
DEBUG("bumping stringpool size");
|
150
|
+
uint32_t next_size = stringpool_next_size(sp);
|
151
|
+
if(next_size <= stringpool_size(sp)) {
|
152
|
+
DEBUG("stringpool can't be bumped no more!");
|
153
|
+
*success = 0;
|
154
|
+
}
|
155
|
+
else {
|
156
|
+
RELAY_ERROR(mmap_obj_resize(&s->stringpool, next_size));
|
157
|
+
sp = MMAP_OBJ(s->stringpool, stringpool); // may have changed!
|
158
|
+
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
159
|
+
sh->pool = sp; // need to update it here too
|
160
|
+
stringpool_bump_size(sp);
|
161
|
+
}
|
162
|
+
}
|
163
|
+
|
164
|
+
return NO_ERROR;
|
165
|
+
}
|
166
|
+
|
167
|
+
RAISING_STATIC(bump_termhash(wp_segment* s, int* success)) {
|
168
|
+
termhash* th = MMAP_OBJ(s->termhash, termhash);
|
169
|
+
|
170
|
+
*success = 1;
|
171
|
+
if(termhash_needs_bump(th)) {
|
172
|
+
DEBUG("bumping termhash size");
|
173
|
+
uint32_t next_size = termhash_next_size(th);
|
174
|
+
if(next_size <= termhash_size(th)) {
|
175
|
+
DEBUG("termhash can't be bumped no more!");
|
176
|
+
*success = 0;
|
177
|
+
}
|
178
|
+
else {
|
179
|
+
RELAY_ERROR(mmap_obj_resize(&s->termhash, next_size));
|
180
|
+
th = MMAP_OBJ(s->termhash, termhash); // could have changed!
|
181
|
+
termhash_setup(th);
|
182
|
+
RELAY_ERROR(termhash_bump_size(th));
|
183
|
+
*success = 1;
|
184
|
+
}
|
185
|
+
}
|
186
|
+
|
187
|
+
return NO_ERROR;
|
188
|
+
}
|
189
|
+
|
190
|
+
RAISING_STATIC(postings_region_ensure_fit(mmap_obj* mmopr, uint32_t postings_bytes, int* success)) {
|
191
|
+
postings_region* pr = MMAP_OBJ_PTR(mmopr, postings_region);
|
192
|
+
uint32_t new_head = pr->postings_head + postings_bytes;
|
193
|
+
|
194
|
+
DEBUG("ensuring fit for %u postings bytes", postings_bytes);
|
195
|
+
|
196
|
+
uint32_t new_tail = pr->postings_tail;
|
197
|
+
while(new_tail <= new_head) new_tail = new_tail * 2;
|
198
|
+
|
199
|
+
if(new_tail > MAX_POSTINGS_REGION_SIZE) new_tail = MAX_POSTINGS_REGION_SIZE;
|
200
|
+
DEBUG("new tail will be %u, current is %u, max is %u", new_tail, pr->postings_tail, MAX_POSTINGS_REGION_SIZE);
|
201
|
+
|
202
|
+
if(new_tail <= new_head) { // can't increase enough
|
203
|
+
*success = 0;
|
204
|
+
return NO_ERROR;
|
205
|
+
}
|
206
|
+
|
207
|
+
if(new_tail != pr->postings_tail) { // need to resize
|
208
|
+
DEBUG("request for %u postings bytes, old tail is %u, new tail will be %u, max is %u\n", postings_bytes, pr->postings_tail, new_tail, MAX_POSTINGS_REGION_SIZE);
|
209
|
+
RELAY_ERROR(mmap_obj_resize(mmopr, new_tail));
|
210
|
+
pr = MMAP_OBJ_PTR(mmopr, postings_region); // may have changed!
|
211
|
+
pr->postings_tail = new_tail;
|
212
|
+
}
|
213
|
+
|
214
|
+
*success = 1;
|
215
|
+
return NO_ERROR;
|
216
|
+
}
|
217
|
+
|
218
|
+
// TODO make this function take the number of stringpool entries, the number of
|
219
|
+
// terms, etc rather than just being a heuristic for everything except for the
|
220
|
+
// postings list
|
221
|
+
wp_error* wp_segment_ensure_fit(wp_segment* seg, uint32_t postings_bytes, uint32_t label_bytes, int* success) {
|
222
|
+
RELAY_ERROR(postings_region_ensure_fit(&seg->postings, postings_bytes, success));
|
223
|
+
if(!*success) return NO_ERROR;
|
224
|
+
|
225
|
+
RELAY_ERROR(postings_region_ensure_fit(&seg->labels, label_bytes, success));
|
226
|
+
if(!*success) return NO_ERROR;
|
227
|
+
|
228
|
+
RELAY_ERROR(bump_stringmap(seg, success));
|
229
|
+
if(!*success) return NO_ERROR;
|
230
|
+
|
231
|
+
RELAY_ERROR(bump_stringpool(seg, success));
|
232
|
+
if(!*success) return NO_ERROR;
|
233
|
+
|
234
|
+
RELAY_ERROR(bump_termhash(seg, success));
|
235
|
+
if(!*success) return NO_ERROR;
|
236
|
+
|
237
|
+
DEBUG("fit of %u postings bytes ensured", postings_bytes);
|
238
|
+
|
239
|
+
return NO_ERROR;
|
240
|
+
}
|
241
|
+
|
242
|
+
static uint32_t size_of(uint32_t num_positions, pos_t positions[]) {
|
243
|
+
(void)positions;
|
244
|
+
uint32_t position_size = sizeof(pos_t) * num_positions;
|
245
|
+
uint32_t size = sizeof(posting) - sizeof(pos_t*) + position_size;
|
246
|
+
|
247
|
+
return size;
|
248
|
+
}
|
249
|
+
|
250
|
+
wp_error* wp_segment_sizeof_posarray(wp_segment* seg, uint32_t num_positions, pos_t* positions, uint32_t* size) {
|
251
|
+
(void)seg;
|
252
|
+
*size = size_of(num_positions, positions);
|
253
|
+
return NO_ERROR;
|
254
|
+
}
|
255
|
+
|
256
|
+
#define BITMASK 0x7f
|
257
|
+
|
258
|
+
RAISING_STATIC(write_multibyte(uint8_t* location, uint32_t val, uint32_t* size)) {
|
259
|
+
//printf("xx writing %u to position %p as:\n", val, location);
|
260
|
+
uint8_t* start = location;
|
261
|
+
|
262
|
+
while(val > BITMASK) {
|
263
|
+
uint8_t c = (val & BITMASK) | 0x80;
|
264
|
+
*location = c;
|
265
|
+
//printf("xx %d = %d | %d at %p\n", c, val & BITMASK, 0x80, location);
|
266
|
+
location++;
|
267
|
+
val >>= 7;
|
268
|
+
}
|
269
|
+
uint8_t c = (val & BITMASK);
|
270
|
+
*location = c;
|
271
|
+
//printf("xx %d at %p\n", c, location);
|
272
|
+
*size = location + 1 - start;
|
273
|
+
//printf("xx total %u bytes\n", *size);
|
274
|
+
return NO_ERROR;
|
275
|
+
}
|
276
|
+
|
277
|
+
RAISING_STATIC(read_multibyte(uint8_t* location, uint32_t* val, uint32_t* size)) {
|
278
|
+
uint8_t* start = location;
|
279
|
+
uint32_t shift = 0;
|
280
|
+
|
281
|
+
*val = 0;
|
282
|
+
while(*location & 0x80) {
|
283
|
+
//printf("yy read continue byte %d -> %d at %p\n", *location, *location & ~0x80, location);
|
284
|
+
*val |= (*location & ~0x80) << shift;
|
285
|
+
shift += 7;
|
286
|
+
location++;
|
287
|
+
}
|
288
|
+
*val |= *location << shift;
|
289
|
+
//printf("yy read final byte %d at %p\n", *location, location);
|
290
|
+
*size = location + 1 - start;
|
291
|
+
//printf("yy total %d bytes, val = %d\n\n", *size, *val);
|
292
|
+
return NO_ERROR;
|
293
|
+
}
|
294
|
+
|
295
|
+
/* write posting entry using a variable-byte encoding
|
296
|
+
|
297
|
+
unfortunately we can't write doc_id deltas, which is what would really make
|
298
|
+
this encoding pay off, because we write the entries in increasing doc_id
|
299
|
+
order but read them in decreasing order. so we write doc_ids raw.
|
300
|
+
|
301
|
+
for next_offsets, we write the delta against the current offset. since the
|
302
|
+
next_offset is guaranteed to be less than the current offset, we subtract
|
303
|
+
next from current.
|
304
|
+
|
305
|
+
positions are written as deltas.
|
306
|
+
*/
|
307
|
+
|
308
|
+
RAISING_STATIC(write_posting(wp_segment* seg, posting* po, pos_t positions[])) {
|
309
|
+
postings_region* pr = MMAP_OBJ(seg->postings, postings_region);
|
310
|
+
|
311
|
+
uint32_t size;
|
312
|
+
uint32_t offset = pr->postings_head;
|
313
|
+
|
314
|
+
if(po->next_offset >= pr->postings_head) RAISE_ERROR("next_offset %u >= postings_head %u", po->next_offset, pr->postings_head);
|
315
|
+
if(po->num_positions == 0) RAISE_ERROR("num_positions == 0");
|
316
|
+
|
317
|
+
uint32_t doc_id = po->doc_id << 1;
|
318
|
+
if(po->num_positions == 1) doc_id |= 1; // marker for single postings
|
319
|
+
RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], doc_id, &size));
|
320
|
+
pr->postings_head += size;
|
321
|
+
//printf("wrote %u-byte doc_id %u (np1 == %d)\n", size, doc_id, po->num_positions == 1 ? 1 : 0);
|
322
|
+
|
323
|
+
RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], offset - po->next_offset, &size));
|
324
|
+
pr->postings_head += size;
|
325
|
+
//printf("wrote %u-byte offset %u\n", size, offset - po->next_offset);
|
326
|
+
|
327
|
+
if(po->num_positions > 1) {
|
328
|
+
RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], po->num_positions, &size));
|
329
|
+
pr->postings_head += size;
|
330
|
+
//printf("wrote %u-byte num positions %u\n", size, po->num_positions);
|
331
|
+
}
|
332
|
+
|
333
|
+
for(uint32_t i = 0; i < po->num_positions; i++) {
|
334
|
+
RELAY_ERROR(write_multibyte(&pr->postings[pr->postings_head], positions[i] - (i == 0 ? 0 : positions[i - 1]), &size));
|
335
|
+
pr->postings_head += size;
|
336
|
+
//printf("wrote %u-byte positions %u\n", size, positions[i] - (i == 0 ? 0 : positions[i - 1]));
|
337
|
+
}
|
338
|
+
|
339
|
+
//printf("done writing posting\n\n");
|
340
|
+
|
341
|
+
//printf(">>> done writing posting %d %d %d to %p\n\n", (prev_docid == 0 ? po->doc_id : prev_docid - po->doc_id), offset - po->next_offset, po->num_positions, &pr->postings[pl->postings_head]);
|
342
|
+
pr->num_postings++;
|
343
|
+
|
344
|
+
return NO_ERROR;
|
345
|
+
}
|
346
|
+
|
347
|
+
/* if include_positions is true, will malloc the positions array for you, and
|
348
|
+
* you must free it when done (assuming num_positions > 0)!
|
349
|
+
*/
|
350
|
+
|
351
|
+
wp_error* wp_segment_read_posting(wp_segment* s, uint32_t offset, posting* po, int include_positions) {
|
352
|
+
uint32_t size;
|
353
|
+
uint32_t orig_offset = offset;
|
354
|
+
postings_region* pr = MMAP_OBJ(s->postings, postings_region);
|
355
|
+
|
356
|
+
//DEBUG("reading posting from offset %u -> %p (pr %p base %p)", offset, &pr->postings[offset], pr, &pr->postings);
|
357
|
+
|
358
|
+
RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->doc_id, &size));
|
359
|
+
int is_single_posting = po->doc_id & 1;
|
360
|
+
po->doc_id = po->doc_id >> 1;
|
361
|
+
//DEBUG("read doc_id %u (%u bytes)", po->doc_id, size);
|
362
|
+
offset += size;
|
363
|
+
|
364
|
+
RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->next_offset, &size));
|
365
|
+
//DEBUG("read next_offset %u -> %u (%u bytes)", po->next_offset, orig_offset - po->next_offset, size);
|
366
|
+
if((po->next_offset == 0) || (po->next_offset > orig_offset)) RAISE_ERROR("read invalid next_offset %u (must be > 0 and < %u", po->next_offset, orig_offset);
|
367
|
+
po->next_offset = orig_offset - po->next_offset;
|
368
|
+
offset += size;
|
369
|
+
|
370
|
+
if(include_positions) {
|
371
|
+
if(is_single_posting) po->num_positions = 1;
|
372
|
+
else {
|
373
|
+
RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->num_positions, &size));
|
374
|
+
//DEBUG("read num_positions: %u (%u bytes)", po->num_positions, size);
|
375
|
+
offset += size;
|
376
|
+
}
|
377
|
+
|
378
|
+
po->positions = malloc(po->num_positions * sizeof(pos_t));
|
379
|
+
|
380
|
+
for(uint32_t i = 0; i < po->num_positions; i++) {
|
381
|
+
RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->positions[i], &size));
|
382
|
+
offset += size;
|
383
|
+
po->positions[i] += (i == 0 ? 0 : po->positions[i - 1]);
|
384
|
+
//DEBUG("read position %u (%u bytes)", po->positions[i], size);
|
385
|
+
}
|
386
|
+
}
|
387
|
+
else {
|
388
|
+
po->num_positions = 0;
|
389
|
+
po->positions = NULL;
|
390
|
+
}
|
391
|
+
//DEBUG("total record took %u bytes", offset - orig_offset);
|
392
|
+
//printf("*** read posting %u %u %u from %u\n", po->doc_id, po->next_offset, po->num_positions, orig_offset);
|
393
|
+
|
394
|
+
return NO_ERROR;
|
395
|
+
}
|
396
|
+
|
397
|
+
wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* word, docid_t doc_id, uint32_t num_positions, pos_t positions[]) {
|
398
|
+
// TODO move this logic up to ensure_fit()
|
399
|
+
int success;
|
400
|
+
RELAY_ERROR(bump_stringmap(s, &success));
|
401
|
+
RELAY_ERROR(bump_stringpool(s, &success));
|
402
|
+
RELAY_ERROR(bump_termhash(s, &success));
|
403
|
+
|
404
|
+
DEBUG("adding posting for %s:%s and doc %u", field, word, doc_id);
|
405
|
+
|
406
|
+
postings_region* pr = MMAP_OBJ(s->postings, postings_region);
|
407
|
+
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
408
|
+
termhash* th = MMAP_OBJ(s->termhash, termhash);
|
409
|
+
|
410
|
+
// construct the term object
|
411
|
+
term t;
|
412
|
+
RELAY_ERROR(stringmap_add(sh, field, &t.field_s));
|
413
|
+
RELAY_ERROR(stringmap_add(sh, word, &t.word_s));
|
414
|
+
|
415
|
+
// find the offset of the next posting
|
416
|
+
posting po;
|
417
|
+
uint32_t next_offset = termhash_get_val(th, t);
|
418
|
+
if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
|
419
|
+
if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy
|
420
|
+
RELAY_ERROR(wp_segment_read_posting(s, next_offset, &po, 0));
|
421
|
+
if(po.doc_id >= doc_id) RAISE_ERROR("cannot add a doc_id out of sorted order");
|
422
|
+
}
|
423
|
+
|
424
|
+
// write the entry to the postings region
|
425
|
+
uint32_t entry_offset = pr->postings_head;
|
426
|
+
//DEBUG("entry will be at offset %u, prev offset is %u and next offset is %u", entry_offset, prev_offset, next_offset);
|
427
|
+
po.doc_id = doc_id;
|
428
|
+
po.next_offset = next_offset;
|
429
|
+
po.num_positions = num_positions;
|
430
|
+
RELAY_ERROR(write_posting(s, &po, positions)); // prev_docid is 0 for th
|
431
|
+
DEBUG("postings list head now at %u", pr->postings_head);
|
432
|
+
|
433
|
+
// really finally, update the tail pointer so that readers can access this posting
|
434
|
+
RELAY_ERROR(termhash_put_val(th, t, entry_offset));
|
435
|
+
|
436
|
+
return NO_ERROR;
|
437
|
+
}
|
438
|
+
|
439
|
+
/*
|
440
|
+
* currently, labels are implemented as a separate postings space and separate
|
441
|
+
* postings structure, but with the same term hash (the offsets just are
|
442
|
+
* relative to the different space).
|
443
|
+
*
|
444
|
+
* we use the sentinel field value 0 to demarcate a label. since no strings have
|
445
|
+
* have stringmap value 0, this is safe.
|
446
|
+
*
|
447
|
+
* we also maintain a free list of unused label postings. since all label
|
448
|
+
* postings are the same size, we can do this to reuse them and avoid losing
|
449
|
+
* space in this area; since label postings can be changed frequently, this is
|
450
|
+
* desirable. we use the sentinel postings value field=0 word=0 to keep track
|
451
|
+
* of this list.
|
452
|
+
*
|
453
|
+
*/
|
454
|
+
wp_error* wp_segment_read_label(wp_segment* s, uint32_t offset, posting* po) {
|
455
|
+
postings_region* pr = MMAP_OBJ(s->labels, postings_region);
|
456
|
+
|
457
|
+
label_posting* lp = wp_segment_label_posting_at(pr, offset);
|
458
|
+
po->doc_id = lp->doc_id;
|
459
|
+
po->next_offset = lp->next_offset;
|
460
|
+
po->num_positions = 0;
|
461
|
+
po->positions = NULL;
|
462
|
+
|
463
|
+
return NO_ERROR;
|
464
|
+
}
|
465
|
+
|
466
|
+
wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id) {
|
467
|
+
// TODO move this logic up to ensure_fit()
|
468
|
+
int success;
|
469
|
+
RELAY_ERROR(bump_stringmap(s, &success));
|
470
|
+
RELAY_ERROR(bump_stringpool(s, &success));
|
471
|
+
RELAY_ERROR(bump_termhash(s, &success));
|
472
|
+
|
473
|
+
DEBUG("adding label %s to doc %u", label, doc_id);
|
474
|
+
|
475
|
+
postings_region* pr = MMAP_OBJ(s->labels, postings_region);
|
476
|
+
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
477
|
+
termhash* th = MMAP_OBJ(s->termhash, termhash);
|
478
|
+
|
479
|
+
// construct the term object. term objects for labels have the special
|
480
|
+
// sentinel field value 0
|
481
|
+
term t;
|
482
|
+
t.field_s = 0; // label sentinel value
|
483
|
+
RELAY_ERROR(stringmap_add(sh, label, &t.word_s)); // get word key
|
484
|
+
|
485
|
+
// find the previous and next label postings, between which we'll insert this
|
486
|
+
// posting
|
487
|
+
uint32_t prev_offset = OFFSET_NONE;
|
488
|
+
uint32_t next_offset = termhash_get_val(th, t);
|
489
|
+
if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
|
490
|
+
|
491
|
+
while(next_offset != OFFSET_NONE) {
|
492
|
+
label_posting* po = wp_segment_label_posting_at(pr, next_offset);
|
493
|
+
if(po->doc_id == doc_id) {
|
494
|
+
DEBUG("already have label '%s' for doc %u; returning", label, doc_id);
|
495
|
+
return NO_ERROR;
|
496
|
+
}
|
497
|
+
else if(po->doc_id < doc_id) break;
|
498
|
+
prev_offset = next_offset;
|
499
|
+
next_offset = po->next_offset;
|
500
|
+
}
|
501
|
+
|
502
|
+
// find a space for the posting by first checking for a free postings in the
|
503
|
+
// dead list. the dead list is the list stored under the sentinel term
|
504
|
+
// with field 0 and word 0.
|
505
|
+
term dead_term = { .field_s = 0, .word_s = 0 };
|
506
|
+
uint32_t entry_offset;
|
507
|
+
uint32_t dead_offset = termhash_get_val(th, dead_term);
|
508
|
+
if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
|
509
|
+
|
510
|
+
if(dead_offset == OFFSET_NONE) { // make a new posting
|
511
|
+
entry_offset = pr->postings_head;
|
512
|
+
}
|
513
|
+
else { // we'll use this one; remove it from the linked list
|
514
|
+
DEBUG("offset from dead list is %u, using it for the new posting!", dead_offset);
|
515
|
+
entry_offset = dead_offset;
|
516
|
+
RELAY_ERROR(termhash_put_val(th, dead_term, wp_segment_label_posting_at(pr, dead_offset)->next_offset));
|
517
|
+
}
|
518
|
+
|
519
|
+
// finally, write the entry to the label postings region
|
520
|
+
DEBUG("label entry will be at offset %u, prev offset is %u and next offset is %u", entry_offset, prev_offset, next_offset);
|
521
|
+
label_posting* po = wp_segment_label_posting_at(pr, entry_offset);
|
522
|
+
po->doc_id = doc_id;
|
523
|
+
po->next_offset = next_offset;
|
524
|
+
|
525
|
+
pr->postings_head += sizeof(label_posting);
|
526
|
+
DEBUG("label postings list head now at %u", pr->postings_head);
|
527
|
+
|
528
|
+
// really finally, update either the previous offset or the tail pointer
|
529
|
+
// for this label so that readers can access this posting
|
530
|
+
if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, entry_offset));
|
531
|
+
else wp_segment_label_posting_at(pr, prev_offset)->next_offset = entry_offset;
|
532
|
+
|
533
|
+
return NO_ERROR;
|
534
|
+
}
|
535
|
+
|
536
|
+
wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_id) {
|
537
|
+
// TODO move this logic to ensure_fit
|
538
|
+
int success;
|
539
|
+
RELAY_ERROR(bump_termhash(s, &success)); // we might add an entry for the dead list
|
540
|
+
|
541
|
+
postings_region* pr = MMAP_OBJ(s->labels, postings_region);
|
542
|
+
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
543
|
+
termhash* th = MMAP_OBJ(s->termhash, termhash);
|
544
|
+
|
545
|
+
// construct the term object. term objects for labels have the special
|
546
|
+
// sentinel field value 0
|
547
|
+
term t;
|
548
|
+
t.field_s = 0; // label sentinel value
|
549
|
+
t.word_s = stringmap_string_to_int(sh, label); // will be -1 if not there
|
550
|
+
|
551
|
+
// find the posting and the previous posting in the list, if any
|
552
|
+
uint32_t prev_offset = OFFSET_NONE;
|
553
|
+
uint32_t offset = termhash_get_val(th, t);
|
554
|
+
if(offset == (uint32_t)-1) offset = OFFSET_NONE;
|
555
|
+
label_posting* lp = NULL;
|
556
|
+
|
557
|
+
while(offset != OFFSET_NONE) {
|
558
|
+
lp = wp_segment_label_posting_at(pr, offset);
|
559
|
+
if(lp->doc_id < doc_id) offset = OFFSET_NONE; // nasty hack to induce failure
|
560
|
+
if(lp->doc_id <= doc_id) break;
|
561
|
+
prev_offset = offset;
|
562
|
+
offset = lp->next_offset;
|
563
|
+
}
|
564
|
+
|
565
|
+
DEBUG("found label posting for doc %u at offset %u; prev_offset is %u", doc_id, offset, prev_offset);
|
566
|
+
|
567
|
+
if(offset == OFFSET_NONE) {
|
568
|
+
DEBUG("no label %s found for doc %u", label, doc_id);
|
569
|
+
return NO_ERROR;
|
570
|
+
}
|
571
|
+
|
572
|
+
// we've found the posting; now remove it from the list
|
573
|
+
if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, lp->next_offset));
|
574
|
+
else wp_segment_label_posting_at(pr, prev_offset)->next_offset = lp->next_offset;
|
575
|
+
|
576
|
+
// now add it to the dead list for later reclamation
|
577
|
+
term dead_term = { .field_s = 0, .word_s = 0 };
|
578
|
+
uint32_t dead_offset = termhash_get_val(th, dead_term);
|
579
|
+
if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
|
580
|
+
|
581
|
+
lp->next_offset = dead_offset;
|
582
|
+
DEBUG("adding dead label posting %u to head of deadlist with next_offset %u", offset, lp->next_offset);
|
583
|
+
RELAY_ERROR(termhash_put_val(th, dead_term, offset));
|
584
|
+
|
585
|
+
return NO_ERROR;
|
586
|
+
}
|
587
|
+
|
588
|
+
wp_error* wp_segment_grab_docid(wp_segment* segment, docid_t* doc_id) {
|
589
|
+
postings_region* pr = MMAP_OBJ(segment->postings, postings_region);
|
590
|
+
*doc_id = ++pr->num_docs;
|
591
|
+
return NO_ERROR;
|
592
|
+
}
|
593
|
+
|
594
|
+
wp_error* wp_segment_dumpinfo(wp_segment* segment, FILE* stream) {
|
595
|
+
postings_region* pr = MMAP_OBJ(segment->postings, postings_region);
|
596
|
+
stringmap* sh = MMAP_OBJ(segment->stringmap, stringmap);
|
597
|
+
stringpool* sp = MMAP_OBJ(segment->stringpool, stringpool);
|
598
|
+
termhash* th = MMAP_OBJ(segment->termhash, termhash);
|
599
|
+
|
600
|
+
#define p(a, b) 100.0 * (float)a / (float)b
|
601
|
+
|
602
|
+
fprintf(stream, "segment has type %u\n", pr->index_type_and_flags);
|
603
|
+
fprintf(stream, "segment has %u docs and %u postings\n", pr->num_docs, pr->num_postings);
|
604
|
+
fprintf(stream, "postings region is %6ukb at %3.1f%% saturation\n", segment->postings.header->size / 1024, p(pr->postings_head, pr->postings_tail));
|
605
|
+
fprintf(stream, " string hash is %6ukb at %3.1f%% saturation\n", segment->stringmap.header->size / 1024, p(sh->n_occupied, sh->n_buckets));
|
606
|
+
fprintf(stream, " stringpool is %6ukb at %3.1f%% saturation\n", segment->stringpool.header->size / 1024, p(sp->next, sp->size));
|
607
|
+
fprintf(stream, " term hash has %6ukb at %3.1f%% saturation\n", segment->termhash.header->size / 1024, p(th->n_occupied, th->n_buckets));
|
608
|
+
|
609
|
+
return NO_ERROR;
|
610
|
+
}
|
611
|
+
|
612
|
+
uint64_t wp_segment_num_docs(wp_segment* seg) {
|
613
|
+
postings_region* pr = MMAP_OBJ(seg->postings, postings_region);
|
614
|
+
return pr->num_docs;
|
615
|
+
}
|