whistlepig 0.11.2 → 0.12
Sign up to get free protection for your applications and to get access to all the features.
- data/README +2 -2
- data/ext/whistlepig/error.c +2 -1
- data/ext/whistlepig/error.h +17 -7
- data/ext/whistlepig/index.c +58 -18
- data/ext/whistlepig/mmap-obj.c +4 -4
- data/ext/whistlepig/search.c +10 -5
- data/ext/whistlepig/segment.c +83 -34
- data/ext/whistlepig/segment.h +3 -0
- data/ext/whistlepig/snippeter.c +1 -1
- data/ext/whistlepig/termhash.c +16 -17
- data/ext/whistlepig/termhash.h +16 -4
- data/ext/whistlepig/whistlepig.c +13 -4
- metadata +2 -2
data/README
CHANGED
@@ -8,7 +8,7 @@ full-text search without the frills, Whistlepig may be for you.
|
|
8
8
|
Whistlepig is written in ANSI C99. It currently provides a C API and Ruby
|
9
9
|
bindings.
|
10
10
|
|
11
|
-
Latest version: 0.
|
11
|
+
Latest version: 0.12, released 2012-06-09.
|
12
12
|
Status: beta
|
13
13
|
News: http://all-thing.net/label/whistlepig/
|
14
14
|
Homepage: http://masanjin.net/whistlepig/
|
@@ -16,7 +16,7 @@ Latest version: 0.11.2, released 2012-05-19.
|
|
16
16
|
|
17
17
|
= Getting it
|
18
18
|
|
19
|
-
Tarball: http://masanjin.net/whistlepig/whistlepig-0.
|
19
|
+
Tarball: http://masanjin.net/whistlepig/whistlepig-0.12.tar.gz
|
20
20
|
Rubygem: gem install whistlepig
|
21
21
|
Git: git clone git://github.com/wmorgan/whistlepig.git
|
22
22
|
|
data/ext/whistlepig/error.c
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
#include <stdlib.h>
|
2
2
|
#include "error.h"
|
3
3
|
|
4
|
-
wp_error* wp_error_new(const char* msg, const char* src) {
|
4
|
+
wp_error* wp_error_new(const char* msg, const char* src, unsigned char type) {
|
5
5
|
wp_error* ret = malloc(sizeof(wp_error));
|
6
6
|
ret->msg = msg;
|
7
|
+
ret->type = type;
|
7
8
|
ret->size = 1;
|
8
9
|
ret->srcs = malloc(sizeof(const char*));
|
9
10
|
ret->srcs[0] = src;
|
data/ext/whistlepig/error.h
CHANGED
@@ -25,8 +25,13 @@
|
|
25
25
|
#include <stdio.h>
|
26
26
|
#include <string.h>
|
27
27
|
|
28
|
+
#define WP_ERROR_TYPE_BASIC 1
|
29
|
+
#define WP_ERROR_TYPE_SYSTEM 2
|
30
|
+
#define WP_ERROR_TYPE_VERSION 3
|
31
|
+
|
28
32
|
// pseudo-backtrace
|
29
33
|
typedef struct wp_error {
|
34
|
+
unsigned char type;
|
30
35
|
unsigned int size;
|
31
36
|
const char* msg;
|
32
37
|
const char** srcs;
|
@@ -39,25 +44,30 @@ typedef struct wp_error {
|
|
39
44
|
// API methods
|
40
45
|
|
41
46
|
// private: make a new error object with a message and source line
|
42
|
-
wp_error* wp_error_new(const char* msg, const char* src) RAISES_ERROR;
|
47
|
+
wp_error* wp_error_new(const char* msg, const char* src, unsigned char type) RAISES_ERROR;
|
43
48
|
// private: add a source line to a pre-existing error
|
44
49
|
wp_error* wp_error_chain(wp_error* e, const char* src) RAISES_ERROR;
|
45
50
|
|
46
51
|
// public: free an error, once handled
|
47
52
|
void wp_error_free(wp_error* e);
|
48
53
|
|
49
|
-
//
|
50
|
-
#define
|
54
|
+
// private: internal mechanics for raising an error
|
55
|
+
#define RAISE_ERROR_OF_TYPE(type, fmt, ...) do { \
|
51
56
|
char* msg = malloc(1024); \
|
52
57
|
char* src = malloc(1024); \
|
53
58
|
snprintf(msg, 1024, fmt, ## __VA_ARGS__); \
|
54
59
|
snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
|
55
|
-
return wp_error_new(msg, src); \
|
60
|
+
return wp_error_new(msg, src, type); \
|
56
61
|
} while(0)
|
57
62
|
|
58
|
-
// public: raise
|
59
|
-
|
60
|
-
|
63
|
+
// public: raise a basic error
|
64
|
+
#define RAISE_ERROR(fmt, ...) RAISE_ERROR_OF_TYPE(WP_ERROR_TYPE_BASIC, fmt, ## __VA_ARGS__)
|
65
|
+
|
66
|
+
// public: raise a version error
|
67
|
+
#define RAISE_VERSION_ERROR(fmt, ...) RAISE_ERROR_OF_TYPE(WP_ERROR_TYPE_VERSION, fmt, ## __VA_ARGS__)
|
68
|
+
|
69
|
+
// public: raise a system error with strerror() automatically appended to the message
|
70
|
+
#define RAISE_SYSERROR(fmt, ...) RAISE_ERROR_OF_TYPE(WP_ERROR_TYPE_SYSTEM, fmt ": %s", ## __VA_ARGS__, strerror(errno))
|
61
71
|
|
62
72
|
// public: relay an error up the stack if the called function returns one.
|
63
73
|
#define RELAY_ERROR(e) do { \
|
data/ext/whistlepig/index.c
CHANGED
@@ -32,7 +32,6 @@ RAISING_STATIC(release_lock(wp_index* index)) {
|
|
32
32
|
return NO_ERROR;
|
33
33
|
}
|
34
34
|
|
35
|
-
|
36
35
|
RAISING_STATIC(index_info_init(index_info* ii, uint32_t index_version)) {
|
37
36
|
ii->index_version = index_version;
|
38
37
|
ii->num_segments = 0;
|
@@ -42,7 +41,7 @@ RAISING_STATIC(index_info_init(index_info* ii, uint32_t index_version)) {
|
|
42
41
|
}
|
43
42
|
|
44
43
|
RAISING_STATIC(index_info_validate(index_info* ii, uint32_t index_version)) {
|
45
|
-
if(ii->index_version != index_version)
|
44
|
+
if(ii->index_version != index_version) RAISE_VERSION_ERROR("index has type %u; expecting type %u", ii->index_version, index_version);
|
46
45
|
return NO_ERROR;
|
47
46
|
}
|
48
47
|
|
@@ -149,6 +148,57 @@ wp_error* wp_index_setup_query(wp_index* index, wp_query* query) {
|
|
149
148
|
return NO_ERROR;
|
150
149
|
}
|
151
150
|
|
151
|
+
#define RESULT_BUF_SIZE 1024
|
152
|
+
// count the results by running the query until it stops. slow!
|
153
|
+
RAISING_STATIC(count_query_by_running_it(wp_index* index, wp_query* query, uint32_t* num_results)) {
|
154
|
+
uint64_t results[RESULT_BUF_SIZE];
|
155
|
+
|
156
|
+
*num_results = 0;
|
157
|
+
while(1) {
|
158
|
+
uint32_t this_num_results;
|
159
|
+
RELAY_ERROR(wp_index_run_query(index, query, RESULT_BUF_SIZE, &this_num_results, results));
|
160
|
+
*num_results += this_num_results;
|
161
|
+
if(this_num_results < RESULT_BUF_SIZE) break; // done
|
162
|
+
}
|
163
|
+
|
164
|
+
return NO_ERROR;
|
165
|
+
}
|
166
|
+
|
167
|
+
RAISING_STATIC(count_query_from_posting_list_header(wp_index* index, wp_query* query, uint32_t* num_results)) {
|
168
|
+
// make sure we have know about all segments (one could've been added by a writer)
|
169
|
+
RELAY_ERROR(grab_readlock(index));
|
170
|
+
RELAY_ERROR(ensure_all_segments(index));
|
171
|
+
RELAY_ERROR(release_lock(index));
|
172
|
+
|
173
|
+
*num_results = 0;
|
174
|
+
for(int i = 0; i < index->num_segments; i++) {
|
175
|
+
uint32_t this_num_results;
|
176
|
+
|
177
|
+
DEBUG("counting on segment %d", i);
|
178
|
+
wp_segment* seg = &index->segments[i];
|
179
|
+
RELAY_ERROR(wp_segment_grab_readlock(seg));
|
180
|
+
RELAY_ERROR(wp_segment_reload(seg));
|
181
|
+
RELAY_ERROR(wp_segment_count_term(seg, query->field, query->word, &this_num_results));
|
182
|
+
RELAY_ERROR(wp_segment_release_lock(seg));
|
183
|
+
*num_results += this_num_results;
|
184
|
+
DEBUG("got %d results from segment %d", this_num_results, i);
|
185
|
+
}
|
186
|
+
|
187
|
+
return NO_ERROR;
|
188
|
+
}
|
189
|
+
|
190
|
+
RAISING_STATIC(count_query(wp_index* index, wp_query* query, uint32_t* num_results)) {
|
191
|
+
switch(query->type) {
|
192
|
+
case WP_QUERY_TERM:
|
193
|
+
case WP_QUERY_LABEL:
|
194
|
+
RELAY_ERROR(count_query_from_posting_list_header(index, query, num_results));
|
195
|
+
break;
|
196
|
+
case WP_QUERY_EVERY: // TODO -- special case this
|
197
|
+
default:
|
198
|
+
RELAY_ERROR(count_query_by_running_it(index, query, num_results));
|
199
|
+
}
|
200
|
+
return NO_ERROR;
|
201
|
+
}
|
152
202
|
// can be called multiple times to resume
|
153
203
|
wp_error* wp_index_run_query(wp_index* index, wp_query* query, uint32_t max_num_results, uint32_t* num_results, uint64_t* results) {
|
154
204
|
*num_results = 0;
|
@@ -209,20 +259,10 @@ wp_error* wp_index_run_query(wp_index* index, wp_query* query, uint32_t max_num_
|
|
209
259
|
return NO_ERROR;
|
210
260
|
}
|
211
261
|
|
212
|
-
|
213
|
-
// count the results by just running the query until it stops. slow!
|
262
|
+
// just count the results, don't return them
|
214
263
|
wp_error* wp_index_count_results(wp_index* index, wp_query* query, uint32_t* num_results) {
|
215
|
-
uint64_t results[RESULT_BUF_SIZE];
|
216
|
-
|
217
|
-
*num_results = 0;
|
218
264
|
RELAY_ERROR(wp_index_setup_query(index, query));
|
219
|
-
|
220
|
-
uint32_t this_num_results;
|
221
|
-
RELAY_ERROR(wp_index_run_query(index, query, RESULT_BUF_SIZE, &this_num_results, results));
|
222
|
-
*num_results += this_num_results;
|
223
|
-
if(this_num_results < RESULT_BUF_SIZE) break; // done
|
224
|
-
}
|
225
|
-
|
265
|
+
RELAY_ERROR(count_query(index, query, num_results));
|
226
266
|
RELAY_ERROR(wp_index_teardown_query(index, query));
|
227
267
|
|
228
268
|
return NO_ERROR;
|
@@ -369,7 +409,7 @@ wp_error* wp_index_add_label(wp_index* index, const char* label, uint64_t doc_id
|
|
369
409
|
if(doc_id > index->docid_offsets[i - 1]) {
|
370
410
|
wp_segment* seg = &index->segments[i - 1];
|
371
411
|
|
372
|
-
DEBUG("found doc %
|
412
|
+
DEBUG("found doc %"PRIu64" in segment %u", doc_id, i - 1);
|
373
413
|
RELAY_ERROR(wp_segment_grab_writelock(seg));
|
374
414
|
RELAY_ERROR(wp_segment_reload(seg));
|
375
415
|
RELAY_ERROR(wp_segment_add_label(seg, label, (docid_t)(doc_id - index->docid_offsets[i - 1])));
|
@@ -377,7 +417,7 @@ wp_error* wp_index_add_label(wp_index* index, const char* label, uint64_t doc_id
|
|
377
417
|
found = 1;
|
378
418
|
break;
|
379
419
|
}
|
380
|
-
else DEBUG("did not find doc %
|
420
|
+
else DEBUG("did not find doc %"PRIu64" in segment %u", doc_id, i - 1);
|
381
421
|
}
|
382
422
|
|
383
423
|
if(!found) RAISE_ERROR("couldn't find doc id %"PRIu64, doc_id);
|
@@ -396,7 +436,7 @@ wp_error* wp_index_remove_label(wp_index* index, const char* label, uint64_t doc
|
|
396
436
|
if(doc_id > index->docid_offsets[i - 1]) {
|
397
437
|
wp_segment* seg = &index->segments[i - 1];
|
398
438
|
|
399
|
-
DEBUG("found doc %
|
439
|
+
DEBUG("found doc %"PRIu64" in segment %u", doc_id, i - 1);
|
400
440
|
RELAY_ERROR(wp_segment_grab_writelock(seg));
|
401
441
|
RELAY_ERROR(wp_segment_reload(seg));
|
402
442
|
RELAY_ERROR(wp_segment_remove_label(seg, label, (docid_t)(doc_id - index->docid_offsets[i - 1])));
|
@@ -404,7 +444,7 @@ wp_error* wp_index_remove_label(wp_index* index, const char* label, uint64_t doc
|
|
404
444
|
found = 1;
|
405
445
|
break;
|
406
446
|
}
|
407
|
-
else DEBUG("did not find doc %
|
447
|
+
else DEBUG("did not find doc %"PRIu64" in segment %u", doc_id, i - 1);
|
408
448
|
}
|
409
449
|
|
410
450
|
if(!found) RAISE_ERROR("couldn't find doc id %"PRIu64, doc_id);
|
data/ext/whistlepig/mmap-obj.c
CHANGED
@@ -14,7 +14,7 @@ wp_error* mmap_obj_create(mmap_obj* o, const char* magic, const char* pathname,
|
|
14
14
|
if(o->fd == -1) RAISE_SYSERROR("cannot create %s", pathname);
|
15
15
|
|
16
16
|
uint32_t size = initial_size + (uint32_t)sizeof(mmap_obj_header);
|
17
|
-
DEBUG("creating %s with %u + %
|
17
|
+
DEBUG("creating %s with %u + %lu = %u bytes for %s object", pathname, initial_size, sizeof(mmap_obj_header), size, magic);
|
18
18
|
lseek(o->fd, size - 1, SEEK_SET);
|
19
19
|
ssize_t num_bytes = write(o->fd, "", 1);
|
20
20
|
if(num_bytes == -1) RAISE_SYSERROR("write");
|
@@ -35,14 +35,14 @@ wp_error* mmap_obj_load(mmap_obj* o, const char* magic, const char* pathname) {
|
|
35
35
|
// load header
|
36
36
|
o->content = mmap(NULL, sizeof(mmap_obj_header), PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
|
37
37
|
if(o->content == MAP_FAILED) RAISE_SYSERROR("header mmap");
|
38
|
-
DEBUG("loaded header of %
|
38
|
+
DEBUG("loaded header of %lu bytes for %s object", sizeof(mmap_obj_header), magic);
|
39
39
|
|
40
40
|
RELAY_ERROR(validate(o->content, magic));
|
41
41
|
|
42
42
|
o->loaded_size = o->content->size;
|
43
43
|
|
44
44
|
uint32_t size = o->content->size + (uint32_t)sizeof(mmap_obj_header);
|
45
|
-
DEBUG("full size is %u bytes (including %
|
45
|
+
DEBUG("full size is %u bytes (including %lu-byte header)", size, sizeof(mmap_obj_header));
|
46
46
|
if(munmap(o->content, sizeof(mmap_obj_header)) == -1) RAISE_SYSERROR("munmap");
|
47
47
|
|
48
48
|
o->content = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
|
@@ -85,7 +85,7 @@ wp_error* mmap_obj_resize(mmap_obj* o, uint32_t data_size) {
|
|
85
85
|
}
|
86
86
|
|
87
87
|
wp_error* mmap_obj_unload(mmap_obj* o) {
|
88
|
-
DEBUG("unloading %
|
88
|
+
DEBUG("unloading %lu bytes", sizeof(mmap_obj_header) + o->content->size);
|
89
89
|
if(munmap(o->content, sizeof(mmap_obj_header) + o->content->size) == -1) RAISE_SYSERROR("munmap");
|
90
90
|
o->content = NULL;
|
91
91
|
return NO_ERROR;
|
data/ext/whistlepig/search.c
CHANGED
@@ -195,8 +195,14 @@ static wp_error* term_init_search_state(wp_query* q, wp_segment* seg) {
|
|
195
195
|
|
196
196
|
t.word_s = stringmap_string_to_int(sh, sp, q->word);
|
197
197
|
|
198
|
-
uint32_t offset
|
199
|
-
|
198
|
+
uint32_t offset;
|
199
|
+
posting_list_header* plh = termhash_get_val(th, t);
|
200
|
+
|
201
|
+
DEBUG("posting list header for %s:%s (-> %u:%u) is %p", q->field, q->word, t.field_s, t.word_s, plh);
|
202
|
+
if(plh == NULL) offset = OFFSET_NONE;
|
203
|
+
else offset = plh->next_offset;
|
204
|
+
|
205
|
+
if(plh) DEBUG("posting list header has count=%u next_offset=%u", plh->count, plh->next_offset);
|
200
206
|
|
201
207
|
if(offset == OFFSET_NONE) state->done = 1; // no entry in term hash
|
202
208
|
else {
|
@@ -475,8 +481,8 @@ static wp_error* conj_next_doc(wp_query* q, wp_segment* seg, search_result* resu
|
|
475
481
|
|
476
482
|
while(!found && !*done) {
|
477
483
|
RELAY_ERROR(query_next_doc(master, seg, result, done));
|
478
|
-
DEBUG("master reports doc %u done %d", result->doc_id, *done);
|
479
484
|
if(!*done) {
|
485
|
+
DEBUG("master reports doc %u done %d", result->doc_id, *done);
|
480
486
|
search_doc = result->doc_id;
|
481
487
|
wp_search_result_free(result); // sigh
|
482
488
|
RELAY_ERROR(conj_advance_to_doc(q, seg, search_doc, result, &found, done));
|
@@ -564,8 +570,8 @@ static wp_error* phrase_next_doc(wp_query* q, wp_segment* seg, search_result* re
|
|
564
570
|
|
565
571
|
while(!found && !*done) {
|
566
572
|
RELAY_ERROR(query_next_doc(master, seg, result, done));
|
567
|
-
DEBUG("master reports doc %u done %d", result->doc_id, *done);
|
568
573
|
if(!*done) {
|
574
|
+
DEBUG("master reports doc %u done %d", result->doc_id, *done);
|
569
575
|
search_doc = result->doc_id;
|
570
576
|
wp_search_result_free(result); // sigh
|
571
577
|
RELAY_ERROR(phrase_advance_to_doc(q, seg, search_doc, result, &found, done));
|
@@ -784,7 +790,6 @@ static wp_error* every_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_
|
|
784
790
|
|
785
791
|
wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment* s, uint32_t max_num_results, uint32_t* num_results, search_result* results) {
|
786
792
|
int done;
|
787
|
-
|
788
793
|
*num_results = 0;
|
789
794
|
|
790
795
|
#ifdef DEBUG
|
data/ext/whistlepig/segment.c
CHANGED
@@ -3,13 +3,16 @@
|
|
3
3
|
#include <unistd.h>
|
4
4
|
#include "whistlepig.h"
|
5
5
|
|
6
|
-
#define POSTINGS_REGION_TYPE_IMMUTABLE_VBE
|
6
|
+
#define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
|
7
7
|
#define POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS 2 // bigger, mutable
|
8
8
|
|
9
|
-
#define SEGMENT_VERSION
|
9
|
+
#define SEGMENT_VERSION 4
|
10
10
|
|
11
11
|
#define wp_segment_label_posting_at(posting_region, offset) ((label_posting*)(posting_region->postings + offset))
|
12
12
|
|
13
|
+
static posting_list_header blank_plh = { .count = 0, .next_offset = OFFSET_NONE };
|
14
|
+
static term dead_term = { .field_s = 0, .word_s = 0 };
|
15
|
+
|
13
16
|
wp_error* wp_segment_grab_readlock(wp_segment* seg) {
|
14
17
|
segment_info* si = MMAP_OBJ(seg->seginfo, segment_info);
|
15
18
|
RELAY_ERROR(wp_lock_grab(&si->lock, WP_LOCK_READLOCK));
|
@@ -28,6 +31,23 @@ wp_error* wp_segment_release_lock(wp_segment* seg) {
|
|
28
31
|
return NO_ERROR;
|
29
32
|
}
|
30
33
|
|
34
|
+
wp_error* wp_segment_count_term(wp_segment* seg, const char* field, const char* word, uint32_t* num_results) {
|
35
|
+
stringmap* sh = MMAP_OBJ(seg->stringmap, stringmap);
|
36
|
+
stringpool* sp = MMAP_OBJ(seg->stringpool, stringpool);
|
37
|
+
termhash* th = MMAP_OBJ(seg->termhash, termhash);
|
38
|
+
|
39
|
+
term t;
|
40
|
+
if(field == NULL) t.field_s = 0; // label sentinel
|
41
|
+
else t.field_s = stringmap_string_to_int(sh, sp, field);
|
42
|
+
t.word_s = stringmap_string_to_int(sh, sp, word);
|
43
|
+
|
44
|
+
posting_list_header* plh = termhash_get_val(th, t);
|
45
|
+
if(plh == NULL) *num_results = 0;
|
46
|
+
else *num_results = plh->count;
|
47
|
+
|
48
|
+
return NO_ERROR;
|
49
|
+
}
|
50
|
+
|
31
51
|
static void postings_region_init(postings_region* pr, uint32_t initial_size, uint32_t postings_type_and_flags) {
|
32
52
|
pr->postings_type_and_flags = postings_type_and_flags;
|
33
53
|
pr->num_postings = 0;
|
@@ -44,7 +64,7 @@ RAISING_STATIC(segment_info_init(segment_info* si, uint32_t segment_version)) {
|
|
44
64
|
}
|
45
65
|
|
46
66
|
RAISING_STATIC(segment_info_validate(segment_info* si, uint32_t segment_version)) {
|
47
|
-
if(si->segment_version != segment_version)
|
67
|
+
if(si->segment_version != segment_version) RAISE_VERSION_ERROR("segment has type %u; expecting type %u", si->segment_version, segment_version);
|
48
68
|
return NO_ERROR;
|
49
69
|
}
|
50
70
|
|
@@ -449,7 +469,7 @@ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* w
|
|
449
469
|
RELAY_ERROR(bump_stringpool(s, &success));
|
450
470
|
RELAY_ERROR(bump_termhash(s, &success));
|
451
471
|
|
452
|
-
DEBUG("adding posting for %s:%s and doc %u", field, word, doc_id);
|
472
|
+
DEBUG("adding posting for %s:%s and doc %u with %u positions", field, word, doc_id, num_positions);
|
453
473
|
|
454
474
|
postings_region* pr = MMAP_OBJ(s->postings, postings_region);
|
455
475
|
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
@@ -461,26 +481,38 @@ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* w
|
|
461
481
|
RELAY_ERROR(stringmap_add(sh, sp, field, &t.field_s));
|
462
482
|
RELAY_ERROR(stringmap_add(sh, sp, word, &t.word_s));
|
463
483
|
|
484
|
+
DEBUG("%s:%s maps to %u:%u", field, word, t.field_s, t.word_s);
|
485
|
+
|
464
486
|
// find the offset of the next posting
|
487
|
+
posting_list_header* plh = termhash_get_val(th, t);
|
488
|
+
if(plh == NULL) {
|
489
|
+
RELAY_ERROR(termhash_put_val(th, t, &blank_plh));
|
490
|
+
plh = termhash_get_val(th, t);
|
491
|
+
}
|
492
|
+
DEBUG("posting list header for %s:%s is at %p", field, word, plh);
|
493
|
+
|
465
494
|
posting po;
|
466
|
-
uint32_t next_offset =
|
467
|
-
|
468
|
-
if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy
|
495
|
+
uint32_t next_offset = plh->next_offset;
|
496
|
+
|
497
|
+
if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy [PERFORMANCE]
|
469
498
|
RELAY_ERROR(wp_segment_read_posting(s, next_offset, &po, 0));
|
470
499
|
if(po.doc_id >= doc_id) RAISE_ERROR("cannot add a doc_id out of sorted order");
|
471
500
|
}
|
472
501
|
|
473
502
|
// write the entry to the postings region
|
474
503
|
uint32_t entry_offset = pr->postings_head;
|
475
|
-
|
504
|
+
DEBUG("writing posting at offset %u. next offset is %u.", entry_offset, next_offset);
|
505
|
+
|
476
506
|
po.doc_id = doc_id;
|
477
507
|
po.next_offset = next_offset;
|
478
508
|
po.num_positions = num_positions;
|
479
509
|
RELAY_ERROR(write_posting(s, &po, positions)); // prev_docid is 0 for th
|
480
|
-
DEBUG("
|
510
|
+
DEBUG("posting list head now at %u", pr->postings_head);
|
481
511
|
|
482
512
|
// really finally, update the tail pointer so that readers can access this posting
|
483
|
-
|
513
|
+
plh->count++;
|
514
|
+
plh->next_offset = entry_offset;
|
515
|
+
DEBUG("posting list header for %s:%s now reads count=%u offset=%u", field, word, plh->count, plh->next_offset);
|
484
516
|
|
485
517
|
return NO_ERROR;
|
486
518
|
}
|
@@ -537,22 +569,25 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
|
|
537
569
|
|
538
570
|
// find the previous and next label postings, between which we'll insert this
|
539
571
|
// posting
|
540
|
-
|
541
|
-
|
572
|
+
posting_list_header* plh = termhash_get_val(th, t);
|
573
|
+
if(plh == NULL) {
|
574
|
+
RELAY_ERROR(termhash_put_val(th, t, &blank_plh));
|
575
|
+
plh = termhash_get_val(th, t);
|
576
|
+
}
|
577
|
+
|
578
|
+
uint32_t next_offset = plh->next_offset;
|
542
579
|
docid_t last_docid = DOCID_NONE;
|
580
|
+
uint32_t prev_offset = OFFSET_NONE;
|
543
581
|
|
544
|
-
if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
|
545
582
|
DEBUG("start offset is %u (none is %u)", next_offset, OFFSET_NONE);
|
546
583
|
|
547
584
|
while(next_offset != OFFSET_NONE) {
|
548
585
|
label_posting* lp = wp_segment_label_posting_at(pr, next_offset);
|
549
586
|
|
550
|
-
if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid))
|
587
|
+
if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid))
|
551
588
|
RAISE_ERROR("whistlepig index corruption! lp %u has docid %u but last docid at lp %u was %u", next_offset, lp->doc_id, prev_offset, last_docid);
|
552
|
-
|
553
|
-
else {
|
589
|
+
else
|
554
590
|
last_docid = lp->doc_id;
|
555
|
-
}
|
556
591
|
|
557
592
|
DEBUG("got doc id %u next_offset %u at offset %u (looking for doc id %u)", lp->doc_id, lp->next_offset, next_offset, doc_id);
|
558
593
|
if(lp->doc_id == doc_id) {
|
@@ -567,18 +602,23 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
|
|
567
602
|
// find a space for the posting by first checking for a free postings in the
|
568
603
|
// dead list. the dead list is the list stored under the sentinel term with
|
569
604
|
// field 0 and word 0.
|
570
|
-
|
605
|
+
posting_list_header* dead_plh = termhash_get_val(th, dead_term);
|
606
|
+
if(dead_plh == NULL) {
|
607
|
+
RELAY_ERROR(termhash_put_val(th, dead_term, &blank_plh));
|
608
|
+
dead_plh = termhash_get_val(th, t);
|
609
|
+
}
|
610
|
+
|
571
611
|
uint32_t entry_offset;
|
572
|
-
uint32_t dead_offset =
|
573
|
-
if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
|
612
|
+
uint32_t dead_offset = dead_plh->next_offset;
|
574
613
|
|
575
614
|
if(dead_offset == OFFSET_NONE) { // make a new posting
|
576
615
|
entry_offset = pr->postings_head;
|
577
616
|
}
|
578
617
|
else { // we'll use this one; remove it from the linked list
|
579
618
|
DEBUG("offset from dead list is %u, using it for the new posting!", dead_offset);
|
580
|
-
entry_offset =
|
581
|
-
|
619
|
+
entry_offset = dead_plh->next_offset;
|
620
|
+
dead_plh->next_offset = wp_segment_label_posting_at(pr, dead_offset)->next_offset;
|
621
|
+
dead_plh->count--;
|
582
622
|
}
|
583
623
|
|
584
624
|
// finally, write the entry to the label postings region
|
@@ -588,11 +628,12 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
|
|
588
628
|
po->next_offset = next_offset;
|
589
629
|
|
590
630
|
pr->postings_head += (uint32_t)sizeof(label_posting);
|
591
|
-
DEBUG("label
|
631
|
+
DEBUG("label posting list head now at %u", pr->postings_head);
|
592
632
|
|
593
633
|
// really finally, update either the previous offset or the tail pointer
|
594
634
|
// for this label so that readers can access this posting
|
595
|
-
|
635
|
+
plh->count++;
|
636
|
+
if(prev_offset == OFFSET_NONE) plh->next_offset = entry_offset;
|
596
637
|
else wp_segment_label_posting_at(pr, prev_offset)->next_offset = entry_offset;
|
597
638
|
|
598
639
|
return NO_ERROR;
|
@@ -615,13 +656,16 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
|
|
615
656
|
t.word_s = stringmap_string_to_int(sh, sp, label); // will be -1 if not there
|
616
657
|
|
617
658
|
// find the posting and the previous posting in the list, if any
|
618
|
-
uint32_t prev_offset = OFFSET_NONE;
|
619
|
-
uint32_t offset = termhash_get_val(th, t);
|
620
659
|
docid_t last_docid = DOCID_NONE;
|
660
|
+
uint32_t prev_offset = OFFSET_NONE;
|
661
|
+
posting_list_header* plh = termhash_get_val(th, t);
|
662
|
+
if(plh == NULL) {
|
663
|
+
DEBUG("no such label %s", label);
|
664
|
+
return NO_ERROR;
|
665
|
+
}
|
621
666
|
|
622
|
-
|
667
|
+
uint32_t offset = plh->next_offset;
|
623
668
|
label_posting* lp = NULL;
|
624
|
-
|
625
669
|
while(offset != OFFSET_NONE) {
|
626
670
|
lp = wp_segment_label_posting_at(pr, offset);
|
627
671
|
|
@@ -646,17 +690,22 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
|
|
646
690
|
}
|
647
691
|
|
648
692
|
// we've found the posting; now remove it from the list
|
649
|
-
if(prev_offset == OFFSET_NONE)
|
693
|
+
if(prev_offset == OFFSET_NONE) plh->next_offset = lp->next_offset;
|
650
694
|
else wp_segment_label_posting_at(pr, prev_offset)->next_offset = lp->next_offset;
|
695
|
+
plh->count--;
|
651
696
|
|
652
697
|
// now add it to the dead list for later reclamation
|
653
|
-
|
654
|
-
|
655
|
-
|
698
|
+
posting_list_header* dead_plh = termhash_get_val(th, dead_term);
|
699
|
+
if(dead_plh == NULL) {
|
700
|
+
RELAY_ERROR(termhash_put_val(th, dead_term, &blank_plh));
|
701
|
+
dead_plh = termhash_get_val(th, t);
|
702
|
+
}
|
656
703
|
|
657
|
-
lp->next_offset = dead_offset;
|
658
704
|
DEBUG("adding dead label posting %u to head of deadlist with next_offset %u", offset, lp->next_offset);
|
659
|
-
|
705
|
+
|
706
|
+
uint32_t dead_offset = dead_plh->next_offset;
|
707
|
+
lp->next_offset = dead_offset;
|
708
|
+
dead_plh->next_offset = offset;
|
660
709
|
|
661
710
|
return NO_ERROR;
|
662
711
|
}
|
data/ext/whistlepig/segment.h
CHANGED
@@ -150,4 +150,7 @@ wp_error* wp_segment_ensure_fit(wp_segment* seg, uint32_t postings_bytes, uint32
|
|
150
150
|
// private: return the size on disk of a position array
|
151
151
|
wp_error* wp_segment_sizeof_posarray(wp_segment* seg, uint32_t num_positions, pos_t* positions, uint32_t* size) RAISES_ERROR;
|
152
152
|
|
153
|
+
// private: count the number of occurences of a particular term
|
154
|
+
wp_error* wp_segment_count_term(wp_segment* seg, const char* field, const char* term, uint32_t* num_results);
|
155
|
+
|
153
156
|
#endif
|
data/ext/whistlepig/snippeter.c
CHANGED
@@ -87,7 +87,7 @@ RAISING_STATIC(snippetize_from_lexer(wp_query* query, lexinfo* charpos, yyscan_t
|
|
87
87
|
|
88
88
|
RARRAY_INIT(pword, words);
|
89
89
|
while(yylex(*scanner) != TOK_DONE) {
|
90
|
-
pword pw = { strdup(yyget_text(*scanner)), charpos->start, charpos->end };
|
90
|
+
pword pw = { .token = strdup(yyget_text(*scanner)), .start = charpos->start, .end = charpos->end };
|
91
91
|
RARRAY_ADD(pword, words, pw);
|
92
92
|
}
|
93
93
|
|
data/ext/whistlepig/termhash.c
CHANGED
@@ -106,15 +106,15 @@ wp_error* termhash_bump_size(termhash *h) {
|
|
106
106
|
|
107
107
|
// get pointers to the old locations
|
108
108
|
term* oldkeys = TERMHASH_KEYS(h);
|
109
|
-
|
109
|
+
posting_list_header* oldvals = TERMHASH_VALS(h);
|
110
110
|
|
111
111
|
// set pointers to the new locations
|
112
112
|
uint32_t* newflags = (uint32_t*)h->boundary;
|
113
113
|
term* newkeys = (term*)(newflags + ((new_n_buckets >> 4) + 1));
|
114
|
-
|
114
|
+
posting_list_header* newvals = (posting_list_header*)(newkeys + new_n_buckets);
|
115
115
|
|
116
116
|
// move the vals and keys
|
117
|
-
memmove(newvals, oldvals, h->n_buckets * sizeof(
|
117
|
+
memmove(newvals, oldvals, h->n_buckets * sizeof(posting_list_header));
|
118
118
|
memmove(newkeys, oldkeys, h->n_buckets * sizeof(term));
|
119
119
|
|
120
120
|
// clear the new flags
|
@@ -124,8 +124,7 @@ wp_error* termhash_bump_size(termhash *h) {
|
|
124
124
|
for (unsigned int j = 0; j != h->n_buckets; ++j) {
|
125
125
|
if (iseither(flagbaks, j) == 0) {
|
126
126
|
term key = newkeys[j];
|
127
|
-
|
128
|
-
val = newvals[j];
|
127
|
+
posting_list_header val = newvals[j];
|
129
128
|
set_isdel_true(flagbaks, j);
|
130
129
|
while (1) {
|
131
130
|
uint32_t inc, k, i;
|
@@ -139,7 +138,7 @@ wp_error* termhash_bump_size(termhash *h) {
|
|
139
138
|
set_isempty_false(newflags, i);
|
140
139
|
if (i < h->n_buckets && iseither(flagbaks, i) == 0) {
|
141
140
|
{ term tmp = newkeys[i]; newkeys[i] = key; key = tmp; }
|
142
|
-
{
|
141
|
+
{ posting_list_header tmp = newvals[i]; newvals[i] = val; val = tmp; }
|
143
142
|
set_isdel_true(flagbaks, i);
|
144
143
|
} else {
|
145
144
|
newkeys[i] = key;
|
@@ -235,20 +234,20 @@ void termhash_del(termhash *h, uint32_t x) {
|
|
235
234
|
}
|
236
235
|
}
|
237
236
|
|
238
|
-
|
239
|
-
|
237
|
+
posting_list_header* termhash_get_val(termhash* h, term t) {
|
238
|
+
posting_list_header* vals = TERMHASH_VALS(h);
|
240
239
|
uint32_t idx = termhash_get(h, t);
|
241
|
-
if(idx == h->n_buckets) return
|
242
|
-
return vals[idx];
|
240
|
+
if(idx == h->n_buckets) return NULL;
|
241
|
+
return &vals[idx];
|
243
242
|
}
|
244
243
|
|
245
|
-
wp_error* termhash_put_val(termhash* h, term t,
|
244
|
+
wp_error* termhash_put_val(termhash* h, term t, posting_list_header* val) {
|
246
245
|
int status;
|
247
|
-
|
246
|
+
posting_list_header* vals = TERMHASH_VALS(h);
|
248
247
|
uint32_t loc = termhash_put(h, t, &status);
|
249
248
|
DEBUG("put(%u,%u) has status %d and loc %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
|
250
249
|
if(status == -1) RAISE_ERROR("out of space in hash");
|
251
|
-
vals[loc]
|
250
|
+
memcpy(&vals[loc], val, sizeof(posting_list_header));
|
252
251
|
return NO_ERROR;
|
253
252
|
}
|
254
253
|
|
@@ -257,22 +256,22 @@ int termhash_needs_bump(termhash* h) {
|
|
257
256
|
}
|
258
257
|
|
259
258
|
// returns the total size in bytes
|
260
|
-
// memory layout: termhash, then:
|
259
|
+
// memory layout: termhash struct, then:
|
261
260
|
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
262
261
|
// n_buckets terms for the keys
|
263
|
-
// n_buckets
|
262
|
+
// n_buckets posting_list_header for the vals (offsets into postings lists)
|
264
263
|
static uint32_t size(uint32_t n_buckets) {
|
265
264
|
uint32_t size = (uint32_t)sizeof(termhash) +
|
266
265
|
(((n_buckets >> 4) + 1) * (uint32_t)sizeof(uint32_t)) +
|
267
266
|
(n_buckets * (uint32_t)sizeof(term)) +
|
268
|
-
(n_buckets * (uint32_t)sizeof(
|
267
|
+
(n_buckets * (uint32_t)sizeof(posting_list_header));
|
269
268
|
|
270
269
|
DEBUG("size of a termhash with %u buckets is %lu + %lu + %lu + %lu = %u",
|
271
270
|
n_buckets,
|
272
271
|
(long)sizeof(termhash),
|
273
272
|
(long)(((n_buckets >> 4) + 1) * sizeof(uint32_t)),
|
274
273
|
(long)(n_buckets * sizeof(term)),
|
275
|
-
(long)(n_buckets * sizeof(
|
274
|
+
(long)(n_buckets * sizeof(posting_list_header)),
|
276
275
|
size);
|
277
276
|
|
278
277
|
return size;
|
data/ext/whistlepig/termhash.h
CHANGED
@@ -22,6 +22,18 @@ typedef struct term {
|
|
22
22
|
uint32_t word_s;
|
23
23
|
} term;
|
24
24
|
|
25
|
+
typedef struct posting_list_header {
|
26
|
+
uint32_t count;
|
27
|
+
uint32_t next_offset;
|
28
|
+
} posting_list_header;
|
29
|
+
|
30
|
+
typedef struct block_header {
|
31
|
+
uint32_t max_docid;
|
32
|
+
uint32_t next_offset;
|
33
|
+
uint32_t block_start;
|
34
|
+
uint8_t data[];
|
35
|
+
} block_header;
|
36
|
+
|
25
37
|
#define INITIAL_N_BUCKETS_IDX 1
|
26
38
|
|
27
39
|
typedef struct termhash {
|
@@ -31,12 +43,12 @@ typedef struct termhash {
|
|
31
43
|
// in memory at this point
|
32
44
|
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
33
45
|
// n_buckets terms for the keys
|
34
|
-
// n_buckets
|
46
|
+
// n_buckets posting_list_header for the vals
|
35
47
|
} termhash;
|
36
48
|
|
37
49
|
#define TERMHASH_FLAGS(h) ((uint32_t*)(h)->boundary)
|
38
50
|
#define TERMHASH_KEYS(h) ((term*)((uint32_t*)(h)->boundary + (((h)->n_buckets >> 4) + 1)))
|
39
|
-
#define TERMHASH_VALS(h) ((
|
51
|
+
#define TERMHASH_VALS(h) ((posting_list_header*)(TERMHASH_KEYS(h) + (h)->n_buckets))
|
40
52
|
|
41
53
|
// API methods
|
42
54
|
|
@@ -50,14 +62,14 @@ uint32_t termhash_get(termhash *h, term t);
|
|
50
62
|
|
51
63
|
// public: get an int given a term. returns (uint32_t)-1 if the term is not in
|
52
64
|
// the hash.
|
53
|
-
|
65
|
+
posting_list_header* termhash_get_val(termhash* h, term t); // convenience
|
54
66
|
|
55
67
|
// private: khash-style setter: insert a term into the hash. see the code
|
56
68
|
// for details on what all the return values mean.
|
57
69
|
uint32_t termhash_put(termhash* h, term t, int *ret); // khash-style
|
58
70
|
|
59
71
|
// public: adds a term to the hash with the given value
|
60
|
-
wp_error* termhash_put_val(termhash* h, term t,
|
72
|
+
wp_error* termhash_put_val(termhash* h, term t, posting_list_header* val) RAISES_ERROR; // convenience
|
61
73
|
|
62
74
|
// public: returns the byte size of the termhash
|
63
75
|
uint32_t termhash_size(termhash* h);
|
data/ext/whistlepig/whistlepig.c
CHANGED
@@ -7,7 +7,9 @@ static VALUE c_index;
|
|
7
7
|
static VALUE c_entry;
|
8
8
|
static VALUE c_query;
|
9
9
|
static VALUE c_error;
|
10
|
-
static VALUE
|
10
|
+
static VALUE c_parse_error;
|
11
|
+
static VALUE c_sys_error;
|
12
|
+
static VALUE c_version_error;
|
11
13
|
|
12
14
|
static void index_free(wp_index* index) {
|
13
15
|
wp_error* e = wp_index_free(index);
|
@@ -20,7 +22,12 @@ static void index_free(wp_index* index) {
|
|
20
22
|
|
21
23
|
#define RAISE_IF_NECESSARY(e) do { \
|
22
24
|
if(e != NULL) { \
|
23
|
-
VALUE exc
|
25
|
+
VALUE exc; \
|
26
|
+
switch(e->type) { \
|
27
|
+
case WP_ERROR_TYPE_SYSTEM: exc = rb_exc_new2(c_sys_error, e->msg); break; \
|
28
|
+
case WP_ERROR_TYPE_VERSION: exc = rb_exc_new2(c_version_error, e->msg); break; \
|
29
|
+
default: exc = rb_exc_new2(c_error, e->msg); break; \
|
30
|
+
} \
|
24
31
|
wp_error_free(e); \
|
25
32
|
rb_exc_raise(exc); \
|
26
33
|
} \
|
@@ -331,7 +338,7 @@ static VALUE query_new(VALUE class, VALUE default_field, VALUE string) {
|
|
331
338
|
wp_query* query;
|
332
339
|
wp_error* e = wp_query_parse(RSTRING_PTR(string), RSTRING_PTR(default_field), &query);
|
333
340
|
if(e != NULL) {
|
334
|
-
VALUE exc = rb_exc_new2(
|
341
|
+
VALUE exc = rb_exc_new2(c_parse_error, e->msg);
|
335
342
|
wp_error_free(e);
|
336
343
|
rb_exc_raise(exc);
|
337
344
|
}
|
@@ -614,5 +621,7 @@ void Init_whistlepig() {
|
|
614
621
|
rb_define_attr(c_query, "query", 1, 0);
|
615
622
|
|
616
623
|
c_error = rb_define_class_under(m_whistlepig, "Error", rb_eStandardError);
|
617
|
-
|
624
|
+
c_parse_error = rb_define_class_under(m_whistlepig, "ParseError", c_error);
|
625
|
+
c_sys_error = rb_define_class_under(m_whistlepig, "SystemError", c_error);
|
626
|
+
c_version_error = rb_define_class_under(m_whistlepig, "VersionError", c_error);
|
618
627
|
}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whistlepig
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.12'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-06-09 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Whistlepig is a minimalist realtime full-text search index. Its goal
|
15
15
|
is to be as small and minimally-featured as possible, while still remaining useful,
|