whistlepig 0.11.2 → 0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +2 -2
- data/ext/whistlepig/error.c +2 -1
- data/ext/whistlepig/error.h +17 -7
- data/ext/whistlepig/index.c +58 -18
- data/ext/whistlepig/mmap-obj.c +4 -4
- data/ext/whistlepig/search.c +10 -5
- data/ext/whistlepig/segment.c +83 -34
- data/ext/whistlepig/segment.h +3 -0
- data/ext/whistlepig/snippeter.c +1 -1
- data/ext/whistlepig/termhash.c +16 -17
- data/ext/whistlepig/termhash.h +16 -4
- data/ext/whistlepig/whistlepig.c +13 -4
- metadata +2 -2
data/README
CHANGED
@@ -8,7 +8,7 @@ full-text search without the frills, Whistlepig may be for you.
|
|
8
8
|
Whistlepig is written in ANSI C99. It currently provides a C API and Ruby
|
9
9
|
bindings.
|
10
10
|
|
11
|
-
Latest version: 0.
|
11
|
+
Latest version: 0.12, released 2012-06-09.
|
12
12
|
Status: beta
|
13
13
|
News: http://all-thing.net/label/whistlepig/
|
14
14
|
Homepage: http://masanjin.net/whistlepig/
|
@@ -16,7 +16,7 @@ Latest version: 0.11.2, released 2012-05-19.
|
|
16
16
|
|
17
17
|
= Getting it
|
18
18
|
|
19
|
-
Tarball: http://masanjin.net/whistlepig/whistlepig-0.
|
19
|
+
Tarball: http://masanjin.net/whistlepig/whistlepig-0.12.tar.gz
|
20
20
|
Rubygem: gem install whistlepig
|
21
21
|
Git: git clone git://github.com/wmorgan/whistlepig.git
|
22
22
|
|
data/ext/whistlepig/error.c
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
#include <stdlib.h>
|
2
2
|
#include "error.h"
|
3
3
|
|
4
|
-
wp_error* wp_error_new(const char* msg, const char* src) {
|
4
|
+
wp_error* wp_error_new(const char* msg, const char* src, unsigned char type) {
|
5
5
|
wp_error* ret = malloc(sizeof(wp_error));
|
6
6
|
ret->msg = msg;
|
7
|
+
ret->type = type;
|
7
8
|
ret->size = 1;
|
8
9
|
ret->srcs = malloc(sizeof(const char*));
|
9
10
|
ret->srcs[0] = src;
|
data/ext/whistlepig/error.h
CHANGED
@@ -25,8 +25,13 @@
|
|
25
25
|
#include <stdio.h>
|
26
26
|
#include <string.h>
|
27
27
|
|
28
|
+
#define WP_ERROR_TYPE_BASIC 1
|
29
|
+
#define WP_ERROR_TYPE_SYSTEM 2
|
30
|
+
#define WP_ERROR_TYPE_VERSION 3
|
31
|
+
|
28
32
|
// pseudo-backtrace
|
29
33
|
typedef struct wp_error {
|
34
|
+
unsigned char type;
|
30
35
|
unsigned int size;
|
31
36
|
const char* msg;
|
32
37
|
const char** srcs;
|
@@ -39,25 +44,30 @@ typedef struct wp_error {
|
|
39
44
|
// API methods
|
40
45
|
|
41
46
|
// private: make a new error object with a message and source line
|
42
|
-
wp_error* wp_error_new(const char* msg, const char* src) RAISES_ERROR;
|
47
|
+
wp_error* wp_error_new(const char* msg, const char* src, unsigned char type) RAISES_ERROR;
|
43
48
|
// private: add a source line to a pre-existing error
|
44
49
|
wp_error* wp_error_chain(wp_error* e, const char* src) RAISES_ERROR;
|
45
50
|
|
46
51
|
// public: free an error, once handled
|
47
52
|
void wp_error_free(wp_error* e);
|
48
53
|
|
49
|
-
//
|
50
|
-
#define
|
54
|
+
// private: internal mechanics for raising an error
|
55
|
+
#define RAISE_ERROR_OF_TYPE(type, fmt, ...) do { \
|
51
56
|
char* msg = malloc(1024); \
|
52
57
|
char* src = malloc(1024); \
|
53
58
|
snprintf(msg, 1024, fmt, ## __VA_ARGS__); \
|
54
59
|
snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
|
55
|
-
return wp_error_new(msg, src); \
|
60
|
+
return wp_error_new(msg, src, type); \
|
56
61
|
} while(0)
|
57
62
|
|
58
|
-
// public: raise
|
59
|
-
|
60
|
-
|
63
|
+
// public: raise a basic error
|
64
|
+
#define RAISE_ERROR(fmt, ...) RAISE_ERROR_OF_TYPE(WP_ERROR_TYPE_BASIC, fmt, ## __VA_ARGS__)
|
65
|
+
|
66
|
+
// public: raise a version error
|
67
|
+
#define RAISE_VERSION_ERROR(fmt, ...) RAISE_ERROR_OF_TYPE(WP_ERROR_TYPE_VERSION, fmt, ## __VA_ARGS__)
|
68
|
+
|
69
|
+
// public: raise a system error with strerror() automatically appended to the message
|
70
|
+
#define RAISE_SYSERROR(fmt, ...) RAISE_ERROR_OF_TYPE(WP_ERROR_TYPE_SYSTEM, fmt ": %s", ## __VA_ARGS__, strerror(errno))
|
61
71
|
|
62
72
|
// public: relay an error up the stack if the called function returns one.
|
63
73
|
#define RELAY_ERROR(e) do { \
|
data/ext/whistlepig/index.c
CHANGED
@@ -32,7 +32,6 @@ RAISING_STATIC(release_lock(wp_index* index)) {
|
|
32
32
|
return NO_ERROR;
|
33
33
|
}
|
34
34
|
|
35
|
-
|
36
35
|
RAISING_STATIC(index_info_init(index_info* ii, uint32_t index_version)) {
|
37
36
|
ii->index_version = index_version;
|
38
37
|
ii->num_segments = 0;
|
@@ -42,7 +41,7 @@ RAISING_STATIC(index_info_init(index_info* ii, uint32_t index_version)) {
|
|
42
41
|
}
|
43
42
|
|
44
43
|
RAISING_STATIC(index_info_validate(index_info* ii, uint32_t index_version)) {
|
45
|
-
if(ii->index_version != index_version)
|
44
|
+
if(ii->index_version != index_version) RAISE_VERSION_ERROR("index has type %u; expecting type %u", ii->index_version, index_version);
|
46
45
|
return NO_ERROR;
|
47
46
|
}
|
48
47
|
|
@@ -149,6 +148,57 @@ wp_error* wp_index_setup_query(wp_index* index, wp_query* query) {
|
|
149
148
|
return NO_ERROR;
|
150
149
|
}
|
151
150
|
|
151
|
+
#define RESULT_BUF_SIZE 1024
|
152
|
+
// count the results by running the query until it stops. slow!
|
153
|
+
RAISING_STATIC(count_query_by_running_it(wp_index* index, wp_query* query, uint32_t* num_results)) {
|
154
|
+
uint64_t results[RESULT_BUF_SIZE];
|
155
|
+
|
156
|
+
*num_results = 0;
|
157
|
+
while(1) {
|
158
|
+
uint32_t this_num_results;
|
159
|
+
RELAY_ERROR(wp_index_run_query(index, query, RESULT_BUF_SIZE, &this_num_results, results));
|
160
|
+
*num_results += this_num_results;
|
161
|
+
if(this_num_results < RESULT_BUF_SIZE) break; // done
|
162
|
+
}
|
163
|
+
|
164
|
+
return NO_ERROR;
|
165
|
+
}
|
166
|
+
|
167
|
+
RAISING_STATIC(count_query_from_posting_list_header(wp_index* index, wp_query* query, uint32_t* num_results)) {
|
168
|
+
// make sure we have know about all segments (one could've been added by a writer)
|
169
|
+
RELAY_ERROR(grab_readlock(index));
|
170
|
+
RELAY_ERROR(ensure_all_segments(index));
|
171
|
+
RELAY_ERROR(release_lock(index));
|
172
|
+
|
173
|
+
*num_results = 0;
|
174
|
+
for(int i = 0; i < index->num_segments; i++) {
|
175
|
+
uint32_t this_num_results;
|
176
|
+
|
177
|
+
DEBUG("counting on segment %d", i);
|
178
|
+
wp_segment* seg = &index->segments[i];
|
179
|
+
RELAY_ERROR(wp_segment_grab_readlock(seg));
|
180
|
+
RELAY_ERROR(wp_segment_reload(seg));
|
181
|
+
RELAY_ERROR(wp_segment_count_term(seg, query->field, query->word, &this_num_results));
|
182
|
+
RELAY_ERROR(wp_segment_release_lock(seg));
|
183
|
+
*num_results += this_num_results;
|
184
|
+
DEBUG("got %d results from segment %d", this_num_results, i);
|
185
|
+
}
|
186
|
+
|
187
|
+
return NO_ERROR;
|
188
|
+
}
|
189
|
+
|
190
|
+
RAISING_STATIC(count_query(wp_index* index, wp_query* query, uint32_t* num_results)) {
|
191
|
+
switch(query->type) {
|
192
|
+
case WP_QUERY_TERM:
|
193
|
+
case WP_QUERY_LABEL:
|
194
|
+
RELAY_ERROR(count_query_from_posting_list_header(index, query, num_results));
|
195
|
+
break;
|
196
|
+
case WP_QUERY_EVERY: // TODO -- special case this
|
197
|
+
default:
|
198
|
+
RELAY_ERROR(count_query_by_running_it(index, query, num_results));
|
199
|
+
}
|
200
|
+
return NO_ERROR;
|
201
|
+
}
|
152
202
|
// can be called multiple times to resume
|
153
203
|
wp_error* wp_index_run_query(wp_index* index, wp_query* query, uint32_t max_num_results, uint32_t* num_results, uint64_t* results) {
|
154
204
|
*num_results = 0;
|
@@ -209,20 +259,10 @@ wp_error* wp_index_run_query(wp_index* index, wp_query* query, uint32_t max_num_
|
|
209
259
|
return NO_ERROR;
|
210
260
|
}
|
211
261
|
|
212
|
-
|
213
|
-
// count the results by just running the query until it stops. slow!
|
262
|
+
// just count the results, don't return them
|
214
263
|
wp_error* wp_index_count_results(wp_index* index, wp_query* query, uint32_t* num_results) {
|
215
|
-
uint64_t results[RESULT_BUF_SIZE];
|
216
|
-
|
217
|
-
*num_results = 0;
|
218
264
|
RELAY_ERROR(wp_index_setup_query(index, query));
|
219
|
-
|
220
|
-
uint32_t this_num_results;
|
221
|
-
RELAY_ERROR(wp_index_run_query(index, query, RESULT_BUF_SIZE, &this_num_results, results));
|
222
|
-
*num_results += this_num_results;
|
223
|
-
if(this_num_results < RESULT_BUF_SIZE) break; // done
|
224
|
-
}
|
225
|
-
|
265
|
+
RELAY_ERROR(count_query(index, query, num_results));
|
226
266
|
RELAY_ERROR(wp_index_teardown_query(index, query));
|
227
267
|
|
228
268
|
return NO_ERROR;
|
@@ -369,7 +409,7 @@ wp_error* wp_index_add_label(wp_index* index, const char* label, uint64_t doc_id
|
|
369
409
|
if(doc_id > index->docid_offsets[i - 1]) {
|
370
410
|
wp_segment* seg = &index->segments[i - 1];
|
371
411
|
|
372
|
-
DEBUG("found doc %
|
412
|
+
DEBUG("found doc %"PRIu64" in segment %u", doc_id, i - 1);
|
373
413
|
RELAY_ERROR(wp_segment_grab_writelock(seg));
|
374
414
|
RELAY_ERROR(wp_segment_reload(seg));
|
375
415
|
RELAY_ERROR(wp_segment_add_label(seg, label, (docid_t)(doc_id - index->docid_offsets[i - 1])));
|
@@ -377,7 +417,7 @@ wp_error* wp_index_add_label(wp_index* index, const char* label, uint64_t doc_id
|
|
377
417
|
found = 1;
|
378
418
|
break;
|
379
419
|
}
|
380
|
-
else DEBUG("did not find doc %
|
420
|
+
else DEBUG("did not find doc %"PRIu64" in segment %u", doc_id, i - 1);
|
381
421
|
}
|
382
422
|
|
383
423
|
if(!found) RAISE_ERROR("couldn't find doc id %"PRIu64, doc_id);
|
@@ -396,7 +436,7 @@ wp_error* wp_index_remove_label(wp_index* index, const char* label, uint64_t doc
|
|
396
436
|
if(doc_id > index->docid_offsets[i - 1]) {
|
397
437
|
wp_segment* seg = &index->segments[i - 1];
|
398
438
|
|
399
|
-
DEBUG("found doc %
|
439
|
+
DEBUG("found doc %"PRIu64" in segment %u", doc_id, i - 1);
|
400
440
|
RELAY_ERROR(wp_segment_grab_writelock(seg));
|
401
441
|
RELAY_ERROR(wp_segment_reload(seg));
|
402
442
|
RELAY_ERROR(wp_segment_remove_label(seg, label, (docid_t)(doc_id - index->docid_offsets[i - 1])));
|
@@ -404,7 +444,7 @@ wp_error* wp_index_remove_label(wp_index* index, const char* label, uint64_t doc
|
|
404
444
|
found = 1;
|
405
445
|
break;
|
406
446
|
}
|
407
|
-
else DEBUG("did not find doc %
|
447
|
+
else DEBUG("did not find doc %"PRIu64" in segment %u", doc_id, i - 1);
|
408
448
|
}
|
409
449
|
|
410
450
|
if(!found) RAISE_ERROR("couldn't find doc id %"PRIu64, doc_id);
|
data/ext/whistlepig/mmap-obj.c
CHANGED
@@ -14,7 +14,7 @@ wp_error* mmap_obj_create(mmap_obj* o, const char* magic, const char* pathname,
|
|
14
14
|
if(o->fd == -1) RAISE_SYSERROR("cannot create %s", pathname);
|
15
15
|
|
16
16
|
uint32_t size = initial_size + (uint32_t)sizeof(mmap_obj_header);
|
17
|
-
DEBUG("creating %s with %u + %
|
17
|
+
DEBUG("creating %s with %u + %lu = %u bytes for %s object", pathname, initial_size, sizeof(mmap_obj_header), size, magic);
|
18
18
|
lseek(o->fd, size - 1, SEEK_SET);
|
19
19
|
ssize_t num_bytes = write(o->fd, "", 1);
|
20
20
|
if(num_bytes == -1) RAISE_SYSERROR("write");
|
@@ -35,14 +35,14 @@ wp_error* mmap_obj_load(mmap_obj* o, const char* magic, const char* pathname) {
|
|
35
35
|
// load header
|
36
36
|
o->content = mmap(NULL, sizeof(mmap_obj_header), PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
|
37
37
|
if(o->content == MAP_FAILED) RAISE_SYSERROR("header mmap");
|
38
|
-
DEBUG("loaded header of %
|
38
|
+
DEBUG("loaded header of %lu bytes for %s object", sizeof(mmap_obj_header), magic);
|
39
39
|
|
40
40
|
RELAY_ERROR(validate(o->content, magic));
|
41
41
|
|
42
42
|
o->loaded_size = o->content->size;
|
43
43
|
|
44
44
|
uint32_t size = o->content->size + (uint32_t)sizeof(mmap_obj_header);
|
45
|
-
DEBUG("full size is %u bytes (including %
|
45
|
+
DEBUG("full size is %u bytes (including %lu-byte header)", size, sizeof(mmap_obj_header));
|
46
46
|
if(munmap(o->content, sizeof(mmap_obj_header)) == -1) RAISE_SYSERROR("munmap");
|
47
47
|
|
48
48
|
o->content = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
|
@@ -85,7 +85,7 @@ wp_error* mmap_obj_resize(mmap_obj* o, uint32_t data_size) {
|
|
85
85
|
}
|
86
86
|
|
87
87
|
wp_error* mmap_obj_unload(mmap_obj* o) {
|
88
|
-
DEBUG("unloading %
|
88
|
+
DEBUG("unloading %lu bytes", sizeof(mmap_obj_header) + o->content->size);
|
89
89
|
if(munmap(o->content, sizeof(mmap_obj_header) + o->content->size) == -1) RAISE_SYSERROR("munmap");
|
90
90
|
o->content = NULL;
|
91
91
|
return NO_ERROR;
|
data/ext/whistlepig/search.c
CHANGED
@@ -195,8 +195,14 @@ static wp_error* term_init_search_state(wp_query* q, wp_segment* seg) {
|
|
195
195
|
|
196
196
|
t.word_s = stringmap_string_to_int(sh, sp, q->word);
|
197
197
|
|
198
|
-
uint32_t offset
|
199
|
-
|
198
|
+
uint32_t offset;
|
199
|
+
posting_list_header* plh = termhash_get_val(th, t);
|
200
|
+
|
201
|
+
DEBUG("posting list header for %s:%s (-> %u:%u) is %p", q->field, q->word, t.field_s, t.word_s, plh);
|
202
|
+
if(plh == NULL) offset = OFFSET_NONE;
|
203
|
+
else offset = plh->next_offset;
|
204
|
+
|
205
|
+
if(plh) DEBUG("posting list header has count=%u next_offset=%u", plh->count, plh->next_offset);
|
200
206
|
|
201
207
|
if(offset == OFFSET_NONE) state->done = 1; // no entry in term hash
|
202
208
|
else {
|
@@ -475,8 +481,8 @@ static wp_error* conj_next_doc(wp_query* q, wp_segment* seg, search_result* resu
|
|
475
481
|
|
476
482
|
while(!found && !*done) {
|
477
483
|
RELAY_ERROR(query_next_doc(master, seg, result, done));
|
478
|
-
DEBUG("master reports doc %u done %d", result->doc_id, *done);
|
479
484
|
if(!*done) {
|
485
|
+
DEBUG("master reports doc %u done %d", result->doc_id, *done);
|
480
486
|
search_doc = result->doc_id;
|
481
487
|
wp_search_result_free(result); // sigh
|
482
488
|
RELAY_ERROR(conj_advance_to_doc(q, seg, search_doc, result, &found, done));
|
@@ -564,8 +570,8 @@ static wp_error* phrase_next_doc(wp_query* q, wp_segment* seg, search_result* re
|
|
564
570
|
|
565
571
|
while(!found && !*done) {
|
566
572
|
RELAY_ERROR(query_next_doc(master, seg, result, done));
|
567
|
-
DEBUG("master reports doc %u done %d", result->doc_id, *done);
|
568
573
|
if(!*done) {
|
574
|
+
DEBUG("master reports doc %u done %d", result->doc_id, *done);
|
569
575
|
search_doc = result->doc_id;
|
570
576
|
wp_search_result_free(result); // sigh
|
571
577
|
RELAY_ERROR(phrase_advance_to_doc(q, seg, search_doc, result, &found, done));
|
@@ -784,7 +790,6 @@ static wp_error* every_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_
|
|
784
790
|
|
785
791
|
wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment* s, uint32_t max_num_results, uint32_t* num_results, search_result* results) {
|
786
792
|
int done;
|
787
|
-
|
788
793
|
*num_results = 0;
|
789
794
|
|
790
795
|
#ifdef DEBUG
|
data/ext/whistlepig/segment.c
CHANGED
@@ -3,13 +3,16 @@
|
|
3
3
|
#include <unistd.h>
|
4
4
|
#include "whistlepig.h"
|
5
5
|
|
6
|
-
#define POSTINGS_REGION_TYPE_IMMUTABLE_VBE
|
6
|
+
#define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
|
7
7
|
#define POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS 2 // bigger, mutable
|
8
8
|
|
9
|
-
#define SEGMENT_VERSION
|
9
|
+
#define SEGMENT_VERSION 4
|
10
10
|
|
11
11
|
#define wp_segment_label_posting_at(posting_region, offset) ((label_posting*)(posting_region->postings + offset))
|
12
12
|
|
13
|
+
static posting_list_header blank_plh = { .count = 0, .next_offset = OFFSET_NONE };
|
14
|
+
static term dead_term = { .field_s = 0, .word_s = 0 };
|
15
|
+
|
13
16
|
wp_error* wp_segment_grab_readlock(wp_segment* seg) {
|
14
17
|
segment_info* si = MMAP_OBJ(seg->seginfo, segment_info);
|
15
18
|
RELAY_ERROR(wp_lock_grab(&si->lock, WP_LOCK_READLOCK));
|
@@ -28,6 +31,23 @@ wp_error* wp_segment_release_lock(wp_segment* seg) {
|
|
28
31
|
return NO_ERROR;
|
29
32
|
}
|
30
33
|
|
34
|
+
wp_error* wp_segment_count_term(wp_segment* seg, const char* field, const char* word, uint32_t* num_results) {
|
35
|
+
stringmap* sh = MMAP_OBJ(seg->stringmap, stringmap);
|
36
|
+
stringpool* sp = MMAP_OBJ(seg->stringpool, stringpool);
|
37
|
+
termhash* th = MMAP_OBJ(seg->termhash, termhash);
|
38
|
+
|
39
|
+
term t;
|
40
|
+
if(field == NULL) t.field_s = 0; // label sentinel
|
41
|
+
else t.field_s = stringmap_string_to_int(sh, sp, field);
|
42
|
+
t.word_s = stringmap_string_to_int(sh, sp, word);
|
43
|
+
|
44
|
+
posting_list_header* plh = termhash_get_val(th, t);
|
45
|
+
if(plh == NULL) *num_results = 0;
|
46
|
+
else *num_results = plh->count;
|
47
|
+
|
48
|
+
return NO_ERROR;
|
49
|
+
}
|
50
|
+
|
31
51
|
static void postings_region_init(postings_region* pr, uint32_t initial_size, uint32_t postings_type_and_flags) {
|
32
52
|
pr->postings_type_and_flags = postings_type_and_flags;
|
33
53
|
pr->num_postings = 0;
|
@@ -44,7 +64,7 @@ RAISING_STATIC(segment_info_init(segment_info* si, uint32_t segment_version)) {
|
|
44
64
|
}
|
45
65
|
|
46
66
|
RAISING_STATIC(segment_info_validate(segment_info* si, uint32_t segment_version)) {
|
47
|
-
if(si->segment_version != segment_version)
|
67
|
+
if(si->segment_version != segment_version) RAISE_VERSION_ERROR("segment has type %u; expecting type %u", si->segment_version, segment_version);
|
48
68
|
return NO_ERROR;
|
49
69
|
}
|
50
70
|
|
@@ -449,7 +469,7 @@ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* w
|
|
449
469
|
RELAY_ERROR(bump_stringpool(s, &success));
|
450
470
|
RELAY_ERROR(bump_termhash(s, &success));
|
451
471
|
|
452
|
-
DEBUG("adding posting for %s:%s and doc %u", field, word, doc_id);
|
472
|
+
DEBUG("adding posting for %s:%s and doc %u with %u positions", field, word, doc_id, num_positions);
|
453
473
|
|
454
474
|
postings_region* pr = MMAP_OBJ(s->postings, postings_region);
|
455
475
|
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
@@ -461,26 +481,38 @@ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* w
|
|
461
481
|
RELAY_ERROR(stringmap_add(sh, sp, field, &t.field_s));
|
462
482
|
RELAY_ERROR(stringmap_add(sh, sp, word, &t.word_s));
|
463
483
|
|
484
|
+
DEBUG("%s:%s maps to %u:%u", field, word, t.field_s, t.word_s);
|
485
|
+
|
464
486
|
// find the offset of the next posting
|
487
|
+
posting_list_header* plh = termhash_get_val(th, t);
|
488
|
+
if(plh == NULL) {
|
489
|
+
RELAY_ERROR(termhash_put_val(th, t, &blank_plh));
|
490
|
+
plh = termhash_get_val(th, t);
|
491
|
+
}
|
492
|
+
DEBUG("posting list header for %s:%s is at %p", field, word, plh);
|
493
|
+
|
465
494
|
posting po;
|
466
|
-
uint32_t next_offset =
|
467
|
-
|
468
|
-
if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy
|
495
|
+
uint32_t next_offset = plh->next_offset;
|
496
|
+
|
497
|
+
if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy [PERFORMANCE]
|
469
498
|
RELAY_ERROR(wp_segment_read_posting(s, next_offset, &po, 0));
|
470
499
|
if(po.doc_id >= doc_id) RAISE_ERROR("cannot add a doc_id out of sorted order");
|
471
500
|
}
|
472
501
|
|
473
502
|
// write the entry to the postings region
|
474
503
|
uint32_t entry_offset = pr->postings_head;
|
475
|
-
|
504
|
+
DEBUG("writing posting at offset %u. next offset is %u.", entry_offset, next_offset);
|
505
|
+
|
476
506
|
po.doc_id = doc_id;
|
477
507
|
po.next_offset = next_offset;
|
478
508
|
po.num_positions = num_positions;
|
479
509
|
RELAY_ERROR(write_posting(s, &po, positions)); // prev_docid is 0 for th
|
480
|
-
DEBUG("
|
510
|
+
DEBUG("posting list head now at %u", pr->postings_head);
|
481
511
|
|
482
512
|
// really finally, update the tail pointer so that readers can access this posting
|
483
|
-
|
513
|
+
plh->count++;
|
514
|
+
plh->next_offset = entry_offset;
|
515
|
+
DEBUG("posting list header for %s:%s now reads count=%u offset=%u", field, word, plh->count, plh->next_offset);
|
484
516
|
|
485
517
|
return NO_ERROR;
|
486
518
|
}
|
@@ -537,22 +569,25 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
|
|
537
569
|
|
538
570
|
// find the previous and next label postings, between which we'll insert this
|
539
571
|
// posting
|
540
|
-
|
541
|
-
|
572
|
+
posting_list_header* plh = termhash_get_val(th, t);
|
573
|
+
if(plh == NULL) {
|
574
|
+
RELAY_ERROR(termhash_put_val(th, t, &blank_plh));
|
575
|
+
plh = termhash_get_val(th, t);
|
576
|
+
}
|
577
|
+
|
578
|
+
uint32_t next_offset = plh->next_offset;
|
542
579
|
docid_t last_docid = DOCID_NONE;
|
580
|
+
uint32_t prev_offset = OFFSET_NONE;
|
543
581
|
|
544
|
-
if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
|
545
582
|
DEBUG("start offset is %u (none is %u)", next_offset, OFFSET_NONE);
|
546
583
|
|
547
584
|
while(next_offset != OFFSET_NONE) {
|
548
585
|
label_posting* lp = wp_segment_label_posting_at(pr, next_offset);
|
549
586
|
|
550
|
-
if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid))
|
587
|
+
if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid))
|
551
588
|
RAISE_ERROR("whistlepig index corruption! lp %u has docid %u but last docid at lp %u was %u", next_offset, lp->doc_id, prev_offset, last_docid);
|
552
|
-
|
553
|
-
else {
|
589
|
+
else
|
554
590
|
last_docid = lp->doc_id;
|
555
|
-
}
|
556
591
|
|
557
592
|
DEBUG("got doc id %u next_offset %u at offset %u (looking for doc id %u)", lp->doc_id, lp->next_offset, next_offset, doc_id);
|
558
593
|
if(lp->doc_id == doc_id) {
|
@@ -567,18 +602,23 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
|
|
567
602
|
// find a space for the posting by first checking for a free postings in the
|
568
603
|
// dead list. the dead list is the list stored under the sentinel term with
|
569
604
|
// field 0 and word 0.
|
570
|
-
|
605
|
+
posting_list_header* dead_plh = termhash_get_val(th, dead_term);
|
606
|
+
if(dead_plh == NULL) {
|
607
|
+
RELAY_ERROR(termhash_put_val(th, dead_term, &blank_plh));
|
608
|
+
dead_plh = termhash_get_val(th, t);
|
609
|
+
}
|
610
|
+
|
571
611
|
uint32_t entry_offset;
|
572
|
-
uint32_t dead_offset =
|
573
|
-
if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
|
612
|
+
uint32_t dead_offset = dead_plh->next_offset;
|
574
613
|
|
575
614
|
if(dead_offset == OFFSET_NONE) { // make a new posting
|
576
615
|
entry_offset = pr->postings_head;
|
577
616
|
}
|
578
617
|
else { // we'll use this one; remove it from the linked list
|
579
618
|
DEBUG("offset from dead list is %u, using it for the new posting!", dead_offset);
|
580
|
-
entry_offset =
|
581
|
-
|
619
|
+
entry_offset = dead_plh->next_offset;
|
620
|
+
dead_plh->next_offset = wp_segment_label_posting_at(pr, dead_offset)->next_offset;
|
621
|
+
dead_plh->count--;
|
582
622
|
}
|
583
623
|
|
584
624
|
// finally, write the entry to the label postings region
|
@@ -588,11 +628,12 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
|
|
588
628
|
po->next_offset = next_offset;
|
589
629
|
|
590
630
|
pr->postings_head += (uint32_t)sizeof(label_posting);
|
591
|
-
DEBUG("label
|
631
|
+
DEBUG("label posting list head now at %u", pr->postings_head);
|
592
632
|
|
593
633
|
// really finally, update either the previous offset or the tail pointer
|
594
634
|
// for this label so that readers can access this posting
|
595
|
-
|
635
|
+
plh->count++;
|
636
|
+
if(prev_offset == OFFSET_NONE) plh->next_offset = entry_offset;
|
596
637
|
else wp_segment_label_posting_at(pr, prev_offset)->next_offset = entry_offset;
|
597
638
|
|
598
639
|
return NO_ERROR;
|
@@ -615,13 +656,16 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
|
|
615
656
|
t.word_s = stringmap_string_to_int(sh, sp, label); // will be -1 if not there
|
616
657
|
|
617
658
|
// find the posting and the previous posting in the list, if any
|
618
|
-
uint32_t prev_offset = OFFSET_NONE;
|
619
|
-
uint32_t offset = termhash_get_val(th, t);
|
620
659
|
docid_t last_docid = DOCID_NONE;
|
660
|
+
uint32_t prev_offset = OFFSET_NONE;
|
661
|
+
posting_list_header* plh = termhash_get_val(th, t);
|
662
|
+
if(plh == NULL) {
|
663
|
+
DEBUG("no such label %s", label);
|
664
|
+
return NO_ERROR;
|
665
|
+
}
|
621
666
|
|
622
|
-
|
667
|
+
uint32_t offset = plh->next_offset;
|
623
668
|
label_posting* lp = NULL;
|
624
|
-
|
625
669
|
while(offset != OFFSET_NONE) {
|
626
670
|
lp = wp_segment_label_posting_at(pr, offset);
|
627
671
|
|
@@ -646,17 +690,22 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
|
|
646
690
|
}
|
647
691
|
|
648
692
|
// we've found the posting; now remove it from the list
|
649
|
-
if(prev_offset == OFFSET_NONE)
|
693
|
+
if(prev_offset == OFFSET_NONE) plh->next_offset = lp->next_offset;
|
650
694
|
else wp_segment_label_posting_at(pr, prev_offset)->next_offset = lp->next_offset;
|
695
|
+
plh->count--;
|
651
696
|
|
652
697
|
// now add it to the dead list for later reclamation
|
653
|
-
|
654
|
-
|
655
|
-
|
698
|
+
posting_list_header* dead_plh = termhash_get_val(th, dead_term);
|
699
|
+
if(dead_plh == NULL) {
|
700
|
+
RELAY_ERROR(termhash_put_val(th, dead_term, &blank_plh));
|
701
|
+
dead_plh = termhash_get_val(th, t);
|
702
|
+
}
|
656
703
|
|
657
|
-
lp->next_offset = dead_offset;
|
658
704
|
DEBUG("adding dead label posting %u to head of deadlist with next_offset %u", offset, lp->next_offset);
|
659
|
-
|
705
|
+
|
706
|
+
uint32_t dead_offset = dead_plh->next_offset;
|
707
|
+
lp->next_offset = dead_offset;
|
708
|
+
dead_plh->next_offset = offset;
|
660
709
|
|
661
710
|
return NO_ERROR;
|
662
711
|
}
|
data/ext/whistlepig/segment.h
CHANGED
@@ -150,4 +150,7 @@ wp_error* wp_segment_ensure_fit(wp_segment* seg, uint32_t postings_bytes, uint32
|
|
150
150
|
// private: return the size on disk of a position array
|
151
151
|
wp_error* wp_segment_sizeof_posarray(wp_segment* seg, uint32_t num_positions, pos_t* positions, uint32_t* size) RAISES_ERROR;
|
152
152
|
|
153
|
+
// private: count the number of occurences of a particular term
|
154
|
+
wp_error* wp_segment_count_term(wp_segment* seg, const char* field, const char* term, uint32_t* num_results);
|
155
|
+
|
153
156
|
#endif
|
data/ext/whistlepig/snippeter.c
CHANGED
@@ -87,7 +87,7 @@ RAISING_STATIC(snippetize_from_lexer(wp_query* query, lexinfo* charpos, yyscan_t
|
|
87
87
|
|
88
88
|
RARRAY_INIT(pword, words);
|
89
89
|
while(yylex(*scanner) != TOK_DONE) {
|
90
|
-
pword pw = { strdup(yyget_text(*scanner)), charpos->start, charpos->end };
|
90
|
+
pword pw = { .token = strdup(yyget_text(*scanner)), .start = charpos->start, .end = charpos->end };
|
91
91
|
RARRAY_ADD(pword, words, pw);
|
92
92
|
}
|
93
93
|
|
data/ext/whistlepig/termhash.c
CHANGED
@@ -106,15 +106,15 @@ wp_error* termhash_bump_size(termhash *h) {
|
|
106
106
|
|
107
107
|
// get pointers to the old locations
|
108
108
|
term* oldkeys = TERMHASH_KEYS(h);
|
109
|
-
|
109
|
+
posting_list_header* oldvals = TERMHASH_VALS(h);
|
110
110
|
|
111
111
|
// set pointers to the new locations
|
112
112
|
uint32_t* newflags = (uint32_t*)h->boundary;
|
113
113
|
term* newkeys = (term*)(newflags + ((new_n_buckets >> 4) + 1));
|
114
|
-
|
114
|
+
posting_list_header* newvals = (posting_list_header*)(newkeys + new_n_buckets);
|
115
115
|
|
116
116
|
// move the vals and keys
|
117
|
-
memmove(newvals, oldvals, h->n_buckets * sizeof(
|
117
|
+
memmove(newvals, oldvals, h->n_buckets * sizeof(posting_list_header));
|
118
118
|
memmove(newkeys, oldkeys, h->n_buckets * sizeof(term));
|
119
119
|
|
120
120
|
// clear the new flags
|
@@ -124,8 +124,7 @@ wp_error* termhash_bump_size(termhash *h) {
|
|
124
124
|
for (unsigned int j = 0; j != h->n_buckets; ++j) {
|
125
125
|
if (iseither(flagbaks, j) == 0) {
|
126
126
|
term key = newkeys[j];
|
127
|
-
|
128
|
-
val = newvals[j];
|
127
|
+
posting_list_header val = newvals[j];
|
129
128
|
set_isdel_true(flagbaks, j);
|
130
129
|
while (1) {
|
131
130
|
uint32_t inc, k, i;
|
@@ -139,7 +138,7 @@ wp_error* termhash_bump_size(termhash *h) {
|
|
139
138
|
set_isempty_false(newflags, i);
|
140
139
|
if (i < h->n_buckets && iseither(flagbaks, i) == 0) {
|
141
140
|
{ term tmp = newkeys[i]; newkeys[i] = key; key = tmp; }
|
142
|
-
{
|
141
|
+
{ posting_list_header tmp = newvals[i]; newvals[i] = val; val = tmp; }
|
143
142
|
set_isdel_true(flagbaks, i);
|
144
143
|
} else {
|
145
144
|
newkeys[i] = key;
|
@@ -235,20 +234,20 @@ void termhash_del(termhash *h, uint32_t x) {
|
|
235
234
|
}
|
236
235
|
}
|
237
236
|
|
238
|
-
|
239
|
-
|
237
|
+
posting_list_header* termhash_get_val(termhash* h, term t) {
|
238
|
+
posting_list_header* vals = TERMHASH_VALS(h);
|
240
239
|
uint32_t idx = termhash_get(h, t);
|
241
|
-
if(idx == h->n_buckets) return
|
242
|
-
return vals[idx];
|
240
|
+
if(idx == h->n_buckets) return NULL;
|
241
|
+
return &vals[idx];
|
243
242
|
}
|
244
243
|
|
245
|
-
wp_error* termhash_put_val(termhash* h, term t,
|
244
|
+
wp_error* termhash_put_val(termhash* h, term t, posting_list_header* val) {
|
246
245
|
int status;
|
247
|
-
|
246
|
+
posting_list_header* vals = TERMHASH_VALS(h);
|
248
247
|
uint32_t loc = termhash_put(h, t, &status);
|
249
248
|
DEBUG("put(%u,%u) has status %d and loc %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
|
250
249
|
if(status == -1) RAISE_ERROR("out of space in hash");
|
251
|
-
vals[loc]
|
250
|
+
memcpy(&vals[loc], val, sizeof(posting_list_header));
|
252
251
|
return NO_ERROR;
|
253
252
|
}
|
254
253
|
|
@@ -257,22 +256,22 @@ int termhash_needs_bump(termhash* h) {
|
|
257
256
|
}
|
258
257
|
|
259
258
|
// returns the total size in bytes
|
260
|
-
// memory layout: termhash, then:
|
259
|
+
// memory layout: termhash struct, then:
|
261
260
|
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
262
261
|
// n_buckets terms for the keys
|
263
|
-
// n_buckets
|
262
|
+
// n_buckets posting_list_header for the vals (offsets into postings lists)
|
264
263
|
static uint32_t size(uint32_t n_buckets) {
|
265
264
|
uint32_t size = (uint32_t)sizeof(termhash) +
|
266
265
|
(((n_buckets >> 4) + 1) * (uint32_t)sizeof(uint32_t)) +
|
267
266
|
(n_buckets * (uint32_t)sizeof(term)) +
|
268
|
-
(n_buckets * (uint32_t)sizeof(
|
267
|
+
(n_buckets * (uint32_t)sizeof(posting_list_header));
|
269
268
|
|
270
269
|
DEBUG("size of a termhash with %u buckets is %lu + %lu + %lu + %lu = %u",
|
271
270
|
n_buckets,
|
272
271
|
(long)sizeof(termhash),
|
273
272
|
(long)(((n_buckets >> 4) + 1) * sizeof(uint32_t)),
|
274
273
|
(long)(n_buckets * sizeof(term)),
|
275
|
-
(long)(n_buckets * sizeof(
|
274
|
+
(long)(n_buckets * sizeof(posting_list_header)),
|
276
275
|
size);
|
277
276
|
|
278
277
|
return size;
|
data/ext/whistlepig/termhash.h
CHANGED
@@ -22,6 +22,18 @@ typedef struct term {
|
|
22
22
|
uint32_t word_s;
|
23
23
|
} term;
|
24
24
|
|
25
|
+
typedef struct posting_list_header {
|
26
|
+
uint32_t count;
|
27
|
+
uint32_t next_offset;
|
28
|
+
} posting_list_header;
|
29
|
+
|
30
|
+
typedef struct block_header {
|
31
|
+
uint32_t max_docid;
|
32
|
+
uint32_t next_offset;
|
33
|
+
uint32_t block_start;
|
34
|
+
uint8_t data[];
|
35
|
+
} block_header;
|
36
|
+
|
25
37
|
#define INITIAL_N_BUCKETS_IDX 1
|
26
38
|
|
27
39
|
typedef struct termhash {
|
@@ -31,12 +43,12 @@ typedef struct termhash {
|
|
31
43
|
// in memory at this point
|
32
44
|
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
33
45
|
// n_buckets terms for the keys
|
34
|
-
// n_buckets
|
46
|
+
// n_buckets posting_list_header for the vals
|
35
47
|
} termhash;
|
36
48
|
|
37
49
|
#define TERMHASH_FLAGS(h) ((uint32_t*)(h)->boundary)
|
38
50
|
#define TERMHASH_KEYS(h) ((term*)((uint32_t*)(h)->boundary + (((h)->n_buckets >> 4) + 1)))
|
39
|
-
#define TERMHASH_VALS(h) ((
|
51
|
+
#define TERMHASH_VALS(h) ((posting_list_header*)(TERMHASH_KEYS(h) + (h)->n_buckets))
|
40
52
|
|
41
53
|
// API methods
|
42
54
|
|
@@ -50,14 +62,14 @@ uint32_t termhash_get(termhash *h, term t);
|
|
50
62
|
|
51
63
|
// public: get an int given a term. returns (uint32_t)-1 if the term is not in
|
52
64
|
// the hash.
|
53
|
-
|
65
|
+
posting_list_header* termhash_get_val(termhash* h, term t); // convenience
|
54
66
|
|
55
67
|
// private: khash-style setter: insert a term into the hash. see the code
|
56
68
|
// for details on what all the return values mean.
|
57
69
|
uint32_t termhash_put(termhash* h, term t, int *ret); // khash-style
|
58
70
|
|
59
71
|
// public: adds a term to the hash with the given value
|
60
|
-
wp_error* termhash_put_val(termhash* h, term t,
|
72
|
+
wp_error* termhash_put_val(termhash* h, term t, posting_list_header* val) RAISES_ERROR; // convenience
|
61
73
|
|
62
74
|
// public: returns the byte size of the termhash
|
63
75
|
uint32_t termhash_size(termhash* h);
|
data/ext/whistlepig/whistlepig.c
CHANGED
@@ -7,7 +7,9 @@ static VALUE c_index;
|
|
7
7
|
static VALUE c_entry;
|
8
8
|
static VALUE c_query;
|
9
9
|
static VALUE c_error;
|
10
|
-
static VALUE
|
10
|
+
static VALUE c_parse_error;
|
11
|
+
static VALUE c_sys_error;
|
12
|
+
static VALUE c_version_error;
|
11
13
|
|
12
14
|
static void index_free(wp_index* index) {
|
13
15
|
wp_error* e = wp_index_free(index);
|
@@ -20,7 +22,12 @@ static void index_free(wp_index* index) {
|
|
20
22
|
|
21
23
|
#define RAISE_IF_NECESSARY(e) do { \
|
22
24
|
if(e != NULL) { \
|
23
|
-
VALUE exc
|
25
|
+
VALUE exc; \
|
26
|
+
switch(e->type) { \
|
27
|
+
case WP_ERROR_TYPE_SYSTEM: exc = rb_exc_new2(c_sys_error, e->msg); break; \
|
28
|
+
case WP_ERROR_TYPE_VERSION: exc = rb_exc_new2(c_version_error, e->msg); break; \
|
29
|
+
default: exc = rb_exc_new2(c_error, e->msg); break; \
|
30
|
+
} \
|
24
31
|
wp_error_free(e); \
|
25
32
|
rb_exc_raise(exc); \
|
26
33
|
} \
|
@@ -331,7 +338,7 @@ static VALUE query_new(VALUE class, VALUE default_field, VALUE string) {
|
|
331
338
|
wp_query* query;
|
332
339
|
wp_error* e = wp_query_parse(RSTRING_PTR(string), RSTRING_PTR(default_field), &query);
|
333
340
|
if(e != NULL) {
|
334
|
-
VALUE exc = rb_exc_new2(
|
341
|
+
VALUE exc = rb_exc_new2(c_parse_error, e->msg);
|
335
342
|
wp_error_free(e);
|
336
343
|
rb_exc_raise(exc);
|
337
344
|
}
|
@@ -614,5 +621,7 @@ void Init_whistlepig() {
|
|
614
621
|
rb_define_attr(c_query, "query", 1, 0);
|
615
622
|
|
616
623
|
c_error = rb_define_class_under(m_whistlepig, "Error", rb_eStandardError);
|
617
|
-
|
624
|
+
c_parse_error = rb_define_class_under(m_whistlepig, "ParseError", c_error);
|
625
|
+
c_sys_error = rb_define_class_under(m_whistlepig, "SystemError", c_error);
|
626
|
+
c_version_error = rb_define_class_under(m_whistlepig, "VersionError", c_error);
|
618
627
|
}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whistlepig
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.12'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-06-09 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Whistlepig is a minimalist realtime full-text search index. Its goal
|
15
15
|
is to be as small and minimally-featured as possible, while still remaining useful,
|