whistlepig 0.9.1 → 0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +40 -12
- data/ext/whistlepig/extconf.rb +1 -1
- data/ext/whistlepig/index.c +201 -62
- data/ext/whistlepig/index.h +11 -2
- data/ext/whistlepig/lock.c +153 -0
- data/ext/whistlepig/lock.h +18 -0
- data/ext/whistlepig/mmap-obj.c +36 -20
- data/ext/whistlepig/mmap-obj.h +12 -7
- data/ext/whistlepig/search.c +7 -6
- data/ext/whistlepig/segment.c +97 -47
- data/ext/whistlepig/segment.h +19 -3
- data/ext/whistlepig/stringmap.c +61 -56
- data/ext/whistlepig/stringmap.h +7 -14
- data/ext/whistlepig/termhash.c +60 -62
- data/ext/whistlepig/termhash.h +4 -6
- data/ext/whistlepig/whistlepig.c +5 -1
- data/ext/whistlepig/whistlepig.h +1 -0
- metadata +29 -38
- data/ext/whistlepig/dump.c +0 -65
- data/ext/whistlepig/extconf.h +0 -3
- data/ext/whistlepig/test-segment.c +0 -404
- data/ext/whistlepig/test-stringmap.c +0 -82
- data/ext/whistlepig/test-stringpool.c +0 -67
- data/ext/whistlepig/test-termhash.c +0 -95
- data/ext/whistlepig/test-tokenizer.c +0 -55
- data/ext/whistlepig/test.h +0 -38
- data/ext/whistlepig/timer.h +0 -28
@@ -0,0 +1,18 @@
|
|
1
|
+
#ifndef wp_lock_h_
|
2
|
+
#define wp_lock_h_
|
3
|
+
|
4
|
+
// whistlepig locks
|
5
|
+
// (c) 2011 william morgan. see copying for license terms.
|
6
|
+
|
7
|
+
#include <pthread.h>
|
8
|
+
|
9
|
+
#include "error.h"
|
10
|
+
|
11
|
+
#define WP_LOCK_READLOCK 0
|
12
|
+
#define WP_LOCK_WRITELOCK 1
|
13
|
+
|
14
|
+
wp_error* wp_lock_setup(pthread_rwlock_t* lock) RAISES_ERROR;
|
15
|
+
wp_error* wp_lock_grab(pthread_rwlock_t* lock, int lock_type) RAISES_ERROR;
|
16
|
+
wp_error* wp_lock_release(pthread_rwlock_t* lock) RAISES_ERROR;
|
17
|
+
|
18
|
+
#endif
|
data/ext/whistlepig/mmap-obj.c
CHANGED
@@ -18,10 +18,10 @@ wp_error* mmap_obj_create(mmap_obj* o, const char* magic, const char* pathname,
|
|
18
18
|
lseek(o->fd, size - 1, SEEK_SET);
|
19
19
|
ssize_t num_bytes = write(o->fd, "", 1);
|
20
20
|
if(num_bytes == -1) RAISE_SYSERROR("write");
|
21
|
-
o->
|
22
|
-
if(o->
|
23
|
-
strncpy(o->
|
24
|
-
o->
|
21
|
+
o->content = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
|
22
|
+
if(o->content == MAP_FAILED) RAISE_SYSERROR("mmap");
|
23
|
+
strncpy(o->content->magic, magic, MMAP_OBJ_MAGIC_SIZE);
|
24
|
+
o->content->size = o->loaded_size = initial_size;
|
25
25
|
DEBUG("created new %s object with %u bytes", magic, size);
|
26
26
|
|
27
27
|
return NO_ERROR;
|
@@ -33,44 +33,60 @@ wp_error* mmap_obj_load(mmap_obj* o, const char* magic, const char* pathname) {
|
|
33
33
|
if(o->fd == -1) RAISE_SYSERROR("cannot open %s", pathname);
|
34
34
|
|
35
35
|
// load header
|
36
|
-
o->
|
37
|
-
if(o->
|
36
|
+
o->content = mmap(NULL, sizeof(mmap_obj_header), PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
|
37
|
+
if(o->content == MAP_FAILED) RAISE_SYSERROR("header mmap");
|
38
38
|
DEBUG("loaded header of %u bytes for %s object", sizeof(mmap_obj_header), magic);
|
39
39
|
|
40
|
-
RELAY_ERROR(validate(o->
|
40
|
+
RELAY_ERROR(validate(o->content, magic));
|
41
41
|
|
42
|
-
|
42
|
+
o->loaded_size = o->content->size;
|
43
|
+
|
44
|
+
uint32_t size = o->content->size + (uint32_t)sizeof(mmap_obj_header);
|
43
45
|
DEBUG("full size is %u bytes (including %u-byte header)", size, sizeof(mmap_obj_header));
|
44
|
-
if(munmap(o->
|
46
|
+
if(munmap(o->content, sizeof(mmap_obj_header)) == -1) RAISE_SYSERROR("munmap");
|
45
47
|
|
46
|
-
o->
|
47
|
-
if(o->
|
48
|
+
o->content = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
|
49
|
+
if(o->content == MAP_FAILED) RAISE_SYSERROR("full mmap");
|
48
50
|
DEBUG("loaded full %s object of %u bytes", magic, size);
|
49
51
|
|
50
52
|
return NO_ERROR;
|
51
53
|
}
|
52
54
|
|
55
|
+
wp_error* mmap_obj_reload(mmap_obj* o) {
|
56
|
+
if(o->loaded_size != o->content->size) {
|
57
|
+
DEBUG("need to reload %s because size of %u is now %u", o->content->magic, o->loaded_size, o->content->size);
|
58
|
+
uint32_t new_size = o->content->size + (uint32_t)sizeof(mmap_obj_header);
|
59
|
+
if(munmap(o->content, sizeof(mmap_obj_header) + o->loaded_size) == -1) RAISE_SYSERROR("munmap");
|
60
|
+
o->content = mmap(NULL, new_size, PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
|
61
|
+
if(o->content == MAP_FAILED) RAISE_SYSERROR("mmap of %uk", new_size / 1024);
|
62
|
+
o->loaded_size = o->content->size;
|
63
|
+
DEBUG("loaded %u bytes for %s. header is at %p", o->content->size, o->content->magic, o->content);
|
64
|
+
}
|
65
|
+
|
66
|
+
return NO_ERROR;
|
67
|
+
}
|
68
|
+
|
53
69
|
wp_error* mmap_obj_resize(mmap_obj* o, uint32_t data_size) {
|
54
|
-
DEBUG("going to expand from %u to %u bytes. current header is at %p", o->
|
70
|
+
DEBUG("going to expand from %u to %u bytes. current header is at %p", o->content->size, data_size, o->content);
|
55
71
|
|
56
|
-
if(munmap(o->
|
72
|
+
if(munmap(o->content, sizeof(mmap_obj_header) + o->content->size) == -1) RAISE_SYSERROR("munmap");
|
57
73
|
uint32_t size = data_size + (uint32_t)sizeof(mmap_obj_header);
|
58
74
|
|
59
75
|
lseek(o->fd, size - 1, SEEK_SET);
|
60
76
|
ssize_t num_bytes = write(o->fd, "", 1);
|
61
77
|
if(num_bytes == -1) RAISE_SYSERROR("write");
|
62
78
|
//lseek(fd, 0, SEEK_SET); // not necessary!
|
63
|
-
o->
|
64
|
-
if(o->
|
65
|
-
o->
|
66
|
-
DEBUG("loaded %u bytes after resize. header is at %p", o->
|
79
|
+
o->content = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
|
80
|
+
if(o->content == MAP_FAILED) RAISE_SYSERROR("mmap");
|
81
|
+
o->content->size = o->loaded_size = data_size;
|
82
|
+
DEBUG("loaded %u bytes after resize. header is at %p", o->content->size, o->content);
|
67
83
|
|
68
84
|
return NO_ERROR;
|
69
85
|
}
|
70
86
|
|
71
87
|
wp_error* mmap_obj_unload(mmap_obj* o) {
|
72
|
-
DEBUG("unloading %u bytes", sizeof(mmap_obj_header) + o->
|
73
|
-
if(munmap(o->
|
74
|
-
o->
|
88
|
+
DEBUG("unloading %u bytes", sizeof(mmap_obj_header) + o->content->size);
|
89
|
+
if(munmap(o->content, sizeof(mmap_obj_header) + o->content->size) == -1) RAISE_SYSERROR("munmap");
|
90
|
+
o->content = NULL;
|
75
91
|
return NO_ERROR;
|
76
92
|
}
|
data/ext/whistlepig/mmap-obj.h
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
// wrappers around the logic of loading, unloading, and resizing
|
8
8
|
// arbitrary-sized objects using mmap.
|
9
9
|
//
|
10
|
-
// note that
|
10
|
+
// note that any of the mmap_obj_* functions may change the object pointer, so
|
11
11
|
// use MMAP_OBJ or MAP_OBJ_PTR to dereference (again) after calling them.
|
12
12
|
|
13
13
|
#define MMAP_OBJ_MAGIC_SIZE 15
|
@@ -15,26 +15,27 @@
|
|
15
15
|
#include <stdint.h>
|
16
16
|
#include "error.h"
|
17
17
|
|
18
|
-
//
|
18
|
+
// what's actually mmap'd
|
19
19
|
typedef struct mmap_obj_header {
|
20
20
|
char magic[MMAP_OBJ_MAGIC_SIZE];
|
21
|
-
uint32_t size;
|
22
|
-
char obj[];
|
21
|
+
uint32_t size; // size of payload, not including this header
|
22
|
+
char obj[]; // the payload itself
|
23
23
|
} mmap_obj_header;
|
24
24
|
|
25
25
|
// what we pass around at runtime
|
26
26
|
typedef struct mmap_obj {
|
27
27
|
int fd;
|
28
|
-
|
28
|
+
uint32_t loaded_size; // compare against header->sizer
|
29
|
+
mmap_obj_header* content;
|
29
30
|
} mmap_obj;
|
30
31
|
|
31
32
|
// public API
|
32
33
|
|
33
34
|
// public: get the actual object from an mmap_obj
|
34
|
-
#define MMAP_OBJ(v, type) ((type*)&v.
|
35
|
+
#define MMAP_OBJ(v, type) ((type*)&v.content->obj)
|
35
36
|
|
36
37
|
// public: get the object from an mmap_obj*
|
37
|
-
#define MMAP_OBJ_PTR(v, type) (type*)v->
|
38
|
+
#define MMAP_OBJ_PTR(v, type) (type*)v->content->obj
|
38
39
|
|
39
40
|
// public: create an object with an initial size
|
40
41
|
wp_error* mmap_obj_create(mmap_obj* o, const char* magic, const char* pathname, uint32_t initial_size) RAISES_ERROR;
|
@@ -43,6 +44,10 @@ wp_error* mmap_obj_create(mmap_obj* o, const char* magic, const char* pathname,
|
|
43
44
|
// magic doesn't match)
|
44
45
|
wp_error* mmap_obj_load(mmap_obj* o, const char* magic, const char* pathname) RAISES_ERROR;
|
45
46
|
|
47
|
+
// public: load an object, but only if the size has changed since the
|
48
|
+
// first load.
|
49
|
+
wp_error* mmap_obj_reload(mmap_obj* o) RAISES_ERROR;
|
50
|
+
|
46
51
|
// public: resize an object. note that the obj pointer might change after this call.
|
47
52
|
wp_error* mmap_obj_resize(mmap_obj* o, uint32_t new_size) RAISES_ERROR;
|
48
53
|
|
data/ext/whistlepig/search.c
CHANGED
@@ -184,15 +184,16 @@ static wp_error* term_init_search_state(wp_query* q, wp_segment* seg) {
|
|
184
184
|
term t;
|
185
185
|
stringmap* sh = MMAP_OBJ(seg->stringmap, stringmap);
|
186
186
|
termhash* th = MMAP_OBJ(seg->termhash, termhash);
|
187
|
+
stringpool* sp = MMAP_OBJ(seg->stringpool, stringpool);
|
187
188
|
|
188
189
|
term_search_state* state = q->search_data = malloc(sizeof(term_search_state));
|
189
190
|
state->started = 0;
|
190
191
|
|
191
192
|
state->label = q->type == WP_QUERY_LABEL ? 1 : 0;
|
192
193
|
if(state->label) t.field_s = 0;
|
193
|
-
else t.field_s = stringmap_string_to_int(sh, q->field); // will be -1 if not found
|
194
|
+
else t.field_s = stringmap_string_to_int(sh, sp, q->field); // will be -1 if not found
|
194
195
|
|
195
|
-
t.word_s = stringmap_string_to_int(sh, q->word);
|
196
|
+
t.word_s = stringmap_string_to_int(sh, sp, q->word);
|
196
197
|
|
197
198
|
uint32_t offset = termhash_get_val(th, t);
|
198
199
|
if(offset == (uint32_t)-1) offset = OFFSET_NONE;
|
@@ -268,10 +269,10 @@ static wp_error* neg_init_search_state(wp_query* q, wp_segment* seg) {
|
|
268
269
|
|
269
270
|
RELAY_ERROR(wp_search_init_search_state(q->children, seg));
|
270
271
|
|
271
|
-
|
272
|
+
segment_info* si = MMAP_OBJ(seg->seginfo, segment_info);
|
272
273
|
neg_search_state* state = q->search_data = malloc(sizeof(neg_search_state));
|
273
274
|
|
274
|
-
state->cur =
|
275
|
+
state->cur = si->num_docs + 1;
|
275
276
|
search_result result;
|
276
277
|
int done;
|
277
278
|
RELAY_ERROR(query_next_doc(q->children, seg, &result, &done));
|
@@ -294,8 +295,8 @@ static wp_error* neg_release_search_state(wp_query* q) {
|
|
294
295
|
static wp_error* every_init_search_state(wp_query* q, wp_segment* seg) {
|
295
296
|
q->search_data = malloc(sizeof(docid_t));
|
296
297
|
|
297
|
-
|
298
|
-
*(docid_t*)q->search_data =
|
298
|
+
segment_info* si = MMAP_OBJ(seg->seginfo, segment_info);
|
299
|
+
*(docid_t*)q->search_data = si->num_docs;
|
299
300
|
|
300
301
|
return NO_ERROR;
|
301
302
|
}
|
data/ext/whistlepig/segment.c
CHANGED
@@ -6,18 +6,50 @@
|
|
6
6
|
#define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
|
7
7
|
#define POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS 2 // bigger, mutable
|
8
8
|
|
9
|
+
#define SEGMENT_VERSION 3
|
10
|
+
|
9
11
|
#define wp_segment_label_posting_at(posting_region, offset) ((label_posting*)(posting_region->postings + offset))
|
10
12
|
|
11
|
-
|
12
|
-
|
13
|
-
|
13
|
+
wp_error* wp_segment_grab_readlock(wp_segment* seg) {
|
14
|
+
segment_info* si = MMAP_OBJ(seg->seginfo, segment_info);
|
15
|
+
RELAY_ERROR(wp_lock_grab(&si->lock, WP_LOCK_READLOCK));
|
16
|
+
return NO_ERROR;
|
17
|
+
}
|
18
|
+
|
19
|
+
wp_error* wp_segment_grab_writelock(wp_segment* seg) {
|
20
|
+
segment_info* si = MMAP_OBJ(seg->seginfo, segment_info);
|
21
|
+
RELAY_ERROR(wp_lock_grab(&si->lock, WP_LOCK_WRITELOCK));
|
22
|
+
return NO_ERROR;
|
23
|
+
}
|
24
|
+
|
25
|
+
wp_error* wp_segment_release_lock(wp_segment* seg) {
|
26
|
+
segment_info* si = MMAP_OBJ(seg->seginfo, segment_info);
|
27
|
+
RELAY_ERROR(wp_lock_release(&si->lock));
|
28
|
+
return NO_ERROR;
|
29
|
+
}
|
30
|
+
|
31
|
+
static void postings_region_init(postings_region* pr, uint32_t initial_size, uint32_t postings_type_and_flags) {
|
32
|
+
pr->postings_type_and_flags = postings_type_and_flags;
|
14
33
|
pr->num_postings = 0;
|
15
34
|
pr->postings_head = 1; // skip one byte, which is reserved as OFFSET_NONE
|
16
35
|
pr->postings_tail = initial_size;
|
17
36
|
}
|
18
37
|
|
19
|
-
RAISING_STATIC(
|
20
|
-
|
38
|
+
RAISING_STATIC(segment_info_init(segment_info* si, uint32_t segment_version)) {
|
39
|
+
si->segment_version = segment_version;
|
40
|
+
si->num_docs = 0;
|
41
|
+
|
42
|
+
RELAY_ERROR(wp_lock_setup(&si->lock));
|
43
|
+
return NO_ERROR;
|
44
|
+
}
|
45
|
+
|
46
|
+
RAISING_STATIC(segment_info_validate(segment_info* si, uint32_t segment_version)) {
|
47
|
+
if(si->segment_version != segment_version) RAISE_ERROR("segment has type %u; expecting type %u", si->segment_version, segment_version);
|
48
|
+
return NO_ERROR;
|
49
|
+
}
|
50
|
+
|
51
|
+
RAISING_STATIC(postings_region_validate(postings_region* pr, uint32_t postings_type_and_flags)) {
|
52
|
+
if(pr->postings_type_and_flags != postings_type_and_flags) RAISE_ERROR("postings region has type %u; expecting type %u", pr->postings_type_and_flags, postings_type_and_flags);
|
21
53
|
return NO_ERROR;
|
22
54
|
}
|
23
55
|
|
@@ -27,59 +59,78 @@ RAISING_STATIC(postings_region_validate(postings_region* pr, uint32_t index_type
|
|
27
59
|
wp_error* wp_segment_load(wp_segment* segment, const char* pathname_base) {
|
28
60
|
char fn[FN_SIZE];
|
29
61
|
|
62
|
+
// open the segment info
|
63
|
+
snprintf(fn, 128, "%s.si", pathname_base);
|
64
|
+
RELAY_ERROR(mmap_obj_load(&segment->seginfo, "wp/seginfo", fn));
|
65
|
+
RELAY_ERROR(segment_info_validate(MMAP_OBJ(segment->seginfo, segment_info), SEGMENT_VERSION));
|
66
|
+
|
30
67
|
// open the string pool
|
31
68
|
snprintf(fn, 128, "%s.sp", pathname_base);
|
32
|
-
RELAY_ERROR(mmap_obj_load(&segment->stringpool, "
|
69
|
+
RELAY_ERROR(mmap_obj_load(&segment->stringpool, "wp/stringpool", fn));
|
33
70
|
|
34
71
|
// open the string hash
|
35
|
-
snprintf(fn, 128, "%s.
|
36
|
-
RELAY_ERROR(mmap_obj_load(&segment->stringmap, "
|
37
|
-
stringmap_setup(MMAP_OBJ(segment->stringmap, stringmap), MMAP_OBJ(segment->stringpool, stringpool));
|
72
|
+
snprintf(fn, 128, "%s.sh", pathname_base);
|
73
|
+
RELAY_ERROR(mmap_obj_load(&segment->stringmap, "wp/stringmap", fn));
|
38
74
|
|
39
75
|
// open the term hash
|
40
76
|
snprintf(fn, 128, "%s.th", pathname_base);
|
41
|
-
RELAY_ERROR(mmap_obj_load(&segment->termhash, "
|
42
|
-
termhash_setup(MMAP_OBJ(segment->termhash, termhash));
|
77
|
+
RELAY_ERROR(mmap_obj_load(&segment->termhash, "wp/termhash", fn));
|
43
78
|
|
44
79
|
// open the postings region
|
45
80
|
snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
|
46
|
-
RELAY_ERROR(mmap_obj_load(&segment->postings, "
|
81
|
+
RELAY_ERROR(mmap_obj_load(&segment->postings, "wp/postings", fn));
|
47
82
|
RELAY_ERROR(postings_region_validate(MMAP_OBJ(segment->postings, postings_region), POSTINGS_REGION_TYPE_IMMUTABLE_VBE));
|
48
83
|
|
49
84
|
// open the labels postings region
|
50
85
|
snprintf(fn, 128, "%s.lb", pathname_base);
|
51
|
-
RELAY_ERROR(mmap_obj_load(&segment->labels, "
|
86
|
+
RELAY_ERROR(mmap_obj_load(&segment->labels, "wp/labels", fn));
|
52
87
|
RELAY_ERROR(postings_region_validate(MMAP_OBJ(segment->labels, postings_region), POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS));
|
53
88
|
|
54
89
|
return NO_ERROR;
|
55
90
|
}
|
56
91
|
|
92
|
+
wp_error* wp_segment_reload(wp_segment* segment) {
|
93
|
+
RELAY_ERROR(mmap_obj_reload(&segment->seginfo));
|
94
|
+
RELAY_ERROR(mmap_obj_reload(&segment->stringpool));
|
95
|
+
RELAY_ERROR(mmap_obj_reload(&segment->stringmap));
|
96
|
+
RELAY_ERROR(mmap_obj_reload(&segment->termhash));
|
97
|
+
RELAY_ERROR(mmap_obj_reload(&segment->postings));
|
98
|
+
RELAY_ERROR(mmap_obj_reload(&segment->labels));
|
99
|
+
|
100
|
+
return NO_ERROR;
|
101
|
+
}
|
102
|
+
|
57
103
|
wp_error* wp_segment_create(wp_segment* segment, const char* pathname_base) {
|
58
104
|
char fn[FN_SIZE];
|
59
105
|
|
106
|
+
// create the segment info
|
107
|
+
snprintf(fn, 128, "%s.si", pathname_base);
|
108
|
+
RELAY_ERROR(mmap_obj_create(&segment->seginfo, "wp/seginfo", fn, sizeof(segment_info)));
|
109
|
+
RELAY_ERROR(segment_info_init(MMAP_OBJ(segment->seginfo, segment_info), SEGMENT_VERSION));
|
110
|
+
|
60
111
|
// create the string pool
|
61
112
|
snprintf(fn, 128, "%s.sp", pathname_base);
|
62
|
-
RELAY_ERROR(mmap_obj_create(&segment->stringpool, "
|
113
|
+
RELAY_ERROR(mmap_obj_create(&segment->stringpool, "wp/stringpool", fn, stringpool_initial_size()));
|
63
114
|
stringpool_init(MMAP_OBJ(segment->stringpool, stringpool));
|
64
115
|
|
65
116
|
// create the string hash
|
66
|
-
snprintf(fn, 128, "%s.
|
67
|
-
RELAY_ERROR(mmap_obj_create(&segment->stringmap, "
|
68
|
-
stringmap_init(MMAP_OBJ(segment->stringmap, stringmap)
|
117
|
+
snprintf(fn, 128, "%s.sh", pathname_base);
|
118
|
+
RELAY_ERROR(mmap_obj_create(&segment->stringmap, "wp/stringmap", fn, stringmap_initial_size()));
|
119
|
+
stringmap_init(MMAP_OBJ(segment->stringmap, stringmap));
|
69
120
|
|
70
121
|
// create the term hash
|
71
122
|
snprintf(fn, 128, "%s.th", pathname_base);
|
72
|
-
RELAY_ERROR(mmap_obj_create(&segment->termhash, "
|
123
|
+
RELAY_ERROR(mmap_obj_create(&segment->termhash, "wp/termhash", fn, termhash_initial_size()));
|
73
124
|
termhash_init(MMAP_OBJ(segment->termhash, termhash));
|
74
125
|
|
75
126
|
// create the postings region
|
76
127
|
snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
|
77
|
-
RELAY_ERROR(mmap_obj_create(&segment->postings, "
|
128
|
+
RELAY_ERROR(mmap_obj_create(&segment->postings, "wp/postings", fn, sizeof(postings_region) + INITIAL_POSTINGS_SIZE));
|
78
129
|
postings_region_init(MMAP_OBJ(segment->postings, postings_region), INITIAL_POSTINGS_SIZE, POSTINGS_REGION_TYPE_IMMUTABLE_VBE);
|
79
130
|
|
80
131
|
// create the labels postings region
|
81
132
|
snprintf(fn, 128, "%s.lb", pathname_base);
|
82
|
-
RELAY_ERROR(mmap_obj_create(&segment->labels, "
|
133
|
+
RELAY_ERROR(mmap_obj_create(&segment->labels, "wp/labels", fn, sizeof(postings_region) + INITIAL_POSTINGS_SIZE));
|
83
134
|
postings_region_init(MMAP_OBJ(segment->labels, postings_region), INITIAL_POSTINGS_SIZE, POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS);
|
84
135
|
|
85
136
|
return NO_ERROR;
|
@@ -96,11 +147,13 @@ int wp_segment_exists(const char* pathname_base) {
|
|
96
147
|
wp_error* wp_segment_delete(const char* pathname_base) {
|
97
148
|
char fn[FN_SIZE];
|
98
149
|
|
150
|
+
snprintf(fn, 128, "%s.si", pathname_base);
|
151
|
+
unlink(fn);
|
99
152
|
snprintf(fn, 128, "%s." WP_SEGMENT_POSTING_REGION_PATH_SUFFIX, pathname_base);
|
100
153
|
unlink(fn);
|
101
154
|
snprintf(fn, 128, "%s.sp", pathname_base);
|
102
155
|
unlink(fn);
|
103
|
-
snprintf(fn, 128, "%s.
|
156
|
+
snprintf(fn, 128, "%s.sh", pathname_base);
|
104
157
|
unlink(fn);
|
105
158
|
snprintf(fn, 128, "%s.th", pathname_base);
|
106
159
|
unlink(fn);
|
@@ -132,9 +185,7 @@ RAISING_STATIC(bump_stringmap(wp_segment* s, int* success)) {
|
|
132
185
|
}
|
133
186
|
else {
|
134
187
|
RELAY_ERROR(mmap_obj_resize(&s->stringmap, next_size));
|
135
|
-
|
136
|
-
stringmap_setup(sh, MMAP_OBJ(s->stringpool, stringpool));
|
137
|
-
RELAY_ERROR(stringmap_bump_size(sh));
|
188
|
+
RELAY_ERROR(stringmap_bump_size(MMAP_OBJ(s->stringmap, stringmap), MMAP_OBJ(s->stringpool, stringpool)));
|
138
189
|
}
|
139
190
|
}
|
140
191
|
|
@@ -154,10 +205,7 @@ RAISING_STATIC(bump_stringpool(wp_segment* s, int* success)) {
|
|
154
205
|
}
|
155
206
|
else {
|
156
207
|
RELAY_ERROR(mmap_obj_resize(&s->stringpool, next_size));
|
157
|
-
|
158
|
-
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
159
|
-
sh->pool = sp; // need to update it here too
|
160
|
-
stringpool_bump_size(sp);
|
208
|
+
stringpool_bump_size(MMAP_OBJ(s->stringpool, stringpool));
|
161
209
|
}
|
162
210
|
}
|
163
211
|
|
@@ -177,9 +225,7 @@ RAISING_STATIC(bump_termhash(wp_segment* s, int* success)) {
|
|
177
225
|
}
|
178
226
|
else {
|
179
227
|
RELAY_ERROR(mmap_obj_resize(&s->termhash, next_size));
|
180
|
-
|
181
|
-
termhash_setup(th);
|
182
|
-
RELAY_ERROR(termhash_bump_size(th));
|
228
|
+
RELAY_ERROR(termhash_bump_size(MMAP_OBJ(s->termhash, termhash)));
|
183
229
|
*success = 1;
|
184
230
|
}
|
185
231
|
}
|
@@ -196,7 +242,7 @@ RAISING_STATIC(postings_region_ensure_fit(mmap_obj* mmopr, uint32_t postings_byt
|
|
196
242
|
uint32_t new_tail = pr->postings_tail;
|
197
243
|
while(new_tail <= new_head) new_tail = new_tail * 2;
|
198
244
|
|
199
|
-
if(new_tail > MAX_POSTINGS_REGION_SIZE) new_tail = MAX_POSTINGS_REGION_SIZE;
|
245
|
+
if(new_tail > MAX_POSTINGS_REGION_SIZE - sizeof(mmap_obj_header)) new_tail = MAX_POSTINGS_REGION_SIZE - sizeof(mmap_obj_header);
|
200
246
|
DEBUG("new tail will be %u, current is %u, max is %u", new_tail, pr->postings_tail, MAX_POSTINGS_REGION_SIZE);
|
201
247
|
|
202
248
|
if(new_tail <= new_head) { // can't increase enough
|
@@ -362,7 +408,7 @@ wp_error* wp_segment_read_posting(wp_segment* s, uint32_t offset, posting* po, i
|
|
362
408
|
|
363
409
|
RELAY_ERROR(read_multibyte(&pr->postings[offset], &po->next_offset, &size));
|
364
410
|
//DEBUG("read next_offset %u -> %u (%u bytes)", po->next_offset, orig_offset - po->next_offset, size);
|
365
|
-
if((po->next_offset == 0) || (po->next_offset > orig_offset)) RAISE_ERROR("read invalid next_offset %u (must be > 0 and < %u", po->next_offset, orig_offset);
|
411
|
+
if((po->next_offset == 0) || (po->next_offset > orig_offset)) RAISE_ERROR("read invalid next_offset %u (must be > 0 and < %u)", po->next_offset, orig_offset);
|
366
412
|
po->next_offset = orig_offset - po->next_offset;
|
367
413
|
offset += size;
|
368
414
|
|
@@ -408,11 +454,12 @@ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* w
|
|
408
454
|
postings_region* pr = MMAP_OBJ(s->postings, postings_region);
|
409
455
|
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
410
456
|
termhash* th = MMAP_OBJ(s->termhash, termhash);
|
457
|
+
stringpool* sp = MMAP_OBJ(s->stringpool, stringpool);
|
411
458
|
|
412
459
|
// construct the term object
|
413
460
|
term t;
|
414
|
-
RELAY_ERROR(stringmap_add(sh, field, &t.field_s));
|
415
|
-
RELAY_ERROR(stringmap_add(sh, word, &t.word_s));
|
461
|
+
RELAY_ERROR(stringmap_add(sh, sp, field, &t.field_s));
|
462
|
+
RELAY_ERROR(stringmap_add(sh, sp, word, &t.word_s));
|
416
463
|
|
417
464
|
// find the offset of the next posting
|
418
465
|
posting po;
|
@@ -480,12 +527,13 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
|
|
480
527
|
postings_region* pr = MMAP_OBJ(s->labels, postings_region);
|
481
528
|
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
482
529
|
termhash* th = MMAP_OBJ(s->termhash, termhash);
|
530
|
+
stringpool* sp = MMAP_OBJ(s->stringpool, stringpool);
|
483
531
|
|
484
532
|
// construct the term object. term objects for labels have the special
|
485
533
|
// sentinel field value 0
|
486
534
|
term t;
|
487
535
|
t.field_s = 0; // label sentinel value
|
488
|
-
RELAY_ERROR(stringmap_add(sh, label, &t.word_s)); // get word key
|
536
|
+
RELAY_ERROR(stringmap_add(sh, sp, label, &t.word_s)); // get word key
|
489
537
|
|
490
538
|
// find the previous and next label postings, between which we'll insert this
|
491
539
|
// posting
|
@@ -558,12 +606,13 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
|
|
558
606
|
postings_region* pr = MMAP_OBJ(s->labels, postings_region);
|
559
607
|
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
|
560
608
|
termhash* th = MMAP_OBJ(s->termhash, termhash);
|
609
|
+
stringpool* sp = MMAP_OBJ(s->stringpool, stringpool);
|
561
610
|
|
562
611
|
// construct the term object. term objects for labels have the special
|
563
612
|
// sentinel field value 0
|
564
613
|
term t;
|
565
614
|
t.field_s = 0; // label sentinel value
|
566
|
-
t.word_s = stringmap_string_to_int(sh, label); // will be -1 if not there
|
615
|
+
t.word_s = stringmap_string_to_int(sh, sp, label); // will be -1 if not there
|
567
616
|
|
568
617
|
// find the posting and the previous posting in the list, if any
|
569
618
|
uint32_t prev_offset = OFFSET_NONE;
|
@@ -613,12 +662,13 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
|
|
613
662
|
}
|
614
663
|
|
615
664
|
wp_error* wp_segment_grab_docid(wp_segment* segment, docid_t* doc_id) {
|
616
|
-
|
617
|
-
*doc_id = ++
|
665
|
+
segment_info* si = MMAP_OBJ(segment->seginfo, segment_info);
|
666
|
+
*doc_id = ++si->num_docs;
|
618
667
|
return NO_ERROR;
|
619
668
|
}
|
620
669
|
|
621
670
|
wp_error* wp_segment_dumpinfo(wp_segment* segment, FILE* stream) {
|
671
|
+
segment_info* si = MMAP_OBJ(segment->seginfo, segment_info);
|
622
672
|
postings_region* pr = MMAP_OBJ(segment->postings, postings_region);
|
623
673
|
stringmap* sh = MMAP_OBJ(segment->stringmap, stringmap);
|
624
674
|
stringpool* sp = MMAP_OBJ(segment->stringpool, stringpool);
|
@@ -626,17 +676,17 @@ wp_error* wp_segment_dumpinfo(wp_segment* segment, FILE* stream) {
|
|
626
676
|
|
627
677
|
#define p(a, b) 100.0 * (float)a / (float)b
|
628
678
|
|
629
|
-
fprintf(stream, "segment has type %u\n", pr->
|
630
|
-
fprintf(stream, "segment has %u docs and %u postings\n",
|
631
|
-
fprintf(stream, "postings region is %6ukb at %3.1f%% saturation\n", segment->postings.
|
632
|
-
fprintf(stream, " string hash is %6ukb at %3.1f%% saturation\n", segment->stringmap.
|
633
|
-
fprintf(stream, " stringpool is %6ukb at %3.1f%% saturation\n", segment->stringpool.
|
634
|
-
fprintf(stream, " term hash has %6ukb at %3.1f%% saturation\n", segment->termhash.
|
679
|
+
fprintf(stream, "segment has type %u\n", pr->postings_type_and_flags);
|
680
|
+
fprintf(stream, "segment has %u docs and %u postings\n", si->num_docs, pr->num_postings);
|
681
|
+
fprintf(stream, "postings region is %6ukb at %3.1f%% saturation\n", segment->postings.content->size / 1024, p(pr->postings_head, pr->postings_tail));
|
682
|
+
fprintf(stream, " string hash is %6ukb at %3.1f%% saturation\n", segment->stringmap.content->size / 1024, p(sh->n_occupied, sh->n_buckets));
|
683
|
+
fprintf(stream, " stringpool is %6ukb at %3.1f%% saturation\n", segment->stringpool.content->size / 1024, p(sp->next, sp->size));
|
684
|
+
fprintf(stream, " term hash has %6ukb at %3.1f%% saturation\n", segment->termhash.content->size / 1024, p(th->n_occupied, th->n_buckets));
|
635
685
|
|
636
686
|
return NO_ERROR;
|
637
687
|
}
|
638
688
|
|
639
689
|
uint64_t wp_segment_num_docs(wp_segment* seg) {
|
640
|
-
|
641
|
-
return
|
690
|
+
segment_info* si = MMAP_OBJ(seg->seginfo, segment_info);
|
691
|
+
return si->num_docs;
|
642
692
|
}
|
data/ext/whistlepig/segment.h
CHANGED
@@ -15,6 +15,8 @@
|
|
15
15
|
// different, mutable format. regular text is stored in a compressed format
|
16
16
|
// that is not amenable to later changes.
|
17
17
|
|
18
|
+
#include <pthread.h>
|
19
|
+
|
18
20
|
#include "defaults.h"
|
19
21
|
#include "stringmap.h"
|
20
22
|
#include "termhash.h"
|
@@ -60,21 +62,27 @@ typedef struct label_posting {
|
|
60
62
|
// terms also; see termhash.h.)
|
61
63
|
|
62
64
|
#define MAX_LOGICAL_DOCID 2147483646 // don't tweak me
|
63
|
-
#define MAX_POSTINGS_REGION_SIZE (
|
65
|
+
#define MAX_POSTINGS_REGION_SIZE (256*1024*1024) // tweak me
|
64
66
|
|
65
67
|
#define WP_SEGMENT_POSTING_REGION_PATH_SUFFIX "pr"
|
66
68
|
|
67
69
|
// the header for the postings region
|
68
70
|
typedef struct postings_region {
|
69
|
-
uint32_t
|
70
|
-
uint32_t num_docs;
|
71
|
+
uint32_t postings_type_and_flags;
|
71
72
|
uint32_t num_postings;
|
72
73
|
uint32_t postings_head, postings_tail;
|
73
74
|
uint8_t postings[]; // where the postings go yo
|
74
75
|
} postings_region;
|
75
76
|
|
77
|
+
typedef struct segment_info {
|
78
|
+
uint32_t segment_version;
|
79
|
+
uint32_t num_docs;
|
80
|
+
pthread_rwlock_t lock;
|
81
|
+
} segment_info;
|
82
|
+
|
76
83
|
// a segment is a bunch of all these things
|
77
84
|
typedef struct wp_segment {
|
85
|
+
mmap_obj seginfo;
|
78
86
|
mmap_obj stringmap;
|
79
87
|
mmap_obj stringpool;
|
80
88
|
mmap_obj termhash;
|
@@ -93,6 +101,9 @@ wp_error* wp_segment_create(wp_segment* segment, const char* pathname_base) RAIS
|
|
93
101
|
// public: load a segment, raising an error unless it already exists
|
94
102
|
wp_error* wp_segment_load(wp_segment* segment, const char* pathname_base) RAISES_ERROR;
|
95
103
|
|
104
|
+
// public: reload a segment as necessary, in case an external writer has changed the mmap object sizes
|
105
|
+
wp_error* wp_segment_reload(wp_segment* segment) RAISES_ERROR;
|
106
|
+
|
96
107
|
// public: unload a segment
|
97
108
|
wp_error* wp_segment_unload(wp_segment* s) RAISES_ERROR;
|
98
109
|
|
@@ -102,6 +113,11 @@ uint64_t wp_segment_num_docs(wp_segment* s);
|
|
102
113
|
// public: delete a segment from disk
|
103
114
|
wp_error* wp_segment_delete(const char* pathname_base) RAISES_ERROR;
|
104
115
|
|
116
|
+
// public: lock grabbing and releasing
|
117
|
+
wp_error* wp_segment_grab_readlock(wp_segment* seg) RAISES_ERROR;
|
118
|
+
wp_error* wp_segment_grab_writelock(wp_segment* seg) RAISES_ERROR;
|
119
|
+
wp_error* wp_segment_release_lock(wp_segment* seg) RAISES_ERROR;
|
120
|
+
|
105
121
|
// private: read a posting from the postings region at a given offset
|
106
122
|
wp_error* wp_segment_read_posting(wp_segment* s, uint32_t offset, posting* po, int include_positions) RAISES_ERROR;
|
107
123
|
|