whistlepig 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,137 @@
1
+ #ifndef WP_SEGMENT_H_
2
+ #define WP_SEGMENT_H_
3
+
4
+ // whistlepig segments
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // a segment is the basic persistence mechanism for indexed documents.
8
+ // each segment contains a string hash and pool, a term hash, a
9
+ // postings region, and a separate labels posting region.
10
+ //
11
+ // segments store documents until MAX_DOCID or MAX_POSTINGS_REGION_SIZE
12
+ // are reached. then you have to make a new segment.
13
+ //
14
+ // labels are stored in a separate postings region because they're stored in a
15
+ // different, mutable format. regular text is stored in a compressed format
16
+ // that is not amenable to later changes.
17
+
18
+ #include "defaults.h"
19
+ #include "stringmap.h"
20
+ #include "termhash.h"
21
+ #include "query.h"
22
+ #include "error.h"
23
+ #include "search.h"
24
+ #include "mmap-obj.h"
25
+
26
+ // a posting entry. used to represent postings when actively working with them.
27
+ // the actual structure on disk/mmap memory region is delta- and variable-byte
28
+ // encoded.
29
+ typedef struct posting {
30
+ docid_t doc_id;
31
+ uint32_t num_positions;
32
+ uint32_t next_offset;
33
+ pos_t* positions;
34
+ } posting;
35
+
36
+ // a label posting entry. currently this is also the actual representation of
37
+ // label postings on disk.
38
+ typedef struct label_posting {
39
+ docid_t doc_id;
40
+ uint32_t next_offset;
41
+ } label_posting;
42
+
43
+ #define OFFSET_NONE (uint32_t)0
44
+ #define DOCID_NONE (docid_t)0
45
+
46
+ // docids:
47
+ //
48
+ // docid 0 is reserved as a sentinel value, so the doc ids returned from this
49
+ // segment will be between 1 and num_docs inclusive.
50
+ //
51
+ // docid num_docs + 1 is also a sentinel value for negative queries. also, we
52
+ // reserve one bit of each docid in the posting region as a marker for when
53
+ // there's only one occurrence in the document (this saves us a byte for this
54
+ // case). so the logical maximum number of docs per segment is 2^31 - 2 =
55
+ // 2,147,483,646.
56
+ //
57
+ // we make the segments smaller than that anyways, under the assumption that
58
+ // this will make automatic segment loading and unloading easier, once we have
59
+ // that implemented. (and there are limits to things like the number of unique
60
+ // terms also; see termhash.h.)
61
+
62
+ #define MAX_LOGICAL_DOCID 2147483646 // don't tweak me
63
+ #define MAX_POSTINGS_REGION_SIZE (512*1024*1024) // tweak me
64
+
65
+ #define WP_SEGMENT_POSTING_REGION_PATH_SUFFIX "pr"
66
+
67
+ // the header for the postings region
68
+ typedef struct postings_region {
69
+ uint32_t index_type_and_flags;
70
+ uint32_t num_docs;
71
+ uint32_t num_postings;
72
+ uint32_t postings_head, postings_tail;
73
+ uint8_t postings[]; // where the postings go yo
74
+ } postings_region;
75
+
76
+ // a segment is a bunch of all these things
77
+ typedef struct wp_segment {
78
+ mmap_obj stringmap;
79
+ mmap_obj stringpool;
80
+ mmap_obj termhash;
81
+ mmap_obj postings;
82
+ mmap_obj labels;
83
+ } wp_segment;
84
+
85
+ // API methods
86
+
87
+ // public: does a segment exist with this base pathname?
88
+ int wp_segment_exists(const char* pathname_base);
89
+
90
+ // public: create a segment, raising an error if it already exists
91
+ wp_error* wp_segment_create(wp_segment* segment, const char* pathname_base) RAISES_ERROR;
92
+
93
+ // public: load a segment, raising an error unless it already exists
94
+ wp_error* wp_segment_load(wp_segment* segment, const char* pathname_base) RAISES_ERROR;
95
+
96
+ // public: unload a segment
97
+ wp_error* wp_segment_unload(wp_segment* s) RAISES_ERROR;
98
+
99
+ // public: number of docs in a segment
100
+ uint64_t wp_segment_num_docs(wp_segment* s);
101
+
102
+ // public: delete a segment from disk
103
+ wp_error* wp_segment_delete(const char* pathname_base) RAISES_ERROR;
104
+
105
+ // private: read a posting from the postings region at a given offset
106
+ wp_error* wp_segment_read_posting(wp_segment* s, uint32_t offset, posting* po, int include_positions) RAISES_ERROR;
107
+
108
+ // private: read a label from the label postings region at a given offset
109
+ wp_error* wp_segment_read_label(wp_segment* s, uint32_t offset, posting* po) RAISES_ERROR;
110
+
111
+ // public: add a posting. be sure you've called wp_segment_ensure_fit with the
112
+ // size of the postings list entry before doing this! (you can obtain the size
113
+ // by calling wp_entry_sizeof_postings_region()).
114
+ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* word, docid_t doc_id, uint32_t num_positions, pos_t positions[]) RAISES_ERROR;
115
+
116
+ // public: add a label to an existing document
117
+ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id) RAISES_ERROR;
118
+
119
+ // public: remove a label from an existing document
120
+ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_id) RAISES_ERROR;
121
+
122
+ // public: get a new docid
123
+ wp_error* wp_segment_grab_docid(wp_segment* s, docid_t* docid) RAISES_ERROR;
124
+
125
+ // public: dump a lot of info about the segment to a stream
126
+ wp_error* wp_segment_dumpinfo(wp_segment* s, FILE* stream) RAISES_ERROR;
127
+
128
+ // public: ensure that adding a certain number of postings bytes and label
129
+ // postings bytes will still fit within the bounds of the segment. sets success
130
+ // to 1 if true or 0 if false. if false, you should put that stuff in a new
131
+ // segment.
132
+ wp_error* wp_segment_ensure_fit(wp_segment* seg, uint32_t postings_bytes, uint32_t label_bytes, int* success) RAISES_ERROR;
133
+
134
+ // private: return the size on disk of a position array
135
+ wp_error* wp_segment_sizeof_posarray(wp_segment* seg, uint32_t num_positions, pos_t* positions, uint32_t* size) RAISES_ERROR;
136
+
137
+ #endif
@@ -0,0 +1,278 @@
1
+ #include "whistlepig.h"
2
+
3
+ static const int HASH_PRIME_SIZE = 32;
4
+
5
+ static const uint32_t prime_list[] = {
6
+ 0ul, 3ul, 11ul, 23ul, 53ul,
7
+ 97ul, 193ul, 389ul, 769ul, 1543ul,
8
+ 3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
9
+ 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
10
+ 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
11
+ 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
12
+ 3221225473ul, 4294967291ul
13
+ };
14
+
15
+ #define isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
16
+ #define isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
17
+ #define iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
18
+ #define set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
19
+ #define set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
20
+ #define set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
21
+ #define set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
22
+
23
+ static const double HASH_UPPER = 0.77;
24
+
25
+ static inline uint32_t string_hash(const char *s) {
26
+ uint32_t h = *s;
27
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
28
+ return h;
29
+ }
30
+
31
+ static inline int string_equals(const char* a, const char* b) {
32
+ //DEBUG("comparing '%s' (%p) and '%s' (%p)", a, a, b, b);
33
+ return strcmp(a, b) == 0;
34
+ }
35
+
36
+ // set flags, keys and vals to correct locations based on h->n_buckets
37
+ void stringmap_setup(stringmap* h, stringpool* p) {
38
+ h->pool = p;
39
+ h->flags = (uint32_t*)h->boundary;
40
+ h->keys = (uint32_t*)((uint32_t*)h->boundary + ((h->n_buckets >> 4) + 1));
41
+ }
42
+
43
+ void stringmap_init(stringmap* h, stringpool* p) {
44
+ h->n_buckets_idx = INITIAL_N_BUCKETS_IDX;
45
+ h->n_buckets = prime_list[h->n_buckets_idx];
46
+ h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
47
+ h->size = h->n_occupied = 0;
48
+ stringmap_setup(h, p);
49
+ memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
50
+ }
51
+
52
+ /*
53
+ static void kh_destroy_##name(kh_##name##_t *h) {
54
+ if (h) {
55
+ free(h->keys); free(h->flags);
56
+ free(h->vals);
57
+ free(h);
58
+ }
59
+ }
60
+
61
+ static void kh_clear_##name(kh_##name##_t *h) {
62
+ if (h && h->flags) {
63
+ memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
64
+ h->size = h->n_occupied = 0;
65
+ }
66
+ }
67
+ */
68
+
69
+ uint32_t stringmap_get(stringmap *h, const char* key) {
70
+ if(h->n_buckets) {
71
+ uint32_t inc, k, i, last;
72
+ k = string_hash(key); i = k % h->n_buckets;
73
+ inc = 1 + k % (h->n_buckets - 1); last = i;
74
+ while (!isempty(h->flags, i) && (isdel(h->flags, i) || !string_equals(stringpool_lookup(h->pool, h->keys[i]), key))) {
75
+ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
76
+ else i += inc;
77
+ if (i == last) return h->n_buckets;
78
+ }
79
+ return iseither(h->flags, i)? h->n_buckets : i;
80
+ }
81
+ else return 0;
82
+ }
83
+
84
+ wp_error* stringmap_bump_size(stringmap *h) {
85
+ DEBUG("bumping size for string hash at %p with size %u and boundary %p", h, stringmap_size(h), h->boundary);
86
+
87
+ if(h->n_buckets_idx >= (HASH_PRIME_SIZE - 1)) RAISE_ERROR("stringmap can't be this big");
88
+
89
+ h->n_buckets_idx++;
90
+ uint32_t new_n_buckets = prime_list[h->n_buckets_idx];
91
+
92
+ // first make a backup of the oldflags
93
+ size_t oldflagsize = ((h->n_buckets >> 4) + 1) * sizeof(uint32_t);
94
+ uint32_t* oldflags = malloc(oldflagsize);
95
+ memcpy(oldflags, h->flags, oldflagsize);
96
+
97
+ // keep pointers to the old locations
98
+ uint32_t* oldkeys = h->keys;
99
+
100
+ // set pointers to the new locations
101
+ h->keys = (uint32_t*)((uint32_t*)h->boundary + ((new_n_buckets >> 4) + 1));
102
+
103
+ // move the keys
104
+ memmove(h->keys, oldkeys, h->n_buckets * sizeof(uint32_t));
105
+
106
+ // clear the new flags
107
+ memset(h->flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t));
108
+
109
+ // do the complicated stuff from khash.h
110
+ for (unsigned int j = 0; j != h->n_buckets; ++j) {
111
+ if (iseither(oldflags, j) == 0) {
112
+ uint32_t key = h->keys[j];
113
+ set_isdel_true(oldflags, j);
114
+ while (1) {
115
+ uint32_t inc, k, i;
116
+ k = string_hash(stringpool_lookup(h->pool, key));
117
+ i = k % new_n_buckets;
118
+ inc = 1 + k % (new_n_buckets - 1);
119
+ while (!isempty(h->flags, i)) {
120
+ if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets;
121
+ else i += inc;
122
+ }
123
+ set_isempty_false(h->flags, i);
124
+ if (i < h->n_buckets && iseither(oldflags, i) == 0) {
125
+ { uint32_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; }
126
+ set_isdel_true(oldflags, i);
127
+ } else {
128
+ h->keys[i] = key;
129
+ break;
130
+ }
131
+ }
132
+ }
133
+ }
134
+
135
+ free(oldflags);
136
+ h->n_buckets = new_n_buckets;
137
+ h->n_occupied = h->size;
138
+ h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
139
+
140
+ #ifdef DEBUGOUTPUT
141
+ DEBUG("after bump, string hash at %p has size %u and boundary %p", h, stringmap_size(h), h->boundary);
142
+ #endif
143
+
144
+ return NO_ERROR;
145
+ }
146
+
147
+ uint32_t stringmap_put(stringmap *h, const char* key, int *ret) {
148
+ uint32_t x;
149
+
150
+ {
151
+ #ifdef DEBUGOUTPUT
152
+ int num_loops = 0;
153
+ #endif
154
+ uint32_t inc, k, i, site, last;
155
+ x = site = h->n_buckets; k = string_hash(key); i = k % h->n_buckets;
156
+ //DEBUG("asked to hash '%s'. initial hash is %u => %u and n_occupied is %u", key, k, i, h->n_occupied);
157
+ if (isempty(h->flags, i)) x = i;
158
+ else {
159
+ inc = 1 + k % (h->n_buckets - 1); last = i;
160
+ while (!isempty(h->flags, i) && (isdel(h->flags, i) || !string_equals(stringpool_lookup(h->pool, h->keys[i]), key))) {
161
+ #ifdef DEBUGOUTPUT
162
+ num_loops++;
163
+ #endif
164
+ if (isdel(h->flags, i)) site = i;
165
+ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
166
+ else i += inc;
167
+ if (i == last) { x = site; break; }
168
+ }
169
+ if ((x == h->n_buckets) && (i == last)) { // out of space
170
+ if(!string_equals(stringpool_lookup(h->pool, h->keys[i]), key)) {
171
+ DEBUG("out of space!");
172
+ *ret = -1;
173
+ return x;
174
+ }
175
+ }
176
+ if (x == h->n_buckets) { // didn't find it on the first try
177
+ if (isempty(h->flags, i) && site != h->n_buckets) x = site;
178
+ else x = i;
179
+ }
180
+ }
181
+ DEBUG("looped %u times to put", num_loops);
182
+ //DEBUG("x is %u, site is %u, n_buckets is %u", x, site, h->n_buckets);
183
+ }
184
+
185
+ //DEBUG("for pos %u, isempty? %d and isdel %d", x, isempty(h->flags, x), isdel(h->flags, x));
186
+
187
+ uint32_t idx;
188
+ if(isempty(h->flags, x) || isdel(h->flags, x)) {
189
+ idx = stringpool_add(h->pool, key);
190
+ if(idx == (uint32_t)-1) {
191
+ *ret = -2;
192
+ return x;
193
+ }
194
+ if (isempty(h->flags, x)) ++h->n_occupied;
195
+ h->keys[x] = idx;
196
+ set_isboth_false(h->flags, x);
197
+ ++h->size;
198
+ *ret = 1;
199
+ }
200
+ else *ret = 0;
201
+
202
+ return x;
203
+ }
204
+
205
+ void stringmap_del(stringmap *h, uint32_t x) {
206
+ if (x != h->n_buckets && !iseither(h->flags, x)) {
207
+ set_isdel_true(h->flags, x);
208
+ --h->size;
209
+ }
210
+ }
211
+
212
+ /*
213
+ uint32_t stringmap_get_val(stringmap* h, string t) {
214
+ uint32_t idx = termhash_get(h, t);
215
+ if(idx == h->n_buckets) return (uint32_t)-1;
216
+ return h->vals[idx];
217
+ }
218
+
219
+ wp_error* termhash_put_val(termhash* h, term t, uint32_t val) {
220
+ int status;
221
+ uint32_t loc = termhash_put(h, t, &status);
222
+ DEBUG("put(%u,%u) has status %d and ret %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
223
+ if(status == -1) RAISE_ERROR("out of space in hash");
224
+ h->vals[loc] = val;
225
+ return NO_ERROR;
226
+ }
227
+ */
228
+
229
+ int stringmap_needs_bump(stringmap* h) {
230
+ return (h->n_occupied >= h->upper_bound);
231
+ }
232
+
233
+ // memory layout: stringmap, then:
234
+ // ((n_buckets >> 4) + 1) uint32_t's for the flags
235
+ // n_buckets uint32_t for the keys
236
+ static uint32_t size(uint32_t n_buckets) {
237
+ uint32_t size = sizeof(stringmap) +
238
+ (((n_buckets >> 4) + 1) * sizeof(uint32_t)) +
239
+ (n_buckets * sizeof(uint32_t));
240
+ return size;
241
+ }
242
+
243
+ // returns the total size in bytes
244
+ uint32_t stringmap_size(stringmap* h) {
245
+ return size(h->n_buckets);
246
+ }
247
+
248
+ uint32_t stringmap_initial_size() {
249
+ return size(prime_list[INITIAL_N_BUCKETS_IDX]);
250
+ }
251
+
252
+ // the size if we embiggen by one notch
253
+ uint32_t stringmap_next_size(stringmap* h) {
254
+ int next_idx = (h->n_buckets_idx < (HASH_PRIME_SIZE - 1)) ? h->n_buckets_idx + 1 : h->n_buckets_idx;
255
+ return size(prime_list[next_idx]);
256
+ }
257
+
258
+ const char* stringmap_int_to_string(stringmap* h, uint32_t i) {
259
+ return stringpool_lookup(h->pool, i);
260
+ }
261
+
262
+ // returns -1 if not found
263
+ uint32_t stringmap_string_to_int(stringmap* h, const char* s) {
264
+ uint32_t idx = stringmap_get(h, s);
265
+ if(idx == h->n_buckets) return (uint32_t)-1; // not there
266
+ return h->keys[idx];
267
+ }
268
+
269
+ wp_error* stringmap_add(stringmap *h, const char* s, uint32_t* id) {
270
+ int status;
271
+ uint32_t idx = stringmap_put(h, s, &status);
272
+ if(status == -1) RAISE_ERROR("out of space in hash put");
273
+ if(status == -2) RAISE_ERROR("out of space in pool put");
274
+
275
+ *id = h->keys[idx];
276
+
277
+ return NO_ERROR;
278
+ }
@@ -0,0 +1,82 @@
1
+ #ifndef WP_STRINGHASH_H_
2
+ #define WP_STRINGHASH_H_
3
+
4
+ // whistlepig string map
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // based on a heavily modified khash.h
8
+ //
9
+ // a stringmap is a bidirectional map from strings to int values. like termhash
10
+ // and stringpool, it uses a slightly funny API that never allocates memory,
11
+ // but instead operates on pointers to preallocated blocks of memory.
12
+ //
13
+ // uses a stringpool internally to do the int->string mapping. so if you're so
14
+ // you shouldn't have to interact with the stringpool directly; you can just
15
+ // use this object.
16
+ //
17
+ // like termhash and pool, has a slightly funny API that is designed to work on
18
+ // a pre-allocated chunk of memory rather than allocate any of its own.
19
+
20
+ #include <stdint.h>
21
+ #include "stringpool.h"
22
+ #include "error.h"
23
+
24
+ /* list of primes from khash.h:
25
+ 0ul, 3ul, 11ul, 23ul, 53ul,
26
+ 97ul, 193ul, 389ul, 769ul, 1543ul,
27
+ 3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
28
+ 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
29
+ 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
30
+ 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
31
+ 3221225473ul, 4294967291ul
32
+ */
33
+
34
+ #define INITIAL_N_BUCKETS_IDX 1
35
+
36
+ typedef struct stringmap {
37
+ uint8_t n_buckets_idx;
38
+ uint32_t n_buckets, size, n_occupied, upper_bound;
39
+ uint32_t *flags;
40
+ uint32_t *keys;
41
+ stringpool* pool;
42
+ uint8_t boundary[];
43
+ // in memory at this point
44
+ // ((n_buckets >> 4) + 1) uint32_t's for the flags
45
+ // n_buckets uint32_t's for the keys
46
+ } stringmap;
47
+
48
+ // API methods
49
+
50
+ // public: write a new stringmap to memory
51
+ void stringmap_init(stringmap* h, stringpool* p);
52
+
53
+ // public: set up an existing stringmap in memory
54
+ void stringmap_setup(stringmap* h, stringpool* p);
55
+
56
+ // public: add a string. sets id to its id. dupes are fine; will just set the
57
+ // id correctly.
58
+ wp_error* stringmap_add(stringmap *h, const char* s, uint32_t* id) RAISES_ERROR;
59
+
60
+ // public: get the int value given a string. returns (uint32_t)-1 if not found.
61
+ uint32_t stringmap_string_to_int(stringmap* h, const char* s);
62
+
63
+ // public: get the string value given an int. returns corrupt data if the int
64
+ // is invalid.
65
+ const char* stringmap_int_to_string(stringmap* h, uint32_t i);
66
+
67
+ // public: returns the byte size of the stringmap
68
+ uint32_t stringmap_size(stringmap* h);
69
+
70
+ // public: returns the initial byte size for an empty stringmap
71
+ uint32_t stringmap_initial_size();
72
+
73
+ // public: returns the byte size for the next larger version of a stringmap
74
+ uint32_t stringmap_next_size(stringmap* h);
75
+
76
+ // public: does the stringmap need a size increase?
77
+ int stringmap_needs_bump(stringmap* h);
78
+
79
+ // public: increases the size of the stringmap
80
+ wp_error* stringmap_bump_size(stringmap *h) RAISES_ERROR;
81
+
82
+ #endif
@@ -0,0 +1,44 @@
1
+ #include "whistlepig.h"
2
+
3
+ void stringpool_init(stringpool* p) {
4
+ p->next = 1;
5
+ p->size = INITIAL_POOL_SIZE;
6
+ }
7
+
8
+ uint32_t stringpool_size(stringpool* p) {
9
+ return sizeof(stringpool) + (p->size * sizeof(char));
10
+ }
11
+
12
+ uint32_t stringpool_add(stringpool* p, const char* s) {
13
+ int len = strlen(s) + 1;
14
+ if((p->next + len) >= p->size) {
15
+ DEBUG("out of space in string pool for %s (len %d, next %d, size %d)", s, len, p->next, p->size);
16
+ return (uint32_t)-1;
17
+ }
18
+ uint32_t ret = p->next;
19
+ p->next += len;
20
+ DEBUG("writing %d bytes to %p -- %p", len, &(p->pool[ret]), &(p->pool[ret]) + len);
21
+ strncpy(&(p->pool[ret]), s, len);
22
+ return ret;
23
+ }
24
+
25
+ int stringpool_needs_bump(stringpool* p) {
26
+ return (p->next >= (int)((float)p->size * 0.9) ? 1 : 0);
27
+ }
28
+
29
+ uint32_t stringpool_next_size(stringpool* p) {
30
+ return sizeof(stringpool) + (2 * (p->size == 0 ? 1 : p->size) * sizeof(char));
31
+ }
32
+
33
+ uint32_t stringpool_initial_size() {
34
+ return sizeof(stringpool) + INITIAL_POOL_SIZE;
35
+ }
36
+
37
+ void stringpool_bump_size(stringpool* p) {
38
+ p->size = stringpool_next_size(p);
39
+ }
40
+
41
+ char* stringpool_lookup(stringpool* p, uint32_t id) {
42
+ if((id == 0) || (id >= p->next)) return NULL;
43
+ return &p->pool[id];
44
+ }
@@ -0,0 +1,58 @@
1
+ #ifndef WP_STRINGPOOL_H_
2
+ #define WP_STRINGPOOL_H_
3
+
4
+ // whistlepig string pool
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // a string pool. adds strings to a big blob and returns an int which can be
8
+ // used to look them up later. in other words, an int->string mapping, where
9
+ // you provide the string and we'll give you an int.
10
+ //
11
+ // does no duplicate detection, if you add the same string twice, you will
12
+ // get two different ints and you will have wasted memory.
13
+ //
14
+ // this is used by stringmap to maintain a bidirectional string<->int mapping
15
+ // and is not really used directly.
16
+ //
17
+ // int 0 is a special case for the null string. passing in invalid ints (i.e.
18
+ // ints i didn't return) will result in garbage data.
19
+ //
20
+ // like termhash and stringmap, has a slightly funny API that is designed to
21
+ // work on a pre-allocated chunk of memory rather than allocate any of its own.
22
+
23
+ #include <stdint.h>
24
+
25
+ #define INITIAL_POOL_SIZE 2048
26
+
27
+ typedef struct stringpool {
28
+ uint32_t size, next;
29
+ char pool[];
30
+ } stringpool;
31
+
32
+ // API methods
33
+
34
+ // public: create a stringpool
35
+ void stringpool_init(stringpool* p);
36
+
37
+ // public: add a string, returning an int
38
+ uint32_t stringpool_add(stringpool* p, const char* s);
39
+
40
+ // public: does this stringpool need to be increased?
41
+ int stringpool_needs_bump(stringpool* p);
42
+
43
+ // public: increase the size of the stringpool
44
+ void stringpool_bump_size(stringpool* p);
45
+
46
+ // public: given an id, return the string
47
+ char* stringpool_lookup(stringpool* p, uint32_t id);
48
+
49
+ // public: returns the byte size of the pool
50
+ uint32_t stringpool_size(stringpool* p);
51
+
52
+ // public: returns the initial byte size for an empty pool
53
+ uint32_t stringpool_initial_size();
54
+
55
+ // public: returns the byte size for the next larger version of a pool
56
+ uint32_t stringpool_next_size(stringpool* p);
57
+
58
+ #endif