whistlepig 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +86 -0
- data/ext/whistlepig/defaults.h +28 -0
- data/ext/whistlepig/entry.c +181 -0
- data/ext/whistlepig/entry.h +66 -0
- data/ext/whistlepig/error.c +24 -0
- data/ext/whistlepig/error.h +94 -0
- data/ext/whistlepig/extconf.rb +6 -0
- data/ext/whistlepig/index.c +294 -0
- data/ext/whistlepig/index.h +88 -0
- data/ext/whistlepig/khash.h +316 -0
- data/ext/whistlepig/mmap-obj.c +76 -0
- data/ext/whistlepig/mmap-obj.h +52 -0
- data/ext/whistlepig/query-parser.c +37 -0
- data/ext/whistlepig/query-parser.h +25 -0
- data/ext/whistlepig/query-parser.lex.c +2249 -0
- data/ext/whistlepig/query-parser.lex.h +359 -0
- data/ext/whistlepig/query-parser.tab.c +1757 -0
- data/ext/whistlepig/query-parser.tab.h +85 -0
- data/ext/whistlepig/query.c +194 -0
- data/ext/whistlepig/query.h +78 -0
- data/ext/whistlepig/search.c +746 -0
- data/ext/whistlepig/search.h +76 -0
- data/ext/whistlepig/segment.c +615 -0
- data/ext/whistlepig/segment.h +137 -0
- data/ext/whistlepig/stringmap.c +278 -0
- data/ext/whistlepig/stringmap.h +82 -0
- data/ext/whistlepig/stringpool.c +44 -0
- data/ext/whistlepig/stringpool.h +58 -0
- data/ext/whistlepig/termhash.c +294 -0
- data/ext/whistlepig/termhash.h +79 -0
- data/ext/whistlepig/tokenizer.lex.c +2263 -0
- data/ext/whistlepig/tokenizer.lex.h +360 -0
- data/ext/whistlepig/whistlepig.h +15 -0
- data/ext/whistlepig/whistlepigc.c +537 -0
- data/lib/whistlepig.rb +119 -0
- metadata +103 -0
@@ -0,0 +1,137 @@
|
|
1
|
+
#ifndef WP_SEGMENT_H_
|
2
|
+
#define WP_SEGMENT_H_
|
3
|
+
|
4
|
+
// whistlepig segments
|
5
|
+
// (c) 2011 William Morgan. See COPYING for license terms.
|
6
|
+
//
|
7
|
+
// a segment is the basic persistence mechanism for indexed documents.
|
8
|
+
// each segment contains a string hash and pool, a term hash, a
|
9
|
+
// postings region, and a separate labels posting region.
|
10
|
+
//
|
11
|
+
// segments store documents until MAX_DOCID or MAX_POSTINGS_REGION_SIZE
|
12
|
+
// are reached. then you have to make a new segment.
|
13
|
+
//
|
14
|
+
// labels are stored in a separate postings region because they're stored in a
|
15
|
+
// different, mutable format. regular text is stored in a compressed format
|
16
|
+
// that is not amenable to later changes.
|
17
|
+
|
18
|
+
#include "defaults.h"
|
19
|
+
#include "stringmap.h"
|
20
|
+
#include "termhash.h"
|
21
|
+
#include "query.h"
|
22
|
+
#include "error.h"
|
23
|
+
#include "search.h"
|
24
|
+
#include "mmap-obj.h"
|
25
|
+
|
26
|
+
// a posting entry. used to represent postings when actively working with them.
|
27
|
+
// the actual structure on disk/mmap memory region is delta- and variable-byte
|
28
|
+
// encoded.
|
29
|
+
typedef struct posting {
|
30
|
+
docid_t doc_id;
|
31
|
+
uint32_t num_positions;
|
32
|
+
uint32_t next_offset;
|
33
|
+
pos_t* positions;
|
34
|
+
} posting;
|
35
|
+
|
36
|
+
// a label posting entry. currently this is also the actual representation of
|
37
|
+
// label postings on disk.
|
38
|
+
typedef struct label_posting {
|
39
|
+
docid_t doc_id;
|
40
|
+
uint32_t next_offset;
|
41
|
+
} label_posting;
|
42
|
+
|
43
|
+
#define OFFSET_NONE (uint32_t)0
|
44
|
+
#define DOCID_NONE (docid_t)0
|
45
|
+
|
46
|
+
// docids:
|
47
|
+
//
|
48
|
+
// docid 0 is reserved as a sentinel value, so the doc ids returned from this
|
49
|
+
// segment will be between 1 and num_docs inclusive.
|
50
|
+
//
|
51
|
+
// docid num_docs + 1 is also a sentinel value for negative queries. also, we
|
52
|
+
// reserve one bit of each docid in the posting region as a marker for when
|
53
|
+
// there's only one occurrence in the document (this saves us a byte for this
|
54
|
+
// case). so the logical maximum number of docs per segment is 2^31 - 2 =
|
55
|
+
// 2,147,483,646.
|
56
|
+
//
|
57
|
+
// we make the segments smaller than that anyways, under the assumption that
|
58
|
+
// this will make automatic segment loading and unloading easier, once we have
|
59
|
+
// that implemented. (and there are limits to things like the number of unique
|
60
|
+
// terms also; see termhash.h.)
|
61
|
+
|
62
|
+
#define MAX_LOGICAL_DOCID 2147483646 // don't tweak me
|
63
|
+
#define MAX_POSTINGS_REGION_SIZE (512*1024*1024) // tweak me
|
64
|
+
|
65
|
+
#define WP_SEGMENT_POSTING_REGION_PATH_SUFFIX "pr"
|
66
|
+
|
67
|
+
// the header for the postings region
|
68
|
+
typedef struct postings_region {
|
69
|
+
uint32_t index_type_and_flags;
|
70
|
+
uint32_t num_docs;
|
71
|
+
uint32_t num_postings;
|
72
|
+
uint32_t postings_head, postings_tail;
|
73
|
+
uint8_t postings[]; // where the postings go yo
|
74
|
+
} postings_region;
|
75
|
+
|
76
|
+
// a segment is a bunch of all these things
|
77
|
+
typedef struct wp_segment {
|
78
|
+
mmap_obj stringmap;
|
79
|
+
mmap_obj stringpool;
|
80
|
+
mmap_obj termhash;
|
81
|
+
mmap_obj postings;
|
82
|
+
mmap_obj labels;
|
83
|
+
} wp_segment;
|
84
|
+
|
85
|
+
// API methods
|
86
|
+
|
87
|
+
// public: does a segment exist with this base pathname?
|
88
|
+
int wp_segment_exists(const char* pathname_base);
|
89
|
+
|
90
|
+
// public: create a segment, raising an error if it already exists
|
91
|
+
wp_error* wp_segment_create(wp_segment* segment, const char* pathname_base) RAISES_ERROR;
|
92
|
+
|
93
|
+
// public: load a segment, raising an error unless it already exists
|
94
|
+
wp_error* wp_segment_load(wp_segment* segment, const char* pathname_base) RAISES_ERROR;
|
95
|
+
|
96
|
+
// public: unload a segment
|
97
|
+
wp_error* wp_segment_unload(wp_segment* s) RAISES_ERROR;
|
98
|
+
|
99
|
+
// public: number of docs in a segment
|
100
|
+
uint64_t wp_segment_num_docs(wp_segment* s);
|
101
|
+
|
102
|
+
// public: delete a segment from disk
|
103
|
+
wp_error* wp_segment_delete(const char* pathname_base) RAISES_ERROR;
|
104
|
+
|
105
|
+
// private: read a posting from the postings region at a given offset
|
106
|
+
wp_error* wp_segment_read_posting(wp_segment* s, uint32_t offset, posting* po, int include_positions) RAISES_ERROR;
|
107
|
+
|
108
|
+
// private: read a label from the label postings region at a given offset
|
109
|
+
wp_error* wp_segment_read_label(wp_segment* s, uint32_t offset, posting* po) RAISES_ERROR;
|
110
|
+
|
111
|
+
// public: add a posting. be sure you've called wp_segment_ensure_fit with the
|
112
|
+
// size of the postings list entry before doing this! (you can obtain the size
|
113
|
+
// by calling wp_entry_sizeof_postings_region()).
|
114
|
+
wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* word, docid_t doc_id, uint32_t num_positions, pos_t positions[]) RAISES_ERROR;
|
115
|
+
|
116
|
+
// public: add a label to an existing document
|
117
|
+
wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id) RAISES_ERROR;
|
118
|
+
|
119
|
+
// public: remove a label from an existing document
|
120
|
+
wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_id) RAISES_ERROR;
|
121
|
+
|
122
|
+
// public: get a new docid
|
123
|
+
wp_error* wp_segment_grab_docid(wp_segment* s, docid_t* docid) RAISES_ERROR;
|
124
|
+
|
125
|
+
// public: dump a lot of info about the segment to a stream
|
126
|
+
wp_error* wp_segment_dumpinfo(wp_segment* s, FILE* stream) RAISES_ERROR;
|
127
|
+
|
128
|
+
// public: ensure that adding a certain number of postings bytes and label
|
129
|
+
// postings bytes will still fit within the bounds of the segment. sets success
|
130
|
+
// to 1 if true or 0 if false. if false, you should put that stuff in a new
|
131
|
+
// segment.
|
132
|
+
wp_error* wp_segment_ensure_fit(wp_segment* seg, uint32_t postings_bytes, uint32_t label_bytes, int* success) RAISES_ERROR;
|
133
|
+
|
134
|
+
// private: return the size on disk of a position array
|
135
|
+
wp_error* wp_segment_sizeof_posarray(wp_segment* seg, uint32_t num_positions, pos_t* positions, uint32_t* size) RAISES_ERROR;
|
136
|
+
|
137
|
+
#endif
|
@@ -0,0 +1,278 @@
|
|
1
|
+
#include "whistlepig.h"
|
2
|
+
|
3
|
+
static const int HASH_PRIME_SIZE = 32;
|
4
|
+
|
5
|
+
static const uint32_t prime_list[] = {
|
6
|
+
0ul, 3ul, 11ul, 23ul, 53ul,
|
7
|
+
97ul, 193ul, 389ul, 769ul, 1543ul,
|
8
|
+
3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
|
9
|
+
98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
|
10
|
+
3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
|
11
|
+
100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
|
12
|
+
3221225473ul, 4294967291ul
|
13
|
+
};
|
14
|
+
|
15
|
+
#define isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
|
16
|
+
#define isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
|
17
|
+
#define iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
|
18
|
+
#define set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
|
19
|
+
#define set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
|
20
|
+
#define set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
|
21
|
+
#define set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
|
22
|
+
|
23
|
+
static const double HASH_UPPER = 0.77;
|
24
|
+
|
25
|
+
static inline uint32_t string_hash(const char *s) {
|
26
|
+
uint32_t h = *s;
|
27
|
+
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
|
28
|
+
return h;
|
29
|
+
}
|
30
|
+
|
31
|
+
static inline int string_equals(const char* a, const char* b) {
|
32
|
+
//DEBUG("comparing '%s' (%p) and '%s' (%p)", a, a, b, b);
|
33
|
+
return strcmp(a, b) == 0;
|
34
|
+
}
|
35
|
+
|
36
|
+
// set flags, keys and vals to correct locations based on h->n_buckets
|
37
|
+
void stringmap_setup(stringmap* h, stringpool* p) {
|
38
|
+
h->pool = p;
|
39
|
+
h->flags = (uint32_t*)h->boundary;
|
40
|
+
h->keys = (uint32_t*)((uint32_t*)h->boundary + ((h->n_buckets >> 4) + 1));
|
41
|
+
}
|
42
|
+
|
43
|
+
void stringmap_init(stringmap* h, stringpool* p) {
|
44
|
+
h->n_buckets_idx = INITIAL_N_BUCKETS_IDX;
|
45
|
+
h->n_buckets = prime_list[h->n_buckets_idx];
|
46
|
+
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
47
|
+
h->size = h->n_occupied = 0;
|
48
|
+
stringmap_setup(h, p);
|
49
|
+
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
50
|
+
}
|
51
|
+
|
52
|
+
/*
|
53
|
+
static void kh_destroy_##name(kh_##name##_t *h) {
|
54
|
+
if (h) {
|
55
|
+
free(h->keys); free(h->flags);
|
56
|
+
free(h->vals);
|
57
|
+
free(h);
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
static void kh_clear_##name(kh_##name##_t *h) {
|
62
|
+
if (h && h->flags) {
|
63
|
+
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
64
|
+
h->size = h->n_occupied = 0;
|
65
|
+
}
|
66
|
+
}
|
67
|
+
*/
|
68
|
+
|
69
|
+
uint32_t stringmap_get(stringmap *h, const char* key) {
|
70
|
+
if(h->n_buckets) {
|
71
|
+
uint32_t inc, k, i, last;
|
72
|
+
k = string_hash(key); i = k % h->n_buckets;
|
73
|
+
inc = 1 + k % (h->n_buckets - 1); last = i;
|
74
|
+
while (!isempty(h->flags, i) && (isdel(h->flags, i) || !string_equals(stringpool_lookup(h->pool, h->keys[i]), key))) {
|
75
|
+
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
76
|
+
else i += inc;
|
77
|
+
if (i == last) return h->n_buckets;
|
78
|
+
}
|
79
|
+
return iseither(h->flags, i)? h->n_buckets : i;
|
80
|
+
}
|
81
|
+
else return 0;
|
82
|
+
}
|
83
|
+
|
84
|
+
wp_error* stringmap_bump_size(stringmap *h) {
|
85
|
+
DEBUG("bumping size for string hash at %p with size %u and boundary %p", h, stringmap_size(h), h->boundary);
|
86
|
+
|
87
|
+
if(h->n_buckets_idx >= (HASH_PRIME_SIZE - 1)) RAISE_ERROR("stringmap can't be this big");
|
88
|
+
|
89
|
+
h->n_buckets_idx++;
|
90
|
+
uint32_t new_n_buckets = prime_list[h->n_buckets_idx];
|
91
|
+
|
92
|
+
// first make a backup of the oldflags
|
93
|
+
size_t oldflagsize = ((h->n_buckets >> 4) + 1) * sizeof(uint32_t);
|
94
|
+
uint32_t* oldflags = malloc(oldflagsize);
|
95
|
+
memcpy(oldflags, h->flags, oldflagsize);
|
96
|
+
|
97
|
+
// keep pointers to the old locations
|
98
|
+
uint32_t* oldkeys = h->keys;
|
99
|
+
|
100
|
+
// set pointers to the new locations
|
101
|
+
h->keys = (uint32_t*)((uint32_t*)h->boundary + ((new_n_buckets >> 4) + 1));
|
102
|
+
|
103
|
+
// move the keys
|
104
|
+
memmove(h->keys, oldkeys, h->n_buckets * sizeof(uint32_t));
|
105
|
+
|
106
|
+
// clear the new flags
|
107
|
+
memset(h->flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t));
|
108
|
+
|
109
|
+
// do the complicated stuff from khash.h
|
110
|
+
for (unsigned int j = 0; j != h->n_buckets; ++j) {
|
111
|
+
if (iseither(oldflags, j) == 0) {
|
112
|
+
uint32_t key = h->keys[j];
|
113
|
+
set_isdel_true(oldflags, j);
|
114
|
+
while (1) {
|
115
|
+
uint32_t inc, k, i;
|
116
|
+
k = string_hash(stringpool_lookup(h->pool, key));
|
117
|
+
i = k % new_n_buckets;
|
118
|
+
inc = 1 + k % (new_n_buckets - 1);
|
119
|
+
while (!isempty(h->flags, i)) {
|
120
|
+
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets;
|
121
|
+
else i += inc;
|
122
|
+
}
|
123
|
+
set_isempty_false(h->flags, i);
|
124
|
+
if (i < h->n_buckets && iseither(oldflags, i) == 0) {
|
125
|
+
{ uint32_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; }
|
126
|
+
set_isdel_true(oldflags, i);
|
127
|
+
} else {
|
128
|
+
h->keys[i] = key;
|
129
|
+
break;
|
130
|
+
}
|
131
|
+
}
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
free(oldflags);
|
136
|
+
h->n_buckets = new_n_buckets;
|
137
|
+
h->n_occupied = h->size;
|
138
|
+
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
139
|
+
|
140
|
+
#ifdef DEBUGOUTPUT
|
141
|
+
DEBUG("after bump, string hash at %p has size %u and boundary %p", h, stringmap_size(h), h->boundary);
|
142
|
+
#endif
|
143
|
+
|
144
|
+
return NO_ERROR;
|
145
|
+
}
|
146
|
+
|
147
|
+
uint32_t stringmap_put(stringmap *h, const char* key, int *ret) {
|
148
|
+
uint32_t x;
|
149
|
+
|
150
|
+
{
|
151
|
+
#ifdef DEBUGOUTPUT
|
152
|
+
int num_loops = 0;
|
153
|
+
#endif
|
154
|
+
uint32_t inc, k, i, site, last;
|
155
|
+
x = site = h->n_buckets; k = string_hash(key); i = k % h->n_buckets;
|
156
|
+
//DEBUG("asked to hash '%s'. initial hash is %u => %u and n_occupied is %u", key, k, i, h->n_occupied);
|
157
|
+
if (isempty(h->flags, i)) x = i;
|
158
|
+
else {
|
159
|
+
inc = 1 + k % (h->n_buckets - 1); last = i;
|
160
|
+
while (!isempty(h->flags, i) && (isdel(h->flags, i) || !string_equals(stringpool_lookup(h->pool, h->keys[i]), key))) {
|
161
|
+
#ifdef DEBUGOUTPUT
|
162
|
+
num_loops++;
|
163
|
+
#endif
|
164
|
+
if (isdel(h->flags, i)) site = i;
|
165
|
+
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
166
|
+
else i += inc;
|
167
|
+
if (i == last) { x = site; break; }
|
168
|
+
}
|
169
|
+
if ((x == h->n_buckets) && (i == last)) { // out of space
|
170
|
+
if(!string_equals(stringpool_lookup(h->pool, h->keys[i]), key)) {
|
171
|
+
DEBUG("out of space!");
|
172
|
+
*ret = -1;
|
173
|
+
return x;
|
174
|
+
}
|
175
|
+
}
|
176
|
+
if (x == h->n_buckets) { // didn't find it on the first try
|
177
|
+
if (isempty(h->flags, i) && site != h->n_buckets) x = site;
|
178
|
+
else x = i;
|
179
|
+
}
|
180
|
+
}
|
181
|
+
DEBUG("looped %u times to put", num_loops);
|
182
|
+
//DEBUG("x is %u, site is %u, n_buckets is %u", x, site, h->n_buckets);
|
183
|
+
}
|
184
|
+
|
185
|
+
//DEBUG("for pos %u, isempty? %d and isdel %d", x, isempty(h->flags, x), isdel(h->flags, x));
|
186
|
+
|
187
|
+
uint32_t idx;
|
188
|
+
if(isempty(h->flags, x) || isdel(h->flags, x)) {
|
189
|
+
idx = stringpool_add(h->pool, key);
|
190
|
+
if(idx == (uint32_t)-1) {
|
191
|
+
*ret = -2;
|
192
|
+
return x;
|
193
|
+
}
|
194
|
+
if (isempty(h->flags, x)) ++h->n_occupied;
|
195
|
+
h->keys[x] = idx;
|
196
|
+
set_isboth_false(h->flags, x);
|
197
|
+
++h->size;
|
198
|
+
*ret = 1;
|
199
|
+
}
|
200
|
+
else *ret = 0;
|
201
|
+
|
202
|
+
return x;
|
203
|
+
}
|
204
|
+
|
205
|
+
void stringmap_del(stringmap *h, uint32_t x) {
|
206
|
+
if (x != h->n_buckets && !iseither(h->flags, x)) {
|
207
|
+
set_isdel_true(h->flags, x);
|
208
|
+
--h->size;
|
209
|
+
}
|
210
|
+
}
|
211
|
+
|
212
|
+
/*
|
213
|
+
uint32_t stringmap_get_val(stringmap* h, string t) {
|
214
|
+
uint32_t idx = termhash_get(h, t);
|
215
|
+
if(idx == h->n_buckets) return (uint32_t)-1;
|
216
|
+
return h->vals[idx];
|
217
|
+
}
|
218
|
+
|
219
|
+
wp_error* termhash_put_val(termhash* h, term t, uint32_t val) {
|
220
|
+
int status;
|
221
|
+
uint32_t loc = termhash_put(h, t, &status);
|
222
|
+
DEBUG("put(%u,%u) has status %d and ret %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
|
223
|
+
if(status == -1) RAISE_ERROR("out of space in hash");
|
224
|
+
h->vals[loc] = val;
|
225
|
+
return NO_ERROR;
|
226
|
+
}
|
227
|
+
*/
|
228
|
+
|
229
|
+
int stringmap_needs_bump(stringmap* h) {
|
230
|
+
return (h->n_occupied >= h->upper_bound);
|
231
|
+
}
|
232
|
+
|
233
|
+
// memory layout: stringmap, then:
|
234
|
+
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
235
|
+
// n_buckets uint32_t for the keys
|
236
|
+
static uint32_t size(uint32_t n_buckets) {
|
237
|
+
uint32_t size = sizeof(stringmap) +
|
238
|
+
(((n_buckets >> 4) + 1) * sizeof(uint32_t)) +
|
239
|
+
(n_buckets * sizeof(uint32_t));
|
240
|
+
return size;
|
241
|
+
}
|
242
|
+
|
243
|
+
// returns the total size in bytes
|
244
|
+
uint32_t stringmap_size(stringmap* h) {
|
245
|
+
return size(h->n_buckets);
|
246
|
+
}
|
247
|
+
|
248
|
+
uint32_t stringmap_initial_size() {
|
249
|
+
return size(prime_list[INITIAL_N_BUCKETS_IDX]);
|
250
|
+
}
|
251
|
+
|
252
|
+
// the size if we embiggen by one notch
|
253
|
+
uint32_t stringmap_next_size(stringmap* h) {
|
254
|
+
int next_idx = (h->n_buckets_idx < (HASH_PRIME_SIZE - 1)) ? h->n_buckets_idx + 1 : h->n_buckets_idx;
|
255
|
+
return size(prime_list[next_idx]);
|
256
|
+
}
|
257
|
+
|
258
|
+
const char* stringmap_int_to_string(stringmap* h, uint32_t i) {
|
259
|
+
return stringpool_lookup(h->pool, i);
|
260
|
+
}
|
261
|
+
|
262
|
+
// returns -1 if not found
|
263
|
+
uint32_t stringmap_string_to_int(stringmap* h, const char* s) {
|
264
|
+
uint32_t idx = stringmap_get(h, s);
|
265
|
+
if(idx == h->n_buckets) return (uint32_t)-1; // not there
|
266
|
+
return h->keys[idx];
|
267
|
+
}
|
268
|
+
|
269
|
+
wp_error* stringmap_add(stringmap *h, const char* s, uint32_t* id) {
|
270
|
+
int status;
|
271
|
+
uint32_t idx = stringmap_put(h, s, &status);
|
272
|
+
if(status == -1) RAISE_ERROR("out of space in hash put");
|
273
|
+
if(status == -2) RAISE_ERROR("out of space in pool put");
|
274
|
+
|
275
|
+
*id = h->keys[idx];
|
276
|
+
|
277
|
+
return NO_ERROR;
|
278
|
+
}
|
@@ -0,0 +1,82 @@
|
|
1
|
+
#ifndef WP_STRINGHASH_H_
|
2
|
+
#define WP_STRINGHASH_H_
|
3
|
+
|
4
|
+
// whistlepig string map
|
5
|
+
// (c) 2011 William Morgan. See COPYING for license terms.
|
6
|
+
//
|
7
|
+
// based on a heavily modified khash.h
|
8
|
+
//
|
9
|
+
// a stringmap is a bidirectional map from strings to int values. like termhash
|
10
|
+
// and stringpool, it uses a slightly funny API that never allocates memory,
|
11
|
+
// but instead operates on pointers to preallocated blocks of memory.
|
12
|
+
//
|
13
|
+
// uses a stringpool internally to do the int->string mapping. so if you're so
|
14
|
+
// you shouldn't have to interact with the stringpool directly; you can just
|
15
|
+
// use this object.
|
16
|
+
//
|
17
|
+
// like termhash and pool, has a slightly funny API that is designed to work on
|
18
|
+
// a pre-allocated chunk of memory rather than allocate any of its own.
|
19
|
+
|
20
|
+
#include <stdint.h>
|
21
|
+
#include "stringpool.h"
|
22
|
+
#include "error.h"
|
23
|
+
|
24
|
+
/* list of primes from khash.h:
|
25
|
+
0ul, 3ul, 11ul, 23ul, 53ul,
|
26
|
+
97ul, 193ul, 389ul, 769ul, 1543ul,
|
27
|
+
3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
|
28
|
+
98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
|
29
|
+
3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
|
30
|
+
100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
|
31
|
+
3221225473ul, 4294967291ul
|
32
|
+
*/
|
33
|
+
|
34
|
+
#define INITIAL_N_BUCKETS_IDX 1
|
35
|
+
|
36
|
+
typedef struct stringmap {
|
37
|
+
uint8_t n_buckets_idx;
|
38
|
+
uint32_t n_buckets, size, n_occupied, upper_bound;
|
39
|
+
uint32_t *flags;
|
40
|
+
uint32_t *keys;
|
41
|
+
stringpool* pool;
|
42
|
+
uint8_t boundary[];
|
43
|
+
// in memory at this point
|
44
|
+
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
45
|
+
// n_buckets uint32_t's for the keys
|
46
|
+
} stringmap;
|
47
|
+
|
48
|
+
// API methods
|
49
|
+
|
50
|
+
// public: write a new stringmap to memory
|
51
|
+
void stringmap_init(stringmap* h, stringpool* p);
|
52
|
+
|
53
|
+
// public: set up an existing stringmap in memory
|
54
|
+
void stringmap_setup(stringmap* h, stringpool* p);
|
55
|
+
|
56
|
+
// public: add a string. sets id to its id. dupes are fine; will just set the
|
57
|
+
// id correctly.
|
58
|
+
wp_error* stringmap_add(stringmap *h, const char* s, uint32_t* id) RAISES_ERROR;
|
59
|
+
|
60
|
+
// public: get the int value given a string. returns (uint32_t)-1 if not found.
|
61
|
+
uint32_t stringmap_string_to_int(stringmap* h, const char* s);
|
62
|
+
|
63
|
+
// public: get the string value given an int. returns corrupt data if the int
|
64
|
+
// is invalid.
|
65
|
+
const char* stringmap_int_to_string(stringmap* h, uint32_t i);
|
66
|
+
|
67
|
+
// public: returns the byte size of the stringmap
|
68
|
+
uint32_t stringmap_size(stringmap* h);
|
69
|
+
|
70
|
+
// public: returns the initial byte size for an empty stringmap
|
71
|
+
uint32_t stringmap_initial_size();
|
72
|
+
|
73
|
+
// public: returns the byte size for the next larger version of a stringmap
|
74
|
+
uint32_t stringmap_next_size(stringmap* h);
|
75
|
+
|
76
|
+
// public: does the stringmap need a size increase?
|
77
|
+
int stringmap_needs_bump(stringmap* h);
|
78
|
+
|
79
|
+
// public: increases the size of the stringmap
|
80
|
+
wp_error* stringmap_bump_size(stringmap *h) RAISES_ERROR;
|
81
|
+
|
82
|
+
#endif
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#include "whistlepig.h"
|
2
|
+
|
3
|
+
void stringpool_init(stringpool* p) {
|
4
|
+
p->next = 1;
|
5
|
+
p->size = INITIAL_POOL_SIZE;
|
6
|
+
}
|
7
|
+
|
8
|
+
uint32_t stringpool_size(stringpool* p) {
|
9
|
+
return sizeof(stringpool) + (p->size * sizeof(char));
|
10
|
+
}
|
11
|
+
|
12
|
+
uint32_t stringpool_add(stringpool* p, const char* s) {
|
13
|
+
int len = strlen(s) + 1;
|
14
|
+
if((p->next + len) >= p->size) {
|
15
|
+
DEBUG("out of space in string pool for %s (len %d, next %d, size %d)", s, len, p->next, p->size);
|
16
|
+
return (uint32_t)-1;
|
17
|
+
}
|
18
|
+
uint32_t ret = p->next;
|
19
|
+
p->next += len;
|
20
|
+
DEBUG("writing %d bytes to %p -- %p", len, &(p->pool[ret]), &(p->pool[ret]) + len);
|
21
|
+
strncpy(&(p->pool[ret]), s, len);
|
22
|
+
return ret;
|
23
|
+
}
|
24
|
+
|
25
|
+
int stringpool_needs_bump(stringpool* p) {
|
26
|
+
return (p->next >= (int)((float)p->size * 0.9) ? 1 : 0);
|
27
|
+
}
|
28
|
+
|
29
|
+
uint32_t stringpool_next_size(stringpool* p) {
|
30
|
+
return sizeof(stringpool) + (2 * (p->size == 0 ? 1 : p->size) * sizeof(char));
|
31
|
+
}
|
32
|
+
|
33
|
+
uint32_t stringpool_initial_size() {
|
34
|
+
return sizeof(stringpool) + INITIAL_POOL_SIZE;
|
35
|
+
}
|
36
|
+
|
37
|
+
void stringpool_bump_size(stringpool* p) {
|
38
|
+
p->size = stringpool_next_size(p);
|
39
|
+
}
|
40
|
+
|
41
|
+
char* stringpool_lookup(stringpool* p, uint32_t id) {
|
42
|
+
if((id == 0) || (id >= p->next)) return NULL;
|
43
|
+
return &p->pool[id];
|
44
|
+
}
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#ifndef WP_STRINGPOOL_H_
|
2
|
+
#define WP_STRINGPOOL_H_
|
3
|
+
|
4
|
+
// whistlepig string pool
|
5
|
+
// (c) 2011 William Morgan. See COPYING for license terms.
|
6
|
+
//
|
7
|
+
// a string pool. adds strings to a big blob and returns an int which can be
|
8
|
+
// used to look them up later. in other words, an int->string mapping, where
|
9
|
+
// you provide the string and we'll give you an int.
|
10
|
+
//
|
11
|
+
// does no duplicate detection, if you add the same string twice, you will
|
12
|
+
// get two different ints and you will have wasted memory.
|
13
|
+
//
|
14
|
+
// this is used by stringmap to maintain a bidirectional string<->int mapping
|
15
|
+
// and is not really used directly.
|
16
|
+
//
|
17
|
+
// int 0 is a special case for the null string. passing in invalid ints (i.e.
|
18
|
+
// ints i didn't return) will result in garbage data.
|
19
|
+
//
|
20
|
+
// like termhash and stringmap, has a slightly funny API that is designed to
|
21
|
+
// work on a pre-allocated chunk of memory rather than allocate any of its own.
|
22
|
+
|
23
|
+
#include <stdint.h>
|
24
|
+
|
25
|
+
#define INITIAL_POOL_SIZE 2048
|
26
|
+
|
27
|
+
typedef struct stringpool {
|
28
|
+
uint32_t size, next;
|
29
|
+
char pool[];
|
30
|
+
} stringpool;
|
31
|
+
|
32
|
+
// API methods
|
33
|
+
|
34
|
+
// public: create a stringpool
|
35
|
+
void stringpool_init(stringpool* p);
|
36
|
+
|
37
|
+
// public: add a string, returning an int
|
38
|
+
uint32_t stringpool_add(stringpool* p, const char* s);
|
39
|
+
|
40
|
+
// public: does this stringpool need to be increased?
|
41
|
+
int stringpool_needs_bump(stringpool* p);
|
42
|
+
|
43
|
+
// public: increase the size of the stringpool
|
44
|
+
void stringpool_bump_size(stringpool* p);
|
45
|
+
|
46
|
+
// public: given an id, return the string
|
47
|
+
char* stringpool_lookup(stringpool* p, uint32_t id);
|
48
|
+
|
49
|
+
// public: returns the byte size of the pool
|
50
|
+
uint32_t stringpool_size(stringpool* p);
|
51
|
+
|
52
|
+
// public: returns the initial byte size for an empty pool
|
53
|
+
uint32_t stringpool_initial_size();
|
54
|
+
|
55
|
+
// public: returns the byte size for the next larger version of a pool
|
56
|
+
uint32_t stringpool_next_size(stringpool* p);
|
57
|
+
|
58
|
+
#endif
|