whistlepig 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,294 @@
1
+ #include "whistlepig.h"
2
+
3
+ static const int HASH_PRIME_SIZE = 32;
4
+
5
+ static const uint32_t prime_list[] = {
6
+ 0ul, 3ul, 11ul, 23ul, 53ul,
7
+ 97ul, 193ul, 389ul, 769ul, 1543ul,
8
+ 3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
9
+ 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
10
+ 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
11
+ 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
12
+ 3221225473ul, 4294967291ul
13
+ };
14
+
15
+ #define isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
16
+ #define isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
17
+ #define iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
18
+ #define set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
19
+ #define set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
20
+ #define set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
21
+ #define set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
22
+
23
+ static const double HASH_UPPER = 0.77;
24
+
25
+ static uint32_t hash_term(term t) {
26
+ return t.word_s ^ t.field_s;
27
+ }
28
+
29
+ static int term_equals(term a, term b) {
30
+ return a.word_s == b.word_s && a.field_s == b.field_s;
31
+ }
32
+
33
+ void termhash_init(termhash* h) {
34
+ h->n_buckets_idx = 1;
35
+ h->n_buckets = prime_list[h->n_buckets_idx];
36
+ h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
37
+ h->size = h->n_occupied = 0;
38
+ termhash_setup(h);
39
+ memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
40
+ }
41
+
42
+ #define OFFSET(a, b) (long)((uint8_t*)a - (uint8_t*)b)
43
+ // set flags, keys and vals to correct locations based on h->n_buckets
44
+ void termhash_setup(termhash* h) {
45
+ DEBUG("term hash ranges from %p to %p (size %u)", h, (char*)h + termhash_size(h), termhash_size(h));
46
+ DEBUG("boundary is at %p (+%ld)", h->boundary, OFFSET(h->boundary, h));
47
+ h->flags = (uint32_t*)h->boundary;
48
+ h->keys = (term*)((uint32_t*)h->boundary + ((h->n_buckets >> 4) + 1));
49
+ h->vals = (uint32_t*)((term*)h->keys + h->n_buckets);
50
+ DEBUG("flags are at %p (+%ld)", h->flags, OFFSET(h->flags, h->boundary));
51
+ DEBUG(" keys are at %p (+%ld)", h->keys, OFFSET(h->keys, h->boundary));
52
+ DEBUG(" vals are at %p (+%ld)", h->vals, OFFSET(h->vals, h->boundary));
53
+ }
54
+
55
+ /*
56
+ static void termhash_dump(termhash* h) {
57
+ for(uint32_t i = 0; i < h->n_buckets; i++) {
58
+ if(isempty(h->flags, i)) printf("%u:\n", i);
59
+ else if(isdel(h->flags, i)) printf("%u: [deleted]", i);
60
+ else {
61
+ term t = h->keys[i];
62
+ printf("%u: (%u,%u)\n", i, t.field_s, t.word_s);
63
+ }
64
+ }
65
+ }
66
+ */
67
+
68
+ /*
69
+ static void kh_destroy_##name(kh_##name##_t *h) {
70
+ if (h) {
71
+ free(h->keys); free(h->flags);
72
+ free(h->vals);
73
+ free(h);
74
+ }
75
+ }
76
+
77
+ static void kh_clear_##name(kh_##name##_t *h) {
78
+ if (h && h->flags) {
79
+ memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
80
+ h->size = h->n_occupied = 0;
81
+ }
82
+ }
83
+ */
84
+
85
+ uint32_t termhash_get(termhash *h, term key) {
86
+ if(h->n_buckets) {
87
+ uint32_t inc, k, i, last;
88
+ k = hash_term(key); i = k % h->n_buckets;
89
+ inc = 1 + k % (h->n_buckets - 1); last = i;
90
+ while (!isempty(h->flags, i) && (isdel(h->flags, i) || !term_equals(h->keys[i], key))) {
91
+ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
92
+ else i += inc;
93
+ if (i == last) return h->n_buckets;
94
+ }
95
+ return iseither(h->flags, i)? h->n_buckets : i;
96
+ }
97
+ else return 0;
98
+ }
99
+
100
+ wp_error* termhash_bump_size(termhash *h) {
101
+ DEBUG("bumping size for term hash at %p with size %u and boundary %p (+%ld)", h, termhash_size(h), h->boundary, (long)((uint8_t*)h->boundary - (uint8_t*)h));
102
+ DEBUG("flags are at %p (+%ld)", h->flags, OFFSET(h->flags, h->boundary));
103
+ DEBUG(" keys are at %p (+%ld)", h->keys, OFFSET(h->keys, h->boundary));
104
+ DEBUG(" vals are at %p (+%ld)", h->vals, OFFSET(h->vals, h->boundary));
105
+
106
+ h->n_buckets_idx++;
107
+ if(h->n_buckets_idx > HASH_PRIME_SIZE) exit(1); // die horribly TODO fixme
108
+ uint32_t new_n_buckets = prime_list[h->n_buckets_idx];
109
+
110
+ // first make a backup of the oldflags
111
+ size_t oldflagsize = ((h->n_buckets >> 4) + 1) * sizeof(uint32_t);
112
+ uint32_t* oldflags = malloc(oldflagsize);
113
+ memcpy(oldflags, h->flags, oldflagsize);
114
+
115
+ // keep pointers to the old locations
116
+ term* oldkeys = h->keys;
117
+ uint32_t* oldvals = h->vals;
118
+
119
+ // set pointers to the new locations
120
+ h->keys = (term*)((uint32_t*)h->boundary + ((new_n_buckets >> 4) + 1));
121
+ h->vals = (uint32_t*)((term*)h->keys + new_n_buckets);
122
+
123
+ // move the vals and keys
124
+ memmove(h->vals, oldvals, h->n_buckets * sizeof(uint32_t));
125
+ memmove(h->keys, oldkeys, h->n_buckets * sizeof(term));
126
+
127
+ // clear the new flags
128
+ memset(h->flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t));
129
+
130
+ // do the complicated stuff from khash.h
131
+ for (unsigned int j = 0; j != h->n_buckets; ++j) {
132
+ if (iseither(oldflags, j) == 0) {
133
+ term key = h->keys[j];
134
+ uint32_t val;
135
+ val = h->vals[j];
136
+ set_isdel_true(oldflags, j);
137
+ while (1) {
138
+ uint32_t inc, k, i;
139
+ k = hash_term(key);
140
+ i = k % new_n_buckets;
141
+ inc = 1 + k % (new_n_buckets - 1);
142
+ while (!isempty(h->flags, i)) {
143
+ if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets;
144
+ else i += inc;
145
+ }
146
+ set_isempty_false(h->flags, i);
147
+ if (i < h->n_buckets && iseither(oldflags, i) == 0) {
148
+ { term tmp = h->keys[i]; h->keys[i] = key; key = tmp; }
149
+ { uint32_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; }
150
+ set_isdel_true(oldflags, i);
151
+ } else {
152
+ h->keys[i] = key;
153
+ h->vals[i] = val;
154
+ break;
155
+ }
156
+ }
157
+ }
158
+ }
159
+
160
+ free(oldflags);
161
+ h->n_buckets = new_n_buckets;
162
+ h->n_occupied = h->size;
163
+ h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
164
+
165
+ DEBUG("after bump, term hash at %p has size %u and boundary %p (+%ld)", h, termhash_size(h), h->boundary, (long)((uint8_t*)h->boundary - (uint8_t*)h));
166
+ DEBUG("flags are at %p (+%ld)", h->flags, (long)((uint8_t*)h->flags - (uint8_t*)h->boundary));
167
+ DEBUG(" keys are at %p (+%ld)", h->keys, (long)((uint8_t*)h->keys - (uint8_t*)h->boundary));
168
+ DEBUG(" vals are at %p (+%ld)", h->vals, (long)((uint8_t*)h->vals - (uint8_t*)h->boundary));
169
+
170
+ #ifdef DEBUGOUTPUT
171
+ //DEBUG("and now i look like this:");
172
+ //termhash_dump(h);
173
+ #endif
174
+
175
+ return NO_ERROR;
176
+ }
177
+
178
+ uint32_t termhash_put(termhash *h, term key, int *ret) {
179
+ uint32_t x;
180
+
181
+ {
182
+ #ifdef DEBUGOUTPUT
183
+ int num_loops = 0;
184
+ #endif
185
+ uint32_t inc, k, i, site, last;
186
+ x = site = h->n_buckets; k = hash_term(key); i = k % h->n_buckets;
187
+ DEBUG("initial hash is %u", k);
188
+ if (isempty(h->flags, i)) x = i;
189
+ else {
190
+ inc = 1 + k % (h->n_buckets - 1); last = i;
191
+ while (!isempty(h->flags, i) && (isdel(h->flags, i) || !term_equals(h->keys[i], key))) {
192
+ #ifdef DEBUGOUTPUT
193
+ num_loops++;
194
+ #endif
195
+ if (isdel(h->flags, i)) site = i;
196
+ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
197
+ else i += inc;
198
+ if (i == last) { x = site; break; }
199
+ }
200
+ if ((x == h->n_buckets) && (i == last)) { // out of space
201
+ if(!term_equals(h->keys[i], key)) {
202
+ *ret = -1;
203
+ return x;
204
+ }
205
+ }
206
+ if (x == h->n_buckets) { // didn't find it on the first try
207
+ if (isempty(h->flags, i) && site != h->n_buckets) x = site;
208
+ else x = i;
209
+ }
210
+ }
211
+ DEBUG("looped %u times to put", num_loops);
212
+ //DEBUG("x is %u, site is %u, n_buckets is %u", x, site, h->n_buckets);
213
+ }
214
+ if (isempty(h->flags, x)) {
215
+ h->keys[x] = key;
216
+ set_isboth_false(h->flags, x);
217
+ ++h->size; ++h->n_occupied;
218
+ *ret = 1;
219
+ } else if (isdel(h->flags, x)) {
220
+ h->keys[x] = key;
221
+ set_isboth_false(h->flags, x);
222
+ ++h->size;
223
+ *ret = 2;
224
+ }
225
+ else *ret = 0;
226
+
227
+ #ifdef DEBUGOUTPUT
228
+ //DEBUG("after put:");
229
+ //termhash_dump(h);
230
+ #endif
231
+
232
+ return x;
233
+ }
234
+
235
+ void termhash_del(termhash *h, uint32_t x) {
236
+ if (x != h->n_buckets && !iseither(h->flags, x)) {
237
+ set_isdel_true(h->flags, x);
238
+ --h->size;
239
+ }
240
+ }
241
+
242
+ uint32_t termhash_get_val(termhash* h, term t) {
243
+ uint32_t idx = termhash_get(h, t);
244
+ if(idx == h->n_buckets) return (uint32_t)-1;
245
+ return h->vals[idx];
246
+ }
247
+
248
+ wp_error* termhash_put_val(termhash* h, term t, uint32_t val) {
249
+ int status;
250
+ uint32_t loc = termhash_put(h, t, &status);
251
+ DEBUG("put(%u,%u) has status %d and loc %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
252
+ if(status == -1) RAISE_ERROR("out of space in hash");
253
+ h->vals[loc] = val;
254
+ return NO_ERROR;
255
+ }
256
+
257
+ int termhash_needs_bump(termhash* h) {
258
+ return (h->n_occupied >= h->upper_bound);
259
+ }
260
+
261
+ // returns the total size in bytes
262
+ // memory layout: termhash, then:
263
+ // ((n_buckets >> 4) + 1) uint32_t's for the flags
264
+ // n_buckets terms for the keys
265
+ // n_buckets uint32_t's for the vals (offsets into postings lists)
266
+ static uint32_t size(uint32_t n_buckets) {
267
+ uint32_t size = sizeof(termhash) +
268
+ (((n_buckets >> 4) + 1) * sizeof(uint32_t)) +
269
+ (n_buckets * sizeof(term)) +
270
+ (n_buckets * sizeof(uint32_t));
271
+
272
+ DEBUG("size of a termhash with %u buckets is %lu + %lu + %lu + %lu = %u",
273
+ n_buckets,
274
+ (long)sizeof(termhash),
275
+ (long)(((n_buckets >> 4) + 1) * sizeof(uint32_t)),
276
+ (long)(n_buckets * sizeof(term)),
277
+ (long)(n_buckets * sizeof(uint32_t)),
278
+ size);
279
+
280
+ return size;
281
+ }
282
+
283
+ uint32_t termhash_size(termhash* h) {
284
+ return size(h->n_buckets);
285
+ }
286
+
287
+ uint32_t termhash_initial_size() {
288
+ return size(prime_list[INITIAL_N_BUCKETS_IDX]);
289
+ }
290
+
291
+ uint32_t termhash_next_size(termhash* h) {
292
+ int next_idx = (h->n_buckets_idx < (HASH_PRIME_SIZE - 1)) ? h->n_buckets_idx + 1 : h->n_buckets_idx;
293
+ return size(prime_list[next_idx]);
294
+ }
@@ -0,0 +1,79 @@
1
+ #ifndef WP_TERMHASH_H_
2
+ #define WP_TERMHASH_H_
3
+
4
+ // whistlepig term hash
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // based on a heavily modified khash.h
8
+ //
9
+ // a term, in this file, is a pair of ints. the intention is that these are the
10
+ // results of adding strings to the stringmap. the termhash is then a map from
11
+ // such pairs to ints.
12
+ //
13
+ // like stringmap and stringpool, it uses a slightly funny API that never
14
+ // allocates memory, but instead operates on pointers to preallocated blocks of
15
+ // memory.
16
+
17
+ #include <stdint.h>
18
+ #include "error.h"
19
+
20
+ typedef struct term {
21
+ uint32_t field_s;
22
+ uint32_t word_s;
23
+ } term;
24
+
25
+ #define INITIAL_N_BUCKETS_IDX 1
26
+
27
+ typedef struct termhash {
28
+ uint8_t n_buckets_idx;
29
+ uint32_t n_buckets, size, n_occupied, upper_bound;
30
+ uint32_t *flags;
31
+ term *keys;
32
+ uint32_t *vals;
33
+ uint8_t boundary[];
34
+ // in memory at this point
35
+ // ((n_buckets >> 4) + 1) uint32_t's for the flags
36
+ // n_buckets terms for the keys
37
+ // n_buckets uint32_t's for the vals (offsets into postings lists)
38
+ } termhash;
39
+
40
+ // API methods
41
+
42
+ // public: make a new termhash
43
+ void termhash_init(termhash* h); // makes a new one
44
+
45
+ // public: set up an existing termhash
46
+ void termhash_setup(termhash* h); // inits one from disk
47
+
48
+ // private: khash-style getter: returns the slot id, if any, given a term key.
49
+ // you can then look this up within the vals array yourself. returns
50
+ // h->n_buckets if the term is not in the hash.
51
+ uint32_t termhash_get(termhash *h, term t);
52
+
53
+ // public: get an int given a term. returns (uint32_t)-1 if the term is not in
54
+ // the hash.
55
+ uint32_t termhash_get_val(termhash* h, term t); // convenience
56
+
57
+ // private: khash-style setter: insert a term into the hash. see the code
58
+ // for details on what all the return values mean.
59
+ uint32_t termhash_put(termhash* h, term t, int *ret); // khash-style
60
+
61
+ // public: adds a term to the hash with the given value
62
+ wp_error* termhash_put_val(termhash* h, term t, uint32_t val) RAISES_ERROR; // convenience
63
+
64
+ // public: returns the byte size of the termhash
65
+ uint32_t termhash_size(termhash* h);
66
+
67
+ // public: returns the byte size for the next larger version of the termhash
68
+ uint32_t termhash_next_size(termhash* h);
69
+
70
+ // public: does the termhash need a size increase?
71
+ int termhash_needs_bump(termhash* h);
72
+
73
+ // public: increases the size of the termhash
74
+ wp_error* termhash_bump_size(termhash* h) RAISES_ERROR;
75
+
76
+ // public: returns the initial byte size for an empty termhash
77
+ uint32_t termhash_initial_size();
78
+
79
+ #endif