whistlepig 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,294 @@
1
+ #include "whistlepig.h"
2
+
3
+ static const int HASH_PRIME_SIZE = 32;
4
+
5
+ static const uint32_t prime_list[] = {
6
+ 0ul, 3ul, 11ul, 23ul, 53ul,
7
+ 97ul, 193ul, 389ul, 769ul, 1543ul,
8
+ 3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
9
+ 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
10
+ 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
11
+ 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
12
+ 3221225473ul, 4294967291ul
13
+ };
14
+
15
+ #define isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
16
+ #define isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
17
+ #define iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
18
+ #define set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
19
+ #define set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
20
+ #define set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
21
+ #define set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
22
+
23
+ static const double HASH_UPPER = 0.77;
24
+
25
+ static uint32_t hash_term(term t) {
26
+ return t.word_s ^ t.field_s;
27
+ }
28
+
29
+ static int term_equals(term a, term b) {
30
+ return a.word_s == b.word_s && a.field_s == b.field_s;
31
+ }
32
+
33
+ void termhash_init(termhash* h) {
34
+ h->n_buckets_idx = 1;
35
+ h->n_buckets = prime_list[h->n_buckets_idx];
36
+ h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
37
+ h->size = h->n_occupied = 0;
38
+ termhash_setup(h);
39
+ memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
40
+ }
41
+
42
+ #define OFFSET(a, b) (long)((uint8_t*)a - (uint8_t*)b)
43
+ // set flags, keys and vals to correct locations based on h->n_buckets
44
+ void termhash_setup(termhash* h) {
45
+ DEBUG("term hash ranges from %p to %p (size %u)", h, (char*)h + termhash_size(h), termhash_size(h));
46
+ DEBUG("boundary is at %p (+%ld)", h->boundary, OFFSET(h->boundary, h));
47
+ h->flags = (uint32_t*)h->boundary;
48
+ h->keys = (term*)((uint32_t*)h->boundary + ((h->n_buckets >> 4) + 1));
49
+ h->vals = (uint32_t*)((term*)h->keys + h->n_buckets);
50
+ DEBUG("flags are at %p (+%ld)", h->flags, OFFSET(h->flags, h->boundary));
51
+ DEBUG(" keys are at %p (+%ld)", h->keys, OFFSET(h->keys, h->boundary));
52
+ DEBUG(" vals are at %p (+%ld)", h->vals, OFFSET(h->vals, h->boundary));
53
+ }
54
+
55
+ /*
56
+ static void termhash_dump(termhash* h) {
57
+ for(uint32_t i = 0; i < h->n_buckets; i++) {
58
+ if(isempty(h->flags, i)) printf("%u:\n", i);
59
+ else if(isdel(h->flags, i)) printf("%u: [deleted]", i);
60
+ else {
61
+ term t = h->keys[i];
62
+ printf("%u: (%u,%u)\n", i, t.field_s, t.word_s);
63
+ }
64
+ }
65
+ }
66
+ */
67
+
68
+ /*
69
+ static void kh_destroy_##name(kh_##name##_t *h) {
70
+ if (h) {
71
+ free(h->keys); free(h->flags);
72
+ free(h->vals);
73
+ free(h);
74
+ }
75
+ }
76
+
77
+ static void kh_clear_##name(kh_##name##_t *h) {
78
+ if (h && h->flags) {
79
+ memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
80
+ h->size = h->n_occupied = 0;
81
+ }
82
+ }
83
+ */
84
+
85
+ uint32_t termhash_get(termhash *h, term key) {
86
+ if(h->n_buckets) {
87
+ uint32_t inc, k, i, last;
88
+ k = hash_term(key); i = k % h->n_buckets;
89
+ inc = 1 + k % (h->n_buckets - 1); last = i;
90
+ while (!isempty(h->flags, i) && (isdel(h->flags, i) || !term_equals(h->keys[i], key))) {
91
+ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
92
+ else i += inc;
93
+ if (i == last) return h->n_buckets;
94
+ }
95
+ return iseither(h->flags, i)? h->n_buckets : i;
96
+ }
97
+ else return 0;
98
+ }
99
+
100
+ wp_error* termhash_bump_size(termhash *h) {
101
+ DEBUG("bumping size for term hash at %p with size %u and boundary %p (+%ld)", h, termhash_size(h), h->boundary, (long)((uint8_t*)h->boundary - (uint8_t*)h));
102
+ DEBUG("flags are at %p (+%ld)", h->flags, OFFSET(h->flags, h->boundary));
103
+ DEBUG(" keys are at %p (+%ld)", h->keys, OFFSET(h->keys, h->boundary));
104
+ DEBUG(" vals are at %p (+%ld)", h->vals, OFFSET(h->vals, h->boundary));
105
+
106
+ h->n_buckets_idx++;
107
+ if(h->n_buckets_idx > HASH_PRIME_SIZE) exit(1); // die horribly TODO fixme
108
+ uint32_t new_n_buckets = prime_list[h->n_buckets_idx];
109
+
110
+ // first make a backup of the oldflags
111
+ size_t oldflagsize = ((h->n_buckets >> 4) + 1) * sizeof(uint32_t);
112
+ uint32_t* oldflags = malloc(oldflagsize);
113
+ memcpy(oldflags, h->flags, oldflagsize);
114
+
115
+ // keep pointers to the old locations
116
+ term* oldkeys = h->keys;
117
+ uint32_t* oldvals = h->vals;
118
+
119
+ // set pointers to the new locations
120
+ h->keys = (term*)((uint32_t*)h->boundary + ((new_n_buckets >> 4) + 1));
121
+ h->vals = (uint32_t*)((term*)h->keys + new_n_buckets);
122
+
123
+ // move the vals and keys
124
+ memmove(h->vals, oldvals, h->n_buckets * sizeof(uint32_t));
125
+ memmove(h->keys, oldkeys, h->n_buckets * sizeof(term));
126
+
127
+ // clear the new flags
128
+ memset(h->flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t));
129
+
130
+ // do the complicated stuff from khash.h
131
+ for (unsigned int j = 0; j != h->n_buckets; ++j) {
132
+ if (iseither(oldflags, j) == 0) {
133
+ term key = h->keys[j];
134
+ uint32_t val;
135
+ val = h->vals[j];
136
+ set_isdel_true(oldflags, j);
137
+ while (1) {
138
+ uint32_t inc, k, i;
139
+ k = hash_term(key);
140
+ i = k % new_n_buckets;
141
+ inc = 1 + k % (new_n_buckets - 1);
142
+ while (!isempty(h->flags, i)) {
143
+ if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets;
144
+ else i += inc;
145
+ }
146
+ set_isempty_false(h->flags, i);
147
+ if (i < h->n_buckets && iseither(oldflags, i) == 0) {
148
+ { term tmp = h->keys[i]; h->keys[i] = key; key = tmp; }
149
+ { uint32_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; }
150
+ set_isdel_true(oldflags, i);
151
+ } else {
152
+ h->keys[i] = key;
153
+ h->vals[i] = val;
154
+ break;
155
+ }
156
+ }
157
+ }
158
+ }
159
+
160
+ free(oldflags);
161
+ h->n_buckets = new_n_buckets;
162
+ h->n_occupied = h->size;
163
+ h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
164
+
165
+ DEBUG("after bump, term hash at %p has size %u and boundary %p (+%ld)", h, termhash_size(h), h->boundary, (long)((uint8_t*)h->boundary - (uint8_t*)h));
166
+ DEBUG("flags are at %p (+%ld)", h->flags, (long)((uint8_t*)h->flags - (uint8_t*)h->boundary));
167
+ DEBUG(" keys are at %p (+%ld)", h->keys, (long)((uint8_t*)h->keys - (uint8_t*)h->boundary));
168
+ DEBUG(" vals are at %p (+%ld)", h->vals, (long)((uint8_t*)h->vals - (uint8_t*)h->boundary));
169
+
170
+ #ifdef DEBUGOUTPUT
171
+ //DEBUG("and now i look like this:");
172
+ //termhash_dump(h);
173
+ #endif
174
+
175
+ return NO_ERROR;
176
+ }
177
+
178
+ uint32_t termhash_put(termhash *h, term key, int *ret) {
179
+ uint32_t x;
180
+
181
+ {
182
+ #ifdef DEBUGOUTPUT
183
+ int num_loops = 0;
184
+ #endif
185
+ uint32_t inc, k, i, site, last;
186
+ x = site = h->n_buckets; k = hash_term(key); i = k % h->n_buckets;
187
+ DEBUG("initial hash is %u", k);
188
+ if (isempty(h->flags, i)) x = i;
189
+ else {
190
+ inc = 1 + k % (h->n_buckets - 1); last = i;
191
+ while (!isempty(h->flags, i) && (isdel(h->flags, i) || !term_equals(h->keys[i], key))) {
192
+ #ifdef DEBUGOUTPUT
193
+ num_loops++;
194
+ #endif
195
+ if (isdel(h->flags, i)) site = i;
196
+ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
197
+ else i += inc;
198
+ if (i == last) { x = site; break; }
199
+ }
200
+ if ((x == h->n_buckets) && (i == last)) { // out of space
201
+ if(!term_equals(h->keys[i], key)) {
202
+ *ret = -1;
203
+ return x;
204
+ }
205
+ }
206
+ if (x == h->n_buckets) { // didn't find it on the first try
207
+ if (isempty(h->flags, i) && site != h->n_buckets) x = site;
208
+ else x = i;
209
+ }
210
+ }
211
+ DEBUG("looped %u times to put", num_loops);
212
+ //DEBUG("x is %u, site is %u, n_buckets is %u", x, site, h->n_buckets);
213
+ }
214
+ if (isempty(h->flags, x)) {
215
+ h->keys[x] = key;
216
+ set_isboth_false(h->flags, x);
217
+ ++h->size; ++h->n_occupied;
218
+ *ret = 1;
219
+ } else if (isdel(h->flags, x)) {
220
+ h->keys[x] = key;
221
+ set_isboth_false(h->flags, x);
222
+ ++h->size;
223
+ *ret = 2;
224
+ }
225
+ else *ret = 0;
226
+
227
+ #ifdef DEBUGOUTPUT
228
+ //DEBUG("after put:");
229
+ //termhash_dump(h);
230
+ #endif
231
+
232
+ return x;
233
+ }
234
+
235
+ void termhash_del(termhash *h, uint32_t x) {
236
+ if (x != h->n_buckets && !iseither(h->flags, x)) {
237
+ set_isdel_true(h->flags, x);
238
+ --h->size;
239
+ }
240
+ }
241
+
242
+ uint32_t termhash_get_val(termhash* h, term t) {
243
+ uint32_t idx = termhash_get(h, t);
244
+ if(idx == h->n_buckets) return (uint32_t)-1;
245
+ return h->vals[idx];
246
+ }
247
+
248
+ wp_error* termhash_put_val(termhash* h, term t, uint32_t val) {
249
+ int status;
250
+ uint32_t loc = termhash_put(h, t, &status);
251
+ DEBUG("put(%u,%u) has status %d and loc %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
252
+ if(status == -1) RAISE_ERROR("out of space in hash");
253
+ h->vals[loc] = val;
254
+ return NO_ERROR;
255
+ }
256
+
257
+ int termhash_needs_bump(termhash* h) {
258
+ return (h->n_occupied >= h->upper_bound);
259
+ }
260
+
261
+ // returns the total size in bytes
262
+ // memory layout: termhash, then:
263
+ // ((n_buckets >> 4) + 1) uint32_t's for the flags
264
+ // n_buckets terms for the keys
265
+ // n_buckets uint32_t's for the vals (offsets into postings lists)
266
+ static uint32_t size(uint32_t n_buckets) {
267
+ uint32_t size = sizeof(termhash) +
268
+ (((n_buckets >> 4) + 1) * sizeof(uint32_t)) +
269
+ (n_buckets * sizeof(term)) +
270
+ (n_buckets * sizeof(uint32_t));
271
+
272
+ DEBUG("size of a termhash with %u buckets is %lu + %lu + %lu + %lu = %u",
273
+ n_buckets,
274
+ (long)sizeof(termhash),
275
+ (long)(((n_buckets >> 4) + 1) * sizeof(uint32_t)),
276
+ (long)(n_buckets * sizeof(term)),
277
+ (long)(n_buckets * sizeof(uint32_t)),
278
+ size);
279
+
280
+ return size;
281
+ }
282
+
283
+ uint32_t termhash_size(termhash* h) {
284
+ return size(h->n_buckets);
285
+ }
286
+
287
+ uint32_t termhash_initial_size() {
288
+ return size(prime_list[INITIAL_N_BUCKETS_IDX]);
289
+ }
290
+
291
+ uint32_t termhash_next_size(termhash* h) {
292
+ int next_idx = (h->n_buckets_idx < (HASH_PRIME_SIZE - 1)) ? h->n_buckets_idx + 1 : h->n_buckets_idx;
293
+ return size(prime_list[next_idx]);
294
+ }
@@ -0,0 +1,79 @@
1
+ #ifndef WP_TERMHASH_H_
2
+ #define WP_TERMHASH_H_
3
+
4
+ // whistlepig term hash
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // based on a heavily modified khash.h
8
+ //
9
+ // a term, in this file, is a pair of ints. the intention is that these are the
10
+ // results of adding strings to the stringmap. the termhash is then a map from
11
+ // such pairs to ints.
12
+ //
13
+ // like stringmap and stringpool, it uses a slightly funny API that never
14
+ // allocates memory, but instead operates on pointers to preallocated blocks of
15
+ // memory.
16
+
17
+ #include <stdint.h>
18
+ #include "error.h"
19
+
20
+ typedef struct term {
21
+ uint32_t field_s;
22
+ uint32_t word_s;
23
+ } term;
24
+
25
+ #define INITIAL_N_BUCKETS_IDX 1
26
+
27
+ typedef struct termhash {
28
+ uint8_t n_buckets_idx;
29
+ uint32_t n_buckets, size, n_occupied, upper_bound;
30
+ uint32_t *flags;
31
+ term *keys;
32
+ uint32_t *vals;
33
+ uint8_t boundary[];
34
+ // in memory at this point
35
+ // ((n_buckets >> 4) + 1) uint32_t's for the flags
36
+ // n_buckets terms for the keys
37
+ // n_buckets uint32_t's for the vals (offsets into postings lists)
38
+ } termhash;
39
+
40
+ // API methods
41
+
42
+ // public: make a new termhash
43
+ void termhash_init(termhash* h); // makes a new one
44
+
45
+ // public: set up an existing termhash
46
+ void termhash_setup(termhash* h); // inits one from disk
47
+
48
+ // private: khash-style getter: returns the slot id, if any, given a term key.
49
+ // you can then look this up within the vals array yourself. returns
50
+ // h->n_buckets if the term is not in the hash.
51
+ uint32_t termhash_get(termhash *h, term t);
52
+
53
+ // public: get an int given a term. returns (uint32_t)-1 if the term is not in
54
+ // the hash.
55
+ uint32_t termhash_get_val(termhash* h, term t); // convenience
56
+
57
+ // private: khash-style setter: insert a term into the hash. see the code
58
+ // for details on what all the return values mean.
59
+ uint32_t termhash_put(termhash* h, term t, int *ret); // khash-style
60
+
61
+ // public: adds a term to the hash with the given value
62
+ wp_error* termhash_put_val(termhash* h, term t, uint32_t val) RAISES_ERROR; // convenience
63
+
64
+ // public: returns the byte size of the termhash
65
+ uint32_t termhash_size(termhash* h);
66
+
67
+ // public: returns the byte size for the next larger version of the termhash
68
+ uint32_t termhash_next_size(termhash* h);
69
+
70
+ // public: does the termhash need a size increase?
71
+ int termhash_needs_bump(termhash* h);
72
+
73
+ // public: increases the size of the termhash
74
+ wp_error* termhash_bump_size(termhash* h) RAISES_ERROR;
75
+
76
+ // public: returns the initial byte size for an empty termhash
77
+ uint32_t termhash_initial_size();
78
+
79
+ #endif