whistlepig 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +86 -0
- data/ext/whistlepig/defaults.h +28 -0
- data/ext/whistlepig/entry.c +181 -0
- data/ext/whistlepig/entry.h +66 -0
- data/ext/whistlepig/error.c +24 -0
- data/ext/whistlepig/error.h +94 -0
- data/ext/whistlepig/extconf.rb +6 -0
- data/ext/whistlepig/index.c +294 -0
- data/ext/whistlepig/index.h +88 -0
- data/ext/whistlepig/khash.h +316 -0
- data/ext/whistlepig/mmap-obj.c +76 -0
- data/ext/whistlepig/mmap-obj.h +52 -0
- data/ext/whistlepig/query-parser.c +37 -0
- data/ext/whistlepig/query-parser.h +25 -0
- data/ext/whistlepig/query-parser.lex.c +2249 -0
- data/ext/whistlepig/query-parser.lex.h +359 -0
- data/ext/whistlepig/query-parser.tab.c +1757 -0
- data/ext/whistlepig/query-parser.tab.h +85 -0
- data/ext/whistlepig/query.c +194 -0
- data/ext/whistlepig/query.h +78 -0
- data/ext/whistlepig/search.c +746 -0
- data/ext/whistlepig/search.h +76 -0
- data/ext/whistlepig/segment.c +615 -0
- data/ext/whistlepig/segment.h +137 -0
- data/ext/whistlepig/stringmap.c +278 -0
- data/ext/whistlepig/stringmap.h +82 -0
- data/ext/whistlepig/stringpool.c +44 -0
- data/ext/whistlepig/stringpool.h +58 -0
- data/ext/whistlepig/termhash.c +294 -0
- data/ext/whistlepig/termhash.h +79 -0
- data/ext/whistlepig/tokenizer.lex.c +2263 -0
- data/ext/whistlepig/tokenizer.lex.h +360 -0
- data/ext/whistlepig/whistlepig.h +15 -0
- data/ext/whistlepig/whistlepigc.c +537 -0
- data/lib/whistlepig.rb +119 -0
- metadata +103 -0
@@ -0,0 +1,294 @@
|
|
1
|
+
#include "whistlepig.h"
|
2
|
+
|
3
|
+
static const int HASH_PRIME_SIZE = 32;
|
4
|
+
|
5
|
+
static const uint32_t prime_list[] = {
|
6
|
+
0ul, 3ul, 11ul, 23ul, 53ul,
|
7
|
+
97ul, 193ul, 389ul, 769ul, 1543ul,
|
8
|
+
3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
|
9
|
+
98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
|
10
|
+
3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
|
11
|
+
100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
|
12
|
+
3221225473ul, 4294967291ul
|
13
|
+
};
|
14
|
+
|
15
|
+
#define isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
|
16
|
+
#define isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
|
17
|
+
#define iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
|
18
|
+
#define set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
|
19
|
+
#define set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
|
20
|
+
#define set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
|
21
|
+
#define set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
|
22
|
+
|
23
|
+
static const double HASH_UPPER = 0.77;
|
24
|
+
|
25
|
+
static uint32_t hash_term(term t) {
|
26
|
+
return t.word_s ^ t.field_s;
|
27
|
+
}
|
28
|
+
|
29
|
+
static int term_equals(term a, term b) {
|
30
|
+
return a.word_s == b.word_s && a.field_s == b.field_s;
|
31
|
+
}
|
32
|
+
|
33
|
+
void termhash_init(termhash* h) {
|
34
|
+
h->n_buckets_idx = 1;
|
35
|
+
h->n_buckets = prime_list[h->n_buckets_idx];
|
36
|
+
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
37
|
+
h->size = h->n_occupied = 0;
|
38
|
+
termhash_setup(h);
|
39
|
+
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
40
|
+
}
|
41
|
+
|
42
|
+
#define OFFSET(a, b) (long)((uint8_t*)a - (uint8_t*)b)
|
43
|
+
// set flags, keys and vals to correct locations based on h->n_buckets
|
44
|
+
void termhash_setup(termhash* h) {
|
45
|
+
DEBUG("term hash ranges from %p to %p (size %u)", h, (char*)h + termhash_size(h), termhash_size(h));
|
46
|
+
DEBUG("boundary is at %p (+%ld)", h->boundary, OFFSET(h->boundary, h));
|
47
|
+
h->flags = (uint32_t*)h->boundary;
|
48
|
+
h->keys = (term*)((uint32_t*)h->boundary + ((h->n_buckets >> 4) + 1));
|
49
|
+
h->vals = (uint32_t*)((term*)h->keys + h->n_buckets);
|
50
|
+
DEBUG("flags are at %p (+%ld)", h->flags, OFFSET(h->flags, h->boundary));
|
51
|
+
DEBUG(" keys are at %p (+%ld)", h->keys, OFFSET(h->keys, h->boundary));
|
52
|
+
DEBUG(" vals are at %p (+%ld)", h->vals, OFFSET(h->vals, h->boundary));
|
53
|
+
}
|
54
|
+
|
55
|
+
/*
|
56
|
+
static void termhash_dump(termhash* h) {
|
57
|
+
for(uint32_t i = 0; i < h->n_buckets; i++) {
|
58
|
+
if(isempty(h->flags, i)) printf("%u:\n", i);
|
59
|
+
else if(isdel(h->flags, i)) printf("%u: [deleted]", i);
|
60
|
+
else {
|
61
|
+
term t = h->keys[i];
|
62
|
+
printf("%u: (%u,%u)\n", i, t.field_s, t.word_s);
|
63
|
+
}
|
64
|
+
}
|
65
|
+
}
|
66
|
+
*/
|
67
|
+
|
68
|
+
/*
|
69
|
+
static void kh_destroy_##name(kh_##name##_t *h) {
|
70
|
+
if (h) {
|
71
|
+
free(h->keys); free(h->flags);
|
72
|
+
free(h->vals);
|
73
|
+
free(h);
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
static void kh_clear_##name(kh_##name##_t *h) {
|
78
|
+
if (h && h->flags) {
|
79
|
+
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
80
|
+
h->size = h->n_occupied = 0;
|
81
|
+
}
|
82
|
+
}
|
83
|
+
*/
|
84
|
+
|
85
|
+
uint32_t termhash_get(termhash *h, term key) {
|
86
|
+
if(h->n_buckets) {
|
87
|
+
uint32_t inc, k, i, last;
|
88
|
+
k = hash_term(key); i = k % h->n_buckets;
|
89
|
+
inc = 1 + k % (h->n_buckets - 1); last = i;
|
90
|
+
while (!isempty(h->flags, i) && (isdel(h->flags, i) || !term_equals(h->keys[i], key))) {
|
91
|
+
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
92
|
+
else i += inc;
|
93
|
+
if (i == last) return h->n_buckets;
|
94
|
+
}
|
95
|
+
return iseither(h->flags, i)? h->n_buckets : i;
|
96
|
+
}
|
97
|
+
else return 0;
|
98
|
+
}
|
99
|
+
|
100
|
+
wp_error* termhash_bump_size(termhash *h) {
|
101
|
+
DEBUG("bumping size for term hash at %p with size %u and boundary %p (+%ld)", h, termhash_size(h), h->boundary, (long)((uint8_t*)h->boundary - (uint8_t*)h));
|
102
|
+
DEBUG("flags are at %p (+%ld)", h->flags, OFFSET(h->flags, h->boundary));
|
103
|
+
DEBUG(" keys are at %p (+%ld)", h->keys, OFFSET(h->keys, h->boundary));
|
104
|
+
DEBUG(" vals are at %p (+%ld)", h->vals, OFFSET(h->vals, h->boundary));
|
105
|
+
|
106
|
+
h->n_buckets_idx++;
|
107
|
+
if(h->n_buckets_idx > HASH_PRIME_SIZE) exit(1); // die horribly TODO fixme
|
108
|
+
uint32_t new_n_buckets = prime_list[h->n_buckets_idx];
|
109
|
+
|
110
|
+
// first make a backup of the oldflags
|
111
|
+
size_t oldflagsize = ((h->n_buckets >> 4) + 1) * sizeof(uint32_t);
|
112
|
+
uint32_t* oldflags = malloc(oldflagsize);
|
113
|
+
memcpy(oldflags, h->flags, oldflagsize);
|
114
|
+
|
115
|
+
// keep pointers to the old locations
|
116
|
+
term* oldkeys = h->keys;
|
117
|
+
uint32_t* oldvals = h->vals;
|
118
|
+
|
119
|
+
// set pointers to the new locations
|
120
|
+
h->keys = (term*)((uint32_t*)h->boundary + ((new_n_buckets >> 4) + 1));
|
121
|
+
h->vals = (uint32_t*)((term*)h->keys + new_n_buckets);
|
122
|
+
|
123
|
+
// move the vals and keys
|
124
|
+
memmove(h->vals, oldvals, h->n_buckets * sizeof(uint32_t));
|
125
|
+
memmove(h->keys, oldkeys, h->n_buckets * sizeof(term));
|
126
|
+
|
127
|
+
// clear the new flags
|
128
|
+
memset(h->flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t));
|
129
|
+
|
130
|
+
// do the complicated stuff from khash.h
|
131
|
+
for (unsigned int j = 0; j != h->n_buckets; ++j) {
|
132
|
+
if (iseither(oldflags, j) == 0) {
|
133
|
+
term key = h->keys[j];
|
134
|
+
uint32_t val;
|
135
|
+
val = h->vals[j];
|
136
|
+
set_isdel_true(oldflags, j);
|
137
|
+
while (1) {
|
138
|
+
uint32_t inc, k, i;
|
139
|
+
k = hash_term(key);
|
140
|
+
i = k % new_n_buckets;
|
141
|
+
inc = 1 + k % (new_n_buckets - 1);
|
142
|
+
while (!isempty(h->flags, i)) {
|
143
|
+
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets;
|
144
|
+
else i += inc;
|
145
|
+
}
|
146
|
+
set_isempty_false(h->flags, i);
|
147
|
+
if (i < h->n_buckets && iseither(oldflags, i) == 0) {
|
148
|
+
{ term tmp = h->keys[i]; h->keys[i] = key; key = tmp; }
|
149
|
+
{ uint32_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; }
|
150
|
+
set_isdel_true(oldflags, i);
|
151
|
+
} else {
|
152
|
+
h->keys[i] = key;
|
153
|
+
h->vals[i] = val;
|
154
|
+
break;
|
155
|
+
}
|
156
|
+
}
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
free(oldflags);
|
161
|
+
h->n_buckets = new_n_buckets;
|
162
|
+
h->n_occupied = h->size;
|
163
|
+
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
164
|
+
|
165
|
+
DEBUG("after bump, term hash at %p has size %u and boundary %p (+%ld)", h, termhash_size(h), h->boundary, (long)((uint8_t*)h->boundary - (uint8_t*)h));
|
166
|
+
DEBUG("flags are at %p (+%ld)", h->flags, (long)((uint8_t*)h->flags - (uint8_t*)h->boundary));
|
167
|
+
DEBUG(" keys are at %p (+%ld)", h->keys, (long)((uint8_t*)h->keys - (uint8_t*)h->boundary));
|
168
|
+
DEBUG(" vals are at %p (+%ld)", h->vals, (long)((uint8_t*)h->vals - (uint8_t*)h->boundary));
|
169
|
+
|
170
|
+
#ifdef DEBUGOUTPUT
|
171
|
+
//DEBUG("and now i look like this:");
|
172
|
+
//termhash_dump(h);
|
173
|
+
#endif
|
174
|
+
|
175
|
+
return NO_ERROR;
|
176
|
+
}
|
177
|
+
|
178
|
+
uint32_t termhash_put(termhash *h, term key, int *ret) {
|
179
|
+
uint32_t x;
|
180
|
+
|
181
|
+
{
|
182
|
+
#ifdef DEBUGOUTPUT
|
183
|
+
int num_loops = 0;
|
184
|
+
#endif
|
185
|
+
uint32_t inc, k, i, site, last;
|
186
|
+
x = site = h->n_buckets; k = hash_term(key); i = k % h->n_buckets;
|
187
|
+
DEBUG("initial hash is %u", k);
|
188
|
+
if (isempty(h->flags, i)) x = i;
|
189
|
+
else {
|
190
|
+
inc = 1 + k % (h->n_buckets - 1); last = i;
|
191
|
+
while (!isempty(h->flags, i) && (isdel(h->flags, i) || !term_equals(h->keys[i], key))) {
|
192
|
+
#ifdef DEBUGOUTPUT
|
193
|
+
num_loops++;
|
194
|
+
#endif
|
195
|
+
if (isdel(h->flags, i)) site = i;
|
196
|
+
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
197
|
+
else i += inc;
|
198
|
+
if (i == last) { x = site; break; }
|
199
|
+
}
|
200
|
+
if ((x == h->n_buckets) && (i == last)) { // out of space
|
201
|
+
if(!term_equals(h->keys[i], key)) {
|
202
|
+
*ret = -1;
|
203
|
+
return x;
|
204
|
+
}
|
205
|
+
}
|
206
|
+
if (x == h->n_buckets) { // didn't find it on the first try
|
207
|
+
if (isempty(h->flags, i) && site != h->n_buckets) x = site;
|
208
|
+
else x = i;
|
209
|
+
}
|
210
|
+
}
|
211
|
+
DEBUG("looped %u times to put", num_loops);
|
212
|
+
//DEBUG("x is %u, site is %u, n_buckets is %u", x, site, h->n_buckets);
|
213
|
+
}
|
214
|
+
if (isempty(h->flags, x)) {
|
215
|
+
h->keys[x] = key;
|
216
|
+
set_isboth_false(h->flags, x);
|
217
|
+
++h->size; ++h->n_occupied;
|
218
|
+
*ret = 1;
|
219
|
+
} else if (isdel(h->flags, x)) {
|
220
|
+
h->keys[x] = key;
|
221
|
+
set_isboth_false(h->flags, x);
|
222
|
+
++h->size;
|
223
|
+
*ret = 2;
|
224
|
+
}
|
225
|
+
else *ret = 0;
|
226
|
+
|
227
|
+
#ifdef DEBUGOUTPUT
|
228
|
+
//DEBUG("after put:");
|
229
|
+
//termhash_dump(h);
|
230
|
+
#endif
|
231
|
+
|
232
|
+
return x;
|
233
|
+
}
|
234
|
+
|
235
|
+
void termhash_del(termhash *h, uint32_t x) {
|
236
|
+
if (x != h->n_buckets && !iseither(h->flags, x)) {
|
237
|
+
set_isdel_true(h->flags, x);
|
238
|
+
--h->size;
|
239
|
+
}
|
240
|
+
}
|
241
|
+
|
242
|
+
uint32_t termhash_get_val(termhash* h, term t) {
|
243
|
+
uint32_t idx = termhash_get(h, t);
|
244
|
+
if(idx == h->n_buckets) return (uint32_t)-1;
|
245
|
+
return h->vals[idx];
|
246
|
+
}
|
247
|
+
|
248
|
+
wp_error* termhash_put_val(termhash* h, term t, uint32_t val) {
|
249
|
+
int status;
|
250
|
+
uint32_t loc = termhash_put(h, t, &status);
|
251
|
+
DEBUG("put(%u,%u) has status %d and loc %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
|
252
|
+
if(status == -1) RAISE_ERROR("out of space in hash");
|
253
|
+
h->vals[loc] = val;
|
254
|
+
return NO_ERROR;
|
255
|
+
}
|
256
|
+
|
257
|
+
int termhash_needs_bump(termhash* h) {
|
258
|
+
return (h->n_occupied >= h->upper_bound);
|
259
|
+
}
|
260
|
+
|
261
|
+
// returns the total size in bytes
|
262
|
+
// memory layout: termhash, then:
|
263
|
+
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
264
|
+
// n_buckets terms for the keys
|
265
|
+
// n_buckets uint32_t's for the vals (offsets into postings lists)
|
266
|
+
static uint32_t size(uint32_t n_buckets) {
|
267
|
+
uint32_t size = sizeof(termhash) +
|
268
|
+
(((n_buckets >> 4) + 1) * sizeof(uint32_t)) +
|
269
|
+
(n_buckets * sizeof(term)) +
|
270
|
+
(n_buckets * sizeof(uint32_t));
|
271
|
+
|
272
|
+
DEBUG("size of a termhash with %u buckets is %lu + %lu + %lu + %lu = %u",
|
273
|
+
n_buckets,
|
274
|
+
(long)sizeof(termhash),
|
275
|
+
(long)(((n_buckets >> 4) + 1) * sizeof(uint32_t)),
|
276
|
+
(long)(n_buckets * sizeof(term)),
|
277
|
+
(long)(n_buckets * sizeof(uint32_t)),
|
278
|
+
size);
|
279
|
+
|
280
|
+
return size;
|
281
|
+
}
|
282
|
+
|
283
|
+
uint32_t termhash_size(termhash* h) {
|
284
|
+
return size(h->n_buckets);
|
285
|
+
}
|
286
|
+
|
287
|
+
uint32_t termhash_initial_size() {
|
288
|
+
return size(prime_list[INITIAL_N_BUCKETS_IDX]);
|
289
|
+
}
|
290
|
+
|
291
|
+
uint32_t termhash_next_size(termhash* h) {
|
292
|
+
int next_idx = (h->n_buckets_idx < (HASH_PRIME_SIZE - 1)) ? h->n_buckets_idx + 1 : h->n_buckets_idx;
|
293
|
+
return size(prime_list[next_idx]);
|
294
|
+
}
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#ifndef WP_TERMHASH_H_
|
2
|
+
#define WP_TERMHASH_H_
|
3
|
+
|
4
|
+
// whistlepig term hash
|
5
|
+
// (c) 2011 William Morgan. See COPYING for license terms.
|
6
|
+
//
|
7
|
+
// based on a heavily modified khash.h
|
8
|
+
//
|
9
|
+
// a term, in this file, is a pair of ints. the intention is that these are the
|
10
|
+
// results of adding strings to the stringmap. the termhash is then a map from
|
11
|
+
// such pairs to ints.
|
12
|
+
//
|
13
|
+
// like stringmap and stringpool, it uses a slightly funny API that never
|
14
|
+
// allocates memory, but instead operates on pointers to preallocated blocks of
|
15
|
+
// memory.
|
16
|
+
|
17
|
+
#include <stdint.h>
|
18
|
+
#include "error.h"
|
19
|
+
|
20
|
+
typedef struct term {
|
21
|
+
uint32_t field_s;
|
22
|
+
uint32_t word_s;
|
23
|
+
} term;
|
24
|
+
|
25
|
+
#define INITIAL_N_BUCKETS_IDX 1
|
26
|
+
|
27
|
+
typedef struct termhash {
|
28
|
+
uint8_t n_buckets_idx;
|
29
|
+
uint32_t n_buckets, size, n_occupied, upper_bound;
|
30
|
+
uint32_t *flags;
|
31
|
+
term *keys;
|
32
|
+
uint32_t *vals;
|
33
|
+
uint8_t boundary[];
|
34
|
+
// in memory at this point
|
35
|
+
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
36
|
+
// n_buckets terms for the keys
|
37
|
+
// n_buckets uint32_t's for the vals (offsets into postings lists)
|
38
|
+
} termhash;
|
39
|
+
|
40
|
+
// API methods
|
41
|
+
|
42
|
+
// public: make a new termhash
|
43
|
+
void termhash_init(termhash* h); // makes a new one
|
44
|
+
|
45
|
+
// public: set up an existing termhash
|
46
|
+
void termhash_setup(termhash* h); // inits one from disk
|
47
|
+
|
48
|
+
// private: khash-style getter: returns the slot id, if any, given a term key.
|
49
|
+
// you can then look this up within the vals array yourself. returns
|
50
|
+
// h->n_buckets if the term is not in the hash.
|
51
|
+
uint32_t termhash_get(termhash *h, term t);
|
52
|
+
|
53
|
+
// public: get an int given a term. returns (uint32_t)-1 if the term is not in
|
54
|
+
// the hash.
|
55
|
+
uint32_t termhash_get_val(termhash* h, term t); // convenience
|
56
|
+
|
57
|
+
// private: khash-style setter: insert a term into the hash. see the code
|
58
|
+
// for details on what all the return values mean.
|
59
|
+
uint32_t termhash_put(termhash* h, term t, int *ret); // khash-style
|
60
|
+
|
61
|
+
// public: adds a term to the hash with the given value
|
62
|
+
wp_error* termhash_put_val(termhash* h, term t, uint32_t val) RAISES_ERROR; // convenience
|
63
|
+
|
64
|
+
// public: returns the byte size of the termhash
|
65
|
+
uint32_t termhash_size(termhash* h);
|
66
|
+
|
67
|
+
// public: returns the byte size for the next larger version of the termhash
|
68
|
+
uint32_t termhash_next_size(termhash* h);
|
69
|
+
|
70
|
+
// public: does the termhash need a size increase?
|
71
|
+
int termhash_needs_bump(termhash* h);
|
72
|
+
|
73
|
+
// public: increases the size of the termhash
|
74
|
+
wp_error* termhash_bump_size(termhash* h) RAISES_ERROR;
|
75
|
+
|
76
|
+
// public: returns the initial byte size for an empty termhash
|
77
|
+
uint32_t termhash_initial_size();
|
78
|
+
|
79
|
+
#endif
|