whistlepig 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +86 -0
- data/ext/whistlepig/defaults.h +28 -0
- data/ext/whistlepig/entry.c +181 -0
- data/ext/whistlepig/entry.h +66 -0
- data/ext/whistlepig/error.c +24 -0
- data/ext/whistlepig/error.h +94 -0
- data/ext/whistlepig/extconf.rb +6 -0
- data/ext/whistlepig/index.c +294 -0
- data/ext/whistlepig/index.h +88 -0
- data/ext/whistlepig/khash.h +316 -0
- data/ext/whistlepig/mmap-obj.c +76 -0
- data/ext/whistlepig/mmap-obj.h +52 -0
- data/ext/whistlepig/query-parser.c +37 -0
- data/ext/whistlepig/query-parser.h +25 -0
- data/ext/whistlepig/query-parser.lex.c +2249 -0
- data/ext/whistlepig/query-parser.lex.h +359 -0
- data/ext/whistlepig/query-parser.tab.c +1757 -0
- data/ext/whistlepig/query-parser.tab.h +85 -0
- data/ext/whistlepig/query.c +194 -0
- data/ext/whistlepig/query.h +78 -0
- data/ext/whistlepig/search.c +746 -0
- data/ext/whistlepig/search.h +76 -0
- data/ext/whistlepig/segment.c +615 -0
- data/ext/whistlepig/segment.h +137 -0
- data/ext/whistlepig/stringmap.c +278 -0
- data/ext/whistlepig/stringmap.h +82 -0
- data/ext/whistlepig/stringpool.c +44 -0
- data/ext/whistlepig/stringpool.h +58 -0
- data/ext/whistlepig/termhash.c +294 -0
- data/ext/whistlepig/termhash.h +79 -0
- data/ext/whistlepig/tokenizer.lex.c +2263 -0
- data/ext/whistlepig/tokenizer.lex.h +360 -0
- data/ext/whistlepig/whistlepig.h +15 -0
- data/ext/whistlepig/whistlepigc.c +537 -0
- data/lib/whistlepig.rb +119 -0
- metadata +103 -0
@@ -0,0 +1,294 @@
|
|
1
|
+
#include "whistlepig.h"
|
2
|
+
|
3
|
+
static const int HASH_PRIME_SIZE = 32;
|
4
|
+
|
5
|
+
static const uint32_t prime_list[] = {
|
6
|
+
0ul, 3ul, 11ul, 23ul, 53ul,
|
7
|
+
97ul, 193ul, 389ul, 769ul, 1543ul,
|
8
|
+
3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
|
9
|
+
98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
|
10
|
+
3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
|
11
|
+
100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
|
12
|
+
3221225473ul, 4294967291ul
|
13
|
+
};
|
14
|
+
|
15
|
+
#define isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
|
16
|
+
#define isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
|
17
|
+
#define iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
|
18
|
+
#define set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
|
19
|
+
#define set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
|
20
|
+
#define set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
|
21
|
+
#define set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
|
22
|
+
|
23
|
+
static const double HASH_UPPER = 0.77;
|
24
|
+
|
25
|
+
static uint32_t hash_term(term t) {
|
26
|
+
return t.word_s ^ t.field_s;
|
27
|
+
}
|
28
|
+
|
29
|
+
static int term_equals(term a, term b) {
|
30
|
+
return a.word_s == b.word_s && a.field_s == b.field_s;
|
31
|
+
}
|
32
|
+
|
33
|
+
void termhash_init(termhash* h) {
|
34
|
+
h->n_buckets_idx = 1;
|
35
|
+
h->n_buckets = prime_list[h->n_buckets_idx];
|
36
|
+
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
37
|
+
h->size = h->n_occupied = 0;
|
38
|
+
termhash_setup(h);
|
39
|
+
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
40
|
+
}
|
41
|
+
|
42
|
+
#define OFFSET(a, b) (long)((uint8_t*)a - (uint8_t*)b)
|
43
|
+
// set flags, keys and vals to correct locations based on h->n_buckets
|
44
|
+
void termhash_setup(termhash* h) {
|
45
|
+
DEBUG("term hash ranges from %p to %p (size %u)", h, (char*)h + termhash_size(h), termhash_size(h));
|
46
|
+
DEBUG("boundary is at %p (+%ld)", h->boundary, OFFSET(h->boundary, h));
|
47
|
+
h->flags = (uint32_t*)h->boundary;
|
48
|
+
h->keys = (term*)((uint32_t*)h->boundary + ((h->n_buckets >> 4) + 1));
|
49
|
+
h->vals = (uint32_t*)((term*)h->keys + h->n_buckets);
|
50
|
+
DEBUG("flags are at %p (+%ld)", h->flags, OFFSET(h->flags, h->boundary));
|
51
|
+
DEBUG(" keys are at %p (+%ld)", h->keys, OFFSET(h->keys, h->boundary));
|
52
|
+
DEBUG(" vals are at %p (+%ld)", h->vals, OFFSET(h->vals, h->boundary));
|
53
|
+
}
|
54
|
+
|
55
|
+
/*
|
56
|
+
static void termhash_dump(termhash* h) {
|
57
|
+
for(uint32_t i = 0; i < h->n_buckets; i++) {
|
58
|
+
if(isempty(h->flags, i)) printf("%u:\n", i);
|
59
|
+
else if(isdel(h->flags, i)) printf("%u: [deleted]", i);
|
60
|
+
else {
|
61
|
+
term t = h->keys[i];
|
62
|
+
printf("%u: (%u,%u)\n", i, t.field_s, t.word_s);
|
63
|
+
}
|
64
|
+
}
|
65
|
+
}
|
66
|
+
*/
|
67
|
+
|
68
|
+
/*
|
69
|
+
static void kh_destroy_##name(kh_##name##_t *h) {
|
70
|
+
if (h) {
|
71
|
+
free(h->keys); free(h->flags);
|
72
|
+
free(h->vals);
|
73
|
+
free(h);
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
static void kh_clear_##name(kh_##name##_t *h) {
|
78
|
+
if (h && h->flags) {
|
79
|
+
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
80
|
+
h->size = h->n_occupied = 0;
|
81
|
+
}
|
82
|
+
}
|
83
|
+
*/
|
84
|
+
|
85
|
+
uint32_t termhash_get(termhash *h, term key) {
|
86
|
+
if(h->n_buckets) {
|
87
|
+
uint32_t inc, k, i, last;
|
88
|
+
k = hash_term(key); i = k % h->n_buckets;
|
89
|
+
inc = 1 + k % (h->n_buckets - 1); last = i;
|
90
|
+
while (!isempty(h->flags, i) && (isdel(h->flags, i) || !term_equals(h->keys[i], key))) {
|
91
|
+
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
92
|
+
else i += inc;
|
93
|
+
if (i == last) return h->n_buckets;
|
94
|
+
}
|
95
|
+
return iseither(h->flags, i)? h->n_buckets : i;
|
96
|
+
}
|
97
|
+
else return 0;
|
98
|
+
}
|
99
|
+
|
100
|
+
wp_error* termhash_bump_size(termhash *h) {
|
101
|
+
DEBUG("bumping size for term hash at %p with size %u and boundary %p (+%ld)", h, termhash_size(h), h->boundary, (long)((uint8_t*)h->boundary - (uint8_t*)h));
|
102
|
+
DEBUG("flags are at %p (+%ld)", h->flags, OFFSET(h->flags, h->boundary));
|
103
|
+
DEBUG(" keys are at %p (+%ld)", h->keys, OFFSET(h->keys, h->boundary));
|
104
|
+
DEBUG(" vals are at %p (+%ld)", h->vals, OFFSET(h->vals, h->boundary));
|
105
|
+
|
106
|
+
h->n_buckets_idx++;
|
107
|
+
if(h->n_buckets_idx > HASH_PRIME_SIZE) exit(1); // die horribly TODO fixme
|
108
|
+
uint32_t new_n_buckets = prime_list[h->n_buckets_idx];
|
109
|
+
|
110
|
+
// first make a backup of the oldflags
|
111
|
+
size_t oldflagsize = ((h->n_buckets >> 4) + 1) * sizeof(uint32_t);
|
112
|
+
uint32_t* oldflags = malloc(oldflagsize);
|
113
|
+
memcpy(oldflags, h->flags, oldflagsize);
|
114
|
+
|
115
|
+
// keep pointers to the old locations
|
116
|
+
term* oldkeys = h->keys;
|
117
|
+
uint32_t* oldvals = h->vals;
|
118
|
+
|
119
|
+
// set pointers to the new locations
|
120
|
+
h->keys = (term*)((uint32_t*)h->boundary + ((new_n_buckets >> 4) + 1));
|
121
|
+
h->vals = (uint32_t*)((term*)h->keys + new_n_buckets);
|
122
|
+
|
123
|
+
// move the vals and keys
|
124
|
+
memmove(h->vals, oldvals, h->n_buckets * sizeof(uint32_t));
|
125
|
+
memmove(h->keys, oldkeys, h->n_buckets * sizeof(term));
|
126
|
+
|
127
|
+
// clear the new flags
|
128
|
+
memset(h->flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t));
|
129
|
+
|
130
|
+
// do the complicated stuff from khash.h
|
131
|
+
for (unsigned int j = 0; j != h->n_buckets; ++j) {
|
132
|
+
if (iseither(oldflags, j) == 0) {
|
133
|
+
term key = h->keys[j];
|
134
|
+
uint32_t val;
|
135
|
+
val = h->vals[j];
|
136
|
+
set_isdel_true(oldflags, j);
|
137
|
+
while (1) {
|
138
|
+
uint32_t inc, k, i;
|
139
|
+
k = hash_term(key);
|
140
|
+
i = k % new_n_buckets;
|
141
|
+
inc = 1 + k % (new_n_buckets - 1);
|
142
|
+
while (!isempty(h->flags, i)) {
|
143
|
+
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets;
|
144
|
+
else i += inc;
|
145
|
+
}
|
146
|
+
set_isempty_false(h->flags, i);
|
147
|
+
if (i < h->n_buckets && iseither(oldflags, i) == 0) {
|
148
|
+
{ term tmp = h->keys[i]; h->keys[i] = key; key = tmp; }
|
149
|
+
{ uint32_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; }
|
150
|
+
set_isdel_true(oldflags, i);
|
151
|
+
} else {
|
152
|
+
h->keys[i] = key;
|
153
|
+
h->vals[i] = val;
|
154
|
+
break;
|
155
|
+
}
|
156
|
+
}
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
free(oldflags);
|
161
|
+
h->n_buckets = new_n_buckets;
|
162
|
+
h->n_occupied = h->size;
|
163
|
+
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
164
|
+
|
165
|
+
DEBUG("after bump, term hash at %p has size %u and boundary %p (+%ld)", h, termhash_size(h), h->boundary, (long)((uint8_t*)h->boundary - (uint8_t*)h));
|
166
|
+
DEBUG("flags are at %p (+%ld)", h->flags, (long)((uint8_t*)h->flags - (uint8_t*)h->boundary));
|
167
|
+
DEBUG(" keys are at %p (+%ld)", h->keys, (long)((uint8_t*)h->keys - (uint8_t*)h->boundary));
|
168
|
+
DEBUG(" vals are at %p (+%ld)", h->vals, (long)((uint8_t*)h->vals - (uint8_t*)h->boundary));
|
169
|
+
|
170
|
+
#ifdef DEBUGOUTPUT
|
171
|
+
//DEBUG("and now i look like this:");
|
172
|
+
//termhash_dump(h);
|
173
|
+
#endif
|
174
|
+
|
175
|
+
return NO_ERROR;
|
176
|
+
}
|
177
|
+
|
178
|
+
uint32_t termhash_put(termhash *h, term key, int *ret) {
|
179
|
+
uint32_t x;
|
180
|
+
|
181
|
+
{
|
182
|
+
#ifdef DEBUGOUTPUT
|
183
|
+
int num_loops = 0;
|
184
|
+
#endif
|
185
|
+
uint32_t inc, k, i, site, last;
|
186
|
+
x = site = h->n_buckets; k = hash_term(key); i = k % h->n_buckets;
|
187
|
+
DEBUG("initial hash is %u", k);
|
188
|
+
if (isempty(h->flags, i)) x = i;
|
189
|
+
else {
|
190
|
+
inc = 1 + k % (h->n_buckets - 1); last = i;
|
191
|
+
while (!isempty(h->flags, i) && (isdel(h->flags, i) || !term_equals(h->keys[i], key))) {
|
192
|
+
#ifdef DEBUGOUTPUT
|
193
|
+
num_loops++;
|
194
|
+
#endif
|
195
|
+
if (isdel(h->flags, i)) site = i;
|
196
|
+
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
197
|
+
else i += inc;
|
198
|
+
if (i == last) { x = site; break; }
|
199
|
+
}
|
200
|
+
if ((x == h->n_buckets) && (i == last)) { // out of space
|
201
|
+
if(!term_equals(h->keys[i], key)) {
|
202
|
+
*ret = -1;
|
203
|
+
return x;
|
204
|
+
}
|
205
|
+
}
|
206
|
+
if (x == h->n_buckets) { // didn't find it on the first try
|
207
|
+
if (isempty(h->flags, i) && site != h->n_buckets) x = site;
|
208
|
+
else x = i;
|
209
|
+
}
|
210
|
+
}
|
211
|
+
DEBUG("looped %u times to put", num_loops);
|
212
|
+
//DEBUG("x is %u, site is %u, n_buckets is %u", x, site, h->n_buckets);
|
213
|
+
}
|
214
|
+
if (isempty(h->flags, x)) {
|
215
|
+
h->keys[x] = key;
|
216
|
+
set_isboth_false(h->flags, x);
|
217
|
+
++h->size; ++h->n_occupied;
|
218
|
+
*ret = 1;
|
219
|
+
} else if (isdel(h->flags, x)) {
|
220
|
+
h->keys[x] = key;
|
221
|
+
set_isboth_false(h->flags, x);
|
222
|
+
++h->size;
|
223
|
+
*ret = 2;
|
224
|
+
}
|
225
|
+
else *ret = 0;
|
226
|
+
|
227
|
+
#ifdef DEBUGOUTPUT
|
228
|
+
//DEBUG("after put:");
|
229
|
+
//termhash_dump(h);
|
230
|
+
#endif
|
231
|
+
|
232
|
+
return x;
|
233
|
+
}
|
234
|
+
|
235
|
+
void termhash_del(termhash *h, uint32_t x) {
|
236
|
+
if (x != h->n_buckets && !iseither(h->flags, x)) {
|
237
|
+
set_isdel_true(h->flags, x);
|
238
|
+
--h->size;
|
239
|
+
}
|
240
|
+
}
|
241
|
+
|
242
|
+
uint32_t termhash_get_val(termhash* h, term t) {
|
243
|
+
uint32_t idx = termhash_get(h, t);
|
244
|
+
if(idx == h->n_buckets) return (uint32_t)-1;
|
245
|
+
return h->vals[idx];
|
246
|
+
}
|
247
|
+
|
248
|
+
wp_error* termhash_put_val(termhash* h, term t, uint32_t val) {
|
249
|
+
int status;
|
250
|
+
uint32_t loc = termhash_put(h, t, &status);
|
251
|
+
DEBUG("put(%u,%u) has status %d and loc %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
|
252
|
+
if(status == -1) RAISE_ERROR("out of space in hash");
|
253
|
+
h->vals[loc] = val;
|
254
|
+
return NO_ERROR;
|
255
|
+
}
|
256
|
+
|
257
|
+
int termhash_needs_bump(termhash* h) {
|
258
|
+
return (h->n_occupied >= h->upper_bound);
|
259
|
+
}
|
260
|
+
|
261
|
+
// returns the total size in bytes
|
262
|
+
// memory layout: termhash, then:
|
263
|
+
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
264
|
+
// n_buckets terms for the keys
|
265
|
+
// n_buckets uint32_t's for the vals (offsets into postings lists)
|
266
|
+
static uint32_t size(uint32_t n_buckets) {
|
267
|
+
uint32_t size = sizeof(termhash) +
|
268
|
+
(((n_buckets >> 4) + 1) * sizeof(uint32_t)) +
|
269
|
+
(n_buckets * sizeof(term)) +
|
270
|
+
(n_buckets * sizeof(uint32_t));
|
271
|
+
|
272
|
+
DEBUG("size of a termhash with %u buckets is %lu + %lu + %lu + %lu = %u",
|
273
|
+
n_buckets,
|
274
|
+
(long)sizeof(termhash),
|
275
|
+
(long)(((n_buckets >> 4) + 1) * sizeof(uint32_t)),
|
276
|
+
(long)(n_buckets * sizeof(term)),
|
277
|
+
(long)(n_buckets * sizeof(uint32_t)),
|
278
|
+
size);
|
279
|
+
|
280
|
+
return size;
|
281
|
+
}
|
282
|
+
|
283
|
+
uint32_t termhash_size(termhash* h) {
|
284
|
+
return size(h->n_buckets);
|
285
|
+
}
|
286
|
+
|
287
|
+
uint32_t termhash_initial_size() {
|
288
|
+
return size(prime_list[INITIAL_N_BUCKETS_IDX]);
|
289
|
+
}
|
290
|
+
|
291
|
+
uint32_t termhash_next_size(termhash* h) {
|
292
|
+
int next_idx = (h->n_buckets_idx < (HASH_PRIME_SIZE - 1)) ? h->n_buckets_idx + 1 : h->n_buckets_idx;
|
293
|
+
return size(prime_list[next_idx]);
|
294
|
+
}
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#ifndef WP_TERMHASH_H_
|
2
|
+
#define WP_TERMHASH_H_
|
3
|
+
|
4
|
+
// whistlepig term hash
|
5
|
+
// (c) 2011 William Morgan. See COPYING for license terms.
|
6
|
+
//
|
7
|
+
// based on a heavily modified khash.h
|
8
|
+
//
|
9
|
+
// a term, in this file, is a pair of ints. the intention is that these are the
|
10
|
+
// results of adding strings to the stringmap. the termhash is then a map from
|
11
|
+
// such pairs to ints.
|
12
|
+
//
|
13
|
+
// like stringmap and stringpool, it uses a slightly funny API that never
|
14
|
+
// allocates memory, but instead operates on pointers to preallocated blocks of
|
15
|
+
// memory.
|
16
|
+
|
17
|
+
#include <stdint.h>
|
18
|
+
#include "error.h"
|
19
|
+
|
20
|
+
typedef struct term {
|
21
|
+
uint32_t field_s;
|
22
|
+
uint32_t word_s;
|
23
|
+
} term;
|
24
|
+
|
25
|
+
#define INITIAL_N_BUCKETS_IDX 1
|
26
|
+
|
27
|
+
typedef struct termhash {
|
28
|
+
uint8_t n_buckets_idx;
|
29
|
+
uint32_t n_buckets, size, n_occupied, upper_bound;
|
30
|
+
uint32_t *flags;
|
31
|
+
term *keys;
|
32
|
+
uint32_t *vals;
|
33
|
+
uint8_t boundary[];
|
34
|
+
// in memory at this point
|
35
|
+
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
36
|
+
// n_buckets terms for the keys
|
37
|
+
// n_buckets uint32_t's for the vals (offsets into postings lists)
|
38
|
+
} termhash;
|
39
|
+
|
40
|
+
// API methods
|
41
|
+
|
42
|
+
// public: make a new termhash
|
43
|
+
void termhash_init(termhash* h); // makes a new one
|
44
|
+
|
45
|
+
// public: set up an existing termhash
|
46
|
+
void termhash_setup(termhash* h); // inits one from disk
|
47
|
+
|
48
|
+
// private: khash-style getter: returns the slot id, if any, given a term key.
|
49
|
+
// you can then look this up within the vals array yourself. returns
|
50
|
+
// h->n_buckets if the term is not in the hash.
|
51
|
+
uint32_t termhash_get(termhash *h, term t);
|
52
|
+
|
53
|
+
// public: get an int given a term. returns (uint32_t)-1 if the term is not in
|
54
|
+
// the hash.
|
55
|
+
uint32_t termhash_get_val(termhash* h, term t); // convenience
|
56
|
+
|
57
|
+
// private: khash-style setter: insert a term into the hash. see the code
|
58
|
+
// for details on what all the return values mean.
|
59
|
+
uint32_t termhash_put(termhash* h, term t, int *ret); // khash-style
|
60
|
+
|
61
|
+
// public: adds a term to the hash with the given value
|
62
|
+
wp_error* termhash_put_val(termhash* h, term t, uint32_t val) RAISES_ERROR; // convenience
|
63
|
+
|
64
|
+
// public: returns the byte size of the termhash
|
65
|
+
uint32_t termhash_size(termhash* h);
|
66
|
+
|
67
|
+
// public: returns the byte size for the next larger version of the termhash
|
68
|
+
uint32_t termhash_next_size(termhash* h);
|
69
|
+
|
70
|
+
// public: does the termhash need a size increase?
|
71
|
+
int termhash_needs_bump(termhash* h);
|
72
|
+
|
73
|
+
// public: increases the size of the termhash
|
74
|
+
wp_error* termhash_bump_size(termhash* h) RAISES_ERROR;
|
75
|
+
|
76
|
+
// public: returns the initial byte size for an empty termhash
|
77
|
+
uint32_t termhash_initial_size();
|
78
|
+
|
79
|
+
#endif
|