whistlepig 0.9.1 → 0.10
Sign up to get free protection for your applications and to get access to all the features.
- data/README +40 -12
- data/ext/whistlepig/extconf.rb +1 -1
- data/ext/whistlepig/index.c +201 -62
- data/ext/whistlepig/index.h +11 -2
- data/ext/whistlepig/lock.c +153 -0
- data/ext/whistlepig/lock.h +18 -0
- data/ext/whistlepig/mmap-obj.c +36 -20
- data/ext/whistlepig/mmap-obj.h +12 -7
- data/ext/whistlepig/search.c +7 -6
- data/ext/whistlepig/segment.c +97 -47
- data/ext/whistlepig/segment.h +19 -3
- data/ext/whistlepig/stringmap.c +61 -56
- data/ext/whistlepig/stringmap.h +7 -14
- data/ext/whistlepig/termhash.c +60 -62
- data/ext/whistlepig/termhash.h +4 -6
- data/ext/whistlepig/whistlepig.c +5 -1
- data/ext/whistlepig/whistlepig.h +1 -0
- metadata +29 -38
- data/ext/whistlepig/dump.c +0 -65
- data/ext/whistlepig/extconf.h +0 -3
- data/ext/whistlepig/test-segment.c +0 -404
- data/ext/whistlepig/test-stringmap.c +0 -82
- data/ext/whistlepig/test-stringpool.c +0 -67
- data/ext/whistlepig/test-termhash.c +0 -95
- data/ext/whistlepig/test-tokenizer.c +0 -55
- data/ext/whistlepig/test.h +0 -38
- data/ext/whistlepig/timer.h +0 -28
data/ext/whistlepig/stringmap.c
CHANGED
@@ -33,20 +33,15 @@ static inline int string_equals(const char* a, const char* b) {
|
|
33
33
|
return strcmp(a, b) == 0;
|
34
34
|
}
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
h->pool = p;
|
39
|
-
h->flags = (uint32_t*)h->boundary;
|
40
|
-
h->keys = (uint32_t*)((uint32_t*)h->boundary + ((h->n_buckets >> 4) + 1));
|
41
|
-
}
|
36
|
+
#define STRINGMAP_FLAGS(h) ((uint32_t*)(h)->boundary)
|
37
|
+
#define STRINGMAP_KEYS(h) ((uint32_t*)((uint32_t*)(h)->boundary + (((h)->n_buckets >> 4) + 1)))
|
42
38
|
|
43
|
-
void stringmap_init(stringmap* h
|
39
|
+
void stringmap_init(stringmap* h) {
|
44
40
|
h->n_buckets_idx = INITIAL_N_BUCKETS_IDX;
|
45
41
|
h->n_buckets = prime_list[h->n_buckets_idx];
|
46
42
|
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
47
43
|
h->size = h->n_occupied = 0;
|
48
|
-
|
49
|
-
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
44
|
+
memset(STRINGMAP_FLAGS(h), 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
50
45
|
}
|
51
46
|
|
52
47
|
/*
|
@@ -66,22 +61,25 @@ static void kh_clear_##name(kh_##name##_t *h) {
|
|
66
61
|
}
|
67
62
|
*/
|
68
63
|
|
69
|
-
uint32_t stringmap_get(stringmap *h, const char* key) {
|
64
|
+
uint32_t stringmap_get(stringmap *h, stringpool* pool, const char* key) {
|
65
|
+
uint32_t* flags = STRINGMAP_FLAGS(h);
|
66
|
+
uint32_t* keys = STRINGMAP_KEYS(h);
|
67
|
+
|
70
68
|
if(h->n_buckets) {
|
71
69
|
uint32_t inc, k, i, last;
|
72
70
|
k = string_hash(key); i = k % h->n_buckets;
|
73
71
|
inc = 1 + k % (h->n_buckets - 1); last = i;
|
74
|
-
while (!isempty(
|
72
|
+
while (!isempty(flags, i) && (isdel(flags, i) || !string_equals(stringpool_lookup(pool, keys[i]), key))) {
|
75
73
|
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
76
74
|
else i += inc;
|
77
75
|
if (i == last) return h->n_buckets;
|
78
76
|
}
|
79
|
-
return iseither(
|
77
|
+
return iseither(flags, i)? h->n_buckets : i;
|
80
78
|
}
|
81
79
|
else return 0;
|
82
80
|
}
|
83
81
|
|
84
|
-
wp_error* stringmap_bump_size(stringmap *h) {
|
82
|
+
wp_error* stringmap_bump_size(stringmap *h, stringpool* pool) {
|
85
83
|
DEBUG("bumping size for string hash at %p with size %u and boundary %p", h, stringmap_size(h), h->boundary);
|
86
84
|
|
87
85
|
if(h->n_buckets_idx >= (HASH_PRIME_SIZE - 1)) RAISE_ERROR("stringmap can't be this big");
|
@@ -89,51 +87,54 @@ wp_error* stringmap_bump_size(stringmap *h) {
|
|
89
87
|
h->n_buckets_idx++;
|
90
88
|
uint32_t new_n_buckets = prime_list[h->n_buckets_idx];
|
91
89
|
|
92
|
-
//
|
93
|
-
|
94
|
-
uint32_t* oldflags =
|
95
|
-
memcpy(oldflags, h->flags, oldflagsize);
|
90
|
+
// get pointers to the old locations
|
91
|
+
uint32_t* oldkeys = STRINGMAP_KEYS(h);
|
92
|
+
uint32_t* oldflags = STRINGMAP_FLAGS(h);
|
96
93
|
|
97
|
-
//
|
98
|
-
|
94
|
+
// make a backup of the old flags in a separate memory region
|
95
|
+
size_t flagbaksize = ((h->n_buckets >> 4) + 1) * sizeof(uint32_t);
|
96
|
+
uint32_t* flagbaks = malloc(flagbaksize);
|
97
|
+
memcpy(flagbaks, oldflags, flagbaksize);
|
99
98
|
|
100
|
-
//
|
101
|
-
h->keys = (uint32_t*)((uint32_t*)h->boundary + ((new_n_buckets >> 4) + 1));
|
99
|
+
// get a pointer pointers to the new locations
|
100
|
+
//h->keys = (uint32_t*)((uint32_t*)h->boundary + ((new_n_buckets >> 4) + 1));
|
101
|
+
uint32_t* newflags = (uint32_t*)h->boundary; // unchanged, actually
|
102
|
+
uint32_t* newkeys = (uint32_t*)((uint32_t*)h->boundary + ((new_n_buckets >> 4) + 1));
|
102
103
|
|
103
104
|
// move the keys
|
104
|
-
memmove(
|
105
|
+
memmove(newkeys, oldkeys, h->n_buckets * sizeof(uint32_t));
|
105
106
|
|
106
107
|
// clear the new flags
|
107
|
-
memset(h
|
108
|
+
memset(STRINGMAP_FLAGS(h), 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t));
|
108
109
|
|
109
110
|
// do the complicated stuff from khash.h
|
110
111
|
for (unsigned int j = 0; j != h->n_buckets; ++j) {
|
111
|
-
if (iseither(
|
112
|
-
uint32_t key =
|
113
|
-
set_isdel_true(
|
112
|
+
if (iseither(flagbaks, j) == 0) {
|
113
|
+
uint32_t key = newkeys[j];
|
114
|
+
set_isdel_true(flagbaks, j);
|
114
115
|
while (1) {
|
115
116
|
uint32_t inc, k, i;
|
116
|
-
k = string_hash(stringpool_lookup(
|
117
|
+
k = string_hash(stringpool_lookup(pool, key));
|
117
118
|
i = k % new_n_buckets;
|
118
119
|
inc = 1 + k % (new_n_buckets - 1);
|
119
|
-
while (!isempty(
|
120
|
+
while (!isempty(newflags, i)) {
|
120
121
|
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets;
|
121
122
|
else i += inc;
|
122
123
|
}
|
123
|
-
set_isempty_false(
|
124
|
-
if (i < h->n_buckets && iseither(
|
125
|
-
{ uint32_t tmp =
|
126
|
-
set_isdel_true(
|
124
|
+
set_isempty_false(newflags, i);
|
125
|
+
if (i < h->n_buckets && iseither(flagbaks, i) == 0) {
|
126
|
+
{ uint32_t tmp = newkeys[i]; newkeys[i] = key; key = tmp; }
|
127
|
+
set_isdel_true(flagbaks, i);
|
127
128
|
} else {
|
128
|
-
|
129
|
+
newkeys[i] = key;
|
129
130
|
break;
|
130
131
|
}
|
131
132
|
}
|
132
133
|
}
|
133
134
|
}
|
134
135
|
|
135
|
-
free(
|
136
|
-
h->n_buckets = new_n_buckets;
|
136
|
+
free(flagbaks);
|
137
|
+
h->n_buckets = new_n_buckets; // STRINGMAP_KEYS now works
|
137
138
|
h->n_occupied = h->size;
|
138
139
|
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
139
140
|
|
@@ -144,8 +145,10 @@ wp_error* stringmap_bump_size(stringmap *h) {
|
|
144
145
|
return NO_ERROR;
|
145
146
|
}
|
146
147
|
|
147
|
-
uint32_t stringmap_put(stringmap *h, const char* key, int *ret) {
|
148
|
+
uint32_t stringmap_put(stringmap *h, stringpool* pool, const char* key, int *ret) {
|
148
149
|
uint32_t x;
|
150
|
+
uint32_t* flags = STRINGMAP_FLAGS(h);
|
151
|
+
uint32_t* keys = STRINGMAP_KEYS(h);
|
149
152
|
|
150
153
|
{
|
151
154
|
#ifdef DEBUGOUTPUT
|
@@ -154,27 +157,27 @@ int num_loops = 0;
|
|
154
157
|
uint32_t inc, k, i, site, last;
|
155
158
|
x = site = h->n_buckets; k = string_hash(key); i = k % h->n_buckets;
|
156
159
|
//DEBUG("asked to hash '%s'. initial hash is %u => %u and n_occupied is %u", key, k, i, h->n_occupied);
|
157
|
-
if (isempty(
|
160
|
+
if (isempty(flags, i)) x = i;
|
158
161
|
else {
|
159
162
|
inc = 1 + k % (h->n_buckets - 1); last = i;
|
160
|
-
while (!isempty(
|
163
|
+
while (!isempty(flags, i) && (isdel(flags, i) || !string_equals(stringpool_lookup(pool, keys[i]), key))) {
|
161
164
|
#ifdef DEBUGOUTPUT
|
162
165
|
num_loops++;
|
163
166
|
#endif
|
164
|
-
if (isdel(
|
167
|
+
if (isdel(flags, i)) site = i;
|
165
168
|
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
166
169
|
else i += inc;
|
167
170
|
if (i == last) { x = site; break; }
|
168
171
|
}
|
169
172
|
if ((x == h->n_buckets) && (i == last)) { // out of space
|
170
|
-
if(!string_equals(stringpool_lookup(
|
173
|
+
if(!string_equals(stringpool_lookup(pool, keys[i]), key)) {
|
171
174
|
DEBUG("out of space!");
|
172
175
|
*ret = -1;
|
173
176
|
return x;
|
174
177
|
}
|
175
178
|
}
|
176
179
|
if (x == h->n_buckets) { // didn't find it on the first try
|
177
|
-
if (isempty(
|
180
|
+
if (isempty(flags, i) && site != h->n_buckets) x = site;
|
178
181
|
else x = i;
|
179
182
|
}
|
180
183
|
}
|
@@ -185,15 +188,15 @@ num_loops++;
|
|
185
188
|
//DEBUG("for pos %u, isempty? %d and isdel %d", x, isempty(h->flags, x), isdel(h->flags, x));
|
186
189
|
|
187
190
|
uint32_t idx;
|
188
|
-
if(isempty(
|
189
|
-
idx = stringpool_add(
|
191
|
+
if(isempty(flags, x) || isdel(flags, x)) {
|
192
|
+
idx = stringpool_add(pool, key);
|
190
193
|
if(idx == (uint32_t)-1) {
|
191
194
|
*ret = -2;
|
192
195
|
return x;
|
193
196
|
}
|
194
|
-
if (isempty(
|
195
|
-
|
196
|
-
set_isboth_false(
|
197
|
+
if (isempty(flags, x)) ++h->n_occupied;
|
198
|
+
keys[x] = idx;
|
199
|
+
set_isboth_false(flags, x);
|
197
200
|
++h->size;
|
198
201
|
*ret = 1;
|
199
202
|
}
|
@@ -203,8 +206,9 @@ num_loops++;
|
|
203
206
|
}
|
204
207
|
|
205
208
|
void stringmap_del(stringmap *h, uint32_t x) {
|
206
|
-
|
207
|
-
|
209
|
+
uint32_t* flags = STRINGMAP_FLAGS(h);
|
210
|
+
if (x != h->n_buckets && !iseither(flags, x)) {
|
211
|
+
set_isdel_true(flags, x);
|
208
212
|
--h->size;
|
209
213
|
}
|
210
214
|
}
|
@@ -255,24 +259,25 @@ uint32_t stringmap_next_size(stringmap* h) {
|
|
255
259
|
return size(prime_list[next_idx]);
|
256
260
|
}
|
257
261
|
|
258
|
-
const char* stringmap_int_to_string(stringmap* h, uint32_t i) {
|
259
|
-
|
262
|
+
const char* stringmap_int_to_string(stringmap* h, stringpool* p, uint32_t i) {
|
263
|
+
(void)h;
|
264
|
+
return stringpool_lookup(p, i);
|
260
265
|
}
|
261
266
|
|
262
267
|
// returns -1 if not found
|
263
|
-
uint32_t stringmap_string_to_int(stringmap* h, const char* s) {
|
264
|
-
uint32_t idx = stringmap_get(h, s);
|
268
|
+
uint32_t stringmap_string_to_int(stringmap* h, stringpool* pool, const char* s) {
|
269
|
+
uint32_t idx = stringmap_get(h, pool, s);
|
265
270
|
if(idx == h->n_buckets) return (uint32_t)-1; // not there
|
266
|
-
return h
|
271
|
+
return STRINGMAP_KEYS(h)[idx];
|
267
272
|
}
|
268
273
|
|
269
|
-
wp_error* stringmap_add(stringmap *h, const char* s, uint32_t* id) {
|
274
|
+
wp_error* stringmap_add(stringmap *h, stringpool* pool, const char* s, uint32_t* id) {
|
270
275
|
int status;
|
271
|
-
uint32_t idx = stringmap_put(h, s, &status);
|
276
|
+
uint32_t idx = stringmap_put(h, pool, s, &status);
|
272
277
|
if(status == -1) RAISE_ERROR("out of space in hash put");
|
273
278
|
if(status == -2) RAISE_ERROR("out of space in pool put");
|
274
279
|
|
275
|
-
*id = h
|
280
|
+
*id = STRINGMAP_KEYS(h)[idx];
|
276
281
|
|
277
282
|
return NO_ERROR;
|
278
283
|
}
|
data/ext/whistlepig/stringmap.h
CHANGED
@@ -10,9 +10,8 @@
|
|
10
10
|
// and stringpool, it uses a slightly funny API that never allocates memory,
|
11
11
|
// but instead operates on pointers to preallocated blocks of memory.
|
12
12
|
//
|
13
|
-
// uses a stringpool internally to do the int->string mapping.
|
14
|
-
//
|
15
|
-
// use this object.
|
13
|
+
// uses a stringpool internally to do the int->string mapping. you shouldn't
|
14
|
+
// have to interact with the stringpool directly; you can just use this object.
|
16
15
|
//
|
17
16
|
// like termhash and pool, has a slightly funny API that is designed to work on
|
18
17
|
// a pre-allocated chunk of memory rather than allocate any of its own.
|
@@ -36,9 +35,6 @@
|
|
36
35
|
typedef struct stringmap {
|
37
36
|
uint8_t n_buckets_idx;
|
38
37
|
uint32_t n_buckets, size, n_occupied, upper_bound;
|
39
|
-
uint32_t *flags;
|
40
|
-
uint32_t *keys;
|
41
|
-
stringpool* pool;
|
42
38
|
uint8_t boundary[];
|
43
39
|
// in memory at this point
|
44
40
|
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
@@ -48,21 +44,18 @@ typedef struct stringmap {
|
|
48
44
|
// API methods
|
49
45
|
|
50
46
|
// public: write a new stringmap to memory
|
51
|
-
void stringmap_init(stringmap* h
|
52
|
-
|
53
|
-
// public: set up an existing stringmap in memory
|
54
|
-
void stringmap_setup(stringmap* h, stringpool* p);
|
47
|
+
void stringmap_init(stringmap* h);
|
55
48
|
|
56
49
|
// public: add a string. sets id to its id. dupes are fine; will just set the
|
57
50
|
// id correctly.
|
58
|
-
wp_error* stringmap_add(stringmap *h, const char* s, uint32_t* id) RAISES_ERROR;
|
51
|
+
wp_error* stringmap_add(stringmap *h, stringpool* p, const char* s, uint32_t* id) RAISES_ERROR;
|
59
52
|
|
60
53
|
// public: get the int value given a string. returns (uint32_t)-1 if not found.
|
61
|
-
uint32_t stringmap_string_to_int(stringmap* h, const char* s);
|
54
|
+
uint32_t stringmap_string_to_int(stringmap* h, stringpool* pool, const char* s);
|
62
55
|
|
63
56
|
// public: get the string value given an int. returns corrupt data if the int
|
64
57
|
// is invalid.
|
65
|
-
const char* stringmap_int_to_string(stringmap* h, uint32_t i);
|
58
|
+
const char* stringmap_int_to_string(stringmap* h, stringpool* p, uint32_t i);
|
66
59
|
|
67
60
|
// public: returns the byte size of the stringmap
|
68
61
|
uint32_t stringmap_size(stringmap* h);
|
@@ -77,6 +70,6 @@ uint32_t stringmap_next_size(stringmap* h);
|
|
77
70
|
int stringmap_needs_bump(stringmap* h);
|
78
71
|
|
79
72
|
// public: increases the size of the stringmap
|
80
|
-
wp_error* stringmap_bump_size(stringmap *h) RAISES_ERROR;
|
73
|
+
wp_error* stringmap_bump_size(stringmap *h, stringpool* pool) RAISES_ERROR;
|
81
74
|
|
82
75
|
#endif
|
data/ext/whistlepig/termhash.c
CHANGED
@@ -35,23 +35,11 @@ void termhash_init(termhash* h) {
|
|
35
35
|
h->n_buckets = prime_list[h->n_buckets_idx];
|
36
36
|
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
37
37
|
h->size = h->n_occupied = 0;
|
38
|
-
|
39
|
-
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
38
|
+
memset(TERMHASH_FLAGS(h), 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
40
39
|
}
|
41
40
|
|
42
41
|
#define OFFSET(a, b) (long)((uint8_t*)a - (uint8_t*)b)
|
43
42
|
// set flags, keys and vals to correct locations based on h->n_buckets
|
44
|
-
void termhash_setup(termhash* h) {
|
45
|
-
DEBUG("term hash ranges from %p to %p (size %u)", h, (char*)h + termhash_size(h), termhash_size(h));
|
46
|
-
DEBUG("boundary is at %p (+%ld)", h->boundary, OFFSET(h->boundary, h));
|
47
|
-
h->flags = (uint32_t*)h->boundary;
|
48
|
-
h->keys = (term*)((uint32_t*)h->boundary + ((h->n_buckets >> 4) + 1));
|
49
|
-
h->vals = (uint32_t*)((term*)h->keys + h->n_buckets);
|
50
|
-
DEBUG("flags are at %p (+%ld)", h->flags, OFFSET(h->flags, h->boundary));
|
51
|
-
DEBUG(" keys are at %p (+%ld)", h->keys, OFFSET(h->keys, h->boundary));
|
52
|
-
DEBUG(" vals are at %p (+%ld)", h->vals, OFFSET(h->vals, h->boundary));
|
53
|
-
}
|
54
|
-
|
55
43
|
/*
|
56
44
|
static void termhash_dump(termhash* h) {
|
57
45
|
for(uint32_t i = 0; i < h->n_buckets; i++) {
|
@@ -83,89 +71,94 @@ static void kh_clear_##name(kh_##name##_t *h) {
|
|
83
71
|
*/
|
84
72
|
|
85
73
|
uint32_t termhash_get(termhash *h, term key) {
|
74
|
+
uint32_t* flags = TERMHASH_FLAGS(h);
|
75
|
+
term* keys = TERMHASH_KEYS(h);
|
76
|
+
|
86
77
|
if(h->n_buckets) {
|
87
78
|
uint32_t inc, k, i, last;
|
88
79
|
k = hash_term(key); i = k % h->n_buckets;
|
89
80
|
inc = 1 + k % (h->n_buckets - 1); last = i;
|
90
|
-
while (!isempty(
|
81
|
+
while (!isempty(flags, i) && (isdel(flags, i) || !term_equals(keys[i], key))) {
|
91
82
|
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
92
83
|
else i += inc;
|
93
84
|
if (i == last) return h->n_buckets;
|
94
85
|
}
|
95
|
-
return iseither(
|
86
|
+
return iseither(flags, i)? h->n_buckets : i;
|
96
87
|
}
|
97
88
|
else return 0;
|
98
89
|
}
|
99
90
|
|
100
91
|
wp_error* termhash_bump_size(termhash *h) {
|
101
92
|
DEBUG("bumping size for term hash at %p with size %u and boundary %p (+%ld)", h, termhash_size(h), h->boundary, (long)((uint8_t*)h->boundary - (uint8_t*)h));
|
102
|
-
DEBUG("flags are at %p (+%ld)", h
|
103
|
-
DEBUG(" keys are at %p (+%ld)", h
|
104
|
-
DEBUG(" vals are at %p (+%ld)", h
|
93
|
+
DEBUG("flags are at %p (+%ld)", TERMHASH_FLAGS(h), OFFSET(TERMHASH_FLAGS(h), h->boundary));
|
94
|
+
DEBUG(" keys are at %p (+%ld)", TERMHASH_KEYS(h), OFFSET(TERMHASH_KEYS(h), h->boundary));
|
95
|
+
DEBUG(" vals are at %p (+%ld)", TERMHASH_VALS(h), OFFSET(TERMHASH_VALS(h), h->boundary));
|
96
|
+
|
97
|
+
if(h->n_buckets_idx >= (HASH_PRIME_SIZE - 1)) RAISE_ERROR("termhash can't be this big");
|
105
98
|
|
106
99
|
h->n_buckets_idx++;
|
107
|
-
if(h->n_buckets_idx > HASH_PRIME_SIZE) exit(1); // die horribly TODO fixme
|
108
100
|
uint32_t new_n_buckets = prime_list[h->n_buckets_idx];
|
109
101
|
|
110
|
-
// first make a backup of the
|
111
|
-
size_t
|
112
|
-
uint32_t*
|
113
|
-
memcpy(
|
102
|
+
// first make a backup of the old flags in a separate memory region
|
103
|
+
size_t flagbaksize = ((h->n_buckets >> 4) + 1) * sizeof(uint32_t);
|
104
|
+
uint32_t* flagbaks = malloc(flagbaksize);
|
105
|
+
memcpy(flagbaks, TERMHASH_FLAGS(h), flagbaksize);
|
114
106
|
|
115
|
-
//
|
116
|
-
term* oldkeys = h
|
117
|
-
uint32_t* oldvals = h
|
107
|
+
// get pointers to the old locations
|
108
|
+
term* oldkeys = TERMHASH_KEYS(h);
|
109
|
+
uint32_t* oldvals = TERMHASH_VALS(h);
|
118
110
|
|
119
111
|
// set pointers to the new locations
|
120
|
-
|
121
|
-
|
112
|
+
uint32_t* newflags = (uint32_t*)h->boundary;
|
113
|
+
term* newkeys = (term*)(newflags + ((new_n_buckets >> 4) + 1));
|
114
|
+
uint32_t* newvals = (uint32_t*)(newkeys + new_n_buckets);
|
122
115
|
|
123
116
|
// move the vals and keys
|
124
|
-
memmove(
|
125
|
-
memmove(
|
117
|
+
memmove(newvals, oldvals, h->n_buckets * sizeof(uint32_t));
|
118
|
+
memmove(newkeys, oldkeys, h->n_buckets * sizeof(term));
|
126
119
|
|
127
120
|
// clear the new flags
|
128
|
-
memset(
|
121
|
+
memset(newflags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t));
|
129
122
|
|
130
123
|
// do the complicated stuff from khash.h
|
131
124
|
for (unsigned int j = 0; j != h->n_buckets; ++j) {
|
132
|
-
if (iseither(
|
133
|
-
term key =
|
125
|
+
if (iseither(flagbaks, j) == 0) {
|
126
|
+
term key = newkeys[j];
|
134
127
|
uint32_t val;
|
135
|
-
val =
|
136
|
-
set_isdel_true(
|
128
|
+
val = newvals[j];
|
129
|
+
set_isdel_true(flagbaks, j);
|
137
130
|
while (1) {
|
138
131
|
uint32_t inc, k, i;
|
139
132
|
k = hash_term(key);
|
140
133
|
i = k % new_n_buckets;
|
141
134
|
inc = 1 + k % (new_n_buckets - 1);
|
142
|
-
while (!isempty(
|
135
|
+
while (!isempty(newflags, i)) {
|
143
136
|
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets;
|
144
137
|
else i += inc;
|
145
138
|
}
|
146
|
-
set_isempty_false(
|
147
|
-
if (i < h->n_buckets && iseither(
|
148
|
-
{ term tmp =
|
149
|
-
{ uint32_t tmp =
|
150
|
-
set_isdel_true(
|
139
|
+
set_isempty_false(newflags, i);
|
140
|
+
if (i < h->n_buckets && iseither(flagbaks, i) == 0) {
|
141
|
+
{ term tmp = newkeys[i]; newkeys[i] = key; key = tmp; }
|
142
|
+
{ uint32_t tmp = newvals[i]; newvals[i] = val; val = tmp; }
|
143
|
+
set_isdel_true(flagbaks, i);
|
151
144
|
} else {
|
152
|
-
|
153
|
-
|
145
|
+
newkeys[i] = key;
|
146
|
+
newvals[i] = val;
|
154
147
|
break;
|
155
148
|
}
|
156
149
|
}
|
157
150
|
}
|
158
151
|
}
|
159
152
|
|
160
|
-
free(
|
153
|
+
free(flagbaks);
|
161
154
|
h->n_buckets = new_n_buckets;
|
162
155
|
h->n_occupied = h->size;
|
163
156
|
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
164
157
|
|
165
158
|
DEBUG("after bump, term hash at %p has size %u and boundary %p (+%ld)", h, termhash_size(h), h->boundary, (long)((uint8_t*)h->boundary - (uint8_t*)h));
|
166
|
-
DEBUG("flags are at %p (+%ld)", h
|
167
|
-
DEBUG(" keys are at %p (+%ld)", h
|
168
|
-
DEBUG(" vals are at %p (+%ld)", h
|
159
|
+
DEBUG("flags are at %p (+%ld)", TERMHASH_FLAGS(h), OFFSET(TERMHASH_FLAGS(h), h->boundary));
|
160
|
+
DEBUG(" keys are at %p (+%ld)", TERMHASH_KEYS(h), OFFSET(TERMHASH_KEYS(h), h->boundary));
|
161
|
+
DEBUG(" vals are at %p (+%ld)", TERMHASH_VALS(h), OFFSET(TERMHASH_VALS(h), h->boundary));
|
169
162
|
|
170
163
|
#ifdef DEBUGOUTPUT
|
171
164
|
//DEBUG("and now i look like this:");
|
@@ -177,6 +170,8 @@ wp_error* termhash_bump_size(termhash *h) {
|
|
177
170
|
|
178
171
|
uint32_t termhash_put(termhash *h, term key, int *ret) {
|
179
172
|
uint32_t x;
|
173
|
+
uint32_t* flags = TERMHASH_FLAGS(h);
|
174
|
+
term* keys = TERMHASH_KEYS(h);
|
180
175
|
|
181
176
|
{
|
182
177
|
#ifdef DEBUGOUTPUT
|
@@ -185,40 +180,40 @@ int num_loops = 0;
|
|
185
180
|
uint32_t inc, k, i, site, last;
|
186
181
|
x = site = h->n_buckets; k = hash_term(key); i = k % h->n_buckets;
|
187
182
|
DEBUG("initial hash is %u", k);
|
188
|
-
if (isempty(
|
183
|
+
if (isempty(flags, i)) x = i;
|
189
184
|
else {
|
190
185
|
inc = 1 + k % (h->n_buckets - 1); last = i;
|
191
|
-
while (!isempty(
|
186
|
+
while (!isempty(flags, i) && (isdel(flags, i) || !term_equals(keys[i], key))) {
|
192
187
|
#ifdef DEBUGOUTPUT
|
193
188
|
num_loops++;
|
194
189
|
#endif
|
195
|
-
if (isdel(
|
190
|
+
if (isdel(flags, i)) site = i;
|
196
191
|
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
197
192
|
else i += inc;
|
198
193
|
if (i == last) { x = site; break; }
|
199
194
|
}
|
200
195
|
if ((x == h->n_buckets) && (i == last)) { // out of space
|
201
|
-
if(!term_equals(
|
196
|
+
if(!term_equals(keys[i], key)) {
|
202
197
|
*ret = -1;
|
203
198
|
return x;
|
204
199
|
}
|
205
200
|
}
|
206
201
|
if (x == h->n_buckets) { // didn't find it on the first try
|
207
|
-
if (isempty(
|
202
|
+
if (isempty(flags, i) && site != h->n_buckets) x = site;
|
208
203
|
else x = i;
|
209
204
|
}
|
210
205
|
}
|
211
206
|
DEBUG("looped %u times to put", num_loops);
|
212
207
|
//DEBUG("x is %u, site is %u, n_buckets is %u", x, site, h->n_buckets);
|
213
208
|
}
|
214
|
-
if (isempty(
|
215
|
-
|
216
|
-
set_isboth_false(
|
209
|
+
if (isempty(flags, x)) {
|
210
|
+
keys[x] = key;
|
211
|
+
set_isboth_false(flags, x);
|
217
212
|
++h->size; ++h->n_occupied;
|
218
213
|
*ret = 1;
|
219
|
-
} else if (isdel(
|
220
|
-
|
221
|
-
set_isboth_false(
|
214
|
+
} else if (isdel(flags, x)) {
|
215
|
+
keys[x] = key;
|
216
|
+
set_isboth_false(flags, x);
|
222
217
|
++h->size;
|
223
218
|
*ret = 2;
|
224
219
|
}
|
@@ -233,24 +228,27 @@ num_loops++;
|
|
233
228
|
}
|
234
229
|
|
235
230
|
void termhash_del(termhash *h, uint32_t x) {
|
236
|
-
|
237
|
-
|
231
|
+
uint32_t* flags = TERMHASH_FLAGS(h);
|
232
|
+
if (x != h->n_buckets && !iseither(flags, x)) {
|
233
|
+
set_isdel_true(flags, x);
|
238
234
|
--h->size;
|
239
235
|
}
|
240
236
|
}
|
241
237
|
|
242
238
|
uint32_t termhash_get_val(termhash* h, term t) {
|
239
|
+
uint32_t* vals = TERMHASH_VALS(h);
|
243
240
|
uint32_t idx = termhash_get(h, t);
|
244
241
|
if(idx == h->n_buckets) return (uint32_t)-1;
|
245
|
-
return
|
242
|
+
return vals[idx];
|
246
243
|
}
|
247
244
|
|
248
245
|
wp_error* termhash_put_val(termhash* h, term t, uint32_t val) {
|
249
246
|
int status;
|
247
|
+
uint32_t* vals = TERMHASH_VALS(h);
|
250
248
|
uint32_t loc = termhash_put(h, t, &status);
|
251
249
|
DEBUG("put(%u,%u) has status %d and loc %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
|
252
250
|
if(status == -1) RAISE_ERROR("out of space in hash");
|
253
|
-
|
251
|
+
vals[loc] = val;
|
254
252
|
return NO_ERROR;
|
255
253
|
}
|
256
254
|
|
data/ext/whistlepig/termhash.h
CHANGED
@@ -27,9 +27,6 @@ typedef struct term {
|
|
27
27
|
typedef struct termhash {
|
28
28
|
uint8_t n_buckets_idx;
|
29
29
|
uint32_t n_buckets, size, n_occupied, upper_bound;
|
30
|
-
uint32_t *flags;
|
31
|
-
term *keys;
|
32
|
-
uint32_t *vals;
|
33
30
|
uint8_t boundary[];
|
34
31
|
// in memory at this point
|
35
32
|
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
@@ -37,14 +34,15 @@ typedef struct termhash {
|
|
37
34
|
// n_buckets uint32_t's for the vals (offsets into postings lists)
|
38
35
|
} termhash;
|
39
36
|
|
37
|
+
#define TERMHASH_FLAGS(h) ((uint32_t*)(h)->boundary)
|
38
|
+
#define TERMHASH_KEYS(h) ((term*)((uint32_t*)(h)->boundary + (((h)->n_buckets >> 4) + 1)))
|
39
|
+
#define TERMHASH_VALS(h) ((uint32_t*)(TERMHASH_KEYS(h) + (h)->n_buckets))
|
40
|
+
|
40
41
|
// API methods
|
41
42
|
|
42
43
|
// public: make a new termhash
|
43
44
|
void termhash_init(termhash* h); // makes a new one
|
44
45
|
|
45
|
-
// public: set up an existing termhash
|
46
|
-
void termhash_setup(termhash* h); // inits one from disk
|
47
|
-
|
48
46
|
// private: khash-style getter: returns the slot id, if any, given a term key.
|
49
47
|
// you can then look this up within the vals array yourself. returns
|
50
48
|
// h->n_buckets if the term is not in the hash.
|
data/ext/whistlepig/whistlepig.c
CHANGED
@@ -143,7 +143,11 @@ static VALUE index_delete(VALUE class, VALUE v_pathname_base) {
|
|
143
143
|
static VALUE index_size(VALUE self) {
|
144
144
|
wp_index* index;
|
145
145
|
Data_Get_Struct(self, wp_index, index);
|
146
|
-
|
146
|
+
|
147
|
+
uint64_t num_docs;
|
148
|
+
wp_error* e = wp_index_num_docs(index, &num_docs);
|
149
|
+
RAISE_IF_NECESSARY(e);
|
150
|
+
return INT2NUM(num_docs);
|
147
151
|
}
|
148
152
|
|
149
153
|
static VALUE index_init(VALUE self, VALUE v_pathname_base) {
|