whistlepig 0.9.1 → 0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +40 -12
- data/ext/whistlepig/extconf.rb +1 -1
- data/ext/whistlepig/index.c +201 -62
- data/ext/whistlepig/index.h +11 -2
- data/ext/whistlepig/lock.c +153 -0
- data/ext/whistlepig/lock.h +18 -0
- data/ext/whistlepig/mmap-obj.c +36 -20
- data/ext/whistlepig/mmap-obj.h +12 -7
- data/ext/whistlepig/search.c +7 -6
- data/ext/whistlepig/segment.c +97 -47
- data/ext/whistlepig/segment.h +19 -3
- data/ext/whistlepig/stringmap.c +61 -56
- data/ext/whistlepig/stringmap.h +7 -14
- data/ext/whistlepig/termhash.c +60 -62
- data/ext/whistlepig/termhash.h +4 -6
- data/ext/whistlepig/whistlepig.c +5 -1
- data/ext/whistlepig/whistlepig.h +1 -0
- metadata +29 -38
- data/ext/whistlepig/dump.c +0 -65
- data/ext/whistlepig/extconf.h +0 -3
- data/ext/whistlepig/test-segment.c +0 -404
- data/ext/whistlepig/test-stringmap.c +0 -82
- data/ext/whistlepig/test-stringpool.c +0 -67
- data/ext/whistlepig/test-termhash.c +0 -95
- data/ext/whistlepig/test-tokenizer.c +0 -55
- data/ext/whistlepig/test.h +0 -38
- data/ext/whistlepig/timer.h +0 -28
data/ext/whistlepig/stringmap.c
CHANGED
@@ -33,20 +33,15 @@ static inline int string_equals(const char* a, const char* b) {
|
|
33
33
|
return strcmp(a, b) == 0;
|
34
34
|
}
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
h->pool = p;
|
39
|
-
h->flags = (uint32_t*)h->boundary;
|
40
|
-
h->keys = (uint32_t*)((uint32_t*)h->boundary + ((h->n_buckets >> 4) + 1));
|
41
|
-
}
|
36
|
+
#define STRINGMAP_FLAGS(h) ((uint32_t*)(h)->boundary)
|
37
|
+
#define STRINGMAP_KEYS(h) ((uint32_t*)((uint32_t*)(h)->boundary + (((h)->n_buckets >> 4) + 1)))
|
42
38
|
|
43
|
-
void stringmap_init(stringmap* h
|
39
|
+
void stringmap_init(stringmap* h) {
|
44
40
|
h->n_buckets_idx = INITIAL_N_BUCKETS_IDX;
|
45
41
|
h->n_buckets = prime_list[h->n_buckets_idx];
|
46
42
|
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
47
43
|
h->size = h->n_occupied = 0;
|
48
|
-
|
49
|
-
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
44
|
+
memset(STRINGMAP_FLAGS(h), 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
50
45
|
}
|
51
46
|
|
52
47
|
/*
|
@@ -66,22 +61,25 @@ static void kh_clear_##name(kh_##name##_t *h) {
|
|
66
61
|
}
|
67
62
|
*/
|
68
63
|
|
69
|
-
uint32_t stringmap_get(stringmap *h, const char* key) {
|
64
|
+
uint32_t stringmap_get(stringmap *h, stringpool* pool, const char* key) {
|
65
|
+
uint32_t* flags = STRINGMAP_FLAGS(h);
|
66
|
+
uint32_t* keys = STRINGMAP_KEYS(h);
|
67
|
+
|
70
68
|
if(h->n_buckets) {
|
71
69
|
uint32_t inc, k, i, last;
|
72
70
|
k = string_hash(key); i = k % h->n_buckets;
|
73
71
|
inc = 1 + k % (h->n_buckets - 1); last = i;
|
74
|
-
while (!isempty(
|
72
|
+
while (!isempty(flags, i) && (isdel(flags, i) || !string_equals(stringpool_lookup(pool, keys[i]), key))) {
|
75
73
|
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
76
74
|
else i += inc;
|
77
75
|
if (i == last) return h->n_buckets;
|
78
76
|
}
|
79
|
-
return iseither(
|
77
|
+
return iseither(flags, i)? h->n_buckets : i;
|
80
78
|
}
|
81
79
|
else return 0;
|
82
80
|
}
|
83
81
|
|
84
|
-
wp_error* stringmap_bump_size(stringmap *h) {
|
82
|
+
wp_error* stringmap_bump_size(stringmap *h, stringpool* pool) {
|
85
83
|
DEBUG("bumping size for string hash at %p with size %u and boundary %p", h, stringmap_size(h), h->boundary);
|
86
84
|
|
87
85
|
if(h->n_buckets_idx >= (HASH_PRIME_SIZE - 1)) RAISE_ERROR("stringmap can't be this big");
|
@@ -89,51 +87,54 @@ wp_error* stringmap_bump_size(stringmap *h) {
|
|
89
87
|
h->n_buckets_idx++;
|
90
88
|
uint32_t new_n_buckets = prime_list[h->n_buckets_idx];
|
91
89
|
|
92
|
-
//
|
93
|
-
|
94
|
-
uint32_t* oldflags =
|
95
|
-
memcpy(oldflags, h->flags, oldflagsize);
|
90
|
+
// get pointers to the old locations
|
91
|
+
uint32_t* oldkeys = STRINGMAP_KEYS(h);
|
92
|
+
uint32_t* oldflags = STRINGMAP_FLAGS(h);
|
96
93
|
|
97
|
-
//
|
98
|
-
|
94
|
+
// make a backup of the old flags in a separate memory region
|
95
|
+
size_t flagbaksize = ((h->n_buckets >> 4) + 1) * sizeof(uint32_t);
|
96
|
+
uint32_t* flagbaks = malloc(flagbaksize);
|
97
|
+
memcpy(flagbaks, oldflags, flagbaksize);
|
99
98
|
|
100
|
-
//
|
101
|
-
h->keys = (uint32_t*)((uint32_t*)h->boundary + ((new_n_buckets >> 4) + 1));
|
99
|
+
// get a pointer pointers to the new locations
|
100
|
+
//h->keys = (uint32_t*)((uint32_t*)h->boundary + ((new_n_buckets >> 4) + 1));
|
101
|
+
uint32_t* newflags = (uint32_t*)h->boundary; // unchanged, actually
|
102
|
+
uint32_t* newkeys = (uint32_t*)((uint32_t*)h->boundary + ((new_n_buckets >> 4) + 1));
|
102
103
|
|
103
104
|
// move the keys
|
104
|
-
memmove(
|
105
|
+
memmove(newkeys, oldkeys, h->n_buckets * sizeof(uint32_t));
|
105
106
|
|
106
107
|
// clear the new flags
|
107
|
-
memset(h
|
108
|
+
memset(STRINGMAP_FLAGS(h), 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t));
|
108
109
|
|
109
110
|
// do the complicated stuff from khash.h
|
110
111
|
for (unsigned int j = 0; j != h->n_buckets; ++j) {
|
111
|
-
if (iseither(
|
112
|
-
uint32_t key =
|
113
|
-
set_isdel_true(
|
112
|
+
if (iseither(flagbaks, j) == 0) {
|
113
|
+
uint32_t key = newkeys[j];
|
114
|
+
set_isdel_true(flagbaks, j);
|
114
115
|
while (1) {
|
115
116
|
uint32_t inc, k, i;
|
116
|
-
k = string_hash(stringpool_lookup(
|
117
|
+
k = string_hash(stringpool_lookup(pool, key));
|
117
118
|
i = k % new_n_buckets;
|
118
119
|
inc = 1 + k % (new_n_buckets - 1);
|
119
|
-
while (!isempty(
|
120
|
+
while (!isempty(newflags, i)) {
|
120
121
|
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets;
|
121
122
|
else i += inc;
|
122
123
|
}
|
123
|
-
set_isempty_false(
|
124
|
-
if (i < h->n_buckets && iseither(
|
125
|
-
{ uint32_t tmp =
|
126
|
-
set_isdel_true(
|
124
|
+
set_isempty_false(newflags, i);
|
125
|
+
if (i < h->n_buckets && iseither(flagbaks, i) == 0) {
|
126
|
+
{ uint32_t tmp = newkeys[i]; newkeys[i] = key; key = tmp; }
|
127
|
+
set_isdel_true(flagbaks, i);
|
127
128
|
} else {
|
128
|
-
|
129
|
+
newkeys[i] = key;
|
129
130
|
break;
|
130
131
|
}
|
131
132
|
}
|
132
133
|
}
|
133
134
|
}
|
134
135
|
|
135
|
-
free(
|
136
|
-
h->n_buckets = new_n_buckets;
|
136
|
+
free(flagbaks);
|
137
|
+
h->n_buckets = new_n_buckets; // STRINGMAP_KEYS now works
|
137
138
|
h->n_occupied = h->size;
|
138
139
|
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
139
140
|
|
@@ -144,8 +145,10 @@ wp_error* stringmap_bump_size(stringmap *h) {
|
|
144
145
|
return NO_ERROR;
|
145
146
|
}
|
146
147
|
|
147
|
-
uint32_t stringmap_put(stringmap *h, const char* key, int *ret) {
|
148
|
+
uint32_t stringmap_put(stringmap *h, stringpool* pool, const char* key, int *ret) {
|
148
149
|
uint32_t x;
|
150
|
+
uint32_t* flags = STRINGMAP_FLAGS(h);
|
151
|
+
uint32_t* keys = STRINGMAP_KEYS(h);
|
149
152
|
|
150
153
|
{
|
151
154
|
#ifdef DEBUGOUTPUT
|
@@ -154,27 +157,27 @@ int num_loops = 0;
|
|
154
157
|
uint32_t inc, k, i, site, last;
|
155
158
|
x = site = h->n_buckets; k = string_hash(key); i = k % h->n_buckets;
|
156
159
|
//DEBUG("asked to hash '%s'. initial hash is %u => %u and n_occupied is %u", key, k, i, h->n_occupied);
|
157
|
-
if (isempty(
|
160
|
+
if (isempty(flags, i)) x = i;
|
158
161
|
else {
|
159
162
|
inc = 1 + k % (h->n_buckets - 1); last = i;
|
160
|
-
while (!isempty(
|
163
|
+
while (!isempty(flags, i) && (isdel(flags, i) || !string_equals(stringpool_lookup(pool, keys[i]), key))) {
|
161
164
|
#ifdef DEBUGOUTPUT
|
162
165
|
num_loops++;
|
163
166
|
#endif
|
164
|
-
if (isdel(
|
167
|
+
if (isdel(flags, i)) site = i;
|
165
168
|
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
166
169
|
else i += inc;
|
167
170
|
if (i == last) { x = site; break; }
|
168
171
|
}
|
169
172
|
if ((x == h->n_buckets) && (i == last)) { // out of space
|
170
|
-
if(!string_equals(stringpool_lookup(
|
173
|
+
if(!string_equals(stringpool_lookup(pool, keys[i]), key)) {
|
171
174
|
DEBUG("out of space!");
|
172
175
|
*ret = -1;
|
173
176
|
return x;
|
174
177
|
}
|
175
178
|
}
|
176
179
|
if (x == h->n_buckets) { // didn't find it on the first try
|
177
|
-
if (isempty(
|
180
|
+
if (isempty(flags, i) && site != h->n_buckets) x = site;
|
178
181
|
else x = i;
|
179
182
|
}
|
180
183
|
}
|
@@ -185,15 +188,15 @@ num_loops++;
|
|
185
188
|
//DEBUG("for pos %u, isempty? %d and isdel %d", x, isempty(h->flags, x), isdel(h->flags, x));
|
186
189
|
|
187
190
|
uint32_t idx;
|
188
|
-
if(isempty(
|
189
|
-
idx = stringpool_add(
|
191
|
+
if(isempty(flags, x) || isdel(flags, x)) {
|
192
|
+
idx = stringpool_add(pool, key);
|
190
193
|
if(idx == (uint32_t)-1) {
|
191
194
|
*ret = -2;
|
192
195
|
return x;
|
193
196
|
}
|
194
|
-
if (isempty(
|
195
|
-
|
196
|
-
set_isboth_false(
|
197
|
+
if (isempty(flags, x)) ++h->n_occupied;
|
198
|
+
keys[x] = idx;
|
199
|
+
set_isboth_false(flags, x);
|
197
200
|
++h->size;
|
198
201
|
*ret = 1;
|
199
202
|
}
|
@@ -203,8 +206,9 @@ num_loops++;
|
|
203
206
|
}
|
204
207
|
|
205
208
|
void stringmap_del(stringmap *h, uint32_t x) {
|
206
|
-
|
207
|
-
|
209
|
+
uint32_t* flags = STRINGMAP_FLAGS(h);
|
210
|
+
if (x != h->n_buckets && !iseither(flags, x)) {
|
211
|
+
set_isdel_true(flags, x);
|
208
212
|
--h->size;
|
209
213
|
}
|
210
214
|
}
|
@@ -255,24 +259,25 @@ uint32_t stringmap_next_size(stringmap* h) {
|
|
255
259
|
return size(prime_list[next_idx]);
|
256
260
|
}
|
257
261
|
|
258
|
-
const char* stringmap_int_to_string(stringmap* h, uint32_t i) {
|
259
|
-
|
262
|
+
const char* stringmap_int_to_string(stringmap* h, stringpool* p, uint32_t i) {
|
263
|
+
(void)h;
|
264
|
+
return stringpool_lookup(p, i);
|
260
265
|
}
|
261
266
|
|
262
267
|
// returns -1 if not found
|
263
|
-
uint32_t stringmap_string_to_int(stringmap* h, const char* s) {
|
264
|
-
uint32_t idx = stringmap_get(h, s);
|
268
|
+
uint32_t stringmap_string_to_int(stringmap* h, stringpool* pool, const char* s) {
|
269
|
+
uint32_t idx = stringmap_get(h, pool, s);
|
265
270
|
if(idx == h->n_buckets) return (uint32_t)-1; // not there
|
266
|
-
return h
|
271
|
+
return STRINGMAP_KEYS(h)[idx];
|
267
272
|
}
|
268
273
|
|
269
|
-
wp_error* stringmap_add(stringmap *h, const char* s, uint32_t* id) {
|
274
|
+
wp_error* stringmap_add(stringmap *h, stringpool* pool, const char* s, uint32_t* id) {
|
270
275
|
int status;
|
271
|
-
uint32_t idx = stringmap_put(h, s, &status);
|
276
|
+
uint32_t idx = stringmap_put(h, pool, s, &status);
|
272
277
|
if(status == -1) RAISE_ERROR("out of space in hash put");
|
273
278
|
if(status == -2) RAISE_ERROR("out of space in pool put");
|
274
279
|
|
275
|
-
*id = h
|
280
|
+
*id = STRINGMAP_KEYS(h)[idx];
|
276
281
|
|
277
282
|
return NO_ERROR;
|
278
283
|
}
|
data/ext/whistlepig/stringmap.h
CHANGED
@@ -10,9 +10,8 @@
|
|
10
10
|
// and stringpool, it uses a slightly funny API that never allocates memory,
|
11
11
|
// but instead operates on pointers to preallocated blocks of memory.
|
12
12
|
//
|
13
|
-
// uses a stringpool internally to do the int->string mapping.
|
14
|
-
//
|
15
|
-
// use this object.
|
13
|
+
// uses a stringpool internally to do the int->string mapping. you shouldn't
|
14
|
+
// have to interact with the stringpool directly; you can just use this object.
|
16
15
|
//
|
17
16
|
// like termhash and pool, has a slightly funny API that is designed to work on
|
18
17
|
// a pre-allocated chunk of memory rather than allocate any of its own.
|
@@ -36,9 +35,6 @@
|
|
36
35
|
typedef struct stringmap {
|
37
36
|
uint8_t n_buckets_idx;
|
38
37
|
uint32_t n_buckets, size, n_occupied, upper_bound;
|
39
|
-
uint32_t *flags;
|
40
|
-
uint32_t *keys;
|
41
|
-
stringpool* pool;
|
42
38
|
uint8_t boundary[];
|
43
39
|
// in memory at this point
|
44
40
|
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
@@ -48,21 +44,18 @@ typedef struct stringmap {
|
|
48
44
|
// API methods
|
49
45
|
|
50
46
|
// public: write a new stringmap to memory
|
51
|
-
void stringmap_init(stringmap* h
|
52
|
-
|
53
|
-
// public: set up an existing stringmap in memory
|
54
|
-
void stringmap_setup(stringmap* h, stringpool* p);
|
47
|
+
void stringmap_init(stringmap* h);
|
55
48
|
|
56
49
|
// public: add a string. sets id to its id. dupes are fine; will just set the
|
57
50
|
// id correctly.
|
58
|
-
wp_error* stringmap_add(stringmap *h, const char* s, uint32_t* id) RAISES_ERROR;
|
51
|
+
wp_error* stringmap_add(stringmap *h, stringpool* p, const char* s, uint32_t* id) RAISES_ERROR;
|
59
52
|
|
60
53
|
// public: get the int value given a string. returns (uint32_t)-1 if not found.
|
61
|
-
uint32_t stringmap_string_to_int(stringmap* h, const char* s);
|
54
|
+
uint32_t stringmap_string_to_int(stringmap* h, stringpool* pool, const char* s);
|
62
55
|
|
63
56
|
// public: get the string value given an int. returns corrupt data if the int
|
64
57
|
// is invalid.
|
65
|
-
const char* stringmap_int_to_string(stringmap* h, uint32_t i);
|
58
|
+
const char* stringmap_int_to_string(stringmap* h, stringpool* p, uint32_t i);
|
66
59
|
|
67
60
|
// public: returns the byte size of the stringmap
|
68
61
|
uint32_t stringmap_size(stringmap* h);
|
@@ -77,6 +70,6 @@ uint32_t stringmap_next_size(stringmap* h);
|
|
77
70
|
int stringmap_needs_bump(stringmap* h);
|
78
71
|
|
79
72
|
// public: increases the size of the stringmap
|
80
|
-
wp_error* stringmap_bump_size(stringmap *h) RAISES_ERROR;
|
73
|
+
wp_error* stringmap_bump_size(stringmap *h, stringpool* pool) RAISES_ERROR;
|
81
74
|
|
82
75
|
#endif
|
data/ext/whistlepig/termhash.c
CHANGED
@@ -35,23 +35,11 @@ void termhash_init(termhash* h) {
|
|
35
35
|
h->n_buckets = prime_list[h->n_buckets_idx];
|
36
36
|
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
37
37
|
h->size = h->n_occupied = 0;
|
38
|
-
|
39
|
-
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
38
|
+
memset(TERMHASH_FLAGS(h), 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t));
|
40
39
|
}
|
41
40
|
|
42
41
|
#define OFFSET(a, b) (long)((uint8_t*)a - (uint8_t*)b)
|
43
42
|
// set flags, keys and vals to correct locations based on h->n_buckets
|
44
|
-
void termhash_setup(termhash* h) {
|
45
|
-
DEBUG("term hash ranges from %p to %p (size %u)", h, (char*)h + termhash_size(h), termhash_size(h));
|
46
|
-
DEBUG("boundary is at %p (+%ld)", h->boundary, OFFSET(h->boundary, h));
|
47
|
-
h->flags = (uint32_t*)h->boundary;
|
48
|
-
h->keys = (term*)((uint32_t*)h->boundary + ((h->n_buckets >> 4) + 1));
|
49
|
-
h->vals = (uint32_t*)((term*)h->keys + h->n_buckets);
|
50
|
-
DEBUG("flags are at %p (+%ld)", h->flags, OFFSET(h->flags, h->boundary));
|
51
|
-
DEBUG(" keys are at %p (+%ld)", h->keys, OFFSET(h->keys, h->boundary));
|
52
|
-
DEBUG(" vals are at %p (+%ld)", h->vals, OFFSET(h->vals, h->boundary));
|
53
|
-
}
|
54
|
-
|
55
43
|
/*
|
56
44
|
static void termhash_dump(termhash* h) {
|
57
45
|
for(uint32_t i = 0; i < h->n_buckets; i++) {
|
@@ -83,89 +71,94 @@ static void kh_clear_##name(kh_##name##_t *h) {
|
|
83
71
|
*/
|
84
72
|
|
85
73
|
uint32_t termhash_get(termhash *h, term key) {
|
74
|
+
uint32_t* flags = TERMHASH_FLAGS(h);
|
75
|
+
term* keys = TERMHASH_KEYS(h);
|
76
|
+
|
86
77
|
if(h->n_buckets) {
|
87
78
|
uint32_t inc, k, i, last;
|
88
79
|
k = hash_term(key); i = k % h->n_buckets;
|
89
80
|
inc = 1 + k % (h->n_buckets - 1); last = i;
|
90
|
-
while (!isempty(
|
81
|
+
while (!isempty(flags, i) && (isdel(flags, i) || !term_equals(keys[i], key))) {
|
91
82
|
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
92
83
|
else i += inc;
|
93
84
|
if (i == last) return h->n_buckets;
|
94
85
|
}
|
95
|
-
return iseither(
|
86
|
+
return iseither(flags, i)? h->n_buckets : i;
|
96
87
|
}
|
97
88
|
else return 0;
|
98
89
|
}
|
99
90
|
|
100
91
|
wp_error* termhash_bump_size(termhash *h) {
|
101
92
|
DEBUG("bumping size for term hash at %p with size %u and boundary %p (+%ld)", h, termhash_size(h), h->boundary, (long)((uint8_t*)h->boundary - (uint8_t*)h));
|
102
|
-
DEBUG("flags are at %p (+%ld)", h
|
103
|
-
DEBUG(" keys are at %p (+%ld)", h
|
104
|
-
DEBUG(" vals are at %p (+%ld)", h
|
93
|
+
DEBUG("flags are at %p (+%ld)", TERMHASH_FLAGS(h), OFFSET(TERMHASH_FLAGS(h), h->boundary));
|
94
|
+
DEBUG(" keys are at %p (+%ld)", TERMHASH_KEYS(h), OFFSET(TERMHASH_KEYS(h), h->boundary));
|
95
|
+
DEBUG(" vals are at %p (+%ld)", TERMHASH_VALS(h), OFFSET(TERMHASH_VALS(h), h->boundary));
|
96
|
+
|
97
|
+
if(h->n_buckets_idx >= (HASH_PRIME_SIZE - 1)) RAISE_ERROR("termhash can't be this big");
|
105
98
|
|
106
99
|
h->n_buckets_idx++;
|
107
|
-
if(h->n_buckets_idx > HASH_PRIME_SIZE) exit(1); // die horribly TODO fixme
|
108
100
|
uint32_t new_n_buckets = prime_list[h->n_buckets_idx];
|
109
101
|
|
110
|
-
// first make a backup of the
|
111
|
-
size_t
|
112
|
-
uint32_t*
|
113
|
-
memcpy(
|
102
|
+
// first make a backup of the old flags in a separate memory region
|
103
|
+
size_t flagbaksize = ((h->n_buckets >> 4) + 1) * sizeof(uint32_t);
|
104
|
+
uint32_t* flagbaks = malloc(flagbaksize);
|
105
|
+
memcpy(flagbaks, TERMHASH_FLAGS(h), flagbaksize);
|
114
106
|
|
115
|
-
//
|
116
|
-
term* oldkeys = h
|
117
|
-
uint32_t* oldvals = h
|
107
|
+
// get pointers to the old locations
|
108
|
+
term* oldkeys = TERMHASH_KEYS(h);
|
109
|
+
uint32_t* oldvals = TERMHASH_VALS(h);
|
118
110
|
|
119
111
|
// set pointers to the new locations
|
120
|
-
|
121
|
-
|
112
|
+
uint32_t* newflags = (uint32_t*)h->boundary;
|
113
|
+
term* newkeys = (term*)(newflags + ((new_n_buckets >> 4) + 1));
|
114
|
+
uint32_t* newvals = (uint32_t*)(newkeys + new_n_buckets);
|
122
115
|
|
123
116
|
// move the vals and keys
|
124
|
-
memmove(
|
125
|
-
memmove(
|
117
|
+
memmove(newvals, oldvals, h->n_buckets * sizeof(uint32_t));
|
118
|
+
memmove(newkeys, oldkeys, h->n_buckets * sizeof(term));
|
126
119
|
|
127
120
|
// clear the new flags
|
128
|
-
memset(
|
121
|
+
memset(newflags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t));
|
129
122
|
|
130
123
|
// do the complicated stuff from khash.h
|
131
124
|
for (unsigned int j = 0; j != h->n_buckets; ++j) {
|
132
|
-
if (iseither(
|
133
|
-
term key =
|
125
|
+
if (iseither(flagbaks, j) == 0) {
|
126
|
+
term key = newkeys[j];
|
134
127
|
uint32_t val;
|
135
|
-
val =
|
136
|
-
set_isdel_true(
|
128
|
+
val = newvals[j];
|
129
|
+
set_isdel_true(flagbaks, j);
|
137
130
|
while (1) {
|
138
131
|
uint32_t inc, k, i;
|
139
132
|
k = hash_term(key);
|
140
133
|
i = k % new_n_buckets;
|
141
134
|
inc = 1 + k % (new_n_buckets - 1);
|
142
|
-
while (!isempty(
|
135
|
+
while (!isempty(newflags, i)) {
|
143
136
|
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets;
|
144
137
|
else i += inc;
|
145
138
|
}
|
146
|
-
set_isempty_false(
|
147
|
-
if (i < h->n_buckets && iseither(
|
148
|
-
{ term tmp =
|
149
|
-
{ uint32_t tmp =
|
150
|
-
set_isdel_true(
|
139
|
+
set_isempty_false(newflags, i);
|
140
|
+
if (i < h->n_buckets && iseither(flagbaks, i) == 0) {
|
141
|
+
{ term tmp = newkeys[i]; newkeys[i] = key; key = tmp; }
|
142
|
+
{ uint32_t tmp = newvals[i]; newvals[i] = val; val = tmp; }
|
143
|
+
set_isdel_true(flagbaks, i);
|
151
144
|
} else {
|
152
|
-
|
153
|
-
|
145
|
+
newkeys[i] = key;
|
146
|
+
newvals[i] = val;
|
154
147
|
break;
|
155
148
|
}
|
156
149
|
}
|
157
150
|
}
|
158
151
|
}
|
159
152
|
|
160
|
-
free(
|
153
|
+
free(flagbaks);
|
161
154
|
h->n_buckets = new_n_buckets;
|
162
155
|
h->n_occupied = h->size;
|
163
156
|
h->upper_bound = (uint32_t)(h->n_buckets * HASH_UPPER + 0.5);
|
164
157
|
|
165
158
|
DEBUG("after bump, term hash at %p has size %u and boundary %p (+%ld)", h, termhash_size(h), h->boundary, (long)((uint8_t*)h->boundary - (uint8_t*)h));
|
166
|
-
DEBUG("flags are at %p (+%ld)", h
|
167
|
-
DEBUG(" keys are at %p (+%ld)", h
|
168
|
-
DEBUG(" vals are at %p (+%ld)", h
|
159
|
+
DEBUG("flags are at %p (+%ld)", TERMHASH_FLAGS(h), OFFSET(TERMHASH_FLAGS(h), h->boundary));
|
160
|
+
DEBUG(" keys are at %p (+%ld)", TERMHASH_KEYS(h), OFFSET(TERMHASH_KEYS(h), h->boundary));
|
161
|
+
DEBUG(" vals are at %p (+%ld)", TERMHASH_VALS(h), OFFSET(TERMHASH_VALS(h), h->boundary));
|
169
162
|
|
170
163
|
#ifdef DEBUGOUTPUT
|
171
164
|
//DEBUG("and now i look like this:");
|
@@ -177,6 +170,8 @@ wp_error* termhash_bump_size(termhash *h) {
|
|
177
170
|
|
178
171
|
uint32_t termhash_put(termhash *h, term key, int *ret) {
|
179
172
|
uint32_t x;
|
173
|
+
uint32_t* flags = TERMHASH_FLAGS(h);
|
174
|
+
term* keys = TERMHASH_KEYS(h);
|
180
175
|
|
181
176
|
{
|
182
177
|
#ifdef DEBUGOUTPUT
|
@@ -185,40 +180,40 @@ int num_loops = 0;
|
|
185
180
|
uint32_t inc, k, i, site, last;
|
186
181
|
x = site = h->n_buckets; k = hash_term(key); i = k % h->n_buckets;
|
187
182
|
DEBUG("initial hash is %u", k);
|
188
|
-
if (isempty(
|
183
|
+
if (isempty(flags, i)) x = i;
|
189
184
|
else {
|
190
185
|
inc = 1 + k % (h->n_buckets - 1); last = i;
|
191
|
-
while (!isempty(
|
186
|
+
while (!isempty(flags, i) && (isdel(flags, i) || !term_equals(keys[i], key))) {
|
192
187
|
#ifdef DEBUGOUTPUT
|
193
188
|
num_loops++;
|
194
189
|
#endif
|
195
|
-
if (isdel(
|
190
|
+
if (isdel(flags, i)) site = i;
|
196
191
|
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets;
|
197
192
|
else i += inc;
|
198
193
|
if (i == last) { x = site; break; }
|
199
194
|
}
|
200
195
|
if ((x == h->n_buckets) && (i == last)) { // out of space
|
201
|
-
if(!term_equals(
|
196
|
+
if(!term_equals(keys[i], key)) {
|
202
197
|
*ret = -1;
|
203
198
|
return x;
|
204
199
|
}
|
205
200
|
}
|
206
201
|
if (x == h->n_buckets) { // didn't find it on the first try
|
207
|
-
if (isempty(
|
202
|
+
if (isempty(flags, i) && site != h->n_buckets) x = site;
|
208
203
|
else x = i;
|
209
204
|
}
|
210
205
|
}
|
211
206
|
DEBUG("looped %u times to put", num_loops);
|
212
207
|
//DEBUG("x is %u, site is %u, n_buckets is %u", x, site, h->n_buckets);
|
213
208
|
}
|
214
|
-
if (isempty(
|
215
|
-
|
216
|
-
set_isboth_false(
|
209
|
+
if (isempty(flags, x)) {
|
210
|
+
keys[x] = key;
|
211
|
+
set_isboth_false(flags, x);
|
217
212
|
++h->size; ++h->n_occupied;
|
218
213
|
*ret = 1;
|
219
|
-
} else if (isdel(
|
220
|
-
|
221
|
-
set_isboth_false(
|
214
|
+
} else if (isdel(flags, x)) {
|
215
|
+
keys[x] = key;
|
216
|
+
set_isboth_false(flags, x);
|
222
217
|
++h->size;
|
223
218
|
*ret = 2;
|
224
219
|
}
|
@@ -233,24 +228,27 @@ num_loops++;
|
|
233
228
|
}
|
234
229
|
|
235
230
|
void termhash_del(termhash *h, uint32_t x) {
|
236
|
-
|
237
|
-
|
231
|
+
uint32_t* flags = TERMHASH_FLAGS(h);
|
232
|
+
if (x != h->n_buckets && !iseither(flags, x)) {
|
233
|
+
set_isdel_true(flags, x);
|
238
234
|
--h->size;
|
239
235
|
}
|
240
236
|
}
|
241
237
|
|
242
238
|
uint32_t termhash_get_val(termhash* h, term t) {
|
239
|
+
uint32_t* vals = TERMHASH_VALS(h);
|
243
240
|
uint32_t idx = termhash_get(h, t);
|
244
241
|
if(idx == h->n_buckets) return (uint32_t)-1;
|
245
|
-
return
|
242
|
+
return vals[idx];
|
246
243
|
}
|
247
244
|
|
248
245
|
wp_error* termhash_put_val(termhash* h, term t, uint32_t val) {
|
249
246
|
int status;
|
247
|
+
uint32_t* vals = TERMHASH_VALS(h);
|
250
248
|
uint32_t loc = termhash_put(h, t, &status);
|
251
249
|
DEBUG("put(%u,%u) has status %d and loc %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
|
252
250
|
if(status == -1) RAISE_ERROR("out of space in hash");
|
253
|
-
|
251
|
+
vals[loc] = val;
|
254
252
|
return NO_ERROR;
|
255
253
|
}
|
256
254
|
|
data/ext/whistlepig/termhash.h
CHANGED
@@ -27,9 +27,6 @@ typedef struct term {
|
|
27
27
|
typedef struct termhash {
|
28
28
|
uint8_t n_buckets_idx;
|
29
29
|
uint32_t n_buckets, size, n_occupied, upper_bound;
|
30
|
-
uint32_t *flags;
|
31
|
-
term *keys;
|
32
|
-
uint32_t *vals;
|
33
30
|
uint8_t boundary[];
|
34
31
|
// in memory at this point
|
35
32
|
// ((n_buckets >> 4) + 1) uint32_t's for the flags
|
@@ -37,14 +34,15 @@ typedef struct termhash {
|
|
37
34
|
// n_buckets uint32_t's for the vals (offsets into postings lists)
|
38
35
|
} termhash;
|
39
36
|
|
37
|
+
#define TERMHASH_FLAGS(h) ((uint32_t*)(h)->boundary)
|
38
|
+
#define TERMHASH_KEYS(h) ((term*)((uint32_t*)(h)->boundary + (((h)->n_buckets >> 4) + 1)))
|
39
|
+
#define TERMHASH_VALS(h) ((uint32_t*)(TERMHASH_KEYS(h) + (h)->n_buckets))
|
40
|
+
|
40
41
|
// API methods
|
41
42
|
|
42
43
|
// public: make a new termhash
|
43
44
|
void termhash_init(termhash* h); // makes a new one
|
44
45
|
|
45
|
-
// public: set up an existing termhash
|
46
|
-
void termhash_setup(termhash* h); // inits one from disk
|
47
|
-
|
48
46
|
// private: khash-style getter: returns the slot id, if any, given a term key.
|
49
47
|
// you can then look this up within the vals array yourself. returns
|
50
48
|
// h->n_buckets if the term is not in the hash.
|
data/ext/whistlepig/whistlepig.c
CHANGED
@@ -143,7 +143,11 @@ static VALUE index_delete(VALUE class, VALUE v_pathname_base) {
|
|
143
143
|
static VALUE index_size(VALUE self) {
|
144
144
|
wp_index* index;
|
145
145
|
Data_Get_Struct(self, wp_index, index);
|
146
|
-
|
146
|
+
|
147
|
+
uint64_t num_docs;
|
148
|
+
wp_error* e = wp_index_num_docs(index, &num_docs);
|
149
|
+
RAISE_IF_NECESSARY(e);
|
150
|
+
return INT2NUM(num_docs);
|
147
151
|
}
|
148
152
|
|
149
153
|
static VALUE index_init(VALUE self, VALUE v_pathname_base) {
|