whistlepig 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +86 -0
- data/ext/whistlepig/defaults.h +28 -0
- data/ext/whistlepig/entry.c +181 -0
- data/ext/whistlepig/entry.h +66 -0
- data/ext/whistlepig/error.c +24 -0
- data/ext/whistlepig/error.h +94 -0
- data/ext/whistlepig/extconf.rb +6 -0
- data/ext/whistlepig/index.c +294 -0
- data/ext/whistlepig/index.h +88 -0
- data/ext/whistlepig/khash.h +316 -0
- data/ext/whistlepig/mmap-obj.c +76 -0
- data/ext/whistlepig/mmap-obj.h +52 -0
- data/ext/whistlepig/query-parser.c +37 -0
- data/ext/whistlepig/query-parser.h +25 -0
- data/ext/whistlepig/query-parser.lex.c +2249 -0
- data/ext/whistlepig/query-parser.lex.h +359 -0
- data/ext/whistlepig/query-parser.tab.c +1757 -0
- data/ext/whistlepig/query-parser.tab.h +85 -0
- data/ext/whistlepig/query.c +194 -0
- data/ext/whistlepig/query.h +78 -0
- data/ext/whistlepig/search.c +746 -0
- data/ext/whistlepig/search.h +76 -0
- data/ext/whistlepig/segment.c +615 -0
- data/ext/whistlepig/segment.h +137 -0
- data/ext/whistlepig/stringmap.c +278 -0
- data/ext/whistlepig/stringmap.h +82 -0
- data/ext/whistlepig/stringpool.c +44 -0
- data/ext/whistlepig/stringpool.h +58 -0
- data/ext/whistlepig/termhash.c +294 -0
- data/ext/whistlepig/termhash.h +79 -0
- data/ext/whistlepig/tokenizer.lex.c +2263 -0
- data/ext/whistlepig/tokenizer.lex.h +360 -0
- data/ext/whistlepig/whistlepig.h +15 -0
- data/ext/whistlepig/whistlepigc.c +537 -0
- data/lib/whistlepig.rb +119 -0
- metadata +103 -0
@@ -0,0 +1,294 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <sys/types.h>
|
3
|
+
#include <sys/stat.h>
|
4
|
+
#include <unistd.h>
|
5
|
+
#include "whistlepig.h"
|
6
|
+
|
7
|
+
#define PATH_BUF_SIZE 4096
|
8
|
+
|
9
|
+
int wp_index_exists(const char* pathname_base) {
|
10
|
+
char buf[PATH_BUF_SIZE];
|
11
|
+
snprintf(buf, PATH_BUF_SIZE, "%s0", pathname_base);
|
12
|
+
return wp_segment_exists(buf);
|
13
|
+
}
|
14
|
+
|
15
|
+
wp_error* wp_index_create(wp_index** indexptr, const char* pathname_base) {
|
16
|
+
char buf[PATH_BUF_SIZE];
|
17
|
+
|
18
|
+
snprintf(buf, PATH_BUF_SIZE, "%s0", pathname_base);
|
19
|
+
if(wp_segment_exists(buf)) RAISE_ERROR("index with base path '%s' already exists", pathname_base);
|
20
|
+
|
21
|
+
wp_index* index = *indexptr = malloc(sizeof(wp_index));
|
22
|
+
index->pathname_base = pathname_base;
|
23
|
+
index->num_segments = 1;
|
24
|
+
index->sizeof_segments = 1;
|
25
|
+
index->open = 1;
|
26
|
+
index->segments = malloc(sizeof(wp_segment));
|
27
|
+
index->docid_offsets = malloc(sizeof(uint64_t));
|
28
|
+
|
29
|
+
RELAY_ERROR(wp_segment_create(&index->segments[0], buf));
|
30
|
+
index->docid_offsets[0] = 0;
|
31
|
+
|
32
|
+
return NO_ERROR;
|
33
|
+
}
|
34
|
+
|
35
|
+
RAISING_STATIC(ensure_num_segments(wp_index* index)) {
|
36
|
+
if(index->num_segments >= index->sizeof_segments) {
|
37
|
+
index->sizeof_segments *= 2;
|
38
|
+
index->segments = realloc(index->segments, sizeof(wp_segment) * index->sizeof_segments);
|
39
|
+
index->docid_offsets = realloc(index->docid_offsets, sizeof(uint64_t) * index->sizeof_segments);
|
40
|
+
if(index->segments == NULL) RAISE_ERROR("oom");
|
41
|
+
}
|
42
|
+
|
43
|
+
return NO_ERROR;
|
44
|
+
}
|
45
|
+
|
46
|
+
wp_error* wp_index_load(wp_index** indexptr, const char* pathname_base) {
|
47
|
+
char buf[PATH_BUF_SIZE];
|
48
|
+
snprintf(buf, PATH_BUF_SIZE, "%s0", pathname_base);
|
49
|
+
if(!wp_segment_exists(buf)) RAISE_ERROR("index with base path '%s' does not exist", pathname_base);
|
50
|
+
|
51
|
+
wp_index* index = *indexptr = malloc(sizeof(wp_index));
|
52
|
+
|
53
|
+
index->pathname_base = pathname_base;
|
54
|
+
index->num_segments = 0;
|
55
|
+
index->sizeof_segments = 1;
|
56
|
+
index->open = 1;
|
57
|
+
index->segments = malloc(sizeof(wp_segment));
|
58
|
+
index->docid_offsets = malloc(sizeof(uint64_t));
|
59
|
+
|
60
|
+
// load all the segments we can
|
61
|
+
while(index->num_segments < WP_MAX_SEGMENTS) {
|
62
|
+
snprintf(buf, PATH_BUF_SIZE, "%s%d", pathname_base, index->num_segments);
|
63
|
+
if(!wp_segment_exists(buf)) break;
|
64
|
+
|
65
|
+
RELAY_ERROR(ensure_num_segments(index));
|
66
|
+
DEBUG("loading segment %s", buf);
|
67
|
+
RELAY_ERROR(wp_segment_load(&index->segments[index->num_segments], buf));
|
68
|
+
if(index->num_segments == 0)
|
69
|
+
index->docid_offsets[index->num_segments] = 0;
|
70
|
+
else {
|
71
|
+
// segments return docids 1 through N, so the num_docs in a segment is
|
72
|
+
// also the max document id
|
73
|
+
postings_region* prevpr = MMAP_OBJ(index->segments[index->num_segments - 1].postings, postings_region);
|
74
|
+
index->docid_offsets[index->num_segments] = prevpr->num_docs + index->docid_offsets[index->num_segments - 1];
|
75
|
+
}
|
76
|
+
|
77
|
+
index->num_segments++;
|
78
|
+
}
|
79
|
+
|
80
|
+
return NO_ERROR;
|
81
|
+
}
|
82
|
+
|
83
|
+
// we have two special values at our disposal to mark where we are in
|
84
|
+
// the sequence of segments
|
85
|
+
#define SEGMENT_UNINITIALIZED WP_MAX_SEGMENTS
|
86
|
+
#define SEGMENT_DONE (WP_MAX_SEGMENTS + 1)
|
87
|
+
|
88
|
+
wp_error* wp_index_setup_query(wp_index* index, wp_query* query) {
|
89
|
+
(void)index;
|
90
|
+
query->segment_idx = SEGMENT_UNINITIALIZED;
|
91
|
+
|
92
|
+
return NO_ERROR;
|
93
|
+
}
|
94
|
+
|
95
|
+
// can be called multiple times to resume
|
96
|
+
wp_error* wp_index_run_query(wp_index* index, wp_query* query, uint32_t max_num_results, uint32_t* num_results, uint64_t* results) {
|
97
|
+
*num_results = 0;
|
98
|
+
if(index->num_segments == 0) return NO_ERROR;
|
99
|
+
|
100
|
+
if(query->segment_idx == SEGMENT_UNINITIALIZED) {
|
101
|
+
query->segment_idx = index->num_segments - 1;
|
102
|
+
DEBUG("setting up segment %u", query->segment_idx);
|
103
|
+
RELAY_ERROR(wp_search_init_search_state(query, &index->segments[query->segment_idx]));
|
104
|
+
}
|
105
|
+
|
106
|
+
// at this point, we assume we're initialized and query->segment_idx is the index
|
107
|
+
// of the segment we're searching against
|
108
|
+
while((*num_results < max_num_results) && (query->segment_idx != SEGMENT_DONE)) {
|
109
|
+
uint32_t want_num_results = max_num_results - *num_results;
|
110
|
+
uint32_t got_num_results = 0;
|
111
|
+
search_result* segment_results = malloc(sizeof(search_result) * want_num_results);
|
112
|
+
|
113
|
+
DEBUG("searching segment %d", query->segment_idx);
|
114
|
+
RELAY_ERROR(wp_search_run_query_on_segment(query, &index->segments[query->segment_idx], want_num_results, &got_num_results, segment_results));
|
115
|
+
DEBUG("asked segment %d for %d results, got %d", query->segment_idx, want_num_results, got_num_results);
|
116
|
+
|
117
|
+
// extract the per-segment docids from the search results and adjust by
|
118
|
+
// each segment's docid offset to form global docids
|
119
|
+
for(uint32_t i = 0; i < got_num_results; i++) {
|
120
|
+
results[*num_results + i] = index->docid_offsets[query->segment_idx] + segment_results[i].doc_id;
|
121
|
+
wp_search_result_free(&segment_results[i]);
|
122
|
+
}
|
123
|
+
free(segment_results);
|
124
|
+
*num_results += got_num_results;
|
125
|
+
|
126
|
+
if(got_num_results < want_num_results) { // this segment is finished; move to the next one
|
127
|
+
DEBUG("releasing index %d", query->segment_idx);
|
128
|
+
RELAY_ERROR(wp_search_release_search_state(query));
|
129
|
+
if(query->segment_idx > 0) {
|
130
|
+
query->segment_idx--;
|
131
|
+
DEBUG("setting up index %d", query->segment_idx);
|
132
|
+
RELAY_ERROR(wp_search_init_search_state(query, &index->segments[query->segment_idx]));
|
133
|
+
}
|
134
|
+
else query->segment_idx = SEGMENT_DONE;
|
135
|
+
}
|
136
|
+
}
|
137
|
+
|
138
|
+
return NO_ERROR;
|
139
|
+
}
|
140
|
+
|
141
|
+
#define RESULT_BUF_SIZE 1024
|
142
|
+
// count the results by just running the query until it stops. slow!
|
143
|
+
wp_error* wp_index_count_results(wp_index* index, wp_query* query, uint32_t* num_results) {
|
144
|
+
uint64_t results[RESULT_BUF_SIZE];
|
145
|
+
|
146
|
+
*num_results = 0;
|
147
|
+
RELAY_ERROR(wp_index_setup_query(index, query));
|
148
|
+
while(1) {
|
149
|
+
uint32_t this_num_results;
|
150
|
+
RELAY_ERROR(wp_index_run_query(index, query, RESULT_BUF_SIZE, &this_num_results, results));
|
151
|
+
*num_results += this_num_results;
|
152
|
+
if(this_num_results < RESULT_BUF_SIZE) break; // done
|
153
|
+
}
|
154
|
+
|
155
|
+
RELAY_ERROR(wp_index_teardown_query(index, query));
|
156
|
+
|
157
|
+
return NO_ERROR;
|
158
|
+
}
|
159
|
+
|
160
|
+
wp_error* wp_index_teardown_query(wp_index* index, wp_query* query) {
|
161
|
+
(void)index;
|
162
|
+
if((query->segment_idx != SEGMENT_UNINITIALIZED) && (query->segment_idx != SEGMENT_DONE)) {
|
163
|
+
RELAY_ERROR(wp_search_release_search_state(query));
|
164
|
+
}
|
165
|
+
query->segment_idx = SEGMENT_UNINITIALIZED;
|
166
|
+
|
167
|
+
return NO_ERROR;
|
168
|
+
}
|
169
|
+
|
170
|
+
wp_error* wp_index_add_entry(wp_index* index, wp_entry* entry, uint64_t* doc_id) {
|
171
|
+
int success;
|
172
|
+
wp_segment* seg = &index->segments[index->num_segments - 1];
|
173
|
+
|
174
|
+
// first, ensure we have enough space in the current segment
|
175
|
+
uint32_t postings_bytes;
|
176
|
+
RELAY_ERROR(wp_entry_sizeof_postings_region(entry, seg, &postings_bytes));
|
177
|
+
RELAY_ERROR(wp_segment_ensure_fit(seg, postings_bytes, 0, &success));
|
178
|
+
|
179
|
+
// if not, we need to open a new one
|
180
|
+
if(!success) {
|
181
|
+
DEBUG("segment %d is full, loading a new one", index->num_segments - 1);
|
182
|
+
char buf[PATH_BUF_SIZE];
|
183
|
+
snprintf(buf, PATH_BUF_SIZE, "%s%d", index->pathname_base, index->num_segments);
|
184
|
+
RELAY_ERROR(ensure_num_segments(index));
|
185
|
+
RELAY_ERROR(wp_segment_create(&index->segments[index->num_segments], buf));
|
186
|
+
index->num_segments++;
|
187
|
+
|
188
|
+
// set the docid_offset
|
189
|
+
postings_region* prevpr = MMAP_OBJ(index->segments[index->num_segments - 2].postings, postings_region);
|
190
|
+
index->docid_offsets[index->num_segments - 1] = prevpr->num_docs + index->docid_offsets[index->num_segments - 2];
|
191
|
+
|
192
|
+
seg = &index->segments[index->num_segments - 1];
|
193
|
+
DEBUG("loaded new segment %d at %p", index->num_segments - 1, &index->segments[index->num_segments - 1]);
|
194
|
+
|
195
|
+
RELAY_ERROR(wp_entry_sizeof_postings_region(entry, seg, &postings_bytes));
|
196
|
+
RELAY_ERROR(wp_segment_ensure_fit(seg, postings_bytes, 0, &success));
|
197
|
+
if(!success) RAISE_ERROR("can't fit new entry into fresh segment. that's crazy");
|
198
|
+
}
|
199
|
+
|
200
|
+
docid_t seg_doc_id;
|
201
|
+
RELAY_ERROR(wp_segment_grab_docid(seg, &seg_doc_id));
|
202
|
+
RELAY_ERROR(wp_entry_write_to_segment(entry, seg, seg_doc_id));
|
203
|
+
*doc_id = seg_doc_id + index->docid_offsets[index->num_segments - 1];
|
204
|
+
|
205
|
+
return NO_ERROR;
|
206
|
+
}
|
207
|
+
|
208
|
+
wp_error* wp_index_unload(wp_index* index) {
|
209
|
+
for(uint16_t i = 0; i < index->num_segments; i++) RELAY_ERROR(wp_segment_unload(&index->segments[i]));
|
210
|
+
index->open = 0;
|
211
|
+
|
212
|
+
return NO_ERROR;
|
213
|
+
}
|
214
|
+
|
215
|
+
wp_error* wp_index_free(wp_index* index) {
|
216
|
+
if(index->open) RELAY_ERROR(wp_index_unload(index));
|
217
|
+
free(index->segments);
|
218
|
+
free(index->docid_offsets);
|
219
|
+
free(index);
|
220
|
+
|
221
|
+
return NO_ERROR;
|
222
|
+
}
|
223
|
+
|
224
|
+
wp_error* wp_index_dumpinfo(wp_index* index, FILE* stream) {
|
225
|
+
fprintf(stream, "index has %d segments\n", index->num_segments);
|
226
|
+
for(int i = 0; i < index->num_segments; i++) {
|
227
|
+
fprintf(stream, "\nsegment %d:\n", i);
|
228
|
+
RELAY_ERROR(wp_segment_dumpinfo(&index->segments[i], stream));
|
229
|
+
}
|
230
|
+
|
231
|
+
return NO_ERROR;
|
232
|
+
}
|
233
|
+
|
234
|
+
wp_error* wp_index_delete(const char* pathname_base) {
|
235
|
+
char buf[PATH_BUF_SIZE];
|
236
|
+
|
237
|
+
int i = 0;
|
238
|
+
while(1) {
|
239
|
+
snprintf(buf, PATH_BUF_SIZE, "%s%d", pathname_base, i);
|
240
|
+
if(wp_segment_exists(buf)) {
|
241
|
+
DEBUG("deleting segment %s", buf);
|
242
|
+
RELAY_ERROR(wp_segment_delete(buf));
|
243
|
+
i++;
|
244
|
+
}
|
245
|
+
else break;
|
246
|
+
}
|
247
|
+
|
248
|
+
return NO_ERROR;
|
249
|
+
}
|
250
|
+
|
251
|
+
wp_error* wp_index_add_label(wp_index* index, const char* label, uint64_t doc_id) {
|
252
|
+
int found = 0;
|
253
|
+
|
254
|
+
for(uint32_t i = index->num_segments; i > 0; i--) {
|
255
|
+
if(doc_id > index->docid_offsets[i - 1]) {
|
256
|
+
DEBUG("found doc %llu in segment %u", doc_id, i - 1);
|
257
|
+
RELAY_ERROR(wp_segment_add_label(&index->segments[i - 1], label, (docid_t)(doc_id - index->docid_offsets[i - 1])));
|
258
|
+
found = 1;
|
259
|
+
break;
|
260
|
+
}
|
261
|
+
else DEBUG("did not find doc %llu in segment %u", doc_id, i - 1);
|
262
|
+
}
|
263
|
+
|
264
|
+
if(!found) RAISE_ERROR("couldn't find doc id %llu", doc_id);
|
265
|
+
|
266
|
+
return NO_ERROR;
|
267
|
+
}
|
268
|
+
|
269
|
+
wp_error* wp_index_remove_label(wp_index* index, const char* label, uint64_t doc_id) {
|
270
|
+
int found = 0;
|
271
|
+
|
272
|
+
for(uint32_t i = index->num_segments; i > 0; i--) {
|
273
|
+
if(doc_id > index->docid_offsets[i - 1]) {
|
274
|
+
DEBUG("found doc %llu in segment %u", doc_id, i - 1);
|
275
|
+
RELAY_ERROR(wp_segment_remove_label(&index->segments[i - 1], label, (docid_t)(doc_id - index->docid_offsets[i - 1])));
|
276
|
+
found = 1;
|
277
|
+
break;
|
278
|
+
}
|
279
|
+
else DEBUG("did not find doc %llu in segment %u", doc_id, i - 1);
|
280
|
+
}
|
281
|
+
|
282
|
+
if(!found) RAISE_ERROR("couldn't find doc id %llu", doc_id);
|
283
|
+
|
284
|
+
return NO_ERROR;
|
285
|
+
}
|
286
|
+
|
287
|
+
uint64_t wp_index_num_docs(wp_index* index) {
|
288
|
+
uint64_t ret = 0;
|
289
|
+
|
290
|
+
// TODO check for overflow or some shit
|
291
|
+
for(uint32_t i = index->num_segments; i > 0; i--) ret += wp_segment_num_docs(&index->segments[i - 1]);
|
292
|
+
|
293
|
+
return ret;
|
294
|
+
}
|
@@ -0,0 +1,88 @@
|
|
1
|
+
#ifndef WP_INDEX_H_
|
2
|
+
#define WP_INDEX_H_
|
3
|
+
|
4
|
+
// whistlepig index
|
5
|
+
// (c) 2011 William Morgan. See COPYING for license terms.
|
6
|
+
//
|
7
|
+
// the main public interaction point with whistlepig, in addition to the
|
8
|
+
// supporting entry and query objects. it holds a collection of segments and
|
9
|
+
// essentially relays commands to the appropriate ones, creating new segments
|
10
|
+
// as needed.
|
11
|
+
|
12
|
+
#include "defaults.h"
|
13
|
+
#include "segment.h"
|
14
|
+
#include "error.h"
|
15
|
+
#include "entry.h"
|
16
|
+
|
17
|
+
#define WP_MAX_SEGMENTS 65534 // max value of wp_search_query->segment_idx - 2 because we need two special numbers
|
18
|
+
|
19
|
+
typedef struct wp_index {
|
20
|
+
const char* pathname_base;
|
21
|
+
uint16_t num_segments;
|
22
|
+
uint16_t sizeof_segments;
|
23
|
+
uint64_t* docid_offsets;
|
24
|
+
struct wp_segment* segments;
|
25
|
+
uint8_t open;
|
26
|
+
} wp_index;
|
27
|
+
|
28
|
+
// API methods
|
29
|
+
|
30
|
+
// public: returns non-zero if an index with base pathname pathname_base
|
31
|
+
// exists, zero otherwise
|
32
|
+
int wp_index_exists(const char* pathname_base);
|
33
|
+
|
34
|
+
// public: creates an index, raising an exception if it already exists
|
35
|
+
wp_error* wp_index_create(wp_index** index, const char* pathname_base) RAISES_ERROR;
|
36
|
+
|
37
|
+
// public: loads an existing index, raising an exception if it doesn't exist
|
38
|
+
wp_error* wp_index_load(wp_index** index, const char* pathname_base) RAISES_ERROR;
|
39
|
+
|
40
|
+
// public: releases an index
|
41
|
+
wp_error* wp_index_unload(wp_index* index) RAISES_ERROR;
|
42
|
+
|
43
|
+
// public: frees all memory. can be called after unload, or not. don't call
|
44
|
+
// anything on the index after calling this, though...
|
45
|
+
wp_error* wp_index_free(wp_index* index) RAISES_ERROR;
|
46
|
+
|
47
|
+
// public: returns the number of documents in the index.
|
48
|
+
uint64_t wp_index_num_docs(wp_index* index);
|
49
|
+
|
50
|
+
// public: initializes a query for use on the index. must be called before
|
51
|
+
// run_query
|
52
|
+
wp_error* wp_index_setup_query(wp_index* index, wp_query* query) RAISES_ERROR;
|
53
|
+
|
54
|
+
// public: tears down a query from use on the index. must be called after
|
55
|
+
// run_query, or memory will leak.
|
56
|
+
wp_error* wp_index_teardown_query(wp_index* index, wp_query* query) RAISES_ERROR;
|
57
|
+
|
58
|
+
// public: runs a query on an index. must be called in between setup_query and
|
59
|
+
// teardown_query. can be called multiple times and the query will be resumed.
|
60
|
+
// when the number of documents returned is < num_results, then you're at the
|
61
|
+
// end!
|
62
|
+
wp_error* wp_index_run_query(wp_index* index, wp_query* query, uint32_t max_num_results, uint32_t* num_results, uint64_t* results) RAISES_ERROR;
|
63
|
+
|
64
|
+
// public: returns the number of results that match a query. note that this is
|
65
|
+
// roughly as expensive as just running the query competely, modulo some memory
|
66
|
+
// allocations here and there...
|
67
|
+
wp_error* wp_index_count_results(wp_index* index, wp_query* query, uint32_t* num_results) RAISES_ERROR;
|
68
|
+
|
69
|
+
// public: adds an entry to the index. sets doc_id to the new docid.
|
70
|
+
wp_error* wp_index_add_entry(wp_index* index, wp_entry* entry, uint64_t* doc_id) RAISES_ERROR;
|
71
|
+
|
72
|
+
// public: adds an label to a doc_id. throws an exception if the document
|
73
|
+
// doesn't exist. does nothing if the label has already been added to the
|
74
|
+
// document.
|
75
|
+
wp_error* wp_index_add_label(wp_index* index, const char* label, uint64_t doc_id);
|
76
|
+
|
77
|
+
// public: removes a label from a doc_id. throws an exception if the document
|
78
|
+
// doesn't exist. does nothing if the label has already been added to the
|
79
|
+
// document.
|
80
|
+
wp_error* wp_index_remove_label(wp_index* index, const char* label, uint64_t doc_id);
|
81
|
+
|
82
|
+
// dumps some index to the stream.
|
83
|
+
wp_error* wp_index_dumpinfo(wp_index* index, FILE* stream) RAISES_ERROR;
|
84
|
+
|
85
|
+
// public: deletes a document from disk.
|
86
|
+
wp_error* wp_index_delete(const char* path) RAISES_ERROR;
|
87
|
+
|
88
|
+
#endif
|
@@ -0,0 +1,316 @@
|
|
1
|
+
/* The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008, by Attractive Chaos <attractivechaos@aol.co.uk>
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
20
|
+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
21
|
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
22
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
+
SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/*
|
27
|
+
An example:
|
28
|
+
|
29
|
+
#include "khash.h"
|
30
|
+
KHASH_MAP_INIT_INT(32, char)
|
31
|
+
int main() {
|
32
|
+
int ret, is_missing;
|
33
|
+
khiter_t k;
|
34
|
+
khash_t(32) *h = kh_init(32);
|
35
|
+
k = kh_put(32, h, 5, &ret);
|
36
|
+
if (!ret) kh_del(32, h, k);
|
37
|
+
kh_value(h, k) = 10;
|
38
|
+
k = kh_get(32, h, 10);
|
39
|
+
is_missing = (k == kh_end(h));
|
40
|
+
k = kh_get(32, h, 5);
|
41
|
+
kh_del(32, h, k);
|
42
|
+
for (k = kh_begin(h); k != kh_end(h); ++k)
|
43
|
+
if (kh_exist(h, k)) kh_value(h, k) = 1;
|
44
|
+
kh_destroy(32, h);
|
45
|
+
return 0;
|
46
|
+
}
|
47
|
+
*/
|
48
|
+
|
49
|
+
/*
|
50
|
+
2008-09-19 (0.2.3):
|
51
|
+
|
52
|
+
* Corrected the example
|
53
|
+
* Improved interfaces
|
54
|
+
|
55
|
+
2008-09-11 (0.2.2):
|
56
|
+
|
57
|
+
* Improved speed a little in kh_put()
|
58
|
+
|
59
|
+
2008-09-10 (0.2.1):
|
60
|
+
|
61
|
+
* Added kh_clear()
|
62
|
+
* Fixed a compiling error
|
63
|
+
|
64
|
+
2008-09-02 (0.2.0):
|
65
|
+
|
66
|
+
* Changed to token concatenation which increases flexibility.
|
67
|
+
|
68
|
+
2008-08-31 (0.1.2):
|
69
|
+
|
70
|
+
* Fixed a bug in kh_get(), which has not been tested previously.
|
71
|
+
|
72
|
+
2008-08-31 (0.1.1):
|
73
|
+
|
74
|
+
* Added destructor
|
75
|
+
*/
|
76
|
+
|
77
|
+
|
78
|
+
#ifndef __AC_KHASH_H
|
79
|
+
#define __AC_KHASH_H
|
80
|
+
|
81
|
+
#define AC_VERSION_KHASH_H "0.2.2"
|
82
|
+
|
83
|
+
#include <stdint.h>
|
84
|
+
#include <stdlib.h>
|
85
|
+
#include <string.h>
|
86
|
+
|
87
|
+
typedef uint32_t khint_t;
|
88
|
+
typedef khint_t khiter_t;
|
89
|
+
|
90
|
+
#define __ac_HASH_PRIME_SIZE 32
|
91
|
+
static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
|
92
|
+
{
|
93
|
+
0ul, 3ul, 11ul, 23ul, 53ul,
|
94
|
+
97ul, 193ul, 389ul, 769ul, 1543ul,
|
95
|
+
3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
|
96
|
+
98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
|
97
|
+
3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
|
98
|
+
100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
|
99
|
+
3221225473ul, 4294967291ul
|
100
|
+
};
|
101
|
+
|
102
|
+
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
|
103
|
+
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
|
104
|
+
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
|
105
|
+
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
|
106
|
+
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
|
107
|
+
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
|
108
|
+
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
|
109
|
+
|
110
|
+
static const double __ac_HASH_UPPER = 0.77;
|
111
|
+
|
112
|
+
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
113
|
+
typedef struct { \
|
114
|
+
khint_t n_buckets, size, n_occupied, upper_bound; \
|
115
|
+
uint32_t *flags; \
|
116
|
+
khkey_t *keys; \
|
117
|
+
khval_t *vals; \
|
118
|
+
} kh_##name##_t; \
|
119
|
+
static inline kh_##name##_t *kh_init_##name() { \
|
120
|
+
return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
|
121
|
+
} \
|
122
|
+
static inline void kh_destroy_##name(kh_##name##_t *h) \
|
123
|
+
{ \
|
124
|
+
if (h) { \
|
125
|
+
free(h->keys); free(h->flags); \
|
126
|
+
free(h->vals); \
|
127
|
+
free(h); \
|
128
|
+
} \
|
129
|
+
} \
|
130
|
+
static inline void kh_clear_##name(kh_##name##_t *h) \
|
131
|
+
{ \
|
132
|
+
if (h && h->flags) { \
|
133
|
+
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
|
134
|
+
h->size = h->n_occupied = 0; \
|
135
|
+
} \
|
136
|
+
} \
|
137
|
+
static inline khint_t kh_get_##name(kh_##name##_t *h, khkey_t key) \
|
138
|
+
{ \
|
139
|
+
if (h->n_buckets) { \
|
140
|
+
khint_t inc, k, i, last; \
|
141
|
+
k = __hash_func(key); i = k % h->n_buckets; \
|
142
|
+
inc = 1 + k % (h->n_buckets - 1); last = i; \
|
143
|
+
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
144
|
+
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
|
145
|
+
else i += inc; \
|
146
|
+
if (i == last) return h->n_buckets; \
|
147
|
+
} \
|
148
|
+
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
|
149
|
+
} else return 0; \
|
150
|
+
} \
|
151
|
+
static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
|
152
|
+
{ \
|
153
|
+
uint32_t *new_flags = 0; \
|
154
|
+
khint_t j = 1; \
|
155
|
+
{ \
|
156
|
+
khint_t t = __ac_HASH_PRIME_SIZE - 1; \
|
157
|
+
while (__ac_prime_list[t] > new_n_buckets) --t; \
|
158
|
+
new_n_buckets = __ac_prime_list[t+1]; \
|
159
|
+
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
|
160
|
+
else { \
|
161
|
+
new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
|
162
|
+
memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
|
163
|
+
if (h->n_buckets < new_n_buckets) { \
|
164
|
+
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
|
165
|
+
if (kh_is_map) \
|
166
|
+
h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
|
167
|
+
} \
|
168
|
+
} \
|
169
|
+
} \
|
170
|
+
if (j) { \
|
171
|
+
for (j = 0; j != h->n_buckets; ++j) { \
|
172
|
+
if (__ac_iseither(h->flags, j) == 0) { \
|
173
|
+
khkey_t key = h->keys[j]; \
|
174
|
+
khval_t val; \
|
175
|
+
if (kh_is_map) val = h->vals[j]; \
|
176
|
+
__ac_set_isdel_true(h->flags, j); \
|
177
|
+
while (1) { \
|
178
|
+
khint_t inc, k, i; \
|
179
|
+
k = __hash_func(key); \
|
180
|
+
i = k % new_n_buckets; \
|
181
|
+
inc = 1 + k % (new_n_buckets - 1); \
|
182
|
+
while (!__ac_isempty(new_flags, i)) { \
|
183
|
+
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
|
184
|
+
else i += inc; \
|
185
|
+
} \
|
186
|
+
__ac_set_isempty_false(new_flags, i); \
|
187
|
+
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
|
188
|
+
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
|
189
|
+
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
|
190
|
+
__ac_set_isdel_true(h->flags, i); \
|
191
|
+
} else { \
|
192
|
+
h->keys[i] = key; \
|
193
|
+
if (kh_is_map) h->vals[i] = val; \
|
194
|
+
break; \
|
195
|
+
} \
|
196
|
+
} \
|
197
|
+
} \
|
198
|
+
} \
|
199
|
+
if (h->n_buckets > new_n_buckets) { \
|
200
|
+
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
|
201
|
+
if (kh_is_map) \
|
202
|
+
h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
|
203
|
+
} \
|
204
|
+
free(h->flags); \
|
205
|
+
h->flags = new_flags; \
|
206
|
+
h->n_buckets = new_n_buckets; \
|
207
|
+
h->n_occupied = h->size; \
|
208
|
+
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
|
209
|
+
} \
|
210
|
+
} \
|
211
|
+
static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
|
212
|
+
{ \
|
213
|
+
khint_t x; \
|
214
|
+
if (h->n_occupied >= h->upper_bound) { \
|
215
|
+
if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
|
216
|
+
else kh_resize_##name(h, h->n_buckets + 1); \
|
217
|
+
} \
|
218
|
+
{ \
|
219
|
+
khint_t inc, k, i, site, last; \
|
220
|
+
x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
|
221
|
+
if (__ac_isempty(h->flags, i)) x = i; \
|
222
|
+
else { \
|
223
|
+
inc = 1 + k % (h->n_buckets - 1); last = i; \
|
224
|
+
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
225
|
+
if (__ac_isdel(h->flags, i)) site = i; \
|
226
|
+
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
|
227
|
+
else i += inc; \
|
228
|
+
if (i == last) { x = site; break; } \
|
229
|
+
} \
|
230
|
+
if (x == h->n_buckets) { \
|
231
|
+
if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
|
232
|
+
else x = i; \
|
233
|
+
} \
|
234
|
+
} \
|
235
|
+
} \
|
236
|
+
if (__ac_isempty(h->flags, x)) { \
|
237
|
+
h->keys[x] = key; \
|
238
|
+
__ac_set_isboth_false(h->flags, x); \
|
239
|
+
++h->size; ++h->n_occupied; \
|
240
|
+
*ret = 1; \
|
241
|
+
} else if (__ac_isdel(h->flags, x)) { \
|
242
|
+
h->keys[x] = key; \
|
243
|
+
__ac_set_isboth_false(h->flags, x); \
|
244
|
+
++h->size; \
|
245
|
+
*ret = 2; \
|
246
|
+
} else *ret = 0; \
|
247
|
+
return x; \
|
248
|
+
} \
|
249
|
+
static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \
|
250
|
+
{ \
|
251
|
+
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
|
252
|
+
__ac_set_isdel_true(h->flags, x); \
|
253
|
+
--h->size; \
|
254
|
+
} \
|
255
|
+
}
|
256
|
+
|
257
|
+
/* --- BEGIN OF HASH FUNCTIONS --- */
|
258
|
+
|
259
|
+
#define kh_int_hash_func(key) (uint32_t)(key)
|
260
|
+
#define kh_int_hash_equal(a, b) (a == b)
|
261
|
+
#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
|
262
|
+
#define kh_int64_hash_equal(a, b) (a == b)
|
263
|
+
static inline khint_t __ac_X31_hash_string(const char *s)
|
264
|
+
{
|
265
|
+
khint_t h = *s;
|
266
|
+
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
|
267
|
+
return h;
|
268
|
+
}
|
269
|
+
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
|
270
|
+
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
|
271
|
+
|
272
|
+
/* --- END OF HASH FUNCTIONS --- */
|
273
|
+
|
274
|
+
/* Other necessary macros... */
|
275
|
+
|
276
|
+
#define khash_t(name) kh_##name##_t
|
277
|
+
|
278
|
+
#define kh_init(name) kh_init_##name()
|
279
|
+
#define kh_destroy(name, h) kh_destroy_##name(h)
|
280
|
+
#define kh_clear(name, h) kh_clear_##name(h)
|
281
|
+
#define kh_resize(name, h, s) kh_resize_##name(h, s)
|
282
|
+
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
|
283
|
+
#define kh_get(name, h, k) kh_get_##name(h, k)
|
284
|
+
#define kh_del(name, h, k) kh_del_##name(h, k)
|
285
|
+
|
286
|
+
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
|
287
|
+
#define kh_key(h, x) ((h)->keys[x])
|
288
|
+
#define kh_val(h, x) ((h)->vals[x])
|
289
|
+
#define kh_value(h, x) ((h)->vals[x])
|
290
|
+
#define kh_begin(h) (khint_t)(0)
|
291
|
+
#define kh_end(h) ((h)->n_buckets)
|
292
|
+
#define kh_size(h) ((h)->size)
|
293
|
+
#define kh_n_buckets(h) ((h)->n_buckets)
|
294
|
+
|
295
|
+
/* More conenient interfaces */
|
296
|
+
|
297
|
+
#define KHASH_SET_INIT_INT(name) \
|
298
|
+
KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
|
299
|
+
|
300
|
+
#define KHASH_MAP_INIT_INT(name, khval_t) \
|
301
|
+
KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
|
302
|
+
|
303
|
+
#define KHASH_SET_INIT_INT64(name) \
|
304
|
+
KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
|
305
|
+
|
306
|
+
#define KHASH_MAP_INIT_INT64(name, khval_t) \
|
307
|
+
KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
|
308
|
+
|
309
|
+
typedef const char *kh_cstr_t;
|
310
|
+
#define KHASH_SET_INIT_STR(name) \
|
311
|
+
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
|
312
|
+
|
313
|
+
#define KHASH_MAP_INIT_STR(name, khval_t) \
|
314
|
+
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
|
315
|
+
|
316
|
+
#endif /* __AC_KHASH_H */
|