whistlepig 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +86 -0
- data/ext/whistlepig/defaults.h +28 -0
- data/ext/whistlepig/entry.c +181 -0
- data/ext/whistlepig/entry.h +66 -0
- data/ext/whistlepig/error.c +24 -0
- data/ext/whistlepig/error.h +94 -0
- data/ext/whistlepig/extconf.rb +6 -0
- data/ext/whistlepig/index.c +294 -0
- data/ext/whistlepig/index.h +88 -0
- data/ext/whistlepig/khash.h +316 -0
- data/ext/whistlepig/mmap-obj.c +76 -0
- data/ext/whistlepig/mmap-obj.h +52 -0
- data/ext/whistlepig/query-parser.c +37 -0
- data/ext/whistlepig/query-parser.h +25 -0
- data/ext/whistlepig/query-parser.lex.c +2249 -0
- data/ext/whistlepig/query-parser.lex.h +359 -0
- data/ext/whistlepig/query-parser.tab.c +1757 -0
- data/ext/whistlepig/query-parser.tab.h +85 -0
- data/ext/whistlepig/query.c +194 -0
- data/ext/whistlepig/query.h +78 -0
- data/ext/whistlepig/search.c +746 -0
- data/ext/whistlepig/search.h +76 -0
- data/ext/whistlepig/segment.c +615 -0
- data/ext/whistlepig/segment.h +137 -0
- data/ext/whistlepig/stringmap.c +278 -0
- data/ext/whistlepig/stringmap.h +82 -0
- data/ext/whistlepig/stringpool.c +44 -0
- data/ext/whistlepig/stringpool.h +58 -0
- data/ext/whistlepig/termhash.c +294 -0
- data/ext/whistlepig/termhash.h +79 -0
- data/ext/whistlepig/tokenizer.lex.c +2263 -0
- data/ext/whistlepig/tokenizer.lex.h +360 -0
- data/ext/whistlepig/whistlepig.h +15 -0
- data/ext/whistlepig/whistlepigc.c +537 -0
- data/lib/whistlepig.rb +119 -0
- metadata +103 -0
@@ -0,0 +1,294 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <sys/types.h>
|
3
|
+
#include <sys/stat.h>
|
4
|
+
#include <unistd.h>
|
5
|
+
#include "whistlepig.h"
|
6
|
+
|
7
|
+
#define PATH_BUF_SIZE 4096
|
8
|
+
|
9
|
+
int wp_index_exists(const char* pathname_base) {
|
10
|
+
char buf[PATH_BUF_SIZE];
|
11
|
+
snprintf(buf, PATH_BUF_SIZE, "%s0", pathname_base);
|
12
|
+
return wp_segment_exists(buf);
|
13
|
+
}
|
14
|
+
|
15
|
+
wp_error* wp_index_create(wp_index** indexptr, const char* pathname_base) {
|
16
|
+
char buf[PATH_BUF_SIZE];
|
17
|
+
|
18
|
+
snprintf(buf, PATH_BUF_SIZE, "%s0", pathname_base);
|
19
|
+
if(wp_segment_exists(buf)) RAISE_ERROR("index with base path '%s' already exists", pathname_base);
|
20
|
+
|
21
|
+
wp_index* index = *indexptr = malloc(sizeof(wp_index));
|
22
|
+
index->pathname_base = pathname_base;
|
23
|
+
index->num_segments = 1;
|
24
|
+
index->sizeof_segments = 1;
|
25
|
+
index->open = 1;
|
26
|
+
index->segments = malloc(sizeof(wp_segment));
|
27
|
+
index->docid_offsets = malloc(sizeof(uint64_t));
|
28
|
+
|
29
|
+
RELAY_ERROR(wp_segment_create(&index->segments[0], buf));
|
30
|
+
index->docid_offsets[0] = 0;
|
31
|
+
|
32
|
+
return NO_ERROR;
|
33
|
+
}
|
34
|
+
|
35
|
+
RAISING_STATIC(ensure_num_segments(wp_index* index)) {
|
36
|
+
if(index->num_segments >= index->sizeof_segments) {
|
37
|
+
index->sizeof_segments *= 2;
|
38
|
+
index->segments = realloc(index->segments, sizeof(wp_segment) * index->sizeof_segments);
|
39
|
+
index->docid_offsets = realloc(index->docid_offsets, sizeof(uint64_t) * index->sizeof_segments);
|
40
|
+
if(index->segments == NULL) RAISE_ERROR("oom");
|
41
|
+
}
|
42
|
+
|
43
|
+
return NO_ERROR;
|
44
|
+
}
|
45
|
+
|
46
|
+
wp_error* wp_index_load(wp_index** indexptr, const char* pathname_base) {
|
47
|
+
char buf[PATH_BUF_SIZE];
|
48
|
+
snprintf(buf, PATH_BUF_SIZE, "%s0", pathname_base);
|
49
|
+
if(!wp_segment_exists(buf)) RAISE_ERROR("index with base path '%s' does not exist", pathname_base);
|
50
|
+
|
51
|
+
wp_index* index = *indexptr = malloc(sizeof(wp_index));
|
52
|
+
|
53
|
+
index->pathname_base = pathname_base;
|
54
|
+
index->num_segments = 0;
|
55
|
+
index->sizeof_segments = 1;
|
56
|
+
index->open = 1;
|
57
|
+
index->segments = malloc(sizeof(wp_segment));
|
58
|
+
index->docid_offsets = malloc(sizeof(uint64_t));
|
59
|
+
|
60
|
+
// load all the segments we can
|
61
|
+
while(index->num_segments < WP_MAX_SEGMENTS) {
|
62
|
+
snprintf(buf, PATH_BUF_SIZE, "%s%d", pathname_base, index->num_segments);
|
63
|
+
if(!wp_segment_exists(buf)) break;
|
64
|
+
|
65
|
+
RELAY_ERROR(ensure_num_segments(index));
|
66
|
+
DEBUG("loading segment %s", buf);
|
67
|
+
RELAY_ERROR(wp_segment_load(&index->segments[index->num_segments], buf));
|
68
|
+
if(index->num_segments == 0)
|
69
|
+
index->docid_offsets[index->num_segments] = 0;
|
70
|
+
else {
|
71
|
+
// segments return docids 1 through N, so the num_docs in a segment is
|
72
|
+
// also the max document id
|
73
|
+
postings_region* prevpr = MMAP_OBJ(index->segments[index->num_segments - 1].postings, postings_region);
|
74
|
+
index->docid_offsets[index->num_segments] = prevpr->num_docs + index->docid_offsets[index->num_segments - 1];
|
75
|
+
}
|
76
|
+
|
77
|
+
index->num_segments++;
|
78
|
+
}
|
79
|
+
|
80
|
+
return NO_ERROR;
|
81
|
+
}
|
82
|
+
|
83
|
+
// we have two special values at our disposal to mark where we are in
|
84
|
+
// the sequence of segments
|
85
|
+
#define SEGMENT_UNINITIALIZED WP_MAX_SEGMENTS
|
86
|
+
#define SEGMENT_DONE (WP_MAX_SEGMENTS + 1)
|
87
|
+
|
88
|
+
wp_error* wp_index_setup_query(wp_index* index, wp_query* query) {
|
89
|
+
(void)index;
|
90
|
+
query->segment_idx = SEGMENT_UNINITIALIZED;
|
91
|
+
|
92
|
+
return NO_ERROR;
|
93
|
+
}
|
94
|
+
|
95
|
+
// can be called multiple times to resume
|
96
|
+
wp_error* wp_index_run_query(wp_index* index, wp_query* query, uint32_t max_num_results, uint32_t* num_results, uint64_t* results) {
|
97
|
+
*num_results = 0;
|
98
|
+
if(index->num_segments == 0) return NO_ERROR;
|
99
|
+
|
100
|
+
if(query->segment_idx == SEGMENT_UNINITIALIZED) {
|
101
|
+
query->segment_idx = index->num_segments - 1;
|
102
|
+
DEBUG("setting up segment %u", query->segment_idx);
|
103
|
+
RELAY_ERROR(wp_search_init_search_state(query, &index->segments[query->segment_idx]));
|
104
|
+
}
|
105
|
+
|
106
|
+
// at this point, we assume we're initialized and query->segment_idx is the index
|
107
|
+
// of the segment we're searching against
|
108
|
+
while((*num_results < max_num_results) && (query->segment_idx != SEGMENT_DONE)) {
|
109
|
+
uint32_t want_num_results = max_num_results - *num_results;
|
110
|
+
uint32_t got_num_results = 0;
|
111
|
+
search_result* segment_results = malloc(sizeof(search_result) * want_num_results);
|
112
|
+
|
113
|
+
DEBUG("searching segment %d", query->segment_idx);
|
114
|
+
RELAY_ERROR(wp_search_run_query_on_segment(query, &index->segments[query->segment_idx], want_num_results, &got_num_results, segment_results));
|
115
|
+
DEBUG("asked segment %d for %d results, got %d", query->segment_idx, want_num_results, got_num_results);
|
116
|
+
|
117
|
+
// extract the per-segment docids from the search results and adjust by
|
118
|
+
// each segment's docid offset to form global docids
|
119
|
+
for(uint32_t i = 0; i < got_num_results; i++) {
|
120
|
+
results[*num_results + i] = index->docid_offsets[query->segment_idx] + segment_results[i].doc_id;
|
121
|
+
wp_search_result_free(&segment_results[i]);
|
122
|
+
}
|
123
|
+
free(segment_results);
|
124
|
+
*num_results += got_num_results;
|
125
|
+
|
126
|
+
if(got_num_results < want_num_results) { // this segment is finished; move to the next one
|
127
|
+
DEBUG("releasing index %d", query->segment_idx);
|
128
|
+
RELAY_ERROR(wp_search_release_search_state(query));
|
129
|
+
if(query->segment_idx > 0) {
|
130
|
+
query->segment_idx--;
|
131
|
+
DEBUG("setting up index %d", query->segment_idx);
|
132
|
+
RELAY_ERROR(wp_search_init_search_state(query, &index->segments[query->segment_idx]));
|
133
|
+
}
|
134
|
+
else query->segment_idx = SEGMENT_DONE;
|
135
|
+
}
|
136
|
+
}
|
137
|
+
|
138
|
+
return NO_ERROR;
|
139
|
+
}
|
140
|
+
|
141
|
+
#define RESULT_BUF_SIZE 1024
|
142
|
+
// count the results by just running the query until it stops. slow!
|
143
|
+
wp_error* wp_index_count_results(wp_index* index, wp_query* query, uint32_t* num_results) {
|
144
|
+
uint64_t results[RESULT_BUF_SIZE];
|
145
|
+
|
146
|
+
*num_results = 0;
|
147
|
+
RELAY_ERROR(wp_index_setup_query(index, query));
|
148
|
+
while(1) {
|
149
|
+
uint32_t this_num_results;
|
150
|
+
RELAY_ERROR(wp_index_run_query(index, query, RESULT_BUF_SIZE, &this_num_results, results));
|
151
|
+
*num_results += this_num_results;
|
152
|
+
if(this_num_results < RESULT_BUF_SIZE) break; // done
|
153
|
+
}
|
154
|
+
|
155
|
+
RELAY_ERROR(wp_index_teardown_query(index, query));
|
156
|
+
|
157
|
+
return NO_ERROR;
|
158
|
+
}
|
159
|
+
|
160
|
+
wp_error* wp_index_teardown_query(wp_index* index, wp_query* query) {
|
161
|
+
(void)index;
|
162
|
+
if((query->segment_idx != SEGMENT_UNINITIALIZED) && (query->segment_idx != SEGMENT_DONE)) {
|
163
|
+
RELAY_ERROR(wp_search_release_search_state(query));
|
164
|
+
}
|
165
|
+
query->segment_idx = SEGMENT_UNINITIALIZED;
|
166
|
+
|
167
|
+
return NO_ERROR;
|
168
|
+
}
|
169
|
+
|
170
|
+
wp_error* wp_index_add_entry(wp_index* index, wp_entry* entry, uint64_t* doc_id) {
|
171
|
+
int success;
|
172
|
+
wp_segment* seg = &index->segments[index->num_segments - 1];
|
173
|
+
|
174
|
+
// first, ensure we have enough space in the current segment
|
175
|
+
uint32_t postings_bytes;
|
176
|
+
RELAY_ERROR(wp_entry_sizeof_postings_region(entry, seg, &postings_bytes));
|
177
|
+
RELAY_ERROR(wp_segment_ensure_fit(seg, postings_bytes, 0, &success));
|
178
|
+
|
179
|
+
// if not, we need to open a new one
|
180
|
+
if(!success) {
|
181
|
+
DEBUG("segment %d is full, loading a new one", index->num_segments - 1);
|
182
|
+
char buf[PATH_BUF_SIZE];
|
183
|
+
snprintf(buf, PATH_BUF_SIZE, "%s%d", index->pathname_base, index->num_segments);
|
184
|
+
RELAY_ERROR(ensure_num_segments(index));
|
185
|
+
RELAY_ERROR(wp_segment_create(&index->segments[index->num_segments], buf));
|
186
|
+
index->num_segments++;
|
187
|
+
|
188
|
+
// set the docid_offset
|
189
|
+
postings_region* prevpr = MMAP_OBJ(index->segments[index->num_segments - 2].postings, postings_region);
|
190
|
+
index->docid_offsets[index->num_segments - 1] = prevpr->num_docs + index->docid_offsets[index->num_segments - 2];
|
191
|
+
|
192
|
+
seg = &index->segments[index->num_segments - 1];
|
193
|
+
DEBUG("loaded new segment %d at %p", index->num_segments - 1, &index->segments[index->num_segments - 1]);
|
194
|
+
|
195
|
+
RELAY_ERROR(wp_entry_sizeof_postings_region(entry, seg, &postings_bytes));
|
196
|
+
RELAY_ERROR(wp_segment_ensure_fit(seg, postings_bytes, 0, &success));
|
197
|
+
if(!success) RAISE_ERROR("can't fit new entry into fresh segment. that's crazy");
|
198
|
+
}
|
199
|
+
|
200
|
+
docid_t seg_doc_id;
|
201
|
+
RELAY_ERROR(wp_segment_grab_docid(seg, &seg_doc_id));
|
202
|
+
RELAY_ERROR(wp_entry_write_to_segment(entry, seg, seg_doc_id));
|
203
|
+
*doc_id = seg_doc_id + index->docid_offsets[index->num_segments - 1];
|
204
|
+
|
205
|
+
return NO_ERROR;
|
206
|
+
}
|
207
|
+
|
208
|
+
wp_error* wp_index_unload(wp_index* index) {
|
209
|
+
for(uint16_t i = 0; i < index->num_segments; i++) RELAY_ERROR(wp_segment_unload(&index->segments[i]));
|
210
|
+
index->open = 0;
|
211
|
+
|
212
|
+
return NO_ERROR;
|
213
|
+
}
|
214
|
+
|
215
|
+
wp_error* wp_index_free(wp_index* index) {
|
216
|
+
if(index->open) RELAY_ERROR(wp_index_unload(index));
|
217
|
+
free(index->segments);
|
218
|
+
free(index->docid_offsets);
|
219
|
+
free(index);
|
220
|
+
|
221
|
+
return NO_ERROR;
|
222
|
+
}
|
223
|
+
|
224
|
+
wp_error* wp_index_dumpinfo(wp_index* index, FILE* stream) {
|
225
|
+
fprintf(stream, "index has %d segments\n", index->num_segments);
|
226
|
+
for(int i = 0; i < index->num_segments; i++) {
|
227
|
+
fprintf(stream, "\nsegment %d:\n", i);
|
228
|
+
RELAY_ERROR(wp_segment_dumpinfo(&index->segments[i], stream));
|
229
|
+
}
|
230
|
+
|
231
|
+
return NO_ERROR;
|
232
|
+
}
|
233
|
+
|
234
|
+
wp_error* wp_index_delete(const char* pathname_base) {
|
235
|
+
char buf[PATH_BUF_SIZE];
|
236
|
+
|
237
|
+
int i = 0;
|
238
|
+
while(1) {
|
239
|
+
snprintf(buf, PATH_BUF_SIZE, "%s%d", pathname_base, i);
|
240
|
+
if(wp_segment_exists(buf)) {
|
241
|
+
DEBUG("deleting segment %s", buf);
|
242
|
+
RELAY_ERROR(wp_segment_delete(buf));
|
243
|
+
i++;
|
244
|
+
}
|
245
|
+
else break;
|
246
|
+
}
|
247
|
+
|
248
|
+
return NO_ERROR;
|
249
|
+
}
|
250
|
+
|
251
|
+
wp_error* wp_index_add_label(wp_index* index, const char* label, uint64_t doc_id) {
|
252
|
+
int found = 0;
|
253
|
+
|
254
|
+
for(uint32_t i = index->num_segments; i > 0; i--) {
|
255
|
+
if(doc_id > index->docid_offsets[i - 1]) {
|
256
|
+
DEBUG("found doc %llu in segment %u", doc_id, i - 1);
|
257
|
+
RELAY_ERROR(wp_segment_add_label(&index->segments[i - 1], label, (docid_t)(doc_id - index->docid_offsets[i - 1])));
|
258
|
+
found = 1;
|
259
|
+
break;
|
260
|
+
}
|
261
|
+
else DEBUG("did not find doc %llu in segment %u", doc_id, i - 1);
|
262
|
+
}
|
263
|
+
|
264
|
+
if(!found) RAISE_ERROR("couldn't find doc id %llu", doc_id);
|
265
|
+
|
266
|
+
return NO_ERROR;
|
267
|
+
}
|
268
|
+
|
269
|
+
wp_error* wp_index_remove_label(wp_index* index, const char* label, uint64_t doc_id) {
|
270
|
+
int found = 0;
|
271
|
+
|
272
|
+
for(uint32_t i = index->num_segments; i > 0; i--) {
|
273
|
+
if(doc_id > index->docid_offsets[i - 1]) {
|
274
|
+
DEBUG("found doc %llu in segment %u", doc_id, i - 1);
|
275
|
+
RELAY_ERROR(wp_segment_remove_label(&index->segments[i - 1], label, (docid_t)(doc_id - index->docid_offsets[i - 1])));
|
276
|
+
found = 1;
|
277
|
+
break;
|
278
|
+
}
|
279
|
+
else DEBUG("did not find doc %llu in segment %u", doc_id, i - 1);
|
280
|
+
}
|
281
|
+
|
282
|
+
if(!found) RAISE_ERROR("couldn't find doc id %llu", doc_id);
|
283
|
+
|
284
|
+
return NO_ERROR;
|
285
|
+
}
|
286
|
+
|
287
|
+
uint64_t wp_index_num_docs(wp_index* index) {
|
288
|
+
uint64_t ret = 0;
|
289
|
+
|
290
|
+
// TODO check for overflow or some shit
|
291
|
+
for(uint32_t i = index->num_segments; i > 0; i--) ret += wp_segment_num_docs(&index->segments[i - 1]);
|
292
|
+
|
293
|
+
return ret;
|
294
|
+
}
|
@@ -0,0 +1,88 @@
|
|
1
|
+
#ifndef WP_INDEX_H_
|
2
|
+
#define WP_INDEX_H_
|
3
|
+
|
4
|
+
// whistlepig index
|
5
|
+
// (c) 2011 William Morgan. See COPYING for license terms.
|
6
|
+
//
|
7
|
+
// the main public interaction point with whistlepig, in addition to the
|
8
|
+
// supporting entry and query objects. it holds a collection of segments and
|
9
|
+
// essentially relays commands to the appropriate ones, creating new segments
|
10
|
+
// as needed.
|
11
|
+
|
12
|
+
#include "defaults.h"
|
13
|
+
#include "segment.h"
|
14
|
+
#include "error.h"
|
15
|
+
#include "entry.h"
|
16
|
+
|
17
|
+
#define WP_MAX_SEGMENTS 65534 // max value of wp_search_query->segment_idx - 2 because we need two special numbers
|
18
|
+
|
19
|
+
typedef struct wp_index {
|
20
|
+
const char* pathname_base;
|
21
|
+
uint16_t num_segments;
|
22
|
+
uint16_t sizeof_segments;
|
23
|
+
uint64_t* docid_offsets;
|
24
|
+
struct wp_segment* segments;
|
25
|
+
uint8_t open;
|
26
|
+
} wp_index;
|
27
|
+
|
28
|
+
// API methods
|
29
|
+
|
30
|
+
// public: returns non-zero if an index with base pathname pathname_base
|
31
|
+
// exists, zero otherwise
|
32
|
+
int wp_index_exists(const char* pathname_base);
|
33
|
+
|
34
|
+
// public: creates an index, raising an exception if it already exists
|
35
|
+
wp_error* wp_index_create(wp_index** index, const char* pathname_base) RAISES_ERROR;
|
36
|
+
|
37
|
+
// public: loads an existing index, raising an exception if it doesn't exist
|
38
|
+
wp_error* wp_index_load(wp_index** index, const char* pathname_base) RAISES_ERROR;
|
39
|
+
|
40
|
+
// public: releases an index
|
41
|
+
wp_error* wp_index_unload(wp_index* index) RAISES_ERROR;
|
42
|
+
|
43
|
+
// public: frees all memory. can be called after unload, or not. don't call
|
44
|
+
// anything on the index after calling this, though...
|
45
|
+
wp_error* wp_index_free(wp_index* index) RAISES_ERROR;
|
46
|
+
|
47
|
+
// public: returns the number of documents in the index.
|
48
|
+
uint64_t wp_index_num_docs(wp_index* index);
|
49
|
+
|
50
|
+
// public: initializes a query for use on the index. must be called before
|
51
|
+
// run_query
|
52
|
+
wp_error* wp_index_setup_query(wp_index* index, wp_query* query) RAISES_ERROR;
|
53
|
+
|
54
|
+
// public: tears down a query from use on the index. must be called after
|
55
|
+
// run_query, or memory will leak.
|
56
|
+
wp_error* wp_index_teardown_query(wp_index* index, wp_query* query) RAISES_ERROR;
|
57
|
+
|
58
|
+
// public: runs a query on an index. must be called in between setup_query and
|
59
|
+
// teardown_query. can be called multiple times and the query will be resumed.
|
60
|
+
// when the number of documents returned is < num_results, then you're at the
|
61
|
+
// end!
|
62
|
+
wp_error* wp_index_run_query(wp_index* index, wp_query* query, uint32_t max_num_results, uint32_t* num_results, uint64_t* results) RAISES_ERROR;
|
63
|
+
|
64
|
+
// public: returns the number of results that match a query. note that this is
|
65
|
+
// roughly as expensive as just running the query competely, modulo some memory
|
66
|
+
// allocations here and there...
|
67
|
+
wp_error* wp_index_count_results(wp_index* index, wp_query* query, uint32_t* num_results) RAISES_ERROR;
|
68
|
+
|
69
|
+
// public: adds an entry to the index. sets doc_id to the new docid.
|
70
|
+
wp_error* wp_index_add_entry(wp_index* index, wp_entry* entry, uint64_t* doc_id) RAISES_ERROR;
|
71
|
+
|
72
|
+
// public: adds an label to a doc_id. throws an exception if the document
|
73
|
+
// doesn't exist. does nothing if the label has already been added to the
|
74
|
+
// document.
|
75
|
+
wp_error* wp_index_add_label(wp_index* index, const char* label, uint64_t doc_id);
|
76
|
+
|
77
|
+
// public: removes a label from a doc_id. throws an exception if the document
|
78
|
+
// doesn't exist. does nothing if the label has already been added to the
|
79
|
+
// document.
|
80
|
+
wp_error* wp_index_remove_label(wp_index* index, const char* label, uint64_t doc_id);
|
81
|
+
|
82
|
+
// dumps some index to the stream.
|
83
|
+
wp_error* wp_index_dumpinfo(wp_index* index, FILE* stream) RAISES_ERROR;
|
84
|
+
|
85
|
+
// public: deletes a document from disk.
|
86
|
+
wp_error* wp_index_delete(const char* path) RAISES_ERROR;
|
87
|
+
|
88
|
+
#endif
|
@@ -0,0 +1,316 @@
|
|
1
|
+
/* The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008, by Attractive Chaos <attractivechaos@aol.co.uk>
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
20
|
+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
21
|
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
22
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
+
SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/*
|
27
|
+
An example:
|
28
|
+
|
29
|
+
#include "khash.h"
|
30
|
+
KHASH_MAP_INIT_INT(32, char)
|
31
|
+
int main() {
|
32
|
+
int ret, is_missing;
|
33
|
+
khiter_t k;
|
34
|
+
khash_t(32) *h = kh_init(32);
|
35
|
+
k = kh_put(32, h, 5, &ret);
|
36
|
+
if (!ret) kh_del(32, h, k);
|
37
|
+
kh_value(h, k) = 10;
|
38
|
+
k = kh_get(32, h, 10);
|
39
|
+
is_missing = (k == kh_end(h));
|
40
|
+
k = kh_get(32, h, 5);
|
41
|
+
kh_del(32, h, k);
|
42
|
+
for (k = kh_begin(h); k != kh_end(h); ++k)
|
43
|
+
if (kh_exist(h, k)) kh_value(h, k) = 1;
|
44
|
+
kh_destroy(32, h);
|
45
|
+
return 0;
|
46
|
+
}
|
47
|
+
*/
|
48
|
+
|
49
|
+
/*
|
50
|
+
2008-09-19 (0.2.3):
|
51
|
+
|
52
|
+
* Corrected the example
|
53
|
+
* Improved interfaces
|
54
|
+
|
55
|
+
2008-09-11 (0.2.2):
|
56
|
+
|
57
|
+
* Improved speed a little in kh_put()
|
58
|
+
|
59
|
+
2008-09-10 (0.2.1):
|
60
|
+
|
61
|
+
* Added kh_clear()
|
62
|
+
* Fixed a compiling error
|
63
|
+
|
64
|
+
2008-09-02 (0.2.0):
|
65
|
+
|
66
|
+
* Changed to token concatenation which increases flexibility.
|
67
|
+
|
68
|
+
2008-08-31 (0.1.2):
|
69
|
+
|
70
|
+
* Fixed a bug in kh_get(), which has not been tested previously.
|
71
|
+
|
72
|
+
2008-08-31 (0.1.1):
|
73
|
+
|
74
|
+
* Added destructor
|
75
|
+
*/
|
76
|
+
|
77
|
+
|
78
|
+
#ifndef __AC_KHASH_H
|
79
|
+
#define __AC_KHASH_H
|
80
|
+
|
81
|
+
#define AC_VERSION_KHASH_H "0.2.2"
|
82
|
+
|
83
|
+
#include <stdint.h>
|
84
|
+
#include <stdlib.h>
|
85
|
+
#include <string.h>
|
86
|
+
|
87
|
+
typedef uint32_t khint_t;
|
88
|
+
typedef khint_t khiter_t;
|
89
|
+
|
90
|
+
#define __ac_HASH_PRIME_SIZE 32
|
91
|
+
static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
|
92
|
+
{
|
93
|
+
0ul, 3ul, 11ul, 23ul, 53ul,
|
94
|
+
97ul, 193ul, 389ul, 769ul, 1543ul,
|
95
|
+
3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
|
96
|
+
98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
|
97
|
+
3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
|
98
|
+
100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
|
99
|
+
3221225473ul, 4294967291ul
|
100
|
+
};
|
101
|
+
|
102
|
+
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
|
103
|
+
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
|
104
|
+
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
|
105
|
+
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
|
106
|
+
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
|
107
|
+
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
|
108
|
+
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
|
109
|
+
|
110
|
+
static const double __ac_HASH_UPPER = 0.77;
|
111
|
+
|
112
|
+
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
113
|
+
typedef struct { \
|
114
|
+
khint_t n_buckets, size, n_occupied, upper_bound; \
|
115
|
+
uint32_t *flags; \
|
116
|
+
khkey_t *keys; \
|
117
|
+
khval_t *vals; \
|
118
|
+
} kh_##name##_t; \
|
119
|
+
static inline kh_##name##_t *kh_init_##name() { \
|
120
|
+
return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
|
121
|
+
} \
|
122
|
+
static inline void kh_destroy_##name(kh_##name##_t *h) \
|
123
|
+
{ \
|
124
|
+
if (h) { \
|
125
|
+
free(h->keys); free(h->flags); \
|
126
|
+
free(h->vals); \
|
127
|
+
free(h); \
|
128
|
+
} \
|
129
|
+
} \
|
130
|
+
static inline void kh_clear_##name(kh_##name##_t *h) \
|
131
|
+
{ \
|
132
|
+
if (h && h->flags) { \
|
133
|
+
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
|
134
|
+
h->size = h->n_occupied = 0; \
|
135
|
+
} \
|
136
|
+
} \
|
137
|
+
static inline khint_t kh_get_##name(kh_##name##_t *h, khkey_t key) \
|
138
|
+
{ \
|
139
|
+
if (h->n_buckets) { \
|
140
|
+
khint_t inc, k, i, last; \
|
141
|
+
k = __hash_func(key); i = k % h->n_buckets; \
|
142
|
+
inc = 1 + k % (h->n_buckets - 1); last = i; \
|
143
|
+
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
144
|
+
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
|
145
|
+
else i += inc; \
|
146
|
+
if (i == last) return h->n_buckets; \
|
147
|
+
} \
|
148
|
+
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
|
149
|
+
} else return 0; \
|
150
|
+
} \
|
151
|
+
static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
|
152
|
+
{ \
|
153
|
+
uint32_t *new_flags = 0; \
|
154
|
+
khint_t j = 1; \
|
155
|
+
{ \
|
156
|
+
khint_t t = __ac_HASH_PRIME_SIZE - 1; \
|
157
|
+
while (__ac_prime_list[t] > new_n_buckets) --t; \
|
158
|
+
new_n_buckets = __ac_prime_list[t+1]; \
|
159
|
+
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
|
160
|
+
else { \
|
161
|
+
new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
|
162
|
+
memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
|
163
|
+
if (h->n_buckets < new_n_buckets) { \
|
164
|
+
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
|
165
|
+
if (kh_is_map) \
|
166
|
+
h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
|
167
|
+
} \
|
168
|
+
} \
|
169
|
+
} \
|
170
|
+
if (j) { \
|
171
|
+
for (j = 0; j != h->n_buckets; ++j) { \
|
172
|
+
if (__ac_iseither(h->flags, j) == 0) { \
|
173
|
+
khkey_t key = h->keys[j]; \
|
174
|
+
khval_t val; \
|
175
|
+
if (kh_is_map) val = h->vals[j]; \
|
176
|
+
__ac_set_isdel_true(h->flags, j); \
|
177
|
+
while (1) { \
|
178
|
+
khint_t inc, k, i; \
|
179
|
+
k = __hash_func(key); \
|
180
|
+
i = k % new_n_buckets; \
|
181
|
+
inc = 1 + k % (new_n_buckets - 1); \
|
182
|
+
while (!__ac_isempty(new_flags, i)) { \
|
183
|
+
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
|
184
|
+
else i += inc; \
|
185
|
+
} \
|
186
|
+
__ac_set_isempty_false(new_flags, i); \
|
187
|
+
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
|
188
|
+
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
|
189
|
+
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
|
190
|
+
__ac_set_isdel_true(h->flags, i); \
|
191
|
+
} else { \
|
192
|
+
h->keys[i] = key; \
|
193
|
+
if (kh_is_map) h->vals[i] = val; \
|
194
|
+
break; \
|
195
|
+
} \
|
196
|
+
} \
|
197
|
+
} \
|
198
|
+
} \
|
199
|
+
if (h->n_buckets > new_n_buckets) { \
|
200
|
+
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
|
201
|
+
if (kh_is_map) \
|
202
|
+
h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
|
203
|
+
} \
|
204
|
+
free(h->flags); \
|
205
|
+
h->flags = new_flags; \
|
206
|
+
h->n_buckets = new_n_buckets; \
|
207
|
+
h->n_occupied = h->size; \
|
208
|
+
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
|
209
|
+
} \
|
210
|
+
} \
|
211
|
+
static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
|
212
|
+
{ \
|
213
|
+
khint_t x; \
|
214
|
+
if (h->n_occupied >= h->upper_bound) { \
|
215
|
+
if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
|
216
|
+
else kh_resize_##name(h, h->n_buckets + 1); \
|
217
|
+
} \
|
218
|
+
{ \
|
219
|
+
khint_t inc, k, i, site, last; \
|
220
|
+
x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
|
221
|
+
if (__ac_isempty(h->flags, i)) x = i; \
|
222
|
+
else { \
|
223
|
+
inc = 1 + k % (h->n_buckets - 1); last = i; \
|
224
|
+
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
225
|
+
if (__ac_isdel(h->flags, i)) site = i; \
|
226
|
+
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
|
227
|
+
else i += inc; \
|
228
|
+
if (i == last) { x = site; break; } \
|
229
|
+
} \
|
230
|
+
if (x == h->n_buckets) { \
|
231
|
+
if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
|
232
|
+
else x = i; \
|
233
|
+
} \
|
234
|
+
} \
|
235
|
+
} \
|
236
|
+
if (__ac_isempty(h->flags, x)) { \
|
237
|
+
h->keys[x] = key; \
|
238
|
+
__ac_set_isboth_false(h->flags, x); \
|
239
|
+
++h->size; ++h->n_occupied; \
|
240
|
+
*ret = 1; \
|
241
|
+
} else if (__ac_isdel(h->flags, x)) { \
|
242
|
+
h->keys[x] = key; \
|
243
|
+
__ac_set_isboth_false(h->flags, x); \
|
244
|
+
++h->size; \
|
245
|
+
*ret = 2; \
|
246
|
+
} else *ret = 0; \
|
247
|
+
return x; \
|
248
|
+
} \
|
249
|
+
static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \
|
250
|
+
{ \
|
251
|
+
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
|
252
|
+
__ac_set_isdel_true(h->flags, x); \
|
253
|
+
--h->size; \
|
254
|
+
} \
|
255
|
+
}
|
256
|
+
|
257
|
+
/* --- BEGIN OF HASH FUNCTIONS --- */
|
258
|
+
|
259
|
+
#define kh_int_hash_func(key) (uint32_t)(key)
|
260
|
+
#define kh_int_hash_equal(a, b) (a == b)
|
261
|
+
#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
|
262
|
+
#define kh_int64_hash_equal(a, b) (a == b)
|
263
|
+
static inline khint_t __ac_X31_hash_string(const char *s)
|
264
|
+
{
|
265
|
+
khint_t h = *s;
|
266
|
+
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
|
267
|
+
return h;
|
268
|
+
}
|
269
|
+
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
|
270
|
+
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
|
271
|
+
|
272
|
+
/* --- END OF HASH FUNCTIONS --- */
|
273
|
+
|
274
|
+
/* Other necessary macros... */
|
275
|
+
|
276
|
+
#define khash_t(name) kh_##name##_t
|
277
|
+
|
278
|
+
#define kh_init(name) kh_init_##name()
|
279
|
+
#define kh_destroy(name, h) kh_destroy_##name(h)
|
280
|
+
#define kh_clear(name, h) kh_clear_##name(h)
|
281
|
+
#define kh_resize(name, h, s) kh_resize_##name(h, s)
|
282
|
+
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
|
283
|
+
#define kh_get(name, h, k) kh_get_##name(h, k)
|
284
|
+
#define kh_del(name, h, k) kh_del_##name(h, k)
|
285
|
+
|
286
|
+
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
|
287
|
+
#define kh_key(h, x) ((h)->keys[x])
|
288
|
+
#define kh_val(h, x) ((h)->vals[x])
|
289
|
+
#define kh_value(h, x) ((h)->vals[x])
|
290
|
+
#define kh_begin(h) (khint_t)(0)
|
291
|
+
#define kh_end(h) ((h)->n_buckets)
|
292
|
+
#define kh_size(h) ((h)->size)
|
293
|
+
#define kh_n_buckets(h) ((h)->n_buckets)
|
294
|
+
|
295
|
+
/* More conenient interfaces */
|
296
|
+
|
297
|
+
#define KHASH_SET_INIT_INT(name) \
|
298
|
+
KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
|
299
|
+
|
300
|
+
#define KHASH_MAP_INIT_INT(name, khval_t) \
|
301
|
+
KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
|
302
|
+
|
303
|
+
#define KHASH_SET_INIT_INT64(name) \
|
304
|
+
KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
|
305
|
+
|
306
|
+
#define KHASH_MAP_INIT_INT64(name, khval_t) \
|
307
|
+
KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
|
308
|
+
|
309
|
+
typedef const char *kh_cstr_t;
|
310
|
+
#define KHASH_SET_INIT_STR(name) \
|
311
|
+
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
|
312
|
+
|
313
|
+
#define KHASH_MAP_INIT_STR(name, khval_t) \
|
314
|
+
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
|
315
|
+
|
316
|
+
#endif /* __AC_KHASH_H */
|