filedictrb 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/setup +1 -0
- data/ext/filedict/filedict.h +417 -0
- data/filedictrb.gemspec +1 -0
- data/lib/filedict/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 64d09435d5e09d917e12a06932a75ddf365b081a9caa1253a3b11a53a235196c
|
4
|
+
data.tar.gz: 4cd1fedaa0035da30c2d9572eb28dd190cab901803089b164c36ba4d5776d36c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d232d34e3b99221baf055c56ca672b2536d910103a3a4dd6bab1be9005270db86cda72690f1dd56ed230fb0214ef4cecae45a9f8ae5931b7ebcb3288df8492e2
|
7
|
+
data.tar.gz: de03f2dc36efad352a54f647f3275cc15bae641799e3b2ca258cdbb08100103493ac35e2c37e327f7f39feb2dc92eb80af6fb3c4cd645fad8794612ee0cffcfc
|
data/bin/setup
CHANGED
@@ -0,0 +1,417 @@
|
|
1
|
+
#ifndef FILEDICT_H
|
2
|
+
#define FILEDICT_H 1
|
3
|
+
|
4
|
+
#ifndef FILEDICT_KEY_SIZE
|
5
|
+
#define FILEDICT_KEY_SIZE 256
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#ifndef FILEDICT_VALUE_SIZE
|
9
|
+
#define FILEDICT_VALUE_SIZE 256
|
10
|
+
#endif
|
11
|
+
|
12
|
+
typedef struct filedict_bucket_entry_t {
|
13
|
+
char key[FILEDICT_KEY_SIZE];
|
14
|
+
char value[FILEDICT_VALUE_SIZE];
|
15
|
+
} filedict_bucket_entry_t;
|
16
|
+
|
17
|
+
#ifndef FILEDICT_BUCKET_ENTRY_COUNT
|
18
|
+
#define FILEDICT_BUCKET_ENTRY_COUNT 4
|
19
|
+
#endif
|
20
|
+
|
21
|
+
typedef struct filedict_bucket_t {
|
22
|
+
filedict_bucket_entry_t entries[FILEDICT_BUCKET_ENTRY_COUNT];
|
23
|
+
} filedict_bucket_t;
|
24
|
+
|
25
|
+
typedef size_t (*filedict_hash_function_t)(const char *);
|
26
|
+
|
27
|
+
typedef struct filedict_t {
|
28
|
+
const char *error;
|
29
|
+
int fd;
|
30
|
+
void *data;
|
31
|
+
size_t data_len;
|
32
|
+
filedict_hash_function_t hash_function;
|
33
|
+
} filedict_t;
|
34
|
+
|
35
|
+
typedef struct filedict_header_t {
|
36
|
+
unsigned long long initial_bucket_count : 32;
|
37
|
+
unsigned long long hashmap_count : 32;
|
38
|
+
} __attribute__ ((__packed__)) filedict_header_t;
|
39
|
+
|
40
|
+
typedef struct filedict_read_t {
|
41
|
+
const filedict_t *filedict;
|
42
|
+
const char *key;
|
43
|
+
const char *value;
|
44
|
+
filedict_bucket_t *bucket;
|
45
|
+
filedict_bucket_entry_t *entry;
|
46
|
+
size_t entry_i;
|
47
|
+
size_t hashmap_i;
|
48
|
+
size_t bucket_count;
|
49
|
+
size_t key_hash;
|
50
|
+
} filedict_read_t;
|
51
|
+
|
52
|
+
#endif
|
53
|
+
|
54
|
+
/*
|
55
|
+
* Above is the header, blow is the implementation
|
56
|
+
*/
|
57
|
+
|
58
|
+
#ifndef FILEDICT_IMPL
|
59
|
+
#define FILEDICT_IMPL
|
60
|
+
#include <sys/mman.h>
|
61
|
+
#include <string.h>
|
62
|
+
#include <unistd.h>
|
63
|
+
#include <fcntl.h>
|
64
|
+
#include <errno.h>
|
65
|
+
#include <limits.h>
|
66
|
+
#include <assert.h>
|
67
|
+
|
68
|
+
/* This is "djb2" from http://www.cse.yorku.ca/~oz/hash.html */
|
69
|
+
static size_t filedict_default_hash_function(const char *input) {
|
70
|
+
unsigned long hash = 5381;
|
71
|
+
int c;
|
72
|
+
|
73
|
+
while ((c = *input++) != 0) {
|
74
|
+
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
|
75
|
+
}
|
76
|
+
|
77
|
+
return hash;
|
78
|
+
}
|
79
|
+
|
80
|
+
/*
|
81
|
+
* Writes at most max_len chars from src into dest.
|
82
|
+
* Returns the total number of bytes in src.
|
83
|
+
*/
|
84
|
+
static size_t filedict_copy_string(char *dest, const char *src, size_t max_len) {
|
85
|
+
size_t src_len = 0;
|
86
|
+
char c;
|
87
|
+
|
88
|
+
while (1) {
|
89
|
+
c = *src++;
|
90
|
+
if (src_len < max_len) { *dest++ = c; }
|
91
|
+
if (c == 0) return src_len;
|
92
|
+
src_len += 1;
|
93
|
+
}
|
94
|
+
}
|
95
|
+
|
96
|
+
static void filedict_init(filedict_t *filedict) {
|
97
|
+
filedict->error = NULL;
|
98
|
+
filedict->fd = 0;
|
99
|
+
filedict->data_len = 0;
|
100
|
+
filedict->data = NULL;
|
101
|
+
filedict->hash_function = filedict_default_hash_function;
|
102
|
+
}
|
103
|
+
|
104
|
+
static void filedict_deinit(filedict_t *filedict) {
|
105
|
+
if (filedict->data) {
|
106
|
+
munmap(filedict->data, filedict->data_len);
|
107
|
+
filedict->data = NULL;
|
108
|
+
filedict->data_len = 0;
|
109
|
+
}
|
110
|
+
if (filedict->fd) {
|
111
|
+
close(filedict->fd);
|
112
|
+
filedict->fd = 0;
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
/*
|
117
|
+
* This computes the size of the entire filedict file given an initial bucket count and hashmap count.
|
118
|
+
*/
|
119
|
+
static size_t filedict_file_size(size_t initial_bucket_count, size_t hashmap_count) {
|
120
|
+
size_t result = sizeof(filedict_header_t);
|
121
|
+
size_t i;
|
122
|
+
|
123
|
+
for (i = 0; i < hashmap_count; ++i) {
|
124
|
+
/* Bucket count is multiplied by 2 for each additional hashmap. */
|
125
|
+
result += (initial_bucket_count << i) * sizeof(filedict_bucket_t);
|
126
|
+
}
|
127
|
+
|
128
|
+
return result;
|
129
|
+
}
|
130
|
+
|
131
|
+
/*
|
132
|
+
* This opens a new file for reading and writing, optionally letting you specify the initial bucket count.
|
133
|
+
*/
|
134
|
+
#define filedict_open_new(filedict, filename) \
|
135
|
+
filedict_open_f(filedict, filename, O_CREAT | O_TRUNC | O_RDWR, 4096)
|
136
|
+
|
137
|
+
#define filedict_open_readonly(filedict, filename) \
|
138
|
+
filedict_open_f(filedict, filename, O_RDONLY, 4096)
|
139
|
+
|
140
|
+
#define filedict_open(filedict, filename) \
|
141
|
+
filedict_open_f(filedict, filename, O_CREAT | O_RDWR, 4096)
|
142
|
+
|
143
|
+
static void filedict_open_f(
|
144
|
+
filedict_t *filedict,
|
145
|
+
const char *filename,
|
146
|
+
int flags,
|
147
|
+
unsigned int initial_bucket_count
|
148
|
+
) {
|
149
|
+
filedict->fd = open(filename, flags, 0666);
|
150
|
+
if (filedict->fd == -1) { filedict->error = strerror(errno); return; }
|
151
|
+
|
152
|
+
filedict->data_len = filedict_file_size(initial_bucket_count, 1);
|
153
|
+
ftruncate(filedict->fd, filedict->data_len);
|
154
|
+
filedict->data = mmap(
|
155
|
+
NULL,
|
156
|
+
filedict->data_len,
|
157
|
+
PROT_READ | PROT_WRITE,
|
158
|
+
MAP_SHARED,
|
159
|
+
filedict->fd,
|
160
|
+
0
|
161
|
+
);
|
162
|
+
if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
|
163
|
+
|
164
|
+
filedict_header_t *data = (filedict_header_t *)filedict->data;
|
165
|
+
assert(initial_bucket_count <= UINT_MAX);
|
166
|
+
data->initial_bucket_count = initial_bucket_count;
|
167
|
+
data->hashmap_count = 1;
|
168
|
+
}
|
169
|
+
|
170
|
+
/*
|
171
|
+
* Inserts a new value under "key". Filedict keys have multiple values, so this will "append" a new
|
172
|
+
* value onto the end of the entry.
|
173
|
+
*/
|
174
|
+
#define filedict_insert(filedict, key, value) filedict_insert_f(filedict, key, value, 0)
|
175
|
+
#define filedict_insert_unique(filedict, key, value) filedict_insert_f(filedict, key, value, 1)
|
176
|
+
|
177
|
+
static void filedict_insert_f(filedict_t *filedict, const char *key, const char *value, int unique) {
|
178
|
+
assert(filedict->fd != 0);
|
179
|
+
assert(filedict->data != NULL);
|
180
|
+
|
181
|
+
size_t i, hashmap_i = 0, bucket_count, key_hash;
|
182
|
+
filedict_header_t *header = (filedict_header_t *)filedict->data;
|
183
|
+
filedict_bucket_t *hashmap = filedict->data + sizeof(filedict_header_t);
|
184
|
+
filedict_bucket_t *bucket;
|
185
|
+
|
186
|
+
bucket_count = header->initial_bucket_count;
|
187
|
+
|
188
|
+
key_hash = filedict->hash_function(key);
|
189
|
+
|
190
|
+
/*
|
191
|
+
* Here we loop through each hashmap.
|
192
|
+
*/
|
193
|
+
while (hashmap_i < header->hashmap_count) {
|
194
|
+
try_again:
|
195
|
+
/* TODO: can we truncate instead of modulo, like in Ruby? */
|
196
|
+
bucket = &hashmap[key_hash % bucket_count];
|
197
|
+
|
198
|
+
for (i = 0; i < FILEDICT_BUCKET_ENTRY_COUNT; ++i) {
|
199
|
+
filedict_bucket_entry_t *entry = &bucket->entries[i];
|
200
|
+
|
201
|
+
/* Easy case: fresh entry. We can just insert here and call it quits. */
|
202
|
+
if (entry->key[0] == 0) {
|
203
|
+
strncpy(entry->key, key, FILEDICT_KEY_SIZE);
|
204
|
+
size_t value_len = filedict_copy_string(entry->value, value, FILEDICT_VALUE_SIZE);
|
205
|
+
|
206
|
+
if (value_len > FILEDICT_VALUE_SIZE) {
|
207
|
+
filedict->error = "Value too big";
|
208
|
+
}
|
209
|
+
return;
|
210
|
+
}
|
211
|
+
/*
|
212
|
+
* We need to check for room in the value, then append value.
|
213
|
+
* This is also where we might run into a duplicate and duck out.existing
|
214
|
+
*/
|
215
|
+
else if (strncmp(entry->key, key, FILEDICT_KEY_SIZE) == 0) {
|
216
|
+
long long first_nonzero = -1;
|
217
|
+
char *candidate = NULL;
|
218
|
+
size_t value_i, candidate_len;
|
219
|
+
|
220
|
+
for (value_i = 0; value_i < FILEDICT_VALUE_SIZE - 1; ++value_i) {
|
221
|
+
if (unique) {
|
222
|
+
if (first_nonzero == -1 && entry->value[value_i] != 0) {
|
223
|
+
first_nonzero = value_i;
|
224
|
+
}
|
225
|
+
|
226
|
+
if (entry->value[value_i] == 0) {
|
227
|
+
int cmp = strncmp(
|
228
|
+
&entry->value[first_nonzero],
|
229
|
+
value,
|
230
|
+
FILEDICT_VALUE_SIZE - first_nonzero
|
231
|
+
);
|
232
|
+
if (cmp == 0) {
|
233
|
+
/* Looks like this value already exists! */
|
234
|
+
return;
|
235
|
+
}
|
236
|
+
first_nonzero = -1;
|
237
|
+
}
|
238
|
+
}
|
239
|
+
|
240
|
+
if (entry->value[value_i] == 0 && entry->value[value_i + 1] == 0) {
|
241
|
+
candidate = &entry->value[value_i + 1];
|
242
|
+
candidate_len = FILEDICT_VALUE_SIZE - value_i - 1;
|
243
|
+
|
244
|
+
if (strlen(value) >= candidate_len) break;
|
245
|
+
|
246
|
+
strncpy(candidate, value, candidate_len);
|
247
|
+
return;
|
248
|
+
}
|
249
|
+
}
|
250
|
+
}
|
251
|
+
}
|
252
|
+
|
253
|
+
++hashmap_i;
|
254
|
+
hashmap += bucket_count;
|
255
|
+
bucket_count = (bucket_count << 1);
|
256
|
+
}
|
257
|
+
|
258
|
+
/*
|
259
|
+
* If we fell through to here, that means we need to allocate a new hashmap.
|
260
|
+
*/
|
261
|
+
size_t new_hashmap_count = header->hashmap_count + 1;
|
262
|
+
size_t old_data_len = filedict->data_len;
|
263
|
+
size_t new_data_len = filedict_file_size(header->initial_bucket_count, new_hashmap_count);
|
264
|
+
|
265
|
+
assert(new_data_len > old_data_len);
|
266
|
+
assert((new_data_len - old_data_len) % header->initial_bucket_count == 0);
|
267
|
+
|
268
|
+
munmap(filedict->data, filedict->data_len);
|
269
|
+
int truncate_result = ftruncate(filedict->fd, new_data_len);
|
270
|
+
if (truncate_result != 0) { filedict->error = strerror(errno); return; }
|
271
|
+
|
272
|
+
filedict->data = mmap(
|
273
|
+
filedict->data,
|
274
|
+
new_data_len,
|
275
|
+
PROT_READ | PROT_WRITE,
|
276
|
+
MAP_SHARED,
|
277
|
+
filedict->fd,
|
278
|
+
0
|
279
|
+
);
|
280
|
+
if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
|
281
|
+
header = (filedict_header_t *)filedict->data;
|
282
|
+
hashmap = filedict->data + old_data_len;
|
283
|
+
|
284
|
+
filedict->data_len = new_data_len;
|
285
|
+
header->hashmap_count = new_hashmap_count;
|
286
|
+
goto try_again;
|
287
|
+
}
|
288
|
+
|
289
|
+
/*
|
290
|
+
* There are 3 "levels" to a filedict. From top to bottom:
|
291
|
+
* 1. Hashmap - which hashmap are we looking at? We create additional hashmaps to handle overflow.
|
292
|
+
* 2. Entry - which entry in our hashmap bucket are we looking at?
|
293
|
+
* 3. Value - where in the value buffer are we looking? There's 256 bytes, so can be many strings.
|
294
|
+
*/
|
295
|
+
|
296
|
+
/* #define log_return(val) do { printf("%s -> %i\n", __func__, (val)); return (val); } while(0) */
|
297
|
+
#define log_return(val) return val
|
298
|
+
|
299
|
+
/*
|
300
|
+
* Returns 1 when we successfully advanced to the next value
|
301
|
+
* Returns 0 when there is no next value
|
302
|
+
*/
|
303
|
+
static int filedict_read_advance_value(filedict_read_t *read) {
|
304
|
+
assert(read->entry != NULL);
|
305
|
+
|
306
|
+
const char *buffer_begin = read->entry->value;
|
307
|
+
const char *buffer_end = buffer_begin + FILEDICT_VALUE_SIZE;
|
308
|
+
|
309
|
+
const char *c;
|
310
|
+
for (c = read->value; c < buffer_end; ++c) {
|
311
|
+
if (*c == 0) {
|
312
|
+
c += 1;
|
313
|
+
break;
|
314
|
+
}
|
315
|
+
}
|
316
|
+
|
317
|
+
if (c >= buffer_end) log_return(0);
|
318
|
+
if (*c == 0) log_return(0);
|
319
|
+
|
320
|
+
read->value = c;
|
321
|
+
log_return(1);
|
322
|
+
}
|
323
|
+
|
324
|
+
/*
|
325
|
+
* Returns 1 when we successfully find a new entry that matches read->key.
|
326
|
+
* advances read->entry_i and read->entry to the new entry.
|
327
|
+
*
|
328
|
+
* Returns 0 when we exhausted all remaining entries and didn't find a match.
|
329
|
+
*/
|
330
|
+
static int filedict_read_advance_entry(filedict_read_t *read) {
|
331
|
+
assert(read->key != NULL);
|
332
|
+
assert(strlen(read->key) > 0);
|
333
|
+
assert(read->bucket != NULL);
|
334
|
+
|
335
|
+
while (1) {
|
336
|
+
if (read->entry_i >= FILEDICT_BUCKET_ENTRY_COUNT) log_return(0);
|
337
|
+
|
338
|
+
read->entry = &read->bucket->entries[read->entry_i];
|
339
|
+
|
340
|
+
if (strncmp(read->entry->key, read->key, FILEDICT_KEY_SIZE) == 0) {
|
341
|
+
read->value = read->entry->value;
|
342
|
+
log_return(1);
|
343
|
+
}
|
344
|
+
|
345
|
+
read->entry_i += 1;
|
346
|
+
}
|
347
|
+
}
|
348
|
+
|
349
|
+
/*
|
350
|
+
* Returns 1 when we successfully advanced to the next hashmap.
|
351
|
+
* read->bucket, read->entry, and read->value will be populated.
|
352
|
+
*
|
353
|
+
* Returns 0 when there are no more hashmaps, or the latest hashmap has no matching entries.
|
354
|
+
*/
|
355
|
+
static int filedict_read_advance_hashmap(filedict_read_t *read) {
|
356
|
+
const filedict_t *filedict = read->filedict;
|
357
|
+
|
358
|
+
assert(filedict);
|
359
|
+
assert(filedict->data);
|
360
|
+
|
361
|
+
filedict_header_t *header = (filedict_header_t*)filedict->data;
|
362
|
+
|
363
|
+
if (read->hashmap_i >= header->hashmap_count) log_return(0);
|
364
|
+
|
365
|
+
size_t offset = filedict_file_size(header->initial_bucket_count, read->hashmap_i);
|
366
|
+
filedict_bucket_t *hashmap = filedict->data + offset;
|
367
|
+
|
368
|
+
read->bucket_count = (size_t)header->initial_bucket_count << read->hashmap_i;
|
369
|
+
read->bucket = &hashmap[read->key_hash % read->bucket_count];
|
370
|
+
read->entry = &read->bucket->entries[0];
|
371
|
+
|
372
|
+
read->entry_i = 0;
|
373
|
+
|
374
|
+
log_return(filedict_read_advance_entry(read));
|
375
|
+
}
|
376
|
+
|
377
|
+
/*
|
378
|
+
* Returns a "read" at the given key. If there's a hit, <return>.value will have the value.
|
379
|
+
*/
|
380
|
+
static filedict_read_t filedict_get(const filedict_t *filedict, const char *key) {
|
381
|
+
filedict_read_t read;
|
382
|
+
read.filedict = filedict;
|
383
|
+
read.key = key;
|
384
|
+
read.value = NULL;
|
385
|
+
read.bucket = NULL;
|
386
|
+
read.entry = NULL;
|
387
|
+
read.entry_i = 0;
|
388
|
+
read.hashmap_i = 0;
|
389
|
+
read.bucket_count = 0;
|
390
|
+
read.key_hash = filedict->hash_function(key);
|
391
|
+
|
392
|
+
filedict_read_advance_hashmap(&read);
|
393
|
+
return read;
|
394
|
+
}
|
395
|
+
|
396
|
+
/*
|
397
|
+
* Lets you find the next value. Pass the return value of filedict_get.
|
398
|
+
*
|
399
|
+
* Returns 1 when a next value was found, 0 otherwise.
|
400
|
+
*
|
401
|
+
* If this returns 0, your filedict_read_t is defunct and shouldn't be used anymore.
|
402
|
+
*/
|
403
|
+
static int filedict_get_next(filedict_read_t *read) {
|
404
|
+
int found = -1;
|
405
|
+
|
406
|
+
found = filedict_read_advance_value(read);
|
407
|
+
if (found == 1) return found;
|
408
|
+
|
409
|
+
read->entry_i += 1;
|
410
|
+
found = filedict_read_advance_entry(read);
|
411
|
+
if (found == 1) return found;
|
412
|
+
|
413
|
+
read->hashmap_i += 1;
|
414
|
+
return filedict_read_advance_hashmap(read);
|
415
|
+
}
|
416
|
+
|
417
|
+
#endif
|
data/filedictrb.gemspec
CHANGED
@@ -25,6 +25,7 @@ Gem::Specification.new do |spec|
|
|
25
25
|
(f == __FILE__) || f.match(%r{\A(?:(?:test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
26
26
|
end
|
27
27
|
end
|
28
|
+
spec.files << 'ext/filedict/filedict.h'
|
28
29
|
spec.bindir = "exe"
|
29
30
|
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
30
31
|
spec.require_paths = ["lib"]
|
data/lib/filedict/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: filedictrb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nigel Baillie
|
@@ -26,6 +26,7 @@ files:
|
|
26
26
|
- Rakefile
|
27
27
|
- bin/console
|
28
28
|
- bin/setup
|
29
|
+
- ext/filedict/filedict.h
|
29
30
|
- ext/filedictrb/extconf.rb
|
30
31
|
- ext/filedictrb/filedictrb.c
|
31
32
|
- ext/filedictrb/filedictrb.h
|