filedictrb 0.1.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/filedict/filedict.h +145 -66
- data/lib/filedict/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 54edb33f9c980c2815d486ba98bcc317209325a2f5a8303114ba2912675ebebf
|
4
|
+
data.tar.gz: f5d8eb13dca465d500621c434440ccb648c4e2b52b4f72a70f2e803ac70c1607
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6fa2c3bc8d94db20229ce1f152ef67dfd2bdc503e090ba46420101a958891b5aa8039bded2a23bf0ee6ccec9ff25e602e5c25ba04faa8ac2ee62195f68960111
|
7
|
+
data.tar.gz: 5b2974454d61502919d45dde3b90b400eb77ab3fea4d87c44b9cae0b3c31c24ace549dc9ee314670ee114d0c7e751566c1b5181569e9120ddbd7bd214d702749
|
data/ext/filedict/filedict.h
CHANGED
@@ -1,17 +1,14 @@
|
|
1
1
|
#ifndef FILEDICT_H
|
2
2
|
#define FILEDICT_H 1
|
3
3
|
|
4
|
-
#
|
5
|
-
#define FILEDICT_KEY_SIZE 256
|
6
|
-
#endif
|
4
|
+
#include <stddef.h>
|
7
5
|
|
8
|
-
#ifndef
|
9
|
-
#define
|
6
|
+
#ifndef FILEDICT_BUCKET_ENTRY_BYTES
|
7
|
+
#define FILEDICT_BUCKET_ENTRY_BYTES 512
|
10
8
|
#endif
|
11
9
|
|
12
10
|
typedef struct filedict_bucket_entry_t {
|
13
|
-
char
|
14
|
-
char value[FILEDICT_VALUE_SIZE];
|
11
|
+
char bytes[FILEDICT_BUCKET_ENTRY_BYTES];
|
15
12
|
} filedict_bucket_entry_t;
|
16
13
|
|
17
14
|
#ifndef FILEDICT_BUCKET_ENTRY_COUNT
|
@@ -58,6 +55,7 @@ typedef struct filedict_read_t {
|
|
58
55
|
|
59
56
|
#ifndef FILEDICT_IMPL
|
60
57
|
#define FILEDICT_IMPL
|
58
|
+
#include <sys/stat.h>
|
61
59
|
#include <sys/mman.h>
|
62
60
|
#include <string.h>
|
63
61
|
#include <unistd.h>
|
@@ -80,7 +78,7 @@ static size_t filedict_default_hash_function(const char *input) {
|
|
80
78
|
|
81
79
|
/*
|
82
80
|
* Writes at most max_len chars from src into dest.
|
83
|
-
* Returns the
|
81
|
+
* Returns the string length of src.
|
84
82
|
*/
|
85
83
|
static size_t filedict_copy_string(char *dest, const char *src, size_t max_len) {
|
86
84
|
size_t src_len = 0;
|
@@ -92,6 +90,23 @@ static size_t filedict_copy_string(char *dest, const char *src, size_t max_len)
|
|
92
90
|
if (c == 0) return src_len;
|
93
91
|
src_len += 1;
|
94
92
|
}
|
93
|
+
|
94
|
+
return src_len;
|
95
|
+
}
|
96
|
+
|
97
|
+
/*
|
98
|
+
* Returns the index of the trailing 0 when str1 and str2 have the same contents.
|
99
|
+
* Returns 0 when str1 and str2 have different contents.
|
100
|
+
*/
|
101
|
+
static size_t filedict_string_includes(const char *str1, const char *str2, size_t max_len) {
|
102
|
+
size_t i;
|
103
|
+
|
104
|
+
for (i = 0; i < max_len; ++i) {
|
105
|
+
if (str1[i] != str2[i]) return 0;
|
106
|
+
if (str1[i] == 0) return i;
|
107
|
+
}
|
108
|
+
|
109
|
+
return 0;
|
95
110
|
}
|
96
111
|
|
97
112
|
static void filedict_init(filedict_t *filedict) {
|
@@ -120,15 +135,39 @@ static void filedict_deinit(filedict_t *filedict) {
|
|
120
135
|
* This computes the size of the entire filedict file given an initial bucket count and hashmap count.
|
121
136
|
*/
|
122
137
|
static size_t filedict_file_size(size_t initial_bucket_count, size_t hashmap_count) {
|
123
|
-
|
124
|
-
|
138
|
+
/*
|
139
|
+
* We used to size each additional hashmap at 2x the previous, but realistically it seems that
|
140
|
+
* most resizes are triggered by keys that are ridiculously large, not by mass collision.
|
141
|
+
*
|
142
|
+
* A more proper fix might be to re-structure the whole filedict. We could keep the existing
|
143
|
+
* hashmap structure, but with buckets that expand dynamically. This would require each bucket
|
144
|
+
* to contain a "pointer" to the next bucket object if present.
|
145
|
+
*
|
146
|
+
* For now, it's easiser to just keep the hashmap duplication without the size doubling.
|
147
|
+
*/
|
148
|
+
return sizeof(filedict_header_t) + initial_bucket_count * hashmap_count * sizeof(filedict_bucket_t);
|
149
|
+
}
|
125
150
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
151
|
+
/*
|
152
|
+
* Resizes the filedict based on the header hashmap count and initial bucket count.
|
153
|
+
* Naturally, your pointers into the map will become invalid after calling this.
|
154
|
+
*/
|
155
|
+
static void filedict_resize(filedict_t *filedict) {
|
156
|
+
filedict_header_t *header = (filedict_header_t*)filedict->data;
|
157
|
+
size_t computed_size = filedict_file_size(header->initial_bucket_count, header->hashmap_count);
|
158
|
+
if (computed_size <= filedict->data_len) return;
|
130
159
|
|
131
|
-
|
160
|
+
munmap(filedict->data, filedict->data_len);
|
161
|
+
filedict->data = mmap(
|
162
|
+
filedict->data,
|
163
|
+
computed_size,
|
164
|
+
PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
|
165
|
+
MAP_SHARED,
|
166
|
+
filedict->fd,
|
167
|
+
0
|
168
|
+
);
|
169
|
+
if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
|
170
|
+
filedict->data_len = computed_size;
|
132
171
|
}
|
133
172
|
|
134
173
|
/*
|
@@ -149,12 +188,20 @@ static void filedict_open_f(
|
|
149
188
|
int flags,
|
150
189
|
unsigned int initial_bucket_count
|
151
190
|
) {
|
191
|
+
struct stat info;
|
192
|
+
|
152
193
|
filedict->flags = flags;
|
153
194
|
filedict->fd = open(filename, flags, 0666);
|
154
195
|
if (filedict->fd == -1) { filedict->error = strerror(errno); return; }
|
196
|
+
if (fstat(filedict->fd, &info) != 0) { filedict->error = strerror(errno); return; }
|
197
|
+
|
198
|
+
if (info.st_size == 0 && (flags & O_RDWR)) {
|
199
|
+
filedict->data_len = filedict_file_size(initial_bucket_count, 1);
|
200
|
+
ftruncate(filedict->fd, filedict->data_len);
|
201
|
+
} else {
|
202
|
+
filedict->data_len = info.st_size;
|
203
|
+
}
|
155
204
|
|
156
|
-
filedict->data_len = filedict_file_size(initial_bucket_count, 1);
|
157
|
-
ftruncate(filedict->fd, filedict->data_len);
|
158
205
|
filedict->data = mmap(
|
159
206
|
NULL,
|
160
207
|
filedict->data_len,
|
@@ -206,35 +253,42 @@ try_again:
|
|
206
253
|
filedict_bucket_entry_t *entry = &bucket->entries[i];
|
207
254
|
|
208
255
|
/* Easy case: fresh entry. We can just insert here and call it quits. */
|
209
|
-
if (entry->
|
210
|
-
|
211
|
-
size_t value_len = filedict_copy_string(entry->
|
256
|
+
if (entry->bytes[0] == 0) {
|
257
|
+
size_t key_len = filedict_copy_string(entry->bytes, key, FILEDICT_BUCKET_ENTRY_BYTES);
|
258
|
+
size_t value_len = filedict_copy_string(entry->bytes + key_len + 1, value, FILEDICT_BUCKET_ENTRY_BYTES);
|
212
259
|
|
213
|
-
if (value_len >
|
260
|
+
if (key_len + value_len > FILEDICT_BUCKET_ENTRY_BYTES) {
|
214
261
|
filedict->error = "Value too big";
|
215
262
|
}
|
216
263
|
return;
|
217
264
|
}
|
218
265
|
/*
|
219
266
|
* We need to check for room in the value, then append value.
|
220
|
-
* This is also where we might run into a duplicate and duck out.
|
267
|
+
* This is also where we might run into a duplicate and duck out.
|
221
268
|
*/
|
222
|
-
else if (strncmp(entry->
|
269
|
+
else if (strncmp(entry->bytes, key, FILEDICT_BUCKET_ENTRY_BYTES) == 0) {
|
223
270
|
long long first_nonzero = -1;
|
224
271
|
char *candidate = NULL;
|
225
|
-
size_t
|
272
|
+
size_t bytes_i, candidate_max_len;
|
226
273
|
|
227
|
-
for (
|
274
|
+
for (bytes_i = 0; entry->bytes[bytes_i] != 0; ++bytes_i) {
|
275
|
+
if (bytes_i >= FILEDICT_BUCKET_ENTRY_BYTES) {
|
276
|
+
filedict->error = "Mysterious entry overflow!! Does it contain a massive key?";
|
277
|
+
return;
|
278
|
+
}
|
279
|
+
}
|
280
|
+
|
281
|
+
for (bytes_i += 1; bytes_i < FILEDICT_BUCKET_ENTRY_BYTES - 1; ++bytes_i) {
|
228
282
|
if (unique) {
|
229
|
-
if (first_nonzero == -1 && entry->
|
230
|
-
first_nonzero =
|
283
|
+
if (first_nonzero == -1 && entry->bytes[bytes_i] != 0) {
|
284
|
+
first_nonzero = bytes_i;
|
231
285
|
}
|
232
286
|
|
233
|
-
if (entry->
|
287
|
+
if (entry->bytes[bytes_i] == 0) {
|
234
288
|
int cmp = strncmp(
|
235
|
-
&entry->
|
289
|
+
&entry->bytes[first_nonzero],
|
236
290
|
value,
|
237
|
-
|
291
|
+
FILEDICT_BUCKET_ENTRY_BYTES - first_nonzero
|
238
292
|
);
|
239
293
|
if (cmp == 0) {
|
240
294
|
/* Looks like this value already exists! */
|
@@ -244,13 +298,13 @@ try_again:
|
|
244
298
|
}
|
245
299
|
}
|
246
300
|
|
247
|
-
if (entry->
|
248
|
-
candidate = &entry->
|
249
|
-
|
301
|
+
if (entry->bytes[bytes_i] == 0 && entry->bytes[bytes_i + 1] == 0) {
|
302
|
+
candidate = &entry->bytes[bytes_i + 1];
|
303
|
+
candidate_max_len = FILEDICT_BUCKET_ENTRY_BYTES - bytes_i - 1;
|
250
304
|
|
251
|
-
if (strlen(value) >=
|
305
|
+
if (strlen(value) >= candidate_max_len) break;
|
252
306
|
|
253
|
-
strncpy(candidate, value,
|
307
|
+
strncpy(candidate, value, candidate_max_len);
|
254
308
|
return;
|
255
309
|
}
|
256
310
|
}
|
@@ -259,7 +313,6 @@ try_again:
|
|
259
313
|
|
260
314
|
++hashmap_i;
|
261
315
|
hashmap += bucket_count;
|
262
|
-
bucket_count = (bucket_count << 1);
|
263
316
|
}
|
264
317
|
|
265
318
|
/*
|
@@ -293,27 +346,6 @@ try_again:
|
|
293
346
|
goto try_again;
|
294
347
|
}
|
295
348
|
|
296
|
-
/*
|
297
|
-
* Resizes the filedict based on the header hashmap count and initial bucket count.
|
298
|
-
* Naturally, your pointers into the map will become invalid after calling this.
|
299
|
-
*/
|
300
|
-
static void filedict_resize(filedict_t *filedict) {
|
301
|
-
filedict_header_t *header = (filedict_header_t*)filedict->data;
|
302
|
-
size_t computed_size = filedict_file_size(header->initial_bucket_count, header->hashmap_count);
|
303
|
-
|
304
|
-
munmap(filedict->data, filedict->data_len);
|
305
|
-
filedict->data = mmap(
|
306
|
-
filedict->data,
|
307
|
-
computed_size,
|
308
|
-
PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
|
309
|
-
MAP_SHARED,
|
310
|
-
filedict->fd,
|
311
|
-
0
|
312
|
-
);
|
313
|
-
if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
|
314
|
-
filedict->data_len = computed_size;
|
315
|
-
}
|
316
|
-
|
317
349
|
/*
|
318
350
|
* There are 3 "levels" to a filedict. From top to bottom:
|
319
351
|
* 1. Hashmap - which hashmap are we looking at? We create additional hashmaps to handle overflow.
|
@@ -331,8 +363,8 @@ static void filedict_resize(filedict_t *filedict) {
|
|
331
363
|
static int filedict_read_advance_value(filedict_read_t *read) {
|
332
364
|
assert(read->entry != NULL);
|
333
365
|
|
334
|
-
const char *buffer_begin = read->entry->
|
335
|
-
const char *buffer_end = buffer_begin +
|
366
|
+
const char *buffer_begin = read->entry->bytes;
|
367
|
+
const char *buffer_end = buffer_begin + FILEDICT_BUCKET_ENTRY_BYTES;
|
336
368
|
|
337
369
|
const char *c;
|
338
370
|
for (c = read->value; c < buffer_end; ++c) {
|
@@ -356,8 +388,8 @@ static int filedict_read_advance_value(filedict_read_t *read) {
|
|
356
388
|
* Returns 0 when we exhausted all remaining entries and didn't find a match.
|
357
389
|
*/
|
358
390
|
static int filedict_read_advance_entry(filedict_read_t *read) {
|
359
|
-
|
360
|
-
|
391
|
+
size_t value_start_i;
|
392
|
+
|
361
393
|
assert(read->bucket != NULL);
|
362
394
|
|
363
395
|
while (1) {
|
@@ -365,9 +397,22 @@ static int filedict_read_advance_entry(filedict_read_t *read) {
|
|
365
397
|
|
366
398
|
read->entry = &read->bucket->entries[read->entry_i];
|
367
399
|
|
368
|
-
if (
|
369
|
-
|
370
|
-
|
400
|
+
if (read->key == NULL) {
|
401
|
+
if (read->entry->bytes[0] != 0) {
|
402
|
+
value_start_i = strlen(read->entry->bytes) + 1;
|
403
|
+
read->value = &read->entry->bytes[value_start_i];
|
404
|
+
log_return(1);
|
405
|
+
}
|
406
|
+
}
|
407
|
+
else {
|
408
|
+
value_start_i = filedict_string_includes(read->entry->bytes, read->key, FILEDICT_BUCKET_ENTRY_BYTES);
|
409
|
+
|
410
|
+
if (value_start_i > 0) {
|
411
|
+
/* add 1 because it's pointing to the 0 after key; not the first char of value */
|
412
|
+
value_start_i += 1;
|
413
|
+
read->value = &read->entry->bytes[value_start_i];
|
414
|
+
log_return(1);
|
415
|
+
}
|
371
416
|
}
|
372
417
|
|
373
418
|
read->entry_i += 1;
|
@@ -382,6 +427,7 @@ static int filedict_read_advance_entry(filedict_read_t *read) {
|
|
382
427
|
*/
|
383
428
|
static int filedict_read_advance_hashmap(filedict_read_t *read) {
|
384
429
|
filedict_t *filedict = read->filedict;
|
430
|
+
int success = 0;
|
385
431
|
|
386
432
|
assert(filedict);
|
387
433
|
assert(filedict->data);
|
@@ -400,12 +446,25 @@ static int filedict_read_advance_hashmap(filedict_read_t *read) {
|
|
400
446
|
|
401
447
|
filedict_bucket_t *hashmap = filedict->data + offset;
|
402
448
|
|
403
|
-
read->bucket_count = (size_t)header->initial_bucket_count
|
449
|
+
read->bucket_count = (size_t)header->initial_bucket_count;
|
404
450
|
read->bucket = &hashmap[read->key_hash % read->bucket_count];
|
405
451
|
read->entry = &read->bucket->entries[0];
|
406
452
|
|
407
453
|
read->entry_i = 0;
|
408
454
|
|
455
|
+
if (read->key == NULL) {
|
456
|
+
success = filedict_read_advance_entry(read);
|
457
|
+
while (!success) {
|
458
|
+
read->key_hash += 1;
|
459
|
+
read->bucket = &hashmap[read->key_hash % read->bucket_count];
|
460
|
+
read->entry = &read->bucket->entries[0];
|
461
|
+
read->entry_i = 0;
|
462
|
+
success = filedict_read_advance_entry(read);
|
463
|
+
if (read->key_hash >= read->bucket_count) return 0;
|
464
|
+
}
|
465
|
+
return success;
|
466
|
+
}
|
467
|
+
|
409
468
|
log_return(filedict_read_advance_entry(read));
|
410
469
|
}
|
411
470
|
|
@@ -422,7 +481,14 @@ static filedict_read_t filedict_get(filedict_t *filedict, const char *key) {
|
|
422
481
|
read.entry_i = 0;
|
423
482
|
read.hashmap_i = 0;
|
424
483
|
read.bucket_count = 0;
|
425
|
-
|
484
|
+
|
485
|
+
/* NULL key means we want to iterate the whole entire dictionary */
|
486
|
+
if (key == NULL) {
|
487
|
+
read.key_hash = 0;
|
488
|
+
}
|
489
|
+
else {
|
490
|
+
read.key_hash = filedict->hash_function(key);
|
491
|
+
}
|
426
492
|
|
427
493
|
filedict_read_advance_hashmap(&read);
|
428
494
|
return read;
|
@@ -445,6 +511,19 @@ static int filedict_get_next(filedict_read_t *read) {
|
|
445
511
|
found = filedict_read_advance_entry(read);
|
446
512
|
if (found == 1) return found;
|
447
513
|
|
514
|
+
/*
|
515
|
+
* If read->key is NULL, that means we're iterating through the whole dict.
|
516
|
+
*/
|
517
|
+
if (read->key == NULL) {
|
518
|
+
read->key_hash += 1;
|
519
|
+
if (read->key_hash < read->bucket_count) {
|
520
|
+
return filedict_read_advance_hashmap(read);
|
521
|
+
}
|
522
|
+
else {
|
523
|
+
read->key_hash = 0;
|
524
|
+
}
|
525
|
+
}
|
526
|
+
|
448
527
|
read->hashmap_i += 1;
|
449
528
|
return filedict_read_advance_hashmap(read);
|
450
529
|
}
|
data/lib/filedict/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: filedictrb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nigel Baillie
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-06-19 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email:
|