filedictrb 0.1.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/filedict/filedict.h +145 -66
- data/lib/filedict/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 54edb33f9c980c2815d486ba98bcc317209325a2f5a8303114ba2912675ebebf
|
4
|
+
data.tar.gz: f5d8eb13dca465d500621c434440ccb648c4e2b52b4f72a70f2e803ac70c1607
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6fa2c3bc8d94db20229ce1f152ef67dfd2bdc503e090ba46420101a958891b5aa8039bded2a23bf0ee6ccec9ff25e602e5c25ba04faa8ac2ee62195f68960111
|
7
|
+
data.tar.gz: 5b2974454d61502919d45dde3b90b400eb77ab3fea4d87c44b9cae0b3c31c24ace549dc9ee314670ee114d0c7e751566c1b5181569e9120ddbd7bd214d702749
|
data/ext/filedict/filedict.h
CHANGED
@@ -1,17 +1,14 @@
|
|
1
1
|
#ifndef FILEDICT_H
|
2
2
|
#define FILEDICT_H 1
|
3
3
|
|
4
|
-
#
|
5
|
-
#define FILEDICT_KEY_SIZE 256
|
6
|
-
#endif
|
4
|
+
#include <stddef.h>
|
7
5
|
|
8
|
-
#ifndef
|
9
|
-
#define
|
6
|
+
#ifndef FILEDICT_BUCKET_ENTRY_BYTES
|
7
|
+
#define FILEDICT_BUCKET_ENTRY_BYTES 512
|
10
8
|
#endif
|
11
9
|
|
12
10
|
typedef struct filedict_bucket_entry_t {
|
13
|
-
char
|
14
|
-
char value[FILEDICT_VALUE_SIZE];
|
11
|
+
char bytes[FILEDICT_BUCKET_ENTRY_BYTES];
|
15
12
|
} filedict_bucket_entry_t;
|
16
13
|
|
17
14
|
#ifndef FILEDICT_BUCKET_ENTRY_COUNT
|
@@ -58,6 +55,7 @@ typedef struct filedict_read_t {
|
|
58
55
|
|
59
56
|
#ifndef FILEDICT_IMPL
|
60
57
|
#define FILEDICT_IMPL
|
58
|
+
#include <sys/stat.h>
|
61
59
|
#include <sys/mman.h>
|
62
60
|
#include <string.h>
|
63
61
|
#include <unistd.h>
|
@@ -80,7 +78,7 @@ static size_t filedict_default_hash_function(const char *input) {
|
|
80
78
|
|
81
79
|
/*
|
82
80
|
* Writes at most max_len chars from src into dest.
|
83
|
-
* Returns the
|
81
|
+
* Returns the string length of src.
|
84
82
|
*/
|
85
83
|
static size_t filedict_copy_string(char *dest, const char *src, size_t max_len) {
|
86
84
|
size_t src_len = 0;
|
@@ -92,6 +90,23 @@ static size_t filedict_copy_string(char *dest, const char *src, size_t max_len)
|
|
92
90
|
if (c == 0) return src_len;
|
93
91
|
src_len += 1;
|
94
92
|
}
|
93
|
+
|
94
|
+
return src_len;
|
95
|
+
}
|
96
|
+
|
97
|
+
/*
|
98
|
+
* Returns the index of the trailing 0 when str1 and str2 have the same contents.
|
99
|
+
* Returns 0 when str1 and str2 have different contents.
|
100
|
+
*/
|
101
|
+
static size_t filedict_string_includes(const char *str1, const char *str2, size_t max_len) {
|
102
|
+
size_t i;
|
103
|
+
|
104
|
+
for (i = 0; i < max_len; ++i) {
|
105
|
+
if (str1[i] != str2[i]) return 0;
|
106
|
+
if (str1[i] == 0) return i;
|
107
|
+
}
|
108
|
+
|
109
|
+
return 0;
|
95
110
|
}
|
96
111
|
|
97
112
|
static void filedict_init(filedict_t *filedict) {
|
@@ -120,15 +135,39 @@ static void filedict_deinit(filedict_t *filedict) {
|
|
120
135
|
* This computes the size of the entire filedict file given an initial bucket count and hashmap count.
|
121
136
|
*/
|
122
137
|
static size_t filedict_file_size(size_t initial_bucket_count, size_t hashmap_count) {
|
123
|
-
|
124
|
-
|
138
|
+
/*
|
139
|
+
* We used to size each additional hashmap at 2x the previous, but realistically it seems that
|
140
|
+
* most resizes are triggered by keys that are ridiculously large, not by mass collision.
|
141
|
+
*
|
142
|
+
* A more proper fix might be to re-structure the whole filedict. We could keep the existing
|
143
|
+
* hashmap structure, but with buckets that expand dynamically. This would require each bucket
|
144
|
+
* to contain a "pointer" to the next bucket object if present.
|
145
|
+
*
|
146
|
+
* For now, it's easiser to just keep the hashmap duplication without the size doubling.
|
147
|
+
*/
|
148
|
+
return sizeof(filedict_header_t) + initial_bucket_count * hashmap_count * sizeof(filedict_bucket_t);
|
149
|
+
}
|
125
150
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
151
|
+
/*
|
152
|
+
* Resizes the filedict based on the header hashmap count and initial bucket count.
|
153
|
+
* Naturally, your pointers into the map will become invalid after calling this.
|
154
|
+
*/
|
155
|
+
static void filedict_resize(filedict_t *filedict) {
|
156
|
+
filedict_header_t *header = (filedict_header_t*)filedict->data;
|
157
|
+
size_t computed_size = filedict_file_size(header->initial_bucket_count, header->hashmap_count);
|
158
|
+
if (computed_size <= filedict->data_len) return;
|
130
159
|
|
131
|
-
|
160
|
+
munmap(filedict->data, filedict->data_len);
|
161
|
+
filedict->data = mmap(
|
162
|
+
filedict->data,
|
163
|
+
computed_size,
|
164
|
+
PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
|
165
|
+
MAP_SHARED,
|
166
|
+
filedict->fd,
|
167
|
+
0
|
168
|
+
);
|
169
|
+
if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
|
170
|
+
filedict->data_len = computed_size;
|
132
171
|
}
|
133
172
|
|
134
173
|
/*
|
@@ -149,12 +188,20 @@ static void filedict_open_f(
|
|
149
188
|
int flags,
|
150
189
|
unsigned int initial_bucket_count
|
151
190
|
) {
|
191
|
+
struct stat info;
|
192
|
+
|
152
193
|
filedict->flags = flags;
|
153
194
|
filedict->fd = open(filename, flags, 0666);
|
154
195
|
if (filedict->fd == -1) { filedict->error = strerror(errno); return; }
|
196
|
+
if (fstat(filedict->fd, &info) != 0) { filedict->error = strerror(errno); return; }
|
197
|
+
|
198
|
+
if (info.st_size == 0 && (flags & O_RDWR)) {
|
199
|
+
filedict->data_len = filedict_file_size(initial_bucket_count, 1);
|
200
|
+
ftruncate(filedict->fd, filedict->data_len);
|
201
|
+
} else {
|
202
|
+
filedict->data_len = info.st_size;
|
203
|
+
}
|
155
204
|
|
156
|
-
filedict->data_len = filedict_file_size(initial_bucket_count, 1);
|
157
|
-
ftruncate(filedict->fd, filedict->data_len);
|
158
205
|
filedict->data = mmap(
|
159
206
|
NULL,
|
160
207
|
filedict->data_len,
|
@@ -206,35 +253,42 @@ try_again:
|
|
206
253
|
filedict_bucket_entry_t *entry = &bucket->entries[i];
|
207
254
|
|
208
255
|
/* Easy case: fresh entry. We can just insert here and call it quits. */
|
209
|
-
if (entry->
|
210
|
-
|
211
|
-
size_t value_len = filedict_copy_string(entry->
|
256
|
+
if (entry->bytes[0] == 0) {
|
257
|
+
size_t key_len = filedict_copy_string(entry->bytes, key, FILEDICT_BUCKET_ENTRY_BYTES);
|
258
|
+
size_t value_len = filedict_copy_string(entry->bytes + key_len + 1, value, FILEDICT_BUCKET_ENTRY_BYTES);
|
212
259
|
|
213
|
-
if (value_len >
|
260
|
+
if (key_len + value_len > FILEDICT_BUCKET_ENTRY_BYTES) {
|
214
261
|
filedict->error = "Value too big";
|
215
262
|
}
|
216
263
|
return;
|
217
264
|
}
|
218
265
|
/*
|
219
266
|
* We need to check for room in the value, then append value.
|
220
|
-
* This is also where we might run into a duplicate and duck out.
|
267
|
+
* This is also where we might run into a duplicate and duck out.
|
221
268
|
*/
|
222
|
-
else if (strncmp(entry->
|
269
|
+
else if (strncmp(entry->bytes, key, FILEDICT_BUCKET_ENTRY_BYTES) == 0) {
|
223
270
|
long long first_nonzero = -1;
|
224
271
|
char *candidate = NULL;
|
225
|
-
size_t
|
272
|
+
size_t bytes_i, candidate_max_len;
|
226
273
|
|
227
|
-
for (
|
274
|
+
for (bytes_i = 0; entry->bytes[bytes_i] != 0; ++bytes_i) {
|
275
|
+
if (bytes_i >= FILEDICT_BUCKET_ENTRY_BYTES) {
|
276
|
+
filedict->error = "Mysterious entry overflow!! Does it contain a massive key?";
|
277
|
+
return;
|
278
|
+
}
|
279
|
+
}
|
280
|
+
|
281
|
+
for (bytes_i += 1; bytes_i < FILEDICT_BUCKET_ENTRY_BYTES - 1; ++bytes_i) {
|
228
282
|
if (unique) {
|
229
|
-
if (first_nonzero == -1 && entry->
|
230
|
-
first_nonzero =
|
283
|
+
if (first_nonzero == -1 && entry->bytes[bytes_i] != 0) {
|
284
|
+
first_nonzero = bytes_i;
|
231
285
|
}
|
232
286
|
|
233
|
-
if (entry->
|
287
|
+
if (entry->bytes[bytes_i] == 0) {
|
234
288
|
int cmp = strncmp(
|
235
|
-
&entry->
|
289
|
+
&entry->bytes[first_nonzero],
|
236
290
|
value,
|
237
|
-
|
291
|
+
FILEDICT_BUCKET_ENTRY_BYTES - first_nonzero
|
238
292
|
);
|
239
293
|
if (cmp == 0) {
|
240
294
|
/* Looks like this value already exists! */
|
@@ -244,13 +298,13 @@ try_again:
|
|
244
298
|
}
|
245
299
|
}
|
246
300
|
|
247
|
-
if (entry->
|
248
|
-
candidate = &entry->
|
249
|
-
|
301
|
+
if (entry->bytes[bytes_i] == 0 && entry->bytes[bytes_i + 1] == 0) {
|
302
|
+
candidate = &entry->bytes[bytes_i + 1];
|
303
|
+
candidate_max_len = FILEDICT_BUCKET_ENTRY_BYTES - bytes_i - 1;
|
250
304
|
|
251
|
-
if (strlen(value) >=
|
305
|
+
if (strlen(value) >= candidate_max_len) break;
|
252
306
|
|
253
|
-
strncpy(candidate, value,
|
307
|
+
strncpy(candidate, value, candidate_max_len);
|
254
308
|
return;
|
255
309
|
}
|
256
310
|
}
|
@@ -259,7 +313,6 @@ try_again:
|
|
259
313
|
|
260
314
|
++hashmap_i;
|
261
315
|
hashmap += bucket_count;
|
262
|
-
bucket_count = (bucket_count << 1);
|
263
316
|
}
|
264
317
|
|
265
318
|
/*
|
@@ -293,27 +346,6 @@ try_again:
|
|
293
346
|
goto try_again;
|
294
347
|
}
|
295
348
|
|
296
|
-
/*
|
297
|
-
* Resizes the filedict based on the header hashmap count and initial bucket count.
|
298
|
-
* Naturally, your pointers into the map will become invalid after calling this.
|
299
|
-
*/
|
300
|
-
static void filedict_resize(filedict_t *filedict) {
|
301
|
-
filedict_header_t *header = (filedict_header_t*)filedict->data;
|
302
|
-
size_t computed_size = filedict_file_size(header->initial_bucket_count, header->hashmap_count);
|
303
|
-
|
304
|
-
munmap(filedict->data, filedict->data_len);
|
305
|
-
filedict->data = mmap(
|
306
|
-
filedict->data,
|
307
|
-
computed_size,
|
308
|
-
PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
|
309
|
-
MAP_SHARED,
|
310
|
-
filedict->fd,
|
311
|
-
0
|
312
|
-
);
|
313
|
-
if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
|
314
|
-
filedict->data_len = computed_size;
|
315
|
-
}
|
316
|
-
|
317
349
|
/*
|
318
350
|
* There are 3 "levels" to a filedict. From top to bottom:
|
319
351
|
* 1. Hashmap - which hashmap are we looking at? We create additional hashmaps to handle overflow.
|
@@ -331,8 +363,8 @@ static void filedict_resize(filedict_t *filedict) {
|
|
331
363
|
static int filedict_read_advance_value(filedict_read_t *read) {
|
332
364
|
assert(read->entry != NULL);
|
333
365
|
|
334
|
-
const char *buffer_begin = read->entry->
|
335
|
-
const char *buffer_end = buffer_begin +
|
366
|
+
const char *buffer_begin = read->entry->bytes;
|
367
|
+
const char *buffer_end = buffer_begin + FILEDICT_BUCKET_ENTRY_BYTES;
|
336
368
|
|
337
369
|
const char *c;
|
338
370
|
for (c = read->value; c < buffer_end; ++c) {
|
@@ -356,8 +388,8 @@ static int filedict_read_advance_value(filedict_read_t *read) {
|
|
356
388
|
* Returns 0 when we exhausted all remaining entries and didn't find a match.
|
357
389
|
*/
|
358
390
|
static int filedict_read_advance_entry(filedict_read_t *read) {
|
359
|
-
|
360
|
-
|
391
|
+
size_t value_start_i;
|
392
|
+
|
361
393
|
assert(read->bucket != NULL);
|
362
394
|
|
363
395
|
while (1) {
|
@@ -365,9 +397,22 @@ static int filedict_read_advance_entry(filedict_read_t *read) {
|
|
365
397
|
|
366
398
|
read->entry = &read->bucket->entries[read->entry_i];
|
367
399
|
|
368
|
-
if (
|
369
|
-
|
370
|
-
|
400
|
+
if (read->key == NULL) {
|
401
|
+
if (read->entry->bytes[0] != 0) {
|
402
|
+
value_start_i = strlen(read->entry->bytes) + 1;
|
403
|
+
read->value = &read->entry->bytes[value_start_i];
|
404
|
+
log_return(1);
|
405
|
+
}
|
406
|
+
}
|
407
|
+
else {
|
408
|
+
value_start_i = filedict_string_includes(read->entry->bytes, read->key, FILEDICT_BUCKET_ENTRY_BYTES);
|
409
|
+
|
410
|
+
if (value_start_i > 0) {
|
411
|
+
/* add 1 because it's pointing to the 0 after key; not the first char of value */
|
412
|
+
value_start_i += 1;
|
413
|
+
read->value = &read->entry->bytes[value_start_i];
|
414
|
+
log_return(1);
|
415
|
+
}
|
371
416
|
}
|
372
417
|
|
373
418
|
read->entry_i += 1;
|
@@ -382,6 +427,7 @@ static int filedict_read_advance_entry(filedict_read_t *read) {
|
|
382
427
|
*/
|
383
428
|
static int filedict_read_advance_hashmap(filedict_read_t *read) {
|
384
429
|
filedict_t *filedict = read->filedict;
|
430
|
+
int success = 0;
|
385
431
|
|
386
432
|
assert(filedict);
|
387
433
|
assert(filedict->data);
|
@@ -400,12 +446,25 @@ static int filedict_read_advance_hashmap(filedict_read_t *read) {
|
|
400
446
|
|
401
447
|
filedict_bucket_t *hashmap = filedict->data + offset;
|
402
448
|
|
403
|
-
read->bucket_count = (size_t)header->initial_bucket_count
|
449
|
+
read->bucket_count = (size_t)header->initial_bucket_count;
|
404
450
|
read->bucket = &hashmap[read->key_hash % read->bucket_count];
|
405
451
|
read->entry = &read->bucket->entries[0];
|
406
452
|
|
407
453
|
read->entry_i = 0;
|
408
454
|
|
455
|
+
if (read->key == NULL) {
|
456
|
+
success = filedict_read_advance_entry(read);
|
457
|
+
while (!success) {
|
458
|
+
read->key_hash += 1;
|
459
|
+
read->bucket = &hashmap[read->key_hash % read->bucket_count];
|
460
|
+
read->entry = &read->bucket->entries[0];
|
461
|
+
read->entry_i = 0;
|
462
|
+
success = filedict_read_advance_entry(read);
|
463
|
+
if (read->key_hash >= read->bucket_count) return 0;
|
464
|
+
}
|
465
|
+
return success;
|
466
|
+
}
|
467
|
+
|
409
468
|
log_return(filedict_read_advance_entry(read));
|
410
469
|
}
|
411
470
|
|
@@ -422,7 +481,14 @@ static filedict_read_t filedict_get(filedict_t *filedict, const char *key) {
|
|
422
481
|
read.entry_i = 0;
|
423
482
|
read.hashmap_i = 0;
|
424
483
|
read.bucket_count = 0;
|
425
|
-
|
484
|
+
|
485
|
+
/* NULL key means we want to iterate the whole entire dictionary */
|
486
|
+
if (key == NULL) {
|
487
|
+
read.key_hash = 0;
|
488
|
+
}
|
489
|
+
else {
|
490
|
+
read.key_hash = filedict->hash_function(key);
|
491
|
+
}
|
426
492
|
|
427
493
|
filedict_read_advance_hashmap(&read);
|
428
494
|
return read;
|
@@ -445,6 +511,19 @@ static int filedict_get_next(filedict_read_t *read) {
|
|
445
511
|
found = filedict_read_advance_entry(read);
|
446
512
|
if (found == 1) return found;
|
447
513
|
|
514
|
+
/*
|
515
|
+
* If read->key is NULL, that means we're iterating through the whole dict.
|
516
|
+
*/
|
517
|
+
if (read->key == NULL) {
|
518
|
+
read->key_hash += 1;
|
519
|
+
if (read->key_hash < read->bucket_count) {
|
520
|
+
return filedict_read_advance_hashmap(read);
|
521
|
+
}
|
522
|
+
else {
|
523
|
+
read->key_hash = 0;
|
524
|
+
}
|
525
|
+
}
|
526
|
+
|
448
527
|
read->hashmap_i += 1;
|
449
528
|
return filedict_read_advance_hashmap(read);
|
450
529
|
}
|
data/lib/filedict/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: filedictrb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nigel Baillie
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-06-19 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email:
|