filedictrb 0.1.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aad965d1fb0dd25c74c20f917b2f71e0d48fc7d6dea59bab8ad2c5990b3fb8fe
4
- data.tar.gz: cd8c479f17d113b19d5ef89d565aeab6c51305e1e0f81dab3d8d6a538555009b
3
+ metadata.gz: 54edb33f9c980c2815d486ba98bcc317209325a2f5a8303114ba2912675ebebf
4
+ data.tar.gz: f5d8eb13dca465d500621c434440ccb648c4e2b52b4f72a70f2e803ac70c1607
5
5
  SHA512:
6
- metadata.gz: 2dc64caded0eadba4cb03da2dfb926b3c488d6773df59087752953371e5dfabeeac4625478f0ac33a26c4a60f76105a8026ee49aa6e2c700437c0e992ba6a6fe
7
- data.tar.gz: 7e392f4e734fbe940fb4d86c7a04700edcad3cbab286d4103f6846dbcadb1f2fd01ac8a6a272dd31e542ded3fb919fcfa6b566919161ab1cc3782a56692a2281
6
+ metadata.gz: 6fa2c3bc8d94db20229ce1f152ef67dfd2bdc503e090ba46420101a958891b5aa8039bded2a23bf0ee6ccec9ff25e602e5c25ba04faa8ac2ee62195f68960111
7
+ data.tar.gz: 5b2974454d61502919d45dde3b90b400eb77ab3fea4d87c44b9cae0b3c31c24ace549dc9ee314670ee114d0c7e751566c1b5181569e9120ddbd7bd214d702749
@@ -1,17 +1,14 @@
1
1
  #ifndef FILEDICT_H
2
2
  #define FILEDICT_H 1
3
3
 
4
- #ifndef FILEDICT_KEY_SIZE
5
- #define FILEDICT_KEY_SIZE 256
6
- #endif
4
+ #include <stddef.h>
7
5
 
8
- #ifndef FILEDICT_VALUE_SIZE
9
- #define FILEDICT_VALUE_SIZE 256
6
+ #ifndef FILEDICT_BUCKET_ENTRY_BYTES
7
+ #define FILEDICT_BUCKET_ENTRY_BYTES 512
10
8
  #endif
11
9
 
12
10
  typedef struct filedict_bucket_entry_t {
13
- char key[FILEDICT_KEY_SIZE];
14
- char value[FILEDICT_VALUE_SIZE];
11
+ char bytes[FILEDICT_BUCKET_ENTRY_BYTES];
15
12
  } filedict_bucket_entry_t;
16
13
 
17
14
  #ifndef FILEDICT_BUCKET_ENTRY_COUNT
@@ -58,6 +55,7 @@ typedef struct filedict_read_t {
58
55
 
59
56
  #ifndef FILEDICT_IMPL
60
57
  #define FILEDICT_IMPL
58
+ #include <sys/stat.h>
61
59
  #include <sys/mman.h>
62
60
  #include <string.h>
63
61
  #include <unistd.h>
@@ -80,7 +78,7 @@ static size_t filedict_default_hash_function(const char *input) {
80
78
 
81
79
  /*
82
80
  * Writes at most max_len chars from src into dest.
83
- * Returns the total number of bytes in src.
81
+ * Returns the string length of src.
84
82
  */
85
83
  static size_t filedict_copy_string(char *dest, const char *src, size_t max_len) {
86
84
  size_t src_len = 0;
@@ -92,6 +90,23 @@ static size_t filedict_copy_string(char *dest, const char *src, size_t max_len)
92
90
  if (c == 0) return src_len;
93
91
  src_len += 1;
94
92
  }
93
+
94
+ return src_len;
95
+ }
96
+
97
+ /*
98
+ * Returns the index of the trailing 0 when str1 and str2 have the same contents.
99
+ * Returns 0 when str1 and str2 have different contents.
100
+ */
101
+ static size_t filedict_string_includes(const char *str1, const char *str2, size_t max_len) {
102
+ size_t i;
103
+
104
+ for (i = 0; i < max_len; ++i) {
105
+ if (str1[i] != str2[i]) return 0;
106
+ if (str1[i] == 0) return i;
107
+ }
108
+
109
+ return 0;
95
110
  }
96
111
 
97
112
  static void filedict_init(filedict_t *filedict) {
@@ -120,15 +135,39 @@ static void filedict_deinit(filedict_t *filedict) {
120
135
  * This computes the size of the entire filedict file given an initial bucket count and hashmap count.
121
136
  */
122
137
  static size_t filedict_file_size(size_t initial_bucket_count, size_t hashmap_count) {
123
- size_t result = sizeof(filedict_header_t);
124
- size_t i;
138
+ /*
139
+ * We used to size each additional hashmap at 2x the previous, but realistically it seems that
140
+ * most resizes are triggered by keys that are ridiculously large, not by mass collision.
141
+ *
142
+ * A more proper fix might be to re-structure the whole filedict. We could keep the existing
143
+ * hashmap structure, but with buckets that expand dynamically. This would require each bucket
144
+ * to contain a "pointer" to the next bucket object if present.
145
+ *
146
+ * For now, it's easiser to just keep the hashmap duplication without the size doubling.
147
+ */
148
+ return sizeof(filedict_header_t) + initial_bucket_count * hashmap_count * sizeof(filedict_bucket_t);
149
+ }
125
150
 
126
- for (i = 0; i < hashmap_count; ++i) {
127
- /* Bucket count is multiplied by 2 for each additional hashmap. */
128
- result += (initial_bucket_count << i) * sizeof(filedict_bucket_t);
129
- }
151
+ /*
152
+ * Resizes the filedict based on the header hashmap count and initial bucket count.
153
+ * Naturally, your pointers into the map will become invalid after calling this.
154
+ */
155
+ static void filedict_resize(filedict_t *filedict) {
156
+ filedict_header_t *header = (filedict_header_t*)filedict->data;
157
+ size_t computed_size = filedict_file_size(header->initial_bucket_count, header->hashmap_count);
158
+ if (computed_size <= filedict->data_len) return;
130
159
 
131
- return result;
160
+ munmap(filedict->data, filedict->data_len);
161
+ filedict->data = mmap(
162
+ filedict->data,
163
+ computed_size,
164
+ PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
165
+ MAP_SHARED,
166
+ filedict->fd,
167
+ 0
168
+ );
169
+ if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
170
+ filedict->data_len = computed_size;
132
171
  }
133
172
 
134
173
  /*
@@ -149,12 +188,20 @@ static void filedict_open_f(
149
188
  int flags,
150
189
  unsigned int initial_bucket_count
151
190
  ) {
191
+ struct stat info;
192
+
152
193
  filedict->flags = flags;
153
194
  filedict->fd = open(filename, flags, 0666);
154
195
  if (filedict->fd == -1) { filedict->error = strerror(errno); return; }
196
+ if (fstat(filedict->fd, &info) != 0) { filedict->error = strerror(errno); return; }
197
+
198
+ if (info.st_size == 0 && (flags & O_RDWR)) {
199
+ filedict->data_len = filedict_file_size(initial_bucket_count, 1);
200
+ ftruncate(filedict->fd, filedict->data_len);
201
+ } else {
202
+ filedict->data_len = info.st_size;
203
+ }
155
204
 
156
- filedict->data_len = filedict_file_size(initial_bucket_count, 1);
157
- ftruncate(filedict->fd, filedict->data_len);
158
205
  filedict->data = mmap(
159
206
  NULL,
160
207
  filedict->data_len,
@@ -206,35 +253,42 @@ try_again:
206
253
  filedict_bucket_entry_t *entry = &bucket->entries[i];
207
254
 
208
255
  /* Easy case: fresh entry. We can just insert here and call it quits. */
209
- if (entry->key[0] == 0) {
210
- strncpy(entry->key, key, FILEDICT_KEY_SIZE);
211
- size_t value_len = filedict_copy_string(entry->value, value, FILEDICT_VALUE_SIZE);
256
+ if (entry->bytes[0] == 0) {
257
+ size_t key_len = filedict_copy_string(entry->bytes, key, FILEDICT_BUCKET_ENTRY_BYTES);
258
+ size_t value_len = filedict_copy_string(entry->bytes + key_len + 1, value, FILEDICT_BUCKET_ENTRY_BYTES);
212
259
 
213
- if (value_len > FILEDICT_VALUE_SIZE) {
260
+ if (key_len + value_len > FILEDICT_BUCKET_ENTRY_BYTES) {
214
261
  filedict->error = "Value too big";
215
262
  }
216
263
  return;
217
264
  }
218
265
  /*
219
266
  * We need to check for room in the value, then append value.
220
- * This is also where we might run into a duplicate and duck out.existing
267
+ * This is also where we might run into a duplicate and duck out.
221
268
  */
222
- else if (strncmp(entry->key, key, FILEDICT_KEY_SIZE) == 0) {
269
+ else if (strncmp(entry->bytes, key, FILEDICT_BUCKET_ENTRY_BYTES) == 0) {
223
270
  long long first_nonzero = -1;
224
271
  char *candidate = NULL;
225
- size_t value_i, candidate_len;
272
+ size_t bytes_i, candidate_max_len;
226
273
 
227
- for (value_i = 0; value_i < FILEDICT_VALUE_SIZE - 1; ++value_i) {
274
+ for (bytes_i = 0; entry->bytes[bytes_i] != 0; ++bytes_i) {
275
+ if (bytes_i >= FILEDICT_BUCKET_ENTRY_BYTES) {
276
+ filedict->error = "Mysterious entry overflow!! Does it contain a massive key?";
277
+ return;
278
+ }
279
+ }
280
+
281
+ for (bytes_i += 1; bytes_i < FILEDICT_BUCKET_ENTRY_BYTES - 1; ++bytes_i) {
228
282
  if (unique) {
229
- if (first_nonzero == -1 && entry->value[value_i] != 0) {
230
- first_nonzero = value_i;
283
+ if (first_nonzero == -1 && entry->bytes[bytes_i] != 0) {
284
+ first_nonzero = bytes_i;
231
285
  }
232
286
 
233
- if (entry->value[value_i] == 0) {
287
+ if (entry->bytes[bytes_i] == 0) {
234
288
  int cmp = strncmp(
235
- &entry->value[first_nonzero],
289
+ &entry->bytes[first_nonzero],
236
290
  value,
237
- FILEDICT_VALUE_SIZE - first_nonzero
291
+ FILEDICT_BUCKET_ENTRY_BYTES - first_nonzero
238
292
  );
239
293
  if (cmp == 0) {
240
294
  /* Looks like this value already exists! */
@@ -244,13 +298,13 @@ try_again:
244
298
  }
245
299
  }
246
300
 
247
- if (entry->value[value_i] == 0 && entry->value[value_i + 1] == 0) {
248
- candidate = &entry->value[value_i + 1];
249
- candidate_len = FILEDICT_VALUE_SIZE - value_i - 1;
301
+ if (entry->bytes[bytes_i] == 0 && entry->bytes[bytes_i + 1] == 0) {
302
+ candidate = &entry->bytes[bytes_i + 1];
303
+ candidate_max_len = FILEDICT_BUCKET_ENTRY_BYTES - bytes_i - 1;
250
304
 
251
- if (strlen(value) >= candidate_len) break;
305
+ if (strlen(value) >= candidate_max_len) break;
252
306
 
253
- strncpy(candidate, value, candidate_len);
307
+ strncpy(candidate, value, candidate_max_len);
254
308
  return;
255
309
  }
256
310
  }
@@ -259,7 +313,6 @@ try_again:
259
313
 
260
314
  ++hashmap_i;
261
315
  hashmap += bucket_count;
262
- bucket_count = (bucket_count << 1);
263
316
  }
264
317
 
265
318
  /*
@@ -293,27 +346,6 @@ try_again:
293
346
  goto try_again;
294
347
  }
295
348
 
296
- /*
297
- * Resizes the filedict based on the header hashmap count and initial bucket count.
298
- * Naturally, your pointers into the map will become invalid after calling this.
299
- */
300
- static void filedict_resize(filedict_t *filedict) {
301
- filedict_header_t *header = (filedict_header_t*)filedict->data;
302
- size_t computed_size = filedict_file_size(header->initial_bucket_count, header->hashmap_count);
303
-
304
- munmap(filedict->data, filedict->data_len);
305
- filedict->data = mmap(
306
- filedict->data,
307
- computed_size,
308
- PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
309
- MAP_SHARED,
310
- filedict->fd,
311
- 0
312
- );
313
- if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
314
- filedict->data_len = computed_size;
315
- }
316
-
317
349
  /*
318
350
  * There are 3 "levels" to a filedict. From top to bottom:
319
351
  * 1. Hashmap - which hashmap are we looking at? We create additional hashmaps to handle overflow.
@@ -331,8 +363,8 @@ static void filedict_resize(filedict_t *filedict) {
331
363
  static int filedict_read_advance_value(filedict_read_t *read) {
332
364
  assert(read->entry != NULL);
333
365
 
334
- const char *buffer_begin = read->entry->value;
335
- const char *buffer_end = buffer_begin + FILEDICT_VALUE_SIZE;
366
+ const char *buffer_begin = read->entry->bytes;
367
+ const char *buffer_end = buffer_begin + FILEDICT_BUCKET_ENTRY_BYTES;
336
368
 
337
369
  const char *c;
338
370
  for (c = read->value; c < buffer_end; ++c) {
@@ -356,8 +388,8 @@ static int filedict_read_advance_value(filedict_read_t *read) {
356
388
  * Returns 0 when we exhausted all remaining entries and didn't find a match.
357
389
  */
358
390
  static int filedict_read_advance_entry(filedict_read_t *read) {
359
- assert(read->key != NULL);
360
- assert(strlen(read->key) > 0);
391
+ size_t value_start_i;
392
+
361
393
  assert(read->bucket != NULL);
362
394
 
363
395
  while (1) {
@@ -365,9 +397,22 @@ static int filedict_read_advance_entry(filedict_read_t *read) {
365
397
 
366
398
  read->entry = &read->bucket->entries[read->entry_i];
367
399
 
368
- if (strncmp(read->entry->key, read->key, FILEDICT_KEY_SIZE) == 0) {
369
- read->value = read->entry->value;
370
- log_return(1);
400
+ if (read->key == NULL) {
401
+ if (read->entry->bytes[0] != 0) {
402
+ value_start_i = strlen(read->entry->bytes) + 1;
403
+ read->value = &read->entry->bytes[value_start_i];
404
+ log_return(1);
405
+ }
406
+ }
407
+ else {
408
+ value_start_i = filedict_string_includes(read->entry->bytes, read->key, FILEDICT_BUCKET_ENTRY_BYTES);
409
+
410
+ if (value_start_i > 0) {
411
+ /* add 1 because it's pointing to the 0 after key; not the first char of value */
412
+ value_start_i += 1;
413
+ read->value = &read->entry->bytes[value_start_i];
414
+ log_return(1);
415
+ }
371
416
  }
372
417
 
373
418
  read->entry_i += 1;
@@ -382,6 +427,7 @@ static int filedict_read_advance_entry(filedict_read_t *read) {
382
427
  */
383
428
  static int filedict_read_advance_hashmap(filedict_read_t *read) {
384
429
  filedict_t *filedict = read->filedict;
430
+ int success = 0;
385
431
 
386
432
  assert(filedict);
387
433
  assert(filedict->data);
@@ -400,12 +446,25 @@ static int filedict_read_advance_hashmap(filedict_read_t *read) {
400
446
 
401
447
  filedict_bucket_t *hashmap = filedict->data + offset;
402
448
 
403
- read->bucket_count = (size_t)header->initial_bucket_count << read->hashmap_i;
449
+ read->bucket_count = (size_t)header->initial_bucket_count;
404
450
  read->bucket = &hashmap[read->key_hash % read->bucket_count];
405
451
  read->entry = &read->bucket->entries[0];
406
452
 
407
453
  read->entry_i = 0;
408
454
 
455
+ if (read->key == NULL) {
456
+ success = filedict_read_advance_entry(read);
457
+ while (!success) {
458
+ read->key_hash += 1;
459
+ read->bucket = &hashmap[read->key_hash % read->bucket_count];
460
+ read->entry = &read->bucket->entries[0];
461
+ read->entry_i = 0;
462
+ success = filedict_read_advance_entry(read);
463
+ if (read->key_hash >= read->bucket_count) return 0;
464
+ }
465
+ return success;
466
+ }
467
+
409
468
  log_return(filedict_read_advance_entry(read));
410
469
  }
411
470
 
@@ -422,7 +481,14 @@ static filedict_read_t filedict_get(filedict_t *filedict, const char *key) {
422
481
  read.entry_i = 0;
423
482
  read.hashmap_i = 0;
424
483
  read.bucket_count = 0;
425
- read.key_hash = filedict->hash_function(key);
484
+
485
+ /* NULL key means we want to iterate the whole entire dictionary */
486
+ if (key == NULL) {
487
+ read.key_hash = 0;
488
+ }
489
+ else {
490
+ read.key_hash = filedict->hash_function(key);
491
+ }
426
492
 
427
493
  filedict_read_advance_hashmap(&read);
428
494
  return read;
@@ -445,6 +511,19 @@ static int filedict_get_next(filedict_read_t *read) {
445
511
  found = filedict_read_advance_entry(read);
446
512
  if (found == 1) return found;
447
513
 
514
+ /*
515
+ * If read->key is NULL, that means we're iterating through the whole dict.
516
+ */
517
+ if (read->key == NULL) {
518
+ read->key_hash += 1;
519
+ if (read->key_hash < read->bucket_count) {
520
+ return filedict_read_advance_hashmap(read);
521
+ }
522
+ else {
523
+ read->key_hash = 0;
524
+ }
525
+ }
526
+
448
527
  read->hashmap_i += 1;
449
528
  return filedict_read_advance_hashmap(read);
450
529
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Filedict
4
- VERSION = "0.1.3"
4
+ VERSION = "1.0.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: filedictrb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nigel Baillie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-04-03 00:00:00.000000000 Z
11
+ date: 2022-06-19 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: