filedictrb 0.1.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aad965d1fb0dd25c74c20f917b2f71e0d48fc7d6dea59bab8ad2c5990b3fb8fe
4
- data.tar.gz: cd8c479f17d113b19d5ef89d565aeab6c51305e1e0f81dab3d8d6a538555009b
3
+ metadata.gz: 54edb33f9c980c2815d486ba98bcc317209325a2f5a8303114ba2912675ebebf
4
+ data.tar.gz: f5d8eb13dca465d500621c434440ccb648c4e2b52b4f72a70f2e803ac70c1607
5
5
  SHA512:
6
- metadata.gz: 2dc64caded0eadba4cb03da2dfb926b3c488d6773df59087752953371e5dfabeeac4625478f0ac33a26c4a60f76105a8026ee49aa6e2c700437c0e992ba6a6fe
7
- data.tar.gz: 7e392f4e734fbe940fb4d86c7a04700edcad3cbab286d4103f6846dbcadb1f2fd01ac8a6a272dd31e542ded3fb919fcfa6b566919161ab1cc3782a56692a2281
6
+ metadata.gz: 6fa2c3bc8d94db20229ce1f152ef67dfd2bdc503e090ba46420101a958891b5aa8039bded2a23bf0ee6ccec9ff25e602e5c25ba04faa8ac2ee62195f68960111
7
+ data.tar.gz: 5b2974454d61502919d45dde3b90b400eb77ab3fea4d87c44b9cae0b3c31c24ace549dc9ee314670ee114d0c7e751566c1b5181569e9120ddbd7bd214d702749
@@ -1,17 +1,14 @@
1
1
  #ifndef FILEDICT_H
2
2
  #define FILEDICT_H 1
3
3
 
4
- #ifndef FILEDICT_KEY_SIZE
5
- #define FILEDICT_KEY_SIZE 256
6
- #endif
4
+ #include <stddef.h>
7
5
 
8
- #ifndef FILEDICT_VALUE_SIZE
9
- #define FILEDICT_VALUE_SIZE 256
6
+ #ifndef FILEDICT_BUCKET_ENTRY_BYTES
7
+ #define FILEDICT_BUCKET_ENTRY_BYTES 512
10
8
  #endif
11
9
 
12
10
  typedef struct filedict_bucket_entry_t {
13
- char key[FILEDICT_KEY_SIZE];
14
- char value[FILEDICT_VALUE_SIZE];
11
+ char bytes[FILEDICT_BUCKET_ENTRY_BYTES];
15
12
  } filedict_bucket_entry_t;
16
13
 
17
14
  #ifndef FILEDICT_BUCKET_ENTRY_COUNT
@@ -58,6 +55,7 @@ typedef struct filedict_read_t {
58
55
 
59
56
  #ifndef FILEDICT_IMPL
60
57
  #define FILEDICT_IMPL
58
+ #include <sys/stat.h>
61
59
  #include <sys/mman.h>
62
60
  #include <string.h>
63
61
  #include <unistd.h>
@@ -80,7 +78,7 @@ static size_t filedict_default_hash_function(const char *input) {
80
78
 
81
79
  /*
82
80
  * Writes at most max_len chars from src into dest.
83
- * Returns the total number of bytes in src.
81
+ * Returns the string length of src.
84
82
  */
85
83
  static size_t filedict_copy_string(char *dest, const char *src, size_t max_len) {
86
84
  size_t src_len = 0;
@@ -92,6 +90,23 @@ static size_t filedict_copy_string(char *dest, const char *src, size_t max_len)
92
90
  if (c == 0) return src_len;
93
91
  src_len += 1;
94
92
  }
93
+
94
+ return src_len;
95
+ }
96
+
97
+ /*
98
+ * Returns the index of the trailing 0 when str1 and str2 have the same contents.
99
+ * Returns 0 when str1 and str2 have different contents.
100
+ */
101
+ static size_t filedict_string_includes(const char *str1, const char *str2, size_t max_len) {
102
+ size_t i;
103
+
104
+ for (i = 0; i < max_len; ++i) {
105
+ if (str1[i] != str2[i]) return 0;
106
+ if (str1[i] == 0) return i;
107
+ }
108
+
109
+ return 0;
95
110
  }
96
111
 
97
112
  static void filedict_init(filedict_t *filedict) {
@@ -120,15 +135,39 @@ static void filedict_deinit(filedict_t *filedict) {
120
135
  * This computes the size of the entire filedict file given an initial bucket count and hashmap count.
121
136
  */
122
137
  static size_t filedict_file_size(size_t initial_bucket_count, size_t hashmap_count) {
123
- size_t result = sizeof(filedict_header_t);
124
- size_t i;
138
+ /*
139
+ * We used to size each additional hashmap at 2x the previous, but realistically it seems that
140
+ * most resizes are triggered by keys that are ridiculously large, not by mass collision.
141
+ *
142
+ * A more proper fix might be to re-structure the whole filedict. We could keep the existing
143
+ * hashmap structure, but with buckets that expand dynamically. This would require each bucket
144
+ * to contain a "pointer" to the next bucket object if present.
145
+ *
146
+ * For now, it's easiser to just keep the hashmap duplication without the size doubling.
147
+ */
148
+ return sizeof(filedict_header_t) + initial_bucket_count * hashmap_count * sizeof(filedict_bucket_t);
149
+ }
125
150
 
126
- for (i = 0; i < hashmap_count; ++i) {
127
- /* Bucket count is multiplied by 2 for each additional hashmap. */
128
- result += (initial_bucket_count << i) * sizeof(filedict_bucket_t);
129
- }
151
+ /*
152
+ * Resizes the filedict based on the header hashmap count and initial bucket count.
153
+ * Naturally, your pointers into the map will become invalid after calling this.
154
+ */
155
+ static void filedict_resize(filedict_t *filedict) {
156
+ filedict_header_t *header = (filedict_header_t*)filedict->data;
157
+ size_t computed_size = filedict_file_size(header->initial_bucket_count, header->hashmap_count);
158
+ if (computed_size <= filedict->data_len) return;
130
159
 
131
- return result;
160
+ munmap(filedict->data, filedict->data_len);
161
+ filedict->data = mmap(
162
+ filedict->data,
163
+ computed_size,
164
+ PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
165
+ MAP_SHARED,
166
+ filedict->fd,
167
+ 0
168
+ );
169
+ if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
170
+ filedict->data_len = computed_size;
132
171
  }
133
172
 
134
173
  /*
@@ -149,12 +188,20 @@ static void filedict_open_f(
149
188
  int flags,
150
189
  unsigned int initial_bucket_count
151
190
  ) {
191
+ struct stat info;
192
+
152
193
  filedict->flags = flags;
153
194
  filedict->fd = open(filename, flags, 0666);
154
195
  if (filedict->fd == -1) { filedict->error = strerror(errno); return; }
196
+ if (fstat(filedict->fd, &info) != 0) { filedict->error = strerror(errno); return; }
197
+
198
+ if (info.st_size == 0 && (flags & O_RDWR)) {
199
+ filedict->data_len = filedict_file_size(initial_bucket_count, 1);
200
+ ftruncate(filedict->fd, filedict->data_len);
201
+ } else {
202
+ filedict->data_len = info.st_size;
203
+ }
155
204
 
156
- filedict->data_len = filedict_file_size(initial_bucket_count, 1);
157
- ftruncate(filedict->fd, filedict->data_len);
158
205
  filedict->data = mmap(
159
206
  NULL,
160
207
  filedict->data_len,
@@ -206,35 +253,42 @@ try_again:
206
253
  filedict_bucket_entry_t *entry = &bucket->entries[i];
207
254
 
208
255
  /* Easy case: fresh entry. We can just insert here and call it quits. */
209
- if (entry->key[0] == 0) {
210
- strncpy(entry->key, key, FILEDICT_KEY_SIZE);
211
- size_t value_len = filedict_copy_string(entry->value, value, FILEDICT_VALUE_SIZE);
256
+ if (entry->bytes[0] == 0) {
257
+ size_t key_len = filedict_copy_string(entry->bytes, key, FILEDICT_BUCKET_ENTRY_BYTES);
258
+ size_t value_len = filedict_copy_string(entry->bytes + key_len + 1, value, FILEDICT_BUCKET_ENTRY_BYTES);
212
259
 
213
- if (value_len > FILEDICT_VALUE_SIZE) {
260
+ if (key_len + value_len > FILEDICT_BUCKET_ENTRY_BYTES) {
214
261
  filedict->error = "Value too big";
215
262
  }
216
263
  return;
217
264
  }
218
265
  /*
219
266
  * We need to check for room in the value, then append value.
220
- * This is also where we might run into a duplicate and duck out.existing
267
+ * This is also where we might run into a duplicate and duck out.
221
268
  */
222
- else if (strncmp(entry->key, key, FILEDICT_KEY_SIZE) == 0) {
269
+ else if (strncmp(entry->bytes, key, FILEDICT_BUCKET_ENTRY_BYTES) == 0) {
223
270
  long long first_nonzero = -1;
224
271
  char *candidate = NULL;
225
- size_t value_i, candidate_len;
272
+ size_t bytes_i, candidate_max_len;
226
273
 
227
- for (value_i = 0; value_i < FILEDICT_VALUE_SIZE - 1; ++value_i) {
274
+ for (bytes_i = 0; entry->bytes[bytes_i] != 0; ++bytes_i) {
275
+ if (bytes_i >= FILEDICT_BUCKET_ENTRY_BYTES) {
276
+ filedict->error = "Mysterious entry overflow!! Does it contain a massive key?";
277
+ return;
278
+ }
279
+ }
280
+
281
+ for (bytes_i += 1; bytes_i < FILEDICT_BUCKET_ENTRY_BYTES - 1; ++bytes_i) {
228
282
  if (unique) {
229
- if (first_nonzero == -1 && entry->value[value_i] != 0) {
230
- first_nonzero = value_i;
283
+ if (first_nonzero == -1 && entry->bytes[bytes_i] != 0) {
284
+ first_nonzero = bytes_i;
231
285
  }
232
286
 
233
- if (entry->value[value_i] == 0) {
287
+ if (entry->bytes[bytes_i] == 0) {
234
288
  int cmp = strncmp(
235
- &entry->value[first_nonzero],
289
+ &entry->bytes[first_nonzero],
236
290
  value,
237
- FILEDICT_VALUE_SIZE - first_nonzero
291
+ FILEDICT_BUCKET_ENTRY_BYTES - first_nonzero
238
292
  );
239
293
  if (cmp == 0) {
240
294
  /* Looks like this value already exists! */
@@ -244,13 +298,13 @@ try_again:
244
298
  }
245
299
  }
246
300
 
247
- if (entry->value[value_i] == 0 && entry->value[value_i + 1] == 0) {
248
- candidate = &entry->value[value_i + 1];
249
- candidate_len = FILEDICT_VALUE_SIZE - value_i - 1;
301
+ if (entry->bytes[bytes_i] == 0 && entry->bytes[bytes_i + 1] == 0) {
302
+ candidate = &entry->bytes[bytes_i + 1];
303
+ candidate_max_len = FILEDICT_BUCKET_ENTRY_BYTES - bytes_i - 1;
250
304
 
251
- if (strlen(value) >= candidate_len) break;
305
+ if (strlen(value) >= candidate_max_len) break;
252
306
 
253
- strncpy(candidate, value, candidate_len);
307
+ strncpy(candidate, value, candidate_max_len);
254
308
  return;
255
309
  }
256
310
  }
@@ -259,7 +313,6 @@ try_again:
259
313
 
260
314
  ++hashmap_i;
261
315
  hashmap += bucket_count;
262
- bucket_count = (bucket_count << 1);
263
316
  }
264
317
 
265
318
  /*
@@ -293,27 +346,6 @@ try_again:
293
346
  goto try_again;
294
347
  }
295
348
 
296
- /*
297
- * Resizes the filedict based on the header hashmap count and initial bucket count.
298
- * Naturally, your pointers into the map will become invalid after calling this.
299
- */
300
- static void filedict_resize(filedict_t *filedict) {
301
- filedict_header_t *header = (filedict_header_t*)filedict->data;
302
- size_t computed_size = filedict_file_size(header->initial_bucket_count, header->hashmap_count);
303
-
304
- munmap(filedict->data, filedict->data_len);
305
- filedict->data = mmap(
306
- filedict->data,
307
- computed_size,
308
- PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
309
- MAP_SHARED,
310
- filedict->fd,
311
- 0
312
- );
313
- if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
314
- filedict->data_len = computed_size;
315
- }
316
-
317
349
  /*
318
350
  * There are 3 "levels" to a filedict. From top to bottom:
319
351
  * 1. Hashmap - which hashmap are we looking at? We create additional hashmaps to handle overflow.
@@ -331,8 +363,8 @@ static void filedict_resize(filedict_t *filedict) {
331
363
  static int filedict_read_advance_value(filedict_read_t *read) {
332
364
  assert(read->entry != NULL);
333
365
 
334
- const char *buffer_begin = read->entry->value;
335
- const char *buffer_end = buffer_begin + FILEDICT_VALUE_SIZE;
366
+ const char *buffer_begin = read->entry->bytes;
367
+ const char *buffer_end = buffer_begin + FILEDICT_BUCKET_ENTRY_BYTES;
336
368
 
337
369
  const char *c;
338
370
  for (c = read->value; c < buffer_end; ++c) {
@@ -356,8 +388,8 @@ static int filedict_read_advance_value(filedict_read_t *read) {
356
388
  * Returns 0 when we exhausted all remaining entries and didn't find a match.
357
389
  */
358
390
  static int filedict_read_advance_entry(filedict_read_t *read) {
359
- assert(read->key != NULL);
360
- assert(strlen(read->key) > 0);
391
+ size_t value_start_i;
392
+
361
393
  assert(read->bucket != NULL);
362
394
 
363
395
  while (1) {
@@ -365,9 +397,22 @@ static int filedict_read_advance_entry(filedict_read_t *read) {
365
397
 
366
398
  read->entry = &read->bucket->entries[read->entry_i];
367
399
 
368
- if (strncmp(read->entry->key, read->key, FILEDICT_KEY_SIZE) == 0) {
369
- read->value = read->entry->value;
370
- log_return(1);
400
+ if (read->key == NULL) {
401
+ if (read->entry->bytes[0] != 0) {
402
+ value_start_i = strlen(read->entry->bytes) + 1;
403
+ read->value = &read->entry->bytes[value_start_i];
404
+ log_return(1);
405
+ }
406
+ }
407
+ else {
408
+ value_start_i = filedict_string_includes(read->entry->bytes, read->key, FILEDICT_BUCKET_ENTRY_BYTES);
409
+
410
+ if (value_start_i > 0) {
411
+ /* add 1 because it's pointing to the 0 after key; not the first char of value */
412
+ value_start_i += 1;
413
+ read->value = &read->entry->bytes[value_start_i];
414
+ log_return(1);
415
+ }
371
416
  }
372
417
 
373
418
  read->entry_i += 1;
@@ -382,6 +427,7 @@ static int filedict_read_advance_entry(filedict_read_t *read) {
382
427
  */
383
428
  static int filedict_read_advance_hashmap(filedict_read_t *read) {
384
429
  filedict_t *filedict = read->filedict;
430
+ int success = 0;
385
431
 
386
432
  assert(filedict);
387
433
  assert(filedict->data);
@@ -400,12 +446,25 @@ static int filedict_read_advance_hashmap(filedict_read_t *read) {
400
446
 
401
447
  filedict_bucket_t *hashmap = filedict->data + offset;
402
448
 
403
- read->bucket_count = (size_t)header->initial_bucket_count << read->hashmap_i;
449
+ read->bucket_count = (size_t)header->initial_bucket_count;
404
450
  read->bucket = &hashmap[read->key_hash % read->bucket_count];
405
451
  read->entry = &read->bucket->entries[0];
406
452
 
407
453
  read->entry_i = 0;
408
454
 
455
+ if (read->key == NULL) {
456
+ success = filedict_read_advance_entry(read);
457
+ while (!success) {
458
+ read->key_hash += 1;
459
+ read->bucket = &hashmap[read->key_hash % read->bucket_count];
460
+ read->entry = &read->bucket->entries[0];
461
+ read->entry_i = 0;
462
+ success = filedict_read_advance_entry(read);
463
+ if (read->key_hash >= read->bucket_count) return 0;
464
+ }
465
+ return success;
466
+ }
467
+
409
468
  log_return(filedict_read_advance_entry(read));
410
469
  }
411
470
 
@@ -422,7 +481,14 @@ static filedict_read_t filedict_get(filedict_t *filedict, const char *key) {
422
481
  read.entry_i = 0;
423
482
  read.hashmap_i = 0;
424
483
  read.bucket_count = 0;
425
- read.key_hash = filedict->hash_function(key);
484
+
485
+ /* NULL key means we want to iterate the whole entire dictionary */
486
+ if (key == NULL) {
487
+ read.key_hash = 0;
488
+ }
489
+ else {
490
+ read.key_hash = filedict->hash_function(key);
491
+ }
426
492
 
427
493
  filedict_read_advance_hashmap(&read);
428
494
  return read;
@@ -445,6 +511,19 @@ static int filedict_get_next(filedict_read_t *read) {
445
511
  found = filedict_read_advance_entry(read);
446
512
  if (found == 1) return found;
447
513
 
514
+ /*
515
+ * If read->key is NULL, that means we're iterating through the whole dict.
516
+ */
517
+ if (read->key == NULL) {
518
+ read->key_hash += 1;
519
+ if (read->key_hash < read->bucket_count) {
520
+ return filedict_read_advance_hashmap(read);
521
+ }
522
+ else {
523
+ read->key_hash = 0;
524
+ }
525
+ }
526
+
448
527
  read->hashmap_i += 1;
449
528
  return filedict_read_advance_hashmap(read);
450
529
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Filedict
4
- VERSION = "0.1.3"
4
+ VERSION = "1.0.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: filedictrb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nigel Baillie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-04-03 00:00:00.000000000 Z
11
+ date: 2022-06-19 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: