RubyGems - filedictrb - Versions diffs - 0.1.3 → 1.0.0 - Mend

filedictrb 0.1.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: aad965d1fb0dd25c74c20f917b2f71e0d48fc7d6dea59bab8ad2c5990b3fb8fe
-  data.tar.gz: cd8c479f17d113b19d5ef89d565aeab6c51305e1e0f81dab3d8d6a538555009b
+  metadata.gz: 54edb33f9c980c2815d486ba98bcc317209325a2f5a8303114ba2912675ebebf
+  data.tar.gz: f5d8eb13dca465d500621c434440ccb648c4e2b52b4f72a70f2e803ac70c1607
 SHA512:
-  metadata.gz: 2dc64caded0eadba4cb03da2dfb926b3c488d6773df59087752953371e5dfabeeac4625478f0ac33a26c4a60f76105a8026ee49aa6e2c700437c0e992ba6a6fe
-  data.tar.gz: 7e392f4e734fbe940fb4d86c7a04700edcad3cbab286d4103f6846dbcadb1f2fd01ac8a6a272dd31e542ded3fb919fcfa6b566919161ab1cc3782a56692a2281
+  metadata.gz: 6fa2c3bc8d94db20229ce1f152ef67dfd2bdc503e090ba46420101a958891b5aa8039bded2a23bf0ee6ccec9ff25e602e5c25ba04faa8ac2ee62195f68960111
+  data.tar.gz: 5b2974454d61502919d45dde3b90b400eb77ab3fea4d87c44b9cae0b3c31c24ace549dc9ee314670ee114d0c7e751566c1b5181569e9120ddbd7bd214d702749

data/ext/filedict/filedict.h CHANGED Viewed

@@ -1,17 +1,14 @@
 #ifndef FILEDICT_H
 #define FILEDICT_H 1
-#ifndef FILEDICT_KEY_SIZE
-#define FILEDICT_KEY_SIZE 256
-#endif
+#include <stddef.h>
-#ifndef FILEDICT_VALUE_SIZE
-#define FILEDICT_VALUE_SIZE 256
+#ifndef FILEDICT_BUCKET_ENTRY_BYTES
+#define FILEDICT_BUCKET_ENTRY_BYTES 512
 #endif
 typedef struct filedict_bucket_entry_t {
-    char key[FILEDICT_KEY_SIZE];
-    char value[FILEDICT_VALUE_SIZE];
+    char bytes[FILEDICT_BUCKET_ENTRY_BYTES];
 } filedict_bucket_entry_t;
 #ifndef FILEDICT_BUCKET_ENTRY_COUNT
@@ -58,6 +55,7 @@ typedef struct filedict_read_t {
 #ifndef FILEDICT_IMPL
 #define FILEDICT_IMPL
+#include <sys/stat.h>
 #include <sys/mman.h>
 #include <string.h>
 #include <unistd.h>
@@ -80,7 +78,7 @@ static size_t filedict_default_hash_function(const char *input) {
 /*
  * Writes at most max_len chars from src into dest.
- * Returns the total number of bytes in src.
+ * Returns the string length of src.
  */
 static size_t filedict_copy_string(char *dest, const char *src, size_t max_len) {
     size_t src_len = 0;
@@ -92,6 +90,23 @@ static size_t filedict_copy_string(char *dest, const char *src, size_t max_len)
         if (c == 0) return src_len;
         src_len += 1;
     }
+    return src_len;
+}
+/*
+ * Returns the index of the trailing 0 when str1 and str2 have the same contents.
+ * Returns 0 when str1 and str2 have different contents.
+ */
+static size_t filedict_string_includes(const char *str1, const char *str2, size_t max_len) {
+    size_t i;
+    for (i = 0; i < max_len; ++i) {
+        if (str1[i] != str2[i]) return 0;
+        if (str1[i] == 0) return i;
+    }
+    return 0;
 }
 static void filedict_init(filedict_t *filedict) {
@@ -120,15 +135,39 @@ static void filedict_deinit(filedict_t *filedict) {
  * This computes the size of the entire filedict file given an initial bucket count and hashmap count.
  */
 static size_t filedict_file_size(size_t initial_bucket_count, size_t hashmap_count) {
-    size_t result = sizeof(filedict_header_t);
-    size_t i;
+    /*
+     * We used to size each additional hashmap at 2x the previous, but realistically it seems that
+     * most resizes are triggered by keys that are ridiculously large, not by mass collision.
+     *
+     * A more proper fix might be to re-structure the whole filedict. We could keep the existing
+     * hashmap structure, but with buckets that expand dynamically. This would require each bucket
+     * to contain a "pointer" to the next bucket object if present.
+     *
+     * For now, it's easiser to just keep the hashmap duplication without the size doubling.
+     */
+    return sizeof(filedict_header_t) + initial_bucket_count * hashmap_count * sizeof(filedict_bucket_t);
+}
-    for (i = 0; i < hashmap_count; ++i) {
-        /* Bucket count is multiplied by 2 for each additional hashmap. */
-        result += (initial_bucket_count << i) * sizeof(filedict_bucket_t);
-    }
+/*
+ * Resizes the filedict based on the header hashmap count and initial bucket count.
+ * Naturally, your pointers into the map will become invalid after calling this.
+ */
+static void filedict_resize(filedict_t *filedict) {
+    filedict_header_t *header = (filedict_header_t*)filedict->data;
+    size_t computed_size = filedict_file_size(header->initial_bucket_count, header->hashmap_count);
+    if (computed_size <= filedict->data_len) return;
-    return result;
+    munmap(filedict->data, filedict->data_len);
+    filedict->data = mmap(
+        filedict->data,
+        computed_size,
+        PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
+        MAP_SHARED,
+        filedict->fd,
+        0
+    );
+    if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
+    filedict->data_len = computed_size;
 }
 /*
@@ -149,12 +188,20 @@ static void filedict_open_f(
     int flags,
     unsigned int initial_bucket_count
 ) {
+    struct stat info;
     filedict->flags = flags;
     filedict->fd = open(filename, flags, 0666);
     if (filedict->fd == -1) { filedict->error = strerror(errno); return; }
+    if (fstat(filedict->fd, &info) != 0) { filedict->error = strerror(errno); return; }
+    if (info.st_size == 0 && (flags & O_RDWR)) {
+        filedict->data_len = filedict_file_size(initial_bucket_count, 1);
+        ftruncate(filedict->fd, filedict->data_len);
+    } else {
+        filedict->data_len = info.st_size;
+    }
-    filedict->data_len = filedict_file_size(initial_bucket_count, 1);
-    ftruncate(filedict->fd, filedict->data_len);
     filedict->data = mmap(
         NULL,
         filedict->data_len,
@@ -206,35 +253,42 @@ try_again:
             filedict_bucket_entry_t *entry = &bucket->entries[i];
             /* Easy case: fresh entry. We can just insert here and call it quits. */
-            if (entry->key[0] == 0) {
-                strncpy(entry->key, key, FILEDICT_KEY_SIZE);
-                size_t value_len = filedict_copy_string(entry->value, value, FILEDICT_VALUE_SIZE);
+            if (entry->bytes[0] == 0) {
+                size_t key_len = filedict_copy_string(entry->bytes, key, FILEDICT_BUCKET_ENTRY_BYTES);
+                size_t value_len = filedict_copy_string(entry->bytes + key_len + 1, value, FILEDICT_BUCKET_ENTRY_BYTES);
-                if (value_len > FILEDICT_VALUE_SIZE) {
+                if (key_len + value_len > FILEDICT_BUCKET_ENTRY_BYTES) {
                     filedict->error = "Value too big";
                 }
                 return;
             }
             /*
              * We need to check for room in the value, then append value.
-             * This is also where we might run into a duplicate and duck out.existing
+             * This is also where we might run into a duplicate and duck out.
              */
-            else if (strncmp(entry->key, key, FILEDICT_KEY_SIZE) == 0) {
+            else if (strncmp(entry->bytes, key, FILEDICT_BUCKET_ENTRY_BYTES) == 0) {
                 long long first_nonzero = -1;
                 char *candidate = NULL;
-                size_t value_i, candidate_len;
+                size_t bytes_i, candidate_max_len;
-                for (value_i = 0; value_i < FILEDICT_VALUE_SIZE - 1; ++value_i) {
+                for (bytes_i = 0; entry->bytes[bytes_i] != 0; ++bytes_i) {
+                    if (bytes_i >= FILEDICT_BUCKET_ENTRY_BYTES) {
+                        filedict->error = "Mysterious entry overflow!! Does it contain a massive key?";
+                        return;
+                    }
+                }
+                for (bytes_i += 1; bytes_i < FILEDICT_BUCKET_ENTRY_BYTES - 1; ++bytes_i) {
                     if (unique) {
-                        if (first_nonzero == -1 && entry->value[value_i] != 0) {
-                            first_nonzero = value_i;
+                        if (first_nonzero == -1 && entry->bytes[bytes_i] != 0) {
+                            first_nonzero = bytes_i;
                         }
-                        if (entry->value[value_i] == 0) {
+                        if (entry->bytes[bytes_i] == 0) {
                             int cmp = strncmp(
-                                &entry->value[first_nonzero],
+                                &entry->bytes[first_nonzero],
                                 value,
-                                FILEDICT_VALUE_SIZE - first_nonzero
+                                FILEDICT_BUCKET_ENTRY_BYTES - first_nonzero
                             );
                             if (cmp == 0) {
                                 /* Looks like this value already exists! */
@@ -244,13 +298,13 @@ try_again:
                         }
                     }
-                    if (entry->value[value_i] == 0 && entry->value[value_i + 1] == 0) {
-                        candidate = &entry->value[value_i + 1];
-                        candidate_len = FILEDICT_VALUE_SIZE - value_i - 1;
+                    if (entry->bytes[bytes_i] == 0 && entry->bytes[bytes_i + 1] == 0) {
+                        candidate = &entry->bytes[bytes_i + 1];
+                        candidate_max_len = FILEDICT_BUCKET_ENTRY_BYTES - bytes_i - 1;
-                        if (strlen(value) >= candidate_len) break;
+                        if (strlen(value) >= candidate_max_len) break;
-                        strncpy(candidate, value, candidate_len);
+                        strncpy(candidate, value, candidate_max_len);
                         return;
                     }
                 }
@@ -259,7 +313,6 @@ try_again:
         ++hashmap_i;
         hashmap += bucket_count;
-        bucket_count = (bucket_count << 1);
     }
     /*
@@ -293,27 +346,6 @@ try_again:
     goto try_again;
 }
-/*
- * Resizes the filedict based on the header hashmap count and initial bucket count.
- * Naturally, your pointers into the map will become invalid after calling this.
- */
-static void filedict_resize(filedict_t *filedict) {
-    filedict_header_t *header = (filedict_header_t*)filedict->data;
-    size_t computed_size = filedict_file_size(header->initial_bucket_count, header->hashmap_count);
-    munmap(filedict->data, filedict->data_len);
-    filedict->data = mmap(
-        filedict->data,
-        computed_size,
-        PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
-        MAP_SHARED,
-        filedict->fd,
-        0
-    );
-    if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
-    filedict->data_len = computed_size;
-}
 /*
  * There are 3 "levels" to a filedict. From top to bottom:
  * 1. Hashmap - which hashmap are we looking at? We create additional hashmaps to handle overflow.
@@ -331,8 +363,8 @@ static void filedict_resize(filedict_t *filedict) {
 static int filedict_read_advance_value(filedict_read_t *read) {
     assert(read->entry != NULL);
-    const char *buffer_begin = read->entry->value;
-    const char *buffer_end = buffer_begin + FILEDICT_VALUE_SIZE;
+    const char *buffer_begin = read->entry->bytes;
+    const char *buffer_end = buffer_begin + FILEDICT_BUCKET_ENTRY_BYTES;
     const char *c;
     for (c = read->value; c < buffer_end; ++c) {
@@ -356,8 +388,8 @@ static int filedict_read_advance_value(filedict_read_t *read) {
  * Returns 0 when we exhausted all remaining entries and didn't find a match.
  */
 static int filedict_read_advance_entry(filedict_read_t *read) {
-    assert(read->key != NULL);
-    assert(strlen(read->key) > 0);
+    size_t value_start_i;
     assert(read->bucket != NULL);
     while (1) {
@@ -365,9 +397,22 @@ static int filedict_read_advance_entry(filedict_read_t *read) {
         read->entry = &read->bucket->entries[read->entry_i];
-        if (strncmp(read->entry->key, read->key, FILEDICT_KEY_SIZE) == 0) {
-            read->value = read->entry->value;
-            log_return(1);
+        if (read->key == NULL) {
+            if (read->entry->bytes[0] != 0) {
+                value_start_i = strlen(read->entry->bytes) + 1;
+                read->value = &read->entry->bytes[value_start_i];
+                log_return(1);
+            }
+        }
+        else {
+            value_start_i = filedict_string_includes(read->entry->bytes, read->key, FILEDICT_BUCKET_ENTRY_BYTES);
+            if (value_start_i > 0) {
+                /* add 1 because it's pointing to the 0 after key; not the first char of value */
+                value_start_i += 1;
+                read->value = &read->entry->bytes[value_start_i];
+                log_return(1);
+            }
         }
         read->entry_i += 1;
@@ -382,6 +427,7 @@ static int filedict_read_advance_entry(filedict_read_t *read) {
  */
 static int filedict_read_advance_hashmap(filedict_read_t *read) {
     filedict_t *filedict = read->filedict;
+    int success = 0;
     assert(filedict);
     assert(filedict->data);
@@ -400,12 +446,25 @@ static int filedict_read_advance_hashmap(filedict_read_t *read) {
     filedict_bucket_t *hashmap = filedict->data + offset;
-    read->bucket_count = (size_t)header->initial_bucket_count << read->hashmap_i;
+    read->bucket_count = (size_t)header->initial_bucket_count;
     read->bucket = &hashmap[read->key_hash % read->bucket_count];
     read->entry = &read->bucket->entries[0];
     read->entry_i = 0;
+    if (read->key == NULL) {
+        success = filedict_read_advance_entry(read);
+        while (!success) {
+            read->key_hash += 1;
+            read->bucket = &hashmap[read->key_hash % read->bucket_count];
+            read->entry = &read->bucket->entries[0];
+            read->entry_i = 0;
+            success = filedict_read_advance_entry(read);
+            if (read->key_hash >= read->bucket_count) return 0;
+        }
+        return success;
+    }
     log_return(filedict_read_advance_entry(read));
 }
@@ -422,7 +481,14 @@ static filedict_read_t filedict_get(filedict_t *filedict, const char *key) {
     read.entry_i = 0;
     read.hashmap_i = 0;
     read.bucket_count = 0;
-    read.key_hash = filedict->hash_function(key);
+    /* NULL key means we want to iterate the whole entire dictionary */
+    if (key == NULL) {
+        read.key_hash = 0;
+    }
+    else {
+        read.key_hash = filedict->hash_function(key);
+    }
     filedict_read_advance_hashmap(&read);
     return read;
@@ -445,6 +511,19 @@ static int filedict_get_next(filedict_read_t *read) {
     found = filedict_read_advance_entry(read);
     if (found == 1) return found;
+    /*
+     * If read->key is NULL, that means we're iterating through the whole dict.
+     */
+    if (read->key == NULL) {
+        read->key_hash += 1;
+        if (read->key_hash < read->bucket_count) {
+            return filedict_read_advance_hashmap(read);
+        }
+        else {
+            read->key_hash = 0;
+        }
+    }
     read->hashmap_i += 1;
     return filedict_read_advance_hashmap(read);
 }

data/lib/filedict/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Filedict
-  VERSION = "0.1.3"
+  VERSION = "1.0.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: filedictrb
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 1.0.0
 platform: ruby
 authors:
 - Nigel Baillie
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2022-04-03 00:00:00.000000000 Z
+date: 2022-06-19 00:00:00.000000000 Z
 dependencies: []
 description:
 email: