RubyGems - filedictrb - Versions diffs - 0.1.0 → 0.1.3 - Mend

filedictrb 0.1.0 → 0.1.3

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 74171da9cf75b57d441faad6754c05e718a170fdada7618cd593ddcc8f8b982a
-  data.tar.gz: 6a88336c3e3cb8e1a18d50bc14bd49794f0ba10e614667c0a086dfb9cd35d11c
+  metadata.gz: aad965d1fb0dd25c74c20f917b2f71e0d48fc7d6dea59bab8ad2c5990b3fb8fe
+  data.tar.gz: cd8c479f17d113b19d5ef89d565aeab6c51305e1e0f81dab3d8d6a538555009b
 SHA512:
-  metadata.gz: 9dde01e9a2f1af933ec22e7648797fbbd393362bd9abdfc7ff351104aa0184f02222edcc8c59de98770172f4e1e9c7285d61e9d2b89d575a06aebacb080d86d3
-  data.tar.gz: d569f379a3787a42dcd691a28b57e68ba1dd5579c4afb8741aad66d6ff13d58d2dba5e949772684c41cbec4ecccc9b55de9e1b9506353c447a37d7fd04d14e65
+  metadata.gz: 2dc64caded0eadba4cb03da2dfb926b3c488d6773df59087752953371e5dfabeeac4625478f0ac33a26c4a60f76105a8026ee49aa6e2c700437c0e992ba6a6fe
+  data.tar.gz: 7e392f4e734fbe940fb4d86c7a04700edcad3cbab286d4103f6846dbcadb1f2fd01ac8a6a272dd31e542ded3fb919fcfa6b566919161ab1cc3782a56692a2281

data/bin/setup CHANGED Viewed

@@ -3,6 +3,7 @@ set -euo pipefail
 IFS=$'\n\t'
 set -vx
+git submodule update --init
 bundle install
 # Do any other automated setup that you need to do here

data/ext/filedict/filedict.h ADDED Viewed

@@ -0,0 +1,452 @@
+#ifndef FILEDICT_H
+#define FILEDICT_H 1
+#ifndef FILEDICT_KEY_SIZE
+#define FILEDICT_KEY_SIZE 256
+#endif
+#ifndef FILEDICT_VALUE_SIZE
+#define FILEDICT_VALUE_SIZE 256
+#endif
+typedef struct filedict_bucket_entry_t {
+    char key[FILEDICT_KEY_SIZE];
+    char value[FILEDICT_VALUE_SIZE];
+} filedict_bucket_entry_t;
+#ifndef FILEDICT_BUCKET_ENTRY_COUNT
+#define FILEDICT_BUCKET_ENTRY_COUNT 4
+#endif
+typedef struct filedict_bucket_t {
+    filedict_bucket_entry_t entries[FILEDICT_BUCKET_ENTRY_COUNT];
+} filedict_bucket_t;
+typedef size_t (*filedict_hash_function_t)(const char *);
+typedef struct filedict_t {
+    const char *error;
+    int fd;
+    int flags;
+    void *data;
+    size_t data_len;
+    filedict_hash_function_t hash_function;
+} filedict_t;
+typedef struct filedict_header_t {
+    unsigned long long initial_bucket_count : 32;
+    unsigned long long hashmap_count : 32;
+} __attribute__ ((__packed__)) filedict_header_t;
+typedef struct filedict_read_t {
+    filedict_t *filedict;
+    const char *key;
+    const char *value;
+    filedict_bucket_t *bucket;
+    filedict_bucket_entry_t *entry;
+    size_t entry_i;
+    size_t hashmap_i;
+    size_t bucket_count;
+    size_t key_hash;
+} filedict_read_t;
+#endif
+/*
+ * Above is the header, blow is the implementation
+ */
+#ifndef FILEDICT_IMPL
+#define FILEDICT_IMPL
+#include <sys/mman.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <limits.h>
+#include <assert.h>
+/* This is "djb2" from http://www.cse.yorku.ca/~oz/hash.html */
+static size_t filedict_default_hash_function(const char *input) {
+    unsigned long hash = 5381;
+    int c;
+    while ((c = *input++) != 0) {
+        hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
+    }
+    return hash;
+}
+/*
+ * Writes at most max_len chars from src into dest.
+ * Returns the total number of bytes in src.
+ */
+static size_t filedict_copy_string(char *dest, const char *src, size_t max_len) {
+    size_t src_len = 0;
+    char c;
+    while (1) {
+        c = *src++;
+        if (src_len < max_len) { *dest++ = c; }
+        if (c == 0) return src_len;
+        src_len += 1;
+    }
+}
+static void filedict_init(filedict_t *filedict) {
+    filedict->error = NULL;
+    filedict->fd = 0;
+    filedict->flags = 0;
+    filedict->data_len = 0;
+    filedict->data = NULL;
+    filedict->hash_function = filedict_default_hash_function;
+}
+static void filedict_deinit(filedict_t *filedict) {
+    if (filedict->data) {
+        munmap(filedict->data, filedict->data_len);
+        filedict->data = NULL;
+        filedict->data_len = 0;
+    }
+    if (filedict->fd) {
+        close(filedict->fd);
+        filedict->fd = 0;
+        filedict->flags = 0;
+    }
+}
+/*
+ * This computes the size of the entire filedict file given an initial bucket count and hashmap count.
+ */
+static size_t filedict_file_size(size_t initial_bucket_count, size_t hashmap_count) {
+    size_t result = sizeof(filedict_header_t);
+    size_t i;
+    for (i = 0; i < hashmap_count; ++i) {
+        /* Bucket count is multiplied by 2 for each additional hashmap. */
+        result += (initial_bucket_count << i) * sizeof(filedict_bucket_t);
+    }
+    return result;
+}
+/*
+ * This opens a new file for reading and writing, optionally letting you specify the initial bucket count.
+ */
+#define filedict_open_new(filedict, filename) \
+    filedict_open_f(filedict, filename, O_CREAT | O_TRUNC | O_RDWR, 4096)
+#define filedict_open_readonly(filedict, filename) \
+    filedict_open_f(filedict, filename, O_RDONLY, 4096)
+#define filedict_open(filedict, filename) \
+    filedict_open_f(filedict, filename, O_CREAT | O_RDWR, 4096)
+static void filedict_open_f(
+    filedict_t *filedict,
+    const char *filename,
+    int flags,
+    unsigned int initial_bucket_count
+) {
+    filedict->flags = flags;
+    filedict->fd = open(filename, flags, 0666);
+    if (filedict->fd == -1) { filedict->error = strerror(errno); return; }
+    filedict->data_len = filedict_file_size(initial_bucket_count, 1);
+    ftruncate(filedict->fd, filedict->data_len);
+    filedict->data = mmap(
+        NULL,
+        filedict->data_len,
+        PROT_READ | ((flags & O_RDWR) ? PROT_WRITE : 0),
+        MAP_SHARED,
+        filedict->fd,
+        0
+    );
+    if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
+    filedict_header_t *data = (filedict_header_t *)filedict->data;
+    assert(initial_bucket_count <= UINT_MAX);
+    if (data->initial_bucket_count == 0) {
+        data->initial_bucket_count = initial_bucket_count;
+        data->hashmap_count = 1;
+    }
+}
+/*
+ * Inserts a new value under "key". Filedict keys have multiple values, so this will "append" a new
+ * value onto the end of the entry.
+ */
+#define filedict_insert(filedict, key, value) filedict_insert_f(filedict, key, value, 0)
+#define filedict_insert_unique(filedict, key, value) filedict_insert_f(filedict, key, value, 1)
+static void filedict_insert_f(filedict_t *filedict, const char *key, const char *value, int unique) {
+    assert(filedict->fd != 0);
+    assert(filedict->data != NULL);
+    size_t i, hashmap_i = 0, bucket_count, key_hash;
+    filedict_header_t *header = (filedict_header_t *)filedict->data;
+    filedict_bucket_t *hashmap = filedict->data + sizeof(filedict_header_t);
+    filedict_bucket_t *bucket;
+    bucket_count = header->initial_bucket_count;
+    key_hash = filedict->hash_function(key);
+    /*
+     * Here we loop through each hashmap.
+     */
+    while (hashmap_i < header->hashmap_count) {
+try_again:
+        /* TODO: can we truncate instead of modulo, like in Ruby? */
+        bucket = &hashmap[key_hash % bucket_count];
+        for (i = 0; i < FILEDICT_BUCKET_ENTRY_COUNT; ++i) {
+            filedict_bucket_entry_t *entry = &bucket->entries[i];
+            /* Easy case: fresh entry. We can just insert here and call it quits. */
+            if (entry->key[0] == 0) {
+                strncpy(entry->key, key, FILEDICT_KEY_SIZE);
+                size_t value_len = filedict_copy_string(entry->value, value, FILEDICT_VALUE_SIZE);
+                if (value_len > FILEDICT_VALUE_SIZE) {
+                    filedict->error = "Value too big";
+                }
+                return;
+            }
+            /*
+             * We need to check for room in the value, then append value.
+             * This is also where we might run into a duplicate and duck out.existing
+             */
+            else if (strncmp(entry->key, key, FILEDICT_KEY_SIZE) == 0) {
+                long long first_nonzero = -1;
+                char *candidate = NULL;
+                size_t value_i, candidate_len;
+                for (value_i = 0; value_i < FILEDICT_VALUE_SIZE - 1; ++value_i) {
+                    if (unique) {
+                        if (first_nonzero == -1 && entry->value[value_i] != 0) {
+                            first_nonzero = value_i;
+                        }
+                        if (entry->value[value_i] == 0) {
+                            int cmp = strncmp(
+                                &entry->value[first_nonzero],
+                                value,
+                                FILEDICT_VALUE_SIZE - first_nonzero
+                            );
+                            if (cmp == 0) {
+                                /* Looks like this value already exists! */
+                                return;
+                            }
+                            first_nonzero = -1;
+                        }
+                    }
+                    if (entry->value[value_i] == 0 && entry->value[value_i + 1] == 0) {
+                        candidate = &entry->value[value_i + 1];
+                        candidate_len = FILEDICT_VALUE_SIZE - value_i - 1;
+                        if (strlen(value) >= candidate_len) break;
+                        strncpy(candidate, value, candidate_len);
+                        return;
+                    }
+                }
+            }
+        }
+        ++hashmap_i;
+        hashmap += bucket_count;
+        bucket_count = (bucket_count << 1);
+    }
+    /*
+     * If we fell through to here, that means we need to allocate a new hashmap.
+     */
+    size_t new_hashmap_count = header->hashmap_count + 1;
+    size_t old_data_len = filedict->data_len;
+    size_t new_data_len = filedict_file_size(header->initial_bucket_count, new_hashmap_count);
+    assert(new_data_len > old_data_len);
+    assert((new_data_len - old_data_len) % header->initial_bucket_count == 0);
+    munmap(filedict->data, filedict->data_len);
+    int truncate_result = ftruncate(filedict->fd, new_data_len);
+    if (truncate_result != 0) { filedict->error = strerror(errno); return; }
+    filedict->data = mmap(
+        filedict->data,
+        new_data_len,
+        PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
+        MAP_SHARED,
+        filedict->fd,
+        0
+    );
+    if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
+    header = (filedict_header_t *)filedict->data;
+    hashmap = filedict->data + old_data_len;
+    filedict->data_len = new_data_len;
+    header->hashmap_count = new_hashmap_count;
+    goto try_again;
+}
+/*
+ * Resizes the filedict based on the header hashmap count and initial bucket count.
+ * Naturally, your pointers into the map will become invalid after calling this.
+ */
+static void filedict_resize(filedict_t *filedict) {
+    filedict_header_t *header = (filedict_header_t*)filedict->data;
+    size_t computed_size = filedict_file_size(header->initial_bucket_count, header->hashmap_count);
+    munmap(filedict->data, filedict->data_len);
+    filedict->data = mmap(
+        filedict->data,
+        computed_size,
+        PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
+        MAP_SHARED,
+        filedict->fd,
+        0
+    );
+    if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
+    filedict->data_len = computed_size;
+}
+/*
+ * There are 3 "levels" to a filedict. From top to bottom:
+ * 1. Hashmap - which hashmap are we looking at? We create additional hashmaps to handle overflow.
+ * 2. Entry   - which entry in our hashmap bucket are we looking at?
+ * 3. Value   - where in the value buffer are we looking? There's 256 bytes, so can be many strings.
+ */
+/* #define log_return(val) do { printf("%s -> %i\n", __func__, (val)); return (val); } while(0) */
+#define log_return(val) return val
+/*
+ * Returns 1 when we successfully advanced to the next value
+ * Returns 0 when there is no next value
+ */
+static int filedict_read_advance_value(filedict_read_t *read) {
+    assert(read->entry != NULL);
+    const char *buffer_begin = read->entry->value;
+    const char *buffer_end = buffer_begin + FILEDICT_VALUE_SIZE;
+    const char *c;
+    for (c = read->value; c < buffer_end; ++c) {
+        if (*c == 0) {
+            c += 1;
+            break;
+        }
+    }
+    if (c >= buffer_end) log_return(0);
+    if (*c == 0) log_return(0);
+    read->value = c;
+    log_return(1);
+}
+/*
+ * Returns 1 when we successfully find a new entry that matches read->key.
+ *           advances read->entry_i and read->entry to the new entry.
+ *
+ * Returns 0 when we exhausted all remaining entries and didn't find a match.
+ */
+static int filedict_read_advance_entry(filedict_read_t *read) {
+    assert(read->key != NULL);
+    assert(strlen(read->key) > 0);
+    assert(read->bucket != NULL);
+    while (1) {
+        if (read->entry_i >= FILEDICT_BUCKET_ENTRY_COUNT) log_return(0);
+        read->entry = &read->bucket->entries[read->entry_i];
+        if (strncmp(read->entry->key, read->key, FILEDICT_KEY_SIZE) == 0) {
+            read->value = read->entry->value;
+            log_return(1);
+        }
+        read->entry_i += 1;
+    }
+}
+/*
+ * Returns 1 when we successfully advanced to the next hashmap.
+ *           read->bucket, read->entry, and read->value will be populated.
+ *
+ * Returns 0 when there are no more hashmaps, or the latest hashmap has no matching entries.
+ */
+static int filedict_read_advance_hashmap(filedict_read_t *read) {
+    filedict_t *filedict = read->filedict;
+    assert(filedict);
+    assert(filedict->data);
+    filedict_header_t *header = (filedict_header_t*)filedict->data;
+    if (read->hashmap_i >= header->hashmap_count) log_return(0);
+    size_t offset = filedict_file_size(header->initial_bucket_count, read->hashmap_i);
+    if (offset >= filedict->data_len) {
+        filedict_resize(filedict);
+        if (filedict->error) log_return(0);
+        header = (filedict_header_t*)filedict->data;
+    }
+    filedict_bucket_t *hashmap = filedict->data + offset;
+    read->bucket_count = (size_t)header->initial_bucket_count << read->hashmap_i;
+    read->bucket = &hashmap[read->key_hash % read->bucket_count];
+    read->entry = &read->bucket->entries[0];
+    read->entry_i = 0;
+    log_return(filedict_read_advance_entry(read));
+}
+/*
+ * Returns a "read" at the given key. If there's a hit, <return>.value will have the value.
+ */
+static filedict_read_t filedict_get(filedict_t *filedict, const char *key) {
+    filedict_read_t read;
+    read.filedict = filedict;
+    read.key = key;
+    read.value = NULL;
+    read.bucket = NULL;
+    read.entry = NULL;
+    read.entry_i = 0;
+    read.hashmap_i = 0;
+    read.bucket_count = 0;
+    read.key_hash = filedict->hash_function(key);
+    filedict_read_advance_hashmap(&read);
+    return read;
+}
+/*
+ * Lets you find the next value. Pass the return value of filedict_get.
+ *
+ * Returns 1 when a next value was found, 0 otherwise.
+ *
+ * If this returns 0, your filedict_read_t is defunct and shouldn't be used anymore.
+ */
+static int filedict_get_next(filedict_read_t *read) {
+    int found = -1;
+    found = filedict_read_advance_value(read);
+    if (found == 1) return found;
+    read->entry_i += 1;
+    found = filedict_read_advance_entry(read);
+    if (found == 1) return found;
+    read->hashmap_i += 1;
+    return filedict_read_advance_hashmap(read);
+}
+#endif

data/ext/filedictrb/hash.c CHANGED Viewed

@@ -3,7 +3,6 @@
 extern VALUE mFiledict;
 VALUE cHash;
-VALUE mSetExt;
 VALUE cSet;
 ID id_add;
@@ -89,7 +88,7 @@ static VALUE fd_set_add(int argc, VALUE *argv, VALUE self) {
     VALUE fd_hash_ruby_object = rb_ivar_get(self, id_fd_hash);
     if (fd_hash_ruby_object == Qnil) {
-        return self;
+        return rb_call_super(argc, argv);
     }
     fd_hash_t *fd_hash = RTYPEDDATA_DATA(fd_hash_ruby_object);
@@ -124,8 +123,6 @@ static VALUE fd_hash_access(VALUE self, VALUE key) {
     rb_ivar_set(result, id_fd_hash, self);
     rb_ivar_set(result, id_fd_key, key);
-    rb_extend_object(result, mSetExt);
     return result;
 }
@@ -139,9 +136,8 @@ void fdrb_init_hash() {
     VALUE rb_cSet = rb_define_class("Set", rb_cObject);
     cSet = rb_define_class_under(mFiledict, "Set", rb_cSet);
-    mSetExt = rb_define_module_under(mFiledict, "SetExt");
-    rb_define_method(mSetExt, "add", fd_set_add, -1);
+    rb_define_method(cSet, "add", fd_set_add, -1);
     id_add = rb_intern("add");
     id_remove = rb_intern("remove");

data/filedictrb.gemspec CHANGED Viewed

@@ -25,6 +25,7 @@ Gem::Specification.new do |spec|
       (f == __FILE__) || f.match(%r{\A(?:(?:test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
     end
   end
+  spec.files << 'ext/filedict/filedict.h'
   spec.bindir        = "exe"
   spec.executables   = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]

data/lib/filedict/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Filedict
-  VERSION = "0.1.0"
+  VERSION = "0.1.3"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: filedictrb
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.3
 platform: ruby
 authors:
 - Nigel Baillie
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2022-03-30 00:00:00.000000000 Z
+date: 2022-04-03 00:00:00.000000000 Z
 dependencies: []
 description:
 email:
@@ -26,6 +26,7 @@ files:
 - Rakefile
 - bin/console
 - bin/setup
+- ext/filedict/filedict.h
 - ext/filedictrb/extconf.rb
 - ext/filedictrb/filedictrb.c
 - ext/filedictrb/filedictrb.h
@@ -57,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.3.3
+rubygems_version: 3.1.6
 signing_key:
 specification_version: 4
 summary: Uses filedict to emulate a file-backed Hash<Set<String>>