RubyGems - filedictrb - Versions diffs - 0.1.0 → 0.1.1 - Mend

filedictrb 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 74171da9cf75b57d441faad6754c05e718a170fdada7618cd593ddcc8f8b982a
-  data.tar.gz: 6a88336c3e3cb8e1a18d50bc14bd49794f0ba10e614667c0a086dfb9cd35d11c
+  metadata.gz: 64d09435d5e09d917e12a06932a75ddf365b081a9caa1253a3b11a53a235196c
+  data.tar.gz: 4cd1fedaa0035da30c2d9572eb28dd190cab901803089b164c36ba4d5776d36c
 SHA512:
-  metadata.gz: 9dde01e9a2f1af933ec22e7648797fbbd393362bd9abdfc7ff351104aa0184f02222edcc8c59de98770172f4e1e9c7285d61e9d2b89d575a06aebacb080d86d3
-  data.tar.gz: d569f379a3787a42dcd691a28b57e68ba1dd5579c4afb8741aad66d6ff13d58d2dba5e949772684c41cbec4ecccc9b55de9e1b9506353c447a37d7fd04d14e65
+  metadata.gz: d232d34e3b99221baf055c56ca672b2536d910103a3a4dd6bab1be9005270db86cda72690f1dd56ed230fb0214ef4cecae45a9f8ae5931b7ebcb3288df8492e2
+  data.tar.gz: de03f2dc36efad352a54f647f3275cc15bae641799e3b2ca258cdbb08100103493ac35e2c37e327f7f39feb2dc92eb80af6fb3c4cd645fad8794612ee0cffcfc

data/bin/setup CHANGED Viewed

@@ -3,6 +3,7 @@ set -euo pipefail
 IFS=$'\n\t'
 set -vx
+git submodule update --init
 bundle install
 # Do any other automated setup that you need to do here

data/ext/filedict/filedict.h ADDED Viewed

@@ -0,0 +1,417 @@
+#ifndef FILEDICT_H
+#define FILEDICT_H 1
+#ifndef FILEDICT_KEY_SIZE
+#define FILEDICT_KEY_SIZE 256
+#endif
+#ifndef FILEDICT_VALUE_SIZE
+#define FILEDICT_VALUE_SIZE 256
+#endif
+typedef struct filedict_bucket_entry_t {
+    char key[FILEDICT_KEY_SIZE];
+    char value[FILEDICT_VALUE_SIZE];
+} filedict_bucket_entry_t;
+#ifndef FILEDICT_BUCKET_ENTRY_COUNT
+#define FILEDICT_BUCKET_ENTRY_COUNT 4
+#endif
+typedef struct filedict_bucket_t {
+    filedict_bucket_entry_t entries[FILEDICT_BUCKET_ENTRY_COUNT];
+} filedict_bucket_t;
+typedef size_t (*filedict_hash_function_t)(const char *);
+typedef struct filedict_t {
+    const char *error;
+    int fd;
+    void *data;
+    size_t data_len;
+    filedict_hash_function_t hash_function;
+} filedict_t;
+typedef struct filedict_header_t {
+    unsigned long long initial_bucket_count : 32;
+    unsigned long long hashmap_count : 32;
+} __attribute__ ((__packed__)) filedict_header_t;
+typedef struct filedict_read_t {
+    const filedict_t *filedict;
+    const char *key;
+    const char *value;
+    filedict_bucket_t *bucket;
+    filedict_bucket_entry_t *entry;
+    size_t entry_i;
+    size_t hashmap_i;
+    size_t bucket_count;
+    size_t key_hash;
+} filedict_read_t;
+#endif
+/*
+ * Above is the header, blow is the implementation
+ */
+#ifndef FILEDICT_IMPL
+#define FILEDICT_IMPL
+#include <sys/mman.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <limits.h>
+#include <assert.h>
+/* This is "djb2" from http://www.cse.yorku.ca/~oz/hash.html */
+static size_t filedict_default_hash_function(const char *input) {
+    unsigned long hash = 5381;
+    int c;
+    while ((c = *input++) != 0) {
+        hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
+    }
+    return hash;
+}
+/*
+ * Writes at most max_len chars from src into dest.
+ * Returns the total number of bytes in src.
+ */
+static size_t filedict_copy_string(char *dest, const char *src, size_t max_len) {
+    size_t src_len = 0;
+    char c;
+    while (1) {
+        c = *src++;
+        if (src_len < max_len) { *dest++ = c; }
+        if (c == 0) return src_len;
+        src_len += 1;
+    }
+}
+static void filedict_init(filedict_t *filedict) {
+    filedict->error = NULL;
+    filedict->fd = 0;
+    filedict->data_len = 0;
+    filedict->data = NULL;
+    filedict->hash_function = filedict_default_hash_function;
+}
+static void filedict_deinit(filedict_t *filedict) {
+    if (filedict->data) {
+        munmap(filedict->data, filedict->data_len);
+        filedict->data = NULL;
+        filedict->data_len = 0;
+    }
+    if (filedict->fd) {
+        close(filedict->fd);
+        filedict->fd = 0;
+    }
+}
+/*
+ * This computes the size of the entire filedict file given an initial bucket count and hashmap count.
+ */
+static size_t filedict_file_size(size_t initial_bucket_count, size_t hashmap_count) {
+    size_t result = sizeof(filedict_header_t);
+    size_t i;
+    for (i = 0; i < hashmap_count; ++i) {
+        /* Bucket count is multiplied by 2 for each additional hashmap. */
+        result += (initial_bucket_count << i) * sizeof(filedict_bucket_t);
+    }
+    return result;
+}
+/*
+ * This opens a new file for reading and writing, optionally letting you specify the initial bucket count.
+ */
+#define filedict_open_new(filedict, filename) \
+    filedict_open_f(filedict, filename, O_CREAT | O_TRUNC | O_RDWR, 4096)
+#define filedict_open_readonly(filedict, filename) \
+    filedict_open_f(filedict, filename, O_RDONLY, 4096)
+#define filedict_open(filedict, filename) \
+    filedict_open_f(filedict, filename, O_CREAT | O_RDWR, 4096)
+static void filedict_open_f(
+    filedict_t *filedict,
+    const char *filename,
+    int flags,
+    unsigned int initial_bucket_count
+) {
+    filedict->fd = open(filename, flags, 0666);
+    if (filedict->fd == -1) { filedict->error = strerror(errno); return; }
+    filedict->data_len = filedict_file_size(initial_bucket_count, 1);
+    ftruncate(filedict->fd, filedict->data_len);
+    filedict->data = mmap(
+        NULL,
+        filedict->data_len,
+        PROT_READ | PROT_WRITE,
+        MAP_SHARED,
+        filedict->fd,
+        0
+    );
+    if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
+    filedict_header_t *data = (filedict_header_t *)filedict->data;
+    assert(initial_bucket_count <= UINT_MAX);
+    data->initial_bucket_count = initial_bucket_count;
+    data->hashmap_count = 1;
+}
+/*
+ * Inserts a new value under "key". Filedict keys have multiple values, so this will "append" a new
+ * value onto the end of the entry.
+ */
+#define filedict_insert(filedict, key, value) filedict_insert_f(filedict, key, value, 0)
+#define filedict_insert_unique(filedict, key, value) filedict_insert_f(filedict, key, value, 1)
+static void filedict_insert_f(filedict_t *filedict, const char *key, const char *value, int unique) {
+    assert(filedict->fd != 0);
+    assert(filedict->data != NULL);
+    size_t i, hashmap_i = 0, bucket_count, key_hash;
+    filedict_header_t *header = (filedict_header_t *)filedict->data;
+    filedict_bucket_t *hashmap = filedict->data + sizeof(filedict_header_t);
+    filedict_bucket_t *bucket;
+    bucket_count = header->initial_bucket_count;
+    key_hash = filedict->hash_function(key);
+    /*
+     * Here we loop through each hashmap.
+     */
+    while (hashmap_i < header->hashmap_count) {
+try_again:
+        /* TODO: can we truncate instead of modulo, like in Ruby? */
+        bucket = &hashmap[key_hash % bucket_count];
+        for (i = 0; i < FILEDICT_BUCKET_ENTRY_COUNT; ++i) {
+            filedict_bucket_entry_t *entry = &bucket->entries[i];
+            /* Easy case: fresh entry. We can just insert here and call it quits. */
+            if (entry->key[0] == 0) {
+                strncpy(entry->key, key, FILEDICT_KEY_SIZE);
+                size_t value_len = filedict_copy_string(entry->value, value, FILEDICT_VALUE_SIZE);
+                if (value_len > FILEDICT_VALUE_SIZE) {
+                    filedict->error = "Value too big";
+                }
+                return;
+            }
+            /*
+             * We need to check for room in the value, then append value.
+             * This is also where we might run into a duplicate and duck out.existing
+             */
+            else if (strncmp(entry->key, key, FILEDICT_KEY_SIZE) == 0) {
+                long long first_nonzero = -1;
+                char *candidate = NULL;
+                size_t value_i, candidate_len;
+                for (value_i = 0; value_i < FILEDICT_VALUE_SIZE - 1; ++value_i) {
+                    if (unique) {
+                        if (first_nonzero == -1 && entry->value[value_i] != 0) {
+                            first_nonzero = value_i;
+                        }
+                        if (entry->value[value_i] == 0) {
+                            int cmp = strncmp(
+                                &entry->value[first_nonzero],
+                                value,
+                                FILEDICT_VALUE_SIZE - first_nonzero
+                            );
+                            if (cmp == 0) {
+                                /* Looks like this value already exists! */
+                                return;
+                            }
+                            first_nonzero = -1;
+                        }
+                    }
+                    if (entry->value[value_i] == 0 && entry->value[value_i + 1] == 0) {
+                        candidate = &entry->value[value_i + 1];
+                        candidate_len = FILEDICT_VALUE_SIZE - value_i - 1;
+                        if (strlen(value) >= candidate_len) break;
+                        strncpy(candidate, value, candidate_len);
+                        return;
+                    }
+                }
+            }
+        }
+        ++hashmap_i;
+        hashmap += bucket_count;
+        bucket_count = (bucket_count << 1);
+    }
+    /*
+     * If we fell through to here, that means we need to allocate a new hashmap.
+     */
+    size_t new_hashmap_count = header->hashmap_count + 1;
+    size_t old_data_len = filedict->data_len;
+    size_t new_data_len = filedict_file_size(header->initial_bucket_count, new_hashmap_count);
+    assert(new_data_len > old_data_len);
+    assert((new_data_len - old_data_len) % header->initial_bucket_count == 0);
+    munmap(filedict->data, filedict->data_len);
+    int truncate_result = ftruncate(filedict->fd, new_data_len);
+    if (truncate_result != 0) { filedict->error = strerror(errno); return; }
+    filedict->data = mmap(
+        filedict->data,
+        new_data_len,
+        PROT_READ | PROT_WRITE,
+        MAP_SHARED,
+        filedict->fd,
+        0
+    );
+    if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
+    header = (filedict_header_t *)filedict->data;
+    hashmap = filedict->data + old_data_len;
+    filedict->data_len = new_data_len;
+    header->hashmap_count = new_hashmap_count;
+    goto try_again;
+}
+/*
+ * There are 3 "levels" to a filedict. From top to bottom:
+ * 1. Hashmap - which hashmap are we looking at? We create additional hashmaps to handle overflow.
+ * 2. Entry   - which entry in our hashmap bucket are we looking at?
+ * 3. Value   - where in the value buffer are we looking? There's 256 bytes, so can be many strings.
+ */
+/* #define log_return(val) do { printf("%s -> %i\n", __func__, (val)); return (val); } while(0) */
+#define log_return(val) return val
+/*
+ * Returns 1 when we successfully advanced to the next value
+ * Returns 0 when there is no next value
+ */
+static int filedict_read_advance_value(filedict_read_t *read) {
+    assert(read->entry != NULL);
+    const char *buffer_begin = read->entry->value;
+    const char *buffer_end = buffer_begin + FILEDICT_VALUE_SIZE;
+    const char *c;
+    for (c = read->value; c < buffer_end; ++c) {
+        if (*c == 0) {
+            c += 1;
+            break;
+        }
+    }
+    if (c >= buffer_end) log_return(0);
+    if (*c == 0) log_return(0);
+    read->value = c;
+    log_return(1);
+}
+/*
+ * Returns 1 when we successfully find a new entry that matches read->key.
+ *           advances read->entry_i and read->entry to the new entry.
+ *
+ * Returns 0 when we exhausted all remaining entries and didn't find a match.
+ */
+static int filedict_read_advance_entry(filedict_read_t *read) {
+    assert(read->key != NULL);
+    assert(strlen(read->key) > 0);
+    assert(read->bucket != NULL);
+    while (1) {
+        if (read->entry_i >= FILEDICT_BUCKET_ENTRY_COUNT) log_return(0);
+        read->entry = &read->bucket->entries[read->entry_i];
+        if (strncmp(read->entry->key, read->key, FILEDICT_KEY_SIZE) == 0) {
+            read->value = read->entry->value;
+            log_return(1);
+        }
+        read->entry_i += 1;
+    }
+}
+/*
+ * Returns 1 when we successfully advanced to the next hashmap.
+ *           read->bucket, read->entry, and read->value will be populated.
+ *
+ * Returns 0 when there are no more hashmaps, or the latest hashmap has no matching entries.
+ */
+static int filedict_read_advance_hashmap(filedict_read_t *read) {
+    const filedict_t *filedict = read->filedict;
+    assert(filedict);
+    assert(filedict->data);
+    filedict_header_t *header = (filedict_header_t*)filedict->data;
+    if (read->hashmap_i >= header->hashmap_count) log_return(0);
+    size_t offset = filedict_file_size(header->initial_bucket_count, read->hashmap_i);
+    filedict_bucket_t *hashmap = filedict->data + offset;
+    read->bucket_count = (size_t)header->initial_bucket_count << read->hashmap_i;
+    read->bucket = &hashmap[read->key_hash % read->bucket_count];
+    read->entry = &read->bucket->entries[0];
+    read->entry_i = 0;
+    log_return(filedict_read_advance_entry(read));
+}
+/*
+ * Returns a "read" at the given key. If there's a hit, <return>.value will have the value.
+ */
+static filedict_read_t filedict_get(const filedict_t *filedict, const char *key) {
+    filedict_read_t read;
+    read.filedict = filedict;
+    read.key = key;
+    read.value = NULL;
+    read.bucket = NULL;
+    read.entry = NULL;
+    read.entry_i = 0;
+    read.hashmap_i = 0;
+    read.bucket_count = 0;
+    read.key_hash = filedict->hash_function(key);
+    filedict_read_advance_hashmap(&read);
+    return read;
+}
+/*
+ * Lets you find the next value. Pass the return value of filedict_get.
+ *
+ * Returns 1 when a next value was found, 0 otherwise.
+ *
+ * If this returns 0, your filedict_read_t is defunct and shouldn't be used anymore.
+ */
+static int filedict_get_next(filedict_read_t *read) {
+    int found = -1;
+    found = filedict_read_advance_value(read);
+    if (found == 1) return found;
+    read->entry_i += 1;
+    found = filedict_read_advance_entry(read);
+    if (found == 1) return found;
+    read->hashmap_i += 1;
+    return filedict_read_advance_hashmap(read);
+}
+#endif

data/filedictrb.gemspec CHANGED Viewed

@@ -25,6 +25,7 @@ Gem::Specification.new do |spec|
       (f == __FILE__) || f.match(%r{\A(?:(?:test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
     end
   end
+  spec.files << 'ext/filedict/filedict.h'
   spec.bindir        = "exe"
   spec.executables   = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]

data/lib/filedict/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Filedict
-  VERSION = "0.1.0"
+  VERSION = "0.1.1"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: filedictrb
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Nigel Baillie
@@ -26,6 +26,7 @@ files:
 - Rakefile
 - bin/console
 - bin/setup
+- ext/filedict/filedict.h
 - ext/filedictrb/extconf.rb
 - ext/filedictrb/filedictrb.c
 - ext/filedictrb/filedictrb.h