filedictrb 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74171da9cf75b57d441faad6754c05e718a170fdada7618cd593ddcc8f8b982a
4
- data.tar.gz: 6a88336c3e3cb8e1a18d50bc14bd49794f0ba10e614667c0a086dfb9cd35d11c
3
+ metadata.gz: 64d09435d5e09d917e12a06932a75ddf365b081a9caa1253a3b11a53a235196c
4
+ data.tar.gz: 4cd1fedaa0035da30c2d9572eb28dd190cab901803089b164c36ba4d5776d36c
5
5
  SHA512:
6
- metadata.gz: 9dde01e9a2f1af933ec22e7648797fbbd393362bd9abdfc7ff351104aa0184f02222edcc8c59de98770172f4e1e9c7285d61e9d2b89d575a06aebacb080d86d3
7
- data.tar.gz: d569f379a3787a42dcd691a28b57e68ba1dd5579c4afb8741aad66d6ff13d58d2dba5e949772684c41cbec4ecccc9b55de9e1b9506353c447a37d7fd04d14e65
6
+ metadata.gz: d232d34e3b99221baf055c56ca672b2536d910103a3a4dd6bab1be9005270db86cda72690f1dd56ed230fb0214ef4cecae45a9f8ae5931b7ebcb3288df8492e2
7
+ data.tar.gz: de03f2dc36efad352a54f647f3275cc15bae641799e3b2ca258cdbb08100103493ac35e2c37e327f7f39feb2dc92eb80af6fb3c4cd645fad8794612ee0cffcfc
data/bin/setup CHANGED
@@ -3,6 +3,7 @@ set -euo pipefail
3
3
  IFS=$'\n\t'
4
4
  set -vx
5
5
 
6
+ git submodule update --init
6
7
  bundle install
7
8
 
8
9
  # Do any other automated setup that you need to do here
@@ -0,0 +1,417 @@
1
+ #ifndef FILEDICT_H
2
+ #define FILEDICT_H 1
3
+
4
+ #ifndef FILEDICT_KEY_SIZE
5
+ #define FILEDICT_KEY_SIZE 256
6
+ #endif
7
+
8
+ #ifndef FILEDICT_VALUE_SIZE
9
+ #define FILEDICT_VALUE_SIZE 256
10
+ #endif
11
+
12
+ typedef struct filedict_bucket_entry_t {
13
+ char key[FILEDICT_KEY_SIZE];
14
+ char value[FILEDICT_VALUE_SIZE];
15
+ } filedict_bucket_entry_t;
16
+
17
+ #ifndef FILEDICT_BUCKET_ENTRY_COUNT
18
+ #define FILEDICT_BUCKET_ENTRY_COUNT 4
19
+ #endif
20
+
21
+ typedef struct filedict_bucket_t {
22
+ filedict_bucket_entry_t entries[FILEDICT_BUCKET_ENTRY_COUNT];
23
+ } filedict_bucket_t;
24
+
25
+ typedef size_t (*filedict_hash_function_t)(const char *);
26
+
27
+ typedef struct filedict_t {
28
+ const char *error;
29
+ int fd;
30
+ void *data;
31
+ size_t data_len;
32
+ filedict_hash_function_t hash_function;
33
+ } filedict_t;
34
+
35
+ typedef struct filedict_header_t {
36
+ unsigned long long initial_bucket_count : 32;
37
+ unsigned long long hashmap_count : 32;
38
+ } __attribute__ ((__packed__)) filedict_header_t;
39
+
40
+ typedef struct filedict_read_t {
41
+ const filedict_t *filedict;
42
+ const char *key;
43
+ const char *value;
44
+ filedict_bucket_t *bucket;
45
+ filedict_bucket_entry_t *entry;
46
+ size_t entry_i;
47
+ size_t hashmap_i;
48
+ size_t bucket_count;
49
+ size_t key_hash;
50
+ } filedict_read_t;
51
+
52
+ #endif
53
+
54
+ /*
55
+ * Above is the header, blow is the implementation
56
+ */
57
+
58
+ #ifndef FILEDICT_IMPL
59
+ #define FILEDICT_IMPL
60
+ #include <sys/mman.h>
61
+ #include <string.h>
62
+ #include <unistd.h>
63
+ #include <fcntl.h>
64
+ #include <errno.h>
65
+ #include <limits.h>
66
+ #include <assert.h>
67
+
68
+ /* This is "djb2" from http://www.cse.yorku.ca/~oz/hash.html */
69
+ static size_t filedict_default_hash_function(const char *input) {
70
+ unsigned long hash = 5381;
71
+ int c;
72
+
73
+ while ((c = *input++) != 0) {
74
+ hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
75
+ }
76
+
77
+ return hash;
78
+ }
79
+
80
+ /*
81
+ * Writes at most max_len chars from src into dest.
82
+ * Returns the total number of bytes in src.
83
+ */
84
+ static size_t filedict_copy_string(char *dest, const char *src, size_t max_len) {
85
+ size_t src_len = 0;
86
+ char c;
87
+
88
+ while (1) {
89
+ c = *src++;
90
+ if (src_len < max_len) { *dest++ = c; }
91
+ if (c == 0) return src_len;
92
+ src_len += 1;
93
+ }
94
+ }
95
+
96
+ static void filedict_init(filedict_t *filedict) {
97
+ filedict->error = NULL;
98
+ filedict->fd = 0;
99
+ filedict->data_len = 0;
100
+ filedict->data = NULL;
101
+ filedict->hash_function = filedict_default_hash_function;
102
+ }
103
+
104
+ static void filedict_deinit(filedict_t *filedict) {
105
+ if (filedict->data) {
106
+ munmap(filedict->data, filedict->data_len);
107
+ filedict->data = NULL;
108
+ filedict->data_len = 0;
109
+ }
110
+ if (filedict->fd) {
111
+ close(filedict->fd);
112
+ filedict->fd = 0;
113
+ }
114
+ }
115
+
116
+ /*
117
+ * This computes the size of the entire filedict file given an initial bucket count and hashmap count.
118
+ */
119
+ static size_t filedict_file_size(size_t initial_bucket_count, size_t hashmap_count) {
120
+ size_t result = sizeof(filedict_header_t);
121
+ size_t i;
122
+
123
+ for (i = 0; i < hashmap_count; ++i) {
124
+ /* Bucket count is multiplied by 2 for each additional hashmap. */
125
+ result += (initial_bucket_count << i) * sizeof(filedict_bucket_t);
126
+ }
127
+
128
+ return result;
129
+ }
130
+
131
+ /*
132
+ * This opens a new file for reading and writing, optionally letting you specify the initial bucket count.
133
+ */
134
+ #define filedict_open_new(filedict, filename) \
135
+ filedict_open_f(filedict, filename, O_CREAT | O_TRUNC | O_RDWR, 4096)
136
+
137
+ #define filedict_open_readonly(filedict, filename) \
138
+ filedict_open_f(filedict, filename, O_RDONLY, 4096)
139
+
140
+ #define filedict_open(filedict, filename) \
141
+ filedict_open_f(filedict, filename, O_CREAT | O_RDWR, 4096)
142
+
143
+ static void filedict_open_f(
144
+ filedict_t *filedict,
145
+ const char *filename,
146
+ int flags,
147
+ unsigned int initial_bucket_count
148
+ ) {
149
+ filedict->fd = open(filename, flags, 0666);
150
+ if (filedict->fd == -1) { filedict->error = strerror(errno); return; }
151
+
152
+ filedict->data_len = filedict_file_size(initial_bucket_count, 1);
153
+ ftruncate(filedict->fd, filedict->data_len);
154
+ filedict->data = mmap(
155
+ NULL,
156
+ filedict->data_len,
157
+ PROT_READ | PROT_WRITE,
158
+ MAP_SHARED,
159
+ filedict->fd,
160
+ 0
161
+ );
162
+ if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
163
+
164
+ filedict_header_t *data = (filedict_header_t *)filedict->data;
165
+ assert(initial_bucket_count <= UINT_MAX);
166
+ data->initial_bucket_count = initial_bucket_count;
167
+ data->hashmap_count = 1;
168
+ }
169
+
170
+ /*
171
+ * Inserts a new value under "key". Filedict keys have multiple values, so this will "append" a new
172
+ * value onto the end of the entry.
173
+ */
174
+ #define filedict_insert(filedict, key, value) filedict_insert_f(filedict, key, value, 0)
175
+ #define filedict_insert_unique(filedict, key, value) filedict_insert_f(filedict, key, value, 1)
176
+
177
+ static void filedict_insert_f(filedict_t *filedict, const char *key, const char *value, int unique) {
178
+ assert(filedict->fd != 0);
179
+ assert(filedict->data != NULL);
180
+
181
+ size_t i, hashmap_i = 0, bucket_count, key_hash;
182
+ filedict_header_t *header = (filedict_header_t *)filedict->data;
183
+ filedict_bucket_t *hashmap = filedict->data + sizeof(filedict_header_t);
184
+ filedict_bucket_t *bucket;
185
+
186
+ bucket_count = header->initial_bucket_count;
187
+
188
+ key_hash = filedict->hash_function(key);
189
+
190
+ /*
191
+ * Here we loop through each hashmap.
192
+ */
193
+ while (hashmap_i < header->hashmap_count) {
194
+ try_again:
195
+ /* TODO: can we truncate instead of modulo, like in Ruby? */
196
+ bucket = &hashmap[key_hash % bucket_count];
197
+
198
+ for (i = 0; i < FILEDICT_BUCKET_ENTRY_COUNT; ++i) {
199
+ filedict_bucket_entry_t *entry = &bucket->entries[i];
200
+
201
+ /* Easy case: fresh entry. We can just insert here and call it quits. */
202
+ if (entry->key[0] == 0) {
203
+ strncpy(entry->key, key, FILEDICT_KEY_SIZE);
204
+ size_t value_len = filedict_copy_string(entry->value, value, FILEDICT_VALUE_SIZE);
205
+
206
+ if (value_len > FILEDICT_VALUE_SIZE) {
207
+ filedict->error = "Value too big";
208
+ }
209
+ return;
210
+ }
211
+ /*
212
+ * We need to check for room in the value, then append value.
213
+ * This is also where we might run into a duplicate and duck out.existing
214
+ */
215
+ else if (strncmp(entry->key, key, FILEDICT_KEY_SIZE) == 0) {
216
+ long long first_nonzero = -1;
217
+ char *candidate = NULL;
218
+ size_t value_i, candidate_len;
219
+
220
+ for (value_i = 0; value_i < FILEDICT_VALUE_SIZE - 1; ++value_i) {
221
+ if (unique) {
222
+ if (first_nonzero == -1 && entry->value[value_i] != 0) {
223
+ first_nonzero = value_i;
224
+ }
225
+
226
+ if (entry->value[value_i] == 0) {
227
+ int cmp = strncmp(
228
+ &entry->value[first_nonzero],
229
+ value,
230
+ FILEDICT_VALUE_SIZE - first_nonzero
231
+ );
232
+ if (cmp == 0) {
233
+ /* Looks like this value already exists! */
234
+ return;
235
+ }
236
+ first_nonzero = -1;
237
+ }
238
+ }
239
+
240
+ if (entry->value[value_i] == 0 && entry->value[value_i + 1] == 0) {
241
+ candidate = &entry->value[value_i + 1];
242
+ candidate_len = FILEDICT_VALUE_SIZE - value_i - 1;
243
+
244
+ if (strlen(value) >= candidate_len) break;
245
+
246
+ strncpy(candidate, value, candidate_len);
247
+ return;
248
+ }
249
+ }
250
+ }
251
+ }
252
+
253
+ ++hashmap_i;
254
+ hashmap += bucket_count;
255
+ bucket_count = (bucket_count << 1);
256
+ }
257
+
258
+ /*
259
+ * If we fell through to here, that means we need to allocate a new hashmap.
260
+ */
261
+ size_t new_hashmap_count = header->hashmap_count + 1;
262
+ size_t old_data_len = filedict->data_len;
263
+ size_t new_data_len = filedict_file_size(header->initial_bucket_count, new_hashmap_count);
264
+
265
+ assert(new_data_len > old_data_len);
266
+ assert((new_data_len - old_data_len) % header->initial_bucket_count == 0);
267
+
268
+ munmap(filedict->data, filedict->data_len);
269
+ int truncate_result = ftruncate(filedict->fd, new_data_len);
270
+ if (truncate_result != 0) { filedict->error = strerror(errno); return; }
271
+
272
+ filedict->data = mmap(
273
+ filedict->data,
274
+ new_data_len,
275
+ PROT_READ | PROT_WRITE,
276
+ MAP_SHARED,
277
+ filedict->fd,
278
+ 0
279
+ );
280
+ if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
281
+ header = (filedict_header_t *)filedict->data;
282
+ hashmap = filedict->data + old_data_len;
283
+
284
+ filedict->data_len = new_data_len;
285
+ header->hashmap_count = new_hashmap_count;
286
+ goto try_again;
287
+ }
288
+
289
+ /*
290
+ * There are 3 "levels" to a filedict. From top to bottom:
291
+ * 1. Hashmap - which hashmap are we looking at? We create additional hashmaps to handle overflow.
292
+ * 2. Entry - which entry in our hashmap bucket are we looking at?
293
+ * 3. Value - where in the value buffer are we looking? There's 256 bytes, so can be many strings.
294
+ */
295
+
296
+ /* #define log_return(val) do { printf("%s -> %i\n", __func__, (val)); return (val); } while(0) */
297
+ #define log_return(val) return val
298
+
299
+ /*
300
+ * Returns 1 when we successfully advanced to the next value
301
+ * Returns 0 when there is no next value
302
+ */
303
+ static int filedict_read_advance_value(filedict_read_t *read) {
304
+ assert(read->entry != NULL);
305
+
306
+ const char *buffer_begin = read->entry->value;
307
+ const char *buffer_end = buffer_begin + FILEDICT_VALUE_SIZE;
308
+
309
+ const char *c;
310
+ for (c = read->value; c < buffer_end; ++c) {
311
+ if (*c == 0) {
312
+ c += 1;
313
+ break;
314
+ }
315
+ }
316
+
317
+ if (c >= buffer_end) log_return(0);
318
+ if (*c == 0) log_return(0);
319
+
320
+ read->value = c;
321
+ log_return(1);
322
+ }
323
+
324
+ /*
325
+ * Returns 1 when we successfully find a new entry that matches read->key.
326
+ * advances read->entry_i and read->entry to the new entry.
327
+ *
328
+ * Returns 0 when we exhausted all remaining entries and didn't find a match.
329
+ */
330
+ static int filedict_read_advance_entry(filedict_read_t *read) {
331
+ assert(read->key != NULL);
332
+ assert(strlen(read->key) > 0);
333
+ assert(read->bucket != NULL);
334
+
335
+ while (1) {
336
+ if (read->entry_i >= FILEDICT_BUCKET_ENTRY_COUNT) log_return(0);
337
+
338
+ read->entry = &read->bucket->entries[read->entry_i];
339
+
340
+ if (strncmp(read->entry->key, read->key, FILEDICT_KEY_SIZE) == 0) {
341
+ read->value = read->entry->value;
342
+ log_return(1);
343
+ }
344
+
345
+ read->entry_i += 1;
346
+ }
347
+ }
348
+
349
+ /*
350
+ * Returns 1 when we successfully advanced to the next hashmap.
351
+ * read->bucket, read->entry, and read->value will be populated.
352
+ *
353
+ * Returns 0 when there are no more hashmaps, or the latest hashmap has no matching entries.
354
+ */
355
+ static int filedict_read_advance_hashmap(filedict_read_t *read) {
356
+ const filedict_t *filedict = read->filedict;
357
+
358
+ assert(filedict);
359
+ assert(filedict->data);
360
+
361
+ filedict_header_t *header = (filedict_header_t*)filedict->data;
362
+
363
+ if (read->hashmap_i >= header->hashmap_count) log_return(0);
364
+
365
+ size_t offset = filedict_file_size(header->initial_bucket_count, read->hashmap_i);
366
+ filedict_bucket_t *hashmap = filedict->data + offset;
367
+
368
+ read->bucket_count = (size_t)header->initial_bucket_count << read->hashmap_i;
369
+ read->bucket = &hashmap[read->key_hash % read->bucket_count];
370
+ read->entry = &read->bucket->entries[0];
371
+
372
+ read->entry_i = 0;
373
+
374
+ log_return(filedict_read_advance_entry(read));
375
+ }
376
+
377
+ /*
378
+ * Returns a "read" at the given key. If there's a hit, <return>.value will have the value.
379
+ */
380
+ static filedict_read_t filedict_get(const filedict_t *filedict, const char *key) {
381
+ filedict_read_t read;
382
+ read.filedict = filedict;
383
+ read.key = key;
384
+ read.value = NULL;
385
+ read.bucket = NULL;
386
+ read.entry = NULL;
387
+ read.entry_i = 0;
388
+ read.hashmap_i = 0;
389
+ read.bucket_count = 0;
390
+ read.key_hash = filedict->hash_function(key);
391
+
392
+ filedict_read_advance_hashmap(&read);
393
+ return read;
394
+ }
395
+
396
+ /*
397
+ * Lets you find the next value. Pass the return value of filedict_get.
398
+ *
399
+ * Returns 1 when a next value was found, 0 otherwise.
400
+ *
401
+ * If this returns 0, your filedict_read_t is defunct and shouldn't be used anymore.
402
+ */
403
+ static int filedict_get_next(filedict_read_t *read) {
404
+ int found = -1;
405
+
406
+ found = filedict_read_advance_value(read);
407
+ if (found == 1) return found;
408
+
409
+ read->entry_i += 1;
410
+ found = filedict_read_advance_entry(read);
411
+ if (found == 1) return found;
412
+
413
+ read->hashmap_i += 1;
414
+ return filedict_read_advance_hashmap(read);
415
+ }
416
+
417
+ #endif
data/filedictrb.gemspec CHANGED
@@ -25,6 +25,7 @@ Gem::Specification.new do |spec|
25
25
  (f == __FILE__) || f.match(%r{\A(?:(?:test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
26
26
  end
27
27
  end
28
+ spec.files << 'ext/filedict/filedict.h'
28
29
  spec.bindir = "exe"
29
30
  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
30
31
  spec.require_paths = ["lib"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Filedict
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: filedictrb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nigel Baillie
@@ -26,6 +26,7 @@ files:
26
26
  - Rakefile
27
27
  - bin/console
28
28
  - bin/setup
29
+ - ext/filedict/filedict.h
29
30
  - ext/filedictrb/extconf.rb
30
31
  - ext/filedictrb/filedictrb.c
31
32
  - ext/filedictrb/filedictrb.h