filedictrb 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74171da9cf75b57d441faad6754c05e718a170fdada7618cd593ddcc8f8b982a
4
- data.tar.gz: 6a88336c3e3cb8e1a18d50bc14bd49794f0ba10e614667c0a086dfb9cd35d11c
3
+ metadata.gz: 64d09435d5e09d917e12a06932a75ddf365b081a9caa1253a3b11a53a235196c
4
+ data.tar.gz: 4cd1fedaa0035da30c2d9572eb28dd190cab901803089b164c36ba4d5776d36c
5
5
  SHA512:
6
- metadata.gz: 9dde01e9a2f1af933ec22e7648797fbbd393362bd9abdfc7ff351104aa0184f02222edcc8c59de98770172f4e1e9c7285d61e9d2b89d575a06aebacb080d86d3
7
- data.tar.gz: d569f379a3787a42dcd691a28b57e68ba1dd5579c4afb8741aad66d6ff13d58d2dba5e949772684c41cbec4ecccc9b55de9e1b9506353c447a37d7fd04d14e65
6
+ metadata.gz: d232d34e3b99221baf055c56ca672b2536d910103a3a4dd6bab1be9005270db86cda72690f1dd56ed230fb0214ef4cecae45a9f8ae5931b7ebcb3288df8492e2
7
+ data.tar.gz: de03f2dc36efad352a54f647f3275cc15bae641799e3b2ca258cdbb08100103493ac35e2c37e327f7f39feb2dc92eb80af6fb3c4cd645fad8794612ee0cffcfc
data/bin/setup CHANGED
@@ -3,6 +3,7 @@ set -euo pipefail
3
3
  IFS=$'\n\t'
4
4
  set -vx
5
5
 
6
+ git submodule update --init
6
7
  bundle install
7
8
 
8
9
  # Do any other automated setup that you need to do here
@@ -0,0 +1,417 @@
1
+ #ifndef FILEDICT_H
2
+ #define FILEDICT_H 1
3
+
4
+ #ifndef FILEDICT_KEY_SIZE
5
+ #define FILEDICT_KEY_SIZE 256
6
+ #endif
7
+
8
+ #ifndef FILEDICT_VALUE_SIZE
9
+ #define FILEDICT_VALUE_SIZE 256
10
+ #endif
11
+
12
+ typedef struct filedict_bucket_entry_t {
13
+ char key[FILEDICT_KEY_SIZE];
14
+ char value[FILEDICT_VALUE_SIZE];
15
+ } filedict_bucket_entry_t;
16
+
17
+ #ifndef FILEDICT_BUCKET_ENTRY_COUNT
18
+ #define FILEDICT_BUCKET_ENTRY_COUNT 4
19
+ #endif
20
+
21
+ typedef struct filedict_bucket_t {
22
+ filedict_bucket_entry_t entries[FILEDICT_BUCKET_ENTRY_COUNT];
23
+ } filedict_bucket_t;
24
+
25
+ typedef size_t (*filedict_hash_function_t)(const char *);
26
+
27
+ typedef struct filedict_t {
28
+ const char *error;
29
+ int fd;
30
+ void *data;
31
+ size_t data_len;
32
+ filedict_hash_function_t hash_function;
33
+ } filedict_t;
34
+
35
+ typedef struct filedict_header_t {
36
+ unsigned long long initial_bucket_count : 32;
37
+ unsigned long long hashmap_count : 32;
38
+ } __attribute__ ((__packed__)) filedict_header_t;
39
+
40
+ typedef struct filedict_read_t {
41
+ const filedict_t *filedict;
42
+ const char *key;
43
+ const char *value;
44
+ filedict_bucket_t *bucket;
45
+ filedict_bucket_entry_t *entry;
46
+ size_t entry_i;
47
+ size_t hashmap_i;
48
+ size_t bucket_count;
49
+ size_t key_hash;
50
+ } filedict_read_t;
51
+
52
+ #endif
53
+
54
+ /*
55
+ * Above is the header, blow is the implementation
56
+ */
57
+
58
+ #ifndef FILEDICT_IMPL
59
+ #define FILEDICT_IMPL
60
+ #include <sys/mman.h>
61
+ #include <string.h>
62
+ #include <unistd.h>
63
+ #include <fcntl.h>
64
+ #include <errno.h>
65
+ #include <limits.h>
66
+ #include <assert.h>
67
+
68
+ /* This is "djb2" from http://www.cse.yorku.ca/~oz/hash.html */
69
+ static size_t filedict_default_hash_function(const char *input) {
70
+ unsigned long hash = 5381;
71
+ int c;
72
+
73
+ while ((c = *input++) != 0) {
74
+ hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
75
+ }
76
+
77
+ return hash;
78
+ }
79
+
80
+ /*
81
+ * Writes at most max_len chars from src into dest.
82
+ * Returns the total number of bytes in src.
83
+ */
84
+ static size_t filedict_copy_string(char *dest, const char *src, size_t max_len) {
85
+ size_t src_len = 0;
86
+ char c;
87
+
88
+ while (1) {
89
+ c = *src++;
90
+ if (src_len < max_len) { *dest++ = c; }
91
+ if (c == 0) return src_len;
92
+ src_len += 1;
93
+ }
94
+ }
95
+
96
+ static void filedict_init(filedict_t *filedict) {
97
+ filedict->error = NULL;
98
+ filedict->fd = 0;
99
+ filedict->data_len = 0;
100
+ filedict->data = NULL;
101
+ filedict->hash_function = filedict_default_hash_function;
102
+ }
103
+
104
+ static void filedict_deinit(filedict_t *filedict) {
105
+ if (filedict->data) {
106
+ munmap(filedict->data, filedict->data_len);
107
+ filedict->data = NULL;
108
+ filedict->data_len = 0;
109
+ }
110
+ if (filedict->fd) {
111
+ close(filedict->fd);
112
+ filedict->fd = 0;
113
+ }
114
+ }
115
+
116
+ /*
117
+ * This computes the size of the entire filedict file given an initial bucket count and hashmap count.
118
+ */
119
+ static size_t filedict_file_size(size_t initial_bucket_count, size_t hashmap_count) {
120
+ size_t result = sizeof(filedict_header_t);
121
+ size_t i;
122
+
123
+ for (i = 0; i < hashmap_count; ++i) {
124
+ /* Bucket count is multiplied by 2 for each additional hashmap. */
125
+ result += (initial_bucket_count << i) * sizeof(filedict_bucket_t);
126
+ }
127
+
128
+ return result;
129
+ }
130
+
131
+ /*
132
+ * This opens a new file for reading and writing, optionally letting you specify the initial bucket count.
133
+ */
134
+ #define filedict_open_new(filedict, filename) \
135
+ filedict_open_f(filedict, filename, O_CREAT | O_TRUNC | O_RDWR, 4096)
136
+
137
+ #define filedict_open_readonly(filedict, filename) \
138
+ filedict_open_f(filedict, filename, O_RDONLY, 4096)
139
+
140
+ #define filedict_open(filedict, filename) \
141
+ filedict_open_f(filedict, filename, O_CREAT | O_RDWR, 4096)
142
+
143
+ static void filedict_open_f(
144
+ filedict_t *filedict,
145
+ const char *filename,
146
+ int flags,
147
+ unsigned int initial_bucket_count
148
+ ) {
149
+ filedict->fd = open(filename, flags, 0666);
150
+ if (filedict->fd == -1) { filedict->error = strerror(errno); return; }
151
+
152
+ filedict->data_len = filedict_file_size(initial_bucket_count, 1);
153
+ ftruncate(filedict->fd, filedict->data_len);
154
+ filedict->data = mmap(
155
+ NULL,
156
+ filedict->data_len,
157
+ PROT_READ | PROT_WRITE,
158
+ MAP_SHARED,
159
+ filedict->fd,
160
+ 0
161
+ );
162
+ if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
163
+
164
+ filedict_header_t *data = (filedict_header_t *)filedict->data;
165
+ assert(initial_bucket_count <= UINT_MAX);
166
+ data->initial_bucket_count = initial_bucket_count;
167
+ data->hashmap_count = 1;
168
+ }
169
+
170
+ /*
171
+ * Inserts a new value under "key". Filedict keys have multiple values, so this will "append" a new
172
+ * value onto the end of the entry.
173
+ */
174
+ #define filedict_insert(filedict, key, value) filedict_insert_f(filedict, key, value, 0)
175
+ #define filedict_insert_unique(filedict, key, value) filedict_insert_f(filedict, key, value, 1)
176
+
177
+ static void filedict_insert_f(filedict_t *filedict, const char *key, const char *value, int unique) {
178
+ assert(filedict->fd != 0);
179
+ assert(filedict->data != NULL);
180
+
181
+ size_t i, hashmap_i = 0, bucket_count, key_hash;
182
+ filedict_header_t *header = (filedict_header_t *)filedict->data;
183
+ filedict_bucket_t *hashmap = filedict->data + sizeof(filedict_header_t);
184
+ filedict_bucket_t *bucket;
185
+
186
+ bucket_count = header->initial_bucket_count;
187
+
188
+ key_hash = filedict->hash_function(key);
189
+
190
+ /*
191
+ * Here we loop through each hashmap.
192
+ */
193
+ while (hashmap_i < header->hashmap_count) {
194
+ try_again:
195
+ /* TODO: can we truncate instead of modulo, like in Ruby? */
196
+ bucket = &hashmap[key_hash % bucket_count];
197
+
198
+ for (i = 0; i < FILEDICT_BUCKET_ENTRY_COUNT; ++i) {
199
+ filedict_bucket_entry_t *entry = &bucket->entries[i];
200
+
201
+ /* Easy case: fresh entry. We can just insert here and call it quits. */
202
+ if (entry->key[0] == 0) {
203
+ strncpy(entry->key, key, FILEDICT_KEY_SIZE);
204
+ size_t value_len = filedict_copy_string(entry->value, value, FILEDICT_VALUE_SIZE);
205
+
206
+ if (value_len > FILEDICT_VALUE_SIZE) {
207
+ filedict->error = "Value too big";
208
+ }
209
+ return;
210
+ }
211
+ /*
212
+ * We need to check for room in the value, then append value.
213
+ * This is also where we might run into a duplicate and duck out.existing
214
+ */
215
+ else if (strncmp(entry->key, key, FILEDICT_KEY_SIZE) == 0) {
216
+ long long first_nonzero = -1;
217
+ char *candidate = NULL;
218
+ size_t value_i, candidate_len;
219
+
220
+ for (value_i = 0; value_i < FILEDICT_VALUE_SIZE - 1; ++value_i) {
221
+ if (unique) {
222
+ if (first_nonzero == -1 && entry->value[value_i] != 0) {
223
+ first_nonzero = value_i;
224
+ }
225
+
226
+ if (entry->value[value_i] == 0) {
227
+ int cmp = strncmp(
228
+ &entry->value[first_nonzero],
229
+ value,
230
+ FILEDICT_VALUE_SIZE - first_nonzero
231
+ );
232
+ if (cmp == 0) {
233
+ /* Looks like this value already exists! */
234
+ return;
235
+ }
236
+ first_nonzero = -1;
237
+ }
238
+ }
239
+
240
+ if (entry->value[value_i] == 0 && entry->value[value_i + 1] == 0) {
241
+ candidate = &entry->value[value_i + 1];
242
+ candidate_len = FILEDICT_VALUE_SIZE - value_i - 1;
243
+
244
+ if (strlen(value) >= candidate_len) break;
245
+
246
+ strncpy(candidate, value, candidate_len);
247
+ return;
248
+ }
249
+ }
250
+ }
251
+ }
252
+
253
+ ++hashmap_i;
254
+ hashmap += bucket_count;
255
+ bucket_count = (bucket_count << 1);
256
+ }
257
+
258
+ /*
259
+ * If we fell through to here, that means we need to allocate a new hashmap.
260
+ */
261
+ size_t new_hashmap_count = header->hashmap_count + 1;
262
+ size_t old_data_len = filedict->data_len;
263
+ size_t new_data_len = filedict_file_size(header->initial_bucket_count, new_hashmap_count);
264
+
265
+ assert(new_data_len > old_data_len);
266
+ assert((new_data_len - old_data_len) % header->initial_bucket_count == 0);
267
+
268
+ munmap(filedict->data, filedict->data_len);
269
+ int truncate_result = ftruncate(filedict->fd, new_data_len);
270
+ if (truncate_result != 0) { filedict->error = strerror(errno); return; }
271
+
272
+ filedict->data = mmap(
273
+ filedict->data,
274
+ new_data_len,
275
+ PROT_READ | PROT_WRITE,
276
+ MAP_SHARED,
277
+ filedict->fd,
278
+ 0
279
+ );
280
+ if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
281
+ header = (filedict_header_t *)filedict->data;
282
+ hashmap = filedict->data + old_data_len;
283
+
284
+ filedict->data_len = new_data_len;
285
+ header->hashmap_count = new_hashmap_count;
286
+ goto try_again;
287
+ }
288
+
289
+ /*
290
+ * There are 3 "levels" to a filedict. From top to bottom:
291
+ * 1. Hashmap - which hashmap are we looking at? We create additional hashmaps to handle overflow.
292
+ * 2. Entry - which entry in our hashmap bucket are we looking at?
293
+ * 3. Value - where in the value buffer are we looking? There's 256 bytes, so can be many strings.
294
+ */
295
+
296
+ /* #define log_return(val) do { printf("%s -> %i\n", __func__, (val)); return (val); } while(0) */
297
+ #define log_return(val) return val
298
+
299
+ /*
300
+ * Returns 1 when we successfully advanced to the next value
301
+ * Returns 0 when there is no next value
302
+ */
303
+ static int filedict_read_advance_value(filedict_read_t *read) {
304
+ assert(read->entry != NULL);
305
+
306
+ const char *buffer_begin = read->entry->value;
307
+ const char *buffer_end = buffer_begin + FILEDICT_VALUE_SIZE;
308
+
309
+ const char *c;
310
+ for (c = read->value; c < buffer_end; ++c) {
311
+ if (*c == 0) {
312
+ c += 1;
313
+ break;
314
+ }
315
+ }
316
+
317
+ if (c >= buffer_end) log_return(0);
318
+ if (*c == 0) log_return(0);
319
+
320
+ read->value = c;
321
+ log_return(1);
322
+ }
323
+
324
+ /*
325
+ * Returns 1 when we successfully find a new entry that matches read->key.
326
+ * advances read->entry_i and read->entry to the new entry.
327
+ *
328
+ * Returns 0 when we exhausted all remaining entries and didn't find a match.
329
+ */
330
+ static int filedict_read_advance_entry(filedict_read_t *read) {
331
+ assert(read->key != NULL);
332
+ assert(strlen(read->key) > 0);
333
+ assert(read->bucket != NULL);
334
+
335
+ while (1) {
336
+ if (read->entry_i >= FILEDICT_BUCKET_ENTRY_COUNT) log_return(0);
337
+
338
+ read->entry = &read->bucket->entries[read->entry_i];
339
+
340
+ if (strncmp(read->entry->key, read->key, FILEDICT_KEY_SIZE) == 0) {
341
+ read->value = read->entry->value;
342
+ log_return(1);
343
+ }
344
+
345
+ read->entry_i += 1;
346
+ }
347
+ }
348
+
349
+ /*
350
+ * Returns 1 when we successfully advanced to the next hashmap.
351
+ * read->bucket, read->entry, and read->value will be populated.
352
+ *
353
+ * Returns 0 when there are no more hashmaps, or the latest hashmap has no matching entries.
354
+ */
355
+ static int filedict_read_advance_hashmap(filedict_read_t *read) {
356
+ const filedict_t *filedict = read->filedict;
357
+
358
+ assert(filedict);
359
+ assert(filedict->data);
360
+
361
+ filedict_header_t *header = (filedict_header_t*)filedict->data;
362
+
363
+ if (read->hashmap_i >= header->hashmap_count) log_return(0);
364
+
365
+ size_t offset = filedict_file_size(header->initial_bucket_count, read->hashmap_i);
366
+ filedict_bucket_t *hashmap = filedict->data + offset;
367
+
368
+ read->bucket_count = (size_t)header->initial_bucket_count << read->hashmap_i;
369
+ read->bucket = &hashmap[read->key_hash % read->bucket_count];
370
+ read->entry = &read->bucket->entries[0];
371
+
372
+ read->entry_i = 0;
373
+
374
+ log_return(filedict_read_advance_entry(read));
375
+ }
376
+
377
+ /*
378
+ * Returns a "read" at the given key. If there's a hit, <return>.value will have the value.
379
+ */
380
+ static filedict_read_t filedict_get(const filedict_t *filedict, const char *key) {
381
+ filedict_read_t read;
382
+ read.filedict = filedict;
383
+ read.key = key;
384
+ read.value = NULL;
385
+ read.bucket = NULL;
386
+ read.entry = NULL;
387
+ read.entry_i = 0;
388
+ read.hashmap_i = 0;
389
+ read.bucket_count = 0;
390
+ read.key_hash = filedict->hash_function(key);
391
+
392
+ filedict_read_advance_hashmap(&read);
393
+ return read;
394
+ }
395
+
396
+ /*
397
+ * Lets you find the next value. Pass the return value of filedict_get.
398
+ *
399
+ * Returns 1 when a next value was found, 0 otherwise.
400
+ *
401
+ * If this returns 0, your filedict_read_t is defunct and shouldn't be used anymore.
402
+ */
403
+ static int filedict_get_next(filedict_read_t *read) {
404
+ int found = -1;
405
+
406
+ found = filedict_read_advance_value(read);
407
+ if (found == 1) return found;
408
+
409
+ read->entry_i += 1;
410
+ found = filedict_read_advance_entry(read);
411
+ if (found == 1) return found;
412
+
413
+ read->hashmap_i += 1;
414
+ return filedict_read_advance_hashmap(read);
415
+ }
416
+
417
+ #endif
data/filedictrb.gemspec CHANGED
@@ -25,6 +25,7 @@ Gem::Specification.new do |spec|
25
25
  (f == __FILE__) || f.match(%r{\A(?:(?:test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
26
26
  end
27
27
  end
28
+ spec.files << 'ext/filedict/filedict.h'
28
29
  spec.bindir = "exe"
29
30
  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
30
31
  spec.require_paths = ["lib"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Filedict
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: filedictrb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nigel Baillie
@@ -26,6 +26,7 @@ files:
26
26
  - Rakefile
27
27
  - bin/console
28
28
  - bin/setup
29
+ - ext/filedict/filedict.h
29
30
  - ext/filedictrb/extconf.rb
30
31
  - ext/filedictrb/filedictrb.c
31
32
  - ext/filedictrb/filedictrb.h