filedictrb 0.1.0 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74171da9cf75b57d441faad6754c05e718a170fdada7618cd593ddcc8f8b982a
4
- data.tar.gz: 6a88336c3e3cb8e1a18d50bc14bd49794f0ba10e614667c0a086dfb9cd35d11c
3
+ metadata.gz: aad965d1fb0dd25c74c20f917b2f71e0d48fc7d6dea59bab8ad2c5990b3fb8fe
4
+ data.tar.gz: cd8c479f17d113b19d5ef89d565aeab6c51305e1e0f81dab3d8d6a538555009b
5
5
  SHA512:
6
- metadata.gz: 9dde01e9a2f1af933ec22e7648797fbbd393362bd9abdfc7ff351104aa0184f02222edcc8c59de98770172f4e1e9c7285d61e9d2b89d575a06aebacb080d86d3
7
- data.tar.gz: d569f379a3787a42dcd691a28b57e68ba1dd5579c4afb8741aad66d6ff13d58d2dba5e949772684c41cbec4ecccc9b55de9e1b9506353c447a37d7fd04d14e65
6
+ metadata.gz: 2dc64caded0eadba4cb03da2dfb926b3c488d6773df59087752953371e5dfabeeac4625478f0ac33a26c4a60f76105a8026ee49aa6e2c700437c0e992ba6a6fe
7
+ data.tar.gz: 7e392f4e734fbe940fb4d86c7a04700edcad3cbab286d4103f6846dbcadb1f2fd01ac8a6a272dd31e542ded3fb919fcfa6b566919161ab1cc3782a56692a2281
data/bin/setup CHANGED
@@ -3,6 +3,7 @@ set -euo pipefail
3
3
  IFS=$'\n\t'
4
4
  set -vx
5
5
 
6
+ git submodule update --init
6
7
  bundle install
7
8
 
8
9
  # Do any other automated setup that you need to do here
@@ -0,0 +1,452 @@
1
+ #ifndef FILEDICT_H
2
+ #define FILEDICT_H 1
3
+
4
+ #ifndef FILEDICT_KEY_SIZE
5
+ #define FILEDICT_KEY_SIZE 256
6
+ #endif
7
+
8
+ #ifndef FILEDICT_VALUE_SIZE
9
+ #define FILEDICT_VALUE_SIZE 256
10
+ #endif
11
+
12
+ typedef struct filedict_bucket_entry_t {
13
+ char key[FILEDICT_KEY_SIZE];
14
+ char value[FILEDICT_VALUE_SIZE];
15
+ } filedict_bucket_entry_t;
16
+
17
+ #ifndef FILEDICT_BUCKET_ENTRY_COUNT
18
+ #define FILEDICT_BUCKET_ENTRY_COUNT 4
19
+ #endif
20
+
21
+ typedef struct filedict_bucket_t {
22
+ filedict_bucket_entry_t entries[FILEDICT_BUCKET_ENTRY_COUNT];
23
+ } filedict_bucket_t;
24
+
25
+ typedef size_t (*filedict_hash_function_t)(const char *);
26
+
27
+ typedef struct filedict_t {
28
+ const char *error;
29
+ int fd;
30
+ int flags;
31
+ void *data;
32
+ size_t data_len;
33
+ filedict_hash_function_t hash_function;
34
+ } filedict_t;
35
+
36
+ typedef struct filedict_header_t {
37
+ unsigned long long initial_bucket_count : 32;
38
+ unsigned long long hashmap_count : 32;
39
+ } __attribute__ ((__packed__)) filedict_header_t;
40
+
41
+ typedef struct filedict_read_t {
42
+ filedict_t *filedict;
43
+ const char *key;
44
+ const char *value;
45
+ filedict_bucket_t *bucket;
46
+ filedict_bucket_entry_t *entry;
47
+ size_t entry_i;
48
+ size_t hashmap_i;
49
+ size_t bucket_count;
50
+ size_t key_hash;
51
+ } filedict_read_t;
52
+
53
+ #endif
54
+
55
+ /*
56
+ * Above is the header, blow is the implementation
57
+ */
58
+
59
+ #ifndef FILEDICT_IMPL
60
+ #define FILEDICT_IMPL
61
+ #include <sys/mman.h>
62
+ #include <string.h>
63
+ #include <unistd.h>
64
+ #include <fcntl.h>
65
+ #include <errno.h>
66
+ #include <limits.h>
67
+ #include <assert.h>
68
+
69
+ /* This is "djb2" from http://www.cse.yorku.ca/~oz/hash.html */
70
+ static size_t filedict_default_hash_function(const char *input) {
71
+ unsigned long hash = 5381;
72
+ int c;
73
+
74
+ while ((c = *input++) != 0) {
75
+ hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
76
+ }
77
+
78
+ return hash;
79
+ }
80
+
81
+ /*
82
+ * Writes at most max_len chars from src into dest.
83
+ * Returns the total number of bytes in src.
84
+ */
85
+ static size_t filedict_copy_string(char *dest, const char *src, size_t max_len) {
86
+ size_t src_len = 0;
87
+ char c;
88
+
89
+ while (1) {
90
+ c = *src++;
91
+ if (src_len < max_len) { *dest++ = c; }
92
+ if (c == 0) return src_len;
93
+ src_len += 1;
94
+ }
95
+ }
96
+
97
+ static void filedict_init(filedict_t *filedict) {
98
+ filedict->error = NULL;
99
+ filedict->fd = 0;
100
+ filedict->flags = 0;
101
+ filedict->data_len = 0;
102
+ filedict->data = NULL;
103
+ filedict->hash_function = filedict_default_hash_function;
104
+ }
105
+
106
+ static void filedict_deinit(filedict_t *filedict) {
107
+ if (filedict->data) {
108
+ munmap(filedict->data, filedict->data_len);
109
+ filedict->data = NULL;
110
+ filedict->data_len = 0;
111
+ }
112
+ if (filedict->fd) {
113
+ close(filedict->fd);
114
+ filedict->fd = 0;
115
+ filedict->flags = 0;
116
+ }
117
+ }
118
+
119
+ /*
120
+ * This computes the size of the entire filedict file given an initial bucket count and hashmap count.
121
+ */
122
+ static size_t filedict_file_size(size_t initial_bucket_count, size_t hashmap_count) {
123
+ size_t result = sizeof(filedict_header_t);
124
+ size_t i;
125
+
126
+ for (i = 0; i < hashmap_count; ++i) {
127
+ /* Bucket count is multiplied by 2 for each additional hashmap. */
128
+ result += (initial_bucket_count << i) * sizeof(filedict_bucket_t);
129
+ }
130
+
131
+ return result;
132
+ }
133
+
134
+ /*
135
+ * This opens a new file for reading and writing, optionally letting you specify the initial bucket count.
136
+ */
137
+ #define filedict_open_new(filedict, filename) \
138
+ filedict_open_f(filedict, filename, O_CREAT | O_TRUNC | O_RDWR, 4096)
139
+
140
+ #define filedict_open_readonly(filedict, filename) \
141
+ filedict_open_f(filedict, filename, O_RDONLY, 4096)
142
+
143
+ #define filedict_open(filedict, filename) \
144
+ filedict_open_f(filedict, filename, O_CREAT | O_RDWR, 4096)
145
+
146
+ static void filedict_open_f(
147
+ filedict_t *filedict,
148
+ const char *filename,
149
+ int flags,
150
+ unsigned int initial_bucket_count
151
+ ) {
152
+ filedict->flags = flags;
153
+ filedict->fd = open(filename, flags, 0666);
154
+ if (filedict->fd == -1) { filedict->error = strerror(errno); return; }
155
+
156
+ filedict->data_len = filedict_file_size(initial_bucket_count, 1);
157
+ ftruncate(filedict->fd, filedict->data_len);
158
+ filedict->data = mmap(
159
+ NULL,
160
+ filedict->data_len,
161
+ PROT_READ | ((flags & O_RDWR) ? PROT_WRITE : 0),
162
+ MAP_SHARED,
163
+ filedict->fd,
164
+ 0
165
+ );
166
+ if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
167
+
168
+ filedict_header_t *data = (filedict_header_t *)filedict->data;
169
+ assert(initial_bucket_count <= UINT_MAX);
170
+
171
+ if (data->initial_bucket_count == 0) {
172
+ data->initial_bucket_count = initial_bucket_count;
173
+ data->hashmap_count = 1;
174
+ }
175
+ }
176
+
177
+ /*
178
+ * Inserts a new value under "key". Filedict keys have multiple values, so this will "append" a new
179
+ * value onto the end of the entry.
180
+ */
181
+ #define filedict_insert(filedict, key, value) filedict_insert_f(filedict, key, value, 0)
182
+ #define filedict_insert_unique(filedict, key, value) filedict_insert_f(filedict, key, value, 1)
183
+
184
+ static void filedict_insert_f(filedict_t *filedict, const char *key, const char *value, int unique) {
185
+ assert(filedict->fd != 0);
186
+ assert(filedict->data != NULL);
187
+
188
+ size_t i, hashmap_i = 0, bucket_count, key_hash;
189
+ filedict_header_t *header = (filedict_header_t *)filedict->data;
190
+ filedict_bucket_t *hashmap = filedict->data + sizeof(filedict_header_t);
191
+ filedict_bucket_t *bucket;
192
+
193
+ bucket_count = header->initial_bucket_count;
194
+
195
+ key_hash = filedict->hash_function(key);
196
+
197
+ /*
198
+ * Here we loop through each hashmap.
199
+ */
200
+ while (hashmap_i < header->hashmap_count) {
201
+ try_again:
202
+ /* TODO: can we truncate instead of modulo, like in Ruby? */
203
+ bucket = &hashmap[key_hash % bucket_count];
204
+
205
+ for (i = 0; i < FILEDICT_BUCKET_ENTRY_COUNT; ++i) {
206
+ filedict_bucket_entry_t *entry = &bucket->entries[i];
207
+
208
+ /* Easy case: fresh entry. We can just insert here and call it quits. */
209
+ if (entry->key[0] == 0) {
210
+ strncpy(entry->key, key, FILEDICT_KEY_SIZE);
211
+ size_t value_len = filedict_copy_string(entry->value, value, FILEDICT_VALUE_SIZE);
212
+
213
+ if (value_len > FILEDICT_VALUE_SIZE) {
214
+ filedict->error = "Value too big";
215
+ }
216
+ return;
217
+ }
218
+ /*
219
+ * We need to check for room in the value, then append value.
220
+ * This is also where we might run into a duplicate and duck out.existing
221
+ */
222
+ else if (strncmp(entry->key, key, FILEDICT_KEY_SIZE) == 0) {
223
+ long long first_nonzero = -1;
224
+ char *candidate = NULL;
225
+ size_t value_i, candidate_len;
226
+
227
+ for (value_i = 0; value_i < FILEDICT_VALUE_SIZE - 1; ++value_i) {
228
+ if (unique) {
229
+ if (first_nonzero == -1 && entry->value[value_i] != 0) {
230
+ first_nonzero = value_i;
231
+ }
232
+
233
+ if (entry->value[value_i] == 0) {
234
+ int cmp = strncmp(
235
+ &entry->value[first_nonzero],
236
+ value,
237
+ FILEDICT_VALUE_SIZE - first_nonzero
238
+ );
239
+ if (cmp == 0) {
240
+ /* Looks like this value already exists! */
241
+ return;
242
+ }
243
+ first_nonzero = -1;
244
+ }
245
+ }
246
+
247
+ if (entry->value[value_i] == 0 && entry->value[value_i + 1] == 0) {
248
+ candidate = &entry->value[value_i + 1];
249
+ candidate_len = FILEDICT_VALUE_SIZE - value_i - 1;
250
+
251
+ if (strlen(value) >= candidate_len) break;
252
+
253
+ strncpy(candidate, value, candidate_len);
254
+ return;
255
+ }
256
+ }
257
+ }
258
+ }
259
+
260
+ ++hashmap_i;
261
+ hashmap += bucket_count;
262
+ bucket_count = (bucket_count << 1);
263
+ }
264
+
265
+ /*
266
+ * If we fell through to here, that means we need to allocate a new hashmap.
267
+ */
268
+ size_t new_hashmap_count = header->hashmap_count + 1;
269
+ size_t old_data_len = filedict->data_len;
270
+ size_t new_data_len = filedict_file_size(header->initial_bucket_count, new_hashmap_count);
271
+
272
+ assert(new_data_len > old_data_len);
273
+ assert((new_data_len - old_data_len) % header->initial_bucket_count == 0);
274
+
275
+ munmap(filedict->data, filedict->data_len);
276
+ int truncate_result = ftruncate(filedict->fd, new_data_len);
277
+ if (truncate_result != 0) { filedict->error = strerror(errno); return; }
278
+
279
+ filedict->data = mmap(
280
+ filedict->data,
281
+ new_data_len,
282
+ PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
283
+ MAP_SHARED,
284
+ filedict->fd,
285
+ 0
286
+ );
287
+ if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
288
+ header = (filedict_header_t *)filedict->data;
289
+ hashmap = filedict->data + old_data_len;
290
+
291
+ filedict->data_len = new_data_len;
292
+ header->hashmap_count = new_hashmap_count;
293
+ goto try_again;
294
+ }
295
+
296
+ /*
297
+ * Resizes the filedict based on the header hashmap count and initial bucket count.
298
+ * Naturally, your pointers into the map will become invalid after calling this.
299
+ */
300
+ static void filedict_resize(filedict_t *filedict) {
301
+ filedict_header_t *header = (filedict_header_t*)filedict->data;
302
+ size_t computed_size = filedict_file_size(header->initial_bucket_count, header->hashmap_count);
303
+
304
+ munmap(filedict->data, filedict->data_len);
305
+ filedict->data = mmap(
306
+ filedict->data,
307
+ computed_size,
308
+ PROT_READ | ((filedict->flags & O_RDWR) ? PROT_WRITE : 0),
309
+ MAP_SHARED,
310
+ filedict->fd,
311
+ 0
312
+ );
313
+ if (filedict->data == MAP_FAILED) { filedict->error = strerror(errno); return; }
314
+ filedict->data_len = computed_size;
315
+ }
316
+
317
+ /*
318
+ * There are 3 "levels" to a filedict. From top to bottom:
319
+ * 1. Hashmap - which hashmap are we looking at? We create additional hashmaps to handle overflow.
320
+ * 2. Entry - which entry in our hashmap bucket are we looking at?
321
+ * 3. Value - where in the value buffer are we looking? There's 256 bytes, so can be many strings.
322
+ */
323
+
324
+ /* #define log_return(val) do { printf("%s -> %i\n", __func__, (val)); return (val); } while(0) */
325
+ #define log_return(val) return val
326
+
327
+ /*
328
+ * Returns 1 when we successfully advanced to the next value
329
+ * Returns 0 when there is no next value
330
+ */
331
+ static int filedict_read_advance_value(filedict_read_t *read) {
332
+ assert(read->entry != NULL);
333
+
334
+ const char *buffer_begin = read->entry->value;
335
+ const char *buffer_end = buffer_begin + FILEDICT_VALUE_SIZE;
336
+
337
+ const char *c;
338
+ for (c = read->value; c < buffer_end; ++c) {
339
+ if (*c == 0) {
340
+ c += 1;
341
+ break;
342
+ }
343
+ }
344
+
345
+ if (c >= buffer_end) log_return(0);
346
+ if (*c == 0) log_return(0);
347
+
348
+ read->value = c;
349
+ log_return(1);
350
+ }
351
+
352
+ /*
353
+ * Returns 1 when we successfully find a new entry that matches read->key.
354
+ * advances read->entry_i and read->entry to the new entry.
355
+ *
356
+ * Returns 0 when we exhausted all remaining entries and didn't find a match.
357
+ */
358
+ static int filedict_read_advance_entry(filedict_read_t *read) {
359
+ assert(read->key != NULL);
360
+ assert(strlen(read->key) > 0);
361
+ assert(read->bucket != NULL);
362
+
363
+ while (1) {
364
+ if (read->entry_i >= FILEDICT_BUCKET_ENTRY_COUNT) log_return(0);
365
+
366
+ read->entry = &read->bucket->entries[read->entry_i];
367
+
368
+ if (strncmp(read->entry->key, read->key, FILEDICT_KEY_SIZE) == 0) {
369
+ read->value = read->entry->value;
370
+ log_return(1);
371
+ }
372
+
373
+ read->entry_i += 1;
374
+ }
375
+ }
376
+
377
+ /*
378
+ * Returns 1 when we successfully advanced to the next hashmap.
379
+ * read->bucket, read->entry, and read->value will be populated.
380
+ *
381
+ * Returns 0 when there are no more hashmaps, or the latest hashmap has no matching entries.
382
+ */
383
+ static int filedict_read_advance_hashmap(filedict_read_t *read) {
384
+ filedict_t *filedict = read->filedict;
385
+
386
+ assert(filedict);
387
+ assert(filedict->data);
388
+
389
+ filedict_header_t *header = (filedict_header_t*)filedict->data;
390
+
391
+ if (read->hashmap_i >= header->hashmap_count) log_return(0);
392
+
393
+ size_t offset = filedict_file_size(header->initial_bucket_count, read->hashmap_i);
394
+
395
+ if (offset >= filedict->data_len) {
396
+ filedict_resize(filedict);
397
+ if (filedict->error) log_return(0);
398
+ header = (filedict_header_t*)filedict->data;
399
+ }
400
+
401
+ filedict_bucket_t *hashmap = filedict->data + offset;
402
+
403
+ read->bucket_count = (size_t)header->initial_bucket_count << read->hashmap_i;
404
+ read->bucket = &hashmap[read->key_hash % read->bucket_count];
405
+ read->entry = &read->bucket->entries[0];
406
+
407
+ read->entry_i = 0;
408
+
409
+ log_return(filedict_read_advance_entry(read));
410
+ }
411
+
412
+ /*
413
+ * Returns a "read" at the given key. If there's a hit, <return>.value will have the value.
414
+ */
415
+ static filedict_read_t filedict_get(filedict_t *filedict, const char *key) {
416
+ filedict_read_t read;
417
+ read.filedict = filedict;
418
+ read.key = key;
419
+ read.value = NULL;
420
+ read.bucket = NULL;
421
+ read.entry = NULL;
422
+ read.entry_i = 0;
423
+ read.hashmap_i = 0;
424
+ read.bucket_count = 0;
425
+ read.key_hash = filedict->hash_function(key);
426
+
427
+ filedict_read_advance_hashmap(&read);
428
+ return read;
429
+ }
430
+
431
+ /*
432
+ * Lets you find the next value. Pass the return value of filedict_get.
433
+ *
434
+ * Returns 1 when a next value was found, 0 otherwise.
435
+ *
436
+ * If this returns 0, your filedict_read_t is defunct and shouldn't be used anymore.
437
+ */
438
+ static int filedict_get_next(filedict_read_t *read) {
439
+ int found = -1;
440
+
441
+ found = filedict_read_advance_value(read);
442
+ if (found == 1) return found;
443
+
444
+ read->entry_i += 1;
445
+ found = filedict_read_advance_entry(read);
446
+ if (found == 1) return found;
447
+
448
+ read->hashmap_i += 1;
449
+ return filedict_read_advance_hashmap(read);
450
+ }
451
+
452
+ #endif
@@ -3,7 +3,6 @@
3
3
 
4
4
  extern VALUE mFiledict;
5
5
  VALUE cHash;
6
- VALUE mSetExt;
7
6
  VALUE cSet;
8
7
 
9
8
  ID id_add;
@@ -89,7 +88,7 @@ static VALUE fd_set_add(int argc, VALUE *argv, VALUE self) {
89
88
  VALUE fd_hash_ruby_object = rb_ivar_get(self, id_fd_hash);
90
89
 
91
90
  if (fd_hash_ruby_object == Qnil) {
92
- return self;
91
+ return rb_call_super(argc, argv);
93
92
  }
94
93
 
95
94
  fd_hash_t *fd_hash = RTYPEDDATA_DATA(fd_hash_ruby_object);
@@ -124,8 +123,6 @@ static VALUE fd_hash_access(VALUE self, VALUE key) {
124
123
  rb_ivar_set(result, id_fd_hash, self);
125
124
  rb_ivar_set(result, id_fd_key, key);
126
125
 
127
- rb_extend_object(result, mSetExt);
128
-
129
126
  return result;
130
127
  }
131
128
 
@@ -139,9 +136,8 @@ void fdrb_init_hash() {
139
136
 
140
137
  VALUE rb_cSet = rb_define_class("Set", rb_cObject);
141
138
  cSet = rb_define_class_under(mFiledict, "Set", rb_cSet);
142
- mSetExt = rb_define_module_under(mFiledict, "SetExt");
143
139
 
144
- rb_define_method(mSetExt, "add", fd_set_add, -1);
140
+ rb_define_method(cSet, "add", fd_set_add, -1);
145
141
 
146
142
  id_add = rb_intern("add");
147
143
  id_remove = rb_intern("remove");
data/filedictrb.gemspec CHANGED
@@ -25,6 +25,7 @@ Gem::Specification.new do |spec|
25
25
  (f == __FILE__) || f.match(%r{\A(?:(?:test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
26
26
  end
27
27
  end
28
+ spec.files << 'ext/filedict/filedict.h'
28
29
  spec.bindir = "exe"
29
30
  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
30
31
  spec.require_paths = ["lib"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Filedict
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.3"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: filedictrb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nigel Baillie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-03-30 00:00:00.000000000 Z
11
+ date: 2022-04-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email:
@@ -26,6 +26,7 @@ files:
26
26
  - Rakefile
27
27
  - bin/console
28
28
  - bin/setup
29
+ - ext/filedict/filedict.h
29
30
  - ext/filedictrb/extconf.rb
30
31
  - ext/filedictrb/filedictrb.c
31
32
  - ext/filedictrb/filedictrb.h
@@ -57,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
57
58
  - !ruby/object:Gem::Version
58
59
  version: '0'
59
60
  requirements: []
60
- rubygems_version: 3.3.3
61
+ rubygems_version: 3.1.6
61
62
  signing_key:
62
63
  specification_version: 4
63
64
  summary: Uses filedict to emulate a file-backed Hash<Set<String>>