blurrily 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ #include <stdlib.h>
2
+ #include <inttypes.h>
3
+ #include "blurrily.h"
4
+ #include "ruby.h"
5
+
6
+ /******************************************************************************/
7
+
8
+ typedef struct blurrily_refs_t {
9
+ VALUE hash;
10
+ } blurrily_refs_t;
11
+
12
+ /******************************************************************************/
13
+
14
+ int blurrily_refs_new(blurrily_refs_t** refs_ptr)
15
+ {
16
+ blurrily_refs_t* refs = NULL;
17
+
18
+ refs = (blurrily_refs_t*) malloc(sizeof(blurrily_refs_t));
19
+ if (!refs) return -1;
20
+
21
+ refs->hash = rb_hash_new();
22
+ *refs_ptr = refs;
23
+ return 0;
24
+ }
25
+
26
+ /******************************************************************************/
27
+
28
+ void blurrily_refs_mark(blurrily_refs_t* refs)
29
+ {
30
+ rb_gc_mark(refs->hash);
31
+ return;
32
+ }
33
+
34
+ /******************************************************************************/
35
+
36
+ void blurrily_refs_free(blurrily_refs_t** refs_ptr)
37
+ {
38
+ blurrily_refs_t* refs = *refs_ptr;
39
+
40
+ refs->hash = Qnil;
41
+ free(refs);
42
+ *refs_ptr = NULL;
43
+ return;
44
+ }
45
+
46
+ /******************************************************************************/
47
+
48
+ void blurrily_refs_add(blurrily_refs_t* refs, uint32_t ref)
49
+ {
50
+ (void) rb_hash_aset(refs->hash, UINT2NUM(ref), Qtrue);
51
+ return;
52
+ }
53
+
54
+ /******************************************************************************/
55
+
56
+ void blurrily_refs_remove(blurrily_refs_t* refs, uint32_t ref)
57
+ {
58
+ (void) rb_hash_aset(refs->hash, UINT2NUM(ref), Qnil);
59
+ }
60
+
61
+ /******************************************************************************/
62
+
63
+ int blurrily_refs_test(blurrily_refs_t* refs, uint32_t ref)
64
+ {
65
+ return rb_hash_aref(refs->hash, UINT2NUM(ref)) == Qtrue ? 1 : 0;
66
+ }
@@ -0,0 +1,30 @@
1
+ /*
2
+
3
+ search_tree.h --
4
+
5
+ List of all references that's fast to query for existence.
6
+
7
+ */
8
+ #include <inttypes.h>
9
+
10
+
11
+ typedef struct blurrily_refs_t blurrily_refs_t;
12
+
13
+
14
+ /* Allocate a search tree */
15
+ int blurrily_refs_new(blurrily_refs_t** refs_ptr);
16
+
17
+ /* Destroy a search tree */
18
+ void blurrily_refs_free(blurrily_refs_t** refs_ptr);
19
+
20
+ /* Mark with Ruby's GC */
21
+ void blurrily_refs_mark(blurrily_refs_t* refs);
22
+
23
+ /* Add a reference */
24
+ void blurrily_refs_add(blurrily_refs_t* refs, uint32_t ref);
25
+
26
+ /* Remove a reference */
27
+ void blurrily_refs_remove(blurrily_refs_t* refs, uint32_t ref);
28
+
29
+ /* Test for a reference (1 = present, 0 = absent) */
30
+ int blurrily_refs_test(blurrily_refs_t* refs, uint32_t ref);
@@ -4,6 +4,7 @@
4
4
  #include <assert.h>
5
5
  #include <fcntl.h>
6
6
  #include <sys/mman.h>
7
+ #include <sys/errno.h>
7
8
  #include <unistd.h>
8
9
  #include <sys/stat.h>
9
10
 
@@ -21,14 +22,13 @@
21
22
  #endif
22
23
 
23
24
  #include "storage.h"
24
-
25
- #include "log.h"
25
+ #include "search_tree.h"
26
26
 
27
27
  /******************************************************************************/
28
28
 
29
29
  #define PAGE_SIZE 4096
30
30
  #define TRIGRAM_COUNT (TRIGRAM_BASE * TRIGRAM_BASE * TRIGRAM_BASE)
31
- #define TRIGRAM_ENTRIES_START_SIZE PAGE_SIZE/8
31
+ #define TRIGRAM_ENTRIES_START_SIZE PAGE_SIZE/sizeof(trigram_entry_t)
32
32
 
33
33
  /******************************************************************************/
34
34
 
@@ -50,7 +50,7 @@ struct PACKED_STRUCT trigram_entries_t
50
50
  uint32_t used;
51
51
 
52
52
  trigram_entry_t* entries; /* set when the structure is in memory */
53
- size_t entries_offset; /* set when the structure is on disk */
53
+ off_t entries_offset; /* set when the structure is on disk */
54
54
 
55
55
  uint8_t dirty; /* not optimised (presorted) yet */
56
56
  };
@@ -68,7 +68,7 @@ struct PACKED_STRUCT trigram_map_t
68
68
  uint32_t total_references;
69
69
  uint32_t total_trigrams;
70
70
  size_t mapped_size; /* when mapped from disk, the number of bytes mapped */
71
- int mapped_fd; /* when mapped from disk, the file descriptor */
71
+ blurrily_refs_t* refs;
72
72
 
73
73
  trigram_entries_t map[TRIGRAM_COUNT]; /* this whole structure is ~500KB */
74
74
  };
@@ -88,6 +88,17 @@ static int fake_mergesort(void *base, size_t nel, size_t width, int (*compar)(co
88
88
 
89
89
  /******************************************************************************/
90
90
 
91
+ #define SMALLOC(_NELEM,_TYPE) (_TYPE*) smalloc(_NELEM, sizeof(_TYPE))
92
+
93
+ static void* smalloc(size_t nelem, size_t length)
94
+ {
95
+ void* result = malloc(nelem * length);
96
+ if (result) memset(result, 0xAA, nelem * length);
97
+ return result;
98
+ }
99
+
100
+ /******************************************************************************/
101
+
91
102
  /* 1 -> little endian, 2 -> big endian */
92
103
  static uint8_t get_big_endian()
93
104
  {
@@ -171,24 +182,23 @@ int blurrily_storage_new(trigram_map* haystack_ptr)
171
182
  int k = 0;
172
183
 
173
184
  LOG("blurrily_storage_new\n");
174
- haystack = (trigram_map) malloc(sizeof(trigram_map_t));
185
+ haystack = SMALLOC(1, trigram_map_t);
175
186
  if (haystack == NULL) return -1;
176
187
 
177
- memset(haystack, 0x00, sizeof(trigram_map_t));
178
-
179
188
  memcpy(haystack->magic, "trigra", 6);
180
189
  haystack->big_endian = get_big_endian();
181
190
  haystack->pointer_size = get_pointer_size();
182
191
 
183
192
  haystack->mapped_size = 0; /* not mapped, as we just created it in memory */
184
- haystack->mapped_fd = 0;
185
193
  haystack->total_references = 0;
186
194
  haystack->total_trigrams = 0;
195
+ haystack->refs = NULL;
187
196
  for(k = 0, ptr = haystack->map ; k < TRIGRAM_COUNT ; ++k, ++ptr) {
188
197
  ptr->buckets = 0;
189
198
  ptr->used = 0;
190
199
  ptr->dirty = 0;
191
200
  ptr->entries = (trigram_entry_t*)NULL;
201
+ ptr->entries_offset = 0;
192
202
  }
193
203
 
194
204
  *haystack_ptr = haystack;
@@ -212,25 +222,46 @@ int blurrily_storage_load(trigram_map* haystack, const char* path)
212
222
  res = fstat(fd, &metadata);
213
223
  if (res < 0) goto cleanup;
214
224
 
225
+ /* check this file is at least lng enough to have a header */
226
+ if (metadata.st_size < (off_t) sizeof(trigram_map_t)) {
227
+ errno = EPROTO;
228
+ res = -1;
229
+ goto cleanup;
230
+ }
231
+
215
232
  header = (trigram_map) mmap(NULL, metadata.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
216
- assert(header != NULL);
233
+ if (header == MAP_FAILED) {
234
+ res = -1;
235
+ header = NULL;
236
+ goto cleanup;
237
+ }
238
+
239
+ /* fd not needed once mapping established */
240
+ res = close(fd);
241
+ if (res < 0) goto cleanup;
242
+ fd = -1;
217
243
 
218
244
  /* check magic */
219
- /* TODO */
245
+ res = memcmp(header->magic, "trigra", 6);
246
+ if (res != 0 || header->big_endian != get_big_endian() || header->pointer_size != get_pointer_size()) {
247
+ errno = EPROTO;
248
+ res = -1;
249
+ goto cleanup;
250
+ }
220
251
 
221
252
  /* fix header data */
222
253
  header->mapped_size = metadata.st_size;
223
- header->mapped_fd = fd;
224
254
  origin = (uint8_t*)header;
225
255
  for (int k = 0; k < TRIGRAM_COUNT; ++k) {
226
256
  trigram_entries_t* map = header->map + k;
227
257
  if (map->entries_offset == 0) continue;
228
258
  map->entries = (trigram_entry_t*) (origin + map->entries_offset);
229
- map->entries_offset = 0;
230
259
  }
231
260
  *haystack = header;
232
261
 
233
262
  cleanup:
263
+ if (fd > 0) (void) close(fd);
264
+ if (res < 0 && header != NULL) (void) munmap(header, metadata.st_size);
234
265
  return res;
235
266
  }
236
267
 
@@ -239,29 +270,28 @@ cleanup:
239
270
  int blurrily_storage_close(trigram_map* haystack_ptr)
240
271
  {
241
272
  trigram_map haystack = *haystack_ptr;
242
- int res = -1;
273
+ int res = 0;
274
+ trigram_entries_t* ptr = haystack->map;
243
275
 
244
276
  LOG("blurrily_storage_close\n");
245
277
 
246
- if (haystack->mapped_size) {
247
- int fd = haystack->mapped_fd;
278
+ for(int k = 0 ; k < TRIGRAM_COUNT ; ++k) {
279
+ if (ptr->entries_offset == 0) free(ptr->entries);
280
+ ++ptr;
281
+ }
248
282
 
249
- res = munmap(haystack, haystack->mapped_size);
250
- assert(res >= 0);
283
+ if (haystack->refs) blurrily_refs_free(&haystack->refs);
251
284
 
252
- res = close(fd);
253
- assert(res >= 0);
285
+ if (haystack->mapped_size) {
286
+ res = munmap(haystack, haystack->mapped_size);
287
+ if (res < 0) goto cleanup;
254
288
  } else {
255
- trigram_entries_t* ptr = haystack->map;
256
- for(int k = 0 ; k < TRIGRAM_COUNT ; ++k) {
257
- free(ptr->entries);
258
- ++ptr;
259
- }
260
289
  free(haystack);
261
290
  }
262
291
 
292
+ cleanup:
263
293
  *haystack_ptr = NULL;
264
- return 0;
294
+ return res;
265
295
  }
266
296
 
267
297
  /******************************************************************************/
@@ -269,7 +299,7 @@ int blurrily_storage_close(trigram_map* haystack_ptr)
269
299
  int blurrily_storage_save(trigram_map haystack, const char* path)
270
300
  {
271
301
  int fd = -1;
272
- int res = -1;
302
+ int res = 0;
273
303
  uint8_t* ptr = (uint8_t*)NULL;
274
304
  size_t total_size = 0;
275
305
  size_t offset = 0;
@@ -282,7 +312,7 @@ int blurrily_storage_save(trigram_map haystack, const char* path)
282
312
  }
283
313
 
284
314
  /* path for temporary file */
285
- snprintf(path_tmp, PATH_MAX, "%s.tmp", path);
315
+ snprintf(path_tmp, PATH_MAX, "%s.tmp.%ld", path, random());
286
316
 
287
317
  /* compute storage space required */
288
318
  total_size += round_to_page(sizeof(trigram_map_t));
@@ -293,16 +323,19 @@ int blurrily_storage_save(trigram_map haystack, const char* path)
293
323
 
294
324
  /* open and map file */
295
325
  fd = open(path_tmp, O_RDWR | O_CREAT | O_TRUNC, 0644);
296
- assert(fd >= 0);
326
+ if (fd < 0) goto cleanup;
297
327
 
298
328
  res = ftruncate(fd, total_size);
299
- assert(res >= 0);
329
+ if (res < 0) goto cleanup;
300
330
 
301
331
  ptr = mmap(NULL, total_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
302
- assert(ptr != NULL);
332
+ if (ptr == MAP_FAILED) { res = -1 ; goto cleanup ; }
333
+
334
+ (void) close(fd);
335
+ fd = -1;
303
336
 
304
337
  /* flush data */
305
- memset(ptr, 0x00, total_size);
338
+ memset(ptr, 0xFF, total_size);
306
339
 
307
340
  /* copy header & clean copy */
308
341
  memcpy(ptr, (void*)haystack, sizeof(trigram_map_t));
@@ -310,7 +343,7 @@ int blurrily_storage_save(trigram_map haystack, const char* path)
310
343
  header = (trigram_map)ptr;
311
344
 
312
345
  header->mapped_size = 0;
313
- header->mapped_fd = 0;
346
+ header->refs = NULL;
314
347
 
315
348
  /* copy each map, set offset in header */
316
349
  for (int k = 0; k < TRIGRAM_COUNT; ++k) {
@@ -330,17 +363,34 @@ int blurrily_storage_save(trigram_map haystack, const char* path)
330
363
  }
331
364
  assert(offset == total_size);
332
365
 
333
- res = munmap(ptr, total_size);
334
- assert(res >= 0);
335
-
336
- res = close(fd);
337
- assert(res >= 0);
366
+ cleanup:
367
+ if (ptr != NULL && total_size > 0) {
368
+ res = munmap(ptr, total_size);
369
+ }
338
370
 
339
371
  /* commit by renaming the file */
340
- res = rename(path_tmp, path);
341
- assert(res >= 0);
372
+ if (res >= 0 && path) {
373
+ res = rename(path_tmp, path);
374
+ }
342
375
 
343
- return 0;
376
+ return res;
377
+ }
378
+
379
+ /******************************************************************************/
380
+
381
+ void add_all_refs(trigram_map haystack)
382
+ {
383
+ assert(haystack->refs != NULL);
384
+
385
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
386
+ trigram_entries_t* map = haystack->map + k;
387
+ trigram_entry_t* ptr = map->entries;
388
+ assert(map->used <= map->buckets);
389
+ for (uint32_t j = 0; j < map->used; ++j, ++ptr) {
390
+ uint32_t ref = ptr->reference;
391
+ blurrily_refs_add(haystack->refs, ref);
392
+ }
393
+ }
344
394
  }
345
395
 
346
396
  /******************************************************************************/
@@ -348,13 +398,19 @@ int blurrily_storage_save(trigram_map haystack, const char* path)
348
398
  int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight)
349
399
  {
350
400
  int nb_trigrams = -1;
351
- int length = strlen(needle);
401
+ size_t length = strlen(needle);
352
402
  trigram_t* trigrams = (trigram_t*)NULL;
353
403
 
354
- trigrams = (trigram_t*)malloc((length+1) * sizeof(trigram_t));
404
+ if (!haystack->refs) {
405
+ blurrily_refs_new(&haystack->refs);
406
+ add_all_refs(haystack);
407
+ }
408
+ if (blurrily_refs_test(haystack->refs, reference)) return 0;
409
+ if (weight <= 0) weight = (uint32_t) length;
410
+
411
+ trigrams = SMALLOC(length+1, trigram_t);
355
412
  nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams);
356
413
 
357
- if (weight <= 0) weight = length;
358
414
 
359
415
  for (int k = 0; k < nb_trigrams; ++k) {
360
416
  trigram_t t = trigrams[k];
@@ -369,33 +425,51 @@ int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t refe
369
425
  LOG("- alloc for %d\n", t);
370
426
 
371
427
  map->buckets = TRIGRAM_ENTRIES_START_SIZE;
372
- map->entries = (trigram_entry_t*) calloc(map->buckets, sizeof(trigram_entry_t));
428
+ map->entries = SMALLOC(map->buckets, trigram_entry_t);
373
429
  }
374
- if (map->used == map->buckets) {
375
- uint32_t new_buckets = map->buckets * 4/3;
430
+ else if (map->used == map->buckets) {
431
+ uint32_t new_buckets = map->buckets * 4/3;
376
432
  trigram_entry_t* new_entries = NULL;
377
433
  LOG("- realloc for %d\n", t);
378
434
 
379
435
  /* copy old data, free old pointer, zero extra space */
380
- new_entries = malloc(new_buckets * sizeof(trigram_entry_t));
436
+ new_entries = SMALLOC(new_buckets, trigram_entry_t);
381
437
  assert(new_entries != NULL);
382
438
  memcpy(new_entries, map->entries, map->buckets * sizeof(trigram_entry_t));
383
- free(map->entries);
384
- memset(new_entries + map->buckets, 0x00, (new_buckets - map->buckets) * sizeof(trigram_entry_t));
439
+ /* scribble the rest of the map*/
440
+ // memset(new_entries + map->buckets, 0xFF, (new_buckets - map->buckets) * sizeof(trigram_entry_t));
441
+
442
+ #ifndef NDEBUG
443
+ /* scribble old data */
444
+ memset(map->entries, 0xFF, map->buckets * sizeof(trigram_entry_t));
445
+ #endif
446
+
447
+ if (map->entries_offset) {
448
+ /* old data was on disk, just mark it as no longer on disk */
449
+ map->entries_offset = 0;
450
+ } else {
451
+ /* free old data */
452
+ free(map->entries);
453
+ }
454
+
385
455
  /* swap fields */
386
456
  map->buckets = new_buckets;
387
457
  map->entries = new_entries;
388
458
  }
459
+
460
+ /* insert new entry */
461
+ assert(map->used < map->buckets);
389
462
  map->entries[map->used] = entry;
390
-
391
463
  map->used += 1;
392
464
  map->dirty = 1;
393
465
  }
394
466
  haystack->total_trigrams += nb_trigrams;
395
467
  haystack->total_references += 1;
396
468
 
469
+ blurrily_refs_add(haystack->refs, reference);
470
+
397
471
  free((void*)trigrams);
398
- return 0;
472
+ return nb_trigrams;
399
473
  }
400
474
 
401
475
  /******************************************************************************/
@@ -403,7 +477,7 @@ int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t refe
403
477
  int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results)
404
478
  {
405
479
  int nb_trigrams = -1;
406
- int length = strlen(needle);
480
+ size_t length = strlen(needle);
407
481
  trigram_t* trigrams = (trigram_t*)NULL;
408
482
  int nb_entries = -1;
409
483
  trigram_entry_t* entries = NULL;
@@ -414,7 +488,7 @@ int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t lim
414
488
  uint32_t last_ref = (uint32_t)-1;
415
489
  int nb_results = 0;
416
490
 
417
- trigrams = (trigram_t*)malloc((length+1) * sizeof(trigram_t));
491
+ trigrams = SMALLOC(length+1, trigram_t);
418
492
  nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams);
419
493
  if (nb_trigrams == 0) goto cleanup;
420
494
 
@@ -429,7 +503,7 @@ int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t lim
429
503
  if (nb_entries == 0) goto cleanup;
430
504
 
431
505
  /* allocate sorting memory */
432
- entries = (trigram_entry_t*) malloc(nb_entries * sizeof(trigram_entry_t));
506
+ entries = SMALLOC(nb_entries, trigram_entry_t);
433
507
  assert(entries != NULL);
434
508
  LOG("allocated space for %zd trigrams entries\n", nb_entries);
435
509
 
@@ -464,7 +538,7 @@ int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t lim
464
538
  LOG("total %zd distinct matches\n", nb_matches);
465
539
 
466
540
  /* allocate maches result */
467
- matches = (trigram_match_t*) calloc(nb_matches, sizeof(trigram_match_t));
541
+ matches = SMALLOC(nb_matches, trigram_match_t);
468
542
  assert(matches != NULL);
469
543
 
470
544
  /* reduction, counting matches per reference */
@@ -519,15 +593,18 @@ int blurrily_storage_delete(trigram_map haystack, uint32_t reference)
519
593
  entry = map->entries + j;
520
594
  if (entry->reference != reference) continue;
521
595
 
596
+ /* swap with the last entry */
522
597
  *entry = map->entries[map->used - 1];
598
+ memset(map->entries + map->used - 1, 0xFF, sizeof(trigram_entry_t));
599
+
523
600
  map->used -= 1;
524
601
 
525
602
  ++trigrams_deleted;
526
603
  --j;
527
604
  }
528
605
  }
529
- haystack->total_trigrams -= trigrams_deleted;
530
- haystack->total_references -= 1;
606
+ haystack->total_trigrams -= trigrams_deleted;
607
+ if (trigrams_deleted > 0) haystack->total_references -= 1;
531
608
  return trigrams_deleted;
532
609
  }
533
610
 
@@ -539,3 +616,11 @@ int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats)
539
616
  stats->trigrams = haystack->total_trigrams;
540
617
  return 0;
541
618
  }
619
+
620
+ /******************************************************************************/
621
+
622
+ void blurrily_storage_mark(trigram_map haystack)
623
+ {
624
+ if (haystack->refs) blurrily_refs_mark(haystack->refs);
625
+ return;
626
+ }