blurrily 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,66 @@
1
+ #include <stdlib.h>
2
+ #include <inttypes.h>
3
+ #include "blurrily.h"
4
+ #include "ruby.h"
5
+
6
+ /******************************************************************************/
7
+
8
+ typedef struct blurrily_refs_t {
9
+ VALUE hash;
10
+ } blurrily_refs_t;
11
+
12
+ /******************************************************************************/
13
+
14
+ int blurrily_refs_new(blurrily_refs_t** refs_ptr)
15
+ {
16
+ blurrily_refs_t* refs = NULL;
17
+
18
+ refs = (blurrily_refs_t*) malloc(sizeof(blurrily_refs_t));
19
+ if (!refs) return -1;
20
+
21
+ refs->hash = rb_hash_new();
22
+ *refs_ptr = refs;
23
+ return 0;
24
+ }
25
+
26
+ /******************************************************************************/
27
+
28
+ void blurrily_refs_mark(blurrily_refs_t* refs)
29
+ {
30
+ rb_gc_mark(refs->hash);
31
+ return;
32
+ }
33
+
34
+ /******************************************************************************/
35
+
36
+ void blurrily_refs_free(blurrily_refs_t** refs_ptr)
37
+ {
38
+ blurrily_refs_t* refs = *refs_ptr;
39
+
40
+ refs->hash = Qnil;
41
+ free(refs);
42
+ *refs_ptr = NULL;
43
+ return;
44
+ }
45
+
46
+ /******************************************************************************/
47
+
48
+ void blurrily_refs_add(blurrily_refs_t* refs, uint32_t ref)
49
+ {
50
+ (void) rb_hash_aset(refs->hash, UINT2NUM(ref), Qtrue);
51
+ return;
52
+ }
53
+
54
+ /******************************************************************************/
55
+
56
+ void blurrily_refs_remove(blurrily_refs_t* refs, uint32_t ref)
57
+ {
58
+ (void) rb_hash_aset(refs->hash, UINT2NUM(ref), Qnil);
59
+ }
60
+
61
+ /******************************************************************************/
62
+
63
+ int blurrily_refs_test(blurrily_refs_t* refs, uint32_t ref)
64
+ {
65
+ return rb_hash_aref(refs->hash, UINT2NUM(ref)) == Qtrue ? 1 : 0;
66
+ }
@@ -0,0 +1,30 @@
1
+ /*
2
+
3
+ search_tree.h --
4
+
5
+ List of all references that's fast to query for existence.
6
+
7
+ */
8
+ #include <inttypes.h>
9
+
10
+
11
+ typedef struct blurrily_refs_t blurrily_refs_t;
12
+
13
+
14
+ /* Allocate a search tree */
15
+ int blurrily_refs_new(blurrily_refs_t** refs_ptr);
16
+
17
+ /* Destroy a search tree */
18
+ void blurrily_refs_free(blurrily_refs_t** refs_ptr);
19
+
20
+ /* Mark with Ruby's GC */
21
+ void blurrily_refs_mark(blurrily_refs_t* refs);
22
+
23
+ /* Add a reference */
24
+ void blurrily_refs_add(blurrily_refs_t* refs, uint32_t ref);
25
+
26
+ /* Remove a reference */
27
+ void blurrily_refs_remove(blurrily_refs_t* refs, uint32_t ref);
28
+
29
+ /* Test for a reference (1 = present, 0 = absent) */
30
+ int blurrily_refs_test(blurrily_refs_t* refs, uint32_t ref);
@@ -4,6 +4,7 @@
4
4
  #include <assert.h>
5
5
  #include <fcntl.h>
6
6
  #include <sys/mman.h>
7
+ #include <sys/errno.h>
7
8
  #include <unistd.h>
8
9
  #include <sys/stat.h>
9
10
 
@@ -21,14 +22,13 @@
21
22
  #endif
22
23
 
23
24
  #include "storage.h"
24
-
25
- #include "log.h"
25
+ #include "search_tree.h"
26
26
 
27
27
  /******************************************************************************/
28
28
 
29
29
  #define PAGE_SIZE 4096
30
30
  #define TRIGRAM_COUNT (TRIGRAM_BASE * TRIGRAM_BASE * TRIGRAM_BASE)
31
- #define TRIGRAM_ENTRIES_START_SIZE PAGE_SIZE/8
31
+ #define TRIGRAM_ENTRIES_START_SIZE PAGE_SIZE/sizeof(trigram_entry_t)
32
32
 
33
33
  /******************************************************************************/
34
34
 
@@ -50,7 +50,7 @@ struct PACKED_STRUCT trigram_entries_t
50
50
  uint32_t used;
51
51
 
52
52
  trigram_entry_t* entries; /* set when the structure is in memory */
53
- size_t entries_offset; /* set when the structure is on disk */
53
+ off_t entries_offset; /* set when the structure is on disk */
54
54
 
55
55
  uint8_t dirty; /* not optimised (presorted) yet */
56
56
  };
@@ -68,7 +68,7 @@ struct PACKED_STRUCT trigram_map_t
68
68
  uint32_t total_references;
69
69
  uint32_t total_trigrams;
70
70
  size_t mapped_size; /* when mapped from disk, the number of bytes mapped */
71
- int mapped_fd; /* when mapped from disk, the file descriptor */
71
+ blurrily_refs_t* refs;
72
72
 
73
73
  trigram_entries_t map[TRIGRAM_COUNT]; /* this whole structure is ~500KB */
74
74
  };
@@ -88,6 +88,17 @@ static int fake_mergesort(void *base, size_t nel, size_t width, int (*compar)(co
88
88
 
89
89
  /******************************************************************************/
90
90
 
91
+ #define SMALLOC(_NELEM,_TYPE) (_TYPE*) smalloc(_NELEM, sizeof(_TYPE))
92
+
93
+ static void* smalloc(size_t nelem, size_t length)
94
+ {
95
+ void* result = malloc(nelem * length);
96
+ if (result) memset(result, 0xAA, nelem * length);
97
+ return result;
98
+ }
99
+
100
+ /******************************************************************************/
101
+
91
102
  /* 1 -> little endian, 2 -> big endian */
92
103
  static uint8_t get_big_endian()
93
104
  {
@@ -171,24 +182,23 @@ int blurrily_storage_new(trigram_map* haystack_ptr)
171
182
  int k = 0;
172
183
 
173
184
  LOG("blurrily_storage_new\n");
174
- haystack = (trigram_map) malloc(sizeof(trigram_map_t));
185
+ haystack = SMALLOC(1, trigram_map_t);
175
186
  if (haystack == NULL) return -1;
176
187
 
177
- memset(haystack, 0x00, sizeof(trigram_map_t));
178
-
179
188
  memcpy(haystack->magic, "trigra", 6);
180
189
  haystack->big_endian = get_big_endian();
181
190
  haystack->pointer_size = get_pointer_size();
182
191
 
183
192
  haystack->mapped_size = 0; /* not mapped, as we just created it in memory */
184
- haystack->mapped_fd = 0;
185
193
  haystack->total_references = 0;
186
194
  haystack->total_trigrams = 0;
195
+ haystack->refs = NULL;
187
196
  for(k = 0, ptr = haystack->map ; k < TRIGRAM_COUNT ; ++k, ++ptr) {
188
197
  ptr->buckets = 0;
189
198
  ptr->used = 0;
190
199
  ptr->dirty = 0;
191
200
  ptr->entries = (trigram_entry_t*)NULL;
201
+ ptr->entries_offset = 0;
192
202
  }
193
203
 
194
204
  *haystack_ptr = haystack;
@@ -212,25 +222,46 @@ int blurrily_storage_load(trigram_map* haystack, const char* path)
212
222
  res = fstat(fd, &metadata);
213
223
  if (res < 0) goto cleanup;
214
224
 
225
+ /* check this file is at least lng enough to have a header */
226
+ if (metadata.st_size < (off_t) sizeof(trigram_map_t)) {
227
+ errno = EPROTO;
228
+ res = -1;
229
+ goto cleanup;
230
+ }
231
+
215
232
  header = (trigram_map) mmap(NULL, metadata.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
216
- assert(header != NULL);
233
+ if (header == MAP_FAILED) {
234
+ res = -1;
235
+ header = NULL;
236
+ goto cleanup;
237
+ }
238
+
239
+ /* fd not needed once mapping established */
240
+ res = close(fd);
241
+ if (res < 0) goto cleanup;
242
+ fd = -1;
217
243
 
218
244
  /* check magic */
219
- /* TODO */
245
+ res = memcmp(header->magic, "trigra", 6);
246
+ if (res != 0 || header->big_endian != get_big_endian() || header->pointer_size != get_pointer_size()) {
247
+ errno = EPROTO;
248
+ res = -1;
249
+ goto cleanup;
250
+ }
220
251
 
221
252
  /* fix header data */
222
253
  header->mapped_size = metadata.st_size;
223
- header->mapped_fd = fd;
224
254
  origin = (uint8_t*)header;
225
255
  for (int k = 0; k < TRIGRAM_COUNT; ++k) {
226
256
  trigram_entries_t* map = header->map + k;
227
257
  if (map->entries_offset == 0) continue;
228
258
  map->entries = (trigram_entry_t*) (origin + map->entries_offset);
229
- map->entries_offset = 0;
230
259
  }
231
260
  *haystack = header;
232
261
 
233
262
  cleanup:
263
+ if (fd > 0) (void) close(fd);
264
+ if (res < 0 && header != NULL) (void) munmap(header, metadata.st_size);
234
265
  return res;
235
266
  }
236
267
 
@@ -239,29 +270,28 @@ cleanup:
239
270
  int blurrily_storage_close(trigram_map* haystack_ptr)
240
271
  {
241
272
  trigram_map haystack = *haystack_ptr;
242
- int res = -1;
273
+ int res = 0;
274
+ trigram_entries_t* ptr = haystack->map;
243
275
 
244
276
  LOG("blurrily_storage_close\n");
245
277
 
246
- if (haystack->mapped_size) {
247
- int fd = haystack->mapped_fd;
278
+ for(int k = 0 ; k < TRIGRAM_COUNT ; ++k) {
279
+ if (ptr->entries_offset == 0) free(ptr->entries);
280
+ ++ptr;
281
+ }
248
282
 
249
- res = munmap(haystack, haystack->mapped_size);
250
- assert(res >= 0);
283
+ if (haystack->refs) blurrily_refs_free(&haystack->refs);
251
284
 
252
- res = close(fd);
253
- assert(res >= 0);
285
+ if (haystack->mapped_size) {
286
+ res = munmap(haystack, haystack->mapped_size);
287
+ if (res < 0) goto cleanup;
254
288
  } else {
255
- trigram_entries_t* ptr = haystack->map;
256
- for(int k = 0 ; k < TRIGRAM_COUNT ; ++k) {
257
- free(ptr->entries);
258
- ++ptr;
259
- }
260
289
  free(haystack);
261
290
  }
262
291
 
292
+ cleanup:
263
293
  *haystack_ptr = NULL;
264
- return 0;
294
+ return res;
265
295
  }
266
296
 
267
297
  /******************************************************************************/
@@ -269,7 +299,7 @@ int blurrily_storage_close(trigram_map* haystack_ptr)
269
299
  int blurrily_storage_save(trigram_map haystack, const char* path)
270
300
  {
271
301
  int fd = -1;
272
- int res = -1;
302
+ int res = 0;
273
303
  uint8_t* ptr = (uint8_t*)NULL;
274
304
  size_t total_size = 0;
275
305
  size_t offset = 0;
@@ -282,7 +312,7 @@ int blurrily_storage_save(trigram_map haystack, const char* path)
282
312
  }
283
313
 
284
314
  /* path for temporary file */
285
- snprintf(path_tmp, PATH_MAX, "%s.tmp", path);
315
+ snprintf(path_tmp, PATH_MAX, "%s.tmp.%ld", path, random());
286
316
 
287
317
  /* compute storage space required */
288
318
  total_size += round_to_page(sizeof(trigram_map_t));
@@ -293,16 +323,19 @@ int blurrily_storage_save(trigram_map haystack, const char* path)
293
323
 
294
324
  /* open and map file */
295
325
  fd = open(path_tmp, O_RDWR | O_CREAT | O_TRUNC, 0644);
296
- assert(fd >= 0);
326
+ if (fd < 0) goto cleanup;
297
327
 
298
328
  res = ftruncate(fd, total_size);
299
- assert(res >= 0);
329
+ if (res < 0) goto cleanup;
300
330
 
301
331
  ptr = mmap(NULL, total_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
302
- assert(ptr != NULL);
332
+ if (ptr == MAP_FAILED) { res = -1 ; goto cleanup ; }
333
+
334
+ (void) close(fd);
335
+ fd = -1;
303
336
 
304
337
  /* flush data */
305
- memset(ptr, 0x00, total_size);
338
+ memset(ptr, 0xFF, total_size);
306
339
 
307
340
  /* copy header & clean copy */
308
341
  memcpy(ptr, (void*)haystack, sizeof(trigram_map_t));
@@ -310,7 +343,7 @@ int blurrily_storage_save(trigram_map haystack, const char* path)
310
343
  header = (trigram_map)ptr;
311
344
 
312
345
  header->mapped_size = 0;
313
- header->mapped_fd = 0;
346
+ header->refs = NULL;
314
347
 
315
348
  /* copy each map, set offset in header */
316
349
  for (int k = 0; k < TRIGRAM_COUNT; ++k) {
@@ -330,17 +363,34 @@ int blurrily_storage_save(trigram_map haystack, const char* path)
330
363
  }
331
364
  assert(offset == total_size);
332
365
 
333
- res = munmap(ptr, total_size);
334
- assert(res >= 0);
335
-
336
- res = close(fd);
337
- assert(res >= 0);
366
+ cleanup:
367
+ if (ptr != NULL && total_size > 0) {
368
+ res = munmap(ptr, total_size);
369
+ }
338
370
 
339
371
  /* commit by renaming the file */
340
- res = rename(path_tmp, path);
341
- assert(res >= 0);
372
+ if (res >= 0 && path) {
373
+ res = rename(path_tmp, path);
374
+ }
342
375
 
343
- return 0;
376
+ return res;
377
+ }
378
+
379
+ /******************************************************************************/
380
+
381
+ void add_all_refs(trigram_map haystack)
382
+ {
383
+ assert(haystack->refs != NULL);
384
+
385
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
386
+ trigram_entries_t* map = haystack->map + k;
387
+ trigram_entry_t* ptr = map->entries;
388
+ assert(map->used <= map->buckets);
389
+ for (uint32_t j = 0; j < map->used; ++j, ++ptr) {
390
+ uint32_t ref = ptr->reference;
391
+ blurrily_refs_add(haystack->refs, ref);
392
+ }
393
+ }
344
394
  }
345
395
 
346
396
  /******************************************************************************/
@@ -348,13 +398,19 @@ int blurrily_storage_save(trigram_map haystack, const char* path)
348
398
  int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight)
349
399
  {
350
400
  int nb_trigrams = -1;
351
- int length = strlen(needle);
401
+ size_t length = strlen(needle);
352
402
  trigram_t* trigrams = (trigram_t*)NULL;
353
403
 
354
- trigrams = (trigram_t*)malloc((length+1) * sizeof(trigram_t));
404
+ if (!haystack->refs) {
405
+ blurrily_refs_new(&haystack->refs);
406
+ add_all_refs(haystack);
407
+ }
408
+ if (blurrily_refs_test(haystack->refs, reference)) return 0;
409
+ if (weight <= 0) weight = (uint32_t) length;
410
+
411
+ trigrams = SMALLOC(length+1, trigram_t);
355
412
  nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams);
356
413
 
357
- if (weight <= 0) weight = length;
358
414
 
359
415
  for (int k = 0; k < nb_trigrams; ++k) {
360
416
  trigram_t t = trigrams[k];
@@ -369,33 +425,51 @@ int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t refe
369
425
  LOG("- alloc for %d\n", t);
370
426
 
371
427
  map->buckets = TRIGRAM_ENTRIES_START_SIZE;
372
- map->entries = (trigram_entry_t*) calloc(map->buckets, sizeof(trigram_entry_t));
428
+ map->entries = SMALLOC(map->buckets, trigram_entry_t);
373
429
  }
374
- if (map->used == map->buckets) {
375
- uint32_t new_buckets = map->buckets * 4/3;
430
+ else if (map->used == map->buckets) {
431
+ uint32_t new_buckets = map->buckets * 4/3;
376
432
  trigram_entry_t* new_entries = NULL;
377
433
  LOG("- realloc for %d\n", t);
378
434
 
379
435
  /* copy old data, free old pointer, zero extra space */
380
- new_entries = malloc(new_buckets * sizeof(trigram_entry_t));
436
+ new_entries = SMALLOC(new_buckets, trigram_entry_t);
381
437
  assert(new_entries != NULL);
382
438
  memcpy(new_entries, map->entries, map->buckets * sizeof(trigram_entry_t));
383
- free(map->entries);
384
- memset(new_entries + map->buckets, 0x00, (new_buckets - map->buckets) * sizeof(trigram_entry_t));
439
+ /* scribble the rest of the map*/
440
+ // memset(new_entries + map->buckets, 0xFF, (new_buckets - map->buckets) * sizeof(trigram_entry_t));
441
+
442
+ #ifndef NDEBUG
443
+ /* scribble old data */
444
+ memset(map->entries, 0xFF, map->buckets * sizeof(trigram_entry_t));
445
+ #endif
446
+
447
+ if (map->entries_offset) {
448
+ /* old data was on disk, just mark it as no longer on disk */
449
+ map->entries_offset = 0;
450
+ } else {
451
+ /* free old data */
452
+ free(map->entries);
453
+ }
454
+
385
455
  /* swap fields */
386
456
  map->buckets = new_buckets;
387
457
  map->entries = new_entries;
388
458
  }
459
+
460
+ /* insert new entry */
461
+ assert(map->used < map->buckets);
389
462
  map->entries[map->used] = entry;
390
-
391
463
  map->used += 1;
392
464
  map->dirty = 1;
393
465
  }
394
466
  haystack->total_trigrams += nb_trigrams;
395
467
  haystack->total_references += 1;
396
468
 
469
+ blurrily_refs_add(haystack->refs, reference);
470
+
397
471
  free((void*)trigrams);
398
- return 0;
472
+ return nb_trigrams;
399
473
  }
400
474
 
401
475
  /******************************************************************************/
@@ -403,7 +477,7 @@ int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t refe
403
477
  int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results)
404
478
  {
405
479
  int nb_trigrams = -1;
406
- int length = strlen(needle);
480
+ size_t length = strlen(needle);
407
481
  trigram_t* trigrams = (trigram_t*)NULL;
408
482
  int nb_entries = -1;
409
483
  trigram_entry_t* entries = NULL;
@@ -414,7 +488,7 @@ int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t lim
414
488
  uint32_t last_ref = (uint32_t)-1;
415
489
  int nb_results = 0;
416
490
 
417
- trigrams = (trigram_t*)malloc((length+1) * sizeof(trigram_t));
491
+ trigrams = SMALLOC(length+1, trigram_t);
418
492
  nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams);
419
493
  if (nb_trigrams == 0) goto cleanup;
420
494
 
@@ -429,7 +503,7 @@ int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t lim
429
503
  if (nb_entries == 0) goto cleanup;
430
504
 
431
505
  /* allocate sorting memory */
432
- entries = (trigram_entry_t*) malloc(nb_entries * sizeof(trigram_entry_t));
506
+ entries = SMALLOC(nb_entries, trigram_entry_t);
433
507
  assert(entries != NULL);
434
508
  LOG("allocated space for %zd trigrams entries\n", nb_entries);
435
509
 
@@ -464,7 +538,7 @@ int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t lim
464
538
  LOG("total %zd distinct matches\n", nb_matches);
465
539
 
466
540
  /* allocate maches result */
467
- matches = (trigram_match_t*) calloc(nb_matches, sizeof(trigram_match_t));
541
+ matches = SMALLOC(nb_matches, trigram_match_t);
468
542
  assert(matches != NULL);
469
543
 
470
544
  /* reduction, counting matches per reference */
@@ -519,15 +593,18 @@ int blurrily_storage_delete(trigram_map haystack, uint32_t reference)
519
593
  entry = map->entries + j;
520
594
  if (entry->reference != reference) continue;
521
595
 
596
+ /* swap with the last entry */
522
597
  *entry = map->entries[map->used - 1];
598
+ memset(map->entries + map->used - 1, 0xFF, sizeof(trigram_entry_t));
599
+
523
600
  map->used -= 1;
524
601
 
525
602
  ++trigrams_deleted;
526
603
  --j;
527
604
  }
528
605
  }
529
- haystack->total_trigrams -= trigrams_deleted;
530
- haystack->total_references -= 1;
606
+ haystack->total_trigrams -= trigrams_deleted;
607
+ if (trigrams_deleted > 0) haystack->total_references -= 1;
531
608
  return trigrams_deleted;
532
609
  }
533
610
 
@@ -539,3 +616,11 @@ int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats)
539
616
  stats->trigrams = haystack->total_trigrams;
540
617
  return 0;
541
618
  }
619
+
620
+ /******************************************************************************/
621
+
622
+ void blurrily_storage_mark(trigram_map haystack)
623
+ {
624
+ if (haystack->refs) blurrily_refs_mark(haystack->refs);
625
+ return;
626
+ }