vinted-blurrily 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,66 @@
1
+ #include <stdlib.h>
2
+ #include <inttypes.h>
3
+ #include "blurrily.h"
4
+ #include "ruby.h"
5
+
6
+ /******************************************************************************/
7
+
8
+ typedef struct blurrily_refs_t {
9
+ VALUE hash;
10
+ } blurrily_refs_t;
11
+
12
+ /******************************************************************************/
13
+
14
+ int blurrily_refs_new(blurrily_refs_t** refs_ptr)
15
+ {
16
+ blurrily_refs_t* refs = NULL;
17
+
18
+ refs = (blurrily_refs_t*) malloc(sizeof(blurrily_refs_t));
19
+ if (!refs) return -1;
20
+
21
+ refs->hash = rb_hash_new();
22
+ *refs_ptr = refs;
23
+ return 0;
24
+ }
25
+
26
+ /******************************************************************************/
27
+
28
+ void blurrily_refs_mark(blurrily_refs_t* refs)
29
+ {
30
+ rb_gc_mark(refs->hash);
31
+ return;
32
+ }
33
+
34
+ /******************************************************************************/
35
+
36
+ void blurrily_refs_free(blurrily_refs_t** refs_ptr)
37
+ {
38
+ blurrily_refs_t* refs = *refs_ptr;
39
+
40
+ refs->hash = Qnil;
41
+ free(refs);
42
+ *refs_ptr = NULL;
43
+ return;
44
+ }
45
+
46
+ /******************************************************************************/
47
+
48
+ void blurrily_refs_add(blurrily_refs_t* refs, uint32_t ref)
49
+ {
50
+ (void) rb_hash_aset(refs->hash, UINT2NUM(ref), Qtrue);
51
+ return;
52
+ }
53
+
54
+ /******************************************************************************/
55
+
56
+ void blurrily_refs_remove(blurrily_refs_t* refs, uint32_t ref)
57
+ {
58
+ (void) rb_hash_aset(refs->hash, UINT2NUM(ref), Qnil);
59
+ }
60
+
61
+ /******************************************************************************/
62
+
63
+ int blurrily_refs_test(blurrily_refs_t* refs, uint32_t ref)
64
+ {
65
+ return rb_hash_aref(refs->hash, UINT2NUM(ref)) == Qtrue ? 1 : 0;
66
+ }
@@ -0,0 +1,30 @@
1
+ /*
2
+
3
+ search_tree.h --
4
+
5
+ List of all references that's fast to query for existence.
6
+
7
+ */
8
+ #include <inttypes.h>
9
+
10
+
11
+ typedef struct blurrily_refs_t blurrily_refs_t;
12
+
13
+
14
+ /* Allocate a search tree */
15
+ int blurrily_refs_new(blurrily_refs_t** refs_ptr);
16
+
17
+ /* Destroy a search tree */
18
+ void blurrily_refs_free(blurrily_refs_t** refs_ptr);
19
+
20
+ /* Mark with Ruby's GC */
21
+ void blurrily_refs_mark(blurrily_refs_t* refs);
22
+
23
+ /* Add a reference */
24
+ void blurrily_refs_add(blurrily_refs_t* refs, uint32_t ref);
25
+
26
+ /* Remove a reference */
27
+ void blurrily_refs_remove(blurrily_refs_t* refs, uint32_t ref);
28
+
29
+ /* Test for a reference (1 = present, 0 = absent) */
30
+ int blurrily_refs_test(blurrily_refs_t* refs, uint32_t ref);
@@ -0,0 +1,629 @@
1
+ #include <stdlib.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+ #include <assert.h>
5
+ #include <fcntl.h>
6
+ #include <sys/mman.h>
7
+ #include <sys/errno.h>
8
+ #include <unistd.h>
9
+ #include <sys/stat.h>
10
+
11
+ #ifdef PLATFORM_LINUX
12
+ #include <linux/limits.h>
13
+ #define MERGESORT fake_mergesort
14
+ #else
15
+ #include <limits.h>
16
+ #define MERGESORT mergesort
17
+ #endif
18
+
19
+ #ifndef PATH_MAX
20
+ /* safe default ... */
21
+ #define PATH_MAX 1024
22
+ #endif
23
+
24
+ #include "storage.h"
25
+ #include "search_tree.h"
26
+
27
+ /******************************************************************************/
28
+
29
+ #define PAGE_SIZE 4096
30
+ #define TRIGRAM_COUNT (TRIGRAM_BASE * TRIGRAM_BASE * TRIGRAM_BASE)
31
+ #define TRIGRAM_ENTRIES_START_SIZE PAGE_SIZE/sizeof(trigram_entry_t)
32
+
33
+ /******************************************************************************/
34
+
35
+ /* one trigram entry -- client reference and sorting weight */
36
+ struct BR_PACKED_STRUCT trigram_entry_t
37
+ {
38
+ uint32_t reference;
39
+ uint32_t weight;
40
+ };
41
+ typedef struct trigram_entry_t trigram_entry_t;
42
+
43
+
44
+ /* collection of entries for a given trigram */
45
+ /* <entries> points to an array of <buckets> entries */
46
+ /* of which <used> are filled */
47
+ struct BR_PACKED_STRUCT trigram_entries_t
48
+ {
49
+ uint32_t buckets;
50
+ uint32_t used;
51
+
52
+ trigram_entry_t* entries; /* set when the structure is in memory */
53
+ off_t entries_offset; /* set when the structure is on disk */
54
+
55
+ uint8_t dirty; /* not optimised (presorted) yet */
56
+ };
57
+ typedef struct trigram_entries_t trigram_entries_t;
58
+
59
+
60
+ /* hash map of all possible trigrams to collection of entries */
61
+ /* there are 28^3 = 19,683 possible trigrams */
62
+ struct BR_PACKED_STRUCT trigram_map_t
63
+ {
64
+ char magic[6]; /* the string "trigra" */
65
+ uint8_t big_endian;
66
+ uint8_t pointer_size;
67
+
68
+ uint32_t total_references;
69
+ uint32_t total_trigrams;
70
+ size_t mapped_size; /* when mapped from disk, the number of bytes mapped */
71
+ blurrily_refs_t* refs;
72
+
73
+ trigram_entries_t map[TRIGRAM_COUNT]; /* this whole structure is ~500KB */
74
+ };
75
+ typedef struct trigram_map_t trigram_map_t;
76
+
77
+ /******************************************************************************/
78
+
79
+ #ifdef PLATFORM_LINUX
80
+ /* fake version of mergesort(3) implemented with qsort(3) as Linux lacks */
81
+ /* the specific variants */
82
+ static int fake_mergesort(void *base, size_t nel, size_t width, int (*compar)(const void *, const void *))
83
+ {
84
+ qsort(base, nel, width, compar);
85
+ return 0;
86
+ }
87
+ #endif
88
+
89
+ /******************************************************************************/
90
+
91
+ #define SMALLOC(_NELEM,_TYPE) (_TYPE*) smalloc(_NELEM, sizeof(_TYPE))
92
+
93
+ static void* smalloc(size_t nelem, size_t length)
94
+ {
95
+ void* result = malloc(nelem * length);
96
+ if (result) memset(result, 0xAA, nelem * length);
97
+ return result;
98
+ }
99
+
100
+ /******************************************************************************/
101
+
102
+ /* 1 -> little endian, 2 -> big endian */
103
+ static uint8_t get_big_endian()
104
+ {
105
+ uint32_t magic = 0xAA0000BB;
106
+ uint8_t head = *((uint8_t*) &magic);
107
+
108
+ return (head == 0xBB) ? 1 : 2;
109
+ }
110
+
111
+ /******************************************************************************/
112
+
113
+ /* 4 or 8 (bytes) */
114
+ static uint8_t get_pointer_size()
115
+ {
116
+ return (uint8_t) sizeof(void*);
117
+ }
118
+
119
+ /******************************************************************************/
120
+
121
+ static int compare_entries(const void* left_p, const void* right_p)
122
+ {
123
+ trigram_entry_t* left = (trigram_entry_t*)left_p;
124
+ trigram_entry_t* right = (trigram_entry_t*)right_p;
125
+ return (int)left->reference - (int)right->reference;
126
+ }
127
+
128
+ /* compares matches on #matches (descending) then weight (ascending) */
129
+ static int compare_matches(const void* left_p, const void* right_p)
130
+ {
131
+ trigram_match_t* left = (trigram_match_t*)left_p;
132
+ trigram_match_t* right = (trigram_match_t*)right_p;
133
+ /* int delta = (int)left->matches - (int)right->matches; */
134
+ int delta = (int)right->matches - (int)left->matches;
135
+
136
+ return (delta != 0) ? delta : ((int)left->weight - (int)right->weight);
137
+
138
+ }
139
+
140
+ /******************************************************************************/
141
+
142
+ static void sort_map_if_dirty(trigram_entries_t* map)
143
+ {
144
+ int res = -1;
145
+ if (! map->dirty) return;
146
+
147
+ res = MERGESORT(map->entries, map->used, sizeof(trigram_entry_t), &compare_entries);
148
+ assert(res >= 0);
149
+ map->dirty = 0;
150
+ }
151
+
152
+ /******************************************************************************/
153
+
154
+ static size_t round_to_page(size_t value)
155
+ {
156
+ if (value % PAGE_SIZE == 0) return value;
157
+ return (value / PAGE_SIZE + 1) * PAGE_SIZE;
158
+ }
159
+
160
+ /******************************************************************************/
161
+
162
+ static size_t get_map_size(trigram_map haystack, int index)
163
+ {
164
+ return haystack->map[index].buckets * sizeof(trigram_entry_t);
165
+ }
166
+
167
+ /******************************************************************************/
168
+
169
+ static void free_if(void* ptr)
170
+ {
171
+ if (ptr == NULL) return;
172
+ free(ptr);
173
+ return;
174
+ }
175
+
176
+ /******************************************************************************/
177
+
178
+ int blurrily_storage_new(trigram_map* haystack_ptr)
179
+ {
180
+ trigram_map haystack = (trigram_map)NULL;
181
+ trigram_entries_t* ptr = NULL;
182
+ int k = 0;
183
+
184
+ LOG("blurrily_storage_new\n");
185
+ haystack = SMALLOC(1, trigram_map_t);
186
+ if (haystack == NULL) return -1;
187
+
188
+ memcpy(haystack->magic, "trigra", 6);
189
+ haystack->big_endian = get_big_endian();
190
+ haystack->pointer_size = get_pointer_size();
191
+
192
+ haystack->mapped_size = 0; /* not mapped, as we just created it in memory */
193
+ haystack->total_references = 0;
194
+ haystack->total_trigrams = 0;
195
+ haystack->refs = NULL;
196
+ for(k = 0, ptr = haystack->map ; k < TRIGRAM_COUNT ; ++k, ++ptr) {
197
+ ptr->buckets = 0;
198
+ ptr->used = 0;
199
+ ptr->dirty = 0;
200
+ ptr->entries = (trigram_entry_t*)NULL;
201
+ ptr->entries_offset = 0;
202
+ }
203
+
204
+ *haystack_ptr = haystack;
205
+ return 0;
206
+ }
207
+
208
+ /******************************************************************************/
209
+
210
+ int blurrily_storage_load(trigram_map* haystack, const char* path)
211
+ {
212
+ int fd = -1;
213
+ int res = -1;
214
+ trigram_map header = NULL;
215
+ uint8_t* origin = NULL;
216
+ struct stat metadata;
217
+
218
+ /* open and map file */
219
+ res = fd = open(path, O_RDONLY);
220
+ if (res < 0) goto cleanup;
221
+
222
+ res = fstat(fd, &metadata);
223
+ if (res < 0) goto cleanup;
224
+
225
+ /* check this file is at least lng enough to have a header */
226
+ if (metadata.st_size < (off_t) sizeof(trigram_map_t)) {
227
+ errno = EPROTO;
228
+ res = -1;
229
+ goto cleanup;
230
+ }
231
+
232
+ header = (trigram_map) mmap(NULL, metadata.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
233
+ if (header == MAP_FAILED) {
234
+ res = -1;
235
+ header = NULL;
236
+ goto cleanup;
237
+ }
238
+
239
+ /* fd not needed once mapping established */
240
+ res = close(fd);
241
+ if (res < 0) goto cleanup;
242
+ fd = -1;
243
+
244
+ /* check magic */
245
+ res = memcmp(header->magic, "trigra", 6);
246
+ if (res != 0 || header->big_endian != get_big_endian() || header->pointer_size != get_pointer_size()) {
247
+ errno = EPROTO;
248
+ res = -1;
249
+ goto cleanup;
250
+ }
251
+
252
+ /* fix header data */
253
+ header->mapped_size = metadata.st_size;
254
+ origin = (uint8_t*)header;
255
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
256
+ trigram_entries_t* map = header->map + k;
257
+ if (map->entries_offset == 0) continue;
258
+ map->entries = (trigram_entry_t*) (origin + map->entries_offset);
259
+ }
260
+ *haystack = header;
261
+
262
+ cleanup:
263
+ if (fd > 0) (void) close(fd);
264
+ if (res < 0 && header != NULL) (void) munmap(header, metadata.st_size);
265
+ return res;
266
+ }
267
+
268
+ /******************************************************************************/
269
+
270
+ int blurrily_storage_close(trigram_map* haystack_ptr)
271
+ {
272
+ trigram_map haystack = *haystack_ptr;
273
+ int res = 0;
274
+ trigram_entries_t* ptr = haystack->map;
275
+
276
+ LOG("blurrily_storage_close\n");
277
+
278
+ for(int k = 0 ; k < TRIGRAM_COUNT ; ++k) {
279
+ if (ptr->entries_offset == 0) free(ptr->entries);
280
+ ++ptr;
281
+ }
282
+
283
+ if (haystack->refs) blurrily_refs_free(&haystack->refs);
284
+
285
+ if (haystack->mapped_size) {
286
+ res = munmap(haystack, haystack->mapped_size);
287
+ if (res < 0) goto cleanup;
288
+ } else {
289
+ free(haystack);
290
+ }
291
+
292
+ cleanup:
293
+ *haystack_ptr = NULL;
294
+ return res;
295
+ }
296
+
297
+ /******************************************************************************/
298
+
299
+ int blurrily_storage_save(trigram_map haystack, const char* path)
300
+ {
301
+ int fd = -1;
302
+ int res = 0;
303
+ uint8_t* ptr = (uint8_t*)NULL;
304
+ size_t total_size = 0;
305
+ size_t offset = 0;
306
+ trigram_map header = NULL;
307
+ char path_tmp[PATH_MAX];
308
+
309
+ /* cleanup maps in memory */
310
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
311
+ sort_map_if_dirty(haystack->map + k);
312
+ }
313
+
314
+ /* path for temporary file */
315
+ snprintf(path_tmp, PATH_MAX, "%s.tmp.%ld", path, random());
316
+
317
+ /* compute storage space required */
318
+ total_size += round_to_page(sizeof(trigram_map_t));
319
+
320
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
321
+ total_size += round_to_page(get_map_size(haystack, k));
322
+ }
323
+
324
+ /* open and map file */
325
+ fd = open(path_tmp, O_RDWR | O_CREAT | O_TRUNC, 0644);
326
+ if (fd < 0) goto cleanup;
327
+
328
+ res = ftruncate(fd, total_size);
329
+ if (res < 0) goto cleanup;
330
+
331
+ ptr = mmap(NULL, total_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
332
+ if (ptr == MAP_FAILED) { res = -1 ; goto cleanup ; }
333
+
334
+ (void) close(fd);
335
+ fd = -1;
336
+
337
+ /* flush data */
338
+ memset(ptr, 0xFF, total_size);
339
+
340
+ /* copy header & clean copy */
341
+ memcpy(ptr, (void*)haystack, sizeof(trigram_map_t));
342
+ offset += round_to_page(sizeof(trigram_map_t));
343
+ header = (trigram_map)ptr;
344
+
345
+ header->mapped_size = 0;
346
+ header->refs = NULL;
347
+
348
+ /* copy each map, set offset in header */
349
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
350
+ size_t block_size = get_map_size(haystack, k);
351
+
352
+ if (block_size > 0) {
353
+ memcpy(ptr+offset, haystack->map[k].entries, block_size);
354
+
355
+ header->map[k].entries = NULL;
356
+ header->map[k].entries_offset = offset;
357
+
358
+ offset += round_to_page(block_size);
359
+ } else {
360
+ header->map[k].entries = NULL;
361
+ header->map[k].entries_offset = 0;
362
+ }
363
+ }
364
+ assert(offset == total_size);
365
+
366
+ cleanup:
367
+ if (ptr != NULL && total_size > 0) {
368
+ res = munmap(ptr, total_size);
369
+ }
370
+
371
+ /* commit by renaming the file */
372
+ if (res >= 0 && path) {
373
+ res = rename(path_tmp, path);
374
+ }
375
+
376
+ return res;
377
+ }
378
+
379
+ /******************************************************************************/
380
+
381
+ void add_all_refs(trigram_map haystack)
382
+ {
383
+ assert(haystack->refs != NULL);
384
+
385
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
386
+ trigram_entries_t* map = haystack->map + k;
387
+ trigram_entry_t* ptr = map->entries;
388
+ assert(map->used <= map->buckets);
389
+ for (uint32_t j = 0; j < map->used; ++j, ++ptr) {
390
+ uint32_t ref = ptr->reference;
391
+ blurrily_refs_add(haystack->refs, ref);
392
+ }
393
+ }
394
+ }
395
+
396
+ /******************************************************************************/
397
+
398
+ int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight)
399
+ {
400
+ int nb_trigrams = -1;
401
+ size_t length = strlen(needle);
402
+ trigram_t* trigrams = (trigram_t*)NULL;
403
+
404
+ if (!haystack->refs) {
405
+ blurrily_refs_new(&haystack->refs);
406
+ add_all_refs(haystack);
407
+ }
408
+ if (blurrily_refs_test(haystack->refs, reference)) return 0;
409
+ if (weight <= 0) weight = (uint32_t) length;
410
+
411
+ trigrams = SMALLOC(length+1, trigram_t);
412
+ nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams);
413
+
414
+
415
+ for (int k = 0; k < nb_trigrams; ++k) {
416
+ trigram_t t = trigrams[k];
417
+ trigram_entries_t* map = &haystack->map[t];
418
+ trigram_entry_t entry = { reference, weight };
419
+
420
+ assert(t < TRIGRAM_COUNT);
421
+ assert(map-> used <= map-> buckets);
422
+
423
+ /* allocate more space as needed (exponential growth) */
424
+ if (map->buckets == 0) {
425
+ LOG("- alloc for %d\n", t);
426
+
427
+ map->buckets = TRIGRAM_ENTRIES_START_SIZE;
428
+ map->entries = SMALLOC(map->buckets, trigram_entry_t);
429
+ }
430
+ else if (map->used == map->buckets) {
431
+ uint32_t new_buckets = map->buckets * 4/3;
432
+ trigram_entry_t* new_entries = NULL;
433
+ LOG("- realloc for %d\n", t);
434
+
435
+ /* copy old data, free old pointer, zero extra space */
436
+ new_entries = SMALLOC(new_buckets, trigram_entry_t);
437
+ assert(new_entries != NULL);
438
+ memcpy(new_entries, map->entries, map->buckets * sizeof(trigram_entry_t));
439
+ /* scribble the rest of the map*/
440
+ // memset(new_entries + map->buckets, 0xFF, (new_buckets - map->buckets) * sizeof(trigram_entry_t));
441
+
442
+ #ifndef NDEBUG
443
+ /* scribble old data */
444
+ memset(map->entries, 0xFF, map->buckets * sizeof(trigram_entry_t));
445
+ #endif
446
+
447
+ if (map->entries_offset) {
448
+ /* old data was on disk, just mark it as no longer on disk */
449
+ map->entries_offset = 0;
450
+ } else {
451
+ /* free old data */
452
+ free(map->entries);
453
+ }
454
+
455
+ /* swap fields */
456
+ map->buckets = new_buckets;
457
+ map->entries = new_entries;
458
+ }
459
+
460
+ /* insert new entry */
461
+ assert(map->used < map->buckets);
462
+ map->entries[map->used] = entry;
463
+ map->used += 1;
464
+ map->dirty = 1;
465
+ }
466
+ haystack->total_trigrams += nb_trigrams;
467
+ haystack->total_references += 1;
468
+
469
+ blurrily_refs_add(haystack->refs, reference);
470
+
471
+ free((void*)trigrams);
472
+ return nb_trigrams;
473
+ }
474
+
475
+ /******************************************************************************/
476
+
477
+ int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results)
478
+ {
479
+ int nb_trigrams = -1;
480
+ size_t length = strlen(needle);
481
+ trigram_t* trigrams = (trigram_t*)NULL;
482
+ int nb_entries = -1;
483
+ trigram_entry_t* entries = NULL;
484
+ trigram_entry_t* entry_ptr = NULL;
485
+ int nb_matches = -1;
486
+ trigram_match_t* matches = NULL;
487
+ trigram_match_t* match_ptr = NULL;
488
+ uint32_t last_ref = (uint32_t)-1;
489
+ int nb_results = 0;
490
+
491
+ trigrams = SMALLOC(length+1, trigram_t);
492
+ nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams);
493
+ if (nb_trigrams == 0) goto cleanup;
494
+
495
+ LOG("%d trigrams in '%s'\n", nb_trigrams, needle);
496
+
497
+ /* measure size required for sorting */
498
+ nb_entries = 0;
499
+ for (int k = 0; k < nb_trigrams; ++k) {
500
+ trigram_t t = trigrams[k];
501
+ nb_entries += haystack->map[t].used;
502
+ }
503
+ if (nb_entries == 0) goto cleanup;
504
+
505
+ /* allocate sorting memory */
506
+ entries = SMALLOC(nb_entries, trigram_entry_t);
507
+ assert(entries != NULL);
508
+ LOG("allocated space for %zd trigrams entries\n", nb_entries);
509
+
510
+ /* copy data for sorting */
511
+ entry_ptr = entries;
512
+ for (int k = 0; k < nb_trigrams; ++k) {
513
+ trigram_t t = trigrams[k];
514
+ size_t buckets = haystack->map[t].used;
515
+
516
+ sort_map_if_dirty(haystack->map + t);
517
+ memcpy(entry_ptr, haystack->map[t].entries, buckets * sizeof(trigram_entry_t));
518
+ entry_ptr += buckets;
519
+ }
520
+ assert(entry_ptr == entries + nb_entries);
521
+
522
+ /* sort data */
523
+ MERGESORT(entries, nb_entries, sizeof(trigram_entry_t), &compare_entries);
524
+ LOG("sorting entries\n");
525
+
526
+ /* count distinct matches */
527
+ entry_ptr = entries;
528
+ last_ref = -1;
529
+ nb_matches = 0;
530
+ for (int k = 0; k < nb_entries; ++k) {
531
+ if (entry_ptr->reference != last_ref) {
532
+ last_ref = entry_ptr->reference;
533
+ ++nb_matches;
534
+ }
535
+ ++entry_ptr;
536
+ }
537
+ assert(entry_ptr == entries + nb_entries);
538
+ LOG("total %zd distinct matches\n", nb_matches);
539
+
540
+ /* allocate maches result */
541
+ matches = SMALLOC(nb_matches, trigram_match_t);
542
+ assert(matches != NULL);
543
+
544
+ /* reduction, counting matches per reference */
545
+ entry_ptr = entries;
546
+ match_ptr = matches;
547
+ match_ptr->matches = 0;
548
+ match_ptr->reference = entry_ptr->reference; /* setup the first match to */
549
+ match_ptr->weight = entry_ptr->weight; /* simplify the loop */
550
+ for (int k = 0; k < nb_entries; ++k) {
551
+ if (entry_ptr->reference != match_ptr->reference) {
552
+ ++match_ptr;
553
+ match_ptr->reference = entry_ptr->reference;
554
+ match_ptr->weight = entry_ptr->weight;
555
+ match_ptr->matches = 1;
556
+ } else {
557
+ match_ptr->matches += 1;
558
+ }
559
+ assert((int) match_ptr->matches <= nb_trigrams);
560
+ ++entry_ptr;
561
+ }
562
+ assert(match_ptr == matches + nb_matches - 1);
563
+ assert(entry_ptr == entries + nb_entries);
564
+
565
+ /* sort by weight (qsort) */
566
+ qsort(matches, nb_matches, sizeof(trigram_match_t), &compare_matches);
567
+
568
+ /* output results */
569
+ nb_results = (limit < nb_matches) ? limit : nb_matches;
570
+ for (int k = 0; k < nb_results; ++k) {
571
+ results[k] = matches[k];
572
+ LOG("match %d: reference %d, matchiness %d, weight %d\n", k, matches[k].reference, matches[k].matches, matches[k].weight);
573
+ }
574
+
575
+ cleanup:
576
+ free_if(entries);
577
+ free_if(matches);
578
+ free_if(trigrams);
579
+ return nb_results;
580
+ }
581
+
582
+ /******************************************************************************/
583
+
584
+ int blurrily_storage_delete(trigram_map haystack, uint32_t reference)
585
+ {
586
+ int trigrams_deleted = 0;
587
+
588
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
589
+ trigram_entries_t* map = haystack->map + k;
590
+ trigram_entry_t* entry = NULL;
591
+
592
+ for (unsigned int j = 0; j < map->used; ++j) {
593
+ entry = map->entries + j;
594
+ if (entry->reference != reference) continue;
595
+
596
+ /* swap with the last entry */
597
+ *entry = map->entries[map->used - 1];
598
+ memset(map->entries + map->used - 1, 0xFF, sizeof(trigram_entry_t));
599
+
600
+ map->used -= 1;
601
+
602
+ ++trigrams_deleted;
603
+ --j;
604
+ }
605
+ }
606
+ haystack->total_trigrams -= trigrams_deleted;
607
+ if (trigrams_deleted > 0) haystack->total_references -= 1;
608
+
609
+ if (haystack->refs) blurrily_refs_remove(haystack->refs, reference);
610
+
611
+ return trigrams_deleted;
612
+ }
613
+
614
+ /******************************************************************************/
615
+
616
+ int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats)
617
+ {
618
+ stats->references = haystack->total_references;
619
+ stats->trigrams = haystack->total_trigrams;
620
+ return 0;
621
+ }
622
+
623
+ /******************************************************************************/
624
+
625
+ void blurrily_storage_mark(trigram_map haystack)
626
+ {
627
+ if (haystack->refs) blurrily_refs_mark(haystack->refs);
628
+ return;
629
+ }