vinted-blurrily 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ #include <stdlib.h>
2
+ #include <inttypes.h>
3
+ #include "blurrily.h"
4
+ #include "ruby.h"
5
+
6
+ /******************************************************************************/
7
+
8
+ typedef struct blurrily_refs_t {
9
+ VALUE hash;
10
+ } blurrily_refs_t;
11
+
12
+ /******************************************************************************/
13
+
14
+ int blurrily_refs_new(blurrily_refs_t** refs_ptr)
15
+ {
16
+ blurrily_refs_t* refs = NULL;
17
+
18
+ refs = (blurrily_refs_t*) malloc(sizeof(blurrily_refs_t));
19
+ if (!refs) return -1;
20
+
21
+ refs->hash = rb_hash_new();
22
+ *refs_ptr = refs;
23
+ return 0;
24
+ }
25
+
26
+ /******************************************************************************/
27
+
28
+ void blurrily_refs_mark(blurrily_refs_t* refs)
29
+ {
30
+ rb_gc_mark(refs->hash);
31
+ return;
32
+ }
33
+
34
+ /******************************************************************************/
35
+
36
+ void blurrily_refs_free(blurrily_refs_t** refs_ptr)
37
+ {
38
+ blurrily_refs_t* refs = *refs_ptr;
39
+
40
+ refs->hash = Qnil;
41
+ free(refs);
42
+ *refs_ptr = NULL;
43
+ return;
44
+ }
45
+
46
+ /******************************************************************************/
47
+
48
+ void blurrily_refs_add(blurrily_refs_t* refs, uint32_t ref)
49
+ {
50
+ (void) rb_hash_aset(refs->hash, UINT2NUM(ref), Qtrue);
51
+ return;
52
+ }
53
+
54
+ /******************************************************************************/
55
+
56
+ void blurrily_refs_remove(blurrily_refs_t* refs, uint32_t ref)
57
+ {
58
+ (void) rb_hash_aset(refs->hash, UINT2NUM(ref), Qnil);
59
+ }
60
+
61
+ /******************************************************************************/
62
+
63
+ int blurrily_refs_test(blurrily_refs_t* refs, uint32_t ref)
64
+ {
65
+ return rb_hash_aref(refs->hash, UINT2NUM(ref)) == Qtrue ? 1 : 0;
66
+ }
@@ -0,0 +1,30 @@
1
+ /*
2
+
3
+ search_tree.h --
4
+
5
+ List of all references that's fast to query for existence.
6
+
7
+ */
8
+ #include <inttypes.h>
9
+
10
+
11
+ typedef struct blurrily_refs_t blurrily_refs_t;
12
+
13
+
14
+ /* Allocate a search tree */
15
+ int blurrily_refs_new(blurrily_refs_t** refs_ptr);
16
+
17
+ /* Destroy a search tree */
18
+ void blurrily_refs_free(blurrily_refs_t** refs_ptr);
19
+
20
+ /* Mark with Ruby's GC */
21
+ void blurrily_refs_mark(blurrily_refs_t* refs);
22
+
23
+ /* Add a reference */
24
+ void blurrily_refs_add(blurrily_refs_t* refs, uint32_t ref);
25
+
26
+ /* Remove a reference */
27
+ void blurrily_refs_remove(blurrily_refs_t* refs, uint32_t ref);
28
+
29
+ /* Test for a reference (1 = present, 0 = absent) */
30
+ int blurrily_refs_test(blurrily_refs_t* refs, uint32_t ref);
@@ -0,0 +1,629 @@
1
+ #include <stdlib.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+ #include <assert.h>
5
+ #include <fcntl.h>
6
+ #include <sys/mman.h>
7
+ #include <sys/errno.h>
8
+ #include <unistd.h>
9
+ #include <sys/stat.h>
10
+
11
+ #ifdef PLATFORM_LINUX
12
+ #include <linux/limits.h>
13
+ #define MERGESORT fake_mergesort
14
+ #else
15
+ #include <limits.h>
16
+ #define MERGESORT mergesort
17
+ #endif
18
+
19
+ #ifndef PATH_MAX
20
+ /* safe default ... */
21
+ #define PATH_MAX 1024
22
+ #endif
23
+
24
+ #include "storage.h"
25
+ #include "search_tree.h"
26
+
27
+ /******************************************************************************/
28
+
29
+ #define PAGE_SIZE 4096
30
+ #define TRIGRAM_COUNT (TRIGRAM_BASE * TRIGRAM_BASE * TRIGRAM_BASE)
31
+ #define TRIGRAM_ENTRIES_START_SIZE PAGE_SIZE/sizeof(trigram_entry_t)
32
+
33
+ /******************************************************************************/
34
+
35
+ /* one trigram entry -- client reference and sorting weight */
36
+ struct BR_PACKED_STRUCT trigram_entry_t
37
+ {
38
+ uint32_t reference;
39
+ uint32_t weight;
40
+ };
41
+ typedef struct trigram_entry_t trigram_entry_t;
42
+
43
+
44
+ /* collection of entries for a given trigram */
45
+ /* <entries> points to an array of <buckets> entries */
46
+ /* of which <used> are filled */
47
+ struct BR_PACKED_STRUCT trigram_entries_t
48
+ {
49
+ uint32_t buckets;
50
+ uint32_t used;
51
+
52
+ trigram_entry_t* entries; /* set when the structure is in memory */
53
+ off_t entries_offset; /* set when the structure is on disk */
54
+
55
+ uint8_t dirty; /* not optimised (presorted) yet */
56
+ };
57
+ typedef struct trigram_entries_t trigram_entries_t;
58
+
59
+
60
+ /* hash map of all possible trigrams to collection of entries */
61
+ /* there are 28^3 = 19,683 possible trigrams */
62
+ struct BR_PACKED_STRUCT trigram_map_t
63
+ {
64
+ char magic[6]; /* the string "trigra" */
65
+ uint8_t big_endian;
66
+ uint8_t pointer_size;
67
+
68
+ uint32_t total_references;
69
+ uint32_t total_trigrams;
70
+ size_t mapped_size; /* when mapped from disk, the number of bytes mapped */
71
+ blurrily_refs_t* refs;
72
+
73
+ trigram_entries_t map[TRIGRAM_COUNT]; /* this whole structure is ~500KB */
74
+ };
75
+ typedef struct trigram_map_t trigram_map_t;
76
+
77
+ /******************************************************************************/
78
+
79
+ #ifdef PLATFORM_LINUX
80
+ /* fake version of mergesort(3) implemented with qsort(3) as Linux lacks */
81
+ /* the specific variants */
82
+ static int fake_mergesort(void *base, size_t nel, size_t width, int (*compar)(const void *, const void *))
83
+ {
84
+ qsort(base, nel, width, compar);
85
+ return 0;
86
+ }
87
+ #endif
88
+
89
+ /******************************************************************************/
90
+
91
+ #define SMALLOC(_NELEM,_TYPE) (_TYPE*) smalloc(_NELEM, sizeof(_TYPE))
92
+
93
+ static void* smalloc(size_t nelem, size_t length)
94
+ {
95
+ void* result = malloc(nelem * length);
96
+ if (result) memset(result, 0xAA, nelem * length);
97
+ return result;
98
+ }
99
+
100
+ /******************************************************************************/
101
+
102
+ /* 1 -> little endian, 2 -> big endian */
103
+ static uint8_t get_big_endian()
104
+ {
105
+ uint32_t magic = 0xAA0000BB;
106
+ uint8_t head = *((uint8_t*) &magic);
107
+
108
+ return (head == 0xBB) ? 1 : 2;
109
+ }
110
+
111
+ /******************************************************************************/
112
+
113
+ /* 4 or 8 (bytes) */
114
+ static uint8_t get_pointer_size()
115
+ {
116
+ return (uint8_t) sizeof(void*);
117
+ }
118
+
119
+ /******************************************************************************/
120
+
121
+ static int compare_entries(const void* left_p, const void* right_p)
122
+ {
123
+ trigram_entry_t* left = (trigram_entry_t*)left_p;
124
+ trigram_entry_t* right = (trigram_entry_t*)right_p;
125
+ return (int)left->reference - (int)right->reference;
126
+ }
127
+
128
+ /* compares matches on #matches (descending) then weight (ascending) */
129
+ static int compare_matches(const void* left_p, const void* right_p)
130
+ {
131
+ trigram_match_t* left = (trigram_match_t*)left_p;
132
+ trigram_match_t* right = (trigram_match_t*)right_p;
133
+ /* int delta = (int)left->matches - (int)right->matches; */
134
+ int delta = (int)right->matches - (int)left->matches;
135
+
136
+ return (delta != 0) ? delta : ((int)left->weight - (int)right->weight);
137
+
138
+ }
139
+
140
+ /******************************************************************************/
141
+
142
+ static void sort_map_if_dirty(trigram_entries_t* map)
143
+ {
144
+ int res = -1;
145
+ if (! map->dirty) return;
146
+
147
+ res = MERGESORT(map->entries, map->used, sizeof(trigram_entry_t), &compare_entries);
148
+ assert(res >= 0);
149
+ map->dirty = 0;
150
+ }
151
+
152
+ /******************************************************************************/
153
+
154
+ static size_t round_to_page(size_t value)
155
+ {
156
+ if (value % PAGE_SIZE == 0) return value;
157
+ return (value / PAGE_SIZE + 1) * PAGE_SIZE;
158
+ }
159
+
160
+ /******************************************************************************/
161
+
162
+ static size_t get_map_size(trigram_map haystack, int index)
163
+ {
164
+ return haystack->map[index].buckets * sizeof(trigram_entry_t);
165
+ }
166
+
167
+ /******************************************************************************/
168
+
169
+ static void free_if(void* ptr)
170
+ {
171
+ if (ptr == NULL) return;
172
+ free(ptr);
173
+ return;
174
+ }
175
+
176
+ /******************************************************************************/
177
+
178
+ int blurrily_storage_new(trigram_map* haystack_ptr)
179
+ {
180
+ trigram_map haystack = (trigram_map)NULL;
181
+ trigram_entries_t* ptr = NULL;
182
+ int k = 0;
183
+
184
+ LOG("blurrily_storage_new\n");
185
+ haystack = SMALLOC(1, trigram_map_t);
186
+ if (haystack == NULL) return -1;
187
+
188
+ memcpy(haystack->magic, "trigra", 6);
189
+ haystack->big_endian = get_big_endian();
190
+ haystack->pointer_size = get_pointer_size();
191
+
192
+ haystack->mapped_size = 0; /* not mapped, as we just created it in memory */
193
+ haystack->total_references = 0;
194
+ haystack->total_trigrams = 0;
195
+ haystack->refs = NULL;
196
+ for(k = 0, ptr = haystack->map ; k < TRIGRAM_COUNT ; ++k, ++ptr) {
197
+ ptr->buckets = 0;
198
+ ptr->used = 0;
199
+ ptr->dirty = 0;
200
+ ptr->entries = (trigram_entry_t*)NULL;
201
+ ptr->entries_offset = 0;
202
+ }
203
+
204
+ *haystack_ptr = haystack;
205
+ return 0;
206
+ }
207
+
208
+ /******************************************************************************/
209
+
210
+ int blurrily_storage_load(trigram_map* haystack, const char* path)
211
+ {
212
+ int fd = -1;
213
+ int res = -1;
214
+ trigram_map header = NULL;
215
+ uint8_t* origin = NULL;
216
+ struct stat metadata;
217
+
218
+ /* open and map file */
219
+ res = fd = open(path, O_RDONLY);
220
+ if (res < 0) goto cleanup;
221
+
222
+ res = fstat(fd, &metadata);
223
+ if (res < 0) goto cleanup;
224
+
225
+ /* check this file is at least lng enough to have a header */
226
+ if (metadata.st_size < (off_t) sizeof(trigram_map_t)) {
227
+ errno = EPROTO;
228
+ res = -1;
229
+ goto cleanup;
230
+ }
231
+
232
+ header = (trigram_map) mmap(NULL, metadata.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
233
+ if (header == MAP_FAILED) {
234
+ res = -1;
235
+ header = NULL;
236
+ goto cleanup;
237
+ }
238
+
239
+ /* fd not needed once mapping established */
240
+ res = close(fd);
241
+ if (res < 0) goto cleanup;
242
+ fd = -1;
243
+
244
+ /* check magic */
245
+ res = memcmp(header->magic, "trigra", 6);
246
+ if (res != 0 || header->big_endian != get_big_endian() || header->pointer_size != get_pointer_size()) {
247
+ errno = EPROTO;
248
+ res = -1;
249
+ goto cleanup;
250
+ }
251
+
252
+ /* fix header data */
253
+ header->mapped_size = metadata.st_size;
254
+ origin = (uint8_t*)header;
255
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
256
+ trigram_entries_t* map = header->map + k;
257
+ if (map->entries_offset == 0) continue;
258
+ map->entries = (trigram_entry_t*) (origin + map->entries_offset);
259
+ }
260
+ *haystack = header;
261
+
262
+ cleanup:
263
+ if (fd > 0) (void) close(fd);
264
+ if (res < 0 && header != NULL) (void) munmap(header, metadata.st_size);
265
+ return res;
266
+ }
267
+
268
+ /******************************************************************************/
269
+
270
+ int blurrily_storage_close(trigram_map* haystack_ptr)
271
+ {
272
+ trigram_map haystack = *haystack_ptr;
273
+ int res = 0;
274
+ trigram_entries_t* ptr = haystack->map;
275
+
276
+ LOG("blurrily_storage_close\n");
277
+
278
+ for(int k = 0 ; k < TRIGRAM_COUNT ; ++k) {
279
+ if (ptr->entries_offset == 0) free(ptr->entries);
280
+ ++ptr;
281
+ }
282
+
283
+ if (haystack->refs) blurrily_refs_free(&haystack->refs);
284
+
285
+ if (haystack->mapped_size) {
286
+ res = munmap(haystack, haystack->mapped_size);
287
+ if (res < 0) goto cleanup;
288
+ } else {
289
+ free(haystack);
290
+ }
291
+
292
+ cleanup:
293
+ *haystack_ptr = NULL;
294
+ return res;
295
+ }
296
+
297
+ /******************************************************************************/
298
+
299
+ int blurrily_storage_save(trigram_map haystack, const char* path)
300
+ {
301
+ int fd = -1;
302
+ int res = 0;
303
+ uint8_t* ptr = (uint8_t*)NULL;
304
+ size_t total_size = 0;
305
+ size_t offset = 0;
306
+ trigram_map header = NULL;
307
+ char path_tmp[PATH_MAX];
308
+
309
+ /* cleanup maps in memory */
310
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
311
+ sort_map_if_dirty(haystack->map + k);
312
+ }
313
+
314
+ /* path for temporary file */
315
+ snprintf(path_tmp, PATH_MAX, "%s.tmp.%ld", path, random());
316
+
317
+ /* compute storage space required */
318
+ total_size += round_to_page(sizeof(trigram_map_t));
319
+
320
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
321
+ total_size += round_to_page(get_map_size(haystack, k));
322
+ }
323
+
324
+ /* open and map file */
325
+ fd = open(path_tmp, O_RDWR | O_CREAT | O_TRUNC, 0644);
326
+ if (fd < 0) goto cleanup;
327
+
328
+ res = ftruncate(fd, total_size);
329
+ if (res < 0) goto cleanup;
330
+
331
+ ptr = mmap(NULL, total_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
332
+ if (ptr == MAP_FAILED) { res = -1 ; goto cleanup ; }
333
+
334
+ (void) close(fd);
335
+ fd = -1;
336
+
337
+ /* flush data */
338
+ memset(ptr, 0xFF, total_size);
339
+
340
+ /* copy header & clean copy */
341
+ memcpy(ptr, (void*)haystack, sizeof(trigram_map_t));
342
+ offset += round_to_page(sizeof(trigram_map_t));
343
+ header = (trigram_map)ptr;
344
+
345
+ header->mapped_size = 0;
346
+ header->refs = NULL;
347
+
348
+ /* copy each map, set offset in header */
349
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
350
+ size_t block_size = get_map_size(haystack, k);
351
+
352
+ if (block_size > 0) {
353
+ memcpy(ptr+offset, haystack->map[k].entries, block_size);
354
+
355
+ header->map[k].entries = NULL;
356
+ header->map[k].entries_offset = offset;
357
+
358
+ offset += round_to_page(block_size);
359
+ } else {
360
+ header->map[k].entries = NULL;
361
+ header->map[k].entries_offset = 0;
362
+ }
363
+ }
364
+ assert(offset == total_size);
365
+
366
+ cleanup:
367
+ if (ptr != NULL && total_size > 0) {
368
+ res = munmap(ptr, total_size);
369
+ }
370
+
371
+ /* commit by renaming the file */
372
+ if (res >= 0 && path) {
373
+ res = rename(path_tmp, path);
374
+ }
375
+
376
+ return res;
377
+ }
378
+
379
+ /******************************************************************************/
380
+
381
+ void add_all_refs(trigram_map haystack)
382
+ {
383
+ assert(haystack->refs != NULL);
384
+
385
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
386
+ trigram_entries_t* map = haystack->map + k;
387
+ trigram_entry_t* ptr = map->entries;
388
+ assert(map->used <= map->buckets);
389
+ for (uint32_t j = 0; j < map->used; ++j, ++ptr) {
390
+ uint32_t ref = ptr->reference;
391
+ blurrily_refs_add(haystack->refs, ref);
392
+ }
393
+ }
394
+ }
395
+
396
+ /******************************************************************************/
397
+
398
+ int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight)
399
+ {
400
+ int nb_trigrams = -1;
401
+ size_t length = strlen(needle);
402
+ trigram_t* trigrams = (trigram_t*)NULL;
403
+
404
+ if (!haystack->refs) {
405
+ blurrily_refs_new(&haystack->refs);
406
+ add_all_refs(haystack);
407
+ }
408
+ if (blurrily_refs_test(haystack->refs, reference)) return 0;
409
+ if (weight <= 0) weight = (uint32_t) length;
410
+
411
+ trigrams = SMALLOC(length+1, trigram_t);
412
+ nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams);
413
+
414
+
415
+ for (int k = 0; k < nb_trigrams; ++k) {
416
+ trigram_t t = trigrams[k];
417
+ trigram_entries_t* map = &haystack->map[t];
418
+ trigram_entry_t entry = { reference, weight };
419
+
420
+ assert(t < TRIGRAM_COUNT);
421
+ assert(map-> used <= map-> buckets);
422
+
423
+ /* allocate more space as needed (exponential growth) */
424
+ if (map->buckets == 0) {
425
+ LOG("- alloc for %d\n", t);
426
+
427
+ map->buckets = TRIGRAM_ENTRIES_START_SIZE;
428
+ map->entries = SMALLOC(map->buckets, trigram_entry_t);
429
+ }
430
+ else if (map->used == map->buckets) {
431
+ uint32_t new_buckets = map->buckets * 4/3;
432
+ trigram_entry_t* new_entries = NULL;
433
+ LOG("- realloc for %d\n", t);
434
+
435
+ /* copy old data, free old pointer, zero extra space */
436
+ new_entries = SMALLOC(new_buckets, trigram_entry_t);
437
+ assert(new_entries != NULL);
438
+ memcpy(new_entries, map->entries, map->buckets * sizeof(trigram_entry_t));
439
+ /* scribble the rest of the map*/
440
+ // memset(new_entries + map->buckets, 0xFF, (new_buckets - map->buckets) * sizeof(trigram_entry_t));
441
+
442
+ #ifndef NDEBUG
443
+ /* scribble old data */
444
+ memset(map->entries, 0xFF, map->buckets * sizeof(trigram_entry_t));
445
+ #endif
446
+
447
+ if (map->entries_offset) {
448
+ /* old data was on disk, just mark it as no longer on disk */
449
+ map->entries_offset = 0;
450
+ } else {
451
+ /* free old data */
452
+ free(map->entries);
453
+ }
454
+
455
+ /* swap fields */
456
+ map->buckets = new_buckets;
457
+ map->entries = new_entries;
458
+ }
459
+
460
+ /* insert new entry */
461
+ assert(map->used < map->buckets);
462
+ map->entries[map->used] = entry;
463
+ map->used += 1;
464
+ map->dirty = 1;
465
+ }
466
+ haystack->total_trigrams += nb_trigrams;
467
+ haystack->total_references += 1;
468
+
469
+ blurrily_refs_add(haystack->refs, reference);
470
+
471
+ free((void*)trigrams);
472
+ return nb_trigrams;
473
+ }
474
+
475
+ /******************************************************************************/
476
+
477
+ int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results)
478
+ {
479
+ int nb_trigrams = -1;
480
+ size_t length = strlen(needle);
481
+ trigram_t* trigrams = (trigram_t*)NULL;
482
+ int nb_entries = -1;
483
+ trigram_entry_t* entries = NULL;
484
+ trigram_entry_t* entry_ptr = NULL;
485
+ int nb_matches = -1;
486
+ trigram_match_t* matches = NULL;
487
+ trigram_match_t* match_ptr = NULL;
488
+ uint32_t last_ref = (uint32_t)-1;
489
+ int nb_results = 0;
490
+
491
+ trigrams = SMALLOC(length+1, trigram_t);
492
+ nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams);
493
+ if (nb_trigrams == 0) goto cleanup;
494
+
495
+ LOG("%d trigrams in '%s'\n", nb_trigrams, needle);
496
+
497
+ /* measure size required for sorting */
498
+ nb_entries = 0;
499
+ for (int k = 0; k < nb_trigrams; ++k) {
500
+ trigram_t t = trigrams[k];
501
+ nb_entries += haystack->map[t].used;
502
+ }
503
+ if (nb_entries == 0) goto cleanup;
504
+
505
+ /* allocate sorting memory */
506
+ entries = SMALLOC(nb_entries, trigram_entry_t);
507
+ assert(entries != NULL);
508
+ LOG("allocated space for %zd trigrams entries\n", nb_entries);
509
+
510
+ /* copy data for sorting */
511
+ entry_ptr = entries;
512
+ for (int k = 0; k < nb_trigrams; ++k) {
513
+ trigram_t t = trigrams[k];
514
+ size_t buckets = haystack->map[t].used;
515
+
516
+ sort_map_if_dirty(haystack->map + t);
517
+ memcpy(entry_ptr, haystack->map[t].entries, buckets * sizeof(trigram_entry_t));
518
+ entry_ptr += buckets;
519
+ }
520
+ assert(entry_ptr == entries + nb_entries);
521
+
522
+ /* sort data */
523
+ MERGESORT(entries, nb_entries, sizeof(trigram_entry_t), &compare_entries);
524
+ LOG("sorting entries\n");
525
+
526
+ /* count distinct matches */
527
+ entry_ptr = entries;
528
+ last_ref = -1;
529
+ nb_matches = 0;
530
+ for (int k = 0; k < nb_entries; ++k) {
531
+ if (entry_ptr->reference != last_ref) {
532
+ last_ref = entry_ptr->reference;
533
+ ++nb_matches;
534
+ }
535
+ ++entry_ptr;
536
+ }
537
+ assert(entry_ptr == entries + nb_entries);
538
+ LOG("total %zd distinct matches\n", nb_matches);
539
+
540
+ /* allocate maches result */
541
+ matches = SMALLOC(nb_matches, trigram_match_t);
542
+ assert(matches != NULL);
543
+
544
+ /* reduction, counting matches per reference */
545
+ entry_ptr = entries;
546
+ match_ptr = matches;
547
+ match_ptr->matches = 0;
548
+ match_ptr->reference = entry_ptr->reference; /* setup the first match to */
549
+ match_ptr->weight = entry_ptr->weight; /* simplify the loop */
550
+ for (int k = 0; k < nb_entries; ++k) {
551
+ if (entry_ptr->reference != match_ptr->reference) {
552
+ ++match_ptr;
553
+ match_ptr->reference = entry_ptr->reference;
554
+ match_ptr->weight = entry_ptr->weight;
555
+ match_ptr->matches = 1;
556
+ } else {
557
+ match_ptr->matches += 1;
558
+ }
559
+ assert((int) match_ptr->matches <= nb_trigrams);
560
+ ++entry_ptr;
561
+ }
562
+ assert(match_ptr == matches + nb_matches - 1);
563
+ assert(entry_ptr == entries + nb_entries);
564
+
565
+ /* sort by weight (qsort) */
566
+ qsort(matches, nb_matches, sizeof(trigram_match_t), &compare_matches);
567
+
568
+ /* output results */
569
+ nb_results = (limit < nb_matches) ? limit : nb_matches;
570
+ for (int k = 0; k < nb_results; ++k) {
571
+ results[k] = matches[k];
572
+ LOG("match %d: reference %d, matchiness %d, weight %d\n", k, matches[k].reference, matches[k].matches, matches[k].weight);
573
+ }
574
+
575
+ cleanup:
576
+ free_if(entries);
577
+ free_if(matches);
578
+ free_if(trigrams);
579
+ return nb_results;
580
+ }
581
+
582
+ /******************************************************************************/
583
+
584
+ int blurrily_storage_delete(trigram_map haystack, uint32_t reference)
585
+ {
586
+ int trigrams_deleted = 0;
587
+
588
+ for (int k = 0; k < TRIGRAM_COUNT; ++k) {
589
+ trigram_entries_t* map = haystack->map + k;
590
+ trigram_entry_t* entry = NULL;
591
+
592
+ for (unsigned int j = 0; j < map->used; ++j) {
593
+ entry = map->entries + j;
594
+ if (entry->reference != reference) continue;
595
+
596
+ /* swap with the last entry */
597
+ *entry = map->entries[map->used - 1];
598
+ memset(map->entries + map->used - 1, 0xFF, sizeof(trigram_entry_t));
599
+
600
+ map->used -= 1;
601
+
602
+ ++trigrams_deleted;
603
+ --j;
604
+ }
605
+ }
606
+ haystack->total_trigrams -= trigrams_deleted;
607
+ if (trigrams_deleted > 0) haystack->total_references -= 1;
608
+
609
+ if (haystack->refs) blurrily_refs_remove(haystack->refs, reference);
610
+
611
+ return trigrams_deleted;
612
+ }
613
+
614
+ /******************************************************************************/
615
+
616
+ int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats)
617
+ {
618
+ stats->references = haystack->total_references;
619
+ stats->trigrams = haystack->total_trigrams;
620
+ return 0;
621
+ }
622
+
623
+ /******************************************************************************/
624
+
625
+ void blurrily_storage_mark(trigram_map haystack)
626
+ {
627
+ if (haystack->refs) blurrily_refs_mark(haystack->refs);
628
+ return;
629
+ }