fast_bloom_filter 1.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,271 +1,788 @@
1
- /*
2
- * FastBloomFilter - High-performance Bloom Filter implementation for Ruby
3
- * Copyright (c) 2025
4
- */
5
-
6
1
  #include <ruby.h>
7
2
  #include <stdint.h>
8
3
  #include <string.h>
9
4
  #include <stdlib.h>
10
5
  #include <math.h>
11
6
 
12
- /* Bloom Filter structure */
13
- typedef struct {
14
- uint8_t *bits; /* Bit array */
15
- size_t size; /* Size in bytes */
16
- size_t capacity; /* Expected number of elements */
17
- int num_hashes; /* Number of hash functions */
18
- } BloomFilter;
19
-
20
- /* GC: Free memory */
21
- static void bloom_free(void *ptr) {
22
- BloomFilter *bloom = (BloomFilter *)ptr;
23
- if (bloom->bits) {
24
- free(bloom->bits);
25
- }
26
- free(bloom);
27
- }
28
-
29
- /* GC: Report memory size */
30
- static size_t bloom_memsize(const void *ptr) {
31
- const BloomFilter *bloom = (const BloomFilter *)ptr;
32
- return sizeof(BloomFilter) + bloom->size;
33
- }
34
-
35
- static const rb_data_type_t bloom_type = {
36
- "BloomFilter",
37
- {NULL, bloom_free, bloom_memsize},
38
- NULL, NULL,
39
- RUBY_TYPED_FREE_IMMEDIATELY
40
- };
41
-
42
- /*
43
- * MurmurHash3 32-bit implementation
44
- */
45
- static uint32_t murmur3_32(const uint8_t *key, size_t len, uint32_t seed) {
46
- uint32_t h = seed;
47
- const uint32_t c1 = 0xcc9e2d51;
48
- const uint32_t c2 = 0x1b873593;
49
-
50
- const int nblocks = len / 4;
51
- const uint32_t *blocks = (const uint32_t *)(key);
52
-
53
- for (int i = 0; i < nblocks; i++) {
54
- uint32_t k1 = blocks[i];
7
+ static inline uint64_t load_u64(const void *p) {
8
+ uint64_t v;
9
+ memcpy(&v, p, sizeof(v));
10
+ return v;
11
+ }
12
+
13
+ static inline size_t popcount64(uint64_t x) {
14
+ #if defined(__GNUC__) || defined(__clang__)
15
+ return (size_t)__builtin_popcountll(x);
16
+ #elif defined(_MSC_VER) && defined(_M_X64)
17
+ return (size_t)__popcnt64(x);
18
+ #else
19
+ x = x - ((x >> 1) & 0x5555555555555555ULL);
20
+ x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
21
+ x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
22
+ return (size_t)((x * 0x0101010101010101ULL) >> 56);
23
+ #endif
24
+ }
25
+
26
+ static inline uint64_t rotl64(uint64_t x, int r) {
27
+ return (x << r) | (x >> (64 - r));
28
+ }
29
+
30
+ static inline void write_le64(uint8_t *dst, uint64_t v) {
31
+ dst[0] = (uint8_t)(v);
32
+ dst[1] = (uint8_t)(v >> 8);
33
+ dst[2] = (uint8_t)(v >> 16);
34
+ dst[3] = (uint8_t)(v >> 24);
35
+ dst[4] = (uint8_t)(v >> 32);
36
+ dst[5] = (uint8_t)(v >> 40);
37
+ dst[6] = (uint8_t)(v >> 48);
38
+ dst[7] = (uint8_t)(v >> 56);
39
+ }
40
+
41
+ static inline uint64_t read_le64(const uint8_t *src) {
42
+ return (uint64_t)src[0] | (uint64_t)src[1] << 8 | (uint64_t)src[2] << 16 |
43
+ (uint64_t)src[3] << 24 | (uint64_t)src[4] << 32 | (uint64_t)src[5] << 40 |
44
+ (uint64_t)src[6] << 48 | (uint64_t)src[7] << 56;
45
+ }
46
+
47
+ static inline void write_le32(uint8_t *dst, uint32_t v) {
48
+ dst[0] = (uint8_t)(v);
49
+ dst[1] = (uint8_t)(v >> 8);
50
+ dst[2] = (uint8_t)(v >> 16);
51
+ dst[3] = (uint8_t)(v >> 24);
52
+ }
53
+
54
+ static inline uint32_t read_le32(const uint8_t *src) {
55
+ return (uint32_t)src[0] | (uint32_t)src[1] << 8 | (uint32_t)src[2] << 16 |
56
+ (uint32_t)src[3] << 24;
57
+ }
58
+
59
+ static inline void write_le_double(uint8_t *dst, double v) {
60
+ uint64_t bits;
61
+ memcpy(&bits, &v, 8);
62
+ write_le64(dst, bits);
63
+ }
64
+
65
+ static inline double read_le_double(const uint8_t *src) {
66
+ uint64_t bits = read_le64(src);
67
+ double v;
68
+ memcpy(&v, &bits, 8);
69
+ return v;
70
+ }
71
+
72
+ static void murmur3_128(const uint8_t *key, size_t len, uint64_t seed, uint64_t *out_h1,
73
+ uint64_t *out_h2) {
74
+ const size_t nblocks = len / 16;
75
+ uint64_t h1 = seed, h2 = seed;
76
+ const uint64_t c1 = 0x87c37b91114253d5ULL;
77
+ const uint64_t c2 = 0x4cf5ad432745937fULL;
78
+
79
+ const uint8_t *body = key;
80
+ for (size_t i = 0; i < nblocks; i++) {
81
+ uint64_t k1 = load_u64(body + i * 16);
82
+ uint64_t k2 = load_u64(body + i * 16 + 8);
55
83
  k1 *= c1;
56
- k1 = (k1 << 15) | (k1 >> 17);
84
+ k1 = rotl64(k1, 31);
57
85
  k1 *= c2;
58
- h ^= k1;
59
- h = (h << 13) | (h >> 19);
60
- h = h * 5 + 0xe6546b64;
86
+ h1 ^= k1;
87
+ h1 = rotl64(h1, 27);
88
+ h1 += h2;
89
+ h1 = h1 * 5 + 0x52dce729;
90
+ k2 *= c2;
91
+ k2 = rotl64(k2, 33);
92
+ k2 *= c1;
93
+ h2 ^= k2;
94
+ h2 = rotl64(h2, 31);
95
+ h2 += h1;
96
+ h2 = h2 * 5 + 0x38495ab5;
61
97
  }
62
-
63
- const uint8_t *tail = (const uint8_t *)(key + nblocks * 4);
64
- uint32_t k1 = 0;
65
-
66
- switch (len & 3) {
67
- case 3: k1 ^= tail[2] << 16;
68
- case 2: k1 ^= tail[1] << 8;
69
- case 1: k1 ^= tail[0];
70
- k1 *= c1;
71
- k1 = (k1 << 15) | (k1 >> 17);
72
- k1 *= c2;
73
- h ^= k1;
98
+
99
+ const uint8_t *tail = key + nblocks * 16;
100
+ uint64_t k1 = 0, k2 = 0;
101
+ switch (len & 15) {
102
+ case 15:
103
+ k2 ^= (uint64_t)tail[14] << 48;
104
+ case 14:
105
+ k2 ^= (uint64_t)tail[13] << 40;
106
+ case 13:
107
+ k2 ^= (uint64_t)tail[12] << 32;
108
+ case 12:
109
+ k2 ^= (uint64_t)tail[11] << 24;
110
+ case 11:
111
+ k2 ^= (uint64_t)tail[10] << 16;
112
+ case 10:
113
+ k2 ^= (uint64_t)tail[9] << 8;
114
+ case 9:
115
+ k2 ^= (uint64_t)tail[8];
116
+ k2 *= c2;
117
+ k2 = rotl64(k2, 33);
118
+ k2 *= c1;
119
+ h2 ^= k2;
120
+ case 8:
121
+ k1 ^= (uint64_t)tail[7] << 56;
122
+ case 7:
123
+ k1 ^= (uint64_t)tail[6] << 48;
124
+ case 6:
125
+ k1 ^= (uint64_t)tail[5] << 40;
126
+ case 5:
127
+ k1 ^= (uint64_t)tail[4] << 32;
128
+ case 4:
129
+ k1 ^= (uint64_t)tail[3] << 24;
130
+ case 3:
131
+ k1 ^= (uint64_t)tail[2] << 16;
132
+ case 2:
133
+ k1 ^= (uint64_t)tail[1] << 8;
134
+ case 1:
135
+ k1 ^= (uint64_t)tail[0];
136
+ k1 *= c1;
137
+ k1 = rotl64(k1, 31);
138
+ k1 *= c2;
139
+ h1 ^= k1;
74
140
  }
75
-
76
- h ^= len;
77
- h ^= h >> 16;
78
- h *= 0x85ebca6b;
79
- h ^= h >> 13;
80
- h *= 0xc2b2ae35;
81
- h ^= h >> 16;
82
-
83
- return h;
84
- }
85
-
86
- /* Set bit at position */
141
+
142
+ h1 ^= (uint64_t)len;
143
+ h2 ^= (uint64_t)len;
144
+ h1 += h2;
145
+ h2 += h1;
146
+ h1 ^= h1 >> 33;
147
+ h1 *= 0xff51afd7ed558ccdULL;
148
+ h1 ^= h1 >> 33;
149
+ h1 *= 0xc4ceb9fe1a85ec53ULL;
150
+ h1 ^= h1 >> 33;
151
+ h2 ^= h2 >> 33;
152
+ h2 *= 0xff51afd7ed558ccdULL;
153
+ h2 ^= h2 >> 33;
154
+ h2 *= 0xc4ceb9fe1a85ec53ULL;
155
+ h2 ^= h2 >> 33;
156
+ h1 += h2;
157
+ h2 += h1;
158
+ *out_h1 = h1;
159
+ *out_h2 = h2;
160
+ }
161
+
162
+ typedef struct {
163
+ uint8_t *bits;
164
+ size_t size;
165
+ size_t capacity;
166
+ size_t count;
167
+ int num_hashes;
168
+ } BloomLayer;
169
+
170
+ typedef struct {
171
+ BloomLayer **layers;
172
+ size_t num_layers;
173
+ size_t layers_cap;
174
+ double error_rate;
175
+ double tightening;
176
+ size_t initial_capacity;
177
+ size_t total_count;
178
+ } ScalableBloom;
179
+
180
+ #define DEFAULT_ERROR_RATE 0.01
181
+ #define DEFAULT_INITIAL_CAP 8192
182
+ #define DEFAULT_TIGHTENING 0.85
183
+ #define MAX_HASHES 20
184
+ #define MIN_HASHES 1
185
+ #define GROWTH_FACTOR 2.0
186
+ #define MURMUR_SEED 0x9747b28cULL
187
+ #define SERIAL_VERSION 1
188
+ #define HEADER_SIZE 48
189
+ #define LAYER_META 32
190
+ #define MAX_BITS_ALLOC (1ULL << 36)
191
+
87
192
  static inline void set_bit(uint8_t *bits, size_t pos) {
88
- bits[pos / 8] |= (1 << (pos % 8));
193
+ bits[pos >> 3] |= (uint8_t)(1u << (pos & 7));
89
194
  }
90
195
 
91
- /* Get bit at position */
92
196
  static inline int get_bit(const uint8_t *bits, size_t pos) {
93
- return (bits[pos / 8] & (1 << (pos % 8))) != 0;
197
+ return (bits[pos >> 3] & (1u << (pos & 7))) != 0;
198
+ }
199
+
200
+ static BloomLayer *layer_create(size_t capacity, double error_rate) {
201
+ BloomLayer *layer = (BloomLayer *)calloc(1, sizeof(BloomLayer));
202
+ if (!layer)
203
+ return NULL;
204
+
205
+ const double ln2 = 0.693147180559945309417;
206
+ const double ln2_sq = ln2 * ln2;
207
+
208
+ size_t bits_count = (size_t)(-(double)capacity * log(error_rate) / ln2_sq);
209
+ if (bits_count < 64)
210
+ bits_count = 64;
211
+ if (bits_count > MAX_BITS_ALLOC) {
212
+ free(layer);
213
+ return NULL;
214
+ }
215
+
216
+ layer->size = (bits_count + 7) / 8;
217
+ layer->capacity = capacity;
218
+ layer->count = 0;
219
+ layer->num_hashes = (int)((double)bits_count / (double)capacity * ln2);
220
+
221
+ if (layer->num_hashes < MIN_HASHES)
222
+ layer->num_hashes = MIN_HASHES;
223
+ if (layer->num_hashes > MAX_HASHES)
224
+ layer->num_hashes = MAX_HASHES;
225
+
226
+ layer->bits = (uint8_t *)calloc(layer->size, sizeof(uint8_t));
227
+ if (!layer->bits) {
228
+ free(layer);
229
+ return NULL;
230
+ }
231
+ return layer;
94
232
  }
95
233
 
96
- /* Allocate BloomFilter object */
234
+ static void layer_free(BloomLayer *layer) {
235
+ if (layer) {
236
+ free(layer->bits);
237
+ free(layer);
238
+ }
239
+ }
240
+
241
+ static inline int layer_is_full(const BloomLayer *layer) {
242
+ return layer->count >= layer->capacity;
243
+ }
244
+
245
+ static inline void layer_hash(const char *data, size_t len, uint64_t *h1, uint64_t *h2) {
246
+ murmur3_128((const uint8_t *)data, len, MURMUR_SEED, h1, h2);
247
+ }
248
+
249
+ static void layer_add(BloomLayer *layer, const char *data, size_t len) {
250
+ const size_t bits_count = layer->size * 8;
251
+ uint64_t h1, h2;
252
+ layer_hash(data, len, &h1, &h2);
253
+
254
+ for (int i = 0; i < layer->num_hashes; i++) {
255
+ uint64_t combined = h1 + (uint64_t)i * h2;
256
+ set_bit(layer->bits, (size_t)(combined % bits_count));
257
+ }
258
+ layer->count++;
259
+ }
260
+
261
+ static int layer_include(const BloomLayer *layer, const char *data, size_t len) {
262
+ const size_t bits_count = layer->size * 8;
263
+ uint64_t h1, h2;
264
+ layer_hash(data, len, &h1, &h2);
265
+
266
+ for (int i = 0; i < layer->num_hashes; i++) {
267
+ uint64_t combined = h1 + (uint64_t)i * h2;
268
+ if (!get_bit(layer->bits, (size_t)(combined % bits_count)))
269
+ return 0;
270
+ }
271
+ return 1;
272
+ }
273
+
274
+ static size_t layer_bits_set(const BloomLayer *layer) {
275
+ size_t count = 0;
276
+ size_t i = 0;
277
+ for (; i + 8 <= layer->size; i += 8) {
278
+ uint64_t word;
279
+ memcpy(&word, layer->bits + i, 8);
280
+ count += popcount64(word);
281
+ }
282
+ for (; i < layer->size; i++)
283
+ count += popcount64((uint64_t)layer->bits[i]);
284
+ return count;
285
+ }
286
+
287
+ static double layer_error_rate(double total_fpr, double r, size_t index) {
288
+ return total_fpr * (1.0 - r) * pow(r, (double)index);
289
+ }
290
+
291
+ static double layer_estimated_fpr(const BloomLayer *layer) {
292
+ double m = (double)(layer->size * 8);
293
+ double k = (double)layer->num_hashes;
294
+ double n = (double)layer->count;
295
+ return pow(1.0 - exp(-k * n / m), k);
296
+ }
297
+
298
+ static BloomLayer *scalable_add_layer(ScalableBloom *sb) {
299
+ size_t new_cap;
300
+ if (sb->num_layers == 0) {
301
+ new_cap = sb->initial_capacity;
302
+ } else {
303
+ new_cap = (size_t)(sb->layers[sb->num_layers - 1]->capacity * GROWTH_FACTOR);
304
+ }
305
+
306
+ double fpr = layer_error_rate(sb->error_rate, sb->tightening, sb->num_layers);
307
+ if (fpr < 1e-15)
308
+ fpr = 1e-15;
309
+
310
+ BloomLayer *layer = layer_create(new_cap, fpr);
311
+ if (!layer)
312
+ return NULL;
313
+
314
+ if (sb->num_layers >= sb->layers_cap) {
315
+ size_t new_slots = sb->layers_cap == 0 ? 4 : sb->layers_cap * 2;
316
+ BloomLayer **tmp = (BloomLayer **)realloc(sb->layers, new_slots * sizeof(BloomLayer *));
317
+ if (!tmp) {
318
+ layer_free(layer);
319
+ return NULL;
320
+ }
321
+ sb->layers = tmp;
322
+ sb->layers_cap = new_slots;
323
+ }
324
+
325
+ sb->layers[sb->num_layers++] = layer;
326
+ return layer;
327
+ }
328
+
329
+ static void bloom_free_scalable(void *ptr) {
330
+ ScalableBloom *sb = (ScalableBloom *)ptr;
331
+ for (size_t i = 0; i < sb->num_layers; i++)
332
+ layer_free(sb->layers[i]);
333
+ free(sb->layers);
334
+ free(sb);
335
+ }
336
+
337
+ static size_t bloom_memsize_scalable(const void *ptr) {
338
+ const ScalableBloom *sb = (const ScalableBloom *)ptr;
339
+ size_t total = sizeof(ScalableBloom);
340
+ total += sb->layers_cap * sizeof(BloomLayer *);
341
+ for (size_t i = 0; i < sb->num_layers; i++)
342
+ total += sizeof(BloomLayer) + sb->layers[i]->size;
343
+ return total;
344
+ }
345
+
346
+ static const rb_data_type_t scalable_bloom_type = {
347
+ "ScalableBloomFilter",
348
+ {NULL, bloom_free_scalable, bloom_memsize_scalable},
349
+ NULL,
350
+ NULL,
351
+ RUBY_TYPED_FREE_IMMEDIATELY};
352
+
97
353
  static VALUE bloom_alloc(VALUE klass) {
98
- BloomFilter *bloom = ALLOC(BloomFilter);
99
- bloom->bits = NULL;
100
- bloom->size = 0;
101
- bloom->capacity = 0;
102
- bloom->num_hashes = 0;
103
-
104
- return TypedData_Wrap_Struct(klass, &bloom_type, bloom);
105
- }
106
-
107
- /*
108
- * Initialize Bloom Filter
109
- *
110
- * @param capacity [Integer] Expected number of elements
111
- * @param error_rate [Float] Desired false positive rate (default: 0.01)
112
- */
354
+ ScalableBloom *sb = (ScalableBloom *)calloc(1, sizeof(ScalableBloom));
355
+ if (!sb)
356
+ rb_raise(rb_eNoMemError, "failed to allocate ScalableBloom");
357
+ return TypedData_Wrap_Struct(klass, &scalable_bloom_type, sb);
358
+ }
359
+
113
360
  static VALUE bloom_initialize(int argc, VALUE *argv, VALUE self) {
114
- VALUE capacity_val, error_rate_val;
115
- rb_scan_args(argc, argv, "11", &capacity_val, &error_rate_val);
116
-
117
- long capacity = NUM2LONG(capacity_val);
118
- double error_rate = NIL_P(error_rate_val) ? 0.01 : NUM2DBL(error_rate_val);
119
-
120
- if (capacity <= 0) {
121
- rb_raise(rb_eArgError, "capacity must be positive");
122
- }
123
-
124
- if (error_rate <= 0 || error_rate >= 1) {
125
- rb_raise(rb_eArgError, "error_rate must be between 0 and 1");
361
+ VALUE opts = Qnil;
362
+
363
+ if (argc == 0) {
364
+ } else if (argc == 1 && RB_TYPE_P(argv[0], T_HASH)) {
365
+ opts = argv[0];
366
+ } else {
367
+ rb_raise(rb_eArgError,
368
+ "wrong number of arguments (given %d, expected 0 or keyword arguments)", argc);
126
369
  }
127
-
128
- BloomFilter *bloom;
129
- TypedData_Get_Struct(self, BloomFilter, &bloom_type, bloom);
130
-
131
- /* Calculate optimal parameters */
132
- double ln2 = 0.693147180559945309417;
133
- double ln2_sq = ln2 * ln2;
134
-
135
- size_t bits_count = (size_t)(-(capacity * log(error_rate)) / ln2_sq);
136
- bloom->size = (bits_count + 7) / 8;
137
- bloom->capacity = capacity;
138
- bloom->num_hashes = (int)((bits_count / (double)capacity) * ln2);
139
-
140
- if (bloom->num_hashes < 1) bloom->num_hashes = 1;
141
- if (bloom->num_hashes > 10) bloom->num_hashes = 10;
142
-
143
- bloom->bits = (uint8_t *)calloc(bloom->size, sizeof(uint8_t));
144
- if (!bloom->bits) {
145
- rb_raise(rb_eNoMemError, "failed to allocate memory");
370
+
371
+ double error_rate = DEFAULT_ERROR_RATE;
372
+ size_t initial_capacity = DEFAULT_INITIAL_CAP;
373
+ double tightening = DEFAULT_TIGHTENING;
374
+
375
+ if (!NIL_P(opts)) {
376
+ VALUE v;
377
+ v = rb_hash_aref(opts, ID2SYM(rb_intern("error_rate")));
378
+ if (!NIL_P(v))
379
+ error_rate = NUM2DBL(v);
380
+ v = rb_hash_aref(opts, ID2SYM(rb_intern("initial_capacity")));
381
+ if (!NIL_P(v))
382
+ initial_capacity = (size_t)NUM2LONG(v);
383
+ v = rb_hash_aref(opts, ID2SYM(rb_intern("tightening")));
384
+ if (!NIL_P(v))
385
+ tightening = NUM2DBL(v);
146
386
  }
147
-
387
+
388
+ if (error_rate <= 0 || error_rate >= 1)
389
+ rb_raise(rb_eArgError, "error_rate must be between 0 and 1 (exclusive)");
390
+ if (initial_capacity == 0)
391
+ rb_raise(rb_eArgError, "initial_capacity must be positive");
392
+ if (tightening <= 0 || tightening >= 1)
393
+ rb_raise(rb_eArgError, "tightening must be between 0 and 1 (exclusive)");
394
+
395
+ ScalableBloom *sb;
396
+ TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
397
+
398
+ sb->error_rate = error_rate;
399
+ sb->initial_capacity = initial_capacity;
400
+ sb->tightening = tightening;
401
+ sb->total_count = 0;
402
+
403
+ if (!scalable_add_layer(sb))
404
+ rb_raise(rb_eNoMemError, "failed to allocate initial layer");
405
+
148
406
  return self;
149
407
  }
150
408
 
151
- /*
152
- * Add element to filter
153
- */
154
409
  static VALUE bloom_add(VALUE self, VALUE str) {
155
- BloomFilter *bloom;
156
- TypedData_Get_Struct(self, BloomFilter, &bloom_type, bloom);
157
-
158
- Check_Type(str, T_STRING);
159
-
410
+ ScalableBloom *sb;
411
+ TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
412
+
413
+ str = StringValue(str);
414
+
415
+ BloomLayer *active = sb->layers[sb->num_layers - 1];
416
+ if (layer_is_full(active)) {
417
+ active = scalable_add_layer(sb);
418
+ if (!active)
419
+ rb_raise(rb_eNoMemError, "failed to allocate new layer");
420
+ }
421
+
422
+ layer_add(active, RSTRING_PTR(str), RSTRING_LEN(str));
423
+ sb->total_count++;
424
+
425
+ return Qtrue;
426
+ }
427
+
428
+ static VALUE bloom_add_if_absent(VALUE self, VALUE str) {
429
+ ScalableBloom *sb;
430
+ TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
431
+
432
+ str = StringValue(str);
160
433
  const char *data = RSTRING_PTR(str);
161
434
  size_t len = RSTRING_LEN(str);
162
- size_t bits_count = bloom->size * 8;
163
-
164
- for (int i = 0; i < bloom->num_hashes; i++) {
165
- uint32_t hash = murmur3_32((const uint8_t *)data, len, i);
166
- size_t pos = hash % bits_count;
167
- set_bit(bloom->bits, pos);
435
+
436
+ for (size_t i = sb->num_layers; i > 0; i--) {
437
+ if (sb->layers[i - 1]->count == 0)
438
+ continue;
439
+ if (layer_include(sb->layers[i - 1], data, len))
440
+ return Qfalse;
168
441
  }
169
-
442
+
443
+ BloomLayer *active = sb->layers[sb->num_layers - 1];
444
+ if (layer_is_full(active)) {
445
+ active = scalable_add_layer(sb);
446
+ if (!active)
447
+ rb_raise(rb_eNoMemError, "failed to allocate new layer");
448
+ }
449
+
450
+ layer_add(active, data, len);
451
+ sb->total_count++;
452
+
170
453
  return Qtrue;
171
454
  }
172
455
 
173
- /*
174
- * Check if element might be in filter
175
- */
176
456
  static VALUE bloom_include(VALUE self, VALUE str) {
177
- BloomFilter *bloom;
178
- TypedData_Get_Struct(self, BloomFilter, &bloom_type, bloom);
179
-
180
- Check_Type(str, T_STRING);
181
-
457
+ ScalableBloom *sb;
458
+ TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
459
+
460
+ str = StringValue(str);
182
461
  const char *data = RSTRING_PTR(str);
183
462
  size_t len = RSTRING_LEN(str);
184
- size_t bits_count = bloom->size * 8;
185
-
186
- for (int i = 0; i < bloom->num_hashes; i++) {
187
- uint32_t hash = murmur3_32((const uint8_t *)data, len, i);
188
- size_t pos = hash % bits_count;
189
- if (!get_bit(bloom->bits, pos)) {
190
- return Qfalse;
191
- }
463
+
464
+ for (size_t i = sb->num_layers; i > 0; i--) {
465
+ if (sb->layers[i - 1]->count == 0)
466
+ continue;
467
+ if (layer_include(sb->layers[i - 1], data, len))
468
+ return Qtrue;
192
469
  }
193
-
194
- return Qtrue;
470
+ return Qfalse;
195
471
  }
196
472
 
197
- /*
198
- * Clear all bits
199
- */
200
473
  static VALUE bloom_clear(VALUE self) {
201
- BloomFilter *bloom;
202
- TypedData_Get_Struct(self, BloomFilter, &bloom_type, bloom);
203
-
204
- memset(bloom->bits, 0, bloom->size);
474
+ ScalableBloom *sb;
475
+ TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
476
+
477
+ for (size_t i = 0; i < sb->num_layers; i++)
478
+ layer_free(sb->layers[i]);
479
+ sb->num_layers = 0;
480
+ sb->total_count = 0;
481
+
482
+ if (!scalable_add_layer(sb))
483
+ rb_raise(rb_eNoMemError, "failed to allocate layer after clear");
484
+
205
485
  return Qnil;
206
486
  }
207
487
 
208
- /*
209
- * Get filter statistics
210
- */
211
488
  static VALUE bloom_stats(VALUE self) {
212
- BloomFilter *bloom;
213
- TypedData_Get_Struct(self, BloomFilter, &bloom_type, bloom);
214
-
215
- size_t bits_set = 0;
216
- size_t total_bits = bloom->size * 8;
217
-
218
- for (size_t i = 0; i < bloom->size; i++) {
219
- uint8_t byte = bloom->bits[i];
220
- while (byte) {
221
- bits_set += byte & 1;
222
- byte >>= 1;
223
- }
489
+ ScalableBloom *sb;
490
+ TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
491
+
492
+ size_t total_bytes = 0;
493
+ size_t total_bits = 0;
494
+ size_t total_bits_set = 0;
495
+ double combined_fpr = 1.0;
496
+
497
+ VALUE layers_ary = rb_ary_new_capa((long)sb->num_layers);
498
+
499
+ for (size_t i = 0; i < sb->num_layers; i++) {
500
+ BloomLayer *l = sb->layers[i];
501
+ size_t bs = layer_bits_set(l);
502
+ size_t tb = l->size * 8;
503
+ double est_fpr = layer_estimated_fpr(l);
504
+
505
+ total_bytes += l->size;
506
+ total_bits += tb;
507
+ total_bits_set += bs;
508
+ combined_fpr *= (1.0 - est_fpr);
509
+
510
+ VALUE lh = rb_hash_new();
511
+ rb_hash_aset(lh, ID2SYM(rb_intern("layer")), LONG2NUM(i));
512
+ rb_hash_aset(lh, ID2SYM(rb_intern("capacity")), LONG2NUM(l->capacity));
513
+ rb_hash_aset(lh, ID2SYM(rb_intern("count")), LONG2NUM(l->count));
514
+ rb_hash_aset(lh, ID2SYM(rb_intern("size_bytes")), LONG2NUM(l->size));
515
+ rb_hash_aset(lh, ID2SYM(rb_intern("num_hashes")), INT2NUM(l->num_hashes));
516
+ rb_hash_aset(lh, ID2SYM(rb_intern("bits_set")), LONG2NUM(bs));
517
+ rb_hash_aset(lh, ID2SYM(rb_intern("total_bits")), LONG2NUM(tb));
518
+ rb_hash_aset(lh, ID2SYM(rb_intern("fill_ratio")), DBL2NUM((double)bs / tb));
519
+ rb_hash_aset(lh, ID2SYM(rb_intern("target_error_rate")),
520
+ DBL2NUM(layer_error_rate(sb->error_rate, sb->tightening, i)));
521
+ rb_hash_aset(lh, ID2SYM(rb_intern("estimated_error_rate")), DBL2NUM(est_fpr));
522
+ rb_ary_push(layers_ary, lh);
224
523
  }
225
-
226
- double fill_ratio = (double)bits_set / total_bits;
227
-
524
+
525
+ double total_est_fpr = 1.0 - combined_fpr;
526
+
228
527
  VALUE hash = rb_hash_new();
229
- rb_hash_aset(hash, ID2SYM(rb_intern("capacity")), LONG2NUM(bloom->capacity));
230
- rb_hash_aset(hash, ID2SYM(rb_intern("size_bytes")), LONG2NUM(bloom->size));
231
- rb_hash_aset(hash, ID2SYM(rb_intern("num_hashes")), INT2NUM(bloom->num_hashes));
232
- rb_hash_aset(hash, ID2SYM(rb_intern("bits_set")), LONG2NUM(bits_set));
528
+ rb_hash_aset(hash, ID2SYM(rb_intern("total_count")), LONG2NUM(sb->total_count));
529
+ rb_hash_aset(hash, ID2SYM(rb_intern("num_layers")), LONG2NUM(sb->num_layers));
530
+ rb_hash_aset(hash, ID2SYM(rb_intern("total_bytes")), LONG2NUM(total_bytes));
233
531
  rb_hash_aset(hash, ID2SYM(rb_intern("total_bits")), LONG2NUM(total_bits));
234
- rb_hash_aset(hash, ID2SYM(rb_intern("fill_ratio")), DBL2NUM(fill_ratio));
235
-
532
+ rb_hash_aset(hash, ID2SYM(rb_intern("total_bits_set")), LONG2NUM(total_bits_set));
533
+ rb_hash_aset(hash, ID2SYM(rb_intern("fill_ratio")),
534
+ DBL2NUM((double)total_bits_set / total_bits));
535
+ rb_hash_aset(hash, ID2SYM(rb_intern("target_error_rate")), DBL2NUM(sb->error_rate));
536
+ rb_hash_aset(hash, ID2SYM(rb_intern("estimated_error_rate")), DBL2NUM(total_est_fpr));
537
+ rb_hash_aset(hash, ID2SYM(rb_intern("layers")), layers_ary);
236
538
  return hash;
237
539
  }
238
540
 
239
- /*
240
- * Merge another filter
241
- */
541
+ static VALUE bloom_count(VALUE self) {
542
+ ScalableBloom *sb;
543
+ TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
544
+ return LONG2NUM(sb->total_count);
545
+ }
546
+
547
+ static VALUE bloom_num_layers(VALUE self) {
548
+ ScalableBloom *sb;
549
+ TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
550
+ return LONG2NUM(sb->num_layers);
551
+ }
552
+
242
553
  static VALUE bloom_merge(VALUE self, VALUE other) {
243
- BloomFilter *bloom1, *bloom2;
244
- TypedData_Get_Struct(self, BloomFilter, &bloom_type, bloom1);
245
- TypedData_Get_Struct(other, BloomFilter, &bloom_type, bloom2);
246
-
247
- if (bloom1->size != bloom2->size || bloom1->num_hashes != bloom2->num_hashes) {
248
- rb_raise(rb_eArgError, "cannot merge filters with different parameters");
249
- }
250
-
251
- for (size_t i = 0; i < bloom1->size; i++) {
252
- bloom1->bits[i] |= bloom2->bits[i];
554
+ ScalableBloom *sb1, *sb2;
555
+ TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb1);
556
+ TypedData_Get_Struct(other, ScalableBloom, &scalable_bloom_type, sb2);
557
+
558
+ if (fabs(sb1->error_rate - sb2->error_rate) > 1e-10)
559
+ rb_raise(rb_eArgError, "cannot merge filters with different error rates (%.6f vs %.6f)",
560
+ sb1->error_rate, sb2->error_rate);
561
+ if (fabs(sb1->tightening - sb2->tightening) > 1e-10)
562
+ rb_raise(rb_eArgError,
563
+ "cannot merge filters with different tightening ratios (%.6f vs %.6f)",
564
+ sb1->tightening, sb2->tightening);
565
+
566
+ for (size_t i = 0; i < sb2->num_layers; i++) {
567
+ BloomLayer *src = sb2->layers[i];
568
+ int merged = 0;
569
+
570
+ if (i < sb1->num_layers) {
571
+ BloomLayer *dst = sb1->layers[i];
572
+ if (dst->size == src->size && dst->num_hashes == src->num_hashes) {
573
+ size_t j = 0;
574
+ for (; j + 8 <= dst->size; j += 8) {
575
+ uint64_t a, b;
576
+ memcpy(&a, dst->bits + j, 8);
577
+ memcpy(&b, src->bits + j, 8);
578
+ a |= b;
579
+ memcpy(dst->bits + j, &a, 8);
580
+ }
581
+ for (; j < dst->size; j++)
582
+ dst->bits[j] |= src->bits[j];
583
+
584
+ size_t new_count = dst->count + src->count;
585
+ dst->count = new_count < dst->capacity ? new_count : dst->capacity;
586
+ merged = 1;
587
+ }
588
+ }
589
+
590
+ if (!merged) {
591
+ BloomLayer *copy = (BloomLayer *)calloc(1, sizeof(BloomLayer));
592
+ if (!copy)
593
+ rb_raise(rb_eNoMemError, "failed to allocate layer copy");
594
+
595
+ copy->size = src->size;
596
+ copy->capacity = src->capacity;
597
+ copy->count = src->count;
598
+ copy->num_hashes = src->num_hashes;
599
+ copy->bits = (uint8_t *)malloc(src->size);
600
+ if (!copy->bits) {
601
+ free(copy);
602
+ rb_raise(rb_eNoMemError, "failed to allocate bits");
603
+ }
604
+ memcpy(copy->bits, src->bits, src->size);
605
+
606
+ if (sb1->num_layers >= sb1->layers_cap) {
607
+ size_t new_slots = sb1->layers_cap == 0 ? 4 : sb1->layers_cap * 2;
608
+ BloomLayer **tmp =
609
+ (BloomLayer **)realloc(sb1->layers, new_slots * sizeof(BloomLayer *));
610
+ if (!tmp) {
611
+ layer_free(copy);
612
+ rb_raise(rb_eNoMemError, "realloc failed");
613
+ }
614
+ sb1->layers = tmp;
615
+ sb1->layers_cap = new_slots;
616
+ }
617
+ sb1->layers[sb1->num_layers++] = copy;
618
+ }
253
619
  }
254
-
620
+
621
+ size_t new_total = sb1->total_count + sb2->total_count;
622
+ sb1->total_count = new_total >= sb1->total_count ? new_total : SIZE_MAX;
255
623
  return self;
256
624
  }
257
625
 
626
+ static VALUE bloom_dump(VALUE self) {
627
+ ScalableBloom *sb;
628
+ TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
629
+
630
+ size_t total_size = HEADER_SIZE;
631
+ for (size_t i = 0; i < sb->num_layers; i++)
632
+ total_size += LAYER_META + sb->layers[i]->size;
633
+
634
+ VALUE str = rb_str_buf_new((long)total_size);
635
+ rb_str_set_len(str, (long)total_size);
636
+ uint8_t *buf = (uint8_t *)RSTRING_PTR(str);
637
+ size_t off = 0;
638
+
639
+ write_le32(buf + off, SERIAL_VERSION);
640
+ off += 4;
641
+ write_le32(buf + off, 0);
642
+ off += 4;
643
+ write_le_double(buf + off, sb->error_rate);
644
+ off += 8;
645
+ write_le_double(buf + off, sb->tightening);
646
+ off += 8;
647
+ write_le64(buf + off, (uint64_t)sb->initial_capacity);
648
+ off += 8;
649
+ write_le64(buf + off, (uint64_t)sb->total_count);
650
+ off += 8;
651
+ write_le64(buf + off, (uint64_t)sb->num_layers);
652
+ off += 8;
653
+
654
+ for (size_t i = 0; i < sb->num_layers; i++) {
655
+ BloomLayer *l = sb->layers[i];
656
+ write_le64(buf + off, (uint64_t)l->capacity);
657
+ off += 8;
658
+ write_le64(buf + off, (uint64_t)l->count);
659
+ off += 8;
660
+ write_le64(buf + off, (uint64_t)l->size);
661
+ off += 8;
662
+ write_le32(buf + off, (uint32_t)l->num_hashes);
663
+ off += 4;
664
+ write_le32(buf + off, 0);
665
+ off += 4;
666
+ memcpy(buf + off, l->bits, l->size);
667
+ off += l->size;
668
+ }
669
+
670
+ return str;
671
+ }
672
+
673
+ static VALUE bloom_load(VALUE klass, VALUE data) {
674
+ Check_Type(data, T_STRING);
675
+
676
+ const uint8_t *buf = (const uint8_t *)RSTRING_PTR(data);
677
+ size_t data_len = (size_t)RSTRING_LEN(data);
678
+
679
+ if (data_len < HEADER_SIZE)
680
+ rb_raise(rb_eArgError, "data too short for bloom filter header");
681
+
682
+ size_t off = 0;
683
+ uint32_t version = read_le32(buf + off);
684
+ off += 4;
685
+ if (version != SERIAL_VERSION)
686
+ rb_raise(rb_eArgError, "unsupported serialization version: %u", version);
687
+ off += 4;
688
+
689
+ VALUE obj = bloom_alloc(klass);
690
+ ScalableBloom *sb;
691
+ TypedData_Get_Struct(obj, ScalableBloom, &scalable_bloom_type, sb);
692
+
693
+ sb->error_rate = read_le_double(buf + off);
694
+ off += 8;
695
+ sb->tightening = read_le_double(buf + off);
696
+ off += 8;
697
+ sb->initial_capacity = (size_t)read_le64(buf + off);
698
+ off += 8;
699
+ sb->total_count = (size_t)read_le64(buf + off);
700
+ off += 8;
701
+
702
+ size_t num_layers = (size_t)read_le64(buf + off);
703
+ off += 8;
704
+
705
+ if (sb->error_rate <= 0 || sb->error_rate >= 1)
706
+ rb_raise(rb_eArgError, "invalid error_rate in serialized data");
707
+ if (sb->tightening <= 0 || sb->tightening >= 1)
708
+ rb_raise(rb_eArgError, "invalid tightening in serialized data");
709
+ if (num_layers > 1000)
710
+ rb_raise(rb_eArgError, "unreasonable number of layers: %zu", num_layers);
711
+
712
+ sb->layers_cap = num_layers < 4 ? 4 : num_layers;
713
+ sb->layers = (BloomLayer **)calloc(sb->layers_cap, sizeof(BloomLayer *));
714
+ if (!sb->layers)
715
+ rb_raise(rb_eNoMemError, "failed to allocate layers array");
716
+
717
+ for (size_t i = 0; i < num_layers; i++) {
718
+ if (off + LAYER_META > data_len) {
719
+ for (size_t j = 0; j < sb->num_layers; j++)
720
+ layer_free(sb->layers[j]);
721
+ sb->num_layers = 0;
722
+ rb_raise(rb_eArgError, "data truncated at layer %zu metadata", i);
723
+ }
724
+
725
+ BloomLayer *l = (BloomLayer *)calloc(1, sizeof(BloomLayer));
726
+ if (!l) {
727
+ for (size_t j = 0; j < sb->num_layers; j++)
728
+ layer_free(sb->layers[j]);
729
+ sb->num_layers = 0;
730
+ rb_raise(rb_eNoMemError, "failed to allocate layer");
731
+ }
732
+
733
+ l->capacity = (size_t)read_le64(buf + off);
734
+ off += 8;
735
+ l->count = (size_t)read_le64(buf + off);
736
+ off += 8;
737
+ l->size = (size_t)read_le64(buf + off);
738
+ off += 8;
739
+ l->num_hashes = (int)read_le32(buf + off);
740
+ off += 4;
741
+ off += 4;
742
+
743
+ if (l->size > (1ULL << 30) || off + l->size > data_len) {
744
+ free(l);
745
+ for (size_t j = 0; j < sb->num_layers; j++)
746
+ layer_free(sb->layers[j]);
747
+ sb->num_layers = 0;
748
+ rb_raise(rb_eArgError, "invalid or truncated layer %zu", i);
749
+ }
750
+
751
+ l->bits = (uint8_t *)malloc(l->size);
752
+ if (!l->bits) {
753
+ free(l);
754
+ for (size_t j = 0; j < sb->num_layers; j++)
755
+ layer_free(sb->layers[j]);
756
+ sb->num_layers = 0;
757
+ rb_raise(rb_eNoMemError, "failed to allocate bits");
758
+ }
759
+ memcpy(l->bits, buf + off, l->size);
760
+ off += l->size;
761
+
762
+ sb->layers[sb->num_layers++] = l;
763
+ }
764
+
765
+ return obj;
766
+ }
767
+
258
768
  void Init_fast_bloom_filter(void) {
259
769
  VALUE mFastBloomFilter = rb_define_module("FastBloomFilter");
260
- VALUE cBloomFilter = rb_define_class_under(mFastBloomFilter, "Filter", rb_cObject);
261
-
262
- rb_define_alloc_func(cBloomFilter, bloom_alloc);
263
- rb_define_method(cBloomFilter, "initialize", bloom_initialize, -1);
264
- rb_define_method(cBloomFilter, "add", bloom_add, 1);
265
- rb_define_method(cBloomFilter, "<<", bloom_add, 1);
266
- rb_define_method(cBloomFilter, "include?", bloom_include, 1);
267
- rb_define_method(cBloomFilter, "member?", bloom_include, 1);
268
- rb_define_method(cBloomFilter, "clear", bloom_clear, 0);
269
- rb_define_method(cBloomFilter, "stats", bloom_stats, 0);
270
- rb_define_method(cBloomFilter, "merge!", bloom_merge, 1);
770
+ VALUE cFilter = rb_define_class_under(mFastBloomFilter, "Filter", rb_cObject);
771
+
772
+ rb_define_alloc_func(cFilter, bloom_alloc);
773
+ rb_define_method(cFilter, "initialize", bloom_initialize, -1);
774
+ rb_define_method(cFilter, "add", bloom_add, 1);
775
+ rb_define_method(cFilter, "<<", bloom_add, 1);
776
+ rb_define_method(cFilter, "add_if_absent", bloom_add_if_absent, 1);
777
+ rb_define_method(cFilter, "include?", bloom_include, 1);
778
+ rb_define_method(cFilter, "member?", bloom_include, 1);
779
+ rb_define_method(cFilter, "clear", bloom_clear, 0);
780
+ rb_define_method(cFilter, "stats", bloom_stats, 0);
781
+ rb_define_method(cFilter, "count", bloom_count, 0);
782
+ rb_define_method(cFilter, "size", bloom_count, 0);
783
+ rb_define_method(cFilter, "num_layers", bloom_num_layers, 0);
784
+ rb_define_method(cFilter, "merge!", bloom_merge, 1);
785
+ rb_define_method(cFilter, "dump", bloom_dump, 0);
786
+
787
+ rb_define_singleton_method(cFilter, "load", bloom_load, 1);
271
788
  }