fast_bloom_filter 1.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +75 -0
- data/README.md +138 -48
- data/ext/fast_bloom_filter/fast_bloom_filter.c +733 -216
- data/lib/fast_bloom_filter/version.rb +1 -1
- data/lib/fast_bloom_filter.rb +13 -13
- metadata +12 -12
|
@@ -1,271 +1,788 @@
|
|
|
1
|
-
/*
|
|
2
|
-
* FastBloomFilter - High-performance Bloom Filter implementation for Ruby
|
|
3
|
-
* Copyright (c) 2025
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
1
|
#include <ruby.h>
|
|
7
2
|
#include <stdint.h>
|
|
8
3
|
#include <string.h>
|
|
9
4
|
#include <stdlib.h>
|
|
10
5
|
#include <math.h>
|
|
11
6
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
return
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
static
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
7
|
+
static inline uint64_t load_u64(const void *p) {
|
|
8
|
+
uint64_t v;
|
|
9
|
+
memcpy(&v, p, sizeof(v));
|
|
10
|
+
return v;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
static inline size_t popcount64(uint64_t x) {
|
|
14
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
15
|
+
return (size_t)__builtin_popcountll(x);
|
|
16
|
+
#elif defined(_MSC_VER) && defined(_M_X64)
|
|
17
|
+
return (size_t)__popcnt64(x);
|
|
18
|
+
#else
|
|
19
|
+
x = x - ((x >> 1) & 0x5555555555555555ULL);
|
|
20
|
+
x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
|
|
21
|
+
x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
|
|
22
|
+
return (size_t)((x * 0x0101010101010101ULL) >> 56);
|
|
23
|
+
#endif
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
static inline uint64_t rotl64(uint64_t x, int r) {
|
|
27
|
+
return (x << r) | (x >> (64 - r));
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
static inline void write_le64(uint8_t *dst, uint64_t v) {
|
|
31
|
+
dst[0] = (uint8_t)(v);
|
|
32
|
+
dst[1] = (uint8_t)(v >> 8);
|
|
33
|
+
dst[2] = (uint8_t)(v >> 16);
|
|
34
|
+
dst[3] = (uint8_t)(v >> 24);
|
|
35
|
+
dst[4] = (uint8_t)(v >> 32);
|
|
36
|
+
dst[5] = (uint8_t)(v >> 40);
|
|
37
|
+
dst[6] = (uint8_t)(v >> 48);
|
|
38
|
+
dst[7] = (uint8_t)(v >> 56);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
static inline uint64_t read_le64(const uint8_t *src) {
|
|
42
|
+
return (uint64_t)src[0] | (uint64_t)src[1] << 8 | (uint64_t)src[2] << 16 |
|
|
43
|
+
(uint64_t)src[3] << 24 | (uint64_t)src[4] << 32 | (uint64_t)src[5] << 40 |
|
|
44
|
+
(uint64_t)src[6] << 48 | (uint64_t)src[7] << 56;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
static inline void write_le32(uint8_t *dst, uint32_t v) {
|
|
48
|
+
dst[0] = (uint8_t)(v);
|
|
49
|
+
dst[1] = (uint8_t)(v >> 8);
|
|
50
|
+
dst[2] = (uint8_t)(v >> 16);
|
|
51
|
+
dst[3] = (uint8_t)(v >> 24);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
static inline uint32_t read_le32(const uint8_t *src) {
|
|
55
|
+
return (uint32_t)src[0] | (uint32_t)src[1] << 8 | (uint32_t)src[2] << 16 |
|
|
56
|
+
(uint32_t)src[3] << 24;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
static inline void write_le_double(uint8_t *dst, double v) {
|
|
60
|
+
uint64_t bits;
|
|
61
|
+
memcpy(&bits, &v, 8);
|
|
62
|
+
write_le64(dst, bits);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
static inline double read_le_double(const uint8_t *src) {
|
|
66
|
+
uint64_t bits = read_le64(src);
|
|
67
|
+
double v;
|
|
68
|
+
memcpy(&v, &bits, 8);
|
|
69
|
+
return v;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
static void murmur3_128(const uint8_t *key, size_t len, uint64_t seed, uint64_t *out_h1,
|
|
73
|
+
uint64_t *out_h2) {
|
|
74
|
+
const size_t nblocks = len / 16;
|
|
75
|
+
uint64_t h1 = seed, h2 = seed;
|
|
76
|
+
const uint64_t c1 = 0x87c37b91114253d5ULL;
|
|
77
|
+
const uint64_t c2 = 0x4cf5ad432745937fULL;
|
|
78
|
+
|
|
79
|
+
const uint8_t *body = key;
|
|
80
|
+
for (size_t i = 0; i < nblocks; i++) {
|
|
81
|
+
uint64_t k1 = load_u64(body + i * 16);
|
|
82
|
+
uint64_t k2 = load_u64(body + i * 16 + 8);
|
|
55
83
|
k1 *= c1;
|
|
56
|
-
k1 = (k1
|
|
84
|
+
k1 = rotl64(k1, 31);
|
|
57
85
|
k1 *= c2;
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
86
|
+
h1 ^= k1;
|
|
87
|
+
h1 = rotl64(h1, 27);
|
|
88
|
+
h1 += h2;
|
|
89
|
+
h1 = h1 * 5 + 0x52dce729;
|
|
90
|
+
k2 *= c2;
|
|
91
|
+
k2 = rotl64(k2, 33);
|
|
92
|
+
k2 *= c1;
|
|
93
|
+
h2 ^= k2;
|
|
94
|
+
h2 = rotl64(h2, 31);
|
|
95
|
+
h2 += h1;
|
|
96
|
+
h2 = h2 * 5 + 0x38495ab5;
|
|
61
97
|
}
|
|
62
|
-
|
|
63
|
-
const uint8_t *tail =
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
98
|
+
|
|
99
|
+
const uint8_t *tail = key + nblocks * 16;
|
|
100
|
+
uint64_t k1 = 0, k2 = 0;
|
|
101
|
+
switch (len & 15) {
|
|
102
|
+
case 15:
|
|
103
|
+
k2 ^= (uint64_t)tail[14] << 48;
|
|
104
|
+
case 14:
|
|
105
|
+
k2 ^= (uint64_t)tail[13] << 40;
|
|
106
|
+
case 13:
|
|
107
|
+
k2 ^= (uint64_t)tail[12] << 32;
|
|
108
|
+
case 12:
|
|
109
|
+
k2 ^= (uint64_t)tail[11] << 24;
|
|
110
|
+
case 11:
|
|
111
|
+
k2 ^= (uint64_t)tail[10] << 16;
|
|
112
|
+
case 10:
|
|
113
|
+
k2 ^= (uint64_t)tail[9] << 8;
|
|
114
|
+
case 9:
|
|
115
|
+
k2 ^= (uint64_t)tail[8];
|
|
116
|
+
k2 *= c2;
|
|
117
|
+
k2 = rotl64(k2, 33);
|
|
118
|
+
k2 *= c1;
|
|
119
|
+
h2 ^= k2;
|
|
120
|
+
case 8:
|
|
121
|
+
k1 ^= (uint64_t)tail[7] << 56;
|
|
122
|
+
case 7:
|
|
123
|
+
k1 ^= (uint64_t)tail[6] << 48;
|
|
124
|
+
case 6:
|
|
125
|
+
k1 ^= (uint64_t)tail[5] << 40;
|
|
126
|
+
case 5:
|
|
127
|
+
k1 ^= (uint64_t)tail[4] << 32;
|
|
128
|
+
case 4:
|
|
129
|
+
k1 ^= (uint64_t)tail[3] << 24;
|
|
130
|
+
case 3:
|
|
131
|
+
k1 ^= (uint64_t)tail[2] << 16;
|
|
132
|
+
case 2:
|
|
133
|
+
k1 ^= (uint64_t)tail[1] << 8;
|
|
134
|
+
case 1:
|
|
135
|
+
k1 ^= (uint64_t)tail[0];
|
|
136
|
+
k1 *= c1;
|
|
137
|
+
k1 = rotl64(k1, 31);
|
|
138
|
+
k1 *= c2;
|
|
139
|
+
h1 ^= k1;
|
|
74
140
|
}
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
141
|
+
|
|
142
|
+
h1 ^= (uint64_t)len;
|
|
143
|
+
h2 ^= (uint64_t)len;
|
|
144
|
+
h1 += h2;
|
|
145
|
+
h2 += h1;
|
|
146
|
+
h1 ^= h1 >> 33;
|
|
147
|
+
h1 *= 0xff51afd7ed558ccdULL;
|
|
148
|
+
h1 ^= h1 >> 33;
|
|
149
|
+
h1 *= 0xc4ceb9fe1a85ec53ULL;
|
|
150
|
+
h1 ^= h1 >> 33;
|
|
151
|
+
h2 ^= h2 >> 33;
|
|
152
|
+
h2 *= 0xff51afd7ed558ccdULL;
|
|
153
|
+
h2 ^= h2 >> 33;
|
|
154
|
+
h2 *= 0xc4ceb9fe1a85ec53ULL;
|
|
155
|
+
h2 ^= h2 >> 33;
|
|
156
|
+
h1 += h2;
|
|
157
|
+
h2 += h1;
|
|
158
|
+
*out_h1 = h1;
|
|
159
|
+
*out_h2 = h2;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
typedef struct {
|
|
163
|
+
uint8_t *bits;
|
|
164
|
+
size_t size;
|
|
165
|
+
size_t capacity;
|
|
166
|
+
size_t count;
|
|
167
|
+
int num_hashes;
|
|
168
|
+
} BloomLayer;
|
|
169
|
+
|
|
170
|
+
typedef struct {
|
|
171
|
+
BloomLayer **layers;
|
|
172
|
+
size_t num_layers;
|
|
173
|
+
size_t layers_cap;
|
|
174
|
+
double error_rate;
|
|
175
|
+
double tightening;
|
|
176
|
+
size_t initial_capacity;
|
|
177
|
+
size_t total_count;
|
|
178
|
+
} ScalableBloom;
|
|
179
|
+
|
|
180
|
+
#define DEFAULT_ERROR_RATE 0.01
|
|
181
|
+
#define DEFAULT_INITIAL_CAP 8192
|
|
182
|
+
#define DEFAULT_TIGHTENING 0.85
|
|
183
|
+
#define MAX_HASHES 20
|
|
184
|
+
#define MIN_HASHES 1
|
|
185
|
+
#define GROWTH_FACTOR 2.0
|
|
186
|
+
#define MURMUR_SEED 0x9747b28cULL
|
|
187
|
+
#define SERIAL_VERSION 1
|
|
188
|
+
#define HEADER_SIZE 48
|
|
189
|
+
#define LAYER_META 32
|
|
190
|
+
#define MAX_BITS_ALLOC (1ULL << 36)
|
|
191
|
+
|
|
87
192
|
static inline void set_bit(uint8_t *bits, size_t pos) {
|
|
88
|
-
bits[pos
|
|
193
|
+
bits[pos >> 3] |= (uint8_t)(1u << (pos & 7));
|
|
89
194
|
}
|
|
90
195
|
|
|
91
|
-
/* Get bit at position */
|
|
92
196
|
static inline int get_bit(const uint8_t *bits, size_t pos) {
|
|
93
|
-
return (bits[pos
|
|
197
|
+
return (bits[pos >> 3] & (1u << (pos & 7))) != 0;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
static BloomLayer *layer_create(size_t capacity, double error_rate) {
|
|
201
|
+
BloomLayer *layer = (BloomLayer *)calloc(1, sizeof(BloomLayer));
|
|
202
|
+
if (!layer)
|
|
203
|
+
return NULL;
|
|
204
|
+
|
|
205
|
+
const double ln2 = 0.693147180559945309417;
|
|
206
|
+
const double ln2_sq = ln2 * ln2;
|
|
207
|
+
|
|
208
|
+
size_t bits_count = (size_t)(-(double)capacity * log(error_rate) / ln2_sq);
|
|
209
|
+
if (bits_count < 64)
|
|
210
|
+
bits_count = 64;
|
|
211
|
+
if (bits_count > MAX_BITS_ALLOC) {
|
|
212
|
+
free(layer);
|
|
213
|
+
return NULL;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
layer->size = (bits_count + 7) / 8;
|
|
217
|
+
layer->capacity = capacity;
|
|
218
|
+
layer->count = 0;
|
|
219
|
+
layer->num_hashes = (int)((double)bits_count / (double)capacity * ln2);
|
|
220
|
+
|
|
221
|
+
if (layer->num_hashes < MIN_HASHES)
|
|
222
|
+
layer->num_hashes = MIN_HASHES;
|
|
223
|
+
if (layer->num_hashes > MAX_HASHES)
|
|
224
|
+
layer->num_hashes = MAX_HASHES;
|
|
225
|
+
|
|
226
|
+
layer->bits = (uint8_t *)calloc(layer->size, sizeof(uint8_t));
|
|
227
|
+
if (!layer->bits) {
|
|
228
|
+
free(layer);
|
|
229
|
+
return NULL;
|
|
230
|
+
}
|
|
231
|
+
return layer;
|
|
94
232
|
}
|
|
95
233
|
|
|
96
|
-
|
|
234
|
+
static void layer_free(BloomLayer *layer) {
|
|
235
|
+
if (layer) {
|
|
236
|
+
free(layer->bits);
|
|
237
|
+
free(layer);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
static inline int layer_is_full(const BloomLayer *layer) {
|
|
242
|
+
return layer->count >= layer->capacity;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
static inline void layer_hash(const char *data, size_t len, uint64_t *h1, uint64_t *h2) {
|
|
246
|
+
murmur3_128((const uint8_t *)data, len, MURMUR_SEED, h1, h2);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
static void layer_add(BloomLayer *layer, const char *data, size_t len) {
|
|
250
|
+
const size_t bits_count = layer->size * 8;
|
|
251
|
+
uint64_t h1, h2;
|
|
252
|
+
layer_hash(data, len, &h1, &h2);
|
|
253
|
+
|
|
254
|
+
for (int i = 0; i < layer->num_hashes; i++) {
|
|
255
|
+
uint64_t combined = h1 + (uint64_t)i * h2;
|
|
256
|
+
set_bit(layer->bits, (size_t)(combined % bits_count));
|
|
257
|
+
}
|
|
258
|
+
layer->count++;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
static int layer_include(const BloomLayer *layer, const char *data, size_t len) {
|
|
262
|
+
const size_t bits_count = layer->size * 8;
|
|
263
|
+
uint64_t h1, h2;
|
|
264
|
+
layer_hash(data, len, &h1, &h2);
|
|
265
|
+
|
|
266
|
+
for (int i = 0; i < layer->num_hashes; i++) {
|
|
267
|
+
uint64_t combined = h1 + (uint64_t)i * h2;
|
|
268
|
+
if (!get_bit(layer->bits, (size_t)(combined % bits_count)))
|
|
269
|
+
return 0;
|
|
270
|
+
}
|
|
271
|
+
return 1;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
static size_t layer_bits_set(const BloomLayer *layer) {
|
|
275
|
+
size_t count = 0;
|
|
276
|
+
size_t i = 0;
|
|
277
|
+
for (; i + 8 <= layer->size; i += 8) {
|
|
278
|
+
uint64_t word;
|
|
279
|
+
memcpy(&word, layer->bits + i, 8);
|
|
280
|
+
count += popcount64(word);
|
|
281
|
+
}
|
|
282
|
+
for (; i < layer->size; i++)
|
|
283
|
+
count += popcount64((uint64_t)layer->bits[i]);
|
|
284
|
+
return count;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
static double layer_error_rate(double total_fpr, double r, size_t index) {
|
|
288
|
+
return total_fpr * (1.0 - r) * pow(r, (double)index);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
static double layer_estimated_fpr(const BloomLayer *layer) {
|
|
292
|
+
double m = (double)(layer->size * 8);
|
|
293
|
+
double k = (double)layer->num_hashes;
|
|
294
|
+
double n = (double)layer->count;
|
|
295
|
+
return pow(1.0 - exp(-k * n / m), k);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
static BloomLayer *scalable_add_layer(ScalableBloom *sb) {
|
|
299
|
+
size_t new_cap;
|
|
300
|
+
if (sb->num_layers == 0) {
|
|
301
|
+
new_cap = sb->initial_capacity;
|
|
302
|
+
} else {
|
|
303
|
+
new_cap = (size_t)(sb->layers[sb->num_layers - 1]->capacity * GROWTH_FACTOR);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
double fpr = layer_error_rate(sb->error_rate, sb->tightening, sb->num_layers);
|
|
307
|
+
if (fpr < 1e-15)
|
|
308
|
+
fpr = 1e-15;
|
|
309
|
+
|
|
310
|
+
BloomLayer *layer = layer_create(new_cap, fpr);
|
|
311
|
+
if (!layer)
|
|
312
|
+
return NULL;
|
|
313
|
+
|
|
314
|
+
if (sb->num_layers >= sb->layers_cap) {
|
|
315
|
+
size_t new_slots = sb->layers_cap == 0 ? 4 : sb->layers_cap * 2;
|
|
316
|
+
BloomLayer **tmp = (BloomLayer **)realloc(sb->layers, new_slots * sizeof(BloomLayer *));
|
|
317
|
+
if (!tmp) {
|
|
318
|
+
layer_free(layer);
|
|
319
|
+
return NULL;
|
|
320
|
+
}
|
|
321
|
+
sb->layers = tmp;
|
|
322
|
+
sb->layers_cap = new_slots;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
sb->layers[sb->num_layers++] = layer;
|
|
326
|
+
return layer;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
static void bloom_free_scalable(void *ptr) {
|
|
330
|
+
ScalableBloom *sb = (ScalableBloom *)ptr;
|
|
331
|
+
for (size_t i = 0; i < sb->num_layers; i++)
|
|
332
|
+
layer_free(sb->layers[i]);
|
|
333
|
+
free(sb->layers);
|
|
334
|
+
free(sb);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
static size_t bloom_memsize_scalable(const void *ptr) {
|
|
338
|
+
const ScalableBloom *sb = (const ScalableBloom *)ptr;
|
|
339
|
+
size_t total = sizeof(ScalableBloom);
|
|
340
|
+
total += sb->layers_cap * sizeof(BloomLayer *);
|
|
341
|
+
for (size_t i = 0; i < sb->num_layers; i++)
|
|
342
|
+
total += sizeof(BloomLayer) + sb->layers[i]->size;
|
|
343
|
+
return total;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
static const rb_data_type_t scalable_bloom_type = {
|
|
347
|
+
"ScalableBloomFilter",
|
|
348
|
+
{NULL, bloom_free_scalable, bloom_memsize_scalable},
|
|
349
|
+
NULL,
|
|
350
|
+
NULL,
|
|
351
|
+
RUBY_TYPED_FREE_IMMEDIATELY};
|
|
352
|
+
|
|
97
353
|
static VALUE bloom_alloc(VALUE klass) {
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
return TypedData_Wrap_Struct(klass, &bloom_type, bloom);
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
/*
|
|
108
|
-
* Initialize Bloom Filter
|
|
109
|
-
*
|
|
110
|
-
* @param capacity [Integer] Expected number of elements
|
|
111
|
-
* @param error_rate [Float] Desired false positive rate (default: 0.01)
|
|
112
|
-
*/
|
|
354
|
+
ScalableBloom *sb = (ScalableBloom *)calloc(1, sizeof(ScalableBloom));
|
|
355
|
+
if (!sb)
|
|
356
|
+
rb_raise(rb_eNoMemError, "failed to allocate ScalableBloom");
|
|
357
|
+
return TypedData_Wrap_Struct(klass, &scalable_bloom_type, sb);
|
|
358
|
+
}
|
|
359
|
+
|
|
113
360
|
static VALUE bloom_initialize(int argc, VALUE *argv, VALUE self) {
|
|
114
|
-
VALUE
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
if (error_rate <= 0 || error_rate >= 1) {
|
|
125
|
-
rb_raise(rb_eArgError, "error_rate must be between 0 and 1");
|
|
361
|
+
VALUE opts = Qnil;
|
|
362
|
+
|
|
363
|
+
if (argc == 0) {
|
|
364
|
+
} else if (argc == 1 && RB_TYPE_P(argv[0], T_HASH)) {
|
|
365
|
+
opts = argv[0];
|
|
366
|
+
} else {
|
|
367
|
+
rb_raise(rb_eArgError,
|
|
368
|
+
"wrong number of arguments (given %d, expected 0 or keyword arguments)", argc);
|
|
126
369
|
}
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
bloom->bits = (uint8_t *)calloc(bloom->size, sizeof(uint8_t));
|
|
144
|
-
if (!bloom->bits) {
|
|
145
|
-
rb_raise(rb_eNoMemError, "failed to allocate memory");
|
|
370
|
+
|
|
371
|
+
double error_rate = DEFAULT_ERROR_RATE;
|
|
372
|
+
size_t initial_capacity = DEFAULT_INITIAL_CAP;
|
|
373
|
+
double tightening = DEFAULT_TIGHTENING;
|
|
374
|
+
|
|
375
|
+
if (!NIL_P(opts)) {
|
|
376
|
+
VALUE v;
|
|
377
|
+
v = rb_hash_aref(opts, ID2SYM(rb_intern("error_rate")));
|
|
378
|
+
if (!NIL_P(v))
|
|
379
|
+
error_rate = NUM2DBL(v);
|
|
380
|
+
v = rb_hash_aref(opts, ID2SYM(rb_intern("initial_capacity")));
|
|
381
|
+
if (!NIL_P(v))
|
|
382
|
+
initial_capacity = (size_t)NUM2LONG(v);
|
|
383
|
+
v = rb_hash_aref(opts, ID2SYM(rb_intern("tightening")));
|
|
384
|
+
if (!NIL_P(v))
|
|
385
|
+
tightening = NUM2DBL(v);
|
|
146
386
|
}
|
|
147
|
-
|
|
387
|
+
|
|
388
|
+
if (error_rate <= 0 || error_rate >= 1)
|
|
389
|
+
rb_raise(rb_eArgError, "error_rate must be between 0 and 1 (exclusive)");
|
|
390
|
+
if (initial_capacity == 0)
|
|
391
|
+
rb_raise(rb_eArgError, "initial_capacity must be positive");
|
|
392
|
+
if (tightening <= 0 || tightening >= 1)
|
|
393
|
+
rb_raise(rb_eArgError, "tightening must be between 0 and 1 (exclusive)");
|
|
394
|
+
|
|
395
|
+
ScalableBloom *sb;
|
|
396
|
+
TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
|
|
397
|
+
|
|
398
|
+
sb->error_rate = error_rate;
|
|
399
|
+
sb->initial_capacity = initial_capacity;
|
|
400
|
+
sb->tightening = tightening;
|
|
401
|
+
sb->total_count = 0;
|
|
402
|
+
|
|
403
|
+
if (!scalable_add_layer(sb))
|
|
404
|
+
rb_raise(rb_eNoMemError, "failed to allocate initial layer");
|
|
405
|
+
|
|
148
406
|
return self;
|
|
149
407
|
}
|
|
150
408
|
|
|
151
|
-
/*
|
|
152
|
-
* Add element to filter
|
|
153
|
-
*/
|
|
154
409
|
static VALUE bloom_add(VALUE self, VALUE str) {
|
|
155
|
-
|
|
156
|
-
TypedData_Get_Struct(self,
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
410
|
+
ScalableBloom *sb;
|
|
411
|
+
TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
|
|
412
|
+
|
|
413
|
+
str = StringValue(str);
|
|
414
|
+
|
|
415
|
+
BloomLayer *active = sb->layers[sb->num_layers - 1];
|
|
416
|
+
if (layer_is_full(active)) {
|
|
417
|
+
active = scalable_add_layer(sb);
|
|
418
|
+
if (!active)
|
|
419
|
+
rb_raise(rb_eNoMemError, "failed to allocate new layer");
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
layer_add(active, RSTRING_PTR(str), RSTRING_LEN(str));
|
|
423
|
+
sb->total_count++;
|
|
424
|
+
|
|
425
|
+
return Qtrue;
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
static VALUE bloom_add_if_absent(VALUE self, VALUE str) {
|
|
429
|
+
ScalableBloom *sb;
|
|
430
|
+
TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
|
|
431
|
+
|
|
432
|
+
str = StringValue(str);
|
|
160
433
|
const char *data = RSTRING_PTR(str);
|
|
161
434
|
size_t len = RSTRING_LEN(str);
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
435
|
+
|
|
436
|
+
for (size_t i = sb->num_layers; i > 0; i--) {
|
|
437
|
+
if (sb->layers[i - 1]->count == 0)
|
|
438
|
+
continue;
|
|
439
|
+
if (layer_include(sb->layers[i - 1], data, len))
|
|
440
|
+
return Qfalse;
|
|
168
441
|
}
|
|
169
|
-
|
|
442
|
+
|
|
443
|
+
BloomLayer *active = sb->layers[sb->num_layers - 1];
|
|
444
|
+
if (layer_is_full(active)) {
|
|
445
|
+
active = scalable_add_layer(sb);
|
|
446
|
+
if (!active)
|
|
447
|
+
rb_raise(rb_eNoMemError, "failed to allocate new layer");
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
layer_add(active, data, len);
|
|
451
|
+
sb->total_count++;
|
|
452
|
+
|
|
170
453
|
return Qtrue;
|
|
171
454
|
}
|
|
172
455
|
|
|
173
|
-
/*
|
|
174
|
-
* Check if element might be in filter
|
|
175
|
-
*/
|
|
176
456
|
static VALUE bloom_include(VALUE self, VALUE str) {
|
|
177
|
-
|
|
178
|
-
TypedData_Get_Struct(self,
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
457
|
+
ScalableBloom *sb;
|
|
458
|
+
TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
|
|
459
|
+
|
|
460
|
+
str = StringValue(str);
|
|
182
461
|
const char *data = RSTRING_PTR(str);
|
|
183
462
|
size_t len = RSTRING_LEN(str);
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
return Qfalse;
|
|
191
|
-
}
|
|
463
|
+
|
|
464
|
+
for (size_t i = sb->num_layers; i > 0; i--) {
|
|
465
|
+
if (sb->layers[i - 1]->count == 0)
|
|
466
|
+
continue;
|
|
467
|
+
if (layer_include(sb->layers[i - 1], data, len))
|
|
468
|
+
return Qtrue;
|
|
192
469
|
}
|
|
193
|
-
|
|
194
|
-
return Qtrue;
|
|
470
|
+
return Qfalse;
|
|
195
471
|
}
|
|
196
472
|
|
|
197
|
-
/*
|
|
198
|
-
* Clear all bits
|
|
199
|
-
*/
|
|
200
473
|
static VALUE bloom_clear(VALUE self) {
|
|
201
|
-
|
|
202
|
-
TypedData_Get_Struct(self,
|
|
203
|
-
|
|
204
|
-
|
|
474
|
+
ScalableBloom *sb;
|
|
475
|
+
TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
|
|
476
|
+
|
|
477
|
+
for (size_t i = 0; i < sb->num_layers; i++)
|
|
478
|
+
layer_free(sb->layers[i]);
|
|
479
|
+
sb->num_layers = 0;
|
|
480
|
+
sb->total_count = 0;
|
|
481
|
+
|
|
482
|
+
if (!scalable_add_layer(sb))
|
|
483
|
+
rb_raise(rb_eNoMemError, "failed to allocate layer after clear");
|
|
484
|
+
|
|
205
485
|
return Qnil;
|
|
206
486
|
}
|
|
207
487
|
|
|
208
|
-
/*
|
|
209
|
-
* Get filter statistics
|
|
210
|
-
*/
|
|
211
488
|
static VALUE bloom_stats(VALUE self) {
|
|
212
|
-
|
|
213
|
-
TypedData_Get_Struct(self,
|
|
214
|
-
|
|
215
|
-
size_t
|
|
216
|
-
size_t total_bits =
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
489
|
+
ScalableBloom *sb;
|
|
490
|
+
TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
|
|
491
|
+
|
|
492
|
+
size_t total_bytes = 0;
|
|
493
|
+
size_t total_bits = 0;
|
|
494
|
+
size_t total_bits_set = 0;
|
|
495
|
+
double combined_fpr = 1.0;
|
|
496
|
+
|
|
497
|
+
VALUE layers_ary = rb_ary_new_capa((long)sb->num_layers);
|
|
498
|
+
|
|
499
|
+
for (size_t i = 0; i < sb->num_layers; i++) {
|
|
500
|
+
BloomLayer *l = sb->layers[i];
|
|
501
|
+
size_t bs = layer_bits_set(l);
|
|
502
|
+
size_t tb = l->size * 8;
|
|
503
|
+
double est_fpr = layer_estimated_fpr(l);
|
|
504
|
+
|
|
505
|
+
total_bytes += l->size;
|
|
506
|
+
total_bits += tb;
|
|
507
|
+
total_bits_set += bs;
|
|
508
|
+
combined_fpr *= (1.0 - est_fpr);
|
|
509
|
+
|
|
510
|
+
VALUE lh = rb_hash_new();
|
|
511
|
+
rb_hash_aset(lh, ID2SYM(rb_intern("layer")), LONG2NUM(i));
|
|
512
|
+
rb_hash_aset(lh, ID2SYM(rb_intern("capacity")), LONG2NUM(l->capacity));
|
|
513
|
+
rb_hash_aset(lh, ID2SYM(rb_intern("count")), LONG2NUM(l->count));
|
|
514
|
+
rb_hash_aset(lh, ID2SYM(rb_intern("size_bytes")), LONG2NUM(l->size));
|
|
515
|
+
rb_hash_aset(lh, ID2SYM(rb_intern("num_hashes")), INT2NUM(l->num_hashes));
|
|
516
|
+
rb_hash_aset(lh, ID2SYM(rb_intern("bits_set")), LONG2NUM(bs));
|
|
517
|
+
rb_hash_aset(lh, ID2SYM(rb_intern("total_bits")), LONG2NUM(tb));
|
|
518
|
+
rb_hash_aset(lh, ID2SYM(rb_intern("fill_ratio")), DBL2NUM((double)bs / tb));
|
|
519
|
+
rb_hash_aset(lh, ID2SYM(rb_intern("target_error_rate")),
|
|
520
|
+
DBL2NUM(layer_error_rate(sb->error_rate, sb->tightening, i)));
|
|
521
|
+
rb_hash_aset(lh, ID2SYM(rb_intern("estimated_error_rate")), DBL2NUM(est_fpr));
|
|
522
|
+
rb_ary_push(layers_ary, lh);
|
|
224
523
|
}
|
|
225
|
-
|
|
226
|
-
double
|
|
227
|
-
|
|
524
|
+
|
|
525
|
+
double total_est_fpr = 1.0 - combined_fpr;
|
|
526
|
+
|
|
228
527
|
VALUE hash = rb_hash_new();
|
|
229
|
-
rb_hash_aset(hash, ID2SYM(rb_intern("
|
|
230
|
-
rb_hash_aset(hash, ID2SYM(rb_intern("
|
|
231
|
-
rb_hash_aset(hash, ID2SYM(rb_intern("
|
|
232
|
-
rb_hash_aset(hash, ID2SYM(rb_intern("bits_set")), LONG2NUM(bits_set));
|
|
528
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("total_count")), LONG2NUM(sb->total_count));
|
|
529
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("num_layers")), LONG2NUM(sb->num_layers));
|
|
530
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("total_bytes")), LONG2NUM(total_bytes));
|
|
233
531
|
rb_hash_aset(hash, ID2SYM(rb_intern("total_bits")), LONG2NUM(total_bits));
|
|
234
|
-
rb_hash_aset(hash, ID2SYM(rb_intern("
|
|
235
|
-
|
|
532
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("total_bits_set")), LONG2NUM(total_bits_set));
|
|
533
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("fill_ratio")),
|
|
534
|
+
DBL2NUM((double)total_bits_set / total_bits));
|
|
535
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("target_error_rate")), DBL2NUM(sb->error_rate));
|
|
536
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("estimated_error_rate")), DBL2NUM(total_est_fpr));
|
|
537
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("layers")), layers_ary);
|
|
236
538
|
return hash;
|
|
237
539
|
}
|
|
238
540
|
|
|
239
|
-
|
|
240
|
-
*
|
|
241
|
-
|
|
541
|
+
static VALUE bloom_count(VALUE self) {
|
|
542
|
+
ScalableBloom *sb;
|
|
543
|
+
TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
|
|
544
|
+
return LONG2NUM(sb->total_count);
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
static VALUE bloom_num_layers(VALUE self) {
|
|
548
|
+
ScalableBloom *sb;
|
|
549
|
+
TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
|
|
550
|
+
return LONG2NUM(sb->num_layers);
|
|
551
|
+
}
|
|
552
|
+
|
|
242
553
|
static VALUE bloom_merge(VALUE self, VALUE other) {
|
|
243
|
-
|
|
244
|
-
TypedData_Get_Struct(self,
|
|
245
|
-
TypedData_Get_Struct(other,
|
|
246
|
-
|
|
247
|
-
if (
|
|
248
|
-
rb_raise(rb_eArgError, "cannot merge filters with different
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
554
|
+
ScalableBloom *sb1, *sb2;
|
|
555
|
+
TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb1);
|
|
556
|
+
TypedData_Get_Struct(other, ScalableBloom, &scalable_bloom_type, sb2);
|
|
557
|
+
|
|
558
|
+
if (fabs(sb1->error_rate - sb2->error_rate) > 1e-10)
|
|
559
|
+
rb_raise(rb_eArgError, "cannot merge filters with different error rates (%.6f vs %.6f)",
|
|
560
|
+
sb1->error_rate, sb2->error_rate);
|
|
561
|
+
if (fabs(sb1->tightening - sb2->tightening) > 1e-10)
|
|
562
|
+
rb_raise(rb_eArgError,
|
|
563
|
+
"cannot merge filters with different tightening ratios (%.6f vs %.6f)",
|
|
564
|
+
sb1->tightening, sb2->tightening);
|
|
565
|
+
|
|
566
|
+
for (size_t i = 0; i < sb2->num_layers; i++) {
|
|
567
|
+
BloomLayer *src = sb2->layers[i];
|
|
568
|
+
int merged = 0;
|
|
569
|
+
|
|
570
|
+
if (i < sb1->num_layers) {
|
|
571
|
+
BloomLayer *dst = sb1->layers[i];
|
|
572
|
+
if (dst->size == src->size && dst->num_hashes == src->num_hashes) {
|
|
573
|
+
size_t j = 0;
|
|
574
|
+
for (; j + 8 <= dst->size; j += 8) {
|
|
575
|
+
uint64_t a, b;
|
|
576
|
+
memcpy(&a, dst->bits + j, 8);
|
|
577
|
+
memcpy(&b, src->bits + j, 8);
|
|
578
|
+
a |= b;
|
|
579
|
+
memcpy(dst->bits + j, &a, 8);
|
|
580
|
+
}
|
|
581
|
+
for (; j < dst->size; j++)
|
|
582
|
+
dst->bits[j] |= src->bits[j];
|
|
583
|
+
|
|
584
|
+
size_t new_count = dst->count + src->count;
|
|
585
|
+
dst->count = new_count < dst->capacity ? new_count : dst->capacity;
|
|
586
|
+
merged = 1;
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
if (!merged) {
|
|
591
|
+
BloomLayer *copy = (BloomLayer *)calloc(1, sizeof(BloomLayer));
|
|
592
|
+
if (!copy)
|
|
593
|
+
rb_raise(rb_eNoMemError, "failed to allocate layer copy");
|
|
594
|
+
|
|
595
|
+
copy->size = src->size;
|
|
596
|
+
copy->capacity = src->capacity;
|
|
597
|
+
copy->count = src->count;
|
|
598
|
+
copy->num_hashes = src->num_hashes;
|
|
599
|
+
copy->bits = (uint8_t *)malloc(src->size);
|
|
600
|
+
if (!copy->bits) {
|
|
601
|
+
free(copy);
|
|
602
|
+
rb_raise(rb_eNoMemError, "failed to allocate bits");
|
|
603
|
+
}
|
|
604
|
+
memcpy(copy->bits, src->bits, src->size);
|
|
605
|
+
|
|
606
|
+
if (sb1->num_layers >= sb1->layers_cap) {
|
|
607
|
+
size_t new_slots = sb1->layers_cap == 0 ? 4 : sb1->layers_cap * 2;
|
|
608
|
+
BloomLayer **tmp =
|
|
609
|
+
(BloomLayer **)realloc(sb1->layers, new_slots * sizeof(BloomLayer *));
|
|
610
|
+
if (!tmp) {
|
|
611
|
+
layer_free(copy);
|
|
612
|
+
rb_raise(rb_eNoMemError, "realloc failed");
|
|
613
|
+
}
|
|
614
|
+
sb1->layers = tmp;
|
|
615
|
+
sb1->layers_cap = new_slots;
|
|
616
|
+
}
|
|
617
|
+
sb1->layers[sb1->num_layers++] = copy;
|
|
618
|
+
}
|
|
253
619
|
}
|
|
254
|
-
|
|
620
|
+
|
|
621
|
+
size_t new_total = sb1->total_count + sb2->total_count;
|
|
622
|
+
sb1->total_count = new_total >= sb1->total_count ? new_total : SIZE_MAX;
|
|
255
623
|
return self;
|
|
256
624
|
}
|
|
257
625
|
|
|
626
|
+
static VALUE bloom_dump(VALUE self) {
|
|
627
|
+
ScalableBloom *sb;
|
|
628
|
+
TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
|
|
629
|
+
|
|
630
|
+
size_t total_size = HEADER_SIZE;
|
|
631
|
+
for (size_t i = 0; i < sb->num_layers; i++)
|
|
632
|
+
total_size += LAYER_META + sb->layers[i]->size;
|
|
633
|
+
|
|
634
|
+
VALUE str = rb_str_buf_new((long)total_size);
|
|
635
|
+
rb_str_set_len(str, (long)total_size);
|
|
636
|
+
uint8_t *buf = (uint8_t *)RSTRING_PTR(str);
|
|
637
|
+
size_t off = 0;
|
|
638
|
+
|
|
639
|
+
write_le32(buf + off, SERIAL_VERSION);
|
|
640
|
+
off += 4;
|
|
641
|
+
write_le32(buf + off, 0);
|
|
642
|
+
off += 4;
|
|
643
|
+
write_le_double(buf + off, sb->error_rate);
|
|
644
|
+
off += 8;
|
|
645
|
+
write_le_double(buf + off, sb->tightening);
|
|
646
|
+
off += 8;
|
|
647
|
+
write_le64(buf + off, (uint64_t)sb->initial_capacity);
|
|
648
|
+
off += 8;
|
|
649
|
+
write_le64(buf + off, (uint64_t)sb->total_count);
|
|
650
|
+
off += 8;
|
|
651
|
+
write_le64(buf + off, (uint64_t)sb->num_layers);
|
|
652
|
+
off += 8;
|
|
653
|
+
|
|
654
|
+
for (size_t i = 0; i < sb->num_layers; i++) {
|
|
655
|
+
BloomLayer *l = sb->layers[i];
|
|
656
|
+
write_le64(buf + off, (uint64_t)l->capacity);
|
|
657
|
+
off += 8;
|
|
658
|
+
write_le64(buf + off, (uint64_t)l->count);
|
|
659
|
+
off += 8;
|
|
660
|
+
write_le64(buf + off, (uint64_t)l->size);
|
|
661
|
+
off += 8;
|
|
662
|
+
write_le32(buf + off, (uint32_t)l->num_hashes);
|
|
663
|
+
off += 4;
|
|
664
|
+
write_le32(buf + off, 0);
|
|
665
|
+
off += 4;
|
|
666
|
+
memcpy(buf + off, l->bits, l->size);
|
|
667
|
+
off += l->size;
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
return str;
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
static VALUE bloom_load(VALUE klass, VALUE data) {
|
|
674
|
+
Check_Type(data, T_STRING);
|
|
675
|
+
|
|
676
|
+
const uint8_t *buf = (const uint8_t *)RSTRING_PTR(data);
|
|
677
|
+
size_t data_len = (size_t)RSTRING_LEN(data);
|
|
678
|
+
|
|
679
|
+
if (data_len < HEADER_SIZE)
|
|
680
|
+
rb_raise(rb_eArgError, "data too short for bloom filter header");
|
|
681
|
+
|
|
682
|
+
size_t off = 0;
|
|
683
|
+
uint32_t version = read_le32(buf + off);
|
|
684
|
+
off += 4;
|
|
685
|
+
if (version != SERIAL_VERSION)
|
|
686
|
+
rb_raise(rb_eArgError, "unsupported serialization version: %u", version);
|
|
687
|
+
off += 4;
|
|
688
|
+
|
|
689
|
+
VALUE obj = bloom_alloc(klass);
|
|
690
|
+
ScalableBloom *sb;
|
|
691
|
+
TypedData_Get_Struct(obj, ScalableBloom, &scalable_bloom_type, sb);
|
|
692
|
+
|
|
693
|
+
sb->error_rate = read_le_double(buf + off);
|
|
694
|
+
off += 8;
|
|
695
|
+
sb->tightening = read_le_double(buf + off);
|
|
696
|
+
off += 8;
|
|
697
|
+
sb->initial_capacity = (size_t)read_le64(buf + off);
|
|
698
|
+
off += 8;
|
|
699
|
+
sb->total_count = (size_t)read_le64(buf + off);
|
|
700
|
+
off += 8;
|
|
701
|
+
|
|
702
|
+
size_t num_layers = (size_t)read_le64(buf + off);
|
|
703
|
+
off += 8;
|
|
704
|
+
|
|
705
|
+
if (sb->error_rate <= 0 || sb->error_rate >= 1)
|
|
706
|
+
rb_raise(rb_eArgError, "invalid error_rate in serialized data");
|
|
707
|
+
if (sb->tightening <= 0 || sb->tightening >= 1)
|
|
708
|
+
rb_raise(rb_eArgError, "invalid tightening in serialized data");
|
|
709
|
+
if (num_layers > 1000)
|
|
710
|
+
rb_raise(rb_eArgError, "unreasonable number of layers: %zu", num_layers);
|
|
711
|
+
|
|
712
|
+
sb->layers_cap = num_layers < 4 ? 4 : num_layers;
|
|
713
|
+
sb->layers = (BloomLayer **)calloc(sb->layers_cap, sizeof(BloomLayer *));
|
|
714
|
+
if (!sb->layers)
|
|
715
|
+
rb_raise(rb_eNoMemError, "failed to allocate layers array");
|
|
716
|
+
|
|
717
|
+
for (size_t i = 0; i < num_layers; i++) {
|
|
718
|
+
if (off + LAYER_META > data_len) {
|
|
719
|
+
for (size_t j = 0; j < sb->num_layers; j++)
|
|
720
|
+
layer_free(sb->layers[j]);
|
|
721
|
+
sb->num_layers = 0;
|
|
722
|
+
rb_raise(rb_eArgError, "data truncated at layer %zu metadata", i);
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
BloomLayer *l = (BloomLayer *)calloc(1, sizeof(BloomLayer));
|
|
726
|
+
if (!l) {
|
|
727
|
+
for (size_t j = 0; j < sb->num_layers; j++)
|
|
728
|
+
layer_free(sb->layers[j]);
|
|
729
|
+
sb->num_layers = 0;
|
|
730
|
+
rb_raise(rb_eNoMemError, "failed to allocate layer");
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
l->capacity = (size_t)read_le64(buf + off);
|
|
734
|
+
off += 8;
|
|
735
|
+
l->count = (size_t)read_le64(buf + off);
|
|
736
|
+
off += 8;
|
|
737
|
+
l->size = (size_t)read_le64(buf + off);
|
|
738
|
+
off += 8;
|
|
739
|
+
l->num_hashes = (int)read_le32(buf + off);
|
|
740
|
+
off += 4;
|
|
741
|
+
off += 4;
|
|
742
|
+
|
|
743
|
+
if (l->size > (1ULL << 30) || off + l->size > data_len) {
|
|
744
|
+
free(l);
|
|
745
|
+
for (size_t j = 0; j < sb->num_layers; j++)
|
|
746
|
+
layer_free(sb->layers[j]);
|
|
747
|
+
sb->num_layers = 0;
|
|
748
|
+
rb_raise(rb_eArgError, "invalid or truncated layer %zu", i);
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
l->bits = (uint8_t *)malloc(l->size);
|
|
752
|
+
if (!l->bits) {
|
|
753
|
+
free(l);
|
|
754
|
+
for (size_t j = 0; j < sb->num_layers; j++)
|
|
755
|
+
layer_free(sb->layers[j]);
|
|
756
|
+
sb->num_layers = 0;
|
|
757
|
+
rb_raise(rb_eNoMemError, "failed to allocate bits");
|
|
758
|
+
}
|
|
759
|
+
memcpy(l->bits, buf + off, l->size);
|
|
760
|
+
off += l->size;
|
|
761
|
+
|
|
762
|
+
sb->layers[sb->num_layers++] = l;
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
return obj;
|
|
766
|
+
}
|
|
767
|
+
|
|
258
768
|
void Init_fast_bloom_filter(void) {
|
|
259
769
|
VALUE mFastBloomFilter = rb_define_module("FastBloomFilter");
|
|
260
|
-
VALUE
|
|
261
|
-
|
|
262
|
-
rb_define_alloc_func(
|
|
263
|
-
rb_define_method(
|
|
264
|
-
rb_define_method(
|
|
265
|
-
rb_define_method(
|
|
266
|
-
rb_define_method(
|
|
267
|
-
rb_define_method(
|
|
268
|
-
rb_define_method(
|
|
269
|
-
rb_define_method(
|
|
270
|
-
rb_define_method(
|
|
770
|
+
VALUE cFilter = rb_define_class_under(mFastBloomFilter, "Filter", rb_cObject);
|
|
771
|
+
|
|
772
|
+
rb_define_alloc_func(cFilter, bloom_alloc);
|
|
773
|
+
rb_define_method(cFilter, "initialize", bloom_initialize, -1);
|
|
774
|
+
rb_define_method(cFilter, "add", bloom_add, 1);
|
|
775
|
+
rb_define_method(cFilter, "<<", bloom_add, 1);
|
|
776
|
+
rb_define_method(cFilter, "add_if_absent", bloom_add_if_absent, 1);
|
|
777
|
+
rb_define_method(cFilter, "include?", bloom_include, 1);
|
|
778
|
+
rb_define_method(cFilter, "member?", bloom_include, 1);
|
|
779
|
+
rb_define_method(cFilter, "clear", bloom_clear, 0);
|
|
780
|
+
rb_define_method(cFilter, "stats", bloom_stats, 0);
|
|
781
|
+
rb_define_method(cFilter, "count", bloom_count, 0);
|
|
782
|
+
rb_define_method(cFilter, "size", bloom_count, 0);
|
|
783
|
+
rb_define_method(cFilter, "num_layers", bloom_num_layers, 0);
|
|
784
|
+
rb_define_method(cFilter, "merge!", bloom_merge, 1);
|
|
785
|
+
rb_define_method(cFilter, "dump", bloom_dump, 0);
|
|
786
|
+
|
|
787
|
+
rb_define_singleton_method(cFilter, "load", bloom_load, 1);
|
|
271
788
|
}
|