RubyGems - fast_bloom_filter - Versions diffs - 1.0.0 → 2.1.0 - Mend

fast_bloom_filter 1.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +75 -0
data/README.md +138 -48
data/ext/fast_bloom_filter/fast_bloom_filter.c +733 -216
data/lib/fast_bloom_filter/version.rb +1 -1
data/lib/fast_bloom_filter.rb +13 -13
metadata +12 -12

data/ext/fast_bloom_filter/fast_bloom_filter.c CHANGED Viewed

@@ -1,271 +1,788 @@
-/*
- * FastBloomFilter - High-performance Bloom Filter implementation for Ruby
- * Copyright (c) 2025
- */
 #include <ruby.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdlib.h>
 #include <math.h>
-/* Bloom Filter structure */
-typedef struct {
-    uint8_t *bits;      /* Bit array */
-    size_t size;        /* Size in bytes */
-    size_t capacity;    /* Expected number of elements */
-    int num_hashes;     /* Number of hash functions */
-} BloomFilter;
-/* GC: Free memory */
-static void bloom_free(void *ptr) {
-    BloomFilter *bloom = (BloomFilter *)ptr;
-    if (bloom->bits) {
-        free(bloom->bits);
-    }
-    free(bloom);
-}
-/* GC: Report memory size */
-static size_t bloom_memsize(const void *ptr) {
-    const BloomFilter *bloom = (const BloomFilter *)ptr;
-    return sizeof(BloomFilter) + bloom->size;
-}
-static const rb_data_type_t bloom_type = {
-    "BloomFilter",
-    {NULL, bloom_free, bloom_memsize},
-    NULL, NULL,
-    RUBY_TYPED_FREE_IMMEDIATELY
-};
-/*
- * MurmurHash3 32-bit implementation
- */
-static uint32_t murmur3_32(const uint8_t *key, size_t len, uint32_t seed) {
-    uint32_t h = seed;
-    const uint32_t c1 = 0xcc9e2d51;
-    const uint32_t c2 = 0x1b873593;
-    const int nblocks = len / 4;
-    const uint32_t *blocks = (const uint32_t *)(key);
-    for (int i = 0; i < nblocks; i++) {
-        uint32_t k1 = blocks[i];
+static inline uint64_t load_u64(const void *p) {
+    uint64_t v;
+    memcpy(&v, p, sizeof(v));
+    return v;
+}
+static inline size_t popcount64(uint64_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+    return (size_t)__builtin_popcountll(x);
+#elif defined(_MSC_VER) && defined(_M_X64)
+    return (size_t)__popcnt64(x);
+#else
+    x = x - ((x >> 1) & 0x5555555555555555ULL);
+    x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+    x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+    return (size_t)((x * 0x0101010101010101ULL) >> 56);
+#endif
+}
+static inline uint64_t rotl64(uint64_t x, int r) {
+    return (x << r) | (x >> (64 - r));
+}
+static inline void write_le64(uint8_t *dst, uint64_t v) {
+    dst[0] = (uint8_t)(v);
+    dst[1] = (uint8_t)(v >> 8);
+    dst[2] = (uint8_t)(v >> 16);
+    dst[3] = (uint8_t)(v >> 24);
+    dst[4] = (uint8_t)(v >> 32);
+    dst[5] = (uint8_t)(v >> 40);
+    dst[6] = (uint8_t)(v >> 48);
+    dst[7] = (uint8_t)(v >> 56);
+}
+static inline uint64_t read_le64(const uint8_t *src) {
+    return (uint64_t)src[0] | (uint64_t)src[1] << 8 | (uint64_t)src[2] << 16 |
+           (uint64_t)src[3] << 24 | (uint64_t)src[4] << 32 | (uint64_t)src[5] << 40 |
+           (uint64_t)src[6] << 48 | (uint64_t)src[7] << 56;
+}
+static inline void write_le32(uint8_t *dst, uint32_t v) {
+    dst[0] = (uint8_t)(v);
+    dst[1] = (uint8_t)(v >> 8);
+    dst[2] = (uint8_t)(v >> 16);
+    dst[3] = (uint8_t)(v >> 24);
+}
+static inline uint32_t read_le32(const uint8_t *src) {
+    return (uint32_t)src[0] | (uint32_t)src[1] << 8 | (uint32_t)src[2] << 16 |
+           (uint32_t)src[3] << 24;
+}
+static inline void write_le_double(uint8_t *dst, double v) {
+    uint64_t bits;
+    memcpy(&bits, &v, 8);
+    write_le64(dst, bits);
+}
+static inline double read_le_double(const uint8_t *src) {
+    uint64_t bits = read_le64(src);
+    double v;
+    memcpy(&v, &bits, 8);
+    return v;
+}
+static void murmur3_128(const uint8_t *key, size_t len, uint64_t seed, uint64_t *out_h1,
+                        uint64_t *out_h2) {
+    const size_t nblocks = len / 16;
+    uint64_t h1 = seed, h2 = seed;
+    const uint64_t c1 = 0x87c37b91114253d5ULL;
+    const uint64_t c2 = 0x4cf5ad432745937fULL;
+    const uint8_t *body = key;
+    for (size_t i = 0; i < nblocks; i++) {
+        uint64_t k1 = load_u64(body + i * 16);
+        uint64_t k2 = load_u64(body + i * 16 + 8);
         k1 *= c1;
-        k1 = (k1 << 15) | (k1 >> 17);
+        k1 = rotl64(k1, 31);
         k1 *= c2;
-        h ^= k1;
-        h = (h << 13) | (h >> 19);
-        h = h * 5 + 0xe6546b64;
+        h1 ^= k1;
+        h1 = rotl64(h1, 27);
+        h1 += h2;
+        h1 = h1 * 5 + 0x52dce729;
+        k2 *= c2;
+        k2 = rotl64(k2, 33);
+        k2 *= c1;
+        h2 ^= k2;
+        h2 = rotl64(h2, 31);
+        h2 += h1;
+        h2 = h2 * 5 + 0x38495ab5;
     }
-    const uint8_t *tail = (const uint8_t *)(key + nblocks * 4);
-    uint32_t k1 = 0;
-    switch (len & 3) {
-        case 3: k1 ^= tail[2] << 16;
-        case 2: k1 ^= tail[1] << 8;
-        case 1: k1 ^= tail[0];
-            k1 *= c1;
-            k1 = (k1 << 15) | (k1 >> 17);
-            k1 *= c2;
-            h ^= k1;
+    const uint8_t *tail = key + nblocks * 16;
+    uint64_t k1 = 0, k2 = 0;
+    switch (len & 15) {
+    case 15:
+        k2 ^= (uint64_t)tail[14] << 48;
+    case 14:
+        k2 ^= (uint64_t)tail[13] << 40;
+    case 13:
+        k2 ^= (uint64_t)tail[12] << 32;
+    case 12:
+        k2 ^= (uint64_t)tail[11] << 24;
+    case 11:
+        k2 ^= (uint64_t)tail[10] << 16;
+    case 10:
+        k2 ^= (uint64_t)tail[9] << 8;
+    case 9:
+        k2 ^= (uint64_t)tail[8];
+        k2 *= c2;
+        k2 = rotl64(k2, 33);
+        k2 *= c1;
+        h2 ^= k2;
+    case 8:
+        k1 ^= (uint64_t)tail[7] << 56;
+    case 7:
+        k1 ^= (uint64_t)tail[6] << 48;
+    case 6:
+        k1 ^= (uint64_t)tail[5] << 40;
+    case 5:
+        k1 ^= (uint64_t)tail[4] << 32;
+    case 4:
+        k1 ^= (uint64_t)tail[3] << 24;
+    case 3:
+        k1 ^= (uint64_t)tail[2] << 16;
+    case 2:
+        k1 ^= (uint64_t)tail[1] << 8;
+    case 1:
+        k1 ^= (uint64_t)tail[0];
+        k1 *= c1;
+        k1 = rotl64(k1, 31);
+        k1 *= c2;
+        h1 ^= k1;
     }
-    h ^= len;
-    h ^= h >> 16;
-    h *= 0x85ebca6b;
-    h ^= h >> 13;
-    h *= 0xc2b2ae35;
-    h ^= h >> 16;
-    return h;
-}
-/* Set bit at position */
+    h1 ^= (uint64_t)len;
+    h2 ^= (uint64_t)len;
+    h1 += h2;
+    h2 += h1;
+    h1 ^= h1 >> 33;
+    h1 *= 0xff51afd7ed558ccdULL;
+    h1 ^= h1 >> 33;
+    h1 *= 0xc4ceb9fe1a85ec53ULL;
+    h1 ^= h1 >> 33;
+    h2 ^= h2 >> 33;
+    h2 *= 0xff51afd7ed558ccdULL;
+    h2 ^= h2 >> 33;
+    h2 *= 0xc4ceb9fe1a85ec53ULL;
+    h2 ^= h2 >> 33;
+    h1 += h2;
+    h2 += h1;
+    *out_h1 = h1;
+    *out_h2 = h2;
+}
+typedef struct {
+    uint8_t *bits;
+    size_t size;
+    size_t capacity;
+    size_t count;
+    int num_hashes;
+} BloomLayer;
+typedef struct {
+    BloomLayer **layers;
+    size_t num_layers;
+    size_t layers_cap;
+    double error_rate;
+    double tightening;
+    size_t initial_capacity;
+    size_t total_count;
+} ScalableBloom;
+#define DEFAULT_ERROR_RATE  0.01
+#define DEFAULT_INITIAL_CAP 8192
+#define DEFAULT_TIGHTENING  0.85
+#define MAX_HASHES          20
+#define MIN_HASHES          1
+#define GROWTH_FACTOR       2.0
+#define MURMUR_SEED         0x9747b28cULL
+#define SERIAL_VERSION      1
+#define HEADER_SIZE         48
+#define LAYER_META          32
+#define MAX_BITS_ALLOC      (1ULL << 36)
 static inline void set_bit(uint8_t *bits, size_t pos) {
-    bits[pos / 8] |= (1 << (pos % 8));
+    bits[pos >> 3] |= (uint8_t)(1u << (pos & 7));
 }
-/* Get bit at position */
 static inline int get_bit(const uint8_t *bits, size_t pos) {
-    return (bits[pos / 8] & (1 << (pos % 8))) != 0;
+    return (bits[pos >> 3] & (1u << (pos & 7))) != 0;
+}
+static BloomLayer *layer_create(size_t capacity, double error_rate) {
+    BloomLayer *layer = (BloomLayer *)calloc(1, sizeof(BloomLayer));
+    if (!layer)
+        return NULL;
+    const double ln2 = 0.693147180559945309417;
+    const double ln2_sq = ln2 * ln2;
+    size_t bits_count = (size_t)(-(double)capacity * log(error_rate) / ln2_sq);
+    if (bits_count < 64)
+        bits_count = 64;
+    if (bits_count > MAX_BITS_ALLOC) {
+        free(layer);
+        return NULL;
+    }
+    layer->size = (bits_count + 7) / 8;
+    layer->capacity = capacity;
+    layer->count = 0;
+    layer->num_hashes = (int)((double)bits_count / (double)capacity * ln2);
+    if (layer->num_hashes < MIN_HASHES)
+        layer->num_hashes = MIN_HASHES;
+    if (layer->num_hashes > MAX_HASHES)
+        layer->num_hashes = MAX_HASHES;
+    layer->bits = (uint8_t *)calloc(layer->size, sizeof(uint8_t));
+    if (!layer->bits) {
+        free(layer);
+        return NULL;
+    }
+    return layer;
 }
-/* Allocate BloomFilter object */
+static void layer_free(BloomLayer *layer) {
+    if (layer) {
+        free(layer->bits);
+        free(layer);
+    }
+}
+static inline int layer_is_full(const BloomLayer *layer) {
+    return layer->count >= layer->capacity;
+}
+static inline void layer_hash(const char *data, size_t len, uint64_t *h1, uint64_t *h2) {
+    murmur3_128((const uint8_t *)data, len, MURMUR_SEED, h1, h2);
+}
+static void layer_add(BloomLayer *layer, const char *data, size_t len) {
+    const size_t bits_count = layer->size * 8;
+    uint64_t h1, h2;
+    layer_hash(data, len, &h1, &h2);
+    for (int i = 0; i < layer->num_hashes; i++) {
+        uint64_t combined = h1 + (uint64_t)i * h2;
+        set_bit(layer->bits, (size_t)(combined % bits_count));
+    }
+    layer->count++;
+}
+static int layer_include(const BloomLayer *layer, const char *data, size_t len) {
+    const size_t bits_count = layer->size * 8;
+    uint64_t h1, h2;
+    layer_hash(data, len, &h1, &h2);
+    for (int i = 0; i < layer->num_hashes; i++) {
+        uint64_t combined = h1 + (uint64_t)i * h2;
+        if (!get_bit(layer->bits, (size_t)(combined % bits_count)))
+            return 0;
+    }
+    return 1;
+}
+static size_t layer_bits_set(const BloomLayer *layer) {
+    size_t count = 0;
+    size_t i = 0;
+    for (; i + 8 <= layer->size; i += 8) {
+        uint64_t word;
+        memcpy(&word, layer->bits + i, 8);
+        count += popcount64(word);
+    }
+    for (; i < layer->size; i++)
+        count += popcount64((uint64_t)layer->bits[i]);
+    return count;
+}
+static double layer_error_rate(double total_fpr, double r, size_t index) {
+    return total_fpr * (1.0 - r) * pow(r, (double)index);
+}
+static double layer_estimated_fpr(const BloomLayer *layer) {
+    double m = (double)(layer->size * 8);
+    double k = (double)layer->num_hashes;
+    double n = (double)layer->count;
+    return pow(1.0 - exp(-k * n / m), k);
+}
+static BloomLayer *scalable_add_layer(ScalableBloom *sb) {
+    size_t new_cap;
+    if (sb->num_layers == 0) {
+        new_cap = sb->initial_capacity;
+    } else {
+        new_cap = (size_t)(sb->layers[sb->num_layers - 1]->capacity * GROWTH_FACTOR);
+    }
+    double fpr = layer_error_rate(sb->error_rate, sb->tightening, sb->num_layers);
+    if (fpr < 1e-15)
+        fpr = 1e-15;
+    BloomLayer *layer = layer_create(new_cap, fpr);
+    if (!layer)
+        return NULL;
+    if (sb->num_layers >= sb->layers_cap) {
+        size_t new_slots = sb->layers_cap == 0 ? 4 : sb->layers_cap * 2;
+        BloomLayer **tmp = (BloomLayer **)realloc(sb->layers, new_slots * sizeof(BloomLayer *));
+        if (!tmp) {
+            layer_free(layer);
+            return NULL;
+        }
+        sb->layers = tmp;
+        sb->layers_cap = new_slots;
+    }
+    sb->layers[sb->num_layers++] = layer;
+    return layer;
+}
+static void bloom_free_scalable(void *ptr) {
+    ScalableBloom *sb = (ScalableBloom *)ptr;
+    for (size_t i = 0; i < sb->num_layers; i++)
+        layer_free(sb->layers[i]);
+    free(sb->layers);
+    free(sb);
+}
+static size_t bloom_memsize_scalable(const void *ptr) {
+    const ScalableBloom *sb = (const ScalableBloom *)ptr;
+    size_t total = sizeof(ScalableBloom);
+    total += sb->layers_cap * sizeof(BloomLayer *);
+    for (size_t i = 0; i < sb->num_layers; i++)
+        total += sizeof(BloomLayer) + sb->layers[i]->size;
+    return total;
+}
+static const rb_data_type_t scalable_bloom_type = {
+    "ScalableBloomFilter",
+    {NULL, bloom_free_scalable, bloom_memsize_scalable},
+    NULL,
+    NULL,
+    RUBY_TYPED_FREE_IMMEDIATELY};
 static VALUE bloom_alloc(VALUE klass) {
-    BloomFilter *bloom = ALLOC(BloomFilter);
-    bloom->bits = NULL;
-    bloom->size = 0;
-    bloom->capacity = 0;
-    bloom->num_hashes = 0;
-    return TypedData_Wrap_Struct(klass, &bloom_type, bloom);
-}
-/*
- * Initialize Bloom Filter
- *
- * @param capacity [Integer] Expected number of elements
- * @param error_rate [Float] Desired false positive rate (default: 0.01)
- */
+    ScalableBloom *sb = (ScalableBloom *)calloc(1, sizeof(ScalableBloom));
+    if (!sb)
+        rb_raise(rb_eNoMemError, "failed to allocate ScalableBloom");
+    return TypedData_Wrap_Struct(klass, &scalable_bloom_type, sb);
+}
 static VALUE bloom_initialize(int argc, VALUE *argv, VALUE self) {
-    VALUE capacity_val, error_rate_val;
-    rb_scan_args(argc, argv, "11", &capacity_val, &error_rate_val);
-    long capacity = NUM2LONG(capacity_val);
-    double error_rate = NIL_P(error_rate_val) ? 0.01 : NUM2DBL(error_rate_val);
-    if (capacity <= 0) {
-        rb_raise(rb_eArgError, "capacity must be positive");
-    }
-    if (error_rate <= 0 || error_rate >= 1) {
-        rb_raise(rb_eArgError, "error_rate must be between 0 and 1");
+    VALUE opts = Qnil;
+    if (argc == 0) {
+    } else if (argc == 1 && RB_TYPE_P(argv[0], T_HASH)) {
+        opts = argv[0];
+    } else {
+        rb_raise(rb_eArgError,
+                 "wrong number of arguments (given %d, expected 0 or keyword arguments)", argc);
     }
-    BloomFilter *bloom;
-    TypedData_Get_Struct(self, BloomFilter, &bloom_type, bloom);
-    /* Calculate optimal parameters */
-    double ln2 = 0.693147180559945309417;
-    double ln2_sq = ln2 * ln2;
-    size_t bits_count = (size_t)(-(capacity * log(error_rate)) / ln2_sq);
-    bloom->size = (bits_count + 7) / 8;
-    bloom->capacity = capacity;
-    bloom->num_hashes = (int)((bits_count / (double)capacity) * ln2);
-    if (bloom->num_hashes < 1) bloom->num_hashes = 1;
-    if (bloom->num_hashes > 10) bloom->num_hashes = 10;
-    bloom->bits = (uint8_t *)calloc(bloom->size, sizeof(uint8_t));
-    if (!bloom->bits) {
-        rb_raise(rb_eNoMemError, "failed to allocate memory");
+    double error_rate = DEFAULT_ERROR_RATE;
+    size_t initial_capacity = DEFAULT_INITIAL_CAP;
+    double tightening = DEFAULT_TIGHTENING;
+    if (!NIL_P(opts)) {
+        VALUE v;
+        v = rb_hash_aref(opts, ID2SYM(rb_intern("error_rate")));
+        if (!NIL_P(v))
+            error_rate = NUM2DBL(v);
+        v = rb_hash_aref(opts, ID2SYM(rb_intern("initial_capacity")));
+        if (!NIL_P(v))
+            initial_capacity = (size_t)NUM2LONG(v);
+        v = rb_hash_aref(opts, ID2SYM(rb_intern("tightening")));
+        if (!NIL_P(v))
+            tightening = NUM2DBL(v);
     }
+    if (error_rate <= 0 || error_rate >= 1)
+        rb_raise(rb_eArgError, "error_rate must be between 0 and 1 (exclusive)");
+    if (initial_capacity == 0)
+        rb_raise(rb_eArgError, "initial_capacity must be positive");
+    if (tightening <= 0 || tightening >= 1)
+        rb_raise(rb_eArgError, "tightening must be between 0 and 1 (exclusive)");
+    ScalableBloom *sb;
+    TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
+    sb->error_rate = error_rate;
+    sb->initial_capacity = initial_capacity;
+    sb->tightening = tightening;
+    sb->total_count = 0;
+    if (!scalable_add_layer(sb))
+        rb_raise(rb_eNoMemError, "failed to allocate initial layer");
     return self;
 }
-/*
- * Add element to filter
- */
 static VALUE bloom_add(VALUE self, VALUE str) {
-    BloomFilter *bloom;
-    TypedData_Get_Struct(self, BloomFilter, &bloom_type, bloom);
-    Check_Type(str, T_STRING);
+    ScalableBloom *sb;
+    TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
+    str = StringValue(str);
+    BloomLayer *active = sb->layers[sb->num_layers - 1];
+    if (layer_is_full(active)) {
+        active = scalable_add_layer(sb);
+        if (!active)
+            rb_raise(rb_eNoMemError, "failed to allocate new layer");
+    }
+    layer_add(active, RSTRING_PTR(str), RSTRING_LEN(str));
+    sb->total_count++;
+    return Qtrue;
+}
+static VALUE bloom_add_if_absent(VALUE self, VALUE str) {
+    ScalableBloom *sb;
+    TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
+    str = StringValue(str);
     const char *data = RSTRING_PTR(str);
     size_t len = RSTRING_LEN(str);
-    size_t bits_count = bloom->size * 8;
-    for (int i = 0; i < bloom->num_hashes; i++) {
-        uint32_t hash = murmur3_32((const uint8_t *)data, len, i);
-        size_t pos = hash % bits_count;
-        set_bit(bloom->bits, pos);
+    for (size_t i = sb->num_layers; i > 0; i--) {
+        if (sb->layers[i - 1]->count == 0)
+            continue;
+        if (layer_include(sb->layers[i - 1], data, len))
+            return Qfalse;
     }
+    BloomLayer *active = sb->layers[sb->num_layers - 1];
+    if (layer_is_full(active)) {
+        active = scalable_add_layer(sb);
+        if (!active)
+            rb_raise(rb_eNoMemError, "failed to allocate new layer");
+    }
+    layer_add(active, data, len);
+    sb->total_count++;
     return Qtrue;
 }
-/*
- * Check if element might be in filter
- */
 static VALUE bloom_include(VALUE self, VALUE str) {
-    BloomFilter *bloom;
-    TypedData_Get_Struct(self, BloomFilter, &bloom_type, bloom);
-    Check_Type(str, T_STRING);
+    ScalableBloom *sb;
+    TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
+    str = StringValue(str);
     const char *data = RSTRING_PTR(str);
     size_t len = RSTRING_LEN(str);
-    size_t bits_count = bloom->size * 8;
-    for (int i = 0; i < bloom->num_hashes; i++) {
-        uint32_t hash = murmur3_32((const uint8_t *)data, len, i);
-        size_t pos = hash % bits_count;
-        if (!get_bit(bloom->bits, pos)) {
-            return Qfalse;
-        }
+    for (size_t i = sb->num_layers; i > 0; i--) {
+        if (sb->layers[i - 1]->count == 0)
+            continue;
+        if (layer_include(sb->layers[i - 1], data, len))
+            return Qtrue;
     }
-    return Qtrue;
+    return Qfalse;
 }
-/*
- * Clear all bits
- */
 static VALUE bloom_clear(VALUE self) {
-    BloomFilter *bloom;
-    TypedData_Get_Struct(self, BloomFilter, &bloom_type, bloom);
-    memset(bloom->bits, 0, bloom->size);
+    ScalableBloom *sb;
+    TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
+    for (size_t i = 0; i < sb->num_layers; i++)
+        layer_free(sb->layers[i]);
+    sb->num_layers = 0;
+    sb->total_count = 0;
+    if (!scalable_add_layer(sb))
+        rb_raise(rb_eNoMemError, "failed to allocate layer after clear");
     return Qnil;
 }
-/*
- * Get filter statistics
- */
 static VALUE bloom_stats(VALUE self) {
-    BloomFilter *bloom;
-    TypedData_Get_Struct(self, BloomFilter, &bloom_type, bloom);
-    size_t bits_set = 0;
-    size_t total_bits = bloom->size * 8;
-    for (size_t i = 0; i < bloom->size; i++) {
-        uint8_t byte = bloom->bits[i];
-        while (byte) {
-            bits_set += byte & 1;
-            byte >>= 1;
-        }
+    ScalableBloom *sb;
+    TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
+    size_t total_bytes = 0;
+    size_t total_bits = 0;
+    size_t total_bits_set = 0;
+    double combined_fpr = 1.0;
+    VALUE layers_ary = rb_ary_new_capa((long)sb->num_layers);
+    for (size_t i = 0; i < sb->num_layers; i++) {
+        BloomLayer *l = sb->layers[i];
+        size_t bs = layer_bits_set(l);
+        size_t tb = l->size * 8;
+        double est_fpr = layer_estimated_fpr(l);
+        total_bytes += l->size;
+        total_bits += tb;
+        total_bits_set += bs;
+        combined_fpr *= (1.0 - est_fpr);
+        VALUE lh = rb_hash_new();
+        rb_hash_aset(lh, ID2SYM(rb_intern("layer")), LONG2NUM(i));
+        rb_hash_aset(lh, ID2SYM(rb_intern("capacity")), LONG2NUM(l->capacity));
+        rb_hash_aset(lh, ID2SYM(rb_intern("count")), LONG2NUM(l->count));
+        rb_hash_aset(lh, ID2SYM(rb_intern("size_bytes")), LONG2NUM(l->size));
+        rb_hash_aset(lh, ID2SYM(rb_intern("num_hashes")), INT2NUM(l->num_hashes));
+        rb_hash_aset(lh, ID2SYM(rb_intern("bits_set")), LONG2NUM(bs));
+        rb_hash_aset(lh, ID2SYM(rb_intern("total_bits")), LONG2NUM(tb));
+        rb_hash_aset(lh, ID2SYM(rb_intern("fill_ratio")), DBL2NUM((double)bs / tb));
+        rb_hash_aset(lh, ID2SYM(rb_intern("target_error_rate")),
+                     DBL2NUM(layer_error_rate(sb->error_rate, sb->tightening, i)));
+        rb_hash_aset(lh, ID2SYM(rb_intern("estimated_error_rate")), DBL2NUM(est_fpr));
+        rb_ary_push(layers_ary, lh);
     }
-    double fill_ratio = (double)bits_set / total_bits;
+    double total_est_fpr = 1.0 - combined_fpr;
     VALUE hash = rb_hash_new();
-    rb_hash_aset(hash, ID2SYM(rb_intern("capacity")), LONG2NUM(bloom->capacity));
-    rb_hash_aset(hash, ID2SYM(rb_intern("size_bytes")), LONG2NUM(bloom->size));
-    rb_hash_aset(hash, ID2SYM(rb_intern("num_hashes")), INT2NUM(bloom->num_hashes));
-    rb_hash_aset(hash, ID2SYM(rb_intern("bits_set")), LONG2NUM(bits_set));
+    rb_hash_aset(hash, ID2SYM(rb_intern("total_count")), LONG2NUM(sb->total_count));
+    rb_hash_aset(hash, ID2SYM(rb_intern("num_layers")), LONG2NUM(sb->num_layers));
+    rb_hash_aset(hash, ID2SYM(rb_intern("total_bytes")), LONG2NUM(total_bytes));
     rb_hash_aset(hash, ID2SYM(rb_intern("total_bits")), LONG2NUM(total_bits));
-    rb_hash_aset(hash, ID2SYM(rb_intern("fill_ratio")), DBL2NUM(fill_ratio));
+    rb_hash_aset(hash, ID2SYM(rb_intern("total_bits_set")), LONG2NUM(total_bits_set));
+    rb_hash_aset(hash, ID2SYM(rb_intern("fill_ratio")),
+                 DBL2NUM((double)total_bits_set / total_bits));
+    rb_hash_aset(hash, ID2SYM(rb_intern("target_error_rate")), DBL2NUM(sb->error_rate));
+    rb_hash_aset(hash, ID2SYM(rb_intern("estimated_error_rate")), DBL2NUM(total_est_fpr));
+    rb_hash_aset(hash, ID2SYM(rb_intern("layers")), layers_ary);
     return hash;
 }
-/*
- * Merge another filter
- */
+static VALUE bloom_count(VALUE self) {
+    ScalableBloom *sb;
+    TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
+    return LONG2NUM(sb->total_count);
+}
+static VALUE bloom_num_layers(VALUE self) {
+    ScalableBloom *sb;
+    TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
+    return LONG2NUM(sb->num_layers);
+}
 static VALUE bloom_merge(VALUE self, VALUE other) {
-    BloomFilter *bloom1, *bloom2;
-    TypedData_Get_Struct(self, BloomFilter, &bloom_type, bloom1);
-    TypedData_Get_Struct(other, BloomFilter, &bloom_type, bloom2);
-    if (bloom1->size != bloom2->size || bloom1->num_hashes != bloom2->num_hashes) {
-        rb_raise(rb_eArgError, "cannot merge filters with different parameters");
-    }
-    for (size_t i = 0; i < bloom1->size; i++) {
-        bloom1->bits[i] |= bloom2->bits[i];
+    ScalableBloom *sb1, *sb2;
+    TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb1);
+    TypedData_Get_Struct(other, ScalableBloom, &scalable_bloom_type, sb2);
+    if (fabs(sb1->error_rate - sb2->error_rate) > 1e-10)
+        rb_raise(rb_eArgError, "cannot merge filters with different error rates (%.6f vs %.6f)",
+                 sb1->error_rate, sb2->error_rate);
+    if (fabs(sb1->tightening - sb2->tightening) > 1e-10)
+        rb_raise(rb_eArgError,
+                 "cannot merge filters with different tightening ratios (%.6f vs %.6f)",
+                 sb1->tightening, sb2->tightening);
+    for (size_t i = 0; i < sb2->num_layers; i++) {
+        BloomLayer *src = sb2->layers[i];
+        int merged = 0;
+        if (i < sb1->num_layers) {
+            BloomLayer *dst = sb1->layers[i];
+            if (dst->size == src->size && dst->num_hashes == src->num_hashes) {
+                size_t j = 0;
+                for (; j + 8 <= dst->size; j += 8) {
+                    uint64_t a, b;
+                    memcpy(&a, dst->bits + j, 8);
+                    memcpy(&b, src->bits + j, 8);
+                    a |= b;
+                    memcpy(dst->bits + j, &a, 8);
+                }
+                for (; j < dst->size; j++)
+                    dst->bits[j] |= src->bits[j];
+                size_t new_count = dst->count + src->count;
+                dst->count = new_count < dst->capacity ? new_count : dst->capacity;
+                merged = 1;
+            }
+        }
+        if (!merged) {
+            BloomLayer *copy = (BloomLayer *)calloc(1, sizeof(BloomLayer));
+            if (!copy)
+                rb_raise(rb_eNoMemError, "failed to allocate layer copy");
+            copy->size = src->size;
+            copy->capacity = src->capacity;
+            copy->count = src->count;
+            copy->num_hashes = src->num_hashes;
+            copy->bits = (uint8_t *)malloc(src->size);
+            if (!copy->bits) {
+                free(copy);
+                rb_raise(rb_eNoMemError, "failed to allocate bits");
+            }
+            memcpy(copy->bits, src->bits, src->size);
+            if (sb1->num_layers >= sb1->layers_cap) {
+                size_t new_slots = sb1->layers_cap == 0 ? 4 : sb1->layers_cap * 2;
+                BloomLayer **tmp =
+                    (BloomLayer **)realloc(sb1->layers, new_slots * sizeof(BloomLayer *));
+                if (!tmp) {
+                    layer_free(copy);
+                    rb_raise(rb_eNoMemError, "realloc failed");
+                }
+                sb1->layers = tmp;
+                sb1->layers_cap = new_slots;
+            }
+            sb1->layers[sb1->num_layers++] = copy;
+        }
     }
+    size_t new_total = sb1->total_count + sb2->total_count;
+    sb1->total_count = new_total >= sb1->total_count ? new_total : SIZE_MAX;
     return self;
 }
+static VALUE bloom_dump(VALUE self) {
+    ScalableBloom *sb;
+    TypedData_Get_Struct(self, ScalableBloom, &scalable_bloom_type, sb);
+    size_t total_size = HEADER_SIZE;
+    for (size_t i = 0; i < sb->num_layers; i++)
+        total_size += LAYER_META + sb->layers[i]->size;
+    VALUE str = rb_str_buf_new((long)total_size);
+    rb_str_set_len(str, (long)total_size);
+    uint8_t *buf = (uint8_t *)RSTRING_PTR(str);
+    size_t off = 0;
+    write_le32(buf + off, SERIAL_VERSION);
+    off += 4;
+    write_le32(buf + off, 0);
+    off += 4;
+    write_le_double(buf + off, sb->error_rate);
+    off += 8;
+    write_le_double(buf + off, sb->tightening);
+    off += 8;
+    write_le64(buf + off, (uint64_t)sb->initial_capacity);
+    off += 8;
+    write_le64(buf + off, (uint64_t)sb->total_count);
+    off += 8;
+    write_le64(buf + off, (uint64_t)sb->num_layers);
+    off += 8;
+    for (size_t i = 0; i < sb->num_layers; i++) {
+        BloomLayer *l = sb->layers[i];
+        write_le64(buf + off, (uint64_t)l->capacity);
+        off += 8;
+        write_le64(buf + off, (uint64_t)l->count);
+        off += 8;
+        write_le64(buf + off, (uint64_t)l->size);
+        off += 8;
+        write_le32(buf + off, (uint32_t)l->num_hashes);
+        off += 4;
+        write_le32(buf + off, 0);
+        off += 4;
+        memcpy(buf + off, l->bits, l->size);
+        off += l->size;
+    }
+    return str;
+}
+static VALUE bloom_load(VALUE klass, VALUE data) {
+    Check_Type(data, T_STRING);
+    const uint8_t *buf = (const uint8_t *)RSTRING_PTR(data);
+    size_t data_len = (size_t)RSTRING_LEN(data);
+    if (data_len < HEADER_SIZE)
+        rb_raise(rb_eArgError, "data too short for bloom filter header");
+    size_t off = 0;
+    uint32_t version = read_le32(buf + off);
+    off += 4;
+    if (version != SERIAL_VERSION)
+        rb_raise(rb_eArgError, "unsupported serialization version: %u", version);
+    off += 4;
+    VALUE obj = bloom_alloc(klass);
+    ScalableBloom *sb;
+    TypedData_Get_Struct(obj, ScalableBloom, &scalable_bloom_type, sb);
+    sb->error_rate = read_le_double(buf + off);
+    off += 8;
+    sb->tightening = read_le_double(buf + off);
+    off += 8;
+    sb->initial_capacity = (size_t)read_le64(buf + off);
+    off += 8;
+    sb->total_count = (size_t)read_le64(buf + off);
+    off += 8;
+    size_t num_layers = (size_t)read_le64(buf + off);
+    off += 8;
+    if (sb->error_rate <= 0 || sb->error_rate >= 1)
+        rb_raise(rb_eArgError, "invalid error_rate in serialized data");
+    if (sb->tightening <= 0 || sb->tightening >= 1)
+        rb_raise(rb_eArgError, "invalid tightening in serialized data");
+    if (num_layers > 1000)
+        rb_raise(rb_eArgError, "unreasonable number of layers: %zu", num_layers);
+    sb->layers_cap = num_layers < 4 ? 4 : num_layers;
+    sb->layers = (BloomLayer **)calloc(sb->layers_cap, sizeof(BloomLayer *));
+    if (!sb->layers)
+        rb_raise(rb_eNoMemError, "failed to allocate layers array");
+    for (size_t i = 0; i < num_layers; i++) {
+        if (off + LAYER_META > data_len) {
+            for (size_t j = 0; j < sb->num_layers; j++)
+                layer_free(sb->layers[j]);
+            sb->num_layers = 0;
+            rb_raise(rb_eArgError, "data truncated at layer %zu metadata", i);
+        }
+        BloomLayer *l = (BloomLayer *)calloc(1, sizeof(BloomLayer));
+        if (!l) {
+            for (size_t j = 0; j < sb->num_layers; j++)
+                layer_free(sb->layers[j]);
+            sb->num_layers = 0;
+            rb_raise(rb_eNoMemError, "failed to allocate layer");
+        }
+        l->capacity = (size_t)read_le64(buf + off);
+        off += 8;
+        l->count = (size_t)read_le64(buf + off);
+        off += 8;
+        l->size = (size_t)read_le64(buf + off);
+        off += 8;
+        l->num_hashes = (int)read_le32(buf + off);
+        off += 4;
+        off += 4;
+        if (l->size > (1ULL << 30) || off + l->size > data_len) {
+            free(l);
+            for (size_t j = 0; j < sb->num_layers; j++)
+                layer_free(sb->layers[j]);
+            sb->num_layers = 0;
+            rb_raise(rb_eArgError, "invalid or truncated layer %zu", i);
+        }
+        l->bits = (uint8_t *)malloc(l->size);
+        if (!l->bits) {
+            free(l);
+            for (size_t j = 0; j < sb->num_layers; j++)
+                layer_free(sb->layers[j]);
+            sb->num_layers = 0;
+            rb_raise(rb_eNoMemError, "failed to allocate bits");
+        }
+        memcpy(l->bits, buf + off, l->size);
+        off += l->size;
+        sb->layers[sb->num_layers++] = l;
+    }
+    return obj;
+}
 void Init_fast_bloom_filter(void) {
     VALUE mFastBloomFilter = rb_define_module("FastBloomFilter");
-    VALUE cBloomFilter = rb_define_class_under(mFastBloomFilter, "Filter", rb_cObject);
-    rb_define_alloc_func(cBloomFilter, bloom_alloc);
-    rb_define_method(cBloomFilter, "initialize", bloom_initialize, -1);
-    rb_define_method(cBloomFilter, "add", bloom_add, 1);
-    rb_define_method(cBloomFilter, "<<", bloom_add, 1);
-    rb_define_method(cBloomFilter, "include?", bloom_include, 1);
-    rb_define_method(cBloomFilter, "member?", bloom_include, 1);
-    rb_define_method(cBloomFilter, "clear", bloom_clear, 0);
-    rb_define_method(cBloomFilter, "stats", bloom_stats, 0);
-    rb_define_method(cBloomFilter, "merge!", bloom_merge, 1);
+    VALUE cFilter = rb_define_class_under(mFastBloomFilter, "Filter", rb_cObject);
+    rb_define_alloc_func(cFilter, bloom_alloc);
+    rb_define_method(cFilter, "initialize", bloom_initialize, -1);
+    rb_define_method(cFilter, "add", bloom_add, 1);
+    rb_define_method(cFilter, "<<", bloom_add, 1);
+    rb_define_method(cFilter, "add_if_absent", bloom_add_if_absent, 1);
+    rb_define_method(cFilter, "include?", bloom_include, 1);
+    rb_define_method(cFilter, "member?", bloom_include, 1);
+    rb_define_method(cFilter, "clear", bloom_clear, 0);
+    rb_define_method(cFilter, "stats", bloom_stats, 0);
+    rb_define_method(cFilter, "count", bloom_count, 0);
+    rb_define_method(cFilter, "size", bloom_count, 0);
+    rb_define_method(cFilter, "num_layers", bloom_num_layers, 0);
+    rb_define_method(cFilter, "merge!", bloom_merge, 1);
+    rb_define_method(cFilter, "dump", bloom_dump, 0);
+    rb_define_singleton_method(cFilter, "load", bloom_load, 1);
 }