RubyGems - bloom_fit - Versions diffs - 0.2.0 → 0.3.1 - Mend

bloom_fit 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/README.md +16 -13
data/ext/cbloomfilter/cbloomfilter.c +33 -89
data/lib/bloom_fit/configuration_mismatch.rb +4 -0
data/lib/bloom_fit/version.rb +1 -1
data/lib/bloom_fit.rb +193 -47
data/lib/cbloomfilter.bundle +0 -0
data/test/bloom_fit_test.rb +344 -0
data/test/test_helper.rb +6 -0
metadata +8 -7
data/spec/bloom_fit_spec.rb +0 -129
data/spec/helper.rb +0 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f542d198165a81ecdc9307e3d2b9a9168608197c117245cb89b087f5fde31081
-  data.tar.gz: 60c9bed4dfbf8b6d5e8d4cb47350b9ec31ee22ce9eae3fbcc92628cc8e4aed53
+  metadata.gz: cd631cdb483e0a84fa05d56eb962fda0f7c7d7a0b002ea708024ce82505a9054
+  data.tar.gz: ee781997465d6f5b590828082e4fadd5b00768298bbdec7845b9f07c3d046549
 SHA512:
-  metadata.gz: 55e33f10d0c71aa77bece3ba974995144f44cfc644d7bbf773de9b5ea562078df4511905de6ae87f15a9b95c12975fd423c1fbe6fccfb22c4a0073b2cdf66362
-  data.tar.gz: a2cca2d8c5c2ea66979ad93030b75fc7e64f5258650dc43b13e1cbf7080ee32ae44f77ac205b9055d3174d2e95dbe32fff61b949e27fc5f3306a3a332673bf57
+  metadata.gz: 7862f2d0189bae865c6fc5e7c7ad24f5c7ab0420415a455a1a0b130835d639c536cb8925b08219eab7dd7a10db1e9299b2868019d3e2259db4dce96de01e50a2
+  data.tar.gz: 41cb7f2fcb8cf80f5345785ce0110e242a29fbe6177284b13b701973ec7b0e7010d788585e406f77712f7ee284ff308633fe060e492b0e153a4a5598658fd465

data/README.md CHANGED Viewed

@@ -4,7 +4,12 @@
 [![CI](https://github.com/rmm5t/bloom_fit/actions/workflows/ci.yml/badge.svg)](https://github.com/rmm5t/bloom_fit/actions/workflows/ci.yml)
 [![Gem Downloads](https://img.shields.io/gem/dt/bloom_fit.svg)](https://rubygems.org/gems/bloom_fit)
-BloomFit provides a MRI/C-based non-counting bloom filter for use in your Ruby projects. It is heavily based on [bloomfilter-rb]'s native implementation, but provides a better hashing distribution by using DJB2 over CRC32, avoids the need to supply a seed, removes counting abilities, improves performance for very large datasets, and will automatically calculate the bit size (m) and the number of hashes (k) when given a capacity and false-positive-rate.
+BloomFit provides a MRI/C-based non-counting bloom filter for use in your Ruby projects. It is heavily based on [bloomfilter-rb]'s native implementation, but differs in the following ways:
+- uses DJB2 over CRC32 yielding better hash distribution
+- improves performance for very large datasets
+- avoids the need to supply a seed
+- automatically calculates the bit size (m) and the number of hashes (k) when given a capacity and false-positive-rate
 A [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter) is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set. False positives are possible, but false negatives are not. Instead of using k different hash functions, this implementation a DJB2 hash with k seeds from the CRC table.
@@ -13,8 +18,6 @@ Performance of the Bloom filter depends on the following:
 - size of the bit array
 - number of hash functions
-BloomFit is a fork of [bloomfilter-rb].
 ## Resources
 - Background: [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter)
@@ -40,11 +43,11 @@ bf["bird"] = "bar"
 bf["bird"]             # => true
 bf["mouse"]            # => false
-bf.stats
-# => Number of filter bits (m): 3600
-# => Number of set bits (n): 20
-# => Number of filter hashes (k) : 10
-# => Predicted false positive rate = 0.00%
+puts bf.stats
+# Number of filter bits (m): 3600
+# Number of set bits (n): 20
+# Number of filter hashes (k) : 10
+# Predicted false positive rate = 0.00%
 ```
 If you'd like more control over the traditional inputs like bit size and the number of hashes:
@@ -62,11 +65,11 @@ bf["bird"] = "bar"
 bf["bird"]             # => true
 bf["mouse"]            # => false
-bf.stats
-# => Number of filter bits (m): 100
-# => Number of set bits (n): 4
-# => Number of filter hashes (k) : 2
-# => Predicted false positive rate = 10.87%
+puts bf.stats
+# Number of filter bits (m): 100
+# Number of set bits (n): 4
+# Number of filter hashes (k) : 2
+# Predicted false positive rate = 10.87%
 ```
 ## Credits

data/ext/cbloomfilter/cbloomfilter.c CHANGED Viewed

@@ -17,8 +17,7 @@ static unsigned int *salts = crc_table;
 static VALUE cBloomFilter;
 struct BloomFilter {
-    int m; /* # of buckets in a bloom filter */
-    int b; /* # of bits in a bloom filter bucket */
+    int m; /* # of bits in a bloom filter */
     int k; /* # of hash functions */
     unsigned char *ptr; /* bits data */
     int bytes; /* size of byte data */
@@ -72,7 +71,6 @@ static VALUE bf_alloc(VALUE klass) {
     VALUE obj = TypedData_Make_Struct(klass, struct BloomFilter, &bf_type, bf);
     bf->m = 0;
-    bf->b = 0;
     bf->k = 0;
     bf->ptr = NULL;
     bf->bytes = 0;
@@ -80,52 +78,24 @@ static VALUE bf_alloc(VALUE klass) {
     return obj;
 }
-void bucket_unset(struct BloomFilter *bf, int index) {
-    int byte_offset = (index * bf->b) / 8;
-    int bit_offset = (index * bf->b) % 8;
-    unsigned int c = bf->ptr[byte_offset];
-    c += bf->ptr[byte_offset + 1] << 8;
-    unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
-    if ((c & mask) == 0) {
-      // do nothing
-    } else {
-        // reduce the counter: 11 00 => 10 00 (suppose bf->b is 2)
-        c -= (1 << bit_offset) & ((1 << 8) -1);
-        // shift the bitmap right by 1 bit: 10 00 => 01 00
-        c = (~mask & c) | ((c & mask) >> (bit_offset + 1) << bit_offset);
-        bf->ptr[byte_offset] = c & ((1 << 8) - 1);
-        bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
-    }
-}
+static void bucket_set(struct BloomFilter *bf, int index) {
+    int byte_offset = index / 8;
+    int bit_offset = index % 8;
-void bucket_set(struct BloomFilter *bf, int index) {
-    int byte_offset = (index * bf->b) / 8;
-    int bit_offset = (index * bf->b) % 8;
-    unsigned int c = bf->ptr[byte_offset];
-    c += bf->ptr[byte_offset + 1] << 8;
-    unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
-    if ((c & mask) != mask) {
-        c = c + ((1 << bit_offset) & ((1 << 8) -1)) | c;
-        bf->ptr[byte_offset] = c & ((1 << 8) - 1);
-        bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
-    }
+    bf->ptr[byte_offset] |= (unsigned char) (1U << bit_offset);
 }
-int bucket_check(struct BloomFilter *bf, int index) {
-    int byte_offset = (index * bf->b) / 8;
-    int bit_offset = (index * bf->b) % 8;
-    unsigned int c = bf->ptr[byte_offset];
-    c += bf->ptr[byte_offset + 1] << 8;
+static int bucket_check(struct BloomFilter *bf, int index) {
+    int byte_offset = index / 8;
+    int bit_offset = index % 8;
-    unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
-    return (c & mask) >> bit_offset;
+    return (bf->ptr[byte_offset] >> bit_offset) & 1;
 }
 static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
     struct BloomFilter *bf;
     VALUE arg1, arg2;
-    int m, k, b;
+    int m, k;
     bf = bf_ptr(self);
@@ -143,21 +113,20 @@ static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
     m = FIX2INT(arg1);
     k = FIX2INT(arg2);
-    b = 1;
     if (m < 1)
         rb_raise(rb_eArgError, "array size");
     if (k < 1)
         rb_raise(rb_eArgError, "hash length");
-    bf->b = b;
     bf->m = m;
     bf->k = k;
     ruby_xfree(bf->ptr);
     bf->ptr = NULL;
     bf->bytes = 0;
-    bf->bytes = ((m * b) + 15) / 8;
+    /* Preserve the existing serialized bitmap length, including one padding byte. */
+    bf->bytes = (m + 15) / 8;
     bf->ptr = ALLOC_N(unsigned char, bf->bytes);
     /* initialize the bits with zeros */
@@ -194,7 +163,7 @@ static VALUE bf_set_bits(VALUE self){
     return INT2FIX(count);
 }
-static VALUE bf_insert(VALUE self, VALUE key) {
+static VALUE bf_add(VALUE self, VALUE key) {
     VALUE skey;
     unsigned long hash;
     int index;
@@ -268,57 +237,34 @@ static VALUE bf_or(VALUE self, VALUE other) {
     return obj;
 }
-static VALUE bf_include(int argc, VALUE* argv, VALUE self) {
+static VALUE bf_include(VALUE self, VALUE key) {
+    VALUE skey;
     unsigned long hash;
-    int i, len, m, k;
     int index;
-    long tests_idx, vlen;
+    int i, len, m, k;
     char *ckey;
-    VALUE tests, key, skey;
-    struct BloomFilter *bf;
+    struct BloomFilter *bf = bf_ptr(self);
-    rb_scan_args(argc, argv, "*", &tests);
+    skey = rb_obj_as_string(key);
+    ckey = StringValuePtr(skey);
+    len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
-    bf = bf_ptr(self);
-    vlen = RARRAY_LEN(tests);
-    for (tests_idx = 0; tests_idx < vlen; tests_idx++) {
-        key = rb_ary_entry(tests, tests_idx);
-        skey = rb_obj_as_string(key);
-        ckey = StringValuePtr(skey);
-        len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
-        m = bf->m;
-        k = bf->k;
-        hash = (unsigned long) djb2(ckey, len);
-        for (i = 0; i <= k - 1; i++) {
-            index = (int) ((hash ^ salts[i]) % (unsigned int) (m));
-            /* check the bit at the index */
-            if (!bucket_check(bf, index)) {
-                return Qfalse; /* i.e., it is a new entry ; escape the loop */
-            }
+    m = bf->m;
+    k = bf->k;
+    hash = (unsigned long) djb2(ckey, len);
+    for (i = 0; i <= k - 1; i++) {
+        index = (int) ((hash ^ salts[i]) % (unsigned int) (m));
+        /* check the bit at the index */
+        if (!bucket_check(bf, index)) {
+            return Qfalse; /* i.e., it is a new entry ; escape the loop */
         }
     }
     return Qtrue;
 }
-static VALUE bf_to_s(VALUE self) {
-    struct BloomFilter *bf = bf_ptr(self);
-    unsigned char *ptr;
-    int i;
-    VALUE str;
-    str = rb_str_new(0, bf->m);
-    ptr = (unsigned char *) RSTRING_PTR(str);
-    for (i = 0; i < bf->m; i++)
-        *ptr++ = bucket_check(bf, i) ? '1' : '0';
-    return str;
-}
 static VALUE bf_bitmap(VALUE self) {
     struct BloomFilter *bf = bf_ptr(self);
@@ -346,15 +292,13 @@ void Init_cbloomfilter(void) {
     rb_define_method(cBloomFilter, "m", bf_m, 0);
     rb_define_method(cBloomFilter, "k", bf_k, 0);
     rb_define_method(cBloomFilter, "set_bits", bf_set_bits, 0);
-    /* rb_define_method(cBloomFilter, "s", bf_s, 0); */
-    rb_define_method(cBloomFilter, "insert", bf_insert, 1);
-    rb_define_method(cBloomFilter, "include?", bf_include, -1);
+    rb_define_method(cBloomFilter, "add", bf_add, 1);
+    rb_define_method(cBloomFilter, "include?", bf_include, 1);
     rb_define_method(cBloomFilter, "clear", bf_clear, 0);
-    rb_define_method(cBloomFilter, "merge!", bf_merge, 1);
+    rb_define_method(cBloomFilter, "merge", bf_merge, 1);
     rb_define_method(cBloomFilter, "&", bf_and, 1);
     rb_define_method(cBloomFilter, "|", bf_or, 1);
-    rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
     rb_define_method(cBloomFilter, "bitmap", bf_bitmap, 0);
     rb_define_method(cBloomFilter, "load", bf_load, 1);

data/lib/bloom_fit/configuration_mismatch.rb ADDED Viewed

@@ -0,0 +1,4 @@
+class BloomFit
+  class ConfigurationMismatch < ArgumentError
+  end
+end

data/lib/bloom_fit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class BloomFit
-  VERSION = "0.2.0".freeze
+  VERSION = "0.3.1".freeze
 end

data/lib/bloom_fit.rb CHANGED Viewed

@@ -1,65 +1,214 @@
+require "forwardable"
 require "cbloomfilter"
+require "bloom_fit/configuration_mismatch"
 require "bloom_fit/version"
+# BloomFit is an in-memory Bloom filter with a small, Set-like API.
+#
+# Bloom filters are probabilistic membership structures: they can report false
+# positives, but they do not report false negatives for values that have been
+# added. That makes BloomFit useful for cheaply ruling out missing values
+# before doing more expensive work, while keeping memory usage low.
+#
+# The class wraps the native +CBloomFilter+ implementation in Ruby-friendly
+# methods such as +add+, +include?+, +merge+, +&+, and +|+. Instances can be
+# serialized with +save+ and reloaded with +BloomFit.load+.
+#
+# Filters can only be combined when they were created with the same +size+ and
+# +hashes+ values; otherwise +BloomFit::ConfigurationMismatch+ is raised.
+#
+#   filter = BloomFit.new(size: 10_000, hashes: 6)
+#   filter.add("cat")
+#   filter.include?("cat") # => true
+#   filter.include?("dog") # => false
+#
+# Choose +size+ and +hashes+ based on the expected number of inserts and the
+# false-positive rate you can tolerate.
 class BloomFit
-  class ConfigurationMismatch < ArgumentError
-  end
+  extend Forwardable
+  # The wrapped native +CBloomFilter+ instance.
+  #
+  # This is mostly useful for low-level integrations and internal filter
+  # operations such as merge, union, and intersection.
   attr_reader :bf
+  # Creates an empty Bloom filter.
+  #
+  # The defaults are a reasonable starting point for small in-memory filters,
+  # but the best values depend on how many keys you expect to insert and how
+  # many false positives you can tolerate.
+  #
+  # @param size [Integer] number of buckets in a bloom filter
+  # @param hashes [Integer] number of hash functions
   def initialize(size: 1_000, hashes: 4)
-    @size = size
-    @hashes = hashes
+    @bf = CBloomFilter.new(size, hashes)
+  end
-    # arg 1: m => size : number of buckets in a bloom filter
-    # arg 2: k => hashes : number of hash functions
-    @bf = CBloomFilter.new(@size, @hashes)
+  # :method: m
+  #
+  # Returns the configured filter width.
+  # :method: k
+  #
+  # Returns the number of hash functions applied to each key.
+  # :method: bitmap
+  #
+  # Returns the raw bitmap as a binary string.
+  #
+  # The returned bytes reflect the native representation, so the string may
+  # include padding beyond the configured filter size.
+  # :method: include?
+  #
+  # Returns +true+ when +key+ may be present and +false+ when it is definitely
+  # absent.
+  #
+  # Positive results are probabilistic and may be false positives.
+  # :method: clear
+  #
+  # Clears the filter by resetting all bits to +0+.
+  # :method: set_bits
+  #
+  # Returns the number of bits currently set to +1+.
+  def_delegators :@bf, :m, :k, :bitmap, :include?, :clear, :set_bits
+  # Returns the configured filter width.
+  alias size m
+  # Returns the number of hash functions used for each inserted key.
+  alias hashes k
+  alias key? include?
+  alias [] include?
+  alias n set_bits
+  # Returns +true+ when no bits are set.
+  #
+  # This is an exact check on the filter state, unlike +include?+, which is
+  # probabilistic for positive matches.
+  def empty?
+    set_bits.zero?
   end
-  def insert(key)
-    @bf.insert(key)
+  # Adds +key+ to the filter and returns +self+.
+  #
+  # This mimics the behavior of Set#add and allows chaining with #<<.
+  def add(key)
+    @bf.add(key)
+    self
+  end
+  alias << add
+  # Adds +key+ to the filter when +value+ is truthy.
+  #
+  # This makes BloomFit behave like a write-only membership hash: truthy values
+  # add the key, while +false+ and +nil+ are ignored.
+  def []=(key, value)
+    @bf.add(key) if value
   end
-  alias []= insert
-  def include?(*keys)
-    @bf.include?(*keys)
+  # Adds +key+ only if it does not already appear to be present.
+  #
+  # Returns +self+ when the key is added and +nil+ when +include?+ is already
+  # true. This mimics Set#add?.
+  #
+  # Because Bloom filters can return false positives, +add?+ may occasionally
+  # return +nil+ for a key that has not actually been inserted before.
+  def add?(key)
+    return nil if include?(key) # rubocop:disable Style/ReturnNilInPredicateMethodDefinition
+    add(key)
   end
-  alias key? include?
-  alias [] include?
-  def clear = @bf.clear
-  def size = @bf.set_bits
-  def merge!(other) = @bf.merge!(other.bf)
+  # Returns the bitmap as a hexadecimal string.
+  #
+  # This is useful for debugging, logging, or comparing filter state in a more
+  # compact form than +to_binary+.
+  def to_hex
+    length = ((size / 8.0).ceil * 8 / 4)
+    bitmap.unpack1("H*")[0...length]
+  end
-  # Returns the number of bits that are set to 1 in the filter.
-  def set_bits
-    @bf.set_bits
+  # Returns the bitmap as a binary string of +0+ and +1+ characters.
+  #
+  # The output is truncated to the configured filter width, so it omits any
+  # trailing padding present in the native bitmap.
+  def to_binary
+    bitmap.unpack1("B*")[0...size]
   end
-  # Computes the intersection of two Bloom filters.
-  # It assumes that both filters have the same size -
-  # if this is not true +BloomFit::ConfigurationMismatch+ is raised.
+  # Merges another filter or collection of keys into this filter.
+  #
+  # When +other+ is a +BloomFit+, the merge is performed bitwise and both
+  # filters must have the same +size+ and +hashes+ values. When +other+
+  # behaves like a hash, only keys with truthy values are added. Any other
+  # enumerable is treated as a list of keys.
+  #
+  # This method mutates the receiver and mimics Set#merge.
+  def merge(other)
+    if other.is_a?(BloomFit)
+      raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
+      @bf.merge(other.bf)
+    elsif other.respond_to?(:each_key)
+      other.each { |k, v| add(k) if v }
+    elsif other.is_a?(Enumerable)
+      other.each { |k| add(k) }
+    else
+      raise ArgumentError, "value must be enumerable or another BloomFit filter"
+    end
+  end
+  # Returns a new filter containing the bitwise intersection of two filters.
+  #
+  # Both filters must have the same +size+ and +hashes+ values or
+  # +BloomFit::ConfigurationMismatch+ is raised.
+  #
+  # Like all Bloom filter operations, membership checks on the result remain
+  # probabilistic and may still produce false positives.
   def &(other)
     raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
-    result = self.class.new
-    result.instance_variable_set(:@bf, @bf.&(other.bf))
-    result
+    self.class.new(size:, hashes:).tap do |result|
+      result.instance_variable_set(:@bf, @bf.&(other.bf))
+    end
   end
-  # Computes the union of two Bloom filters.
-  # It assumes that both filters have the same size -
-  # if this is not true +BloomFit::ConfigurationMismatch+ is raised.
+  alias intersection &
+  # Returns a new filter containing the bitwise union of two filters.
+  #
+  # Both filters must have the same +size+ and +hashes+ values or
+  # +BloomFit::ConfigurationMismatch+ is raised.
+  #
+  # The receiver and +other+ are left unchanged.
   def |(other)
     raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
-    result = self.class.new
-    result.instance_variable_set(:@bf, @bf.|(other.bf))
-    result
+    self.class.new(size:, hashes:).tap do |result|
+      result.instance_variable_set(:@bf, @bf.|(other.bf))
+    end
   end
+  alias union |
+  # Returns a human-readable summary of the filter's current state.
+  #
+  # The report includes the configured width (+m+), the current number of set
+  # bits (+n+), the hash count (+k+), and the predicted false-positive rate
+  # based on the current fill level.
+  def stats
+    fpr = ((1.0 - Math.exp(-(k * n).to_f / m))**k) * 100
-  def bitmap
-    @bf.bitmap
+    (+"").tap do |s|
+      s << format("Number of filter buckets (m):  %d\n",     m)
+      s << format("Number of set bits (n):        %d\n",     n)
+      s << format("Number of filter hashes (k):   %d\n",     k)
+      s << format("Predicted false positive rate: %.2f%%\n", fpr)
+    end
   end
+  # Rebuilds the filter from the serialized data returned by +marshal_dump+.
+  #
+  # This hook is used by Ruby's +Marshal+ support.
   def marshal_load(ary)
     size, hashes, bitmap = *ary
@@ -67,32 +216,29 @@ class BloomFit
     @bf.load(bitmap) if bitmap
   end
+  # Returns the data Ruby's +Marshal+ uses to serialize this filter.
   def marshal_dump
-    [@size, @hashes, @bf.bitmap]
+    [size, hashes, bitmap]
   end
+  # Loads a filter from a file previously written by +save+.
+  #
+  # The file is read using Ruby's +Marshal+ format, so it should only be used
+  # with trusted input.
   def self.load(filename)
-    Marshal.load(File.open(filename, "r"))
+    Marshal.load(File.open(filename, "r")) # rubocop:disable Security/MarshalLoad
   end
+  # Writes the filter to +filename+ using Ruby's +Marshal+ format.
   def save(filename)
     File.open(filename, "w") do |f|
       f << Marshal.dump(self)
     end
   end
-  def stats
-    fp = ((1.0 - Math.exp(-(@hashes * size).to_f / @size))**@hashes) * 100
-    printf "Number of filter buckets (m): %d\n", @size
-    printf "Number of set bits (n): %d\n", set_bits
-    printf "Number of filter hashes (k) : %d\n", @hashes
-    printf "Predicted false positive rate = %.2f%%\n", fp
-  end
   protected
-  # Returns true if parameters of the +other+ filter are
-  # the same.
+  # Returns +true+ when +other+ has the same +size+ and +hashes+ values.
   def same_parameters?(other)
     bf.m == other.bf.m && bf.k == other.bf.k
   end

data/lib/cbloomfilter.bundle CHANGED Viewed

Binary file

data/test/bloom_fit_test.rb ADDED Viewed

@@ -0,0 +1,344 @@
+require "test_helper"
+class BloomFitTest < Minitest::Spec
+  subject { BloomFit.new(size: 100, hashes: 4) }
+  describe "#empty?" do
+    it "returns true when nothing set" do
+      assert_equal true, subject.empty? # rubocop:disable Minitest/AssertTruthy
+      assert_empty subject
+    end
+    it "returns false when something set" do
+      subject << "key"
+      assert_equal false, subject.empty? # rubocop:disable Minitest/RefuteFalse
+      refute_empty subject
+    end
+  end
+  describe "#add" do
+    it "adds the key and returns self" do
+      assert_equal subject, subject.add("test1")
+      assert_equal subject, subject.add("test2")
+      assert_includes subject, "test1"
+      assert_includes subject, "test2"
+    end
+    it "is aliased as #<<" do
+      subject << "test1" << "test2"
+      assert_includes subject, "test1"
+      assert_includes subject, "test2"
+    end
+    it "is aliased as #[]=, and handles truthy/falsey values" do
+      subject["dog"] = :bar
+      subject["cat"] = :foo
+      assert_includes subject, "dog"
+      assert_includes subject, "cat"
+      subject["bat"] = nil
+      subject["pig"] = false
+      refute_includes subject, "bat"
+      refute_includes subject, "pig"
+    end
+    it "casts using #to_s as necessary" do
+      subject << :symbol << true << 12_345
+      assert_includes subject, "symbol"
+      assert_includes subject, :symbol
+      assert_includes subject, "true"
+      assert_includes subject, "12345"
+      assert_includes subject, 12_345
+    end
+  end
+  describe "#add?" do
+    it "adds new key and returns self" do
+      assert_equal subject, subject.add("test1")
+      assert_equal subject, subject.add("test2")
+      assert_includes subject, "test1"
+      assert_includes subject, "test2"
+    end
+    it "return nil if the key already exists" do
+      subject << "test1"
+      subject << "test2"
+      assert_includes subject, "test1"
+      assert_includes subject, "test2"
+      assert_nil subject.add?("test1")
+      assert_nil subject.add?("test2")
+    end
+  end
+  describe "#include?" do
+    it "returns true when a key is in the set" do
+      subject << "test1"
+      subject << "test2"
+      assert_equal true, subject.include?("test1") # rubocop:disable Minitest/AssertTruthy
+      assert_equal true, subject.include?("test2") # rubocop:disable Minitest/AssertTruthy
+    end
+    it "returns false when a key is not in the set" do
+      assert_equal false, subject.include?("test") # rubocop:disable Minitest/RefuteFalse
+      assert_equal false, subject.include?("nada") # rubocop:disable Minitest/RefuteFalse
+    end
+    it "is aliased as #key?" do
+      subject << "test1"
+      subject << "test2"
+      assert subject.key?("test1")
+      assert subject.key?("test2")
+      refute subject.key?("test3")
+    end
+    it "is aliased as #[]" do
+      subject << "test1"
+      subject << "test2"
+      assert subject["test1"]
+      assert subject["test2"]
+      refute subject["test3"]
+    end
+  end
+  describe "#clear" do
+    it "zeroes the bits" do
+      subject.add("test")
+      assert_includes subject, "test"
+      assert_includes subject.to_binary, "1"
+      subject.clear
+      refute_includes subject, "test"
+      refute_includes subject.to_binary, "1"
+    end
+  end
+  describe "#set_bits" do
+    it "returns the number of bits set to 1" do
+      bf = BloomFit.new(size: 100, hashes: 4)
+      bf.add("bits")
+      assert_equal 4, bf.set_bits
+      bf = BloomFit.new(size: 100, hashes: 1)
+      bf.add("bits")
+      assert_equal 1, bf.set_bits
+    end
+  end
+  describe "#bitmap" do
+    it "returns a binary bitmap of all zeros when empty (including a terminating byte)" do
+      bf = BloomFit.new(size: 16)
+      assert_equal "\x00\x00\x00".b, bf.bitmap
+    end
+    it "returns a binary bitmap representing the set" do
+      bf = BloomFit.new(size: 16, hashes: 4)
+      bf.add("something")
+      assert_equal "(\x82\x00".b, bf.bitmap
+    end
+    it "returns a binary bitmap representing the set even if not a multiple of 8 bits" do
+      bf = BloomFit.new(size: 20, hashes: 4)
+      bf.add("wow")
+      assert_equal "\x04\x14\x00\x00".b, bf.bitmap
+    end
+  end
+  describe "#to_hex" do
+    it "returns a hex bitmap of all zeros when empty" do
+      bf = BloomFit.new(size: 16)
+      assert_equal "0000", bf.to_hex
+    end
+    it "returns a hex bitmap of all zeros when empty if not a multiple of 8 bits" do
+      bf = BloomFit.new(size: 18)
+      assert_equal "000000", bf.to_hex
+    end
+    it "returns a hex bitmap representing the set" do
+      bf = BloomFit.new(size: 16, hashes: 4)
+      bf.add("cool")
+      assert_equal "1441", bf.to_hex
+    end
+  end
+  describe "#to_binary" do
+    it "returns a binary bitmap of all zeros when empty" do
+      bf = BloomFit.new(size: 16)
+      assert_equal "0000000000000000", bf.to_binary
+    end
+    it "returns a binary bitmap of all zeros when empty if not a multiple of 8 bits" do
+      bf = BloomFit.new(size: 19)
+      assert_equal "0000000000000000000", bf.to_binary
+    end
+    it "returns a binary bitmap representing the set" do
+      bf = BloomFit.new(size: 16, hashes: 4)
+      bf << "cool" << "cat"
+      assert_equal "1001011001101001", bf.to_binary
+    end
+  end
+  describe "#merge" do
+    it "merges another BloomFit filter" do
+      bf1 = BloomFit.new(size: 100, hashes: 2)
+      bf2 = BloomFit.new(size: 100, hashes: 2)
+      bf1 << "mouse"
+      bf2 << "cat" << "dog"
+      refute_includes bf1, "cat"
+      refute_includes bf1, "dog"
+      bf1.merge(bf2)
+      assert_includes bf1, "mouse"
+      assert_includes bf1, "cat"
+      assert_includes bf1, "dog"
+      refute_includes bf2, "mouse"
+      assert_includes bf2, "cat"
+      assert_includes bf2, "dog"
+    end
+    it "merges an array" do
+      subject << "mouse"
+      subject.merge %i[cat dog]
+      assert_includes subject, "mouse"
+      assert_includes subject, "cat"
+      assert_includes subject, "dog"
+    end
+    it "merges a set" do
+      subject << "mouse"
+      subject.merge Set.new(%w[cat dog])
+      assert_includes subject, "mouse"
+      assert_includes subject, "cat"
+      assert_includes subject, "dog"
+    end
+    it "merges a hash ignoring falsey values" do
+      subject << "mouse"
+      subject.merge({ cat: 1, dog: 2, ant: false, bug: nil })
+      assert_includes subject, "mouse"
+      assert_includes subject, "cat"
+      assert_includes subject, "dog"
+      refute_includes subject, "ant"
+      refute_includes subject, "bug"
+    end
+    it "raises when merge is between incompatible filters" do
+      bf1 = BloomFit.new(size: 10)
+      bf2 = BloomFit.new(size: 20)
+      assert_raises(BloomFit::ConfigurationMismatch) { bf1.merge(bf2) }
+    end
+  end
+  describe "#&" do
+    it "returns intersection of both filters" do
+      bf1 = BloomFit.new(size: 35, hashes: 4)
+      bf1.add("test")
+      bf1.add("test1")
+      bf2 = BloomFit.new(size: 35, hashes: 4)
+      bf2.add("test")
+      bf2.add("test2")
+      bf3 = bf1 & bf2
+      assert_equal 35, bf3.size
+      assert_equal 4, bf3.hashes
+      assert_includes bf3, "test"
+      refute_includes bf3, "test1"
+      refute_includes bf3, "test2"
+    end
+    it "is aliased as #intersection" do
+      bf1 = BloomFit.new(size: 20, hashes: 4)
+      bf1.add("test")
+      bf1.add("test1")
+      bf2 = BloomFit.new(size: 20, hashes: 4)
+      bf2.add("test")
+      bf3 = bf1.intersection(bf2)
+      assert_includes bf3, "test"
+      refute_includes bf3, "test1"
+    end
+    it "raises when intersection is between incompatible filters" do
+      bf1 = BloomFit.new(size: 10)
+      bf2 = BloomFit.new(size: 20)
+      assert_raises(BloomFit::ConfigurationMismatch) { bf1 & bf2 }
+      bf1 = BloomFit.new(size: 10, hashes: 2)
+      bf2 = BloomFit.new(size: 10, hashes: 4)
+      assert_raises(BloomFit::ConfigurationMismatch) { bf1 & bf2 }
+    end
+  end
+  describe "#|" do
+    it "returns union with other filter" do
+      bf1 = BloomFit.new
+      bf1.add("test")
+      bf1.add("test1")
+      bf2 = BloomFit.new
+      bf2.add("test")
+      bf2.add("test2")
+      bf3 = bf1 | bf2
+      assert_includes bf3, "test"
+      assert_includes bf3, "test1"
+      assert_includes bf3, "test2"
+    end
+    it "is aliased as #union" do
+      bf1 = BloomFit.new(size: 20, hashes: 4)
+      bf1.add("test")
+      bf1.add("test1")
+      bf2 = BloomFit.new(size: 20, hashes: 4)
+      bf2.add("test")
+      bf3 = bf1.union(bf2)
+      assert_includes bf3, "test"
+      assert_includes bf3, "test1"
+    end
+    it "raises when union is between incompatible filters" do
+      bf1 = BloomFit.new(size: 10)
+      bf2 = BloomFit.new(size: 20)
+      assert_raises(BloomFit::ConfigurationMismatch) { bf1 | bf2 }
+    end
+  end
+  describe "#stats" do
+    it "returns current stats" do
+      bf = BloomFit.new(size: 10, hashes: 3)
+      expected = <<~STATS
+        Number of filter buckets (m):  10
+        Number of set bits (n):        0
+        Number of filter hashes (k):   3
+        Predicted false positive rate: 0.00%
+      STATS
+      assert_equal expected, bf.stats
+    end
+  end
+  describe "serialization" do
+    after { File.unlink("bf.out") }
+    it "marshalls" do
+      bf = BloomFit.new
+      assert bf.save("bf.out")
+    end
+    it "loads from marshalled" do
+      subject.add("foo")
+      subject.add("bar")
+      subject.save("bf.out")
+      bf2 = BloomFit.load("bf.out")
+      assert_includes bf2, "foo"
+      assert_includes bf2, "bar"
+      refute_includes bf2, "baz"
+      assert subject.send(:same_parameters?, bf2)
+    end
+  end
+end

data/test/test_helper.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require "minitest/autorun"
+require "minitest/reporters"
+Minitest::Reporters.use! # override with MINITEST_REPORTER env var
+require "bloom_fit"

metadata CHANGED Viewed

@@ -1,22 +1,22 @@
 --- !ruby/object:Gem::Specification
 name: bloom_fit
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.1
 platform: ruby
 authors:
-- Ilya Grigorik
-- Tatsuya Mori
 - Ryan McGeary
 - Beshad Talayeminaei
+- Ilya Grigorik
+- Tatsuya Mori
 bindir: bin
 cert_chain: []
 date: 1980-01-02 00:00:00.000000000 Z
 dependencies: []
 email:
-- ilya@grigorik.com
-- valdzone@gmail.com
 - ryan@mcgeary.org
 - 'btalayeminaei@gmail.com '
+- ilya@grigorik.com
+- valdzone@gmail.com
 executables: []
 extensions:
 - ext/cbloomfilter/extconf.rb
@@ -27,10 +27,11 @@ files:
 - ext/cbloomfilter/crc32.h
 - ext/cbloomfilter/extconf.rb
 - lib/bloom_fit.rb
+- lib/bloom_fit/configuration_mismatch.rb
 - lib/bloom_fit/version.rb
 - lib/cbloomfilter.bundle
-- spec/bloom_fit_spec.rb
-- spec/helper.rb
+- test/bloom_fit_test.rb
+- test/test_helper.rb
 homepage: https://github.com/rmm5t/bloom_fit
 licenses: []
 metadata:

data/spec/bloom_fit_spec.rb DELETED Viewed

@@ -1,129 +0,0 @@
-require "helper"
-describe BloomFit do
-  it "clears" do
-    bf = BloomFit.new(size: 100, hashes: 2)
-    bf.insert("test")
-    expect(bf.include?("test")).to be true
-    bf.clear
-    expect(bf.include?("test")).to be false
-  end
-  it "merges" do
-    bf1 = BloomFit.new(size: 100, hashes: 2)
-    bf2 = BloomFit.new(size: 100, hashes: 2)
-    bf2.insert("test")
-    expect(bf1.include?("test")).to be false
-    bf1.merge!(bf2)
-    expect(bf1.include?("test")).to be true
-    expect(bf2.include?("test")).to be true
-  end
-  it "tests set membership" do
-    bf = BloomFit.new(size: 100, hashes: 2)
-    bf.insert("test")
-    bf.insert("test1")
-    expect(bf.include?("test")).to be true
-    expect(bf.include?("abcd")).to be false
-    expect(bf.include?("test", "test1")).to be true
-    expect(bf.include?("test1", "abcd")).to be false
-  end
-  it "works with any object's to_s" do
-    subject.insert(:test)
-    subject.insert(:test1)
-    subject.insert(12_345)
-    expect(subject.include?("test")).to be true
-    expect(subject.include?("abcd")).to be false
-    expect(subject.include?("12345")).to be true
-  end
-  it "returns the number of bits set to 1" do
-    bf = BloomFit.new(hashes: 4)
-    bf.insert("test")
-    expect(bf.set_bits).to eq 4
-    bf = BloomFit.new(hashes: 1)
-    bf.insert("test")
-    expect(bf.set_bits).to eq 1
-  end
-  it "returns intersection with other filter" do
-    bf1 = BloomFit.new
-    bf1.insert("test")
-    bf1.insert("test1")
-    bf2 = BloomFit.new
-    bf2.insert("test")
-    bf2.insert("test2")
-    bf3 = bf1 & bf2
-    expect(bf3.include?("test")).to be true
-    expect(bf3.include?("test1")).to be false
-    expect(bf3.include?("test2")).to be false
-  end
-  it "raises an exception when intersection is to be computed for incompatible filters" do
-    bf1 = BloomFit.new(size: 10)
-    bf1.insert("test")
-    bf2 = BloomFit.new(size: 20)
-    bf2.insert("test")
-    expect { bf1 & bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
-  end
-  it "returns union with other filter" do
-    bf1 = BloomFit.new
-    bf1.insert("test")
-    bf1.insert("test1")
-    bf2 = BloomFit.new
-    bf2.insert("test")
-    bf2.insert("test2")
-    bf3 = bf1 | bf2
-    expect(bf3.include?("test")).to be true
-    expect(bf3.include?("test1")).to be true
-    expect(bf3.include?("test2")).to be true
-  end
-  it "raises an exception when union is to be computed for incompatible filters" do
-    bf1 = BloomFit.new(size: 10)
-    bf1.insert("test")
-    bf2 = BloomFit.new(size: 20)
-    bf2.insert("test")
-    expect { bf1 | bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
-  end
-  it "outputs current stats" do
-    subject.insert("test")
-    expect { subject.stats }.not_to raise_error
-  end
-  context "serialization" do
-    after { File.unlink("bf.out") }
-    it "marshalls" do
-      bf = BloomFit.new
-      expect { bf.save("bf.out") }.not_to raise_error
-    end
-    it "loads from marshalled" do
-      subject.insert("foo")
-      subject.insert("bar")
-      subject.save("bf.out")
-      bf2 = BloomFit.load("bf.out")
-      expect(bf2.include?("foo")).to be true
-      expect(bf2.include?("bar")).to be true
-      expect(bf2.include?("baz")).to be false
-      expect(subject.send(:same_parameters?, bf2)).to be true
-    end
-  end
-end

data/spec/helper.rb DELETED Viewed

	@@ -1 +0,0 @@
1	- require "bloom_fit"