RubyGems - bloom_fit - Versions diffs - 0.2.0 → 0.3.0 - Mend

bloom_fit 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/README.md +16 -13
data/ext/cbloomfilter/cbloomfilter.c +33 -89
data/lib/bloom_fit/configuration_mismatch.rb +4 -0
data/lib/bloom_fit/version.rb +1 -1
data/lib/bloom_fit.rb +83 -44
data/lib/cbloomfilter.bundle +0 -0
data/test/bloom_fit_test.rb +344 -0
data/test/test_helper.rb +6 -0
metadata +8 -7
data/spec/bloom_fit_spec.rb +0 -129
data/spec/helper.rb +0 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f542d198165a81ecdc9307e3d2b9a9168608197c117245cb89b087f5fde31081
-  data.tar.gz: 60c9bed4dfbf8b6d5e8d4cb47350b9ec31ee22ce9eae3fbcc92628cc8e4aed53
+  metadata.gz: efa22c92049e3607485a8fcfe471b15cca6e85e6da0c7b19b65f74b9f6ad5fe9
+  data.tar.gz: 5e8432456b1258111671d536165217bc3e82e0e430c3bc63112abc4670f91e78
 SHA512:
-  metadata.gz: 55e33f10d0c71aa77bece3ba974995144f44cfc644d7bbf773de9b5ea562078df4511905de6ae87f15a9b95c12975fd423c1fbe6fccfb22c4a0073b2cdf66362
-  data.tar.gz: a2cca2d8c5c2ea66979ad93030b75fc7e64f5258650dc43b13e1cbf7080ee32ae44f77ac205b9055d3174d2e95dbe32fff61b949e27fc5f3306a3a332673bf57
+  metadata.gz: 72738a57ccb3a1a8989e86993490c3ba6a4f90925c834c1acd70ba104df8ef2bb318d5c66830786ba662e88df09f9ce46d7184810e3d4ec1c6b4cc0b41fcec44
+  data.tar.gz: 7472e370d1a66a6034ecb2f0d4720b9edd12f21e181a37cae2869e0e34c70a829366e7ee6caf880f4c4d8c789bc18bdbe2f83e6699617ccc77460320f2a2a1af

data/README.md CHANGED Viewed

@@ -4,7 +4,12 @@
 [![CI](https://github.com/rmm5t/bloom_fit/actions/workflows/ci.yml/badge.svg)](https://github.com/rmm5t/bloom_fit/actions/workflows/ci.yml)
 [![Gem Downloads](https://img.shields.io/gem/dt/bloom_fit.svg)](https://rubygems.org/gems/bloom_fit)
-BloomFit provides a MRI/C-based non-counting bloom filter for use in your Ruby projects. It is heavily based on [bloomfilter-rb]'s native implementation, but provides a better hashing distribution by using DJB2 over CRC32, avoids the need to supply a seed, removes counting abilities, improves performance for very large datasets, and will automatically calculate the bit size (m) and the number of hashes (k) when given a capacity and false-positive-rate.
+BloomFit provides a MRI/C-based non-counting bloom filter for use in your Ruby projects. It is heavily based on [bloomfilter-rb]'s native implementation, but differs in the following ways:
+- uses DJB2 over CRC32 yielding better hash distribution
+- improves performance for very large datasets
+- avoids the need to supply a seed
+- automatically calculates the bit size (m) and the number of hashes (k) when given a capacity and false-positive-rate
 A [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter) is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set. False positives are possible, but false negatives are not. Instead of using k different hash functions, this implementation a DJB2 hash with k seeds from the CRC table.
@@ -13,8 +18,6 @@ Performance of the Bloom filter depends on the following:
 - size of the bit array
 - number of hash functions
-BloomFit is a fork of [bloomfilter-rb].
 ## Resources
 - Background: [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter)
@@ -40,11 +43,11 @@ bf["bird"] = "bar"
 bf["bird"]             # => true
 bf["mouse"]            # => false
-bf.stats
-# => Number of filter bits (m): 3600
-# => Number of set bits (n): 20
-# => Number of filter hashes (k) : 10
-# => Predicted false positive rate = 0.00%
+puts bf.stats
+# Number of filter bits (m): 3600
+# Number of set bits (n): 20
+# Number of filter hashes (k) : 10
+# Predicted false positive rate = 0.00%
 ```
 If you'd like more control over the traditional inputs like bit size and the number of hashes:
@@ -62,11 +65,11 @@ bf["bird"] = "bar"
 bf["bird"]             # => true
 bf["mouse"]            # => false
-bf.stats
-# => Number of filter bits (m): 100
-# => Number of set bits (n): 4
-# => Number of filter hashes (k) : 2
-# => Predicted false positive rate = 10.87%
+puts bf.stats
+# Number of filter bits (m): 100
+# Number of set bits (n): 4
+# Number of filter hashes (k) : 2
+# Predicted false positive rate = 10.87%
 ```
 ## Credits

data/ext/cbloomfilter/cbloomfilter.c CHANGED Viewed

@@ -17,8 +17,7 @@ static unsigned int *salts = crc_table;
 static VALUE cBloomFilter;
 struct BloomFilter {
-    int m; /* # of buckets in a bloom filter */
-    int b; /* # of bits in a bloom filter bucket */
+    int m; /* # of bits in a bloom filter */
     int k; /* # of hash functions */
     unsigned char *ptr; /* bits data */
     int bytes; /* size of byte data */
@@ -72,7 +71,6 @@ static VALUE bf_alloc(VALUE klass) {
     VALUE obj = TypedData_Make_Struct(klass, struct BloomFilter, &bf_type, bf);
     bf->m = 0;
-    bf->b = 0;
     bf->k = 0;
     bf->ptr = NULL;
     bf->bytes = 0;
@@ -80,52 +78,24 @@ static VALUE bf_alloc(VALUE klass) {
     return obj;
 }
-void bucket_unset(struct BloomFilter *bf, int index) {
-    int byte_offset = (index * bf->b) / 8;
-    int bit_offset = (index * bf->b) % 8;
-    unsigned int c = bf->ptr[byte_offset];
-    c += bf->ptr[byte_offset + 1] << 8;
-    unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
-    if ((c & mask) == 0) {
-      // do nothing
-    } else {
-        // reduce the counter: 11 00 => 10 00 (suppose bf->b is 2)
-        c -= (1 << bit_offset) & ((1 << 8) -1);
-        // shift the bitmap right by 1 bit: 10 00 => 01 00
-        c = (~mask & c) | ((c & mask) >> (bit_offset + 1) << bit_offset);
-        bf->ptr[byte_offset] = c & ((1 << 8) - 1);
-        bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
-    }
-}
+static void bucket_set(struct BloomFilter *bf, int index) {
+    int byte_offset = index / 8;
+    int bit_offset = index % 8;
-void bucket_set(struct BloomFilter *bf, int index) {
-    int byte_offset = (index * bf->b) / 8;
-    int bit_offset = (index * bf->b) % 8;
-    unsigned int c = bf->ptr[byte_offset];
-    c += bf->ptr[byte_offset + 1] << 8;
-    unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
-    if ((c & mask) != mask) {
-        c = c + ((1 << bit_offset) & ((1 << 8) -1)) | c;
-        bf->ptr[byte_offset] = c & ((1 << 8) - 1);
-        bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
-    }
+    bf->ptr[byte_offset] |= (unsigned char) (1U << bit_offset);
 }
-int bucket_check(struct BloomFilter *bf, int index) {
-    int byte_offset = (index * bf->b) / 8;
-    int bit_offset = (index * bf->b) % 8;
-    unsigned int c = bf->ptr[byte_offset];
-    c += bf->ptr[byte_offset + 1] << 8;
+static int bucket_check(struct BloomFilter *bf, int index) {
+    int byte_offset = index / 8;
+    int bit_offset = index % 8;
-    unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
-    return (c & mask) >> bit_offset;
+    return (bf->ptr[byte_offset] >> bit_offset) & 1;
 }
 static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
     struct BloomFilter *bf;
     VALUE arg1, arg2;
-    int m, k, b;
+    int m, k;
     bf = bf_ptr(self);
@@ -143,21 +113,20 @@ static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
     m = FIX2INT(arg1);
     k = FIX2INT(arg2);
-    b = 1;
     if (m < 1)
         rb_raise(rb_eArgError, "array size");
     if (k < 1)
         rb_raise(rb_eArgError, "hash length");
-    bf->b = b;
     bf->m = m;
     bf->k = k;
     ruby_xfree(bf->ptr);
     bf->ptr = NULL;
     bf->bytes = 0;
-    bf->bytes = ((m * b) + 15) / 8;
+    /* Preserve the existing serialized bitmap length, including one padding byte. */
+    bf->bytes = (m + 15) / 8;
     bf->ptr = ALLOC_N(unsigned char, bf->bytes);
     /* initialize the bits with zeros */
@@ -194,7 +163,7 @@ static VALUE bf_set_bits(VALUE self){
     return INT2FIX(count);
 }
-static VALUE bf_insert(VALUE self, VALUE key) {
+static VALUE bf_add(VALUE self, VALUE key) {
     VALUE skey;
     unsigned long hash;
     int index;
@@ -268,57 +237,34 @@ static VALUE bf_or(VALUE self, VALUE other) {
     return obj;
 }
-static VALUE bf_include(int argc, VALUE* argv, VALUE self) {
+static VALUE bf_include(VALUE self, VALUE key) {
+    VALUE skey;
     unsigned long hash;
-    int i, len, m, k;
     int index;
-    long tests_idx, vlen;
+    int i, len, m, k;
     char *ckey;
-    VALUE tests, key, skey;
-    struct BloomFilter *bf;
+    struct BloomFilter *bf = bf_ptr(self);
-    rb_scan_args(argc, argv, "*", &tests);
+    skey = rb_obj_as_string(key);
+    ckey = StringValuePtr(skey);
+    len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
-    bf = bf_ptr(self);
-    vlen = RARRAY_LEN(tests);
-    for (tests_idx = 0; tests_idx < vlen; tests_idx++) {
-        key = rb_ary_entry(tests, tests_idx);
-        skey = rb_obj_as_string(key);
-        ckey = StringValuePtr(skey);
-        len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
-        m = bf->m;
-        k = bf->k;
-        hash = (unsigned long) djb2(ckey, len);
-        for (i = 0; i <= k - 1; i++) {
-            index = (int) ((hash ^ salts[i]) % (unsigned int) (m));
-            /* check the bit at the index */
-            if (!bucket_check(bf, index)) {
-                return Qfalse; /* i.e., it is a new entry ; escape the loop */
-            }
+    m = bf->m;
+    k = bf->k;
+    hash = (unsigned long) djb2(ckey, len);
+    for (i = 0; i <= k - 1; i++) {
+        index = (int) ((hash ^ salts[i]) % (unsigned int) (m));
+        /* check the bit at the index */
+        if (!bucket_check(bf, index)) {
+            return Qfalse; /* i.e., it is a new entry ; escape the loop */
         }
     }
     return Qtrue;
 }
-static VALUE bf_to_s(VALUE self) {
-    struct BloomFilter *bf = bf_ptr(self);
-    unsigned char *ptr;
-    int i;
-    VALUE str;
-    str = rb_str_new(0, bf->m);
-    ptr = (unsigned char *) RSTRING_PTR(str);
-    for (i = 0; i < bf->m; i++)
-        *ptr++ = bucket_check(bf, i) ? '1' : '0';
-    return str;
-}
 static VALUE bf_bitmap(VALUE self) {
     struct BloomFilter *bf = bf_ptr(self);
@@ -346,15 +292,13 @@ void Init_cbloomfilter(void) {
     rb_define_method(cBloomFilter, "m", bf_m, 0);
     rb_define_method(cBloomFilter, "k", bf_k, 0);
     rb_define_method(cBloomFilter, "set_bits", bf_set_bits, 0);
-    /* rb_define_method(cBloomFilter, "s", bf_s, 0); */
-    rb_define_method(cBloomFilter, "insert", bf_insert, 1);
-    rb_define_method(cBloomFilter, "include?", bf_include, -1);
+    rb_define_method(cBloomFilter, "add", bf_add, 1);
+    rb_define_method(cBloomFilter, "include?", bf_include, 1);
     rb_define_method(cBloomFilter, "clear", bf_clear, 0);
-    rb_define_method(cBloomFilter, "merge!", bf_merge, 1);
+    rb_define_method(cBloomFilter, "merge", bf_merge, 1);
     rb_define_method(cBloomFilter, "&", bf_and, 1);
     rb_define_method(cBloomFilter, "|", bf_or, 1);
-    rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
     rb_define_method(cBloomFilter, "bitmap", bf_bitmap, 0);
     rb_define_method(cBloomFilter, "load", bf_load, 1);

data/lib/bloom_fit/configuration_mismatch.rb ADDED Viewed

@@ -0,0 +1,4 @@
+class BloomFit
+  class ConfigurationMismatch < ArgumentError
+  end
+end

data/lib/bloom_fit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class BloomFit
-  VERSION = "0.2.0".freeze
+  VERSION = "0.3.0".freeze
 end

data/lib/bloom_fit.rb CHANGED Viewed

@@ -1,63 +1,110 @@
+require "forwardable"
 require "cbloomfilter"
+require "bloom_fit/configuration_mismatch"
 require "bloom_fit/version"
 class BloomFit
-  class ConfigurationMismatch < ArgumentError
-  end
+  extend Forwardable
   attr_reader :bf
+  # @param size [Integer] number of buckets in a bloom filter
+  # @param hashes [Integer] number of hash functions
   def initialize(size: 1_000, hashes: 4)
-    @size = size
-    @hashes = hashes
+    @bf = CBloomFilter.new(size, hashes)
+  end
+  def_delegators :@bf, :m, :k, :bitmap, :include?, :clear, :set_bits
+  alias size m
+  alias hashes k
+  alias key? include?
+  alias [] include?
+  alias n set_bits
-    # arg 1: m => size : number of buckets in a bloom filter
-    # arg 2: k => hashes : number of hash functions
-    @bf = CBloomFilter.new(@size, @hashes)
+  def empty?
+    set_bits.zero?
   end
-  def insert(key)
-    @bf.insert(key)
+  # Adds the given key to the set and returns +self+.  Mimics the behavior of
+  # +Set#add+
+  def add(key)
+    @bf.add(key)
+    self
   end
-  alias []= insert
+  alias << add
-  def include?(*keys)
-    @bf.include?(*keys)
+  # Adds the given key to the set if the value is truthy.  Mimics the behavior of
+  # +Hash#[]=+
+  def []=(key, value)
+    @bf.add(key) if value
   end
-  alias key? include?
-  alias [] include?
-  def clear = @bf.clear
-  def size = @bf.set_bits
-  def merge!(other) = @bf.merge!(other.bf)
+  # Adds the given key to the set and returns +self+. If the key is already
+  # the in set, returns +nil+. Mimics the behavior of +Set#add?+
+  def add?(key)
+    return nil if include?(key) # rubocop:disable Style/ReturnNilInPredicateMethodDefinition
+    add(key)
+  end
+  # Returns a string of the set bits in hex format
+  def to_hex
+    length = ((size / 8.0).ceil * 8 / 4)
+    bitmap.unpack1("H*")[0...length]
+  end
-  # Returns the number of bits that are set to 1 in the filter.
-  def set_bits
-    @bf.set_bits
+  # Returns a string of the set bits in binary format
+  def to_binary
+    bitmap.unpack1("B*")[0...size]
   end
-  # Computes the intersection of two Bloom filters.
-  # It assumes that both filters have the same size -
-  # if this is not true +BloomFit::ConfigurationMismatch+ is raised.
+  # Adds the set from another BloomFit filter or adds all the elements from an
+  # enumerable.  Mimics the behavior of +Set#merge+
+  def merge(other)
+    if other.is_a?(BloomFit)
+      raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
+      @bf.merge(other.bf)
+    elsif other.respond_to?(:each_key)
+      other.each { |k, v| add(k) if v }
+    elsif other.is_a?(Enumerable)
+      other.each { |k| add(k) }
+    else
+      raise ArgumentError, "value must be enumerable or another BloomFit filter"
+    end
+  end
+  # Computes the intersection of two Bloom filters. It requires that both
+  # filters have the same size; otherwise, +BloomFit::ConfigurationMismatch+
+  # is raised.
   def &(other)
     raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
-    result = self.class.new
-    result.instance_variable_set(:@bf, @bf.&(other.bf))
-    result
+    self.class.new(size:, hashes:).tap do |result|
+      result.instance_variable_set(:@bf, @bf.&(other.bf))
+    end
   end
+  alias intersection &
-  # Computes the union of two Bloom filters.
-  # It assumes that both filters have the same size -
-  # if this is not true +BloomFit::ConfigurationMismatch+ is raised.
+  # Computes the union of two Bloom filters. It requires that both filters
+  # have the same size; otherwise, +BloomFit::ConfigurationMismatch+ is
+  # raised.
   def |(other)
     raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
-    result = self.class.new
-    result.instance_variable_set(:@bf, @bf.|(other.bf))
-    result
+    self.class.new(size:, hashes:).tap do |result|
+      result.instance_variable_set(:@bf, @bf.|(other.bf))
+    end
   end
+  alias union |
-  def bitmap
-    @bf.bitmap
+  def stats
+    fpr = ((1.0 - Math.exp(-(k * n).to_f / m))**k) * 100
+    (+"").tap do |s|
+      s << format("Number of filter buckets (m):  %d\n",     m)
+      s << format("Number of set bits (n):        %d\n",     n)
+      s << format("Number of filter hashes (k):   %d\n",     k)
+      s << format("Predicted false positive rate: %.2f%%\n", fpr)
+    end
   end
   def marshal_load(ary)
@@ -68,11 +115,11 @@ class BloomFit
   end
   def marshal_dump
-    [@size, @hashes, @bf.bitmap]
+    [size, hashes, bitmap]
   end
   def self.load(filename)
-    Marshal.load(File.open(filename, "r"))
+    Marshal.load(File.open(filename, "r")) # rubocop:disable Security/MarshalLoad
   end
   def save(filename)
@@ -81,14 +128,6 @@ class BloomFit
     end
   end
-  def stats
-    fp = ((1.0 - Math.exp(-(@hashes * size).to_f / @size))**@hashes) * 100
-    printf "Number of filter buckets (m): %d\n", @size
-    printf "Number of set bits (n): %d\n", set_bits
-    printf "Number of filter hashes (k) : %d\n", @hashes
-    printf "Predicted false positive rate = %.2f%%\n", fp
-  end
   protected
   # Returns true if parameters of the +other+ filter are

data/lib/cbloomfilter.bundle CHANGED Viewed

Binary file

data/test/bloom_fit_test.rb ADDED Viewed

@@ -0,0 +1,344 @@
+require "test_helper"
+class BloomFitTest < Minitest::Spec
+  subject { BloomFit.new(size: 100, hashes: 4) }
+  describe "#empty?" do
+    it "returns true when nothing set" do
+      assert_equal true, subject.empty? # rubocop:disable Minitest/AssertTruthy
+      assert_empty subject
+    end
+    it "returns false when something set" do
+      subject << "key"
+      assert_equal false, subject.empty? # rubocop:disable Minitest/RefuteFalse
+      refute_empty subject
+    end
+  end
+  describe "#add" do
+    it "adds the key and returns self" do
+      assert_equal subject, subject.add("test1")
+      assert_equal subject, subject.add("test2")
+      assert_includes subject, "test1"
+      assert_includes subject, "test2"
+    end
+    it "is aliased as #<<" do
+      subject << "test1" << "test2"
+      assert_includes subject, "test1"
+      assert_includes subject, "test2"
+    end
+    it "is aliased as #[]=, and handles truthy/falsey values" do
+      subject["dog"] = :bar
+      subject["cat"] = :foo
+      assert_includes subject, "dog"
+      assert_includes subject, "cat"
+      subject["bat"] = nil
+      subject["pig"] = false
+      refute_includes subject, "bat"
+      refute_includes subject, "pig"
+    end
+    it "casts using #to_s as necessary" do
+      subject << :symbol << true << 12_345
+      assert_includes subject, "symbol"
+      assert_includes subject, :symbol
+      assert_includes subject, "true"
+      assert_includes subject, "12345"
+      assert_includes subject, 12_345
+    end
+  end
+  describe "#add?" do
+    it "adds new key and returns self" do
+      assert_equal subject, subject.add("test1")
+      assert_equal subject, subject.add("test2")
+      assert_includes subject, "test1"
+      assert_includes subject, "test2"
+    end
+    it "return nil if the key already exists" do
+      subject << "test1"
+      subject << "test2"
+      assert_includes subject, "test1"
+      assert_includes subject, "test2"
+      assert_nil subject.add?("test1")
+      assert_nil subject.add?("test2")
+    end
+  end
+  describe "#include?" do
+    it "returns true when a key is in the set" do
+      subject << "test1"
+      subject << "test2"
+      assert_equal true, subject.include?("test1") # rubocop:disable Minitest/AssertTruthy
+      assert_equal true, subject.include?("test2") # rubocop:disable Minitest/AssertTruthy
+    end
+    it "returns false when a key is not in the set" do
+      assert_equal false, subject.include?("test") # rubocop:disable Minitest/RefuteFalse
+      assert_equal false, subject.include?("nada") # rubocop:disable Minitest/RefuteFalse
+    end
+    it "is aliased as #key?" do
+      subject << "test1"
+      subject << "test2"
+      assert subject.key?("test1")
+      assert subject.key?("test2")
+      refute subject.key?("test3")
+    end
+    it "is aliased as #[]" do
+      subject << "test1"
+      subject << "test2"
+      assert subject["test1"]
+      assert subject["test2"]
+      refute subject["test3"]
+    end
+  end
+  describe "#clear" do
+    it "zeroes the bits" do
+      subject.add("test")
+      assert_includes subject, "test"
+      assert_includes subject.to_binary, "1"
+      subject.clear
+      refute_includes subject, "test"
+      refute_includes subject.to_binary, "1"
+    end
+  end
+  describe "#set_bits" do
+    it "returns the number of bits set to 1" do
+      bf = BloomFit.new(size: 100, hashes: 4)
+      bf.add("bits")
+      assert_equal 4, bf.set_bits
+      bf = BloomFit.new(size: 100, hashes: 1)
+      bf.add("bits")
+      assert_equal 1, bf.set_bits
+    end
+  end
+  describe "#bitmap" do
+    it "returns a binary bitmap of all zeros when empty (including a terminating byte)" do
+      bf = BloomFit.new(size: 16)
+      assert_equal "\x00\x00\x00".b, bf.bitmap
+    end
+    it "returns a binary bitmap representing the set" do
+      bf = BloomFit.new(size: 16, hashes: 4)
+      bf.add("something")
+      assert_equal "(\x82\x00".b, bf.bitmap
+    end
+    it "returns a binary bitmap representing the set even if not a multiple of 8 bits" do
+      bf = BloomFit.new(size: 20, hashes: 4)
+      bf.add("wow")
+      assert_equal "\x04\x14\x00\x00".b, bf.bitmap
+    end
+  end
+  describe "#to_hex" do
+    it "returns a hex bitmap of all zeros when empty" do
+      bf = BloomFit.new(size: 16)
+      assert_equal "0000", bf.to_hex
+    end
+    it "returns a hex bitmap of all zeros when empty if not a multiple of 8 bits" do
+      bf = BloomFit.new(size: 18)
+      assert_equal "000000", bf.to_hex
+    end
+    it "returns a hex bitmap representing the set" do
+      bf = BloomFit.new(size: 16, hashes: 4)
+      bf.add("cool")
+      assert_equal "1441", bf.to_hex
+    end
+  end
+  describe "#to_binary" do
+    it "returns a binary bitmap of all zeros when empty" do
+      bf = BloomFit.new(size: 16)
+      assert_equal "0000000000000000", bf.to_binary
+    end
+    it "returns a binary bitmap of all zeros when empty if not a multiple of 8 bits" do
+      bf = BloomFit.new(size: 19)
+      assert_equal "0000000000000000000", bf.to_binary
+    end
+    it "returns a binary bitmap representing the set" do
+      bf = BloomFit.new(size: 16, hashes: 4)
+      bf << "cool" << "cat"
+      assert_equal "1001011001101001", bf.to_binary
+    end
+  end
+  describe "#merge" do
+    it "merges another BloomFit filter" do
+      bf1 = BloomFit.new(size: 100, hashes: 2)
+      bf2 = BloomFit.new(size: 100, hashes: 2)
+      bf1 << "mouse"
+      bf2 << "cat" << "dog"
+      refute_includes bf1, "cat"
+      refute_includes bf1, "dog"
+      bf1.merge(bf2)
+      assert_includes bf1, "mouse"
+      assert_includes bf1, "cat"
+      assert_includes bf1, "dog"
+      refute_includes bf2, "mouse"
+      assert_includes bf2, "cat"
+      assert_includes bf2, "dog"
+    end
+    it "merges an array" do
+      subject << "mouse"
+      subject.merge %i[cat dog]
+      assert_includes subject, "mouse"
+      assert_includes subject, "cat"
+      assert_includes subject, "dog"
+    end
+    it "merges a set" do
+      subject << "mouse"
+      subject.merge Set.new(%w[cat dog])
+      assert_includes subject, "mouse"
+      assert_includes subject, "cat"
+      assert_includes subject, "dog"
+    end
+    it "merges a hash ignoring falsey values" do
+      subject << "mouse"
+      subject.merge({ cat: 1, dog: 2, ant: false, bug: nil })
+      assert_includes subject, "mouse"
+      assert_includes subject, "cat"
+      assert_includes subject, "dog"
+      refute_includes subject, "ant"
+      refute_includes subject, "bug"
+    end
+    it "raises when merge is between incompatible filters" do
+      bf1 = BloomFit.new(size: 10)
+      bf2 = BloomFit.new(size: 20)
+      assert_raises(BloomFit::ConfigurationMismatch) { bf1.merge(bf2) }
+    end
+  end
+  describe "#&" do
+    it "returns intersection of both filters" do
+      bf1 = BloomFit.new(size: 35, hashes: 4)
+      bf1.add("test")
+      bf1.add("test1")
+      bf2 = BloomFit.new(size: 35, hashes: 4)
+      bf2.add("test")
+      bf2.add("test2")
+      bf3 = bf1 & bf2
+      assert_equal 35, bf3.size
+      assert_equal 4, bf3.hashes
+      assert_includes bf3, "test"
+      refute_includes bf3, "test1"
+      refute_includes bf3, "test2"
+    end
+    it "is aliased as #intersection" do
+      bf1 = BloomFit.new(size: 20, hashes: 4)
+      bf1.add("test")
+      bf1.add("test1")
+      bf2 = BloomFit.new(size: 20, hashes: 4)
+      bf2.add("test")
+      bf3 = bf1.intersection(bf2)
+      assert_includes bf3, "test"
+      refute_includes bf3, "test1"
+    end
+    it "raises when intersection is between incompatible filters" do
+      bf1 = BloomFit.new(size: 10)
+      bf2 = BloomFit.new(size: 20)
+      assert_raises(BloomFit::ConfigurationMismatch) { bf1 & bf2 }
+      bf1 = BloomFit.new(size: 10, hashes: 2)
+      bf2 = BloomFit.new(size: 10, hashes: 4)
+      assert_raises(BloomFit::ConfigurationMismatch) { bf1 & bf2 }
+    end
+  end
+  describe "#|" do
+    it "returns union with other filter" do
+      bf1 = BloomFit.new
+      bf1.add("test")
+      bf1.add("test1")
+      bf2 = BloomFit.new
+      bf2.add("test")
+      bf2.add("test2")
+      bf3 = bf1 | bf2
+      assert_includes bf3, "test"
+      assert_includes bf3, "test1"
+      assert_includes bf3, "test2"
+    end
+    it "is aliased as #union" do
+      bf1 = BloomFit.new(size: 20, hashes: 4)
+      bf1.add("test")
+      bf1.add("test1")
+      bf2 = BloomFit.new(size: 20, hashes: 4)
+      bf2.add("test")
+      bf3 = bf1.union(bf2)
+      assert_includes bf3, "test"
+      assert_includes bf3, "test1"
+    end
+    it "raises when union is between incompatible filters" do
+      bf1 = BloomFit.new(size: 10)
+      bf2 = BloomFit.new(size: 20)
+      assert_raises(BloomFit::ConfigurationMismatch) { bf1 | bf2 }
+    end
+  end
+  describe "#stats" do
+    it "returns current stats" do
+      bf = BloomFit.new(size: 10, hashes: 3)
+      expected = <<~STATS
+        Number of filter buckets (m):  10
+        Number of set bits (n):        0
+        Number of filter hashes (k):   3
+        Predicted false positive rate: 0.00%
+      STATS
+      assert_equal expected, bf.stats
+    end
+  end
+  describe "serialization" do
+    after { File.unlink("bf.out") }
+    it "marshalls" do
+      bf = BloomFit.new
+      assert bf.save("bf.out")
+    end
+    it "loads from marshalled" do
+      subject.add("foo")
+      subject.add("bar")
+      subject.save("bf.out")
+      bf2 = BloomFit.load("bf.out")
+      assert_includes bf2, "foo"
+      assert_includes bf2, "bar"
+      refute_includes bf2, "baz"
+      assert subject.send(:same_parameters?, bf2)
+    end
+  end
+end

data/test/test_helper.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require "minitest/autorun"
+require "minitest/reporters"
+Minitest::Reporters.use! # override with MINITEST_REPORTER env var
+require "bloom_fit"

metadata CHANGED Viewed

@@ -1,22 +1,22 @@
 --- !ruby/object:Gem::Specification
 name: bloom_fit
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
-- Ilya Grigorik
-- Tatsuya Mori
 - Ryan McGeary
 - Beshad Talayeminaei
+- Ilya Grigorik
+- Tatsuya Mori
 bindir: bin
 cert_chain: []
 date: 1980-01-02 00:00:00.000000000 Z
 dependencies: []
 email:
-- ilya@grigorik.com
-- valdzone@gmail.com
 - ryan@mcgeary.org
 - 'btalayeminaei@gmail.com '
+- ilya@grigorik.com
+- valdzone@gmail.com
 executables: []
 extensions:
 - ext/cbloomfilter/extconf.rb
@@ -27,10 +27,11 @@ files:
 - ext/cbloomfilter/crc32.h
 - ext/cbloomfilter/extconf.rb
 - lib/bloom_fit.rb
+- lib/bloom_fit/configuration_mismatch.rb
 - lib/bloom_fit/version.rb
 - lib/cbloomfilter.bundle
-- spec/bloom_fit_spec.rb
-- spec/helper.rb
+- test/bloom_fit_test.rb
+- test/test_helper.rb
 homepage: https://github.com/rmm5t/bloom_fit
 licenses: []
 metadata:

data/spec/bloom_fit_spec.rb DELETED Viewed

@@ -1,129 +0,0 @@
-require "helper"
-describe BloomFit do
-  it "clears" do
-    bf = BloomFit.new(size: 100, hashes: 2)
-    bf.insert("test")
-    expect(bf.include?("test")).to be true
-    bf.clear
-    expect(bf.include?("test")).to be false
-  end
-  it "merges" do
-    bf1 = BloomFit.new(size: 100, hashes: 2)
-    bf2 = BloomFit.new(size: 100, hashes: 2)
-    bf2.insert("test")
-    expect(bf1.include?("test")).to be false
-    bf1.merge!(bf2)
-    expect(bf1.include?("test")).to be true
-    expect(bf2.include?("test")).to be true
-  end
-  it "tests set membership" do
-    bf = BloomFit.new(size: 100, hashes: 2)
-    bf.insert("test")
-    bf.insert("test1")
-    expect(bf.include?("test")).to be true
-    expect(bf.include?("abcd")).to be false
-    expect(bf.include?("test", "test1")).to be true
-    expect(bf.include?("test1", "abcd")).to be false
-  end
-  it "works with any object's to_s" do
-    subject.insert(:test)
-    subject.insert(:test1)
-    subject.insert(12_345)
-    expect(subject.include?("test")).to be true
-    expect(subject.include?("abcd")).to be false
-    expect(subject.include?("12345")).to be true
-  end
-  it "returns the number of bits set to 1" do
-    bf = BloomFit.new(hashes: 4)
-    bf.insert("test")
-    expect(bf.set_bits).to eq 4
-    bf = BloomFit.new(hashes: 1)
-    bf.insert("test")
-    expect(bf.set_bits).to eq 1
-  end
-  it "returns intersection with other filter" do
-    bf1 = BloomFit.new
-    bf1.insert("test")
-    bf1.insert("test1")
-    bf2 = BloomFit.new
-    bf2.insert("test")
-    bf2.insert("test2")
-    bf3 = bf1 & bf2
-    expect(bf3.include?("test")).to be true
-    expect(bf3.include?("test1")).to be false
-    expect(bf3.include?("test2")).to be false
-  end
-  it "raises an exception when intersection is to be computed for incompatible filters" do
-    bf1 = BloomFit.new(size: 10)
-    bf1.insert("test")
-    bf2 = BloomFit.new(size: 20)
-    bf2.insert("test")
-    expect { bf1 & bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
-  end
-  it "returns union with other filter" do
-    bf1 = BloomFit.new
-    bf1.insert("test")
-    bf1.insert("test1")
-    bf2 = BloomFit.new
-    bf2.insert("test")
-    bf2.insert("test2")
-    bf3 = bf1 | bf2
-    expect(bf3.include?("test")).to be true
-    expect(bf3.include?("test1")).to be true
-    expect(bf3.include?("test2")).to be true
-  end
-  it "raises an exception when union is to be computed for incompatible filters" do
-    bf1 = BloomFit.new(size: 10)
-    bf1.insert("test")
-    bf2 = BloomFit.new(size: 20)
-    bf2.insert("test")
-    expect { bf1 | bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
-  end
-  it "outputs current stats" do
-    subject.insert("test")
-    expect { subject.stats }.not_to raise_error
-  end
-  context "serialization" do
-    after { File.unlink("bf.out") }
-    it "marshalls" do
-      bf = BloomFit.new
-      expect { bf.save("bf.out") }.not_to raise_error
-    end
-    it "loads from marshalled" do
-      subject.insert("foo")
-      subject.insert("bar")
-      subject.save("bf.out")
-      bf2 = BloomFit.load("bf.out")
-      expect(bf2.include?("foo")).to be true
-      expect(bf2.include?("bar")).to be true
-      expect(bf2.include?("baz")).to be false
-      expect(subject.send(:same_parameters?, bf2)).to be true
-    end
-  end
-end

data/spec/helper.rb DELETED Viewed

	@@ -1 +0,0 @@
1	- require "bloom_fit"