RubyGems - bloomer - Versions diffs - 0.0.1 → 0.0.2 - Mend

bloomer 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/README.md CHANGED Viewed

@@ -1,9 +1,14 @@
-# Bloomer: a pure-ruby bloom filter with no extra fluff
+# Bloomer: A pure-ruby bloom filter with no extra fluff
 [Bloom filters](http://en.wikipedia.org/wiki/Bloom_filter) are great for quickly checking to see if
 a given string has been seen before--in constant time, and using a fixed amount of RAM.
+Note that false positives with bloom filters *are possible*, but false negatives are not. In other words,
+* if ```include?``` returns *false*, that string has *certainly not* been ```add```ed
+* if ```include?``` returns *true*, it *might* mean that string was ```add```ed (depending on the
+```false_positive_probability``` parameter provided to the constructor).
 This implementation is the Nth bloom filter gem written in ruby -- but, at the time of conception, the only one that
 * uses a robust set of hashing functions
@@ -38,4 +43,10 @@ new_b.include? "a"
 * 0.0.1 Bloom, there it is.
+* 0.0.2 Switch to triple hash chaining, which resulted in better, faster hashing (!!):
+  md5 (v0.0.2): 66 sec, false positive rate = 1.116%, expected 1.0%
+  multihash (0.0.1): 92 sec, false positive rate = 1.27%, expected 1.0%

data/bloomer.gemspec CHANGED Viewed

@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
   s.version     = Bloomer::VERSION
   s.authors     = ["Matthew McEachen"]
   s.email       = ["matthew+github@mceachen.org"]
-  s.homepage    = ""
+  s.homepage    = "https://github.com/mceachen/bloomer"
   s.summary     = %q{Pure-ruby bloom filter with minimal dependencies}
   s.description = %q{Pure-ruby bloom filter with minimal dependencies}

data/lib/bloomer.rb CHANGED Viewed

@@ -1,7 +1,8 @@
 require 'bitarray'
+require 'digest/md5'
 class Bloomer
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
   def initialize(expected_size, false_positive_probability = 0.001, opts = {})
     @ba = opts[:ba] || begin
@@ -9,104 +10,48 @@ class Bloomer
       m = -(expected_size * Math.log(false_positive_probability)) / (Math.log(2) ** 2)
       BitArray.new(m.round)
     end
     # k is the number of hash functions that minimizes the probability of false positives
-    k = opts[:k] || Math.log(2) * (@ba.size / expected_size)
-    @hashes = Hashes.build(k.round)
+    @k = (opts[:k] || Math.log(2) * (@ba.size / expected_size)).round
   end
+  # returns true if item hadn't already been added
   def add string
-    indicies(string).each { |ea| @ba[ea] = 1 }
+    count = 0
+    hashes(string).each { |ea| count += @ba[ea] ; @ba[ea] = 1 }
+    count == @k
   end
+  # returns false if the item hadn't already been added
+  # returns true if it is likely that string had been added. See #false_positive_probability
   def include? string
-    !indicies(string).any? { |ea| @ba[ea] == 0 }
+    !hashes(string).any? { |ea| @ba[ea] == 0 }
   end
   def _dump(depth)
-    [@hashes.size, Marshal.dump(@ba)].join("\n")
+    [@k, Marshal.dump(@ba)].join(" ")
   end
   def self._load(data)
-    k, ba = data.split("\n", 2)
+    k, ba = data.split(" ", 2)
     new(nil, nil, :k => k.to_i, :ba => Marshal.load(ba))
   end
   private
-  def indicies string
-    @hashes.collect do |h|
-      h.call(string) % @ba.size
-    end
-  end
-  class CircularQueue < Array
-    def rot!
-      first = self.shift
-      self.push(first)
-      first
-    end
-  end
-  class Hashes
-    PRIMES = [3571, 4219, 4447, 5167, 5419, 6211, 7057, 7351, 8269, 9241, 10267, 11719, 12097, 13267, 13669, 16651, 19441, 19927, 22447, 23497, 24571, 25117, 26227, 27361, 33391, 35317]
-    def self.build(number_of_hashes)
-      hashes = [djb_hash, js_hash, rs_hash, knr_hash, ruby_hash]
-      primes = CircularQueue.new PRIMES
-      while (number_of_hashes > hashes.size)
-        hashes += [:djb_hash, :js_hash, :rs_hash, :knr_hash, :ruby_hash].collect do |ea|
-          send(ea, primes.rot!, primes.rot!)
-        end
-      end
-      return hashes.first(number_of_hashes)
-    end
-    MAX = 2**31 - 1
-    # written by Professor Daniel J. Bernstein from comp.lang.c
-    def self.djb_hash(a = 5381, b = nil)
-      lambda do |data|
-        data.each_byte.inject(a) do |hash, ea|
-          ((hash << 5) + hash + ea) % MAX
-        end
-      end
-    end
-    # bitwise hash function written by Justin Sobel
-    def self.js_hash(a = 1315423911, b = nil)
-      lambda do |data|
-        data.each_byte.inject(a) do |hash, ea|
-          (hash ^ ((hash << 5) + ea + (hash >> 2))) % MAX
-        end
-      end
-    end
-    # simple hash function from Robert Sedgwicks Algorithms in C book
-    def self.rs_hash(a = 63689, b = 378551)
-      lambda do |data|
-        i, j = a, b
-        data.each_byte.inject(0) do |hash, ea|
-          i = (i * j) % MAX
-          (hash * i + ea) % MAX
-        end
-      end
-    end
-    # From Kernigham and Ritchie's "The C Programming Language"
-    def self.knr_hash(a = 1619, b = 911)
-      lambda do |data|
-        data.each_byte.inject(a) do |hash, ea|
-          ((hash * b) + ea) % MAX
-        end
-      end
-    end
-    # default hash
-    def self.ruby_hash(a = 1, b = 1)
-      lambda do |data|
-        (data.hash * a) % MAX
-      end
-    end
-  end
-end
+  # Return an array of hash indices to set.
+  # Uses triple hashing as described in http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf
+  def hashes(data)
+    m = @ba.size
+    h = Digest::MD5.hexdigest(data.to_s).to_i(16)
+    x = h % m
+    h /= m
+    y = h % m
+    h /= m
+    z = h % m
+    [x] + 1.upto(@k - 1).collect do |i|
+      x = (x + y) % m
+      y = (y + z) % m
+      x
+    end
+  end
+end

data/spec/bloomer_spec.rb CHANGED Viewed

@@ -1,42 +1,64 @@
 require "spec_helper"
+require "benchmark"
-def rand_alpha(size)
-  chars = ('a'..'z').to_a + ('A'..'Z').to_a
-  (0...size).collect { chars[Kernel.rand(chars.length)] }.join
+def rand_word(length = 8)
+  ('a'..'z').to_a.shuffle.first(length).join # not random enough to cause hits.
 end
 describe Bloomer do
   it "should work trivially" do
     b = Bloomer.new(10, 0.001)
-    b.add("a")
+    b.add("a").should be_false
+    b.add("a").should be_true
     b.should include("a")
     b.should_not include("")
     b.should_not include("b")
-    b.add("b")
+    b.add("b").should be_false
+    b.add("b").should be_true
     b.should include("b")
     b.should_not include("")
     b.add("")
     b.should include("")
   end
-  it "should find random strings" do
-    b = Bloomer.new(5_000, 0.001)
-    inputs = 1000.times.collect { rand_alpha(Kernel.rand(50)) }
-    inputs.each { |ea| b.add(ea) }
-    inputs.each { |ea| b.include?(ea).should be_true }
-    5000.times.each do
-      s = rand_alpha(Kernel.rand(50))
-      b.include?(s).should == inputs.include?(s)
-    end
-  end
   it "should marshal state correctly" do
     b = Bloomer.new(10, 0.001)
     inputs = %q(a b c d)
-    inputs.each{|ea|b.add(ea)}
+    inputs.each { |ea| b.add(ea) }
     s = Marshal.dump(b)
     new_b = Marshal.load(s)
-    inputs.each{|ea|new_b.should include(ea)}
+    inputs.each { |ea| new_b.should include(ea) }
+  end
+  it "should result in similar-to-expected false positives" do
+    max_false_prob = 0.001
+    size = 50_000
+    bloom = Bloomer.new(size, max_false_prob)
+    set = Set.new
+    size.times do
+      w = rand_word
+      bloom.add(w)
+      set.add(w)
+    end
+    set.each { |ea| bloom.include?(ea).should be_true }
+    tries = size * 3
+    false_hits = 0
+    hits = 0
+    tries.times.each do
+      word = rand_word
+      b_inc, s_inc = bloom.include?(word), set.include?(word)
+      hits += 1 if s_inc
+      if s_inc && !b_inc
+        fail "'#{word}': false negative on include"
+      elsif !s_inc && b_inc
+        false_hits += 1
+      end
+    end
+    false_positive_failure_rate = false_hits.to_f / tries
+    puts "False positive rate = #{false_positive_failure_rate * 100}%, expected #{max_false_prob * 100}% (#{false_hits} false positives, #{hits} hits)"
+    if (false_positive_failure_rate) > max_false_prob * 2
+      fail "False-positive failure rate was bad: #{false_positive_failure_rate}"
+    end
   end
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: bloomer
 version: !ruby/object:Gem::Version
-  hash: 29
+  hash: 27
   prerelease:
   segments:
   - 0
   - 0
-  - 1
-  version: 0.0.1
+  - 2
+  version: 0.0.2
 platform: ruby
 authors:
 - Matthew McEachen
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-01-18 00:00:00 -08:00
+date: 2012-01-21 00:00:00 -08:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -52,7 +52,7 @@ files:
 - spec/bloomer_spec.rb
 - spec/spec_helper.rb
 has_rdoc: true
-homepage: ""
+homepage: https://github.com/mceachen/bloomer
 licenses: []
 post_install_message: