RubyGems - bloomer - Versions diffs - 0.0.1 → 0.0.2 - Mend

bloomer 0.0.1 → 0.0.2

Files changed (5) hide show

data/README.md CHANGED Viewed

@@ -1,9 +1,14 @@
-# Bloomer: a pure-ruby bloom filter with no extra fluff
+# Bloomer: A pure-ruby bloom filter with no extra fluff
 [Bloom filters](http://en.wikipedia.org/wiki/Bloom_filter) are great for quickly checking to see if
 a given string has been seen before--in constant time, and using a fixed amount of RAM.
+Note that false positives with bloom filters *are possible*, but false negatives are not. In other words,
+* if ```include?``` returns *false*, that string has *certainly not* been ```add```ed
+* if ```include?``` returns *true*, it *might* mean that string was ```add```ed (depending on the
+```false_positive_probability``` parameter provided to the constructor).
 This implementation is the Nth bloom filter gem written in ruby -- but, at the time of conception, the only one that
 * uses a robust set of hashing functions
@@ -38,4 +43,10 @@ new_b.include? "a"
 * 0.0.1 Bloom, there it is.
+* 0.0.2 Switch to triple hash chaining, which resulted in better, faster hashing (!!):
+  md5 (v0.0.2): 66 sec, false positive rate = 1.116%, expected 1.0%
+  multihash (0.0.1): 92 sec, false positive rate = 1.27%, expected 1.0%

data/bloomer.gemspec CHANGED Viewed

@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
   s.version     = Bloomer::VERSION
   s.authors     = ["Matthew McEachen"]
   s.email       = ["matthew+github@mceachen.org"]
-  s.homepage    = ""
+  s.homepage    = "https://github.com/mceachen/bloomer"
   s.summary     = %q{Pure-ruby bloom filter with minimal dependencies}
   s.description = %q{Pure-ruby bloom filter with minimal dependencies}

data/lib/bloomer.rb CHANGED Viewed

@@ -1,7 +1,8 @@
 require 'bitarray'
+require 'digest/md5'
 class Bloomer
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
   def initialize(expected_size, false_positive_probability = 0.001, opts = {})
     @ba = opts[:ba] || begin
@@ -9,104 +10,48 @@ class Bloomer
       m = -(expected_size * Math.log(false_positive_probability)) / (Math.log(2) ** 2)
       BitArray.new(m.round)
     end
     # k is the number of hash functions that minimizes the probability of false positives
-    k = opts[:k] || Math.log(2) * (@ba.size / expected_size)
-    @hashes = Hashes.build(k.round)
+    @k = (opts[:k] || Math.log(2) * (@ba.size / expected_size)).round
   end
+  # returns true if item hadn't already been added
   def add string
-    indicies(string).each { |ea| @ba[ea] = 1 }
+    count = 0
+    hashes(string).each { |ea| count += @ba[ea] ; @ba[ea] = 1 }
+    count == @k
   end
+  # returns false if the item hadn't already been added
+  # returns true if it is likely that string had been added. See #false_positive_probability
   def include? string
-    !indicies(string).any? { |ea| @ba[ea] == 0 }
+    !hashes(string).any? { |ea| @ba[ea] == 0 }
   end
   def _dump(depth)
-    [@hashes.size, Marshal.dump(@ba)].join("\n")
+    [@k, Marshal.dump(@ba)].join(" ")
   end
   def self._load(data)
-    k, ba = data.split("\n", 2)
+    k, ba = data.split(" ", 2)
     new(nil, nil, :k => k.to_i, :ba => Marshal.load(ba))
   end
   private
-  def indicies string
-    @hashes.collect do |h|
-      h.call(string) % @ba.size
-    end
-  end
-  class CircularQueue < Array
-    def rot!
-      first = self.shift
-      self.push(first)
-      first
-    end
-  end
-  class Hashes
-    PRIMES = [3571, 4219, 4447, 5167, 5419, 6211, 7057, 7351, 8269, 9241, 10267, 11719, 12097, 13267, 13669, 16651, 19441, 19927, 22447, 23497, 24571, 25117, 26227, 27361, 33391, 35317]
-    def self.build(number_of_hashes)
-      hashes = [djb_hash, js_hash, rs_hash, knr_hash, ruby_hash]
-      primes = CircularQueue.new PRIMES
-      while (number_of_hashes > hashes.size)
-        hashes += [:djb_hash, :js_hash, :rs_hash, :knr_hash, :ruby_hash].collect do |ea|
-          send(ea, primes.rot!, primes.rot!)
-        end
-      end
-      return hashes.first(number_of_hashes)
-    end
-    MAX = 2**31 - 1
-    # written by Professor Daniel J. Bernstein from comp.lang.c
-    def self.djb_hash(a = 5381, b = nil)
-      lambda do |data|
-        data.each_byte.inject(a) do |hash, ea|
-          ((hash << 5) + hash + ea) % MAX
-        end
-      end
-    end
-    # bitwise hash function written by Justin Sobel
-    def self.js_hash(a = 1315423911, b = nil)
-      lambda do |data|
-        data.each_byte.inject(a) do |hash, ea|
-          (hash ^ ((hash << 5) + ea + (hash >> 2))) % MAX
-        end
-      end
-    end
-    # simple hash function from Robert Sedgwicks Algorithms in C book
-    def self.rs_hash(a = 63689, b = 378551)
-      lambda do |data|
-        i, j = a, b
-        data.each_byte.inject(0) do |hash, ea|
-          i = (i * j) % MAX
-          (hash * i + ea) % MAX
-        end
-      end
-    end
-    # From Kernigham and Ritchie's "The C Programming Language"
-    def self.knr_hash(a = 1619, b = 911)
-      lambda do |data|
-        data.each_byte.inject(a) do |hash, ea|
-          ((hash * b) + ea) % MAX
-        end
-      end
-    end
-    # default hash
-    def self.ruby_hash(a = 1, b = 1)
-      lambda do |data|
-        (data.hash * a) % MAX
-      end
-    end
-  end
-end
+  # Return an array of hash indices to set.
+  # Uses triple hashing as described in http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf
+  def hashes(data)
+    m = @ba.size
+    h = Digest::MD5.hexdigest(data.to_s).to_i(16)
+    x = h % m
+    h /= m
+    y = h % m
+    h /= m
+    z = h % m
+    [x] + 1.upto(@k - 1).collect do |i|
+      x = (x + y) % m
+      y = (y + z) % m
+      x
+    end
+  end
+end

data/spec/bloomer_spec.rb CHANGED Viewed

@@ -1,42 +1,64 @@
 require "spec_helper"
+require "benchmark"
-def rand_alpha(size)
-  chars = ('a'..'z').to_a + ('A'..'Z').to_a
-  (0...size).collect { chars[Kernel.rand(chars.length)] }.join
+def rand_word(length = 8)
+  ('a'..'z').to_a.shuffle.first(length).join # not random enough to cause hits.
 end
 describe Bloomer do
   it "should work trivially" do
     b = Bloomer.new(10, 0.001)
-    b.add("a")
+    b.add("a").should be_false
+    b.add("a").should be_true
     b.should include("a")
     b.should_not include("")
     b.should_not include("b")
-    b.add("b")
+    b.add("b").should be_false
+    b.add("b").should be_true
     b.should include("b")
     b.should_not include("")
     b.add("")
     b.should include("")
   end
-  it "should find random strings" do
-    b = Bloomer.new(5_000, 0.001)
-    inputs = 1000.times.collect { rand_alpha(Kernel.rand(50)) }
-    inputs.each { |ea| b.add(ea) }
-    inputs.each { |ea| b.include?(ea).should be_true }
-    5000.times.each do
-      s = rand_alpha(Kernel.rand(50))
-      b.include?(s).should == inputs.include?(s)
-    end
-  end
   it "should marshal state correctly" do
     b = Bloomer.new(10, 0.001)
     inputs = %q(a b c d)
-    inputs.each{|ea|b.add(ea)}
+    inputs.each { |ea| b.add(ea) }
     s = Marshal.dump(b)
     new_b = Marshal.load(s)
-    inputs.each{|ea|new_b.should include(ea)}
+    inputs.each { |ea| new_b.should include(ea) }
+  end
+  it "should result in similar-to-expected false positives" do
+    max_false_prob = 0.001
+    size = 50_000
+    bloom = Bloomer.new(size, max_false_prob)
+    set = Set.new
+    size.times do
+      w = rand_word
+      bloom.add(w)
+      set.add(w)
+    end
+    set.each { |ea| bloom.include?(ea).should be_true }
+    tries = size * 3
+    false_hits = 0
+    hits = 0
+    tries.times.each do
+      word = rand_word
+      b_inc, s_inc = bloom.include?(word), set.include?(word)
+      hits += 1 if s_inc
+      if s_inc && !b_inc
+        fail "'#{word}': false negative on include"
+      elsif !s_inc && b_inc
+        false_hits += 1
+      end
+    end
+    false_positive_failure_rate = false_hits.to_f / tries
+    puts "False positive rate = #{false_positive_failure_rate * 100}%, expected #{max_false_prob * 100}% (#{false_hits} false positives, #{hits} hits)"
+    if (false_positive_failure_rate) > max_false_prob * 2
+      fail "False-positive failure rate was bad: #{false_positive_failure_rate}"
+    end
   end
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: bloomer
 version: !ruby/object:Gem::Version
-  hash: 29
+  hash: 27
   prerelease:
   segments:
   - 0
   - 0
-  - 1
-  version: 0.0.1
+  - 2
+  version: 0.0.2
 platform: ruby
 authors:
 - Matthew McEachen
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-01-18 00:00:00 -08:00
+date: 2012-01-21 00:00:00 -08:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -52,7 +52,7 @@ files:
 - spec/bloomer_spec.rb
 - spec/spec_helper.rb
 has_rdoc: true
-homepage: ""
+homepage: https://github.com/mceachen/bloomer
 licenses: []
 post_install_message: