bloomer 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,9 +1,14 @@
1
- # Bloomer: a pure-ruby bloom filter with no extra fluff
2
-
1
+ # Bloomer: A pure-ruby bloom filter with no extra fluff
3
2
 
4
3
  [Bloom filters](http://en.wikipedia.org/wiki/Bloom_filter) are great for quickly checking to see if
5
4
  a given string has been seen before--in constant time, and using a fixed amount of RAM.
6
5
 
6
+ Note that false positives with bloom filters *are possible*, but false negatives are not. In other words,
7
+
8
+ * if ```include?``` returns *false*, that string has *certainly not* been ```add```ed
9
+ * if ```include?``` returns *true*, it *might* mean that string was ```add```ed (depending on the
10
+ ```false_positive_probability``` parameter provided to the constructor).
11
+
7
12
  This implementation is the Nth bloom filter gem written in ruby -- but, at the time of conception, the only one that
8
13
 
9
14
  * uses a robust set of hashing functions
@@ -38,4 +43,10 @@ new_b.include? "a"
38
43
 
39
44
  * 0.0.1 Bloom, there it is.
40
45
 
46
+ * 0.0.2 Switch to triple hash chaining, which resulted in better, faster hashing (!!):
47
+
48
+ md5 (v0.0.2): 66 sec, false positive rate = 1.116%, expected 1.0%
49
+ multihash (0.0.1): 92 sec, false positive rate = 1.27%, expected 1.0%
50
+
51
+
41
52
 
data/bloomer.gemspec CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
8
8
  s.version = Bloomer::VERSION
9
9
  s.authors = ["Matthew McEachen"]
10
10
  s.email = ["matthew+github@mceachen.org"]
11
- s.homepage = ""
11
+ s.homepage = "https://github.com/mceachen/bloomer"
12
12
  s.summary = %q{Pure-ruby bloom filter with minimal dependencies}
13
13
  s.description = %q{Pure-ruby bloom filter with minimal dependencies}
14
14
 
data/lib/bloomer.rb CHANGED
@@ -1,7 +1,8 @@
1
1
  require 'bitarray'
2
+ require 'digest/md5'
2
3
 
3
4
  class Bloomer
4
- VERSION = "0.0.1"
5
+ VERSION = "0.0.2"
5
6
 
6
7
  def initialize(expected_size, false_positive_probability = 0.001, opts = {})
7
8
  @ba = opts[:ba] || begin
@@ -9,104 +10,48 @@ class Bloomer
9
10
  m = -(expected_size * Math.log(false_positive_probability)) / (Math.log(2) ** 2)
10
11
  BitArray.new(m.round)
11
12
  end
12
-
13
13
  # k is the number of hash functions that minimizes the probability of false positives
14
- k = opts[:k] || Math.log(2) * (@ba.size / expected_size)
15
- @hashes = Hashes.build(k.round)
14
+ @k = (opts[:k] || Math.log(2) * (@ba.size / expected_size)).round
16
15
  end
17
16
 
17
+ # returns true if item hadn't already been added
18
18
  def add string
19
- indicies(string).each { |ea| @ba[ea] = 1 }
19
+ count = 0
20
+ hashes(string).each { |ea| count += @ba[ea] ; @ba[ea] = 1 }
21
+ count == @k
20
22
  end
21
23
 
24
+ # returns false if the item hadn't already been added
25
+ # returns true if it is likely that string had been added. See #false_positive_probability
22
26
  def include? string
23
- !indicies(string).any? { |ea| @ba[ea] == 0 }
27
+ !hashes(string).any? { |ea| @ba[ea] == 0 }
24
28
  end
25
29
 
26
30
  def _dump(depth)
27
- [@hashes.size, Marshal.dump(@ba)].join("\n")
31
+ [@k, Marshal.dump(@ba)].join(" ")
28
32
  end
29
33
 
30
34
  def self._load(data)
31
- k, ba = data.split("\n", 2)
35
+ k, ba = data.split(" ", 2)
32
36
  new(nil, nil, :k => k.to_i, :ba => Marshal.load(ba))
33
37
  end
34
38
 
35
39
  private
36
40
 
37
- def indicies string
38
- @hashes.collect do |h|
39
- h.call(string) % @ba.size
40
- end
41
- end
42
-
43
- class CircularQueue < Array
44
- def rot!
45
- first = self.shift
46
- self.push(first)
47
- first
48
- end
49
- end
50
-
51
- class Hashes
52
- PRIMES = [3571, 4219, 4447, 5167, 5419, 6211, 7057, 7351, 8269, 9241, 10267, 11719, 12097, 13267, 13669, 16651, 19441, 19927, 22447, 23497, 24571, 25117, 26227, 27361, 33391, 35317]
53
-
54
- def self.build(number_of_hashes)
55
- hashes = [djb_hash, js_hash, rs_hash, knr_hash, ruby_hash]
56
- primes = CircularQueue.new PRIMES
57
- while (number_of_hashes > hashes.size)
58
- hashes += [:djb_hash, :js_hash, :rs_hash, :knr_hash, :ruby_hash].collect do |ea|
59
- send(ea, primes.rot!, primes.rot!)
60
- end
61
- end
62
- return hashes.first(number_of_hashes)
63
- end
64
-
65
- MAX = 2**31 - 1
66
-
67
- # written by Professor Daniel J. Bernstein from comp.lang.c
68
- def self.djb_hash(a = 5381, b = nil)
69
- lambda do |data|
70
- data.each_byte.inject(a) do |hash, ea|
71
- ((hash << 5) + hash + ea) % MAX
72
- end
73
- end
74
- end
75
-
76
- # bitwise hash function written by Justin Sobel
77
- def self.js_hash(a = 1315423911, b = nil)
78
- lambda do |data|
79
- data.each_byte.inject(a) do |hash, ea|
80
- (hash ^ ((hash << 5) + ea + (hash >> 2))) % MAX
81
- end
82
- end
83
- end
84
-
85
- # simple hash function from Robert Sedgwicks Algorithms in C book
86
- def self.rs_hash(a = 63689, b = 378551)
87
- lambda do |data|
88
- i, j = a, b
89
- data.each_byte.inject(0) do |hash, ea|
90
- i = (i * j) % MAX
91
- (hash * i + ea) % MAX
92
- end
93
- end
94
- end
95
-
96
- # From Kernigham and Ritchie's "The C Programming Language"
97
- def self.knr_hash(a = 1619, b = 911)
98
- lambda do |data|
99
- data.each_byte.inject(a) do |hash, ea|
100
- ((hash * b) + ea) % MAX
101
- end
102
- end
103
- end
104
-
105
- # default hash
106
- def self.ruby_hash(a = 1, b = 1)
107
- lambda do |data|
108
- (data.hash * a) % MAX
109
- end
110
- end
111
- end
112
- end
41
+ # Return an array of hash indices to set.
42
+ # Uses triple hashing as described in http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf
43
+ def hashes(data)
44
+ m = @ba.size
45
+ h = Digest::MD5.hexdigest(data.to_s).to_i(16)
46
+ x = h % m
47
+ h /= m
48
+ y = h % m
49
+ h /= m
50
+ z = h % m
51
+ [x] + 1.upto(@k - 1).collect do |i|
52
+ x = (x + y) % m
53
+ y = (y + z) % m
54
+ x
55
+ end
56
+ end
57
+ end
data/spec/bloomer_spec.rb CHANGED
@@ -1,42 +1,64 @@
1
1
  require "spec_helper"
2
+ require "benchmark"
2
3
 
3
- def rand_alpha(size)
4
- chars = ('a'..'z').to_a + ('A'..'Z').to_a
5
- (0...size).collect { chars[Kernel.rand(chars.length)] }.join
4
+ def rand_word(length = 8)
5
+ ('a'..'z').to_a.shuffle.first(length).join # not random enough to cause hits.
6
6
  end
7
7
 
8
8
  describe Bloomer do
9
-
10
9
  it "should work trivially" do
11
10
  b = Bloomer.new(10, 0.001)
12
- b.add("a")
11
+ b.add("a").should be_false
12
+ b.add("a").should be_true
13
13
  b.should include("a")
14
14
  b.should_not include("")
15
15
  b.should_not include("b")
16
- b.add("b")
16
+ b.add("b").should be_false
17
+ b.add("b").should be_true
17
18
  b.should include("b")
18
19
  b.should_not include("")
19
20
  b.add("")
20
21
  b.should include("")
21
22
  end
22
23
 
23
- it "should find random strings" do
24
- b = Bloomer.new(5_000, 0.001)
25
- inputs = 1000.times.collect { rand_alpha(Kernel.rand(50)) }
26
- inputs.each { |ea| b.add(ea) }
27
- inputs.each { |ea| b.include?(ea).should be_true }
28
- 5000.times.each do
29
- s = rand_alpha(Kernel.rand(50))
30
- b.include?(s).should == inputs.include?(s)
31
- end
32
- end
33
-
34
24
  it "should marshal state correctly" do
35
25
  b = Bloomer.new(10, 0.001)
36
26
  inputs = %q(a b c d)
37
- inputs.each{|ea|b.add(ea)}
27
+ inputs.each { |ea| b.add(ea) }
38
28
  s = Marshal.dump(b)
39
29
  new_b = Marshal.load(s)
40
- inputs.each{|ea|new_b.should include(ea)}
30
+ inputs.each { |ea| new_b.should include(ea) }
31
+ end
32
+
33
+ it "should result in similar-to-expected false positives" do
34
+ max_false_prob = 0.001
35
+ size = 50_000
36
+ bloom = Bloomer.new(size, max_false_prob)
37
+ set = Set.new
38
+ size.times do
39
+ w = rand_word
40
+ bloom.add(w)
41
+ set.add(w)
42
+ end
43
+ set.each { |ea| bloom.include?(ea).should be_true }
44
+ tries = size * 3
45
+ false_hits = 0
46
+ hits = 0
47
+ tries.times.each do
48
+ word = rand_word
49
+ b_inc, s_inc = bloom.include?(word), set.include?(word)
50
+ hits += 1 if s_inc
51
+ if s_inc && !b_inc
52
+ fail "'#{word}': false negative on include"
53
+ elsif !s_inc && b_inc
54
+ false_hits += 1
55
+ end
56
+ end
57
+
58
+ false_positive_failure_rate = false_hits.to_f / tries
59
+ puts "False positive rate = #{false_positive_failure_rate * 100}%, expected #{max_false_prob * 100}% (#{false_hits} false positives, #{hits} hits)"
60
+ if (false_positive_failure_rate) > max_false_prob * 2
61
+ fail "False-positive failure rate was bad: #{false_positive_failure_rate}"
62
+ end
41
63
  end
42
64
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bloomer
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 27
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 1
10
- version: 0.0.1
9
+ - 2
10
+ version: 0.0.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Matthew McEachen
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-01-18 00:00:00 -08:00
18
+ date: 2012-01-21 00:00:00 -08:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -52,7 +52,7 @@ files:
52
52
  - spec/bloomer_spec.rb
53
53
  - spec/spec_helper.rb
54
54
  has_rdoc: true
55
- homepage: ""
55
+ homepage: https://github.com/mceachen/bloomer
56
56
  licenses: []
57
57
 
58
58
  post_install_message: