bloomer 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,9 +1,14 @@
1
- # Bloomer: a pure-ruby bloom filter with no extra fluff
2
-
1
+ # Bloomer: A pure-ruby bloom filter with no extra fluff
3
2
 
4
3
  [Bloom filters](http://en.wikipedia.org/wiki/Bloom_filter) are great for quickly checking to see if
5
4
  a given string has been seen before--in constant time, and using a fixed amount of RAM.
6
5
 
6
+ Note that false positives with bloom filters *are possible*, but false negatives are not. In other words,
7
+
8
+ * if ```include?``` returns *false*, that string has *certainly not* been ```add```ed
9
+ * if ```include?``` returns *true*, it *might* mean that string was ```add```ed (depending on the
10
+ ```false_positive_probability``` parameter provided to the constructor).
11
+
7
12
  This implementation is the Nth bloom filter gem written in ruby -- but, at the time of conception, the only one that
8
13
 
9
14
  * uses a robust set of hashing functions
@@ -38,4 +43,10 @@ new_b.include? "a"
38
43
 
39
44
  * 0.0.1 Bloom, there it is.
40
45
 
46
+ * 0.0.2 Switch to triple hash chaining, which resulted in better, faster hashing (!!):
47
+
48
+ md5 (v0.0.2): 66 sec, false positive rate = 1.116%, expected 1.0%
49
+ multihash (0.0.1): 92 sec, false positive rate = 1.27%, expected 1.0%
50
+
51
+
41
52
 
data/bloomer.gemspec CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
8
8
  s.version = Bloomer::VERSION
9
9
  s.authors = ["Matthew McEachen"]
10
10
  s.email = ["matthew+github@mceachen.org"]
11
- s.homepage = ""
11
+ s.homepage = "https://github.com/mceachen/bloomer"
12
12
  s.summary = %q{Pure-ruby bloom filter with minimal dependencies}
13
13
  s.description = %q{Pure-ruby bloom filter with minimal dependencies}
14
14
 
data/lib/bloomer.rb CHANGED
@@ -1,7 +1,8 @@
1
1
  require 'bitarray'
2
+ require 'digest/md5'
2
3
 
3
4
  class Bloomer
4
- VERSION = "0.0.1"
5
+ VERSION = "0.0.2"
5
6
 
6
7
  def initialize(expected_size, false_positive_probability = 0.001, opts = {})
7
8
  @ba = opts[:ba] || begin
@@ -9,104 +10,48 @@ class Bloomer
9
10
  m = -(expected_size * Math.log(false_positive_probability)) / (Math.log(2) ** 2)
10
11
  BitArray.new(m.round)
11
12
  end
12
-
13
13
  # k is the number of hash functions that minimizes the probability of false positives
14
- k = opts[:k] || Math.log(2) * (@ba.size / expected_size)
15
- @hashes = Hashes.build(k.round)
14
+ @k = (opts[:k] || Math.log(2) * (@ba.size / expected_size)).round
16
15
  end
17
16
 
17
+ # returns true if item hadn't already been added
18
18
  def add string
19
- indicies(string).each { |ea| @ba[ea] = 1 }
19
+ count = 0
20
+ hashes(string).each { |ea| count += @ba[ea] ; @ba[ea] = 1 }
21
+ count == @k
20
22
  end
21
23
 
24
+ # returns false if the item hadn't already been added
25
+ # returns true if it is likely that string had been added. See #false_positive_probability
22
26
  def include? string
23
- !indicies(string).any? { |ea| @ba[ea] == 0 }
27
+ !hashes(string).any? { |ea| @ba[ea] == 0 }
24
28
  end
25
29
 
26
30
  def _dump(depth)
27
- [@hashes.size, Marshal.dump(@ba)].join("\n")
31
+ [@k, Marshal.dump(@ba)].join(" ")
28
32
  end
29
33
 
30
34
  def self._load(data)
31
- k, ba = data.split("\n", 2)
35
+ k, ba = data.split(" ", 2)
32
36
  new(nil, nil, :k => k.to_i, :ba => Marshal.load(ba))
33
37
  end
34
38
 
35
39
  private
36
40
 
37
- def indicies string
38
- @hashes.collect do |h|
39
- h.call(string) % @ba.size
40
- end
41
- end
42
-
43
- class CircularQueue < Array
44
- def rot!
45
- first = self.shift
46
- self.push(first)
47
- first
48
- end
49
- end
50
-
51
- class Hashes
52
- PRIMES = [3571, 4219, 4447, 5167, 5419, 6211, 7057, 7351, 8269, 9241, 10267, 11719, 12097, 13267, 13669, 16651, 19441, 19927, 22447, 23497, 24571, 25117, 26227, 27361, 33391, 35317]
53
-
54
- def self.build(number_of_hashes)
55
- hashes = [djb_hash, js_hash, rs_hash, knr_hash, ruby_hash]
56
- primes = CircularQueue.new PRIMES
57
- while (number_of_hashes > hashes.size)
58
- hashes += [:djb_hash, :js_hash, :rs_hash, :knr_hash, :ruby_hash].collect do |ea|
59
- send(ea, primes.rot!, primes.rot!)
60
- end
61
- end
62
- return hashes.first(number_of_hashes)
63
- end
64
-
65
- MAX = 2**31 - 1
66
-
67
- # written by Professor Daniel J. Bernstein from comp.lang.c
68
- def self.djb_hash(a = 5381, b = nil)
69
- lambda do |data|
70
- data.each_byte.inject(a) do |hash, ea|
71
- ((hash << 5) + hash + ea) % MAX
72
- end
73
- end
74
- end
75
-
76
- # bitwise hash function written by Justin Sobel
77
- def self.js_hash(a = 1315423911, b = nil)
78
- lambda do |data|
79
- data.each_byte.inject(a) do |hash, ea|
80
- (hash ^ ((hash << 5) + ea + (hash >> 2))) % MAX
81
- end
82
- end
83
- end
84
-
85
- # simple hash function from Robert Sedgwicks Algorithms in C book
86
- def self.rs_hash(a = 63689, b = 378551)
87
- lambda do |data|
88
- i, j = a, b
89
- data.each_byte.inject(0) do |hash, ea|
90
- i = (i * j) % MAX
91
- (hash * i + ea) % MAX
92
- end
93
- end
94
- end
95
-
96
- # From Kernigham and Ritchie's "The C Programming Language"
97
- def self.knr_hash(a = 1619, b = 911)
98
- lambda do |data|
99
- data.each_byte.inject(a) do |hash, ea|
100
- ((hash * b) + ea) % MAX
101
- end
102
- end
103
- end
104
-
105
- # default hash
106
- def self.ruby_hash(a = 1, b = 1)
107
- lambda do |data|
108
- (data.hash * a) % MAX
109
- end
110
- end
111
- end
112
- end
41
+ # Return an array of hash indices to set.
42
+ # Uses triple hashing as described in http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf
43
+ def hashes(data)
44
+ m = @ba.size
45
+ h = Digest::MD5.hexdigest(data.to_s).to_i(16)
46
+ x = h % m
47
+ h /= m
48
+ y = h % m
49
+ h /= m
50
+ z = h % m
51
+ [x] + 1.upto(@k - 1).collect do |i|
52
+ x = (x + y) % m
53
+ y = (y + z) % m
54
+ x
55
+ end
56
+ end
57
+ end
data/spec/bloomer_spec.rb CHANGED
@@ -1,42 +1,64 @@
1
1
  require "spec_helper"
2
+ require "benchmark"
2
3
 
3
- def rand_alpha(size)
4
- chars = ('a'..'z').to_a + ('A'..'Z').to_a
5
- (0...size).collect { chars[Kernel.rand(chars.length)] }.join
4
+ def rand_word(length = 8)
5
+ ('a'..'z').to_a.shuffle.first(length).join # not random enough to cause hits.
6
6
  end
7
7
 
8
8
  describe Bloomer do
9
-
10
9
  it "should work trivially" do
11
10
  b = Bloomer.new(10, 0.001)
12
- b.add("a")
11
+ b.add("a").should be_false
12
+ b.add("a").should be_true
13
13
  b.should include("a")
14
14
  b.should_not include("")
15
15
  b.should_not include("b")
16
- b.add("b")
16
+ b.add("b").should be_false
17
+ b.add("b").should be_true
17
18
  b.should include("b")
18
19
  b.should_not include("")
19
20
  b.add("")
20
21
  b.should include("")
21
22
  end
22
23
 
23
- it "should find random strings" do
24
- b = Bloomer.new(5_000, 0.001)
25
- inputs = 1000.times.collect { rand_alpha(Kernel.rand(50)) }
26
- inputs.each { |ea| b.add(ea) }
27
- inputs.each { |ea| b.include?(ea).should be_true }
28
- 5000.times.each do
29
- s = rand_alpha(Kernel.rand(50))
30
- b.include?(s).should == inputs.include?(s)
31
- end
32
- end
33
-
34
24
  it "should marshal state correctly" do
35
25
  b = Bloomer.new(10, 0.001)
36
26
  inputs = %q(a b c d)
37
- inputs.each{|ea|b.add(ea)}
27
+ inputs.each { |ea| b.add(ea) }
38
28
  s = Marshal.dump(b)
39
29
  new_b = Marshal.load(s)
40
- inputs.each{|ea|new_b.should include(ea)}
30
+ inputs.each { |ea| new_b.should include(ea) }
31
+ end
32
+
33
+ it "should result in similar-to-expected false positives" do
34
+ max_false_prob = 0.001
35
+ size = 50_000
36
+ bloom = Bloomer.new(size, max_false_prob)
37
+ set = Set.new
38
+ size.times do
39
+ w = rand_word
40
+ bloom.add(w)
41
+ set.add(w)
42
+ end
43
+ set.each { |ea| bloom.include?(ea).should be_true }
44
+ tries = size * 3
45
+ false_hits = 0
46
+ hits = 0
47
+ tries.times.each do
48
+ word = rand_word
49
+ b_inc, s_inc = bloom.include?(word), set.include?(word)
50
+ hits += 1 if s_inc
51
+ if s_inc && !b_inc
52
+ fail "'#{word}': false negative on include"
53
+ elsif !s_inc && b_inc
54
+ false_hits += 1
55
+ end
56
+ end
57
+
58
+ false_positive_failure_rate = false_hits.to_f / tries
59
+ puts "False positive rate = #{false_positive_failure_rate * 100}%, expected #{max_false_prob * 100}% (#{false_hits} false positives, #{hits} hits)"
60
+ if (false_positive_failure_rate) > max_false_prob * 2
61
+ fail "False-positive failure rate was bad: #{false_positive_failure_rate}"
62
+ end
41
63
  end
42
64
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bloomer
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 27
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 1
10
- version: 0.0.1
9
+ - 2
10
+ version: 0.0.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Matthew McEachen
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-01-18 00:00:00 -08:00
18
+ date: 2012-01-21 00:00:00 -08:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -52,7 +52,7 @@ files:
52
52
  - spec/bloomer_spec.rb
53
53
  - spec/spec_helper.rb
54
54
  has_rdoc: true
55
- homepage: ""
55
+ homepage: https://github.com/mceachen/bloomer
56
56
  licenses: []
57
57
 
58
58
  post_install_message: