bloomer 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +13 -2
- data/bloomer.gemspec +1 -1
- data/lib/bloomer.rb +29 -84
- data/spec/bloomer_spec.rb +41 -19
- metadata +5 -5
data/README.md
CHANGED
@@ -1,9 +1,14 @@
|
|
1
|
-
# Bloomer:
|
2
|
-
|
1
|
+
# Bloomer: A pure-ruby bloom filter with no extra fluff
|
3
2
|
|
4
3
|
[Bloom filters](http://en.wikipedia.org/wiki/Bloom_filter) are great for quickly checking to see if
|
5
4
|
a given string has been seen before--in constant time, and using a fixed amount of RAM.
|
6
5
|
|
6
|
+
Note that false positives with bloom filters *are possible*, but false negatives are not. In other words,
|
7
|
+
|
8
|
+
* if ```include?``` returns *false*, that string has *certainly not* been ```add```ed
|
9
|
+
* if ```include?``` returns *true*, it *might* mean that string was ```add```ed (depending on the
|
10
|
+
```false_positive_probability``` parameter provided to the constructor).
|
11
|
+
|
7
12
|
This implementation is the Nth bloom filter gem written in ruby -- but, at the time of conception, the only one that
|
8
13
|
|
9
14
|
* uses a robust set of hashing functions
|
@@ -38,4 +43,10 @@ new_b.include? "a"
|
|
38
43
|
|
39
44
|
* 0.0.1 Bloom, there it is.
|
40
45
|
|
46
|
+
* 0.0.2 Switch to triple hash chaining, which resulted in better, faster hashing (!!):
|
47
|
+
|
48
|
+
md5 (v0.0.2): 66 sec, false positive rate = 1.116%, expected 1.0%
|
49
|
+
multihash (0.0.1): 92 sec, false positive rate = 1.27%, expected 1.0%
|
50
|
+
|
51
|
+
|
41
52
|
|
data/bloomer.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
|
|
8
8
|
s.version = Bloomer::VERSION
|
9
9
|
s.authors = ["Matthew McEachen"]
|
10
10
|
s.email = ["matthew+github@mceachen.org"]
|
11
|
-
s.homepage = ""
|
11
|
+
s.homepage = "https://github.com/mceachen/bloomer"
|
12
12
|
s.summary = %q{Pure-ruby bloom filter with minimal dependencies}
|
13
13
|
s.description = %q{Pure-ruby bloom filter with minimal dependencies}
|
14
14
|
|
data/lib/bloomer.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'bitarray'
|
2
|
+
require 'digest/md5'
|
2
3
|
|
3
4
|
class Bloomer
|
4
|
-
VERSION = "0.0.
|
5
|
+
VERSION = "0.0.2"
|
5
6
|
|
6
7
|
def initialize(expected_size, false_positive_probability = 0.001, opts = {})
|
7
8
|
@ba = opts[:ba] || begin
|
@@ -9,104 +10,48 @@ class Bloomer
|
|
9
10
|
m = -(expected_size * Math.log(false_positive_probability)) / (Math.log(2) ** 2)
|
10
11
|
BitArray.new(m.round)
|
11
12
|
end
|
12
|
-
|
13
13
|
# k is the number of hash functions that minimizes the probability of false positives
|
14
|
-
k = opts[:k] || Math.log(2) * (@ba.size / expected_size)
|
15
|
-
@hashes = Hashes.build(k.round)
|
14
|
+
@k = (opts[:k] || Math.log(2) * (@ba.size / expected_size)).round
|
16
15
|
end
|
17
16
|
|
17
|
+
# returns true if item hadn't already been added
|
18
18
|
def add string
|
19
|
-
|
19
|
+
count = 0
|
20
|
+
hashes(string).each { |ea| count += @ba[ea] ; @ba[ea] = 1 }
|
21
|
+
count == @k
|
20
22
|
end
|
21
23
|
|
24
|
+
# returns false if the item hadn't already been added
|
25
|
+
# returns true if it is likely that string had been added. See #false_positive_probability
|
22
26
|
def include? string
|
23
|
-
!
|
27
|
+
!hashes(string).any? { |ea| @ba[ea] == 0 }
|
24
28
|
end
|
25
29
|
|
26
30
|
def _dump(depth)
|
27
|
-
[@
|
31
|
+
[@k, Marshal.dump(@ba)].join(" ")
|
28
32
|
end
|
29
33
|
|
30
34
|
def self._load(data)
|
31
|
-
k, ba = data.split("
|
35
|
+
k, ba = data.split(" ", 2)
|
32
36
|
new(nil, nil, :k => k.to_i, :ba => Marshal.load(ba))
|
33
37
|
end
|
34
38
|
|
35
39
|
private
|
36
40
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
def self.build(number_of_hashes)
|
55
|
-
hashes = [djb_hash, js_hash, rs_hash, knr_hash, ruby_hash]
|
56
|
-
primes = CircularQueue.new PRIMES
|
57
|
-
while (number_of_hashes > hashes.size)
|
58
|
-
hashes += [:djb_hash, :js_hash, :rs_hash, :knr_hash, :ruby_hash].collect do |ea|
|
59
|
-
send(ea, primes.rot!, primes.rot!)
|
60
|
-
end
|
61
|
-
end
|
62
|
-
return hashes.first(number_of_hashes)
|
63
|
-
end
|
64
|
-
|
65
|
-
MAX = 2**31 - 1
|
66
|
-
|
67
|
-
# written by Professor Daniel J. Bernstein from comp.lang.c
|
68
|
-
def self.djb_hash(a = 5381, b = nil)
|
69
|
-
lambda do |data|
|
70
|
-
data.each_byte.inject(a) do |hash, ea|
|
71
|
-
((hash << 5) + hash + ea) % MAX
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
# bitwise hash function written by Justin Sobel
|
77
|
-
def self.js_hash(a = 1315423911, b = nil)
|
78
|
-
lambda do |data|
|
79
|
-
data.each_byte.inject(a) do |hash, ea|
|
80
|
-
(hash ^ ((hash << 5) + ea + (hash >> 2))) % MAX
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
# simple hash function from Robert Sedgwicks Algorithms in C book
|
86
|
-
def self.rs_hash(a = 63689, b = 378551)
|
87
|
-
lambda do |data|
|
88
|
-
i, j = a, b
|
89
|
-
data.each_byte.inject(0) do |hash, ea|
|
90
|
-
i = (i * j) % MAX
|
91
|
-
(hash * i + ea) % MAX
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
# From Kernigham and Ritchie's "The C Programming Language"
|
97
|
-
def self.knr_hash(a = 1619, b = 911)
|
98
|
-
lambda do |data|
|
99
|
-
data.each_byte.inject(a) do |hash, ea|
|
100
|
-
((hash * b) + ea) % MAX
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
# default hash
|
106
|
-
def self.ruby_hash(a = 1, b = 1)
|
107
|
-
lambda do |data|
|
108
|
-
(data.hash * a) % MAX
|
109
|
-
end
|
110
|
-
end
|
111
|
-
end
|
112
|
-
end
|
41
|
+
# Return an array of hash indices to set.
|
42
|
+
# Uses triple hashing as described in http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf
|
43
|
+
def hashes(data)
|
44
|
+
m = @ba.size
|
45
|
+
h = Digest::MD5.hexdigest(data.to_s).to_i(16)
|
46
|
+
x = h % m
|
47
|
+
h /= m
|
48
|
+
y = h % m
|
49
|
+
h /= m
|
50
|
+
z = h % m
|
51
|
+
[x] + 1.upto(@k - 1).collect do |i|
|
52
|
+
x = (x + y) % m
|
53
|
+
y = (y + z) % m
|
54
|
+
x
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/spec/bloomer_spec.rb
CHANGED
@@ -1,42 +1,64 @@
|
|
1
1
|
require "spec_helper"
|
2
|
+
require "benchmark"
|
2
3
|
|
3
|
-
def
|
4
|
-
|
5
|
-
(0...size).collect { chars[Kernel.rand(chars.length)] }.join
|
4
|
+
def rand_word(length = 8)
|
5
|
+
('a'..'z').to_a.shuffle.first(length).join # not random enough to cause hits.
|
6
6
|
end
|
7
7
|
|
8
8
|
describe Bloomer do
|
9
|
-
|
10
9
|
it "should work trivially" do
|
11
10
|
b = Bloomer.new(10, 0.001)
|
12
|
-
b.add("a")
|
11
|
+
b.add("a").should be_false
|
12
|
+
b.add("a").should be_true
|
13
13
|
b.should include("a")
|
14
14
|
b.should_not include("")
|
15
15
|
b.should_not include("b")
|
16
|
-
b.add("b")
|
16
|
+
b.add("b").should be_false
|
17
|
+
b.add("b").should be_true
|
17
18
|
b.should include("b")
|
18
19
|
b.should_not include("")
|
19
20
|
b.add("")
|
20
21
|
b.should include("")
|
21
22
|
end
|
22
23
|
|
23
|
-
it "should find random strings" do
|
24
|
-
b = Bloomer.new(5_000, 0.001)
|
25
|
-
inputs = 1000.times.collect { rand_alpha(Kernel.rand(50)) }
|
26
|
-
inputs.each { |ea| b.add(ea) }
|
27
|
-
inputs.each { |ea| b.include?(ea).should be_true }
|
28
|
-
5000.times.each do
|
29
|
-
s = rand_alpha(Kernel.rand(50))
|
30
|
-
b.include?(s).should == inputs.include?(s)
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
24
|
it "should marshal state correctly" do
|
35
25
|
b = Bloomer.new(10, 0.001)
|
36
26
|
inputs = %q(a b c d)
|
37
|
-
inputs.each{|ea|b.add(ea)}
|
27
|
+
inputs.each { |ea| b.add(ea) }
|
38
28
|
s = Marshal.dump(b)
|
39
29
|
new_b = Marshal.load(s)
|
40
|
-
inputs.each{|ea|new_b.should include(ea)}
|
30
|
+
inputs.each { |ea| new_b.should include(ea) }
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should result in similar-to-expected false positives" do
|
34
|
+
max_false_prob = 0.001
|
35
|
+
size = 50_000
|
36
|
+
bloom = Bloomer.new(size, max_false_prob)
|
37
|
+
set = Set.new
|
38
|
+
size.times do
|
39
|
+
w = rand_word
|
40
|
+
bloom.add(w)
|
41
|
+
set.add(w)
|
42
|
+
end
|
43
|
+
set.each { |ea| bloom.include?(ea).should be_true }
|
44
|
+
tries = size * 3
|
45
|
+
false_hits = 0
|
46
|
+
hits = 0
|
47
|
+
tries.times.each do
|
48
|
+
word = rand_word
|
49
|
+
b_inc, s_inc = bloom.include?(word), set.include?(word)
|
50
|
+
hits += 1 if s_inc
|
51
|
+
if s_inc && !b_inc
|
52
|
+
fail "'#{word}': false negative on include"
|
53
|
+
elsif !s_inc && b_inc
|
54
|
+
false_hits += 1
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
false_positive_failure_rate = false_hits.to_f / tries
|
59
|
+
puts "False positive rate = #{false_positive_failure_rate * 100}%, expected #{max_false_prob * 100}% (#{false_hits} false positives, #{hits} hits)"
|
60
|
+
if (false_positive_failure_rate) > max_false_prob * 2
|
61
|
+
fail "False-positive failure rate was bad: #{false_positive_failure_rate}"
|
62
|
+
end
|
41
63
|
end
|
42
64
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bloomer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Matthew McEachen
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-01-
|
18
|
+
date: 2012-01-21 00:00:00 -08:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -52,7 +52,7 @@ files:
|
|
52
52
|
- spec/bloomer_spec.rb
|
53
53
|
- spec/spec_helper.rb
|
54
54
|
has_rdoc: true
|
55
|
-
homepage:
|
55
|
+
homepage: https://github.com/mceachen/bloomer
|
56
56
|
licenses: []
|
57
57
|
|
58
58
|
post_install_message:
|