bloomer 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +13 -2
- data/bloomer.gemspec +1 -1
- data/lib/bloomer.rb +29 -84
- data/spec/bloomer_spec.rb +41 -19
- metadata +5 -5
data/README.md
CHANGED
@@ -1,9 +1,14 @@
|
|
1
|
-
# Bloomer:
|
2
|
-
|
1
|
+
# Bloomer: A pure-ruby bloom filter with no extra fluff
|
3
2
|
|
4
3
|
[Bloom filters](http://en.wikipedia.org/wiki/Bloom_filter) are great for quickly checking to see if
|
5
4
|
a given string has been seen before--in constant time, and using a fixed amount of RAM.
|
6
5
|
|
6
|
+
Note that false positives with bloom filters *are possible*, but false negatives are not. In other words,
|
7
|
+
|
8
|
+
* if ```include?``` returns *false*, that string has *certainly not* been ```add```ed
|
9
|
+
* if ```include?``` returns *true*, it *might* mean that string was ```add```ed (depending on the
|
10
|
+
```false_positive_probability``` parameter provided to the constructor).
|
11
|
+
|
7
12
|
This implementation is the Nth bloom filter gem written in ruby -- but, at the time of conception, the only one that
|
8
13
|
|
9
14
|
* uses a robust set of hashing functions
|
@@ -38,4 +43,10 @@ new_b.include? "a"
|
|
38
43
|
|
39
44
|
* 0.0.1 Bloom, there it is.
|
40
45
|
|
46
|
+
* 0.0.2 Switch to triple hash chaining, which resulted in better, faster hashing (!!):
|
47
|
+
|
48
|
+
md5 (v0.0.2): 66 sec, false positive rate = 1.116%, expected 1.0%
|
49
|
+
multihash (0.0.1): 92 sec, false positive rate = 1.27%, expected 1.0%
|
50
|
+
|
51
|
+
|
41
52
|
|
data/bloomer.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
|
|
8
8
|
s.version = Bloomer::VERSION
|
9
9
|
s.authors = ["Matthew McEachen"]
|
10
10
|
s.email = ["matthew+github@mceachen.org"]
|
11
|
-
s.homepage = ""
|
11
|
+
s.homepage = "https://github.com/mceachen/bloomer"
|
12
12
|
s.summary = %q{Pure-ruby bloom filter with minimal dependencies}
|
13
13
|
s.description = %q{Pure-ruby bloom filter with minimal dependencies}
|
14
14
|
|
data/lib/bloomer.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'bitarray'
|
2
|
+
require 'digest/md5'
|
2
3
|
|
3
4
|
class Bloomer
|
4
|
-
VERSION = "0.0.
|
5
|
+
VERSION = "0.0.2"
|
5
6
|
|
6
7
|
def initialize(expected_size, false_positive_probability = 0.001, opts = {})
|
7
8
|
@ba = opts[:ba] || begin
|
@@ -9,104 +10,48 @@ class Bloomer
|
|
9
10
|
m = -(expected_size * Math.log(false_positive_probability)) / (Math.log(2) ** 2)
|
10
11
|
BitArray.new(m.round)
|
11
12
|
end
|
12
|
-
|
13
13
|
# k is the number of hash functions that minimizes the probability of false positives
|
14
|
-
k = opts[:k] || Math.log(2) * (@ba.size / expected_size)
|
15
|
-
@hashes = Hashes.build(k.round)
|
14
|
+
@k = (opts[:k] || Math.log(2) * (@ba.size / expected_size)).round
|
16
15
|
end
|
17
16
|
|
17
|
+
# returns true if item hadn't already been added
|
18
18
|
def add string
|
19
|
-
|
19
|
+
count = 0
|
20
|
+
hashes(string).each { |ea| count += @ba[ea] ; @ba[ea] = 1 }
|
21
|
+
count == @k
|
20
22
|
end
|
21
23
|
|
24
|
+
# returns false if the item hadn't already been added
|
25
|
+
# returns true if it is likely that string had been added. See #false_positive_probability
|
22
26
|
def include? string
|
23
|
-
!
|
27
|
+
!hashes(string).any? { |ea| @ba[ea] == 0 }
|
24
28
|
end
|
25
29
|
|
26
30
|
def _dump(depth)
|
27
|
-
[@
|
31
|
+
[@k, Marshal.dump(@ba)].join(" ")
|
28
32
|
end
|
29
33
|
|
30
34
|
def self._load(data)
|
31
|
-
k, ba = data.split("
|
35
|
+
k, ba = data.split(" ", 2)
|
32
36
|
new(nil, nil, :k => k.to_i, :ba => Marshal.load(ba))
|
33
37
|
end
|
34
38
|
|
35
39
|
private
|
36
40
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
def self.build(number_of_hashes)
|
55
|
-
hashes = [djb_hash, js_hash, rs_hash, knr_hash, ruby_hash]
|
56
|
-
primes = CircularQueue.new PRIMES
|
57
|
-
while (number_of_hashes > hashes.size)
|
58
|
-
hashes += [:djb_hash, :js_hash, :rs_hash, :knr_hash, :ruby_hash].collect do |ea|
|
59
|
-
send(ea, primes.rot!, primes.rot!)
|
60
|
-
end
|
61
|
-
end
|
62
|
-
return hashes.first(number_of_hashes)
|
63
|
-
end
|
64
|
-
|
65
|
-
MAX = 2**31 - 1
|
66
|
-
|
67
|
-
# written by Professor Daniel J. Bernstein from comp.lang.c
|
68
|
-
def self.djb_hash(a = 5381, b = nil)
|
69
|
-
lambda do |data|
|
70
|
-
data.each_byte.inject(a) do |hash, ea|
|
71
|
-
((hash << 5) + hash + ea) % MAX
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
# bitwise hash function written by Justin Sobel
|
77
|
-
def self.js_hash(a = 1315423911, b = nil)
|
78
|
-
lambda do |data|
|
79
|
-
data.each_byte.inject(a) do |hash, ea|
|
80
|
-
(hash ^ ((hash << 5) + ea + (hash >> 2))) % MAX
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
# simple hash function from Robert Sedgwicks Algorithms in C book
|
86
|
-
def self.rs_hash(a = 63689, b = 378551)
|
87
|
-
lambda do |data|
|
88
|
-
i, j = a, b
|
89
|
-
data.each_byte.inject(0) do |hash, ea|
|
90
|
-
i = (i * j) % MAX
|
91
|
-
(hash * i + ea) % MAX
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
# From Kernigham and Ritchie's "The C Programming Language"
|
97
|
-
def self.knr_hash(a = 1619, b = 911)
|
98
|
-
lambda do |data|
|
99
|
-
data.each_byte.inject(a) do |hash, ea|
|
100
|
-
((hash * b) + ea) % MAX
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
# default hash
|
106
|
-
def self.ruby_hash(a = 1, b = 1)
|
107
|
-
lambda do |data|
|
108
|
-
(data.hash * a) % MAX
|
109
|
-
end
|
110
|
-
end
|
111
|
-
end
|
112
|
-
end
|
41
|
+
# Return an array of hash indices to set.
|
42
|
+
# Uses triple hashing as described in http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf
|
43
|
+
def hashes(data)
|
44
|
+
m = @ba.size
|
45
|
+
h = Digest::MD5.hexdigest(data.to_s).to_i(16)
|
46
|
+
x = h % m
|
47
|
+
h /= m
|
48
|
+
y = h % m
|
49
|
+
h /= m
|
50
|
+
z = h % m
|
51
|
+
[x] + 1.upto(@k - 1).collect do |i|
|
52
|
+
x = (x + y) % m
|
53
|
+
y = (y + z) % m
|
54
|
+
x
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/spec/bloomer_spec.rb
CHANGED
@@ -1,42 +1,64 @@
|
|
1
1
|
require "spec_helper"
|
2
|
+
require "benchmark"
|
2
3
|
|
3
|
-
def
|
4
|
-
|
5
|
-
(0...size).collect { chars[Kernel.rand(chars.length)] }.join
|
4
|
+
def rand_word(length = 8)
|
5
|
+
('a'..'z').to_a.shuffle.first(length).join # not random enough to cause hits.
|
6
6
|
end
|
7
7
|
|
8
8
|
describe Bloomer do
|
9
|
-
|
10
9
|
it "should work trivially" do
|
11
10
|
b = Bloomer.new(10, 0.001)
|
12
|
-
b.add("a")
|
11
|
+
b.add("a").should be_false
|
12
|
+
b.add("a").should be_true
|
13
13
|
b.should include("a")
|
14
14
|
b.should_not include("")
|
15
15
|
b.should_not include("b")
|
16
|
-
b.add("b")
|
16
|
+
b.add("b").should be_false
|
17
|
+
b.add("b").should be_true
|
17
18
|
b.should include("b")
|
18
19
|
b.should_not include("")
|
19
20
|
b.add("")
|
20
21
|
b.should include("")
|
21
22
|
end
|
22
23
|
|
23
|
-
it "should find random strings" do
|
24
|
-
b = Bloomer.new(5_000, 0.001)
|
25
|
-
inputs = 1000.times.collect { rand_alpha(Kernel.rand(50)) }
|
26
|
-
inputs.each { |ea| b.add(ea) }
|
27
|
-
inputs.each { |ea| b.include?(ea).should be_true }
|
28
|
-
5000.times.each do
|
29
|
-
s = rand_alpha(Kernel.rand(50))
|
30
|
-
b.include?(s).should == inputs.include?(s)
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
24
|
it "should marshal state correctly" do
|
35
25
|
b = Bloomer.new(10, 0.001)
|
36
26
|
inputs = %q(a b c d)
|
37
|
-
inputs.each{|ea|b.add(ea)}
|
27
|
+
inputs.each { |ea| b.add(ea) }
|
38
28
|
s = Marshal.dump(b)
|
39
29
|
new_b = Marshal.load(s)
|
40
|
-
inputs.each{|ea|new_b.should include(ea)}
|
30
|
+
inputs.each { |ea| new_b.should include(ea) }
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should result in similar-to-expected false positives" do
|
34
|
+
max_false_prob = 0.001
|
35
|
+
size = 50_000
|
36
|
+
bloom = Bloomer.new(size, max_false_prob)
|
37
|
+
set = Set.new
|
38
|
+
size.times do
|
39
|
+
w = rand_word
|
40
|
+
bloom.add(w)
|
41
|
+
set.add(w)
|
42
|
+
end
|
43
|
+
set.each { |ea| bloom.include?(ea).should be_true }
|
44
|
+
tries = size * 3
|
45
|
+
false_hits = 0
|
46
|
+
hits = 0
|
47
|
+
tries.times.each do
|
48
|
+
word = rand_word
|
49
|
+
b_inc, s_inc = bloom.include?(word), set.include?(word)
|
50
|
+
hits += 1 if s_inc
|
51
|
+
if s_inc && !b_inc
|
52
|
+
fail "'#{word}': false negative on include"
|
53
|
+
elsif !s_inc && b_inc
|
54
|
+
false_hits += 1
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
false_positive_failure_rate = false_hits.to_f / tries
|
59
|
+
puts "False positive rate = #{false_positive_failure_rate * 100}%, expected #{max_false_prob * 100}% (#{false_hits} false positives, #{hits} hits)"
|
60
|
+
if (false_positive_failure_rate) > max_false_prob * 2
|
61
|
+
fail "False-positive failure rate was bad: #{false_positive_failure_rate}"
|
62
|
+
end
|
41
63
|
end
|
42
64
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bloomer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Matthew McEachen
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-01-
|
18
|
+
date: 2012-01-21 00:00:00 -08:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -52,7 +52,7 @@ files:
|
|
52
52
|
- spec/bloomer_spec.rb
|
53
53
|
- spec/spec_helper.rb
|
54
54
|
has_rdoc: true
|
55
|
-
homepage:
|
55
|
+
homepage: https://github.com/mceachen/bloomer
|
56
56
|
licenses: []
|
57
57
|
|
58
58
|
post_install_message:
|