bloomer 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +28 -11
- data/bloomer.gemspec +2 -2
- data/lib/bloomer.rb +54 -16
- data/spec/bloomer_spec.rb +84 -43
- metadata +5 -5
data/README.md
CHANGED
@@ -1,18 +1,29 @@
|
|
1
|
-
# Bloomer: A pure-ruby
|
1
|
+
# Bloomer: A Scalable pure-ruby Bloom filter
|
2
2
|
|
3
3
|
[Bloom filters](http://en.wikipedia.org/wiki/Bloom_filter) are great for quickly checking to see if
|
4
|
-
a given string has been seen before--in constant time, and using a fixed amount of RAM
|
4
|
+
a given string has been seen before--in constant time, and using a fixed amount of RAM, as long
|
5
|
+
as you know the expected number of elements up front.
|
5
6
|
|
6
|
-
|
7
|
+
[Scalable Bloom Filters](http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf) allow you to establish an
|
8
|
+
initial capacity, but dynamically scale past that and maintain a false_positive_probability at the expense of
|
9
|
+
growing the RAM requirements.
|
10
|
+
|
11
|
+
```Bloomer``` is a Bloom Filter. ```Bloomer::Scalable``` is a Scalable Bloom Filter.
|
12
|
+
|
13
|
+
Keep in mind that false positives with Bloom Filters *are expected* with a specified probability rate.
|
14
|
+
False negatives, however, are not. In other words,
|
7
15
|
|
8
16
|
* if ```include?``` returns *false*, that string has *certainly not* been ```add```ed
|
9
17
|
* if ```include?``` returns *true*, it *might* mean that string was ```add```ed (depending on the
|
10
18
|
```false_positive_probability``` parameter provided to the constructor).
|
11
19
|
|
12
|
-
This implementation is
|
20
|
+
This implementation is unique in that Bloomer
|
13
21
|
|
14
|
-
*
|
22
|
+
* supports scalable bloom filters (SBF)
|
23
|
+
* uses triple hash chains (see [the paper](http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf))
|
15
24
|
* can marshal state quickly
|
25
|
+
* has rigorous tests
|
26
|
+
* is pure ruby
|
16
27
|
* does not require EM or Redis or something else unrelated to simply implementing a bloom filter
|
17
28
|
|
18
29
|
## Usage
|
@@ -28,6 +39,16 @@ bf.include? "dog"
|
|
28
39
|
#=> false
|
29
40
|
```
|
30
41
|
|
42
|
+
Scalable Bloom filters use the same API:
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
b = Bloomer::Scalable.new
|
46
|
+
b.add "boom"
|
47
|
+
b.include? "boom"
|
48
|
+
#=> true
|
49
|
+
bf.include? "badda"
|
50
|
+
#=> false
|
51
|
+
|
31
52
|
Serialization is through [Marshal](http://ruby-doc.org/core-1.8.7/Marshal.html):
|
32
53
|
|
33
54
|
```ruby
|
@@ -42,11 +63,7 @@ new_b.include? "a"
|
|
42
63
|
## History
|
43
64
|
|
44
65
|
* 0.0.1 Bloom, there it is.
|
45
|
-
|
46
|
-
* 0.0.
|
47
|
-
|
48
|
-
md5 (v0.0.2): 66 sec, false positive rate = 1.116%, expected 1.0%
|
49
|
-
multihash (0.0.1): 92 sec, false positive rate = 1.27%, expected 1.0%
|
50
|
-
|
66
|
+
* 0.0.2 Switch to triple-hash chaining (simpler, faster, and better false-positive rate)
|
67
|
+
* 0.0.3 Added support for scalable bloom filters (SBF)
|
51
68
|
|
52
69
|
|
data/bloomer.gemspec
CHANGED
@@ -9,8 +9,8 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.authors = ["Matthew McEachen"]
|
10
10
|
s.email = ["matthew+github@mceachen.org"]
|
11
11
|
s.homepage = "https://github.com/mceachen/bloomer"
|
12
|
-
s.summary = %q{Pure-ruby bloom filter
|
13
|
-
s.description = %q{
|
12
|
+
s.summary = %q{Pure-ruby scalable bloom filter}
|
13
|
+
s.description = %q{Bloomer implements both simple Bloom filters as well as Scalable Bloom Filters (SBF), in pure ruby and with minimal external dependencies}
|
14
14
|
|
15
15
|
s.rubyforge_project = "bloomer"
|
16
16
|
|
data/lib/bloomer.rb
CHANGED
@@ -2,23 +2,26 @@ require 'bitarray'
|
|
2
2
|
require 'digest/md5'
|
3
3
|
|
4
4
|
class Bloomer
|
5
|
-
VERSION = "0.0.
|
5
|
+
VERSION = "0.0.3"
|
6
6
|
|
7
|
-
def initialize(
|
8
|
-
@
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
7
|
+
def initialize(capacity, false_positive_probability = 0.001)
|
8
|
+
@capacity = capacity.round
|
9
|
+
# m is the required number of bits in the array
|
10
|
+
m = -(capacity * Math.log(false_positive_probability)) / (Math.log(2) ** 2)
|
11
|
+
@ba = BitArray.new(m.round)
|
12
|
+
# count is the number of unique additions to this filter.
|
13
|
+
@count = 0
|
13
14
|
# k is the number of hash functions that minimizes the probability of false positives
|
14
|
-
@k = (
|
15
|
+
@k = (Math.log(2) * (@ba.size / capacity)).round
|
15
16
|
end
|
16
17
|
|
17
|
-
# returns true if item
|
18
|
+
# returns true if item did had not already been added
|
18
19
|
def add string
|
19
20
|
count = 0
|
20
|
-
hashes(string).each { |ea| count += @ba[ea]
|
21
|
-
count == @k
|
21
|
+
hashes(string).each { |ea| count += @ba[ea]; @ba[ea] = 1 }
|
22
|
+
previously_included = (count == @k)
|
23
|
+
@count += 1 unless previously_included
|
24
|
+
!previously_included
|
22
25
|
end
|
23
26
|
|
24
27
|
# returns false if the item hadn't already been added
|
@@ -27,13 +30,15 @@ class Bloomer
|
|
27
30
|
!hashes(string).any? { |ea| @ba[ea] == 0 }
|
28
31
|
end
|
29
32
|
|
30
|
-
|
31
|
-
|
33
|
+
# The number of unique strings given to #add (including false positives, which can mean
|
34
|
+
# this number under-counts)
|
35
|
+
def count
|
36
|
+
@count
|
32
37
|
end
|
33
38
|
|
34
|
-
|
35
|
-
|
36
|
-
|
39
|
+
# If count exceeds capacity, the provided #false_positive_probability will probably be exceeded.
|
40
|
+
def capacity
|
41
|
+
@capacity
|
37
42
|
end
|
38
43
|
|
39
44
|
private
|
@@ -54,4 +59,37 @@ class Bloomer
|
|
54
59
|
x
|
55
60
|
end
|
56
61
|
end
|
62
|
+
|
63
|
+
# Automatically expanding bloom filter.
|
64
|
+
# See http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf
|
65
|
+
class Scalable
|
66
|
+
S = 2
|
67
|
+
R = Math.log(2) ** 2
|
68
|
+
def initialize(initial_capacity = 256, false_positive_probability = 0.001)
|
69
|
+
@false_positive_probability = false_positive_probability
|
70
|
+
@bloomers = [Bloomer.new(initial_capacity, false_positive_probability * R)]
|
71
|
+
end
|
72
|
+
|
73
|
+
def capacity
|
74
|
+
@bloomers.last.capacity
|
75
|
+
end
|
76
|
+
|
77
|
+
def count
|
78
|
+
@bloomers.inject(0) {|i,b|i + b.count}
|
79
|
+
end
|
80
|
+
|
81
|
+
def add string
|
82
|
+
l = @bloomers.last
|
83
|
+
r = l.add(string)
|
84
|
+
if r && (l.count > l.capacity)
|
85
|
+
@bloomers << Bloomer.new(l.capacity * S, @false_positive_probability * (R**@bloomers.size))
|
86
|
+
end
|
87
|
+
r
|
88
|
+
end
|
89
|
+
|
90
|
+
# only return false if no bloomers include string.
|
91
|
+
def include? string
|
92
|
+
@bloomers.any? { |ea| ea.include? string }
|
93
|
+
end
|
94
|
+
end
|
57
95
|
end
|
data/spec/bloomer_spec.rb
CHANGED
@@ -1,64 +1,105 @@
|
|
1
1
|
require "spec_helper"
|
2
2
|
require "benchmark"
|
3
3
|
|
4
|
+
C = ('a'..'z').to_a
|
4
5
|
def rand_word(length = 8)
|
5
|
-
|
6
|
+
C.shuffle.first(length).join # not random enough to cause hits.
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_bloom(size, max_false_prob, bloom)
|
10
|
+
set = Set.new
|
11
|
+
size.times do
|
12
|
+
w = rand_word
|
13
|
+
bloom.add(w)
|
14
|
+
set.add(w)
|
15
|
+
end
|
16
|
+
set.each { |ea| bloom.include?(ea).should be_true }
|
17
|
+
tries = size * 3
|
18
|
+
false_hits = 0
|
19
|
+
hits = 0
|
20
|
+
tries.times.each do
|
21
|
+
word = rand_word
|
22
|
+
b_inc, s_inc = bloom.include?(word), set.include?(word)
|
23
|
+
hits += 1 if s_inc
|
24
|
+
if s_inc && !b_inc
|
25
|
+
fail "'#{word}': false negative on include"
|
26
|
+
elsif !s_inc && b_inc
|
27
|
+
false_hits += 1
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
false_positive_failure_rate = false_hits.to_f / tries
|
32
|
+
puts "False positive rate = #{false_positive_failure_rate * 100}%, expected #{max_false_prob * 100}% (#{false_hits} false positives, #{hits} hits)"
|
33
|
+
if (false_positive_failure_rate) > max_false_prob * 2
|
34
|
+
fail "False-positive failure rate was bad: #{false_positive_failure_rate}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_marshal_state(b)
|
39
|
+
inputs = b.capacity.times.collect { rand_word }
|
40
|
+
inputs.each { |ea| b.add(ea) }
|
41
|
+
new_b = Marshal.load(Marshal.dump(b))
|
42
|
+
new_b.count.should == b.count
|
43
|
+
new_b.capacity.should == b.capacity
|
44
|
+
inputs.each { |ea| new_b.should include(ea) }
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_simple(b)
|
48
|
+
b.add("a").should be_true
|
49
|
+
b.add("a").should be_false
|
50
|
+
b.should include("a")
|
51
|
+
b.should_not include("")
|
52
|
+
b.should_not include("b")
|
53
|
+
b.add("b").should be_true
|
54
|
+
b.add("b").should be_false
|
55
|
+
b.should include("b")
|
56
|
+
b.should_not include("")
|
57
|
+
b.add("")
|
58
|
+
b.should include("")
|
6
59
|
end
|
7
60
|
|
8
61
|
describe Bloomer do
|
9
62
|
it "should work trivially" do
|
10
63
|
b = Bloomer.new(10, 0.001)
|
11
|
-
b
|
12
|
-
b.add("a").should be_true
|
13
|
-
b.should include("a")
|
14
|
-
b.should_not include("")
|
15
|
-
b.should_not include("b")
|
16
|
-
b.add("b").should be_false
|
17
|
-
b.add("b").should be_true
|
18
|
-
b.should include("b")
|
19
|
-
b.should_not include("")
|
20
|
-
b.add("")
|
21
|
-
b.should include("")
|
64
|
+
test_simple(b)
|
22
65
|
end
|
23
66
|
|
24
67
|
it "should marshal state correctly" do
|
25
68
|
b = Bloomer.new(10, 0.001)
|
26
|
-
|
27
|
-
inputs.each { |ea| b.add(ea) }
|
28
|
-
s = Marshal.dump(b)
|
29
|
-
new_b = Marshal.load(s)
|
30
|
-
inputs.each { |ea| new_b.should include(ea) }
|
69
|
+
test_marshal_state(b)
|
31
70
|
end
|
32
71
|
|
33
72
|
it "should result in similar-to-expected false positives" do
|
34
73
|
max_false_prob = 0.001
|
35
74
|
size = 50_000
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
bloom.add(w)
|
41
|
-
set.add(w)
|
42
|
-
end
|
43
|
-
set.each { |ea| bloom.include?(ea).should be_true }
|
44
|
-
tries = size * 3
|
45
|
-
false_hits = 0
|
46
|
-
hits = 0
|
47
|
-
tries.times.each do
|
48
|
-
word = rand_word
|
49
|
-
b_inc, s_inc = bloom.include?(word), set.include?(word)
|
50
|
-
hits += 1 if s_inc
|
51
|
-
if s_inc && !b_inc
|
52
|
-
fail "'#{word}': false negative on include"
|
53
|
-
elsif !s_inc && b_inc
|
54
|
-
false_hits += 1
|
55
|
-
end
|
56
|
-
end
|
75
|
+
b = Bloomer.new(size, max_false_prob)
|
76
|
+
test_bloom(size, max_false_prob, b)
|
77
|
+
end
|
78
|
+
end
|
57
79
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
80
|
+
describe Bloomer::Scalable do
|
81
|
+
it "should work trivially" do
|
82
|
+
b = Bloomer::Scalable.new
|
83
|
+
test_simple(b)
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should marshal state correctly" do
|
87
|
+
b = Bloomer::Scalable.new(10, 0.001)
|
88
|
+
100.times.each { b.add(rand_word) }
|
89
|
+
test_marshal_state(b)
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should result in similar-to-expected false positives" do
|
93
|
+
max_false_prob = 0.001
|
94
|
+
size = 10_000
|
95
|
+
b = Bloomer::Scalable.new(1024, max_false_prob)
|
96
|
+
test_bloom(size, max_false_prob, b)
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should result in similar-to-expected false positives" do
|
100
|
+
max_false_prob = 0.01
|
101
|
+
size = 50_000
|
102
|
+
b = Bloomer::Scalable.new(1024, max_false_prob)
|
103
|
+
test_bloom(size, max_false_prob, b)
|
63
104
|
end
|
64
105
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bloomer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Matthew McEachen
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
type: :runtime
|
33
33
|
name: bitarray
|
34
34
|
version_requirements: *id001
|
35
|
-
description:
|
35
|
+
description: Bloomer implements both simple Bloom filters as well as Scalable Bloom Filters (SBF), in pure ruby and with minimal external dependencies
|
36
36
|
email:
|
37
37
|
- matthew+github@mceachen.org
|
38
38
|
executables: []
|
@@ -84,7 +84,7 @@ rubyforge_project: bloomer
|
|
84
84
|
rubygems_version: 1.6.2
|
85
85
|
signing_key:
|
86
86
|
specification_version: 3
|
87
|
-
summary: Pure-ruby bloom filter
|
87
|
+
summary: Pure-ruby scalable bloom filter
|
88
88
|
test_files:
|
89
89
|
- spec/bloomer_spec.rb
|
90
90
|
- spec/spec_helper.rb
|