bloomer 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +28 -11
- data/bloomer.gemspec +2 -2
- data/lib/bloomer.rb +54 -16
- data/spec/bloomer_spec.rb +84 -43
- metadata +5 -5
data/README.md
CHANGED
@@ -1,18 +1,29 @@
|
|
1
|
-
# Bloomer: A pure-ruby
|
1
|
+
# Bloomer: A Scalable pure-ruby Bloom filter
|
2
2
|
|
3
3
|
[Bloom filters](http://en.wikipedia.org/wiki/Bloom_filter) are great for quickly checking to see if
|
4
|
-
a given string has been seen before--in constant time, and using a fixed amount of RAM
|
4
|
+
a given string has been seen before--in constant time, and using a fixed amount of RAM, as long
|
5
|
+
as you know the expected number of elements up front.
|
5
6
|
|
6
|
-
|
7
|
+
[Scalable Bloom Filters](http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf) allow you to establish an
|
8
|
+
initial capacity, but dynamically scale past that and maintain a false_positive_probability at the expense of
|
9
|
+
growing the RAM requirements.
|
10
|
+
|
11
|
+
```Bloomer``` is a Bloom Filter. ```Bloomer::Scalable``` is a Scalable Bloom Filter.
|
12
|
+
|
13
|
+
Keep in mind that false positives with Bloom Filters *are expected* with a specified probability rate.
|
14
|
+
False negatives, however, are not. In other words,
|
7
15
|
|
8
16
|
* if ```include?``` returns *false*, that string has *certainly not* been ```add```ed
|
9
17
|
* if ```include?``` returns *true*, it *might* mean that string was ```add```ed (depending on the
|
10
18
|
```false_positive_probability``` parameter provided to the constructor).
|
11
19
|
|
12
|
-
This implementation is
|
20
|
+
This implementation is unique in that Bloomer
|
13
21
|
|
14
|
-
*
|
22
|
+
* supports scalable bloom filters (SBF)
|
23
|
+
* uses triple hash chains (see [the paper](http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf))
|
15
24
|
* can marshal state quickly
|
25
|
+
* has rigorous tests
|
26
|
+
* is pure ruby
|
16
27
|
* does not require EM or Redis or something else unrelated to simply implementing a bloom filter
|
17
28
|
|
18
29
|
## Usage
|
@@ -28,6 +39,16 @@ bf.include? "dog"
|
|
28
39
|
#=> false
|
29
40
|
```
|
30
41
|
|
42
|
+
Scalable Bloom filters use the same API:
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
b = Bloomer::Scalable.new
|
46
|
+
b.add "boom"
|
47
|
+
b.include? "boom"
|
48
|
+
#=> true
|
49
|
+
bf.include? "badda"
|
50
|
+
#=> false
|
51
|
+
|
31
52
|
Serialization is through [Marshal](http://ruby-doc.org/core-1.8.7/Marshal.html):
|
32
53
|
|
33
54
|
```ruby
|
@@ -42,11 +63,7 @@ new_b.include? "a"
|
|
42
63
|
## History
|
43
64
|
|
44
65
|
* 0.0.1 Bloom, there it is.
|
45
|
-
|
46
|
-
* 0.0.
|
47
|
-
|
48
|
-
md5 (v0.0.2): 66 sec, false positive rate = 1.116%, expected 1.0%
|
49
|
-
multihash (0.0.1): 92 sec, false positive rate = 1.27%, expected 1.0%
|
50
|
-
|
66
|
+
* 0.0.2 Switch to triple-hash chaining (simpler, faster, and better false-positive rate)
|
67
|
+
* 0.0.3 Added support for scalable bloom filters (SBF)
|
51
68
|
|
52
69
|
|
data/bloomer.gemspec
CHANGED
@@ -9,8 +9,8 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.authors = ["Matthew McEachen"]
|
10
10
|
s.email = ["matthew+github@mceachen.org"]
|
11
11
|
s.homepage = "https://github.com/mceachen/bloomer"
|
12
|
-
s.summary = %q{Pure-ruby bloom filter
|
13
|
-
s.description = %q{
|
12
|
+
s.summary = %q{Pure-ruby scalable bloom filter}
|
13
|
+
s.description = %q{Bloomer implements both simple Bloom filters as well as Scalable Bloom Filters (SBF), in pure ruby and with minimal external dependencies}
|
14
14
|
|
15
15
|
s.rubyforge_project = "bloomer"
|
16
16
|
|
data/lib/bloomer.rb
CHANGED
@@ -2,23 +2,26 @@ require 'bitarray'
|
|
2
2
|
require 'digest/md5'
|
3
3
|
|
4
4
|
class Bloomer
|
5
|
-
VERSION = "0.0.
|
5
|
+
VERSION = "0.0.3"
|
6
6
|
|
7
|
-
def initialize(
|
8
|
-
@
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
7
|
+
def initialize(capacity, false_positive_probability = 0.001)
|
8
|
+
@capacity = capacity.round
|
9
|
+
# m is the required number of bits in the array
|
10
|
+
m = -(capacity * Math.log(false_positive_probability)) / (Math.log(2) ** 2)
|
11
|
+
@ba = BitArray.new(m.round)
|
12
|
+
# count is the number of unique additions to this filter.
|
13
|
+
@count = 0
|
13
14
|
# k is the number of hash functions that minimizes the probability of false positives
|
14
|
-
@k = (
|
15
|
+
@k = (Math.log(2) * (@ba.size / capacity)).round
|
15
16
|
end
|
16
17
|
|
17
|
-
# returns true if item
|
18
|
+
# returns true if item did had not already been added
|
18
19
|
def add string
|
19
20
|
count = 0
|
20
|
-
hashes(string).each { |ea| count += @ba[ea]
|
21
|
-
count == @k
|
21
|
+
hashes(string).each { |ea| count += @ba[ea]; @ba[ea] = 1 }
|
22
|
+
previously_included = (count == @k)
|
23
|
+
@count += 1 unless previously_included
|
24
|
+
!previously_included
|
22
25
|
end
|
23
26
|
|
24
27
|
# returns false if the item hadn't already been added
|
@@ -27,13 +30,15 @@ class Bloomer
|
|
27
30
|
!hashes(string).any? { |ea| @ba[ea] == 0 }
|
28
31
|
end
|
29
32
|
|
30
|
-
|
31
|
-
|
33
|
+
# The number of unique strings given to #add (including false positives, which can mean
|
34
|
+
# this number under-counts)
|
35
|
+
def count
|
36
|
+
@count
|
32
37
|
end
|
33
38
|
|
34
|
-
|
35
|
-
|
36
|
-
|
39
|
+
# If count exceeds capacity, the provided #false_positive_probability will probably be exceeded.
|
40
|
+
def capacity
|
41
|
+
@capacity
|
37
42
|
end
|
38
43
|
|
39
44
|
private
|
@@ -54,4 +59,37 @@ class Bloomer
|
|
54
59
|
x
|
55
60
|
end
|
56
61
|
end
|
62
|
+
|
63
|
+
# Automatically expanding bloom filter.
|
64
|
+
# See http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf
|
65
|
+
class Scalable
|
66
|
+
S = 2
|
67
|
+
R = Math.log(2) ** 2
|
68
|
+
def initialize(initial_capacity = 256, false_positive_probability = 0.001)
|
69
|
+
@false_positive_probability = false_positive_probability
|
70
|
+
@bloomers = [Bloomer.new(initial_capacity, false_positive_probability * R)]
|
71
|
+
end
|
72
|
+
|
73
|
+
def capacity
|
74
|
+
@bloomers.last.capacity
|
75
|
+
end
|
76
|
+
|
77
|
+
def count
|
78
|
+
@bloomers.inject(0) {|i,b|i + b.count}
|
79
|
+
end
|
80
|
+
|
81
|
+
def add string
|
82
|
+
l = @bloomers.last
|
83
|
+
r = l.add(string)
|
84
|
+
if r && (l.count > l.capacity)
|
85
|
+
@bloomers << Bloomer.new(l.capacity * S, @false_positive_probability * (R**@bloomers.size))
|
86
|
+
end
|
87
|
+
r
|
88
|
+
end
|
89
|
+
|
90
|
+
# only return false if no bloomers include string.
|
91
|
+
def include? string
|
92
|
+
@bloomers.any? { |ea| ea.include? string }
|
93
|
+
end
|
94
|
+
end
|
57
95
|
end
|
data/spec/bloomer_spec.rb
CHANGED
@@ -1,64 +1,105 @@
|
|
1
1
|
require "spec_helper"
|
2
2
|
require "benchmark"
|
3
3
|
|
4
|
+
C = ('a'..'z').to_a
|
4
5
|
def rand_word(length = 8)
|
5
|
-
|
6
|
+
C.shuffle.first(length).join # not random enough to cause hits.
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_bloom(size, max_false_prob, bloom)
|
10
|
+
set = Set.new
|
11
|
+
size.times do
|
12
|
+
w = rand_word
|
13
|
+
bloom.add(w)
|
14
|
+
set.add(w)
|
15
|
+
end
|
16
|
+
set.each { |ea| bloom.include?(ea).should be_true }
|
17
|
+
tries = size * 3
|
18
|
+
false_hits = 0
|
19
|
+
hits = 0
|
20
|
+
tries.times.each do
|
21
|
+
word = rand_word
|
22
|
+
b_inc, s_inc = bloom.include?(word), set.include?(word)
|
23
|
+
hits += 1 if s_inc
|
24
|
+
if s_inc && !b_inc
|
25
|
+
fail "'#{word}': false negative on include"
|
26
|
+
elsif !s_inc && b_inc
|
27
|
+
false_hits += 1
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
false_positive_failure_rate = false_hits.to_f / tries
|
32
|
+
puts "False positive rate = #{false_positive_failure_rate * 100}%, expected #{max_false_prob * 100}% (#{false_hits} false positives, #{hits} hits)"
|
33
|
+
if (false_positive_failure_rate) > max_false_prob * 2
|
34
|
+
fail "False-positive failure rate was bad: #{false_positive_failure_rate}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_marshal_state(b)
|
39
|
+
inputs = b.capacity.times.collect { rand_word }
|
40
|
+
inputs.each { |ea| b.add(ea) }
|
41
|
+
new_b = Marshal.load(Marshal.dump(b))
|
42
|
+
new_b.count.should == b.count
|
43
|
+
new_b.capacity.should == b.capacity
|
44
|
+
inputs.each { |ea| new_b.should include(ea) }
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_simple(b)
|
48
|
+
b.add("a").should be_true
|
49
|
+
b.add("a").should be_false
|
50
|
+
b.should include("a")
|
51
|
+
b.should_not include("")
|
52
|
+
b.should_not include("b")
|
53
|
+
b.add("b").should be_true
|
54
|
+
b.add("b").should be_false
|
55
|
+
b.should include("b")
|
56
|
+
b.should_not include("")
|
57
|
+
b.add("")
|
58
|
+
b.should include("")
|
6
59
|
end
|
7
60
|
|
8
61
|
describe Bloomer do
|
9
62
|
it "should work trivially" do
|
10
63
|
b = Bloomer.new(10, 0.001)
|
11
|
-
b
|
12
|
-
b.add("a").should be_true
|
13
|
-
b.should include("a")
|
14
|
-
b.should_not include("")
|
15
|
-
b.should_not include("b")
|
16
|
-
b.add("b").should be_false
|
17
|
-
b.add("b").should be_true
|
18
|
-
b.should include("b")
|
19
|
-
b.should_not include("")
|
20
|
-
b.add("")
|
21
|
-
b.should include("")
|
64
|
+
test_simple(b)
|
22
65
|
end
|
23
66
|
|
24
67
|
it "should marshal state correctly" do
|
25
68
|
b = Bloomer.new(10, 0.001)
|
26
|
-
|
27
|
-
inputs.each { |ea| b.add(ea) }
|
28
|
-
s = Marshal.dump(b)
|
29
|
-
new_b = Marshal.load(s)
|
30
|
-
inputs.each { |ea| new_b.should include(ea) }
|
69
|
+
test_marshal_state(b)
|
31
70
|
end
|
32
71
|
|
33
72
|
it "should result in similar-to-expected false positives" do
|
34
73
|
max_false_prob = 0.001
|
35
74
|
size = 50_000
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
bloom.add(w)
|
41
|
-
set.add(w)
|
42
|
-
end
|
43
|
-
set.each { |ea| bloom.include?(ea).should be_true }
|
44
|
-
tries = size * 3
|
45
|
-
false_hits = 0
|
46
|
-
hits = 0
|
47
|
-
tries.times.each do
|
48
|
-
word = rand_word
|
49
|
-
b_inc, s_inc = bloom.include?(word), set.include?(word)
|
50
|
-
hits += 1 if s_inc
|
51
|
-
if s_inc && !b_inc
|
52
|
-
fail "'#{word}': false negative on include"
|
53
|
-
elsif !s_inc && b_inc
|
54
|
-
false_hits += 1
|
55
|
-
end
|
56
|
-
end
|
75
|
+
b = Bloomer.new(size, max_false_prob)
|
76
|
+
test_bloom(size, max_false_prob, b)
|
77
|
+
end
|
78
|
+
end
|
57
79
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
80
|
+
describe Bloomer::Scalable do
|
81
|
+
it "should work trivially" do
|
82
|
+
b = Bloomer::Scalable.new
|
83
|
+
test_simple(b)
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should marshal state correctly" do
|
87
|
+
b = Bloomer::Scalable.new(10, 0.001)
|
88
|
+
100.times.each { b.add(rand_word) }
|
89
|
+
test_marshal_state(b)
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should result in similar-to-expected false positives" do
|
93
|
+
max_false_prob = 0.001
|
94
|
+
size = 10_000
|
95
|
+
b = Bloomer::Scalable.new(1024, max_false_prob)
|
96
|
+
test_bloom(size, max_false_prob, b)
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should result in similar-to-expected false positives" do
|
100
|
+
max_false_prob = 0.01
|
101
|
+
size = 50_000
|
102
|
+
b = Bloomer::Scalable.new(1024, max_false_prob)
|
103
|
+
test_bloom(size, max_false_prob, b)
|
63
104
|
end
|
64
105
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bloomer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Matthew McEachen
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
type: :runtime
|
33
33
|
name: bitarray
|
34
34
|
version_requirements: *id001
|
35
|
-
description:
|
35
|
+
description: Bloomer implements both simple Bloom filters as well as Scalable Bloom Filters (SBF), in pure ruby and with minimal external dependencies
|
36
36
|
email:
|
37
37
|
- matthew+github@mceachen.org
|
38
38
|
executables: []
|
@@ -84,7 +84,7 @@ rubyforge_project: bloomer
|
|
84
84
|
rubygems_version: 1.6.2
|
85
85
|
signing_key:
|
86
86
|
specification_version: 3
|
87
|
-
summary: Pure-ruby bloom filter
|
87
|
+
summary: Pure-ruby scalable bloom filter
|
88
88
|
test_files:
|
89
89
|
- spec/bloomer_spec.rb
|
90
90
|
- spec/spec_helper.rb
|