bloomfilter-rb 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/.rspec +0 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +29 -0
- data/README.md +87 -0
- data/Rakefile +9 -0
- data/bloomfilter-rb.gemspec +26 -0
- data/examples/counting-redis.rb +12 -0
- data/examples/pure-ruby-bf.rb +52 -0
- data/examples/simple-native.rb +25 -0
- data/examples/simple-redis.rb +31 -0
- data/ext/cbloomfilter/cbloomfilter.c +359 -0
- data/ext/cbloomfilter/crc32.c +32 -0
- data/ext/cbloomfilter/crc32.h +78 -0
- data/ext/cbloomfilter/extconf.rb +4 -0
- data/lib/bloomfilter-rb.rb +9 -0
- data/lib/bloomfilter/counting_redis.rb +61 -0
- data/lib/bloomfilter/filter.rb +13 -0
- data/lib/bloomfilter/native.rb +65 -0
- data/lib/bloomfilter/redis.rb +69 -0
- data/lib/bloomfilter/version.rb +3 -0
- data/spec/counting_redis_spec.rb +52 -0
- data/spec/helper.rb +2 -0
- data/spec/native_spec.rb +79 -0
- data/spec/redis_spec.rb +54 -0
- metadata +133 -0
data/.gitignore
ADDED
data/.rspec
ADDED
File without changes
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
bloomfilter-rb (2.0.0)
|
5
|
+
redis (>= 2.1.1)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: http://rubygems.org/
|
9
|
+
specs:
|
10
|
+
diff-lcs (1.1.2)
|
11
|
+
rake (0.8.7)
|
12
|
+
redis (2.1.1)
|
13
|
+
rspec (2.3.0)
|
14
|
+
rspec-core (~> 2.3.0)
|
15
|
+
rspec-expectations (~> 2.3.0)
|
16
|
+
rspec-mocks (~> 2.3.0)
|
17
|
+
rspec-core (2.3.1)
|
18
|
+
rspec-expectations (2.3.0)
|
19
|
+
diff-lcs (~> 1.1.2)
|
20
|
+
rspec-mocks (2.3.0)
|
21
|
+
|
22
|
+
PLATFORMS
|
23
|
+
ruby
|
24
|
+
|
25
|
+
DEPENDENCIES
|
26
|
+
bloomfilter-rb!
|
27
|
+
rake
|
28
|
+
redis (>= 2.1.1)
|
29
|
+
rspec
|
data/README.md
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
# BloomFilter(s) in Ruby
|
2
|
+
|
3
|
+
- Native (MRI/C) counting bloom filter
|
4
|
+
- Redis-backed getbit/setbit non-counting bloom filter
|
5
|
+
- Redis-backed set-based counting (+TTL) bloom filter
|
6
|
+
|
7
|
+
Bloom filter is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set. False positives are possible, but false negatives are not. For more detail, check the [wikipedia article](http://en.wikipedia.org/wiki/Bloom_filter). Instead of using k different hash functions, this implementation seeds the CRC32 hash with k different initial values (0, 1, ..., k-1). This may or may not give you a good distribution, it all depends on the data.
|
8
|
+
|
9
|
+
Performance of the Bloom filter depends on a number of variables:
|
10
|
+
|
11
|
+
- size of the bit array
|
12
|
+
- size of the counter bucket
|
13
|
+
- number of hash functions
|
14
|
+
|
15
|
+
## Resources
|
16
|
+
|
17
|
+
- Determining parameters: [Scalable Datasets: Bloom Filters in Ruby](http://www.igvita.com/2008/12/27/scalable-datasets-bloom-filters-in-ruby/)
|
18
|
+
- Applications & reasons behind bloom filter: [Flow analysis: Time based bloom filter](http://www.igvita.com/2010/01/06/flow-analysis-time-based-bloom-filters/)
|
19
|
+
|
20
|
+
***
|
21
|
+
|
22
|
+
## MRI/C API Example
|
23
|
+
|
24
|
+
MRI/C implementation which creates an in-memory filter which can be saved and reloaded from disk.
|
25
|
+
|
26
|
+
require 'bloomfilter'
|
27
|
+
|
28
|
+
bf = BloomFilter::Native.new(:size => 100, :hashes => 2, :seed => 1, :bucket => 3, :raise => false)
|
29
|
+
bf.insert("test")
|
30
|
+
bf.include?("test") # => true
|
31
|
+
bf.include?("blah") # => false
|
32
|
+
|
33
|
+
bf.delete("test")
|
34
|
+
bf.include?("test") # => false
|
35
|
+
|
36
|
+
# Hash with a bloom filter!
|
37
|
+
bf["test2"] = "bar"
|
38
|
+
bf["test2"] # => true
|
39
|
+
bf["test3"] # => false
|
40
|
+
|
41
|
+
bf.stats
|
42
|
+
Number of filter bits (m): 10
|
43
|
+
Number of filter elements (n): 2
|
44
|
+
Number of filter hashes (k) : 2
|
45
|
+
Predicted false positive rate = 10.87%
|
46
|
+
|
47
|
+
***
|
48
|
+
|
49
|
+
## Redis-backed setbit/getbit bloom filter
|
50
|
+
|
51
|
+
Uses [getbit](http://redis.io/commands/getbit)/[setbit](http://redis.io/commands/setbit) on Redis strings - efficient, fast, can be shared by multiple/concurrent processes.
|
52
|
+
|
53
|
+
bf = BloomFilter::Redis.new
|
54
|
+
|
55
|
+
bf.insert('test')
|
56
|
+
bf.include?('test') # => true
|
57
|
+
bf.include?('blah') # => false
|
58
|
+
|
59
|
+
bf.delete('test')
|
60
|
+
bf.include?('test') # => false
|
61
|
+
|
62
|
+
### Memory footprint
|
63
|
+
|
64
|
+
- 1.0% error rate for 1M items, 10 bits/item: *2.5 mb*
|
65
|
+
- 1.0% error rate for 150M items, 10 bits per item: *358.52 mb*
|
66
|
+
- 0.1% error rate for 150M items, 15 bits per item: *537.33 mb*
|
67
|
+
|
68
|
+
***
|
69
|
+
|
70
|
+
## Redis-backed counting bloom filter with TTL's
|
71
|
+
Uses regular Redis get/set counters to implement a counting filter with optional TTL expiry. Because each "bit" requires its own key in Redis, you do incur a much larger memory overhead.
|
72
|
+
|
73
|
+
bf = BloomFilter::CountingRedis.new(:ttl => 2)
|
74
|
+
|
75
|
+
bf.insert('test')
|
76
|
+
bf.include?('test') # => true
|
77
|
+
|
78
|
+
sleep(2)
|
79
|
+
bf.include?('test') # => false
|
80
|
+
|
81
|
+
## Credits
|
82
|
+
|
83
|
+
Tatsuya Mori <valdzone@gmail.com> (Original C implementation: http://vald.x0.com/sb/)
|
84
|
+
|
85
|
+
## License
|
86
|
+
|
87
|
+
(MIT License) - Copyright (c) 2011 Ilya Grigorik
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "bloomfilter/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "bloomfilter-rb"
|
7
|
+
s.version = BloomFilter::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Ilya Grigorik", "Tatsuya Mori"]
|
10
|
+
s.email = ["ilya@igvita.com"]
|
11
|
+
s.homepage = "http://github.com/igrigorik/bloomfilter"
|
12
|
+
s.summary = "Counting Bloom Filter implemented in Ruby"
|
13
|
+
s.description = s.summary
|
14
|
+
s.rubyforge_project = "bloomfilter-rb"
|
15
|
+
|
16
|
+
s.add_dependency "redis", ">= 2.1.1"
|
17
|
+
s.add_development_dependency "rspec"
|
18
|
+
s.add_development_dependency "rake"
|
19
|
+
|
20
|
+
s.extensions = ["ext/cbloomfilter/extconf.rb"]
|
21
|
+
|
22
|
+
s.files = `git ls-files`.split("\n")
|
23
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
24
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
25
|
+
s.require_paths = ["lib"]
|
26
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#
|
2
|
+
# Pure ruby implementation of a Bloom filter, just for kicks
|
3
|
+
#
|
4
|
+
|
5
|
+
require 'bitset'
|
6
|
+
require 'zlib'
|
7
|
+
|
8
|
+
class BloomFilter
|
9
|
+
|
10
|
+
def initialize(max_entries, num_hashes, seed)
|
11
|
+
@num_hashes = num_hashes
|
12
|
+
@size = max_entries.to_i
|
13
|
+
@bitmap = BitSet.new(@size)
|
14
|
+
@__mask = BitSet.new(@size)
|
15
|
+
@seed = seed
|
16
|
+
end
|
17
|
+
|
18
|
+
def insert(key)
|
19
|
+
mask = make_mask(key)
|
20
|
+
@bitmap |= mask
|
21
|
+
end
|
22
|
+
|
23
|
+
def new?(key)
|
24
|
+
mask = make_mask(key)
|
25
|
+
return ((@bitmap & mask) != mask);
|
26
|
+
end
|
27
|
+
|
28
|
+
def make_mask(key)
|
29
|
+
@__mask.clear
|
30
|
+
0.upto(@num_hashes.to_i - 1) do |i|
|
31
|
+
hash = Zlib.crc32(key, i + @seed)
|
32
|
+
@__mask.set(hash % @size, 1)
|
33
|
+
end
|
34
|
+
return @__mask
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def main
|
39
|
+
bf = BloomFilter.new(1000000, 4, 0)
|
40
|
+
num = 0
|
41
|
+
while line = ARGF.gets
|
42
|
+
data = line.chop
|
43
|
+
|
44
|
+
if bf.new_entry?(data)
|
45
|
+
num += 1
|
46
|
+
bf.insert(data)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
print "#element = #{num}\n"
|
50
|
+
end
|
51
|
+
|
52
|
+
main
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bloomfilter-rb'
|
3
|
+
|
4
|
+
WORDS = %w(duck penguin bear panda)
|
5
|
+
TEST = %w(penguin moose racooon)
|
6
|
+
|
7
|
+
bf = BloomFilter::Native.new(:size => 100, :hashes => 2, :seed => 1, :bucket => 3, :raise => false)
|
8
|
+
|
9
|
+
WORDS.each { |w| bf.insert(w) }
|
10
|
+
TEST.each do |w|
|
11
|
+
puts "#{w}: #{bf.include?(w)}"
|
12
|
+
end
|
13
|
+
|
14
|
+
bf.stats
|
15
|
+
|
16
|
+
# penguin: true
|
17
|
+
# moose: false
|
18
|
+
# racooon: false
|
19
|
+
#
|
20
|
+
# Number of filter buckets (m): 100
|
21
|
+
# Number of bits per buckets (b): 1
|
22
|
+
# Number of filter elements (n): 4
|
23
|
+
# Number of filter hashes (k) : 4
|
24
|
+
# Raise on overflow? (r) : false
|
25
|
+
# Predicted false positive rate = 0.05%
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'lib/bloomfilter-rb'
|
3
|
+
|
4
|
+
items = 1_000_00
|
5
|
+
bits = 1
|
6
|
+
|
7
|
+
# p BloomFilter::Redis.new(:size => items*bits, :hashes => 7) # 2.5 mb
|
8
|
+
# p BloomFilter::Redis.new(:size => items*bits*5, :hashes => 7) # 13 mb
|
9
|
+
# p BloomFilter::Redis.new(:size => items*bits*30, :hashes => 7) # 73 mb
|
10
|
+
|
11
|
+
# 1% error rate for 5M items/day, 10 bits per item, for 30 days of data: 358.52 mb
|
12
|
+
# 0.1% error rate for 5M items/day, 15 bits per item, for 30 days of data: 537.33 mb
|
13
|
+
|
14
|
+
bf = BloomFilter::Redis.new(:size => items*bits, :hashes => 7) # 2.5 mb
|
15
|
+
|
16
|
+
seen = Set.new
|
17
|
+
err = 0
|
18
|
+
num = 100000
|
19
|
+
|
20
|
+
num.times do
|
21
|
+
item = rand(items)
|
22
|
+
|
23
|
+
if bf.include?(item) != seen.include?(item)
|
24
|
+
err += 1
|
25
|
+
end
|
26
|
+
|
27
|
+
seen << item
|
28
|
+
bf.insert(item)
|
29
|
+
end
|
30
|
+
|
31
|
+
p [:error_rate, (err.to_f / num) * 100]
|
@@ -0,0 +1,359 @@
|
|
1
|
+
/*
|
2
|
+
* cbloomfilter.c - simple Bloom Filter
|
3
|
+
* (c) Tatsuya Mori <valdzone@gmail.com>
|
4
|
+
*/
|
5
|
+
|
6
|
+
#include "ruby.h"
|
7
|
+
#include "crc32.h"
|
8
|
+
|
9
|
+
#if !defined(RSTRING_LEN)
|
10
|
+
# define RSTRING_LEN(x) (RSTRING(x)->len)
|
11
|
+
# define RSTRING_PTR(x) (RSTRING(x)->ptr)
|
12
|
+
#endif
|
13
|
+
|
14
|
+
static VALUE cBloomFilter;
|
15
|
+
|
16
|
+
struct BloomFilter {
|
17
|
+
int m; /* # of buckets in a bloom filter */
|
18
|
+
int b; /* # of bits in a bloom filter bucket */
|
19
|
+
int k; /* # of hash functions */
|
20
|
+
int s; /* # seed of hash functions */
|
21
|
+
int r; /* # raise on bucket overflow? */
|
22
|
+
int num_set; /* # of set bits */
|
23
|
+
unsigned char *ptr; /* bits data */
|
24
|
+
int bytes; /* size of byte data */
|
25
|
+
};
|
26
|
+
|
27
|
+
void bits_free(struct BloomFilter *bf) {
|
28
|
+
ruby_xfree(bf->ptr);
|
29
|
+
}
|
30
|
+
|
31
|
+
void bucket_unset(struct BloomFilter *bf, int index) {
|
32
|
+
int byte_offset = (index * bf->b) / 8;
|
33
|
+
int bit_offset = (index * bf->b) % 8;
|
34
|
+
unsigned int c = bf->ptr[byte_offset];
|
35
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
36
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
37
|
+
if ((c & mask) == 0) {
|
38
|
+
// do nothing
|
39
|
+
} else {
|
40
|
+
bf->ptr[byte_offset] -= (1 << bit_offset) & ((1 << 8) - 1);
|
41
|
+
bf->ptr[byte_offset + 1] -= ((1 << bit_offset) & ((1 << 16) - 1)) >> 8;
|
42
|
+
}
|
43
|
+
|
44
|
+
}
|
45
|
+
|
46
|
+
void bucket_set(struct BloomFilter *bf, int index) {
|
47
|
+
int byte_offset = (index * bf->b) / 8;
|
48
|
+
int bit_offset = (index * bf->b) % 8;
|
49
|
+
unsigned int c = bf->ptr[byte_offset];
|
50
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
51
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
52
|
+
if ((c & mask) == mask) {
|
53
|
+
if (bf->r == 1) rb_raise(rb_eRuntimeError, "bucket got filled up");
|
54
|
+
} else {
|
55
|
+
bf->ptr[byte_offset] += (1 << bit_offset) & ((1 << 8) - 1);
|
56
|
+
bf->ptr[byte_offset + 1] += ((1 << bit_offset) & ((1 << 16) - 1)) >> 8;
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
int bucket_check(struct BloomFilter *bf, int index) {
|
61
|
+
int byte_offset = (index * bf->b) / 8;
|
62
|
+
int bit_offset = (index * bf->b) % 8;
|
63
|
+
unsigned int c = bf->ptr[byte_offset];
|
64
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
65
|
+
|
66
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
67
|
+
return (c & mask) >> bit_offset;
|
68
|
+
}
|
69
|
+
|
70
|
+
int bucket_get(struct BloomFilter *bf, int index) {
|
71
|
+
int byte_offset = (index * bf->b) / 8;
|
72
|
+
int bit_offset = (index * bf->b) % 8;
|
73
|
+
unsigned int c = bf->ptr[byte_offset];
|
74
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
75
|
+
|
76
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
77
|
+
return (c & mask) >> bit_offset;
|
78
|
+
}
|
79
|
+
|
80
|
+
static VALUE bf_s_new(int argc, VALUE *argv, VALUE self) {
|
81
|
+
struct BloomFilter *bf;
|
82
|
+
VALUE arg1, arg2, arg3, arg4, arg5, obj;
|
83
|
+
int m, k, s, b, r, bytes;
|
84
|
+
|
85
|
+
obj = Data_Make_Struct(self, struct BloomFilter, NULL, bits_free, bf);
|
86
|
+
|
87
|
+
/* default = Fugou approach :-) */
|
88
|
+
arg1 = INT2FIX(100000000);
|
89
|
+
arg2 = INT2FIX(4);
|
90
|
+
arg3 = INT2FIX(0);
|
91
|
+
arg4 = INT2FIX(1);
|
92
|
+
arg5 = INT2FIX(0);
|
93
|
+
|
94
|
+
switch (argc) {
|
95
|
+
case 5:
|
96
|
+
if (argv[4] == Qtrue) {
|
97
|
+
arg5 = INT2FIX(1);
|
98
|
+
}
|
99
|
+
case 4:
|
100
|
+
arg4 = argv[3];
|
101
|
+
case 3:
|
102
|
+
arg3 = argv[2];
|
103
|
+
case 2:
|
104
|
+
arg2 = argv[1];
|
105
|
+
case 1:
|
106
|
+
arg1 = argv[0];
|
107
|
+
break;
|
108
|
+
}
|
109
|
+
|
110
|
+
m = FIX2INT(arg1);
|
111
|
+
k = FIX2INT(arg2);
|
112
|
+
s = FIX2INT(arg3);
|
113
|
+
b = FIX2INT(arg4);
|
114
|
+
r = FIX2INT(arg5);
|
115
|
+
|
116
|
+
if (b < 1 || b > 8)
|
117
|
+
rb_raise(rb_eArgError, "bucket size");
|
118
|
+
if (m < 1)
|
119
|
+
rb_raise(rb_eArgError, "array size");
|
120
|
+
if (k < 1)
|
121
|
+
rb_raise(rb_eArgError, "hash length");
|
122
|
+
if (s < 0)
|
123
|
+
rb_raise(rb_eArgError, "random seed");
|
124
|
+
|
125
|
+
bf->b = b;
|
126
|
+
bf->m = m;
|
127
|
+
bf->k = k;
|
128
|
+
bf->s = s;
|
129
|
+
bf->r = r;
|
130
|
+
bf->num_set = 0;
|
131
|
+
|
132
|
+
bf->bytes = ((m * b) + 15) / 8;
|
133
|
+
bf->ptr = ALLOC_N(unsigned char, bf->bytes);
|
134
|
+
|
135
|
+
/* initialize the bits with zeros */
|
136
|
+
memset(bf->ptr, 0, bf->bytes);
|
137
|
+
rb_iv_set(obj, "@hash_value", rb_hash_new());
|
138
|
+
|
139
|
+
return obj;
|
140
|
+
}
|
141
|
+
|
142
|
+
static VALUE bf_clear(VALUE self) {
|
143
|
+
struct BloomFilter *bf;
|
144
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
145
|
+
memset(bf->ptr, 0, bf->bytes);
|
146
|
+
return Qtrue;
|
147
|
+
}
|
148
|
+
|
149
|
+
static VALUE bf_m(VALUE self) {
|
150
|
+
struct BloomFilter *bf;
|
151
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
152
|
+
return INT2FIX(bf->m);
|
153
|
+
}
|
154
|
+
|
155
|
+
static VALUE bf_k(VALUE self) {
|
156
|
+
struct BloomFilter *bf;
|
157
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
158
|
+
return INT2FIX(bf->k);
|
159
|
+
}
|
160
|
+
|
161
|
+
static VALUE bf_b(VALUE self) {
|
162
|
+
struct BloomFilter *bf;
|
163
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
164
|
+
return INT2FIX(bf->b);
|
165
|
+
}
|
166
|
+
|
167
|
+
static VALUE bf_r(VALUE self) {
|
168
|
+
struct BloomFilter *bf;
|
169
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
170
|
+
return bf->r == 0 ? Qfalse : Qtrue;
|
171
|
+
}
|
172
|
+
|
173
|
+
static VALUE bf_num_set(VALUE self) {
|
174
|
+
struct BloomFilter *bf;
|
175
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
176
|
+
return INT2FIX(bf->num_set);
|
177
|
+
}
|
178
|
+
|
179
|
+
static VALUE bf_insert(VALUE self, VALUE key) {
|
180
|
+
VALUE skey;
|
181
|
+
int index, seed;
|
182
|
+
int i, len, m, k, s;
|
183
|
+
char *ckey;
|
184
|
+
struct BloomFilter *bf;
|
185
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
186
|
+
|
187
|
+
skey = rb_obj_as_string(key);
|
188
|
+
ckey = StringValuePtr(skey);
|
189
|
+
len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
|
190
|
+
|
191
|
+
m = bf->m;
|
192
|
+
k = bf->k;
|
193
|
+
s = bf->s;
|
194
|
+
|
195
|
+
for (i = 0; i <= k - 1; i++) {
|
196
|
+
/* seeds for hash functions */
|
197
|
+
seed = i + s;
|
198
|
+
|
199
|
+
/* hash */
|
200
|
+
index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
|
201
|
+
|
202
|
+
/* set a bit at the index */
|
203
|
+
bucket_set(bf, index);
|
204
|
+
}
|
205
|
+
|
206
|
+
bf->num_set += 1;
|
207
|
+
return Qnil;
|
208
|
+
}
|
209
|
+
|
210
|
+
static VALUE bf_merge(VALUE self, VALUE other) {
|
211
|
+
struct BloomFilter *bf, *target;
|
212
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
213
|
+
Data_Get_Struct(other, struct BloomFilter, target);
|
214
|
+
int i;
|
215
|
+
for (i = 0; i < bf->bytes; i++) {
|
216
|
+
bf->ptr[i] |= target->ptr[i];
|
217
|
+
}
|
218
|
+
return Qnil;
|
219
|
+
}
|
220
|
+
|
221
|
+
static VALUE bf_delete(VALUE self, VALUE key) {
|
222
|
+
int index, seed;
|
223
|
+
int i, len, m, k, s;
|
224
|
+
char *ckey;
|
225
|
+
VALUE skey;
|
226
|
+
struct BloomFilter *bf;
|
227
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
228
|
+
|
229
|
+
skey = rb_obj_as_string(key);
|
230
|
+
ckey = StringValuePtr(skey);
|
231
|
+
len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
|
232
|
+
|
233
|
+
m = bf->m;
|
234
|
+
k = bf->k;
|
235
|
+
s = bf->s;
|
236
|
+
|
237
|
+
for (i = 0; i <= k - 1; i++) {
|
238
|
+
/* seeds for hash functions */
|
239
|
+
seed = i + s;
|
240
|
+
|
241
|
+
/* hash */
|
242
|
+
index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
|
243
|
+
|
244
|
+
/* set a bit at the index */
|
245
|
+
bucket_unset(bf, index);
|
246
|
+
}
|
247
|
+
|
248
|
+
bf->num_set += 1;
|
249
|
+
return Qnil;
|
250
|
+
}
|
251
|
+
|
252
|
+
|
253
|
+
static VALUE bf_include(int argc, VALUE* argv, VALUE self) {
|
254
|
+
int index, seed;
|
255
|
+
int i, len, m, k, s, tests_idx, vlen;
|
256
|
+
char *ckey;
|
257
|
+
VALUE tests, key, skey;
|
258
|
+
struct BloomFilter *bf;
|
259
|
+
|
260
|
+
rb_scan_args(argc, argv, "*", &tests);
|
261
|
+
|
262
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
263
|
+
vlen = RARRAY_LEN(tests);
|
264
|
+
for(tests_idx = 0; tests_idx < vlen; tests_idx++) {
|
265
|
+
key = rb_ary_entry(tests, tests_idx);
|
266
|
+
skey = rb_obj_as_string(key);
|
267
|
+
ckey = StringValuePtr(skey);
|
268
|
+
len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
|
269
|
+
|
270
|
+
m = bf->m;
|
271
|
+
k = bf->k;
|
272
|
+
s = bf->s;
|
273
|
+
|
274
|
+
for (i = 0; i <= k - 1; i++) {
|
275
|
+
/* seeds for hash functions */
|
276
|
+
seed = i + s;
|
277
|
+
|
278
|
+
/* hash */
|
279
|
+
index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
|
280
|
+
|
281
|
+
/* check the bit at the index */
|
282
|
+
if (!bucket_check(bf, index)) {
|
283
|
+
return Qfalse; /* i.e., it is a new entry ; escape the loop */
|
284
|
+
}
|
285
|
+
}
|
286
|
+
|
287
|
+
return Qtrue;
|
288
|
+
}
|
289
|
+
|
290
|
+
}
|
291
|
+
|
292
|
+
static VALUE bf_to_s(VALUE self) {
|
293
|
+
struct BloomFilter *bf;
|
294
|
+
unsigned char *ptr;
|
295
|
+
int i;
|
296
|
+
VALUE str;
|
297
|
+
|
298
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
299
|
+
str = rb_str_new(0, bf->m);
|
300
|
+
|
301
|
+
ptr = (unsigned char *) RSTRING_PTR(str);
|
302
|
+
for (i = 0; i < bf->m; i++)
|
303
|
+
*ptr++ = bucket_get(bf, i) ? '1' : '0';
|
304
|
+
|
305
|
+
return str;
|
306
|
+
}
|
307
|
+
|
308
|
+
static VALUE bf_bitmap(VALUE self) {
|
309
|
+
struct BloomFilter *bf;
|
310
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
311
|
+
|
312
|
+
VALUE str = rb_str_new(0, bf->m);
|
313
|
+
unsigned char* ptr = (unsigned char *) RSTRING_PTR(str);
|
314
|
+
|
315
|
+
int i;
|
316
|
+
for (i = 0; i < bf->m; i++)
|
317
|
+
*ptr++ = bucket_get(bf, i);
|
318
|
+
|
319
|
+
return str;
|
320
|
+
}
|
321
|
+
|
322
|
+
static VALUE bf_load(VALUE self, VALUE bitmap) {
|
323
|
+
struct BloomFilter *bf;
|
324
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
325
|
+
unsigned char* ptr = (unsigned char *) RSTRING_PTR(bitmap);
|
326
|
+
|
327
|
+
int i;
|
328
|
+
for (i = 0; i < bf->m; i++) {
|
329
|
+
if (*ptr++)
|
330
|
+
bucket_set(bf, i);
|
331
|
+
}
|
332
|
+
|
333
|
+
return Qnil;
|
334
|
+
}
|
335
|
+
|
336
|
+
void Init_cbloomfilter(void) {
|
337
|
+
cBloomFilter = rb_define_class("CBloomFilter", rb_cObject);
|
338
|
+
rb_define_singleton_method(cBloomFilter, "new", bf_s_new, -1);
|
339
|
+
rb_define_method(cBloomFilter, "m", bf_m, 0);
|
340
|
+
rb_define_method(cBloomFilter, "k", bf_k, 0);
|
341
|
+
rb_define_method(cBloomFilter, "b", bf_b, 0);
|
342
|
+
rb_define_method(cBloomFilter, "r", bf_r, 0);
|
343
|
+
rb_define_method(cBloomFilter, "num_set", bf_num_set, 0);
|
344
|
+
rb_define_method(cBloomFilter, "insert", bf_insert, 1);
|
345
|
+
rb_define_method(cBloomFilter, "delete", bf_delete, 1);
|
346
|
+
rb_define_method(cBloomFilter, "include?", bf_include, -1);
|
347
|
+
rb_define_method(cBloomFilter, "clear", bf_clear, 0);
|
348
|
+
rb_define_method(cBloomFilter, "merge!", bf_merge, 1);
|
349
|
+
|
350
|
+
rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
|
351
|
+
rb_define_method(cBloomFilter, "bitmap", bf_bitmap, 0);
|
352
|
+
rb_define_method(cBloomFilter, "load", bf_load, 1);
|
353
|
+
|
354
|
+
/* functions that have not been implemented, yet */
|
355
|
+
|
356
|
+
// rb_define_method(cBloomFilter, "&", bf_and, 1);
|
357
|
+
// rb_define_method(cBloomFilter, "|", bf_or, 1);
|
358
|
+
// rb_define_method(cBloomFilter, "<=>", bf_cmp, 1);
|
359
|
+
}
|
@@ -0,0 +1,32 @@
|
|
1
|
+
/* simple CRC32 code */
|
2
|
+
/*
|
3
|
+
* Copyright 2005 Aris Adamantiadis
|
4
|
+
*
|
5
|
+
* This file is part of the SSH Library
|
6
|
+
*
|
7
|
+
* The SSH Library is free software; you can redistribute it and/or modify
|
8
|
+
* it under the terms of the GNU Lesser General Public License as published by
|
9
|
+
* the Free Software Foundation; either version 2.1 of the License, or (at your
|
10
|
+
* option) any later version.
|
11
|
+
*
|
12
|
+
*
|
13
|
+
* The SSH Library is distributed in the hope that it will be useful, but
|
14
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
15
|
+
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
16
|
+
* License for more details.
|
17
|
+
*
|
18
|
+
* You should have received a copy of the GNU Lesser General Public License
|
19
|
+
* along with the SSH Library; see the file COPYING. If not, write to
|
20
|
+
* the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
|
21
|
+
* MA 02111-1307, USA. */
|
22
|
+
|
23
|
+
#include "crc32.h"
|
24
|
+
|
25
|
+
unsigned int crc32(unsigned int crc, char *buf, int len) {
|
26
|
+
while (len > 0) {
|
27
|
+
crc = crc_table[(crc ^ *buf) & 0xff] ^ (crc >> 8);
|
28
|
+
--len;
|
29
|
+
++buf;
|
30
|
+
}
|
31
|
+
return crc;
|
32
|
+
}
|
@@ -0,0 +1,78 @@
|
|
1
|
+
/* simple CRC32 code */
|
2
|
+
/*
|
3
|
+
* Copyright 2005 Aris Adamantiadis
|
4
|
+
*
|
5
|
+
* This file is part of the SSH Library
|
6
|
+
*
|
7
|
+
* The SSH Library is free software; you can redistribute it and/or modify
|
8
|
+
* it under the terms of the GNU Lesser General Public License as published by
|
9
|
+
* the Free Software Foundation; either version 2.1 of the License, or (at your
|
10
|
+
* option) any later version.
|
11
|
+
*
|
12
|
+
*
|
13
|
+
* The SSH Library is distributed in the hope that it will be useful, but
|
14
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
15
|
+
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
16
|
+
* License for more details.
|
17
|
+
*
|
18
|
+
* You should have received a copy of the GNU Lesser General Public License
|
19
|
+
* along with the SSH Library; see the file COPYING. If not, write to
|
20
|
+
* the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
|
21
|
+
* MA 02111-1307, USA. */
|
22
|
+
|
23
|
+
static unsigned int crc_table[] = {
|
24
|
+
0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
|
25
|
+
0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
|
26
|
+
0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
|
27
|
+
0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
|
28
|
+
0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
|
29
|
+
0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
|
30
|
+
0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
|
31
|
+
0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
|
32
|
+
0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
|
33
|
+
0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
|
34
|
+
0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
|
35
|
+
0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
|
36
|
+
0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
|
37
|
+
0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
|
38
|
+
0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
|
39
|
+
0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
|
40
|
+
0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
|
41
|
+
0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
|
42
|
+
0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
|
43
|
+
0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
|
44
|
+
0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
|
45
|
+
0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
|
46
|
+
0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
|
47
|
+
0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
|
48
|
+
0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
|
49
|
+
0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
|
50
|
+
0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
|
51
|
+
0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
|
52
|
+
0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
|
53
|
+
0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
|
54
|
+
0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
|
55
|
+
0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
|
56
|
+
0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
|
57
|
+
0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
|
58
|
+
0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
|
59
|
+
0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
|
60
|
+
0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
|
61
|
+
0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
|
62
|
+
0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
|
63
|
+
0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
|
64
|
+
0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
|
65
|
+
0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
|
66
|
+
0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
|
67
|
+
0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
|
68
|
+
0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
|
69
|
+
0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
|
70
|
+
0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
|
71
|
+
0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
|
72
|
+
0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
|
73
|
+
0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
|
74
|
+
0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
|
75
|
+
0x2d02ef8dUL
|
76
|
+
};
|
77
|
+
|
78
|
+
unsigned int crc32(unsigned int crc, char *buf, int len);
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module BloomFilter
|
2
|
+
class CountingRedis < Filter
|
3
|
+
|
4
|
+
def initialize(opts = {})
|
5
|
+
@opts = {
|
6
|
+
:size => 100,
|
7
|
+
:hashes => 4,
|
8
|
+
:seed => Time.now.to_i,
|
9
|
+
:bucket => 3,
|
10
|
+
:ttl => false,
|
11
|
+
:server => {}
|
12
|
+
}.merge opts
|
13
|
+
@db = ::Redis.new(@opts[:server])
|
14
|
+
end
|
15
|
+
|
16
|
+
def insert(key, ttl=nil)
|
17
|
+
ttl = @opts[:ttl] if ttl.nil?
|
18
|
+
|
19
|
+
indexes_for(key).each do |idx|
|
20
|
+
@db.incr idx
|
21
|
+
@db.expire(idx, ttl) if ttl
|
22
|
+
end
|
23
|
+
end
|
24
|
+
alias :[]= :insert
|
25
|
+
|
26
|
+
def delete(key)
|
27
|
+
indexes_for(key).each do |idx|
|
28
|
+
if @db.decr(idx).to_i <= 0
|
29
|
+
@db.del(idx)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def include?(*keys)
|
35
|
+
indexes = keys.collect { |key| indexes_for(key) }
|
36
|
+
not @db.mget(*indexes.flatten).include? nil
|
37
|
+
end
|
38
|
+
alias :key? :include?
|
39
|
+
|
40
|
+
def num_set
|
41
|
+
@db.keys("rbloom:*").size
|
42
|
+
end
|
43
|
+
alias :size :num_set
|
44
|
+
|
45
|
+
def clear
|
46
|
+
@db.flushdb
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
# compute index offsets for provided key
|
52
|
+
def indexes_for(key)
|
53
|
+
indexes = []
|
54
|
+
@opts[:hashes].times do |i|
|
55
|
+
indexes.push "rbloom:" + (Zlib.crc32("#{key}:#{i+@opts[:seed]}") % @opts[:size]).to_s
|
56
|
+
end
|
57
|
+
|
58
|
+
indexes
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module BloomFilter
|
2
|
+
class Filter
|
3
|
+
def stats
|
4
|
+
fp = ((1.0 - Math.exp(-(@opts[:hashes] * size).to_f / @opts[:size])) ** @opts[:hashes]) * 100
|
5
|
+
printf "Number of filter buckets (m): %d\n" % @opts[:size]
|
6
|
+
printf "Number of bits per buckets (b): %d\n" % @opts[:bucket]
|
7
|
+
printf "Number of filter elements (n): %d\n" % size
|
8
|
+
printf "Number of filter hashes (k) : %d\n" % @opts[:hashes]
|
9
|
+
printf "Raise on overflow? (r) : %s\n" % @opts[:raise].to_s
|
10
|
+
printf "Predicted false positive rate = %.2f%\n" % fp
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module BloomFilter
|
2
|
+
class Native < Filter
|
3
|
+
attr_reader :bf
|
4
|
+
|
5
|
+
def initialize(opts = {})
|
6
|
+
@opts = {
|
7
|
+
:size => 100,
|
8
|
+
:hashes => 4,
|
9
|
+
:seed => Time.now.to_i,
|
10
|
+
:bucket => 3,
|
11
|
+
:raise => false
|
12
|
+
}.merge(opts)
|
13
|
+
|
14
|
+
# arg 1: m => size : number of buckets in a bloom filter
|
15
|
+
# arg 2: k => hashes : number of hash functions
|
16
|
+
# arg 3: s => seed : seed of hash functions
|
17
|
+
# arg 4: b => bucket : number of bits in a bloom filter bucket
|
18
|
+
# arg 5: r => raise : raise on bucket overflow?
|
19
|
+
|
20
|
+
@bf = CBloomFilter.new(@opts[:size], @opts[:hashes], @opts[:seed], @opts[:bucket], @opts[:raise])
|
21
|
+
end
|
22
|
+
|
23
|
+
def insert(key)
|
24
|
+
@bf.insert(key)
|
25
|
+
end
|
26
|
+
alias :[]= :insert
|
27
|
+
|
28
|
+
def include?(*keys)
|
29
|
+
@bf.include?(*keys)
|
30
|
+
end
|
31
|
+
alias :key? :include?
|
32
|
+
alias :[] :include?
|
33
|
+
|
34
|
+
def delete(key); @bf.delete(key); end
|
35
|
+
def clear; @bf.clear; end
|
36
|
+
def size; @bf.num_set; end
|
37
|
+
def merge!(o); @bf.merge!(o.bf); end
|
38
|
+
|
39
|
+
def bitmap
|
40
|
+
@bf.bitmap
|
41
|
+
end
|
42
|
+
|
43
|
+
def marshal_load(ary)
|
44
|
+
opts, bitmap = *ary
|
45
|
+
|
46
|
+
@bf = Native.new(opts)
|
47
|
+
@bf.bf.load(bitmap) if !bitmap.nil?
|
48
|
+
end
|
49
|
+
|
50
|
+
def marshal_dump
|
51
|
+
[@opts, @bf.bitmap]
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.load(filename)
|
55
|
+
Marshal.load(File.open(filename, 'r'))
|
56
|
+
end
|
57
|
+
|
58
|
+
def save(filename)
|
59
|
+
File.open(filename, 'w') do |f|
|
60
|
+
f << Marshal.dump(self)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module BloomFilter
|
2
|
+
class Redis < Filter
|
3
|
+
|
4
|
+
def initialize(opts = {})
|
5
|
+
@opts = {
|
6
|
+
:size => 100,
|
7
|
+
:hashes => 4,
|
8
|
+
:seed => Time.now.to_i,
|
9
|
+
:namespace => 'redis',
|
10
|
+
:eager => true,
|
11
|
+
:server => {}
|
12
|
+
}.merge opts
|
13
|
+
@db = ::Redis.new(@opts[:server])
|
14
|
+
|
15
|
+
if @opts[:eager]
|
16
|
+
# allocate the memory immediately
|
17
|
+
@db.setbit @opts[:namespace], @opts[:size], 1
|
18
|
+
@db.setbit @opts[:namespace], @opts[:size], 0
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def insert(key, ttl=nil)
|
23
|
+
indexes_for(key) { |idx| @db.setbit @opts[:namespace], idx, 1 }
|
24
|
+
end
|
25
|
+
alias :[]= :insert
|
26
|
+
|
27
|
+
def include?(*keys)
|
28
|
+
keys.each do |key|
|
29
|
+
indexes_for(key) do |idx|
|
30
|
+
return false if @db.getbit(@opts[:namespace], idx).zero?
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
true
|
35
|
+
end
|
36
|
+
alias :key? :include?
|
37
|
+
|
38
|
+
def delete(key)
|
39
|
+
indexes_for(key) do |idx|
|
40
|
+
@db.setbit @opts[:namespace], idx, 0
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def clear
|
45
|
+
@db.set @opts[:namespace], 0
|
46
|
+
end
|
47
|
+
|
48
|
+
def num_set
|
49
|
+
@db.strlen @opts[:namespace]
|
50
|
+
end
|
51
|
+
alias :size :num_set
|
52
|
+
|
53
|
+
def stats
|
54
|
+
printf "Number of filter buckets (m): %d\n" % @opts[:size]
|
55
|
+
printf "Number of filter hashes (k) : %d\n" % @opts[:hashes]
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
# compute index offsets for provided key
|
61
|
+
def indexes_for(key)
|
62
|
+
indexes = []
|
63
|
+
@opts[:hashes].times do |i|
|
64
|
+
yield Zlib.crc32("#{key}:#{i+@opts[:seed]}") % @opts[:size]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
describe BloomFilter::CountingRedis do
|
4
|
+
include BloomFilter
|
5
|
+
|
6
|
+
context "use Redis for storage" do
|
7
|
+
it "should store data in Redis" do
|
8
|
+
bf = CountingRedis.new
|
9
|
+
|
10
|
+
bf.insert(:abcd)
|
11
|
+
bf.insert('test')
|
12
|
+
bf.include?('test').should be_true
|
13
|
+
bf.key?('test').should be_true
|
14
|
+
|
15
|
+
bf.include?('test', 'test2').should be_false
|
16
|
+
bf.include?('test', 'abcd').should be_true
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should accept a TTL value for a key" do
|
20
|
+
bf = CountingRedis.new(:ttl => 1)
|
21
|
+
|
22
|
+
bf.insert('test')
|
23
|
+
bf.include?('test').should be_true
|
24
|
+
|
25
|
+
sleep(2)
|
26
|
+
bf.include?('test').should be_false
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should delete keys from Redis" do
|
30
|
+
bf = CountingRedis.new
|
31
|
+
|
32
|
+
bf.insert('test')
|
33
|
+
bf.include?('test').should be_true
|
34
|
+
|
35
|
+
bf.delete('test')
|
36
|
+
bf.include?('test').should be_false
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should output current stats" do
|
40
|
+
bf = CountingRedis.new
|
41
|
+
bf.clear
|
42
|
+
|
43
|
+
bf.insert('test')
|
44
|
+
bf.size.should == 4
|
45
|
+
lambda { bf.stats }.should_not raise_error
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should connect to remote redis server" do
|
49
|
+
lambda { CountingRedis.new }.should_not raise_error
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
data/spec/helper.rb
ADDED
data/spec/native_spec.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
describe BloomFilter::Native do
|
4
|
+
include BloomFilter
|
5
|
+
|
6
|
+
it "should clear" do
|
7
|
+
bf = Native.new(:size => 100, :hashes => 2, :seed => 1, :bucket => 3, :raise => false)
|
8
|
+
bf.insert("test")
|
9
|
+
bf.include?("test").should be_true
|
10
|
+
bf.clear
|
11
|
+
bf.include?("test").should be_false
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should merge" do
|
15
|
+
bf1 = Native.new(:size => 100, :hashes => 2, :seed => 1, :bucket => 3, :raise => false)
|
16
|
+
bf2 = Native.new(:size => 100, :hashes => 2, :seed => 1, :bucket => 3, :raise => false)
|
17
|
+
bf2.insert("test")
|
18
|
+
bf1.include?("test").should be_false
|
19
|
+
bf1.merge!(bf2)
|
20
|
+
bf1.include?("test").should be_true
|
21
|
+
bf2.include?("test").should be_true
|
22
|
+
end
|
23
|
+
|
24
|
+
context "behave like a bloomfilter" do
|
25
|
+
it "should test set memerbship" do
|
26
|
+
bf = Native.new(:size => 100, :hashes => 2, :seed => 1, :bucket => 3, :raise => false)
|
27
|
+
bf.insert("test")
|
28
|
+
bf.insert("test1")
|
29
|
+
|
30
|
+
bf.include?("test").should be_true
|
31
|
+
bf.include?("abcd").should be_false
|
32
|
+
bf.include?("test", "test1").should be_true
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should work with any object's to_s" do
|
36
|
+
bf = Native.new
|
37
|
+
bf.insert(:test)
|
38
|
+
bf.insert(:test1)
|
39
|
+
bf.insert(12345)
|
40
|
+
|
41
|
+
bf.include?("test").should be_true
|
42
|
+
bf.include?("abcd").should be_false
|
43
|
+
bf.include?("test", "test1", '12345').should be_true
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
context "behave like counting bloom filter" do
|
48
|
+
it "should delete / decrement keys" do
|
49
|
+
bf = Native.new
|
50
|
+
|
51
|
+
bf.insert("test")
|
52
|
+
bf.include?("test").should be_true
|
53
|
+
|
54
|
+
bf.delete("test")
|
55
|
+
bf.include?("test").should be_false
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context "serialize" do
|
60
|
+
after(:each) { File.unlink('bf.out') }
|
61
|
+
|
62
|
+
it "should marshall the bloomfilter" do
|
63
|
+
bf = Native.new
|
64
|
+
lambda { bf.save('bf.out') }.should_not raise_error
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should load marshalled bloomfilter" do
|
68
|
+
bf = Native.new
|
69
|
+
bf.insert('foo')
|
70
|
+
bf.insert('bar')
|
71
|
+
bf.save('bf.out')
|
72
|
+
|
73
|
+
bf = Native.load('bf.out')
|
74
|
+
bf.include?('foo').should be_true
|
75
|
+
bf.include?('bar').should be_true
|
76
|
+
bf.include?('baz').should be_false
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/spec/redis_spec.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
describe BloomFilter::Redis do
|
4
|
+
include BloomFilter
|
5
|
+
|
6
|
+
context "use Redis bitstring for storage" do
|
7
|
+
let(:bf) { Redis.new }
|
8
|
+
|
9
|
+
it "should store data in Redis" do
|
10
|
+
bf.insert(:abcd)
|
11
|
+
bf.insert('test')
|
12
|
+
bf.include?('test').should be_true
|
13
|
+
bf.key?('test').should be_true
|
14
|
+
|
15
|
+
bf.include?('test', 'test2').should be_false
|
16
|
+
bf.include?('test', 'abcd').should be_true
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should delete keys from Redis" do
|
20
|
+
bf.insert('test')
|
21
|
+
bf.include?('test').should be_true
|
22
|
+
|
23
|
+
bf.delete('test')
|
24
|
+
bf.include?('test').should be_false
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should clear Redis filter" do
|
28
|
+
bf.insert('test')
|
29
|
+
bf.include?('test').should be_true
|
30
|
+
|
31
|
+
bf.clear
|
32
|
+
bf.include?('test').should be_false
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should output current stats" do
|
36
|
+
bf.clear
|
37
|
+
bf.insert('test')
|
38
|
+
lambda { bf.stats }.should_not raise_error
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should connect to remote redis server" do
|
42
|
+
lambda { Redis.new }.should_not raise_error
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should allow namespaced BloomFilters" do
|
46
|
+
bf1 = Redis.new(:namespace => :a)
|
47
|
+
bf2 = Redis.new(:namespace => :b)
|
48
|
+
|
49
|
+
bf1.insert('test')
|
50
|
+
bf1.include?('test').should be_true
|
51
|
+
bf2.include?('test').should be_false
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
metadata
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bloomfilter-rb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 2
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
version: 2.0.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Ilya Grigorik
|
13
|
+
- Tatsuya Mori
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-01-05 00:00:00 -05:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: redis
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
segments:
|
30
|
+
- 2
|
31
|
+
- 1
|
32
|
+
- 1
|
33
|
+
version: 2.1.1
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rspec
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
segments:
|
45
|
+
- 0
|
46
|
+
version: "0"
|
47
|
+
type: :development
|
48
|
+
version_requirements: *id002
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: rake
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
segments:
|
58
|
+
- 0
|
59
|
+
version: "0"
|
60
|
+
type: :development
|
61
|
+
version_requirements: *id003
|
62
|
+
description: Counting Bloom Filter implemented in Ruby
|
63
|
+
email:
|
64
|
+
- ilya@igvita.com
|
65
|
+
executables: []
|
66
|
+
|
67
|
+
extensions:
|
68
|
+
- ext/cbloomfilter/extconf.rb
|
69
|
+
extra_rdoc_files: []
|
70
|
+
|
71
|
+
files:
|
72
|
+
- .gitignore
|
73
|
+
- .rspec
|
74
|
+
- Gemfile
|
75
|
+
- Gemfile.lock
|
76
|
+
- README.md
|
77
|
+
- Rakefile
|
78
|
+
- bloomfilter-rb.gemspec
|
79
|
+
- examples/counting-redis.rb
|
80
|
+
- examples/pure-ruby-bf.rb
|
81
|
+
- examples/simple-native.rb
|
82
|
+
- examples/simple-redis.rb
|
83
|
+
- ext/cbloomfilter/cbloomfilter.c
|
84
|
+
- ext/cbloomfilter/crc32.c
|
85
|
+
- ext/cbloomfilter/crc32.h
|
86
|
+
- ext/cbloomfilter/extconf.rb
|
87
|
+
- lib/bloomfilter-rb.rb
|
88
|
+
- lib/bloomfilter/counting_redis.rb
|
89
|
+
- lib/bloomfilter/filter.rb
|
90
|
+
- lib/bloomfilter/native.rb
|
91
|
+
- lib/bloomfilter/redis.rb
|
92
|
+
- lib/bloomfilter/version.rb
|
93
|
+
- spec/counting_redis_spec.rb
|
94
|
+
- spec/helper.rb
|
95
|
+
- spec/native_spec.rb
|
96
|
+
- spec/redis_spec.rb
|
97
|
+
has_rdoc: true
|
98
|
+
homepage: http://github.com/igrigorik/bloomfilter
|
99
|
+
licenses: []
|
100
|
+
|
101
|
+
post_install_message:
|
102
|
+
rdoc_options: []
|
103
|
+
|
104
|
+
require_paths:
|
105
|
+
- lib
|
106
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
107
|
+
none: false
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
segments:
|
112
|
+
- 0
|
113
|
+
version: "0"
|
114
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
115
|
+
none: false
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
segments:
|
120
|
+
- 0
|
121
|
+
version: "0"
|
122
|
+
requirements: []
|
123
|
+
|
124
|
+
rubyforge_project: bloomfilter-rb
|
125
|
+
rubygems_version: 1.3.7
|
126
|
+
signing_key:
|
127
|
+
specification_version: 3
|
128
|
+
summary: Counting Bloom Filter implemented in Ruby
|
129
|
+
test_files:
|
130
|
+
- spec/counting_redis_spec.rb
|
131
|
+
- spec/helper.rb
|
132
|
+
- spec/native_spec.rb
|
133
|
+
- spec/redis_spec.rb
|