bloomfilter-rb 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/.rspec +0 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +29 -0
- data/README.md +87 -0
- data/Rakefile +9 -0
- data/bloomfilter-rb.gemspec +26 -0
- data/examples/counting-redis.rb +12 -0
- data/examples/pure-ruby-bf.rb +52 -0
- data/examples/simple-native.rb +25 -0
- data/examples/simple-redis.rb +31 -0
- data/ext/cbloomfilter/cbloomfilter.c +359 -0
- data/ext/cbloomfilter/crc32.c +32 -0
- data/ext/cbloomfilter/crc32.h +78 -0
- data/ext/cbloomfilter/extconf.rb +4 -0
- data/lib/bloomfilter-rb.rb +9 -0
- data/lib/bloomfilter/counting_redis.rb +61 -0
- data/lib/bloomfilter/filter.rb +13 -0
- data/lib/bloomfilter/native.rb +65 -0
- data/lib/bloomfilter/redis.rb +69 -0
- data/lib/bloomfilter/version.rb +3 -0
- data/spec/counting_redis_spec.rb +52 -0
- data/spec/helper.rb +2 -0
- data/spec/native_spec.rb +79 -0
- data/spec/redis_spec.rb +54 -0
- metadata +133 -0
data/.gitignore
ADDED
data/.rspec
ADDED
File without changes
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
bloomfilter-rb (2.0.0)
|
5
|
+
redis (>= 2.1.1)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: http://rubygems.org/
|
9
|
+
specs:
|
10
|
+
diff-lcs (1.1.2)
|
11
|
+
rake (0.8.7)
|
12
|
+
redis (2.1.1)
|
13
|
+
rspec (2.3.0)
|
14
|
+
rspec-core (~> 2.3.0)
|
15
|
+
rspec-expectations (~> 2.3.0)
|
16
|
+
rspec-mocks (~> 2.3.0)
|
17
|
+
rspec-core (2.3.1)
|
18
|
+
rspec-expectations (2.3.0)
|
19
|
+
diff-lcs (~> 1.1.2)
|
20
|
+
rspec-mocks (2.3.0)
|
21
|
+
|
22
|
+
PLATFORMS
|
23
|
+
ruby
|
24
|
+
|
25
|
+
DEPENDENCIES
|
26
|
+
bloomfilter-rb!
|
27
|
+
rake
|
28
|
+
redis (>= 2.1.1)
|
29
|
+
rspec
|
data/README.md
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
# BloomFilter(s) in Ruby
|
2
|
+
|
3
|
+
- Native (MRI/C) counting bloom filter
|
4
|
+
- Redis-backed getbit/setbit non-counting bloom filter
|
5
|
+
- Redis-backed set-based counting (+TTL) bloom filter
|
6
|
+
|
7
|
+
Bloom filter is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set. False positives are possible, but false negatives are not. For more detail, check the [wikipedia article](http://en.wikipedia.org/wiki/Bloom_filter). Instead of using k different hash functions, this implementation seeds the CRC32 hash with k different initial values (0, 1, ..., k-1). This may or may not give you a good distribution, it all depends on the data.
|
8
|
+
|
9
|
+
Performance of the Bloom filter depends on a number of variables:
|
10
|
+
|
11
|
+
- size of the bit array
|
12
|
+
- size of the counter bucket
|
13
|
+
- number of hash functions
|
14
|
+
|
15
|
+
## Resources
|
16
|
+
|
17
|
+
- Determining parameters: [Scalable Datasets: Bloom Filters in Ruby](http://www.igvita.com/2008/12/27/scalable-datasets-bloom-filters-in-ruby/)
|
18
|
+
- Applications & reasons behind bloom filter: [Flow analysis: Time based bloom filter](http://www.igvita.com/2010/01/06/flow-analysis-time-based-bloom-filters/)
|
19
|
+
|
20
|
+
***
|
21
|
+
|
22
|
+
## MRI/C API Example
|
23
|
+
|
24
|
+
MRI/C implementation which creates an in-memory filter which can be saved and reloaded from disk.
|
25
|
+
|
26
|
+
require 'bloomfilter'
|
27
|
+
|
28
|
+
bf = BloomFilter::Native.new(:size => 100, :hashes => 2, :seed => 1, :bucket => 3, :raise => false)
|
29
|
+
bf.insert("test")
|
30
|
+
bf.include?("test") # => true
|
31
|
+
bf.include?("blah") # => false
|
32
|
+
|
33
|
+
bf.delete("test")
|
34
|
+
bf.include?("test") # => false
|
35
|
+
|
36
|
+
# Hash with a bloom filter!
|
37
|
+
bf["test2"] = "bar"
|
38
|
+
bf["test2"] # => true
|
39
|
+
bf["test3"] # => false
|
40
|
+
|
41
|
+
bf.stats
|
42
|
+
Number of filter bits (m): 10
|
43
|
+
Number of filter elements (n): 2
|
44
|
+
Number of filter hashes (k) : 2
|
45
|
+
Predicted false positive rate = 10.87%
|
46
|
+
|
47
|
+
***
|
48
|
+
|
49
|
+
## Redis-backed setbit/getbit bloom filter
|
50
|
+
|
51
|
+
Uses [getbit](http://redis.io/commands/getbit)/[setbit](http://redis.io/commands/setbit) on Redis strings - efficient, fast, can be shared by multiple/concurrent processes.
|
52
|
+
|
53
|
+
bf = BloomFilter::Redis.new
|
54
|
+
|
55
|
+
bf.insert('test')
|
56
|
+
bf.include?('test') # => true
|
57
|
+
bf.include?('blah') # => false
|
58
|
+
|
59
|
+
bf.delete('test')
|
60
|
+
bf.include?('test') # => false
|
61
|
+
|
62
|
+
### Memory footprint
|
63
|
+
|
64
|
+
- 1.0% error rate for 1M items, 10 bits/item: *2.5 mb*
|
65
|
+
- 1.0% error rate for 150M items, 10 bits per item: *358.52 mb*
|
66
|
+
- 0.1% error rate for 150M items, 15 bits per item: *537.33 mb*
|
67
|
+
|
68
|
+
***
|
69
|
+
|
70
|
+
## Redis-backed counting bloom filter with TTL's
|
71
|
+
Uses regular Redis get/set counters to implement a counting filter with optional TTL expiry. Because each "bit" requires its own key in Redis, you do incur a much larger memory overhead.
|
72
|
+
|
73
|
+
bf = BloomFilter::CountingRedis.new(:ttl => 2)
|
74
|
+
|
75
|
+
bf.insert('test')
|
76
|
+
bf.include?('test') # => true
|
77
|
+
|
78
|
+
sleep(2)
|
79
|
+
bf.include?('test') # => false
|
80
|
+
|
81
|
+
## Credits
|
82
|
+
|
83
|
+
Tatsuya Mori <valdzone@gmail.com> (Original C implementation: http://vald.x0.com/sb/)
|
84
|
+
|
85
|
+
## License
|
86
|
+
|
87
|
+
(MIT License) - Copyright (c) 2011 Ilya Grigorik
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "bloomfilter/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "bloomfilter-rb"
|
7
|
+
s.version = BloomFilter::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Ilya Grigorik", "Tatsuya Mori"]
|
10
|
+
s.email = ["ilya@igvita.com"]
|
11
|
+
s.homepage = "http://github.com/igrigorik/bloomfilter"
|
12
|
+
s.summary = "Counting Bloom Filter implemented in Ruby"
|
13
|
+
s.description = s.summary
|
14
|
+
s.rubyforge_project = "bloomfilter-rb"
|
15
|
+
|
16
|
+
s.add_dependency "redis", ">= 2.1.1"
|
17
|
+
s.add_development_dependency "rspec"
|
18
|
+
s.add_development_dependency "rake"
|
19
|
+
|
20
|
+
s.extensions = ["ext/cbloomfilter/extconf.rb"]
|
21
|
+
|
22
|
+
s.files = `git ls-files`.split("\n")
|
23
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
24
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
25
|
+
s.require_paths = ["lib"]
|
26
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#
|
2
|
+
# Pure ruby implementation of a Bloom filter, just for kicks
|
3
|
+
#
|
4
|
+
|
5
|
+
require 'bitset'
|
6
|
+
require 'zlib'
|
7
|
+
|
8
|
+
class BloomFilter
|
9
|
+
|
10
|
+
def initialize(max_entries, num_hashes, seed)
|
11
|
+
@num_hashes = num_hashes
|
12
|
+
@size = max_entries.to_i
|
13
|
+
@bitmap = BitSet.new(@size)
|
14
|
+
@__mask = BitSet.new(@size)
|
15
|
+
@seed = seed
|
16
|
+
end
|
17
|
+
|
18
|
+
def insert(key)
|
19
|
+
mask = make_mask(key)
|
20
|
+
@bitmap |= mask
|
21
|
+
end
|
22
|
+
|
23
|
+
def new?(key)
|
24
|
+
mask = make_mask(key)
|
25
|
+
return ((@bitmap & mask) != mask);
|
26
|
+
end
|
27
|
+
|
28
|
+
def make_mask(key)
|
29
|
+
@__mask.clear
|
30
|
+
0.upto(@num_hashes.to_i - 1) do |i|
|
31
|
+
hash = Zlib.crc32(key, i + @seed)
|
32
|
+
@__mask.set(hash % @size, 1)
|
33
|
+
end
|
34
|
+
return @__mask
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def main
|
39
|
+
bf = BloomFilter.new(1000000, 4, 0)
|
40
|
+
num = 0
|
41
|
+
while line = ARGF.gets
|
42
|
+
data = line.chop
|
43
|
+
|
44
|
+
if bf.new_entry?(data)
|
45
|
+
num += 1
|
46
|
+
bf.insert(data)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
print "#element = #{num}\n"
|
50
|
+
end
|
51
|
+
|
52
|
+
main
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bloomfilter-rb'
|
3
|
+
|
4
|
+
WORDS = %w(duck penguin bear panda)
|
5
|
+
TEST = %w(penguin moose racooon)
|
6
|
+
|
7
|
+
bf = BloomFilter::Native.new(:size => 100, :hashes => 2, :seed => 1, :bucket => 3, :raise => false)
|
8
|
+
|
9
|
+
WORDS.each { |w| bf.insert(w) }
|
10
|
+
TEST.each do |w|
|
11
|
+
puts "#{w}: #{bf.include?(w)}"
|
12
|
+
end
|
13
|
+
|
14
|
+
bf.stats
|
15
|
+
|
16
|
+
# penguin: true
|
17
|
+
# moose: false
|
18
|
+
# racooon: false
|
19
|
+
#
|
20
|
+
# Number of filter buckets (m): 100
|
21
|
+
# Number of bits per buckets (b): 1
|
22
|
+
# Number of filter elements (n): 4
|
23
|
+
# Number of filter hashes (k) : 4
|
24
|
+
# Raise on overflow? (r) : false
|
25
|
+
# Predicted false positive rate = 0.05%
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'lib/bloomfilter-rb'
|
3
|
+
|
4
|
+
items = 1_000_00
|
5
|
+
bits = 1
|
6
|
+
|
7
|
+
# p BloomFilter::Redis.new(:size => items*bits, :hashes => 7) # 2.5 mb
|
8
|
+
# p BloomFilter::Redis.new(:size => items*bits*5, :hashes => 7) # 13 mb
|
9
|
+
# p BloomFilter::Redis.new(:size => items*bits*30, :hashes => 7) # 73 mb
|
10
|
+
|
11
|
+
# 1% error rate for 5M items/day, 10 bits per item, for 30 days of data: 358.52 mb
|
12
|
+
# 0.1% error rate for 5M items/day, 15 bits per item, for 30 days of data: 537.33 mb
|
13
|
+
|
14
|
+
bf = BloomFilter::Redis.new(:size => items*bits, :hashes => 7) # 2.5 mb
|
15
|
+
|
16
|
+
seen = Set.new
|
17
|
+
err = 0
|
18
|
+
num = 100000
|
19
|
+
|
20
|
+
num.times do
|
21
|
+
item = rand(items)
|
22
|
+
|
23
|
+
if bf.include?(item) != seen.include?(item)
|
24
|
+
err += 1
|
25
|
+
end
|
26
|
+
|
27
|
+
seen << item
|
28
|
+
bf.insert(item)
|
29
|
+
end
|
30
|
+
|
31
|
+
p [:error_rate, (err.to_f / num) * 100]
|
@@ -0,0 +1,359 @@
|
|
1
|
+
/*
|
2
|
+
* cbloomfilter.c - simple Bloom Filter
|
3
|
+
* (c) Tatsuya Mori <valdzone@gmail.com>
|
4
|
+
*/
|
5
|
+
|
6
|
+
#include "ruby.h"
|
7
|
+
#include "crc32.h"
|
8
|
+
|
9
|
+
#if !defined(RSTRING_LEN)
|
10
|
+
# define RSTRING_LEN(x) (RSTRING(x)->len)
|
11
|
+
# define RSTRING_PTR(x) (RSTRING(x)->ptr)
|
12
|
+
#endif
|
13
|
+
|
14
|
+
static VALUE cBloomFilter;
|
15
|
+
|
16
|
+
struct BloomFilter {
|
17
|
+
int m; /* # of buckets in a bloom filter */
|
18
|
+
int b; /* # of bits in a bloom filter bucket */
|
19
|
+
int k; /* # of hash functions */
|
20
|
+
int s; /* # seed of hash functions */
|
21
|
+
int r; /* # raise on bucket overflow? */
|
22
|
+
int num_set; /* # of set bits */
|
23
|
+
unsigned char *ptr; /* bits data */
|
24
|
+
int bytes; /* size of byte data */
|
25
|
+
};
|
26
|
+
|
27
|
+
void bits_free(struct BloomFilter *bf) {
|
28
|
+
ruby_xfree(bf->ptr);
|
29
|
+
}
|
30
|
+
|
31
|
+
void bucket_unset(struct BloomFilter *bf, int index) {
|
32
|
+
int byte_offset = (index * bf->b) / 8;
|
33
|
+
int bit_offset = (index * bf->b) % 8;
|
34
|
+
unsigned int c = bf->ptr[byte_offset];
|
35
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
36
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
37
|
+
if ((c & mask) == 0) {
|
38
|
+
// do nothing
|
39
|
+
} else {
|
40
|
+
bf->ptr[byte_offset] -= (1 << bit_offset) & ((1 << 8) - 1);
|
41
|
+
bf->ptr[byte_offset + 1] -= ((1 << bit_offset) & ((1 << 16) - 1)) >> 8;
|
42
|
+
}
|
43
|
+
|
44
|
+
}
|
45
|
+
|
46
|
+
void bucket_set(struct BloomFilter *bf, int index) {
|
47
|
+
int byte_offset = (index * bf->b) / 8;
|
48
|
+
int bit_offset = (index * bf->b) % 8;
|
49
|
+
unsigned int c = bf->ptr[byte_offset];
|
50
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
51
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
52
|
+
if ((c & mask) == mask) {
|
53
|
+
if (bf->r == 1) rb_raise(rb_eRuntimeError, "bucket got filled up");
|
54
|
+
} else {
|
55
|
+
bf->ptr[byte_offset] += (1 << bit_offset) & ((1 << 8) - 1);
|
56
|
+
bf->ptr[byte_offset + 1] += ((1 << bit_offset) & ((1 << 16) - 1)) >> 8;
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
int bucket_check(struct BloomFilter *bf, int index) {
|
61
|
+
int byte_offset = (index * bf->b) / 8;
|
62
|
+
int bit_offset = (index * bf->b) % 8;
|
63
|
+
unsigned int c = bf->ptr[byte_offset];
|
64
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
65
|
+
|
66
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
67
|
+
return (c & mask) >> bit_offset;
|
68
|
+
}
|
69
|
+
|
70
|
+
int bucket_get(struct BloomFilter *bf, int index) {
|
71
|
+
int byte_offset = (index * bf->b) / 8;
|
72
|
+
int bit_offset = (index * bf->b) % 8;
|
73
|
+
unsigned int c = bf->ptr[byte_offset];
|
74
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
75
|
+
|
76
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
77
|
+
return (c & mask) >> bit_offset;
|
78
|
+
}
|
79
|
+
|
80
|
+
static VALUE bf_s_new(int argc, VALUE *argv, VALUE self) {
|
81
|
+
struct BloomFilter *bf;
|
82
|
+
VALUE arg1, arg2, arg3, arg4, arg5, obj;
|
83
|
+
int m, k, s, b, r, bytes;
|
84
|
+
|
85
|
+
obj = Data_Make_Struct(self, struct BloomFilter, NULL, bits_free, bf);
|
86
|
+
|
87
|
+
/* default = Fugou approach :-) */
|
88
|
+
arg1 = INT2FIX(100000000);
|
89
|
+
arg2 = INT2FIX(4);
|
90
|
+
arg3 = INT2FIX(0);
|
91
|
+
arg4 = INT2FIX(1);
|
92
|
+
arg5 = INT2FIX(0);
|
93
|
+
|
94
|
+
switch (argc) {
|
95
|
+
case 5:
|
96
|
+
if (argv[4] == Qtrue) {
|
97
|
+
arg5 = INT2FIX(1);
|
98
|
+
}
|
99
|
+
case 4:
|
100
|
+
arg4 = argv[3];
|
101
|
+
case 3:
|
102
|
+
arg3 = argv[2];
|
103
|
+
case 2:
|
104
|
+
arg2 = argv[1];
|
105
|
+
case 1:
|
106
|
+
arg1 = argv[0];
|
107
|
+
break;
|
108
|
+
}
|
109
|
+
|
110
|
+
m = FIX2INT(arg1);
|
111
|
+
k = FIX2INT(arg2);
|
112
|
+
s = FIX2INT(arg3);
|
113
|
+
b = FIX2INT(arg4);
|
114
|
+
r = FIX2INT(arg5);
|
115
|
+
|
116
|
+
if (b < 1 || b > 8)
|
117
|
+
rb_raise(rb_eArgError, "bucket size");
|
118
|
+
if (m < 1)
|
119
|
+
rb_raise(rb_eArgError, "array size");
|
120
|
+
if (k < 1)
|
121
|
+
rb_raise(rb_eArgError, "hash length");
|
122
|
+
if (s < 0)
|
123
|
+
rb_raise(rb_eArgError, "random seed");
|
124
|
+
|
125
|
+
bf->b = b;
|
126
|
+
bf->m = m;
|
127
|
+
bf->k = k;
|
128
|
+
bf->s = s;
|
129
|
+
bf->r = r;
|
130
|
+
bf->num_set = 0;
|
131
|
+
|
132
|
+
bf->bytes = ((m * b) + 15) / 8;
|
133
|
+
bf->ptr = ALLOC_N(unsigned char, bf->bytes);
|
134
|
+
|
135
|
+
/* initialize the bits with zeros */
|
136
|
+
memset(bf->ptr, 0, bf->bytes);
|
137
|
+
rb_iv_set(obj, "@hash_value", rb_hash_new());
|
138
|
+
|
139
|
+
return obj;
|
140
|
+
}
|
141
|
+
|
142
|
+
static VALUE bf_clear(VALUE self) {
|
143
|
+
struct BloomFilter *bf;
|
144
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
145
|
+
memset(bf->ptr, 0, bf->bytes);
|
146
|
+
return Qtrue;
|
147
|
+
}
|
148
|
+
|
149
|
+
static VALUE bf_m(VALUE self) {
|
150
|
+
struct BloomFilter *bf;
|
151
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
152
|
+
return INT2FIX(bf->m);
|
153
|
+
}
|
154
|
+
|
155
|
+
static VALUE bf_k(VALUE self) {
|
156
|
+
struct BloomFilter *bf;
|
157
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
158
|
+
return INT2FIX(bf->k);
|
159
|
+
}
|
160
|
+
|
161
|
+
static VALUE bf_b(VALUE self) {
|
162
|
+
struct BloomFilter *bf;
|
163
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
164
|
+
return INT2FIX(bf->b);
|
165
|
+
}
|
166
|
+
|
167
|
+
static VALUE bf_r(VALUE self) {
|
168
|
+
struct BloomFilter *bf;
|
169
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
170
|
+
return bf->r == 0 ? Qfalse : Qtrue;
|
171
|
+
}
|
172
|
+
|
173
|
+
static VALUE bf_num_set(VALUE self) {
|
174
|
+
struct BloomFilter *bf;
|
175
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
176
|
+
return INT2FIX(bf->num_set);
|
177
|
+
}
|
178
|
+
|
179
|
+
static VALUE bf_insert(VALUE self, VALUE key) {
|
180
|
+
VALUE skey;
|
181
|
+
int index, seed;
|
182
|
+
int i, len, m, k, s;
|
183
|
+
char *ckey;
|
184
|
+
struct BloomFilter *bf;
|
185
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
186
|
+
|
187
|
+
skey = rb_obj_as_string(key);
|
188
|
+
ckey = StringValuePtr(skey);
|
189
|
+
len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
|
190
|
+
|
191
|
+
m = bf->m;
|
192
|
+
k = bf->k;
|
193
|
+
s = bf->s;
|
194
|
+
|
195
|
+
for (i = 0; i <= k - 1; i++) {
|
196
|
+
/* seeds for hash functions */
|
197
|
+
seed = i + s;
|
198
|
+
|
199
|
+
/* hash */
|
200
|
+
index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
|
201
|
+
|
202
|
+
/* set a bit at the index */
|
203
|
+
bucket_set(bf, index);
|
204
|
+
}
|
205
|
+
|
206
|
+
bf->num_set += 1;
|
207
|
+
return Qnil;
|
208
|
+
}
|
209
|
+
|
210
|
+
static VALUE bf_merge(VALUE self, VALUE other) {
|
211
|
+
struct BloomFilter *bf, *target;
|
212
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
213
|
+
Data_Get_Struct(other, struct BloomFilter, target);
|
214
|
+
int i;
|
215
|
+
for (i = 0; i < bf->bytes; i++) {
|
216
|
+
bf->ptr[i] |= target->ptr[i];
|
217
|
+
}
|
218
|
+
return Qnil;
|
219
|
+
}
|
220
|
+
|
221
|
+
static VALUE bf_delete(VALUE self, VALUE key) {
|
222
|
+
int index, seed;
|
223
|
+
int i, len, m, k, s;
|
224
|
+
char *ckey;
|
225
|
+
VALUE skey;
|
226
|
+
struct BloomFilter *bf;
|
227
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
228
|
+
|
229
|
+
skey = rb_obj_as_string(key);
|
230
|
+
ckey = StringValuePtr(skey);
|
231
|
+
len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
|
232
|
+
|
233
|
+
m = bf->m;
|
234
|
+
k = bf->k;
|
235
|
+
s = bf->s;
|
236
|
+
|
237
|
+
for (i = 0; i <= k - 1; i++) {
|
238
|
+
/* seeds for hash functions */
|
239
|
+
seed = i + s;
|
240
|
+
|
241
|
+
/* hash */
|
242
|
+
index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
|
243
|
+
|
244
|
+
/* set a bit at the index */
|
245
|
+
bucket_unset(bf, index);
|
246
|
+
}
|
247
|
+
|
248
|
+
bf->num_set += 1;
|
249
|
+
return Qnil;
|
250
|
+
}
|
251
|
+
|
252
|
+
|
253
|
+
static VALUE bf_include(int argc, VALUE* argv, VALUE self) {
|
254
|
+
int index, seed;
|
255
|
+
int i, len, m, k, s, tests_idx, vlen;
|
256
|
+
char *ckey;
|
257
|
+
VALUE tests, key, skey;
|
258
|
+
struct BloomFilter *bf;
|
259
|
+
|
260
|
+
rb_scan_args(argc, argv, "*", &tests);
|
261
|
+
|
262
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
263
|
+
vlen = RARRAY_LEN(tests);
|
264
|
+
for(tests_idx = 0; tests_idx < vlen; tests_idx++) {
|
265
|
+
key = rb_ary_entry(tests, tests_idx);
|
266
|
+
skey = rb_obj_as_string(key);
|
267
|
+
ckey = StringValuePtr(skey);
|
268
|
+
len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
|
269
|
+
|
270
|
+
m = bf->m;
|
271
|
+
k = bf->k;
|
272
|
+
s = bf->s;
|
273
|
+
|
274
|
+
for (i = 0; i <= k - 1; i++) {
|
275
|
+
/* seeds for hash functions */
|
276
|
+
seed = i + s;
|
277
|
+
|
278
|
+
/* hash */
|
279
|
+
index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
|
280
|
+
|
281
|
+
/* check the bit at the index */
|
282
|
+
if (!bucket_check(bf, index)) {
|
283
|
+
return Qfalse; /* i.e., it is a new entry ; escape the loop */
|
284
|
+
}
|
285
|
+
}
|
286
|
+
|
287
|
+
return Qtrue;
|
288
|
+
}
|
289
|
+
|
290
|
+
}
|
291
|
+
|
292
|
+
static VALUE bf_to_s(VALUE self) {
|
293
|
+
struct BloomFilter *bf;
|
294
|
+
unsigned char *ptr;
|
295
|
+
int i;
|
296
|
+
VALUE str;
|
297
|
+
|
298
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
299
|
+
str = rb_str_new(0, bf->m);
|
300
|
+
|
301
|
+
ptr = (unsigned char *) RSTRING_PTR(str);
|
302
|
+
for (i = 0; i < bf->m; i++)
|
303
|
+
*ptr++ = bucket_get(bf, i) ? '1' : '0';
|
304
|
+
|
305
|
+
return str;
|
306
|
+
}
|
307
|
+
|
308
|
+
static VALUE bf_bitmap(VALUE self) {
|
309
|
+
struct BloomFilter *bf;
|
310
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
311
|
+
|
312
|
+
VALUE str = rb_str_new(0, bf->m);
|
313
|
+
unsigned char* ptr = (unsigned char *) RSTRING_PTR(str);
|
314
|
+
|
315
|
+
int i;
|
316
|
+
for (i = 0; i < bf->m; i++)
|
317
|
+
*ptr++ = bucket_get(bf, i);
|
318
|
+
|
319
|
+
return str;
|
320
|
+
}
|
321
|
+
|
322
|
+
static VALUE bf_load(VALUE self, VALUE bitmap) {
|
323
|
+
struct BloomFilter *bf;
|
324
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
325
|
+
unsigned char* ptr = (unsigned char *) RSTRING_PTR(bitmap);
|
326
|
+
|
327
|
+
int i;
|
328
|
+
for (i = 0; i < bf->m; i++) {
|
329
|
+
if (*ptr++)
|
330
|
+
bucket_set(bf, i);
|
331
|
+
}
|
332
|
+
|
333
|
+
return Qnil;
|
334
|
+
}
|
335
|
+
|
336
|
+
void Init_cbloomfilter(void) {
|
337
|
+
cBloomFilter = rb_define_class("CBloomFilter", rb_cObject);
|
338
|
+
rb_define_singleton_method(cBloomFilter, "new", bf_s_new, -1);
|
339
|
+
rb_define_method(cBloomFilter, "m", bf_m, 0);
|
340
|
+
rb_define_method(cBloomFilter, "k", bf_k, 0);
|
341
|
+
rb_define_method(cBloomFilter, "b", bf_b, 0);
|
342
|
+
rb_define_method(cBloomFilter, "r", bf_r, 0);
|
343
|
+
rb_define_method(cBloomFilter, "num_set", bf_num_set, 0);
|
344
|
+
rb_define_method(cBloomFilter, "insert", bf_insert, 1);
|
345
|
+
rb_define_method(cBloomFilter, "delete", bf_delete, 1);
|
346
|
+
rb_define_method(cBloomFilter, "include?", bf_include, -1);
|
347
|
+
rb_define_method(cBloomFilter, "clear", bf_clear, 0);
|
348
|
+
rb_define_method(cBloomFilter, "merge!", bf_merge, 1);
|
349
|
+
|
350
|
+
rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
|
351
|
+
rb_define_method(cBloomFilter, "bitmap", bf_bitmap, 0);
|
352
|
+
rb_define_method(cBloomFilter, "load", bf_load, 1);
|
353
|
+
|
354
|
+
/* functions that have not been implemented, yet */
|
355
|
+
|
356
|
+
// rb_define_method(cBloomFilter, "&", bf_and, 1);
|
357
|
+
// rb_define_method(cBloomFilter, "|", bf_or, 1);
|
358
|
+
// rb_define_method(cBloomFilter, "<=>", bf_cmp, 1);
|
359
|
+
}
|
@@ -0,0 +1,32 @@
|
|
1
|
+
/* simple CRC32 code */
|
2
|
+
/*
|
3
|
+
* Copyright 2005 Aris Adamantiadis
|
4
|
+
*
|
5
|
+
* This file is part of the SSH Library
|
6
|
+
*
|
7
|
+
* The SSH Library is free software; you can redistribute it and/or modify
|
8
|
+
* it under the terms of the GNU Lesser General Public License as published by
|
9
|
+
* the Free Software Foundation; either version 2.1 of the License, or (at your
|
10
|
+
* option) any later version.
|
11
|
+
*
|
12
|
+
*
|
13
|
+
* The SSH Library is distributed in the hope that it will be useful, but
|
14
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
15
|
+
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
16
|
+
* License for more details.
|
17
|
+
*
|
18
|
+
* You should have received a copy of the GNU Lesser General Public License
|
19
|
+
* along with the SSH Library; see the file COPYING. If not, write to
|
20
|
+
* the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
|
21
|
+
* MA 02111-1307, USA. */
|
22
|
+
|
23
|
+
#include "crc32.h"
|
24
|
+
|
25
|
+
unsigned int crc32(unsigned int crc, char *buf, int len) {
|
26
|
+
while (len > 0) {
|
27
|
+
crc = crc_table[(crc ^ *buf) & 0xff] ^ (crc >> 8);
|
28
|
+
--len;
|
29
|
+
++buf;
|
30
|
+
}
|
31
|
+
return crc;
|
32
|
+
}
|
@@ -0,0 +1,78 @@
|
|
1
|
+
/* simple CRC32 code */
|
2
|
+
/*
|
3
|
+
* Copyright 2005 Aris Adamantiadis
|
4
|
+
*
|
5
|
+
* This file is part of the SSH Library
|
6
|
+
*
|
7
|
+
* The SSH Library is free software; you can redistribute it and/or modify
|
8
|
+
* it under the terms of the GNU Lesser General Public License as published by
|
9
|
+
* the Free Software Foundation; either version 2.1 of the License, or (at your
|
10
|
+
* option) any later version.
|
11
|
+
*
|
12
|
+
*
|
13
|
+
* The SSH Library is distributed in the hope that it will be useful, but
|
14
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
15
|
+
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
16
|
+
* License for more details.
|
17
|
+
*
|
18
|
+
* You should have received a copy of the GNU Lesser General Public License
|
19
|
+
* along with the SSH Library; see the file COPYING. If not, write to
|
20
|
+
* the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
|
21
|
+
* MA 02111-1307, USA. */
|
22
|
+
|
23
|
+
static unsigned int crc_table[] = {
|
24
|
+
0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
|
25
|
+
0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
|
26
|
+
0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
|
27
|
+
0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
|
28
|
+
0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
|
29
|
+
0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
|
30
|
+
0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
|
31
|
+
0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
|
32
|
+
0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
|
33
|
+
0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
|
34
|
+
0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
|
35
|
+
0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
|
36
|
+
0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
|
37
|
+
0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
|
38
|
+
0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
|
39
|
+
0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
|
40
|
+
0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
|
41
|
+
0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
|
42
|
+
0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
|
43
|
+
0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
|
44
|
+
0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
|
45
|
+
0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
|
46
|
+
0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
|
47
|
+
0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
|
48
|
+
0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
|
49
|
+
0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
|
50
|
+
0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
|
51
|
+
0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
|
52
|
+
0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
|
53
|
+
0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
|
54
|
+
0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
|
55
|
+
0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
|
56
|
+
0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
|
57
|
+
0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
|
58
|
+
0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
|
59
|
+
0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
|
60
|
+
0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
|
61
|
+
0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
|
62
|
+
0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
|
63
|
+
0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
|
64
|
+
0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
|
65
|
+
0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
|
66
|
+
0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
|
67
|
+
0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
|
68
|
+
0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
|
69
|
+
0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
|
70
|
+
0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
|
71
|
+
0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
|
72
|
+
0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
|
73
|
+
0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
|
74
|
+
0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
|
75
|
+
0x2d02ef8dUL
|
76
|
+
};
|
77
|
+
|
78
|
+
unsigned int crc32(unsigned int crc, char *buf, int len);
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module BloomFilter
|
2
|
+
class CountingRedis < Filter
|
3
|
+
|
4
|
+
def initialize(opts = {})
|
5
|
+
@opts = {
|
6
|
+
:size => 100,
|
7
|
+
:hashes => 4,
|
8
|
+
:seed => Time.now.to_i,
|
9
|
+
:bucket => 3,
|
10
|
+
:ttl => false,
|
11
|
+
:server => {}
|
12
|
+
}.merge opts
|
13
|
+
@db = ::Redis.new(@opts[:server])
|
14
|
+
end
|
15
|
+
|
16
|
+
def insert(key, ttl=nil)
|
17
|
+
ttl = @opts[:ttl] if ttl.nil?
|
18
|
+
|
19
|
+
indexes_for(key).each do |idx|
|
20
|
+
@db.incr idx
|
21
|
+
@db.expire(idx, ttl) if ttl
|
22
|
+
end
|
23
|
+
end
|
24
|
+
alias :[]= :insert
|
25
|
+
|
26
|
+
def delete(key)
|
27
|
+
indexes_for(key).each do |idx|
|
28
|
+
if @db.decr(idx).to_i <= 0
|
29
|
+
@db.del(idx)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def include?(*keys)
|
35
|
+
indexes = keys.collect { |key| indexes_for(key) }
|
36
|
+
not @db.mget(*indexes.flatten).include? nil
|
37
|
+
end
|
38
|
+
alias :key? :include?
|
39
|
+
|
40
|
+
def num_set
|
41
|
+
@db.keys("rbloom:*").size
|
42
|
+
end
|
43
|
+
alias :size :num_set
|
44
|
+
|
45
|
+
def clear
|
46
|
+
@db.flushdb
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
# compute index offsets for provided key
|
52
|
+
def indexes_for(key)
|
53
|
+
indexes = []
|
54
|
+
@opts[:hashes].times do |i|
|
55
|
+
indexes.push "rbloom:" + (Zlib.crc32("#{key}:#{i+@opts[:seed]}") % @opts[:size]).to_s
|
56
|
+
end
|
57
|
+
|
58
|
+
indexes
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module BloomFilter
|
2
|
+
class Filter
|
3
|
+
def stats
|
4
|
+
fp = ((1.0 - Math.exp(-(@opts[:hashes] * size).to_f / @opts[:size])) ** @opts[:hashes]) * 100
|
5
|
+
printf "Number of filter buckets (m): %d\n" % @opts[:size]
|
6
|
+
printf "Number of bits per buckets (b): %d\n" % @opts[:bucket]
|
7
|
+
printf "Number of filter elements (n): %d\n" % size
|
8
|
+
printf "Number of filter hashes (k) : %d\n" % @opts[:hashes]
|
9
|
+
printf "Raise on overflow? (r) : %s\n" % @opts[:raise].to_s
|
10
|
+
printf "Predicted false positive rate = %.2f%\n" % fp
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module BloomFilter
|
2
|
+
class Native < Filter
|
3
|
+
attr_reader :bf
|
4
|
+
|
5
|
+
def initialize(opts = {})
|
6
|
+
@opts = {
|
7
|
+
:size => 100,
|
8
|
+
:hashes => 4,
|
9
|
+
:seed => Time.now.to_i,
|
10
|
+
:bucket => 3,
|
11
|
+
:raise => false
|
12
|
+
}.merge(opts)
|
13
|
+
|
14
|
+
# arg 1: m => size : number of buckets in a bloom filter
|
15
|
+
# arg 2: k => hashes : number of hash functions
|
16
|
+
# arg 3: s => seed : seed of hash functions
|
17
|
+
# arg 4: b => bucket : number of bits in a bloom filter bucket
|
18
|
+
# arg 5: r => raise : raise on bucket overflow?
|
19
|
+
|
20
|
+
@bf = CBloomFilter.new(@opts[:size], @opts[:hashes], @opts[:seed], @opts[:bucket], @opts[:raise])
|
21
|
+
end
|
22
|
+
|
23
|
+
def insert(key)
|
24
|
+
@bf.insert(key)
|
25
|
+
end
|
26
|
+
alias :[]= :insert
|
27
|
+
|
28
|
+
def include?(*keys)
|
29
|
+
@bf.include?(*keys)
|
30
|
+
end
|
31
|
+
alias :key? :include?
|
32
|
+
alias :[] :include?
|
33
|
+
|
34
|
+
def delete(key); @bf.delete(key); end
|
35
|
+
def clear; @bf.clear; end
|
36
|
+
def size; @bf.num_set; end
|
37
|
+
def merge!(o); @bf.merge!(o.bf); end
|
38
|
+
|
39
|
+
def bitmap
|
40
|
+
@bf.bitmap
|
41
|
+
end
|
42
|
+
|
43
|
+
def marshal_load(ary)
|
44
|
+
opts, bitmap = *ary
|
45
|
+
|
46
|
+
@bf = Native.new(opts)
|
47
|
+
@bf.bf.load(bitmap) if !bitmap.nil?
|
48
|
+
end
|
49
|
+
|
50
|
+
def marshal_dump
|
51
|
+
[@opts, @bf.bitmap]
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.load(filename)
|
55
|
+
Marshal.load(File.open(filename, 'r'))
|
56
|
+
end
|
57
|
+
|
58
|
+
def save(filename)
|
59
|
+
File.open(filename, 'w') do |f|
|
60
|
+
f << Marshal.dump(self)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module BloomFilter
|
2
|
+
class Redis < Filter
|
3
|
+
|
4
|
+
def initialize(opts = {})
|
5
|
+
@opts = {
|
6
|
+
:size => 100,
|
7
|
+
:hashes => 4,
|
8
|
+
:seed => Time.now.to_i,
|
9
|
+
:namespace => 'redis',
|
10
|
+
:eager => true,
|
11
|
+
:server => {}
|
12
|
+
}.merge opts
|
13
|
+
@db = ::Redis.new(@opts[:server])
|
14
|
+
|
15
|
+
if @opts[:eager]
|
16
|
+
# allocate the memory immediately
|
17
|
+
@db.setbit @opts[:namespace], @opts[:size], 1
|
18
|
+
@db.setbit @opts[:namespace], @opts[:size], 0
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def insert(key, ttl=nil)
|
23
|
+
indexes_for(key) { |idx| @db.setbit @opts[:namespace], idx, 1 }
|
24
|
+
end
|
25
|
+
alias :[]= :insert
|
26
|
+
|
27
|
+
def include?(*keys)
|
28
|
+
keys.each do |key|
|
29
|
+
indexes_for(key) do |idx|
|
30
|
+
return false if @db.getbit(@opts[:namespace], idx).zero?
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
true
|
35
|
+
end
|
36
|
+
alias :key? :include?
|
37
|
+
|
38
|
+
def delete(key)
|
39
|
+
indexes_for(key) do |idx|
|
40
|
+
@db.setbit @opts[:namespace], idx, 0
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def clear
|
45
|
+
@db.set @opts[:namespace], 0
|
46
|
+
end
|
47
|
+
|
48
|
+
def num_set
|
49
|
+
@db.strlen @opts[:namespace]
|
50
|
+
end
|
51
|
+
alias :size :num_set
|
52
|
+
|
53
|
+
def stats
|
54
|
+
printf "Number of filter buckets (m): %d\n" % @opts[:size]
|
55
|
+
printf "Number of filter hashes (k) : %d\n" % @opts[:hashes]
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
# compute index offsets for provided key
|
61
|
+
def indexes_for(key)
|
62
|
+
indexes = []
|
63
|
+
@opts[:hashes].times do |i|
|
64
|
+
yield Zlib.crc32("#{key}:#{i+@opts[:seed]}") % @opts[:size]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
describe BloomFilter::CountingRedis do
|
4
|
+
include BloomFilter
|
5
|
+
|
6
|
+
context "use Redis for storage" do
|
7
|
+
it "should store data in Redis" do
|
8
|
+
bf = CountingRedis.new
|
9
|
+
|
10
|
+
bf.insert(:abcd)
|
11
|
+
bf.insert('test')
|
12
|
+
bf.include?('test').should be_true
|
13
|
+
bf.key?('test').should be_true
|
14
|
+
|
15
|
+
bf.include?('test', 'test2').should be_false
|
16
|
+
bf.include?('test', 'abcd').should be_true
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should accept a TTL value for a key" do
|
20
|
+
bf = CountingRedis.new(:ttl => 1)
|
21
|
+
|
22
|
+
bf.insert('test')
|
23
|
+
bf.include?('test').should be_true
|
24
|
+
|
25
|
+
sleep(2)
|
26
|
+
bf.include?('test').should be_false
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should delete keys from Redis" do
|
30
|
+
bf = CountingRedis.new
|
31
|
+
|
32
|
+
bf.insert('test')
|
33
|
+
bf.include?('test').should be_true
|
34
|
+
|
35
|
+
bf.delete('test')
|
36
|
+
bf.include?('test').should be_false
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should output current stats" do
|
40
|
+
bf = CountingRedis.new
|
41
|
+
bf.clear
|
42
|
+
|
43
|
+
bf.insert('test')
|
44
|
+
bf.size.should == 4
|
45
|
+
lambda { bf.stats }.should_not raise_error
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should connect to remote redis server" do
|
49
|
+
lambda { CountingRedis.new }.should_not raise_error
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
data/spec/helper.rb
ADDED
data/spec/native_spec.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
describe BloomFilter::Native do
|
4
|
+
include BloomFilter
|
5
|
+
|
6
|
+
it "should clear" do
|
7
|
+
bf = Native.new(:size => 100, :hashes => 2, :seed => 1, :bucket => 3, :raise => false)
|
8
|
+
bf.insert("test")
|
9
|
+
bf.include?("test").should be_true
|
10
|
+
bf.clear
|
11
|
+
bf.include?("test").should be_false
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should merge" do
|
15
|
+
bf1 = Native.new(:size => 100, :hashes => 2, :seed => 1, :bucket => 3, :raise => false)
|
16
|
+
bf2 = Native.new(:size => 100, :hashes => 2, :seed => 1, :bucket => 3, :raise => false)
|
17
|
+
bf2.insert("test")
|
18
|
+
bf1.include?("test").should be_false
|
19
|
+
bf1.merge!(bf2)
|
20
|
+
bf1.include?("test").should be_true
|
21
|
+
bf2.include?("test").should be_true
|
22
|
+
end
|
23
|
+
|
24
|
+
context "behave like a bloomfilter" do
|
25
|
+
it "should test set memerbship" do
|
26
|
+
bf = Native.new(:size => 100, :hashes => 2, :seed => 1, :bucket => 3, :raise => false)
|
27
|
+
bf.insert("test")
|
28
|
+
bf.insert("test1")
|
29
|
+
|
30
|
+
bf.include?("test").should be_true
|
31
|
+
bf.include?("abcd").should be_false
|
32
|
+
bf.include?("test", "test1").should be_true
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should work with any object's to_s" do
|
36
|
+
bf = Native.new
|
37
|
+
bf.insert(:test)
|
38
|
+
bf.insert(:test1)
|
39
|
+
bf.insert(12345)
|
40
|
+
|
41
|
+
bf.include?("test").should be_true
|
42
|
+
bf.include?("abcd").should be_false
|
43
|
+
bf.include?("test", "test1", '12345').should be_true
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
context "behave like counting bloom filter" do
|
48
|
+
it "should delete / decrement keys" do
|
49
|
+
bf = Native.new
|
50
|
+
|
51
|
+
bf.insert("test")
|
52
|
+
bf.include?("test").should be_true
|
53
|
+
|
54
|
+
bf.delete("test")
|
55
|
+
bf.include?("test").should be_false
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context "serialize" do
|
60
|
+
after(:each) { File.unlink('bf.out') }
|
61
|
+
|
62
|
+
it "should marshall the bloomfilter" do
|
63
|
+
bf = Native.new
|
64
|
+
lambda { bf.save('bf.out') }.should_not raise_error
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should load marshalled bloomfilter" do
|
68
|
+
bf = Native.new
|
69
|
+
bf.insert('foo')
|
70
|
+
bf.insert('bar')
|
71
|
+
bf.save('bf.out')
|
72
|
+
|
73
|
+
bf = Native.load('bf.out')
|
74
|
+
bf.include?('foo').should be_true
|
75
|
+
bf.include?('bar').should be_true
|
76
|
+
bf.include?('baz').should be_false
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/spec/redis_spec.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
describe BloomFilter::Redis do
|
4
|
+
include BloomFilter
|
5
|
+
|
6
|
+
context "use Redis bitstring for storage" do
|
7
|
+
let(:bf) { Redis.new }
|
8
|
+
|
9
|
+
it "should store data in Redis" do
|
10
|
+
bf.insert(:abcd)
|
11
|
+
bf.insert('test')
|
12
|
+
bf.include?('test').should be_true
|
13
|
+
bf.key?('test').should be_true
|
14
|
+
|
15
|
+
bf.include?('test', 'test2').should be_false
|
16
|
+
bf.include?('test', 'abcd').should be_true
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should delete keys from Redis" do
|
20
|
+
bf.insert('test')
|
21
|
+
bf.include?('test').should be_true
|
22
|
+
|
23
|
+
bf.delete('test')
|
24
|
+
bf.include?('test').should be_false
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should clear Redis filter" do
|
28
|
+
bf.insert('test')
|
29
|
+
bf.include?('test').should be_true
|
30
|
+
|
31
|
+
bf.clear
|
32
|
+
bf.include?('test').should be_false
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should output current stats" do
|
36
|
+
bf.clear
|
37
|
+
bf.insert('test')
|
38
|
+
lambda { bf.stats }.should_not raise_error
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should connect to remote redis server" do
|
42
|
+
lambda { Redis.new }.should_not raise_error
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should allow namespaced BloomFilters" do
|
46
|
+
bf1 = Redis.new(:namespace => :a)
|
47
|
+
bf2 = Redis.new(:namespace => :b)
|
48
|
+
|
49
|
+
bf1.insert('test')
|
50
|
+
bf1.include?('test').should be_true
|
51
|
+
bf2.include?('test').should be_false
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
metadata
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bloomfilter-rb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 2
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
version: 2.0.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Ilya Grigorik
|
13
|
+
- Tatsuya Mori
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-01-05 00:00:00 -05:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: redis
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
segments:
|
30
|
+
- 2
|
31
|
+
- 1
|
32
|
+
- 1
|
33
|
+
version: 2.1.1
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rspec
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
segments:
|
45
|
+
- 0
|
46
|
+
version: "0"
|
47
|
+
type: :development
|
48
|
+
version_requirements: *id002
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: rake
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
segments:
|
58
|
+
- 0
|
59
|
+
version: "0"
|
60
|
+
type: :development
|
61
|
+
version_requirements: *id003
|
62
|
+
description: Counting Bloom Filter implemented in Ruby
|
63
|
+
email:
|
64
|
+
- ilya@igvita.com
|
65
|
+
executables: []
|
66
|
+
|
67
|
+
extensions:
|
68
|
+
- ext/cbloomfilter/extconf.rb
|
69
|
+
extra_rdoc_files: []
|
70
|
+
|
71
|
+
files:
|
72
|
+
- .gitignore
|
73
|
+
- .rspec
|
74
|
+
- Gemfile
|
75
|
+
- Gemfile.lock
|
76
|
+
- README.md
|
77
|
+
- Rakefile
|
78
|
+
- bloomfilter-rb.gemspec
|
79
|
+
- examples/counting-redis.rb
|
80
|
+
- examples/pure-ruby-bf.rb
|
81
|
+
- examples/simple-native.rb
|
82
|
+
- examples/simple-redis.rb
|
83
|
+
- ext/cbloomfilter/cbloomfilter.c
|
84
|
+
- ext/cbloomfilter/crc32.c
|
85
|
+
- ext/cbloomfilter/crc32.h
|
86
|
+
- ext/cbloomfilter/extconf.rb
|
87
|
+
- lib/bloomfilter-rb.rb
|
88
|
+
- lib/bloomfilter/counting_redis.rb
|
89
|
+
- lib/bloomfilter/filter.rb
|
90
|
+
- lib/bloomfilter/native.rb
|
91
|
+
- lib/bloomfilter/redis.rb
|
92
|
+
- lib/bloomfilter/version.rb
|
93
|
+
- spec/counting_redis_spec.rb
|
94
|
+
- spec/helper.rb
|
95
|
+
- spec/native_spec.rb
|
96
|
+
- spec/redis_spec.rb
|
97
|
+
has_rdoc: true
|
98
|
+
homepage: http://github.com/igrigorik/bloomfilter
|
99
|
+
licenses: []
|
100
|
+
|
101
|
+
post_install_message:
|
102
|
+
rdoc_options: []
|
103
|
+
|
104
|
+
require_paths:
|
105
|
+
- lib
|
106
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
107
|
+
none: false
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
segments:
|
112
|
+
- 0
|
113
|
+
version: "0"
|
114
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
115
|
+
none: false
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
segments:
|
120
|
+
- 0
|
121
|
+
version: "0"
|
122
|
+
requirements: []
|
123
|
+
|
124
|
+
rubyforge_project: bloomfilter-rb
|
125
|
+
rubygems_version: 1.3.7
|
126
|
+
signing_key:
|
127
|
+
specification_version: 3
|
128
|
+
summary: Counting Bloom Filter implemented in Ruby
|
129
|
+
test_files:
|
130
|
+
- spec/counting_redis_spec.rb
|
131
|
+
- spec/helper.rb
|
132
|
+
- spec/native_spec.rb
|
133
|
+
- spec/redis_spec.rb
|