igrigorik-bloomfilter 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +58 -0
- data/Rakefile +64 -0
- data/examples/bf.rb +54 -0
- data/examples/simple.rb +26 -0
- data/ext/crc32.c +32 -0
- data/ext/crc32.h +78 -0
- data/ext/extconf.rb +4 -0
- data/ext/sbloomfilter.c +312 -0
- data/lib/bloomfilter.rb +31 -0
- data/test/helper.rb +2 -0
- data/test/test_bloom_filter.rb +44 -0
- metadata +64 -0
data/README.rdoc
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
= BloomFilter
|
2
|
+
|
3
|
+
Counting Bloom Filter implemented in Ruby.
|
4
|
+
|
5
|
+
Bloom filter is a space-efficient probabilistic data structure that is used to
|
6
|
+
test whether an element is a member of a set. False positives are possible, but
|
7
|
+
false negatives are not. For more detail: http://en.wikipedia.org/wiki/Bloom_filter
|
8
|
+
|
9
|
+
== Implementation
|
10
|
+
|
11
|
+
Instead of using k different hash functions, this implementation seeds the CRC32 hash
|
12
|
+
with k different initial values (0, 1, ..., k-1). This may or may not give you a good
|
13
|
+
distribution, it all depends on the data.
|
14
|
+
|
15
|
+
== Example
|
16
|
+
|
17
|
+
require 'bloomfilter'
|
18
|
+
|
19
|
+
# M (size of bit array)
|
20
|
+
# K (number of hash functions)
|
21
|
+
# R (random seed) 100000000, k=4, random seed=1
|
22
|
+
|
23
|
+
# M, K, R
|
24
|
+
bf = BloomFilter.new(10, 2, 1)
|
25
|
+
bf.insert("test")
|
26
|
+
bf.include?("test")
|
27
|
+
=> true
|
28
|
+
bf.include?("test2")
|
29
|
+
=> false
|
30
|
+
bf.delete("test")
|
31
|
+
bf.include?("test")
|
32
|
+
=> false
|
33
|
+
|
34
|
+
# Hash with a bloom filter!
|
35
|
+
bf["test2"] = "bar"
|
36
|
+
bf["test2"]
|
37
|
+
=> "bar"
|
38
|
+
bf["test3"]
|
39
|
+
=> nil
|
40
|
+
|
41
|
+
bf.stats
|
42
|
+
Number of filter bits (m): 10
|
43
|
+
Number of filter elements (n): 2
|
44
|
+
Number of filter hashes (k) : 2
|
45
|
+
Predicted false positive rate = 10.87%
|
46
|
+
|
47
|
+
|
48
|
+
== Configuring Bloom Filter
|
49
|
+
|
50
|
+
Performance of the Bloom filter depends on a number of variables:
|
51
|
+
- size of the bit array
|
52
|
+
- number of hash functions
|
53
|
+
|
54
|
+
To figure out the values for these parameters, refer to:
|
55
|
+
http://www.igvita.com/2008/12/27/scalable-datasets-bloom-filters-in-ruby/
|
56
|
+
|
57
|
+
== Credits
|
58
|
+
Tatsuya Mori <valdzone@gmail.com> (Original: http://vald.x0.com/sb/)
|
data/Rakefile
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/clean'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
require 'rake/gempackagetask'
|
5
|
+
require 'rake/testtask'
|
6
|
+
require 'fileutils'
|
7
|
+
include FileUtils
|
8
|
+
|
9
|
+
# Default Rake task is compile
|
10
|
+
task :default => :compile
|
11
|
+
|
12
|
+
def make(makedir)
|
13
|
+
Dir.chdir(makedir) { sh 'make' }
|
14
|
+
end
|
15
|
+
|
16
|
+
def extconf(dir)
|
17
|
+
Dir.chdir(dir) { ruby "extconf.rb" }
|
18
|
+
end
|
19
|
+
|
20
|
+
def setup_extension(dir, extension)
|
21
|
+
ext = "ext/#{dir}"
|
22
|
+
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
|
23
|
+
ext_files = FileList[
|
24
|
+
"#{ext}/*.c",
|
25
|
+
"#{ext}/*.h",
|
26
|
+
"#{ext}/extconf.rb",
|
27
|
+
"#{ext}/Makefile",
|
28
|
+
"lib"
|
29
|
+
]
|
30
|
+
|
31
|
+
task "lib" do
|
32
|
+
directory "lib"
|
33
|
+
end
|
34
|
+
|
35
|
+
desc "Builds just the #{extension} extension"
|
36
|
+
task extension.to_sym => ["#{ext}/Makefile", ext_so ]
|
37
|
+
|
38
|
+
file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
|
39
|
+
extconf "#{ext}"
|
40
|
+
end
|
41
|
+
|
42
|
+
file ext_so => ext_files do
|
43
|
+
make "#{ext}"
|
44
|
+
cp ext_so, "lib"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
setup_extension("", "sbloomfilter")
|
49
|
+
|
50
|
+
task :compile => [:sbloomfilter]
|
51
|
+
|
52
|
+
CLEAN.include ['build/*', '**/*.o', '**/*.so', '**/*.a', '**/*.log', 'pkg']
|
53
|
+
CLEAN.include ['ext/Makefile']
|
54
|
+
|
55
|
+
Rake::TestTask.new do |t|
|
56
|
+
%w[ ext lib test ].each do |dir|
|
57
|
+
t.libs << dir
|
58
|
+
end
|
59
|
+
|
60
|
+
t.test_files = FileList['test/test_*.rb']
|
61
|
+
t.verbose = true
|
62
|
+
end
|
63
|
+
Rake::Task[:test].prerequisites << :compile
|
64
|
+
|
data/examples/bf.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bitset'
|
3
|
+
require 'zlib'
|
4
|
+
|
5
|
+
#
|
6
|
+
# Pure ruby implementation of a Bloom filter, just for kicks
|
7
|
+
#
|
8
|
+
|
9
|
+
class BloomFilter
|
10
|
+
|
11
|
+
def initialize(max_entries, num_hashes, seed)
|
12
|
+
@num_hashes = num_hashes
|
13
|
+
@size = max_entries.to_i
|
14
|
+
@bitmap = BitSet.new(@size)
|
15
|
+
@__mask = BitSet.new(@size)
|
16
|
+
@seed = seed
|
17
|
+
end
|
18
|
+
|
19
|
+
def insert(key)
|
20
|
+
mask = make_mask(key)
|
21
|
+
@bitmap |= mask
|
22
|
+
end
|
23
|
+
|
24
|
+
def new?(key)
|
25
|
+
mask = make_mask(key)
|
26
|
+
return ((@bitmap & mask) != mask);
|
27
|
+
end
|
28
|
+
|
29
|
+
def make_mask(key)
|
30
|
+
@__mask.clear
|
31
|
+
0.upto(@num_hashes.to_i - 1) do |i|
|
32
|
+
hash = Zlib.crc32(key, i + @seed)
|
33
|
+
@__mask.set(hash % @size, 1)
|
34
|
+
end
|
35
|
+
return @__mask
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
def main
|
41
|
+
bf = BloomFilter.new(1000000, 4, 0)
|
42
|
+
num = 0
|
43
|
+
while line = ARGF.gets
|
44
|
+
data = line.chop
|
45
|
+
|
46
|
+
if bf.new_entry?(data)
|
47
|
+
num += 1
|
48
|
+
bf.insert(data)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
print "#element = #{num}\n"
|
52
|
+
end
|
53
|
+
|
54
|
+
main
|
data/examples/simple.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bloomfilter'
|
3
|
+
|
4
|
+
WORDS = %w(duck penguin bear panda)
|
5
|
+
TEST = %w(penguin moose racooon)
|
6
|
+
|
7
|
+
# m = 100, k = 4, seed = 1
|
8
|
+
bf = BloomFilter.new(100, 4, 1)
|
9
|
+
|
10
|
+
WORDS.each { |w| bf.insert(w) }
|
11
|
+
TEST.each do |w|
|
12
|
+
puts "#{w}: #{bf.include?(w)}"
|
13
|
+
end
|
14
|
+
|
15
|
+
bf.stats
|
16
|
+
|
17
|
+
# penguin: true
|
18
|
+
# moose: false
|
19
|
+
# racooon: false
|
20
|
+
#
|
21
|
+
# Number of filter buckets (m): 100
|
22
|
+
# Number of bits per buckets (b): 1
|
23
|
+
# Number of filter elements (n): 4
|
24
|
+
# Number of filter hashes (k) : 4
|
25
|
+
# Raise on overflow? (r) : false
|
26
|
+
# Predicted false positive rate = 0.05%
|
data/ext/crc32.c
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
/* simple CRC32 code */
|
2
|
+
/*
|
3
|
+
* Copyright 2005 Aris Adamantiadis
|
4
|
+
*
|
5
|
+
* This file is part of the SSH Library
|
6
|
+
*
|
7
|
+
* The SSH Library is free software; you can redistribute it and/or modify
|
8
|
+
* it under the terms of the GNU Lesser General Public License as published by
|
9
|
+
* the Free Software Foundation; either version 2.1 of the License, or (at your
|
10
|
+
* option) any later version.
|
11
|
+
*
|
12
|
+
*
|
13
|
+
* The SSH Library is distributed in the hope that it will be useful, but
|
14
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
15
|
+
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
16
|
+
* License for more details.
|
17
|
+
*
|
18
|
+
* You should have received a copy of the GNU Lesser General Public License
|
19
|
+
* along with the SSH Library; see the file COPYING. If not, write to
|
20
|
+
* the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
|
21
|
+
* MA 02111-1307, USA. */
|
22
|
+
|
23
|
+
#include "crc32.h"
|
24
|
+
|
25
|
+
unsigned int crc32(unsigned int crc, char *buf, int len) {
|
26
|
+
while (len > 0) {
|
27
|
+
crc = crc_table[(crc ^ *buf) & 0xff] ^ (crc >> 8);
|
28
|
+
--len;
|
29
|
+
++buf;
|
30
|
+
}
|
31
|
+
return crc;
|
32
|
+
}
|
data/ext/crc32.h
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
/* simple CRC32 code */
|
2
|
+
/*
|
3
|
+
* Copyright 2005 Aris Adamantiadis
|
4
|
+
*
|
5
|
+
* This file is part of the SSH Library
|
6
|
+
*
|
7
|
+
* The SSH Library is free software; you can redistribute it and/or modify
|
8
|
+
* it under the terms of the GNU Lesser General Public License as published by
|
9
|
+
* the Free Software Foundation; either version 2.1 of the License, or (at your
|
10
|
+
* option) any later version.
|
11
|
+
*
|
12
|
+
*
|
13
|
+
* The SSH Library is distributed in the hope that it will be useful, but
|
14
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
15
|
+
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
16
|
+
* License for more details.
|
17
|
+
*
|
18
|
+
* You should have received a copy of the GNU Lesser General Public License
|
19
|
+
* along with the SSH Library; see the file COPYING. If not, write to
|
20
|
+
* the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
|
21
|
+
* MA 02111-1307, USA. */
|
22
|
+
|
23
|
+
static unsigned int crc_table[] = {
|
24
|
+
0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
|
25
|
+
0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
|
26
|
+
0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
|
27
|
+
0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
|
28
|
+
0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
|
29
|
+
0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
|
30
|
+
0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
|
31
|
+
0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
|
32
|
+
0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
|
33
|
+
0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
|
34
|
+
0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
|
35
|
+
0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
|
36
|
+
0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
|
37
|
+
0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
|
38
|
+
0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
|
39
|
+
0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
|
40
|
+
0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
|
41
|
+
0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
|
42
|
+
0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
|
43
|
+
0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
|
44
|
+
0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
|
45
|
+
0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
|
46
|
+
0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
|
47
|
+
0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
|
48
|
+
0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
|
49
|
+
0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
|
50
|
+
0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
|
51
|
+
0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
|
52
|
+
0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
|
53
|
+
0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
|
54
|
+
0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
|
55
|
+
0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
|
56
|
+
0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
|
57
|
+
0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
|
58
|
+
0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
|
59
|
+
0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
|
60
|
+
0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
|
61
|
+
0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
|
62
|
+
0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
|
63
|
+
0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
|
64
|
+
0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
|
65
|
+
0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
|
66
|
+
0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
|
67
|
+
0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
|
68
|
+
0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
|
69
|
+
0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
|
70
|
+
0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
|
71
|
+
0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
|
72
|
+
0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
|
73
|
+
0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
|
74
|
+
0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
|
75
|
+
0x2d02ef8dUL
|
76
|
+
};
|
77
|
+
|
78
|
+
unsigned int crc32(unsigned int crc, char *buf, int len);
|
data/ext/extconf.rb
ADDED
data/ext/sbloomfilter.c
ADDED
@@ -0,0 +1,312 @@
|
|
1
|
+
/*
|
2
|
+
* sbloomfilter.c - simple Bloom Filter
|
3
|
+
* (c) Tatsuya Mori <valdzone@gmail.com>
|
4
|
+
*/
|
5
|
+
|
6
|
+
#include "ruby.h"
|
7
|
+
#include "crc32.h"
|
8
|
+
|
9
|
+
static VALUE cBloomFilter;
|
10
|
+
|
11
|
+
struct BloomFilter {
|
12
|
+
int m; /* # of buckets in a bloom filter */
|
13
|
+
int b; /* # of bits in a bloom filter bucket */
|
14
|
+
int k; /* # of hash functions */
|
15
|
+
int s; /* # seed of hash functions */
|
16
|
+
int r; /* # raise on bucket overflow? */
|
17
|
+
int num_set; /* # of set bits */
|
18
|
+
unsigned char *ptr; /* bits data */
|
19
|
+
};
|
20
|
+
|
21
|
+
void bits_free(struct BloomFilter *bf) {
|
22
|
+
ruby_xfree(bf->ptr);
|
23
|
+
}
|
24
|
+
|
25
|
+
|
26
|
+
void bucket_unset(struct BloomFilter *bf, int index) {
|
27
|
+
int byte_offset = (index * bf->b) / 8;
|
28
|
+
int bit_offset = (index * bf->b) % 8;
|
29
|
+
unsigned int c = bf->ptr[byte_offset];
|
30
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
31
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
32
|
+
if ((c & mask) == 0) {
|
33
|
+
// do nothing
|
34
|
+
} else {
|
35
|
+
bf->ptr[byte_offset] -= (1 << bit_offset) & ((1 << 8) - 1);
|
36
|
+
bf->ptr[byte_offset + 1] -= ((1 << bit_offset) & ((1 << 16) - 1)) >> 8;
|
37
|
+
}
|
38
|
+
|
39
|
+
}
|
40
|
+
|
41
|
+
void bucket_set(struct BloomFilter *bf, int index) {
|
42
|
+
int byte_offset = (index * bf->b) / 8;
|
43
|
+
int bit_offset = (index * bf->b) % 8;
|
44
|
+
unsigned int c = bf->ptr[byte_offset];
|
45
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
46
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
47
|
+
if ((c & mask) == mask) {
|
48
|
+
if (bf->r == 1) rb_raise(rb_eRuntimeError, "bucket got filled up");
|
49
|
+
} else {
|
50
|
+
bf->ptr[byte_offset] += (1 << bit_offset) & ((1 << 8) - 1);
|
51
|
+
bf->ptr[byte_offset + 1] += ((1 << bit_offset) & ((1 << 16) - 1)) >> 8;
|
52
|
+
}
|
53
|
+
|
54
|
+
}
|
55
|
+
|
56
|
+
int bucket_check(struct BloomFilter *bf, int index) {
|
57
|
+
int byte_offset = (index * bf->b) / 8;
|
58
|
+
int bit_offset = (index * bf->b) % 8;
|
59
|
+
unsigned int c = bf->ptr[byte_offset];
|
60
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
61
|
+
|
62
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
63
|
+
return (c & mask) >> bit_offset;
|
64
|
+
}
|
65
|
+
|
66
|
+
int bucket_get(struct BloomFilter *bf, int index) {
|
67
|
+
int byte_offset = (index * bf->b) / 8;
|
68
|
+
int bit_offset = (index * bf->b) % 8;
|
69
|
+
unsigned int c = bf->ptr[byte_offset];
|
70
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
71
|
+
|
72
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
73
|
+
return (c & mask) >> bit_offset;
|
74
|
+
}
|
75
|
+
|
76
|
+
static VALUE bf_s_new(int argc, VALUE *argv, VALUE self) {
|
77
|
+
struct BloomFilter *bf;
|
78
|
+
VALUE arg1, arg2, arg3, arg4, arg5, obj;
|
79
|
+
int m, k, s, b, r, bytes;
|
80
|
+
|
81
|
+
obj = Data_Make_Struct(self, struct BloomFilter, NULL, bits_free, bf);
|
82
|
+
|
83
|
+
if (argc == 5) {
|
84
|
+
arg1 = argv[0];
|
85
|
+
arg2 = argv[1];
|
86
|
+
arg3 = argv[2];
|
87
|
+
arg4 = argv[3];
|
88
|
+
arg5 = argv[4];
|
89
|
+
} else if (argc == 4) {
|
90
|
+
arg1 = argv[0];
|
91
|
+
arg2 = argv[1];
|
92
|
+
arg3 = argv[2];
|
93
|
+
arg4 = argv[3];
|
94
|
+
arg5 = 0;
|
95
|
+
} else if (argc == 3) {
|
96
|
+
arg1 = argv[0];
|
97
|
+
arg2 = argv[1];
|
98
|
+
arg3 = argv[2];
|
99
|
+
arg4 = INT2FIX(1);
|
100
|
+
arg5 = 0;
|
101
|
+
} else if (argc == 2) {
|
102
|
+
arg1 = argv[0];
|
103
|
+
arg2 = argv[1];
|
104
|
+
arg3 = INT2FIX(0);
|
105
|
+
arg4 = INT2FIX(1);
|
106
|
+
arg5 = 0;
|
107
|
+
} else if (argc == 1) {
|
108
|
+
arg1 = argv[0];
|
109
|
+
arg2 = INT2FIX(4);
|
110
|
+
arg3 = INT2FIX(0);
|
111
|
+
arg4 = INT2FIX(1);
|
112
|
+
arg5 = 0;
|
113
|
+
} else { /* default = Fugou approach :-) */
|
114
|
+
arg1 = INT2FIX(100000000);
|
115
|
+
arg2 = INT2FIX(4);
|
116
|
+
arg3 = INT2FIX(0);
|
117
|
+
arg4 = INT2FIX(1);
|
118
|
+
arg5 = 0;
|
119
|
+
}
|
120
|
+
|
121
|
+
m = FIX2INT(arg1);
|
122
|
+
k = FIX2INT(arg2);
|
123
|
+
s = FIX2INT(arg3);
|
124
|
+
b = FIX2INT(arg4);
|
125
|
+
r = FIX2INT(arg5);
|
126
|
+
|
127
|
+
if (b < 1 || b > 8)
|
128
|
+
rb_raise(rb_eArgError, "bucket size");
|
129
|
+
if (m < 1)
|
130
|
+
rb_raise(rb_eArgError, "array size");
|
131
|
+
if (k < 1)
|
132
|
+
rb_raise(rb_eArgError, "hash length");
|
133
|
+
if (s < 0)
|
134
|
+
rb_raise(rb_eArgError, "random seed");
|
135
|
+
|
136
|
+
bf->b = b;
|
137
|
+
bf->m = m;
|
138
|
+
bf->k = k;
|
139
|
+
bf->s = s;
|
140
|
+
bf->r = r;
|
141
|
+
bf->num_set = 0;
|
142
|
+
|
143
|
+
bytes = ((m * b) + 15) / 8;
|
144
|
+
bf->ptr = ALLOC_N(unsigned char, bytes);
|
145
|
+
|
146
|
+
/* initialize the bits with zeros */
|
147
|
+
memset(bf->ptr, 0, bytes);
|
148
|
+
rb_iv_set(obj, "@hash_value", rb_hash_new());
|
149
|
+
|
150
|
+
return obj;
|
151
|
+
}
|
152
|
+
|
153
|
+
static VALUE bf_m(VALUE self) {
|
154
|
+
struct BloomFilter *bf;
|
155
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
156
|
+
return INT2FIX(bf->m);
|
157
|
+
}
|
158
|
+
|
159
|
+
static VALUE bf_k(VALUE self) {
|
160
|
+
struct BloomFilter *bf;
|
161
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
162
|
+
return INT2FIX(bf->k);
|
163
|
+
}
|
164
|
+
|
165
|
+
static VALUE bf_b(VALUE self) {
|
166
|
+
struct BloomFilter *bf;
|
167
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
168
|
+
return INT2FIX(bf->b);
|
169
|
+
}
|
170
|
+
|
171
|
+
static VALUE bf_r(VALUE self) {
|
172
|
+
struct BloomFilter *bf;
|
173
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
174
|
+
return bf->r == 0 ? Qfalse : Qtrue;
|
175
|
+
}
|
176
|
+
|
177
|
+
static VALUE bf_num_set(VALUE self) {
|
178
|
+
struct BloomFilter *bf;
|
179
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
180
|
+
return INT2FIX(bf->num_set);
|
181
|
+
}
|
182
|
+
|
183
|
+
static VALUE bf_insert(VALUE self, VALUE key) {
|
184
|
+
int index, seed;
|
185
|
+
int i, len, m, k, s;
|
186
|
+
char *ckey;
|
187
|
+
|
188
|
+
struct BloomFilter *bf;
|
189
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
190
|
+
|
191
|
+
Check_Type(key, T_STRING);
|
192
|
+
ckey = STR2CSTR(key);
|
193
|
+
len = (int) (RSTRING(key)->len); /* length of the string in bytes */
|
194
|
+
|
195
|
+
m = bf->m;
|
196
|
+
k = bf->k;
|
197
|
+
s = bf->s;
|
198
|
+
|
199
|
+
for (i = 0; i <= k - 1; i++) {
|
200
|
+
/* seeds for hash functions */
|
201
|
+
seed = i + s;
|
202
|
+
|
203
|
+
/* hash */
|
204
|
+
index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
|
205
|
+
|
206
|
+
/* set a bit at the index */
|
207
|
+
bucket_set(bf, index);
|
208
|
+
}
|
209
|
+
|
210
|
+
bf->num_set += 1;
|
211
|
+
return Qnil;
|
212
|
+
}
|
213
|
+
|
214
|
+
static VALUE bf_delete(VALUE self, VALUE key) {
|
215
|
+
int index, seed;
|
216
|
+
int i, len, m, k, s;
|
217
|
+
char *ckey;
|
218
|
+
|
219
|
+
struct BloomFilter *bf;
|
220
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
221
|
+
|
222
|
+
Check_Type(key, T_STRING);
|
223
|
+
ckey = STR2CSTR(key);
|
224
|
+
len = (int) (RSTRING(key)->len); /* length of the string in bytes */
|
225
|
+
|
226
|
+
m = bf->m;
|
227
|
+
k = bf->k;
|
228
|
+
s = bf->s;
|
229
|
+
|
230
|
+
for (i = 0; i <= k - 1; i++) {
|
231
|
+
/* seeds for hash functions */
|
232
|
+
seed = i + s;
|
233
|
+
|
234
|
+
/* hash */
|
235
|
+
index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
|
236
|
+
|
237
|
+
/* set a bit at the index */
|
238
|
+
bucket_unset(bf, index);
|
239
|
+
}
|
240
|
+
|
241
|
+
bf->num_set += 1;
|
242
|
+
return Qnil;
|
243
|
+
}
|
244
|
+
|
245
|
+
|
246
|
+
static VALUE bf_include(VALUE self, VALUE key) {
|
247
|
+
int index, seed;
|
248
|
+
int i, len, m, k, s;
|
249
|
+
char *ckey;
|
250
|
+
|
251
|
+
struct BloomFilter *bf;
|
252
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
253
|
+
|
254
|
+
Check_Type(key, T_STRING);
|
255
|
+
ckey = STR2CSTR(key);
|
256
|
+
len = (int) (RSTRING(key)->len); /* length of the string in bytes */
|
257
|
+
|
258
|
+
m = bf->m;
|
259
|
+
k = bf->k;
|
260
|
+
s = bf->s;
|
261
|
+
|
262
|
+
for (i = 0; i <= k - 1; i++) {
|
263
|
+
/* seeds for hash functions */
|
264
|
+
seed = i + s;
|
265
|
+
|
266
|
+
/* hash */
|
267
|
+
index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
|
268
|
+
|
269
|
+
/* check the bit at the index */
|
270
|
+
if (!bucket_check(bf, index))
|
271
|
+
return Qfalse; /* i.e., it is a new entry ; escape the loop */
|
272
|
+
}
|
273
|
+
|
274
|
+
return Qtrue;
|
275
|
+
}
|
276
|
+
|
277
|
+
static VALUE bf_to_s(VALUE self) {
|
278
|
+
struct BloomFilter *bf;
|
279
|
+
unsigned char *ptr;
|
280
|
+
int i;
|
281
|
+
VALUE str;
|
282
|
+
|
283
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
284
|
+
str = rb_str_new(0, bf->m);
|
285
|
+
|
286
|
+
ptr = (unsigned char *) RSTRING(str)->ptr;
|
287
|
+
for (i = 0; i < bf->m; i++)
|
288
|
+
*ptr++ = bucket_get(bf, i) ? '1' : '0';
|
289
|
+
|
290
|
+
return str;
|
291
|
+
}
|
292
|
+
|
293
|
+
void Init_sbloomfilter(void) {
|
294
|
+
cBloomFilter = rb_define_class("BloomFilter", rb_cObject);
|
295
|
+
rb_define_singleton_method(cBloomFilter, "new", bf_s_new, -1);
|
296
|
+
rb_define_method(cBloomFilter, "m", bf_m, 0);
|
297
|
+
rb_define_method(cBloomFilter, "k", bf_k, 0);
|
298
|
+
rb_define_method(cBloomFilter, "b", bf_b, 0);
|
299
|
+
rb_define_method(cBloomFilter, "r", bf_r, 0);
|
300
|
+
rb_define_method(cBloomFilter, "num_set", bf_num_set, 0);
|
301
|
+
rb_define_method(cBloomFilter, "insert", bf_insert, 1);
|
302
|
+
rb_define_method(cBloomFilter, "delete", bf_delete, 1);
|
303
|
+
rb_define_method(cBloomFilter, "include?", bf_include, 1);
|
304
|
+
rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
|
305
|
+
|
306
|
+
/* functions that have not been implemented, yet */
|
307
|
+
|
308
|
+
// rb_define_method(cBloomFilter, "clear", bf_clear, 0);
|
309
|
+
// rb_define_method(cBloomFilter, "&", bf_and, 1);
|
310
|
+
// rb_define_method(cBloomFilter, "|", bf_or, 1);
|
311
|
+
// rb_define_method(cBloomFilter, "<=>", bf_cmp, 1);
|
312
|
+
}
|
data/lib/bloomfilter.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'sbloomfilter'
|
2
|
+
|
3
|
+
class BloomFilter
|
4
|
+
def stats
|
5
|
+
fp = ((1.0 - Math.exp(-(self.k * self.num_set).to_f / self.m)) ** self.k) * 100
|
6
|
+
printf "Number of filter buckets (m): %d\n" % self.m
|
7
|
+
printf "Number of bits per buckets (b): %d\n" % self.b
|
8
|
+
printf "Number of filter elements (n): %d\n" % self.num_set
|
9
|
+
printf "Number of filter hashes (k) : %d\n" % self.k
|
10
|
+
printf "Raise on overflow? (r) : %s\n" % self.r.to_s
|
11
|
+
printf "Predicted false positive rate = %.2f%\n" % fp
|
12
|
+
end
|
13
|
+
|
14
|
+
def []= key, value
|
15
|
+
insert(key)
|
16
|
+
@hash_value[key] = value
|
17
|
+
end
|
18
|
+
|
19
|
+
def [] key
|
20
|
+
return nil unless include?(key)
|
21
|
+
@hash_value[key]
|
22
|
+
end
|
23
|
+
|
24
|
+
def key? key
|
25
|
+
include?(key)
|
26
|
+
end
|
27
|
+
|
28
|
+
def keys
|
29
|
+
@hash_value.keys
|
30
|
+
end
|
31
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestBloomFilter < Test::Unit::TestCase
|
4
|
+
def test_include?
|
5
|
+
bf = BloomFilter.new(10, 2, 1, 1, false)
|
6
|
+
bf.insert("test")
|
7
|
+
bf.insert("test")
|
8
|
+
bf.insert("test")
|
9
|
+
bf.insert("test")
|
10
|
+
bf.insert("test")
|
11
|
+
assert bf.include?("test")
|
12
|
+
assert !bf.include?("lkajdsfhlkajsdfhlakjsdfhalsjdkfh")
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_hash_key_insert
|
16
|
+
bf = BloomFilter.new(10, 2, 1)
|
17
|
+
bf['foo'] = 'bar'
|
18
|
+
assert bf.key?('foo')
|
19
|
+
assert_equal 'bar', bf['foo']
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_hash_key?
|
23
|
+
bf = BloomFilter.new(10, 2, 1)
|
24
|
+
assert !bf.key?('foo')
|
25
|
+
bf['foo'] = 'bar'
|
26
|
+
assert bf.key?('foo')
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_keys
|
30
|
+
bf = BloomFilter.new(10, 2, 1)
|
31
|
+
bf['foo'] = 'bar'
|
32
|
+
bf['awesome'] = 'bar'
|
33
|
+
assert_equal %w{ awesome foo }.sort, bf.keys.sort
|
34
|
+
end
|
35
|
+
|
36
|
+
#TODO: no delete function yet.
|
37
|
+
def test_delete
|
38
|
+
bf = BloomFilter.new(10, 2, 1, 2, false)
|
39
|
+
bf.insert("test")
|
40
|
+
assert bf.include?("test")
|
41
|
+
bf.delete("test")
|
42
|
+
assert !bf.include?("test")
|
43
|
+
end
|
44
|
+
end
|
metadata
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: igrigorik-bloomfilter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ilya Grigorik
|
8
|
+
- Tatsuya Mori
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2009-02-21 00:00:00 -08:00
|
14
|
+
default_executable:
|
15
|
+
dependencies: []
|
16
|
+
|
17
|
+
description: Counting Bloom Filter in Ruby
|
18
|
+
email: ilya@igvita.com
|
19
|
+
executables: []
|
20
|
+
|
21
|
+
extensions:
|
22
|
+
- ext/extconf.rb
|
23
|
+
extra_rdoc_files: []
|
24
|
+
|
25
|
+
files:
|
26
|
+
- README.rdoc
|
27
|
+
- Rakefile
|
28
|
+
- ext/crc32.c
|
29
|
+
- ext/crc32.h
|
30
|
+
- ext/extconf.rb
|
31
|
+
- ext/sbloomfilter.c
|
32
|
+
- lib/bloomfilter.rb
|
33
|
+
- examples/bf.rb
|
34
|
+
- examples/simple.rb
|
35
|
+
- test/helper.rb
|
36
|
+
- test/test_bloom_filter.rb
|
37
|
+
has_rdoc: true
|
38
|
+
homepage: http://github.com/igrigorik/bloomfilter
|
39
|
+
post_install_message:
|
40
|
+
rdoc_options: []
|
41
|
+
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: "0"
|
49
|
+
version:
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: "0"
|
55
|
+
version:
|
56
|
+
requirements: []
|
57
|
+
|
58
|
+
rubyforge_project: bloomfilter
|
59
|
+
rubygems_version: 1.2.0
|
60
|
+
signing_key:
|
61
|
+
specification_version: 2
|
62
|
+
summary: Counting Bloom Filter in Ruby
|
63
|
+
test_files: []
|
64
|
+
|