igrigorik-bloomfilter 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +58 -0
- data/Rakefile +64 -0
- data/examples/bf.rb +54 -0
- data/examples/simple.rb +26 -0
- data/ext/crc32.c +32 -0
- data/ext/crc32.h +78 -0
- data/ext/extconf.rb +4 -0
- data/ext/sbloomfilter.c +312 -0
- data/lib/bloomfilter.rb +31 -0
- data/test/helper.rb +2 -0
- data/test/test_bloom_filter.rb +44 -0
- metadata +64 -0
data/README.rdoc
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
= BloomFilter
|
2
|
+
|
3
|
+
Counting Bloom Filter implemented in Ruby.
|
4
|
+
|
5
|
+
Bloom filter is a space-efficient probabilistic data structure that is used to
|
6
|
+
test whether an element is a member of a set. False positives are possible, but
|
7
|
+
false negatives are not. For more detail: http://en.wikipedia.org/wiki/Bloom_filter
|
8
|
+
|
9
|
+
== Implementation
|
10
|
+
|
11
|
+
Instead of using k different hash functions, this implementation seeds the CRC32 hash
|
12
|
+
with k different initial values (0, 1, ..., k-1). This may or may not give you a good
|
13
|
+
distribution, it all depends on the data.
|
14
|
+
|
15
|
+
== Example
|
16
|
+
|
17
|
+
require 'bloomfilter'
|
18
|
+
|
19
|
+
# M (size of bit array)
|
20
|
+
# K (number of hash functions)
|
21
|
+
# R (random seed) 100000000, k=4, random seed=1
|
22
|
+
|
23
|
+
# M, K, R
|
24
|
+
bf = BloomFilter.new(10, 2, 1)
|
25
|
+
bf.insert("test")
|
26
|
+
bf.include?("test")
|
27
|
+
=> true
|
28
|
+
bf.include?("test2")
|
29
|
+
=> false
|
30
|
+
bf.delete("test")
|
31
|
+
bf.include?("test")
|
32
|
+
=> false
|
33
|
+
|
34
|
+
# Hash with a bloom filter!
|
35
|
+
bf["test2"] = "bar"
|
36
|
+
bf["test2"]
|
37
|
+
=> "bar"
|
38
|
+
bf["test3"]
|
39
|
+
=> nil
|
40
|
+
|
41
|
+
bf.stats
|
42
|
+
Number of filter bits (m): 10
|
43
|
+
Number of filter elements (n): 2
|
44
|
+
Number of filter hashes (k) : 2
|
45
|
+
Predicted false positive rate = 10.87%
|
46
|
+
|
47
|
+
|
48
|
+
== Configuring Bloom Filter
|
49
|
+
|
50
|
+
Performance of the Bloom filter depends on a number of variables:
|
51
|
+
- size of the bit array
|
52
|
+
- number of hash functions
|
53
|
+
|
54
|
+
To figure out the values for these parameters, refer to:
|
55
|
+
http://www.igvita.com/2008/12/27/scalable-datasets-bloom-filters-in-ruby/
|
56
|
+
|
57
|
+
== Credits
|
58
|
+
Tatsuya Mori <valdzone@gmail.com> (Original: http://vald.x0.com/sb/)
|
data/Rakefile
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/clean'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
require 'rake/gempackagetask'
|
5
|
+
require 'rake/testtask'
|
6
|
+
require 'fileutils'
|
7
|
+
include FileUtils
|
8
|
+
|
9
|
+
# Default Rake task is compile
|
10
|
+
task :default => :compile
|
11
|
+
|
12
|
+
def make(makedir)
|
13
|
+
Dir.chdir(makedir) { sh 'make' }
|
14
|
+
end
|
15
|
+
|
16
|
+
def extconf(dir)
|
17
|
+
Dir.chdir(dir) { ruby "extconf.rb" }
|
18
|
+
end
|
19
|
+
|
20
|
+
def setup_extension(dir, extension)
|
21
|
+
ext = "ext/#{dir}"
|
22
|
+
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
|
23
|
+
ext_files = FileList[
|
24
|
+
"#{ext}/*.c",
|
25
|
+
"#{ext}/*.h",
|
26
|
+
"#{ext}/extconf.rb",
|
27
|
+
"#{ext}/Makefile",
|
28
|
+
"lib"
|
29
|
+
]
|
30
|
+
|
31
|
+
task "lib" do
|
32
|
+
directory "lib"
|
33
|
+
end
|
34
|
+
|
35
|
+
desc "Builds just the #{extension} extension"
|
36
|
+
task extension.to_sym => ["#{ext}/Makefile", ext_so ]
|
37
|
+
|
38
|
+
file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
|
39
|
+
extconf "#{ext}"
|
40
|
+
end
|
41
|
+
|
42
|
+
file ext_so => ext_files do
|
43
|
+
make "#{ext}"
|
44
|
+
cp ext_so, "lib"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
setup_extension("", "sbloomfilter")
|
49
|
+
|
50
|
+
task :compile => [:sbloomfilter]
|
51
|
+
|
52
|
+
CLEAN.include ['build/*', '**/*.o', '**/*.so', '**/*.a', '**/*.log', 'pkg']
|
53
|
+
CLEAN.include ['ext/Makefile']
|
54
|
+
|
55
|
+
Rake::TestTask.new do |t|
|
56
|
+
%w[ ext lib test ].each do |dir|
|
57
|
+
t.libs << dir
|
58
|
+
end
|
59
|
+
|
60
|
+
t.test_files = FileList['test/test_*.rb']
|
61
|
+
t.verbose = true
|
62
|
+
end
|
63
|
+
Rake::Task[:test].prerequisites << :compile
|
64
|
+
|
data/examples/bf.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bitset'
|
3
|
+
require 'zlib'
|
4
|
+
|
5
|
+
#
|
6
|
+
# Pure ruby implementation of a Bloom filter, just for kicks
|
7
|
+
#
|
8
|
+
|
9
|
+
class BloomFilter
|
10
|
+
|
11
|
+
def initialize(max_entries, num_hashes, seed)
|
12
|
+
@num_hashes = num_hashes
|
13
|
+
@size = max_entries.to_i
|
14
|
+
@bitmap = BitSet.new(@size)
|
15
|
+
@__mask = BitSet.new(@size)
|
16
|
+
@seed = seed
|
17
|
+
end
|
18
|
+
|
19
|
+
def insert(key)
|
20
|
+
mask = make_mask(key)
|
21
|
+
@bitmap |= mask
|
22
|
+
end
|
23
|
+
|
24
|
+
def new?(key)
|
25
|
+
mask = make_mask(key)
|
26
|
+
return ((@bitmap & mask) != mask);
|
27
|
+
end
|
28
|
+
|
29
|
+
def make_mask(key)
|
30
|
+
@__mask.clear
|
31
|
+
0.upto(@num_hashes.to_i - 1) do |i|
|
32
|
+
hash = Zlib.crc32(key, i + @seed)
|
33
|
+
@__mask.set(hash % @size, 1)
|
34
|
+
end
|
35
|
+
return @__mask
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
def main
|
41
|
+
bf = BloomFilter.new(1000000, 4, 0)
|
42
|
+
num = 0
|
43
|
+
while line = ARGF.gets
|
44
|
+
data = line.chop
|
45
|
+
|
46
|
+
if bf.new_entry?(data)
|
47
|
+
num += 1
|
48
|
+
bf.insert(data)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
print "#element = #{num}\n"
|
52
|
+
end
|
53
|
+
|
54
|
+
main
|
data/examples/simple.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bloomfilter'
|
3
|
+
|
4
|
+
WORDS = %w(duck penguin bear panda)
|
5
|
+
TEST = %w(penguin moose racooon)
|
6
|
+
|
7
|
+
# m = 100, k = 4, seed = 1
|
8
|
+
bf = BloomFilter.new(100, 4, 1)
|
9
|
+
|
10
|
+
WORDS.each { |w| bf.insert(w) }
|
11
|
+
TEST.each do |w|
|
12
|
+
puts "#{w}: #{bf.include?(w)}"
|
13
|
+
end
|
14
|
+
|
15
|
+
bf.stats
|
16
|
+
|
17
|
+
# penguin: true
|
18
|
+
# moose: false
|
19
|
+
# racooon: false
|
20
|
+
#
|
21
|
+
# Number of filter buckets (m): 100
|
22
|
+
# Number of bits per buckets (b): 1
|
23
|
+
# Number of filter elements (n): 4
|
24
|
+
# Number of filter hashes (k) : 4
|
25
|
+
# Raise on overflow? (r) : false
|
26
|
+
# Predicted false positive rate = 0.05%
|
data/ext/crc32.c
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
/* simple CRC32 code */
|
2
|
+
/*
|
3
|
+
* Copyright 2005 Aris Adamantiadis
|
4
|
+
*
|
5
|
+
* This file is part of the SSH Library
|
6
|
+
*
|
7
|
+
* The SSH Library is free software; you can redistribute it and/or modify
|
8
|
+
* it under the terms of the GNU Lesser General Public License as published by
|
9
|
+
* the Free Software Foundation; either version 2.1 of the License, or (at your
|
10
|
+
* option) any later version.
|
11
|
+
*
|
12
|
+
*
|
13
|
+
* The SSH Library is distributed in the hope that it will be useful, but
|
14
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
15
|
+
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
16
|
+
* License for more details.
|
17
|
+
*
|
18
|
+
* You should have received a copy of the GNU Lesser General Public License
|
19
|
+
* along with the SSH Library; see the file COPYING. If not, write to
|
20
|
+
* the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
|
21
|
+
* MA 02111-1307, USA. */
|
22
|
+
|
23
|
+
#include "crc32.h"
|
24
|
+
|
25
|
+
unsigned int crc32(unsigned int crc, char *buf, int len) {
|
26
|
+
while (len > 0) {
|
27
|
+
crc = crc_table[(crc ^ *buf) & 0xff] ^ (crc >> 8);
|
28
|
+
--len;
|
29
|
+
++buf;
|
30
|
+
}
|
31
|
+
return crc;
|
32
|
+
}
|
data/ext/crc32.h
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
/* simple CRC32 code */
|
2
|
+
/*
|
3
|
+
* Copyright 2005 Aris Adamantiadis
|
4
|
+
*
|
5
|
+
* This file is part of the SSH Library
|
6
|
+
*
|
7
|
+
* The SSH Library is free software; you can redistribute it and/or modify
|
8
|
+
* it under the terms of the GNU Lesser General Public License as published by
|
9
|
+
* the Free Software Foundation; either version 2.1 of the License, or (at your
|
10
|
+
* option) any later version.
|
11
|
+
*
|
12
|
+
*
|
13
|
+
* The SSH Library is distributed in the hope that it will be useful, but
|
14
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
15
|
+
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
16
|
+
* License for more details.
|
17
|
+
*
|
18
|
+
* You should have received a copy of the GNU Lesser General Public License
|
19
|
+
* along with the SSH Library; see the file COPYING. If not, write to
|
20
|
+
* the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
|
21
|
+
* MA 02111-1307, USA. */
|
22
|
+
|
23
|
+
static unsigned int crc_table[] = {
|
24
|
+
0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
|
25
|
+
0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
|
26
|
+
0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
|
27
|
+
0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
|
28
|
+
0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
|
29
|
+
0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
|
30
|
+
0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
|
31
|
+
0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
|
32
|
+
0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
|
33
|
+
0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
|
34
|
+
0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
|
35
|
+
0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
|
36
|
+
0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
|
37
|
+
0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
|
38
|
+
0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
|
39
|
+
0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
|
40
|
+
0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
|
41
|
+
0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
|
42
|
+
0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
|
43
|
+
0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
|
44
|
+
0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
|
45
|
+
0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
|
46
|
+
0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
|
47
|
+
0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
|
48
|
+
0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
|
49
|
+
0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
|
50
|
+
0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
|
51
|
+
0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
|
52
|
+
0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
|
53
|
+
0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
|
54
|
+
0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
|
55
|
+
0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
|
56
|
+
0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
|
57
|
+
0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
|
58
|
+
0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
|
59
|
+
0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
|
60
|
+
0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
|
61
|
+
0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
|
62
|
+
0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
|
63
|
+
0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
|
64
|
+
0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
|
65
|
+
0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
|
66
|
+
0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
|
67
|
+
0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
|
68
|
+
0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
|
69
|
+
0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
|
70
|
+
0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
|
71
|
+
0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
|
72
|
+
0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
|
73
|
+
0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
|
74
|
+
0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
|
75
|
+
0x2d02ef8dUL
|
76
|
+
};
|
77
|
+
|
78
|
+
unsigned int crc32(unsigned int crc, char *buf, int len);
|
data/ext/extconf.rb
ADDED
data/ext/sbloomfilter.c
ADDED
@@ -0,0 +1,312 @@
|
|
1
|
+
/*
|
2
|
+
* sbloomfilter.c - simple Bloom Filter
|
3
|
+
* (c) Tatsuya Mori <valdzone@gmail.com>
|
4
|
+
*/
|
5
|
+
|
6
|
+
#include "ruby.h"
|
7
|
+
#include "crc32.h"
|
8
|
+
|
9
|
+
static VALUE cBloomFilter;
|
10
|
+
|
11
|
+
struct BloomFilter {
|
12
|
+
int m; /* # of buckets in a bloom filter */
|
13
|
+
int b; /* # of bits in a bloom filter bucket */
|
14
|
+
int k; /* # of hash functions */
|
15
|
+
int s; /* # seed of hash functions */
|
16
|
+
int r; /* # raise on bucket overflow? */
|
17
|
+
int num_set; /* # of set bits */
|
18
|
+
unsigned char *ptr; /* bits data */
|
19
|
+
};
|
20
|
+
|
21
|
+
void bits_free(struct BloomFilter *bf) {
|
22
|
+
ruby_xfree(bf->ptr);
|
23
|
+
}
|
24
|
+
|
25
|
+
|
26
|
+
void bucket_unset(struct BloomFilter *bf, int index) {
|
27
|
+
int byte_offset = (index * bf->b) / 8;
|
28
|
+
int bit_offset = (index * bf->b) % 8;
|
29
|
+
unsigned int c = bf->ptr[byte_offset];
|
30
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
31
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
32
|
+
if ((c & mask) == 0) {
|
33
|
+
// do nothing
|
34
|
+
} else {
|
35
|
+
bf->ptr[byte_offset] -= (1 << bit_offset) & ((1 << 8) - 1);
|
36
|
+
bf->ptr[byte_offset + 1] -= ((1 << bit_offset) & ((1 << 16) - 1)) >> 8;
|
37
|
+
}
|
38
|
+
|
39
|
+
}
|
40
|
+
|
41
|
+
void bucket_set(struct BloomFilter *bf, int index) {
|
42
|
+
int byte_offset = (index * bf->b) / 8;
|
43
|
+
int bit_offset = (index * bf->b) % 8;
|
44
|
+
unsigned int c = bf->ptr[byte_offset];
|
45
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
46
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
47
|
+
if ((c & mask) == mask) {
|
48
|
+
if (bf->r == 1) rb_raise(rb_eRuntimeError, "bucket got filled up");
|
49
|
+
} else {
|
50
|
+
bf->ptr[byte_offset] += (1 << bit_offset) & ((1 << 8) - 1);
|
51
|
+
bf->ptr[byte_offset + 1] += ((1 << bit_offset) & ((1 << 16) - 1)) >> 8;
|
52
|
+
}
|
53
|
+
|
54
|
+
}
|
55
|
+
|
56
|
+
int bucket_check(struct BloomFilter *bf, int index) {
|
57
|
+
int byte_offset = (index * bf->b) / 8;
|
58
|
+
int bit_offset = (index * bf->b) % 8;
|
59
|
+
unsigned int c = bf->ptr[byte_offset];
|
60
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
61
|
+
|
62
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
63
|
+
return (c & mask) >> bit_offset;
|
64
|
+
}
|
65
|
+
|
66
|
+
int bucket_get(struct BloomFilter *bf, int index) {
|
67
|
+
int byte_offset = (index * bf->b) / 8;
|
68
|
+
int bit_offset = (index * bf->b) % 8;
|
69
|
+
unsigned int c = bf->ptr[byte_offset];
|
70
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
71
|
+
|
72
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
73
|
+
return (c & mask) >> bit_offset;
|
74
|
+
}
|
75
|
+
|
76
|
+
static VALUE bf_s_new(int argc, VALUE *argv, VALUE self) {
|
77
|
+
struct BloomFilter *bf;
|
78
|
+
VALUE arg1, arg2, arg3, arg4, arg5, obj;
|
79
|
+
int m, k, s, b, r, bytes;
|
80
|
+
|
81
|
+
obj = Data_Make_Struct(self, struct BloomFilter, NULL, bits_free, bf);
|
82
|
+
|
83
|
+
if (argc == 5) {
|
84
|
+
arg1 = argv[0];
|
85
|
+
arg2 = argv[1];
|
86
|
+
arg3 = argv[2];
|
87
|
+
arg4 = argv[3];
|
88
|
+
arg5 = argv[4];
|
89
|
+
} else if (argc == 4) {
|
90
|
+
arg1 = argv[0];
|
91
|
+
arg2 = argv[1];
|
92
|
+
arg3 = argv[2];
|
93
|
+
arg4 = argv[3];
|
94
|
+
arg5 = 0;
|
95
|
+
} else if (argc == 3) {
|
96
|
+
arg1 = argv[0];
|
97
|
+
arg2 = argv[1];
|
98
|
+
arg3 = argv[2];
|
99
|
+
arg4 = INT2FIX(1);
|
100
|
+
arg5 = 0;
|
101
|
+
} else if (argc == 2) {
|
102
|
+
arg1 = argv[0];
|
103
|
+
arg2 = argv[1];
|
104
|
+
arg3 = INT2FIX(0);
|
105
|
+
arg4 = INT2FIX(1);
|
106
|
+
arg5 = 0;
|
107
|
+
} else if (argc == 1) {
|
108
|
+
arg1 = argv[0];
|
109
|
+
arg2 = INT2FIX(4);
|
110
|
+
arg3 = INT2FIX(0);
|
111
|
+
arg4 = INT2FIX(1);
|
112
|
+
arg5 = 0;
|
113
|
+
} else { /* default = Fugou approach :-) */
|
114
|
+
arg1 = INT2FIX(100000000);
|
115
|
+
arg2 = INT2FIX(4);
|
116
|
+
arg3 = INT2FIX(0);
|
117
|
+
arg4 = INT2FIX(1);
|
118
|
+
arg5 = 0;
|
119
|
+
}
|
120
|
+
|
121
|
+
m = FIX2INT(arg1);
|
122
|
+
k = FIX2INT(arg2);
|
123
|
+
s = FIX2INT(arg3);
|
124
|
+
b = FIX2INT(arg4);
|
125
|
+
r = FIX2INT(arg5);
|
126
|
+
|
127
|
+
if (b < 1 || b > 8)
|
128
|
+
rb_raise(rb_eArgError, "bucket size");
|
129
|
+
if (m < 1)
|
130
|
+
rb_raise(rb_eArgError, "array size");
|
131
|
+
if (k < 1)
|
132
|
+
rb_raise(rb_eArgError, "hash length");
|
133
|
+
if (s < 0)
|
134
|
+
rb_raise(rb_eArgError, "random seed");
|
135
|
+
|
136
|
+
bf->b = b;
|
137
|
+
bf->m = m;
|
138
|
+
bf->k = k;
|
139
|
+
bf->s = s;
|
140
|
+
bf->r = r;
|
141
|
+
bf->num_set = 0;
|
142
|
+
|
143
|
+
bytes = ((m * b) + 15) / 8;
|
144
|
+
bf->ptr = ALLOC_N(unsigned char, bytes);
|
145
|
+
|
146
|
+
/* initialize the bits with zeros */
|
147
|
+
memset(bf->ptr, 0, bytes);
|
148
|
+
rb_iv_set(obj, "@hash_value", rb_hash_new());
|
149
|
+
|
150
|
+
return obj;
|
151
|
+
}
|
152
|
+
|
153
|
+
static VALUE bf_m(VALUE self) {
|
154
|
+
struct BloomFilter *bf;
|
155
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
156
|
+
return INT2FIX(bf->m);
|
157
|
+
}
|
158
|
+
|
159
|
+
static VALUE bf_k(VALUE self) {
|
160
|
+
struct BloomFilter *bf;
|
161
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
162
|
+
return INT2FIX(bf->k);
|
163
|
+
}
|
164
|
+
|
165
|
+
static VALUE bf_b(VALUE self) {
|
166
|
+
struct BloomFilter *bf;
|
167
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
168
|
+
return INT2FIX(bf->b);
|
169
|
+
}
|
170
|
+
|
171
|
+
static VALUE bf_r(VALUE self) {
|
172
|
+
struct BloomFilter *bf;
|
173
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
174
|
+
return bf->r == 0 ? Qfalse : Qtrue;
|
175
|
+
}
|
176
|
+
|
177
|
+
static VALUE bf_num_set(VALUE self) {
|
178
|
+
struct BloomFilter *bf;
|
179
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
180
|
+
return INT2FIX(bf->num_set);
|
181
|
+
}
|
182
|
+
|
183
|
+
static VALUE bf_insert(VALUE self, VALUE key) {
|
184
|
+
int index, seed;
|
185
|
+
int i, len, m, k, s;
|
186
|
+
char *ckey;
|
187
|
+
|
188
|
+
struct BloomFilter *bf;
|
189
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
190
|
+
|
191
|
+
Check_Type(key, T_STRING);
|
192
|
+
ckey = STR2CSTR(key);
|
193
|
+
len = (int) (RSTRING(key)->len); /* length of the string in bytes */
|
194
|
+
|
195
|
+
m = bf->m;
|
196
|
+
k = bf->k;
|
197
|
+
s = bf->s;
|
198
|
+
|
199
|
+
for (i = 0; i <= k - 1; i++) {
|
200
|
+
/* seeds for hash functions */
|
201
|
+
seed = i + s;
|
202
|
+
|
203
|
+
/* hash */
|
204
|
+
index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
|
205
|
+
|
206
|
+
/* set a bit at the index */
|
207
|
+
bucket_set(bf, index);
|
208
|
+
}
|
209
|
+
|
210
|
+
bf->num_set += 1;
|
211
|
+
return Qnil;
|
212
|
+
}
|
213
|
+
|
214
|
+
static VALUE bf_delete(VALUE self, VALUE key) {
|
215
|
+
int index, seed;
|
216
|
+
int i, len, m, k, s;
|
217
|
+
char *ckey;
|
218
|
+
|
219
|
+
struct BloomFilter *bf;
|
220
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
221
|
+
|
222
|
+
Check_Type(key, T_STRING);
|
223
|
+
ckey = STR2CSTR(key);
|
224
|
+
len = (int) (RSTRING(key)->len); /* length of the string in bytes */
|
225
|
+
|
226
|
+
m = bf->m;
|
227
|
+
k = bf->k;
|
228
|
+
s = bf->s;
|
229
|
+
|
230
|
+
for (i = 0; i <= k - 1; i++) {
|
231
|
+
/* seeds for hash functions */
|
232
|
+
seed = i + s;
|
233
|
+
|
234
|
+
/* hash */
|
235
|
+
index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
|
236
|
+
|
237
|
+
/* set a bit at the index */
|
238
|
+
bucket_unset(bf, index);
|
239
|
+
}
|
240
|
+
|
241
|
+
bf->num_set += 1;
|
242
|
+
return Qnil;
|
243
|
+
}
|
244
|
+
|
245
|
+
|
246
|
+
static VALUE bf_include(VALUE self, VALUE key) {
|
247
|
+
int index, seed;
|
248
|
+
int i, len, m, k, s;
|
249
|
+
char *ckey;
|
250
|
+
|
251
|
+
struct BloomFilter *bf;
|
252
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
253
|
+
|
254
|
+
Check_Type(key, T_STRING);
|
255
|
+
ckey = STR2CSTR(key);
|
256
|
+
len = (int) (RSTRING(key)->len); /* length of the string in bytes */
|
257
|
+
|
258
|
+
m = bf->m;
|
259
|
+
k = bf->k;
|
260
|
+
s = bf->s;
|
261
|
+
|
262
|
+
for (i = 0; i <= k - 1; i++) {
|
263
|
+
/* seeds for hash functions */
|
264
|
+
seed = i + s;
|
265
|
+
|
266
|
+
/* hash */
|
267
|
+
index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
|
268
|
+
|
269
|
+
/* check the bit at the index */
|
270
|
+
if (!bucket_check(bf, index))
|
271
|
+
return Qfalse; /* i.e., it is a new entry ; escape the loop */
|
272
|
+
}
|
273
|
+
|
274
|
+
return Qtrue;
|
275
|
+
}
|
276
|
+
|
277
|
+
static VALUE bf_to_s(VALUE self) {
|
278
|
+
struct BloomFilter *bf;
|
279
|
+
unsigned char *ptr;
|
280
|
+
int i;
|
281
|
+
VALUE str;
|
282
|
+
|
283
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
284
|
+
str = rb_str_new(0, bf->m);
|
285
|
+
|
286
|
+
ptr = (unsigned char *) RSTRING(str)->ptr;
|
287
|
+
for (i = 0; i < bf->m; i++)
|
288
|
+
*ptr++ = bucket_get(bf, i) ? '1' : '0';
|
289
|
+
|
290
|
+
return str;
|
291
|
+
}
|
292
|
+
|
293
|
+
void Init_sbloomfilter(void) {
|
294
|
+
cBloomFilter = rb_define_class("BloomFilter", rb_cObject);
|
295
|
+
rb_define_singleton_method(cBloomFilter, "new", bf_s_new, -1);
|
296
|
+
rb_define_method(cBloomFilter, "m", bf_m, 0);
|
297
|
+
rb_define_method(cBloomFilter, "k", bf_k, 0);
|
298
|
+
rb_define_method(cBloomFilter, "b", bf_b, 0);
|
299
|
+
rb_define_method(cBloomFilter, "r", bf_r, 0);
|
300
|
+
rb_define_method(cBloomFilter, "num_set", bf_num_set, 0);
|
301
|
+
rb_define_method(cBloomFilter, "insert", bf_insert, 1);
|
302
|
+
rb_define_method(cBloomFilter, "delete", bf_delete, 1);
|
303
|
+
rb_define_method(cBloomFilter, "include?", bf_include, 1);
|
304
|
+
rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
|
305
|
+
|
306
|
+
/* functions that have not been implemented, yet */
|
307
|
+
|
308
|
+
// rb_define_method(cBloomFilter, "clear", bf_clear, 0);
|
309
|
+
// rb_define_method(cBloomFilter, "&", bf_and, 1);
|
310
|
+
// rb_define_method(cBloomFilter, "|", bf_or, 1);
|
311
|
+
// rb_define_method(cBloomFilter, "<=>", bf_cmp, 1);
|
312
|
+
}
|
data/lib/bloomfilter.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'sbloomfilter'
|
2
|
+
|
3
|
+
class BloomFilter
|
4
|
+
def stats
|
5
|
+
fp = ((1.0 - Math.exp(-(self.k * self.num_set).to_f / self.m)) ** self.k) * 100
|
6
|
+
printf "Number of filter buckets (m): %d\n" % self.m
|
7
|
+
printf "Number of bits per buckets (b): %d\n" % self.b
|
8
|
+
printf "Number of filter elements (n): %d\n" % self.num_set
|
9
|
+
printf "Number of filter hashes (k) : %d\n" % self.k
|
10
|
+
printf "Raise on overflow? (r) : %s\n" % self.r.to_s
|
11
|
+
printf "Predicted false positive rate = %.2f%\n" % fp
|
12
|
+
end
|
13
|
+
|
14
|
+
def []= key, value
|
15
|
+
insert(key)
|
16
|
+
@hash_value[key] = value
|
17
|
+
end
|
18
|
+
|
19
|
+
def [] key
|
20
|
+
return nil unless include?(key)
|
21
|
+
@hash_value[key]
|
22
|
+
end
|
23
|
+
|
24
|
+
def key? key
|
25
|
+
include?(key)
|
26
|
+
end
|
27
|
+
|
28
|
+
def keys
|
29
|
+
@hash_value.keys
|
30
|
+
end
|
31
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestBloomFilter < Test::Unit::TestCase
|
4
|
+
def test_include?
|
5
|
+
bf = BloomFilter.new(10, 2, 1, 1, false)
|
6
|
+
bf.insert("test")
|
7
|
+
bf.insert("test")
|
8
|
+
bf.insert("test")
|
9
|
+
bf.insert("test")
|
10
|
+
bf.insert("test")
|
11
|
+
assert bf.include?("test")
|
12
|
+
assert !bf.include?("lkajdsfhlkajsdfhlakjsdfhalsjdkfh")
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_hash_key_insert
|
16
|
+
bf = BloomFilter.new(10, 2, 1)
|
17
|
+
bf['foo'] = 'bar'
|
18
|
+
assert bf.key?('foo')
|
19
|
+
assert_equal 'bar', bf['foo']
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_hash_key?
|
23
|
+
bf = BloomFilter.new(10, 2, 1)
|
24
|
+
assert !bf.key?('foo')
|
25
|
+
bf['foo'] = 'bar'
|
26
|
+
assert bf.key?('foo')
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_keys
|
30
|
+
bf = BloomFilter.new(10, 2, 1)
|
31
|
+
bf['foo'] = 'bar'
|
32
|
+
bf['awesome'] = 'bar'
|
33
|
+
assert_equal %w{ awesome foo }.sort, bf.keys.sort
|
34
|
+
end
|
35
|
+
|
36
|
+
#TODO: no delete function yet.
|
37
|
+
def test_delete
|
38
|
+
bf = BloomFilter.new(10, 2, 1, 2, false)
|
39
|
+
bf.insert("test")
|
40
|
+
assert bf.include?("test")
|
41
|
+
bf.delete("test")
|
42
|
+
assert !bf.include?("test")
|
43
|
+
end
|
44
|
+
end
|
metadata
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: igrigorik-bloomfilter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ilya Grigorik
|
8
|
+
- Tatsuya Mori
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2009-02-21 00:00:00 -08:00
|
14
|
+
default_executable:
|
15
|
+
dependencies: []
|
16
|
+
|
17
|
+
description: Counting Bloom Filter in Ruby
|
18
|
+
email: ilya@igvita.com
|
19
|
+
executables: []
|
20
|
+
|
21
|
+
extensions:
|
22
|
+
- ext/extconf.rb
|
23
|
+
extra_rdoc_files: []
|
24
|
+
|
25
|
+
files:
|
26
|
+
- README.rdoc
|
27
|
+
- Rakefile
|
28
|
+
- ext/crc32.c
|
29
|
+
- ext/crc32.h
|
30
|
+
- ext/extconf.rb
|
31
|
+
- ext/sbloomfilter.c
|
32
|
+
- lib/bloomfilter.rb
|
33
|
+
- examples/bf.rb
|
34
|
+
- examples/simple.rb
|
35
|
+
- test/helper.rb
|
36
|
+
- test/test_bloom_filter.rb
|
37
|
+
has_rdoc: true
|
38
|
+
homepage: http://github.com/igrigorik/bloomfilter
|
39
|
+
post_install_message:
|
40
|
+
rdoc_options: []
|
41
|
+
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: "0"
|
49
|
+
version:
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: "0"
|
55
|
+
version:
|
56
|
+
requirements: []
|
57
|
+
|
58
|
+
rubyforge_project: bloomfilter
|
59
|
+
rubygems_version: 1.2.0
|
60
|
+
signing_key:
|
61
|
+
specification_version: 2
|
62
|
+
summary: Counting Bloom Filter in Ruby
|
63
|
+
test_files: []
|
64
|
+
|