igrigorik-bloomfilter 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc ADDED
@@ -0,0 +1,58 @@
1
+ = BloomFilter
2
+
3
+ Counting Bloom Filter implemented in Ruby.
4
+
5
+ Bloom filter is a space-efficient probabilistic data structure that is used to
6
+ test whether an element is a member of a set. False positives are possible, but
7
+ false negatives are not. For more detail: http://en.wikipedia.org/wiki/Bloom_filter
8
+
9
+ == Implementation
10
+
11
+ Instead of using k different hash functions, this implementation seeds the CRC32 hash
12
+ with k different initial values (0, 1, ..., k-1). This may or may not give you a good
13
+ distribution, it all depends on the data.
14
+
15
+ == Example
16
+
17
+ require 'bloomfilter'
18
+
19
+ # M (size of bit array)
20
+ # K (number of hash functions)
21
+ # R (random seed) 100000000, k=4, random seed=1
22
+
23
+ # M, K, R
24
+ bf = BloomFilter.new(10, 2, 1)
25
+ bf.insert("test")
26
+ bf.include?("test")
27
+ => true
28
+ bf.include?("test2")
29
+ => false
30
+ bf.delete("test")
31
+ bf.include?("test")
32
+ => false
33
+
34
+ # Hash with a bloom filter!
35
+ bf["test2"] = "bar"
36
+ bf["test2"]
37
+ => "bar"
38
+ bf["test3"]
39
+ => nil
40
+
41
+ bf.stats
42
+ Number of filter bits (m): 10
43
+ Number of filter elements (n): 2
44
+ Number of filter hashes (k) : 2
45
+ Predicted false positive rate = 10.87%
46
+
47
+
48
+ == Configuring Bloom Filter
49
+
50
+ Performance of the Bloom filter depends on a number of variables:
51
+ - size of the bit array
52
+ - number of hash functions
53
+
54
+ To figure out the values for these parameters, refer to:
55
+ http://www.igvita.com/2008/12/27/scalable-datasets-bloom-filters-in-ruby/
56
+
57
+ == Credits
58
+ Tatsuya Mori <valdzone@gmail.com> (Original: http://vald.x0.com/sb/)
data/Rakefile ADDED
@@ -0,0 +1,64 @@
1
+ require 'rake'
2
+ require 'rake/clean'
3
+ require 'rake/rdoctask'
4
+ require 'rake/gempackagetask'
5
+ require 'rake/testtask'
6
+ require 'fileutils'
7
+ include FileUtils
8
+
9
+ # Default Rake task is compile
10
+ task :default => :compile
11
+
12
+ def make(makedir)
13
+ Dir.chdir(makedir) { sh 'make' }
14
+ end
15
+
16
+ def extconf(dir)
17
+ Dir.chdir(dir) { ruby "extconf.rb" }
18
+ end
19
+
20
+ def setup_extension(dir, extension)
21
+ ext = "ext/#{dir}"
22
+ ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
23
+ ext_files = FileList[
24
+ "#{ext}/*.c",
25
+ "#{ext}/*.h",
26
+ "#{ext}/extconf.rb",
27
+ "#{ext}/Makefile",
28
+ "lib"
29
+ ]
30
+
31
+ task "lib" do
32
+ directory "lib"
33
+ end
34
+
35
+ desc "Builds just the #{extension} extension"
36
+ task extension.to_sym => ["#{ext}/Makefile", ext_so ]
37
+
38
+ file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
39
+ extconf "#{ext}"
40
+ end
41
+
42
+ file ext_so => ext_files do
43
+ make "#{ext}"
44
+ cp ext_so, "lib"
45
+ end
46
+ end
47
+
48
+ setup_extension("", "sbloomfilter")
49
+
50
+ task :compile => [:sbloomfilter]
51
+
52
+ CLEAN.include ['build/*', '**/*.o', '**/*.so', '**/*.a', '**/*.log', 'pkg']
53
+ CLEAN.include ['ext/Makefile']
54
+
55
+ Rake::TestTask.new do |t|
56
+ %w[ ext lib test ].each do |dir|
57
+ t.libs << dir
58
+ end
59
+
60
+ t.test_files = FileList['test/test_*.rb']
61
+ t.verbose = true
62
+ end
63
+ Rake::Task[:test].prerequisites << :compile
64
+
data/examples/bf.rb ADDED
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bitset'
3
+ require 'zlib'
4
+
5
+ #
6
+ # Pure ruby implementation of a Bloom filter, just for kicks
7
+ #
8
+
9
+ class BloomFilter
10
+
11
+ def initialize(max_entries, num_hashes, seed)
12
+ @num_hashes = num_hashes
13
+ @size = max_entries.to_i
14
+ @bitmap = BitSet.new(@size)
15
+ @__mask = BitSet.new(@size)
16
+ @seed = seed
17
+ end
18
+
19
+ def insert(key)
20
+ mask = make_mask(key)
21
+ @bitmap |= mask
22
+ end
23
+
24
+ def new?(key)
25
+ mask = make_mask(key)
26
+ return ((@bitmap & mask) != mask);
27
+ end
28
+
29
+ def make_mask(key)
30
+ @__mask.clear
31
+ 0.upto(@num_hashes.to_i - 1) do |i|
32
+ hash = Zlib.crc32(key, i + @seed)
33
+ @__mask.set(hash % @size, 1)
34
+ end
35
+ return @__mask
36
+ end
37
+ end
38
+
39
+
40
+ def main
41
+ bf = BloomFilter.new(1000000, 4, 0)
42
+ num = 0
43
+ while line = ARGF.gets
44
+ data = line.chop
45
+
46
+ if bf.new_entry?(data)
47
+ num += 1
48
+ bf.insert(data)
49
+ end
50
+ end
51
+ print "#element = #{num}\n"
52
+ end
53
+
54
+ main
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bloomfilter'
3
+
4
+ WORDS = %w(duck penguin bear panda)
5
+ TEST = %w(penguin moose racooon)
6
+
7
+ # m = 100, k = 4, seed = 1
8
+ bf = BloomFilter.new(100, 4, 1)
9
+
10
+ WORDS.each { |w| bf.insert(w) }
11
+ TEST.each do |w|
12
+ puts "#{w}: #{bf.include?(w)}"
13
+ end
14
+
15
+ bf.stats
16
+
17
+ # penguin: true
18
+ # moose: false
19
+ # racooon: false
20
+ #
21
+ # Number of filter buckets (m): 100
22
+ # Number of bits per buckets (b): 1
23
+ # Number of filter elements (n): 4
24
+ # Number of filter hashes (k) : 4
25
+ # Raise on overflow? (r) : false
26
+ # Predicted false positive rate = 0.05%
data/ext/crc32.c ADDED
@@ -0,0 +1,32 @@
1
+ /* simple CRC32 code */
2
+ /*
3
+ * Copyright 2005 Aris Adamantiadis
4
+ *
5
+ * This file is part of the SSH Library
6
+ *
7
+ * The SSH Library is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU Lesser General Public License as published by
9
+ * the Free Software Foundation; either version 2.1 of the License, or (at your
10
+ * option) any later version.
11
+ *
12
+ *
13
+ * The SSH Library is distributed in the hope that it will be useful, but
14
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16
+ * License for more details.
17
+ *
18
+ * You should have received a copy of the GNU Lesser General Public License
19
+ * along with the SSH Library; see the file COPYING. If not, write to
20
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
+ * MA 02111-1307, USA. */
22
+
23
+ #include "crc32.h"
24
+
25
+ unsigned int crc32(unsigned int crc, char *buf, int len) {
26
+ while (len > 0) {
27
+ crc = crc_table[(crc ^ *buf) & 0xff] ^ (crc >> 8);
28
+ --len;
29
+ ++buf;
30
+ }
31
+ return crc;
32
+ }
data/ext/crc32.h ADDED
@@ -0,0 +1,78 @@
1
+ /* simple CRC32 code */
2
+ /*
3
+ * Copyright 2005 Aris Adamantiadis
4
+ *
5
+ * This file is part of the SSH Library
6
+ *
7
+ * The SSH Library is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU Lesser General Public License as published by
9
+ * the Free Software Foundation; either version 2.1 of the License, or (at your
10
+ * option) any later version.
11
+ *
12
+ *
13
+ * The SSH Library is distributed in the hope that it will be useful, but
14
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16
+ * License for more details.
17
+ *
18
+ * You should have received a copy of the GNU Lesser General Public License
19
+ * along with the SSH Library; see the file COPYING. If not, write to
20
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
+ * MA 02111-1307, USA. */
22
+
23
+ static unsigned int crc_table[] = {
24
+ 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
25
+ 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
26
+ 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
27
+ 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
28
+ 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
29
+ 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
30
+ 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
31
+ 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
32
+ 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
33
+ 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
34
+ 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
35
+ 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
36
+ 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
37
+ 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
38
+ 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
39
+ 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
40
+ 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
41
+ 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
42
+ 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
43
+ 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
44
+ 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
45
+ 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
46
+ 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
47
+ 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
48
+ 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
49
+ 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
50
+ 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
51
+ 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
52
+ 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
53
+ 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
54
+ 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
55
+ 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
56
+ 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
57
+ 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
58
+ 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
59
+ 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
60
+ 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
61
+ 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
62
+ 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
63
+ 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
64
+ 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
65
+ 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
66
+ 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
67
+ 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
68
+ 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
69
+ 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
70
+ 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
71
+ 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
72
+ 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
73
+ 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
74
+ 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
75
+ 0x2d02ef8dUL
76
+ };
77
+
78
+ unsigned int crc32(unsigned int crc, char *buf, int len);
data/ext/extconf.rb ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require "mkmf"
3
+
4
+ create_makefile("sbloomfilter")
@@ -0,0 +1,312 @@
1
+ /*
2
+ * sbloomfilter.c - simple Bloom Filter
3
+ * (c) Tatsuya Mori <valdzone@gmail.com>
4
+ */
5
+
6
+ #include "ruby.h"
7
+ #include "crc32.h"
8
+
9
+ static VALUE cBloomFilter;
10
+
11
+ struct BloomFilter {
12
+ int m; /* # of buckets in a bloom filter */
13
+ int b; /* # of bits in a bloom filter bucket */
14
+ int k; /* # of hash functions */
15
+ int s; /* # seed of hash functions */
16
+ int r; /* # raise on bucket overflow? */
17
+ int num_set; /* # of set bits */
18
+ unsigned char *ptr; /* bits data */
19
+ };
20
+
21
+ void bits_free(struct BloomFilter *bf) {
22
+ ruby_xfree(bf->ptr);
23
+ }
24
+
25
+
26
+ void bucket_unset(struct BloomFilter *bf, int index) {
27
+ int byte_offset = (index * bf->b) / 8;
28
+ int bit_offset = (index * bf->b) % 8;
29
+ unsigned int c = bf->ptr[byte_offset];
30
+ c += bf->ptr[byte_offset + 1] << 8;
31
+ unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
32
+ if ((c & mask) == 0) {
33
+ // do nothing
34
+ } else {
35
+ bf->ptr[byte_offset] -= (1 << bit_offset) & ((1 << 8) - 1);
36
+ bf->ptr[byte_offset + 1] -= ((1 << bit_offset) & ((1 << 16) - 1)) >> 8;
37
+ }
38
+
39
+ }
40
+
41
+ void bucket_set(struct BloomFilter *bf, int index) {
42
+ int byte_offset = (index * bf->b) / 8;
43
+ int bit_offset = (index * bf->b) % 8;
44
+ unsigned int c = bf->ptr[byte_offset];
45
+ c += bf->ptr[byte_offset + 1] << 8;
46
+ unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
47
+ if ((c & mask) == mask) {
48
+ if (bf->r == 1) rb_raise(rb_eRuntimeError, "bucket got filled up");
49
+ } else {
50
+ bf->ptr[byte_offset] += (1 << bit_offset) & ((1 << 8) - 1);
51
+ bf->ptr[byte_offset + 1] += ((1 << bit_offset) & ((1 << 16) - 1)) >> 8;
52
+ }
53
+
54
+ }
55
+
56
+ int bucket_check(struct BloomFilter *bf, int index) {
57
+ int byte_offset = (index * bf->b) / 8;
58
+ int bit_offset = (index * bf->b) % 8;
59
+ unsigned int c = bf->ptr[byte_offset];
60
+ c += bf->ptr[byte_offset + 1] << 8;
61
+
62
+ unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
63
+ return (c & mask) >> bit_offset;
64
+ }
65
+
66
+ int bucket_get(struct BloomFilter *bf, int index) {
67
+ int byte_offset = (index * bf->b) / 8;
68
+ int bit_offset = (index * bf->b) % 8;
69
+ unsigned int c = bf->ptr[byte_offset];
70
+ c += bf->ptr[byte_offset + 1] << 8;
71
+
72
+ unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
73
+ return (c & mask) >> bit_offset;
74
+ }
75
+
76
+ static VALUE bf_s_new(int argc, VALUE *argv, VALUE self) {
77
+ struct BloomFilter *bf;
78
+ VALUE arg1, arg2, arg3, arg4, arg5, obj;
79
+ int m, k, s, b, r, bytes;
80
+
81
+ obj = Data_Make_Struct(self, struct BloomFilter, NULL, bits_free, bf);
82
+
83
+ if (argc == 5) {
84
+ arg1 = argv[0];
85
+ arg2 = argv[1];
86
+ arg3 = argv[2];
87
+ arg4 = argv[3];
88
+ arg5 = argv[4];
89
+ } else if (argc == 4) {
90
+ arg1 = argv[0];
91
+ arg2 = argv[1];
92
+ arg3 = argv[2];
93
+ arg4 = argv[3];
94
+ arg5 = 0;
95
+ } else if (argc == 3) {
96
+ arg1 = argv[0];
97
+ arg2 = argv[1];
98
+ arg3 = argv[2];
99
+ arg4 = INT2FIX(1);
100
+ arg5 = 0;
101
+ } else if (argc == 2) {
102
+ arg1 = argv[0];
103
+ arg2 = argv[1];
104
+ arg3 = INT2FIX(0);
105
+ arg4 = INT2FIX(1);
106
+ arg5 = 0;
107
+ } else if (argc == 1) {
108
+ arg1 = argv[0];
109
+ arg2 = INT2FIX(4);
110
+ arg3 = INT2FIX(0);
111
+ arg4 = INT2FIX(1);
112
+ arg5 = 0;
113
+ } else { /* default = Fugou approach :-) */
114
+ arg1 = INT2FIX(100000000);
115
+ arg2 = INT2FIX(4);
116
+ arg3 = INT2FIX(0);
117
+ arg4 = INT2FIX(1);
118
+ arg5 = 0;
119
+ }
120
+
121
+ m = FIX2INT(arg1);
122
+ k = FIX2INT(arg2);
123
+ s = FIX2INT(arg3);
124
+ b = FIX2INT(arg4);
125
+ r = FIX2INT(arg5);
126
+
127
+ if (b < 1 || b > 8)
128
+ rb_raise(rb_eArgError, "bucket size");
129
+ if (m < 1)
130
+ rb_raise(rb_eArgError, "array size");
131
+ if (k < 1)
132
+ rb_raise(rb_eArgError, "hash length");
133
+ if (s < 0)
134
+ rb_raise(rb_eArgError, "random seed");
135
+
136
+ bf->b = b;
137
+ bf->m = m;
138
+ bf->k = k;
139
+ bf->s = s;
140
+ bf->r = r;
141
+ bf->num_set = 0;
142
+
143
+ bytes = ((m * b) + 15) / 8;
144
+ bf->ptr = ALLOC_N(unsigned char, bytes);
145
+
146
+ /* initialize the bits with zeros */
147
+ memset(bf->ptr, 0, bytes);
148
+ rb_iv_set(obj, "@hash_value", rb_hash_new());
149
+
150
+ return obj;
151
+ }
152
+
153
+ static VALUE bf_m(VALUE self) {
154
+ struct BloomFilter *bf;
155
+ Data_Get_Struct(self, struct BloomFilter, bf);
156
+ return INT2FIX(bf->m);
157
+ }
158
+
159
+ static VALUE bf_k(VALUE self) {
160
+ struct BloomFilter *bf;
161
+ Data_Get_Struct(self, struct BloomFilter, bf);
162
+ return INT2FIX(bf->k);
163
+ }
164
+
165
+ static VALUE bf_b(VALUE self) {
166
+ struct BloomFilter *bf;
167
+ Data_Get_Struct(self, struct BloomFilter, bf);
168
+ return INT2FIX(bf->b);
169
+ }
170
+
171
+ static VALUE bf_r(VALUE self) {
172
+ struct BloomFilter *bf;
173
+ Data_Get_Struct(self, struct BloomFilter, bf);
174
+ return bf->r == 0 ? Qfalse : Qtrue;
175
+ }
176
+
177
+ static VALUE bf_num_set(VALUE self) {
178
+ struct BloomFilter *bf;
179
+ Data_Get_Struct(self, struct BloomFilter, bf);
180
+ return INT2FIX(bf->num_set);
181
+ }
182
+
183
+ static VALUE bf_insert(VALUE self, VALUE key) {
184
+ int index, seed;
185
+ int i, len, m, k, s;
186
+ char *ckey;
187
+
188
+ struct BloomFilter *bf;
189
+ Data_Get_Struct(self, struct BloomFilter, bf);
190
+
191
+ Check_Type(key, T_STRING);
192
+ ckey = STR2CSTR(key);
193
+ len = (int) (RSTRING(key)->len); /* length of the string in bytes */
194
+
195
+ m = bf->m;
196
+ k = bf->k;
197
+ s = bf->s;
198
+
199
+ for (i = 0; i <= k - 1; i++) {
200
+ /* seeds for hash functions */
201
+ seed = i + s;
202
+
203
+ /* hash */
204
+ index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
205
+
206
+ /* set a bit at the index */
207
+ bucket_set(bf, index);
208
+ }
209
+
210
+ bf->num_set += 1;
211
+ return Qnil;
212
+ }
213
+
214
+ static VALUE bf_delete(VALUE self, VALUE key) {
215
+ int index, seed;
216
+ int i, len, m, k, s;
217
+ char *ckey;
218
+
219
+ struct BloomFilter *bf;
220
+ Data_Get_Struct(self, struct BloomFilter, bf);
221
+
222
+ Check_Type(key, T_STRING);
223
+ ckey = STR2CSTR(key);
224
+ len = (int) (RSTRING(key)->len); /* length of the string in bytes */
225
+
226
+ m = bf->m;
227
+ k = bf->k;
228
+ s = bf->s;
229
+
230
+ for (i = 0; i <= k - 1; i++) {
231
+ /* seeds for hash functions */
232
+ seed = i + s;
233
+
234
+ /* hash */
235
+ index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
236
+
237
+ /* set a bit at the index */
238
+ bucket_unset(bf, index);
239
+ }
240
+
241
+ bf->num_set += 1;
242
+ return Qnil;
243
+ }
244
+
245
+
246
+ static VALUE bf_include(VALUE self, VALUE key) {
247
+ int index, seed;
248
+ int i, len, m, k, s;
249
+ char *ckey;
250
+
251
+ struct BloomFilter *bf;
252
+ Data_Get_Struct(self, struct BloomFilter, bf);
253
+
254
+ Check_Type(key, T_STRING);
255
+ ckey = STR2CSTR(key);
256
+ len = (int) (RSTRING(key)->len); /* length of the string in bytes */
257
+
258
+ m = bf->m;
259
+ k = bf->k;
260
+ s = bf->s;
261
+
262
+ for (i = 0; i <= k - 1; i++) {
263
+ /* seeds for hash functions */
264
+ seed = i + s;
265
+
266
+ /* hash */
267
+ index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
268
+
269
+ /* check the bit at the index */
270
+ if (!bucket_check(bf, index))
271
+ return Qfalse; /* i.e., it is a new entry ; escape the loop */
272
+ }
273
+
274
+ return Qtrue;
275
+ }
276
+
277
+ static VALUE bf_to_s(VALUE self) {
278
+ struct BloomFilter *bf;
279
+ unsigned char *ptr;
280
+ int i;
281
+ VALUE str;
282
+
283
+ Data_Get_Struct(self, struct BloomFilter, bf);
284
+ str = rb_str_new(0, bf->m);
285
+
286
+ ptr = (unsigned char *) RSTRING(str)->ptr;
287
+ for (i = 0; i < bf->m; i++)
288
+ *ptr++ = bucket_get(bf, i) ? '1' : '0';
289
+
290
+ return str;
291
+ }
292
+
293
+ void Init_sbloomfilter(void) {
294
+ cBloomFilter = rb_define_class("BloomFilter", rb_cObject);
295
+ rb_define_singleton_method(cBloomFilter, "new", bf_s_new, -1);
296
+ rb_define_method(cBloomFilter, "m", bf_m, 0);
297
+ rb_define_method(cBloomFilter, "k", bf_k, 0);
298
+ rb_define_method(cBloomFilter, "b", bf_b, 0);
299
+ rb_define_method(cBloomFilter, "r", bf_r, 0);
300
+ rb_define_method(cBloomFilter, "num_set", bf_num_set, 0);
301
+ rb_define_method(cBloomFilter, "insert", bf_insert, 1);
302
+ rb_define_method(cBloomFilter, "delete", bf_delete, 1);
303
+ rb_define_method(cBloomFilter, "include?", bf_include, 1);
304
+ rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
305
+
306
+ /* functions that have not been implemented, yet */
307
+
308
+ // rb_define_method(cBloomFilter, "clear", bf_clear, 0);
309
+ // rb_define_method(cBloomFilter, "&", bf_and, 1);
310
+ // rb_define_method(cBloomFilter, "|", bf_or, 1);
311
+ // rb_define_method(cBloomFilter, "<=>", bf_cmp, 1);
312
+ }
@@ -0,0 +1,31 @@
1
+ require 'sbloomfilter'
2
+
3
+ class BloomFilter
4
+ def stats
5
+ fp = ((1.0 - Math.exp(-(self.k * self.num_set).to_f / self.m)) ** self.k) * 100
6
+ printf "Number of filter buckets (m): %d\n" % self.m
7
+ printf "Number of bits per buckets (b): %d\n" % self.b
8
+ printf "Number of filter elements (n): %d\n" % self.num_set
9
+ printf "Number of filter hashes (k) : %d\n" % self.k
10
+ printf "Raise on overflow? (r) : %s\n" % self.r.to_s
11
+ printf "Predicted false positive rate = %.2f%\n" % fp
12
+ end
13
+
14
+ def []= key, value
15
+ insert(key)
16
+ @hash_value[key] = value
17
+ end
18
+
19
+ def [] key
20
+ return nil unless include?(key)
21
+ @hash_value[key]
22
+ end
23
+
24
+ def key? key
25
+ include?(key)
26
+ end
27
+
28
+ def keys
29
+ @hash_value.keys
30
+ end
31
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'bloomfilter'
2
+ require 'test/unit'
@@ -0,0 +1,44 @@
1
+ require 'helper'
2
+
3
+ class TestBloomFilter < Test::Unit::TestCase
4
+ def test_include?
5
+ bf = BloomFilter.new(10, 2, 1, 1, false)
6
+ bf.insert("test")
7
+ bf.insert("test")
8
+ bf.insert("test")
9
+ bf.insert("test")
10
+ bf.insert("test")
11
+ assert bf.include?("test")
12
+ assert !bf.include?("lkajdsfhlkajsdfhlakjsdfhalsjdkfh")
13
+ end
14
+
15
+ def test_hash_key_insert
16
+ bf = BloomFilter.new(10, 2, 1)
17
+ bf['foo'] = 'bar'
18
+ assert bf.key?('foo')
19
+ assert_equal 'bar', bf['foo']
20
+ end
21
+
22
+ def test_hash_key?
23
+ bf = BloomFilter.new(10, 2, 1)
24
+ assert !bf.key?('foo')
25
+ bf['foo'] = 'bar'
26
+ assert bf.key?('foo')
27
+ end
28
+
29
+ def test_keys
30
+ bf = BloomFilter.new(10, 2, 1)
31
+ bf['foo'] = 'bar'
32
+ bf['awesome'] = 'bar'
33
+ assert_equal %w{ awesome foo }.sort, bf.keys.sort
34
+ end
35
+
36
+ #TODO: no delete function yet.
37
+ def test_delete
38
+ bf = BloomFilter.new(10, 2, 1, 2, false)
39
+ bf.insert("test")
40
+ assert bf.include?("test")
41
+ bf.delete("test")
42
+ assert !bf.include?("test")
43
+ end
44
+ end
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: igrigorik-bloomfilter
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Ilya Grigorik
8
+ - Tatsuya Mori
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-02-21 00:00:00 -08:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description: Counting Bloom Filter in Ruby
18
+ email: ilya@igvita.com
19
+ executables: []
20
+
21
+ extensions:
22
+ - ext/extconf.rb
23
+ extra_rdoc_files: []
24
+
25
+ files:
26
+ - README.rdoc
27
+ - Rakefile
28
+ - ext/crc32.c
29
+ - ext/crc32.h
30
+ - ext/extconf.rb
31
+ - ext/sbloomfilter.c
32
+ - lib/bloomfilter.rb
33
+ - examples/bf.rb
34
+ - examples/simple.rb
35
+ - test/helper.rb
36
+ - test/test_bloom_filter.rb
37
+ has_rdoc: true
38
+ homepage: http://github.com/igrigorik/bloomfilter
39
+ post_install_message:
40
+ rdoc_options: []
41
+
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: "0"
49
+ version:
50
+ required_rubygems_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ requirements: []
57
+
58
+ rubyforge_project: bloomfilter
59
+ rubygems_version: 1.2.0
60
+ signing_key:
61
+ specification_version: 2
62
+ summary: Counting Bloom Filter in Ruby
63
+ test_files: []
64
+