igrigorik-bloomfilter 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc ADDED
@@ -0,0 +1,58 @@
1
+ = BloomFilter
2
+
3
+ Counting Bloom Filter implemented in Ruby.
4
+
5
+ Bloom filter is a space-efficient probabilistic data structure that is used to
6
+ test whether an element is a member of a set. False positives are possible, but
7
+ false negatives are not. For more detail: http://en.wikipedia.org/wiki/Bloom_filter
8
+
9
+ == Implementation
10
+
11
+ Instead of using k different hash functions, this implementation seeds the CRC32 hash
12
+ with k different initial values (0, 1, ..., k-1). This may or may not give you a good
13
+ distribution, it all depends on the data.
14
+
15
+ == Example
16
+
17
+ require 'bloomfilter'
18
+
19
+ # M (size of bit array)
20
+ # K (number of hash functions)
21
+ # R (random seed) 100000000, k=4, random seed=1
22
+
23
+ # M, K, R
24
+ bf = BloomFilter.new(10, 2, 1)
25
+ bf.insert("test")
26
+ bf.include?("test")
27
+ => true
28
+ bf.include?("test2")
29
+ => false
30
+ bf.delete("test")
31
+ bf.include?("test")
32
+ => false
33
+
34
+ # Hash with a bloom filter!
35
+ bf["test2"] = "bar"
36
+ bf["test2"]
37
+ => "bar"
38
+ bf["test3"]
39
+ => nil
40
+
41
+ bf.stats
42
+ Number of filter bits (m): 10
43
+ Number of filter elements (n): 2
44
+ Number of filter hashes (k) : 2
45
+ Predicted false positive rate = 10.87%
46
+
47
+
48
+ == Configuring Bloom Filter
49
+
50
+ Performance of the Bloom filter depends on a number of variables:
51
+ - size of the bit array
52
+ - number of hash functions
53
+
54
+ To figure out the values for these parameters, refer to:
55
+ http://www.igvita.com/2008/12/27/scalable-datasets-bloom-filters-in-ruby/
56
+
57
+ == Credits
58
+ Tatsuya Mori <valdzone@gmail.com> (Original: http://vald.x0.com/sb/)
data/Rakefile ADDED
@@ -0,0 +1,64 @@
1
+ require 'rake'
2
+ require 'rake/clean'
3
+ require 'rake/rdoctask'
4
+ require 'rake/gempackagetask'
5
+ require 'rake/testtask'
6
+ require 'fileutils'
7
+ include FileUtils
8
+
9
+ # Default Rake task is compile
10
+ task :default => :compile
11
+
12
+ def make(makedir)
13
+ Dir.chdir(makedir) { sh 'make' }
14
+ end
15
+
16
+ def extconf(dir)
17
+ Dir.chdir(dir) { ruby "extconf.rb" }
18
+ end
19
+
20
+ def setup_extension(dir, extension)
21
+ ext = "ext/#{dir}"
22
+ ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
23
+ ext_files = FileList[
24
+ "#{ext}/*.c",
25
+ "#{ext}/*.h",
26
+ "#{ext}/extconf.rb",
27
+ "#{ext}/Makefile",
28
+ "lib"
29
+ ]
30
+
31
+ task "lib" do
32
+ directory "lib"
33
+ end
34
+
35
+ desc "Builds just the #{extension} extension"
36
+ task extension.to_sym => ["#{ext}/Makefile", ext_so ]
37
+
38
+ file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
39
+ extconf "#{ext}"
40
+ end
41
+
42
+ file ext_so => ext_files do
43
+ make "#{ext}"
44
+ cp ext_so, "lib"
45
+ end
46
+ end
47
+
48
+ setup_extension("", "sbloomfilter")
49
+
50
+ task :compile => [:sbloomfilter]
51
+
52
+ CLEAN.include ['build/*', '**/*.o', '**/*.so', '**/*.a', '**/*.log', 'pkg']
53
+ CLEAN.include ['ext/Makefile']
54
+
55
+ Rake::TestTask.new do |t|
56
+ %w[ ext lib test ].each do |dir|
57
+ t.libs << dir
58
+ end
59
+
60
+ t.test_files = FileList['test/test_*.rb']
61
+ t.verbose = true
62
+ end
63
+ Rake::Task[:test].prerequisites << :compile
64
+
data/examples/bf.rb ADDED
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bitset'
3
+ require 'zlib'
4
+
5
+ #
6
+ # Pure ruby implementation of a Bloom filter, just for kicks
7
+ #
8
+
9
+ class BloomFilter
10
+
11
+ def initialize(max_entries, num_hashes, seed)
12
+ @num_hashes = num_hashes
13
+ @size = max_entries.to_i
14
+ @bitmap = BitSet.new(@size)
15
+ @__mask = BitSet.new(@size)
16
+ @seed = seed
17
+ end
18
+
19
+ def insert(key)
20
+ mask = make_mask(key)
21
+ @bitmap |= mask
22
+ end
23
+
24
+ def new?(key)
25
+ mask = make_mask(key)
26
+ return ((@bitmap & mask) != mask);
27
+ end
28
+
29
+ def make_mask(key)
30
+ @__mask.clear
31
+ 0.upto(@num_hashes.to_i - 1) do |i|
32
+ hash = Zlib.crc32(key, i + @seed)
33
+ @__mask.set(hash % @size, 1)
34
+ end
35
+ return @__mask
36
+ end
37
+ end
38
+
39
+
40
+ def main
41
+ bf = BloomFilter.new(1000000, 4, 0)
42
+ num = 0
43
+ while line = ARGF.gets
44
+ data = line.chop
45
+
46
+ if bf.new_entry?(data)
47
+ num += 1
48
+ bf.insert(data)
49
+ end
50
+ end
51
+ print "#element = #{num}\n"
52
+ end
53
+
54
+ main
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bloomfilter'
3
+
4
+ WORDS = %w(duck penguin bear panda)
5
+ TEST = %w(penguin moose racooon)
6
+
7
+ # m = 100, k = 4, seed = 1
8
+ bf = BloomFilter.new(100, 4, 1)
9
+
10
+ WORDS.each { |w| bf.insert(w) }
11
+ TEST.each do |w|
12
+ puts "#{w}: #{bf.include?(w)}"
13
+ end
14
+
15
+ bf.stats
16
+
17
+ # penguin: true
18
+ # moose: false
19
+ # racooon: false
20
+ #
21
+ # Number of filter buckets (m): 100
22
+ # Number of bits per buckets (b): 1
23
+ # Number of filter elements (n): 4
24
+ # Number of filter hashes (k) : 4
25
+ # Raise on overflow? (r) : false
26
+ # Predicted false positive rate = 0.05%
data/ext/crc32.c ADDED
@@ -0,0 +1,32 @@
1
+ /* simple CRC32 code */
2
+ /*
3
+ * Copyright 2005 Aris Adamantiadis
4
+ *
5
+ * This file is part of the SSH Library
6
+ *
7
+ * The SSH Library is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU Lesser General Public License as published by
9
+ * the Free Software Foundation; either version 2.1 of the License, or (at your
10
+ * option) any later version.
11
+ *
12
+ *
13
+ * The SSH Library is distributed in the hope that it will be useful, but
14
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16
+ * License for more details.
17
+ *
18
+ * You should have received a copy of the GNU Lesser General Public License
19
+ * along with the SSH Library; see the file COPYING. If not, write to
20
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
+ * MA 02111-1307, USA. */
22
+
23
+ #include "crc32.h"
24
+
25
+ unsigned int crc32(unsigned int crc, char *buf, int len) {
26
+ while (len > 0) {
27
+ crc = crc_table[(crc ^ *buf) & 0xff] ^ (crc >> 8);
28
+ --len;
29
+ ++buf;
30
+ }
31
+ return crc;
32
+ }
data/ext/crc32.h ADDED
@@ -0,0 +1,78 @@
1
+ /* simple CRC32 code */
2
+ /*
3
+ * Copyright 2005 Aris Adamantiadis
4
+ *
5
+ * This file is part of the SSH Library
6
+ *
7
+ * The SSH Library is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU Lesser General Public License as published by
9
+ * the Free Software Foundation; either version 2.1 of the License, or (at your
10
+ * option) any later version.
11
+ *
12
+ *
13
+ * The SSH Library is distributed in the hope that it will be useful, but
14
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16
+ * License for more details.
17
+ *
18
+ * You should have received a copy of the GNU Lesser General Public License
19
+ * along with the SSH Library; see the file COPYING. If not, write to
20
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
+ * MA 02111-1307, USA. */
22
+
23
+ static unsigned int crc_table[] = {
24
+ 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
25
+ 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
26
+ 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
27
+ 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
28
+ 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
29
+ 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
30
+ 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
31
+ 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
32
+ 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
33
+ 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
34
+ 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
35
+ 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
36
+ 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
37
+ 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
38
+ 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
39
+ 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
40
+ 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
41
+ 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
42
+ 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
43
+ 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
44
+ 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
45
+ 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
46
+ 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
47
+ 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
48
+ 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
49
+ 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
50
+ 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
51
+ 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
52
+ 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
53
+ 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
54
+ 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
55
+ 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
56
+ 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
57
+ 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
58
+ 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
59
+ 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
60
+ 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
61
+ 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
62
+ 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
63
+ 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
64
+ 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
65
+ 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
66
+ 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
67
+ 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
68
+ 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
69
+ 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
70
+ 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
71
+ 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
72
+ 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
73
+ 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
74
+ 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
75
+ 0x2d02ef8dUL
76
+ };
77
+
78
+ unsigned int crc32(unsigned int crc, char *buf, int len);
data/ext/extconf.rb ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require "mkmf"
3
+
4
+ create_makefile("sbloomfilter")
@@ -0,0 +1,312 @@
1
+ /*
2
+ * sbloomfilter.c - simple Bloom Filter
3
+ * (c) Tatsuya Mori <valdzone@gmail.com>
4
+ */
5
+
6
+ #include "ruby.h"
7
+ #include "crc32.h"
8
+
9
+ static VALUE cBloomFilter;
10
+
11
+ struct BloomFilter {
12
+ int m; /* # of buckets in a bloom filter */
13
+ int b; /* # of bits in a bloom filter bucket */
14
+ int k; /* # of hash functions */
15
+ int s; /* # seed of hash functions */
16
+ int r; /* # raise on bucket overflow? */
17
+ int num_set; /* # of set bits */
18
+ unsigned char *ptr; /* bits data */
19
+ };
20
+
21
+ void bits_free(struct BloomFilter *bf) {
22
+ ruby_xfree(bf->ptr);
23
+ }
24
+
25
+
26
+ void bucket_unset(struct BloomFilter *bf, int index) {
27
+ int byte_offset = (index * bf->b) / 8;
28
+ int bit_offset = (index * bf->b) % 8;
29
+ unsigned int c = bf->ptr[byte_offset];
30
+ c += bf->ptr[byte_offset + 1] << 8;
31
+ unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
32
+ if ((c & mask) == 0) {
33
+ // do nothing
34
+ } else {
35
+ bf->ptr[byte_offset] -= (1 << bit_offset) & ((1 << 8) - 1);
36
+ bf->ptr[byte_offset + 1] -= ((1 << bit_offset) & ((1 << 16) - 1)) >> 8;
37
+ }
38
+
39
+ }
40
+
41
+ void bucket_set(struct BloomFilter *bf, int index) {
42
+ int byte_offset = (index * bf->b) / 8;
43
+ int bit_offset = (index * bf->b) % 8;
44
+ unsigned int c = bf->ptr[byte_offset];
45
+ c += bf->ptr[byte_offset + 1] << 8;
46
+ unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
47
+ if ((c & mask) == mask) {
48
+ if (bf->r == 1) rb_raise(rb_eRuntimeError, "bucket got filled up");
49
+ } else {
50
+ bf->ptr[byte_offset] += (1 << bit_offset) & ((1 << 8) - 1);
51
+ bf->ptr[byte_offset + 1] += ((1 << bit_offset) & ((1 << 16) - 1)) >> 8;
52
+ }
53
+
54
+ }
55
+
56
+ int bucket_check(struct BloomFilter *bf, int index) {
57
+ int byte_offset = (index * bf->b) / 8;
58
+ int bit_offset = (index * bf->b) % 8;
59
+ unsigned int c = bf->ptr[byte_offset];
60
+ c += bf->ptr[byte_offset + 1] << 8;
61
+
62
+ unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
63
+ return (c & mask) >> bit_offset;
64
+ }
65
+
66
+ int bucket_get(struct BloomFilter *bf, int index) {
67
+ int byte_offset = (index * bf->b) / 8;
68
+ int bit_offset = (index * bf->b) % 8;
69
+ unsigned int c = bf->ptr[byte_offset];
70
+ c += bf->ptr[byte_offset + 1] << 8;
71
+
72
+ unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
73
+ return (c & mask) >> bit_offset;
74
+ }
75
+
76
+ static VALUE bf_s_new(int argc, VALUE *argv, VALUE self) {
77
+ struct BloomFilter *bf;
78
+ VALUE arg1, arg2, arg3, arg4, arg5, obj;
79
+ int m, k, s, b, r, bytes;
80
+
81
+ obj = Data_Make_Struct(self, struct BloomFilter, NULL, bits_free, bf);
82
+
83
+ if (argc == 5) {
84
+ arg1 = argv[0];
85
+ arg2 = argv[1];
86
+ arg3 = argv[2];
87
+ arg4 = argv[3];
88
+ arg5 = argv[4];
89
+ } else if (argc == 4) {
90
+ arg1 = argv[0];
91
+ arg2 = argv[1];
92
+ arg3 = argv[2];
93
+ arg4 = argv[3];
94
+ arg5 = 0;
95
+ } else if (argc == 3) {
96
+ arg1 = argv[0];
97
+ arg2 = argv[1];
98
+ arg3 = argv[2];
99
+ arg4 = INT2FIX(1);
100
+ arg5 = 0;
101
+ } else if (argc == 2) {
102
+ arg1 = argv[0];
103
+ arg2 = argv[1];
104
+ arg3 = INT2FIX(0);
105
+ arg4 = INT2FIX(1);
106
+ arg5 = 0;
107
+ } else if (argc == 1) {
108
+ arg1 = argv[0];
109
+ arg2 = INT2FIX(4);
110
+ arg3 = INT2FIX(0);
111
+ arg4 = INT2FIX(1);
112
+ arg5 = 0;
113
+ } else { /* default = Fugou approach :-) */
114
+ arg1 = INT2FIX(100000000);
115
+ arg2 = INT2FIX(4);
116
+ arg3 = INT2FIX(0);
117
+ arg4 = INT2FIX(1);
118
+ arg5 = 0;
119
+ }
120
+
121
+ m = FIX2INT(arg1);
122
+ k = FIX2INT(arg2);
123
+ s = FIX2INT(arg3);
124
+ b = FIX2INT(arg4);
125
+ r = FIX2INT(arg5);
126
+
127
+ if (b < 1 || b > 8)
128
+ rb_raise(rb_eArgError, "bucket size");
129
+ if (m < 1)
130
+ rb_raise(rb_eArgError, "array size");
131
+ if (k < 1)
132
+ rb_raise(rb_eArgError, "hash length");
133
+ if (s < 0)
134
+ rb_raise(rb_eArgError, "random seed");
135
+
136
+ bf->b = b;
137
+ bf->m = m;
138
+ bf->k = k;
139
+ bf->s = s;
140
+ bf->r = r;
141
+ bf->num_set = 0;
142
+
143
+ bytes = ((m * b) + 15) / 8;
144
+ bf->ptr = ALLOC_N(unsigned char, bytes);
145
+
146
+ /* initialize the bits with zeros */
147
+ memset(bf->ptr, 0, bytes);
148
+ rb_iv_set(obj, "@hash_value", rb_hash_new());
149
+
150
+ return obj;
151
+ }
152
+
153
+ static VALUE bf_m(VALUE self) {
154
+ struct BloomFilter *bf;
155
+ Data_Get_Struct(self, struct BloomFilter, bf);
156
+ return INT2FIX(bf->m);
157
+ }
158
+
159
+ static VALUE bf_k(VALUE self) {
160
+ struct BloomFilter *bf;
161
+ Data_Get_Struct(self, struct BloomFilter, bf);
162
+ return INT2FIX(bf->k);
163
+ }
164
+
165
+ static VALUE bf_b(VALUE self) {
166
+ struct BloomFilter *bf;
167
+ Data_Get_Struct(self, struct BloomFilter, bf);
168
+ return INT2FIX(bf->b);
169
+ }
170
+
171
+ static VALUE bf_r(VALUE self) {
172
+ struct BloomFilter *bf;
173
+ Data_Get_Struct(self, struct BloomFilter, bf);
174
+ return bf->r == 0 ? Qfalse : Qtrue;
175
+ }
176
+
177
+ static VALUE bf_num_set(VALUE self) {
178
+ struct BloomFilter *bf;
179
+ Data_Get_Struct(self, struct BloomFilter, bf);
180
+ return INT2FIX(bf->num_set);
181
+ }
182
+
183
+ static VALUE bf_insert(VALUE self, VALUE key) {
184
+ int index, seed;
185
+ int i, len, m, k, s;
186
+ char *ckey;
187
+
188
+ struct BloomFilter *bf;
189
+ Data_Get_Struct(self, struct BloomFilter, bf);
190
+
191
+ Check_Type(key, T_STRING);
192
+ ckey = STR2CSTR(key);
193
+ len = (int) (RSTRING(key)->len); /* length of the string in bytes */
194
+
195
+ m = bf->m;
196
+ k = bf->k;
197
+ s = bf->s;
198
+
199
+ for (i = 0; i <= k - 1; i++) {
200
+ /* seeds for hash functions */
201
+ seed = i + s;
202
+
203
+ /* hash */
204
+ index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
205
+
206
+ /* set a bit at the index */
207
+ bucket_set(bf, index);
208
+ }
209
+
210
+ bf->num_set += 1;
211
+ return Qnil;
212
+ }
213
+
214
+ static VALUE bf_delete(VALUE self, VALUE key) {
215
+ int index, seed;
216
+ int i, len, m, k, s;
217
+ char *ckey;
218
+
219
+ struct BloomFilter *bf;
220
+ Data_Get_Struct(self, struct BloomFilter, bf);
221
+
222
+ Check_Type(key, T_STRING);
223
+ ckey = STR2CSTR(key);
224
+ len = (int) (RSTRING(key)->len); /* length of the string in bytes */
225
+
226
+ m = bf->m;
227
+ k = bf->k;
228
+ s = bf->s;
229
+
230
+ for (i = 0; i <= k - 1; i++) {
231
+ /* seeds for hash functions */
232
+ seed = i + s;
233
+
234
+ /* hash */
235
+ index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
236
+
237
+ /* set a bit at the index */
238
+ bucket_unset(bf, index);
239
+ }
240
+
241
+ bf->num_set += 1;
242
+ return Qnil;
243
+ }
244
+
245
+
246
+ static VALUE bf_include(VALUE self, VALUE key) {
247
+ int index, seed;
248
+ int i, len, m, k, s;
249
+ char *ckey;
250
+
251
+ struct BloomFilter *bf;
252
+ Data_Get_Struct(self, struct BloomFilter, bf);
253
+
254
+ Check_Type(key, T_STRING);
255
+ ckey = STR2CSTR(key);
256
+ len = (int) (RSTRING(key)->len); /* length of the string in bytes */
257
+
258
+ m = bf->m;
259
+ k = bf->k;
260
+ s = bf->s;
261
+
262
+ for (i = 0; i <= k - 1; i++) {
263
+ /* seeds for hash functions */
264
+ seed = i + s;
265
+
266
+ /* hash */
267
+ index = (int) (crc32((unsigned int) (seed), ckey, len) % (unsigned int) (m));
268
+
269
+ /* check the bit at the index */
270
+ if (!bucket_check(bf, index))
271
+ return Qfalse; /* i.e., it is a new entry ; escape the loop */
272
+ }
273
+
274
+ return Qtrue;
275
+ }
276
+
277
+ static VALUE bf_to_s(VALUE self) {
278
+ struct BloomFilter *bf;
279
+ unsigned char *ptr;
280
+ int i;
281
+ VALUE str;
282
+
283
+ Data_Get_Struct(self, struct BloomFilter, bf);
284
+ str = rb_str_new(0, bf->m);
285
+
286
+ ptr = (unsigned char *) RSTRING(str)->ptr;
287
+ for (i = 0; i < bf->m; i++)
288
+ *ptr++ = bucket_get(bf, i) ? '1' : '0';
289
+
290
+ return str;
291
+ }
292
+
293
+ void Init_sbloomfilter(void) {
294
+ cBloomFilter = rb_define_class("BloomFilter", rb_cObject);
295
+ rb_define_singleton_method(cBloomFilter, "new", bf_s_new, -1);
296
+ rb_define_method(cBloomFilter, "m", bf_m, 0);
297
+ rb_define_method(cBloomFilter, "k", bf_k, 0);
298
+ rb_define_method(cBloomFilter, "b", bf_b, 0);
299
+ rb_define_method(cBloomFilter, "r", bf_r, 0);
300
+ rb_define_method(cBloomFilter, "num_set", bf_num_set, 0);
301
+ rb_define_method(cBloomFilter, "insert", bf_insert, 1);
302
+ rb_define_method(cBloomFilter, "delete", bf_delete, 1);
303
+ rb_define_method(cBloomFilter, "include?", bf_include, 1);
304
+ rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
305
+
306
+ /* functions that have not been implemented, yet */
307
+
308
+ // rb_define_method(cBloomFilter, "clear", bf_clear, 0);
309
+ // rb_define_method(cBloomFilter, "&", bf_and, 1);
310
+ // rb_define_method(cBloomFilter, "|", bf_or, 1);
311
+ // rb_define_method(cBloomFilter, "<=>", bf_cmp, 1);
312
+ }
@@ -0,0 +1,31 @@
1
+ require 'sbloomfilter'
2
+
3
+ class BloomFilter
4
+ def stats
5
+ fp = ((1.0 - Math.exp(-(self.k * self.num_set).to_f / self.m)) ** self.k) * 100
6
+ printf "Number of filter buckets (m): %d\n" % self.m
7
+ printf "Number of bits per buckets (b): %d\n" % self.b
8
+ printf "Number of filter elements (n): %d\n" % self.num_set
9
+ printf "Number of filter hashes (k) : %d\n" % self.k
10
+ printf "Raise on overflow? (r) : %s\n" % self.r.to_s
11
+ printf "Predicted false positive rate = %.2f%\n" % fp
12
+ end
13
+
14
+ def []= key, value
15
+ insert(key)
16
+ @hash_value[key] = value
17
+ end
18
+
19
+ def [] key
20
+ return nil unless include?(key)
21
+ @hash_value[key]
22
+ end
23
+
24
+ def key? key
25
+ include?(key)
26
+ end
27
+
28
+ def keys
29
+ @hash_value.keys
30
+ end
31
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'bloomfilter'
2
+ require 'test/unit'
@@ -0,0 +1,44 @@
1
+ require 'helper'
2
+
3
+ class TestBloomFilter < Test::Unit::TestCase
4
+ def test_include?
5
+ bf = BloomFilter.new(10, 2, 1, 1, false)
6
+ bf.insert("test")
7
+ bf.insert("test")
8
+ bf.insert("test")
9
+ bf.insert("test")
10
+ bf.insert("test")
11
+ assert bf.include?("test")
12
+ assert !bf.include?("lkajdsfhlkajsdfhlakjsdfhalsjdkfh")
13
+ end
14
+
15
+ def test_hash_key_insert
16
+ bf = BloomFilter.new(10, 2, 1)
17
+ bf['foo'] = 'bar'
18
+ assert bf.key?('foo')
19
+ assert_equal 'bar', bf['foo']
20
+ end
21
+
22
+ def test_hash_key?
23
+ bf = BloomFilter.new(10, 2, 1)
24
+ assert !bf.key?('foo')
25
+ bf['foo'] = 'bar'
26
+ assert bf.key?('foo')
27
+ end
28
+
29
+ def test_keys
30
+ bf = BloomFilter.new(10, 2, 1)
31
+ bf['foo'] = 'bar'
32
+ bf['awesome'] = 'bar'
33
+ assert_equal %w{ awesome foo }.sort, bf.keys.sort
34
+ end
35
+
36
+ #TODO: no delete function yet.
37
+ def test_delete
38
+ bf = BloomFilter.new(10, 2, 1, 2, false)
39
+ bf.insert("test")
40
+ assert bf.include?("test")
41
+ bf.delete("test")
42
+ assert !bf.include?("test")
43
+ end
44
+ end
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: igrigorik-bloomfilter
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Ilya Grigorik
8
+ - Tatsuya Mori
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-02-21 00:00:00 -08:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description: Counting Bloom Filter in Ruby
18
+ email: ilya@igvita.com
19
+ executables: []
20
+
21
+ extensions:
22
+ - ext/extconf.rb
23
+ extra_rdoc_files: []
24
+
25
+ files:
26
+ - README.rdoc
27
+ - Rakefile
28
+ - ext/crc32.c
29
+ - ext/crc32.h
30
+ - ext/extconf.rb
31
+ - ext/sbloomfilter.c
32
+ - lib/bloomfilter.rb
33
+ - examples/bf.rb
34
+ - examples/simple.rb
35
+ - test/helper.rb
36
+ - test/test_bloom_filter.rb
37
+ has_rdoc: true
38
+ homepage: http://github.com/igrigorik/bloomfilter
39
+ post_install_message:
40
+ rdoc_options: []
41
+
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: "0"
49
+ version:
50
+ required_rubygems_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ requirements: []
57
+
58
+ rubyforge_project: bloomfilter
59
+ rubygems_version: 1.2.0
60
+ signing_key:
61
+ specification_version: 2
62
+ summary: Counting Bloom Filter in Ruby
63
+ test_files: []
64
+