bloom-ruby 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ === 0.2.1 (2013-01-21)
2
+
3
+ * Clean up compiler warnings.
4
+
5
+ === 0.2.0 (2012-04-27)
6
+
7
+ * Added BloomFilter#bits
8
+
9
+ === 0.1.1 (2012-01-26)
10
+
11
+ * Store table size in dumps.
12
+
13
+ === 0.1.0 (2012-01-25)
14
+
15
+ * Initial version.
@@ -0,0 +1,67 @@
1
+ # BloomFilter
2
+
3
+ BloomFilter is a ruby library that implements an in-memory [Bloom Filter](http://en.wikipedia.org/wiki/Bloom_filter)
4
+
5
+ ## Dependencies
6
+
7
+ * ruby 1.9.1 or later
8
+
9
+ ## Installation
10
+
11
+ ```
12
+ gem install bloom-filter
13
+ ```
14
+
15
+ ## API
16
+
17
+ ```
18
+ BloomFilter
19
+ .new
20
+ .load
21
+ #dump
22
+ #insert
23
+ #include?
24
+ #bits
25
+ #binary
26
+
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ```ruby
32
+ require 'bloom-filter'
33
+
34
+ filter = BloomFilter.new
35
+
36
+ # auto-calculate optimum bitmap size based on maximum number of items stored and desired max error rate.
37
+ filter = BloomFilter.new size: 100_000, error_rate: 0.01
38
+
39
+ # specify bitmap size & number of hash functions explicitly.
40
+ filter = BloomFilter.new bits: 100_000_0, hashes: 4
41
+
42
+ filter.insert "foo"
43
+ filter.include? "foo" #=> true
44
+ filter.include? "bar" #=> false
45
+
46
+ filter.dump "/tmp/random.bloom"
47
+ filter = BloomFilter.load "/tmp/random.bloom"
48
+
49
+ bits = filter.bits #=> "10010100100111..."
50
+ binary = filter.binary #=> "\x83Ö\xAC\xEA\u00..."
51
+
52
+ filter2 = BloomFilter.new bits: 100_000_0, hashes: 4
53
+ filter2.binary = binary
54
+ filter2.include? "foo" #=> true
55
+ ```
56
+
57
+ ## See Also
58
+
59
+ [https://github.com/igrigorik/bloomfilter-rb](https://github.com/igrigorik/bloomfilter-rb)
60
+
61
+ ## Home Page
62
+
63
+ [https://github.com/deepfryed/bloom-filter](https://github.com/deepfryed/bloom-filter)
64
+
65
+ ## License
66
+
67
+ MIT
@@ -0,0 +1,265 @@
1
+ /*
2
+
3
+ Copyright (c) 2005-2008, Simon Howard
4
+
5
+ Permission to use, copy, modify, and/or distribute this software
6
+ for any purpose with or without fee is hereby granted, provided
7
+ that the above copyright notice and this permission notice appear
8
+ in all copies.
9
+
10
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
11
+ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
12
+ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
13
+ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
14
+ CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
15
+ LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
16
+ NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
17
+ CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18
+
19
+ */
20
+
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+
24
+ #include "bloom-filter.h"
25
+
26
+ /* malloc() / free() testing */
27
+
28
+ #ifdef ALLOC_TESTING
29
+ #include "alloc-testing.h"
30
+ #endif
31
+
32
+ /* Salt values. These salts are XORed with the output of the hash
33
+ * function to give multiple unique hashes. */
34
+
35
+ static const unsigned int salts[] = {
36
+ 0x5cee4612, 0xb5587b1c, 0xa250f2b0, 0xa3bf6d2a,
37
+ 0x7a81bd1a, 0x92888d7f, 0x1dc977c7, 0xedc96624,
38
+ 0x920c85d9, 0xf16066b3, 0xc6f0d4b3, 0x2b76eb86,
39
+ 0xcacb3893, 0x493d81c5, 0xf5a133ac, 0x039740bf,
40
+ 0x162b8224, 0xf841de90, 0xc3e5090d, 0x3bce93a7,
41
+ 0xf1860334, 0xe832b5f1, 0xf5b6535b, 0xe4cf4fa6,
42
+ 0x8357b769, 0x1442b07a, 0x21c5863d, 0xabc0d846,
43
+ 0x6dc0d77a, 0x23a3992c, 0xe12179ba, 0xd81d1e23,
44
+ 0xcff4727b, 0xe957ecfb, 0xee8f391a, 0x426efa23,
45
+ 0x3a34ff2c, 0x8b875d94, 0x34fd0f63, 0xf159daae,
46
+ 0xaabab8b3, 0xa83a07ba, 0x4e54fb33, 0xfb82fab8,
47
+ 0x2ae2888f, 0xd1a307a8, 0xbe33322d, 0x87c73f86,
48
+ 0x7270fa7e, 0x68673c55, 0x2c8026d0, 0xead8e422,
49
+ 0xa3ee5132, 0xecb67767, 0x1c3b1ae5, 0x47adf5b6,
50
+ 0xf4518d30, 0x46e62797, 0x9889aa76, 0x1405aadf,
51
+ 0xf62f9124, 0x5c435ac5, 0x35b8dfe3, 0x651c08c5,
52
+ };
53
+
54
+ BloomFilter *bloom_filter_new (unsigned int table_size, BloomFilterHashFunc hash_func, unsigned int num_functions) {
55
+ BloomFilter *filter;
56
+
57
+ /* There is a limit on the number of functions which can be
58
+ * applied, due to the table size */
59
+
60
+ if (num_functions > sizeof (salts) / sizeof (*salts)) {
61
+ return NULL;
62
+ }
63
+
64
+ /* Allocate bloom filter structure */
65
+
66
+ filter = malloc (sizeof (BloomFilter));
67
+
68
+ if (filter == NULL) {
69
+ return NULL;
70
+ }
71
+
72
+ /* Allocate table, each entry is one bit; these are packed into
73
+ * bytes. When allocating we must round the length up to the nearest
74
+ * byte. */
75
+
76
+ filter->table = calloc ((table_size + 7) / 8, 1);
77
+
78
+ if (filter->table == NULL) {
79
+ free (filter);
80
+ return NULL;
81
+ }
82
+
83
+ filter->hash_func = hash_func;
84
+ filter->num_functions = num_functions;
85
+ filter->table_size = table_size;
86
+
87
+ return filter;
88
+ }
89
+
90
+ void bloom_filter_free (BloomFilter * bloomfilter) {
91
+ free (bloomfilter->table);
92
+ free (bloomfilter);
93
+ }
94
+
95
+ void bloom_filter_insert (BloomFilter * bloomfilter, BloomFilterValue value) {
96
+ unsigned long hash;
97
+ unsigned long subhash;
98
+ unsigned int index;
99
+ unsigned int i;
100
+
101
+ /* Generate hash of the value to insert */
102
+
103
+ hash = bloomfilter->hash_func (value);
104
+
105
+ /* Generate multiple unique hashes by XORing with values in the
106
+ * salt table. */
107
+
108
+ for (i = 0; i < bloomfilter->num_functions; ++i) {
109
+
110
+ /* Generate a unique hash */
111
+
112
+ subhash = hash ^ salts[i];
113
+
114
+ /* Find the index into the table */
115
+
116
+ index = subhash % bloomfilter->table_size;
117
+
118
+ /* Insert into the table.
119
+ * index / 8 finds the byte index of the table,
120
+ * index % 8 gives the bit index within that byte to set. */
121
+
122
+ bloomfilter->table[index / 8] |= 1 << (index % 8);
123
+ }
124
+ }
125
+
126
+ int bloom_filter_query (BloomFilter * bloomfilter, BloomFilterValue value) {
127
+ unsigned long hash;
128
+ unsigned long subhash;
129
+ unsigned int index;
130
+ unsigned int i;
131
+ unsigned char b;
132
+ int bit;
133
+
134
+ /* Generate hash of the value to lookup */
135
+
136
+ hash = bloomfilter->hash_func (value);
137
+
138
+ /* Generate multiple unique hashes by XORing with values in the
139
+ * salt table. */
140
+
141
+ for (i = 0; i < bloomfilter->num_functions; ++i) {
142
+
143
+ /* Generate a unique hash */
144
+
145
+ subhash = hash ^ salts[i];
146
+
147
+ /* Find the index into the table to test */
148
+
149
+ index = subhash % bloomfilter->table_size;
150
+
151
+ /* The byte at index / 8 holds the value to test */
152
+
153
+ b = bloomfilter->table[index / 8];
154
+ bit = 1 << (index % 8);
155
+
156
+ /* Test if the particular bit is set; if it is not set,
157
+ * this value can not have been inserted. */
158
+
159
+ if ((b & bit) == 0) {
160
+ return 0;
161
+ }
162
+ }
163
+
164
+ /* All necessary bits were set. This may indicate that the value
165
+ * was inserted, or the values could have been set through other
166
+ * insertions. */
167
+
168
+ return 1;
169
+ }
170
+
171
+ void bloom_filter_read (BloomFilter * bloomfilter, unsigned char *array) {
172
+ unsigned int array_size;
173
+
174
+ /* The table is an array of bits, packed into bytes. Round up
175
+ * to the nearest byte. */
176
+
177
+ array_size = (bloomfilter->table_size + 7) / 8;
178
+
179
+ /* Copy into the buffer of the calling routine. */
180
+
181
+ memcpy (array, bloomfilter->table, array_size);
182
+ }
183
+
184
+ void bloom_filter_load (BloomFilter * bloomfilter, unsigned char *array) {
185
+ unsigned int array_size;
186
+
187
+ /* The table is an array of bits, packed into bytes. Round up
188
+ * to the nearest byte. */
189
+
190
+ array_size = (bloomfilter->table_size + 7) / 8;
191
+
192
+ /* Copy from the buffer of the calling routine. */
193
+
194
+ memcpy (bloomfilter->table, array, array_size);
195
+ }
196
+
197
+ BloomFilter *bloom_filter_union (BloomFilter * filter1, BloomFilter * filter2) {
198
+ BloomFilter *result;
199
+ unsigned int i;
200
+ unsigned int array_size;
201
+
202
+ /* To perform this operation, both filters must be created with
203
+ * the same values. */
204
+
205
+ if (filter1->table_size != filter2->table_size
206
+ || filter1->num_functions != filter2->num_functions || filter1->hash_func != filter2->hash_func) {
207
+ return NULL;
208
+ }
209
+
210
+ /* Create a new bloom filter for the result */
211
+
212
+ result = bloom_filter_new (filter1->table_size, filter1->hash_func, filter1->num_functions);
213
+
214
+ if (result == NULL) {
215
+ return NULL;
216
+ }
217
+
218
+ /* The table is an array of bits, packed into bytes. Round up
219
+ * to the nearest byte. */
220
+
221
+ array_size = (filter1->table_size + 7) / 8;
222
+
223
+ /* Populate the table of the new filter */
224
+
225
+ for (i = 0; i < array_size; ++i) {
226
+ result->table[i] = filter1->table[i] | filter2->table[i];
227
+ }
228
+
229
+ return result;
230
+ }
231
+
232
+ BloomFilter *bloom_filter_intersection (BloomFilter * filter1, BloomFilter * filter2) {
233
+ BloomFilter *result;
234
+ unsigned int i;
235
+ unsigned int array_size;
236
+
237
+ /* To perform this operation, both filters must be created with
238
+ * the same values. */
239
+
240
+ if (filter1->table_size != filter2->table_size
241
+ || filter1->num_functions != filter2->num_functions || filter1->hash_func != filter2->hash_func) {
242
+ return NULL;
243
+ }
244
+
245
+ /* Create a new bloom filter for the result */
246
+
247
+ result = bloom_filter_new (filter1->table_size, filter1->hash_func, filter1->num_functions);
248
+
249
+ if (result == NULL) {
250
+ return NULL;
251
+ }
252
+
253
+ /* The table is an array of bits, packed into bytes. Round up
254
+ * to the nearest byte. */
255
+
256
+ array_size = (filter1->table_size + 7) / 8;
257
+
258
+ /* Populate the table of the new filter */
259
+
260
+ for (i = 0; i < array_size; ++i) {
261
+ result->table[i] = filter1->table[i] & filter2->table[i];
262
+ }
263
+
264
+ return result;
265
+ }
@@ -0,0 +1,193 @@
1
+ /*
2
+
3
+ Copyright (c) 2005-2008, Simon Howard
4
+
5
+ Permission to use, copy, modify, and/or distribute this software
6
+ for any purpose with or without fee is hereby granted, provided
7
+ that the above copyright notice and this permission notice appear
8
+ in all copies.
9
+
10
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
11
+ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
12
+ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
13
+ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
14
+ CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
15
+ LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
16
+ NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
17
+ CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18
+
19
+ */
20
+
21
+ /**
22
+ * @file bloom-filter.h
23
+ *
24
+ * @brief Bloom filter
25
+ *
26
+ * A bloom filter is a space efficient data structure that can be
27
+ * used to test whether a given element is part of a set. Lookups
28
+ * will occasionally generate false positives, but never false
29
+ * negatives.
30
+ *
31
+ * To create a bloom filter, use @ref bloom_filter_new. To destroy a
32
+ * bloom filter, use @ref bloom_filter_free.
33
+ *
34
+ * To insert a value into a bloom filter, use @ref bloom_filter_insert.
35
+ *
36
+ * To query whether a value is part of the set, use
37
+ * @ref bloom_filter_query.
38
+ */
39
+
40
+ #ifndef ALGORITHM_BLOOM_FILTER_H
41
+ #define ALGORITHM_BLOOM_FILTER_H
42
+
43
+ #ifdef __cplusplus
44
+ extern "C" {
45
+ #endif
46
+
47
+ /**
48
+ * A value stored in a @ref BloomFilter.
49
+ */
50
+
51
+ typedef void *BloomFilterValue;
52
+
53
+ /**
54
+ * Hash function used to generate hash values for values inserted into a
55
+ * bloom filter.
56
+ *
57
+ * @param data The value to generate a hash value for.
58
+ * @return The hash value.
59
+ */
60
+
61
+ typedef unsigned long (*BloomFilterHashFunc) (BloomFilterValue data);
62
+
63
+ /**
64
+ * A bloom filter structure.
65
+ */
66
+
67
+ typedef struct BloomFilter {
68
+ BloomFilterHashFunc hash_func;
69
+ unsigned char *table;
70
+ unsigned int table_size;
71
+ unsigned int num_functions;
72
+ } BloomFilter;
73
+
74
+
75
+ /**
76
+ * Create a new bloom filter.
77
+ *
78
+ * @param table_size The size of the bloom filter. The greater
79
+ * the table size, the more elements can be
80
+ * stored, and the lesser the chance of false
81
+ * positives.
82
+ * @param hash_func Hash function to use on values stored in the
83
+ * filter.
84
+ * @param num_functions Number of hash functions to apply to each
85
+ * element on insertion. This running time for
86
+ * insertion and queries is proportional to this
87
+ * value. The more functions applied, the lesser
88
+ * the chance of false positives. The maximum
89
+ * number of functions is 64.
90
+ * @return A new hash table structure, or NULL if it
91
+ * was not possible to allocate the new bloom
92
+ * filter.
93
+ */
94
+
95
+ BloomFilter *bloom_filter_new (unsigned int table_size, BloomFilterHashFunc hash_func, unsigned int num_functions);
96
+
97
+ /**
98
+ * Destroy a bloom filter.
99
+ *
100
+ * @param bloomfilter The bloom filter to destroy.
101
+ */
102
+
103
+ void bloom_filter_free (BloomFilter * bloomfilter);
104
+
105
+ /**
106
+ * Insert a value into a bloom filter.
107
+ *
108
+ * @param bloomfilter The bloom filter.
109
+ * @param value The value to insert.
110
+ */
111
+
112
+ void bloom_filter_insert (BloomFilter * bloomfilter, BloomFilterValue value);
113
+
114
+ /**
115
+ * Query a bloom filter for a particular value.
116
+ *
117
+ * @param bloomfilter The bloom filter.
118
+ * @param value The value to look up.
119
+ * @return Zero if the value was definitely not
120
+ * inserted into the filter. Non-zero
121
+ * indicates that it either may or may not
122
+ * have been inserted.
123
+ */
124
+
125
+ int bloom_filter_query (BloomFilter * bloomfilter, BloomFilterValue value);
126
+
127
+ /**
128
+ * Read the contents of a bloom filter into an array.
129
+ *
130
+ * @param bloomfilter The bloom filter.
131
+ * @param array Pointer to the array to read into. This
132
+ * should be (table_size + 7) / 8 bytes in
133
+ * length.
134
+ */
135
+
136
+ void bloom_filter_read (BloomFilter * bloomfilter, unsigned char *array);
137
+
138
+ /**
139
+ * Load the contents of a bloom filter from an array.
140
+ * The data loaded should be the output read from @ref bloom_filter_read,
141
+ * from a bloom filter created using the same arguments used to create
142
+ * the original filter.
143
+ *
144
+ * @param bloomfilter The bloom filter.
145
+ * @param array Pointer to the array to load from. This
146
+ * should be (table_size + 7) / 8 bytes in
147
+ * length.
148
+ */
149
+
150
+ void bloom_filter_load (BloomFilter * bloomfilter, unsigned char *array);
151
+
152
+ /**
153
+ * Find the union of two bloom filters. Values are present in the
154
+ * resulting filter if they are present in either of the original
155
+ * filters.
156
+ *
157
+ * Both of the original filters must have been created using the
158
+ * same parameters to @ref bloom_filter_new.
159
+ *
160
+ * @param filter1 The first filter.
161
+ * @param filter2 The second filter.
162
+ * @return A new filter which is an intersection of the
163
+ * two filters, or NULL if it was not possible
164
+ * to allocate memory for the new filter, or
165
+ * if the two filters specified were created
166
+ * with different parameters.
167
+ */
168
+
169
+ BloomFilter *bloom_filter_union (BloomFilter * filter1, BloomFilter * filter2);
170
+
171
+ /**
172
+ * Find the intersection of two bloom filters. Values are only ever
173
+ * present in the resulting filter if they are present in both of the
174
+ * original filters.
175
+ *
176
+ * Both of the original filters must have been created using the
177
+ * same parameters to @ref bloom_filter_new.
178
+ *
179
+ * @param filter1 The first filter.
180
+ * @param filter2 The second filter.
181
+ * @return A new filter which is an intersection of the
182
+ * two filters, or NULL if it was not possible
183
+ * to allocate memory for the new filter, or
184
+ * if the two filters specified were created
185
+ * with different parameters.
186
+ */
187
+
188
+ BloomFilter *bloom_filter_intersection (BloomFilter * filter1, BloomFilter * filter2);
189
+
190
+ #ifdef __cplusplus
191
+ }
192
+ #endif
193
+ #endif /* #ifndef ALGORITHM_BLOOM_FILTER_H */
@@ -0,0 +1,248 @@
1
+ #include <sys/types.h>
2
+ #include <sys/stat.h>
3
+ #include <fcntl.h>
4
+ #include <unistd.h>
5
+ #include <errno.h>
6
+ #include <string.h>
7
+ #include <stdlib.h>
8
+ #include <math.h>
9
+
10
+ #include "ruby/ruby.h"
11
+ #include "bloom-filter.h"
12
+ #include "hash-string.h"
13
+ #include "version.h"
14
+
15
+ #define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
16
+ #define CSTRING(v) RSTRING_PTR(TO_S(v))
17
+
18
+ typedef struct FileHeader {
19
+ uint64_t table_size;
20
+ uint64_t num_functions;
21
+ } FileHeader;
22
+
23
+ static void bloom_free(BloomFilter *filter) {
24
+ if (filter)
25
+ bloom_filter_free(filter);
26
+ }
27
+
28
+ VALUE bloom_allocate(VALUE klass) {
29
+ BloomFilter *filter = 0;
30
+ return Data_Wrap_Struct(klass, 0, bloom_free, filter);
31
+ }
32
+
33
+ BloomFilter* bloom_handle(VALUE self) {
34
+ BloomFilter *filter = 0;
35
+ Data_Get_Struct(self, BloomFilter, filter);
36
+ if (!filter)
37
+ rb_raise(rb_eArgError, "invalid BloomFilter instance");
38
+ return filter;
39
+ }
40
+
41
+ VALUE bloom_initialize(int argc, VALUE *argv, VALUE self) {
42
+ double error;
43
+ size_t nbits, nhash, nmax;
44
+
45
+ VALUE max_size, error_rate, bitmap_size, hash_count, options;
46
+ BloomFilter *filter = 0;
47
+
48
+ rb_scan_args(argc, argv, "01", &options);
49
+ if (!NIL_P(options) && TYPE(options) != T_HASH)
50
+ rb_raise(rb_eArgError, "invalid options, expect hash");
51
+
52
+ if (NIL_P(options)) {
53
+ nbits = 1000000;
54
+ nhash = 4;
55
+ }
56
+ else {
57
+ max_size = rb_hash_aref(options, ID2SYM(rb_intern("size")));
58
+ error_rate = rb_hash_aref(options, ID2SYM(rb_intern("error_rate")));
59
+ bitmap_size = rb_hash_aref(options, ID2SYM(rb_intern("bits")));
60
+ hash_count = rb_hash_aref(options, ID2SYM(rb_intern("hashes")));
61
+
62
+ nhash = NIL_P(hash_count) ? 4 : NUM2ULONG(hash_count);
63
+
64
+ if (!NIL_P(bitmap_size))
65
+ nbits = NUM2ULONG(bitmap_size);
66
+ else if (!NIL_P(max_size)) {
67
+ nmax = NUM2ULONG(max_size);
68
+ error = NIL_P(error_rate) ? 0.01 : NUM2DBL(error_rate);
69
+ nbits = ceil(fabs(log(error) * (double)nmax / pow(log(2), 2)));
70
+ nhash = ceil(0.7 * (double)nbits / (double)nmax);
71
+ }
72
+ else
73
+ rb_raise(rb_eArgError, "requires either size & error_rate or bits & hashes");
74
+ }
75
+
76
+ filter = bloom_filter_new(nbits, string_nocase_hash, nhash);
77
+
78
+ if (!filter)
79
+ rb_raise(rb_eNoMemError, "unable to allocate memory for BloomFilter");
80
+
81
+ DATA_PTR(self) = filter;
82
+ return self;
83
+ }
84
+
85
+
86
+ VALUE bloom_insert(VALUE klass, VALUE string) {
87
+ BloomFilter *filter = bloom_handle(klass);
88
+ bloom_filter_insert(filter, (BloomFilterValue)CSTRING(string));
89
+ return Qtrue;
90
+ }
91
+
92
+ VALUE bloom_include(VALUE klass, VALUE string) {
93
+ BloomFilter *filter = bloom_handle(klass);
94
+ return bloom_filter_query(filter, (BloomFilterValue)CSTRING(string)) ? Qtrue : Qfalse;
95
+ }
96
+
97
+ VALUE bloom_dump(VALUE klass, VALUE file) {
98
+ int fd;
99
+ void *buffer;
100
+ uint64_t nbits;
101
+ FileHeader header;
102
+ BloomFilter *filter = bloom_handle(klass);
103
+
104
+ nbits = (filter->table_size + 7) / 8;
105
+ buffer = malloc(nbits);
106
+
107
+ if (!buffer)
108
+ rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
109
+
110
+ bloom_filter_read(filter, buffer);
111
+
112
+ fd = open(CSTRING(file), O_WRONLY|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR);
113
+ if (fd == -1) {
114
+ free(buffer);
115
+ rb_raise(rb_eIOError, "unable to open file. %s", strerror(errno));
116
+ }
117
+
118
+ header.table_size = filter->table_size;
119
+ header.num_functions = filter->num_functions;
120
+
121
+ if (write(fd, &header, sizeof(header)) == -1) {
122
+ free(buffer);
123
+ close(fd);
124
+ rb_raise(rb_eIOError, "error dumping BloomFilter: %s\n", strerror(errno));
125
+ }
126
+
127
+ if (write(fd, buffer, nbits) != -1) {
128
+ free(buffer);
129
+ close(fd);
130
+ return Qtrue;
131
+ }
132
+ else {
133
+ free(buffer);
134
+ close(fd);
135
+ rb_raise(rb_eIOError, "error dumping BloomFilter: %s\n", strerror(errno));
136
+ }
137
+
138
+ return Qfalse; // not reachable
139
+ }
140
+
141
+ VALUE bloom_bits(VALUE klass) {
142
+ BloomFilter *filter = bloom_handle(klass);
143
+
144
+ VALUE bitmap;
145
+ char *buffer;
146
+ unsigned char b;
147
+ int i = 0, bit, nbits = filter->table_size;
148
+
149
+ buffer = (char *)malloc(nbits);
150
+ if (!buffer)
151
+ rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
152
+
153
+ for (i = 0; i < nbits; i++) {
154
+ b = filter->table[i / 8];
155
+ bit = 1 << (i % 8);
156
+
157
+ if ((b & bit) == 0)
158
+ buffer[i] = '0';
159
+ else
160
+ buffer[i] = '1';
161
+ }
162
+
163
+ bitmap = rb_str_new(buffer, nbits);
164
+ free(buffer);
165
+ return bitmap;
166
+ }
167
+
168
+ VALUE bloom_binary(VALUE klass) {
169
+ BloomFilter *filter = bloom_handle(klass);
170
+
171
+ VALUE bitmap;
172
+ char *buffer;
173
+
174
+ int nbytes = (filter->table_size + 7) / 8;
175
+ buffer = (char *)malloc(nbytes);
176
+
177
+ if (!buffer)
178
+ rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
179
+
180
+ bloom_filter_read(filter, buffer);
181
+
182
+ bitmap = rb_str_new(buffer, nbytes);
183
+ free(buffer);
184
+ return bitmap;
185
+ }
186
+
187
+ VALUE bloom_binary_set(VALUE klass, VALUE buffer) {
188
+ BloomFilter *filter = bloom_handle(klass);
189
+ char* ptr = (char *) RSTRING_PTR(buffer);
190
+ bloom_filter_load(filter, ptr);
191
+ return Qtrue;
192
+ }
193
+
194
+ VALUE bloom_load(VALUE klass, VALUE file) {
195
+ int fd;
196
+ void *buffer;
197
+ size_t nbits, bytes;
198
+ FileHeader header;
199
+ BloomFilter *filter;
200
+ VALUE instance;
201
+
202
+ fd = open(CSTRING(file), O_RDONLY);
203
+ if (fd == -1)
204
+ rb_raise(rb_eIOError, "unable to open file: %s", strerror(errno));
205
+
206
+ if (read(fd, &header, sizeof(header)) != sizeof(header)) {
207
+ close(fd);
208
+ rb_raise(rb_eIOError, "unable to read file, header corrupted\n");
209
+ }
210
+
211
+ nbits = (header.table_size + 7) / 8;
212
+ buffer = malloc(nbits);
213
+ if (!buffer) {
214
+ close(fd);
215
+ rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
216
+ }
217
+
218
+ bytes = read(fd, buffer, nbits);
219
+ if (bytes != nbits) {
220
+ free(buffer);
221
+ close(fd);
222
+ rb_raise(rb_eStandardError, "unable to load BloomFilter, expected %ld but got %ld bytes", nbits, bytes);
223
+ }
224
+
225
+ filter = bloom_filter_new(header.table_size, string_nocase_hash, header.num_functions);
226
+ bloom_filter_load(filter, buffer);
227
+ instance = Data_Wrap_Struct(klass, 0, bloom_free, filter);
228
+
229
+ free(buffer);
230
+ close(fd);
231
+ return instance;
232
+ }
233
+
234
+ void Init_bloom_filter() {
235
+ VALUE cBloom = rb_define_class("BloomFilter", rb_cObject);
236
+
237
+ rb_define_method(cBloom, "initialize", RUBY_METHOD_FUNC(bloom_initialize), -1);
238
+ rb_define_method(cBloom, "dump", RUBY_METHOD_FUNC(bloom_dump), 1);
239
+ rb_define_method(cBloom, "bits", RUBY_METHOD_FUNC(bloom_bits), 0);
240
+ rb_define_method(cBloom, "binary", RUBY_METHOD_FUNC(bloom_binary), 0);
241
+ rb_define_method(cBloom, "binary=", RUBY_METHOD_FUNC(bloom_binary_set), 1);
242
+ rb_define_method(cBloom, "insert", RUBY_METHOD_FUNC(bloom_insert), 1);
243
+ rb_define_method(cBloom, "include?", RUBY_METHOD_FUNC(bloom_include), 1);
244
+
245
+ rb_define_alloc_func(cBloom, bloom_allocate);
246
+ rb_define_singleton_method(cBloom, "load", RUBY_METHOD_FUNC(bloom_load), 1);
247
+ rb_define_const(cBloom, "VERSION", rb_str_new2(RUBY_BLOOM_FILTER_VERSION));
248
+ }
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'mkmf'
4
+ create_makefile 'bloom_filter'
@@ -0,0 +1,58 @@
1
+ /*
2
+
3
+ Copyright (c) 2005-2008, Simon Howard
4
+
5
+ Permission to use, copy, modify, and/or distribute this software
6
+ for any purpose with or without fee is hereby granted, provided
7
+ that the above copyright notice and this permission notice appear
8
+ in all copies.
9
+
10
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
11
+ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
12
+ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
13
+ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
14
+ CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
15
+ LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
16
+ NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
17
+ CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18
+
19
+ */
20
+
21
+ #include <ctype.h>
22
+
23
+ #include "hash-string.h"
24
+
25
+ /* String hash function */
26
+
27
+ unsigned long string_hash (void *string) {
28
+ /* This is the djb2 string hash function */
29
+
30
+ unsigned long result = 5381;
31
+ unsigned char *p;
32
+
33
+ p = (unsigned char *) string;
34
+
35
+ while (*p != '\0') {
36
+ result = ((result << 5) ^ result) ^ (*p);
37
+ ++p;
38
+ }
39
+
40
+ return result;
41
+ }
42
+
43
+ /* The same function, with a tolower on every character so that
44
+ * case is ignored. This code is duplicated for performance. */
45
+
46
+ unsigned long string_nocase_hash (void *string) {
47
+ unsigned long result = 5381;
48
+ unsigned char *p;
49
+
50
+ p = (unsigned char *) string;
51
+
52
+ while (*p != '\0') {
53
+ result = ((result << 5) ^ result) ^ tolower (*p);
54
+ ++p;
55
+ }
56
+
57
+ return result;
58
+ }
@@ -0,0 +1,56 @@
1
+ /*
2
+
3
+ Copyright (c) 2005-2008, Simon Howard
4
+
5
+ Permission to use, copy, modify, and/or distribute this software
6
+ for any purpose with or without fee is hereby granted, provided
7
+ that the above copyright notice and this permission notice appear
8
+ in all copies.
9
+
10
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
11
+ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
12
+ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
13
+ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
14
+ CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
15
+ LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
16
+ NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
17
+ CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18
+
19
+ */
20
+
21
+ /**
22
+ * @file hash-string.h
23
+ *
24
+ * Hash functions for text strings. For more information
25
+ * see @ref string_hash or @ref string_nocase_hash.
26
+ */
27
+
28
+ #ifndef ALGORITHM_HASH_STRING_H
29
+ #define ALGORITHM_HASH_STRING_H
30
+
31
+ #ifdef __cplusplus
32
+ extern "C" {
33
+ #endif
34
+
35
+ /**
36
+ * Generate a hash key from a string.
37
+ *
38
+ * @param string The string.
39
+ * @return A hash key for the string.
40
+ */
41
+
42
+ unsigned long string_hash (void *string);
43
+
44
+ /**
45
+ * Generate a hash key from a string, ignoring the case of letters.
46
+ *
47
+ * @param string The string.
48
+ * @return A hash key for the string.
49
+ */
50
+
51
+ unsigned long string_nocase_hash (void *string);
52
+
53
+ #ifdef __cplusplus
54
+ }
55
+ #endif
56
+ #endif /* #ifndef ALGORITHM_HASH_STRING_H */
@@ -0,0 +1 @@
1
+ #define RUBY_BLOOM_FILTER_VERSION "0.2.1"
@@ -0,0 +1 @@
1
+ require 'bloom_filter'
@@ -0,0 +1,4 @@
1
+ require 'bloom-filter'
2
+ require 'minitest/spec'
3
+
4
+ MiniTest::Unit.autorun
@@ -0,0 +1,29 @@
1
+ require 'helper'
2
+
3
+ describe 'BloomFilter primitives' do
4
+ it 'should create one with default size' do
5
+ assert BloomFilter.new
6
+ end
7
+
8
+ it 'should create one with given size' do
9
+ assert BloomFilter.new(size: 100)
10
+ end
11
+
12
+ it 'should insert' do
13
+ assert filter = BloomFilter.new(size: 100)
14
+ assert filter.insert("foo")
15
+ end
16
+
17
+ it 'should allow membership checks' do
18
+ assert filter = BloomFilter.new(size: 100)
19
+ assert filter.insert("foo")
20
+ assert filter.include?("foo")
21
+ assert !filter.include?("bar")
22
+ end
23
+
24
+ it 'should return bits as a string' do
25
+ assert filter = BloomFilter.new(bits: 5, hashes: 2)
26
+ assert filter.insert("a")
27
+ assert_equal "10010", filter.bits
28
+ end
29
+ end
@@ -0,0 +1,27 @@
1
+ require 'helper'
2
+ require 'tempfile'
3
+
4
+ describe 'BloomFilter load & dump' do
5
+ it 'should dump and load a filter' do
6
+ file = Tempfile.new("bloom-filter-test")
7
+
8
+ assert filter = BloomFilter.new(size: 100)
9
+ assert filter.insert("foo")
10
+ assert filter.dump(file.path), "dump filter"
11
+ assert filter = BloomFilter.load(file.path)
12
+ assert filter.include?("foo")
13
+ assert !filter.include?("bar")
14
+ end
15
+
16
+ it 'should accept assigning the bits directly' do
17
+ assert filter = BloomFilter.new(bits: 80)
18
+ assert filter.insert("foo")
19
+ puts filter.binary.inspect
20
+ assert filter2 = BloomFilter.new(bits: 80)
21
+ assert filter2.binary = filter.binary
22
+ puts
23
+ puts filter2.binary.inspect
24
+ assert filter2.include?("foo")
25
+ assert !filter2.include?("bar")
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bloom-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Bharanee Rathna
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-21 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake-compiler
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: A fast Bloom Filter library for Ruby for unices.
47
+ email:
48
+ - deepfryed@gmail.com
49
+ executables: []
50
+ extensions:
51
+ - ext/extconf.rb
52
+ extra_rdoc_files: []
53
+ files:
54
+ - ext/bloom-filter.c
55
+ - ext/hash-string.c
56
+ - ext/bloom_filter.c
57
+ - ext/bloom-filter.h
58
+ - ext/hash-string.h
59
+ - ext/version.h
60
+ - ext/extconf.rb
61
+ - test/helper.rb
62
+ - test/test_io.rb
63
+ - test/test_basic.rb
64
+ - README.md
65
+ - CHANGELOG
66
+ - lib/bloom-filter.rb
67
+ homepage: http://github.com/deepfryed/bloom-filter
68
+ licenses: []
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ none: false
75
+ requirements:
76
+ - - ! '>='
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ none: false
81
+ requirements:
82
+ - - ! '>='
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ requirements: []
86
+ rubyforge_project:
87
+ rubygems_version: 1.8.25
88
+ signing_key:
89
+ specification_version: 3
90
+ summary: A fast Bloom Filter library for Ruby.
91
+ test_files: []