bloom-filter 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +3 -0
- data/README.md +49 -0
- data/ext/bloom-filter.c +265 -0
- data/ext/bloom-filter.h +193 -0
- data/ext/bloom_filter.c +150 -0
- data/ext/extconf.rb +4 -0
- data/ext/hash-string.c +58 -0
- data/ext/hash-string.h +56 -0
- data/ext/version.h +1 -0
- data/lib/bloom-filter.rb +1 -0
- data/lib/bloom_filter.so +0 -0
- data/test/helper.rb +4 -0
- data/test/test_basic.rb +23 -0
- data/test/test_io.rb +15 -0
- metadata +103 -0
data/CHANGELOG
ADDED
data/README.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# BloomFilter
|
2
|
+
|
3
|
+
BloomFilter is a ruby library that implements an in-memory [Bloom Filter](http://en.wikipedia.org/wiki/Bloom_filter)
|
4
|
+
|
5
|
+
## Dependencies
|
6
|
+
|
7
|
+
* ruby 1.9.1 or later
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
```
|
12
|
+
gem install bloom-filter
|
13
|
+
```
|
14
|
+
|
15
|
+
## API
|
16
|
+
|
17
|
+
```
|
18
|
+
BloomFilter
|
19
|
+
.new
|
20
|
+
.load
|
21
|
+
#dump
|
22
|
+
#insert
|
23
|
+
#exists?
|
24
|
+
|
25
|
+
```
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
require 'bloom-filter'
|
31
|
+
|
32
|
+
filter = BloomFilter.new
|
33
|
+
filter = BloomFilter.new 100_000
|
34
|
+
|
35
|
+
filter.insert "foo"
|
36
|
+
filter.exists? "foo" #=> true
|
37
|
+
filter.exists? "bar" #=> false
|
38
|
+
|
39
|
+
filter.dump "/tmp/random.bloom"
|
40
|
+
filter = BloomFilter.load "/tmp/random.bloom", 100_000
|
41
|
+
```
|
42
|
+
|
43
|
+
## See Also
|
44
|
+
|
45
|
+
[https://github.com/igrigorik/bloomfilter-rb](https://github.com/igrigorik/bloomfilter-rb)
|
46
|
+
|
47
|
+
## License
|
48
|
+
|
49
|
+
[Creative Commons Attribution - CC BY](http://creativecommons.org/licenses/by/3.0)
|
data/ext/bloom-filter.c
ADDED
@@ -0,0 +1,265 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
Copyright (c) 2005-2008, Simon Howard
|
4
|
+
|
5
|
+
Permission to use, copy, modify, and/or distribute this software
|
6
|
+
for any purpose with or without fee is hereby granted, provided
|
7
|
+
that the above copyright notice and this permission notice appear
|
8
|
+
in all copies.
|
9
|
+
|
10
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
|
11
|
+
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
|
12
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
|
13
|
+
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
|
14
|
+
CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
15
|
+
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
|
16
|
+
NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
17
|
+
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
18
|
+
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdlib.h>
|
22
|
+
#include <string.h>
|
23
|
+
|
24
|
+
#include "bloom-filter.h"
|
25
|
+
|
26
|
+
/* malloc() / free() testing */
|
27
|
+
|
28
|
+
#ifdef ALLOC_TESTING
|
29
|
+
#include "alloc-testing.h"
|
30
|
+
#endif
|
31
|
+
|
32
|
+
/* Salt values. These salts are XORed with the output of the hash
|
33
|
+
* function to give multiple unique hashes. */
|
34
|
+
|
35
|
+
static const unsigned int salts[] = {
|
36
|
+
0x5cee4612, 0xb5587b1c, 0xa250f2b0, 0xa3bf6d2a,
|
37
|
+
0x7a81bd1a, 0x92888d7f, 0x1dc977c7, 0xedc96624,
|
38
|
+
0x920c85d9, 0xf16066b3, 0xc6f0d4b3, 0x2b76eb86,
|
39
|
+
0xcacb3893, 0x493d81c5, 0xf5a133ac, 0x039740bf,
|
40
|
+
0x162b8224, 0xf841de90, 0xc3e5090d, 0x3bce93a7,
|
41
|
+
0xf1860334, 0xe832b5f1, 0xf5b6535b, 0xe4cf4fa6,
|
42
|
+
0x8357b769, 0x1442b07a, 0x21c5863d, 0xabc0d846,
|
43
|
+
0x6dc0d77a, 0x23a3992c, 0xe12179ba, 0xd81d1e23,
|
44
|
+
0xcff4727b, 0xe957ecfb, 0xee8f391a, 0x426efa23,
|
45
|
+
0x3a34ff2c, 0x8b875d94, 0x34fd0f63, 0xf159daae,
|
46
|
+
0xaabab8b3, 0xa83a07ba, 0x4e54fb33, 0xfb82fab8,
|
47
|
+
0x2ae2888f, 0xd1a307a8, 0xbe33322d, 0x87c73f86,
|
48
|
+
0x7270fa7e, 0x68673c55, 0x2c8026d0, 0xead8e422,
|
49
|
+
0xa3ee5132, 0xecb67767, 0x1c3b1ae5, 0x47adf5b6,
|
50
|
+
0xf4518d30, 0x46e62797, 0x9889aa76, 0x1405aadf,
|
51
|
+
0xf62f9124, 0x5c435ac5, 0x35b8dfe3, 0x651c08c5,
|
52
|
+
};
|
53
|
+
|
54
|
+
BloomFilter *bloom_filter_new (unsigned int table_size, BloomFilterHashFunc hash_func, unsigned int num_functions) {
|
55
|
+
BloomFilter *filter;
|
56
|
+
|
57
|
+
/* There is a limit on the number of functions which can be
|
58
|
+
* applied, due to the table size */
|
59
|
+
|
60
|
+
if (num_functions > sizeof (salts) / sizeof (*salts)) {
|
61
|
+
return NULL;
|
62
|
+
}
|
63
|
+
|
64
|
+
/* Allocate bloom filter structure */
|
65
|
+
|
66
|
+
filter = malloc (sizeof (BloomFilter));
|
67
|
+
|
68
|
+
if (filter == NULL) {
|
69
|
+
return NULL;
|
70
|
+
}
|
71
|
+
|
72
|
+
/* Allocate table, each entry is one bit; these are packed into
|
73
|
+
* bytes. When allocating we must round the length up to the nearest
|
74
|
+
* byte. */
|
75
|
+
|
76
|
+
filter->table = calloc ((table_size + 7) / 8, 1);
|
77
|
+
|
78
|
+
if (filter->table == NULL) {
|
79
|
+
free (filter);
|
80
|
+
return NULL;
|
81
|
+
}
|
82
|
+
|
83
|
+
filter->hash_func = hash_func;
|
84
|
+
filter->num_functions = num_functions;
|
85
|
+
filter->table_size = table_size;
|
86
|
+
|
87
|
+
return filter;
|
88
|
+
}
|
89
|
+
|
90
|
+
void bloom_filter_free (BloomFilter * bloomfilter) {
|
91
|
+
free (bloomfilter->table);
|
92
|
+
free (bloomfilter);
|
93
|
+
}
|
94
|
+
|
95
|
+
void bloom_filter_insert (BloomFilter * bloomfilter, BloomFilterValue value) {
|
96
|
+
unsigned long hash;
|
97
|
+
unsigned long subhash;
|
98
|
+
unsigned int index;
|
99
|
+
unsigned int i;
|
100
|
+
|
101
|
+
/* Generate hash of the value to insert */
|
102
|
+
|
103
|
+
hash = bloomfilter->hash_func (value);
|
104
|
+
|
105
|
+
/* Generate multiple unique hashes by XORing with values in the
|
106
|
+
* salt table. */
|
107
|
+
|
108
|
+
for (i = 0; i < bloomfilter->num_functions; ++i) {
|
109
|
+
|
110
|
+
/* Generate a unique hash */
|
111
|
+
|
112
|
+
subhash = hash ^ salts[i];
|
113
|
+
|
114
|
+
/* Find the index into the table */
|
115
|
+
|
116
|
+
index = subhash % bloomfilter->table_size;
|
117
|
+
|
118
|
+
/* Insert into the table.
|
119
|
+
* index / 8 finds the byte index of the table,
|
120
|
+
* index % 8 gives the bit index within that byte to set. */
|
121
|
+
|
122
|
+
bloomfilter->table[index / 8] |= 1 << (index % 8);
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
int bloom_filter_query (BloomFilter * bloomfilter, BloomFilterValue value) {
|
127
|
+
unsigned long hash;
|
128
|
+
unsigned long subhash;
|
129
|
+
unsigned int index;
|
130
|
+
unsigned int i;
|
131
|
+
unsigned char b;
|
132
|
+
int bit;
|
133
|
+
|
134
|
+
/* Generate hash of the value to lookup */
|
135
|
+
|
136
|
+
hash = bloomfilter->hash_func (value);
|
137
|
+
|
138
|
+
/* Generate multiple unique hashes by XORing with values in the
|
139
|
+
* salt table. */
|
140
|
+
|
141
|
+
for (i = 0; i < bloomfilter->num_functions; ++i) {
|
142
|
+
|
143
|
+
/* Generate a unique hash */
|
144
|
+
|
145
|
+
subhash = hash ^ salts[i];
|
146
|
+
|
147
|
+
/* Find the index into the table to test */
|
148
|
+
|
149
|
+
index = subhash % bloomfilter->table_size;
|
150
|
+
|
151
|
+
/* The byte at index / 8 holds the value to test */
|
152
|
+
|
153
|
+
b = bloomfilter->table[index / 8];
|
154
|
+
bit = 1 << (index % 8);
|
155
|
+
|
156
|
+
/* Test if the particular bit is set; if it is not set,
|
157
|
+
* this value can not have been inserted. */
|
158
|
+
|
159
|
+
if ((b & bit) == 0) {
|
160
|
+
return 0;
|
161
|
+
}
|
162
|
+
}
|
163
|
+
|
164
|
+
/* All necessary bits were set. This may indicate that the value
|
165
|
+
* was inserted, or the values could have been set through other
|
166
|
+
* insertions. */
|
167
|
+
|
168
|
+
return 1;
|
169
|
+
}
|
170
|
+
|
171
|
+
void bloom_filter_read (BloomFilter * bloomfilter, unsigned char *array) {
|
172
|
+
unsigned int array_size;
|
173
|
+
|
174
|
+
/* The table is an array of bits, packed into bytes. Round up
|
175
|
+
* to the nearest byte. */
|
176
|
+
|
177
|
+
array_size = (bloomfilter->table_size + 7) / 8;
|
178
|
+
|
179
|
+
/* Copy into the buffer of the calling routine. */
|
180
|
+
|
181
|
+
memcpy (array, bloomfilter->table, array_size);
|
182
|
+
}
|
183
|
+
|
184
|
+
void bloom_filter_load (BloomFilter * bloomfilter, unsigned char *array) {
|
185
|
+
unsigned int array_size;
|
186
|
+
|
187
|
+
/* The table is an array of bits, packed into bytes. Round up
|
188
|
+
* to the nearest byte. */
|
189
|
+
|
190
|
+
array_size = (bloomfilter->table_size + 7) / 8;
|
191
|
+
|
192
|
+
/* Copy from the buffer of the calling routine. */
|
193
|
+
|
194
|
+
memcpy (bloomfilter->table, array, array_size);
|
195
|
+
}
|
196
|
+
|
197
|
+
BloomFilter *bloom_filter_union (BloomFilter * filter1, BloomFilter * filter2) {
|
198
|
+
BloomFilter *result;
|
199
|
+
unsigned int i;
|
200
|
+
unsigned int array_size;
|
201
|
+
|
202
|
+
/* To perform this operation, both filters must be created with
|
203
|
+
* the same values. */
|
204
|
+
|
205
|
+
if (filter1->table_size != filter2->table_size
|
206
|
+
|| filter1->num_functions != filter2->num_functions || filter1->hash_func != filter2->hash_func) {
|
207
|
+
return NULL;
|
208
|
+
}
|
209
|
+
|
210
|
+
/* Create a new bloom filter for the result */
|
211
|
+
|
212
|
+
result = bloom_filter_new (filter1->table_size, filter1->hash_func, filter1->num_functions);
|
213
|
+
|
214
|
+
if (result == NULL) {
|
215
|
+
return NULL;
|
216
|
+
}
|
217
|
+
|
218
|
+
/* The table is an array of bits, packed into bytes. Round up
|
219
|
+
* to the nearest byte. */
|
220
|
+
|
221
|
+
array_size = (filter1->table_size + 7) / 8;
|
222
|
+
|
223
|
+
/* Populate the table of the new filter */
|
224
|
+
|
225
|
+
for (i = 0; i < array_size; ++i) {
|
226
|
+
result->table[i] = filter1->table[i] | filter2->table[i];
|
227
|
+
}
|
228
|
+
|
229
|
+
return result;
|
230
|
+
}
|
231
|
+
|
232
|
+
BloomFilter *bloom_filter_intersection (BloomFilter * filter1, BloomFilter * filter2) {
|
233
|
+
BloomFilter *result;
|
234
|
+
unsigned int i;
|
235
|
+
unsigned int array_size;
|
236
|
+
|
237
|
+
/* To perform this operation, both filters must be created with
|
238
|
+
* the same values. */
|
239
|
+
|
240
|
+
if (filter1->table_size != filter2->table_size
|
241
|
+
|| filter1->num_functions != filter2->num_functions || filter1->hash_func != filter2->hash_func) {
|
242
|
+
return NULL;
|
243
|
+
}
|
244
|
+
|
245
|
+
/* Create a new bloom filter for the result */
|
246
|
+
|
247
|
+
result = bloom_filter_new (filter1->table_size, filter1->hash_func, filter1->num_functions);
|
248
|
+
|
249
|
+
if (result == NULL) {
|
250
|
+
return NULL;
|
251
|
+
}
|
252
|
+
|
253
|
+
/* The table is an array of bits, packed into bytes. Round up
|
254
|
+
* to the nearest byte. */
|
255
|
+
|
256
|
+
array_size = (filter1->table_size + 7) / 8;
|
257
|
+
|
258
|
+
/* Populate the table of the new filter */
|
259
|
+
|
260
|
+
for (i = 0; i < array_size; ++i) {
|
261
|
+
result->table[i] = filter1->table[i] & filter2->table[i];
|
262
|
+
}
|
263
|
+
|
264
|
+
return result;
|
265
|
+
}
|
data/ext/bloom-filter.h
ADDED
@@ -0,0 +1,193 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
Copyright (c) 2005-2008, Simon Howard
|
4
|
+
|
5
|
+
Permission to use, copy, modify, and/or distribute this software
|
6
|
+
for any purpose with or without fee is hereby granted, provided
|
7
|
+
that the above copyright notice and this permission notice appear
|
8
|
+
in all copies.
|
9
|
+
|
10
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
|
11
|
+
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
|
12
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
|
13
|
+
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
|
14
|
+
CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
15
|
+
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
|
16
|
+
NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
17
|
+
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
18
|
+
|
19
|
+
*/
|
20
|
+
|
21
|
+
/**
|
22
|
+
* @file bloom-filter.h
|
23
|
+
*
|
24
|
+
* @brief Bloom filter
|
25
|
+
*
|
26
|
+
* A bloom filter is a space efficient data structure that can be
|
27
|
+
* used to test whether a given element is part of a set. Lookups
|
28
|
+
* will occasionally generate false positives, but never false
|
29
|
+
* negatives.
|
30
|
+
*
|
31
|
+
* To create a bloom filter, use @ref bloom_filter_new. To destroy a
|
32
|
+
* bloom filter, use @ref bloom_filter_free.
|
33
|
+
*
|
34
|
+
* To insert a value into a bloom filter, use @ref bloom_filter_insert.
|
35
|
+
*
|
36
|
+
* To query whether a value is part of the set, use
|
37
|
+
* @ref bloom_filter_query.
|
38
|
+
*/
|
39
|
+
|
40
|
+
#ifndef ALGORITHM_BLOOM_FILTER_H
|
41
|
+
#define ALGORITHM_BLOOM_FILTER_H
|
42
|
+
|
43
|
+
#ifdef __cplusplus
|
44
|
+
extern "C" {
|
45
|
+
#endif
|
46
|
+
|
47
|
+
/**
|
48
|
+
* A value stored in a @ref BloomFilter.
|
49
|
+
*/
|
50
|
+
|
51
|
+
typedef void *BloomFilterValue;
|
52
|
+
|
53
|
+
/**
|
54
|
+
* Hash function used to generate hash values for values inserted into a
|
55
|
+
* bloom filter.
|
56
|
+
*
|
57
|
+
* @param data The value to generate a hash value for.
|
58
|
+
* @return The hash value.
|
59
|
+
*/
|
60
|
+
|
61
|
+
typedef unsigned long (*BloomFilterHashFunc) (BloomFilterValue data);
|
62
|
+
|
63
|
+
/**
|
64
|
+
* A bloom filter structure.
|
65
|
+
*/
|
66
|
+
|
67
|
+
typedef struct BloomFilter {
|
68
|
+
BloomFilterHashFunc hash_func;
|
69
|
+
unsigned char *table;
|
70
|
+
unsigned int table_size;
|
71
|
+
unsigned int num_functions;
|
72
|
+
} BloomFilter;
|
73
|
+
|
74
|
+
|
75
|
+
/**
|
76
|
+
* Create a new bloom filter.
|
77
|
+
*
|
78
|
+
* @param table_size The size of the bloom filter. The greater
|
79
|
+
* the table size, the more elements can be
|
80
|
+
* stored, and the lesser the chance of false
|
81
|
+
* positives.
|
82
|
+
* @param hash_func Hash function to use on values stored in the
|
83
|
+
* filter.
|
84
|
+
* @param num_functions Number of hash functions to apply to each
|
85
|
+
* element on insertion. This running time for
|
86
|
+
* insertion and queries is proportional to this
|
87
|
+
* value. The more functions applied, the lesser
|
88
|
+
* the chance of false positives. The maximum
|
89
|
+
* number of functions is 64.
|
90
|
+
* @return A new hash table structure, or NULL if it
|
91
|
+
* was not possible to allocate the new bloom
|
92
|
+
* filter.
|
93
|
+
*/
|
94
|
+
|
95
|
+
BloomFilter *bloom_filter_new (unsigned int table_size, BloomFilterHashFunc hash_func, unsigned int num_functions);
|
96
|
+
|
97
|
+
/**
|
98
|
+
* Destroy a bloom filter.
|
99
|
+
*
|
100
|
+
* @param bloomfilter The bloom filter to destroy.
|
101
|
+
*/
|
102
|
+
|
103
|
+
void bloom_filter_free (BloomFilter * bloomfilter);
|
104
|
+
|
105
|
+
/**
|
106
|
+
* Insert a value into a bloom filter.
|
107
|
+
*
|
108
|
+
* @param bloomfilter The bloom filter.
|
109
|
+
* @param value The value to insert.
|
110
|
+
*/
|
111
|
+
|
112
|
+
void bloom_filter_insert (BloomFilter * bloomfilter, BloomFilterValue value);
|
113
|
+
|
114
|
+
/**
|
115
|
+
* Query a bloom filter for a particular value.
|
116
|
+
*
|
117
|
+
* @param bloomfilter The bloom filter.
|
118
|
+
* @param value The value to look up.
|
119
|
+
* @return Zero if the value was definitely not
|
120
|
+
* inserted into the filter. Non-zero
|
121
|
+
* indicates that it either may or may not
|
122
|
+
* have been inserted.
|
123
|
+
*/
|
124
|
+
|
125
|
+
int bloom_filter_query (BloomFilter * bloomfilter, BloomFilterValue value);
|
126
|
+
|
127
|
+
/**
|
128
|
+
* Read the contents of a bloom filter into an array.
|
129
|
+
*
|
130
|
+
* @param bloomfilter The bloom filter.
|
131
|
+
* @param array Pointer to the array to read into. This
|
132
|
+
* should be (table_size + 7) / 8 bytes in
|
133
|
+
* length.
|
134
|
+
*/
|
135
|
+
|
136
|
+
void bloom_filter_read (BloomFilter * bloomfilter, unsigned char *array);
|
137
|
+
|
138
|
+
/**
|
139
|
+
* Load the contents of a bloom filter from an array.
|
140
|
+
* The data loaded should be the output read from @ref bloom_filter_read,
|
141
|
+
* from a bloom filter created using the same arguments used to create
|
142
|
+
* the original filter.
|
143
|
+
*
|
144
|
+
* @param bloomfilter The bloom filter.
|
145
|
+
* @param array Pointer to the array to load from. This
|
146
|
+
* should be (table_size + 7) / 8 bytes in
|
147
|
+
* length.
|
148
|
+
*/
|
149
|
+
|
150
|
+
void bloom_filter_load (BloomFilter * bloomfilter, unsigned char *array);
|
151
|
+
|
152
|
+
/**
|
153
|
+
* Find the union of two bloom filters. Values are present in the
|
154
|
+
* resulting filter if they are present in either of the original
|
155
|
+
* filters.
|
156
|
+
*
|
157
|
+
* Both of the original filters must have been created using the
|
158
|
+
* same parameters to @ref bloom_filter_new.
|
159
|
+
*
|
160
|
+
* @param filter1 The first filter.
|
161
|
+
* @param filter2 The second filter.
|
162
|
+
* @return A new filter which is an intersection of the
|
163
|
+
* two filters, or NULL if it was not possible
|
164
|
+
* to allocate memory for the new filter, or
|
165
|
+
* if the two filters specified were created
|
166
|
+
* with different parameters.
|
167
|
+
*/
|
168
|
+
|
169
|
+
BloomFilter *bloom_filter_union (BloomFilter * filter1, BloomFilter * filter2);
|
170
|
+
|
171
|
+
/**
|
172
|
+
* Find the intersection of two bloom filters. Values are only ever
|
173
|
+
* present in the resulting filter if they are present in both of the
|
174
|
+
* original filters.
|
175
|
+
*
|
176
|
+
* Both of the original filters must have been created using the
|
177
|
+
* same parameters to @ref bloom_filter_new.
|
178
|
+
*
|
179
|
+
* @param filter1 The first filter.
|
180
|
+
* @param filter2 The second filter.
|
181
|
+
* @return A new filter which is an intersection of the
|
182
|
+
* two filters, or NULL if it was not possible
|
183
|
+
* to allocate memory for the new filter, or
|
184
|
+
* if the two filters specified were created
|
185
|
+
* with different parameters.
|
186
|
+
*/
|
187
|
+
|
188
|
+
BloomFilter *bloom_filter_intersection (BloomFilter * filter1, BloomFilter * filter2);
|
189
|
+
|
190
|
+
#ifdef __cplusplus
|
191
|
+
}
|
192
|
+
#endif
|
193
|
+
#endif /* #ifndef ALGORITHM_BLOOM_FILTER_H */
|
data/ext/bloom_filter.c
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
#include <sys/types.h>
|
2
|
+
#include <sys/stat.h>
|
3
|
+
#include <fcntl.h>
|
4
|
+
#include <unistd.h>
|
5
|
+
#include <errno.h>
|
6
|
+
#include <string.h>
|
7
|
+
#include <stdlib.h>
|
8
|
+
|
9
|
+
#include "ruby/ruby.h"
|
10
|
+
#include "bloom-filter.h"
|
11
|
+
#include "hash-string.h"
|
12
|
+
#include "version.h"
|
13
|
+
|
14
|
+
#define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
|
15
|
+
#define CSTRING(v) RSTRING_PTR(TO_S(v))
|
16
|
+
|
17
|
+
static void bloom_free(BloomFilter *filter) {
|
18
|
+
if (filter)
|
19
|
+
bloom_filter_free(filter);
|
20
|
+
}
|
21
|
+
|
22
|
+
VALUE bloom_allocate(VALUE klass) {
|
23
|
+
BloomFilter *filter = 0;
|
24
|
+
return Data_Wrap_Struct(klass, 0, bloom_free, filter);
|
25
|
+
}
|
26
|
+
|
27
|
+
BloomFilter* bloom_handle(VALUE self) {
|
28
|
+
BloomFilter *filter = 0;
|
29
|
+
Data_Get_Struct(self, BloomFilter, filter);
|
30
|
+
if (!filter)
|
31
|
+
rb_raise(rb_eArgError, "invalid BloomFilter instance");
|
32
|
+
return filter;
|
33
|
+
}
|
34
|
+
|
35
|
+
VALUE bloom_initialize(int argc, VALUE *argv, VALUE self) {
|
36
|
+
size_t size;
|
37
|
+
VALUE table_size;
|
38
|
+
BloomFilter *filter = 0;
|
39
|
+
|
40
|
+
rb_scan_args(argc, argv, "01", &table_size);
|
41
|
+
if (NIL_P(table_size))
|
42
|
+
size = 1000000;
|
43
|
+
else
|
44
|
+
size = atol(CSTRING(table_size));
|
45
|
+
|
46
|
+
filter = bloom_filter_new(size, string_nocase_hash, 4);
|
47
|
+
|
48
|
+
if (!filter)
|
49
|
+
rb_raise(rb_eNoMemError, "unable to allocate memory for BloomFilter");
|
50
|
+
|
51
|
+
DATA_PTR(self) = filter;
|
52
|
+
}
|
53
|
+
|
54
|
+
|
55
|
+
VALUE bloom_insert(VALUE klass, VALUE string) {
|
56
|
+
BloomFilter *filter = bloom_handle(klass);
|
57
|
+
bloom_filter_insert(filter, (BloomFilterValue)CSTRING(string));
|
58
|
+
return Qtrue;
|
59
|
+
}
|
60
|
+
|
61
|
+
VALUE bloom_exists(VALUE klass, VALUE string) {
|
62
|
+
BloomFilter *filter = bloom_handle(klass);
|
63
|
+
return bloom_filter_query(filter, (BloomFilterValue)CSTRING(string)) ? Qtrue : Qfalse;
|
64
|
+
}
|
65
|
+
|
66
|
+
VALUE bloom_dump(VALUE klass, VALUE file) {
|
67
|
+
int fd;
|
68
|
+
BloomFilter *filter = bloom_handle(klass);
|
69
|
+
|
70
|
+
size_t size = (filter->table_size + 7) / 8;
|
71
|
+
void *buffer = malloc(size);
|
72
|
+
|
73
|
+
if (!buffer)
|
74
|
+
rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
|
75
|
+
|
76
|
+
bloom_filter_read(filter, buffer);
|
77
|
+
|
78
|
+
fd = open(CSTRING(file), O_WRONLY|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR);
|
79
|
+
if (fd == -1) {
|
80
|
+
free(buffer);
|
81
|
+
rb_raise(rb_eIOError, "unable to open file. %s", strerror(errno));
|
82
|
+
}
|
83
|
+
|
84
|
+
if (write(fd, buffer, size) != -1) {
|
85
|
+
free(buffer);
|
86
|
+
close(fd);
|
87
|
+
return Qtrue;
|
88
|
+
}
|
89
|
+
else {
|
90
|
+
free(buffer);
|
91
|
+
close(fd);
|
92
|
+
rb_raise(rb_eIOError, "error dumping BloomFilter: %s\n", strerror(errno));
|
93
|
+
return Qfalse; // not reachable
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
VALUE bloom_load(int argc, VALUE *argv, VALUE klass) {
|
98
|
+
int fd;
|
99
|
+
void *buffer;
|
100
|
+
size_t size, bytes;
|
101
|
+
BloomFilter *filter;
|
102
|
+
VALUE file, table_size, instance;
|
103
|
+
|
104
|
+
rb_scan_args(argc, argv, "11", &file, &table_size);
|
105
|
+
if (NIL_P(table_size)) {
|
106
|
+
size = 1000000;
|
107
|
+
table_size = INT2NUM(size);
|
108
|
+
}
|
109
|
+
else
|
110
|
+
size = atol(CSTRING(table_size));
|
111
|
+
|
112
|
+
fd = open(CSTRING(file), O_RDONLY);
|
113
|
+
if (fd == -1)
|
114
|
+
rb_raise(rb_eIOError, "unable to open file: %s", strerror(errno));
|
115
|
+
|
116
|
+
size = (size + 7) / 8;
|
117
|
+
buffer = malloc(size);
|
118
|
+
if (!buffer) {
|
119
|
+
close(fd);
|
120
|
+
rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
|
121
|
+
}
|
122
|
+
|
123
|
+
bytes = read(fd, buffer, size);
|
124
|
+
if (bytes != size) {
|
125
|
+
free(buffer);
|
126
|
+
close(fd);
|
127
|
+
rb_raise(rb_eStandardError, "unable to load BloomFilter, expected %ld but got %ld bytes", size, bytes);
|
128
|
+
}
|
129
|
+
|
130
|
+
instance = bloom_allocate(klass);
|
131
|
+
bloom_initialize(1, &table_size, instance);
|
132
|
+
|
133
|
+
bloom_filter_load(bloom_handle(instance), buffer);
|
134
|
+
free(buffer);
|
135
|
+
close(fd);
|
136
|
+
return instance;
|
137
|
+
}
|
138
|
+
|
139
|
+
Init_bloom_filter() {
|
140
|
+
VALUE cBloom = rb_define_class("BloomFilter", rb_cObject);
|
141
|
+
|
142
|
+
rb_define_method(cBloom, "initialize", RUBY_METHOD_FUNC(bloom_initialize), -1);
|
143
|
+
rb_define_method(cBloom, "dump", RUBY_METHOD_FUNC(bloom_dump), 1);
|
144
|
+
rb_define_method(cBloom, "insert", RUBY_METHOD_FUNC(bloom_insert), 1);
|
145
|
+
rb_define_method(cBloom, "exists?", RUBY_METHOD_FUNC(bloom_exists), 1);
|
146
|
+
|
147
|
+
rb_define_alloc_func(cBloom, bloom_allocate);
|
148
|
+
rb_define_singleton_method(cBloom, "load", RUBY_METHOD_FUNC(bloom_load), -1);
|
149
|
+
rb_define_const(cBloom, "VERSION", rb_str_new2(RUBY_BLOOM_FILTER_VERSION));
|
150
|
+
}
|
data/ext/extconf.rb
ADDED
data/ext/hash-string.c
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
Copyright (c) 2005-2008, Simon Howard
|
4
|
+
|
5
|
+
Permission to use, copy, modify, and/or distribute this software
|
6
|
+
for any purpose with or without fee is hereby granted, provided
|
7
|
+
that the above copyright notice and this permission notice appear
|
8
|
+
in all copies.
|
9
|
+
|
10
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
|
11
|
+
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
|
12
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
|
13
|
+
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
|
14
|
+
CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
15
|
+
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
|
16
|
+
NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
17
|
+
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
18
|
+
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <ctype.h>
|
22
|
+
|
23
|
+
#include "hash-string.h"
|
24
|
+
|
25
|
+
/* String hash function */
|
26
|
+
|
27
|
+
unsigned long string_hash (void *string) {
|
28
|
+
/* This is the djb2 string hash function */
|
29
|
+
|
30
|
+
unsigned long result = 5381;
|
31
|
+
unsigned char *p;
|
32
|
+
|
33
|
+
p = (unsigned char *) string;
|
34
|
+
|
35
|
+
while (*p != '\0') {
|
36
|
+
result = ((result << 5) ^ result) ^ (*p);
|
37
|
+
++p;
|
38
|
+
}
|
39
|
+
|
40
|
+
return result;
|
41
|
+
}
|
42
|
+
|
43
|
+
/* The same function, with a tolower on every character so that
|
44
|
+
* case is ignored. This code is duplicated for performance. */
|
45
|
+
|
46
|
+
unsigned long string_nocase_hash (void *string) {
|
47
|
+
unsigned long result = 5381;
|
48
|
+
unsigned char *p;
|
49
|
+
|
50
|
+
p = (unsigned char *) string;
|
51
|
+
|
52
|
+
while (*p != '\0') {
|
53
|
+
result = ((result << 5) ^ result) ^ tolower (*p);
|
54
|
+
++p;
|
55
|
+
}
|
56
|
+
|
57
|
+
return result;
|
58
|
+
}
|
data/ext/hash-string.h
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
Copyright (c) 2005-2008, Simon Howard
|
4
|
+
|
5
|
+
Permission to use, copy, modify, and/or distribute this software
|
6
|
+
for any purpose with or without fee is hereby granted, provided
|
7
|
+
that the above copyright notice and this permission notice appear
|
8
|
+
in all copies.
|
9
|
+
|
10
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
|
11
|
+
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
|
12
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
|
13
|
+
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
|
14
|
+
CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
15
|
+
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
|
16
|
+
NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
17
|
+
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
18
|
+
|
19
|
+
*/
|
20
|
+
|
21
|
+
/**
|
22
|
+
* @file hash-string.h
|
23
|
+
*
|
24
|
+
* Hash functions for text strings. For more information
|
25
|
+
* see @ref string_hash or @ref string_nocase_hash.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef ALGORITHM_HASH_STRING_H
|
29
|
+
#define ALGORITHM_HASH_STRING_H
|
30
|
+
|
31
|
+
#ifdef __cplusplus
|
32
|
+
extern "C" {
|
33
|
+
#endif
|
34
|
+
|
35
|
+
/**
|
36
|
+
* Generate a hash key from a string.
|
37
|
+
*
|
38
|
+
* @param string The string.
|
39
|
+
* @return A hash key for the string.
|
40
|
+
*/
|
41
|
+
|
42
|
+
unsigned long string_hash (void *string);
|
43
|
+
|
44
|
+
/**
|
45
|
+
* Generate a hash key from a string, ignoring the case of letters.
|
46
|
+
*
|
47
|
+
* @param string The string.
|
48
|
+
* @return A hash key for the string.
|
49
|
+
*/
|
50
|
+
|
51
|
+
unsigned long string_nocase_hash (void *string);
|
52
|
+
|
53
|
+
#ifdef __cplusplus
|
54
|
+
}
|
55
|
+
#endif
|
56
|
+
#endif /* #ifndef ALGORITHM_HASH_STRING_H */
|
data/ext/version.h
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
#define RUBY_BLOOM_FILTER_VERSION "0.1.0"
|
data/lib/bloom-filter.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'bloom_filter'
|
data/lib/bloom_filter.so
ADDED
Binary file
|
data/test/helper.rb
ADDED
data/test/test_basic.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
describe 'BloomFilter primitives' do
|
4
|
+
it 'should create one with default size' do
|
5
|
+
assert BloomFilter.new
|
6
|
+
end
|
7
|
+
|
8
|
+
it 'should create one with given size' do
|
9
|
+
assert BloomFilter.new(100)
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should insert' do
|
13
|
+
assert filter = BloomFilter.new(100)
|
14
|
+
assert filter.insert("foo")
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'should allow membership checks' do
|
18
|
+
assert filter = BloomFilter.new(100)
|
19
|
+
assert filter.insert("foo")
|
20
|
+
assert filter.exists?("foo")
|
21
|
+
assert !filter.exists?("bar")
|
22
|
+
end
|
23
|
+
end
|
data/test/test_io.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require 'tempfile'
|
3
|
+
|
4
|
+
describe 'BloomFilter load & dump' do
|
5
|
+
it 'should dump and load a filter' do
|
6
|
+
file = Tempfile.new("bloom-filter-test")
|
7
|
+
|
8
|
+
assert filter = BloomFilter.new(100)
|
9
|
+
assert filter.insert("foo")
|
10
|
+
assert filter.dump(file.path), "dump filter"
|
11
|
+
assert filter = BloomFilter.load(file.path, 100)
|
12
|
+
assert filter.exists?("foo")
|
13
|
+
assert !filter.exists?("bar")
|
14
|
+
end
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bloom-filter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Bharanee Rathna
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2012-01-25 00:00:00 +11:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rake
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
type: :development
|
32
|
+
version_requirements: *id001
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: rake-compiler
|
35
|
+
prerelease: false
|
36
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
segments:
|
42
|
+
- 0
|
43
|
+
version: "0"
|
44
|
+
type: :development
|
45
|
+
version_requirements: *id002
|
46
|
+
description: A fast Bloom Filter library for Ruby for unices.
|
47
|
+
email:
|
48
|
+
- deepfryed@gmail.com
|
49
|
+
executables: []
|
50
|
+
|
51
|
+
extensions:
|
52
|
+
- ext/extconf.rb
|
53
|
+
extra_rdoc_files: []
|
54
|
+
|
55
|
+
files:
|
56
|
+
- ext/hash-string.c
|
57
|
+
- ext/bloom-filter.c
|
58
|
+
- ext/bloom_filter.c
|
59
|
+
- ext/bloom-filter.h
|
60
|
+
- ext/version.h
|
61
|
+
- ext/hash-string.h
|
62
|
+
- ext/extconf.rb
|
63
|
+
- test/test_io.rb
|
64
|
+
- test/helper.rb
|
65
|
+
- test/test_basic.rb
|
66
|
+
- README.md
|
67
|
+
- CHANGELOG
|
68
|
+
- lib/bloom-filter.rb
|
69
|
+
- lib/bloom_filter.so
|
70
|
+
has_rdoc: true
|
71
|
+
homepage: http://github.com/deepfryed/bloom-filter
|
72
|
+
licenses: []
|
73
|
+
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options: []
|
76
|
+
|
77
|
+
require_paths:
|
78
|
+
- lib
|
79
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
80
|
+
none: false
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
segments:
|
85
|
+
- 0
|
86
|
+
version: "0"
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
none: false
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
segments:
|
93
|
+
- 0
|
94
|
+
version: "0"
|
95
|
+
requirements: []
|
96
|
+
|
97
|
+
rubyforge_project:
|
98
|
+
rubygems_version: 1.3.7
|
99
|
+
signing_key:
|
100
|
+
specification_version: 3
|
101
|
+
summary: A fast Bloom Filter library for Ruby.
|
102
|
+
test_files: []
|
103
|
+
|