bloom_fit 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +17 -14
- data/ext/cbloomfilter/cbloomfilter.c +104 -194
- data/ext/cbloomfilter/extconf.rb +0 -1
- data/lib/bloom_fit/configuration_mismatch.rb +4 -0
- data/lib/bloom_fit/version.rb +1 -1
- data/lib/bloom_fit.rb +96 -65
- data/lib/cbloomfilter.bundle +0 -0
- data/test/bloom_fit_test.rb +344 -0
- data/test/test_helper.rb +6 -0
- metadata +12 -69
- data/Rakefile +0 -12
- data/spec/bloom_fit_spec.rb +0 -152
- data/spec/helper.rb +0 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: efa22c92049e3607485a8fcfe471b15cca6e85e6da0c7b19b65f74b9f6ad5fe9
|
|
4
|
+
data.tar.gz: 5e8432456b1258111671d536165217bc3e82e0e430c3bc63112abc4670f91e78
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 72738a57ccb3a1a8989e86993490c3ba6a4f90925c834c1acd70ba104df8ef2bb318d5c66830786ba662e88df09f9ce46d7184810e3d4ec1c6b4cc0b41fcec44
|
|
7
|
+
data.tar.gz: 7472e370d1a66a6034ecb2f0d4720b9edd12f21e181a37cae2869e0e34c70a829366e7ee6caf880f4c4d8c789bc18bdbe2f83e6699617ccc77460320f2a2a1af
|
data/README.md
CHANGED
|
@@ -4,7 +4,12 @@
|
|
|
4
4
|
[](https://github.com/rmm5t/bloom_fit/actions/workflows/ci.yml)
|
|
5
5
|
[](https://rubygems.org/gems/bloom_fit)
|
|
6
6
|
|
|
7
|
-
BloomFit provides a MRI/C-based non-counting bloom filter for use in your Ruby projects. It is heavily based on [bloomfilter-rb]'s native implementation, but
|
|
7
|
+
BloomFit provides a MRI/C-based non-counting bloom filter for use in your Ruby projects. It is heavily based on [bloomfilter-rb]'s native implementation, but differs in the following ways:
|
|
8
|
+
|
|
9
|
+
- uses DJB2 over CRC32 yielding better hash distribution
|
|
10
|
+
- improves performance for very large datasets
|
|
11
|
+
- avoids the need to supply a seed
|
|
12
|
+
- automatically calculates the bit size (m) and the number of hashes (k) when given a capacity and false-positive-rate
|
|
8
13
|
|
|
9
14
|
A [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter) is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set. False positives are possible, but false negatives are not. Instead of using k different hash functions, this implementation a DJB2 hash with k seeds from the CRC table.
|
|
10
15
|
|
|
@@ -13,8 +18,6 @@ Performance of the Bloom filter depends on the following:
|
|
|
13
18
|
- size of the bit array
|
|
14
19
|
- number of hash functions
|
|
15
20
|
|
|
16
|
-
BloomFit is a fork of [bloomfilter-rb].
|
|
17
|
-
|
|
18
21
|
## Resources
|
|
19
22
|
|
|
20
23
|
- Background: [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter)
|
|
@@ -25,7 +28,7 @@ BloomFit is a fork of [bloomfilter-rb].
|
|
|
25
28
|
|
|
26
29
|
MRI/C implementation which creates an in-memory filter which can be saved and reloaded from disk.
|
|
27
30
|
|
|
28
|
-
(COMING SOON) If you'd like to specify an expected item count and a false-positive rate that you can tolerate
|
|
31
|
+
(COMING SOON) If you'd like to specify an expected item count and a false-positive rate that you can tolerate. Visit the [Bloom Filter Calculator](https://hur.st/bloomfilter/) to learn more.
|
|
29
32
|
|
|
30
33
|
```ruby
|
|
31
34
|
require "bloom_fit"
|
|
@@ -40,11 +43,11 @@ bf["bird"] = "bar"
|
|
|
40
43
|
bf["bird"] # => true
|
|
41
44
|
bf["mouse"] # => false
|
|
42
45
|
|
|
43
|
-
bf.stats
|
|
44
|
-
#
|
|
45
|
-
#
|
|
46
|
-
#
|
|
47
|
-
#
|
|
46
|
+
puts bf.stats
|
|
47
|
+
# Number of filter bits (m): 3600
|
|
48
|
+
# Number of set bits (n): 20
|
|
49
|
+
# Number of filter hashes (k) : 10
|
|
50
|
+
# Predicted false positive rate = 0.00%
|
|
48
51
|
```
|
|
49
52
|
|
|
50
53
|
If you'd like more control over the traditional inputs like bit size and the number of hashes:
|
|
@@ -62,11 +65,11 @@ bf["bird"] = "bar"
|
|
|
62
65
|
bf["bird"] # => true
|
|
63
66
|
bf["mouse"] # => false
|
|
64
67
|
|
|
65
|
-
bf.stats
|
|
66
|
-
#
|
|
67
|
-
#
|
|
68
|
-
#
|
|
69
|
-
#
|
|
68
|
+
puts bf.stats
|
|
69
|
+
# Number of filter bits (m): 100
|
|
70
|
+
# Number of set bits (n): 4
|
|
71
|
+
# Number of filter hashes (k) : 2
|
|
72
|
+
# Predicted false positive rate = 10.87%
|
|
70
73
|
```
|
|
71
74
|
|
|
72
75
|
## Credits
|
|
@@ -11,110 +11,99 @@
|
|
|
11
11
|
# define RSTRING_PTR(x) (RSTRING(x)->ptr)
|
|
12
12
|
#endif
|
|
13
13
|
|
|
14
|
-
/* Reuse the standard CRC table for consistent
|
|
15
|
-
static unsigned int *
|
|
14
|
+
/* Reuse the standard CRC table for consistent salts */
|
|
15
|
+
static unsigned int *salts = crc_table;
|
|
16
16
|
|
|
17
17
|
static VALUE cBloomFilter;
|
|
18
18
|
|
|
19
19
|
struct BloomFilter {
|
|
20
|
-
int m; /* # of
|
|
21
|
-
int b; /* # of bits in a bloom filter bucket */
|
|
20
|
+
int m; /* # of bits in a bloom filter */
|
|
22
21
|
int k; /* # of hash functions */
|
|
23
|
-
int r; /* # raise on bucket overflow? */
|
|
24
22
|
unsigned char *ptr; /* bits data */
|
|
25
23
|
int bytes; /* size of byte data */
|
|
26
24
|
};
|
|
27
25
|
|
|
28
|
-
unsigned long djb2(
|
|
26
|
+
unsigned long djb2(const char *str, int len) {
|
|
29
27
|
unsigned long hash = 5381;
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
while (len > 0) {
|
|
33
|
-
hash = ((hash << 5) ^ hash) ^ (*c);
|
|
34
|
-
--len;
|
|
35
|
-
++c;
|
|
28
|
+
for (int i = 0; i < len; i++) {
|
|
29
|
+
hash = ((hash << 5) + hash) + str[i];
|
|
36
30
|
}
|
|
37
31
|
return hash;
|
|
38
32
|
}
|
|
39
33
|
|
|
40
|
-
void
|
|
34
|
+
static void bf_free(void *ptr) {
|
|
35
|
+
struct BloomFilter *bf = ptr;
|
|
36
|
+
|
|
37
|
+
if (bf == NULL) {
|
|
38
|
+
return;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
41
|
ruby_xfree(bf->ptr);
|
|
42
|
+
ruby_xfree(bf);
|
|
42
43
|
}
|
|
43
44
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
|
50
|
-
if ((c & mask) == 0) {
|
|
51
|
-
// do nothing
|
|
52
|
-
} else {
|
|
53
|
-
// reduce the counter: 11 00 => 10 00 (suppose bf->b is 2)
|
|
54
|
-
c -= (1 << bit_offset) & ((1 << 8) -1);
|
|
55
|
-
// shift the bitmap right by 1 bit: 10 00 => 01 00
|
|
56
|
-
c = (~mask & c) | ((c & mask) >> (bit_offset + 1) << bit_offset);
|
|
57
|
-
|
|
58
|
-
bf->ptr[byte_offset] = c & ((1 << 8) - 1);
|
|
59
|
-
bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
|
|
45
|
+
static size_t bf_memsize(const void *ptr) {
|
|
46
|
+
const struct BloomFilter *bf = ptr;
|
|
47
|
+
|
|
48
|
+
if (bf == NULL) {
|
|
49
|
+
return 0;
|
|
60
50
|
}
|
|
51
|
+
|
|
52
|
+
return sizeof(*bf) + (bf->ptr == NULL ? 0 : (size_t) bf->bytes);
|
|
61
53
|
}
|
|
62
54
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
55
|
+
static const rb_data_type_t bf_type = {
|
|
56
|
+
"CBloomFilter",
|
|
57
|
+
{0, bf_free, bf_memsize,},
|
|
58
|
+
0, 0, RUBY_TYPED_FREE_IMMEDIATELY,
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
static struct BloomFilter *bf_ptr(VALUE obj) {
|
|
62
|
+
struct BloomFilter *bf;
|
|
63
|
+
|
|
64
|
+
TypedData_Get_Struct(obj, struct BloomFilter, &bf_type, bf);
|
|
65
|
+
|
|
66
|
+
return bf;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
static VALUE bf_alloc(VALUE klass) {
|
|
70
|
+
struct BloomFilter *bf;
|
|
71
|
+
VALUE obj = TypedData_Make_Struct(klass, struct BloomFilter, &bf_type, bf);
|
|
72
|
+
|
|
73
|
+
bf->m = 0;
|
|
74
|
+
bf->k = 0;
|
|
75
|
+
bf->ptr = NULL;
|
|
76
|
+
bf->bytes = 0;
|
|
77
|
+
|
|
78
|
+
return obj;
|
|
76
79
|
}
|
|
77
80
|
|
|
78
|
-
|
|
79
|
-
int byte_offset =
|
|
80
|
-
int bit_offset =
|
|
81
|
-
unsigned int c = bf->ptr[byte_offset];
|
|
82
|
-
c += bf->ptr[byte_offset + 1] << 8;
|
|
81
|
+
static void bucket_set(struct BloomFilter *bf, int index) {
|
|
82
|
+
int byte_offset = index / 8;
|
|
83
|
+
int bit_offset = index % 8;
|
|
83
84
|
|
|
84
|
-
|
|
85
|
-
return (c & mask) >> bit_offset;
|
|
85
|
+
bf->ptr[byte_offset] |= (unsigned char) (1U << bit_offset);
|
|
86
86
|
}
|
|
87
87
|
|
|
88
|
-
int
|
|
89
|
-
int byte_offset =
|
|
90
|
-
int bit_offset =
|
|
91
|
-
unsigned int c = bf->ptr[byte_offset];
|
|
92
|
-
c += bf->ptr[byte_offset + 1] << 8;
|
|
88
|
+
static int bucket_check(struct BloomFilter *bf, int index) {
|
|
89
|
+
int byte_offset = index / 8;
|
|
90
|
+
int bit_offset = index % 8;
|
|
93
91
|
|
|
94
|
-
|
|
95
|
-
return (c & mask) >> bit_offset;
|
|
92
|
+
return (bf->ptr[byte_offset] >> bit_offset) & 1;
|
|
96
93
|
}
|
|
97
94
|
|
|
98
|
-
static VALUE
|
|
95
|
+
static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
|
|
99
96
|
struct BloomFilter *bf;
|
|
100
|
-
VALUE arg1, arg2
|
|
101
|
-
int m, k
|
|
97
|
+
VALUE arg1, arg2;
|
|
98
|
+
int m, k;
|
|
102
99
|
|
|
103
|
-
|
|
100
|
+
bf = bf_ptr(self);
|
|
104
101
|
|
|
105
102
|
/* default = Fugou approach :-) */
|
|
106
|
-
arg1 = INT2FIX(
|
|
103
|
+
arg1 = INT2FIX(1000);
|
|
107
104
|
arg2 = INT2FIX(4);
|
|
108
|
-
arg3 = INT2FIX(1);
|
|
109
|
-
arg4 = INT2FIX(0);
|
|
110
105
|
|
|
111
106
|
switch (argc) {
|
|
112
|
-
case 4:
|
|
113
|
-
if (argv[3] == Qtrue) {
|
|
114
|
-
arg4 = INT2FIX(1);
|
|
115
|
-
}
|
|
116
|
-
case 3:
|
|
117
|
-
arg3 = argv[2];
|
|
118
107
|
case 2:
|
|
119
108
|
arg2 = argv[1];
|
|
120
109
|
case 1:
|
|
@@ -124,66 +113,48 @@ static VALUE bf_s_new(int argc, VALUE *argv, VALUE self) {
|
|
|
124
113
|
|
|
125
114
|
m = FIX2INT(arg1);
|
|
126
115
|
k = FIX2INT(arg2);
|
|
127
|
-
b = FIX2INT(arg3);
|
|
128
|
-
r = FIX2INT(arg4);
|
|
129
116
|
|
|
130
|
-
if (b < 1 || b > 8)
|
|
131
|
-
rb_raise(rb_eArgError, "bucket size");
|
|
132
117
|
if (m < 1)
|
|
133
118
|
rb_raise(rb_eArgError, "array size");
|
|
134
119
|
if (k < 1)
|
|
135
120
|
rb_raise(rb_eArgError, "hash length");
|
|
136
121
|
|
|
137
|
-
bf->b = b;
|
|
138
122
|
bf->m = m;
|
|
139
123
|
bf->k = k;
|
|
140
|
-
bf->r = r;
|
|
141
124
|
|
|
142
|
-
bf->
|
|
125
|
+
ruby_xfree(bf->ptr);
|
|
126
|
+
bf->ptr = NULL;
|
|
127
|
+
bf->bytes = 0;
|
|
128
|
+
/* Preserve the existing serialized bitmap length, including one padding byte. */
|
|
129
|
+
bf->bytes = (m + 15) / 8;
|
|
143
130
|
bf->ptr = ALLOC_N(unsigned char, bf->bytes);
|
|
144
131
|
|
|
145
132
|
/* initialize the bits with zeros */
|
|
146
133
|
memset(bf->ptr, 0, bf->bytes);
|
|
147
|
-
rb_iv_set(
|
|
134
|
+
rb_iv_set(self, "@hash_value", rb_hash_new());
|
|
148
135
|
|
|
149
|
-
return
|
|
136
|
+
return self;
|
|
150
137
|
}
|
|
151
138
|
|
|
152
139
|
static VALUE bf_clear(VALUE self) {
|
|
153
|
-
struct BloomFilter *bf;
|
|
154
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
140
|
+
struct BloomFilter *bf = bf_ptr(self);
|
|
155
141
|
memset(bf->ptr, 0, bf->bytes);
|
|
156
142
|
return Qtrue;
|
|
157
143
|
}
|
|
158
144
|
|
|
159
145
|
static VALUE bf_m(VALUE self) {
|
|
160
|
-
struct BloomFilter *bf;
|
|
161
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
146
|
+
struct BloomFilter *bf = bf_ptr(self);
|
|
162
147
|
return INT2FIX(bf->m);
|
|
163
148
|
}
|
|
164
149
|
|
|
165
150
|
static VALUE bf_k(VALUE self) {
|
|
166
|
-
struct BloomFilter *bf;
|
|
167
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
151
|
+
struct BloomFilter *bf = bf_ptr(self);
|
|
168
152
|
return INT2FIX(bf->k);
|
|
169
153
|
}
|
|
170
154
|
|
|
171
|
-
static VALUE bf_b(VALUE self) {
|
|
172
|
-
struct BloomFilter *bf;
|
|
173
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
174
|
-
return INT2FIX(bf->b);
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
static VALUE bf_r(VALUE self) {
|
|
178
|
-
struct BloomFilter *bf;
|
|
179
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
180
|
-
return bf->r == 0 ? Qfalse : Qtrue;
|
|
181
|
-
}
|
|
182
|
-
|
|
183
155
|
static VALUE bf_set_bits(VALUE self){
|
|
184
|
-
struct BloomFilter *bf;
|
|
156
|
+
struct BloomFilter *bf = bf_ptr(self);
|
|
185
157
|
int i,j,count = 0;
|
|
186
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
187
158
|
for (i = 0; i < bf->bytes; i++) {
|
|
188
159
|
for (j = 0; j < 8; j++) {
|
|
189
160
|
count += (bf->ptr[i] >> j) & 1;
|
|
@@ -192,13 +163,13 @@ static VALUE bf_set_bits(VALUE self){
|
|
|
192
163
|
return INT2FIX(count);
|
|
193
164
|
}
|
|
194
165
|
|
|
195
|
-
static VALUE
|
|
166
|
+
static VALUE bf_add(VALUE self, VALUE key) {
|
|
196
167
|
VALUE skey;
|
|
197
|
-
unsigned long hash
|
|
168
|
+
unsigned long hash;
|
|
169
|
+
int index;
|
|
198
170
|
int i, len, m, k;
|
|
199
171
|
char *ckey;
|
|
200
|
-
struct BloomFilter *bf;
|
|
201
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
172
|
+
struct BloomFilter *bf = bf_ptr(self);
|
|
202
173
|
|
|
203
174
|
skey = rb_obj_as_string(key);
|
|
204
175
|
ckey = StringValuePtr(skey);
|
|
@@ -209,7 +180,7 @@ static VALUE bf_insert(VALUE self, VALUE key) {
|
|
|
209
180
|
|
|
210
181
|
hash = (unsigned long) djb2(ckey, len);
|
|
211
182
|
for (i = 0; i <= k - 1; i++) {
|
|
212
|
-
index = (
|
|
183
|
+
index = (int) ((hash ^ salts[i]) % (unsigned int) (m));
|
|
213
184
|
|
|
214
185
|
/* set a bit at the index */
|
|
215
186
|
bucket_set(bf, index);
|
|
@@ -219,10 +190,9 @@ static VALUE bf_insert(VALUE self, VALUE key) {
|
|
|
219
190
|
}
|
|
220
191
|
|
|
221
192
|
static VALUE bf_merge(VALUE self, VALUE other) {
|
|
222
|
-
struct BloomFilter *bf
|
|
193
|
+
struct BloomFilter *bf = bf_ptr(self);
|
|
194
|
+
struct BloomFilter *target = bf_ptr(other);
|
|
223
195
|
int i;
|
|
224
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
225
|
-
Data_Get_Struct(other, struct BloomFilter, target);
|
|
226
196
|
for (i = 0; i < bf->bytes; i++) {
|
|
227
197
|
bf->ptr[i] |= target->ptr[i];
|
|
228
198
|
}
|
|
@@ -230,19 +200,17 @@ static VALUE bf_merge(VALUE self, VALUE other) {
|
|
|
230
200
|
}
|
|
231
201
|
|
|
232
202
|
static VALUE bf_and(VALUE self, VALUE other) {
|
|
233
|
-
struct BloomFilter *bf
|
|
203
|
+
struct BloomFilter *bf = bf_ptr(self);
|
|
204
|
+
struct BloomFilter *bf_other = bf_ptr(other);
|
|
205
|
+
struct BloomFilter *target;
|
|
234
206
|
VALUE klass, obj, args[5];
|
|
235
207
|
int i;
|
|
236
208
|
|
|
237
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
238
|
-
Data_Get_Struct(other, struct BloomFilter, bf_other);
|
|
239
209
|
args[0] = INT2FIX(bf->m);
|
|
240
210
|
args[1] = INT2FIX(bf->k);
|
|
241
|
-
args[2] = INT2FIX(bf->b);
|
|
242
|
-
args[3] = INT2FIX(bf->r);
|
|
243
211
|
klass = rb_funcall(self,rb_intern("class"),0);
|
|
244
|
-
obj =
|
|
245
|
-
|
|
212
|
+
obj = rb_class_new_instance(2, args, klass);
|
|
213
|
+
target = bf_ptr(obj);
|
|
246
214
|
for (i = 0; i < bf->bytes; i++){
|
|
247
215
|
target->ptr[i] = bf->ptr[i] & bf_other->ptr[i];
|
|
248
216
|
}
|
|
@@ -251,19 +219,17 @@ static VALUE bf_and(VALUE self, VALUE other) {
|
|
|
251
219
|
}
|
|
252
220
|
|
|
253
221
|
static VALUE bf_or(VALUE self, VALUE other) {
|
|
254
|
-
struct BloomFilter *bf
|
|
222
|
+
struct BloomFilter *bf = bf_ptr(self);
|
|
223
|
+
struct BloomFilter *bf_other = bf_ptr(other);
|
|
224
|
+
struct BloomFilter *target;
|
|
255
225
|
VALUE klass, obj, args[5];
|
|
256
226
|
int i;
|
|
257
227
|
|
|
258
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
259
|
-
Data_Get_Struct(other, struct BloomFilter, bf_other);
|
|
260
228
|
args[0] = INT2FIX(bf->m);
|
|
261
229
|
args[1] = INT2FIX(bf->k);
|
|
262
|
-
args[2] = INT2FIX(bf->b);
|
|
263
|
-
args[3] = INT2FIX(bf->r);
|
|
264
230
|
klass = rb_funcall(self,rb_intern("class"),0);
|
|
265
|
-
obj =
|
|
266
|
-
|
|
231
|
+
obj = rb_class_new_instance(2, args, klass);
|
|
232
|
+
target = bf_ptr(obj);
|
|
267
233
|
for (i = 0; i < bf->bytes; i++){
|
|
268
234
|
target->ptr[i] = bf->ptr[i] | bf_other->ptr[i];
|
|
269
235
|
}
|
|
@@ -271,13 +237,13 @@ static VALUE bf_or(VALUE self, VALUE other) {
|
|
|
271
237
|
return obj;
|
|
272
238
|
}
|
|
273
239
|
|
|
274
|
-
static VALUE
|
|
275
|
-
|
|
240
|
+
static VALUE bf_include(VALUE self, VALUE key) {
|
|
241
|
+
VALUE skey;
|
|
242
|
+
unsigned long hash;
|
|
243
|
+
int index;
|
|
276
244
|
int i, len, m, k;
|
|
277
245
|
char *ckey;
|
|
278
|
-
|
|
279
|
-
struct BloomFilter *bf;
|
|
280
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
246
|
+
struct BloomFilter *bf = bf_ptr(self);
|
|
281
247
|
|
|
282
248
|
skey = rb_obj_as_string(key);
|
|
283
249
|
ckey = StringValuePtr(skey);
|
|
@@ -288,69 +254,19 @@ static VALUE bf_delete(VALUE self, VALUE key) {
|
|
|
288
254
|
|
|
289
255
|
hash = (unsigned long) djb2(ckey, len);
|
|
290
256
|
for (i = 0; i <= k - 1; i++) {
|
|
291
|
-
index = (
|
|
292
|
-
|
|
293
|
-
/* set a bit at the index */
|
|
294
|
-
bucket_unset(bf, index);
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
return Qnil;
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
static VALUE bf_include(int argc, VALUE* argv, VALUE self) {
|
|
302
|
-
unsigned long hash, index;
|
|
303
|
-
int i, len, m, k, tests_idx, vlen;
|
|
304
|
-
char *ckey;
|
|
305
|
-
VALUE tests, key, skey;
|
|
306
|
-
struct BloomFilter *bf;
|
|
307
|
-
|
|
308
|
-
rb_scan_args(argc, argv, "*", &tests);
|
|
309
|
-
|
|
310
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
311
|
-
vlen = RARRAY_LEN(tests);
|
|
312
|
-
for(tests_idx = 0; tests_idx < vlen; tests_idx++) {
|
|
313
|
-
key = rb_ary_entry(tests, tests_idx);
|
|
314
|
-
skey = rb_obj_as_string(key);
|
|
315
|
-
ckey = StringValuePtr(skey);
|
|
316
|
-
len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
|
|
257
|
+
index = (int) ((hash ^ salts[i]) % (unsigned int) (m));
|
|
317
258
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
for (i = 0; i <= k - 1; i++) {
|
|
323
|
-
index = (unsigned long) (hash ^ seeds[i]) % (unsigned int) (m);
|
|
324
|
-
|
|
325
|
-
/* check the bit at the index */
|
|
326
|
-
if (!bucket_check(bf, index)) {
|
|
327
|
-
return Qfalse; /* i.e., it is a new entry ; escape the loop */
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
return Qtrue;
|
|
259
|
+
/* check the bit at the index */
|
|
260
|
+
if (!bucket_check(bf, index)) {
|
|
261
|
+
return Qfalse; /* i.e., it is a new entry ; escape the loop */
|
|
262
|
+
}
|
|
332
263
|
}
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
static VALUE bf_to_s(VALUE self) {
|
|
336
|
-
struct BloomFilter *bf;
|
|
337
|
-
unsigned char *ptr;
|
|
338
|
-
int i;
|
|
339
|
-
VALUE str;
|
|
340
264
|
|
|
341
|
-
|
|
342
|
-
str = rb_str_new(0, bf->m);
|
|
343
|
-
|
|
344
|
-
ptr = (unsigned char *) RSTRING_PTR(str);
|
|
345
|
-
for (i = 0; i < bf->m; i++)
|
|
346
|
-
*ptr++ = bucket_get(bf, i) ? '1' : '0';
|
|
347
|
-
|
|
348
|
-
return str;
|
|
265
|
+
return Qtrue;
|
|
349
266
|
}
|
|
350
267
|
|
|
351
268
|
static VALUE bf_bitmap(VALUE self) {
|
|
352
|
-
struct BloomFilter *bf;
|
|
353
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
269
|
+
struct BloomFilter *bf = bf_ptr(self);
|
|
354
270
|
|
|
355
271
|
VALUE str = rb_str_new(0, bf->bytes);
|
|
356
272
|
unsigned char* ptr = (unsigned char *) RSTRING_PTR(str);
|
|
@@ -361,8 +277,7 @@ static VALUE bf_bitmap(VALUE self) {
|
|
|
361
277
|
}
|
|
362
278
|
|
|
363
279
|
static VALUE bf_load(VALUE self, VALUE bitmap) {
|
|
364
|
-
struct BloomFilter *bf;
|
|
365
|
-
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
280
|
+
struct BloomFilter *bf = bf_ptr(self);
|
|
366
281
|
unsigned char* ptr = (unsigned char *) RSTRING_PTR(bitmap);
|
|
367
282
|
|
|
368
283
|
memcpy(bf->ptr, ptr, bf->bytes);
|
|
@@ -372,26 +287,21 @@ static VALUE bf_load(VALUE self, VALUE bitmap) {
|
|
|
372
287
|
|
|
373
288
|
void Init_cbloomfilter(void) {
|
|
374
289
|
cBloomFilter = rb_define_class("CBloomFilter", rb_cObject);
|
|
375
|
-
|
|
290
|
+
rb_define_alloc_func(cBloomFilter, bf_alloc);
|
|
291
|
+
rb_define_method(cBloomFilter, "initialize", bf_initialize, -1);
|
|
376
292
|
rb_define_method(cBloomFilter, "m", bf_m, 0);
|
|
377
293
|
rb_define_method(cBloomFilter, "k", bf_k, 0);
|
|
378
|
-
rb_define_method(cBloomFilter, "b", bf_b, 0);
|
|
379
|
-
rb_define_method(cBloomFilter, "r", bf_r, 0);
|
|
380
294
|
rb_define_method(cBloomFilter, "set_bits", bf_set_bits, 0);
|
|
381
|
-
|
|
382
|
-
rb_define_method(cBloomFilter, "
|
|
383
|
-
rb_define_method(cBloomFilter, "delete", bf_delete, 1);
|
|
384
|
-
rb_define_method(cBloomFilter, "include?", bf_include, -1);
|
|
295
|
+
rb_define_method(cBloomFilter, "add", bf_add, 1);
|
|
296
|
+
rb_define_method(cBloomFilter, "include?", bf_include, 1);
|
|
385
297
|
rb_define_method(cBloomFilter, "clear", bf_clear, 0);
|
|
386
|
-
rb_define_method(cBloomFilter, "merge
|
|
298
|
+
rb_define_method(cBloomFilter, "merge", bf_merge, 1);
|
|
387
299
|
rb_define_method(cBloomFilter, "&", bf_and, 1);
|
|
388
300
|
rb_define_method(cBloomFilter, "|", bf_or, 1);
|
|
389
301
|
|
|
390
|
-
rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
|
|
391
302
|
rb_define_method(cBloomFilter, "bitmap", bf_bitmap, 0);
|
|
392
303
|
rb_define_method(cBloomFilter, "load", bf_load, 1);
|
|
393
304
|
|
|
394
305
|
/* functions that have not been implemented, yet */
|
|
395
|
-
|
|
396
306
|
// rb_define_method(cBloomFilter, "<=>", bf_cmp, 1);
|
|
397
307
|
}
|
data/ext/cbloomfilter/extconf.rb
CHANGED
data/lib/bloom_fit/version.rb
CHANGED