bloom-filter 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +10 -5
- data/ext/bloom_filter.c +74 -34
- data/ext/version.h +1 -1
- data/test/test_basic.rb +5 -5
- data/test/test_io.rb +4 -4
- metadata +3 -4
- data/lib/bloom_filter.so +0 -0
data/README.md
CHANGED
@@ -20,7 +20,7 @@ BloomFilter is a ruby library that implements an in-memory [Bloom Filter](http:/
|
|
20
20
|
.load
|
21
21
|
#dump
|
22
22
|
#insert
|
23
|
-
#
|
23
|
+
#include?
|
24
24
|
|
25
25
|
```
|
26
26
|
|
@@ -30,14 +30,19 @@ BloomFilter is a ruby library that implements an in-memory [Bloom Filter](http:/
|
|
30
30
|
require 'bloom-filter'
|
31
31
|
|
32
32
|
filter = BloomFilter.new
|
33
|
-
|
33
|
+
|
34
|
+
# auto-calculate optimum bitmap size based on maximum number of items stored and desired max error rate.
|
35
|
+
filter = BloomFilter.new size: 100_000, error_rate: 0.01
|
36
|
+
|
37
|
+
# specify bitmap size & number of hash functions explicitly.
|
38
|
+
filter = BloomFilter.new bits: 100_000_0, hashes: 4
|
34
39
|
|
35
40
|
filter.insert "foo"
|
36
|
-
filter.
|
37
|
-
filter.
|
41
|
+
filter.include? "foo" #=> true
|
42
|
+
filter.include? "bar" #=> false
|
38
43
|
|
39
44
|
filter.dump "/tmp/random.bloom"
|
40
|
-
filter = BloomFilter.load "/tmp/random.bloom"
|
45
|
+
filter = BloomFilter.load "/tmp/random.bloom"
|
41
46
|
```
|
42
47
|
|
43
48
|
## See Also
|
data/ext/bloom_filter.c
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
#include <errno.h>
|
6
6
|
#include <string.h>
|
7
7
|
#include <stdlib.h>
|
8
|
+
#include <math.h>
|
8
9
|
|
9
10
|
#include "ruby/ruby.h"
|
10
11
|
#include "bloom-filter.h"
|
@@ -14,6 +15,11 @@
|
|
14
15
|
#define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
|
15
16
|
#define CSTRING(v) RSTRING_PTR(TO_S(v))
|
16
17
|
|
18
|
+
typedef struct FileHeader {
|
19
|
+
uint64_t table_size;
|
20
|
+
uint64_t num_functions;
|
21
|
+
} FileHeader;
|
22
|
+
|
17
23
|
static void bloom_free(BloomFilter *filter) {
|
18
24
|
if (filter)
|
19
25
|
bloom_filter_free(filter);
|
@@ -33,17 +39,41 @@ BloomFilter* bloom_handle(VALUE self) {
|
|
33
39
|
}
|
34
40
|
|
35
41
|
VALUE bloom_initialize(int argc, VALUE *argv, VALUE self) {
|
36
|
-
|
37
|
-
|
42
|
+
double error;
|
43
|
+
size_t nbits, nhash, nmax;
|
44
|
+
|
45
|
+
VALUE max_size, error_rate, bitmap_size, hash_count, options;
|
38
46
|
BloomFilter *filter = 0;
|
39
47
|
|
40
|
-
rb_scan_args(argc, argv, "01", &
|
41
|
-
if (NIL_P(
|
42
|
-
|
43
|
-
else
|
44
|
-
size = atol(CSTRING(table_size));
|
48
|
+
rb_scan_args(argc, argv, "01", &options);
|
49
|
+
if (!NIL_P(options) && TYPE(options) != T_HASH)
|
50
|
+
rb_raise(rb_eArgError, "invalid options, expect hash");
|
45
51
|
|
46
|
-
|
52
|
+
if (NIL_P(options)) {
|
53
|
+
nbits = 1000000;
|
54
|
+
nhash = 4;
|
55
|
+
}
|
56
|
+
else {
|
57
|
+
max_size = rb_hash_aref(options, ID2SYM(rb_intern("size")));
|
58
|
+
error_rate = rb_hash_aref(options, ID2SYM(rb_intern("error_rate")));
|
59
|
+
bitmap_size = rb_hash_aref(options, ID2SYM(rb_intern("bits")));
|
60
|
+
hash_count = rb_hash_aref(options, ID2SYM(rb_intern("hashes")));
|
61
|
+
|
62
|
+
nhash = NIL_P(hash_count) ? 4 : NUM2ULONG(hash_count);
|
63
|
+
|
64
|
+
if (!NIL_P(bitmap_size))
|
65
|
+
nbits = NUM2ULONG(bitmap_size);
|
66
|
+
else if (!NIL_P(max_size)) {
|
67
|
+
nmax = NUM2ULONG(max_size);
|
68
|
+
error = NIL_P(error_rate) ? 0.01 : NUM2DBL(error_rate);
|
69
|
+
nbits = ceil(fabs(log(error) * (double)nmax / pow(log(2), 2)));
|
70
|
+
nhash = ceil(0.7 * (double)nbits / (double)nmax);
|
71
|
+
}
|
72
|
+
else
|
73
|
+
rb_raise(rb_eArgError, "requires either size & error_rate or bits & hashes");
|
74
|
+
}
|
75
|
+
|
76
|
+
filter = bloom_filter_new(nbits, string_nocase_hash, nhash);
|
47
77
|
|
48
78
|
if (!filter)
|
49
79
|
rb_raise(rb_eNoMemError, "unable to allocate memory for BloomFilter");
|
@@ -58,17 +88,19 @@ VALUE bloom_insert(VALUE klass, VALUE string) {
|
|
58
88
|
return Qtrue;
|
59
89
|
}
|
60
90
|
|
61
|
-
VALUE
|
91
|
+
VALUE bloom_include(VALUE klass, VALUE string) {
|
62
92
|
BloomFilter *filter = bloom_handle(klass);
|
63
93
|
return bloom_filter_query(filter, (BloomFilterValue)CSTRING(string)) ? Qtrue : Qfalse;
|
64
94
|
}
|
65
95
|
|
66
96
|
VALUE bloom_dump(VALUE klass, VALUE file) {
|
67
97
|
int fd;
|
98
|
+
uint64_t nbits;
|
99
|
+
FileHeader header;
|
68
100
|
BloomFilter *filter = bloom_handle(klass);
|
69
101
|
|
70
|
-
|
71
|
-
void *buffer = malloc(
|
102
|
+
nbits = (filter->table_size + 7) / 8;
|
103
|
+
void *buffer = malloc(nbits);
|
72
104
|
|
73
105
|
if (!buffer)
|
74
106
|
rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
|
@@ -81,7 +113,16 @@ VALUE bloom_dump(VALUE klass, VALUE file) {
|
|
81
113
|
rb_raise(rb_eIOError, "unable to open file. %s", strerror(errno));
|
82
114
|
}
|
83
115
|
|
84
|
-
|
116
|
+
header.table_size = filter->table_size;
|
117
|
+
header.num_functions = filter->num_functions;
|
118
|
+
|
119
|
+
if (write(fd, &header, sizeof(header)) == -1) {
|
120
|
+
free(buffer);
|
121
|
+
close(fd);
|
122
|
+
rb_raise(rb_eIOError, "error dumping BloomFilter: %s\n", strerror(errno));
|
123
|
+
}
|
124
|
+
|
125
|
+
if (write(fd, buffer, nbits) != -1) {
|
85
126
|
free(buffer);
|
86
127
|
close(fd);
|
87
128
|
return Qtrue;
|
@@ -90,47 +131,46 @@ VALUE bloom_dump(VALUE klass, VALUE file) {
|
|
90
131
|
free(buffer);
|
91
132
|
close(fd);
|
92
133
|
rb_raise(rb_eIOError, "error dumping BloomFilter: %s\n", strerror(errno));
|
93
|
-
return Qfalse; // not reachable
|
94
134
|
}
|
135
|
+
|
136
|
+
return Qfalse; // not reachable
|
95
137
|
}
|
96
138
|
|
97
|
-
VALUE bloom_load(
|
139
|
+
VALUE bloom_load(VALUE klass, VALUE file) {
|
98
140
|
int fd;
|
99
141
|
void *buffer;
|
100
|
-
size_t
|
142
|
+
size_t nbits, bytes;
|
143
|
+
FileHeader header;
|
101
144
|
BloomFilter *filter;
|
102
|
-
VALUE
|
103
|
-
|
104
|
-
rb_scan_args(argc, argv, "11", &file, &table_size);
|
105
|
-
if (NIL_P(table_size)) {
|
106
|
-
size = 1000000;
|
107
|
-
table_size = INT2NUM(size);
|
108
|
-
}
|
109
|
-
else
|
110
|
-
size = atol(CSTRING(table_size));
|
145
|
+
VALUE instance;
|
111
146
|
|
112
147
|
fd = open(CSTRING(file), O_RDONLY);
|
113
148
|
if (fd == -1)
|
114
149
|
rb_raise(rb_eIOError, "unable to open file: %s", strerror(errno));
|
115
150
|
|
116
|
-
|
117
|
-
|
151
|
+
if (read(fd, &header, sizeof(header)) != sizeof(header)) {
|
152
|
+
close(fd);
|
153
|
+
rb_raise(rb_eIOError, "unable to read file, header corrupted\n");
|
154
|
+
}
|
155
|
+
|
156
|
+
nbits = (header.table_size + 7) / 8;
|
157
|
+
buffer = malloc(nbits);
|
118
158
|
if (!buffer) {
|
119
159
|
close(fd);
|
120
160
|
rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
|
121
161
|
}
|
122
162
|
|
123
|
-
bytes = read(fd, buffer,
|
124
|
-
if (bytes !=
|
163
|
+
bytes = read(fd, buffer, nbits);
|
164
|
+
if (bytes != nbits) {
|
125
165
|
free(buffer);
|
126
166
|
close(fd);
|
127
|
-
rb_raise(rb_eStandardError, "unable to load BloomFilter, expected %ld but got %ld bytes",
|
167
|
+
rb_raise(rb_eStandardError, "unable to load BloomFilter, expected %ld but got %ld bytes", nbits, bytes);
|
128
168
|
}
|
129
169
|
|
130
|
-
|
131
|
-
|
170
|
+
filter = bloom_filter_new(header.table_size, string_nocase_hash, header.num_functions);
|
171
|
+
bloom_filter_load(filter, buffer);
|
172
|
+
instance = Data_Wrap_Struct(klass, 0, bloom_free, filter);
|
132
173
|
|
133
|
-
bloom_filter_load(bloom_handle(instance), buffer);
|
134
174
|
free(buffer);
|
135
175
|
close(fd);
|
136
176
|
return instance;
|
@@ -142,9 +182,9 @@ Init_bloom_filter() {
|
|
142
182
|
rb_define_method(cBloom, "initialize", RUBY_METHOD_FUNC(bloom_initialize), -1);
|
143
183
|
rb_define_method(cBloom, "dump", RUBY_METHOD_FUNC(bloom_dump), 1);
|
144
184
|
rb_define_method(cBloom, "insert", RUBY_METHOD_FUNC(bloom_insert), 1);
|
145
|
-
rb_define_method(cBloom, "
|
185
|
+
rb_define_method(cBloom, "include?", RUBY_METHOD_FUNC(bloom_include), 1);
|
146
186
|
|
147
187
|
rb_define_alloc_func(cBloom, bloom_allocate);
|
148
|
-
rb_define_singleton_method(cBloom, "load", RUBY_METHOD_FUNC(bloom_load),
|
188
|
+
rb_define_singleton_method(cBloom, "load", RUBY_METHOD_FUNC(bloom_load), 1);
|
149
189
|
rb_define_const(cBloom, "VERSION", rb_str_new2(RUBY_BLOOM_FILTER_VERSION));
|
150
190
|
}
|
data/ext/version.h
CHANGED
@@ -1 +1 @@
|
|
1
|
-
#define RUBY_BLOOM_FILTER_VERSION "0.1.
|
1
|
+
#define RUBY_BLOOM_FILTER_VERSION "0.1.1"
|
data/test/test_basic.rb
CHANGED
@@ -6,18 +6,18 @@ describe 'BloomFilter primitives' do
|
|
6
6
|
end
|
7
7
|
|
8
8
|
it 'should create one with given size' do
|
9
|
-
assert BloomFilter.new(100)
|
9
|
+
assert BloomFilter.new(size: 100)
|
10
10
|
end
|
11
11
|
|
12
12
|
it 'should insert' do
|
13
|
-
assert filter = BloomFilter.new(100)
|
13
|
+
assert filter = BloomFilter.new(size: 100)
|
14
14
|
assert filter.insert("foo")
|
15
15
|
end
|
16
16
|
|
17
17
|
it 'should allow membership checks' do
|
18
|
-
assert filter = BloomFilter.new(100)
|
18
|
+
assert filter = BloomFilter.new(size: 100)
|
19
19
|
assert filter.insert("foo")
|
20
|
-
assert filter.
|
21
|
-
assert !filter.
|
20
|
+
assert filter.include?("foo")
|
21
|
+
assert !filter.include?("bar")
|
22
22
|
end
|
23
23
|
end
|
data/test/test_io.rb
CHANGED
@@ -5,11 +5,11 @@ describe 'BloomFilter load & dump' do
|
|
5
5
|
it 'should dump and load a filter' do
|
6
6
|
file = Tempfile.new("bloom-filter-test")
|
7
7
|
|
8
|
-
assert filter = BloomFilter.new(100)
|
8
|
+
assert filter = BloomFilter.new(size: 100)
|
9
9
|
assert filter.insert("foo")
|
10
10
|
assert filter.dump(file.path), "dump filter"
|
11
|
-
assert filter = BloomFilter.load(file.path
|
12
|
-
assert filter.
|
13
|
-
assert !filter.
|
11
|
+
assert filter = BloomFilter.load(file.path)
|
12
|
+
assert filter.include?("foo")
|
13
|
+
assert !filter.include?("bar")
|
14
14
|
end
|
15
15
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 1
|
9
|
+
version: 0.1.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Bharanee Rathna
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2012-01-
|
17
|
+
date: 2012-01-27 00:00:00 +11:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -66,7 +66,6 @@ files:
|
|
66
66
|
- README.md
|
67
67
|
- CHANGELOG
|
68
68
|
- lib/bloom-filter.rb
|
69
|
-
- lib/bloom_filter.so
|
70
69
|
has_rdoc: true
|
71
70
|
homepage: http://github.com/deepfryed/bloom-filter
|
72
71
|
licenses: []
|
data/lib/bloom_filter.so
DELETED
Binary file
|