bloom-filter 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +10 -5
- data/ext/bloom_filter.c +74 -34
- data/ext/version.h +1 -1
- data/test/test_basic.rb +5 -5
- data/test/test_io.rb +4 -4
- metadata +3 -4
- data/lib/bloom_filter.so +0 -0
data/README.md
CHANGED
@@ -20,7 +20,7 @@ BloomFilter is a ruby library that implements an in-memory [Bloom Filter](http:/
|
|
20
20
|
.load
|
21
21
|
#dump
|
22
22
|
#insert
|
23
|
-
#
|
23
|
+
#include?
|
24
24
|
|
25
25
|
```
|
26
26
|
|
@@ -30,14 +30,19 @@ BloomFilter is a ruby library that implements an in-memory [Bloom Filter](http:/
|
|
30
30
|
require 'bloom-filter'
|
31
31
|
|
32
32
|
filter = BloomFilter.new
|
33
|
-
|
33
|
+
|
34
|
+
# auto-calculate optimum bitmap size based on maximum number of items stored and desired max error rate.
|
35
|
+
filter = BloomFilter.new size: 100_000, error_rate: 0.01
|
36
|
+
|
37
|
+
# specify bitmap size & number of hash functions explicitly.
|
38
|
+
filter = BloomFilter.new bits: 100_000_0, hashes: 4
|
34
39
|
|
35
40
|
filter.insert "foo"
|
36
|
-
filter.
|
37
|
-
filter.
|
41
|
+
filter.include? "foo" #=> true
|
42
|
+
filter.include? "bar" #=> false
|
38
43
|
|
39
44
|
filter.dump "/tmp/random.bloom"
|
40
|
-
filter = BloomFilter.load "/tmp/random.bloom"
|
45
|
+
filter = BloomFilter.load "/tmp/random.bloom"
|
41
46
|
```
|
42
47
|
|
43
48
|
## See Also
|
data/ext/bloom_filter.c
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
#include <errno.h>
|
6
6
|
#include <string.h>
|
7
7
|
#include <stdlib.h>
|
8
|
+
#include <math.h>
|
8
9
|
|
9
10
|
#include "ruby/ruby.h"
|
10
11
|
#include "bloom-filter.h"
|
@@ -14,6 +15,11 @@
|
|
14
15
|
#define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
|
15
16
|
#define CSTRING(v) RSTRING_PTR(TO_S(v))
|
16
17
|
|
18
|
+
typedef struct FileHeader {
|
19
|
+
uint64_t table_size;
|
20
|
+
uint64_t num_functions;
|
21
|
+
} FileHeader;
|
22
|
+
|
17
23
|
static void bloom_free(BloomFilter *filter) {
|
18
24
|
if (filter)
|
19
25
|
bloom_filter_free(filter);
|
@@ -33,17 +39,41 @@ BloomFilter* bloom_handle(VALUE self) {
|
|
33
39
|
}
|
34
40
|
|
35
41
|
VALUE bloom_initialize(int argc, VALUE *argv, VALUE self) {
|
36
|
-
|
37
|
-
|
42
|
+
double error;
|
43
|
+
size_t nbits, nhash, nmax;
|
44
|
+
|
45
|
+
VALUE max_size, error_rate, bitmap_size, hash_count, options;
|
38
46
|
BloomFilter *filter = 0;
|
39
47
|
|
40
|
-
rb_scan_args(argc, argv, "01", &
|
41
|
-
if (NIL_P(
|
42
|
-
|
43
|
-
else
|
44
|
-
size = atol(CSTRING(table_size));
|
48
|
+
rb_scan_args(argc, argv, "01", &options);
|
49
|
+
if (!NIL_P(options) && TYPE(options) != T_HASH)
|
50
|
+
rb_raise(rb_eArgError, "invalid options, expect hash");
|
45
51
|
|
46
|
-
|
52
|
+
if (NIL_P(options)) {
|
53
|
+
nbits = 1000000;
|
54
|
+
nhash = 4;
|
55
|
+
}
|
56
|
+
else {
|
57
|
+
max_size = rb_hash_aref(options, ID2SYM(rb_intern("size")));
|
58
|
+
error_rate = rb_hash_aref(options, ID2SYM(rb_intern("error_rate")));
|
59
|
+
bitmap_size = rb_hash_aref(options, ID2SYM(rb_intern("bits")));
|
60
|
+
hash_count = rb_hash_aref(options, ID2SYM(rb_intern("hashes")));
|
61
|
+
|
62
|
+
nhash = NIL_P(hash_count) ? 4 : NUM2ULONG(hash_count);
|
63
|
+
|
64
|
+
if (!NIL_P(bitmap_size))
|
65
|
+
nbits = NUM2ULONG(bitmap_size);
|
66
|
+
else if (!NIL_P(max_size)) {
|
67
|
+
nmax = NUM2ULONG(max_size);
|
68
|
+
error = NIL_P(error_rate) ? 0.01 : NUM2DBL(error_rate);
|
69
|
+
nbits = ceil(fabs(log(error) * (double)nmax / pow(log(2), 2)));
|
70
|
+
nhash = ceil(0.7 * (double)nbits / (double)nmax);
|
71
|
+
}
|
72
|
+
else
|
73
|
+
rb_raise(rb_eArgError, "requires either size & error_rate or bits & hashes");
|
74
|
+
}
|
75
|
+
|
76
|
+
filter = bloom_filter_new(nbits, string_nocase_hash, nhash);
|
47
77
|
|
48
78
|
if (!filter)
|
49
79
|
rb_raise(rb_eNoMemError, "unable to allocate memory for BloomFilter");
|
@@ -58,17 +88,19 @@ VALUE bloom_insert(VALUE klass, VALUE string) {
|
|
58
88
|
return Qtrue;
|
59
89
|
}
|
60
90
|
|
61
|
-
VALUE
|
91
|
+
VALUE bloom_include(VALUE klass, VALUE string) {
|
62
92
|
BloomFilter *filter = bloom_handle(klass);
|
63
93
|
return bloom_filter_query(filter, (BloomFilterValue)CSTRING(string)) ? Qtrue : Qfalse;
|
64
94
|
}
|
65
95
|
|
66
96
|
VALUE bloom_dump(VALUE klass, VALUE file) {
|
67
97
|
int fd;
|
98
|
+
uint64_t nbits;
|
99
|
+
FileHeader header;
|
68
100
|
BloomFilter *filter = bloom_handle(klass);
|
69
101
|
|
70
|
-
|
71
|
-
void *buffer = malloc(
|
102
|
+
nbits = (filter->table_size + 7) / 8;
|
103
|
+
void *buffer = malloc(nbits);
|
72
104
|
|
73
105
|
if (!buffer)
|
74
106
|
rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
|
@@ -81,7 +113,16 @@ VALUE bloom_dump(VALUE klass, VALUE file) {
|
|
81
113
|
rb_raise(rb_eIOError, "unable to open file. %s", strerror(errno));
|
82
114
|
}
|
83
115
|
|
84
|
-
|
116
|
+
header.table_size = filter->table_size;
|
117
|
+
header.num_functions = filter->num_functions;
|
118
|
+
|
119
|
+
if (write(fd, &header, sizeof(header)) == -1) {
|
120
|
+
free(buffer);
|
121
|
+
close(fd);
|
122
|
+
rb_raise(rb_eIOError, "error dumping BloomFilter: %s\n", strerror(errno));
|
123
|
+
}
|
124
|
+
|
125
|
+
if (write(fd, buffer, nbits) != -1) {
|
85
126
|
free(buffer);
|
86
127
|
close(fd);
|
87
128
|
return Qtrue;
|
@@ -90,47 +131,46 @@ VALUE bloom_dump(VALUE klass, VALUE file) {
|
|
90
131
|
free(buffer);
|
91
132
|
close(fd);
|
92
133
|
rb_raise(rb_eIOError, "error dumping BloomFilter: %s\n", strerror(errno));
|
93
|
-
return Qfalse; // not reachable
|
94
134
|
}
|
135
|
+
|
136
|
+
return Qfalse; // not reachable
|
95
137
|
}
|
96
138
|
|
97
|
-
VALUE bloom_load(
|
139
|
+
VALUE bloom_load(VALUE klass, VALUE file) {
|
98
140
|
int fd;
|
99
141
|
void *buffer;
|
100
|
-
size_t
|
142
|
+
size_t nbits, bytes;
|
143
|
+
FileHeader header;
|
101
144
|
BloomFilter *filter;
|
102
|
-
VALUE
|
103
|
-
|
104
|
-
rb_scan_args(argc, argv, "11", &file, &table_size);
|
105
|
-
if (NIL_P(table_size)) {
|
106
|
-
size = 1000000;
|
107
|
-
table_size = INT2NUM(size);
|
108
|
-
}
|
109
|
-
else
|
110
|
-
size = atol(CSTRING(table_size));
|
145
|
+
VALUE instance;
|
111
146
|
|
112
147
|
fd = open(CSTRING(file), O_RDONLY);
|
113
148
|
if (fd == -1)
|
114
149
|
rb_raise(rb_eIOError, "unable to open file: %s", strerror(errno));
|
115
150
|
|
116
|
-
|
117
|
-
|
151
|
+
if (read(fd, &header, sizeof(header)) != sizeof(header)) {
|
152
|
+
close(fd);
|
153
|
+
rb_raise(rb_eIOError, "unable to read file, header corrupted\n");
|
154
|
+
}
|
155
|
+
|
156
|
+
nbits = (header.table_size + 7) / 8;
|
157
|
+
buffer = malloc(nbits);
|
118
158
|
if (!buffer) {
|
119
159
|
close(fd);
|
120
160
|
rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
|
121
161
|
}
|
122
162
|
|
123
|
-
bytes = read(fd, buffer,
|
124
|
-
if (bytes !=
|
163
|
+
bytes = read(fd, buffer, nbits);
|
164
|
+
if (bytes != nbits) {
|
125
165
|
free(buffer);
|
126
166
|
close(fd);
|
127
|
-
rb_raise(rb_eStandardError, "unable to load BloomFilter, expected %ld but got %ld bytes",
|
167
|
+
rb_raise(rb_eStandardError, "unable to load BloomFilter, expected %ld but got %ld bytes", nbits, bytes);
|
128
168
|
}
|
129
169
|
|
130
|
-
|
131
|
-
|
170
|
+
filter = bloom_filter_new(header.table_size, string_nocase_hash, header.num_functions);
|
171
|
+
bloom_filter_load(filter, buffer);
|
172
|
+
instance = Data_Wrap_Struct(klass, 0, bloom_free, filter);
|
132
173
|
|
133
|
-
bloom_filter_load(bloom_handle(instance), buffer);
|
134
174
|
free(buffer);
|
135
175
|
close(fd);
|
136
176
|
return instance;
|
@@ -142,9 +182,9 @@ Init_bloom_filter() {
|
|
142
182
|
rb_define_method(cBloom, "initialize", RUBY_METHOD_FUNC(bloom_initialize), -1);
|
143
183
|
rb_define_method(cBloom, "dump", RUBY_METHOD_FUNC(bloom_dump), 1);
|
144
184
|
rb_define_method(cBloom, "insert", RUBY_METHOD_FUNC(bloom_insert), 1);
|
145
|
-
rb_define_method(cBloom, "
|
185
|
+
rb_define_method(cBloom, "include?", RUBY_METHOD_FUNC(bloom_include), 1);
|
146
186
|
|
147
187
|
rb_define_alloc_func(cBloom, bloom_allocate);
|
148
|
-
rb_define_singleton_method(cBloom, "load", RUBY_METHOD_FUNC(bloom_load),
|
188
|
+
rb_define_singleton_method(cBloom, "load", RUBY_METHOD_FUNC(bloom_load), 1);
|
149
189
|
rb_define_const(cBloom, "VERSION", rb_str_new2(RUBY_BLOOM_FILTER_VERSION));
|
150
190
|
}
|
data/ext/version.h
CHANGED
@@ -1 +1 @@
|
|
1
|
-
#define RUBY_BLOOM_FILTER_VERSION "0.1.
|
1
|
+
#define RUBY_BLOOM_FILTER_VERSION "0.1.1"
|
data/test/test_basic.rb
CHANGED
@@ -6,18 +6,18 @@ describe 'BloomFilter primitives' do
|
|
6
6
|
end
|
7
7
|
|
8
8
|
it 'should create one with given size' do
|
9
|
-
assert BloomFilter.new(100)
|
9
|
+
assert BloomFilter.new(size: 100)
|
10
10
|
end
|
11
11
|
|
12
12
|
it 'should insert' do
|
13
|
-
assert filter = BloomFilter.new(100)
|
13
|
+
assert filter = BloomFilter.new(size: 100)
|
14
14
|
assert filter.insert("foo")
|
15
15
|
end
|
16
16
|
|
17
17
|
it 'should allow membership checks' do
|
18
|
-
assert filter = BloomFilter.new(100)
|
18
|
+
assert filter = BloomFilter.new(size: 100)
|
19
19
|
assert filter.insert("foo")
|
20
|
-
assert filter.
|
21
|
-
assert !filter.
|
20
|
+
assert filter.include?("foo")
|
21
|
+
assert !filter.include?("bar")
|
22
22
|
end
|
23
23
|
end
|
data/test/test_io.rb
CHANGED
@@ -5,11 +5,11 @@ describe 'BloomFilter load & dump' do
|
|
5
5
|
it 'should dump and load a filter' do
|
6
6
|
file = Tempfile.new("bloom-filter-test")
|
7
7
|
|
8
|
-
assert filter = BloomFilter.new(100)
|
8
|
+
assert filter = BloomFilter.new(size: 100)
|
9
9
|
assert filter.insert("foo")
|
10
10
|
assert filter.dump(file.path), "dump filter"
|
11
|
-
assert filter = BloomFilter.load(file.path
|
12
|
-
assert filter.
|
13
|
-
assert !filter.
|
11
|
+
assert filter = BloomFilter.load(file.path)
|
12
|
+
assert filter.include?("foo")
|
13
|
+
assert !filter.include?("bar")
|
14
14
|
end
|
15
15
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 1
|
9
|
+
version: 0.1.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Bharanee Rathna
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2012-01-
|
17
|
+
date: 2012-01-27 00:00:00 +11:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -66,7 +66,6 @@ files:
|
|
66
66
|
- README.md
|
67
67
|
- CHANGELOG
|
68
68
|
- lib/bloom-filter.rb
|
69
|
-
- lib/bloom_filter.so
|
70
69
|
has_rdoc: true
|
71
70
|
homepage: http://github.com/deepfryed/bloom-filter
|
72
71
|
licenses: []
|
data/lib/bloom_filter.so
DELETED
Binary file
|