bloom-filter 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -20,7 +20,7 @@ BloomFilter is a ruby library that implements an in-memory [Bloom Filter](http:/
20
20
  .load
21
21
  #dump
22
22
  #insert
23
- #exists?
23
+ #include?
24
24
 
25
25
  ```
26
26
 
@@ -30,14 +30,19 @@ BloomFilter is a ruby library that implements an in-memory [Bloom Filter](http:/
30
30
  require 'bloom-filter'
31
31
 
32
32
  filter = BloomFilter.new
33
- filter = BloomFilter.new 100_000
33
+
34
+ # auto-calculate optimum bitmap size based on maximum number of items stored and desired max error rate.
35
+ filter = BloomFilter.new size: 100_000, error_rate: 0.01
36
+
37
+ # specify bitmap size & number of hash functions explicitly.
38
+ filter = BloomFilter.new bits: 100_000_0, hashes: 4
34
39
 
35
40
  filter.insert "foo"
36
- filter.exists? "foo" #=> true
37
- filter.exists? "bar" #=> false
41
+ filter.include? "foo" #=> true
42
+ filter.include? "bar" #=> false
38
43
 
39
44
  filter.dump "/tmp/random.bloom"
40
- filter = BloomFilter.load "/tmp/random.bloom", 100_000
45
+ filter = BloomFilter.load "/tmp/random.bloom"
41
46
  ```
42
47
 
43
48
  ## See Also
@@ -5,6 +5,7 @@
5
5
  #include <errno.h>
6
6
  #include <string.h>
7
7
  #include <stdlib.h>
8
+ #include <math.h>
8
9
 
9
10
  #include "ruby/ruby.h"
10
11
  #include "bloom-filter.h"
@@ -14,6 +15,11 @@
14
15
  #define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
15
16
  #define CSTRING(v) RSTRING_PTR(TO_S(v))
16
17
 
18
+ typedef struct FileHeader {
19
+ uint64_t table_size;
20
+ uint64_t num_functions;
21
+ } FileHeader;
22
+
17
23
  static void bloom_free(BloomFilter *filter) {
18
24
  if (filter)
19
25
  bloom_filter_free(filter);
@@ -33,17 +39,41 @@ BloomFilter* bloom_handle(VALUE self) {
33
39
  }
34
40
 
35
41
  VALUE bloom_initialize(int argc, VALUE *argv, VALUE self) {
36
- size_t size;
37
- VALUE table_size;
42
+ double error;
43
+ size_t nbits, nhash, nmax;
44
+
45
+ VALUE max_size, error_rate, bitmap_size, hash_count, options;
38
46
  BloomFilter *filter = 0;
39
47
 
40
- rb_scan_args(argc, argv, "01", &table_size);
41
- if (NIL_P(table_size))
42
- size = 1000000;
43
- else
44
- size = atol(CSTRING(table_size));
48
+ rb_scan_args(argc, argv, "01", &options);
49
+ if (!NIL_P(options) && TYPE(options) != T_HASH)
50
+ rb_raise(rb_eArgError, "invalid options, expect hash");
45
51
 
46
- filter = bloom_filter_new(size, string_nocase_hash, 4);
52
+ if (NIL_P(options)) {
53
+ nbits = 1000000;
54
+ nhash = 4;
55
+ }
56
+ else {
57
+ max_size = rb_hash_aref(options, ID2SYM(rb_intern("size")));
58
+ error_rate = rb_hash_aref(options, ID2SYM(rb_intern("error_rate")));
59
+ bitmap_size = rb_hash_aref(options, ID2SYM(rb_intern("bits")));
60
+ hash_count = rb_hash_aref(options, ID2SYM(rb_intern("hashes")));
61
+
62
+ nhash = NIL_P(hash_count) ? 4 : NUM2ULONG(hash_count);
63
+
64
+ if (!NIL_P(bitmap_size))
65
+ nbits = NUM2ULONG(bitmap_size);
66
+ else if (!NIL_P(max_size)) {
67
+ nmax = NUM2ULONG(max_size);
68
+ error = NIL_P(error_rate) ? 0.01 : NUM2DBL(error_rate);
69
+ nbits = ceil(fabs(log(error) * (double)nmax / pow(log(2), 2)));
70
+ nhash = ceil(0.7 * (double)nbits / (double)nmax);
71
+ }
72
+ else
73
+ rb_raise(rb_eArgError, "requires either size & error_rate or bits & hashes");
74
+ }
75
+
76
+ filter = bloom_filter_new(nbits, string_nocase_hash, nhash);
47
77
 
48
78
  if (!filter)
49
79
  rb_raise(rb_eNoMemError, "unable to allocate memory for BloomFilter");
@@ -58,17 +88,19 @@ VALUE bloom_insert(VALUE klass, VALUE string) {
58
88
  return Qtrue;
59
89
  }
60
90
 
61
- VALUE bloom_exists(VALUE klass, VALUE string) {
91
+ VALUE bloom_include(VALUE klass, VALUE string) {
62
92
  BloomFilter *filter = bloom_handle(klass);
63
93
  return bloom_filter_query(filter, (BloomFilterValue)CSTRING(string)) ? Qtrue : Qfalse;
64
94
  }
65
95
 
66
96
  VALUE bloom_dump(VALUE klass, VALUE file) {
67
97
  int fd;
98
+ uint64_t nbits;
99
+ FileHeader header;
68
100
  BloomFilter *filter = bloom_handle(klass);
69
101
 
70
- size_t size = (filter->table_size + 7) / 8;
71
- void *buffer = malloc(size);
102
+ nbits = (filter->table_size + 7) / 8;
103
+ void *buffer = malloc(nbits);
72
104
 
73
105
  if (!buffer)
74
106
  rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
@@ -81,7 +113,16 @@ VALUE bloom_dump(VALUE klass, VALUE file) {
81
113
  rb_raise(rb_eIOError, "unable to open file. %s", strerror(errno));
82
114
  }
83
115
 
84
- if (write(fd, buffer, size) != -1) {
116
+ header.table_size = filter->table_size;
117
+ header.num_functions = filter->num_functions;
118
+
119
+ if (write(fd, &header, sizeof(header)) == -1) {
120
+ free(buffer);
121
+ close(fd);
122
+ rb_raise(rb_eIOError, "error dumping BloomFilter: %s\n", strerror(errno));
123
+ }
124
+
125
+ if (write(fd, buffer, nbits) != -1) {
85
126
  free(buffer);
86
127
  close(fd);
87
128
  return Qtrue;
@@ -90,47 +131,46 @@ VALUE bloom_dump(VALUE klass, VALUE file) {
90
131
  free(buffer);
91
132
  close(fd);
92
133
  rb_raise(rb_eIOError, "error dumping BloomFilter: %s\n", strerror(errno));
93
- return Qfalse; // not reachable
94
134
  }
135
+
136
+ return Qfalse; // not reachable
95
137
  }
96
138
 
97
- VALUE bloom_load(int argc, VALUE *argv, VALUE klass) {
139
+ VALUE bloom_load(VALUE klass, VALUE file) {
98
140
  int fd;
99
141
  void *buffer;
100
- size_t size, bytes;
142
+ size_t nbits, bytes;
143
+ FileHeader header;
101
144
  BloomFilter *filter;
102
- VALUE file, table_size, instance;
103
-
104
- rb_scan_args(argc, argv, "11", &file, &table_size);
105
- if (NIL_P(table_size)) {
106
- size = 1000000;
107
- table_size = INT2NUM(size);
108
- }
109
- else
110
- size = atol(CSTRING(table_size));
145
+ VALUE instance;
111
146
 
112
147
  fd = open(CSTRING(file), O_RDONLY);
113
148
  if (fd == -1)
114
149
  rb_raise(rb_eIOError, "unable to open file: %s", strerror(errno));
115
150
 
116
- size = (size + 7) / 8;
117
- buffer = malloc(size);
151
+ if (read(fd, &header, sizeof(header)) != sizeof(header)) {
152
+ close(fd);
153
+ rb_raise(rb_eIOError, "unable to read file, header corrupted\n");
154
+ }
155
+
156
+ nbits = (header.table_size + 7) / 8;
157
+ buffer = malloc(nbits);
118
158
  if (!buffer) {
119
159
  close(fd);
120
160
  rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
121
161
  }
122
162
 
123
- bytes = read(fd, buffer, size);
124
- if (bytes != size) {
163
+ bytes = read(fd, buffer, nbits);
164
+ if (bytes != nbits) {
125
165
  free(buffer);
126
166
  close(fd);
127
- rb_raise(rb_eStandardError, "unable to load BloomFilter, expected %ld but got %ld bytes", size, bytes);
167
+ rb_raise(rb_eStandardError, "unable to load BloomFilter, expected %ld but got %ld bytes", nbits, bytes);
128
168
  }
129
169
 
130
- instance = bloom_allocate(klass);
131
- bloom_initialize(1, &table_size, instance);
170
+ filter = bloom_filter_new(header.table_size, string_nocase_hash, header.num_functions);
171
+ bloom_filter_load(filter, buffer);
172
+ instance = Data_Wrap_Struct(klass, 0, bloom_free, filter);
132
173
 
133
- bloom_filter_load(bloom_handle(instance), buffer);
134
174
  free(buffer);
135
175
  close(fd);
136
176
  return instance;
@@ -142,9 +182,9 @@ Init_bloom_filter() {
142
182
  rb_define_method(cBloom, "initialize", RUBY_METHOD_FUNC(bloom_initialize), -1);
143
183
  rb_define_method(cBloom, "dump", RUBY_METHOD_FUNC(bloom_dump), 1);
144
184
  rb_define_method(cBloom, "insert", RUBY_METHOD_FUNC(bloom_insert), 1);
145
- rb_define_method(cBloom, "exists?", RUBY_METHOD_FUNC(bloom_exists), 1);
185
+ rb_define_method(cBloom, "include?", RUBY_METHOD_FUNC(bloom_include), 1);
146
186
 
147
187
  rb_define_alloc_func(cBloom, bloom_allocate);
148
- rb_define_singleton_method(cBloom, "load", RUBY_METHOD_FUNC(bloom_load), -1);
188
+ rb_define_singleton_method(cBloom, "load", RUBY_METHOD_FUNC(bloom_load), 1);
149
189
  rb_define_const(cBloom, "VERSION", rb_str_new2(RUBY_BLOOM_FILTER_VERSION));
150
190
  }
@@ -1 +1 @@
1
- #define RUBY_BLOOM_FILTER_VERSION "0.1.0"
1
+ #define RUBY_BLOOM_FILTER_VERSION "0.1.1"
@@ -6,18 +6,18 @@ describe 'BloomFilter primitives' do
6
6
  end
7
7
 
8
8
  it 'should create one with given size' do
9
- assert BloomFilter.new(100)
9
+ assert BloomFilter.new(size: 100)
10
10
  end
11
11
 
12
12
  it 'should insert' do
13
- assert filter = BloomFilter.new(100)
13
+ assert filter = BloomFilter.new(size: 100)
14
14
  assert filter.insert("foo")
15
15
  end
16
16
 
17
17
  it 'should allow membership checks' do
18
- assert filter = BloomFilter.new(100)
18
+ assert filter = BloomFilter.new(size: 100)
19
19
  assert filter.insert("foo")
20
- assert filter.exists?("foo")
21
- assert !filter.exists?("bar")
20
+ assert filter.include?("foo")
21
+ assert !filter.include?("bar")
22
22
  end
23
23
  end
@@ -5,11 +5,11 @@ describe 'BloomFilter load & dump' do
5
5
  it 'should dump and load a filter' do
6
6
  file = Tempfile.new("bloom-filter-test")
7
7
 
8
- assert filter = BloomFilter.new(100)
8
+ assert filter = BloomFilter.new(size: 100)
9
9
  assert filter.insert("foo")
10
10
  assert filter.dump(file.path), "dump filter"
11
- assert filter = BloomFilter.load(file.path, 100)
12
- assert filter.exists?("foo")
13
- assert !filter.exists?("bar")
11
+ assert filter = BloomFilter.load(file.path)
12
+ assert filter.include?("foo")
13
+ assert !filter.include?("bar")
14
14
  end
15
15
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 0
9
- version: 0.1.0
8
+ - 1
9
+ version: 0.1.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - Bharanee Rathna
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2012-01-25 00:00:00 +11:00
17
+ date: 2012-01-27 00:00:00 +11:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -66,7 +66,6 @@ files:
66
66
  - README.md
67
67
  - CHANGELOG
68
68
  - lib/bloom-filter.rb
69
- - lib/bloom_filter.so
70
69
  has_rdoc: true
71
70
  homepage: http://github.com/deepfryed/bloom-filter
72
71
  licenses: []
Binary file