bloom-filter 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -20,7 +20,7 @@ BloomFilter is a ruby library that implements an in-memory [Bloom Filter](http:/
20
20
  .load
21
21
  #dump
22
22
  #insert
23
- #exists?
23
+ #include?
24
24
 
25
25
  ```
26
26
 
@@ -30,14 +30,19 @@ BloomFilter is a ruby library that implements an in-memory [Bloom Filter](http:/
30
30
  require 'bloom-filter'
31
31
 
32
32
  filter = BloomFilter.new
33
- filter = BloomFilter.new 100_000
33
+
34
+ # auto-calculate optimum bitmap size based on maximum number of items stored and desired max error rate.
35
+ filter = BloomFilter.new size: 100_000, error_rate: 0.01
36
+
37
+ # specify bitmap size & number of hash functions explicitly.
38
+ filter = BloomFilter.new bits: 100_000_0, hashes: 4
34
39
 
35
40
  filter.insert "foo"
36
- filter.exists? "foo" #=> true
37
- filter.exists? "bar" #=> false
41
+ filter.include? "foo" #=> true
42
+ filter.include? "bar" #=> false
38
43
 
39
44
  filter.dump "/tmp/random.bloom"
40
- filter = BloomFilter.load "/tmp/random.bloom", 100_000
45
+ filter = BloomFilter.load "/tmp/random.bloom"
41
46
  ```
42
47
 
43
48
  ## See Also
@@ -5,6 +5,7 @@
5
5
  #include <errno.h>
6
6
  #include <string.h>
7
7
  #include <stdlib.h>
8
+ #include <math.h>
8
9
 
9
10
  #include "ruby/ruby.h"
10
11
  #include "bloom-filter.h"
@@ -14,6 +15,11 @@
14
15
  #define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
15
16
  #define CSTRING(v) RSTRING_PTR(TO_S(v))
16
17
 
18
+ typedef struct FileHeader {
19
+ uint64_t table_size;
20
+ uint64_t num_functions;
21
+ } FileHeader;
22
+
17
23
  static void bloom_free(BloomFilter *filter) {
18
24
  if (filter)
19
25
  bloom_filter_free(filter);
@@ -33,17 +39,41 @@ BloomFilter* bloom_handle(VALUE self) {
33
39
  }
34
40
 
35
41
  VALUE bloom_initialize(int argc, VALUE *argv, VALUE self) {
36
- size_t size;
37
- VALUE table_size;
42
+ double error;
43
+ size_t nbits, nhash, nmax;
44
+
45
+ VALUE max_size, error_rate, bitmap_size, hash_count, options;
38
46
  BloomFilter *filter = 0;
39
47
 
40
- rb_scan_args(argc, argv, "01", &table_size);
41
- if (NIL_P(table_size))
42
- size = 1000000;
43
- else
44
- size = atol(CSTRING(table_size));
48
+ rb_scan_args(argc, argv, "01", &options);
49
+ if (!NIL_P(options) && TYPE(options) != T_HASH)
50
+ rb_raise(rb_eArgError, "invalid options, expect hash");
45
51
 
46
- filter = bloom_filter_new(size, string_nocase_hash, 4);
52
+ if (NIL_P(options)) {
53
+ nbits = 1000000;
54
+ nhash = 4;
55
+ }
56
+ else {
57
+ max_size = rb_hash_aref(options, ID2SYM(rb_intern("size")));
58
+ error_rate = rb_hash_aref(options, ID2SYM(rb_intern("error_rate")));
59
+ bitmap_size = rb_hash_aref(options, ID2SYM(rb_intern("bits")));
60
+ hash_count = rb_hash_aref(options, ID2SYM(rb_intern("hashes")));
61
+
62
+ nhash = NIL_P(hash_count) ? 4 : NUM2ULONG(hash_count);
63
+
64
+ if (!NIL_P(bitmap_size))
65
+ nbits = NUM2ULONG(bitmap_size);
66
+ else if (!NIL_P(max_size)) {
67
+ nmax = NUM2ULONG(max_size);
68
+ error = NIL_P(error_rate) ? 0.01 : NUM2DBL(error_rate);
69
+ nbits = ceil(fabs(log(error) * (double)nmax / pow(log(2), 2)));
70
+ nhash = ceil(0.7 * (double)nbits / (double)nmax);
71
+ }
72
+ else
73
+ rb_raise(rb_eArgError, "requires either size & error_rate or bits & hashes");
74
+ }
75
+
76
+ filter = bloom_filter_new(nbits, string_nocase_hash, nhash);
47
77
 
48
78
  if (!filter)
49
79
  rb_raise(rb_eNoMemError, "unable to allocate memory for BloomFilter");
@@ -58,17 +88,19 @@ VALUE bloom_insert(VALUE klass, VALUE string) {
58
88
  return Qtrue;
59
89
  }
60
90
 
61
- VALUE bloom_exists(VALUE klass, VALUE string) {
91
+ VALUE bloom_include(VALUE klass, VALUE string) {
62
92
  BloomFilter *filter = bloom_handle(klass);
63
93
  return bloom_filter_query(filter, (BloomFilterValue)CSTRING(string)) ? Qtrue : Qfalse;
64
94
  }
65
95
 
66
96
  VALUE bloom_dump(VALUE klass, VALUE file) {
67
97
  int fd;
98
+ uint64_t nbits;
99
+ FileHeader header;
68
100
  BloomFilter *filter = bloom_handle(klass);
69
101
 
70
- size_t size = (filter->table_size + 7) / 8;
71
- void *buffer = malloc(size);
102
+ nbits = (filter->table_size + 7) / 8;
103
+ void *buffer = malloc(nbits);
72
104
 
73
105
  if (!buffer)
74
106
  rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
@@ -81,7 +113,16 @@ VALUE bloom_dump(VALUE klass, VALUE file) {
81
113
  rb_raise(rb_eIOError, "unable to open file. %s", strerror(errno));
82
114
  }
83
115
 
84
- if (write(fd, buffer, size) != -1) {
116
+ header.table_size = filter->table_size;
117
+ header.num_functions = filter->num_functions;
118
+
119
+ if (write(fd, &header, sizeof(header)) == -1) {
120
+ free(buffer);
121
+ close(fd);
122
+ rb_raise(rb_eIOError, "error dumping BloomFilter: %s\n", strerror(errno));
123
+ }
124
+
125
+ if (write(fd, buffer, nbits) != -1) {
85
126
  free(buffer);
86
127
  close(fd);
87
128
  return Qtrue;
@@ -90,47 +131,46 @@ VALUE bloom_dump(VALUE klass, VALUE file) {
90
131
  free(buffer);
91
132
  close(fd);
92
133
  rb_raise(rb_eIOError, "error dumping BloomFilter: %s\n", strerror(errno));
93
- return Qfalse; // not reachable
94
134
  }
135
+
136
+ return Qfalse; // not reachable
95
137
  }
96
138
 
97
- VALUE bloom_load(int argc, VALUE *argv, VALUE klass) {
139
+ VALUE bloom_load(VALUE klass, VALUE file) {
98
140
  int fd;
99
141
  void *buffer;
100
- size_t size, bytes;
142
+ size_t nbits, bytes;
143
+ FileHeader header;
101
144
  BloomFilter *filter;
102
- VALUE file, table_size, instance;
103
-
104
- rb_scan_args(argc, argv, "11", &file, &table_size);
105
- if (NIL_P(table_size)) {
106
- size = 1000000;
107
- table_size = INT2NUM(size);
108
- }
109
- else
110
- size = atol(CSTRING(table_size));
145
+ VALUE instance;
111
146
 
112
147
  fd = open(CSTRING(file), O_RDONLY);
113
148
  if (fd == -1)
114
149
  rb_raise(rb_eIOError, "unable to open file: %s", strerror(errno));
115
150
 
116
- size = (size + 7) / 8;
117
- buffer = malloc(size);
151
+ if (read(fd, &header, sizeof(header)) != sizeof(header)) {
152
+ close(fd);
153
+ rb_raise(rb_eIOError, "unable to read file, header corrupted\n");
154
+ }
155
+
156
+ nbits = (header.table_size + 7) / 8;
157
+ buffer = malloc(nbits);
118
158
  if (!buffer) {
119
159
  close(fd);
120
160
  rb_raise(rb_eNoMemError, "out of memory dumping BloomFilter");
121
161
  }
122
162
 
123
- bytes = read(fd, buffer, size);
124
- if (bytes != size) {
163
+ bytes = read(fd, buffer, nbits);
164
+ if (bytes != nbits) {
125
165
  free(buffer);
126
166
  close(fd);
127
- rb_raise(rb_eStandardError, "unable to load BloomFilter, expected %ld but got %ld bytes", size, bytes);
167
+ rb_raise(rb_eStandardError, "unable to load BloomFilter, expected %ld but got %ld bytes", nbits, bytes);
128
168
  }
129
169
 
130
- instance = bloom_allocate(klass);
131
- bloom_initialize(1, &table_size, instance);
170
+ filter = bloom_filter_new(header.table_size, string_nocase_hash, header.num_functions);
171
+ bloom_filter_load(filter, buffer);
172
+ instance = Data_Wrap_Struct(klass, 0, bloom_free, filter);
132
173
 
133
- bloom_filter_load(bloom_handle(instance), buffer);
134
174
  free(buffer);
135
175
  close(fd);
136
176
  return instance;
@@ -142,9 +182,9 @@ Init_bloom_filter() {
142
182
  rb_define_method(cBloom, "initialize", RUBY_METHOD_FUNC(bloom_initialize), -1);
143
183
  rb_define_method(cBloom, "dump", RUBY_METHOD_FUNC(bloom_dump), 1);
144
184
  rb_define_method(cBloom, "insert", RUBY_METHOD_FUNC(bloom_insert), 1);
145
- rb_define_method(cBloom, "exists?", RUBY_METHOD_FUNC(bloom_exists), 1);
185
+ rb_define_method(cBloom, "include?", RUBY_METHOD_FUNC(bloom_include), 1);
146
186
 
147
187
  rb_define_alloc_func(cBloom, bloom_allocate);
148
- rb_define_singleton_method(cBloom, "load", RUBY_METHOD_FUNC(bloom_load), -1);
188
+ rb_define_singleton_method(cBloom, "load", RUBY_METHOD_FUNC(bloom_load), 1);
149
189
  rb_define_const(cBloom, "VERSION", rb_str_new2(RUBY_BLOOM_FILTER_VERSION));
150
190
  }
@@ -1 +1 @@
1
- #define RUBY_BLOOM_FILTER_VERSION "0.1.0"
1
+ #define RUBY_BLOOM_FILTER_VERSION "0.1.1"
@@ -6,18 +6,18 @@ describe 'BloomFilter primitives' do
6
6
  end
7
7
 
8
8
  it 'should create one with given size' do
9
- assert BloomFilter.new(100)
9
+ assert BloomFilter.new(size: 100)
10
10
  end
11
11
 
12
12
  it 'should insert' do
13
- assert filter = BloomFilter.new(100)
13
+ assert filter = BloomFilter.new(size: 100)
14
14
  assert filter.insert("foo")
15
15
  end
16
16
 
17
17
  it 'should allow membership checks' do
18
- assert filter = BloomFilter.new(100)
18
+ assert filter = BloomFilter.new(size: 100)
19
19
  assert filter.insert("foo")
20
- assert filter.exists?("foo")
21
- assert !filter.exists?("bar")
20
+ assert filter.include?("foo")
21
+ assert !filter.include?("bar")
22
22
  end
23
23
  end
@@ -5,11 +5,11 @@ describe 'BloomFilter load & dump' do
5
5
  it 'should dump and load a filter' do
6
6
  file = Tempfile.new("bloom-filter-test")
7
7
 
8
- assert filter = BloomFilter.new(100)
8
+ assert filter = BloomFilter.new(size: 100)
9
9
  assert filter.insert("foo")
10
10
  assert filter.dump(file.path), "dump filter"
11
- assert filter = BloomFilter.load(file.path, 100)
12
- assert filter.exists?("foo")
13
- assert !filter.exists?("bar")
11
+ assert filter = BloomFilter.load(file.path)
12
+ assert filter.include?("foo")
13
+ assert !filter.include?("bar")
14
14
  end
15
15
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 0
9
- version: 0.1.0
8
+ - 1
9
+ version: 0.1.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - Bharanee Rathna
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2012-01-25 00:00:00 +11:00
17
+ date: 2012-01-27 00:00:00 +11:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -66,7 +66,6 @@ files:
66
66
  - README.md
67
67
  - CHANGELOG
68
68
  - lib/bloom-filter.rb
69
- - lib/bloom_filter.so
70
69
  has_rdoc: true
71
70
  homepage: http://github.com/deepfryed/bloom-filter
72
71
  licenses: []
Binary file