ewah-bitset 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG ADDED
@@ -0,0 +1 @@
1
+ v0.0.1. First version.
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2010 Josh Ferguson (josh@besquared.net)
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/Manifest ADDED
@@ -0,0 +1,11 @@
1
+ CHANGELOG
2
+ LICENSE
3
+ Manifest
4
+ README.md
5
+ Rakefile
6
+ ext/boolarray.h
7
+ ext/ewah-bitset.cpp
8
+ ext/ewah.h
9
+ ext/extconf.rb
10
+ spec/ewah_bitset_spec.rb
11
+ spec/spec.opts
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ # EWAH Bitsets in Ruby
2
+
3
+ Enhanced Word Aligned Hybrid (EWAH) encoding is a compression strategy for bitmaps. The compressed bitmaps can be scanned over and operated on in place which results in extremely fast processing times even while maintaining high compression rates.
4
+
5
+ This library wraps the original C++ implementation written by one of the original paper authors Daniel Lemire which is hosted here:
6
+
7
+ https://github.com/lemire/EWAHBoolArray
8
+
9
+ Examples are in the spec but the general idea goes something like this:
10
+
11
+ require 'ewahbitset'
12
+
13
+ bitset = EwahBitset.new
14
+ 0.upto(10) do |i|
15
+ bitset.set(i * 10)
16
+ end
17
+
18
+ positions = []
19
+ bitset.each do |position|
20
+ positions << position
21
+ end
22
+
23
+ puts positions.inspect
24
+
25
+ => [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
26
+
27
+ puts bitset.serialize.inspect
28
+
29
+ => "e\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\000\000\000\000\004\000\000\000\001\004\020@\000\001\004\020@\000\001\004\020\000\000\000"
30
+
31
+ # Tradeoffs
32
+
33
+ Of course there are a few tradeoffs at work here to mention. The first is that bits must be set in order. This means that you can't set bit position 1000 and then go set bit position 100. The other thing to note is that there is no random access into the bitset. Anytime you want to check a set bit you're required to do a sequential scan. Due to this an implemention of is_set? requires a sequential scan using the each method. This may make sense for some uses and not others but be aware!
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ require 'echoe'
2
+ require 'rake'
3
+ require 'rspec/core/rake_task'
4
+
5
+ task :default => :spec
6
+
7
+ Echoe.new("ewah-bitset") do |p|
8
+ p.author = "Josh Ferguson"
9
+ p.email = "josh@besquared.net"
10
+ p.project = "ewah-bitset"
11
+ p.summary = "A wrapper around Lemire's EWAHBoolArray from https://github.com/lemire/EWAHBoolArray"
12
+ p.url = "http://www.github.com/yammer/ewah-bitset/"
13
+ end
14
+
15
+ RSpec::Core::RakeTask.new(:spec) do |t|
16
+ t.pattern = 'spec/**/*_spec.rb'
17
+ t.rspec_opts = ['--options', "\"spec/spec.opts\""]
18
+ end
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "ewah-bitset"
5
+ s.version = "0.0.1"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Josh Ferguson"]
9
+ s.date = "2012-04-21"
10
+ s.description = "A wrapper around Lemire's EWAHBoolArray from https://github.com/lemire/EWAHBoolArray"
11
+ s.email = "josh@besquared.net"
12
+ s.extensions = ["ext/extconf.rb"]
13
+ s.extra_rdoc_files = ["CHANGELOG", "LICENSE", "README.md", "ext/boolarray.h", "ext/ewah-bitset.cpp", "ext/ewah.h", "ext/extconf.rb"]
14
+ s.files = ["CHANGELOG", "LICENSE", "Manifest", "README.md", "Rakefile", "ext/boolarray.h", "ext/ewah-bitset.cpp", "ext/ewah.h", "ext/extconf.rb", "spec/ewah_bitset_spec.rb", "spec/spec.opts", "ewah-bitset.gemspec"]
15
+ s.homepage = "http://www.github.com/yammer/ewah-bitset/"
16
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Ewah-bitset", "--main", "README.md"]
17
+ s.require_paths = ["lib", "ext"]
18
+ s.rubyforge_project = "ewah-bitset"
19
+ s.rubygems_version = "1.8.15"
20
+ s.summary = "A wrapper around Lemire's EWAHBoolArray from https://github.com/lemire/EWAHBoolArray"
21
+
22
+ if s.respond_to? :specification_version then
23
+ s.specification_version = 3
24
+
25
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
26
+ else
27
+ end
28
+ else
29
+ end
30
+ end
data/ext/boolarray.h ADDED
@@ -0,0 +1,179 @@
1
+ #ifndef BOOLARRAY_H
2
+ #define BOOLARRAY_H
3
+
4
+ #include <cassert>
5
+ #include <iostream>
6
+ #include <vector>
7
+ #include <stdexcept>
8
+ #include <sstream>
9
+ #include <iso646.h> // mostly for Microsoft compilers
10
+
11
+ typedef unsigned long ulong;
12
+ typedef unsigned int uint;
13
+ typedef unsigned short uword16;
14
+ typedef unsigned int uword32;
15
+ typedef unsigned long long uword64;
16
+
17
+
18
+ using namespace std;
19
+
20
+ /**
21
+ * A dynamic bitset implementation. (without compression).
22
+ * This is not tremendously useful, but it is provided as a reference.
23
+ */
24
+ template <class uword=uword32>
25
+ class BoolArray {
26
+ public:
27
+ BoolArray(const size_t n, const uword initval= 0):buffer(n / wordinbits + (n % wordinbits == 0 ? 0 : 1),initval),sizeinbits(n) { }
28
+
29
+ BoolArray():buffer(),sizeinbits(0) {}
30
+
31
+ BoolArray(const BoolArray & ba) : buffer(ba.buffer),sizeinbits(ba.sizeinbits) {}
32
+ void read(istream & in) {
33
+ sizeinbits = 0;
34
+ in.read(reinterpret_cast<char *>(&sizeinbits), sizeof(sizeinbits));
35
+ buffer.resize(sizeinbits / wordinbits + (sizeinbits % wordinbits == 0 ? 0 : 1));
36
+ in.read(reinterpret_cast<char *>(&buffer[0]),buffer.size()*sizeof(uword));
37
+ }
38
+
39
+ void readBuffer(istream & in,const size_t size) {
40
+ buffer.resize(size);
41
+ in.read(reinterpret_cast<char *>(&buffer[0]),buffer.size()*sizeof(uword));
42
+ sizeinbits = size*sizeof(uword)*8;
43
+ }
44
+
45
+ void setSizeInBits(const size_t sizeib) {
46
+ sizeinbits = sizeib;
47
+ }
48
+
49
+
50
+ void write(ostream & out) {
51
+ write(out,sizeinbits);
52
+ }
53
+
54
+ void write(ostream & out, const size_t numberofbits) const {
55
+ const size_t size = numberofbits/wordinbits + (numberofbits%wordinbits == 0 ? 0: 1);
56
+ out.write(reinterpret_cast<const char *>(&numberofbits), sizeof(numberofbits));
57
+ out.write(reinterpret_cast<const char *>(&buffer[0]),size*sizeof(uword));
58
+ }
59
+
60
+ void writeBuffer(ostream & out, const size_t numberofbits) const {
61
+ const size_t size = numberofbits/wordinbits + (numberofbits%wordinbits == 0 ? 0: 1);
62
+ out.write(reinterpret_cast<const char *>(&buffer[0]),size*sizeof(uword));
63
+ }
64
+
65
+ size_t sizeOnDisk() const {
66
+ size_t size = sizeinbits/wordinbits + (sizeinbits%wordinbits == 0 ? 0: 1);
67
+ return sizeof(sizeinbits) + size*sizeof(uword);
68
+ }
69
+
70
+
71
+ BoolArray& operator=(const BoolArray & x) {
72
+ this->buffer = x.buffer;
73
+ this->sizeinbits = x.sizeinbits;
74
+ return *this;
75
+ }
76
+
77
+ bool operator==(const BoolArray & x) const {
78
+ if(sizeinbits != x.sizeinbits) return false;
79
+ assert(buffer.size() == x.buffer.size());
80
+ for(size_t k = 0; k < buffer.size(); ++k)
81
+ if(buffer[k] != x.buffer[k]) return false;
82
+ return true;
83
+ }
84
+
85
+ bool operator!=(const BoolArray & x) const {
86
+ return ! operator==(x);
87
+ }
88
+
89
+ void setWord(const size_t pos, const uword val) {
90
+ assert(pos < buffer.size());
91
+ buffer[pos] = val;
92
+ }
93
+
94
+ void add(const uword val) {
95
+ if(sizeinbits % wordinbits != 0) throw invalid_argument("you probably didn't want to do this");
96
+ sizeinbits += wordinbits;
97
+ buffer.push_back(val);
98
+ }
99
+
100
+ uword getWord(const size_t pos) const {
101
+ assert(pos < buffer.size());
102
+ return buffer[pos];
103
+ }
104
+
105
+ /**
106
+ * set to true (whether it was already set to true or not)
107
+ *
108
+ * TODO this is an expensive (random access) API, you really ought to
109
+ * prepare a new word and then append it.
110
+ */
111
+ void set(const size_t pos) {
112
+ buffer[pos/wordinbits] |= ( static_cast<uword>(1) << (pos % wordinbits) ) ;
113
+ }
114
+
115
+ /**
116
+ * set to false (whether it was already set to false or not)
117
+ *
118
+ * TODO this is an expensive (random access) API, you really ought to
119
+ * prepare a new word and then append it.
120
+ */
121
+ void unset(const size_t pos) {
122
+ buffer[pos/wordinbits] |= ~( static_cast<uword>(1) << (pos % wordinbits) ) ;
123
+ }
124
+
125
+ /**
126
+ * true of false? (set or unset)
127
+ */
128
+ bool get(const size_t pos) const {
129
+ assert(pos/wordinbits < buffer.size());
130
+ return (buffer[pos/wordinbits] & ( static_cast<uword>(1) << (pos % wordinbits) )) != 0;
131
+ }
132
+
133
+ /**
134
+ * set all bits to 0
135
+ */
136
+ void reset() {
137
+ memset(&buffer[0],0,sizeof(uword)*buffer.size());
138
+ sizeinbits = 0;
139
+ }
140
+
141
+ size_t sizeInBits() const {
142
+ return sizeinbits;
143
+ }
144
+
145
+ ~BoolArray() {}
146
+
147
+ void logicaland(const BoolArray & ba, BoolArray & out);
148
+
149
+ void logicalor(const BoolArray & ba, BoolArray & out);
150
+
151
+
152
+
153
+ inline void printout(ostream &o = cout) {
154
+ for(size_t k = 0; k < sizeinbits; ++k)
155
+ o << get(k) << " ";
156
+ o << endl;
157
+ }
158
+
159
+ void append(const BoolArray & a);
160
+
161
+ enum { wordinbits = sizeof(uword) * 8};
162
+
163
+ private:
164
+ vector<uword> buffer;
165
+ size_t sizeinbits;
166
+
167
+ };
168
+
169
+ template <class uword>
170
+ void BoolArray<uword>::append(const BoolArray & a) {
171
+ if(sizeinbits % wordinbits == 0) {
172
+ buffer.insert(buffer.end(),a.buffer.begin(),a.buffer.end());
173
+ } else {
174
+ throw invalid_argument("Cannot append if parent does not meet boundary");
175
+ }
176
+ sizeinbits += a.sizeinbits;
177
+ }
178
+
179
+ #endif
@@ -0,0 +1,176 @@
1
+ #include <ruby.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <string.h>
5
+ #include <errno.h>
6
+
7
+ #include "ewah.h"
8
+
9
+ typedef VALUE (ruby_method)(...);
10
+
11
+ typedef struct ewah {
12
+ EWAHBoolArray<uword64> *bits;
13
+ } EWAH;
14
+
15
+ extern "C" VALUE ewah_new(VALUE klass) {
16
+ EWAH *b = ALLOC(EWAH);
17
+ b->bits = new EWAHBoolArray<uword64>();
18
+ VALUE bitset = Data_Wrap_Struct(klass, 0, free, b);
19
+ rb_obj_call_init(bitset, 0, 0);
20
+ return bitset;
21
+ }
22
+
23
+ extern "C" VALUE ewah_init(VALUE self) {
24
+ return self;
25
+ }
26
+
27
+ /* Core API */
28
+ extern "C" VALUE ewah_set(VALUE self, VALUE position) {
29
+ if (position == Qnil) {
30
+ rb_raise(rb_eRuntimeError, "Position to set not specified");
31
+ }
32
+
33
+ EWAH *bitset;
34
+ Data_Get_Struct(self, EWAH, bitset);
35
+ bitset->bits->set(FIX2INT(position));
36
+
37
+ return self;
38
+ }
39
+
40
+ extern "C" VALUE ewah_each(VALUE self) {
41
+ EWAH *bitset;
42
+ Data_Get_Struct(self, EWAH, bitset);
43
+
44
+ for(EWAHBoolArray<uword64>::const_iterator i = bitset->bits->begin(); i != bitset->bits->end(); ++i)
45
+ rb_yield(INT2FIX(*i));
46
+
47
+ return Qnil;
48
+ }
49
+
50
+ extern "C" VALUE ewah_swap(VALUE self, VALUE other) {
51
+ EWAH *bitset;
52
+ EWAH *obitset;
53
+ Data_Get_Struct(self, EWAH, bitset);
54
+ Data_Get_Struct(other, EWAH, obitset);
55
+
56
+ bitset->bits->swap(*(obitset->bits));
57
+
58
+ return self;
59
+ }
60
+
61
+ extern "C" VALUE ewah_reset(VALUE self) {
62
+ EWAH *bitset;
63
+ Data_Get_Struct(self, EWAH, bitset);
64
+ bitset->bits->reset();
65
+ return self;
66
+ }
67
+
68
+ /* Set Operations */
69
+ extern "C" VALUE ewah_logical_or(VALUE self, VALUE other) {
70
+ EWAH *bitset;
71
+ EWAH *obitset;
72
+ Data_Get_Struct(self, EWAH, bitset);
73
+ Data_Get_Struct(other, EWAH, obitset);
74
+
75
+ VALUE newBitset = ewah_new(rb_path2class("EwahBitset"));
76
+
77
+ EWAH *newBits;
78
+ Data_Get_Struct(newBitset, EWAH, newBits);
79
+ bitset->bits->logicalor(*(obitset->bits), *(newBits->bits));
80
+
81
+ return newBitset;
82
+ }
83
+
84
+ extern "C" VALUE ewah_logical_and(VALUE self, VALUE other) {
85
+ EWAH *bitset;
86
+ EWAH *obitset;
87
+ Data_Get_Struct(self, EWAH, bitset);
88
+ Data_Get_Struct(other, EWAH, obitset);
89
+
90
+ VALUE newBitset = ewah_new(rb_path2class("EwahBitset"));
91
+
92
+ EWAH *newBits;
93
+ Data_Get_Struct(newBitset, EWAH, newBits);
94
+ bitset->bits->logicaland(*(obitset->bits), *(newBits->bits));
95
+
96
+ return newBitset;
97
+ }
98
+
99
+ extern "C" VALUE ewah_equals(VALUE self, VALUE other) {
100
+ EWAH *bitset;
101
+ EWAH *obitset;
102
+ Data_Get_Struct(self, EWAH, bitset);
103
+ Data_Get_Struct(other, EWAH, obitset);
104
+
105
+ if(*(bitset->bits) == *(obitset->bits)) {
106
+ return Qtrue;
107
+ } else {
108
+ return Qfalse;
109
+ }
110
+ }
111
+
112
+ /* Information & Serialization */
113
+ extern "C" VALUE ewah_size_in_bits(VALUE self) {
114
+ EWAH *bitset;
115
+ Data_Get_Struct(self, EWAH, bitset);
116
+ return INT2FIX(bitset->bits->sizeInBits());
117
+ }
118
+
119
+ extern "C" VALUE ewah_size_in_bytes(VALUE self) {
120
+ EWAH *bitset;
121
+ Data_Get_Struct(self, EWAH, bitset);
122
+ return INT2FIX(bitset->bits->sizeInBytes());
123
+ }
124
+
125
+ extern "C" VALUE ewah_to_binary_s(VALUE self) {
126
+ EWAH *bitset;
127
+ Data_Get_Struct(self, EWAH, bitset);
128
+
129
+ stringstream ss;
130
+ bitset->bits->printout(ss);
131
+
132
+ return rb_str_new(ss.str().c_str(), ss.str().size());
133
+ }
134
+
135
+ extern "C" VALUE ewah_serialize(VALUE self) {
136
+ EWAH *bitset;
137
+ Data_Get_Struct(self, EWAH, bitset);
138
+
139
+ stringstream ss;
140
+ bitset->bits->write(ss);
141
+
142
+ return rb_str_new(ss.str().c_str(), ss.str().size());
143
+ }
144
+
145
+ extern "C" VALUE ewah_deserialize(VALUE self, VALUE bytes) {
146
+ EWAH *bitset;
147
+ Data_Get_Struct(self, EWAH, bitset);
148
+
149
+ stringstream ss;
150
+ ss.write(RSTRING_PTR(bytes), RSTRING_LEN(bytes));
151
+ bitset->bits->read(ss, true);
152
+
153
+ return self;
154
+ }
155
+
156
+ static VALUE rb_cC;
157
+ extern "C" void Init_ewahbitset() {
158
+ rb_cC = rb_define_class("EwahBitset", rb_cObject);
159
+ rb_define_singleton_method(rb_cC, "new", (ruby_method*) &ewah_new, 0);
160
+ rb_define_method(rb_cC, "initialize", (ruby_method*) &ewah_init, 0);
161
+
162
+ rb_define_method(rb_cC, "set", (ruby_method*) &ewah_set, 1);
163
+ rb_define_method(rb_cC, "each", (ruby_method*) &ewah_each, 0);
164
+ rb_define_method(rb_cC, "swap", (ruby_method*) &ewah_swap, 1);
165
+ rb_define_method(rb_cC, "reset", (ruby_method*) &ewah_reset, 0);
166
+
167
+ rb_define_method(rb_cC, "==", (ruby_method*) &ewah_equals, 1);
168
+ rb_define_method(rb_cC, "logical_or", (ruby_method*) &ewah_logical_or, 1);
169
+ rb_define_method(rb_cC, "logical_and", (ruby_method*) &ewah_logical_and, 1);
170
+
171
+ rb_define_method(rb_cC, "to_binary_s", (ruby_method*) &ewah_to_binary_s, 0);
172
+ rb_define_method(rb_cC, "serialize", (ruby_method*) &ewah_serialize, 0);
173
+ rb_define_method(rb_cC, "deserialize", (ruby_method*) &ewah_deserialize, 1);
174
+ rb_define_method(rb_cC, "size_in_bits", (ruby_method*) ewah_size_in_bits, 0);
175
+ rb_define_method(rb_cC, "size_in_bytes", (ruby_method*) ewah_size_in_bytes, 0);
176
+ }