hyperloglog 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG ADDED
@@ -0,0 +1 @@
1
+ v0.0.1. Initial Version.
data/Manifest ADDED
@@ -0,0 +1,14 @@
1
+ CHANGELOG
2
+ Manifest
3
+ README.md
4
+ Rakefile
5
+ ext/boolarray.h
6
+ ext/ewah.h
7
+ ext/extconf.rb
8
+ ext/hyperloglog.cpp
9
+ ext/murmur3.h
10
+ spec/data/integers.txt
11
+ spec/data/small_integers.txt
12
+ spec/data/small_integers2.txt
13
+ spec/hyperloglog_spec.rb
14
+ spec/spec.opts
data/README.md ADDED
@@ -0,0 +1,21 @@
1
+ # HyperLogLog for Ruby
2
+
3
+ # External Libraries Included
4
+
5
+ Murmur3
6
+ https://github.com/PeterScott/murmur3
7
+
8
+ EWAHBoolArray
9
+ https://github.com/lemire/EWAHBoolArray
10
+
11
+ # Example
12
+
13
+ # Build a new estimator
14
+ builder = HyperBuilder.new
15
+ 0.upto(100).each{|user_id| builder.offer(user_id)}
16
+
17
+ # Read an estimator from bytes on disk
18
+ estimator = HyperEstimator.new(File.read('bytes.txt'))
19
+
20
+ # Estimate the union of our two sources
21
+ estimate = HyperEstimator.estimate(builder.estimator, estimator)
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ require 'echoe'
2
+ require 'rake'
3
+ require 'rspec/core/rake_task'
4
+
5
+ task :default => :spec
6
+
7
+ Echoe.new("hyperloglog") do |p|
8
+ p.author = "Josh Ferguson"
9
+ p.email = "josh@besquared.net"
10
+ p.project = "hyperloglog"
11
+ p.summary = "An efficient implementation of the HyperLogLog cardinality estimator"
12
+ p.url = "http://www.github.com/besquared/hyperloglog/"
13
+ end
14
+
15
+ RSpec::Core::RakeTask.new(:spec) do |t|
16
+ t.pattern = 'spec/**/*_spec.rb'
17
+ t.rspec_opts = ['--options', "\"spec/spec.opts\""]
18
+ end
data/ext/boolarray.h ADDED
@@ -0,0 +1,179 @@
1
+ #ifndef BOOLARRAY_H
2
+ #define BOOLARRAY_H
3
+
4
+ #include <cassert>
5
+ #include <iostream>
6
+ #include <vector>
7
+ #include <stdexcept>
8
+ #include <sstream>
9
+ #include <iso646.h> // mostly for Microsoft compilers
10
+
11
+ typedef unsigned long ulong;
12
+ typedef unsigned int uint;
13
+ typedef unsigned short uword16;
14
+ typedef unsigned int uword32;
15
+ typedef unsigned long long uword64;
16
+
17
+
18
+ using namespace std;
19
+
20
+ /**
21
+ * A dynamic bitset implementation. (without compression).
22
+ * This is not tremendously useful, but it is provided as a reference.
23
+ */
24
+ template <class uword=uword32>
25
+ class BoolArray {
26
+ public:
27
+ BoolArray(const size_t n, const uword initval= 0):buffer(n / wordinbits + (n % wordinbits == 0 ? 0 : 1),initval),sizeinbits(n) { }
28
+
29
+ BoolArray():buffer(),sizeinbits(0) {}
30
+
31
+ BoolArray(const BoolArray & ba) : buffer(ba.buffer),sizeinbits(ba.sizeinbits) {}
32
+ void read(istream & in) {
33
+ sizeinbits = 0;
34
+ in.read(reinterpret_cast<char *>(&sizeinbits), sizeof(sizeinbits));
35
+ buffer.resize(sizeinbits / wordinbits + (sizeinbits % wordinbits == 0 ? 0 : 1));
36
+ in.read(reinterpret_cast<char *>(&buffer[0]),buffer.size()*sizeof(uword));
37
+ }
38
+
39
+ void readBuffer(istream & in,const size_t size) {
40
+ buffer.resize(size);
41
+ in.read(reinterpret_cast<char *>(&buffer[0]),buffer.size()*sizeof(uword));
42
+ sizeinbits = size*sizeof(uword)*8;
43
+ }
44
+
45
+ void setSizeInBits(const size_t sizeib) {
46
+ sizeinbits = sizeib;
47
+ }
48
+
49
+
50
+ void write(ostream & out) {
51
+ write(out,sizeinbits);
52
+ }
53
+
54
+ void write(ostream & out, const size_t numberofbits) const {
55
+ const size_t size = numberofbits/wordinbits + (numberofbits%wordinbits == 0 ? 0: 1);
56
+ out.write(reinterpret_cast<const char *>(&numberofbits), sizeof(numberofbits));
57
+ out.write(reinterpret_cast<const char *>(&buffer[0]),size*sizeof(uword));
58
+ }
59
+
60
+ void writeBuffer(ostream & out, const size_t numberofbits) const {
61
+ const size_t size = numberofbits/wordinbits + (numberofbits%wordinbits == 0 ? 0: 1);
62
+ out.write(reinterpret_cast<const char *>(&buffer[0]),size*sizeof(uword));
63
+ }
64
+
65
+ size_t sizeOnDisk() const {
66
+ size_t size = sizeinbits/wordinbits + (sizeinbits%wordinbits == 0 ? 0: 1);
67
+ return sizeof(sizeinbits) + size*sizeof(uword);
68
+ }
69
+
70
+
71
+ BoolArray& operator=(const BoolArray & x) {
72
+ this->buffer = x.buffer;
73
+ this->sizeinbits = x.sizeinbits;
74
+ return *this;
75
+ }
76
+
77
+ bool operator==(const BoolArray & x) const {
78
+ if(sizeinbits != x.sizeinbits) return false;
79
+ assert(buffer.size() == x.buffer.size());
80
+ for(size_t k = 0; k < buffer.size(); ++k)
81
+ if(buffer[k] != x.buffer[k]) return false;
82
+ return true;
83
+ }
84
+
85
+ bool operator!=(const BoolArray & x) const {
86
+ return ! operator==(x);
87
+ }
88
+
89
+ void setWord(const size_t pos, const uword val) {
90
+ assert(pos < buffer.size());
91
+ buffer[pos] = val;
92
+ }
93
+
94
+ void add(const uword val) {
95
+ if(sizeinbits % wordinbits != 0) throw invalid_argument("you probably didn't want to do this");
96
+ sizeinbits += wordinbits;
97
+ buffer.push_back(val);
98
+ }
99
+
100
+ uword getWord(const size_t pos) const {
101
+ assert(pos < buffer.size());
102
+ return buffer[pos];
103
+ }
104
+
105
+ /**
106
+ * set to true (whether it was already set to true or not)
107
+ *
108
+ * TODO this is an expensive (random access) API, you really ought to
109
+ * prepare a new word and then append it.
110
+ */
111
+ void set(const size_t pos) {
112
+ buffer[pos/wordinbits] |= ( static_cast<uword>(1) << (pos % wordinbits) ) ;
113
+ }
114
+
115
+ /**
116
+ * set to false (whether it was already set to false or not)
117
+ *
118
+ * TODO this is an expensive (random access) API, you really ought to
119
+ * prepare a new word and then append it.
120
+ */
121
+ void unset(const size_t pos) {
122
+ buffer[pos/wordinbits] |= ~( static_cast<uword>(1) << (pos % wordinbits) ) ;
123
+ }
124
+
125
+ /**
126
+ * true of false? (set or unset)
127
+ */
128
+ bool get(const size_t pos) const {
129
+ assert(pos/wordinbits < buffer.size());
130
+ return (buffer[pos/wordinbits] & ( static_cast<uword>(1) << (pos % wordinbits) )) != 0;
131
+ }
132
+
133
+ /**
134
+ * set all bits to 0
135
+ */
136
+ void reset() {
137
+ memset(&buffer[0],0,sizeof(uword)*buffer.size());
138
+ sizeinbits = 0;
139
+ }
140
+
141
+ size_t sizeInBits() const {
142
+ return sizeinbits;
143
+ }
144
+
145
+ ~BoolArray() {}
146
+
147
+ void logicaland(const BoolArray & ba, BoolArray & out);
148
+
149
+ void logicalor(const BoolArray & ba, BoolArray & out);
150
+
151
+
152
+
153
+ inline void printout(ostream &o = cout) {
154
+ for(size_t k = 0; k < sizeinbits; ++k)
155
+ o << get(k) << " ";
156
+ o << endl;
157
+ }
158
+
159
+ void append(const BoolArray & a);
160
+
161
+ enum { wordinbits = sizeof(uword) * 8};
162
+
163
+ private:
164
+ vector<uword> buffer;
165
+ size_t sizeinbits;
166
+
167
+ };
168
+
169
+ template <class uword>
170
+ void BoolArray<uword>::append(const BoolArray & a) {
171
+ if(sizeinbits % wordinbits == 0) {
172
+ buffer.insert(buffer.end(),a.buffer.begin(),a.buffer.end());
173
+ } else {
174
+ throw invalid_argument("Cannot append if parent does not meet boundary");
175
+ }
176
+ sizeinbits += a.sizeinbits;
177
+ }
178
+
179
+ #endif
data/ext/ewah.h ADDED
@@ -0,0 +1,1763 @@
1
+ #ifndef EWAH_H
2
+ #define EWAH_H
3
+
4
+ #include <string.h>
5
+ #include <stdlib.h>
6
+ #include <cassert>
7
+ #include <iostream>
8
+ #include <vector>
9
+ #include <stdexcept>
10
+ #include <cstddef>
11
+ #include <iso646.h> // mostly for Microsoft compilers
12
+
13
+ #include "boolarray.h"
14
+
15
+ // taken from stackoverflow
16
+ #ifndef NDEBUG
17
+ # define ASSERT(condition, message) \
18
+ do { \
19
+ if (! (condition)) { \
20
+ std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \
21
+ << " line " << __LINE__ << ": " << message << std::endl; \
22
+ std::exit(EXIT_FAILURE); \
23
+ } \
24
+ } while (false)
25
+ #else
26
+ # define ASSERT(condition, message) do { } while (false)
27
+ #endif
28
+
29
+
30
+ using namespace std;
31
+
32
+
33
+ /**
34
+ * count the number of bits set to one (32 bit version)
35
+ */
36
+ uint countOnes(uword32 v) {
37
+ v = v - ((v >> 1) & 0x55555555);
38
+ v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
39
+ return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
40
+ }
41
+ /**
42
+ * count the number of bits set to one (64 bit version)
43
+ */
44
+ uint countOnes(uword64 v) {
45
+ return countOnes(static_cast<uword32>(v))+ countOnes(static_cast<uword32>(v>>32));
46
+ }
47
+
48
+
49
+ uint countOnes(uword16 v) {
50
+ uint c;
51
+ for ( c = 0; v; c++) {
52
+ v &= v - 1;
53
+ }
54
+ return c;
55
+ }
56
+
57
+ /**
58
+ * Returns the binary representation of a binary word.
59
+ */
60
+ template <class uword>
61
+ inline string toBinaryString(const uword w) {
62
+ stringstream s;
63
+ for(uint k = 0; k <sizeof(uword)*8;++k) {
64
+ if(w & (static_cast<uword>(1)<<k)) s <<"1"; else s << "0";
65
+ }
66
+ string ans;
67
+ s >> ans;
68
+ return ans;
69
+ }
70
+
71
+
72
+ /**
73
+ * For expert users.
74
+ * This class is used to represent a special type of word storing
75
+ * a run length. It is defined by the Enhanced Word Aligned Hybrid (EWAH)
76
+ * format. You don't normally need to access this class.
77
+ */
78
+ template <class uword=uword32>
79
+ class RunningLengthWord {
80
+ public:
81
+ RunningLengthWord (uword & data) : mydata(data) { }
82
+
83
+ RunningLengthWord(const RunningLengthWord & rlw) : mydata(rlw.mydata) {}
84
+
85
+ RunningLengthWord& operator=(const RunningLengthWord & rlw) {
86
+ mydata = rlw.mydata;
87
+ return *this;
88
+ }
89
+
90
+
91
+ /**
92
+ * Which bit is being repeated?
93
+ */
94
+ bool getRunningBit() const {
95
+ return mydata & static_cast<uword>(1);
96
+ }
97
+
98
+
99
+ /**
100
+ * how many words should be filled by the running bit
101
+ */
102
+ static inline bool getRunningBit(uword data) {
103
+ return data & static_cast<uword>(1) ;
104
+ }
105
+
106
+ /**
107
+ * how many words should be filled by the running bit
108
+ */
109
+ uword getRunningLength() const {
110
+ return (mydata >> 1) & largestrunninglengthcount ;
111
+ }
112
+
113
+
114
+ /**
115
+ * followed by how many literal words?
116
+ */
117
+ static inline uword getRunningLength(uword data) {
118
+ return (data >> 1) & largestrunninglengthcount ;
119
+ }
120
+
121
+ /**
122
+ * followed by how many literal words?
123
+ */
124
+ uword getNumberOfLiteralWords() const {
125
+ return static_cast<uword>(mydata >> (1+runninglengthbits));
126
+ }
127
+
128
+ /**
129
+ * Total of getRunningLength() and getNumberOfLiteralWords()
130
+ */
131
+ uword size() const {
132
+ return getRunningLength() + getNumberOfLiteralWords();
133
+ }
134
+
135
+
136
+ /**
137
+ * followed by how many literal words?
138
+ */
139
+ static inline uword getNumberOfLiteralWords(uword data) {
140
+ return data >> (1+runninglengthbits);
141
+ }
142
+
143
+
144
+ /**
145
+ * running length of which type of bits
146
+ */
147
+ void setRunningBit(bool b) {
148
+ if(b) mydata |= static_cast<uword>(1);
149
+ else mydata &= static_cast<uword>(~1);
150
+ }
151
+
152
+
153
+
154
+ /**
155
+ * running length of which type of bits
156
+ */
157
+ static inline void setRunningBit(uword & data, bool b) {
158
+ if(b) data |= static_cast<uword>(1);
159
+ else data &= static_cast<uword>(~1);
160
+ }
161
+
162
+
163
+ /**
164
+ * running length of which type of bits
165
+ */
166
+ void discardFirstWords(uword x) {
167
+ assert(x<= size());
168
+ const uword rl ( getRunningLength() );
169
+ if(rl >= x) {
170
+ setRunningLength(rl - x);
171
+ return;
172
+ }
173
+ x -= rl;
174
+ setRunningLength(0);
175
+ setNumberOfLiteralWords(getNumberOfLiteralWords() - x);
176
+ }
177
+
178
+ void setRunningLength(uword l) {
179
+ mydata |= shiftedlargestrunninglengthcount;
180
+ mydata &= static_cast<uword>(l << 1) | notshiftedlargestrunninglengthcount;
181
+ }
182
+
183
+ // static call for people who hate objects
184
+ static inline void setRunningLength(uword & data, uword l) {
185
+ data |= shiftedlargestrunninglengthcount;
186
+ data &= static_cast<uword>(l<<1) | notshiftedlargestrunninglengthcount;
187
+ }
188
+
189
+ void setNumberOfLiteralWords(uword l) {
190
+ mydata |= notrunninglengthplusrunningbit;
191
+ mydata &= static_cast<uword>(l << (runninglengthbits +1) ) |runninglengthplusrunningbit;
192
+ }
193
+ // static call for people who hate objects
194
+ static inline void setNumberOfLiteralWords(uword & data, uword l) {
195
+ data |= notrunninglengthplusrunningbit;
196
+ data &= static_cast<uword>(l << (runninglengthbits +1) ) |runninglengthplusrunningbit;
197
+ }
198
+ static const uint runninglengthbits = sizeof(uword)*4;//16;
199
+ static const uint literalbits = sizeof(uword)*8 - 1 - runninglengthbits;
200
+ static const uword largestliteralcount = (static_cast<uword>(1)<<literalbits) - 1;
201
+ static const uword largestrunninglengthcount = (static_cast<uword>(1)<<runninglengthbits)-1;
202
+ static const uword shiftedlargestrunninglengthcount = largestrunninglengthcount<<1;
203
+ static const uword notshiftedlargestrunninglengthcount = static_cast<uword>(~shiftedlargestrunninglengthcount);
204
+ static const uword runninglengthplusrunningbit = (static_cast<uword>(1)<<(runninglengthbits+1)) - 1;
205
+ static const uword notrunninglengthplusrunningbit =static_cast<uword>(~runninglengthplusrunningbit);
206
+ static const uword notlargestrunninglengthcount =static_cast<uword>(~largestrunninglengthcount);
207
+
208
+ uword & mydata;
209
+ private:
210
+ };
211
+
212
+
213
+ /**
214
+ * Same as RunningLengthWord, except that the values cannot be modified.
215
+ */
216
+ template <class uword=uword32>
217
+ class ConstRunningLengthWord {
218
+ public:
219
+
220
+ ConstRunningLengthWord () : mydata(0) {
221
+ }
222
+
223
+ ConstRunningLengthWord (const uword data) : mydata(data) {
224
+ }
225
+
226
+ ConstRunningLengthWord(const ConstRunningLengthWord & rlw) : mydata(rlw.mydata) {}
227
+
228
+ /**
229
+ * Which bit is being repeated?
230
+ */
231
+ bool getRunningBit() const {
232
+ return mydata & static_cast<uword>(1);
233
+ }
234
+
235
+ /**
236
+ * how many words should be filled by the running bit
237
+ */
238
+ uword getRunningLength() const {
239
+ return (mydata >> 1) & RunningLengthWord<uword>::largestrunninglengthcount ;
240
+ }
241
+
242
+ /**
243
+ * followed by how many literal words?
244
+ */
245
+ uword getNumberOfLiteralWords() const {
246
+ return static_cast<uword>(mydata >> (1+RunningLengthWord<uword>::runninglengthbits));
247
+ }
248
+
249
+ /**
250
+ * Total of getRunningLength() and getNumberOfLiteralWords()
251
+ */
252
+ uword size() const {
253
+ return getRunningLength() + getNumberOfLiteralWords();
254
+ }
255
+
256
+ uword mydata;
257
+ };
258
+
259
+
260
+
261
+ /**
262
+ * Same as RunningLengthWord, except that the values are buffered for quick
263
+ * access.
264
+ */
265
+ template <class uword=uword32>
266
+ class BufferedRunningLengthWord {
267
+ public:
268
+ BufferedRunningLengthWord (const uword & data) : RunningBit(data & static_cast<uword>(1)),
269
+ RunningLength((data >> 1) & RunningLengthWord<uword>::largestrunninglengthcount),
270
+ NumberOfLiteralWords(static_cast<uword>(data >> (1+RunningLengthWord<uword>::runninglengthbits))) {
271
+ }
272
+ BufferedRunningLengthWord (const RunningLengthWord<uword> & p) : RunningBit(p.mydata & static_cast<uword>(1)),
273
+ RunningLength((p.mydata >> 1) & RunningLengthWord<uword>::largestrunninglengthcount),
274
+ NumberOfLiteralWords(p.mydata >> (1+RunningLengthWord<uword>::runninglengthbits)) {
275
+ }
276
+
277
+ void read(const uword & data) {
278
+ RunningBit = data & static_cast<uword>(1);
279
+ RunningLength = (data >> 1) & RunningLengthWord<uword>::largestrunninglengthcount;
280
+ NumberOfLiteralWords = static_cast<uword>(data >> (1+RunningLengthWord<uword>::runninglengthbits));
281
+ }
282
+
283
+ /**
284
+ * Which bit is being repeated?
285
+ */
286
+ bool getRunningBit() const {
287
+ return RunningBit;
288
+ }
289
+
290
+ void discardFirstWords(uword x) {
291
+ assert(x<= size());
292
+ if(RunningLength >= x) {
293
+ RunningLength = static_cast<uword>(RunningLength - x);
294
+ return;
295
+ }
296
+ x = static_cast<uword>( x - RunningLength);
297
+ RunningLength = 0;
298
+ NumberOfLiteralWords = static_cast<uword>(NumberOfLiteralWords - x);
299
+ }
300
+
301
+ /**
302
+ * how many words should be filled by the running bit (see previous method)
303
+ */
304
+ uword getRunningLength() const {
305
+ return RunningLength ;
306
+ }
307
+
308
+ /**
309
+ * followed by how many literal words?
310
+ */
311
+ uword getNumberOfLiteralWords() const {
312
+ return NumberOfLiteralWords;
313
+ }
314
+
315
+
316
+ /**
317
+ * Total of getRunningLength() and getNumberOfLiteralWords()
318
+ */
319
+ uword size() const {
320
+ return static_cast<uword>(RunningLength + NumberOfLiteralWords);
321
+ }
322
+ bool RunningBit;
323
+ uword RunningLength;
324
+ uword NumberOfLiteralWords;
325
+
326
+ };
327
+
328
+ template <class uword>
329
+ class EWAHBoolArray;
330
+
331
+
332
+ template <class uword>
333
+ class EWAHBoolArraySparseIterator;
334
+
335
+
336
+ /**
337
+ * Iterate over words of bits from a compressed bitmap.
338
+ */
339
+ template <class uword=uword32>
340
+ class EWAHBoolArrayIterator {
341
+ public:
342
+ /**
343
+ * is there a new word?
344
+ */
345
+ bool hasNext() const {
346
+ return pointer < myparent.size();
347
+ }
348
+
349
+ /**
350
+ * return next word.
351
+ */
352
+ uword next() {
353
+ uword returnvalue;
354
+ if(compressedwords < rl) {
355
+ ++compressedwords;
356
+ if(b)
357
+ returnvalue = notzero;
358
+ else
359
+ returnvalue = zero;
360
+ } else {
361
+ assert (literalwords < lw) ;
362
+ ++literalwords;
363
+ ++pointer;
364
+ assert(pointer <myparent.size());
365
+ returnvalue = myparent[pointer];
366
+ }
367
+ if((compressedwords == rl) && (literalwords == lw)) {
368
+ ++pointer;
369
+ if(pointer < myparent.size()) readNewRunningLengthWord();
370
+ }
371
+ return returnvalue;
372
+ }
373
+
374
+ EWAHBoolArrayIterator(const EWAHBoolArrayIterator<uword> & other):pointer(other.pointer),
375
+ myparent(other.myparent),
376
+ compressedwords(other.compressedwords),
377
+ literalwords(other.literalwords),
378
+ rl(other.rl),
379
+ lw(other.lw),
380
+ b(other.b) {}
381
+
382
+ static const uword zero = 0;
383
+ static const uword notzero=static_cast<uword>(~zero);
384
+ private:
385
+ EWAHBoolArrayIterator(const vector<uword> & parent) ;
386
+ void readNewRunningLengthWord() ;
387
+ friend class EWAHBoolArray<uword>;
388
+ friend class EWAHBoolArraySparseIterator<uword>;
389
+ size_t pointer;
390
+ const vector<uword> & myparent;
391
+ uword compressedwords;
392
+ uword literalwords;
393
+ uword rl, lw;
394
+ bool b;
395
+ };
396
+
397
+ template <class uword>
398
+ class EWAHBoolArraySparseIterator;
399
+
400
+
401
+
402
+
403
+ template <class uword>
404
+ class EWAHBoolArraySetBitForwardIterator;
405
+
406
+
407
+ class BitmapStatistics;
408
+
409
+ template <class uword>
410
+ class EWAHBoolArrayRawIterator;
411
+
412
+ /**
413
+ * This class is a compressed bitmap.
414
+ * This is where compression
415
+ * happens.
416
+ * The underlying data structure is an STL vector.
417
+ */
418
+ template <class uword=uword32>
419
+ class EWAHBoolArray {
420
+ public:
421
+ EWAHBoolArray(): buffer(1,0), sizeinbits(0), lastRLW(0) {
422
+ }
423
+
424
+ /**
425
+ * set the ith bit to true (starting at zero).
426
+ * Auto-expands the bitmap. It has constant running time complexity.
427
+ * Note that you must set the bits in increasing order:
428
+ * set(1), set(2) is ok; set(2), set(1) is not ok.
429
+ */
430
+ void set(size_t i);
431
+
432
+ /**
433
+ * Make sure the two bitmaps have the same size (padding with zeroes
434
+ * if necessary). It has constant running time complexity.
435
+ */
436
+ void makeSameSize(EWAHBoolArray & a) {
437
+ if(a.sizeinbits<sizeinbits)
438
+ a.padWithZeroes(sizeinbits);
439
+ else if(sizeinbits<a.sizeinbits)
440
+ padWithZeroes(a.sizeinbits);
441
+ }
442
+
443
+ enum {RESERVEMEMORY=true}; // for speed
444
+
445
+ typedef EWAHBoolArraySetBitForwardIterator<uword> const_iterator;
446
+
447
+
448
+ /**
449
+ * Returns an iterator that can be used to access the position of the
450
+ * set bits. The running time complexity of a full scan is proportional to the number
451
+ * of set bits: be aware that if you have long strings of 1s, this can be
452
+ * very inefficient.
453
+ */
454
+ const_iterator begin() const {
455
+ return EWAHBoolArraySetBitForwardIterator<uword>(buffer);
456
+ }
457
+
458
+
459
+ /**
460
+ * Basically a bogus iterator that can be used together with begin()
461
+ * for constructions such as for(EWAHBoolArray<uword>::iterator i = b.begin(); i!=b.end(); ++i) {}
462
+ */
463
+ const_iterator end() const {
464
+ return EWAHBoolArraySetBitForwardIterator<uword>(buffer,buffer.size());
465
+ }
466
+
467
+ /**
468
+ * computes the logical and with another compressed bitmap
469
+ * answer goes into container, though rawlogicaland is the
470
+ * default, sometimes this version is faster.
471
+ */
472
+ void sparselogicaland( EWAHBoolArray &a, EWAHBoolArray &out) ;
473
+
474
+ /**
475
+ * computes the logical and with another compressed bitmap
476
+ * answer goes into container
477
+ * Running time complexity is proportional to the sum of the compressed
478
+ * bitmap sizes.
479
+ */
480
+ void rawlogicaland( EWAHBoolArray &a, EWAHBoolArray &container) ;
481
+
482
+ /**
483
+ * computes the logical and with another compressed bitmap
484
+ * answer goes into container
485
+ * Running time complexity is proportional to the sum of the compressed
486
+ * bitmap sizes.
487
+ */
488
+ void rawlogicalor( EWAHBoolArray &a, EWAHBoolArray &container) ;
489
+
490
+
491
+ /**
492
+ * computes the logical and with another compressed bitmap
493
+ * answer goes into container
494
+ * Running time complexity is proportional to the sum of the compressed
495
+ * bitmap sizes.
496
+ * (alias for rawlogicaland)
497
+ */
498
+ void logicaland( EWAHBoolArray &a, EWAHBoolArray &container) {
499
+ rawlogicaland(a,container);
500
+ }
501
+
502
+ /**
503
+ * compute the logical and with another compressed bitmap
504
+ * answer goes into container.
505
+ * Running time complexity is proportional to the sum of the compressed
506
+ * bitmap sizes.
507
+ * (alias for rawlogicalor)
508
+ */
509
+ void logicalor( EWAHBoolArray &a, EWAHBoolArray &container) {
510
+ rawlogicalor(a,container);
511
+ }
512
+
513
+ /**
514
+ * clear the content of the bitmap. It does not
515
+ * release the memory.
516
+ */
517
+ void reset() {
518
+ buffer.clear();
519
+ buffer.push_back(0);
520
+ sizeinbits = 0;
521
+ lastRLW = 0;
522
+ }
523
+
524
+ /**
525
+ * convenience method.
526
+ *
527
+ * returns the number of words added (storage cost increase)
528
+ */
529
+ inline size_t add(const uword newdata, const uint bitsthatmatter = 8*sizeof(uword));
530
+
531
+ inline void printout(ostream &o = cout) {
532
+ toBoolArray().printout(o);
533
+ }
534
+
535
+ /**
536
+ * Prints a verbose description of the content of the compressed bitmap.
537
+ */
538
+ void debugprintout() const;
539
+
540
+ /**
541
+ * Return the size in bits of this bitmap (this refers
542
+ * to the uncompressed size in bits).
543
+ */
544
+ inline size_t sizeInBits() const {
545
+ return sizeinbits;
546
+ }
547
+
548
+ /**
549
+ * set size in bits. This does not affect the compressed size. It
550
+ * runs in constant time.
551
+ */
552
+ inline void setSizeInBits(const size_t size) {
553
+ sizeinbits = size;
554
+ }
555
+
556
+ /**
557
+ * Return the size of the buffer in bytes. This
558
+ * is equivalent to the storage cost, minus some overhead.
559
+ */
560
+ inline size_t sizeInBytes() const {
561
+ return buffer.size()*sizeof(uword);
562
+ }
563
+
564
+
565
+
566
+ /**
567
+ * same as addEmptyWord, but you can do several in one shot!
568
+ * returns the number of words added (storage cost increase)
569
+ */
570
+ size_t addStreamOfEmptyWords(const bool v, const size_t number);
571
+
572
+ /**
573
+ * add a stream of dirty words,, returns the number of words added
574
+ * (storage cost increase)
575
+ */
576
+ size_t addStreamOfDirtyWords(const uword * v, const size_t number);
577
+
578
+ /**
579
+ * make sure the size of the array is totalbits bits by padding with zeroes.
580
+ * returns the number of words added (storage cost increase)
581
+ */
582
+ inline size_t padWithZeroes(const size_t totalbits);
583
+
584
+ /**
585
+ * Compute the size on disk assuming that it was saved using
586
+ * the method "save".
587
+ */
588
+ size_t sizeOnDisk() const;
589
+
590
+
591
+ /**
592
+ * Save this bitmap to a stream. The file format is
593
+ * | sizeinbits | buffer lenth | buffer content|
594
+ * the sizeinbits part can be omitted if "savesizeinbits=false".
595
+ * Both sizeinbits and buffer length are saved using the size_t data
596
+ * type which is typically a 32-bit unsigned integer for 32-bit CPUs
597
+ * and a 64-bit unsigned integer for 64-bit CPUs.
598
+ * Note that this format is machine-specific. Note also
599
+ * that the word size is not saved. For robust persistent
600
+ * storage, you need to save this extra information elsewhere.
601
+ */
602
+ inline void write(ostream & out, const bool savesizeinbits=true) const;
603
+
604
+ /**
605
+ * This only writes the content of the buffer (see write()) method.
606
+ * It is for advanced users.
607
+ */
608
+ inline void writeBuffer(ostream & out) const;
609
+
610
+ /**
611
+ * size (in words) of the underlying STL vector.
612
+ */
613
+ inline size_t bufferSize() const {
614
+ return buffer.size();
615
+ }
616
+
617
+ /**
618
+ * this is the counterpart to the write method.
619
+ * if you set savesizeinbits=false, then you are responsible
620
+ * for setting the value fo the attribute sizeinbits (see method setSizeInBits).
621
+ */
622
+ inline void read(istream & in, const bool savesizeinbits=true);
623
+
624
+
625
+ /**
626
+ * read the buffer from a stream, see method writeBuffer.
627
+ * this is for advanced users.
628
+ */
629
+ inline void readBuffer(istream & in, const size_t buffersize);
630
+
631
+ bool operator==(const EWAHBoolArray & x) const;
632
+
633
+ bool operator!=(const EWAHBoolArray & x) const;
634
+
635
+ bool operator==(const BoolArray<uword> & x) const;
636
+
637
+ bool operator!=(const BoolArray<uword> & x) const;
638
+
639
+ /**
640
+ * Iterate over the uncompressed words.
641
+ * Can be considerably faster than begin()/end().
642
+ * Running time complexity of a full scan is proportional to the
643
+ * uncompressed size of the bitmap.
644
+ */
645
+ EWAHBoolArrayIterator<uword> uncompress() const ;
646
+
647
+ /**
648
+ * To iterate over non-zero uncompressed words.
649
+ * Can be considerably faster than begin()/end().
650
+ * Running time complexity of a fun scan is proportional to the number of
651
+ * non-zero uncompressed words.
652
+ */
653
+ EWAHBoolArraySparseIterator<uword> sparse_uncompress() const ;
654
+
655
+ /**
656
+ * To iterate over the compressed data.
657
+ * Can be faster than any other iterator.
658
+ * Running time complexity of a full scan is proportional to the
659
+ * compressed size of the bitmap.
660
+ */
661
+ EWAHBoolArrayRawIterator<uword> raw_iterator() const ;
662
+
663
+ /**
664
+ * Appends the content of some other compressed bitmap
665
+ * at the end of the current bitmap.
666
+ */
667
+ void append(const EWAHBoolArray & x);
668
+
669
+ /**
670
+ * For research purposes. This computes the number of
671
+ * dirty words and the number of compressed words.
672
+ */
673
+ BitmapStatistics computeStatistics() const;
674
+
675
+ BoolArray<uword> toBoolArray() const;
676
+
677
+ /**
678
+ * Convert to a list of positions of "set" bits.
679
+ * The recommender container is vector<size_t>.
680
+ */
681
+ template <class container>
682
+ void appendRowIDs(container & out, const size_t offset = 0) const;
683
+
684
+
685
+ /**
686
+ * Convert to a list of positions of "set" bits.
687
+ * The recommender container is vector<size_t>.
688
+ * (alias for appendRowIDs).
689
+ */
690
+ template <class container>
691
+ void appendSetBits(container & out, const size_t offset = 0) const {
692
+ return appendRowIDs(out,offset);
693
+ }
694
+
695
+ /**
696
+ * Returns the number of bits set to the value 1.
697
+ * The running time complexity is proportional to the
698
+ * compressed size of the bitmap.
699
+ */
700
+ size_t numberOfOnes();
701
+
702
+ /**
703
+ * Swap the content of this bitmap with another bitmap.
704
+ * No copying is done. (Running time complexity is constant.)
705
+ */
706
+ void swap(EWAHBoolArray & x);
707
+
708
+ const vector<uword> & getBuffer() const {
709
+ return buffer;
710
+ };
711
+ enum { wordinbits = sizeof(uword) * 8};
712
+
713
+
714
+ /**
715
+ *Please don't copy your bitmaps! The running time
716
+ * complexity of a copy is the size of the compressed bitmap.
717
+ **/
718
+ EWAHBoolArray(const EWAHBoolArray& other) :
719
+ buffer(other.buffer),
720
+ sizeinbits(other.sizeinbits),
721
+ lastRLW(other.lastRLW) {
722
+ ASSERT(buffer.size()<=1,"You are trying to copy the bitmap, a terrible idea in general, for performance reasons.");// performance assert!
723
+ }
724
+
725
+ /**
726
+ * Copies the content of one bitmap onto another. Running time complexity
727
+ * is proportional to the size of the compressed bitmap.
728
+ * please, never hard-copy this object. Use the swap method if you must.
729
+ */
730
+ EWAHBoolArray & operator=(const EWAHBoolArray & x) {
731
+ buffer = x.buffer;
732
+ sizeinbits = x.sizeinbits;
733
+ lastRLW = x.lastRLW;
734
+ return *this;
735
+ }
736
+
737
+ /**
738
+ * This is equivalent to the operator =. It is used
739
+ * to keep in mind that assignment can be expensive.
740
+ *
741
+ *if you don't care to copy the bitmap (performance-wise), use this!
742
+ */
743
+ void expensive_copy(const EWAHBoolArray & x) {
744
+ buffer = x.buffer;
745
+ sizeinbits = x.sizeinbits;
746
+ lastRLW = x.lastRLW;
747
+ }
748
+
749
+ /**
750
+ * Write the logical not of this bitmap in the provided container.
751
+ */
752
+ void logicalnot(EWAHBoolArray & x) const;
753
+
754
+ /**
755
+ * Apply the logical not operation on this bitmap.
756
+ * Running time complexity is proportional to the compressed size of the bitmap.
757
+ */
758
+ void inplace_logicalnot();
759
+
760
+
761
+ private:
762
+
763
+
764
+
765
+ // private because does not increment the size in bits
766
+ // returns the number of words added (storage cost increase)
767
+ inline size_t addLiteralWord(const uword newdata) ;
768
+
769
+ // private because does not increment the size in bits
770
+ // returns the number of words added (storage cost increase)
771
+ size_t addEmptyWord(const bool v);
772
+ // this second version "might" be faster if you hate OOP.
773
+ // in my tests, it turned out to be slower!
774
+ // private because does not increment the size in bits
775
+ //inline void addEmptyWordStaticCalls(bool v);
776
+
777
+ vector<uword> buffer;
778
+ size_t sizeinbits;
779
+ size_t lastRLW;
780
+ };
781
+
782
+
783
+
784
+ /**
785
+ * Iterator over the words of the compressed bitmap.
786
+ */
787
+ template <class uword=uword32>
788
+ class EWAHBoolArraySparseIterator {
789
+ public:
790
+ /**
791
+ * is there more words?
792
+ */
793
+ bool hasNext() const {
794
+ return i.hasNext();
795
+ }
796
+
797
+ size_t position() const {
798
+ return mPosition;
799
+ }
800
+ /**
801
+ * return next word. If the word is either 0x00 or 0x11
802
+ * the you need to call position() to know how many times it
803
+ * was repeated
804
+ */
805
+ uword next() {
806
+ uword returnvalue;
807
+ if(i.compressedwords < i.rl) {
808
+ if(i.b) {
809
+ ++mPosition;
810
+ ++i.compressedwords;
811
+ returnvalue = EWAHBoolArrayIterator<uword>::notzero;
812
+ } else {
813
+ mPosition = static_cast<size_t>(mPosition + i.rl);
814
+ i.compressedwords = i.rl;
815
+ returnvalue = EWAHBoolArrayIterator<uword>::zero;//next();
816
+ }
817
+ } else {
818
+ assert (i.literalwords < i.lw);
819
+ ++i.literalwords;
820
+ ++i.pointer;
821
+ ++mPosition;
822
+ assert(i.pointer <i.myparent.size());
823
+ returnvalue = i.myparent[i.pointer];
824
+ }
825
+ if((i.compressedwords == i.rl) && (i.literalwords == i.lw)) {
826
+ ++i.pointer;
827
+ if(i.pointer < i.myparent.size()) i.readNewRunningLengthWord();
828
+ }
829
+ return returnvalue;
830
+ }
831
+
832
+ EWAHBoolArraySparseIterator(const EWAHBoolArraySparseIterator<uword> & other):i(other.i),mPosition(other.mPosition) {}
833
+
834
+ private:
835
+ EWAHBoolArraySparseIterator(const vector<uword> & parent) : i(parent), mPosition(0) {}
836
+ EWAHBoolArrayIterator<uword> i;
837
+ size_t mPosition;
838
+ friend class EWAHBoolArray<uword>;
839
+ };
840
+
841
+
842
+ /**
843
+ * Used to go through the set bits. Not optimally fast, but convenient.
844
+ */
845
+ template <class uword>
846
+ class EWAHBoolArraySetBitForwardIterator {
847
+ public:
848
+ enum { wordinbits = sizeof(uword) * 8};
849
+ typedef forward_iterator_tag iterator_category;
850
+ typedef size_t * pointer;
851
+ typedef size_t & reference_type;
852
+ typedef size_t value_type;
853
+ typedef ptrdiff_t difference_type;
854
+ typedef EWAHBoolArraySetBitForwardIterator<uword> type_of_iterator;
855
+
856
+ /**
857
+ * Provides the location of the set bit.
858
+ */
859
+ size_t operator*() const {
860
+ return currentrunoffset+offsetofpreviousrun;
861
+ }
862
+
863
+ // this can be expensive
864
+ difference_type operator-(const type_of_iterator& o) {
865
+ type_of_iterator& smaller = *this<o ? *this : o;
866
+ type_of_iterator& bigger = *this>=o ? *this : o;
867
+ if(smaller.mpointer==smaller.buffer.size())
868
+ return 0;
869
+ difference_type absdiff = static_cast<difference_type>(0);
870
+ EWAHBoolArraySetBitForwardIterator<uword> buf(smaller);
871
+ while(buf!= bigger) {
872
+ ++absdiff;
873
+ ++buf;
874
+ }
875
+ if(*this<o)
876
+ return absdiff;
877
+ else
878
+ return - absdiff;
879
+ }
880
+
881
+ bool operator<(const type_of_iterator& o) {
882
+ if(buffer != o.buffer) return false;
883
+ if(mpointer==buffer.size()) return false;
884
+ if(o.mpointer==o.buffer.size()) return true;
885
+ if(offsetofpreviousrun<o.offsetofpreviousrun)
886
+ return true;
887
+ if(offsetofpreviousrun>o.offsetofpreviousrun)
888
+ return false;
889
+ if(currentrunoffset<o.currentrunoffset)
890
+ return true;
891
+ return false;
892
+ }
893
+ bool operator<=(const type_of_iterator& o) {
894
+ return ( (*this) < o ) || ((*this) == o);
895
+ }
896
+
897
+ bool operator>(const type_of_iterator& o) {
898
+ return ! ((*this) <= o ) ;
899
+ }
900
+
901
+ bool operator>=(const type_of_iterator& o) {
902
+ return ! ((*this) < o ) ;
903
+ }
904
+
905
+ EWAHBoolArraySetBitForwardIterator & operator++() {
906
+ ++currentrunoffset;
907
+ advanceToNextSetBit();
908
+ return *this;
909
+ }
910
+ EWAHBoolArraySetBitForwardIterator operator++(int) {
911
+ EWAHBoolArraySetBitForwardIterator old(*this);
912
+ ++currentrunoffset;
913
+ advanceToNextSetBit();
914
+ return old;
915
+ }
916
+ bool operator==(const EWAHBoolArraySetBitForwardIterator<uword> & o) {
917
+ // if they are both over, return true
918
+ if((mpointer==buffer.size()) && (o.mpointer==o.buffer.size()))
919
+ return true;
920
+ return (buffer == o.buffer) && (mpointer == o.mpointer) &&
921
+ (offsetofpreviousrun == o.offsetofpreviousrun) && (currentrunoffset == o.currentrunoffset);
922
+ }
923
+
924
+ bool operator!=(const EWAHBoolArraySetBitForwardIterator<uword> & o) {
925
+ // if they are both over, return false
926
+ if((mpointer==buffer.size()) && (o.mpointer==o.buffer.size()))
927
+ return false;
928
+ return (buffer != o.buffer) || (mpointer != o.mpointer) ||
929
+ (offsetofpreviousrun != o.offsetofpreviousrun) || (currentrunoffset != o.currentrunoffset);
930
+ }
931
+
932
+
933
+ EWAHBoolArraySetBitForwardIterator(const EWAHBoolArraySetBitForwardIterator & o) : buffer(o.buffer), mpointer(o.mpointer),
934
+ offsetofpreviousrun(o.offsetofpreviousrun), currentrunoffset(o.currentrunoffset), rlw(o.rlw) {}
935
+
936
+ private:
937
+
938
+ bool advanceToNextSetBit() {
939
+ if(mpointer==buffer.size()) return false;
940
+ if (currentrunoffset<static_cast<size_t>(rlw.getRunningLength() * wordinbits)) {
941
+ if(rlw.getRunningBit())
942
+ return true;// nothing to do
943
+ currentrunoffset = static_cast<size_t>(rlw.getRunningLength() * wordinbits);//skipping
944
+ }
945
+ while(true) {
946
+ const size_t indexoflitword = static_cast<size_t>( (currentrunoffset-rlw.getRunningLength() * wordinbits)/wordinbits);
947
+ if(indexoflitword>= rlw.getNumberOfLiteralWords() ) {
948
+ if(advanceToNextRun())
949
+ return advanceToNextSetBit();
950
+ else {
951
+ return false;
952
+ }
953
+ }
954
+ const uword currentword = buffer[mpointer + 1 + indexoflitword];
955
+ for(uint inwordpointer =
956
+ static_cast<uint>((currentrunoffset-rlw.getRunningLength() * wordinbits)%wordinbits);
957
+ inwordpointer<wordinbits;++inwordpointer,++currentrunoffset) {
958
+ if((currentword & (static_cast<uword>(1) << inwordpointer))!=0)
959
+ return true;
960
+ }
961
+ }
962
+ }
963
+
964
+ bool advanceToNextRun() {
965
+ offsetofpreviousrun += currentrunoffset;
966
+ currentrunoffset = 0;
967
+ mpointer += static_cast<size_t>(1 + rlw.getNumberOfLiteralWords());
968
+ if(mpointer<buffer.size()) {
969
+ rlw.mydata = buffer[mpointer];
970
+ } else {
971
+ return false;
972
+ }
973
+ return true;
974
+ }
975
+
976
+
977
+ EWAHBoolArraySetBitForwardIterator(const vector<uword> & parent, size_t startpointer = 0) : buffer(parent), mpointer(startpointer),
978
+ offsetofpreviousrun(0), currentrunoffset(0), rlw(0) {
979
+ if(mpointer<buffer.size()) {
980
+ rlw.mydata = buffer[mpointer];
981
+ advanceToNextSetBit();
982
+ }
983
+ }
984
+
985
+
986
+ const vector<uword> & buffer;
987
+ size_t mpointer;
988
+ size_t offsetofpreviousrun;
989
+ size_t currentrunoffset;
990
+ friend class EWAHBoolArray<uword>;
991
+ ConstRunningLengthWord<uword> rlw;
992
+ };
993
+
994
+
995
+
996
+ /**
997
+ * This object is returned by the compressed bitmap as a
998
+ * statistical descriptor.
999
+ */
1000
+ class BitmapStatistics {
1001
+ public:
1002
+ BitmapStatistics() : totalliteral(0), totalcompressed(0), runningwordmarker(0), maximumofrunningcounterreached(0) {}
1003
+ size_t getCompressedSize() const {
1004
+ return totalliteral+ runningwordmarker;
1005
+ }
1006
+ size_t getUncompressedSize() const {
1007
+ return totalliteral+ totalcompressed;
1008
+ }
1009
+ size_t getNumberOfDirtyWords() const {
1010
+ return totalliteral;
1011
+ }
1012
+ size_t getNumberOfCleanWords() const {
1013
+ return totalcompressed;
1014
+ }
1015
+ size_t getNumberOfMarkers() const {
1016
+ return runningwordmarker;
1017
+ }
1018
+ size_t getOverRuns() const {
1019
+ return maximumofrunningcounterreached;
1020
+ }
1021
+ size_t totalliteral;
1022
+ size_t totalcompressed;
1023
+ size_t runningwordmarker;
1024
+ size_t maximumofrunningcounterreached;
1025
+ };
1026
+
1027
+
1028
+ template <class uword>
1029
+ void EWAHBoolArray<uword>::set(size_t i) {
1030
+ // must I complete a word?
1031
+ if ( (sizeinbits % (8*sizeof(uword))) != 0) {
1032
+ const size_t possiblesizeinbits = (sizeinbits /(8*sizeof(uword)))*(8*sizeof(uword)) + (8*sizeof(uword));
1033
+ if(possiblesizeinbits<i+1) {
1034
+ sizeinbits = possiblesizeinbits;
1035
+ }
1036
+ }
1037
+ addStreamOfEmptyWords(false, (i/(8*sizeof(uword))) - sizeinbits/(8*sizeof(uword)));
1038
+ size_t bittoflip = i-(sizeinbits/(8*sizeof(uword)) * (8*sizeof(uword)));
1039
+ // next, we set the bit
1040
+ RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
1041
+ if(( lastRunningLengthWord.getNumberOfLiteralWords() == 0) || ((sizeinbits
1042
+ -1)/(8*sizeof(uword)) < i/(8*sizeof(uword))) ) {
1043
+ const uword newdata = static_cast<uword>(static_cast<uword>(1)<<bittoflip);
1044
+ addLiteralWord(newdata);
1045
+ } else {
1046
+ buffer[buffer.size()-1] |= static_cast<uword>(static_cast<uword>(1)<<bittoflip);
1047
+ // check if we just completed a stream of 1s
1048
+ if(buffer[buffer.size()-1] == static_cast<uword>(~0)) {
1049
+ // we remove the last dirty word
1050
+ buffer[buffer.size()-1] = 0;
1051
+ buffer.resize(buffer.size()-1);
1052
+ lastRunningLengthWord.setNumberOfLiteralWords(static_cast<uword>(lastRunningLengthWord.getNumberOfLiteralWords()-1));
1053
+ // next we add one clean word
1054
+ addEmptyWord(true);
1055
+ }
1056
+ }
1057
+ sizeinbits = i+1;
1058
+ }
1059
+
1060
+
1061
+
1062
+ template <class uword>
1063
+ void EWAHBoolArray<uword>::inplace_logicalnot() {
1064
+ size_t pointer(0);
1065
+ while(pointer <buffer.size()) {
1066
+ RunningLengthWord<uword> rlw(buffer[pointer]);
1067
+ if(rlw.getRunningBit())
1068
+ rlw.setRunningBit(false);
1069
+ else
1070
+ rlw.setRunningBit(true);
1071
+ ++pointer;
1072
+ for(size_t k = 0; k<rlw.getNumberOfLiteralWords(); ++k) {
1073
+ buffer[pointer] = ~buffer[pointer];
1074
+ ++pointer;
1075
+ }
1076
+ }
1077
+ }
1078
+
1079
+
1080
+ template <class uword>
1081
+ void EWAHBoolArray<uword>::logicalnot(EWAHBoolArray & x) const {
1082
+ x.reset();
1083
+ x.buffer.reserve(buffer.size());
1084
+ EWAHBoolArrayRawIterator<uword> i = this->raw_iterator();
1085
+ while(i.hasNext()) {
1086
+ BufferedRunningLengthWord<uword> & rlw = i.next();
1087
+ x.addStreamOfEmptyWords(! rlw.getRunningBit(), rlw.getRunningLength());
1088
+ if(rlw.getNumberOfLiteralWords()>0) {
1089
+ const uword * dw = i.dirtyWords();
1090
+ for(size_t k = 0 ; k <rlw.getNumberOfLiteralWords(); ++k) {
1091
+ x.addLiteralWord(~ dw[k]);
1092
+ }
1093
+ }
1094
+ }
1095
+ x.sizeinbits = this->sizeinbits;
1096
+ }
1097
+
1098
+
1099
+ template <class uword>
1100
+ size_t EWAHBoolArray<uword>::add(const uword newdata, const uint bitsthatmatter) {
1101
+ sizeinbits += bitsthatmatter;
1102
+ if(newdata == 0) {
1103
+ return addEmptyWord(0);
1104
+ } else if (newdata == static_cast<uword>(~0)) {
1105
+ return addEmptyWord(1);
1106
+ } else {
1107
+ return addLiteralWord(newdata);
1108
+ }
1109
+ }
1110
+
1111
+
1112
+ template <class uword>
1113
+ inline void EWAHBoolArray<uword>::writeBuffer(ostream & out) const {
1114
+ if(buffer.size()>0)
1115
+ out.write(reinterpret_cast<const char *>(& buffer[0]),sizeof(uword)*buffer.size());
1116
+ }
1117
+
1118
+
1119
+ template <class uword>
1120
+ inline void EWAHBoolArray<uword>::readBuffer(istream & in, const size_t buffersize) {
1121
+ buffer.resize(buffersize);
1122
+ if(buffersize>0)
1123
+ in.read(reinterpret_cast<char *>(&buffer[0]),sizeof(uword)*buffersize);
1124
+ }
1125
+
1126
+
1127
+ template <class uword>
1128
+ void EWAHBoolArray<uword>::write(ostream & out, const bool savesizeinbits) const {
1129
+ if(savesizeinbits)out.write(reinterpret_cast<const char *>( & sizeinbits), sizeof(sizeinbits));
1130
+ const size_t buffersize = buffer.size();
1131
+ out.write(reinterpret_cast<const char *>(& buffersize),sizeof(buffersize));
1132
+ if(buffersize>0)
1133
+ out.write(reinterpret_cast<const char *>(& buffer[0]),sizeof(uword)*buffersize);
1134
+ }
1135
+
1136
+
1137
+ template <class uword>
1138
+ void EWAHBoolArray<uword>::read(istream & in, const bool savesizeinbits) {
1139
+ if(savesizeinbits) in.read(reinterpret_cast<char *>(&sizeinbits), sizeof(sizeinbits));
1140
+ else sizeinbits = 0;
1141
+ size_t buffersize(0);
1142
+ in.read(reinterpret_cast<char *>(&buffersize), sizeof(buffersize));
1143
+ buffer.resize(buffersize);
1144
+ if(buffersize>0)
1145
+ in.read(reinterpret_cast<char *>(&buffer[0]),sizeof(uword)*buffersize);
1146
+ }
1147
+
1148
+
1149
+ template <class uword>
1150
+ size_t EWAHBoolArray<uword>::addLiteralWord(const uword newdata) {
1151
+ RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
1152
+ uword numbersofar = lastRunningLengthWord.getNumberOfLiteralWords();
1153
+ if(numbersofar >= RunningLengthWord<uword>::largestliteralcount) {//0x7FFF) {
1154
+ buffer.push_back(0);
1155
+ lastRLW = buffer.size() - 1;
1156
+ RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
1157
+ lastRunningLengthWord2.setNumberOfLiteralWords(1);
1158
+ buffer.push_back(newdata);
1159
+ return 2;
1160
+ }
1161
+ lastRunningLengthWord.setNumberOfLiteralWords(static_cast<uword>(numbersofar + 1));
1162
+ assert(lastRunningLengthWord.getNumberOfLiteralWords()==numbersofar + 1);
1163
+ buffer.push_back(newdata);
1164
+ return 1;
1165
+ }
1166
+
1167
+
1168
+
1169
+
1170
+ template <class uword>
1171
+ size_t EWAHBoolArray<uword>::padWithZeroes(const size_t totalbits) {
1172
+ assert(totalbits >= sizeinbits);
1173
+ size_t missingbits = totalbits - sizeinbits;
1174
+ size_t wordsadded = addStreamOfEmptyWords(0, missingbits/wordinbits + ((missingbits % wordinbits != 0) ? 1 : 0));
1175
+ assert(sizeinbits >= totalbits);
1176
+ assert(sizeinbits <= totalbits + wordinbits);
1177
+ sizeinbits = totalbits;
1178
+ return wordsadded;
1179
+ }
1180
+
1181
+
1182
+
1183
+ /**
1184
+ * This is a low-level iterator.
1185
+ */
1186
+
1187
+ template <class uword=uword32>
1188
+ class EWAHBoolArrayRawIterator {
1189
+ public:
1190
+ EWAHBoolArrayRawIterator(const EWAHBoolArray<uword> & p) : pointer(0),
1191
+ myparent(&p.getBuffer()), rlw((*myparent)[pointer]) { //RunningLength(0), NumberOfLiteralWords(0), Bit(0) {
1192
+ if(verbose) {
1193
+ cout<<"created a new raw iterator over buffer of size "<<myparent->size()<<endl;
1194
+ }
1195
+ }
1196
+ EWAHBoolArrayRawIterator(const EWAHBoolArrayRawIterator & o) : pointer(o.pointer),
1197
+ myparent(o.myparent), rlw(o.rlw) {}
1198
+
1199
+
1200
+ bool hasNext() const {
1201
+ if(verbose)cout<<"call to hasNext, pointer is at "<<pointer<< ", parent.size()= "<<myparent->size()<<endl;
1202
+ return pointer < myparent->size();
1203
+ }
1204
+
1205
+ BufferedRunningLengthWord<uword> & next() {
1206
+ assert(pointer < myparent->size());
1207
+ rlw.read( (*myparent)[pointer]);
1208
+ pointer = static_cast<size_t>(pointer + rlw.getNumberOfLiteralWords() + 1);
1209
+ return rlw;
1210
+ }
1211
+
1212
+ const uword * dirtyWords() const {
1213
+ assert(pointer>0);
1214
+ assert(pointer>=rlw.getNumberOfLiteralWords());
1215
+ return & (myparent->at(static_cast<size_t>(pointer-rlw.getNumberOfLiteralWords())));
1216
+ }
1217
+
1218
+ EWAHBoolArrayRawIterator & operator=(const EWAHBoolArrayRawIterator & other) {
1219
+ pointer = other.pointer;
1220
+ myparent=other.myparent;
1221
+ rlw=other.rlw;
1222
+ return *this;
1223
+ }
1224
+
1225
+ enum {verbose=false};
1226
+ size_t pointer;
1227
+ const vector<uword> * myparent;
1228
+ BufferedRunningLengthWord<uword> rlw;
1229
+ private:
1230
+
1231
+ EWAHBoolArrayRawIterator();
1232
+ };
1233
+
1234
+
1235
+
1236
+
1237
+
1238
+
1239
+ template <class uword>
1240
+ EWAHBoolArrayIterator<uword> EWAHBoolArray<uword>::uncompress() const {
1241
+ return EWAHBoolArrayIterator<uword>(buffer);
1242
+ }
1243
+
1244
+ template <class uword>
1245
+ EWAHBoolArrayRawIterator<uword> EWAHBoolArray<uword>::raw_iterator() const {
1246
+ return EWAHBoolArrayRawIterator<uword>(*this);
1247
+ }
1248
+
1249
+
1250
+ template <class uword>
1251
+ EWAHBoolArraySparseIterator<uword> EWAHBoolArray<uword>::sparse_uncompress() const {
1252
+ return EWAHBoolArraySparseIterator<uword>(buffer);
1253
+ }
1254
+
1255
+ template <class uword>
1256
+ bool EWAHBoolArray<uword>::operator==(const EWAHBoolArray & x) const {
1257
+ if(sizeinbits != x.sizeinbits) return false;
1258
+ if(buffer.size() != x.buffer.size()) return false;
1259
+ for(size_t k = 0; k < buffer.size(); ++k)
1260
+ if(buffer[k] != x.buffer[k]) return false;
1261
+ return true;
1262
+ }
1263
+
1264
+ template <class uword>
1265
+ void EWAHBoolArray<uword>::swap(EWAHBoolArray & x) {
1266
+ buffer.swap(x.buffer);
1267
+ size_t tmp = x.sizeinbits;
1268
+ x.sizeinbits = sizeinbits;
1269
+ sizeinbits = tmp;
1270
+ tmp = x.lastRLW;
1271
+ x.lastRLW = lastRLW;
1272
+ lastRLW = tmp;
1273
+ }
1274
+
1275
+ template <class uword>
1276
+ void EWAHBoolArray<uword>::append(const EWAHBoolArray & x) {
1277
+ if(sizeinbits % wordinbits == 0) {
1278
+ // hoping for the best?
1279
+ sizeinbits += x.sizeinbits;
1280
+ ConstRunningLengthWord<uword> lRLW(buffer[lastRLW]);
1281
+ if( (lRLW.getRunningLength() == 0) && (lRLW.getNumberOfLiteralWords() == 0)) {
1282
+ // it could be that the running length word is empty, in such a case,
1283
+ // we want to get rid of it!
1284
+ assert(lastRLW == buffer.size()-1);
1285
+ lastRLW = x.lastRLW + buffer.size() - 1;
1286
+ buffer.resize(buffer.size()-1);
1287
+ buffer.insert(buffer.end(),x.buffer.begin(),x.buffer.end());
1288
+ } else {
1289
+ lastRLW = x.lastRLW + buffer.size();
1290
+ buffer.insert(buffer.end(),x.buffer.begin(),x.buffer.end());
1291
+ }
1292
+ } else {
1293
+ stringstream ss;
1294
+ ss<<"This should really not happen! You are trying to append to a bitmap having a fractional number of words, that is, "<<static_cast<int>(sizeinbits)<<" bits with a word size in bits of "<<static_cast<int>(wordinbits)<<". ";
1295
+ ss<<"Size of the bitmap being appended: "<<x.sizeinbits<<" bits."<<endl;
1296
+ throw invalid_argument(ss.str());
1297
+ }
1298
+ }
1299
+
1300
+ template <class uword>
1301
+ EWAHBoolArrayIterator<uword>::EWAHBoolArrayIterator(const vector<uword> & parent) :
1302
+ pointer(0),
1303
+ myparent(parent),
1304
+ compressedwords(0), literalwords(0), rl(0), lw(0), b(0) {
1305
+ if(pointer <myparent.size()) readNewRunningLengthWord();
1306
+ }
1307
+
1308
+
1309
+ template <class uword>
1310
+ void EWAHBoolArrayIterator<uword>::readNewRunningLengthWord() {
1311
+ literalwords = 0;
1312
+ compressedwords = 0;
1313
+ ConstRunningLengthWord<uword> rlw(myparent[pointer]);
1314
+ rl = rlw.getRunningLength();
1315
+ lw = rlw.getNumberOfLiteralWords();
1316
+ b = rlw.getRunningBit();
1317
+ if((rl == 0) && (lw == 0)) {
1318
+ if(pointer < myparent.size() -1) {
1319
+ ++pointer;
1320
+ readNewRunningLengthWord();
1321
+ } else {
1322
+ assert(pointer >= myparent.size()-1);
1323
+ pointer = myparent.size();
1324
+ assert(! hasNext());
1325
+ }
1326
+ }
1327
+ }
1328
+
1329
+ template <class uword>
1330
+ BoolArray<uword> EWAHBoolArray<uword>::toBoolArray() const {
1331
+ BoolArray<uword> ans(sizeinbits);
1332
+ EWAHBoolArrayIterator<uword> i = uncompress();
1333
+ int counter = 0;
1334
+ while(i.hasNext()) {
1335
+ ans.setWord(counter++,i.next());
1336
+ }
1337
+ return ans;
1338
+ }
1339
+
1340
+ template <class uword>
1341
+ size_t EWAHBoolArray<uword>::numberOfOnes() {
1342
+ size_t c (0);
1343
+ EWAHBoolArraySparseIterator<uword> i = sparse_uncompress();
1344
+ while(i.hasNext()) {
1345
+ const uword currentword = i.next();
1346
+ c += countOnes(currentword);
1347
+ /*
1348
+ for(int k = 0; k < wordinbits; ++k) {
1349
+ if ( (currentword & (static_cast<uword>(1) << k)) != 0)
1350
+ ++c;
1351
+ }*/
1352
+
1353
+ }
1354
+ return c;
1355
+
1356
+ }
1357
+
1358
+
1359
+
1360
+
1361
+ template <class uword>
1362
+ template <class container>
1363
+ void EWAHBoolArray<uword>::appendRowIDs(container & out, const size_t offset) const {
1364
+ size_t pointer(0);
1365
+ size_t currentoffset(offset);
1366
+ if(RESERVEMEMORY) out.reserve(buffer.size()+64);// trading memory for speed.
1367
+ while(pointer <buffer.size()) {
1368
+ ConstRunningLengthWord<uword> rlw(buffer[pointer]);
1369
+ if(rlw.getRunningBit()) {
1370
+ for(size_t x = 0; x< static_cast<size_t>(rlw.getRunningLength()*wordinbits); ++x) {
1371
+ out.push_back(currentoffset + x);
1372
+ }
1373
+ }
1374
+ currentoffset = static_cast<size_t>(currentoffset + rlw.getRunningLength() * wordinbits);
1375
+ ++pointer;
1376
+ for(uword k = 0; k<rlw.getNumberOfLiteralWords(); ++k) {
1377
+ const uword currentword = buffer[pointer];
1378
+ for(uint kk = 0; kk < wordinbits; ++kk) {
1379
+ if ( ( currentword & static_cast<uword>(static_cast<uword>(1) << kk)) != 0)
1380
+ out.push_back(currentoffset + kk);
1381
+ }
1382
+ currentoffset+=wordinbits;
1383
+ ++pointer;
1384
+ }
1385
+ }
1386
+ }
1387
+
1388
+
1389
+
1390
+ template <class uword>
1391
+ bool EWAHBoolArray<uword>::operator!=(const EWAHBoolArray<uword> & x) const {
1392
+ return !(*this == x);
1393
+ }
1394
+
1395
+ template <class uword>
1396
+ bool EWAHBoolArray<uword>::operator==(const BoolArray<uword> & x) const {
1397
+ // could be more efficient
1398
+ return (this->toBoolArray() == x);
1399
+ }
1400
+
1401
+ template <class uword>
1402
+ bool EWAHBoolArray<uword>::operator!=(const BoolArray<uword> & x) const {
1403
+ // could be more efficient
1404
+ return (this->toBoolArray() != x);
1405
+ }
1406
+
1407
+
1408
+ template <class uword>
1409
+ size_t EWAHBoolArray<uword>::addStreamOfEmptyWords(const bool v, const size_t number) {
1410
+ if(number == 0) return 0;
1411
+ RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
1412
+ const bool noliteralword = (lastRunningLengthWord.getNumberOfLiteralWords() == 0);
1413
+ //firts, if the last running length word is empty, we align it
1414
+ // this
1415
+ const uword runlen = lastRunningLengthWord.getRunningLength();
1416
+ if( ( noliteralword ) && ( runlen == 0 )) {
1417
+ lastRunningLengthWord.setRunningBit(v);
1418
+ }
1419
+ size_t wordsadded (0);
1420
+ if( ( noliteralword ) && (lastRunningLengthWord.getRunningBit() == v) && (runlen < RunningLengthWord<uword>::largestrunninglengthcount) ) {
1421
+ // that's the easy case, we are just continuing
1422
+ uword whatwecanadd = static_cast<uword>( number < static_cast<uword>(RunningLengthWord<uword>::largestrunninglengthcount-runlen) ? number : static_cast<size_t>(RunningLengthWord<uword>::largestrunninglengthcount-runlen) );
1423
+ lastRunningLengthWord.setRunningLength(static_cast<uword>(runlen+whatwecanadd));
1424
+ sizeinbits = static_cast<size_t>(sizeinbits + whatwecanadd * wordinbits);
1425
+ if(number - whatwecanadd> 0 ) wordsadded = static_cast<size_t>(wordsadded + addStreamOfEmptyWords(v, static_cast<size_t>(number - whatwecanadd)));
1426
+ } else {
1427
+ buffer.push_back(0);
1428
+ ++wordsadded;
1429
+ lastRLW = buffer.size() - 1;
1430
+ RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
1431
+ uword whatwecanadd = static_cast<uword>( number < RunningLengthWord<uword>::largestrunninglengthcount ? number : static_cast<size_t>(RunningLengthWord<uword>::largestrunninglengthcount) );
1432
+ lastRunningLengthWord2.setRunningBit(v);
1433
+ lastRunningLengthWord2.setRunningLength(whatwecanadd);
1434
+ sizeinbits = static_cast<size_t>(sizeinbits + whatwecanadd * wordinbits);
1435
+ if(number - whatwecanadd> 0 ) wordsadded = static_cast<size_t>( wordsadded + addStreamOfEmptyWords(v, static_cast<size_t>(number - whatwecanadd)));
1436
+ }
1437
+ return wordsadded;
1438
+ }
1439
+
1440
+
1441
+ template <class uword>
1442
+ size_t EWAHBoolArray<uword>::addStreamOfDirtyWords(const uword * v, const size_t number) {
1443
+ if(number == 0) return 0;
1444
+ RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
1445
+ const uword NumberOfLiteralWords = lastRunningLengthWord.getNumberOfLiteralWords();
1446
+ assert(RunningLengthWord<uword>::largestliteralcount >= NumberOfLiteralWords);
1447
+ const size_t whatwecanadd = number < static_cast<uword>(RunningLengthWord<uword>::largestliteralcount - NumberOfLiteralWords) ? number : static_cast<size_t>(RunningLengthWord<uword>::largestliteralcount - NumberOfLiteralWords);//0x7FFF-NumberOfLiteralWords);
1448
+ assert(NumberOfLiteralWords+whatwecanadd>=NumberOfLiteralWords);
1449
+ assert(NumberOfLiteralWords+whatwecanadd<=RunningLengthWord<uword>::largestliteralcount);
1450
+ lastRunningLengthWord.setNumberOfLiteralWords(static_cast<uword>(NumberOfLiteralWords+whatwecanadd));
1451
+ assert(lastRunningLengthWord.getNumberOfLiteralWords()==NumberOfLiteralWords+whatwecanadd);
1452
+ const size_t leftovernumber = number - whatwecanadd;
1453
+ // add the dirty words...
1454
+ const size_t oldsize (buffer.size());
1455
+ buffer.resize(oldsize+whatwecanadd);
1456
+ memcpy(&buffer[oldsize],v,whatwecanadd*sizeof(uword));
1457
+ size_t wordsadded(whatwecanadd);
1458
+ if(leftovernumber>0) {
1459
+ //add
1460
+ buffer.push_back(0);
1461
+ lastRLW=buffer.size() - 1;
1462
+ ++wordsadded;
1463
+ wordsadded+=addStreamOfDirtyWords(v+whatwecanadd, leftovernumber);
1464
+ }
1465
+ assert(wordsadded >= number);
1466
+ return wordsadded;
1467
+ }
1468
+
1469
+
1470
+
1471
+ template <class uword>
1472
+ size_t EWAHBoolArray<uword>::addEmptyWord(const bool v) {
1473
+ RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
1474
+ const bool noliteralword = (lastRunningLengthWord.getNumberOfLiteralWords() == 0);
1475
+ //firts, if the last running length word is empty, we align it
1476
+ // this
1477
+ uword runlen = lastRunningLengthWord.getRunningLength();
1478
+ if( ( noliteralword ) && ( runlen == 0 )) {
1479
+ lastRunningLengthWord.setRunningBit(v);
1480
+ assert(lastRunningLengthWord.getRunningBit() == v);
1481
+ }
1482
+ if( ( noliteralword ) && (lastRunningLengthWord.getRunningBit() == v) && (runlen < RunningLengthWord<uword>::largestrunninglengthcount) ) {
1483
+ lastRunningLengthWord.setRunningLength(static_cast<uword>(runlen+1));
1484
+ assert(lastRunningLengthWord.getRunningLength() == runlen+1);
1485
+ return 0;
1486
+ } else {
1487
+ // we have to start anew
1488
+ buffer.push_back(0);
1489
+ lastRLW = buffer.size() - 1;
1490
+ RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
1491
+ assert(lastRunningLengthWord2.getRunningLength()==0);
1492
+ assert(lastRunningLengthWord2.getRunningBit()==0);
1493
+ assert(lastRunningLengthWord2.getNumberOfLiteralWords()==0);
1494
+ lastRunningLengthWord2.setRunningBit(v);
1495
+ assert(lastRunningLengthWord2.getRunningBit() == v);
1496
+ lastRunningLengthWord2.setRunningLength(1);
1497
+ assert(lastRunningLengthWord2.getRunningLength() == 1);
1498
+ assert(lastRunningLengthWord2.getNumberOfLiteralWords()==0);
1499
+ return 1;
1500
+ }
1501
+ }
1502
+
1503
+
1504
+
1505
+ template <class uword>
1506
+ void EWAHBoolArray<uword>::sparselogicaland(EWAHBoolArray &a, EWAHBoolArray &container) {
1507
+ makeSameSize(a);
1508
+ container.reset();
1509
+ if(RESERVEMEMORY) container.buffer.reserve(buffer.size()>a.buffer.size()?buffer.size():a.buffer.size());
1510
+ assert(sizeInBits() == a.sizeInBits());
1511
+ /**
1512
+ * This could possibly be faster if we go around
1513
+ * the uncompress calls.
1514
+ */
1515
+ EWAHBoolArraySparseIterator<uword> i = a.sparse_uncompress();
1516
+ EWAHBoolArraySparseIterator<uword> j = sparse_uncompress();
1517
+ size_t pos (0);
1518
+ uword x,y;
1519
+ bool ibehindj,jbehindi;
1520
+ while(i.hasNext() and j.hasNext()) {
1521
+ x = i.next();
1522
+ y = j.next();
1523
+ ibehindj = i.position() < j.position();
1524
+ jbehindi = j.position() < i.position();
1525
+ while (( ibehindj and i.hasNext()) or (jbehindi and j.hasNext())) {
1526
+ if(ibehindj) x = i.next();
1527
+ else if(jbehindi) y = j.next();
1528
+ ibehindj = i.position() < j.position();
1529
+ jbehindi = j.position() < i.position();
1530
+ }
1531
+ size_t nextnonzero = i.position()< j.position() ?i.position(): j.position() ;
1532
+ if(nextnonzero > pos + 1) {
1533
+ container.addStreamOfEmptyWords(0, nextnonzero-pos-1);
1534
+ pos += nextnonzero-pos-1;
1535
+ }
1536
+ if(i.position() == j.position()) {
1537
+ container.add(x & y);
1538
+ ++pos;
1539
+ }
1540
+ }
1541
+ container.setSizeInBits(sizeInBits());
1542
+ //return answer;
1543
+ }
1544
+
1545
+
1546
+
1547
+ template <class uword>
1548
+ void EWAHBoolArray<uword>::rawlogicalor(EWAHBoolArray &a, EWAHBoolArray &container) {
1549
+ makeSameSize(a);
1550
+ container.reset();
1551
+ if(RESERVEMEMORY) container.buffer.reserve(buffer.size()+a.buffer.size());
1552
+ assert(sizeInBits() == a.sizeInBits());
1553
+ EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
1554
+ EWAHBoolArrayRawIterator<uword> j = raw_iterator();
1555
+ if(!(i.hasNext() and j.hasNext())) {// hopefully this never happens...
1556
+ container.setSizeInBits(sizeInBits());
1557
+ return;
1558
+ }
1559
+ // at this point, this should be safe:
1560
+ BufferedRunningLengthWord<uword> & rlwi = i.next();
1561
+ BufferedRunningLengthWord<uword> & rlwj = j.next();
1562
+ //RunningLength;
1563
+ while (true) {
1564
+ bool i_is_prey (rlwi.size()<rlwj.size());
1565
+ BufferedRunningLengthWord<uword> & prey ( i_is_prey ? rlwi: rlwj);
1566
+ BufferedRunningLengthWord<uword> & predator (i_is_prey ? rlwj: rlwi);
1567
+ if(prey.getRunningBit() == 0) {
1568
+ // we have a stream of 0x00
1569
+ const uword predatorrl (predator.getRunningLength());
1570
+ const uword preyrl (prey.getRunningLength());
1571
+ if(predatorrl >= preyrl) {
1572
+ const uword tobediscarded = preyrl ;
1573
+ container.addStreamOfEmptyWords(predator.getRunningBit(), static_cast<size_t>(tobediscarded));
1574
+ } else {
1575
+ const uword tobediscarded = predatorrl ;
1576
+ container.addStreamOfEmptyWords(predator.getRunningBit(), static_cast<size_t>(tobediscarded));
1577
+ if(preyrl - tobediscarded>0) {
1578
+ const uword * dw_predator (i_is_prey ? j.dirtyWords(): i.dirtyWords());
1579
+ container.addStreamOfDirtyWords(dw_predator, static_cast<size_t>(preyrl - tobediscarded));
1580
+ }
1581
+ }
1582
+ predator.discardFirstWords(preyrl);
1583
+ prey.discardFirstWords(preyrl);
1584
+ } else {
1585
+ // we have a stream of 1x11
1586
+ const uword preyrl (prey.getRunningLength());
1587
+ predator.discardFirstWords(preyrl);
1588
+ prey.discardFirstWords(preyrl);
1589
+ container.addStreamOfEmptyWords(1, static_cast<size_t>(preyrl));
1590
+ }
1591
+ const uword predatorrl (predator.getRunningLength());
1592
+ if(predatorrl>0) {
1593
+ if(predator.getRunningBit() == 0) {
1594
+ const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
1595
+ const uword tobediscarded = (predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey : predatorrl;
1596
+ if(tobediscarded>0) {
1597
+ const uword * dw_prey (i_is_prey ? i.dirtyWords(): j.dirtyWords());
1598
+ container.addStreamOfDirtyWords(dw_prey, static_cast<size_t>(tobediscarded));
1599
+ predator.discardFirstWords(tobediscarded);
1600
+ prey.discardFirstWords(tobediscarded);
1601
+ }
1602
+ } else {
1603
+ const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
1604
+ const uword tobediscarded = (predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey : predatorrl;
1605
+ predator.discardFirstWords(tobediscarded);
1606
+ prey.discardFirstWords(tobediscarded);
1607
+ container.addStreamOfEmptyWords(1, static_cast<size_t>(tobediscarded));
1608
+ }
1609
+ }
1610
+ assert(prey.getRunningLength() ==0);
1611
+ // all that is left to do now is to AND the dirty words
1612
+ uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
1613
+ if(nbre_dirty_prey > 0) {
1614
+ assert(predator.getRunningLength() ==0);
1615
+ const uword * idirty = i.dirtyWords();
1616
+ const uword * jdirty = j.dirtyWords();
1617
+ for(uword k = 0; k< nbre_dirty_prey; ++k) {
1618
+ container.add(idirty[k] | jdirty[k]);
1619
+ }
1620
+ predator.discardFirstWords(nbre_dirty_prey);
1621
+ }
1622
+ if( i_is_prey ) {
1623
+ if(!i.hasNext()) break;
1624
+ rlwi = i.next();
1625
+ } else {
1626
+ if(!j.hasNext()) break;
1627
+ rlwj = j.next();
1628
+ }
1629
+ }
1630
+ container.setSizeInBits(sizeInBits());
1631
+ }
1632
+
1633
+
1634
+ template <class uword>
1635
+ void EWAHBoolArray<uword>::rawlogicaland(EWAHBoolArray &a, EWAHBoolArray &container) {
1636
+ makeSameSize(a);
1637
+ container.reset();
1638
+ if(RESERVEMEMORY) container.buffer.reserve(buffer.size()>a.buffer.size()?buffer.size():a.buffer.size());
1639
+ assert(sizeInBits() == a.sizeInBits());
1640
+ EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
1641
+ EWAHBoolArrayRawIterator<uword> j = raw_iterator();
1642
+ if(!(i.hasNext() and j.hasNext())) {// hopefully this never happens...
1643
+ container.setSizeInBits(sizeInBits());
1644
+ return;
1645
+ }
1646
+ // at this point, this should be safe:
1647
+ BufferedRunningLengthWord<uword> & rlwi = i.next();
1648
+ BufferedRunningLengthWord<uword> & rlwj = j.next();
1649
+ //RunningLength;
1650
+ while (true) {
1651
+ bool i_is_prey (rlwi.size()<rlwj.size());
1652
+ BufferedRunningLengthWord<uword> & prey ( i_is_prey ? rlwi: rlwj);
1653
+ BufferedRunningLengthWord<uword> & predator (i_is_prey ? rlwj: rlwi);
1654
+ if(prey.getRunningBit() == 0) {
1655
+ const uword preyrl (prey.getRunningLength());
1656
+ predator.discardFirstWords(preyrl);
1657
+ prey.discardFirstWords(preyrl);
1658
+ container.addStreamOfEmptyWords(0, static_cast<size_t>(preyrl));
1659
+ } else {
1660
+ // we have a stream of 1x11
1661
+ const uword predatorrl (predator.getRunningLength());
1662
+ const uword preyrl (prey.getRunningLength());
1663
+ const uword tobediscarded = (predatorrl >= preyrl) ? preyrl : predatorrl;
1664
+ container.addStreamOfEmptyWords(predator.getRunningBit(), static_cast<size_t>(tobediscarded));
1665
+ if(preyrl - tobediscarded>0) {
1666
+ const uword * dw_predator (i_is_prey ? j.dirtyWords(): i.dirtyWords());
1667
+ container.addStreamOfDirtyWords(dw_predator, static_cast<size_t>(preyrl - tobediscarded));
1668
+ }
1669
+ predator.discardFirstWords(preyrl);
1670
+ prey.discardFirstWords(preyrl);
1671
+ }
1672
+ const uword predatorrl (predator.getRunningLength());
1673
+ if(predatorrl>0) {
1674
+ if(predator.getRunningBit() == 0) {
1675
+ const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
1676
+ const uword tobediscarded = (predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey : predatorrl;
1677
+ predator.discardFirstWords(tobediscarded);
1678
+ prey.discardFirstWords(tobediscarded);
1679
+ container.addStreamOfEmptyWords(0, static_cast<size_t>(tobediscarded));
1680
+ } else {
1681
+ const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
1682
+ const uword tobediscarded = (predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey : predatorrl;
1683
+ if(tobediscarded>0) {
1684
+ const uword * dw_prey (i_is_prey ? i.dirtyWords(): j.dirtyWords());
1685
+ container.addStreamOfDirtyWords(dw_prey, static_cast<size_t>(tobediscarded));
1686
+ predator.discardFirstWords(tobediscarded);
1687
+ prey.discardFirstWords(tobediscarded);
1688
+ }
1689
+ }
1690
+ }
1691
+ assert(prey.getRunningLength() ==0);
1692
+ // all that is left to do now is to AND the dirty words
1693
+ uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
1694
+ if(nbre_dirty_prey > 0) {
1695
+ assert(predator.getRunningLength() ==0);
1696
+ const uword * idirty = i.dirtyWords();
1697
+ const uword * jdirty = j.dirtyWords();
1698
+ for(uword k = 0; k< nbre_dirty_prey; ++k) {
1699
+ container.add(idirty[k] & jdirty[k]);
1700
+ }
1701
+ predator.discardFirstWords(nbre_dirty_prey);
1702
+ }
1703
+ if( i_is_prey ) {
1704
+ if(!i.hasNext()) break;
1705
+ rlwi = i.next();
1706
+ } else {
1707
+ if(!j.hasNext()) break;
1708
+ rlwj = j.next();
1709
+ }
1710
+ }
1711
+ container.setSizeInBits(sizeInBits());
1712
+ }
1713
+
1714
+
1715
+
1716
+
1717
+ template <class uword>
1718
+ BitmapStatistics EWAHBoolArray<uword>::computeStatistics() const {
1719
+ //uint totalcompressed(0), totalliteral(0);
1720
+ BitmapStatistics bs;
1721
+ EWAHBoolArrayRawIterator<uword> i = raw_iterator();
1722
+ while(i.hasNext()) {
1723
+ BufferedRunningLengthWord<uword> &brlw (i.next());
1724
+ ++bs.runningwordmarker;
1725
+ bs.totalliteral += brlw.getNumberOfLiteralWords();
1726
+ bs.totalcompressed += brlw.getRunningLength();
1727
+ if(brlw.getRunningLength() == RunningLengthWord<uword>::largestrunninglengthcount) {
1728
+ ++bs.maximumofrunningcounterreached;
1729
+ }
1730
+ }
1731
+ return bs;
1732
+ }
1733
+
1734
+
1735
+ template <class uword>
1736
+ void EWAHBoolArray<uword>::debugprintout() const {
1737
+ cout << "==printing out EWAHBoolArray=="<<endl;
1738
+ cout <<"Number of compressed words: "<< buffer.size()<< endl;
1739
+ size_t pointer = 0;
1740
+ while(pointer <buffer.size()) {
1741
+ ConstRunningLengthWord<uword> rlw(buffer[pointer]);
1742
+ bool b = rlw.getRunningBit() ;
1743
+ uword rl = rlw.getRunningLength() ;
1744
+ uword lw = rlw.getNumberOfLiteralWords();
1745
+ cout << "pointer = "<<pointer<<" running bit="<<b<<" running length="<<rl<<" lit. words="<<lw<<endl;
1746
+ for(uword j = 0; j < lw ; ++j) {
1747
+ const uword & w = buffer[pointer+j+1];
1748
+ cout<<toBinaryString(w)<<endl;;
1749
+ }
1750
+ pointer += lw + 1;
1751
+ }
1752
+ cout << "==END=="<<endl;
1753
+ }
1754
+
1755
+ template <class uword>
1756
+ size_t EWAHBoolArray<uword>::sizeOnDisk() const {
1757
+ return sizeof(sizeinbits)+sizeof(size_t)+sizeof(uword)*buffer.size();
1758
+ }
1759
+
1760
+
1761
+
1762
+
1763
+ #endif