ewah-bitset 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -0
- data/LICENSE +19 -0
- data/Manifest +11 -0
- data/README.md +33 -0
- data/Rakefile +18 -0
- data/ewah-bitset.gemspec +30 -0
- data/ext/boolarray.h +179 -0
- data/ext/ewah-bitset.cpp +176 -0
- data/ext/ewah.h +1763 -0
- data/ext/extconf.rb +7 -0
- data/spec/ewah_bitset_spec.rb +102 -0
- data/spec/spec.opts +2 -0
- metadata +89 -0
data/ext/ewah.h
ADDED
@@ -0,0 +1,1763 @@
|
|
1
|
+
#ifndef EWAH_H
|
2
|
+
#define EWAH_H
|
3
|
+
|
4
|
+
#include <string.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
#include <cassert>
|
7
|
+
#include <iostream>
|
8
|
+
#include <vector>
|
9
|
+
#include <stdexcept>
|
10
|
+
#include <cstddef>
|
11
|
+
#include <iso646.h> // mostly for Microsoft compilers
|
12
|
+
|
13
|
+
#include "boolarray.h"
|
14
|
+
|
15
|
+
// taken from stackoverflow
|
16
|
+
#ifndef NDEBUG
|
17
|
+
# define ASSERT(condition, message) \
|
18
|
+
do { \
|
19
|
+
if (! (condition)) { \
|
20
|
+
std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \
|
21
|
+
<< " line " << __LINE__ << ": " << message << std::endl; \
|
22
|
+
std::exit(EXIT_FAILURE); \
|
23
|
+
} \
|
24
|
+
} while (false)
|
25
|
+
#else
|
26
|
+
# define ASSERT(condition, message) do { } while (false)
|
27
|
+
#endif
|
28
|
+
|
29
|
+
|
30
|
+
using namespace std;
|
31
|
+
|
32
|
+
|
33
|
+
/**
|
34
|
+
* count the number of bits set to one (32 bit version)
|
35
|
+
*/
|
36
|
+
uint countOnes(uword32 v) {
|
37
|
+
v = v - ((v >> 1) & 0x55555555);
|
38
|
+
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
|
39
|
+
return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
|
40
|
+
}
|
41
|
+
/**
|
42
|
+
* count the number of bits set to one (64 bit version)
|
43
|
+
*/
|
44
|
+
uint countOnes(uword64 v) {
|
45
|
+
return countOnes(static_cast<uword32>(v))+ countOnes(static_cast<uword32>(v>>32));
|
46
|
+
}
|
47
|
+
|
48
|
+
|
49
|
+
uint countOnes(uword16 v) {
|
50
|
+
uint c;
|
51
|
+
for ( c = 0; v; c++) {
|
52
|
+
v &= v - 1;
|
53
|
+
}
|
54
|
+
return c;
|
55
|
+
}
|
56
|
+
|
57
|
+
/**
|
58
|
+
* Returns the binary representation of a binary word.
|
59
|
+
*/
|
60
|
+
template <class uword>
|
61
|
+
inline string toBinaryString(const uword w) {
|
62
|
+
stringstream s;
|
63
|
+
for(uint k = 0; k <sizeof(uword)*8;++k) {
|
64
|
+
if(w & (static_cast<uword>(1)<<k)) s <<"1"; else s << "0";
|
65
|
+
}
|
66
|
+
string ans;
|
67
|
+
s >> ans;
|
68
|
+
return ans;
|
69
|
+
}
|
70
|
+
|
71
|
+
|
72
|
+
/**
|
73
|
+
* For expert users.
|
74
|
+
* This class is used to represent a special type of word storing
|
75
|
+
* a run length. It is defined by the Enhanced Word Aligned Hybrid (EWAH)
|
76
|
+
* format. You don't normally need to access this class.
|
77
|
+
*/
|
78
|
+
template <class uword=uword32>
|
79
|
+
class RunningLengthWord {
|
80
|
+
public:
|
81
|
+
RunningLengthWord (uword & data) : mydata(data) { }
|
82
|
+
|
83
|
+
RunningLengthWord(const RunningLengthWord & rlw) : mydata(rlw.mydata) {}
|
84
|
+
|
85
|
+
RunningLengthWord& operator=(const RunningLengthWord & rlw) {
|
86
|
+
mydata = rlw.mydata;
|
87
|
+
return *this;
|
88
|
+
}
|
89
|
+
|
90
|
+
|
91
|
+
/**
|
92
|
+
* Which bit is being repeated?
|
93
|
+
*/
|
94
|
+
bool getRunningBit() const {
|
95
|
+
return mydata & static_cast<uword>(1);
|
96
|
+
}
|
97
|
+
|
98
|
+
|
99
|
+
/**
|
100
|
+
* how many words should be filled by the running bit
|
101
|
+
*/
|
102
|
+
static inline bool getRunningBit(uword data) {
|
103
|
+
return data & static_cast<uword>(1) ;
|
104
|
+
}
|
105
|
+
|
106
|
+
/**
|
107
|
+
* how many words should be filled by the running bit
|
108
|
+
*/
|
109
|
+
uword getRunningLength() const {
|
110
|
+
return (mydata >> 1) & largestrunninglengthcount ;
|
111
|
+
}
|
112
|
+
|
113
|
+
|
114
|
+
/**
|
115
|
+
* followed by how many literal words?
|
116
|
+
*/
|
117
|
+
static inline uword getRunningLength(uword data) {
|
118
|
+
return (data >> 1) & largestrunninglengthcount ;
|
119
|
+
}
|
120
|
+
|
121
|
+
/**
|
122
|
+
* followed by how many literal words?
|
123
|
+
*/
|
124
|
+
uword getNumberOfLiteralWords() const {
|
125
|
+
return static_cast<uword>(mydata >> (1+runninglengthbits));
|
126
|
+
}
|
127
|
+
|
128
|
+
/**
|
129
|
+
* Total of getRunningLength() and getNumberOfLiteralWords()
|
130
|
+
*/
|
131
|
+
uword size() const {
|
132
|
+
return getRunningLength() + getNumberOfLiteralWords();
|
133
|
+
}
|
134
|
+
|
135
|
+
|
136
|
+
/**
|
137
|
+
* followed by how many literal words?
|
138
|
+
*/
|
139
|
+
static inline uword getNumberOfLiteralWords(uword data) {
|
140
|
+
return data >> (1+runninglengthbits);
|
141
|
+
}
|
142
|
+
|
143
|
+
|
144
|
+
/**
|
145
|
+
* running length of which type of bits
|
146
|
+
*/
|
147
|
+
void setRunningBit(bool b) {
|
148
|
+
if(b) mydata |= static_cast<uword>(1);
|
149
|
+
else mydata &= static_cast<uword>(~1);
|
150
|
+
}
|
151
|
+
|
152
|
+
|
153
|
+
|
154
|
+
/**
|
155
|
+
* running length of which type of bits
|
156
|
+
*/
|
157
|
+
static inline void setRunningBit(uword & data, bool b) {
|
158
|
+
if(b) data |= static_cast<uword>(1);
|
159
|
+
else data &= static_cast<uword>(~1);
|
160
|
+
}
|
161
|
+
|
162
|
+
|
163
|
+
/**
|
164
|
+
* running length of which type of bits
|
165
|
+
*/
|
166
|
+
void discardFirstWords(uword x) {
|
167
|
+
assert(x<= size());
|
168
|
+
const uword rl ( getRunningLength() );
|
169
|
+
if(rl >= x) {
|
170
|
+
setRunningLength(rl - x);
|
171
|
+
return;
|
172
|
+
}
|
173
|
+
x -= rl;
|
174
|
+
setRunningLength(0);
|
175
|
+
setNumberOfLiteralWords(getNumberOfLiteralWords() - x);
|
176
|
+
}
|
177
|
+
|
178
|
+
void setRunningLength(uword l) {
|
179
|
+
mydata |= shiftedlargestrunninglengthcount;
|
180
|
+
mydata &= static_cast<uword>(l << 1) | notshiftedlargestrunninglengthcount;
|
181
|
+
}
|
182
|
+
|
183
|
+
// static call for people who hate objects
|
184
|
+
static inline void setRunningLength(uword & data, uword l) {
|
185
|
+
data |= shiftedlargestrunninglengthcount;
|
186
|
+
data &= static_cast<uword>(l<<1) | notshiftedlargestrunninglengthcount;
|
187
|
+
}
|
188
|
+
|
189
|
+
void setNumberOfLiteralWords(uword l) {
|
190
|
+
mydata |= notrunninglengthplusrunningbit;
|
191
|
+
mydata &= static_cast<uword>(l << (runninglengthbits +1) ) |runninglengthplusrunningbit;
|
192
|
+
}
|
193
|
+
// static call for people who hate objects
|
194
|
+
static inline void setNumberOfLiteralWords(uword & data, uword l) {
|
195
|
+
data |= notrunninglengthplusrunningbit;
|
196
|
+
data &= static_cast<uword>(l << (runninglengthbits +1) ) |runninglengthplusrunningbit;
|
197
|
+
}
|
198
|
+
static const uint runninglengthbits = sizeof(uword)*4;//16;
|
199
|
+
static const uint literalbits = sizeof(uword)*8 - 1 - runninglengthbits;
|
200
|
+
static const uword largestliteralcount = (static_cast<uword>(1)<<literalbits) - 1;
|
201
|
+
static const uword largestrunninglengthcount = (static_cast<uword>(1)<<runninglengthbits)-1;
|
202
|
+
static const uword shiftedlargestrunninglengthcount = largestrunninglengthcount<<1;
|
203
|
+
static const uword notshiftedlargestrunninglengthcount = static_cast<uword>(~shiftedlargestrunninglengthcount);
|
204
|
+
static const uword runninglengthplusrunningbit = (static_cast<uword>(1)<<(runninglengthbits+1)) - 1;
|
205
|
+
static const uword notrunninglengthplusrunningbit =static_cast<uword>(~runninglengthplusrunningbit);
|
206
|
+
static const uword notlargestrunninglengthcount =static_cast<uword>(~largestrunninglengthcount);
|
207
|
+
|
208
|
+
uword & mydata;
|
209
|
+
private:
|
210
|
+
};
|
211
|
+
|
212
|
+
|
213
|
+
/**
|
214
|
+
* Same as RunningLengthWord, except that the values cannot be modified.
|
215
|
+
*/
|
216
|
+
template <class uword=uword32>
|
217
|
+
class ConstRunningLengthWord {
|
218
|
+
public:
|
219
|
+
|
220
|
+
ConstRunningLengthWord () : mydata(0) {
|
221
|
+
}
|
222
|
+
|
223
|
+
ConstRunningLengthWord (const uword data) : mydata(data) {
|
224
|
+
}
|
225
|
+
|
226
|
+
ConstRunningLengthWord(const ConstRunningLengthWord & rlw) : mydata(rlw.mydata) {}
|
227
|
+
|
228
|
+
/**
|
229
|
+
* Which bit is being repeated?
|
230
|
+
*/
|
231
|
+
bool getRunningBit() const {
|
232
|
+
return mydata & static_cast<uword>(1);
|
233
|
+
}
|
234
|
+
|
235
|
+
/**
|
236
|
+
* how many words should be filled by the running bit
|
237
|
+
*/
|
238
|
+
uword getRunningLength() const {
|
239
|
+
return (mydata >> 1) & RunningLengthWord<uword>::largestrunninglengthcount ;
|
240
|
+
}
|
241
|
+
|
242
|
+
/**
|
243
|
+
* followed by how many literal words?
|
244
|
+
*/
|
245
|
+
uword getNumberOfLiteralWords() const {
|
246
|
+
return static_cast<uword>(mydata >> (1+RunningLengthWord<uword>::runninglengthbits));
|
247
|
+
}
|
248
|
+
|
249
|
+
/**
|
250
|
+
* Total of getRunningLength() and getNumberOfLiteralWords()
|
251
|
+
*/
|
252
|
+
uword size() const {
|
253
|
+
return getRunningLength() + getNumberOfLiteralWords();
|
254
|
+
}
|
255
|
+
|
256
|
+
uword mydata;
|
257
|
+
};
|
258
|
+
|
259
|
+
|
260
|
+
|
261
|
+
/**
|
262
|
+
* Same as RunningLengthWord, except that the values are buffered for quick
|
263
|
+
* access.
|
264
|
+
*/
|
265
|
+
template <class uword=uword32>
|
266
|
+
class BufferedRunningLengthWord {
|
267
|
+
public:
|
268
|
+
BufferedRunningLengthWord (const uword & data) : RunningBit(data & static_cast<uword>(1)),
|
269
|
+
RunningLength((data >> 1) & RunningLengthWord<uword>::largestrunninglengthcount),
|
270
|
+
NumberOfLiteralWords(static_cast<uword>(data >> (1+RunningLengthWord<uword>::runninglengthbits))) {
|
271
|
+
}
|
272
|
+
BufferedRunningLengthWord (const RunningLengthWord<uword> & p) : RunningBit(p.mydata & static_cast<uword>(1)),
|
273
|
+
RunningLength((p.mydata >> 1) & RunningLengthWord<uword>::largestrunninglengthcount),
|
274
|
+
NumberOfLiteralWords(p.mydata >> (1+RunningLengthWord<uword>::runninglengthbits)) {
|
275
|
+
}
|
276
|
+
|
277
|
+
void read(const uword & data) {
|
278
|
+
RunningBit = data & static_cast<uword>(1);
|
279
|
+
RunningLength = (data >> 1) & RunningLengthWord<uword>::largestrunninglengthcount;
|
280
|
+
NumberOfLiteralWords = static_cast<uword>(data >> (1+RunningLengthWord<uword>::runninglengthbits));
|
281
|
+
}
|
282
|
+
|
283
|
+
/**
|
284
|
+
* Which bit is being repeated?
|
285
|
+
*/
|
286
|
+
bool getRunningBit() const {
|
287
|
+
return RunningBit;
|
288
|
+
}
|
289
|
+
|
290
|
+
void discardFirstWords(uword x) {
|
291
|
+
assert(x<= size());
|
292
|
+
if(RunningLength >= x) {
|
293
|
+
RunningLength = static_cast<uword>(RunningLength - x);
|
294
|
+
return;
|
295
|
+
}
|
296
|
+
x = static_cast<uword>( x - RunningLength);
|
297
|
+
RunningLength = 0;
|
298
|
+
NumberOfLiteralWords = static_cast<uword>(NumberOfLiteralWords - x);
|
299
|
+
}
|
300
|
+
|
301
|
+
/**
|
302
|
+
* how many words should be filled by the running bit (see previous method)
|
303
|
+
*/
|
304
|
+
uword getRunningLength() const {
|
305
|
+
return RunningLength ;
|
306
|
+
}
|
307
|
+
|
308
|
+
/**
|
309
|
+
* followed by how many literal words?
|
310
|
+
*/
|
311
|
+
uword getNumberOfLiteralWords() const {
|
312
|
+
return NumberOfLiteralWords;
|
313
|
+
}
|
314
|
+
|
315
|
+
|
316
|
+
/**
|
317
|
+
* Total of getRunningLength() and getNumberOfLiteralWords()
|
318
|
+
*/
|
319
|
+
uword size() const {
|
320
|
+
return static_cast<uword>(RunningLength + NumberOfLiteralWords);
|
321
|
+
}
|
322
|
+
bool RunningBit;
|
323
|
+
uword RunningLength;
|
324
|
+
uword NumberOfLiteralWords;
|
325
|
+
|
326
|
+
};
|
327
|
+
|
328
|
+
template <class uword>
|
329
|
+
class EWAHBoolArray;
|
330
|
+
|
331
|
+
|
332
|
+
template <class uword>
|
333
|
+
class EWAHBoolArraySparseIterator;
|
334
|
+
|
335
|
+
|
336
|
+
/**
|
337
|
+
* Iterate over words of bits from a compressed bitmap.
|
338
|
+
*/
|
339
|
+
template <class uword=uword32>
|
340
|
+
class EWAHBoolArrayIterator {
|
341
|
+
public:
|
342
|
+
/**
|
343
|
+
* is there a new word?
|
344
|
+
*/
|
345
|
+
bool hasNext() const {
|
346
|
+
return pointer < myparent.size();
|
347
|
+
}
|
348
|
+
|
349
|
+
/**
|
350
|
+
* return next word.
|
351
|
+
*/
|
352
|
+
uword next() {
|
353
|
+
uword returnvalue;
|
354
|
+
if(compressedwords < rl) {
|
355
|
+
++compressedwords;
|
356
|
+
if(b)
|
357
|
+
returnvalue = notzero;
|
358
|
+
else
|
359
|
+
returnvalue = zero;
|
360
|
+
} else {
|
361
|
+
assert (literalwords < lw) ;
|
362
|
+
++literalwords;
|
363
|
+
++pointer;
|
364
|
+
assert(pointer <myparent.size());
|
365
|
+
returnvalue = myparent[pointer];
|
366
|
+
}
|
367
|
+
if((compressedwords == rl) && (literalwords == lw)) {
|
368
|
+
++pointer;
|
369
|
+
if(pointer < myparent.size()) readNewRunningLengthWord();
|
370
|
+
}
|
371
|
+
return returnvalue;
|
372
|
+
}
|
373
|
+
|
374
|
+
EWAHBoolArrayIterator(const EWAHBoolArrayIterator<uword> & other):pointer(other.pointer),
|
375
|
+
myparent(other.myparent),
|
376
|
+
compressedwords(other.compressedwords),
|
377
|
+
literalwords(other.literalwords),
|
378
|
+
rl(other.rl),
|
379
|
+
lw(other.lw),
|
380
|
+
b(other.b) {}
|
381
|
+
|
382
|
+
static const uword zero = 0;
|
383
|
+
static const uword notzero=static_cast<uword>(~zero);
|
384
|
+
private:
|
385
|
+
EWAHBoolArrayIterator(const vector<uword> & parent) ;
|
386
|
+
void readNewRunningLengthWord() ;
|
387
|
+
friend class EWAHBoolArray<uword>;
|
388
|
+
friend class EWAHBoolArraySparseIterator<uword>;
|
389
|
+
size_t pointer;
|
390
|
+
const vector<uword> & myparent;
|
391
|
+
uword compressedwords;
|
392
|
+
uword literalwords;
|
393
|
+
uword rl, lw;
|
394
|
+
bool b;
|
395
|
+
};
|
396
|
+
|
397
|
+
template <class uword>
|
398
|
+
class EWAHBoolArraySparseIterator;
|
399
|
+
|
400
|
+
|
401
|
+
|
402
|
+
|
403
|
+
template <class uword>
|
404
|
+
class EWAHBoolArraySetBitForwardIterator;
|
405
|
+
|
406
|
+
|
407
|
+
class BitmapStatistics;
|
408
|
+
|
409
|
+
template <class uword>
|
410
|
+
class EWAHBoolArrayRawIterator;
|
411
|
+
|
412
|
+
/**
|
413
|
+
* This class is a compressed bitmap.
|
414
|
+
* This is where compression
|
415
|
+
* happens.
|
416
|
+
* The underlying data structure is an STL vector.
|
417
|
+
*/
|
418
|
+
template <class uword=uword32>
|
419
|
+
class EWAHBoolArray {
|
420
|
+
public:
|
421
|
+
EWAHBoolArray(): buffer(1,0), sizeinbits(0), lastRLW(0) {
|
422
|
+
}
|
423
|
+
|
424
|
+
/**
|
425
|
+
* set the ith bit to true (starting at zero).
|
426
|
+
* Auto-expands the bitmap. It has constant running time complexity.
|
427
|
+
* Note that you must set the bits in increasing order:
|
428
|
+
* set(1), set(2) is ok; set(2), set(1) is not ok.
|
429
|
+
*/
|
430
|
+
void set(size_t i);
|
431
|
+
|
432
|
+
/**
|
433
|
+
* Make sure the two bitmaps have the same size (padding with zeroes
|
434
|
+
* if necessary). It has constant running time complexity.
|
435
|
+
*/
|
436
|
+
void makeSameSize(EWAHBoolArray & a) {
|
437
|
+
if(a.sizeinbits<sizeinbits)
|
438
|
+
a.padWithZeroes(sizeinbits);
|
439
|
+
else if(sizeinbits<a.sizeinbits)
|
440
|
+
padWithZeroes(a.sizeinbits);
|
441
|
+
}
|
442
|
+
|
443
|
+
enum {RESERVEMEMORY=true}; // for speed
|
444
|
+
|
445
|
+
typedef EWAHBoolArraySetBitForwardIterator<uword> const_iterator;
|
446
|
+
|
447
|
+
|
448
|
+
/**
|
449
|
+
* Returns an iterator that can be used to access the position of the
|
450
|
+
* set bits. The running time complexity of a full scan is proportional to the number
|
451
|
+
* of set bits: be aware that if you have long strings of 1s, this can be
|
452
|
+
* very inefficient.
|
453
|
+
*/
|
454
|
+
const_iterator begin() const {
|
455
|
+
return EWAHBoolArraySetBitForwardIterator<uword>(buffer);
|
456
|
+
}
|
457
|
+
|
458
|
+
|
459
|
+
/**
|
460
|
+
* Basically a bogus iterator that can be used together with begin()
|
461
|
+
* for constructions such as for(EWAHBoolArray<uword>::iterator i = b.begin(); i!=b.end(); ++i) {}
|
462
|
+
*/
|
463
|
+
const_iterator end() const {
|
464
|
+
return EWAHBoolArraySetBitForwardIterator<uword>(buffer,buffer.size());
|
465
|
+
}
|
466
|
+
|
467
|
+
/**
|
468
|
+
* computes the logical and with another compressed bitmap
|
469
|
+
* answer goes into container, though rawlogicaland is the
|
470
|
+
* default, sometimes this version is faster.
|
471
|
+
*/
|
472
|
+
void sparselogicaland( EWAHBoolArray &a, EWAHBoolArray &out) ;
|
473
|
+
|
474
|
+
/**
|
475
|
+
* computes the logical and with another compressed bitmap
|
476
|
+
* answer goes into container
|
477
|
+
* Running time complexity is proportional to the sum of the compressed
|
478
|
+
* bitmap sizes.
|
479
|
+
*/
|
480
|
+
void rawlogicaland( EWAHBoolArray &a, EWAHBoolArray &container) ;
|
481
|
+
|
482
|
+
/**
|
483
|
+
* computes the logical and with another compressed bitmap
|
484
|
+
* answer goes into container
|
485
|
+
* Running time complexity is proportional to the sum of the compressed
|
486
|
+
* bitmap sizes.
|
487
|
+
*/
|
488
|
+
void rawlogicalor( EWAHBoolArray &a, EWAHBoolArray &container) ;
|
489
|
+
|
490
|
+
|
491
|
+
/**
|
492
|
+
* computes the logical and with another compressed bitmap
|
493
|
+
* answer goes into container
|
494
|
+
* Running time complexity is proportional to the sum of the compressed
|
495
|
+
* bitmap sizes.
|
496
|
+
* (alias for rawlogicaland)
|
497
|
+
*/
|
498
|
+
void logicaland( EWAHBoolArray &a, EWAHBoolArray &container) {
|
499
|
+
rawlogicaland(a,container);
|
500
|
+
}
|
501
|
+
|
502
|
+
/**
|
503
|
+
* compute the logical and with another compressed bitmap
|
504
|
+
* answer goes into container.
|
505
|
+
* Running time complexity is proportional to the sum of the compressed
|
506
|
+
* bitmap sizes.
|
507
|
+
* (alias for rawlogicalor)
|
508
|
+
*/
|
509
|
+
void logicalor( EWAHBoolArray &a, EWAHBoolArray &container) {
|
510
|
+
rawlogicalor(a,container);
|
511
|
+
}
|
512
|
+
|
513
|
+
/**
|
514
|
+
* clear the content of the bitmap. It does not
|
515
|
+
* release the memory.
|
516
|
+
*/
|
517
|
+
void reset() {
|
518
|
+
buffer.clear();
|
519
|
+
buffer.push_back(0);
|
520
|
+
sizeinbits = 0;
|
521
|
+
lastRLW = 0;
|
522
|
+
}
|
523
|
+
|
524
|
+
/**
|
525
|
+
* convenience method.
|
526
|
+
*
|
527
|
+
* returns the number of words added (storage cost increase)
|
528
|
+
*/
|
529
|
+
inline size_t add(const uword newdata, const uint bitsthatmatter = 8*sizeof(uword));
|
530
|
+
|
531
|
+
inline void printout(ostream &o = cout) {
|
532
|
+
toBoolArray().printout(o);
|
533
|
+
}
|
534
|
+
|
535
|
+
/**
|
536
|
+
* Prints a verbose description of the content of the compressed bitmap.
|
537
|
+
*/
|
538
|
+
void debugprintout() const;
|
539
|
+
|
540
|
+
/**
|
541
|
+
* Return the size in bits of this bitmap (this refers
|
542
|
+
* to the uncompressed size in bits).
|
543
|
+
*/
|
544
|
+
inline size_t sizeInBits() const {
|
545
|
+
return sizeinbits;
|
546
|
+
}
|
547
|
+
|
548
|
+
/**
|
549
|
+
* set size in bits. This does not affect the compressed size. It
|
550
|
+
* runs in constant time.
|
551
|
+
*/
|
552
|
+
inline void setSizeInBits(const size_t size) {
|
553
|
+
sizeinbits = size;
|
554
|
+
}
|
555
|
+
|
556
|
+
/**
|
557
|
+
* Return the size of the buffer in bytes. This
|
558
|
+
* is equivalent to the storage cost, minus some overhead.
|
559
|
+
*/
|
560
|
+
inline size_t sizeInBytes() const {
|
561
|
+
return buffer.size()*sizeof(uword);
|
562
|
+
}
|
563
|
+
|
564
|
+
|
565
|
+
|
566
|
+
/**
|
567
|
+
* same as addEmptyWord, but you can do several in one shot!
|
568
|
+
* returns the number of words added (storage cost increase)
|
569
|
+
*/
|
570
|
+
size_t addStreamOfEmptyWords(const bool v, const size_t number);
|
571
|
+
|
572
|
+
/**
|
573
|
+
* add a stream of dirty words,, returns the number of words added
|
574
|
+
* (storage cost increase)
|
575
|
+
*/
|
576
|
+
size_t addStreamOfDirtyWords(const uword * v, const size_t number);
|
577
|
+
|
578
|
+
/**
|
579
|
+
* make sure the size of the array is totalbits bits by padding with zeroes.
|
580
|
+
* returns the number of words added (storage cost increase)
|
581
|
+
*/
|
582
|
+
inline size_t padWithZeroes(const size_t totalbits);
|
583
|
+
|
584
|
+
/**
|
585
|
+
* Compute the size on disk assuming that it was saved using
|
586
|
+
* the method "save".
|
587
|
+
*/
|
588
|
+
size_t sizeOnDisk() const;
|
589
|
+
|
590
|
+
|
591
|
+
/**
|
592
|
+
* Save this bitmap to a stream. The file format is
|
593
|
+
* | sizeinbits | buffer lenth | buffer content|
|
594
|
+
* the sizeinbits part can be omitted if "savesizeinbits=false".
|
595
|
+
* Both sizeinbits and buffer length are saved using the size_t data
|
596
|
+
* type which is typically a 32-bit unsigned integer for 32-bit CPUs
|
597
|
+
* and a 64-bit unsigned integer for 64-bit CPUs.
|
598
|
+
* Note that this format is machine-specific. Note also
|
599
|
+
* that the word size is not saved. For robust persistent
|
600
|
+
* storage, you need to save this extra information elsewhere.
|
601
|
+
*/
|
602
|
+
inline void write(ostream & out, const bool savesizeinbits=true) const;
|
603
|
+
|
604
|
+
/**
|
605
|
+
* This only writes the content of the buffer (see write()) method.
|
606
|
+
* It is for advanced users.
|
607
|
+
*/
|
608
|
+
inline void writeBuffer(ostream & out) const;
|
609
|
+
|
610
|
+
/**
|
611
|
+
* size (in words) of the underlying STL vector.
|
612
|
+
*/
|
613
|
+
inline size_t bufferSize() const {
|
614
|
+
return buffer.size();
|
615
|
+
}
|
616
|
+
|
617
|
+
/**
|
618
|
+
* this is the counterpart to the write method.
|
619
|
+
* if you set savesizeinbits=false, then you are responsible
|
620
|
+
* for setting the value fo the attribute sizeinbits (see method setSizeInBits).
|
621
|
+
*/
|
622
|
+
inline void read(istream & in, const bool savesizeinbits=true);
|
623
|
+
|
624
|
+
|
625
|
+
/**
|
626
|
+
* read the buffer from a stream, see method writeBuffer.
|
627
|
+
* this is for advanced users.
|
628
|
+
*/
|
629
|
+
inline void readBuffer(istream & in, const size_t buffersize);
|
630
|
+
|
631
|
+
bool operator==(const EWAHBoolArray & x) const;
|
632
|
+
|
633
|
+
bool operator!=(const EWAHBoolArray & x) const;
|
634
|
+
|
635
|
+
bool operator==(const BoolArray<uword> & x) const;
|
636
|
+
|
637
|
+
bool operator!=(const BoolArray<uword> & x) const;
|
638
|
+
|
639
|
+
/**
|
640
|
+
* Iterate over the uncompressed words.
|
641
|
+
* Can be considerably faster than begin()/end().
|
642
|
+
* Running time complexity of a full scan is proportional to the
|
643
|
+
* uncompressed size of the bitmap.
|
644
|
+
*/
|
645
|
+
EWAHBoolArrayIterator<uword> uncompress() const ;
|
646
|
+
|
647
|
+
/**
|
648
|
+
* To iterate over non-zero uncompressed words.
|
649
|
+
* Can be considerably faster than begin()/end().
|
650
|
+
* Running time complexity of a fun scan is proportional to the number of
|
651
|
+
* non-zero uncompressed words.
|
652
|
+
*/
|
653
|
+
EWAHBoolArraySparseIterator<uword> sparse_uncompress() const ;
|
654
|
+
|
655
|
+
/**
|
656
|
+
* To iterate over the compressed data.
|
657
|
+
* Can be faster than any other iterator.
|
658
|
+
* Running time complexity of a full scan is proportional to the
|
659
|
+
* compressed size of the bitmap.
|
660
|
+
*/
|
661
|
+
EWAHBoolArrayRawIterator<uword> raw_iterator() const ;
|
662
|
+
|
663
|
+
/**
|
664
|
+
* Appends the content of some other compressed bitmap
|
665
|
+
* at the end of the current bitmap.
|
666
|
+
*/
|
667
|
+
void append(const EWAHBoolArray & x);
|
668
|
+
|
669
|
+
/**
|
670
|
+
* For research purposes. This computes the number of
|
671
|
+
* dirty words and the number of compressed words.
|
672
|
+
*/
|
673
|
+
BitmapStatistics computeStatistics() const;
|
674
|
+
|
675
|
+
BoolArray<uword> toBoolArray() const;
|
676
|
+
|
677
|
+
/**
|
678
|
+
* Convert to a list of positions of "set" bits.
|
679
|
+
* The recommender container is vector<size_t>.
|
680
|
+
*/
|
681
|
+
template <class container>
|
682
|
+
void appendRowIDs(container & out, const size_t offset = 0) const;
|
683
|
+
|
684
|
+
|
685
|
+
/**
|
686
|
+
* Convert to a list of positions of "set" bits.
|
687
|
+
* The recommender container is vector<size_t>.
|
688
|
+
* (alias for appendRowIDs).
|
689
|
+
*/
|
690
|
+
template <class container>
|
691
|
+
void appendSetBits(container & out, const size_t offset = 0) const {
|
692
|
+
return appendRowIDs(out,offset);
|
693
|
+
}
|
694
|
+
|
695
|
+
/**
|
696
|
+
* Returns the number of bits set to the value 1.
|
697
|
+
* The running time complexity is proportional to the
|
698
|
+
* compressed size of the bitmap.
|
699
|
+
*/
|
700
|
+
size_t numberOfOnes();
|
701
|
+
|
702
|
+
/**
|
703
|
+
* Swap the content of this bitmap with another bitmap.
|
704
|
+
* No copying is done. (Running time complexity is constant.)
|
705
|
+
*/
|
706
|
+
void swap(EWAHBoolArray & x);
|
707
|
+
|
708
|
+
const vector<uword> & getBuffer() const {
|
709
|
+
return buffer;
|
710
|
+
};
|
711
|
+
enum { wordinbits = sizeof(uword) * 8};
|
712
|
+
|
713
|
+
|
714
|
+
/**
|
715
|
+
*Please don't copy your bitmaps! The running time
|
716
|
+
* complexity of a copy is the size of the compressed bitmap.
|
717
|
+
**/
|
718
|
+
EWAHBoolArray(const EWAHBoolArray& other) :
|
719
|
+
buffer(other.buffer),
|
720
|
+
sizeinbits(other.sizeinbits),
|
721
|
+
lastRLW(other.lastRLW) {
|
722
|
+
ASSERT(buffer.size()<=1,"You are trying to copy the bitmap, a terrible idea in general, for performance reasons.");// performance assert!
|
723
|
+
}
|
724
|
+
|
725
|
+
/**
|
726
|
+
* Copies the content of one bitmap onto another. Running time complexity
|
727
|
+
* is proportional to the size of the compressed bitmap.
|
728
|
+
* please, never hard-copy this object. Use the swap method if you must.
|
729
|
+
*/
|
730
|
+
EWAHBoolArray & operator=(const EWAHBoolArray & x) {
|
731
|
+
buffer = x.buffer;
|
732
|
+
sizeinbits = x.sizeinbits;
|
733
|
+
lastRLW = x.lastRLW;
|
734
|
+
return *this;
|
735
|
+
}
|
736
|
+
|
737
|
+
/**
|
738
|
+
* This is equivalent to the operator =. It is used
|
739
|
+
* to keep in mind that assignment can be expensive.
|
740
|
+
*
|
741
|
+
*if you don't care to copy the bitmap (performance-wise), use this!
|
742
|
+
*/
|
743
|
+
void expensive_copy(const EWAHBoolArray & x) {
|
744
|
+
buffer = x.buffer;
|
745
|
+
sizeinbits = x.sizeinbits;
|
746
|
+
lastRLW = x.lastRLW;
|
747
|
+
}
|
748
|
+
|
749
|
+
/**
|
750
|
+
* Write the logical not of this bitmap in the provided container.
|
751
|
+
*/
|
752
|
+
void logicalnot(EWAHBoolArray & x) const;
|
753
|
+
|
754
|
+
/**
|
755
|
+
* Apply the logical not operation on this bitmap.
|
756
|
+
* Running time complexity is proportional to the compressed size of the bitmap.
|
757
|
+
*/
|
758
|
+
void inplace_logicalnot();
|
759
|
+
|
760
|
+
|
761
|
+
private:
|
762
|
+
|
763
|
+
|
764
|
+
|
765
|
+
// private because does not increment the size in bits
|
766
|
+
// returns the number of words added (storage cost increase)
|
767
|
+
inline size_t addLiteralWord(const uword newdata) ;
|
768
|
+
|
769
|
+
// private because does not increment the size in bits
|
770
|
+
// returns the number of words added (storage cost increase)
|
771
|
+
size_t addEmptyWord(const bool v);
|
772
|
+
// this second version "might" be faster if you hate OOP.
|
773
|
+
// in my tests, it turned out to be slower!
|
774
|
+
// private because does not increment the size in bits
|
775
|
+
//inline void addEmptyWordStaticCalls(bool v);
|
776
|
+
|
777
|
+
vector<uword> buffer;
|
778
|
+
size_t sizeinbits;
|
779
|
+
size_t lastRLW;
|
780
|
+
};
|
781
|
+
|
782
|
+
|
783
|
+
|
784
|
+
/**
|
785
|
+
* Iterator over the words of the compressed bitmap.
|
786
|
+
*/
|
787
|
+
template <class uword=uword32>
|
788
|
+
class EWAHBoolArraySparseIterator {
|
789
|
+
public:
|
790
|
+
/**
|
791
|
+
* is there more words?
|
792
|
+
*/
|
793
|
+
bool hasNext() const {
|
794
|
+
return i.hasNext();
|
795
|
+
}
|
796
|
+
|
797
|
+
size_t position() const {
|
798
|
+
return mPosition;
|
799
|
+
}
|
800
|
+
/**
|
801
|
+
* return next word. If the word is either 0x00 or 0x11
|
802
|
+
* the you need to call position() to know how many times it
|
803
|
+
* was repeated
|
804
|
+
*/
|
805
|
+
uword next() {
|
806
|
+
uword returnvalue;
|
807
|
+
if(i.compressedwords < i.rl) {
|
808
|
+
if(i.b) {
|
809
|
+
++mPosition;
|
810
|
+
++i.compressedwords;
|
811
|
+
returnvalue = EWAHBoolArrayIterator<uword>::notzero;
|
812
|
+
} else {
|
813
|
+
mPosition = static_cast<size_t>(mPosition + i.rl);
|
814
|
+
i.compressedwords = i.rl;
|
815
|
+
returnvalue = EWAHBoolArrayIterator<uword>::zero;//next();
|
816
|
+
}
|
817
|
+
} else {
|
818
|
+
assert (i.literalwords < i.lw);
|
819
|
+
++i.literalwords;
|
820
|
+
++i.pointer;
|
821
|
+
++mPosition;
|
822
|
+
assert(i.pointer <i.myparent.size());
|
823
|
+
returnvalue = i.myparent[i.pointer];
|
824
|
+
}
|
825
|
+
if((i.compressedwords == i.rl) && (i.literalwords == i.lw)) {
|
826
|
+
++i.pointer;
|
827
|
+
if(i.pointer < i.myparent.size()) i.readNewRunningLengthWord();
|
828
|
+
}
|
829
|
+
return returnvalue;
|
830
|
+
}
|
831
|
+
|
832
|
+
EWAHBoolArraySparseIterator(const EWAHBoolArraySparseIterator<uword> & other):i(other.i),mPosition(other.mPosition) {}
|
833
|
+
|
834
|
+
private:
|
835
|
+
EWAHBoolArraySparseIterator(const vector<uword> & parent) : i(parent), mPosition(0) {}
|
836
|
+
EWAHBoolArrayIterator<uword> i;
|
837
|
+
size_t mPosition;
|
838
|
+
friend class EWAHBoolArray<uword>;
|
839
|
+
};
|
840
|
+
|
841
|
+
|
842
|
+
/**
|
843
|
+
* Used to go through the set bits. Not optimally fast, but convenient.
|
844
|
+
*/
|
845
|
+
template <class uword>
|
846
|
+
class EWAHBoolArraySetBitForwardIterator {
|
847
|
+
public:
|
848
|
+
enum { wordinbits = sizeof(uword) * 8};
|
849
|
+
typedef forward_iterator_tag iterator_category;
|
850
|
+
typedef size_t * pointer;
|
851
|
+
typedef size_t & reference_type;
|
852
|
+
typedef size_t value_type;
|
853
|
+
typedef ptrdiff_t difference_type;
|
854
|
+
typedef EWAHBoolArraySetBitForwardIterator<uword> type_of_iterator;
|
855
|
+
|
856
|
+
/**
|
857
|
+
* Provides the location of the set bit.
|
858
|
+
*/
|
859
|
+
size_t operator*() const {
|
860
|
+
return currentrunoffset+offsetofpreviousrun;
|
861
|
+
}
|
862
|
+
|
863
|
+
// this can be expensive
|
864
|
+
difference_type operator-(const type_of_iterator& o) {
|
865
|
+
type_of_iterator& smaller = *this<o ? *this : o;
|
866
|
+
type_of_iterator& bigger = *this>=o ? *this : o;
|
867
|
+
if(smaller.mpointer==smaller.buffer.size())
|
868
|
+
return 0;
|
869
|
+
difference_type absdiff = static_cast<difference_type>(0);
|
870
|
+
EWAHBoolArraySetBitForwardIterator<uword> buf(smaller);
|
871
|
+
while(buf!= bigger) {
|
872
|
+
++absdiff;
|
873
|
+
++buf;
|
874
|
+
}
|
875
|
+
if(*this<o)
|
876
|
+
return absdiff;
|
877
|
+
else
|
878
|
+
return - absdiff;
|
879
|
+
}
|
880
|
+
|
881
|
+
bool operator<(const type_of_iterator& o) {
|
882
|
+
if(buffer != o.buffer) return false;
|
883
|
+
if(mpointer==buffer.size()) return false;
|
884
|
+
if(o.mpointer==o.buffer.size()) return true;
|
885
|
+
if(offsetofpreviousrun<o.offsetofpreviousrun)
|
886
|
+
return true;
|
887
|
+
if(offsetofpreviousrun>o.offsetofpreviousrun)
|
888
|
+
return false;
|
889
|
+
if(currentrunoffset<o.currentrunoffset)
|
890
|
+
return true;
|
891
|
+
return false;
|
892
|
+
}
|
893
|
+
bool operator<=(const type_of_iterator& o) {
|
894
|
+
return ( (*this) < o ) || ((*this) == o);
|
895
|
+
}
|
896
|
+
|
897
|
+
bool operator>(const type_of_iterator& o) {
|
898
|
+
return ! ((*this) <= o ) ;
|
899
|
+
}
|
900
|
+
|
901
|
+
bool operator>=(const type_of_iterator& o) {
|
902
|
+
return ! ((*this) < o ) ;
|
903
|
+
}
|
904
|
+
|
905
|
+
EWAHBoolArraySetBitForwardIterator & operator++() {
|
906
|
+
++currentrunoffset;
|
907
|
+
advanceToNextSetBit();
|
908
|
+
return *this;
|
909
|
+
}
|
910
|
+
EWAHBoolArraySetBitForwardIterator operator++(int) {
|
911
|
+
EWAHBoolArraySetBitForwardIterator old(*this);
|
912
|
+
++currentrunoffset;
|
913
|
+
advanceToNextSetBit();
|
914
|
+
return old;
|
915
|
+
}
|
916
|
+
bool operator==(const EWAHBoolArraySetBitForwardIterator<uword> & o) {
|
917
|
+
// if they are both over, return true
|
918
|
+
if((mpointer==buffer.size()) && (o.mpointer==o.buffer.size()))
|
919
|
+
return true;
|
920
|
+
return (buffer == o.buffer) && (mpointer == o.mpointer) &&
|
921
|
+
(offsetofpreviousrun == o.offsetofpreviousrun) && (currentrunoffset == o.currentrunoffset);
|
922
|
+
}
|
923
|
+
|
924
|
+
bool operator!=(const EWAHBoolArraySetBitForwardIterator<uword> & o) {
|
925
|
+
// if they are both over, return false
|
926
|
+
if((mpointer==buffer.size()) && (o.mpointer==o.buffer.size()))
|
927
|
+
return false;
|
928
|
+
return (buffer != o.buffer) || (mpointer != o.mpointer) ||
|
929
|
+
(offsetofpreviousrun != o.offsetofpreviousrun) || (currentrunoffset != o.currentrunoffset);
|
930
|
+
}
|
931
|
+
|
932
|
+
|
933
|
+
EWAHBoolArraySetBitForwardIterator(const EWAHBoolArraySetBitForwardIterator & o) : buffer(o.buffer), mpointer(o.mpointer),
|
934
|
+
offsetofpreviousrun(o.offsetofpreviousrun), currentrunoffset(o.currentrunoffset), rlw(o.rlw) {}
|
935
|
+
|
936
|
+
private:
|
937
|
+
|
938
|
+
bool advanceToNextSetBit() {
|
939
|
+
if(mpointer==buffer.size()) return false;
|
940
|
+
if (currentrunoffset<static_cast<size_t>(rlw.getRunningLength() * wordinbits)) {
|
941
|
+
if(rlw.getRunningBit())
|
942
|
+
return true;// nothing to do
|
943
|
+
currentrunoffset = static_cast<size_t>(rlw.getRunningLength() * wordinbits);//skipping
|
944
|
+
}
|
945
|
+
while(true) {
|
946
|
+
const size_t indexoflitword = static_cast<size_t>( (currentrunoffset-rlw.getRunningLength() * wordinbits)/wordinbits);
|
947
|
+
if(indexoflitword>= rlw.getNumberOfLiteralWords() ) {
|
948
|
+
if(advanceToNextRun())
|
949
|
+
return advanceToNextSetBit();
|
950
|
+
else {
|
951
|
+
return false;
|
952
|
+
}
|
953
|
+
}
|
954
|
+
const uword currentword = buffer[mpointer + 1 + indexoflitword];
|
955
|
+
for(uint inwordpointer =
|
956
|
+
static_cast<uint>((currentrunoffset-rlw.getRunningLength() * wordinbits)%wordinbits);
|
957
|
+
inwordpointer<wordinbits;++inwordpointer,++currentrunoffset) {
|
958
|
+
if((currentword & (static_cast<uword>(1) << inwordpointer))!=0)
|
959
|
+
return true;
|
960
|
+
}
|
961
|
+
}
|
962
|
+
}
|
963
|
+
|
964
|
+
bool advanceToNextRun() {
|
965
|
+
offsetofpreviousrun += currentrunoffset;
|
966
|
+
currentrunoffset = 0;
|
967
|
+
mpointer += static_cast<size_t>(1 + rlw.getNumberOfLiteralWords());
|
968
|
+
if(mpointer<buffer.size()) {
|
969
|
+
rlw.mydata = buffer[mpointer];
|
970
|
+
} else {
|
971
|
+
return false;
|
972
|
+
}
|
973
|
+
return true;
|
974
|
+
}
|
975
|
+
|
976
|
+
|
977
|
+
EWAHBoolArraySetBitForwardIterator(const vector<uword> & parent, size_t startpointer = 0) : buffer(parent), mpointer(startpointer),
|
978
|
+
offsetofpreviousrun(0), currentrunoffset(0), rlw(0) {
|
979
|
+
if(mpointer<buffer.size()) {
|
980
|
+
rlw.mydata = buffer[mpointer];
|
981
|
+
advanceToNextSetBit();
|
982
|
+
}
|
983
|
+
}
|
984
|
+
|
985
|
+
|
986
|
+
const vector<uword> & buffer;
|
987
|
+
size_t mpointer;
|
988
|
+
size_t offsetofpreviousrun;
|
989
|
+
size_t currentrunoffset;
|
990
|
+
friend class EWAHBoolArray<uword>;
|
991
|
+
ConstRunningLengthWord<uword> rlw;
|
992
|
+
};
|
993
|
+
|
994
|
+
|
995
|
+
|
996
|
+
/**
|
997
|
+
* This object is returned by the compressed bitmap as a
|
998
|
+
* statistical descriptor.
|
999
|
+
*/
|
1000
|
+
class BitmapStatistics {
|
1001
|
+
public:
|
1002
|
+
BitmapStatistics() : totalliteral(0), totalcompressed(0), runningwordmarker(0), maximumofrunningcounterreached(0) {}
|
1003
|
+
size_t getCompressedSize() const {
|
1004
|
+
return totalliteral+ runningwordmarker;
|
1005
|
+
}
|
1006
|
+
size_t getUncompressedSize() const {
|
1007
|
+
return totalliteral+ totalcompressed;
|
1008
|
+
}
|
1009
|
+
size_t getNumberOfDirtyWords() const {
|
1010
|
+
return totalliteral;
|
1011
|
+
}
|
1012
|
+
size_t getNumberOfCleanWords() const {
|
1013
|
+
return totalcompressed;
|
1014
|
+
}
|
1015
|
+
size_t getNumberOfMarkers() const {
|
1016
|
+
return runningwordmarker;
|
1017
|
+
}
|
1018
|
+
size_t getOverRuns() const {
|
1019
|
+
return maximumofrunningcounterreached;
|
1020
|
+
}
|
1021
|
+
size_t totalliteral;
|
1022
|
+
size_t totalcompressed;
|
1023
|
+
size_t runningwordmarker;
|
1024
|
+
size_t maximumofrunningcounterreached;
|
1025
|
+
};
|
1026
|
+
|
1027
|
+
|
1028
|
+
template <class uword>
|
1029
|
+
void EWAHBoolArray<uword>::set(size_t i) {
|
1030
|
+
// must I complete a word?
|
1031
|
+
if ( (sizeinbits % (8*sizeof(uword))) != 0) {
|
1032
|
+
const size_t possiblesizeinbits = (sizeinbits /(8*sizeof(uword)))*(8*sizeof(uword)) + (8*sizeof(uword));
|
1033
|
+
if(possiblesizeinbits<i+1) {
|
1034
|
+
sizeinbits = possiblesizeinbits;
|
1035
|
+
}
|
1036
|
+
}
|
1037
|
+
addStreamOfEmptyWords(false, (i/(8*sizeof(uword))) - sizeinbits/(8*sizeof(uword)));
|
1038
|
+
size_t bittoflip = i-(sizeinbits/(8*sizeof(uword)) * (8*sizeof(uword)));
|
1039
|
+
// next, we set the bit
|
1040
|
+
RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
|
1041
|
+
if(( lastRunningLengthWord.getNumberOfLiteralWords() == 0) || ((sizeinbits
|
1042
|
+
-1)/(8*sizeof(uword)) < i/(8*sizeof(uword))) ) {
|
1043
|
+
const uword newdata = static_cast<uword>(static_cast<uword>(1)<<bittoflip);
|
1044
|
+
addLiteralWord(newdata);
|
1045
|
+
} else {
|
1046
|
+
buffer[buffer.size()-1] |= static_cast<uword>(static_cast<uword>(1)<<bittoflip);
|
1047
|
+
// check if we just completed a stream of 1s
|
1048
|
+
if(buffer[buffer.size()-1] == static_cast<uword>(~0)) {
|
1049
|
+
// we remove the last dirty word
|
1050
|
+
buffer[buffer.size()-1] = 0;
|
1051
|
+
buffer.resize(buffer.size()-1);
|
1052
|
+
lastRunningLengthWord.setNumberOfLiteralWords(static_cast<uword>(lastRunningLengthWord.getNumberOfLiteralWords()-1));
|
1053
|
+
// next we add one clean word
|
1054
|
+
addEmptyWord(true);
|
1055
|
+
}
|
1056
|
+
}
|
1057
|
+
sizeinbits = i+1;
|
1058
|
+
}
|
1059
|
+
|
1060
|
+
|
1061
|
+
|
1062
|
+
template <class uword>
|
1063
|
+
void EWAHBoolArray<uword>::inplace_logicalnot() {
|
1064
|
+
size_t pointer(0);
|
1065
|
+
while(pointer <buffer.size()) {
|
1066
|
+
RunningLengthWord<uword> rlw(buffer[pointer]);
|
1067
|
+
if(rlw.getRunningBit())
|
1068
|
+
rlw.setRunningBit(false);
|
1069
|
+
else
|
1070
|
+
rlw.setRunningBit(true);
|
1071
|
+
++pointer;
|
1072
|
+
for(size_t k = 0; k<rlw.getNumberOfLiteralWords(); ++k) {
|
1073
|
+
buffer[pointer] = ~buffer[pointer];
|
1074
|
+
++pointer;
|
1075
|
+
}
|
1076
|
+
}
|
1077
|
+
}
|
1078
|
+
|
1079
|
+
|
1080
|
+
template <class uword>
|
1081
|
+
void EWAHBoolArray<uword>::logicalnot(EWAHBoolArray & x) const {
|
1082
|
+
x.reset();
|
1083
|
+
x.buffer.reserve(buffer.size());
|
1084
|
+
EWAHBoolArrayRawIterator<uword> i = this->raw_iterator();
|
1085
|
+
while(i.hasNext()) {
|
1086
|
+
BufferedRunningLengthWord<uword> & rlw = i.next();
|
1087
|
+
x.addStreamOfEmptyWords(! rlw.getRunningBit(), rlw.getRunningLength());
|
1088
|
+
if(rlw.getNumberOfLiteralWords()>0) {
|
1089
|
+
const uword * dw = i.dirtyWords();
|
1090
|
+
for(size_t k = 0 ; k <rlw.getNumberOfLiteralWords(); ++k) {
|
1091
|
+
x.addLiteralWord(~ dw[k]);
|
1092
|
+
}
|
1093
|
+
}
|
1094
|
+
}
|
1095
|
+
x.sizeinbits = this->sizeinbits;
|
1096
|
+
}
|
1097
|
+
|
1098
|
+
|
1099
|
+
template <class uword>
|
1100
|
+
size_t EWAHBoolArray<uword>::add(const uword newdata, const uint bitsthatmatter) {
|
1101
|
+
sizeinbits += bitsthatmatter;
|
1102
|
+
if(newdata == 0) {
|
1103
|
+
return addEmptyWord(0);
|
1104
|
+
} else if (newdata == static_cast<uword>(~0)) {
|
1105
|
+
return addEmptyWord(1);
|
1106
|
+
} else {
|
1107
|
+
return addLiteralWord(newdata);
|
1108
|
+
}
|
1109
|
+
}
|
1110
|
+
|
1111
|
+
|
1112
|
+
template <class uword>
|
1113
|
+
inline void EWAHBoolArray<uword>::writeBuffer(ostream & out) const {
|
1114
|
+
if(buffer.size()>0)
|
1115
|
+
out.write(reinterpret_cast<const char *>(& buffer[0]),sizeof(uword)*buffer.size());
|
1116
|
+
}
|
1117
|
+
|
1118
|
+
|
1119
|
+
template <class uword>
|
1120
|
+
inline void EWAHBoolArray<uword>::readBuffer(istream & in, const size_t buffersize) {
|
1121
|
+
buffer.resize(buffersize);
|
1122
|
+
if(buffersize>0)
|
1123
|
+
in.read(reinterpret_cast<char *>(&buffer[0]),sizeof(uword)*buffersize);
|
1124
|
+
}
|
1125
|
+
|
1126
|
+
|
1127
|
+
template <class uword>
|
1128
|
+
void EWAHBoolArray<uword>::write(ostream & out, const bool savesizeinbits) const {
|
1129
|
+
if(savesizeinbits)out.write(reinterpret_cast<const char *>( & sizeinbits), sizeof(sizeinbits));
|
1130
|
+
const size_t buffersize = buffer.size();
|
1131
|
+
out.write(reinterpret_cast<const char *>(& buffersize),sizeof(buffersize));
|
1132
|
+
if(buffersize>0)
|
1133
|
+
out.write(reinterpret_cast<const char *>(& buffer[0]),sizeof(uword)*buffersize);
|
1134
|
+
}
|
1135
|
+
|
1136
|
+
|
1137
|
+
template <class uword>
|
1138
|
+
void EWAHBoolArray<uword>::read(istream & in, const bool savesizeinbits) {
|
1139
|
+
if(savesizeinbits) in.read(reinterpret_cast<char *>(&sizeinbits), sizeof(sizeinbits));
|
1140
|
+
else sizeinbits = 0;
|
1141
|
+
size_t buffersize(0);
|
1142
|
+
in.read(reinterpret_cast<char *>(&buffersize), sizeof(buffersize));
|
1143
|
+
buffer.resize(buffersize);
|
1144
|
+
if(buffersize>0)
|
1145
|
+
in.read(reinterpret_cast<char *>(&buffer[0]),sizeof(uword)*buffersize);
|
1146
|
+
}
|
1147
|
+
|
1148
|
+
|
1149
|
+
template <class uword>
|
1150
|
+
size_t EWAHBoolArray<uword>::addLiteralWord(const uword newdata) {
|
1151
|
+
RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
|
1152
|
+
uword numbersofar = lastRunningLengthWord.getNumberOfLiteralWords();
|
1153
|
+
if(numbersofar >= RunningLengthWord<uword>::largestliteralcount) {//0x7FFF) {
|
1154
|
+
buffer.push_back(0);
|
1155
|
+
lastRLW = buffer.size() - 1;
|
1156
|
+
RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
|
1157
|
+
lastRunningLengthWord2.setNumberOfLiteralWords(1);
|
1158
|
+
buffer.push_back(newdata);
|
1159
|
+
return 2;
|
1160
|
+
}
|
1161
|
+
lastRunningLengthWord.setNumberOfLiteralWords(static_cast<uword>(numbersofar + 1));
|
1162
|
+
assert(lastRunningLengthWord.getNumberOfLiteralWords()==numbersofar + 1);
|
1163
|
+
buffer.push_back(newdata);
|
1164
|
+
return 1;
|
1165
|
+
}
|
1166
|
+
|
1167
|
+
|
1168
|
+
|
1169
|
+
|
1170
|
+
template <class uword>
|
1171
|
+
size_t EWAHBoolArray<uword>::padWithZeroes(const size_t totalbits) {
|
1172
|
+
assert(totalbits >= sizeinbits);
|
1173
|
+
size_t missingbits = totalbits - sizeinbits;
|
1174
|
+
size_t wordsadded = addStreamOfEmptyWords(0, missingbits/wordinbits + ((missingbits % wordinbits != 0) ? 1 : 0));
|
1175
|
+
assert(sizeinbits >= totalbits);
|
1176
|
+
assert(sizeinbits <= totalbits + wordinbits);
|
1177
|
+
sizeinbits = totalbits;
|
1178
|
+
return wordsadded;
|
1179
|
+
}
|
1180
|
+
|
1181
|
+
|
1182
|
+
|
1183
|
+
/**
|
1184
|
+
* This is a low-level iterator.
|
1185
|
+
*/
|
1186
|
+
|
1187
|
+
template <class uword=uword32>
|
1188
|
+
class EWAHBoolArrayRawIterator {
|
1189
|
+
public:
|
1190
|
+
EWAHBoolArrayRawIterator(const EWAHBoolArray<uword> & p) : pointer(0),
|
1191
|
+
myparent(&p.getBuffer()), rlw((*myparent)[pointer]) { //RunningLength(0), NumberOfLiteralWords(0), Bit(0) {
|
1192
|
+
if(verbose) {
|
1193
|
+
cout<<"created a new raw iterator over buffer of size "<<myparent->size()<<endl;
|
1194
|
+
}
|
1195
|
+
}
|
1196
|
+
EWAHBoolArrayRawIterator(const EWAHBoolArrayRawIterator & o) : pointer(o.pointer),
|
1197
|
+
myparent(o.myparent), rlw(o.rlw) {}
|
1198
|
+
|
1199
|
+
|
1200
|
+
bool hasNext() const {
|
1201
|
+
if(verbose)cout<<"call to hasNext, pointer is at "<<pointer<< ", parent.size()= "<<myparent->size()<<endl;
|
1202
|
+
return pointer < myparent->size();
|
1203
|
+
}
|
1204
|
+
|
1205
|
+
BufferedRunningLengthWord<uword> & next() {
|
1206
|
+
assert(pointer < myparent->size());
|
1207
|
+
rlw.read( (*myparent)[pointer]);
|
1208
|
+
pointer = static_cast<size_t>(pointer + rlw.getNumberOfLiteralWords() + 1);
|
1209
|
+
return rlw;
|
1210
|
+
}
|
1211
|
+
|
1212
|
+
const uword * dirtyWords() const {
|
1213
|
+
assert(pointer>0);
|
1214
|
+
assert(pointer>=rlw.getNumberOfLiteralWords());
|
1215
|
+
return & (myparent->at(static_cast<size_t>(pointer-rlw.getNumberOfLiteralWords())));
|
1216
|
+
}
|
1217
|
+
|
1218
|
+
EWAHBoolArrayRawIterator & operator=(const EWAHBoolArrayRawIterator & other) {
|
1219
|
+
pointer = other.pointer;
|
1220
|
+
myparent=other.myparent;
|
1221
|
+
rlw=other.rlw;
|
1222
|
+
return *this;
|
1223
|
+
}
|
1224
|
+
|
1225
|
+
enum {verbose=false};
|
1226
|
+
size_t pointer;
|
1227
|
+
const vector<uword> * myparent;
|
1228
|
+
BufferedRunningLengthWord<uword> rlw;
|
1229
|
+
private:
|
1230
|
+
|
1231
|
+
EWAHBoolArrayRawIterator();
|
1232
|
+
};
|
1233
|
+
|
1234
|
+
|
1235
|
+
|
1236
|
+
|
1237
|
+
|
1238
|
+
|
1239
|
+
template <class uword>
|
1240
|
+
EWAHBoolArrayIterator<uword> EWAHBoolArray<uword>::uncompress() const {
|
1241
|
+
return EWAHBoolArrayIterator<uword>(buffer);
|
1242
|
+
}
|
1243
|
+
|
1244
|
+
template <class uword>
|
1245
|
+
EWAHBoolArrayRawIterator<uword> EWAHBoolArray<uword>::raw_iterator() const {
|
1246
|
+
return EWAHBoolArrayRawIterator<uword>(*this);
|
1247
|
+
}
|
1248
|
+
|
1249
|
+
|
1250
|
+
template <class uword>
|
1251
|
+
EWAHBoolArraySparseIterator<uword> EWAHBoolArray<uword>::sparse_uncompress() const {
|
1252
|
+
return EWAHBoolArraySparseIterator<uword>(buffer);
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
template <class uword>
|
1256
|
+
bool EWAHBoolArray<uword>::operator==(const EWAHBoolArray & x) const {
|
1257
|
+
if(sizeinbits != x.sizeinbits) return false;
|
1258
|
+
if(buffer.size() != x.buffer.size()) return false;
|
1259
|
+
for(size_t k = 0; k < buffer.size(); ++k)
|
1260
|
+
if(buffer[k] != x.buffer[k]) return false;
|
1261
|
+
return true;
|
1262
|
+
}
|
1263
|
+
|
1264
|
+
template <class uword>
|
1265
|
+
void EWAHBoolArray<uword>::swap(EWAHBoolArray & x) {
|
1266
|
+
buffer.swap(x.buffer);
|
1267
|
+
size_t tmp = x.sizeinbits;
|
1268
|
+
x.sizeinbits = sizeinbits;
|
1269
|
+
sizeinbits = tmp;
|
1270
|
+
tmp = x.lastRLW;
|
1271
|
+
x.lastRLW = lastRLW;
|
1272
|
+
lastRLW = tmp;
|
1273
|
+
}
|
1274
|
+
|
1275
|
+
template <class uword>
|
1276
|
+
void EWAHBoolArray<uword>::append(const EWAHBoolArray & x) {
|
1277
|
+
if(sizeinbits % wordinbits == 0) {
|
1278
|
+
// hoping for the best?
|
1279
|
+
sizeinbits += x.sizeinbits;
|
1280
|
+
ConstRunningLengthWord<uword> lRLW(buffer[lastRLW]);
|
1281
|
+
if( (lRLW.getRunningLength() == 0) && (lRLW.getNumberOfLiteralWords() == 0)) {
|
1282
|
+
// it could be that the running length word is empty, in such a case,
|
1283
|
+
// we want to get rid of it!
|
1284
|
+
assert(lastRLW == buffer.size()-1);
|
1285
|
+
lastRLW = x.lastRLW + buffer.size() - 1;
|
1286
|
+
buffer.resize(buffer.size()-1);
|
1287
|
+
buffer.insert(buffer.end(),x.buffer.begin(),x.buffer.end());
|
1288
|
+
} else {
|
1289
|
+
lastRLW = x.lastRLW + buffer.size();
|
1290
|
+
buffer.insert(buffer.end(),x.buffer.begin(),x.buffer.end());
|
1291
|
+
}
|
1292
|
+
} else {
|
1293
|
+
stringstream ss;
|
1294
|
+
ss<<"This should really not happen! You are trying to append to a bitmap having a fractional number of words, that is, "<<static_cast<int>(sizeinbits)<<" bits with a word size in bits of "<<static_cast<int>(wordinbits)<<". ";
|
1295
|
+
ss<<"Size of the bitmap being appended: "<<x.sizeinbits<<" bits."<<endl;
|
1296
|
+
throw invalid_argument(ss.str());
|
1297
|
+
}
|
1298
|
+
}
|
1299
|
+
|
1300
|
+
template <class uword>
|
1301
|
+
EWAHBoolArrayIterator<uword>::EWAHBoolArrayIterator(const vector<uword> & parent) :
|
1302
|
+
pointer(0),
|
1303
|
+
myparent(parent),
|
1304
|
+
compressedwords(0), literalwords(0), rl(0), lw(0), b(0) {
|
1305
|
+
if(pointer <myparent.size()) readNewRunningLengthWord();
|
1306
|
+
}
|
1307
|
+
|
1308
|
+
|
1309
|
+
template <class uword>
|
1310
|
+
void EWAHBoolArrayIterator<uword>::readNewRunningLengthWord() {
|
1311
|
+
literalwords = 0;
|
1312
|
+
compressedwords = 0;
|
1313
|
+
ConstRunningLengthWord<uword> rlw(myparent[pointer]);
|
1314
|
+
rl = rlw.getRunningLength();
|
1315
|
+
lw = rlw.getNumberOfLiteralWords();
|
1316
|
+
b = rlw.getRunningBit();
|
1317
|
+
if((rl == 0) && (lw == 0)) {
|
1318
|
+
if(pointer < myparent.size() -1) {
|
1319
|
+
++pointer;
|
1320
|
+
readNewRunningLengthWord();
|
1321
|
+
} else {
|
1322
|
+
assert(pointer >= myparent.size()-1);
|
1323
|
+
pointer = myparent.size();
|
1324
|
+
assert(! hasNext());
|
1325
|
+
}
|
1326
|
+
}
|
1327
|
+
}
|
1328
|
+
|
1329
|
+
template <class uword>
|
1330
|
+
BoolArray<uword> EWAHBoolArray<uword>::toBoolArray() const {
|
1331
|
+
BoolArray<uword> ans(sizeinbits);
|
1332
|
+
EWAHBoolArrayIterator<uword> i = uncompress();
|
1333
|
+
int counter = 0;
|
1334
|
+
while(i.hasNext()) {
|
1335
|
+
ans.setWord(counter++,i.next());
|
1336
|
+
}
|
1337
|
+
return ans;
|
1338
|
+
}
|
1339
|
+
|
1340
|
+
template <class uword>
|
1341
|
+
size_t EWAHBoolArray<uword>::numberOfOnes() {
|
1342
|
+
size_t c (0);
|
1343
|
+
EWAHBoolArraySparseIterator<uword> i = sparse_uncompress();
|
1344
|
+
while(i.hasNext()) {
|
1345
|
+
const uword currentword = i.next();
|
1346
|
+
c += countOnes(currentword);
|
1347
|
+
/*
|
1348
|
+
for(int k = 0; k < wordinbits; ++k) {
|
1349
|
+
if ( (currentword & (static_cast<uword>(1) << k)) != 0)
|
1350
|
+
++c;
|
1351
|
+
}*/
|
1352
|
+
|
1353
|
+
}
|
1354
|
+
return c;
|
1355
|
+
|
1356
|
+
}
|
1357
|
+
|
1358
|
+
|
1359
|
+
|
1360
|
+
|
1361
|
+
template <class uword>
|
1362
|
+
template <class container>
|
1363
|
+
void EWAHBoolArray<uword>::appendRowIDs(container & out, const size_t offset) const {
|
1364
|
+
size_t pointer(0);
|
1365
|
+
size_t currentoffset(offset);
|
1366
|
+
if(RESERVEMEMORY) out.reserve(buffer.size()+64);// trading memory for speed.
|
1367
|
+
while(pointer <buffer.size()) {
|
1368
|
+
ConstRunningLengthWord<uword> rlw(buffer[pointer]);
|
1369
|
+
if(rlw.getRunningBit()) {
|
1370
|
+
for(size_t x = 0; x< static_cast<size_t>(rlw.getRunningLength()*wordinbits); ++x) {
|
1371
|
+
out.push_back(currentoffset + x);
|
1372
|
+
}
|
1373
|
+
}
|
1374
|
+
currentoffset = static_cast<size_t>(currentoffset + rlw.getRunningLength() * wordinbits);
|
1375
|
+
++pointer;
|
1376
|
+
for(uword k = 0; k<rlw.getNumberOfLiteralWords(); ++k) {
|
1377
|
+
const uword currentword = buffer[pointer];
|
1378
|
+
for(uint kk = 0; kk < wordinbits; ++kk) {
|
1379
|
+
if ( ( currentword & static_cast<uword>(static_cast<uword>(1) << kk)) != 0)
|
1380
|
+
out.push_back(currentoffset + kk);
|
1381
|
+
}
|
1382
|
+
currentoffset+=wordinbits;
|
1383
|
+
++pointer;
|
1384
|
+
}
|
1385
|
+
}
|
1386
|
+
}
|
1387
|
+
|
1388
|
+
|
1389
|
+
|
1390
|
+
template <class uword>
|
1391
|
+
bool EWAHBoolArray<uword>::operator!=(const EWAHBoolArray<uword> & x) const {
|
1392
|
+
return !(*this == x);
|
1393
|
+
}
|
1394
|
+
|
1395
|
+
template <class uword>
|
1396
|
+
bool EWAHBoolArray<uword>::operator==(const BoolArray<uword> & x) const {
|
1397
|
+
// could be more efficient
|
1398
|
+
return (this->toBoolArray() == x);
|
1399
|
+
}
|
1400
|
+
|
1401
|
+
template <class uword>
|
1402
|
+
bool EWAHBoolArray<uword>::operator!=(const BoolArray<uword> & x) const {
|
1403
|
+
// could be more efficient
|
1404
|
+
return (this->toBoolArray() != x);
|
1405
|
+
}
|
1406
|
+
|
1407
|
+
|
1408
|
+
template <class uword>
|
1409
|
+
size_t EWAHBoolArray<uword>::addStreamOfEmptyWords(const bool v, const size_t number) {
|
1410
|
+
if(number == 0) return 0;
|
1411
|
+
RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
|
1412
|
+
const bool noliteralword = (lastRunningLengthWord.getNumberOfLiteralWords() == 0);
|
1413
|
+
//firts, if the last running length word is empty, we align it
|
1414
|
+
// this
|
1415
|
+
const uword runlen = lastRunningLengthWord.getRunningLength();
|
1416
|
+
if( ( noliteralword ) && ( runlen == 0 )) {
|
1417
|
+
lastRunningLengthWord.setRunningBit(v);
|
1418
|
+
}
|
1419
|
+
size_t wordsadded (0);
|
1420
|
+
if( ( noliteralword ) && (lastRunningLengthWord.getRunningBit() == v) && (runlen < RunningLengthWord<uword>::largestrunninglengthcount) ) {
|
1421
|
+
// that's the easy case, we are just continuing
|
1422
|
+
uword whatwecanadd = static_cast<uword>( number < static_cast<uword>(RunningLengthWord<uword>::largestrunninglengthcount-runlen) ? number : static_cast<size_t>(RunningLengthWord<uword>::largestrunninglengthcount-runlen) );
|
1423
|
+
lastRunningLengthWord.setRunningLength(static_cast<uword>(runlen+whatwecanadd));
|
1424
|
+
sizeinbits = static_cast<size_t>(sizeinbits + whatwecanadd * wordinbits);
|
1425
|
+
if(number - whatwecanadd> 0 ) wordsadded = static_cast<size_t>(wordsadded + addStreamOfEmptyWords(v, static_cast<size_t>(number - whatwecanadd)));
|
1426
|
+
} else {
|
1427
|
+
buffer.push_back(0);
|
1428
|
+
++wordsadded;
|
1429
|
+
lastRLW = buffer.size() - 1;
|
1430
|
+
RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
|
1431
|
+
uword whatwecanadd = static_cast<uword>( number < RunningLengthWord<uword>::largestrunninglengthcount ? number : static_cast<size_t>(RunningLengthWord<uword>::largestrunninglengthcount) );
|
1432
|
+
lastRunningLengthWord2.setRunningBit(v);
|
1433
|
+
lastRunningLengthWord2.setRunningLength(whatwecanadd);
|
1434
|
+
sizeinbits = static_cast<size_t>(sizeinbits + whatwecanadd * wordinbits);
|
1435
|
+
if(number - whatwecanadd> 0 ) wordsadded = static_cast<size_t>( wordsadded + addStreamOfEmptyWords(v, static_cast<size_t>(number - whatwecanadd)));
|
1436
|
+
}
|
1437
|
+
return wordsadded;
|
1438
|
+
}
|
1439
|
+
|
1440
|
+
|
1441
|
+
template <class uword>
|
1442
|
+
size_t EWAHBoolArray<uword>::addStreamOfDirtyWords(const uword * v, const size_t number) {
|
1443
|
+
if(number == 0) return 0;
|
1444
|
+
RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
|
1445
|
+
const uword NumberOfLiteralWords = lastRunningLengthWord.getNumberOfLiteralWords();
|
1446
|
+
assert(RunningLengthWord<uword>::largestliteralcount >= NumberOfLiteralWords);
|
1447
|
+
const size_t whatwecanadd = number < static_cast<uword>(RunningLengthWord<uword>::largestliteralcount - NumberOfLiteralWords) ? number : static_cast<size_t>(RunningLengthWord<uword>::largestliteralcount - NumberOfLiteralWords);//0x7FFF-NumberOfLiteralWords);
|
1448
|
+
assert(NumberOfLiteralWords+whatwecanadd>=NumberOfLiteralWords);
|
1449
|
+
assert(NumberOfLiteralWords+whatwecanadd<=RunningLengthWord<uword>::largestliteralcount);
|
1450
|
+
lastRunningLengthWord.setNumberOfLiteralWords(static_cast<uword>(NumberOfLiteralWords+whatwecanadd));
|
1451
|
+
assert(lastRunningLengthWord.getNumberOfLiteralWords()==NumberOfLiteralWords+whatwecanadd);
|
1452
|
+
const size_t leftovernumber = number - whatwecanadd;
|
1453
|
+
// add the dirty words...
|
1454
|
+
const size_t oldsize (buffer.size());
|
1455
|
+
buffer.resize(oldsize+whatwecanadd);
|
1456
|
+
memcpy(&buffer[oldsize],v,whatwecanadd*sizeof(uword));
|
1457
|
+
size_t wordsadded(whatwecanadd);
|
1458
|
+
if(leftovernumber>0) {
|
1459
|
+
//add
|
1460
|
+
buffer.push_back(0);
|
1461
|
+
lastRLW=buffer.size() - 1;
|
1462
|
+
++wordsadded;
|
1463
|
+
wordsadded+=addStreamOfDirtyWords(v+whatwecanadd, leftovernumber);
|
1464
|
+
}
|
1465
|
+
assert(wordsadded >= number);
|
1466
|
+
return wordsadded;
|
1467
|
+
}
|
1468
|
+
|
1469
|
+
|
1470
|
+
|
1471
|
+
template <class uword>
|
1472
|
+
size_t EWAHBoolArray<uword>::addEmptyWord(const bool v) {
|
1473
|
+
RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
|
1474
|
+
const bool noliteralword = (lastRunningLengthWord.getNumberOfLiteralWords() == 0);
|
1475
|
+
//firts, if the last running length word is empty, we align it
|
1476
|
+
// this
|
1477
|
+
uword runlen = lastRunningLengthWord.getRunningLength();
|
1478
|
+
if( ( noliteralword ) && ( runlen == 0 )) {
|
1479
|
+
lastRunningLengthWord.setRunningBit(v);
|
1480
|
+
assert(lastRunningLengthWord.getRunningBit() == v);
|
1481
|
+
}
|
1482
|
+
if( ( noliteralword ) && (lastRunningLengthWord.getRunningBit() == v) && (runlen < RunningLengthWord<uword>::largestrunninglengthcount) ) {
|
1483
|
+
lastRunningLengthWord.setRunningLength(static_cast<uword>(runlen+1));
|
1484
|
+
assert(lastRunningLengthWord.getRunningLength() == runlen+1);
|
1485
|
+
return 0;
|
1486
|
+
} else {
|
1487
|
+
// we have to start anew
|
1488
|
+
buffer.push_back(0);
|
1489
|
+
lastRLW = buffer.size() - 1;
|
1490
|
+
RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
|
1491
|
+
assert(lastRunningLengthWord2.getRunningLength()==0);
|
1492
|
+
assert(lastRunningLengthWord2.getRunningBit()==0);
|
1493
|
+
assert(lastRunningLengthWord2.getNumberOfLiteralWords()==0);
|
1494
|
+
lastRunningLengthWord2.setRunningBit(v);
|
1495
|
+
assert(lastRunningLengthWord2.getRunningBit() == v);
|
1496
|
+
lastRunningLengthWord2.setRunningLength(1);
|
1497
|
+
assert(lastRunningLengthWord2.getRunningLength() == 1);
|
1498
|
+
assert(lastRunningLengthWord2.getNumberOfLiteralWords()==0);
|
1499
|
+
return 1;
|
1500
|
+
}
|
1501
|
+
}
|
1502
|
+
|
1503
|
+
|
1504
|
+
|
1505
|
+
template <class uword>
|
1506
|
+
void EWAHBoolArray<uword>::sparselogicaland(EWAHBoolArray &a, EWAHBoolArray &container) {
|
1507
|
+
makeSameSize(a);
|
1508
|
+
container.reset();
|
1509
|
+
if(RESERVEMEMORY) container.buffer.reserve(buffer.size()>a.buffer.size()?buffer.size():a.buffer.size());
|
1510
|
+
assert(sizeInBits() == a.sizeInBits());
|
1511
|
+
/**
|
1512
|
+
* This could possibly be faster if we go around
|
1513
|
+
* the uncompress calls.
|
1514
|
+
*/
|
1515
|
+
EWAHBoolArraySparseIterator<uword> i = a.sparse_uncompress();
|
1516
|
+
EWAHBoolArraySparseIterator<uword> j = sparse_uncompress();
|
1517
|
+
size_t pos (0);
|
1518
|
+
uword x,y;
|
1519
|
+
bool ibehindj,jbehindi;
|
1520
|
+
while(i.hasNext() and j.hasNext()) {
|
1521
|
+
x = i.next();
|
1522
|
+
y = j.next();
|
1523
|
+
ibehindj = i.position() < j.position();
|
1524
|
+
jbehindi = j.position() < i.position();
|
1525
|
+
while (( ibehindj and i.hasNext()) or (jbehindi and j.hasNext())) {
|
1526
|
+
if(ibehindj) x = i.next();
|
1527
|
+
else if(jbehindi) y = j.next();
|
1528
|
+
ibehindj = i.position() < j.position();
|
1529
|
+
jbehindi = j.position() < i.position();
|
1530
|
+
}
|
1531
|
+
size_t nextnonzero = i.position()< j.position() ?i.position(): j.position() ;
|
1532
|
+
if(nextnonzero > pos + 1) {
|
1533
|
+
container.addStreamOfEmptyWords(0, nextnonzero-pos-1);
|
1534
|
+
pos += nextnonzero-pos-1;
|
1535
|
+
}
|
1536
|
+
if(i.position() == j.position()) {
|
1537
|
+
container.add(x & y);
|
1538
|
+
++pos;
|
1539
|
+
}
|
1540
|
+
}
|
1541
|
+
container.setSizeInBits(sizeInBits());
|
1542
|
+
//return answer;
|
1543
|
+
}
|
1544
|
+
|
1545
|
+
|
1546
|
+
|
1547
|
+
template <class uword>
|
1548
|
+
void EWAHBoolArray<uword>::rawlogicalor(EWAHBoolArray &a, EWAHBoolArray &container) {
|
1549
|
+
makeSameSize(a);
|
1550
|
+
container.reset();
|
1551
|
+
if(RESERVEMEMORY) container.buffer.reserve(buffer.size()+a.buffer.size());
|
1552
|
+
assert(sizeInBits() == a.sizeInBits());
|
1553
|
+
EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
|
1554
|
+
EWAHBoolArrayRawIterator<uword> j = raw_iterator();
|
1555
|
+
if(!(i.hasNext() and j.hasNext())) {// hopefully this never happens...
|
1556
|
+
container.setSizeInBits(sizeInBits());
|
1557
|
+
return;
|
1558
|
+
}
|
1559
|
+
// at this point, this should be safe:
|
1560
|
+
BufferedRunningLengthWord<uword> & rlwi = i.next();
|
1561
|
+
BufferedRunningLengthWord<uword> & rlwj = j.next();
|
1562
|
+
//RunningLength;
|
1563
|
+
while (true) {
|
1564
|
+
bool i_is_prey (rlwi.size()<rlwj.size());
|
1565
|
+
BufferedRunningLengthWord<uword> & prey ( i_is_prey ? rlwi: rlwj);
|
1566
|
+
BufferedRunningLengthWord<uword> & predator (i_is_prey ? rlwj: rlwi);
|
1567
|
+
if(prey.getRunningBit() == 0) {
|
1568
|
+
// we have a stream of 0x00
|
1569
|
+
const uword predatorrl (predator.getRunningLength());
|
1570
|
+
const uword preyrl (prey.getRunningLength());
|
1571
|
+
if(predatorrl >= preyrl) {
|
1572
|
+
const uword tobediscarded = preyrl ;
|
1573
|
+
container.addStreamOfEmptyWords(predator.getRunningBit(), static_cast<size_t>(tobediscarded));
|
1574
|
+
} else {
|
1575
|
+
const uword tobediscarded = predatorrl ;
|
1576
|
+
container.addStreamOfEmptyWords(predator.getRunningBit(), static_cast<size_t>(tobediscarded));
|
1577
|
+
if(preyrl - tobediscarded>0) {
|
1578
|
+
const uword * dw_predator (i_is_prey ? j.dirtyWords(): i.dirtyWords());
|
1579
|
+
container.addStreamOfDirtyWords(dw_predator, static_cast<size_t>(preyrl - tobediscarded));
|
1580
|
+
}
|
1581
|
+
}
|
1582
|
+
predator.discardFirstWords(preyrl);
|
1583
|
+
prey.discardFirstWords(preyrl);
|
1584
|
+
} else {
|
1585
|
+
// we have a stream of 1x11
|
1586
|
+
const uword preyrl (prey.getRunningLength());
|
1587
|
+
predator.discardFirstWords(preyrl);
|
1588
|
+
prey.discardFirstWords(preyrl);
|
1589
|
+
container.addStreamOfEmptyWords(1, static_cast<size_t>(preyrl));
|
1590
|
+
}
|
1591
|
+
const uword predatorrl (predator.getRunningLength());
|
1592
|
+
if(predatorrl>0) {
|
1593
|
+
if(predator.getRunningBit() == 0) {
|
1594
|
+
const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
|
1595
|
+
const uword tobediscarded = (predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey : predatorrl;
|
1596
|
+
if(tobediscarded>0) {
|
1597
|
+
const uword * dw_prey (i_is_prey ? i.dirtyWords(): j.dirtyWords());
|
1598
|
+
container.addStreamOfDirtyWords(dw_prey, static_cast<size_t>(tobediscarded));
|
1599
|
+
predator.discardFirstWords(tobediscarded);
|
1600
|
+
prey.discardFirstWords(tobediscarded);
|
1601
|
+
}
|
1602
|
+
} else {
|
1603
|
+
const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
|
1604
|
+
const uword tobediscarded = (predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey : predatorrl;
|
1605
|
+
predator.discardFirstWords(tobediscarded);
|
1606
|
+
prey.discardFirstWords(tobediscarded);
|
1607
|
+
container.addStreamOfEmptyWords(1, static_cast<size_t>(tobediscarded));
|
1608
|
+
}
|
1609
|
+
}
|
1610
|
+
assert(prey.getRunningLength() ==0);
|
1611
|
+
// all that is left to do now is to AND the dirty words
|
1612
|
+
uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
|
1613
|
+
if(nbre_dirty_prey > 0) {
|
1614
|
+
assert(predator.getRunningLength() ==0);
|
1615
|
+
const uword * idirty = i.dirtyWords();
|
1616
|
+
const uword * jdirty = j.dirtyWords();
|
1617
|
+
for(uword k = 0; k< nbre_dirty_prey; ++k) {
|
1618
|
+
container.add(idirty[k] | jdirty[k]);
|
1619
|
+
}
|
1620
|
+
predator.discardFirstWords(nbre_dirty_prey);
|
1621
|
+
}
|
1622
|
+
if( i_is_prey ) {
|
1623
|
+
if(!i.hasNext()) break;
|
1624
|
+
rlwi = i.next();
|
1625
|
+
} else {
|
1626
|
+
if(!j.hasNext()) break;
|
1627
|
+
rlwj = j.next();
|
1628
|
+
}
|
1629
|
+
}
|
1630
|
+
container.setSizeInBits(sizeInBits());
|
1631
|
+
}
|
1632
|
+
|
1633
|
+
|
1634
|
+
template <class uword>
|
1635
|
+
void EWAHBoolArray<uword>::rawlogicaland(EWAHBoolArray &a, EWAHBoolArray &container) {
|
1636
|
+
makeSameSize(a);
|
1637
|
+
container.reset();
|
1638
|
+
if(RESERVEMEMORY) container.buffer.reserve(buffer.size()>a.buffer.size()?buffer.size():a.buffer.size());
|
1639
|
+
assert(sizeInBits() == a.sizeInBits());
|
1640
|
+
EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
|
1641
|
+
EWAHBoolArrayRawIterator<uword> j = raw_iterator();
|
1642
|
+
if(!(i.hasNext() and j.hasNext())) {// hopefully this never happens...
|
1643
|
+
container.setSizeInBits(sizeInBits());
|
1644
|
+
return;
|
1645
|
+
}
|
1646
|
+
// at this point, this should be safe:
|
1647
|
+
BufferedRunningLengthWord<uword> & rlwi = i.next();
|
1648
|
+
BufferedRunningLengthWord<uword> & rlwj = j.next();
|
1649
|
+
//RunningLength;
|
1650
|
+
while (true) {
|
1651
|
+
bool i_is_prey (rlwi.size()<rlwj.size());
|
1652
|
+
BufferedRunningLengthWord<uword> & prey ( i_is_prey ? rlwi: rlwj);
|
1653
|
+
BufferedRunningLengthWord<uword> & predator (i_is_prey ? rlwj: rlwi);
|
1654
|
+
if(prey.getRunningBit() == 0) {
|
1655
|
+
const uword preyrl (prey.getRunningLength());
|
1656
|
+
predator.discardFirstWords(preyrl);
|
1657
|
+
prey.discardFirstWords(preyrl);
|
1658
|
+
container.addStreamOfEmptyWords(0, static_cast<size_t>(preyrl));
|
1659
|
+
} else {
|
1660
|
+
// we have a stream of 1x11
|
1661
|
+
const uword predatorrl (predator.getRunningLength());
|
1662
|
+
const uword preyrl (prey.getRunningLength());
|
1663
|
+
const uword tobediscarded = (predatorrl >= preyrl) ? preyrl : predatorrl;
|
1664
|
+
container.addStreamOfEmptyWords(predator.getRunningBit(), static_cast<size_t>(tobediscarded));
|
1665
|
+
if(preyrl - tobediscarded>0) {
|
1666
|
+
const uword * dw_predator (i_is_prey ? j.dirtyWords(): i.dirtyWords());
|
1667
|
+
container.addStreamOfDirtyWords(dw_predator, static_cast<size_t>(preyrl - tobediscarded));
|
1668
|
+
}
|
1669
|
+
predator.discardFirstWords(preyrl);
|
1670
|
+
prey.discardFirstWords(preyrl);
|
1671
|
+
}
|
1672
|
+
const uword predatorrl (predator.getRunningLength());
|
1673
|
+
if(predatorrl>0) {
|
1674
|
+
if(predator.getRunningBit() == 0) {
|
1675
|
+
const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
|
1676
|
+
const uword tobediscarded = (predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey : predatorrl;
|
1677
|
+
predator.discardFirstWords(tobediscarded);
|
1678
|
+
prey.discardFirstWords(tobediscarded);
|
1679
|
+
container.addStreamOfEmptyWords(0, static_cast<size_t>(tobediscarded));
|
1680
|
+
} else {
|
1681
|
+
const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
|
1682
|
+
const uword tobediscarded = (predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey : predatorrl;
|
1683
|
+
if(tobediscarded>0) {
|
1684
|
+
const uword * dw_prey (i_is_prey ? i.dirtyWords(): j.dirtyWords());
|
1685
|
+
container.addStreamOfDirtyWords(dw_prey, static_cast<size_t>(tobediscarded));
|
1686
|
+
predator.discardFirstWords(tobediscarded);
|
1687
|
+
prey.discardFirstWords(tobediscarded);
|
1688
|
+
}
|
1689
|
+
}
|
1690
|
+
}
|
1691
|
+
assert(prey.getRunningLength() ==0);
|
1692
|
+
// all that is left to do now is to AND the dirty words
|
1693
|
+
uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
|
1694
|
+
if(nbre_dirty_prey > 0) {
|
1695
|
+
assert(predator.getRunningLength() ==0);
|
1696
|
+
const uword * idirty = i.dirtyWords();
|
1697
|
+
const uword * jdirty = j.dirtyWords();
|
1698
|
+
for(uword k = 0; k< nbre_dirty_prey; ++k) {
|
1699
|
+
container.add(idirty[k] & jdirty[k]);
|
1700
|
+
}
|
1701
|
+
predator.discardFirstWords(nbre_dirty_prey);
|
1702
|
+
}
|
1703
|
+
if( i_is_prey ) {
|
1704
|
+
if(!i.hasNext()) break;
|
1705
|
+
rlwi = i.next();
|
1706
|
+
} else {
|
1707
|
+
if(!j.hasNext()) break;
|
1708
|
+
rlwj = j.next();
|
1709
|
+
}
|
1710
|
+
}
|
1711
|
+
container.setSizeInBits(sizeInBits());
|
1712
|
+
}
|
1713
|
+
|
1714
|
+
|
1715
|
+
|
1716
|
+
|
1717
|
+
template <class uword>
|
1718
|
+
BitmapStatistics EWAHBoolArray<uword>::computeStatistics() const {
|
1719
|
+
//uint totalcompressed(0), totalliteral(0);
|
1720
|
+
BitmapStatistics bs;
|
1721
|
+
EWAHBoolArrayRawIterator<uword> i = raw_iterator();
|
1722
|
+
while(i.hasNext()) {
|
1723
|
+
BufferedRunningLengthWord<uword> &brlw (i.next());
|
1724
|
+
++bs.runningwordmarker;
|
1725
|
+
bs.totalliteral += brlw.getNumberOfLiteralWords();
|
1726
|
+
bs.totalcompressed += brlw.getRunningLength();
|
1727
|
+
if(brlw.getRunningLength() == RunningLengthWord<uword>::largestrunninglengthcount) {
|
1728
|
+
++bs.maximumofrunningcounterreached;
|
1729
|
+
}
|
1730
|
+
}
|
1731
|
+
return bs;
|
1732
|
+
}
|
1733
|
+
|
1734
|
+
|
1735
|
+
template <class uword>
|
1736
|
+
void EWAHBoolArray<uword>::debugprintout() const {
|
1737
|
+
cout << "==printing out EWAHBoolArray=="<<endl;
|
1738
|
+
cout <<"Number of compressed words: "<< buffer.size()<< endl;
|
1739
|
+
size_t pointer = 0;
|
1740
|
+
while(pointer <buffer.size()) {
|
1741
|
+
ConstRunningLengthWord<uword> rlw(buffer[pointer]);
|
1742
|
+
bool b = rlw.getRunningBit() ;
|
1743
|
+
uword rl = rlw.getRunningLength() ;
|
1744
|
+
uword lw = rlw.getNumberOfLiteralWords();
|
1745
|
+
cout << "pointer = "<<pointer<<" running bit="<<b<<" running length="<<rl<<" lit. words="<<lw<<endl;
|
1746
|
+
for(uword j = 0; j < lw ; ++j) {
|
1747
|
+
const uword & w = buffer[pointer+j+1];
|
1748
|
+
cout<<toBinaryString(w)<<endl;;
|
1749
|
+
}
|
1750
|
+
pointer += lw + 1;
|
1751
|
+
}
|
1752
|
+
cout << "==END=="<<endl;
|
1753
|
+
}
|
1754
|
+
|
1755
|
+
template <class uword>
|
1756
|
+
size_t EWAHBoolArray<uword>::sizeOnDisk() const {
|
1757
|
+
return sizeof(sizeinbits)+sizeof(size_t)+sizeof(uword)*buffer.size();
|
1758
|
+
}
|
1759
|
+
|
1760
|
+
|
1761
|
+
|
1762
|
+
|
1763
|
+
#endif
|