hyperloglog 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -0
- data/Manifest +14 -0
- data/README.md +21 -0
- data/Rakefile +18 -0
- data/ext/boolarray.h +179 -0
- data/ext/ewah.h +1763 -0
- data/ext/extconf.rb +7 -0
- data/ext/hyperloglog.cpp +263 -0
- data/ext/murmur3.h +104 -0
- data/hyperloglog.gemspec +30 -0
- data/spec/data/integers.txt +62382 -0
- data/spec/data/small_integers.txt +1000 -0
- data/spec/data/small_integers2.txt +1000 -0
- data/spec/hyperloglog_spec.rb +48 -0
- data/spec/spec.opts +2 -0
- metadata +92 -0
data/CHANGELOG
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
v0.0.1. Initial Version.
|
data/Manifest
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
CHANGELOG
|
2
|
+
Manifest
|
3
|
+
README.md
|
4
|
+
Rakefile
|
5
|
+
ext/boolarray.h
|
6
|
+
ext/ewah.h
|
7
|
+
ext/extconf.rb
|
8
|
+
ext/hyperloglog.cpp
|
9
|
+
ext/murmur3.h
|
10
|
+
spec/data/integers.txt
|
11
|
+
spec/data/small_integers.txt
|
12
|
+
spec/data/small_integers2.txt
|
13
|
+
spec/hyperloglog_spec.rb
|
14
|
+
spec/spec.opts
|
data/README.md
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# HyperLogLog for Ruby
|
2
|
+
|
3
|
+
# External Libraries Included
|
4
|
+
|
5
|
+
Murmur3
|
6
|
+
https://github.com/PeterScott/murmur3
|
7
|
+
|
8
|
+
EWAHBoolArray
|
9
|
+
https://github.com/lemire/EWAHBoolArray
|
10
|
+
|
11
|
+
# Example
|
12
|
+
|
13
|
+
# Build a new estimator
|
14
|
+
builder = HyperBuilder.new
|
15
|
+
0.upto(100).each{|user_id| builder.offer(user_id)}
|
16
|
+
|
17
|
+
# Read an estimator from bytes on disk
|
18
|
+
estimator = HyperEstimator.new(File.read('bytes.txt'))
|
19
|
+
|
20
|
+
# Estimate the union of our two sources
|
21
|
+
estimate = HyperEstimator.estimate(builder.estimator, estimator)
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'echoe'
|
2
|
+
require 'rake'
|
3
|
+
require 'rspec/core/rake_task'
|
4
|
+
|
5
|
+
task :default => :spec
|
6
|
+
|
7
|
+
Echoe.new("hyperloglog") do |p|
|
8
|
+
p.author = "Josh Ferguson"
|
9
|
+
p.email = "josh@besquared.net"
|
10
|
+
p.project = "hyperloglog"
|
11
|
+
p.summary = "An efficient implementation of the HyperLogLog cardinality estimator"
|
12
|
+
p.url = "http://www.github.com/besquared/hyperloglog/"
|
13
|
+
end
|
14
|
+
|
15
|
+
RSpec::Core::RakeTask.new(:spec) do |t|
|
16
|
+
t.pattern = 'spec/**/*_spec.rb'
|
17
|
+
t.rspec_opts = ['--options', "\"spec/spec.opts\""]
|
18
|
+
end
|
data/ext/boolarray.h
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
#ifndef BOOLARRAY_H
|
2
|
+
#define BOOLARRAY_H
|
3
|
+
|
4
|
+
#include <cassert>
|
5
|
+
#include <iostream>
|
6
|
+
#include <vector>
|
7
|
+
#include <stdexcept>
|
8
|
+
#include <sstream>
|
9
|
+
#include <iso646.h> // mostly for Microsoft compilers
|
10
|
+
|
11
|
+
typedef unsigned long ulong;
|
12
|
+
typedef unsigned int uint;
|
13
|
+
typedef unsigned short uword16;
|
14
|
+
typedef unsigned int uword32;
|
15
|
+
typedef unsigned long long uword64;
|
16
|
+
|
17
|
+
|
18
|
+
using namespace std;
|
19
|
+
|
20
|
+
/**
|
21
|
+
* A dynamic bitset implementation. (without compression).
|
22
|
+
* This is not tremendously useful, but it is provided as a reference.
|
23
|
+
*/
|
24
|
+
template <class uword=uword32>
|
25
|
+
class BoolArray {
|
26
|
+
public:
|
27
|
+
BoolArray(const size_t n, const uword initval= 0):buffer(n / wordinbits + (n % wordinbits == 0 ? 0 : 1),initval),sizeinbits(n) { }
|
28
|
+
|
29
|
+
BoolArray():buffer(),sizeinbits(0) {}
|
30
|
+
|
31
|
+
BoolArray(const BoolArray & ba) : buffer(ba.buffer),sizeinbits(ba.sizeinbits) {}
|
32
|
+
void read(istream & in) {
|
33
|
+
sizeinbits = 0;
|
34
|
+
in.read(reinterpret_cast<char *>(&sizeinbits), sizeof(sizeinbits));
|
35
|
+
buffer.resize(sizeinbits / wordinbits + (sizeinbits % wordinbits == 0 ? 0 : 1));
|
36
|
+
in.read(reinterpret_cast<char *>(&buffer[0]),buffer.size()*sizeof(uword));
|
37
|
+
}
|
38
|
+
|
39
|
+
void readBuffer(istream & in,const size_t size) {
|
40
|
+
buffer.resize(size);
|
41
|
+
in.read(reinterpret_cast<char *>(&buffer[0]),buffer.size()*sizeof(uword));
|
42
|
+
sizeinbits = size*sizeof(uword)*8;
|
43
|
+
}
|
44
|
+
|
45
|
+
void setSizeInBits(const size_t sizeib) {
|
46
|
+
sizeinbits = sizeib;
|
47
|
+
}
|
48
|
+
|
49
|
+
|
50
|
+
void write(ostream & out) {
|
51
|
+
write(out,sizeinbits);
|
52
|
+
}
|
53
|
+
|
54
|
+
void write(ostream & out, const size_t numberofbits) const {
|
55
|
+
const size_t size = numberofbits/wordinbits + (numberofbits%wordinbits == 0 ? 0: 1);
|
56
|
+
out.write(reinterpret_cast<const char *>(&numberofbits), sizeof(numberofbits));
|
57
|
+
out.write(reinterpret_cast<const char *>(&buffer[0]),size*sizeof(uword));
|
58
|
+
}
|
59
|
+
|
60
|
+
void writeBuffer(ostream & out, const size_t numberofbits) const {
|
61
|
+
const size_t size = numberofbits/wordinbits + (numberofbits%wordinbits == 0 ? 0: 1);
|
62
|
+
out.write(reinterpret_cast<const char *>(&buffer[0]),size*sizeof(uword));
|
63
|
+
}
|
64
|
+
|
65
|
+
size_t sizeOnDisk() const {
|
66
|
+
size_t size = sizeinbits/wordinbits + (sizeinbits%wordinbits == 0 ? 0: 1);
|
67
|
+
return sizeof(sizeinbits) + size*sizeof(uword);
|
68
|
+
}
|
69
|
+
|
70
|
+
|
71
|
+
BoolArray& operator=(const BoolArray & x) {
|
72
|
+
this->buffer = x.buffer;
|
73
|
+
this->sizeinbits = x.sizeinbits;
|
74
|
+
return *this;
|
75
|
+
}
|
76
|
+
|
77
|
+
bool operator==(const BoolArray & x) const {
|
78
|
+
if(sizeinbits != x.sizeinbits) return false;
|
79
|
+
assert(buffer.size() == x.buffer.size());
|
80
|
+
for(size_t k = 0; k < buffer.size(); ++k)
|
81
|
+
if(buffer[k] != x.buffer[k]) return false;
|
82
|
+
return true;
|
83
|
+
}
|
84
|
+
|
85
|
+
bool operator!=(const BoolArray & x) const {
|
86
|
+
return ! operator==(x);
|
87
|
+
}
|
88
|
+
|
89
|
+
void setWord(const size_t pos, const uword val) {
|
90
|
+
assert(pos < buffer.size());
|
91
|
+
buffer[pos] = val;
|
92
|
+
}
|
93
|
+
|
94
|
+
void add(const uword val) {
|
95
|
+
if(sizeinbits % wordinbits != 0) throw invalid_argument("you probably didn't want to do this");
|
96
|
+
sizeinbits += wordinbits;
|
97
|
+
buffer.push_back(val);
|
98
|
+
}
|
99
|
+
|
100
|
+
uword getWord(const size_t pos) const {
|
101
|
+
assert(pos < buffer.size());
|
102
|
+
return buffer[pos];
|
103
|
+
}
|
104
|
+
|
105
|
+
/**
|
106
|
+
* set to true (whether it was already set to true or not)
|
107
|
+
*
|
108
|
+
* TODO this is an expensive (random access) API, you really ought to
|
109
|
+
* prepare a new word and then append it.
|
110
|
+
*/
|
111
|
+
void set(const size_t pos) {
|
112
|
+
buffer[pos/wordinbits] |= ( static_cast<uword>(1) << (pos % wordinbits) ) ;
|
113
|
+
}
|
114
|
+
|
115
|
+
/**
|
116
|
+
* set to false (whether it was already set to false or not)
|
117
|
+
*
|
118
|
+
* TODO this is an expensive (random access) API, you really ought to
|
119
|
+
* prepare a new word and then append it.
|
120
|
+
*/
|
121
|
+
void unset(const size_t pos) {
|
122
|
+
buffer[pos/wordinbits] |= ~( static_cast<uword>(1) << (pos % wordinbits) ) ;
|
123
|
+
}
|
124
|
+
|
125
|
+
/**
|
126
|
+
* true of false? (set or unset)
|
127
|
+
*/
|
128
|
+
bool get(const size_t pos) const {
|
129
|
+
assert(pos/wordinbits < buffer.size());
|
130
|
+
return (buffer[pos/wordinbits] & ( static_cast<uword>(1) << (pos % wordinbits) )) != 0;
|
131
|
+
}
|
132
|
+
|
133
|
+
/**
|
134
|
+
* set all bits to 0
|
135
|
+
*/
|
136
|
+
void reset() {
|
137
|
+
memset(&buffer[0],0,sizeof(uword)*buffer.size());
|
138
|
+
sizeinbits = 0;
|
139
|
+
}
|
140
|
+
|
141
|
+
size_t sizeInBits() const {
|
142
|
+
return sizeinbits;
|
143
|
+
}
|
144
|
+
|
145
|
+
~BoolArray() {}
|
146
|
+
|
147
|
+
void logicaland(const BoolArray & ba, BoolArray & out);
|
148
|
+
|
149
|
+
void logicalor(const BoolArray & ba, BoolArray & out);
|
150
|
+
|
151
|
+
|
152
|
+
|
153
|
+
inline void printout(ostream &o = cout) {
|
154
|
+
for(size_t k = 0; k < sizeinbits; ++k)
|
155
|
+
o << get(k) << " ";
|
156
|
+
o << endl;
|
157
|
+
}
|
158
|
+
|
159
|
+
void append(const BoolArray & a);
|
160
|
+
|
161
|
+
enum { wordinbits = sizeof(uword) * 8};
|
162
|
+
|
163
|
+
private:
|
164
|
+
vector<uword> buffer;
|
165
|
+
size_t sizeinbits;
|
166
|
+
|
167
|
+
};
|
168
|
+
|
169
|
+
template <class uword>
|
170
|
+
void BoolArray<uword>::append(const BoolArray & a) {
|
171
|
+
if(sizeinbits % wordinbits == 0) {
|
172
|
+
buffer.insert(buffer.end(),a.buffer.begin(),a.buffer.end());
|
173
|
+
} else {
|
174
|
+
throw invalid_argument("Cannot append if parent does not meet boundary");
|
175
|
+
}
|
176
|
+
sizeinbits += a.sizeinbits;
|
177
|
+
}
|
178
|
+
|
179
|
+
#endif
|
data/ext/ewah.h
ADDED
@@ -0,0 +1,1763 @@
|
|
1
|
+
#ifndef EWAH_H
|
2
|
+
#define EWAH_H
|
3
|
+
|
4
|
+
#include <string.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
#include <cassert>
|
7
|
+
#include <iostream>
|
8
|
+
#include <vector>
|
9
|
+
#include <stdexcept>
|
10
|
+
#include <cstddef>
|
11
|
+
#include <iso646.h> // mostly for Microsoft compilers
|
12
|
+
|
13
|
+
#include "boolarray.h"
|
14
|
+
|
15
|
+
// taken from stackoverflow
|
16
|
+
#ifndef NDEBUG
|
17
|
+
# define ASSERT(condition, message) \
|
18
|
+
do { \
|
19
|
+
if (! (condition)) { \
|
20
|
+
std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \
|
21
|
+
<< " line " << __LINE__ << ": " << message << std::endl; \
|
22
|
+
std::exit(EXIT_FAILURE); \
|
23
|
+
} \
|
24
|
+
} while (false)
|
25
|
+
#else
|
26
|
+
# define ASSERT(condition, message) do { } while (false)
|
27
|
+
#endif
|
28
|
+
|
29
|
+
|
30
|
+
using namespace std;
|
31
|
+
|
32
|
+
|
33
|
+
/**
|
34
|
+
* count the number of bits set to one (32 bit version)
|
35
|
+
*/
|
36
|
+
uint countOnes(uword32 v) {
|
37
|
+
v = v - ((v >> 1) & 0x55555555);
|
38
|
+
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
|
39
|
+
return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
|
40
|
+
}
|
41
|
+
/**
|
42
|
+
* count the number of bits set to one (64 bit version)
|
43
|
+
*/
|
44
|
+
uint countOnes(uword64 v) {
|
45
|
+
return countOnes(static_cast<uword32>(v))+ countOnes(static_cast<uword32>(v>>32));
|
46
|
+
}
|
47
|
+
|
48
|
+
|
49
|
+
uint countOnes(uword16 v) {
|
50
|
+
uint c;
|
51
|
+
for ( c = 0; v; c++) {
|
52
|
+
v &= v - 1;
|
53
|
+
}
|
54
|
+
return c;
|
55
|
+
}
|
56
|
+
|
57
|
+
/**
|
58
|
+
* Returns the binary representation of a binary word.
|
59
|
+
*/
|
60
|
+
template <class uword>
|
61
|
+
inline string toBinaryString(const uword w) {
|
62
|
+
stringstream s;
|
63
|
+
for(uint k = 0; k <sizeof(uword)*8;++k) {
|
64
|
+
if(w & (static_cast<uword>(1)<<k)) s <<"1"; else s << "0";
|
65
|
+
}
|
66
|
+
string ans;
|
67
|
+
s >> ans;
|
68
|
+
return ans;
|
69
|
+
}
|
70
|
+
|
71
|
+
|
72
|
+
/**
|
73
|
+
* For expert users.
|
74
|
+
* This class is used to represent a special type of word storing
|
75
|
+
* a run length. It is defined by the Enhanced Word Aligned Hybrid (EWAH)
|
76
|
+
* format. You don't normally need to access this class.
|
77
|
+
*/
|
78
|
+
template <class uword=uword32>
|
79
|
+
class RunningLengthWord {
|
80
|
+
public:
|
81
|
+
RunningLengthWord (uword & data) : mydata(data) { }
|
82
|
+
|
83
|
+
RunningLengthWord(const RunningLengthWord & rlw) : mydata(rlw.mydata) {}
|
84
|
+
|
85
|
+
RunningLengthWord& operator=(const RunningLengthWord & rlw) {
|
86
|
+
mydata = rlw.mydata;
|
87
|
+
return *this;
|
88
|
+
}
|
89
|
+
|
90
|
+
|
91
|
+
/**
|
92
|
+
* Which bit is being repeated?
|
93
|
+
*/
|
94
|
+
bool getRunningBit() const {
|
95
|
+
return mydata & static_cast<uword>(1);
|
96
|
+
}
|
97
|
+
|
98
|
+
|
99
|
+
/**
|
100
|
+
* how many words should be filled by the running bit
|
101
|
+
*/
|
102
|
+
static inline bool getRunningBit(uword data) {
|
103
|
+
return data & static_cast<uword>(1) ;
|
104
|
+
}
|
105
|
+
|
106
|
+
/**
|
107
|
+
* how many words should be filled by the running bit
|
108
|
+
*/
|
109
|
+
uword getRunningLength() const {
|
110
|
+
return (mydata >> 1) & largestrunninglengthcount ;
|
111
|
+
}
|
112
|
+
|
113
|
+
|
114
|
+
/**
|
115
|
+
* followed by how many literal words?
|
116
|
+
*/
|
117
|
+
static inline uword getRunningLength(uword data) {
|
118
|
+
return (data >> 1) & largestrunninglengthcount ;
|
119
|
+
}
|
120
|
+
|
121
|
+
/**
|
122
|
+
* followed by how many literal words?
|
123
|
+
*/
|
124
|
+
uword getNumberOfLiteralWords() const {
|
125
|
+
return static_cast<uword>(mydata >> (1+runninglengthbits));
|
126
|
+
}
|
127
|
+
|
128
|
+
/**
|
129
|
+
* Total of getRunningLength() and getNumberOfLiteralWords()
|
130
|
+
*/
|
131
|
+
uword size() const {
|
132
|
+
return getRunningLength() + getNumberOfLiteralWords();
|
133
|
+
}
|
134
|
+
|
135
|
+
|
136
|
+
/**
|
137
|
+
* followed by how many literal words?
|
138
|
+
*/
|
139
|
+
static inline uword getNumberOfLiteralWords(uword data) {
|
140
|
+
return data >> (1+runninglengthbits);
|
141
|
+
}
|
142
|
+
|
143
|
+
|
144
|
+
/**
|
145
|
+
* running length of which type of bits
|
146
|
+
*/
|
147
|
+
void setRunningBit(bool b) {
|
148
|
+
if(b) mydata |= static_cast<uword>(1);
|
149
|
+
else mydata &= static_cast<uword>(~1);
|
150
|
+
}
|
151
|
+
|
152
|
+
|
153
|
+
|
154
|
+
/**
|
155
|
+
* running length of which type of bits
|
156
|
+
*/
|
157
|
+
static inline void setRunningBit(uword & data, bool b) {
|
158
|
+
if(b) data |= static_cast<uword>(1);
|
159
|
+
else data &= static_cast<uword>(~1);
|
160
|
+
}
|
161
|
+
|
162
|
+
|
163
|
+
/**
|
164
|
+
* running length of which type of bits
|
165
|
+
*/
|
166
|
+
void discardFirstWords(uword x) {
|
167
|
+
assert(x<= size());
|
168
|
+
const uword rl ( getRunningLength() );
|
169
|
+
if(rl >= x) {
|
170
|
+
setRunningLength(rl - x);
|
171
|
+
return;
|
172
|
+
}
|
173
|
+
x -= rl;
|
174
|
+
setRunningLength(0);
|
175
|
+
setNumberOfLiteralWords(getNumberOfLiteralWords() - x);
|
176
|
+
}
|
177
|
+
|
178
|
+
void setRunningLength(uword l) {
|
179
|
+
mydata |= shiftedlargestrunninglengthcount;
|
180
|
+
mydata &= static_cast<uword>(l << 1) | notshiftedlargestrunninglengthcount;
|
181
|
+
}
|
182
|
+
|
183
|
+
// static call for people who hate objects
|
184
|
+
static inline void setRunningLength(uword & data, uword l) {
|
185
|
+
data |= shiftedlargestrunninglengthcount;
|
186
|
+
data &= static_cast<uword>(l<<1) | notshiftedlargestrunninglengthcount;
|
187
|
+
}
|
188
|
+
|
189
|
+
void setNumberOfLiteralWords(uword l) {
|
190
|
+
mydata |= notrunninglengthplusrunningbit;
|
191
|
+
mydata &= static_cast<uword>(l << (runninglengthbits +1) ) |runninglengthplusrunningbit;
|
192
|
+
}
|
193
|
+
// static call for people who hate objects
|
194
|
+
static inline void setNumberOfLiteralWords(uword & data, uword l) {
|
195
|
+
data |= notrunninglengthplusrunningbit;
|
196
|
+
data &= static_cast<uword>(l << (runninglengthbits +1) ) |runninglengthplusrunningbit;
|
197
|
+
}
|
198
|
+
static const uint runninglengthbits = sizeof(uword)*4;//16;
|
199
|
+
static const uint literalbits = sizeof(uword)*8 - 1 - runninglengthbits;
|
200
|
+
static const uword largestliteralcount = (static_cast<uword>(1)<<literalbits) - 1;
|
201
|
+
static const uword largestrunninglengthcount = (static_cast<uword>(1)<<runninglengthbits)-1;
|
202
|
+
static const uword shiftedlargestrunninglengthcount = largestrunninglengthcount<<1;
|
203
|
+
static const uword notshiftedlargestrunninglengthcount = static_cast<uword>(~shiftedlargestrunninglengthcount);
|
204
|
+
static const uword runninglengthplusrunningbit = (static_cast<uword>(1)<<(runninglengthbits+1)) - 1;
|
205
|
+
static const uword notrunninglengthplusrunningbit =static_cast<uword>(~runninglengthplusrunningbit);
|
206
|
+
static const uword notlargestrunninglengthcount =static_cast<uword>(~largestrunninglengthcount);
|
207
|
+
|
208
|
+
uword & mydata;
|
209
|
+
private:
|
210
|
+
};
|
211
|
+
|
212
|
+
|
213
|
+
/**
|
214
|
+
* Same as RunningLengthWord, except that the values cannot be modified.
|
215
|
+
*/
|
216
|
+
template <class uword=uword32>
|
217
|
+
class ConstRunningLengthWord {
|
218
|
+
public:
|
219
|
+
|
220
|
+
ConstRunningLengthWord () : mydata(0) {
|
221
|
+
}
|
222
|
+
|
223
|
+
ConstRunningLengthWord (const uword data) : mydata(data) {
|
224
|
+
}
|
225
|
+
|
226
|
+
ConstRunningLengthWord(const ConstRunningLengthWord & rlw) : mydata(rlw.mydata) {}
|
227
|
+
|
228
|
+
/**
|
229
|
+
* Which bit is being repeated?
|
230
|
+
*/
|
231
|
+
bool getRunningBit() const {
|
232
|
+
return mydata & static_cast<uword>(1);
|
233
|
+
}
|
234
|
+
|
235
|
+
/**
|
236
|
+
* how many words should be filled by the running bit
|
237
|
+
*/
|
238
|
+
uword getRunningLength() const {
|
239
|
+
return (mydata >> 1) & RunningLengthWord<uword>::largestrunninglengthcount ;
|
240
|
+
}
|
241
|
+
|
242
|
+
/**
|
243
|
+
* followed by how many literal words?
|
244
|
+
*/
|
245
|
+
uword getNumberOfLiteralWords() const {
|
246
|
+
return static_cast<uword>(mydata >> (1+RunningLengthWord<uword>::runninglengthbits));
|
247
|
+
}
|
248
|
+
|
249
|
+
/**
|
250
|
+
* Total of getRunningLength() and getNumberOfLiteralWords()
|
251
|
+
*/
|
252
|
+
uword size() const {
|
253
|
+
return getRunningLength() + getNumberOfLiteralWords();
|
254
|
+
}
|
255
|
+
|
256
|
+
uword mydata;
|
257
|
+
};
|
258
|
+
|
259
|
+
|
260
|
+
|
261
|
+
/**
|
262
|
+
* Same as RunningLengthWord, except that the values are buffered for quick
|
263
|
+
* access.
|
264
|
+
*/
|
265
|
+
template <class uword=uword32>
|
266
|
+
class BufferedRunningLengthWord {
|
267
|
+
public:
|
268
|
+
BufferedRunningLengthWord (const uword & data) : RunningBit(data & static_cast<uword>(1)),
|
269
|
+
RunningLength((data >> 1) & RunningLengthWord<uword>::largestrunninglengthcount),
|
270
|
+
NumberOfLiteralWords(static_cast<uword>(data >> (1+RunningLengthWord<uword>::runninglengthbits))) {
|
271
|
+
}
|
272
|
+
BufferedRunningLengthWord (const RunningLengthWord<uword> & p) : RunningBit(p.mydata & static_cast<uword>(1)),
|
273
|
+
RunningLength((p.mydata >> 1) & RunningLengthWord<uword>::largestrunninglengthcount),
|
274
|
+
NumberOfLiteralWords(p.mydata >> (1+RunningLengthWord<uword>::runninglengthbits)) {
|
275
|
+
}
|
276
|
+
|
277
|
+
void read(const uword & data) {
|
278
|
+
RunningBit = data & static_cast<uword>(1);
|
279
|
+
RunningLength = (data >> 1) & RunningLengthWord<uword>::largestrunninglengthcount;
|
280
|
+
NumberOfLiteralWords = static_cast<uword>(data >> (1+RunningLengthWord<uword>::runninglengthbits));
|
281
|
+
}
|
282
|
+
|
283
|
+
/**
|
284
|
+
* Which bit is being repeated?
|
285
|
+
*/
|
286
|
+
bool getRunningBit() const {
|
287
|
+
return RunningBit;
|
288
|
+
}
|
289
|
+
|
290
|
+
void discardFirstWords(uword x) {
|
291
|
+
assert(x<= size());
|
292
|
+
if(RunningLength >= x) {
|
293
|
+
RunningLength = static_cast<uword>(RunningLength - x);
|
294
|
+
return;
|
295
|
+
}
|
296
|
+
x = static_cast<uword>( x - RunningLength);
|
297
|
+
RunningLength = 0;
|
298
|
+
NumberOfLiteralWords = static_cast<uword>(NumberOfLiteralWords - x);
|
299
|
+
}
|
300
|
+
|
301
|
+
/**
|
302
|
+
* how many words should be filled by the running bit (see previous method)
|
303
|
+
*/
|
304
|
+
uword getRunningLength() const {
|
305
|
+
return RunningLength ;
|
306
|
+
}
|
307
|
+
|
308
|
+
/**
|
309
|
+
* followed by how many literal words?
|
310
|
+
*/
|
311
|
+
uword getNumberOfLiteralWords() const {
|
312
|
+
return NumberOfLiteralWords;
|
313
|
+
}
|
314
|
+
|
315
|
+
|
316
|
+
/**
|
317
|
+
* Total of getRunningLength() and getNumberOfLiteralWords()
|
318
|
+
*/
|
319
|
+
uword size() const {
|
320
|
+
return static_cast<uword>(RunningLength + NumberOfLiteralWords);
|
321
|
+
}
|
322
|
+
bool RunningBit;
|
323
|
+
uword RunningLength;
|
324
|
+
uword NumberOfLiteralWords;
|
325
|
+
|
326
|
+
};
|
327
|
+
|
328
|
+
template <class uword>
|
329
|
+
class EWAHBoolArray;
|
330
|
+
|
331
|
+
|
332
|
+
template <class uword>
|
333
|
+
class EWAHBoolArraySparseIterator;
|
334
|
+
|
335
|
+
|
336
|
+
/**
|
337
|
+
* Iterate over words of bits from a compressed bitmap.
|
338
|
+
*/
|
339
|
+
template <class uword=uword32>
|
340
|
+
class EWAHBoolArrayIterator {
|
341
|
+
public:
|
342
|
+
/**
|
343
|
+
* is there a new word?
|
344
|
+
*/
|
345
|
+
bool hasNext() const {
|
346
|
+
return pointer < myparent.size();
|
347
|
+
}
|
348
|
+
|
349
|
+
/**
|
350
|
+
* return next word.
|
351
|
+
*/
|
352
|
+
uword next() {
|
353
|
+
uword returnvalue;
|
354
|
+
if(compressedwords < rl) {
|
355
|
+
++compressedwords;
|
356
|
+
if(b)
|
357
|
+
returnvalue = notzero;
|
358
|
+
else
|
359
|
+
returnvalue = zero;
|
360
|
+
} else {
|
361
|
+
assert (literalwords < lw) ;
|
362
|
+
++literalwords;
|
363
|
+
++pointer;
|
364
|
+
assert(pointer <myparent.size());
|
365
|
+
returnvalue = myparent[pointer];
|
366
|
+
}
|
367
|
+
if((compressedwords == rl) && (literalwords == lw)) {
|
368
|
+
++pointer;
|
369
|
+
if(pointer < myparent.size()) readNewRunningLengthWord();
|
370
|
+
}
|
371
|
+
return returnvalue;
|
372
|
+
}
|
373
|
+
|
374
|
+
EWAHBoolArrayIterator(const EWAHBoolArrayIterator<uword> & other):pointer(other.pointer),
|
375
|
+
myparent(other.myparent),
|
376
|
+
compressedwords(other.compressedwords),
|
377
|
+
literalwords(other.literalwords),
|
378
|
+
rl(other.rl),
|
379
|
+
lw(other.lw),
|
380
|
+
b(other.b) {}
|
381
|
+
|
382
|
+
static const uword zero = 0;
|
383
|
+
static const uword notzero=static_cast<uword>(~zero);
|
384
|
+
private:
|
385
|
+
EWAHBoolArrayIterator(const vector<uword> & parent) ;
|
386
|
+
void readNewRunningLengthWord() ;
|
387
|
+
friend class EWAHBoolArray<uword>;
|
388
|
+
friend class EWAHBoolArraySparseIterator<uword>;
|
389
|
+
size_t pointer;
|
390
|
+
const vector<uword> & myparent;
|
391
|
+
uword compressedwords;
|
392
|
+
uword literalwords;
|
393
|
+
uword rl, lw;
|
394
|
+
bool b;
|
395
|
+
};
|
396
|
+
|
397
|
+
template <class uword>
|
398
|
+
class EWAHBoolArraySparseIterator;
|
399
|
+
|
400
|
+
|
401
|
+
|
402
|
+
|
403
|
+
template <class uword>
|
404
|
+
class EWAHBoolArraySetBitForwardIterator;
|
405
|
+
|
406
|
+
|
407
|
+
class BitmapStatistics;
|
408
|
+
|
409
|
+
template <class uword>
|
410
|
+
class EWAHBoolArrayRawIterator;
|
411
|
+
|
412
|
+
/**
|
413
|
+
* This class is a compressed bitmap.
|
414
|
+
* This is where compression
|
415
|
+
* happens.
|
416
|
+
* The underlying data structure is an STL vector.
|
417
|
+
*/
|
418
|
+
template <class uword=uword32>
|
419
|
+
class EWAHBoolArray {
|
420
|
+
public:
|
421
|
+
EWAHBoolArray(): buffer(1,0), sizeinbits(0), lastRLW(0) {
|
422
|
+
}
|
423
|
+
|
424
|
+
/**
|
425
|
+
* set the ith bit to true (starting at zero).
|
426
|
+
* Auto-expands the bitmap. It has constant running time complexity.
|
427
|
+
* Note that you must set the bits in increasing order:
|
428
|
+
* set(1), set(2) is ok; set(2), set(1) is not ok.
|
429
|
+
*/
|
430
|
+
void set(size_t i);
|
431
|
+
|
432
|
+
/**
|
433
|
+
* Make sure the two bitmaps have the same size (padding with zeroes
|
434
|
+
* if necessary). It has constant running time complexity.
|
435
|
+
*/
|
436
|
+
void makeSameSize(EWAHBoolArray & a) {
|
437
|
+
if(a.sizeinbits<sizeinbits)
|
438
|
+
a.padWithZeroes(sizeinbits);
|
439
|
+
else if(sizeinbits<a.sizeinbits)
|
440
|
+
padWithZeroes(a.sizeinbits);
|
441
|
+
}
|
442
|
+
|
443
|
+
enum {RESERVEMEMORY=true}; // for speed
|
444
|
+
|
445
|
+
typedef EWAHBoolArraySetBitForwardIterator<uword> const_iterator;
|
446
|
+
|
447
|
+
|
448
|
+
/**
|
449
|
+
* Returns an iterator that can be used to access the position of the
|
450
|
+
* set bits. The running time complexity of a full scan is proportional to the number
|
451
|
+
* of set bits: be aware that if you have long strings of 1s, this can be
|
452
|
+
* very inefficient.
|
453
|
+
*/
|
454
|
+
const_iterator begin() const {
|
455
|
+
return EWAHBoolArraySetBitForwardIterator<uword>(buffer);
|
456
|
+
}
|
457
|
+
|
458
|
+
|
459
|
+
/**
|
460
|
+
* Basically a bogus iterator that can be used together with begin()
|
461
|
+
* for constructions such as for(EWAHBoolArray<uword>::iterator i = b.begin(); i!=b.end(); ++i) {}
|
462
|
+
*/
|
463
|
+
const_iterator end() const {
|
464
|
+
return EWAHBoolArraySetBitForwardIterator<uword>(buffer,buffer.size());
|
465
|
+
}
|
466
|
+
|
467
|
+
/**
|
468
|
+
* computes the logical and with another compressed bitmap
|
469
|
+
* answer goes into container, though rawlogicaland is the
|
470
|
+
* default, sometimes this version is faster.
|
471
|
+
*/
|
472
|
+
void sparselogicaland( EWAHBoolArray &a, EWAHBoolArray &out) ;
|
473
|
+
|
474
|
+
/**
|
475
|
+
* computes the logical and with another compressed bitmap
|
476
|
+
* answer goes into container
|
477
|
+
* Running time complexity is proportional to the sum of the compressed
|
478
|
+
* bitmap sizes.
|
479
|
+
*/
|
480
|
+
void rawlogicaland( EWAHBoolArray &a, EWAHBoolArray &container) ;
|
481
|
+
|
482
|
+
/**
|
483
|
+
* computes the logical and with another compressed bitmap
|
484
|
+
* answer goes into container
|
485
|
+
* Running time complexity is proportional to the sum of the compressed
|
486
|
+
* bitmap sizes.
|
487
|
+
*/
|
488
|
+
void rawlogicalor( EWAHBoolArray &a, EWAHBoolArray &container) ;
|
489
|
+
|
490
|
+
|
491
|
+
/**
|
492
|
+
* computes the logical and with another compressed bitmap
|
493
|
+
* answer goes into container
|
494
|
+
* Running time complexity is proportional to the sum of the compressed
|
495
|
+
* bitmap sizes.
|
496
|
+
* (alias for rawlogicaland)
|
497
|
+
*/
|
498
|
+
void logicaland( EWAHBoolArray &a, EWAHBoolArray &container) {
|
499
|
+
rawlogicaland(a,container);
|
500
|
+
}
|
501
|
+
|
502
|
+
/**
|
503
|
+
* compute the logical and with another compressed bitmap
|
504
|
+
* answer goes into container.
|
505
|
+
* Running time complexity is proportional to the sum of the compressed
|
506
|
+
* bitmap sizes.
|
507
|
+
* (alias for rawlogicalor)
|
508
|
+
*/
|
509
|
+
void logicalor( EWAHBoolArray &a, EWAHBoolArray &container) {
|
510
|
+
rawlogicalor(a,container);
|
511
|
+
}
|
512
|
+
|
513
|
+
/**
|
514
|
+
* clear the content of the bitmap. It does not
|
515
|
+
* release the memory.
|
516
|
+
*/
|
517
|
+
void reset() {
|
518
|
+
buffer.clear();
|
519
|
+
buffer.push_back(0);
|
520
|
+
sizeinbits = 0;
|
521
|
+
lastRLW = 0;
|
522
|
+
}
|
523
|
+
|
524
|
+
/**
|
525
|
+
* convenience method.
|
526
|
+
*
|
527
|
+
* returns the number of words added (storage cost increase)
|
528
|
+
*/
|
529
|
+
inline size_t add(const uword newdata, const uint bitsthatmatter = 8*sizeof(uword));
|
530
|
+
|
531
|
+
inline void printout(ostream &o = cout) {
|
532
|
+
toBoolArray().printout(o);
|
533
|
+
}
|
534
|
+
|
535
|
+
/**
|
536
|
+
* Prints a verbose description of the content of the compressed bitmap.
|
537
|
+
*/
|
538
|
+
void debugprintout() const;
|
539
|
+
|
540
|
+
/**
|
541
|
+
* Return the size in bits of this bitmap (this refers
|
542
|
+
* to the uncompressed size in bits).
|
543
|
+
*/
|
544
|
+
inline size_t sizeInBits() const {
|
545
|
+
return sizeinbits;
|
546
|
+
}
|
547
|
+
|
548
|
+
/**
|
549
|
+
* set size in bits. This does not affect the compressed size. It
|
550
|
+
* runs in constant time.
|
551
|
+
*/
|
552
|
+
inline void setSizeInBits(const size_t size) {
|
553
|
+
sizeinbits = size;
|
554
|
+
}
|
555
|
+
|
556
|
+
/**
|
557
|
+
* Return the size of the buffer in bytes. This
|
558
|
+
* is equivalent to the storage cost, minus some overhead.
|
559
|
+
*/
|
560
|
+
inline size_t sizeInBytes() const {
|
561
|
+
return buffer.size()*sizeof(uword);
|
562
|
+
}
|
563
|
+
|
564
|
+
|
565
|
+
|
566
|
+
/**
|
567
|
+
* same as addEmptyWord, but you can do several in one shot!
|
568
|
+
* returns the number of words added (storage cost increase)
|
569
|
+
*/
|
570
|
+
size_t addStreamOfEmptyWords(const bool v, const size_t number);
|
571
|
+
|
572
|
+
/**
|
573
|
+
* add a stream of dirty words,, returns the number of words added
|
574
|
+
* (storage cost increase)
|
575
|
+
*/
|
576
|
+
size_t addStreamOfDirtyWords(const uword * v, const size_t number);
|
577
|
+
|
578
|
+
/**
|
579
|
+
* make sure the size of the array is totalbits bits by padding with zeroes.
|
580
|
+
* returns the number of words added (storage cost increase)
|
581
|
+
*/
|
582
|
+
inline size_t padWithZeroes(const size_t totalbits);
|
583
|
+
|
584
|
+
/**
|
585
|
+
* Compute the size on disk assuming that it was saved using
|
586
|
+
* the method "save".
|
587
|
+
*/
|
588
|
+
size_t sizeOnDisk() const;
|
589
|
+
|
590
|
+
|
591
|
+
/**
|
592
|
+
* Save this bitmap to a stream. The file format is
|
593
|
+
* | sizeinbits | buffer lenth | buffer content|
|
594
|
+
* the sizeinbits part can be omitted if "savesizeinbits=false".
|
595
|
+
* Both sizeinbits and buffer length are saved using the size_t data
|
596
|
+
* type which is typically a 32-bit unsigned integer for 32-bit CPUs
|
597
|
+
* and a 64-bit unsigned integer for 64-bit CPUs.
|
598
|
+
* Note that this format is machine-specific. Note also
|
599
|
+
* that the word size is not saved. For robust persistent
|
600
|
+
* storage, you need to save this extra information elsewhere.
|
601
|
+
*/
|
602
|
+
inline void write(ostream & out, const bool savesizeinbits=true) const;
|
603
|
+
|
604
|
+
/**
|
605
|
+
* This only writes the content of the buffer (see write()) method.
|
606
|
+
* It is for advanced users.
|
607
|
+
*/
|
608
|
+
inline void writeBuffer(ostream & out) const;
|
609
|
+
|
610
|
+
/**
|
611
|
+
* size (in words) of the underlying STL vector.
|
612
|
+
*/
|
613
|
+
inline size_t bufferSize() const {
|
614
|
+
return buffer.size();
|
615
|
+
}
|
616
|
+
|
617
|
+
/**
|
618
|
+
* this is the counterpart to the write method.
|
619
|
+
* if you set savesizeinbits=false, then you are responsible
|
620
|
+
* for setting the value fo the attribute sizeinbits (see method setSizeInBits).
|
621
|
+
*/
|
622
|
+
inline void read(istream & in, const bool savesizeinbits=true);
|
623
|
+
|
624
|
+
|
625
|
+
/**
|
626
|
+
* read the buffer from a stream, see method writeBuffer.
|
627
|
+
* this is for advanced users.
|
628
|
+
*/
|
629
|
+
inline void readBuffer(istream & in, const size_t buffersize);
|
630
|
+
|
631
|
+
bool operator==(const EWAHBoolArray & x) const;
|
632
|
+
|
633
|
+
bool operator!=(const EWAHBoolArray & x) const;
|
634
|
+
|
635
|
+
bool operator==(const BoolArray<uword> & x) const;
|
636
|
+
|
637
|
+
bool operator!=(const BoolArray<uword> & x) const;
|
638
|
+
|
639
|
+
/**
|
640
|
+
* Iterate over the uncompressed words.
|
641
|
+
* Can be considerably faster than begin()/end().
|
642
|
+
* Running time complexity of a full scan is proportional to the
|
643
|
+
* uncompressed size of the bitmap.
|
644
|
+
*/
|
645
|
+
EWAHBoolArrayIterator<uword> uncompress() const ;
|
646
|
+
|
647
|
+
/**
|
648
|
+
* To iterate over non-zero uncompressed words.
|
649
|
+
* Can be considerably faster than begin()/end().
|
650
|
+
* Running time complexity of a fun scan is proportional to the number of
|
651
|
+
* non-zero uncompressed words.
|
652
|
+
*/
|
653
|
+
EWAHBoolArraySparseIterator<uword> sparse_uncompress() const ;
|
654
|
+
|
655
|
+
/**
|
656
|
+
* To iterate over the compressed data.
|
657
|
+
* Can be faster than any other iterator.
|
658
|
+
* Running time complexity of a full scan is proportional to the
|
659
|
+
* compressed size of the bitmap.
|
660
|
+
*/
|
661
|
+
EWAHBoolArrayRawIterator<uword> raw_iterator() const ;
|
662
|
+
|
663
|
+
/**
|
664
|
+
* Appends the content of some other compressed bitmap
|
665
|
+
* at the end of the current bitmap.
|
666
|
+
*/
|
667
|
+
void append(const EWAHBoolArray & x);
|
668
|
+
|
669
|
+
/**
|
670
|
+
* For research purposes. This computes the number of
|
671
|
+
* dirty words and the number of compressed words.
|
672
|
+
*/
|
673
|
+
BitmapStatistics computeStatistics() const;
|
674
|
+
|
675
|
+
BoolArray<uword> toBoolArray() const;
|
676
|
+
|
677
|
+
/**
|
678
|
+
* Convert to a list of positions of "set" bits.
|
679
|
+
* The recommender container is vector<size_t>.
|
680
|
+
*/
|
681
|
+
template <class container>
|
682
|
+
void appendRowIDs(container & out, const size_t offset = 0) const;
|
683
|
+
|
684
|
+
|
685
|
+
/**
|
686
|
+
* Convert to a list of positions of "set" bits.
|
687
|
+
* The recommender container is vector<size_t>.
|
688
|
+
* (alias for appendRowIDs).
|
689
|
+
*/
|
690
|
+
template <class container>
|
691
|
+
void appendSetBits(container & out, const size_t offset = 0) const {
|
692
|
+
return appendRowIDs(out,offset);
|
693
|
+
}
|
694
|
+
|
695
|
+
/**
|
696
|
+
* Returns the number of bits set to the value 1.
|
697
|
+
* The running time complexity is proportional to the
|
698
|
+
* compressed size of the bitmap.
|
699
|
+
*/
|
700
|
+
size_t numberOfOnes();
|
701
|
+
|
702
|
+
/**
|
703
|
+
* Swap the content of this bitmap with another bitmap.
|
704
|
+
* No copying is done. (Running time complexity is constant.)
|
705
|
+
*/
|
706
|
+
void swap(EWAHBoolArray & x);
|
707
|
+
|
708
|
+
const vector<uword> & getBuffer() const {
|
709
|
+
return buffer;
|
710
|
+
};
|
711
|
+
enum { wordinbits = sizeof(uword) * 8};
|
712
|
+
|
713
|
+
|
714
|
+
/**
|
715
|
+
*Please don't copy your bitmaps! The running time
|
716
|
+
* complexity of a copy is the size of the compressed bitmap.
|
717
|
+
**/
|
718
|
+
EWAHBoolArray(const EWAHBoolArray& other) :
|
719
|
+
buffer(other.buffer),
|
720
|
+
sizeinbits(other.sizeinbits),
|
721
|
+
lastRLW(other.lastRLW) {
|
722
|
+
ASSERT(buffer.size()<=1,"You are trying to copy the bitmap, a terrible idea in general, for performance reasons.");// performance assert!
|
723
|
+
}
|
724
|
+
|
725
|
+
/**
|
726
|
+
* Copies the content of one bitmap onto another. Running time complexity
|
727
|
+
* is proportional to the size of the compressed bitmap.
|
728
|
+
* please, never hard-copy this object. Use the swap method if you must.
|
729
|
+
*/
|
730
|
+
EWAHBoolArray & operator=(const EWAHBoolArray & x) {
|
731
|
+
buffer = x.buffer;
|
732
|
+
sizeinbits = x.sizeinbits;
|
733
|
+
lastRLW = x.lastRLW;
|
734
|
+
return *this;
|
735
|
+
}
|
736
|
+
|
737
|
+
/**
|
738
|
+
* This is equivalent to the operator =. It is used
|
739
|
+
* to keep in mind that assignment can be expensive.
|
740
|
+
*
|
741
|
+
*if you don't care to copy the bitmap (performance-wise), use this!
|
742
|
+
*/
|
743
|
+
void expensive_copy(const EWAHBoolArray & x) {
|
744
|
+
buffer = x.buffer;
|
745
|
+
sizeinbits = x.sizeinbits;
|
746
|
+
lastRLW = x.lastRLW;
|
747
|
+
}
|
748
|
+
|
749
|
+
/**
|
750
|
+
* Write the logical not of this bitmap in the provided container.
|
751
|
+
*/
|
752
|
+
void logicalnot(EWAHBoolArray & x) const;
|
753
|
+
|
754
|
+
/**
|
755
|
+
* Apply the logical not operation on this bitmap.
|
756
|
+
* Running time complexity is proportional to the compressed size of the bitmap.
|
757
|
+
*/
|
758
|
+
void inplace_logicalnot();
|
759
|
+
|
760
|
+
|
761
|
+
private:
|
762
|
+
|
763
|
+
|
764
|
+
|
765
|
+
// private because does not increment the size in bits
|
766
|
+
// returns the number of words added (storage cost increase)
|
767
|
+
inline size_t addLiteralWord(const uword newdata) ;
|
768
|
+
|
769
|
+
// private because does not increment the size in bits
|
770
|
+
// returns the number of words added (storage cost increase)
|
771
|
+
size_t addEmptyWord(const bool v);
|
772
|
+
// this second version "might" be faster if you hate OOP.
|
773
|
+
// in my tests, it turned out to be slower!
|
774
|
+
// private because does not increment the size in bits
|
775
|
+
//inline void addEmptyWordStaticCalls(bool v);
|
776
|
+
|
777
|
+
vector<uword> buffer;
|
778
|
+
size_t sizeinbits;
|
779
|
+
size_t lastRLW;
|
780
|
+
};
|
781
|
+
|
782
|
+
|
783
|
+
|
784
|
+
/**
|
785
|
+
* Iterator over the words of the compressed bitmap.
|
786
|
+
*/
|
787
|
+
template <class uword=uword32>
|
788
|
+
class EWAHBoolArraySparseIterator {
|
789
|
+
public:
|
790
|
+
/**
|
791
|
+
* is there more words?
|
792
|
+
*/
|
793
|
+
bool hasNext() const {
|
794
|
+
return i.hasNext();
|
795
|
+
}
|
796
|
+
|
797
|
+
size_t position() const {
|
798
|
+
return mPosition;
|
799
|
+
}
|
800
|
+
/**
|
801
|
+
* return next word. If the word is either 0x00 or 0x11
|
802
|
+
* the you need to call position() to know how many times it
|
803
|
+
* was repeated
|
804
|
+
*/
|
805
|
+
uword next() {
|
806
|
+
uword returnvalue;
|
807
|
+
if(i.compressedwords < i.rl) {
|
808
|
+
if(i.b) {
|
809
|
+
++mPosition;
|
810
|
+
++i.compressedwords;
|
811
|
+
returnvalue = EWAHBoolArrayIterator<uword>::notzero;
|
812
|
+
} else {
|
813
|
+
mPosition = static_cast<size_t>(mPosition + i.rl);
|
814
|
+
i.compressedwords = i.rl;
|
815
|
+
returnvalue = EWAHBoolArrayIterator<uword>::zero;//next();
|
816
|
+
}
|
817
|
+
} else {
|
818
|
+
assert (i.literalwords < i.lw);
|
819
|
+
++i.literalwords;
|
820
|
+
++i.pointer;
|
821
|
+
++mPosition;
|
822
|
+
assert(i.pointer <i.myparent.size());
|
823
|
+
returnvalue = i.myparent[i.pointer];
|
824
|
+
}
|
825
|
+
if((i.compressedwords == i.rl) && (i.literalwords == i.lw)) {
|
826
|
+
++i.pointer;
|
827
|
+
if(i.pointer < i.myparent.size()) i.readNewRunningLengthWord();
|
828
|
+
}
|
829
|
+
return returnvalue;
|
830
|
+
}
|
831
|
+
|
832
|
+
EWAHBoolArraySparseIterator(const EWAHBoolArraySparseIterator<uword> & other):i(other.i),mPosition(other.mPosition) {}
|
833
|
+
|
834
|
+
private:
|
835
|
+
EWAHBoolArraySparseIterator(const vector<uword> & parent) : i(parent), mPosition(0) {}
|
836
|
+
EWAHBoolArrayIterator<uword> i;
|
837
|
+
size_t mPosition;
|
838
|
+
friend class EWAHBoolArray<uword>;
|
839
|
+
};
|
840
|
+
|
841
|
+
|
842
|
+
/**
|
843
|
+
* Used to go through the set bits. Not optimally fast, but convenient.
|
844
|
+
*/
|
845
|
+
template <class uword>
|
846
|
+
class EWAHBoolArraySetBitForwardIterator {
|
847
|
+
public:
|
848
|
+
enum { wordinbits = sizeof(uword) * 8};
|
849
|
+
typedef forward_iterator_tag iterator_category;
|
850
|
+
typedef size_t * pointer;
|
851
|
+
typedef size_t & reference_type;
|
852
|
+
typedef size_t value_type;
|
853
|
+
typedef ptrdiff_t difference_type;
|
854
|
+
typedef EWAHBoolArraySetBitForwardIterator<uword> type_of_iterator;
|
855
|
+
|
856
|
+
/**
|
857
|
+
* Provides the location of the set bit.
|
858
|
+
*/
|
859
|
+
size_t operator*() const {
|
860
|
+
return currentrunoffset+offsetofpreviousrun;
|
861
|
+
}
|
862
|
+
|
863
|
+
// this can be expensive
|
864
|
+
difference_type operator-(const type_of_iterator& o) {
|
865
|
+
type_of_iterator& smaller = *this<o ? *this : o;
|
866
|
+
type_of_iterator& bigger = *this>=o ? *this : o;
|
867
|
+
if(smaller.mpointer==smaller.buffer.size())
|
868
|
+
return 0;
|
869
|
+
difference_type absdiff = static_cast<difference_type>(0);
|
870
|
+
EWAHBoolArraySetBitForwardIterator<uword> buf(smaller);
|
871
|
+
while(buf!= bigger) {
|
872
|
+
++absdiff;
|
873
|
+
++buf;
|
874
|
+
}
|
875
|
+
if(*this<o)
|
876
|
+
return absdiff;
|
877
|
+
else
|
878
|
+
return - absdiff;
|
879
|
+
}
|
880
|
+
|
881
|
+
bool operator<(const type_of_iterator& o) {
|
882
|
+
if(buffer != o.buffer) return false;
|
883
|
+
if(mpointer==buffer.size()) return false;
|
884
|
+
if(o.mpointer==o.buffer.size()) return true;
|
885
|
+
if(offsetofpreviousrun<o.offsetofpreviousrun)
|
886
|
+
return true;
|
887
|
+
if(offsetofpreviousrun>o.offsetofpreviousrun)
|
888
|
+
return false;
|
889
|
+
if(currentrunoffset<o.currentrunoffset)
|
890
|
+
return true;
|
891
|
+
return false;
|
892
|
+
}
|
893
|
+
bool operator<=(const type_of_iterator& o) {
|
894
|
+
return ( (*this) < o ) || ((*this) == o);
|
895
|
+
}
|
896
|
+
|
897
|
+
bool operator>(const type_of_iterator& o) {
|
898
|
+
return ! ((*this) <= o ) ;
|
899
|
+
}
|
900
|
+
|
901
|
+
bool operator>=(const type_of_iterator& o) {
|
902
|
+
return ! ((*this) < o ) ;
|
903
|
+
}
|
904
|
+
|
905
|
+
EWAHBoolArraySetBitForwardIterator & operator++() {
|
906
|
+
++currentrunoffset;
|
907
|
+
advanceToNextSetBit();
|
908
|
+
return *this;
|
909
|
+
}
|
910
|
+
EWAHBoolArraySetBitForwardIterator operator++(int) {
|
911
|
+
EWAHBoolArraySetBitForwardIterator old(*this);
|
912
|
+
++currentrunoffset;
|
913
|
+
advanceToNextSetBit();
|
914
|
+
return old;
|
915
|
+
}
|
916
|
+
bool operator==(const EWAHBoolArraySetBitForwardIterator<uword> & o) {
|
917
|
+
// if they are both over, return true
|
918
|
+
if((mpointer==buffer.size()) && (o.mpointer==o.buffer.size()))
|
919
|
+
return true;
|
920
|
+
return (buffer == o.buffer) && (mpointer == o.mpointer) &&
|
921
|
+
(offsetofpreviousrun == o.offsetofpreviousrun) && (currentrunoffset == o.currentrunoffset);
|
922
|
+
}
|
923
|
+
|
924
|
+
bool operator!=(const EWAHBoolArraySetBitForwardIterator<uword> & o) {
|
925
|
+
// if they are both over, return false
|
926
|
+
if((mpointer==buffer.size()) && (o.mpointer==o.buffer.size()))
|
927
|
+
return false;
|
928
|
+
return (buffer != o.buffer) || (mpointer != o.mpointer) ||
|
929
|
+
(offsetofpreviousrun != o.offsetofpreviousrun) || (currentrunoffset != o.currentrunoffset);
|
930
|
+
}
|
931
|
+
|
932
|
+
|
933
|
+
EWAHBoolArraySetBitForwardIterator(const EWAHBoolArraySetBitForwardIterator & o) : buffer(o.buffer), mpointer(o.mpointer),
|
934
|
+
offsetofpreviousrun(o.offsetofpreviousrun), currentrunoffset(o.currentrunoffset), rlw(o.rlw) {}
|
935
|
+
|
936
|
+
private:
|
937
|
+
|
938
|
+
bool advanceToNextSetBit() {
|
939
|
+
if(mpointer==buffer.size()) return false;
|
940
|
+
if (currentrunoffset<static_cast<size_t>(rlw.getRunningLength() * wordinbits)) {
|
941
|
+
if(rlw.getRunningBit())
|
942
|
+
return true;// nothing to do
|
943
|
+
currentrunoffset = static_cast<size_t>(rlw.getRunningLength() * wordinbits);//skipping
|
944
|
+
}
|
945
|
+
while(true) {
|
946
|
+
const size_t indexoflitword = static_cast<size_t>( (currentrunoffset-rlw.getRunningLength() * wordinbits)/wordinbits);
|
947
|
+
if(indexoflitword>= rlw.getNumberOfLiteralWords() ) {
|
948
|
+
if(advanceToNextRun())
|
949
|
+
return advanceToNextSetBit();
|
950
|
+
else {
|
951
|
+
return false;
|
952
|
+
}
|
953
|
+
}
|
954
|
+
const uword currentword = buffer[mpointer + 1 + indexoflitword];
|
955
|
+
for(uint inwordpointer =
|
956
|
+
static_cast<uint>((currentrunoffset-rlw.getRunningLength() * wordinbits)%wordinbits);
|
957
|
+
inwordpointer<wordinbits;++inwordpointer,++currentrunoffset) {
|
958
|
+
if((currentword & (static_cast<uword>(1) << inwordpointer))!=0)
|
959
|
+
return true;
|
960
|
+
}
|
961
|
+
}
|
962
|
+
}
|
963
|
+
|
964
|
+
bool advanceToNextRun() {
|
965
|
+
offsetofpreviousrun += currentrunoffset;
|
966
|
+
currentrunoffset = 0;
|
967
|
+
mpointer += static_cast<size_t>(1 + rlw.getNumberOfLiteralWords());
|
968
|
+
if(mpointer<buffer.size()) {
|
969
|
+
rlw.mydata = buffer[mpointer];
|
970
|
+
} else {
|
971
|
+
return false;
|
972
|
+
}
|
973
|
+
return true;
|
974
|
+
}
|
975
|
+
|
976
|
+
|
977
|
+
EWAHBoolArraySetBitForwardIterator(const vector<uword> & parent, size_t startpointer = 0) : buffer(parent), mpointer(startpointer),
|
978
|
+
offsetofpreviousrun(0), currentrunoffset(0), rlw(0) {
|
979
|
+
if(mpointer<buffer.size()) {
|
980
|
+
rlw.mydata = buffer[mpointer];
|
981
|
+
advanceToNextSetBit();
|
982
|
+
}
|
983
|
+
}
|
984
|
+
|
985
|
+
|
986
|
+
const vector<uword> & buffer;
|
987
|
+
size_t mpointer;
|
988
|
+
size_t offsetofpreviousrun;
|
989
|
+
size_t currentrunoffset;
|
990
|
+
friend class EWAHBoolArray<uword>;
|
991
|
+
ConstRunningLengthWord<uword> rlw;
|
992
|
+
};
|
993
|
+
|
994
|
+
|
995
|
+
|
996
|
+
/**
|
997
|
+
* This object is returned by the compressed bitmap as a
|
998
|
+
* statistical descriptor.
|
999
|
+
*/
|
1000
|
+
class BitmapStatistics {
|
1001
|
+
public:
|
1002
|
+
BitmapStatistics() : totalliteral(0), totalcompressed(0), runningwordmarker(0), maximumofrunningcounterreached(0) {}
|
1003
|
+
size_t getCompressedSize() const {
|
1004
|
+
return totalliteral+ runningwordmarker;
|
1005
|
+
}
|
1006
|
+
size_t getUncompressedSize() const {
|
1007
|
+
return totalliteral+ totalcompressed;
|
1008
|
+
}
|
1009
|
+
size_t getNumberOfDirtyWords() const {
|
1010
|
+
return totalliteral;
|
1011
|
+
}
|
1012
|
+
size_t getNumberOfCleanWords() const {
|
1013
|
+
return totalcompressed;
|
1014
|
+
}
|
1015
|
+
size_t getNumberOfMarkers() const {
|
1016
|
+
return runningwordmarker;
|
1017
|
+
}
|
1018
|
+
size_t getOverRuns() const {
|
1019
|
+
return maximumofrunningcounterreached;
|
1020
|
+
}
|
1021
|
+
size_t totalliteral;
|
1022
|
+
size_t totalcompressed;
|
1023
|
+
size_t runningwordmarker;
|
1024
|
+
size_t maximumofrunningcounterreached;
|
1025
|
+
};
|
1026
|
+
|
1027
|
+
|
1028
|
+
template <class uword>
|
1029
|
+
void EWAHBoolArray<uword>::set(size_t i) {
|
1030
|
+
// must I complete a word?
|
1031
|
+
if ( (sizeinbits % (8*sizeof(uword))) != 0) {
|
1032
|
+
const size_t possiblesizeinbits = (sizeinbits /(8*sizeof(uword)))*(8*sizeof(uword)) + (8*sizeof(uword));
|
1033
|
+
if(possiblesizeinbits<i+1) {
|
1034
|
+
sizeinbits = possiblesizeinbits;
|
1035
|
+
}
|
1036
|
+
}
|
1037
|
+
addStreamOfEmptyWords(false, (i/(8*sizeof(uword))) - sizeinbits/(8*sizeof(uword)));
|
1038
|
+
size_t bittoflip = i-(sizeinbits/(8*sizeof(uword)) * (8*sizeof(uword)));
|
1039
|
+
// next, we set the bit
|
1040
|
+
RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
|
1041
|
+
if(( lastRunningLengthWord.getNumberOfLiteralWords() == 0) || ((sizeinbits
|
1042
|
+
-1)/(8*sizeof(uword)) < i/(8*sizeof(uword))) ) {
|
1043
|
+
const uword newdata = static_cast<uword>(static_cast<uword>(1)<<bittoflip);
|
1044
|
+
addLiteralWord(newdata);
|
1045
|
+
} else {
|
1046
|
+
buffer[buffer.size()-1] |= static_cast<uword>(static_cast<uword>(1)<<bittoflip);
|
1047
|
+
// check if we just completed a stream of 1s
|
1048
|
+
if(buffer[buffer.size()-1] == static_cast<uword>(~0)) {
|
1049
|
+
// we remove the last dirty word
|
1050
|
+
buffer[buffer.size()-1] = 0;
|
1051
|
+
buffer.resize(buffer.size()-1);
|
1052
|
+
lastRunningLengthWord.setNumberOfLiteralWords(static_cast<uword>(lastRunningLengthWord.getNumberOfLiteralWords()-1));
|
1053
|
+
// next we add one clean word
|
1054
|
+
addEmptyWord(true);
|
1055
|
+
}
|
1056
|
+
}
|
1057
|
+
sizeinbits = i+1;
|
1058
|
+
}
|
1059
|
+
|
1060
|
+
|
1061
|
+
|
1062
|
+
template <class uword>
|
1063
|
+
void EWAHBoolArray<uword>::inplace_logicalnot() {
|
1064
|
+
size_t pointer(0);
|
1065
|
+
while(pointer <buffer.size()) {
|
1066
|
+
RunningLengthWord<uword> rlw(buffer[pointer]);
|
1067
|
+
if(rlw.getRunningBit())
|
1068
|
+
rlw.setRunningBit(false);
|
1069
|
+
else
|
1070
|
+
rlw.setRunningBit(true);
|
1071
|
+
++pointer;
|
1072
|
+
for(size_t k = 0; k<rlw.getNumberOfLiteralWords(); ++k) {
|
1073
|
+
buffer[pointer] = ~buffer[pointer];
|
1074
|
+
++pointer;
|
1075
|
+
}
|
1076
|
+
}
|
1077
|
+
}
|
1078
|
+
|
1079
|
+
|
1080
|
+
template <class uword>
|
1081
|
+
void EWAHBoolArray<uword>::logicalnot(EWAHBoolArray & x) const {
|
1082
|
+
x.reset();
|
1083
|
+
x.buffer.reserve(buffer.size());
|
1084
|
+
EWAHBoolArrayRawIterator<uword> i = this->raw_iterator();
|
1085
|
+
while(i.hasNext()) {
|
1086
|
+
BufferedRunningLengthWord<uword> & rlw = i.next();
|
1087
|
+
x.addStreamOfEmptyWords(! rlw.getRunningBit(), rlw.getRunningLength());
|
1088
|
+
if(rlw.getNumberOfLiteralWords()>0) {
|
1089
|
+
const uword * dw = i.dirtyWords();
|
1090
|
+
for(size_t k = 0 ; k <rlw.getNumberOfLiteralWords(); ++k) {
|
1091
|
+
x.addLiteralWord(~ dw[k]);
|
1092
|
+
}
|
1093
|
+
}
|
1094
|
+
}
|
1095
|
+
x.sizeinbits = this->sizeinbits;
|
1096
|
+
}
|
1097
|
+
|
1098
|
+
|
1099
|
+
template <class uword>
|
1100
|
+
size_t EWAHBoolArray<uword>::add(const uword newdata, const uint bitsthatmatter) {
|
1101
|
+
sizeinbits += bitsthatmatter;
|
1102
|
+
if(newdata == 0) {
|
1103
|
+
return addEmptyWord(0);
|
1104
|
+
} else if (newdata == static_cast<uword>(~0)) {
|
1105
|
+
return addEmptyWord(1);
|
1106
|
+
} else {
|
1107
|
+
return addLiteralWord(newdata);
|
1108
|
+
}
|
1109
|
+
}
|
1110
|
+
|
1111
|
+
|
1112
|
+
template <class uword>
|
1113
|
+
inline void EWAHBoolArray<uword>::writeBuffer(ostream & out) const {
|
1114
|
+
if(buffer.size()>0)
|
1115
|
+
out.write(reinterpret_cast<const char *>(& buffer[0]),sizeof(uword)*buffer.size());
|
1116
|
+
}
|
1117
|
+
|
1118
|
+
|
1119
|
+
template <class uword>
|
1120
|
+
inline void EWAHBoolArray<uword>::readBuffer(istream & in, const size_t buffersize) {
|
1121
|
+
buffer.resize(buffersize);
|
1122
|
+
if(buffersize>0)
|
1123
|
+
in.read(reinterpret_cast<char *>(&buffer[0]),sizeof(uword)*buffersize);
|
1124
|
+
}
|
1125
|
+
|
1126
|
+
|
1127
|
+
template <class uword>
|
1128
|
+
void EWAHBoolArray<uword>::write(ostream & out, const bool savesizeinbits) const {
|
1129
|
+
if(savesizeinbits)out.write(reinterpret_cast<const char *>( & sizeinbits), sizeof(sizeinbits));
|
1130
|
+
const size_t buffersize = buffer.size();
|
1131
|
+
out.write(reinterpret_cast<const char *>(& buffersize),sizeof(buffersize));
|
1132
|
+
if(buffersize>0)
|
1133
|
+
out.write(reinterpret_cast<const char *>(& buffer[0]),sizeof(uword)*buffersize);
|
1134
|
+
}
|
1135
|
+
|
1136
|
+
|
1137
|
+
template <class uword>
|
1138
|
+
void EWAHBoolArray<uword>::read(istream & in, const bool savesizeinbits) {
|
1139
|
+
if(savesizeinbits) in.read(reinterpret_cast<char *>(&sizeinbits), sizeof(sizeinbits));
|
1140
|
+
else sizeinbits = 0;
|
1141
|
+
size_t buffersize(0);
|
1142
|
+
in.read(reinterpret_cast<char *>(&buffersize), sizeof(buffersize));
|
1143
|
+
buffer.resize(buffersize);
|
1144
|
+
if(buffersize>0)
|
1145
|
+
in.read(reinterpret_cast<char *>(&buffer[0]),sizeof(uword)*buffersize);
|
1146
|
+
}
|
1147
|
+
|
1148
|
+
|
1149
|
+
template <class uword>
|
1150
|
+
size_t EWAHBoolArray<uword>::addLiteralWord(const uword newdata) {
|
1151
|
+
RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
|
1152
|
+
uword numbersofar = lastRunningLengthWord.getNumberOfLiteralWords();
|
1153
|
+
if(numbersofar >= RunningLengthWord<uword>::largestliteralcount) {//0x7FFF) {
|
1154
|
+
buffer.push_back(0);
|
1155
|
+
lastRLW = buffer.size() - 1;
|
1156
|
+
RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
|
1157
|
+
lastRunningLengthWord2.setNumberOfLiteralWords(1);
|
1158
|
+
buffer.push_back(newdata);
|
1159
|
+
return 2;
|
1160
|
+
}
|
1161
|
+
lastRunningLengthWord.setNumberOfLiteralWords(static_cast<uword>(numbersofar + 1));
|
1162
|
+
assert(lastRunningLengthWord.getNumberOfLiteralWords()==numbersofar + 1);
|
1163
|
+
buffer.push_back(newdata);
|
1164
|
+
return 1;
|
1165
|
+
}
|
1166
|
+
|
1167
|
+
|
1168
|
+
|
1169
|
+
|
1170
|
+
template <class uword>
|
1171
|
+
size_t EWAHBoolArray<uword>::padWithZeroes(const size_t totalbits) {
|
1172
|
+
assert(totalbits >= sizeinbits);
|
1173
|
+
size_t missingbits = totalbits - sizeinbits;
|
1174
|
+
size_t wordsadded = addStreamOfEmptyWords(0, missingbits/wordinbits + ((missingbits % wordinbits != 0) ? 1 : 0));
|
1175
|
+
assert(sizeinbits >= totalbits);
|
1176
|
+
assert(sizeinbits <= totalbits + wordinbits);
|
1177
|
+
sizeinbits = totalbits;
|
1178
|
+
return wordsadded;
|
1179
|
+
}
|
1180
|
+
|
1181
|
+
|
1182
|
+
|
1183
|
+
/**
|
1184
|
+
* This is a low-level iterator.
|
1185
|
+
*/
|
1186
|
+
|
1187
|
+
template <class uword=uword32>
|
1188
|
+
class EWAHBoolArrayRawIterator {
|
1189
|
+
public:
|
1190
|
+
EWAHBoolArrayRawIterator(const EWAHBoolArray<uword> & p) : pointer(0),
|
1191
|
+
myparent(&p.getBuffer()), rlw((*myparent)[pointer]) { //RunningLength(0), NumberOfLiteralWords(0), Bit(0) {
|
1192
|
+
if(verbose) {
|
1193
|
+
cout<<"created a new raw iterator over buffer of size "<<myparent->size()<<endl;
|
1194
|
+
}
|
1195
|
+
}
|
1196
|
+
EWAHBoolArrayRawIterator(const EWAHBoolArrayRawIterator & o) : pointer(o.pointer),
|
1197
|
+
myparent(o.myparent), rlw(o.rlw) {}
|
1198
|
+
|
1199
|
+
|
1200
|
+
bool hasNext() const {
|
1201
|
+
if(verbose)cout<<"call to hasNext, pointer is at "<<pointer<< ", parent.size()= "<<myparent->size()<<endl;
|
1202
|
+
return pointer < myparent->size();
|
1203
|
+
}
|
1204
|
+
|
1205
|
+
BufferedRunningLengthWord<uword> & next() {
|
1206
|
+
assert(pointer < myparent->size());
|
1207
|
+
rlw.read( (*myparent)[pointer]);
|
1208
|
+
pointer = static_cast<size_t>(pointer + rlw.getNumberOfLiteralWords() + 1);
|
1209
|
+
return rlw;
|
1210
|
+
}
|
1211
|
+
|
1212
|
+
const uword * dirtyWords() const {
|
1213
|
+
assert(pointer>0);
|
1214
|
+
assert(pointer>=rlw.getNumberOfLiteralWords());
|
1215
|
+
return & (myparent->at(static_cast<size_t>(pointer-rlw.getNumberOfLiteralWords())));
|
1216
|
+
}
|
1217
|
+
|
1218
|
+
EWAHBoolArrayRawIterator & operator=(const EWAHBoolArrayRawIterator & other) {
|
1219
|
+
pointer = other.pointer;
|
1220
|
+
myparent=other.myparent;
|
1221
|
+
rlw=other.rlw;
|
1222
|
+
return *this;
|
1223
|
+
}
|
1224
|
+
|
1225
|
+
enum {verbose=false};
|
1226
|
+
size_t pointer;
|
1227
|
+
const vector<uword> * myparent;
|
1228
|
+
BufferedRunningLengthWord<uword> rlw;
|
1229
|
+
private:
|
1230
|
+
|
1231
|
+
EWAHBoolArrayRawIterator();
|
1232
|
+
};
|
1233
|
+
|
1234
|
+
|
1235
|
+
|
1236
|
+
|
1237
|
+
|
1238
|
+
|
1239
|
+
template <class uword>
|
1240
|
+
EWAHBoolArrayIterator<uword> EWAHBoolArray<uword>::uncompress() const {
|
1241
|
+
return EWAHBoolArrayIterator<uword>(buffer);
|
1242
|
+
}
|
1243
|
+
|
1244
|
+
template <class uword>
|
1245
|
+
EWAHBoolArrayRawIterator<uword> EWAHBoolArray<uword>::raw_iterator() const {
|
1246
|
+
return EWAHBoolArrayRawIterator<uword>(*this);
|
1247
|
+
}
|
1248
|
+
|
1249
|
+
|
1250
|
+
template <class uword>
|
1251
|
+
EWAHBoolArraySparseIterator<uword> EWAHBoolArray<uword>::sparse_uncompress() const {
|
1252
|
+
return EWAHBoolArraySparseIterator<uword>(buffer);
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
template <class uword>
|
1256
|
+
bool EWAHBoolArray<uword>::operator==(const EWAHBoolArray & x) const {
|
1257
|
+
if(sizeinbits != x.sizeinbits) return false;
|
1258
|
+
if(buffer.size() != x.buffer.size()) return false;
|
1259
|
+
for(size_t k = 0; k < buffer.size(); ++k)
|
1260
|
+
if(buffer[k] != x.buffer[k]) return false;
|
1261
|
+
return true;
|
1262
|
+
}
|
1263
|
+
|
1264
|
+
template <class uword>
|
1265
|
+
void EWAHBoolArray<uword>::swap(EWAHBoolArray & x) {
|
1266
|
+
buffer.swap(x.buffer);
|
1267
|
+
size_t tmp = x.sizeinbits;
|
1268
|
+
x.sizeinbits = sizeinbits;
|
1269
|
+
sizeinbits = tmp;
|
1270
|
+
tmp = x.lastRLW;
|
1271
|
+
x.lastRLW = lastRLW;
|
1272
|
+
lastRLW = tmp;
|
1273
|
+
}
|
1274
|
+
|
1275
|
+
template <class uword>
|
1276
|
+
void EWAHBoolArray<uword>::append(const EWAHBoolArray & x) {
|
1277
|
+
if(sizeinbits % wordinbits == 0) {
|
1278
|
+
// hoping for the best?
|
1279
|
+
sizeinbits += x.sizeinbits;
|
1280
|
+
ConstRunningLengthWord<uword> lRLW(buffer[lastRLW]);
|
1281
|
+
if( (lRLW.getRunningLength() == 0) && (lRLW.getNumberOfLiteralWords() == 0)) {
|
1282
|
+
// it could be that the running length word is empty, in such a case,
|
1283
|
+
// we want to get rid of it!
|
1284
|
+
assert(lastRLW == buffer.size()-1);
|
1285
|
+
lastRLW = x.lastRLW + buffer.size() - 1;
|
1286
|
+
buffer.resize(buffer.size()-1);
|
1287
|
+
buffer.insert(buffer.end(),x.buffer.begin(),x.buffer.end());
|
1288
|
+
} else {
|
1289
|
+
lastRLW = x.lastRLW + buffer.size();
|
1290
|
+
buffer.insert(buffer.end(),x.buffer.begin(),x.buffer.end());
|
1291
|
+
}
|
1292
|
+
} else {
|
1293
|
+
stringstream ss;
|
1294
|
+
ss<<"This should really not happen! You are trying to append to a bitmap having a fractional number of words, that is, "<<static_cast<int>(sizeinbits)<<" bits with a word size in bits of "<<static_cast<int>(wordinbits)<<". ";
|
1295
|
+
ss<<"Size of the bitmap being appended: "<<x.sizeinbits<<" bits."<<endl;
|
1296
|
+
throw invalid_argument(ss.str());
|
1297
|
+
}
|
1298
|
+
}
|
1299
|
+
|
1300
|
+
template <class uword>
|
1301
|
+
EWAHBoolArrayIterator<uword>::EWAHBoolArrayIterator(const vector<uword> & parent) :
|
1302
|
+
pointer(0),
|
1303
|
+
myparent(parent),
|
1304
|
+
compressedwords(0), literalwords(0), rl(0), lw(0), b(0) {
|
1305
|
+
if(pointer <myparent.size()) readNewRunningLengthWord();
|
1306
|
+
}
|
1307
|
+
|
1308
|
+
|
1309
|
+
template <class uword>
|
1310
|
+
void EWAHBoolArrayIterator<uword>::readNewRunningLengthWord() {
|
1311
|
+
literalwords = 0;
|
1312
|
+
compressedwords = 0;
|
1313
|
+
ConstRunningLengthWord<uword> rlw(myparent[pointer]);
|
1314
|
+
rl = rlw.getRunningLength();
|
1315
|
+
lw = rlw.getNumberOfLiteralWords();
|
1316
|
+
b = rlw.getRunningBit();
|
1317
|
+
if((rl == 0) && (lw == 0)) {
|
1318
|
+
if(pointer < myparent.size() -1) {
|
1319
|
+
++pointer;
|
1320
|
+
readNewRunningLengthWord();
|
1321
|
+
} else {
|
1322
|
+
assert(pointer >= myparent.size()-1);
|
1323
|
+
pointer = myparent.size();
|
1324
|
+
assert(! hasNext());
|
1325
|
+
}
|
1326
|
+
}
|
1327
|
+
}
|
1328
|
+
|
1329
|
+
template <class uword>
|
1330
|
+
BoolArray<uword> EWAHBoolArray<uword>::toBoolArray() const {
|
1331
|
+
BoolArray<uword> ans(sizeinbits);
|
1332
|
+
EWAHBoolArrayIterator<uword> i = uncompress();
|
1333
|
+
int counter = 0;
|
1334
|
+
while(i.hasNext()) {
|
1335
|
+
ans.setWord(counter++,i.next());
|
1336
|
+
}
|
1337
|
+
return ans;
|
1338
|
+
}
|
1339
|
+
|
1340
|
+
template <class uword>
|
1341
|
+
size_t EWAHBoolArray<uword>::numberOfOnes() {
|
1342
|
+
size_t c (0);
|
1343
|
+
EWAHBoolArraySparseIterator<uword> i = sparse_uncompress();
|
1344
|
+
while(i.hasNext()) {
|
1345
|
+
const uword currentword = i.next();
|
1346
|
+
c += countOnes(currentword);
|
1347
|
+
/*
|
1348
|
+
for(int k = 0; k < wordinbits; ++k) {
|
1349
|
+
if ( (currentword & (static_cast<uword>(1) << k)) != 0)
|
1350
|
+
++c;
|
1351
|
+
}*/
|
1352
|
+
|
1353
|
+
}
|
1354
|
+
return c;
|
1355
|
+
|
1356
|
+
}
|
1357
|
+
|
1358
|
+
|
1359
|
+
|
1360
|
+
|
1361
|
+
template <class uword>
|
1362
|
+
template <class container>
|
1363
|
+
void EWAHBoolArray<uword>::appendRowIDs(container & out, const size_t offset) const {
|
1364
|
+
size_t pointer(0);
|
1365
|
+
size_t currentoffset(offset);
|
1366
|
+
if(RESERVEMEMORY) out.reserve(buffer.size()+64);// trading memory for speed.
|
1367
|
+
while(pointer <buffer.size()) {
|
1368
|
+
ConstRunningLengthWord<uword> rlw(buffer[pointer]);
|
1369
|
+
if(rlw.getRunningBit()) {
|
1370
|
+
for(size_t x = 0; x< static_cast<size_t>(rlw.getRunningLength()*wordinbits); ++x) {
|
1371
|
+
out.push_back(currentoffset + x);
|
1372
|
+
}
|
1373
|
+
}
|
1374
|
+
currentoffset = static_cast<size_t>(currentoffset + rlw.getRunningLength() * wordinbits);
|
1375
|
+
++pointer;
|
1376
|
+
for(uword k = 0; k<rlw.getNumberOfLiteralWords(); ++k) {
|
1377
|
+
const uword currentword = buffer[pointer];
|
1378
|
+
for(uint kk = 0; kk < wordinbits; ++kk) {
|
1379
|
+
if ( ( currentword & static_cast<uword>(static_cast<uword>(1) << kk)) != 0)
|
1380
|
+
out.push_back(currentoffset + kk);
|
1381
|
+
}
|
1382
|
+
currentoffset+=wordinbits;
|
1383
|
+
++pointer;
|
1384
|
+
}
|
1385
|
+
}
|
1386
|
+
}
|
1387
|
+
|
1388
|
+
|
1389
|
+
|
1390
|
+
template <class uword>
|
1391
|
+
bool EWAHBoolArray<uword>::operator!=(const EWAHBoolArray<uword> & x) const {
|
1392
|
+
return !(*this == x);
|
1393
|
+
}
|
1394
|
+
|
1395
|
+
template <class uword>
|
1396
|
+
bool EWAHBoolArray<uword>::operator==(const BoolArray<uword> & x) const {
|
1397
|
+
// could be more efficient
|
1398
|
+
return (this->toBoolArray() == x);
|
1399
|
+
}
|
1400
|
+
|
1401
|
+
template <class uword>
|
1402
|
+
bool EWAHBoolArray<uword>::operator!=(const BoolArray<uword> & x) const {
|
1403
|
+
// could be more efficient
|
1404
|
+
return (this->toBoolArray() != x);
|
1405
|
+
}
|
1406
|
+
|
1407
|
+
|
1408
|
+
template <class uword>
|
1409
|
+
size_t EWAHBoolArray<uword>::addStreamOfEmptyWords(const bool v, const size_t number) {
|
1410
|
+
if(number == 0) return 0;
|
1411
|
+
RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
|
1412
|
+
const bool noliteralword = (lastRunningLengthWord.getNumberOfLiteralWords() == 0);
|
1413
|
+
//firts, if the last running length word is empty, we align it
|
1414
|
+
// this
|
1415
|
+
const uword runlen = lastRunningLengthWord.getRunningLength();
|
1416
|
+
if( ( noliteralword ) && ( runlen == 0 )) {
|
1417
|
+
lastRunningLengthWord.setRunningBit(v);
|
1418
|
+
}
|
1419
|
+
size_t wordsadded (0);
|
1420
|
+
if( ( noliteralword ) && (lastRunningLengthWord.getRunningBit() == v) && (runlen < RunningLengthWord<uword>::largestrunninglengthcount) ) {
|
1421
|
+
// that's the easy case, we are just continuing
|
1422
|
+
uword whatwecanadd = static_cast<uword>( number < static_cast<uword>(RunningLengthWord<uword>::largestrunninglengthcount-runlen) ? number : static_cast<size_t>(RunningLengthWord<uword>::largestrunninglengthcount-runlen) );
|
1423
|
+
lastRunningLengthWord.setRunningLength(static_cast<uword>(runlen+whatwecanadd));
|
1424
|
+
sizeinbits = static_cast<size_t>(sizeinbits + whatwecanadd * wordinbits);
|
1425
|
+
if(number - whatwecanadd> 0 ) wordsadded = static_cast<size_t>(wordsadded + addStreamOfEmptyWords(v, static_cast<size_t>(number - whatwecanadd)));
|
1426
|
+
} else {
|
1427
|
+
buffer.push_back(0);
|
1428
|
+
++wordsadded;
|
1429
|
+
lastRLW = buffer.size() - 1;
|
1430
|
+
RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
|
1431
|
+
uword whatwecanadd = static_cast<uword>( number < RunningLengthWord<uword>::largestrunninglengthcount ? number : static_cast<size_t>(RunningLengthWord<uword>::largestrunninglengthcount) );
|
1432
|
+
lastRunningLengthWord2.setRunningBit(v);
|
1433
|
+
lastRunningLengthWord2.setRunningLength(whatwecanadd);
|
1434
|
+
sizeinbits = static_cast<size_t>(sizeinbits + whatwecanadd * wordinbits);
|
1435
|
+
if(number - whatwecanadd> 0 ) wordsadded = static_cast<size_t>( wordsadded + addStreamOfEmptyWords(v, static_cast<size_t>(number - whatwecanadd)));
|
1436
|
+
}
|
1437
|
+
return wordsadded;
|
1438
|
+
}
|
1439
|
+
|
1440
|
+
|
1441
|
+
template <class uword>
|
1442
|
+
size_t EWAHBoolArray<uword>::addStreamOfDirtyWords(const uword * v, const size_t number) {
|
1443
|
+
if(number == 0) return 0;
|
1444
|
+
RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
|
1445
|
+
const uword NumberOfLiteralWords = lastRunningLengthWord.getNumberOfLiteralWords();
|
1446
|
+
assert(RunningLengthWord<uword>::largestliteralcount >= NumberOfLiteralWords);
|
1447
|
+
const size_t whatwecanadd = number < static_cast<uword>(RunningLengthWord<uword>::largestliteralcount - NumberOfLiteralWords) ? number : static_cast<size_t>(RunningLengthWord<uword>::largestliteralcount - NumberOfLiteralWords);//0x7FFF-NumberOfLiteralWords);
|
1448
|
+
assert(NumberOfLiteralWords+whatwecanadd>=NumberOfLiteralWords);
|
1449
|
+
assert(NumberOfLiteralWords+whatwecanadd<=RunningLengthWord<uword>::largestliteralcount);
|
1450
|
+
lastRunningLengthWord.setNumberOfLiteralWords(static_cast<uword>(NumberOfLiteralWords+whatwecanadd));
|
1451
|
+
assert(lastRunningLengthWord.getNumberOfLiteralWords()==NumberOfLiteralWords+whatwecanadd);
|
1452
|
+
const size_t leftovernumber = number - whatwecanadd;
|
1453
|
+
// add the dirty words...
|
1454
|
+
const size_t oldsize (buffer.size());
|
1455
|
+
buffer.resize(oldsize+whatwecanadd);
|
1456
|
+
memcpy(&buffer[oldsize],v,whatwecanadd*sizeof(uword));
|
1457
|
+
size_t wordsadded(whatwecanadd);
|
1458
|
+
if(leftovernumber>0) {
|
1459
|
+
//add
|
1460
|
+
buffer.push_back(0);
|
1461
|
+
lastRLW=buffer.size() - 1;
|
1462
|
+
++wordsadded;
|
1463
|
+
wordsadded+=addStreamOfDirtyWords(v+whatwecanadd, leftovernumber);
|
1464
|
+
}
|
1465
|
+
assert(wordsadded >= number);
|
1466
|
+
return wordsadded;
|
1467
|
+
}
|
1468
|
+
|
1469
|
+
|
1470
|
+
|
1471
|
+
template <class uword>
|
1472
|
+
size_t EWAHBoolArray<uword>::addEmptyWord(const bool v) {
|
1473
|
+
RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
|
1474
|
+
const bool noliteralword = (lastRunningLengthWord.getNumberOfLiteralWords() == 0);
|
1475
|
+
//firts, if the last running length word is empty, we align it
|
1476
|
+
// this
|
1477
|
+
uword runlen = lastRunningLengthWord.getRunningLength();
|
1478
|
+
if( ( noliteralword ) && ( runlen == 0 )) {
|
1479
|
+
lastRunningLengthWord.setRunningBit(v);
|
1480
|
+
assert(lastRunningLengthWord.getRunningBit() == v);
|
1481
|
+
}
|
1482
|
+
if( ( noliteralword ) && (lastRunningLengthWord.getRunningBit() == v) && (runlen < RunningLengthWord<uword>::largestrunninglengthcount) ) {
|
1483
|
+
lastRunningLengthWord.setRunningLength(static_cast<uword>(runlen+1));
|
1484
|
+
assert(lastRunningLengthWord.getRunningLength() == runlen+1);
|
1485
|
+
return 0;
|
1486
|
+
} else {
|
1487
|
+
// we have to start anew
|
1488
|
+
buffer.push_back(0);
|
1489
|
+
lastRLW = buffer.size() - 1;
|
1490
|
+
RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
|
1491
|
+
assert(lastRunningLengthWord2.getRunningLength()==0);
|
1492
|
+
assert(lastRunningLengthWord2.getRunningBit()==0);
|
1493
|
+
assert(lastRunningLengthWord2.getNumberOfLiteralWords()==0);
|
1494
|
+
lastRunningLengthWord2.setRunningBit(v);
|
1495
|
+
assert(lastRunningLengthWord2.getRunningBit() == v);
|
1496
|
+
lastRunningLengthWord2.setRunningLength(1);
|
1497
|
+
assert(lastRunningLengthWord2.getRunningLength() == 1);
|
1498
|
+
assert(lastRunningLengthWord2.getNumberOfLiteralWords()==0);
|
1499
|
+
return 1;
|
1500
|
+
}
|
1501
|
+
}
|
1502
|
+
|
1503
|
+
|
1504
|
+
|
1505
|
+
template <class uword>
|
1506
|
+
void EWAHBoolArray<uword>::sparselogicaland(EWAHBoolArray &a, EWAHBoolArray &container) {
|
1507
|
+
makeSameSize(a);
|
1508
|
+
container.reset();
|
1509
|
+
if(RESERVEMEMORY) container.buffer.reserve(buffer.size()>a.buffer.size()?buffer.size():a.buffer.size());
|
1510
|
+
assert(sizeInBits() == a.sizeInBits());
|
1511
|
+
/**
|
1512
|
+
* This could possibly be faster if we go around
|
1513
|
+
* the uncompress calls.
|
1514
|
+
*/
|
1515
|
+
EWAHBoolArraySparseIterator<uword> i = a.sparse_uncompress();
|
1516
|
+
EWAHBoolArraySparseIterator<uword> j = sparse_uncompress();
|
1517
|
+
size_t pos (0);
|
1518
|
+
uword x,y;
|
1519
|
+
bool ibehindj,jbehindi;
|
1520
|
+
while(i.hasNext() and j.hasNext()) {
|
1521
|
+
x = i.next();
|
1522
|
+
y = j.next();
|
1523
|
+
ibehindj = i.position() < j.position();
|
1524
|
+
jbehindi = j.position() < i.position();
|
1525
|
+
while (( ibehindj and i.hasNext()) or (jbehindi and j.hasNext())) {
|
1526
|
+
if(ibehindj) x = i.next();
|
1527
|
+
else if(jbehindi) y = j.next();
|
1528
|
+
ibehindj = i.position() < j.position();
|
1529
|
+
jbehindi = j.position() < i.position();
|
1530
|
+
}
|
1531
|
+
size_t nextnonzero = i.position()< j.position() ?i.position(): j.position() ;
|
1532
|
+
if(nextnonzero > pos + 1) {
|
1533
|
+
container.addStreamOfEmptyWords(0, nextnonzero-pos-1);
|
1534
|
+
pos += nextnonzero-pos-1;
|
1535
|
+
}
|
1536
|
+
if(i.position() == j.position()) {
|
1537
|
+
container.add(x & y);
|
1538
|
+
++pos;
|
1539
|
+
}
|
1540
|
+
}
|
1541
|
+
container.setSizeInBits(sizeInBits());
|
1542
|
+
//return answer;
|
1543
|
+
}
|
1544
|
+
|
1545
|
+
|
1546
|
+
|
1547
|
+
template <class uword>
|
1548
|
+
void EWAHBoolArray<uword>::rawlogicalor(EWAHBoolArray &a, EWAHBoolArray &container) {
|
1549
|
+
makeSameSize(a);
|
1550
|
+
container.reset();
|
1551
|
+
if(RESERVEMEMORY) container.buffer.reserve(buffer.size()+a.buffer.size());
|
1552
|
+
assert(sizeInBits() == a.sizeInBits());
|
1553
|
+
EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
|
1554
|
+
EWAHBoolArrayRawIterator<uword> j = raw_iterator();
|
1555
|
+
if(!(i.hasNext() and j.hasNext())) {// hopefully this never happens...
|
1556
|
+
container.setSizeInBits(sizeInBits());
|
1557
|
+
return;
|
1558
|
+
}
|
1559
|
+
// at this point, this should be safe:
|
1560
|
+
BufferedRunningLengthWord<uword> & rlwi = i.next();
|
1561
|
+
BufferedRunningLengthWord<uword> & rlwj = j.next();
|
1562
|
+
//RunningLength;
|
1563
|
+
while (true) {
|
1564
|
+
bool i_is_prey (rlwi.size()<rlwj.size());
|
1565
|
+
BufferedRunningLengthWord<uword> & prey ( i_is_prey ? rlwi: rlwj);
|
1566
|
+
BufferedRunningLengthWord<uword> & predator (i_is_prey ? rlwj: rlwi);
|
1567
|
+
if(prey.getRunningBit() == 0) {
|
1568
|
+
// we have a stream of 0x00
|
1569
|
+
const uword predatorrl (predator.getRunningLength());
|
1570
|
+
const uword preyrl (prey.getRunningLength());
|
1571
|
+
if(predatorrl >= preyrl) {
|
1572
|
+
const uword tobediscarded = preyrl ;
|
1573
|
+
container.addStreamOfEmptyWords(predator.getRunningBit(), static_cast<size_t>(tobediscarded));
|
1574
|
+
} else {
|
1575
|
+
const uword tobediscarded = predatorrl ;
|
1576
|
+
container.addStreamOfEmptyWords(predator.getRunningBit(), static_cast<size_t>(tobediscarded));
|
1577
|
+
if(preyrl - tobediscarded>0) {
|
1578
|
+
const uword * dw_predator (i_is_prey ? j.dirtyWords(): i.dirtyWords());
|
1579
|
+
container.addStreamOfDirtyWords(dw_predator, static_cast<size_t>(preyrl - tobediscarded));
|
1580
|
+
}
|
1581
|
+
}
|
1582
|
+
predator.discardFirstWords(preyrl);
|
1583
|
+
prey.discardFirstWords(preyrl);
|
1584
|
+
} else {
|
1585
|
+
// we have a stream of 1x11
|
1586
|
+
const uword preyrl (prey.getRunningLength());
|
1587
|
+
predator.discardFirstWords(preyrl);
|
1588
|
+
prey.discardFirstWords(preyrl);
|
1589
|
+
container.addStreamOfEmptyWords(1, static_cast<size_t>(preyrl));
|
1590
|
+
}
|
1591
|
+
const uword predatorrl (predator.getRunningLength());
|
1592
|
+
if(predatorrl>0) {
|
1593
|
+
if(predator.getRunningBit() == 0) {
|
1594
|
+
const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
|
1595
|
+
const uword tobediscarded = (predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey : predatorrl;
|
1596
|
+
if(tobediscarded>0) {
|
1597
|
+
const uword * dw_prey (i_is_prey ? i.dirtyWords(): j.dirtyWords());
|
1598
|
+
container.addStreamOfDirtyWords(dw_prey, static_cast<size_t>(tobediscarded));
|
1599
|
+
predator.discardFirstWords(tobediscarded);
|
1600
|
+
prey.discardFirstWords(tobediscarded);
|
1601
|
+
}
|
1602
|
+
} else {
|
1603
|
+
const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
|
1604
|
+
const uword tobediscarded = (predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey : predatorrl;
|
1605
|
+
predator.discardFirstWords(tobediscarded);
|
1606
|
+
prey.discardFirstWords(tobediscarded);
|
1607
|
+
container.addStreamOfEmptyWords(1, static_cast<size_t>(tobediscarded));
|
1608
|
+
}
|
1609
|
+
}
|
1610
|
+
assert(prey.getRunningLength() ==0);
|
1611
|
+
// all that is left to do now is to AND the dirty words
|
1612
|
+
uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
|
1613
|
+
if(nbre_dirty_prey > 0) {
|
1614
|
+
assert(predator.getRunningLength() ==0);
|
1615
|
+
const uword * idirty = i.dirtyWords();
|
1616
|
+
const uword * jdirty = j.dirtyWords();
|
1617
|
+
for(uword k = 0; k< nbre_dirty_prey; ++k) {
|
1618
|
+
container.add(idirty[k] | jdirty[k]);
|
1619
|
+
}
|
1620
|
+
predator.discardFirstWords(nbre_dirty_prey);
|
1621
|
+
}
|
1622
|
+
if( i_is_prey ) {
|
1623
|
+
if(!i.hasNext()) break;
|
1624
|
+
rlwi = i.next();
|
1625
|
+
} else {
|
1626
|
+
if(!j.hasNext()) break;
|
1627
|
+
rlwj = j.next();
|
1628
|
+
}
|
1629
|
+
}
|
1630
|
+
container.setSizeInBits(sizeInBits());
|
1631
|
+
}
|
1632
|
+
|
1633
|
+
|
1634
|
+
template <class uword>
|
1635
|
+
void EWAHBoolArray<uword>::rawlogicaland(EWAHBoolArray &a, EWAHBoolArray &container) {
|
1636
|
+
makeSameSize(a);
|
1637
|
+
container.reset();
|
1638
|
+
if(RESERVEMEMORY) container.buffer.reserve(buffer.size()>a.buffer.size()?buffer.size():a.buffer.size());
|
1639
|
+
assert(sizeInBits() == a.sizeInBits());
|
1640
|
+
EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
|
1641
|
+
EWAHBoolArrayRawIterator<uword> j = raw_iterator();
|
1642
|
+
if(!(i.hasNext() and j.hasNext())) {// hopefully this never happens...
|
1643
|
+
container.setSizeInBits(sizeInBits());
|
1644
|
+
return;
|
1645
|
+
}
|
1646
|
+
// at this point, this should be safe:
|
1647
|
+
BufferedRunningLengthWord<uword> & rlwi = i.next();
|
1648
|
+
BufferedRunningLengthWord<uword> & rlwj = j.next();
|
1649
|
+
//RunningLength;
|
1650
|
+
while (true) {
|
1651
|
+
bool i_is_prey (rlwi.size()<rlwj.size());
|
1652
|
+
BufferedRunningLengthWord<uword> & prey ( i_is_prey ? rlwi: rlwj);
|
1653
|
+
BufferedRunningLengthWord<uword> & predator (i_is_prey ? rlwj: rlwi);
|
1654
|
+
if(prey.getRunningBit() == 0) {
|
1655
|
+
const uword preyrl (prey.getRunningLength());
|
1656
|
+
predator.discardFirstWords(preyrl);
|
1657
|
+
prey.discardFirstWords(preyrl);
|
1658
|
+
container.addStreamOfEmptyWords(0, static_cast<size_t>(preyrl));
|
1659
|
+
} else {
|
1660
|
+
// we have a stream of 1x11
|
1661
|
+
const uword predatorrl (predator.getRunningLength());
|
1662
|
+
const uword preyrl (prey.getRunningLength());
|
1663
|
+
const uword tobediscarded = (predatorrl >= preyrl) ? preyrl : predatorrl;
|
1664
|
+
container.addStreamOfEmptyWords(predator.getRunningBit(), static_cast<size_t>(tobediscarded));
|
1665
|
+
if(preyrl - tobediscarded>0) {
|
1666
|
+
const uword * dw_predator (i_is_prey ? j.dirtyWords(): i.dirtyWords());
|
1667
|
+
container.addStreamOfDirtyWords(dw_predator, static_cast<size_t>(preyrl - tobediscarded));
|
1668
|
+
}
|
1669
|
+
predator.discardFirstWords(preyrl);
|
1670
|
+
prey.discardFirstWords(preyrl);
|
1671
|
+
}
|
1672
|
+
const uword predatorrl (predator.getRunningLength());
|
1673
|
+
if(predatorrl>0) {
|
1674
|
+
if(predator.getRunningBit() == 0) {
|
1675
|
+
const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
|
1676
|
+
const uword tobediscarded = (predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey : predatorrl;
|
1677
|
+
predator.discardFirstWords(tobediscarded);
|
1678
|
+
prey.discardFirstWords(tobediscarded);
|
1679
|
+
container.addStreamOfEmptyWords(0, static_cast<size_t>(tobediscarded));
|
1680
|
+
} else {
|
1681
|
+
const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
|
1682
|
+
const uword tobediscarded = (predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey : predatorrl;
|
1683
|
+
if(tobediscarded>0) {
|
1684
|
+
const uword * dw_prey (i_is_prey ? i.dirtyWords(): j.dirtyWords());
|
1685
|
+
container.addStreamOfDirtyWords(dw_prey, static_cast<size_t>(tobediscarded));
|
1686
|
+
predator.discardFirstWords(tobediscarded);
|
1687
|
+
prey.discardFirstWords(tobediscarded);
|
1688
|
+
}
|
1689
|
+
}
|
1690
|
+
}
|
1691
|
+
assert(prey.getRunningLength() ==0);
|
1692
|
+
// all that is left to do now is to AND the dirty words
|
1693
|
+
uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
|
1694
|
+
if(nbre_dirty_prey > 0) {
|
1695
|
+
assert(predator.getRunningLength() ==0);
|
1696
|
+
const uword * idirty = i.dirtyWords();
|
1697
|
+
const uword * jdirty = j.dirtyWords();
|
1698
|
+
for(uword k = 0; k< nbre_dirty_prey; ++k) {
|
1699
|
+
container.add(idirty[k] & jdirty[k]);
|
1700
|
+
}
|
1701
|
+
predator.discardFirstWords(nbre_dirty_prey);
|
1702
|
+
}
|
1703
|
+
if( i_is_prey ) {
|
1704
|
+
if(!i.hasNext()) break;
|
1705
|
+
rlwi = i.next();
|
1706
|
+
} else {
|
1707
|
+
if(!j.hasNext()) break;
|
1708
|
+
rlwj = j.next();
|
1709
|
+
}
|
1710
|
+
}
|
1711
|
+
container.setSizeInBits(sizeInBits());
|
1712
|
+
}
|
1713
|
+
|
1714
|
+
|
1715
|
+
|
1716
|
+
|
1717
|
+
template <class uword>
|
1718
|
+
BitmapStatistics EWAHBoolArray<uword>::computeStatistics() const {
|
1719
|
+
//uint totalcompressed(0), totalliteral(0);
|
1720
|
+
BitmapStatistics bs;
|
1721
|
+
EWAHBoolArrayRawIterator<uword> i = raw_iterator();
|
1722
|
+
while(i.hasNext()) {
|
1723
|
+
BufferedRunningLengthWord<uword> &brlw (i.next());
|
1724
|
+
++bs.runningwordmarker;
|
1725
|
+
bs.totalliteral += brlw.getNumberOfLiteralWords();
|
1726
|
+
bs.totalcompressed += brlw.getRunningLength();
|
1727
|
+
if(brlw.getRunningLength() == RunningLengthWord<uword>::largestrunninglengthcount) {
|
1728
|
+
++bs.maximumofrunningcounterreached;
|
1729
|
+
}
|
1730
|
+
}
|
1731
|
+
return bs;
|
1732
|
+
}
|
1733
|
+
|
1734
|
+
|
1735
|
+
template <class uword>
|
1736
|
+
void EWAHBoolArray<uword>::debugprintout() const {
|
1737
|
+
cout << "==printing out EWAHBoolArray=="<<endl;
|
1738
|
+
cout <<"Number of compressed words: "<< buffer.size()<< endl;
|
1739
|
+
size_t pointer = 0;
|
1740
|
+
while(pointer <buffer.size()) {
|
1741
|
+
ConstRunningLengthWord<uword> rlw(buffer[pointer]);
|
1742
|
+
bool b = rlw.getRunningBit() ;
|
1743
|
+
uword rl = rlw.getRunningLength() ;
|
1744
|
+
uword lw = rlw.getNumberOfLiteralWords();
|
1745
|
+
cout << "pointer = "<<pointer<<" running bit="<<b<<" running length="<<rl<<" lit. words="<<lw<<endl;
|
1746
|
+
for(uword j = 0; j < lw ; ++j) {
|
1747
|
+
const uword & w = buffer[pointer+j+1];
|
1748
|
+
cout<<toBinaryString(w)<<endl;;
|
1749
|
+
}
|
1750
|
+
pointer += lw + 1;
|
1751
|
+
}
|
1752
|
+
cout << "==END=="<<endl;
|
1753
|
+
}
|
1754
|
+
|
1755
|
+
template <class uword>
|
1756
|
+
size_t EWAHBoolArray<uword>::sizeOnDisk() const {
|
1757
|
+
return sizeof(sizeinbits)+sizeof(size_t)+sizeof(uword)*buffer.size();
|
1758
|
+
}
|
1759
|
+
|
1760
|
+
|
1761
|
+
|
1762
|
+
|
1763
|
+
#endif
|