datasketches 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
- metadata +8 -3
@@ -49,12 +49,12 @@ cpc_compressor<A>::~cpc_compressor() {
|
|
49
49
|
}
|
50
50
|
|
51
51
|
template<typename A>
|
52
|
-
uint8_t* cpc_compressor<A>::make_inverse_permutation(const uint8_t* permu,
|
52
|
+
uint8_t* cpc_compressor<A>::make_inverse_permutation(const uint8_t* permu, unsigned length) {
|
53
53
|
uint8_t* inverse = new uint8_t[length]; // use new for global initialization
|
54
|
-
for (
|
55
|
-
inverse[permu[i]] = i;
|
54
|
+
for (unsigned i = 0; i < length; i++) {
|
55
|
+
inverse[permu[i]] = static_cast<uint8_t>(i);
|
56
56
|
}
|
57
|
-
for (
|
57
|
+
for (unsigned i = 0; i < length; i++) {
|
58
58
|
if (permu[inverse[i]] != i) throw std::logic_error("inverse permutation error");
|
59
59
|
}
|
60
60
|
return inverse;
|
@@ -64,17 +64,17 @@ uint8_t* cpc_compressor<A>::make_inverse_permutation(const uint8_t* permu, int l
|
|
64
64
|
of length at most 12, this builds a size-4096 decoding table */
|
65
65
|
// The second argument is typically 256, but can be other values such as 65.
|
66
66
|
template<typename A>
|
67
|
-
uint16_t* cpc_compressor<A>::make_decoding_table(const uint16_t* encoding_table,
|
67
|
+
uint16_t* cpc_compressor<A>::make_decoding_table(const uint16_t* encoding_table, unsigned num_byte_values) {
|
68
68
|
uint16_t* decoding_table = new uint16_t[4096]; // use new for global initialization
|
69
|
-
for (
|
70
|
-
const
|
71
|
-
const
|
72
|
-
const
|
73
|
-
const
|
74
|
-
const
|
75
|
-
const
|
76
|
-
for (
|
77
|
-
const
|
69
|
+
for (unsigned byte_value = 0; byte_value < num_byte_values; byte_value++) {
|
70
|
+
const uint16_t encoding_entry = encoding_table[byte_value];
|
71
|
+
const uint16_t code_value = encoding_entry & 0xfff;
|
72
|
+
const uint8_t code_length = encoding_entry >> 12;
|
73
|
+
const uint16_t decoding_entry = static_cast<uint16_t>((code_length << 8) | byte_value);
|
74
|
+
const uint8_t garbage_length = 12 - code_length;
|
75
|
+
const uint32_t num_copies = 1 << garbage_length;
|
76
|
+
for (uint32_t garbage_bits = 0; garbage_bits < num_copies; garbage_bits++) {
|
77
|
+
const uint16_t extended_code_value = static_cast<uint16_t>(code_value | (garbage_bits << code_length));
|
78
78
|
decoding_table[extended_code_value & 0xfff] = decoding_entry;
|
79
79
|
}
|
80
80
|
}
|
@@ -157,7 +157,7 @@ void cpc_compressor<A>::compress(const cpc_sketch_alloc<A>& source, compressed_s
|
|
157
157
|
}
|
158
158
|
|
159
159
|
template<typename A>
|
160
|
-
void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k,
|
160
|
+
void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const {
|
161
161
|
switch (cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons)) {
|
162
162
|
case cpc_sketch_alloc<A>::flavor::EMPTY:
|
163
163
|
target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
|
@@ -202,16 +202,17 @@ template<typename A>
|
|
202
202
|
void cpc_compressor<A>::compress_hybrid_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& result) const {
|
203
203
|
if (source.sliding_window.size() == 0) throw std::logic_error("no sliding window");
|
204
204
|
if (source.window_offset != 0) throw std::logic_error("window_offset != 0");
|
205
|
-
const
|
205
|
+
const uint32_t k = 1 << source.get_lg_k();
|
206
206
|
vector_u32<A> pairs_from_table = source.surprising_value_table.unwrapping_get_items();
|
207
|
-
|
208
|
-
|
207
|
+
const uint32_t num_pairs_from_table = static_cast<uint32_t>(pairs_from_table.size());
|
208
|
+
if (num_pairs_from_table > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, num_pairs_from_table);
|
209
|
+
const uint32_t num_pairs_from_window = source.get_num_coupons() - num_pairs_from_table; // because the window offset is zero
|
209
210
|
|
210
|
-
vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window,
|
211
|
+
vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, num_pairs_from_table, source.get_allocator());
|
211
212
|
|
212
213
|
u32_table<A>::merge(
|
213
214
|
pairs_from_table.data(), 0, pairs_from_table.size(),
|
214
|
-
all_pairs.data(),
|
215
|
+
all_pairs.data(), num_pairs_from_table, num_pairs_from_window,
|
215
216
|
all_pairs.data(), 0
|
216
217
|
); // note the overlapping subarray trick
|
217
218
|
|
@@ -228,15 +229,15 @@ void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& sour
|
|
228
229
|
// In the hybrid flavor, some of these pairs actually
|
229
230
|
// belong in the window, so we will separate them out,
|
230
231
|
// moving the "true" pairs to the bottom of the array.
|
231
|
-
const
|
232
|
+
const uint32_t k = 1 << lg_k;
|
232
233
|
target.window.resize(k, 0); // important: zero the memory
|
233
|
-
|
234
|
-
for (
|
234
|
+
uint32_t next_true_pair = 0;
|
235
|
+
for (uint32_t i = 0; i < source.table_num_entries; i++) {
|
235
236
|
const uint32_t row_col = pairs[i];
|
236
237
|
if (row_col == UINT32_MAX) throw std::logic_error("empty marker is not expected");
|
237
238
|
const uint8_t col = row_col & 63;
|
238
239
|
if (col < 8) {
|
239
|
-
const
|
240
|
+
const uint32_t row = row_col >> 6;
|
240
241
|
target.window[row] |= 1 << col; // set the window bit
|
241
242
|
} else {
|
242
243
|
pairs[next_true_pair++] = row_col; // move true pair down
|
@@ -270,7 +271,7 @@ void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& sour
|
|
270
271
|
uint8_t lg_k, uint32_t num_coupons) const {
|
271
272
|
if (source.window_data.size() == 0) throw std::logic_error("window is expected");
|
272
273
|
uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
|
273
|
-
const
|
274
|
+
const uint32_t num_pairs = source.table_num_entries;
|
274
275
|
if (num_pairs == 0) {
|
275
276
|
target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
|
276
277
|
} else {
|
@@ -278,7 +279,7 @@ void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& sour
|
|
278
279
|
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
|
279
280
|
lg_k, source.table_data.get_allocator());
|
280
281
|
// undo the compressor's 8-column shift
|
281
|
-
for (
|
282
|
+
for (uint32_t i = 0; i < num_pairs; i++) {
|
282
283
|
if ((pairs[i] & 63) >= 56) throw std::logic_error("(pairs[i] & 63) >= 56");
|
283
284
|
pairs[i] += 8;
|
284
285
|
}
|
@@ -302,7 +303,7 @@ void cpc_compressor<A>::compress_sliding_flavor(const cpc_sketch_alloc<A>& sourc
|
|
302
303
|
|
303
304
|
for (size_t i = 0; i < pairs.size(); i++) {
|
304
305
|
const uint32_t row_col = pairs[i];
|
305
|
-
const
|
306
|
+
const uint32_t row = row_col >> 6;
|
306
307
|
uint8_t col = row_col & 63;
|
307
308
|
// first rotate the columns into a canonical configuration: new = ((old - (offset+8)) + 64) mod 64
|
308
309
|
col = (col + 56 - offset) & 63;
|
@@ -322,7 +323,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
|
|
322
323
|
uint8_t lg_k, uint32_t num_coupons) const {
|
323
324
|
if (source.window_data.size() == 0) throw std::logic_error("window is expected");
|
324
325
|
uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
|
325
|
-
const
|
326
|
+
const uint32_t num_pairs = source.table_num_entries;
|
326
327
|
if (num_pairs == 0) {
|
327
328
|
target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
|
328
329
|
} else {
|
@@ -337,9 +338,9 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
|
|
337
338
|
uint8_t offset = cpc_sketch_alloc<A>::determine_correct_offset(lg_k, num_coupons);
|
338
339
|
if (offset > 56) throw std::out_of_range("offset out of range");
|
339
340
|
|
340
|
-
for (
|
341
|
+
for (uint32_t i = 0; i < num_pairs; i++) {
|
341
342
|
const uint32_t row_col = pairs[i];
|
342
|
-
const
|
343
|
+
const uint32_t row = row_col >> 6;
|
343
344
|
uint8_t col = row_col & 63;
|
344
345
|
// first undo the permutation
|
345
346
|
col = permutation[col];
|
@@ -354,25 +355,26 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
|
|
354
355
|
|
355
356
|
template<typename A>
|
356
357
|
void cpc_compressor<A>::compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const {
|
357
|
-
const
|
358
|
-
const
|
359
|
-
const
|
358
|
+
const uint32_t k = 1 << lg_k;
|
359
|
+
const uint32_t num_pairs = static_cast<uint32_t>(pairs.size());
|
360
|
+
const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
|
361
|
+
const uint64_t table_len = safe_length_for_compressed_pair_buf(k, num_pairs, num_base_bits);
|
360
362
|
result.table_data.resize(table_len);
|
361
363
|
|
362
|
-
|
364
|
+
uint32_t csv_length = low_level_compress_pairs(pairs.data(), static_cast<uint32_t>(pairs.size()), num_base_bits, result.table_data.data());
|
363
365
|
|
364
366
|
// At this point we could free the unused portion of the compression output buffer,
|
365
367
|
// but it is not necessary if it is temporary
|
366
368
|
// Note: realloc caused strange timing spikes for lgK = 11 and 12.
|
367
369
|
|
368
370
|
result.table_data_words = csv_length;
|
369
|
-
result.table_num_entries =
|
371
|
+
result.table_num_entries = num_pairs;
|
370
372
|
}
|
371
373
|
|
372
374
|
template<typename A>
|
373
|
-
vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data,
|
375
|
+
vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, uint32_t data_words, uint32_t num_pairs,
|
374
376
|
uint8_t lg_k, const A& allocator) const {
|
375
|
-
const
|
377
|
+
const uint32_t k = 1 << lg_k;
|
376
378
|
vector_u32<A> pairs(num_pairs, 0, allocator);
|
377
379
|
const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
|
378
380
|
low_level_uncompress_pairs(pairs.data(), num_pairs, num_base_bits, data, data_words);
|
@@ -381,7 +383,7 @@ vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* da
|
|
381
383
|
|
382
384
|
template<typename A>
|
383
385
|
void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const {
|
384
|
-
const
|
386
|
+
const uint32_t k = 1 << lg_k;
|
385
387
|
const size_t window_buf_len = safe_length_for_compressed_window_buf(k);
|
386
388
|
target.window_data.resize(window_buf_len);
|
387
389
|
const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
|
@@ -391,20 +393,20 @@ void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t l
|
|
391
393
|
// but it is not necessary if it is temporary
|
392
394
|
// Note: realloc caused strange timing spikes for lgK = 11 and 12.
|
393
395
|
|
394
|
-
target.window_data_words = data_words;
|
396
|
+
target.window_data_words = static_cast<uint32_t>(data_words);
|
395
397
|
}
|
396
398
|
|
397
399
|
template<typename A>
|
398
|
-
void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data,
|
400
|
+
void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, uint32_t data_words, vector_u8<A>& window,
|
399
401
|
uint8_t lg_k, uint32_t num_coupons) const {
|
400
|
-
const
|
402
|
+
const uint32_t k = 1 << lg_k;
|
401
403
|
window.resize(k); // zeroing not needed here (unlike the Hybrid Flavor)
|
402
404
|
const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
|
403
405
|
low_level_uncompress_bytes(window.data(), k, decoding_tables_for_high_entropy_byte[pseudo_phase], data, data_words);
|
404
406
|
}
|
405
407
|
|
406
408
|
template<typename A>
|
407
|
-
size_t cpc_compressor<A>::safe_length_for_compressed_pair_buf(
|
409
|
+
size_t cpc_compressor<A>::safe_length_for_compressed_pair_buf(uint32_t k, uint32_t num_pairs, uint8_t num_base_bits) {
|
408
410
|
// Long ybits = k + numPairs; // simpler and safer UB
|
409
411
|
// The following tighter UB on ybits is based on page 198
|
410
412
|
// of the textbook "Managing Gigabytes" by Witten, Moffat, and Bell.
|
@@ -422,14 +424,14 @@ size_t cpc_compressor<A>::safe_length_for_compressed_pair_buf(uint64_t k, size_t
|
|
422
424
|
// So the 12-bit lookahead is the tight constraint, but there are at least (2 + B) bits emitted,
|
423
425
|
// so we would be safe with max (0, 10 - B) bits of padding at the end of the bitstream.
|
424
426
|
template<typename A>
|
425
|
-
size_t cpc_compressor<A>::safe_length_for_compressed_window_buf(
|
427
|
+
size_t cpc_compressor<A>::safe_length_for_compressed_window_buf(uint32_t k) { // measured in 32-bit words
|
426
428
|
const size_t bits = 12 * k + 11; // 11 bits of padding, due to 12-bit lookahead, with 1 bit certainly present.
|
427
429
|
return divide_longs_rounding_up(bits, 32);
|
428
430
|
}
|
429
431
|
|
430
432
|
template<typename A>
|
431
|
-
uint8_t cpc_compressor<A>::determine_pseudo_phase(uint8_t lg_k,
|
432
|
-
const
|
433
|
+
uint8_t cpc_compressor<A>::determine_pseudo_phase(uint8_t lg_k, uint32_t c) {
|
434
|
+
const uint32_t k = 1 << lg_k;
|
433
435
|
// This mid-range logic produces pseudo-phases. They are used to select encoding tables.
|
434
436
|
// The thresholds were chosen by hand after looking at plots of measured compression.
|
435
437
|
if (1000 * c < 2375 * k) {
|
@@ -450,7 +452,7 @@ uint8_t cpc_compressor<A>::determine_pseudo_phase(uint8_t lg_k, uint64_t c) {
|
|
450
452
|
}
|
451
453
|
}
|
452
454
|
|
453
|
-
static inline void maybe_flush_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, uint32_t* wordarr,
|
455
|
+
static inline void maybe_flush_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, uint32_t* wordarr, uint32_t& wordindex) {
|
454
456
|
if (bufbits >= 32) {
|
455
457
|
wordarr[wordindex++] = bitbuf & 0xffffffff;
|
456
458
|
bitbuf = bitbuf >> 32;
|
@@ -458,7 +460,7 @@ static inline void maybe_flush_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, uint32
|
|
458
460
|
}
|
459
461
|
}
|
460
462
|
|
461
|
-
static inline void maybe_fill_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, const uint32_t* wordarr,
|
463
|
+
static inline void maybe_fill_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, const uint32_t* wordarr, uint32_t& wordindex, uint8_t minbits) {
|
462
464
|
if (bufbits < minbits) {
|
463
465
|
bitbuf |= static_cast<uint64_t>(wordarr[wordindex++]) << bufbits;
|
464
466
|
bufbits += 32;
|
@@ -468,20 +470,20 @@ static inline void maybe_fill_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, const u
|
|
468
470
|
// This returns the number of compressed words that were actually used.
|
469
471
|
// It is the caller's responsibility to ensure that the compressed_words array is long enough.
|
470
472
|
template<typename A>
|
471
|
-
|
473
|
+
uint32_t cpc_compressor<A>::low_level_compress_bytes(
|
472
474
|
const uint8_t* byte_array, // input
|
473
|
-
|
475
|
+
uint32_t num_bytes_to_encode,
|
474
476
|
const uint16_t* encoding_table,
|
475
477
|
uint32_t* compressed_words // output
|
476
478
|
) const {
|
477
479
|
uint64_t bitbuf = 0; // bits are packed into this first, then are flushed to compressed_words
|
478
480
|
uint8_t bufbits = 0; // number of bits currently in bitbuf; must be between 0 and 31
|
479
|
-
|
481
|
+
uint32_t next_word_index = 0;
|
480
482
|
|
481
|
-
for (
|
482
|
-
const
|
483
|
+
for (uint32_t byte_index = 0; byte_index < num_bytes_to_encode; byte_index++) {
|
484
|
+
const uint16_t code_info = encoding_table[byte_array[byte_index]];
|
483
485
|
const uint64_t code_val = code_info & 0xfff;
|
484
|
-
const
|
486
|
+
const uint8_t code_len = code_info >> 12;
|
485
487
|
bitbuf |= (code_val << bufbits);
|
486
488
|
bufbits += code_len;
|
487
489
|
maybe_flush_bitbuf(bitbuf, bufbits, compressed_words, next_word_index);
|
@@ -502,12 +504,12 @@ size_t cpc_compressor<A>::low_level_compress_bytes(
|
|
502
504
|
template<typename A>
|
503
505
|
void cpc_compressor<A>::low_level_uncompress_bytes(
|
504
506
|
uint8_t* byte_array, // output
|
505
|
-
|
507
|
+
uint32_t num_bytes_to_decode,
|
506
508
|
const uint16_t* decoding_table,
|
507
509
|
const uint32_t* compressed_words, // input
|
508
|
-
|
510
|
+
uint32_t num_compressed_words
|
509
511
|
) const {
|
510
|
-
|
512
|
+
uint32_t word_index = 0;
|
511
513
|
uint64_t bitbuf = 0;
|
512
514
|
uint8_t bufbits = 0;
|
513
515
|
|
@@ -515,7 +517,7 @@ void cpc_compressor<A>::low_level_uncompress_bytes(
|
|
515
517
|
if (decoding_table == nullptr) throw std::logic_error("decoding_table == NULL");
|
516
518
|
if (compressed_words == nullptr) throw std::logic_error("compressed_words == NULL");
|
517
519
|
|
518
|
-
for (
|
520
|
+
for (uint32_t byte_index = 0; byte_index < num_bytes_to_decode; byte_index++) {
|
519
521
|
maybe_fill_bitbuf(bitbuf, bufbits, compressed_words, word_index, 12); // ensure 12 bits in bit buffer
|
520
522
|
|
521
523
|
const size_t peek12 = bitbuf & 0xfff; // These 12 bits will include an entire Huffman codeword.
|
@@ -533,14 +535,14 @@ void cpc_compressor<A>::low_level_uncompress_bytes(
|
|
533
535
|
|
534
536
|
static inline uint64_t read_unary(
|
535
537
|
const uint32_t* compressed_words,
|
536
|
-
|
538
|
+
uint32_t& next_word_index,
|
537
539
|
uint64_t& bitbuf,
|
538
540
|
uint8_t& bufbits
|
539
541
|
);
|
540
542
|
|
541
543
|
static inline void write_unary(
|
542
544
|
uint32_t* compressed_words,
|
543
|
-
|
545
|
+
uint32_t& next_word_index_ptr,
|
544
546
|
uint64_t& bit_buf_ptr,
|
545
547
|
uint8_t& buf_bits_ptr,
|
546
548
|
uint64_t value
|
@@ -551,38 +553,38 @@ static inline void write_unary(
|
|
551
553
|
|
552
554
|
// returns the number of compressed_words actually used
|
553
555
|
template<typename A>
|
554
|
-
|
556
|
+
uint32_t cpc_compressor<A>::low_level_compress_pairs(
|
555
557
|
const uint32_t* pair_array, // input
|
556
|
-
|
557
|
-
|
558
|
+
uint32_t num_pairs_to_encode,
|
559
|
+
uint8_t num_base_bits,
|
558
560
|
uint32_t* compressed_words // output
|
559
561
|
) const {
|
560
562
|
uint64_t bitbuf = 0;
|
561
563
|
uint8_t bufbits = 0;
|
562
|
-
|
564
|
+
uint32_t next_word_index = 0;
|
563
565
|
const uint64_t golomb_lo_mask = (1 << num_base_bits) - 1;
|
564
|
-
|
565
|
-
|
566
|
+
uint32_t predicted_row_index = 0;
|
567
|
+
uint8_t predicted_col_index = 0;
|
566
568
|
|
567
|
-
for (
|
569
|
+
for (uint32_t pair_index = 0; pair_index < num_pairs_to_encode; pair_index++) {
|
568
570
|
const uint32_t row_col = pair_array[pair_index];
|
569
|
-
const
|
570
|
-
const
|
571
|
+
const uint32_t row_index = row_col >> 6;
|
572
|
+
const uint8_t col_index = row_col & 63;
|
571
573
|
|
572
574
|
if (row_index != predicted_row_index) predicted_col_index = 0;
|
573
575
|
|
574
576
|
if (row_index < predicted_row_index) throw std::logic_error("row_index < predicted_row_index");
|
575
577
|
if (col_index < predicted_col_index) throw std::logic_error("col_index < predicted_col_index");
|
576
578
|
|
577
|
-
const
|
578
|
-
const
|
579
|
+
const uint32_t y_delta = row_index - predicted_row_index;
|
580
|
+
const uint8_t x_delta = col_index - predicted_col_index;
|
579
581
|
|
580
582
|
predicted_row_index = row_index;
|
581
583
|
predicted_col_index = col_index + 1;
|
582
584
|
|
583
|
-
const
|
585
|
+
const uint16_t code_info = length_limited_unary_encoding_table65[x_delta];
|
584
586
|
const uint64_t code_val = code_info & 0xfff;
|
585
|
-
const uint8_t code_len = code_info >> 12;
|
587
|
+
const uint8_t code_len = static_cast<uint8_t>(code_info >> 12);
|
586
588
|
bitbuf |= code_val << bufbits;
|
587
589
|
bufbits += code_len;
|
588
590
|
maybe_flush_bitbuf(bitbuf, bufbits, compressed_words, next_word_index);
|
@@ -614,29 +616,29 @@ size_t cpc_compressor<A>::low_level_compress_pairs(
|
|
614
616
|
template<typename A>
|
615
617
|
void cpc_compressor<A>::low_level_uncompress_pairs(
|
616
618
|
uint32_t* pair_array, // output
|
617
|
-
|
618
|
-
|
619
|
+
uint32_t num_pairs_to_decode,
|
620
|
+
uint8_t num_base_bits,
|
619
621
|
const uint32_t* compressed_words, // input
|
620
|
-
|
622
|
+
uint32_t num_compressed_words
|
621
623
|
) const {
|
622
|
-
|
624
|
+
uint32_t word_index = 0;
|
623
625
|
uint64_t bitbuf = 0;
|
624
626
|
uint8_t bufbits = 0;
|
625
627
|
const uint64_t golomb_lo_mask = (1 << num_base_bits) - 1;
|
626
|
-
|
627
|
-
|
628
|
+
uint32_t predicted_row_index = 0;
|
629
|
+
uint8_t predicted_col_index = 0;
|
628
630
|
|
629
631
|
// for each pair we need to read:
|
630
632
|
// x_delta (12-bit length-limited unary)
|
631
633
|
// y_delta_hi (unary)
|
632
634
|
// y_delta_lo (basebits)
|
633
635
|
|
634
|
-
for (
|
636
|
+
for (uint32_t pair_index = 0; pair_index < num_pairs_to_decode; pair_index++) {
|
635
637
|
maybe_fill_bitbuf(bitbuf, bufbits, compressed_words, word_index, 12); // ensure 12 bits in bit buffer
|
636
638
|
const size_t peek12 = bitbuf & 0xfff;
|
637
639
|
const uint16_t lookup = length_limited_unary_decoding_table65[peek12];
|
638
|
-
const
|
639
|
-
const
|
640
|
+
const uint8_t code_word_length = lookup >> 8;
|
641
|
+
const int8_t x_delta = lookup & 0xff;
|
640
642
|
bitbuf >>= code_word_length;
|
641
643
|
bufbits -= code_word_length;
|
642
644
|
|
@@ -650,8 +652,8 @@ void cpc_compressor<A>::low_level_uncompress_pairs(
|
|
650
652
|
|
651
653
|
// Now that we have x_delta and y_delta, we can compute the pair's row and column
|
652
654
|
if (y_delta > 0) predicted_col_index = 0;
|
653
|
-
const
|
654
|
-
const
|
655
|
+
const uint32_t row_index = static_cast<uint32_t>(predicted_row_index + y_delta);
|
656
|
+
const uint8_t col_index = predicted_col_index + x_delta;
|
655
657
|
const uint32_t row_col = (row_index << 6) | col_index;
|
656
658
|
pair_array[pair_index] = row_col;
|
657
659
|
predicted_row_index = row_index;
|
@@ -662,7 +664,7 @@ void cpc_compressor<A>::low_level_uncompress_pairs(
|
|
662
664
|
|
663
665
|
uint64_t read_unary(
|
664
666
|
const uint32_t* compressed_words,
|
665
|
-
|
667
|
+
uint32_t& next_word_index,
|
666
668
|
uint64_t& bitbuf,
|
667
669
|
uint8_t& bufbits
|
668
670
|
) {
|
@@ -689,7 +691,7 @@ uint64_t read_unary(
|
|
689
691
|
|
690
692
|
void write_unary(
|
691
693
|
uint32_t* compressed_words,
|
692
|
-
|
694
|
+
uint32_t& next_word_index,
|
693
695
|
uint64_t& bitbuf,
|
694
696
|
uint8_t& bufbits,
|
695
697
|
uint64_t value
|
@@ -709,9 +711,9 @@ void write_unary(
|
|
709
711
|
|
710
712
|
if (remaining > 15) throw std::out_of_range("remaining out of range");
|
711
713
|
|
712
|
-
const uint64_t the_unary_code =
|
714
|
+
const uint64_t the_unary_code = 1ULL << remaining;
|
713
715
|
bitbuf |= the_unary_code << bufbits;
|
714
|
-
bufbits +=
|
716
|
+
bufbits += static_cast<uint8_t>(remaining + 1);
|
715
717
|
maybe_flush_bitbuf(bitbuf, bufbits, compressed_words, next_word_index);
|
716
718
|
}
|
717
719
|
|
@@ -738,12 +740,12 @@ vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* win
|
|
738
740
|
// returns an integer that is between
|
739
741
|
// zero and ceiling(log_2(k)) - 1, inclusive
|
740
742
|
template<typename A>
|
741
|
-
|
743
|
+
uint8_t cpc_compressor<A>::golomb_choose_number_of_base_bits(uint32_t k, uint64_t count) {
|
742
744
|
if (k < 1) throw std::invalid_argument("golomb_choose_number_of_base_bits: k < 1");
|
743
745
|
if (count < 1) throw std::invalid_argument("golomb_choose_number_of_base_bits: count < 1");
|
744
746
|
const uint64_t quotient = (k - count) / count; // integer division
|
745
747
|
if (quotient == 0) return 0;
|
746
|
-
else return
|
748
|
+
else return floor_log2_of_long(quotient);
|
747
749
|
}
|
748
750
|
|
749
751
|
} /* namespace datasketches */
|