datasketches 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
#ifndef COMPACT_THETA_SKETCH_PARSER_HPP_
|
|
21
21
|
#define COMPACT_THETA_SKETCH_PARSER_HPP_
|
|
22
22
|
|
|
23
|
-
#include <
|
|
23
|
+
#include <cstdint>
|
|
24
24
|
|
|
25
25
|
namespace datasketches {
|
|
26
26
|
|
|
@@ -33,7 +33,8 @@ public:
|
|
|
33
33
|
uint16_t seed_hash;
|
|
34
34
|
uint32_t num_entries;
|
|
35
35
|
uint64_t theta;
|
|
36
|
-
const
|
|
36
|
+
const void* entries_start_ptr;
|
|
37
|
+
uint8_t entry_bits;
|
|
37
38
|
};
|
|
38
39
|
|
|
39
40
|
static compact_theta_sketch_data parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error = false);
|
|
@@ -45,18 +46,23 @@ private:
|
|
|
45
46
|
static const size_t COMPACT_SKETCH_TYPE_BYTE = 2;
|
|
46
47
|
static const size_t COMPACT_SKETCH_FLAGS_BYTE = 5;
|
|
47
48
|
static const size_t COMPACT_SKETCH_SEED_HASH_U16 = 3;
|
|
48
|
-
static const size_t
|
|
49
|
-
static const size_t
|
|
50
|
-
static const size_t COMPACT_SKETCH_ENTRIES_EXACT_U64 = 2;
|
|
51
|
-
static const size_t
|
|
52
|
-
static const size_t
|
|
49
|
+
static const size_t COMPACT_SKETCH_SINGLE_ENTRY_U64 = 1; // ver 3
|
|
50
|
+
static const size_t COMPACT_SKETCH_NUM_ENTRIES_U32 = 2; // ver 1-3
|
|
51
|
+
static const size_t COMPACT_SKETCH_ENTRIES_EXACT_U64 = 2; // ver 1-3
|
|
52
|
+
static const size_t COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 = 3; // ver 1-3
|
|
53
|
+
static const size_t COMPACT_SKETCH_THETA_U64 = 2; // ver 1-3
|
|
54
|
+
static const size_t COMPACT_SKETCH_V4_ENTRY_BITS_BYTE = 3;
|
|
55
|
+
static const size_t COMPACT_SKETCH_V4_NUM_ENTRIES_BYTES_BYTE = 4;
|
|
56
|
+
static const size_t COMPACT_SKETCH_V4_THETA_U64 = 1;
|
|
57
|
+
static const size_t COMPACT_SKETCH_V4_PACKED_DATA_EXACT_BYTE = 8;
|
|
58
|
+
static const size_t COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_BYTE = 16;
|
|
53
59
|
|
|
54
60
|
static const uint8_t COMPACT_SKETCH_IS_EMPTY_FLAG = 2;
|
|
55
61
|
static const uint8_t COMPACT_SKETCH_IS_ORDERED_FLAG = 4;
|
|
56
62
|
|
|
57
|
-
static const uint8_t COMPACT_SKETCH_SERIAL_VERSION = 3;
|
|
58
63
|
static const uint8_t COMPACT_SKETCH_TYPE = 3;
|
|
59
64
|
|
|
65
|
+
static void check_memory_size(const void* ptr, size_t actual_bytes, size_t expected_bytes, bool dump_on_error);
|
|
60
66
|
static std::string hex_dump(const uint8_t* ptr, size_t size);
|
|
61
67
|
};
|
|
62
68
|
|
|
@@ -26,106 +26,120 @@
|
|
|
26
26
|
|
|
27
27
|
namespace datasketches {
|
|
28
28
|
|
|
29
|
+
template<typename T>
|
|
30
|
+
T whole_bytes_to_hold_bits(T bits) {
|
|
31
|
+
static_assert(std::is_integral<T>::value, "integral type expected");
|
|
32
|
+
return (bits >> 3) + ((bits & 7) > 0);
|
|
33
|
+
}
|
|
34
|
+
|
|
29
35
|
template<bool dummy>
|
|
30
36
|
auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
37
|
+
check_memory_size(ptr, size, 8, dump_on_error);
|
|
38
|
+
checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
|
|
34
39
|
uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
|
|
35
|
-
|
|
36
40
|
switch(serial_version) {
|
|
37
|
-
case
|
|
38
|
-
|
|
41
|
+
case 4: {
|
|
42
|
+
// version 4 sketches are ordered and always have entries (single item in exact mode is v3)
|
|
43
|
+
const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
|
|
44
|
+
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
45
|
+
const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 1;
|
|
46
|
+
uint64_t theta = theta_constants::MAX_THETA;
|
|
47
|
+
if (has_theta) {
|
|
48
|
+
check_memory_size(ptr, size, 16, dump_on_error);
|
|
49
|
+
theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_V4_THETA_U64];
|
|
50
|
+
}
|
|
51
|
+
const uint8_t num_entries_bytes = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_V4_NUM_ENTRIES_BYTES_BYTE];
|
|
52
|
+
size_t data_offset_bytes = has_theta ? COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_BYTE : COMPACT_SKETCH_V4_PACKED_DATA_EXACT_BYTE;
|
|
53
|
+
check_memory_size(ptr, size, data_offset_bytes + num_entries_bytes, dump_on_error);
|
|
54
|
+
uint32_t num_entries = 0;
|
|
55
|
+
const uint8_t* num_entries_ptr = reinterpret_cast<const uint8_t*>(ptr) + data_offset_bytes;
|
|
56
|
+
for (unsigned i = 0; i < num_entries_bytes; ++i) {
|
|
57
|
+
num_entries |= (*num_entries_ptr++) << (i << 3);
|
|
58
|
+
}
|
|
59
|
+
data_offset_bytes += num_entries_bytes;
|
|
60
|
+
const uint8_t entry_bits = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_V4_ENTRY_BITS_BYTE];
|
|
61
|
+
const size_t expected_bits = entry_bits * num_entries;
|
|
62
|
+
const size_t expected_size_bytes = data_offset_bytes + whole_bytes_to_hold_bits(expected_bits);
|
|
63
|
+
check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
|
|
64
|
+
return {false, true, seed_hash, num_entries, theta,
|
|
65
|
+
reinterpret_cast<const uint8_t*>(ptr) + data_offset_bytes, entry_bits};
|
|
66
|
+
}
|
|
67
|
+
case 3: {
|
|
39
68
|
uint64_t theta = theta_constants::MAX_THETA;
|
|
40
69
|
const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
|
|
41
70
|
if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
|
|
42
|
-
return {true, true, seed_hash, 0, theta, nullptr};
|
|
71
|
+
return {true, true, seed_hash, 0, theta, nullptr, 64};
|
|
43
72
|
}
|
|
44
73
|
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
45
74
|
const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
|
|
46
75
|
if (has_theta) {
|
|
47
|
-
|
|
76
|
+
check_memory_size(ptr, size, (COMPACT_SKETCH_THETA_U64 + 1) * sizeof(uint64_t), dump_on_error);
|
|
48
77
|
theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
|
49
78
|
}
|
|
50
79
|
if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
|
|
51
|
-
|
|
52
|
-
return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
|
|
80
|
+
check_memory_size(ptr, size, 16, dump_on_error);
|
|
81
|
+
return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64, 64};
|
|
53
82
|
}
|
|
54
83
|
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
55
84
|
const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
|
|
56
85
|
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
|
|
57
86
|
const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
|
|
58
|
-
|
|
59
|
-
throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
60
|
-
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
61
|
-
}
|
|
87
|
+
check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
|
|
62
88
|
const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
|
|
63
|
-
return {false, is_ordered, seed_hash, num_entries, theta, entries};
|
|
89
|
+
return {false, is_ordered, seed_hash, num_entries, theta, entries, 64};
|
|
64
90
|
}
|
|
65
91
|
case 1: {
|
|
66
92
|
uint16_t seed_hash = compute_seed_hash(seed);
|
|
67
|
-
checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
|
|
68
93
|
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
69
94
|
uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
|
70
95
|
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
|
71
|
-
if (is_empty) {
|
|
72
|
-
return {true, true, seed_hash, 0, theta, nullptr};
|
|
73
|
-
}
|
|
96
|
+
if (is_empty) return {true, true, seed_hash, 0, theta, nullptr, 64};
|
|
74
97
|
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
|
|
75
98
|
const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
79
|
-
}
|
|
80
|
-
return {false, true, seed_hash, num_entries, theta, entries};
|
|
99
|
+
check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
|
|
100
|
+
return {false, true, seed_hash, num_entries, theta, entries, 64};
|
|
81
101
|
}
|
|
82
102
|
case 2: {
|
|
83
|
-
uint8_t preamble_size =
|
|
84
|
-
checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
|
|
103
|
+
uint8_t preamble_size = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
|
|
85
104
|
const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
|
|
86
105
|
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
87
106
|
if (preamble_size == 1) {
|
|
88
|
-
return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
|
|
107
|
+
return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr, 64};
|
|
89
108
|
} else if (preamble_size == 2) {
|
|
90
109
|
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
91
110
|
if (num_entries == 0) {
|
|
92
|
-
return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
|
|
111
|
+
return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr, 64};
|
|
93
112
|
} else {
|
|
94
113
|
const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
|
|
95
|
-
|
|
96
|
-
throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
97
|
-
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
98
|
-
}
|
|
114
|
+
check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
|
|
99
115
|
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
|
|
100
|
-
return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries};
|
|
116
|
+
return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries, 64};
|
|
101
117
|
}
|
|
102
118
|
} else if (preamble_size == 3) {
|
|
103
119
|
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
104
120
|
uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
|
105
121
|
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
|
106
|
-
if (is_empty) {
|
|
107
|
-
return {true, true, seed_hash, 0, theta, nullptr};
|
|
108
|
-
}
|
|
122
|
+
if (is_empty) return {true, true, seed_hash, 0, theta, nullptr, 64};
|
|
109
123
|
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
|
|
110
124
|
const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
114
|
-
}
|
|
115
|
-
return {false, true, seed_hash, num_entries, theta, entries};
|
|
125
|
+
check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
|
|
126
|
+
return {false, true, seed_hash, num_entries, theta, entries, 64};
|
|
116
127
|
} else {
|
|
117
128
|
throw std::invalid_argument(std::to_string(preamble_size) + " longs of premable, but expected 1, 2, or 3");
|
|
118
129
|
}
|
|
119
130
|
}
|
|
120
131
|
default:
|
|
121
|
-
|
|
122
|
-
checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
|
|
123
|
-
// this throw is never reached, because check_serial_version will throw an informative exception.
|
|
124
|
-
// This is only here to avoid a compiler warning about a path without a return value.
|
|
125
|
-
throw std::invalid_argument("unexpected sketch serialization version");
|
|
132
|
+
throw std::invalid_argument("unsupported serial version " + std::to_string(serial_version));
|
|
126
133
|
}
|
|
127
134
|
}
|
|
128
135
|
|
|
136
|
+
template<bool dummy>
|
|
137
|
+
void compact_theta_sketch_parser<dummy>::check_memory_size(const void* ptr, size_t actual_bytes, size_t expected_bytes, bool dump_on_error) {
|
|
138
|
+
if (actual_bytes < expected_bytes) throw std::out_of_range("at least " + std::to_string(expected_bytes)
|
|
139
|
+
+ " bytes expected, actual " + std::to_string(actual_bytes)
|
|
140
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), actual_bytes)) : ""));
|
|
141
|
+
}
|
|
142
|
+
|
|
129
143
|
template<bool dummy>
|
|
130
144
|
std::string compact_theta_sketch_parser<dummy>::hex_dump(const uint8_t* ptr, size_t size) {
|
|
131
145
|
std::stringstream s;
|
|
@@ -20,8 +20,10 @@
|
|
|
20
20
|
#ifndef THETA_HELPERS_HPP_
|
|
21
21
|
#define THETA_HELPERS_HPP_
|
|
22
22
|
|
|
23
|
-
#include <string>
|
|
24
23
|
#include <stdexcept>
|
|
24
|
+
#include <string>
|
|
25
|
+
|
|
26
|
+
#include "theta_constants.hpp"
|
|
25
27
|
|
|
26
28
|
namespace datasketches {
|
|
27
29
|
|
|
@@ -55,7 +57,7 @@ public:
|
|
|
55
57
|
// consistent way of initializing theta from p
|
|
56
58
|
// avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
|
|
57
59
|
static uint64_t starting_theta_from_p(float p) {
|
|
58
|
-
if (p < 1) return static_cast<
|
|
60
|
+
if (p < 1) return static_cast<float>(theta_constants::MAX_THETA) * p;
|
|
59
61
|
return theta_constants::MAX_THETA;
|
|
60
62
|
}
|
|
61
63
|
|
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
#define THETA_SKETCH_HPP_
|
|
22
22
|
|
|
23
23
|
#include "theta_update_sketch_base.hpp"
|
|
24
|
+
#include "compact_theta_sketch_parser.hpp"
|
|
24
25
|
|
|
25
26
|
namespace datasketches {
|
|
26
27
|
|
|
@@ -317,7 +318,8 @@ public:
|
|
|
317
318
|
using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
|
|
318
319
|
using vector_bytes = std::vector<uint8_t, AllocBytes>;
|
|
319
320
|
|
|
320
|
-
static const uint8_t
|
|
321
|
+
static const uint8_t UNCOMPRESSED_SERIAL_VERSION = 3;
|
|
322
|
+
static const uint8_t COMPRESSED_SERIAL_VERSION = 4;
|
|
321
323
|
static const uint8_t SKETCH_TYPE = 3;
|
|
322
324
|
|
|
323
325
|
// Instances of this type can be obtained:
|
|
@@ -355,6 +357,25 @@ public:
|
|
|
355
357
|
*/
|
|
356
358
|
vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
|
357
359
|
|
|
360
|
+
/**
|
|
361
|
+
* This method serializes the sketch into a given stream in a compressed binary form.
|
|
362
|
+
* Compression is applied to ordered sketches except empty and single item.
|
|
363
|
+
* For unordered, empty and single item sketches this method is equivalent to serialize()
|
|
364
|
+
* @param os output stream
|
|
365
|
+
*/
|
|
366
|
+
void serialize_compressed(std::ostream& os) const;
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* This method serializes the sketch as a vector of bytes.
|
|
370
|
+
* An optional header can be reserved in front of the sketch.
|
|
371
|
+
* It is an uninitialized space of a given size.
|
|
372
|
+
* This header is used in Datasketches PostgreSQL extension.
|
|
373
|
+
* Compression is applied to ordered sketches except empty and single item.
|
|
374
|
+
* For unordered, empty and single item sketches this method is equivalent to serialize()
|
|
375
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
|
376
|
+
*/
|
|
377
|
+
vector_bytes serialize_compressed(unsigned header_size_bytes = 0) const;
|
|
378
|
+
|
|
358
379
|
virtual iterator begin();
|
|
359
380
|
virtual iterator end();
|
|
360
381
|
virtual const_iterator begin() const;
|
|
@@ -391,6 +412,16 @@ private:
|
|
|
391
412
|
uint64_t theta_;
|
|
392
413
|
std::vector<uint64_t, Allocator> entries_;
|
|
393
414
|
|
|
415
|
+
bool is_suitable_for_compression() const;
|
|
416
|
+
uint8_t compute_min_leading_zeros() const;
|
|
417
|
+
void serialize_version_4(std::ostream& os) const;
|
|
418
|
+
vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
|
|
419
|
+
|
|
420
|
+
static compact_theta_sketch_alloc deserialize_v1(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
|
|
421
|
+
static compact_theta_sketch_alloc deserialize_v2(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
|
|
422
|
+
static compact_theta_sketch_alloc deserialize_v3(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
|
|
423
|
+
static compact_theta_sketch_alloc deserialize_v4(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
|
|
424
|
+
|
|
394
425
|
virtual void print_specifics(std::ostringstream& os) const;
|
|
395
426
|
};
|
|
396
427
|
|
|
@@ -407,7 +438,7 @@ public:
|
|
|
407
438
|
template<typename Allocator = std::allocator<uint64_t>>
|
|
408
439
|
class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
|
|
409
440
|
public:
|
|
410
|
-
|
|
441
|
+
class const_iterator;
|
|
411
442
|
|
|
412
443
|
Allocator get_allocator() const;
|
|
413
444
|
bool is_empty() const;
|
|
@@ -433,15 +464,32 @@ protected:
|
|
|
433
464
|
virtual void print_items(std::ostringstream& os) const;
|
|
434
465
|
|
|
435
466
|
private:
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
uint16_t seed_hash_;
|
|
439
|
-
uint32_t num_entries_;
|
|
440
|
-
uint64_t theta_;
|
|
441
|
-
const uint64_t* entries_;
|
|
467
|
+
using data_type = compact_theta_sketch_parser<true>::compact_theta_sketch_data;
|
|
468
|
+
data_type data_;
|
|
442
469
|
|
|
443
|
-
wrapped_compact_theta_sketch_alloc(
|
|
444
|
-
|
|
470
|
+
wrapped_compact_theta_sketch_alloc(const data_type& data);
|
|
471
|
+
};
|
|
472
|
+
|
|
473
|
+
template<typename Allocator>
|
|
474
|
+
class wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator: public std::iterator<std::input_iterator_tag, uint64_t> {
|
|
475
|
+
public:
|
|
476
|
+
const_iterator(const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index);
|
|
477
|
+
const_iterator& operator++();
|
|
478
|
+
const_iterator operator++(int);
|
|
479
|
+
bool operator==(const const_iterator& other) const;
|
|
480
|
+
bool operator!=(const const_iterator& other) const;
|
|
481
|
+
const uint64_t& operator*() const;
|
|
482
|
+
const uint64_t* operator->() const;
|
|
483
|
+
private:
|
|
484
|
+
const void* ptr_;
|
|
485
|
+
uint8_t entry_bits_;
|
|
486
|
+
uint32_t num_entries_;
|
|
487
|
+
uint32_t index_;
|
|
488
|
+
uint64_t previous_;
|
|
489
|
+
bool is_block_mode_;
|
|
490
|
+
uint8_t buf_i_;
|
|
491
|
+
uint8_t offset_;
|
|
492
|
+
uint64_t buffer_[8];
|
|
445
493
|
};
|
|
446
494
|
|
|
447
495
|
// aliases with default allocator for convenience
|