datasketches 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
- data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/python/README.md +6 -3
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
- data/vendor/datasketches-cpp/setup.py +5 -3
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
- metadata +43 -34
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
|
@@ -20,35 +20,23 @@
|
|
|
20
20
|
#ifndef THETA_SKETCH_IMPL_HPP_
|
|
21
21
|
#define THETA_SKETCH_IMPL_HPP_
|
|
22
22
|
|
|
23
|
-
#include <algorithm>
|
|
24
|
-
#include <cmath>
|
|
25
|
-
#include <memory>
|
|
26
|
-
#include <functional>
|
|
27
|
-
#include <istream>
|
|
28
|
-
#include <ostream>
|
|
29
23
|
#include <sstream>
|
|
24
|
+
#include <vector>
|
|
30
25
|
|
|
31
|
-
#include "MurmurHash3.h"
|
|
32
26
|
#include "serde.hpp"
|
|
33
27
|
#include "binomial_bounds.hpp"
|
|
34
|
-
#include "
|
|
28
|
+
#include "theta_helpers.hpp"
|
|
35
29
|
|
|
36
30
|
namespace datasketches {
|
|
37
31
|
|
|
38
|
-
/*
|
|
39
|
-
* author Alexander Saydakov
|
|
40
|
-
* author Lee Rhodes
|
|
41
|
-
* author Kevin Lang
|
|
42
|
-
*/
|
|
43
|
-
|
|
44
32
|
template<typename A>
|
|
45
|
-
theta_sketch_alloc<A>::
|
|
46
|
-
|
|
47
|
-
|
|
33
|
+
bool theta_sketch_alloc<A>::is_estimation_mode() const {
|
|
34
|
+
return get_theta64() < theta_constants::MAX_THETA && !is_empty();
|
|
35
|
+
}
|
|
48
36
|
|
|
49
37
|
template<typename A>
|
|
50
|
-
|
|
51
|
-
return
|
|
38
|
+
double theta_sketch_alloc<A>::get_theta() const {
|
|
39
|
+
return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
|
|
52
40
|
}
|
|
53
41
|
|
|
54
42
|
template<typename A>
|
|
@@ -69,182 +57,47 @@ double theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
|
|
|
69
57
|
}
|
|
70
58
|
|
|
71
59
|
template<typename A>
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
is.read((char*)&type, sizeof(type));
|
|
94
|
-
uint8_t lg_nom_size;
|
|
95
|
-
is.read((char*)&lg_nom_size, sizeof(lg_nom_size));
|
|
96
|
-
uint8_t lg_cur_size;
|
|
97
|
-
is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
|
|
98
|
-
uint8_t flags_byte;
|
|
99
|
-
is.read((char*)&flags_byte, sizeof(flags_byte));
|
|
100
|
-
uint16_t seed_hash;
|
|
101
|
-
is.read((char*)&seed_hash, sizeof(seed_hash));
|
|
102
|
-
|
|
103
|
-
check_serial_version(serial_version, SERIAL_VERSION);
|
|
104
|
-
|
|
105
|
-
if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
|
|
106
|
-
check_seed_hash(seed_hash, get_seed_hash(seed));
|
|
107
|
-
typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
|
|
108
|
-
typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
|
|
109
|
-
return unique_ptr(
|
|
110
|
-
static_cast<theta_sketch_alloc<A>*>(new (AU().allocate(1)) update_theta_sketch_alloc<A>(update_theta_sketch_alloc<A>::internal_deserialize(is, rf, lg_cur_size, lg_nom_size, flags_byte, seed))),
|
|
111
|
-
[](theta_sketch_alloc<A>* ptr) {
|
|
112
|
-
ptr->~theta_sketch_alloc();
|
|
113
|
-
AU().deallocate(static_cast<update_theta_sketch_alloc<A>*>(ptr), 1);
|
|
114
|
-
}
|
|
115
|
-
);
|
|
116
|
-
} else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
|
|
117
|
-
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
118
|
-
if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
|
|
119
|
-
typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
|
|
120
|
-
return unique_ptr(
|
|
121
|
-
static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(compact_theta_sketch_alloc<A>::internal_deserialize(is, preamble_longs, flags_byte, seed_hash))),
|
|
122
|
-
[](theta_sketch_alloc<A>* ptr) {
|
|
123
|
-
ptr->~theta_sketch_alloc();
|
|
124
|
-
AC().deallocate(static_cast<compact_theta_sketch_alloc<A>*>(ptr), 1);
|
|
125
|
-
}
|
|
126
|
-
);
|
|
127
|
-
}
|
|
128
|
-
throw std::invalid_argument("unsupported sketch type " + std::to_string((int) type));
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
template<typename A>
|
|
132
|
-
typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
|
|
133
|
-
ensure_minimum_memory(size, static_cast<size_t>(8));
|
|
134
|
-
const char* ptr = static_cast<const char*>(bytes);
|
|
135
|
-
uint8_t preamble_longs;
|
|
136
|
-
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
|
|
137
|
-
uint8_t serial_version;
|
|
138
|
-
ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
|
|
139
|
-
uint8_t type;
|
|
140
|
-
ptr += copy_from_mem(ptr, &type, sizeof(type));
|
|
141
|
-
uint8_t lg_nom_size;
|
|
142
|
-
ptr += copy_from_mem(ptr, &lg_nom_size, sizeof(lg_nom_size));
|
|
143
|
-
uint8_t lg_cur_size;
|
|
144
|
-
ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(lg_cur_size));
|
|
145
|
-
uint8_t flags_byte;
|
|
146
|
-
ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
|
|
147
|
-
uint16_t seed_hash;
|
|
148
|
-
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
|
|
149
|
-
|
|
150
|
-
check_serial_version(serial_version, SERIAL_VERSION);
|
|
151
|
-
|
|
152
|
-
if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
|
|
153
|
-
check_seed_hash(seed_hash, get_seed_hash(seed));
|
|
154
|
-
typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
|
|
155
|
-
typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
|
|
156
|
-
return unique_ptr(
|
|
157
|
-
static_cast<theta_sketch_alloc<A>*>(new (AU().allocate(1)) update_theta_sketch_alloc<A>(
|
|
158
|
-
update_theta_sketch_alloc<A>::internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), rf, lg_cur_size, lg_nom_size, flags_byte, seed))
|
|
159
|
-
),
|
|
160
|
-
[](theta_sketch_alloc<A>* ptr) {
|
|
161
|
-
ptr->~theta_sketch_alloc();
|
|
162
|
-
AU().deallocate(static_cast<update_theta_sketch_alloc<A>*>(ptr), 1);
|
|
163
|
-
}
|
|
164
|
-
);
|
|
165
|
-
} else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
|
|
166
|
-
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
167
|
-
if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
|
|
168
|
-
typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
|
|
169
|
-
return unique_ptr(
|
|
170
|
-
static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(
|
|
171
|
-
compact_theta_sketch_alloc<A>::internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash))
|
|
172
|
-
),
|
|
173
|
-
[](theta_sketch_alloc<A>* ptr) {
|
|
174
|
-
ptr->~theta_sketch_alloc();
|
|
175
|
-
AC().deallocate(static_cast<compact_theta_sketch_alloc<A>*>(ptr), 1);
|
|
176
|
-
}
|
|
177
|
-
);
|
|
178
|
-
}
|
|
179
|
-
throw std::invalid_argument("unsupported sketch type " + std::to_string((int) type));
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
template<typename A>
|
|
183
|
-
uint16_t theta_sketch_alloc<A>::get_seed_hash(uint64_t seed) {
|
|
184
|
-
HashState hashes;
|
|
185
|
-
MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
|
|
186
|
-
return hashes.h1;
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
template<typename A>
|
|
190
|
-
void theta_sketch_alloc<A>::check_sketch_type(uint8_t actual, uint8_t expected) {
|
|
191
|
-
if (actual != expected) {
|
|
192
|
-
throw std::invalid_argument("Sketch type mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
|
|
193
|
-
}
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
template<typename A>
|
|
197
|
-
void theta_sketch_alloc<A>::check_serial_version(uint8_t actual, uint8_t expected) {
|
|
198
|
-
if (actual != expected) {
|
|
199
|
-
throw std::invalid_argument("Sketch serial version mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
template<typename A>
|
|
204
|
-
void theta_sketch_alloc<A>::check_seed_hash(uint16_t actual, uint16_t expected) {
|
|
205
|
-
if (actual != expected) {
|
|
206
|
-
throw std::invalid_argument("Sketch seed hash mismatch: expected " + std::to_string(expected) + ", actual " + std::to_string(actual));
|
|
60
|
+
string<A> theta_sketch_alloc<A>::to_string(bool detail) const {
|
|
61
|
+
ostrstream os;
|
|
62
|
+
os << "### Theta sketch summary:" << std::endl;
|
|
63
|
+
os << " num retained entries : " << get_num_retained() << std::endl;
|
|
64
|
+
os << " seed hash : " << get_seed_hash() << std::endl;
|
|
65
|
+
os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
|
|
66
|
+
os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
|
|
67
|
+
os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
|
|
68
|
+
os << " theta (fraction) : " << get_theta() << std::endl;
|
|
69
|
+
os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
|
|
70
|
+
os << " estimate : " << this->get_estimate() << std::endl;
|
|
71
|
+
os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
|
|
72
|
+
os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
|
|
73
|
+
print_specifics(os);
|
|
74
|
+
os << "### End sketch summary" << std::endl;
|
|
75
|
+
if (detail) {
|
|
76
|
+
os << "### Retained entries" << std::endl;
|
|
77
|
+
for (const auto& hash: *this) {
|
|
78
|
+
os << hash << std::endl;
|
|
79
|
+
}
|
|
80
|
+
os << "### End retained entries" << std::endl;
|
|
207
81
|
}
|
|
82
|
+
return os.str();
|
|
208
83
|
}
|
|
209
84
|
|
|
210
85
|
// update sketch
|
|
211
86
|
|
|
212
87
|
template<typename A>
|
|
213
|
-
update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
lg_nom_size_(lg_nom_size),
|
|
217
|
-
keys_(1 << lg_cur_size_, 0),
|
|
218
|
-
num_keys_(0),
|
|
219
|
-
rf_(rf),
|
|
220
|
-
p_(p),
|
|
221
|
-
seed_(seed),
|
|
222
|
-
capacity_(get_capacity(lg_cur_size, lg_nom_size))
|
|
223
|
-
{
|
|
224
|
-
if (p < 1) this->theta_ *= p;
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
template<typename A>
|
|
228
|
-
update_theta_sketch_alloc<A>::update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed):
|
|
229
|
-
theta_sketch_alloc<A>(is_empty, theta),
|
|
230
|
-
lg_cur_size_(lg_cur_size),
|
|
231
|
-
lg_nom_size_(lg_nom_size),
|
|
232
|
-
keys_(std::move(keys)),
|
|
233
|
-
num_keys_(num_keys),
|
|
234
|
-
rf_(rf),
|
|
235
|
-
p_(p),
|
|
236
|
-
seed_(seed),
|
|
237
|
-
capacity_(get_capacity(lg_cur_size, lg_nom_size))
|
|
88
|
+
update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
|
89
|
+
uint64_t theta, uint64_t seed, const A& allocator):
|
|
90
|
+
table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
|
|
238
91
|
{}
|
|
239
92
|
|
|
240
93
|
template<typename A>
|
|
241
|
-
|
|
242
|
-
return
|
|
94
|
+
A update_theta_sketch_alloc<A>::get_allocator() const {
|
|
95
|
+
return table_.allocator_;
|
|
243
96
|
}
|
|
244
97
|
|
|
245
98
|
template<typename A>
|
|
246
|
-
|
|
247
|
-
return
|
|
99
|
+
bool update_theta_sketch_alloc<A>::is_empty() const {
|
|
100
|
+
return table_.is_empty_;
|
|
248
101
|
}
|
|
249
102
|
|
|
250
103
|
template<typename A>
|
|
@@ -253,169 +106,28 @@ bool update_theta_sketch_alloc<A>::is_ordered() const {
|
|
|
253
106
|
}
|
|
254
107
|
|
|
255
108
|
template<typename A>
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
os << "### Update Theta sketch summary:" << std::endl;
|
|
259
|
-
os << " lg nominal size : " << (int) lg_nom_size_ << std::endl;
|
|
260
|
-
os << " lg current size : " << (int) lg_cur_size_ << std::endl;
|
|
261
|
-
os << " num retained keys : " << num_keys_ << std::endl;
|
|
262
|
-
os << " resize factor : " << (1 << rf_) << std::endl;
|
|
263
|
-
os << " sampling probability : " << p_ << std::endl;
|
|
264
|
-
os << " seed hash : " << this->get_seed_hash() << std::endl;
|
|
265
|
-
os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
|
|
266
|
-
os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
|
|
267
|
-
os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
|
|
268
|
-
os << " theta (fraction) : " << this->get_theta() << std::endl;
|
|
269
|
-
os << " theta (raw 64-bit) : " << this->theta_ << std::endl;
|
|
270
|
-
os << " estimate : " << this->get_estimate() << std::endl;
|
|
271
|
-
os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
|
|
272
|
-
os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
|
|
273
|
-
os << "### End sketch summary" << std::endl;
|
|
274
|
-
if (print_items) {
|
|
275
|
-
os << "### Retained keys" << std::endl;
|
|
276
|
-
for (auto key: *this) os << " " << key << std::endl;
|
|
277
|
-
os << "### End retained keys" << std::endl;
|
|
278
|
-
}
|
|
279
|
-
return os.str();
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
template<typename A>
|
|
283
|
-
void update_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
284
|
-
const uint8_t preamble_longs_and_rf = 3 | (rf_ << 6);
|
|
285
|
-
os.write((char*)&preamble_longs_and_rf, sizeof(preamble_longs_and_rf));
|
|
286
|
-
const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
|
|
287
|
-
os.write((char*)&serial_version, sizeof(serial_version));
|
|
288
|
-
const uint8_t type = SKETCH_TYPE;
|
|
289
|
-
os.write((char*)&type, sizeof(type));
|
|
290
|
-
os.write((char*)&lg_nom_size_, sizeof(lg_nom_size_));
|
|
291
|
-
os.write((char*)&lg_cur_size_, sizeof(lg_cur_size_));
|
|
292
|
-
const uint8_t flags_byte(
|
|
293
|
-
(this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0)
|
|
294
|
-
);
|
|
295
|
-
os.write((char*)&flags_byte, sizeof(flags_byte));
|
|
296
|
-
const uint16_t seed_hash = get_seed_hash();
|
|
297
|
-
os.write((char*)&seed_hash, sizeof(seed_hash));
|
|
298
|
-
os.write((char*)&num_keys_, sizeof(num_keys_));
|
|
299
|
-
os.write((char*)&p_, sizeof(p_));
|
|
300
|
-
os.write((char*)&(this->theta_), sizeof(uint64_t));
|
|
301
|
-
os.write((char*)keys_.data(), sizeof(uint64_t) * keys_.size());
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
template<typename A>
|
|
305
|
-
vector_u8<A> update_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
|
|
306
|
-
const uint8_t preamble_longs = 3;
|
|
307
|
-
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + sizeof(uint64_t) * keys_.size();
|
|
308
|
-
vector_u8<A> bytes(size);
|
|
309
|
-
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
310
|
-
|
|
311
|
-
const uint8_t preamble_longs_and_rf = preamble_longs | (rf_ << 6);
|
|
312
|
-
ptr += copy_to_mem(&preamble_longs_and_rf, ptr, sizeof(preamble_longs_and_rf));
|
|
313
|
-
const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
|
|
314
|
-
ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
|
|
315
|
-
const uint8_t type = SKETCH_TYPE;
|
|
316
|
-
ptr += copy_to_mem(&type, ptr, sizeof(type));
|
|
317
|
-
ptr += copy_to_mem(&lg_nom_size_, ptr, sizeof(lg_nom_size_));
|
|
318
|
-
ptr += copy_to_mem(&lg_cur_size_, ptr, sizeof(lg_cur_size_));
|
|
319
|
-
const uint8_t flags_byte(
|
|
320
|
-
(this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0)
|
|
321
|
-
);
|
|
322
|
-
ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
|
|
323
|
-
const uint16_t seed_hash = get_seed_hash();
|
|
324
|
-
ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
|
|
325
|
-
ptr += copy_to_mem(&num_keys_, ptr, sizeof(num_keys_));
|
|
326
|
-
ptr += copy_to_mem(&p_, ptr, sizeof(p_));
|
|
327
|
-
ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
|
|
328
|
-
ptr += copy_to_mem(keys_.data(), ptr, sizeof(uint64_t) * keys_.size());
|
|
329
|
-
|
|
330
|
-
return bytes;
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
template<typename A>
|
|
334
|
-
update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
|
|
335
|
-
uint8_t preamble_longs;
|
|
336
|
-
is.read((char*)&preamble_longs, sizeof(preamble_longs));
|
|
337
|
-
resize_factor rf = static_cast<resize_factor>(preamble_longs >> 6);
|
|
338
|
-
preamble_longs &= 0x3f; // remove resize factor
|
|
339
|
-
uint8_t serial_version;
|
|
340
|
-
is.read((char*)&serial_version, sizeof(serial_version));
|
|
341
|
-
uint8_t type;
|
|
342
|
-
is.read((char*)&type, sizeof(type));
|
|
343
|
-
uint8_t lg_nom_size;
|
|
344
|
-
is.read((char*)&lg_nom_size, sizeof(lg_nom_size));
|
|
345
|
-
uint8_t lg_cur_size;
|
|
346
|
-
is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
|
|
347
|
-
uint8_t flags_byte;
|
|
348
|
-
is.read((char*)&flags_byte, sizeof(flags_byte));
|
|
349
|
-
uint16_t seed_hash;
|
|
350
|
-
is.read((char*)&seed_hash, sizeof(seed_hash));
|
|
351
|
-
theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
|
|
352
|
-
theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
|
|
353
|
-
theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
|
|
354
|
-
return internal_deserialize(is, rf, lg_cur_size, lg_nom_size, flags_byte, seed);
|
|
109
|
+
uint64_t update_theta_sketch_alloc<A>::get_theta64() const {
|
|
110
|
+
return table_.theta_;
|
|
355
111
|
}
|
|
356
112
|
|
|
357
113
|
template<typename A>
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
is.read((char*)&num_keys, sizeof(num_keys));
|
|
361
|
-
float p;
|
|
362
|
-
is.read((char*)&p, sizeof(p));
|
|
363
|
-
uint64_t theta;
|
|
364
|
-
is.read((char*)&theta, sizeof(theta));
|
|
365
|
-
vector_u64<A> keys(1 << lg_cur_size);
|
|
366
|
-
is.read((char*)keys.data(), sizeof(uint64_t) * keys.size());
|
|
367
|
-
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
368
|
-
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
|
369
|
-
return update_theta_sketch_alloc<A>(is_empty, theta, lg_cur_size, lg_nom_size, std::move(keys), num_keys, rf, p, seed);
|
|
114
|
+
uint32_t update_theta_sketch_alloc<A>::get_num_retained() const {
|
|
115
|
+
return table_.num_entries_;
|
|
370
116
|
}
|
|
371
117
|
|
|
372
118
|
template<typename A>
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
const char* ptr = static_cast<const char*>(bytes);
|
|
376
|
-
uint8_t preamble_longs;
|
|
377
|
-
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
|
|
378
|
-
resize_factor rf = static_cast<resize_factor>(preamble_longs >> 6);
|
|
379
|
-
preamble_longs &= 0x3f; // remove resize factor
|
|
380
|
-
uint8_t serial_version;
|
|
381
|
-
ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
|
|
382
|
-
uint8_t type;
|
|
383
|
-
ptr += copy_from_mem(ptr, &type, sizeof(type));
|
|
384
|
-
uint8_t lg_nom_size;
|
|
385
|
-
ptr += copy_from_mem(ptr, &lg_nom_size, sizeof(lg_nom_size));
|
|
386
|
-
uint8_t lg_cur_size;
|
|
387
|
-
ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(lg_cur_size));
|
|
388
|
-
uint8_t flags_byte;
|
|
389
|
-
ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
|
|
390
|
-
uint16_t seed_hash;
|
|
391
|
-
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
|
|
392
|
-
theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
|
|
393
|
-
theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
|
|
394
|
-
theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
|
|
395
|
-
return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), rf, lg_cur_size, lg_nom_size, flags_byte, seed);
|
|
119
|
+
uint16_t update_theta_sketch_alloc<A>::get_seed_hash() const {
|
|
120
|
+
return compute_seed_hash(table_.seed_);
|
|
396
121
|
}
|
|
397
122
|
|
|
398
123
|
template<typename A>
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
ensure_minimum_memory(size, 16 + sizeof(uint64_t) * table_size);
|
|
402
|
-
const char* ptr = static_cast<const char*>(bytes);
|
|
403
|
-
uint32_t num_keys;
|
|
404
|
-
ptr += copy_from_mem(ptr, &num_keys, sizeof(num_keys));
|
|
405
|
-
float p;
|
|
406
|
-
ptr += copy_from_mem(ptr, &p, sizeof(p));
|
|
407
|
-
uint64_t theta;
|
|
408
|
-
ptr += copy_from_mem(ptr, &theta, sizeof(theta));
|
|
409
|
-
vector_u64<A> keys(table_size);
|
|
410
|
-
ptr += copy_from_mem(ptr, keys.data(), sizeof(uint64_t) * table_size);
|
|
411
|
-
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
412
|
-
return update_theta_sketch_alloc<A>(is_empty, theta, lg_cur_size, lg_nom_size, std::move(keys), num_keys, rf, p, seed);
|
|
124
|
+
uint8_t update_theta_sketch_alloc<A>::get_lg_k() const {
|
|
125
|
+
return table_.lg_nom_size_;
|
|
413
126
|
}
|
|
414
127
|
|
|
415
128
|
template<typename A>
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
update(value.c_str(), value.length());
|
|
129
|
+
auto update_theta_sketch_alloc<A>::get_rf() const -> resize_factor {
|
|
130
|
+
return table_.rf_;
|
|
419
131
|
}
|
|
420
132
|
|
|
421
133
|
template<typename A>
|
|
@@ -460,19 +172,7 @@ void update_theta_sketch_alloc<A>::update(int8_t value) {
|
|
|
460
172
|
|
|
461
173
|
template<typename A>
|
|
462
174
|
void update_theta_sketch_alloc<A>::update(double value) {
|
|
463
|
-
|
|
464
|
-
int64_t long_value;
|
|
465
|
-
double double_value;
|
|
466
|
-
} long_double_union;
|
|
467
|
-
|
|
468
|
-
if (value == 0.0) {
|
|
469
|
-
long_double_union.double_value = 0.0; // canonicalize -0.0 to 0.0
|
|
470
|
-
} else if (std::isnan(value)) {
|
|
471
|
-
long_double_union.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
|
|
472
|
-
} else {
|
|
473
|
-
long_double_union.double_value = value;
|
|
474
|
-
}
|
|
475
|
-
update(&long_double_union, sizeof(long_double_union));
|
|
175
|
+
update(canonical_double(value));
|
|
476
176
|
}
|
|
477
177
|
|
|
478
178
|
template<typename A>
|
|
@@ -481,157 +181,116 @@ void update_theta_sketch_alloc<A>::update(float value) {
|
|
|
481
181
|
}
|
|
482
182
|
|
|
483
183
|
template<typename A>
|
|
484
|
-
void update_theta_sketch_alloc<A>::update(const
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
const uint64_t hash = hashes.h1 >> 1; // Java implementation does logical shift >>> to make values positive
|
|
488
|
-
internal_update(hash);
|
|
489
|
-
}
|
|
490
|
-
|
|
491
|
-
template<typename A>
|
|
492
|
-
compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered) const {
|
|
493
|
-
return compact_theta_sketch_alloc<A>(*this, ordered);
|
|
184
|
+
void update_theta_sketch_alloc<A>::update(const std::string& value) {
|
|
185
|
+
if (value.empty()) return;
|
|
186
|
+
update(value.c_str(), value.length());
|
|
494
187
|
}
|
|
495
188
|
|
|
496
189
|
template<typename A>
|
|
497
|
-
void update_theta_sketch_alloc<A>::
|
|
498
|
-
|
|
499
|
-
if (hash
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
if (lg_cur_size_ <= lg_nom_size_) {
|
|
504
|
-
resize();
|
|
505
|
-
} else {
|
|
506
|
-
rebuild();
|
|
507
|
-
}
|
|
508
|
-
}
|
|
190
|
+
void update_theta_sketch_alloc<A>::update(const void* data, size_t length) {
|
|
191
|
+
const uint64_t hash = table_.hash_and_screen(data, length);
|
|
192
|
+
if (hash == 0) return;
|
|
193
|
+
auto result = table_.find(hash);
|
|
194
|
+
if (!result.second) {
|
|
195
|
+
table_.insert(result.first, hash);
|
|
509
196
|
}
|
|
510
197
|
}
|
|
511
198
|
|
|
512
199
|
template<typename A>
|
|
513
200
|
void update_theta_sketch_alloc<A>::trim() {
|
|
514
|
-
|
|
201
|
+
table_.trim();
|
|
515
202
|
}
|
|
516
203
|
|
|
517
204
|
template<typename A>
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
const uint8_t factor = std::max(1, std::min(static_cast<int>(rf_), lg_tgt_size - lg_cur_size_));
|
|
521
|
-
const uint8_t lg_new_size = lg_cur_size_ + factor;
|
|
522
|
-
const uint32_t new_size = 1 << lg_new_size;
|
|
523
|
-
vector_u64<A> new_keys(new_size, 0);
|
|
524
|
-
for (uint32_t i = 0; i < keys_.size(); i++) {
|
|
525
|
-
if (keys_[i] != 0) {
|
|
526
|
-
hash_search_or_insert(keys_[i], new_keys.data(), lg_new_size); // TODO hash_insert
|
|
527
|
-
}
|
|
528
|
-
}
|
|
529
|
-
keys_ = std::move(new_keys);
|
|
530
|
-
lg_cur_size_ += factor;
|
|
531
|
-
capacity_ = get_capacity(lg_cur_size_, lg_nom_size_);
|
|
532
|
-
}
|
|
533
|
-
|
|
534
|
-
template<typename A>
|
|
535
|
-
void update_theta_sketch_alloc<A>::rebuild() {
|
|
536
|
-
const uint32_t pivot = (1 << lg_nom_size_) + keys_.size() - num_keys_;
|
|
537
|
-
std::nth_element(keys_.begin(), keys_.begin() + pivot, keys_.end());
|
|
538
|
-
this->theta_ = keys_[pivot];
|
|
539
|
-
vector_u64<A> new_keys(keys_.size(), 0);
|
|
540
|
-
num_keys_ = 0;
|
|
541
|
-
for (uint32_t i = 0; i < keys_.size(); i++) {
|
|
542
|
-
if (keys_[i] != 0 && keys_[i] < this->theta_) {
|
|
543
|
-
hash_search_or_insert(keys_[i], new_keys.data(), lg_cur_size_); // TODO hash_insert
|
|
544
|
-
num_keys_++;
|
|
545
|
-
}
|
|
546
|
-
}
|
|
547
|
-
keys_ = std::move(new_keys);
|
|
205
|
+
auto update_theta_sketch_alloc<A>::begin() -> iterator {
|
|
206
|
+
return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
|
|
548
207
|
}
|
|
549
208
|
|
|
550
209
|
template<typename A>
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
return std::floor(fraction * (1 << lg_cur_size));
|
|
210
|
+
auto update_theta_sketch_alloc<A>::end() -> iterator {
|
|
211
|
+
return iterator(nullptr, 0, 1 << table_.lg_cur_size_);
|
|
554
212
|
}
|
|
555
213
|
|
|
556
214
|
template<typename A>
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
return (2 * static_cast<uint32_t>((hash >> lg_size) & STRIDE_MASK)) + 1;
|
|
215
|
+
auto update_theta_sketch_alloc<A>::begin() const -> const_iterator {
|
|
216
|
+
return const_iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
|
|
560
217
|
}
|
|
561
218
|
|
|
562
219
|
template<typename A>
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
uint32_t cur_probe = static_cast<uint32_t>(hash) & mask;
|
|
220
|
+
auto update_theta_sketch_alloc<A>::end() const -> const_iterator {
|
|
221
|
+
return const_iterator(nullptr, 0, 1 << table_.lg_cur_size_);
|
|
222
|
+
}
|
|
567
223
|
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
const uint64_t value = table[cur_probe];
|
|
572
|
-
if (value == 0) {
|
|
573
|
-
table[cur_probe] = hash; // insert value
|
|
574
|
-
return true;
|
|
575
|
-
} else if (value == hash) {
|
|
576
|
-
return false; // found a duplicate
|
|
577
|
-
}
|
|
578
|
-
cur_probe = (cur_probe + stride) & mask;
|
|
579
|
-
} while (cur_probe != loop_index);
|
|
580
|
-
throw std::logic_error("key not found and no empty slots!");
|
|
581
|
-
}
|
|
582
|
-
|
|
583
|
-
template<typename A>
|
|
584
|
-
bool update_theta_sketch_alloc<A>::hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size) {
|
|
585
|
-
const uint32_t mask = (1 << lg_size) - 1;
|
|
586
|
-
const uint32_t stride = update_theta_sketch_alloc<A>::get_stride(hash, lg_size);
|
|
587
|
-
uint32_t cur_probe = static_cast<uint32_t>(hash) & mask;
|
|
588
|
-
const uint32_t loop_index = cur_probe;
|
|
589
|
-
do {
|
|
590
|
-
const uint64_t value = table[cur_probe];
|
|
591
|
-
if (value == 0) {
|
|
592
|
-
return false;
|
|
593
|
-
} else if (value == hash) {
|
|
594
|
-
return true;
|
|
595
|
-
}
|
|
596
|
-
cur_probe = (cur_probe + stride) & mask;
|
|
597
|
-
} while (cur_probe != loop_index);
|
|
598
|
-
throw std::logic_error("key not found and search wrapped");
|
|
224
|
+
template<typename A>
|
|
225
|
+
compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered) const {
|
|
226
|
+
return compact_theta_sketch_alloc<A>(*this, ordered);
|
|
599
227
|
}
|
|
600
228
|
|
|
601
229
|
template<typename A>
|
|
602
|
-
|
|
603
|
-
|
|
230
|
+
void update_theta_sketch_alloc<A>::print_specifics(ostrstream& os) const {
|
|
231
|
+
os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
|
|
232
|
+
os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
|
|
233
|
+
os << " resize factor : " << (1 << table_.rf_) << std::endl;
|
|
604
234
|
}
|
|
605
235
|
|
|
236
|
+
// builder
|
|
237
|
+
|
|
606
238
|
template<typename A>
|
|
607
|
-
|
|
608
|
-
|
|
239
|
+
update_theta_sketch_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
|
|
240
|
+
|
|
241
|
+
template<typename A>
|
|
242
|
+
update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
|
|
243
|
+
return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
|
|
609
244
|
}
|
|
610
245
|
|
|
611
246
|
// compact sketch
|
|
612
247
|
|
|
613
248
|
template<typename A>
|
|
614
|
-
compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(
|
|
615
|
-
|
|
616
|
-
|
|
249
|
+
compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Base& other, bool ordered):
|
|
250
|
+
is_empty_(other.is_empty()),
|
|
251
|
+
is_ordered_(other.is_ordered() || ordered),
|
|
252
|
+
seed_hash_(other.get_seed_hash()),
|
|
253
|
+
theta_(other.get_theta64()),
|
|
254
|
+
entries_(other.get_allocator())
|
|
255
|
+
{
|
|
256
|
+
entries_.reserve(other.get_num_retained());
|
|
257
|
+
std::copy(other.begin(), other.end(), std::back_inserter(entries_));
|
|
258
|
+
if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
template<typename A>
|
|
262
|
+
compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
|
|
263
|
+
std::vector<uint64_t, A>&& entries):
|
|
264
|
+
is_empty_(is_empty),
|
|
265
|
+
is_ordered_(is_ordered),
|
|
617
266
|
seed_hash_(seed_hash),
|
|
618
|
-
|
|
267
|
+
theta_(theta),
|
|
268
|
+
entries_(std::move(entries))
|
|
619
269
|
{}
|
|
620
270
|
|
|
621
271
|
template<typename A>
|
|
622
|
-
compact_theta_sketch_alloc<A>::
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
{
|
|
628
|
-
|
|
629
|
-
|
|
272
|
+
A compact_theta_sketch_alloc<A>::get_allocator() const {
|
|
273
|
+
return entries_.get_allocator();
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
template<typename A>
|
|
277
|
+
bool compact_theta_sketch_alloc<A>::is_empty() const {
|
|
278
|
+
return is_empty_;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
template<typename A>
|
|
282
|
+
bool compact_theta_sketch_alloc<A>::is_ordered() const {
|
|
283
|
+
return is_ordered_;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
template<typename A>
|
|
287
|
+
uint64_t compact_theta_sketch_alloc<A>::get_theta64() const {
|
|
288
|
+
return theta_;
|
|
630
289
|
}
|
|
631
290
|
|
|
632
291
|
template<typename A>
|
|
633
292
|
uint32_t compact_theta_sketch_alloc<A>::get_num_retained() const {
|
|
634
|
-
return
|
|
293
|
+
return entries_.size();
|
|
635
294
|
}
|
|
636
295
|
|
|
637
296
|
template<typename A>
|
|
@@ -640,158 +299,148 @@ uint16_t compact_theta_sketch_alloc<A>::get_seed_hash() const {
|
|
|
640
299
|
}
|
|
641
300
|
|
|
642
301
|
template<typename A>
|
|
643
|
-
|
|
644
|
-
return
|
|
302
|
+
auto compact_theta_sketch_alloc<A>::begin() -> iterator {
|
|
303
|
+
return iterator(entries_.data(), entries_.size(), 0);
|
|
645
304
|
}
|
|
646
305
|
|
|
647
306
|
template<typename A>
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
os << "### Compact Theta sketch summary:" << std::endl;
|
|
651
|
-
os << " num retained keys : " << keys_.size() << std::endl;
|
|
652
|
-
os << " seed hash : " << this->get_seed_hash() << std::endl;
|
|
653
|
-
os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
|
|
654
|
-
os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
|
|
655
|
-
os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
|
|
656
|
-
os << " theta (fraction) : " << this->get_theta() << std::endl;
|
|
657
|
-
os << " theta (raw 64-bit) : " << this->theta_ << std::endl;
|
|
658
|
-
os << " estimate : " << this->get_estimate() << std::endl;
|
|
659
|
-
os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
|
|
660
|
-
os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
|
|
661
|
-
os << "### End sketch summary" << std::endl;
|
|
662
|
-
if (print_items) {
|
|
663
|
-
os << "### Retained keys" << std::endl;
|
|
664
|
-
for (auto key: *this) os << " " << key << std::endl;
|
|
665
|
-
os << "### End retained keys" << std::endl;
|
|
666
|
-
}
|
|
667
|
-
return os.str();
|
|
307
|
+
auto compact_theta_sketch_alloc<A>::end() -> iterator {
|
|
308
|
+
return iterator(nullptr, 0, entries_.size());
|
|
668
309
|
}
|
|
669
310
|
|
|
311
|
+
template<typename A>
|
|
312
|
+
auto compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
|
|
313
|
+
return const_iterator(entries_.data(), entries_.size(), 0);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
template<typename A>
|
|
317
|
+
auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
|
|
318
|
+
return const_iterator(nullptr, 0, entries_.size());
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
template<typename A>
|
|
322
|
+
void compact_theta_sketch_alloc<A>::print_specifics(ostrstream&) const {}
|
|
323
|
+
|
|
670
324
|
template<typename A>
|
|
671
325
|
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
672
|
-
const bool is_single_item =
|
|
326
|
+
const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
|
|
673
327
|
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
|
|
674
328
|
os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
|
|
675
|
-
const uint8_t serial_version =
|
|
329
|
+
const uint8_t serial_version = SERIAL_VERSION;
|
|
676
330
|
os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
|
|
677
331
|
const uint8_t type = SKETCH_TYPE;
|
|
678
332
|
os.write(reinterpret_cast<const char*>(&type), sizeof(type));
|
|
679
333
|
const uint16_t unused16 = 0;
|
|
680
334
|
os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
|
|
681
335
|
const uint8_t flags_byte(
|
|
682
|
-
(1 <<
|
|
683
|
-
(1 <<
|
|
684
|
-
(this->is_empty() ? 1 <<
|
|
685
|
-
(this->is_ordered() ? 1 <<
|
|
336
|
+
(1 << flags::IS_COMPACT) |
|
|
337
|
+
(1 << flags::IS_READ_ONLY) |
|
|
338
|
+
(this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
|
|
339
|
+
(this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
|
|
686
340
|
);
|
|
687
341
|
os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
|
|
688
342
|
const uint16_t seed_hash = get_seed_hash();
|
|
689
|
-
os.write((
|
|
343
|
+
os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
|
|
690
344
|
if (!this->is_empty()) {
|
|
691
345
|
if (!is_single_item) {
|
|
692
|
-
const uint32_t
|
|
693
|
-
os.write((
|
|
346
|
+
const uint32_t num_entries = entries_.size();
|
|
347
|
+
os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
|
|
694
348
|
const uint32_t unused32 = 0;
|
|
695
|
-
os.write((
|
|
349
|
+
os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
|
|
696
350
|
if (this->is_estimation_mode()) {
|
|
697
|
-
os.write((
|
|
351
|
+
os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
|
|
698
352
|
}
|
|
699
353
|
}
|
|
700
|
-
os.write((
|
|
354
|
+
os.write(reinterpret_cast<const char*>(entries_.data()), entries_.size() * sizeof(uint64_t));
|
|
701
355
|
}
|
|
702
356
|
}
|
|
703
357
|
|
|
704
358
|
template<typename A>
|
|
705
|
-
|
|
706
|
-
const bool is_single_item =
|
|
359
|
+
auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
|
|
360
|
+
const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
|
|
707
361
|
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
|
|
708
|
-
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
|
|
709
|
-
|
|
362
|
+
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
|
|
363
|
+
+ sizeof(uint64_t) * entries_.size();
|
|
364
|
+
vector_bytes bytes(size, 0, entries_.get_allocator());
|
|
710
365
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
711
366
|
|
|
712
367
|
ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
|
|
713
|
-
const uint8_t serial_version =
|
|
368
|
+
const uint8_t serial_version = SERIAL_VERSION;
|
|
714
369
|
ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
|
|
715
370
|
const uint8_t type = SKETCH_TYPE;
|
|
716
371
|
ptr += copy_to_mem(&type, ptr, sizeof(type));
|
|
717
372
|
const uint16_t unused16 = 0;
|
|
718
373
|
ptr += copy_to_mem(&unused16, ptr, sizeof(unused16));
|
|
719
374
|
const uint8_t flags_byte(
|
|
720
|
-
(1 <<
|
|
721
|
-
(1 <<
|
|
722
|
-
(this->is_empty() ? 1 <<
|
|
723
|
-
(this->is_ordered() ? 1 <<
|
|
375
|
+
(1 << flags::IS_COMPACT) |
|
|
376
|
+
(1 << flags::IS_READ_ONLY) |
|
|
377
|
+
(this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
|
|
378
|
+
(this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
|
|
724
379
|
);
|
|
725
380
|
ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
|
|
726
381
|
const uint16_t seed_hash = get_seed_hash();
|
|
727
382
|
ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
|
|
728
383
|
if (!this->is_empty()) {
|
|
729
384
|
if (!is_single_item) {
|
|
730
|
-
const uint32_t
|
|
731
|
-
ptr += copy_to_mem(&
|
|
385
|
+
const uint32_t num_entries = entries_.size();
|
|
386
|
+
ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
|
|
732
387
|
const uint32_t unused32 = 0;
|
|
733
388
|
ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
|
|
734
389
|
if (this->is_estimation_mode()) {
|
|
735
|
-
ptr += copy_to_mem(&
|
|
390
|
+
ptr += copy_to_mem(&theta_, ptr, sizeof(uint64_t));
|
|
736
391
|
}
|
|
737
392
|
}
|
|
738
|
-
ptr += copy_to_mem(
|
|
393
|
+
ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
|
|
739
394
|
}
|
|
740
|
-
|
|
741
395
|
return bytes;
|
|
742
396
|
}
|
|
743
397
|
|
|
744
398
|
template<typename A>
|
|
745
|
-
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
|
|
399
|
+
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
|
|
746
400
|
uint8_t preamble_longs;
|
|
747
|
-
is.read((
|
|
401
|
+
is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
|
|
748
402
|
uint8_t serial_version;
|
|
749
|
-
is.read((
|
|
403
|
+
is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
|
|
750
404
|
uint8_t type;
|
|
751
|
-
is.read((
|
|
405
|
+
is.read(reinterpret_cast<char*>(&type), sizeof(type));
|
|
752
406
|
uint16_t unused16;
|
|
753
|
-
is.read((
|
|
407
|
+
is.read(reinterpret_cast<char*>(&unused16), sizeof(unused16));
|
|
754
408
|
uint8_t flags_byte;
|
|
755
|
-
is.read((
|
|
409
|
+
is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
|
|
756
410
|
uint16_t seed_hash;
|
|
757
|
-
is.read((
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
const bool is_empty = flags_byte & (1 <<
|
|
761
|
-
if (!is_empty)
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
template<typename A>
|
|
766
|
-
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash) {
|
|
767
|
-
uint64_t theta = theta_sketch_alloc<A>::MAX_THETA;
|
|
768
|
-
uint32_t num_keys = 0;
|
|
769
|
-
|
|
770
|
-
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
411
|
+
is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
|
|
412
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
413
|
+
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
|
414
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
415
|
+
if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
416
|
+
|
|
417
|
+
uint64_t theta = theta_constants::MAX_THETA;
|
|
418
|
+
uint32_t num_entries = 0;
|
|
771
419
|
if (!is_empty) {
|
|
772
420
|
if (preamble_longs == 1) {
|
|
773
|
-
|
|
421
|
+
num_entries = 1;
|
|
774
422
|
} else {
|
|
775
|
-
is.read((
|
|
423
|
+
is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
|
|
776
424
|
uint32_t unused32;
|
|
777
|
-
is.read((
|
|
425
|
+
is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
|
|
778
426
|
if (preamble_longs > 2) {
|
|
779
|
-
is.read((
|
|
427
|
+
is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
|
|
780
428
|
}
|
|
781
429
|
}
|
|
782
430
|
}
|
|
783
|
-
|
|
784
|
-
if (!is_empty) is.read((
|
|
431
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
|
432
|
+
if (!is_empty) is.read(reinterpret_cast<char*>(entries.data()), sizeof(uint64_t) * entries.size());
|
|
785
433
|
|
|
786
|
-
const bool is_ordered = flags_byte & (1 <<
|
|
787
|
-
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
|
788
|
-
return compact_theta_sketch_alloc
|
|
434
|
+
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
|
|
435
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
|
436
|
+
return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
|
|
789
437
|
}
|
|
790
438
|
|
|
791
439
|
template<typename A>
|
|
792
|
-
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
|
|
440
|
+
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
|
|
793
441
|
ensure_minimum_memory(size, 8);
|
|
794
442
|
const char* ptr = static_cast<const char*>(bytes);
|
|
443
|
+
const char* base = ptr;
|
|
795
444
|
uint8_t preamble_longs;
|
|
796
445
|
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
|
|
797
446
|
uint8_t serial_version;
|
|
@@ -804,28 +453,19 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
|
|
|
804
453
|
ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
|
|
805
454
|
uint16_t seed_hash;
|
|
806
455
|
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
const bool is_empty = flags_byte & (1 <<
|
|
810
|
-
if (!is_empty)
|
|
811
|
-
return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash);
|
|
812
|
-
}
|
|
813
|
-
|
|
814
|
-
template<typename A>
|
|
815
|
-
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash) {
|
|
816
|
-
const char* ptr = static_cast<const char*>(bytes);
|
|
817
|
-
const char* base = ptr;
|
|
818
|
-
|
|
819
|
-
uint64_t theta = theta_sketch_alloc<A>::MAX_THETA;
|
|
820
|
-
uint32_t num_keys = 0;
|
|
456
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
457
|
+
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
|
458
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
459
|
+
if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
821
460
|
|
|
822
|
-
|
|
461
|
+
uint64_t theta = theta_constants::MAX_THETA;
|
|
462
|
+
uint32_t num_entries = 0;
|
|
823
463
|
if (!is_empty) {
|
|
824
464
|
if (preamble_longs == 1) {
|
|
825
|
-
|
|
465
|
+
num_entries = 1;
|
|
826
466
|
} else {
|
|
827
467
|
ensure_minimum_memory(size, 8); // read the first prelong before this method
|
|
828
|
-
ptr += copy_from_mem(ptr, &
|
|
468
|
+
ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
|
|
829
469
|
uint32_t unused32;
|
|
830
470
|
ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
|
|
831
471
|
if (preamble_longs > 2) {
|
|
@@ -834,106 +474,16 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::internal_deserializ
|
|
|
834
474
|
}
|
|
835
475
|
}
|
|
836
476
|
}
|
|
837
|
-
const size_t
|
|
838
|
-
check_memory_size(ptr - base +
|
|
839
|
-
|
|
840
|
-
if (!is_empty) ptr += copy_from_mem(ptr,
|
|
841
|
-
|
|
842
|
-
const bool is_ordered = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_ORDERED);
|
|
843
|
-
return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash, is_ordered);
|
|
844
|
-
}
|
|
845
|
-
|
|
846
|
-
template<typename A>
|
|
847
|
-
typename theta_sketch_alloc<A>::const_iterator compact_theta_sketch_alloc<A>::begin() const {
|
|
848
|
-
return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), 0);
|
|
849
|
-
}
|
|
850
|
-
|
|
851
|
-
template<typename A>
|
|
852
|
-
typename theta_sketch_alloc<A>::const_iterator compact_theta_sketch_alloc<A>::end() const {
|
|
853
|
-
return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), keys_.size());
|
|
854
|
-
}
|
|
855
|
-
|
|
856
|
-
// builder
|
|
857
|
-
|
|
858
|
-
template<typename A>
|
|
859
|
-
update_theta_sketch_alloc<A>::builder::builder():
|
|
860
|
-
lg_k_(DEFAULT_LG_K), rf_(DEFAULT_RESIZE_FACTOR), p_(1), seed_(DEFAULT_SEED) {}
|
|
861
|
-
|
|
862
|
-
template<typename A>
|
|
863
|
-
typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_lg_k(uint8_t lg_k) {
|
|
864
|
-
if (lg_k < MIN_LG_K) {
|
|
865
|
-
throw std::invalid_argument("lg_k must not be less than " + std::to_string(MIN_LG_K) + ": " + std::to_string(lg_k));
|
|
866
|
-
}
|
|
867
|
-
lg_k_ = lg_k;
|
|
868
|
-
return *this;
|
|
869
|
-
}
|
|
870
|
-
|
|
871
|
-
template<typename A>
|
|
872
|
-
typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_resize_factor(resize_factor rf) {
|
|
873
|
-
rf_ = rf;
|
|
874
|
-
return *this;
|
|
875
|
-
}
|
|
477
|
+
const size_t entries_size_bytes = sizeof(uint64_t) * num_entries;
|
|
478
|
+
check_memory_size(ptr - base + entries_size_bytes, size);
|
|
479
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
|
480
|
+
if (!is_empty) ptr += copy_from_mem(ptr, entries.data(), entries_size_bytes);
|
|
876
481
|
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
p_ = p;
|
|
880
|
-
return *this;
|
|
881
|
-
}
|
|
882
|
-
|
|
883
|
-
template<typename A>
|
|
884
|
-
typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_seed(uint64_t seed) {
|
|
885
|
-
seed_ = seed;
|
|
886
|
-
return *this;
|
|
887
|
-
}
|
|
888
|
-
|
|
889
|
-
template<typename A>
|
|
890
|
-
uint8_t update_theta_sketch_alloc<A>::builder::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
|
|
891
|
-
return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
|
|
892
|
-
}
|
|
893
|
-
|
|
894
|
-
template<typename A>
|
|
895
|
-
update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
|
|
896
|
-
return update_theta_sketch_alloc<A>(starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_)), lg_k_, rf_, p_, seed_);
|
|
897
|
-
}
|
|
898
|
-
|
|
899
|
-
// iterator
|
|
900
|
-
|
|
901
|
-
template<typename A>
|
|
902
|
-
theta_sketch_alloc<A>::const_iterator::const_iterator(const uint64_t* keys, uint32_t size, uint32_t index):
|
|
903
|
-
keys_(keys), size_(size), index_(index) {
|
|
904
|
-
while (index_ < size_ && keys_[index_] == 0) ++index_;
|
|
905
|
-
}
|
|
906
|
-
|
|
907
|
-
template<typename A>
|
|
908
|
-
typename theta_sketch_alloc<A>::const_iterator& theta_sketch_alloc<A>::const_iterator::operator++() {
|
|
909
|
-
do {
|
|
910
|
-
++index_;
|
|
911
|
-
} while (index_ < size_ && keys_[index_] == 0);
|
|
912
|
-
return *this;
|
|
913
|
-
}
|
|
914
|
-
|
|
915
|
-
template<typename A>
|
|
916
|
-
typename theta_sketch_alloc<A>::const_iterator theta_sketch_alloc<A>::const_iterator::operator++(int) {
|
|
917
|
-
const_iterator tmp(*this);
|
|
918
|
-
operator++();
|
|
919
|
-
return tmp;
|
|
920
|
-
}
|
|
921
|
-
|
|
922
|
-
template<typename A>
|
|
923
|
-
bool theta_sketch_alloc<A>::const_iterator::operator==(const const_iterator& other) const {
|
|
924
|
-
return index_ == other.index_;
|
|
925
|
-
}
|
|
926
|
-
|
|
927
|
-
template<typename A>
|
|
928
|
-
bool theta_sketch_alloc<A>::const_iterator::operator!=(const const_iterator& other) const {
|
|
929
|
-
return index_ != other.index_;
|
|
930
|
-
}
|
|
931
|
-
|
|
932
|
-
template<typename A>
|
|
933
|
-
uint64_t theta_sketch_alloc<A>::const_iterator::operator*() const {
|
|
934
|
-
return keys_[index_];
|
|
482
|
+
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
|
|
483
|
+
return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
|
|
935
484
|
}
|
|
936
485
|
|
|
937
486
|
} /* namespace datasketches */
|
|
938
487
|
|
|
939
488
|
#endif
|
|
489
|
+
|