datasketches 0.1.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +10 -0
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +18 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +13 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +20 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +116 -105
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +22 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +140 -101
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +20 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -16
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +21 -21
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +102 -105
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +141 -125
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +5 -5
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +81 -109
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +25 -24
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +89 -105
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +130 -165
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +21 -22
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +88 -83
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +34 -45
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +7 -8
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +41 -52
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +7 -8
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +220 -251
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +42 -42
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +36 -38
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +47 -44
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +62 -87
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +121 -128
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +25 -53
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +8 -8
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +36 -36
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +28 -28
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +37 -37
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +57 -61
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +40 -25
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +50 -6
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +164 -136
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +178 -88
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +12 -6
- data/vendor/datasketches-cpp/python/README.md +52 -49
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -6
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +4 -2
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +38 -28
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -2
- data/vendor/datasketches-cpp/python/tests/kll_test.py +5 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +18 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +488 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +19 -13
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +130 -127
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +41 -49
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -44
- data/vendor/datasketches-cpp/setup.py +11 -6
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +3 -2
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +11 -4
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +26 -28
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +24 -36
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +163 -256
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +250 -651
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +10 -21
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +44 -30
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +60 -5
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +74 -235
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +57 -70
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +18 -21
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +13 -16
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +7 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +3 -3
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +13 -16
- metadata +51 -36
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -33,10 +33,14 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
33
33
|
const uint8_t frequent_items_sketch<T, W, H, E, S, A>::LG_MIN_MAP_SIZE;
|
34
34
|
|
35
35
|
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
36
|
-
frequent_items_sketch<T, W, H, E, S, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size):
|
36
|
+
frequent_items_sketch<T, W, H, E, S, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size, const A& allocator):
|
37
37
|
total_weight(0),
|
38
38
|
offset(0),
|
39
|
-
map(
|
39
|
+
map(
|
40
|
+
std::max(lg_start_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
|
41
|
+
std::max(lg_max_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
|
42
|
+
allocator
|
43
|
+
)
|
40
44
|
{
|
41
45
|
if (lg_start_map_size > lg_max_map_size) throw std::invalid_argument("starting size must not be greater than maximum size");
|
42
46
|
}
|
@@ -61,7 +65,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
61
65
|
void frequent_items_sketch<T, W, H, E, S, A>::merge(const frequent_items_sketch& other) {
|
62
66
|
if (other.is_empty()) return;
|
63
67
|
const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
|
64
|
-
for (auto
|
68
|
+
for (auto it: other.map) {
|
65
69
|
update(it.first, it.second);
|
66
70
|
}
|
67
71
|
offset += other.offset;
|
@@ -72,7 +76,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
72
76
|
void frequent_items_sketch<T, W, H, E, S, A>::merge(frequent_items_sketch&& other) {
|
73
77
|
if (other.is_empty()) return;
|
74
78
|
const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
|
75
|
-
for (auto
|
79
|
+
for (auto it: other.map) {
|
76
80
|
update(std::move(it.first), it.second);
|
77
81
|
}
|
78
82
|
offset += other.offset;
|
@@ -142,8 +146,8 @@ frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error
|
|
142
146
|
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
143
147
|
typename frequent_items_sketch<T, W, H, E, S, A>::vector_row
|
144
148
|
frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const {
|
145
|
-
vector_row items;
|
146
|
-
for (auto
|
149
|
+
vector_row items(map.get_allocator());
|
150
|
+
for (auto it: map) {
|
147
151
|
const W lb = it.second;
|
148
152
|
const W ub = it.second + offset;
|
149
153
|
if ((err_type == NO_FALSE_NEGATIVES && ub > threshold) || (err_type == NO_FALSE_POSITIVES && lb > threshold)) {
|
@@ -158,43 +162,45 @@ frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error
|
|
158
162
|
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
159
163
|
void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os) const {
|
160
164
|
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
|
161
|
-
|
165
|
+
write(os, preamble_longs);
|
162
166
|
const uint8_t serial_version = SERIAL_VERSION;
|
163
|
-
|
167
|
+
write(os, serial_version);
|
164
168
|
const uint8_t family = FAMILY_ID;
|
165
|
-
|
169
|
+
write(os, family);
|
166
170
|
const uint8_t lg_max_size = map.get_lg_max_size();
|
167
|
-
|
171
|
+
write(os, lg_max_size);
|
168
172
|
const uint8_t lg_cur_size = map.get_lg_cur_size();
|
169
|
-
|
173
|
+
write(os, lg_cur_size);
|
170
174
|
const uint8_t flags_byte(
|
171
175
|
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
172
176
|
);
|
173
|
-
|
177
|
+
write(os, flags_byte);
|
174
178
|
const uint16_t unused16 = 0;
|
175
|
-
|
179
|
+
write(os, unused16);
|
176
180
|
if (!is_empty()) {
|
177
181
|
const uint32_t num_items = map.get_num_active();
|
178
|
-
|
182
|
+
write(os, num_items);
|
179
183
|
const uint32_t unused32 = 0;
|
180
|
-
|
181
|
-
|
182
|
-
|
184
|
+
write(os, unused32);
|
185
|
+
write(os, total_weight);
|
186
|
+
write(os, offset);
|
183
187
|
|
184
188
|
// copy active items and their weights to use batch serialization
|
185
|
-
|
186
|
-
|
187
|
-
|
189
|
+
using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
|
190
|
+
AllocW aw(map.get_allocator());
|
191
|
+
W* weights = aw.allocate(num_items);
|
192
|
+
A alloc(map.get_allocator());
|
193
|
+
T* items = alloc.allocate(num_items);
|
188
194
|
uint32_t i = 0;
|
189
|
-
for (auto
|
195
|
+
for (auto it: map) {
|
190
196
|
new (&items[i]) T(it.first);
|
191
197
|
weights[i++] = it.second;
|
192
198
|
}
|
193
|
-
|
194
|
-
|
199
|
+
write(os, weights, sizeof(W) * num_items);
|
200
|
+
aw.deallocate(weights, num_items);
|
195
201
|
S().serialize(os, items, num_items);
|
196
|
-
for (
|
197
|
-
|
202
|
+
for (i = 0; i < num_items; i++) items[i].~T();
|
203
|
+
alloc.deallocate(items, num_items);
|
198
204
|
}
|
199
205
|
}
|
200
206
|
|
@@ -202,56 +208,56 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
202
208
|
size_t frequent_items_sketch<T, W, H, E, S, A>::get_serialized_size_bytes() const {
|
203
209
|
if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t);
|
204
210
|
size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W);
|
205
|
-
for (auto
|
211
|
+
for (auto it: map) size += S().size_of_item(it.first);
|
206
212
|
return size;
|
207
213
|
}
|
208
214
|
|
209
215
|
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
210
|
-
|
216
|
+
auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
|
211
217
|
const size_t size = header_size_bytes + get_serialized_size_bytes();
|
212
|
-
|
218
|
+
vector_bytes bytes(size, 0, map.get_allocator());
|
213
219
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
214
220
|
uint8_t* end_ptr = ptr + size;
|
215
221
|
|
216
222
|
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
|
217
|
-
ptr += copy_to_mem(
|
223
|
+
ptr += copy_to_mem(preamble_longs, ptr);
|
218
224
|
const uint8_t serial_version = SERIAL_VERSION;
|
219
|
-
ptr += copy_to_mem(
|
225
|
+
ptr += copy_to_mem(serial_version, ptr);
|
220
226
|
const uint8_t family = FAMILY_ID;
|
221
|
-
ptr += copy_to_mem(
|
227
|
+
ptr += copy_to_mem(family, ptr);
|
222
228
|
const uint8_t lg_max_size = map.get_lg_max_size();
|
223
|
-
ptr += copy_to_mem(
|
229
|
+
ptr += copy_to_mem(lg_max_size, ptr);
|
224
230
|
const uint8_t lg_cur_size = map.get_lg_cur_size();
|
225
|
-
ptr += copy_to_mem(
|
231
|
+
ptr += copy_to_mem(lg_cur_size, ptr);
|
226
232
|
const uint8_t flags_byte(
|
227
233
|
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
228
234
|
);
|
229
|
-
ptr += copy_to_mem(
|
230
|
-
|
231
|
-
ptr += copy_to_mem(&unused16, ptr, sizeof(uint16_t));
|
235
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
236
|
+
ptr += sizeof(uint16_t); // unused
|
232
237
|
if (!is_empty()) {
|
233
238
|
const uint32_t num_items = map.get_num_active();
|
234
|
-
ptr += copy_to_mem(
|
235
|
-
|
236
|
-
ptr += copy_to_mem(
|
237
|
-
ptr += copy_to_mem(
|
238
|
-
ptr += copy_to_mem(&offset, ptr, sizeof(offset));
|
239
|
+
ptr += copy_to_mem(num_items, ptr);
|
240
|
+
ptr += sizeof(uint32_t); // unused
|
241
|
+
ptr += copy_to_mem(total_weight, ptr);
|
242
|
+
ptr += copy_to_mem(offset, ptr);
|
239
243
|
|
240
244
|
// copy active items and their weights to use batch serialization
|
241
|
-
|
242
|
-
|
243
|
-
|
245
|
+
using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
|
246
|
+
AllocW aw(map.get_allocator());
|
247
|
+
W* weights = aw.allocate(num_items);
|
248
|
+
A alloc(map.get_allocator());
|
249
|
+
T* items = alloc.allocate(num_items);
|
244
250
|
uint32_t i = 0;
|
245
|
-
for (auto
|
251
|
+
for (auto it: map) {
|
246
252
|
new (&items[i]) T(it.first);
|
247
253
|
weights[i++] = it.second;
|
248
254
|
}
|
249
255
|
ptr += copy_to_mem(weights, ptr, sizeof(W) * num_items);
|
250
|
-
|
256
|
+
aw.deallocate(weights, num_items);
|
251
257
|
const size_t bytes_remaining = end_ptr - ptr;
|
252
258
|
ptr += S().serialize(ptr, bytes_remaining, items, num_items);
|
253
|
-
for (
|
254
|
-
|
259
|
+
for (i = 0; i < num_items; i++) items[i].~T();
|
260
|
+
alloc.deallocate(items, num_items);
|
255
261
|
}
|
256
262
|
return bytes;
|
257
263
|
}
|
@@ -259,37 +265,32 @@ vector_u8<A> frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_
|
|
259
265
|
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
260
266
|
class frequent_items_sketch<T, W, H, E, S, A>::items_deleter {
|
261
267
|
public:
|
262
|
-
items_deleter(uint32_t num, bool destroy
|
263
|
-
|
264
|
-
void
|
268
|
+
items_deleter(uint32_t num, bool destroy, const A& allocator):
|
269
|
+
allocator_(allocator), num_(num), destroy_(destroy) {}
|
270
|
+
void set_destroy(bool destroy) { destroy_ = destroy; }
|
271
|
+
void operator() (T* ptr) {
|
265
272
|
if (ptr != nullptr) {
|
266
|
-
if (
|
267
|
-
for (uint32_t i = 0; i <
|
273
|
+
if (destroy_) {
|
274
|
+
for (uint32_t i = 0; i < num_; ++i) ptr[i].~T();
|
268
275
|
}
|
269
|
-
|
276
|
+
allocator_.deallocate(ptr, num_);
|
270
277
|
}
|
271
278
|
}
|
272
279
|
private:
|
273
|
-
|
274
|
-
|
280
|
+
A allocator_;
|
281
|
+
uint32_t num_;
|
282
|
+
bool destroy_;
|
275
283
|
};
|
276
284
|
|
277
285
|
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
278
|
-
frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is) {
|
279
|
-
|
280
|
-
|
281
|
-
uint8_t
|
282
|
-
|
283
|
-
uint8_t
|
284
|
-
|
285
|
-
|
286
|
-
is.read((char*)&lg_max_size, sizeof(lg_max_size));
|
287
|
-
uint8_t lg_cur_size;
|
288
|
-
is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
|
289
|
-
uint8_t flags_byte;
|
290
|
-
is.read((char*)&flags_byte, sizeof(flags_byte));
|
291
|
-
uint16_t unused16;
|
292
|
-
is.read((char*)&unused16, sizeof(unused16));
|
286
|
+
frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is, const A& allocator) {
|
287
|
+
const auto preamble_longs = read<uint8_t>(is);
|
288
|
+
const auto serial_version = read<uint8_t>(is);
|
289
|
+
const auto family_id = read<uint8_t>(is);
|
290
|
+
const auto lg_max_size = read<uint8_t>(is);
|
291
|
+
const auto lg_cur_size = read<uint8_t>(is);
|
292
|
+
const auto flags_byte = read<uint8_t>(is);
|
293
|
+
read<uint16_t>(is); // unused
|
293
294
|
|
294
295
|
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
295
296
|
|
@@ -298,22 +299,19 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
298
299
|
check_family_id(family_id);
|
299
300
|
check_size(lg_cur_size, lg_max_size);
|
300
301
|
|
301
|
-
frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size);
|
302
|
+
frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
|
302
303
|
if (!is_empty) {
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
W total_weight;
|
308
|
-
is.read((char*)&total_weight, sizeof(total_weight));
|
309
|
-
W offset;
|
310
|
-
is.read((char*)&offset, sizeof(offset));
|
304
|
+
const auto num_items = read<uint32_t>(is);
|
305
|
+
read<uint32_t>(is); // unused
|
306
|
+
const auto total_weight = read<W>(is);
|
307
|
+
const auto offset = read<W>(is);
|
311
308
|
|
312
309
|
// batch deserialization with intermediate array of items and weights
|
313
|
-
|
314
|
-
std::vector<W, AllocW> weights(num_items);
|
315
|
-
|
316
|
-
|
310
|
+
using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
|
311
|
+
std::vector<W, AllocW> weights(num_items, 0, allocator);
|
312
|
+
read(is, weights.data(), sizeof(W) * num_items);
|
313
|
+
A alloc(allocator);
|
314
|
+
std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(num_items, false, alloc));
|
317
315
|
S().deserialize(is, items.get(), num_items);
|
318
316
|
items.get_deleter().set_destroy(true); // serde did not throw, so the items must be constructed
|
319
317
|
for (uint32_t i = 0; i < num_items; i++) {
|
@@ -328,24 +326,23 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
328
326
|
}
|
329
327
|
|
330
328
|
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
331
|
-
frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(const void* bytes, size_t size) {
|
329
|
+
frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
|
332
330
|
ensure_minimum_memory(size, 8);
|
333
331
|
const char* ptr = static_cast<const char*>(bytes);
|
334
332
|
const char* base = static_cast<const char*>(bytes);
|
335
333
|
uint8_t preamble_longs;
|
336
|
-
ptr += copy_from_mem(ptr,
|
334
|
+
ptr += copy_from_mem(ptr, preamble_longs);
|
337
335
|
uint8_t serial_version;
|
338
|
-
ptr += copy_from_mem(ptr,
|
336
|
+
ptr += copy_from_mem(ptr, serial_version);
|
339
337
|
uint8_t family_id;
|
340
|
-
ptr += copy_from_mem(ptr,
|
338
|
+
ptr += copy_from_mem(ptr, family_id);
|
341
339
|
uint8_t lg_max_size;
|
342
|
-
ptr += copy_from_mem(ptr,
|
340
|
+
ptr += copy_from_mem(ptr, lg_max_size);
|
343
341
|
uint8_t lg_cur_size;
|
344
|
-
ptr += copy_from_mem(ptr,
|
342
|
+
ptr += copy_from_mem(ptr, lg_cur_size);
|
345
343
|
uint8_t flags_byte;
|
346
|
-
ptr += copy_from_mem(ptr,
|
347
|
-
uint16_t
|
348
|
-
ptr += copy_from_mem(ptr, &unused16, sizeof(uint16_t));
|
344
|
+
ptr += copy_from_mem(ptr, flags_byte);
|
345
|
+
ptr += sizeof(uint16_t); // unused
|
349
346
|
|
350
347
|
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
351
348
|
|
@@ -353,25 +350,25 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
353
350
|
check_serial_version(serial_version);
|
354
351
|
check_family_id(family_id);
|
355
352
|
check_size(lg_cur_size, lg_max_size);
|
356
|
-
ensure_minimum_memory(size,
|
353
|
+
ensure_minimum_memory(size, preamble_longs * sizeof(uint64_t));
|
357
354
|
|
358
|
-
frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size);
|
355
|
+
frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
|
359
356
|
if (!is_empty) {
|
360
357
|
uint32_t num_items;
|
361
|
-
ptr += copy_from_mem(ptr,
|
362
|
-
uint32_t
|
363
|
-
ptr += copy_from_mem(ptr, &unused32, sizeof(uint32_t));
|
358
|
+
ptr += copy_from_mem(ptr, num_items);
|
359
|
+
ptr += sizeof(uint32_t); // unused
|
364
360
|
W total_weight;
|
365
|
-
ptr += copy_from_mem(ptr,
|
361
|
+
ptr += copy_from_mem(ptr, total_weight);
|
366
362
|
W offset;
|
367
|
-
ptr += copy_from_mem(ptr,
|
363
|
+
ptr += copy_from_mem(ptr, offset);
|
368
364
|
|
369
365
|
ensure_minimum_memory(size, ptr - base + (sizeof(W) * num_items));
|
370
366
|
// batch deserialization with intermediate array of items and weights
|
371
|
-
|
372
|
-
std::vector<W, AllocW> weights(num_items);
|
367
|
+
using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
|
368
|
+
std::vector<W, AllocW> weights(num_items, 0, allocator);
|
373
369
|
ptr += copy_from_mem(ptr, weights.data(), sizeof(W) * num_items);
|
374
|
-
|
370
|
+
A alloc(allocator);
|
371
|
+
std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(num_items, false, alloc));
|
375
372
|
const size_t bytes_remaining = size - (ptr - base);
|
376
373
|
ptr += S().deserialize(ptr, bytes_remaining, items.get(), num_items);
|
377
374
|
items.get_deleter().set_destroy(true); // serde did not throw, so the items must be constructed
|
@@ -434,14 +431,14 @@ string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) c
|
|
434
431
|
os << "### End sketch summary" << std::endl;
|
435
432
|
if (print_items) {
|
436
433
|
vector_row items;
|
437
|
-
for (auto
|
434
|
+
for (auto it: map) {
|
438
435
|
items.push_back(row(&it.first, it.second, offset));
|
439
436
|
}
|
440
437
|
// sort by estimate in descending order
|
441
438
|
std::sort(items.begin(), items.end(), [](row a, row b){ return a.get_estimate() > b.get_estimate(); });
|
442
439
|
os << "### Items in descending order by estimate" << std::endl;
|
443
440
|
os << " item, estimate, lower bound, upper bound" << std::endl;
|
444
|
-
for (auto
|
441
|
+
for (auto it: items) {
|
445
442
|
os << " " << it.get_item() << ", " << it.get_estimate() << ", "
|
446
443
|
<< it.get_lower_bound() << ", " << it.get_upper_bound() << std::endl;
|
447
444
|
}
|
@@ -39,33 +39,39 @@ public:
|
|
39
39
|
using AllocV = typename std::allocator_traits<A>::template rebind_alloc<V>;
|
40
40
|
using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
|
41
41
|
|
42
|
-
reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size);
|
42
|
+
reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size, const A& allocator);
|
43
43
|
reverse_purge_hash_map(const reverse_purge_hash_map& other);
|
44
44
|
reverse_purge_hash_map(reverse_purge_hash_map&& other) noexcept;
|
45
45
|
~reverse_purge_hash_map();
|
46
46
|
reverse_purge_hash_map& operator=(reverse_purge_hash_map other);
|
47
47
|
reverse_purge_hash_map& operator=(reverse_purge_hash_map&& other);
|
48
|
-
|
49
|
-
|
48
|
+
|
49
|
+
template<typename FwdK>
|
50
|
+
V adjust_or_insert(FwdK&& key, V value);
|
51
|
+
|
50
52
|
V get(const K& key) const;
|
51
53
|
uint8_t get_lg_cur_size() const;
|
52
54
|
uint8_t get_lg_max_size() const;
|
53
55
|
uint32_t get_capacity() const;
|
54
56
|
uint32_t get_num_active() const;
|
57
|
+
const A& get_allocator() const;
|
58
|
+
|
55
59
|
class iterator;
|
56
60
|
iterator begin() const;
|
57
61
|
iterator end() const;
|
62
|
+
|
58
63
|
private:
|
59
64
|
static constexpr double LOAD_FACTOR = 0.75;
|
60
65
|
static constexpr uint16_t DRIFT_LIMIT = 1024; // used only for stress testing
|
61
66
|
static constexpr uint32_t MAX_SAMPLE_SIZE = 1024; // number of samples to compute approximate median during purge
|
62
67
|
|
63
|
-
|
64
|
-
uint8_t
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
68
|
+
A allocator_;
|
69
|
+
uint8_t lg_cur_size_;
|
70
|
+
uint8_t lg_max_size_;
|
71
|
+
uint32_t num_active_;
|
72
|
+
K* keys_;
|
73
|
+
V* values_;
|
74
|
+
uint16_t* states_;
|
69
75
|
|
70
76
|
inline bool is_active(uint32_t probe) const;
|
71
77
|
void subtract_and_keep_positive_only(V amount);
|
@@ -83,8 +89,8 @@ public:
|
|
83
89
|
friend class reverse_purge_hash_map<K, V, H, E, A>;
|
84
90
|
iterator& operator++() {
|
85
91
|
++count;
|
86
|
-
if (count < map->
|
87
|
-
const uint32_t mask = (1 << map->
|
92
|
+
if (count < map->num_active_) {
|
93
|
+
const uint32_t mask = (1 << map->lg_cur_size_) - 1;
|
88
94
|
do {
|
89
95
|
index = (index + stride) & mask;
|
90
96
|
} while (!map->is_active(index));
|
@@ -95,7 +101,7 @@ public:
|
|
95
101
|
bool operator==(const iterator& rhs) const { return count == rhs.count; }
|
96
102
|
bool operator!=(const iterator& rhs) const { return count != rhs.count; }
|
97
103
|
const std::pair<K&, V> operator*() const {
|
98
|
-
return std::pair<K&, V>(map->
|
104
|
+
return std::pair<K&, V>(map->keys_[index], map->values_[index]);
|
99
105
|
}
|
100
106
|
private:
|
101
107
|
static constexpr double GOLDEN_RATIO_RECIPROCAL = 0.6180339887498949; // = (sqrt(5) - 1) / 2
|
@@ -104,7 +110,7 @@ private:
|
|
104
110
|
uint32_t count;
|
105
111
|
uint32_t stride;
|
106
112
|
iterator(const reverse_purge_hash_map<K, V, H, E, A>* map, uint32_t index, uint32_t count):
|
107
|
-
map(map), index(index), count(count), stride(static_cast<uint32_t>((1 << map->
|
113
|
+
map(map), index(index), count(count), stride(static_cast<uint32_t>((1 << map->lg_cur_size_) * GOLDEN_RATIO_RECIPROCAL) | 1) {}
|
108
114
|
};
|
109
115
|
|
110
116
|
} /* namespace datasketches */
|