datasketches 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +3 -3
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +23 -20
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +15 -10
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +2 -2
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/{python/src/__init__.py → count/CMakeLists.txt} +25 -1
- data/vendor/datasketches-cpp/count/include/count_min.hpp +405 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +497 -0
- data/vendor/datasketches-cpp/{MANIFEST.in → count/test/CMakeLists.txt} +23 -20
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +303 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +14 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/{tox.ini → density/CMakeLists.txt} +24 -8
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +256 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/{python/datasketches/__init__.py → density/test/CMakeLists.txt} +15 -3
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +94 -61
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +20 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -18
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +79 -65
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +79 -53
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +61 -132
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +5 -40
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +76 -54
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +66 -136
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +15 -39
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -4
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +105 -26
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +50 -111
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +89 -32
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +33 -19
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +13 -10
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +23 -19
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -51
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -20
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +180 -33
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +10 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +21 -6
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +13 -3
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +15 -1
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +39 -188
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/{python/src/datasketches.cpp → tuple/test/tuple_sketch_serialize_for_java.cpp} +16 -30
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +61 -79
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -81
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -104
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -90
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -128
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -134
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -210
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -111
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -204
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -172
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -110
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -130
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -125
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -126
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -126
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -146
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -125
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -26,22 +26,12 @@
|
|
26
26
|
#include <iterator>
|
27
27
|
#include <vector>
|
28
28
|
|
29
|
-
|
30
|
-
/**
|
31
|
-
* This sketch samples data from a stream of items, designed for optimal (minimum) variance when
|
32
|
-
* querying the sketch to estimate subset sums of items matchng a provided predicate. Variance
|
33
|
-
* optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
|
34
|
-
* subset sum estimation.
|
35
|
-
*
|
36
|
-
* author Kevin Lang
|
37
|
-
* author Jon Malkin
|
38
|
-
*/
|
39
29
|
namespace datasketches {
|
40
30
|
|
41
31
|
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
42
32
|
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
|
43
33
|
|
44
|
-
|
34
|
+
/*
|
45
35
|
* A struct to hold the result of subset sum queries
|
46
36
|
*/
|
47
37
|
struct subset_summary {
|
@@ -53,11 +43,23 @@ struct subset_summary {
|
|
53
43
|
|
54
44
|
template <typename T, typename A> class var_opt_union; // forward declaration
|
55
45
|
|
46
|
+
/// VarOpt sketch constants
|
56
47
|
namespace var_opt_constants {
|
57
|
-
|
58
|
-
|
48
|
+
/// default resize factor
|
49
|
+
const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
|
50
|
+
/// maximum value of parameter K
|
51
|
+
const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
|
59
52
|
}
|
60
53
|
|
54
|
+
/**
|
55
|
+
* This sketch samples data from a stream of items. Designed for optimal (minimum) variance when
|
56
|
+
* querying the sketch to estimate subset sums of items matching a provided predicate. Variance
|
57
|
+
* optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
|
58
|
+
* subset sum estimation.
|
59
|
+
*
|
60
|
+
* author Kevin Lang
|
61
|
+
* author Jon Malkin
|
62
|
+
*/
|
61
63
|
template<
|
62
64
|
typename T,
|
63
65
|
typename A = std::allocator<T>
|
@@ -68,15 +70,42 @@ class var_opt_sketch {
|
|
68
70
|
static const resize_factor DEFAULT_RESIZE_FACTOR = var_opt_constants::DEFAULT_RESIZE_FACTOR;
|
69
71
|
static const uint32_t MAX_K = var_opt_constants::MAX_K;
|
70
72
|
|
73
|
+
/**
|
74
|
+
* Constructor
|
75
|
+
* @param k sketch size
|
76
|
+
* @param rf resize factor
|
77
|
+
* @param allocator instance of an allocator
|
78
|
+
*/
|
71
79
|
explicit var_opt_sketch(uint32_t k,
|
72
80
|
resize_factor rf = var_opt_constants::DEFAULT_RESIZE_FACTOR,
|
73
81
|
const A& allocator = A());
|
82
|
+
|
83
|
+
/**
|
84
|
+
* Copy constructor
|
85
|
+
* @param other sketch to be copied
|
86
|
+
*/
|
74
87
|
var_opt_sketch(const var_opt_sketch& other);
|
88
|
+
|
89
|
+
/**
|
90
|
+
* Move constructor
|
91
|
+
* @param other sketch to be moved
|
92
|
+
*/
|
75
93
|
var_opt_sketch(var_opt_sketch&& other) noexcept;
|
76
94
|
|
77
95
|
~var_opt_sketch();
|
78
96
|
|
97
|
+
/**
|
98
|
+
* Copy assignment
|
99
|
+
* @param other sketch to be copied
|
100
|
+
* @return reference to this sketch
|
101
|
+
*/
|
79
102
|
var_opt_sketch& operator=(const var_opt_sketch& other);
|
103
|
+
|
104
|
+
/**
|
105
|
+
* Move assignment
|
106
|
+
* @param other sketch to be moved
|
107
|
+
* @return reference to this sketch
|
108
|
+
*/
|
80
109
|
var_opt_sketch& operator=(var_opt_sketch&& other);
|
81
110
|
|
82
111
|
/**
|
@@ -85,7 +114,7 @@ class var_opt_sketch {
|
|
85
114
|
* @param item an item from a stream of items
|
86
115
|
* @param weight the weight of the item
|
87
116
|
*/
|
88
|
-
void update(const T& item, double weight=1.0);
|
117
|
+
void update(const T& item, double weight = 1.0);
|
89
118
|
|
90
119
|
/**
|
91
120
|
* Updates this sketch with the given data item with the given weight.
|
@@ -93,7 +122,7 @@ class var_opt_sketch {
|
|
93
122
|
* @param item an item from a stream of items
|
94
123
|
* @param weight the weight of the item
|
95
124
|
*/
|
96
|
-
void update(T&& item, double weight=1.0);
|
125
|
+
void update(T&& item, double weight = 1.0);
|
97
126
|
|
98
127
|
/**
|
99
128
|
* Returns the configured maximum sample size.
|
@@ -117,7 +146,7 @@ class var_opt_sketch {
|
|
117
146
|
* Computes an estimated subset sum from the entire stream for objects matching a given
|
118
147
|
* predicate. Provides a lower bound, estimate, and upper bound using a target of 2 standard
|
119
148
|
* deviations. This is technically a heuristic method and tries to err on the conservative side.
|
120
|
-
* @param
|
149
|
+
* @param predicate a predicate function
|
121
150
|
* @return a subset_summary item with estimate, upper and lower bounds,
|
122
151
|
* and total sketch weight
|
123
152
|
*/
|
@@ -138,7 +167,7 @@ class var_opt_sketch {
|
|
138
167
|
/**
|
139
168
|
* Computes size needed to serialize the current state of the sketch.
|
140
169
|
* This version is for fixed-size arithmetic types (integral and floating point).
|
141
|
-
* @param instance of a SerDe
|
170
|
+
* @param sd instance of a SerDe
|
142
171
|
* @return size in bytes needed to serialize this sketch
|
143
172
|
*/
|
144
173
|
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
@@ -147,7 +176,7 @@ class var_opt_sketch {
|
|
147
176
|
/**
|
148
177
|
* Computes size needed to serialize the current state of the sketch.
|
149
178
|
* This version is for all other types and can be expensive since every item needs to be looked at.
|
150
|
-
* @param instance of a SerDe
|
179
|
+
* @param sd instance of a SerDe
|
151
180
|
* @return size in bytes needed to serialize this sketch
|
152
181
|
*/
|
153
182
|
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
@@ -155,7 +184,7 @@ class var_opt_sketch {
|
|
155
184
|
|
156
185
|
// This is a convenience alias for users
|
157
186
|
// The type returned by the following serialize method
|
158
|
-
|
187
|
+
using vector_bytes = vector_u8<A>;
|
159
188
|
|
160
189
|
/**
|
161
190
|
* This method serializes the sketch as a vector of bytes.
|
@@ -163,7 +192,7 @@ class var_opt_sketch {
|
|
163
192
|
* It is a blank space of a given size.
|
164
193
|
* This header is used in Datasketches PostgreSQL extension.
|
165
194
|
* @param header_size_bytes space to reserve in front of the sketch
|
166
|
-
* @param instance of a SerDe
|
195
|
+
* @param sd instance of a SerDe
|
167
196
|
*/
|
168
197
|
template<typename SerDe = serde<T>>
|
169
198
|
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
|
@@ -171,7 +200,7 @@ class var_opt_sketch {
|
|
171
200
|
/**
|
172
201
|
* This method serializes the sketch into a given stream in a binary form
|
173
202
|
* @param os output stream
|
174
|
-
* @param instance of a SerDe
|
203
|
+
* @param sd instance of a SerDe
|
175
204
|
*/
|
176
205
|
template<typename SerDe = serde<T>>
|
177
206
|
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
|
@@ -179,8 +208,8 @@ class var_opt_sketch {
|
|
179
208
|
/**
|
180
209
|
* This method deserializes a sketch from a given stream.
|
181
210
|
* @param is input stream
|
182
|
-
* @param instance of a SerDe
|
183
|
-
* @param instance of an
|
211
|
+
* @param sd instance of a SerDe
|
212
|
+
* @param allocator instance of an allocator
|
184
213
|
* @return an instance of a sketch
|
185
214
|
*/
|
186
215
|
template<typename SerDe = serde<T>>
|
@@ -190,8 +219,8 @@ class var_opt_sketch {
|
|
190
219
|
* This method deserializes a sketch from a given array of bytes.
|
191
220
|
* @param bytes pointer to the array of bytes
|
192
221
|
* @param size the size of the array
|
193
|
-
* @param instance of a SerDe
|
194
|
-
* @param instance of an
|
222
|
+
* @param sd instance of a SerDe
|
223
|
+
* @param allocator instance of an allocator
|
195
224
|
* @return an instance of a sketch
|
196
225
|
*/
|
197
226
|
template<typename SerDe = serde<T>>
|
@@ -205,7 +234,8 @@ class var_opt_sketch {
|
|
205
234
|
|
206
235
|
/**
|
207
236
|
* Prints the raw sketch items to a string. Calls items_to_stream() internally.
|
208
|
-
* Only works for type T with a defined
|
237
|
+
* Only works for type T with a defined
|
238
|
+
* std::ostream& operator<<(std::ostream&, const T&) and
|
209
239
|
* kept separate from to_string() to allow compilation even if
|
210
240
|
* T does not have such an operator defined.
|
211
241
|
* @return a string with the sketch items
|
@@ -213,7 +243,20 @@ class var_opt_sketch {
|
|
213
243
|
string<A> items_to_string() const;
|
214
244
|
|
215
245
|
class const_iterator;
|
246
|
+
|
247
|
+
/**
|
248
|
+
* Iterator pointing to the first item in the sketch.
|
249
|
+
* If the sketch is empty, the returned iterator must not be dereferenced or incremented.
|
250
|
+
* @return iterator pointing to the first item in the sketch
|
251
|
+
*/
|
216
252
|
const_iterator begin() const;
|
253
|
+
|
254
|
+
/**
|
255
|
+
* Iterator pointing to the past-the-end item in the sketch.
|
256
|
+
* The past-the-end item is the hypothetical item that would follow the last item.
|
257
|
+
* It does not point to any item, and must not be dereferenced or incremented.
|
258
|
+
* @return iterator pointing to the past-the-end item in the sketch
|
259
|
+
*/
|
217
260
|
const_iterator end() const;
|
218
261
|
|
219
262
|
private:
|
@@ -346,14 +389,21 @@ class var_opt_sketch {
|
|
346
389
|
};
|
347
390
|
|
348
391
|
template<typename T, typename A>
|
349
|
-
class var_opt_sketch<T, A>::const_iterator
|
392
|
+
class var_opt_sketch<T, A>::const_iterator {
|
350
393
|
public:
|
394
|
+
using iterator_category = std::input_iterator_tag;
|
395
|
+
using value_type = std::pair<const T&, const double>;
|
396
|
+
using difference_type = void;
|
397
|
+
using pointer = const return_value_holder<value_type>;
|
398
|
+
using reference = const value_type;
|
399
|
+
|
351
400
|
const_iterator(const const_iterator& other);
|
352
401
|
const_iterator& operator++();
|
353
402
|
const_iterator& operator++(int);
|
354
403
|
bool operator==(const const_iterator& other) const;
|
355
404
|
bool operator!=(const const_iterator& other) const;
|
356
|
-
|
405
|
+
reference operator*() const;
|
406
|
+
pointer operator->() const;
|
357
407
|
|
358
408
|
private:
|
359
409
|
friend class var_opt_sketch<T, A>;
|
@@ -362,8 +412,8 @@ private:
|
|
362
412
|
// default iterator over full sketch
|
363
413
|
const_iterator(const var_opt_sketch<T, A>& sk, bool is_end);
|
364
414
|
|
365
|
-
// iterates over only one of the H or R
|
366
|
-
//
|
415
|
+
// iterates over only one of the H or R regions
|
416
|
+
// does not apply weight correction
|
367
417
|
const_iterator(const var_opt_sketch<T, A>& sk, bool is_end, bool use_r_region);
|
368
418
|
|
369
419
|
bool get_mark() const;
|
@@ -377,14 +427,21 @@ private:
|
|
377
427
|
|
378
428
|
// non-const iterator for internal use
|
379
429
|
template<typename T, typename A>
|
380
|
-
class var_opt_sketch<T, A>::iterator
|
430
|
+
class var_opt_sketch<T, A>::iterator {
|
381
431
|
public:
|
432
|
+
using iterator_category = std::input_iterator_tag;
|
433
|
+
using value_type = std::pair<T&, double>;
|
434
|
+
using difference_type = void;
|
435
|
+
using pointer = return_value_holder<value_type>;
|
436
|
+
using reference = value_type;
|
437
|
+
|
382
438
|
iterator(const iterator& other);
|
383
439
|
iterator& operator++();
|
384
440
|
iterator& operator++(int);
|
385
441
|
bool operator==(const iterator& other) const;
|
386
442
|
bool operator!=(const iterator& other) const;
|
387
|
-
|
443
|
+
reference operator*();
|
444
|
+
pointer operator->();
|
388
445
|
|
389
446
|
private:
|
390
447
|
friend class var_opt_sketch<T, A>;
|
@@ -36,7 +36,7 @@
|
|
36
36
|
|
37
37
|
namespace datasketches {
|
38
38
|
|
39
|
-
|
39
|
+
/*
|
40
40
|
* Implementation code for the VarOpt sketch.
|
41
41
|
*
|
42
42
|
* author Kevin Lang
|
@@ -189,16 +189,16 @@ var_opt_sketch<T, A>::~var_opt_sketch() {
|
|
189
189
|
// destroy everything
|
190
190
|
const size_t num_to_destroy = std::min(k_ + 1, curr_items_alloc_);
|
191
191
|
for (size_t i = 0; i < num_to_destroy; ++i) {
|
192
|
-
|
192
|
+
data_[i].~T();
|
193
193
|
}
|
194
194
|
} else {
|
195
195
|
// skip gap or anything unused at the end
|
196
196
|
for (size_t i = 0; i < h_; ++i) {
|
197
|
-
|
197
|
+
data_[i].~T();
|
198
198
|
}
|
199
199
|
|
200
200
|
for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) {
|
201
|
-
|
201
|
+
data_[i].~T();
|
202
202
|
}
|
203
203
|
}
|
204
204
|
allocator_.deallocate(data_, curr_items_alloc_);
|
@@ -658,14 +658,14 @@ void var_opt_sketch<T, A>::reset() {
|
|
658
658
|
// destroy everything
|
659
659
|
const size_t num_to_destroy = std::min(k_ + 1, prev_alloc);
|
660
660
|
for (size_t i = 0; i < num_to_destroy; ++i)
|
661
|
-
|
661
|
+
data_[i].~T();
|
662
662
|
} else {
|
663
663
|
// skip gap or anything unused at the end
|
664
664
|
for (size_t i = 0; i < h_; ++i)
|
665
|
-
|
665
|
+
data_[i].~T();
|
666
666
|
|
667
667
|
for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
|
668
|
-
|
668
|
+
data_[i].~T();
|
669
669
|
}
|
670
670
|
|
671
671
|
if (curr_items_alloc_ < prev_alloc) {
|
@@ -754,10 +754,10 @@ string<A> var_opt_sketch<T, A>::items_to_string(bool print_gap) const {
|
|
754
754
|
const uint32_t array_length = (n_ < k_ ? n_ : k_ + 1);
|
755
755
|
for (uint32_t i = 0, display_idx = 0; i < array_length; ++i) {
|
756
756
|
if (i == h_ && print_gap) {
|
757
|
-
os <<
|
757
|
+
os << display_idx << ": GAP" << std::endl;
|
758
758
|
++display_idx;
|
759
759
|
} else {
|
760
|
-
os <<
|
760
|
+
os << display_idx << ": " << data_[i] << "\twt = ";
|
761
761
|
if (weights_[i] == -1.0) {
|
762
762
|
os << get_tau() << "\t(-1.0)" << std::endl;
|
763
763
|
} else {
|
@@ -895,7 +895,7 @@ void var_opt_sketch<T, A>::update_heavy_r_eq1(O&& item, double weight, bool mark
|
|
895
895
|
grow_candidate_set(weights_[m_slot] + total_wt_r_, 2);
|
896
896
|
}
|
897
897
|
|
898
|
-
|
898
|
+
/*
|
899
899
|
* Decreases sketch's value of k by 1, updating stored values as needed.
|
900
900
|
*
|
901
901
|
* <p>Subject to certain pre-conditions, decreasing k causes tau to increase. This fact is used by
|
@@ -990,7 +990,7 @@ void var_opt_sketch<T, A>::grow_data_arrays() {
|
|
990
990
|
|
991
991
|
for (uint32_t i = 0; i < prev_size; ++i) {
|
992
992
|
new (&tmp_data[i]) T(std::move(data_[i]));
|
993
|
-
|
993
|
+
data_[i].~T();
|
994
994
|
tmp_weights[i] = weights_[i];
|
995
995
|
}
|
996
996
|
|
@@ -1531,7 +1531,6 @@ var_opt_sketch<T, A>::const_iterator::const_iterator(const var_opt_sketch& sk, b
|
|
1531
1531
|
if (idx_ == final_idx_) { sk_ = nullptr; }
|
1532
1532
|
}
|
1533
1533
|
|
1534
|
-
|
1535
1534
|
template<typename T, typename A>
|
1536
1535
|
var_opt_sketch<T, A>::const_iterator::const_iterator(const const_iterator& other) :
|
1537
1536
|
sk_(other.sk_),
|
@@ -1543,6 +1542,9 @@ var_opt_sketch<T, A>::const_iterator::const_iterator(const const_iterator& other
|
|
1543
1542
|
|
1544
1543
|
template<typename T, typename A>
|
1545
1544
|
typename var_opt_sketch<T, A>::const_iterator& var_opt_sketch<T, A>::const_iterator::operator++() {
|
1545
|
+
// accumulate weight already visited
|
1546
|
+
if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
|
1547
|
+
|
1546
1548
|
++idx_;
|
1547
1549
|
|
1548
1550
|
if (idx_ == final_idx_) {
|
@@ -1551,7 +1553,6 @@ typename var_opt_sketch<T, A>::const_iterator& var_opt_sketch<T, A>::const_itera
|
|
1551
1553
|
} else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
|
1552
1554
|
++idx_;
|
1553
1555
|
}
|
1554
|
-
if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
|
1555
1556
|
return *this;
|
1556
1557
|
}
|
1557
1558
|
|
@@ -1575,14 +1576,19 @@ bool var_opt_sketch<T, A>::const_iterator::operator!=(const const_iterator& othe
|
|
1575
1576
|
}
|
1576
1577
|
|
1577
1578
|
template<typename T, typename A>
|
1578
|
-
|
1579
|
+
auto var_opt_sketch<T, A>::const_iterator::operator*() const -> reference {
|
1579
1580
|
double wt;
|
1580
1581
|
if (idx_ < sk_->h_) {
|
1581
1582
|
wt = sk_->weights_[idx_];
|
1582
1583
|
} else {
|
1583
1584
|
wt = r_item_wt_;
|
1584
1585
|
}
|
1585
|
-
return
|
1586
|
+
return value_type(sk_->data_[idx_], wt);
|
1587
|
+
}
|
1588
|
+
|
1589
|
+
template<typename T, typename A>
|
1590
|
+
auto var_opt_sketch<T, A>::const_iterator::operator->() const -> pointer {
|
1591
|
+
return **this;
|
1586
1592
|
}
|
1587
1593
|
|
1588
1594
|
template<typename T, typename A>
|
@@ -1622,6 +1628,9 @@ var_opt_sketch<T, A>::iterator::iterator(const iterator& other) :
|
|
1622
1628
|
|
1623
1629
|
template<typename T, typename A>
|
1624
1630
|
typename var_opt_sketch<T, A>::iterator& var_opt_sketch<T, A>::iterator::operator++() {
|
1631
|
+
// accumulate weight already visited
|
1632
|
+
if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
|
1633
|
+
|
1625
1634
|
++idx_;
|
1626
1635
|
|
1627
1636
|
if (idx_ == final_idx_) {
|
@@ -1630,7 +1639,7 @@ typename var_opt_sketch<T, A>::iterator& var_opt_sketch<T, A>::iterator::operato
|
|
1630
1639
|
} else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
|
1631
1640
|
++idx_;
|
1632
1641
|
}
|
1633
|
-
|
1642
|
+
|
1634
1643
|
return *this;
|
1635
1644
|
}
|
1636
1645
|
|
@@ -1654,7 +1663,7 @@ bool var_opt_sketch<T, A>::iterator::operator!=(const iterator& other) const {
|
|
1654
1663
|
}
|
1655
1664
|
|
1656
1665
|
template<typename T, typename A>
|
1657
|
-
|
1666
|
+
auto var_opt_sketch<T, A>::iterator::operator*() -> reference {
|
1658
1667
|
double wt;
|
1659
1668
|
if (idx_ < sk_->h_) {
|
1660
1669
|
wt = sk_->weights_[idx_];
|
@@ -1663,7 +1672,12 @@ std::pair<T&, double> var_opt_sketch<T, A>::iterator::operator*() {
|
|
1663
1672
|
} else {
|
1664
1673
|
wt = r_item_wt_;
|
1665
1674
|
}
|
1666
|
-
return
|
1675
|
+
return value_type(sk_->data_[idx_], wt);
|
1676
|
+
}
|
1677
|
+
|
1678
|
+
template<typename T, typename A>
|
1679
|
+
auto var_opt_sketch<T, A>::iterator::operator->() -> pointer {
|
1680
|
+
return **this;
|
1667
1681
|
}
|
1668
1682
|
|
1669
1683
|
template<typename T, typename A>
|
@@ -1671,7 +1685,7 @@ bool var_opt_sketch<T, A>::iterator::get_mark() const {
|
|
1671
1685
|
return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
|
1672
1686
|
}
|
1673
1687
|
|
1674
|
-
|
1688
|
+
/*
|
1675
1689
|
* Checks if target sampling allocation is more than 50% of max sampling size.
|
1676
1690
|
* If so, returns max sampling size, otherwise passes through target size.
|
1677
1691
|
*/
|
@@ -52,7 +52,6 @@ template<
|
|
52
52
|
class var_opt_union {
|
53
53
|
|
54
54
|
public:
|
55
|
-
static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
|
56
55
|
|
57
56
|
explicit var_opt_union(uint32_t max_k, const A& allocator = A());
|
58
57
|
var_opt_union(const var_opt_union& other);
|
@@ -91,7 +90,7 @@ public:
|
|
91
90
|
/**
|
92
91
|
* Computes size needed to serialize the current state of the union.
|
93
92
|
* This version is for all other types and can be expensive since every item needs to be looked at.
|
94
|
-
* @param instance of a SerDe
|
93
|
+
* @param sd instance of a SerDe
|
95
94
|
* @return size in bytes needed to serialize this sketch
|
96
95
|
*/
|
97
96
|
template<typename SerDe = serde<T>>
|
@@ -108,7 +107,7 @@ public:
|
|
108
107
|
* It is a blank space of a given size.
|
109
108
|
* This header is used in Datasketches PostgreSQL extension.
|
110
109
|
* @param header_size_bytes space to reserve in front of the sketch
|
111
|
-
* @param instance of a SerDe
|
110
|
+
* @param sd instance of a SerDe
|
112
111
|
*/
|
113
112
|
template<typename SerDe = serde<T>>
|
114
113
|
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
|
@@ -117,7 +116,7 @@ public:
|
|
117
116
|
* NOTE: This method may be deprecated in a future version.
|
118
117
|
* This method serializes the sketch into a given stream in a binary form
|
119
118
|
* @param os output stream
|
120
|
-
* @param instance of a SerDe
|
119
|
+
* @param sd instance of a SerDe
|
121
120
|
*/
|
122
121
|
template<typename SerDe = serde<T>>
|
123
122
|
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
|
@@ -126,8 +125,8 @@ public:
|
|
126
125
|
* NOTE: This method may be deprecated in a future version.
|
127
126
|
* This method deserializes a union from a given stream.
|
128
127
|
* @param is input stream
|
129
|
-
* @param instance of a SerDe
|
130
|
-
* @param instance of an Allocator
|
128
|
+
* @param sd instance of a SerDe
|
129
|
+
* @param allocator instance of an Allocator
|
131
130
|
* @return an instance of a union
|
132
131
|
*/
|
133
132
|
template<typename SerDe = serde<T>>
|
@@ -138,8 +137,8 @@ public:
|
|
138
137
|
* This method deserializes a union from a given array of bytes.
|
139
138
|
* @param bytes pointer to the array of bytes
|
140
139
|
* @param size the size of the array
|
141
|
-
* @param instance of a SerDe
|
142
|
-
* @param instance of an Allocator
|
140
|
+
* @param sd instance of a SerDe
|
141
|
+
* @param allocator instance of an Allocator
|
143
142
|
* @return an instance of a union
|
144
143
|
*/
|
145
144
|
template<typename SerDe = serde<T>>
|
@@ -152,7 +151,9 @@ public:
|
|
152
151
|
string<A> to_string() const;
|
153
152
|
|
154
153
|
private:
|
155
|
-
|
154
|
+
using AllocSketch = typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T, A>>;
|
155
|
+
using AllocDouble = typename std::allocator_traits<A>::template rebind_alloc<double>;
|
156
|
+
using AllocBool = typename std::allocator_traits<A>::template rebind_alloc<bool>;
|
156
157
|
|
157
158
|
static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
|
158
159
|
static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
|
@@ -170,10 +171,12 @@ private:
|
|
170
171
|
|
171
172
|
uint32_t max_k_;
|
172
173
|
|
174
|
+
A allocator_;
|
175
|
+
|
173
176
|
var_opt_sketch<T, A> gadget_;
|
174
177
|
|
175
178
|
var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
|
176
|
-
uint32_t max_k, var_opt_sketch<T, A>&& gadget);
|
179
|
+
uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator = A());
|
177
180
|
|
178
181
|
/*
|
179
182
|
IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
|
@@ -34,6 +34,7 @@ var_opt_union<T, A>::var_opt_union(uint32_t max_k, const A& allocator) :
|
|
34
34
|
outer_tau_numer_(0.0),
|
35
35
|
outer_tau_denom_(0),
|
36
36
|
max_k_(max_k),
|
37
|
+
allocator_(allocator),
|
37
38
|
gadget_(max_k, var_opt_sketch<T, A>::DEFAULT_RESIZE_FACTOR, true, allocator)
|
38
39
|
{}
|
39
40
|
|
@@ -43,6 +44,7 @@ var_opt_union<T, A>::var_opt_union(const var_opt_union& other) :
|
|
43
44
|
outer_tau_numer_(other.outer_tau_numer_),
|
44
45
|
outer_tau_denom_(other.outer_tau_denom_),
|
45
46
|
max_k_(other.max_k_),
|
47
|
+
allocator_(other.allocator_),
|
46
48
|
gadget_(other.gadget_)
|
47
49
|
{}
|
48
50
|
|
@@ -52,16 +54,18 @@ var_opt_union<T, A>::var_opt_union(var_opt_union&& other) noexcept :
|
|
52
54
|
outer_tau_numer_(other.outer_tau_numer_),
|
53
55
|
outer_tau_denom_(other.outer_tau_denom_),
|
54
56
|
max_k_(other.max_k_),
|
57
|
+
allocator_(other.allocator_),
|
55
58
|
gadget_(std::move(other.gadget_))
|
56
59
|
{}
|
57
60
|
|
58
61
|
template<typename T, typename A>
|
59
62
|
var_opt_union<T, A>::var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
|
60
|
-
uint32_t max_k, var_opt_sketch<T, A>&& gadget) :
|
63
|
+
uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator) :
|
61
64
|
n_(n),
|
62
65
|
outer_tau_numer_(outer_tau_numer),
|
63
66
|
outer_tau_denom_(outer_tau_denom),
|
64
67
|
max_k_(max_k),
|
68
|
+
allocator_(allocator),
|
65
69
|
gadget_(gadget)
|
66
70
|
{}
|
67
71
|
|
@@ -75,6 +79,7 @@ var_opt_union<T, A>& var_opt_union<T, A>::operator=(const var_opt_union& other)
|
|
75
79
|
std::swap(outer_tau_numer_, union_copy.outer_tau_numer_);
|
76
80
|
std::swap(outer_tau_denom_, union_copy.outer_tau_denom_);
|
77
81
|
std::swap(max_k_, union_copy.max_k_);
|
82
|
+
std::swap(allocator_, other.allocator_);
|
78
83
|
std::swap(gadget_, union_copy.gadget_);
|
79
84
|
return *this;
|
80
85
|
}
|
@@ -85,6 +90,7 @@ var_opt_union<T, A>& var_opt_union<T, A>::operator=(var_opt_union&& other) {
|
|
85
90
|
std::swap(outer_tau_numer_, other.outer_tau_numer_);
|
86
91
|
std::swap(outer_tau_denom_, other.outer_tau_denom_);
|
87
92
|
std::swap(max_k_, other.max_k_);
|
93
|
+
std::swap(allocator_, other.allocator_);
|
88
94
|
std::swap(gadget_, other.gadget_);
|
89
95
|
return *this;
|
90
96
|
}
|
@@ -140,7 +146,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(std::istream& is, const Ser
|
|
140
146
|
check_preamble_longs(preamble_longs, flags);
|
141
147
|
check_family_and_serialization_version(family_id, serial_version);
|
142
148
|
|
143
|
-
if (max_k == 0 || max_k > MAX_K) {
|
149
|
+
if (max_k == 0 || max_k > var_opt_constants::MAX_K) {
|
144
150
|
throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
|
145
151
|
}
|
146
152
|
|
@@ -162,7 +168,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(std::istream& is, const Ser
|
|
162
168
|
if (!is.good())
|
163
169
|
throw std::runtime_error("error reading from std::istream");
|
164
170
|
|
165
|
-
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
171
|
+
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget), allocator);
|
166
172
|
}
|
167
173
|
|
168
174
|
template<typename T, typename A>
|
@@ -184,7 +190,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(const void* bytes, size_t s
|
|
184
190
|
check_preamble_longs(preamble_longs, flags);
|
185
191
|
check_family_and_serialization_version(family_id, serial_version);
|
186
192
|
|
187
|
-
if (max_k == 0 || max_k > MAX_K) {
|
193
|
+
if (max_k == 0 || max_k > var_opt_constants::MAX_K) {
|
188
194
|
throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
|
189
195
|
}
|
190
196
|
|
@@ -204,7 +210,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(const void* bytes, size_t s
|
|
204
210
|
const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
|
205
211
|
var_opt_sketch<T, A> gadget = var_opt_sketch<T, A>::deserialize(ptr, gadget_size, sd, allocator);
|
206
212
|
|
207
|
-
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
213
|
+
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget), allocator);
|
208
214
|
}
|
209
215
|
|
210
216
|
template<typename T, typename A>
|
@@ -442,7 +448,7 @@ var_opt_sketch<T, A> var_opt_union<T, A>::get_result() const {
|
|
442
448
|
|
443
449
|
/**
|
444
450
|
* When there are no marked items in H, the gadget is mathematically equivalent to a valid
|
445
|
-
* varopt sketch. This method simply returns a copy (without
|
451
|
+
* varopt sketch. This method simply returns a copy (without preserving marks).
|
446
452
|
*
|
447
453
|
* @return A shallow copy of the gadget as valid varopt sketch
|
448
454
|
*/
|
@@ -508,9 +514,8 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
508
514
|
uint32_t result_r = 0;
|
509
515
|
size_t next_r_pos = result_k; // = (result_k+1)-1, to fill R region from back to front
|
510
516
|
|
511
|
-
|
512
|
-
|
513
|
-
T* data = A().allocate(result_k + 1);
|
517
|
+
double* wts = AllocDouble(allocator_).allocate(result_k + 1);
|
518
|
+
T* data = A(allocator_).allocate(result_k + 1);
|
514
519
|
|
515
520
|
// insert R region items, ignoring weights
|
516
521
|
// Currently (May 2017) this next block is unreachable; this coercer is used only in the
|
@@ -519,7 +524,7 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
519
524
|
// Addedndum (Jan 2020): Cleanup at end of method assumes R count is 0
|
520
525
|
const size_t final_idx = gadget_.get_num_samples();
|
521
526
|
for (size_t idx = gadget_.h_ + 1; idx <= final_idx; ++idx) {
|
522
|
-
|
527
|
+
new (&data[next_r_pos]) T(gadget_.data_[idx]);
|
523
528
|
wts[next_r_pos] = gadget_.weights_[idx];
|
524
529
|
++result_r;
|
525
530
|
--next_r_pos;
|
@@ -530,21 +535,21 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
530
535
|
// insert H region items
|
531
536
|
for (size_t idx = 0; idx < gadget_.h_; ++idx) {
|
532
537
|
if (gadget_.marks_[idx]) {
|
533
|
-
|
538
|
+
new (&data[next_r_pos]) T(gadget_.data_[idx]);
|
534
539
|
wts[next_r_pos] = -1.0;
|
535
540
|
transferred_weight += gadget_.weights_[idx];
|
536
541
|
++result_r;
|
537
542
|
--next_r_pos;
|
538
543
|
} else {
|
539
|
-
|
544
|
+
new (&data[result_h]) T(gadget_.data_[idx]);
|
540
545
|
wts[result_h] = gadget_.weights_[idx];
|
541
546
|
++result_h;
|
542
547
|
}
|
543
548
|
}
|
544
549
|
|
545
550
|
if (result_h + result_r != result_k) throw std::logic_error("H + R counts must equal k");
|
546
|
-
if (
|
547
|
-
throw std::logic_error("
|
551
|
+
if (std::abs(transferred_weight - outer_tau_numer_) > 1e-10) {
|
552
|
+
throw std::logic_error("unexpected mismatch in transferred weight");
|
548
553
|
}
|
549
554
|
|
550
555
|
const double result_r_weight = gadget_.total_wt_r_ + transferred_weight;
|
@@ -554,11 +559,10 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
554
559
|
wts[result_h] = -1.0;
|
555
560
|
|
556
561
|
// clean up arrays in input sketch, replace with new values
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
A().deallocate(sk.data_, sk.curr_items_alloc_);
|
562
|
+
AllocBool(allocator_).deallocate(sk.marks_, sk.curr_items_alloc_);
|
563
|
+
AllocDouble(allocator_).deallocate(sk.weights_, sk.curr_items_alloc_);
|
564
|
+
for (size_t i = 0; i < result_k; ++i) { sk.data_[i].~T(); } // assumes everything in H region, no gap
|
565
|
+
A(allocator_).deallocate(sk.data_, sk.curr_items_alloc_);
|
562
566
|
|
563
567
|
sk.data_ = data;
|
564
568
|
sk.weights_ = wts;
|