datasketches 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
|
@@ -26,22 +26,12 @@
|
|
|
26
26
|
#include <iterator>
|
|
27
27
|
#include <vector>
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
/**
|
|
31
|
-
* This sketch samples data from a stream of items, designed for optimal (minimum) variance when
|
|
32
|
-
* querying the sketch to estimate subset sums of items matchng a provided predicate. Variance
|
|
33
|
-
* optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
|
|
34
|
-
* subset sum estimation.
|
|
35
|
-
*
|
|
36
|
-
* author Kevin Lang
|
|
37
|
-
* author Jon Malkin
|
|
38
|
-
*/
|
|
39
29
|
namespace datasketches {
|
|
40
30
|
|
|
41
31
|
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
|
42
32
|
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
|
|
43
33
|
|
|
44
|
-
|
|
34
|
+
/*
|
|
45
35
|
* A struct to hold the result of subset sum queries
|
|
46
36
|
*/
|
|
47
37
|
struct subset_summary {
|
|
@@ -53,11 +43,23 @@ struct subset_summary {
|
|
|
53
43
|
|
|
54
44
|
template <typename T, typename A> class var_opt_union; // forward declaration
|
|
55
45
|
|
|
46
|
+
/// VarOpt sketch constants
|
|
56
47
|
namespace var_opt_constants {
|
|
57
|
-
|
|
58
|
-
|
|
48
|
+
/// default resize factor
|
|
49
|
+
const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
|
|
50
|
+
/// maximum value of parameter K
|
|
51
|
+
const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
|
|
59
52
|
}
|
|
60
53
|
|
|
54
|
+
/**
|
|
55
|
+
* This sketch samples data from a stream of items. Designed for optimal (minimum) variance when
|
|
56
|
+
* querying the sketch to estimate subset sums of items matching a provided predicate. Variance
|
|
57
|
+
* optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
|
|
58
|
+
* subset sum estimation.
|
|
59
|
+
*
|
|
60
|
+
* author Kevin Lang
|
|
61
|
+
* author Jon Malkin
|
|
62
|
+
*/
|
|
61
63
|
template<
|
|
62
64
|
typename T,
|
|
63
65
|
typename A = std::allocator<T>
|
|
@@ -68,15 +70,42 @@ class var_opt_sketch {
|
|
|
68
70
|
static const resize_factor DEFAULT_RESIZE_FACTOR = var_opt_constants::DEFAULT_RESIZE_FACTOR;
|
|
69
71
|
static const uint32_t MAX_K = var_opt_constants::MAX_K;
|
|
70
72
|
|
|
73
|
+
/**
|
|
74
|
+
* Constructor
|
|
75
|
+
* @param k sketch size
|
|
76
|
+
* @param rf resize factor
|
|
77
|
+
* @param allocator instance of an allocator
|
|
78
|
+
*/
|
|
71
79
|
explicit var_opt_sketch(uint32_t k,
|
|
72
80
|
resize_factor rf = var_opt_constants::DEFAULT_RESIZE_FACTOR,
|
|
73
81
|
const A& allocator = A());
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Copy constructor
|
|
85
|
+
* @param other sketch to be copied
|
|
86
|
+
*/
|
|
74
87
|
var_opt_sketch(const var_opt_sketch& other);
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Move constructor
|
|
91
|
+
* @param other sketch to be moved
|
|
92
|
+
*/
|
|
75
93
|
var_opt_sketch(var_opt_sketch&& other) noexcept;
|
|
76
94
|
|
|
77
95
|
~var_opt_sketch();
|
|
78
96
|
|
|
97
|
+
/**
|
|
98
|
+
* Copy assignment
|
|
99
|
+
* @param other sketch to be copied
|
|
100
|
+
* @return reference to this sketch
|
|
101
|
+
*/
|
|
79
102
|
var_opt_sketch& operator=(const var_opt_sketch& other);
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Move assignment
|
|
106
|
+
* @param other sketch to be moved
|
|
107
|
+
* @return reference to this sketch
|
|
108
|
+
*/
|
|
80
109
|
var_opt_sketch& operator=(var_opt_sketch&& other);
|
|
81
110
|
|
|
82
111
|
/**
|
|
@@ -85,7 +114,7 @@ class var_opt_sketch {
|
|
|
85
114
|
* @param item an item from a stream of items
|
|
86
115
|
* @param weight the weight of the item
|
|
87
116
|
*/
|
|
88
|
-
void update(const T& item, double weight=1.0);
|
|
117
|
+
void update(const T& item, double weight = 1.0);
|
|
89
118
|
|
|
90
119
|
/**
|
|
91
120
|
* Updates this sketch with the given data item with the given weight.
|
|
@@ -93,7 +122,7 @@ class var_opt_sketch {
|
|
|
93
122
|
* @param item an item from a stream of items
|
|
94
123
|
* @param weight the weight of the item
|
|
95
124
|
*/
|
|
96
|
-
void update(T&& item, double weight=1.0);
|
|
125
|
+
void update(T&& item, double weight = 1.0);
|
|
97
126
|
|
|
98
127
|
/**
|
|
99
128
|
* Returns the configured maximum sample size.
|
|
@@ -117,7 +146,7 @@ class var_opt_sketch {
|
|
|
117
146
|
* Computes an estimated subset sum from the entire stream for objects matching a given
|
|
118
147
|
* predicate. Provides a lower bound, estimate, and upper bound using a target of 2 standard
|
|
119
148
|
* deviations. This is technically a heuristic method and tries to err on the conservative side.
|
|
120
|
-
* @param
|
|
149
|
+
* @param predicate a predicate function
|
|
121
150
|
* @return a subset_summary item with estimate, upper and lower bounds,
|
|
122
151
|
* and total sketch weight
|
|
123
152
|
*/
|
|
@@ -138,7 +167,7 @@ class var_opt_sketch {
|
|
|
138
167
|
/**
|
|
139
168
|
* Computes size needed to serialize the current state of the sketch.
|
|
140
169
|
* This version is for fixed-size arithmetic types (integral and floating point).
|
|
141
|
-
* @param instance of a SerDe
|
|
170
|
+
* @param sd instance of a SerDe
|
|
142
171
|
* @return size in bytes needed to serialize this sketch
|
|
143
172
|
*/
|
|
144
173
|
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
|
@@ -147,7 +176,7 @@ class var_opt_sketch {
|
|
|
147
176
|
/**
|
|
148
177
|
* Computes size needed to serialize the current state of the sketch.
|
|
149
178
|
* This version is for all other types and can be expensive since every item needs to be looked at.
|
|
150
|
-
* @param instance of a SerDe
|
|
179
|
+
* @param sd instance of a SerDe
|
|
151
180
|
* @return size in bytes needed to serialize this sketch
|
|
152
181
|
*/
|
|
153
182
|
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
@@ -155,7 +184,7 @@ class var_opt_sketch {
|
|
|
155
184
|
|
|
156
185
|
// This is a convenience alias for users
|
|
157
186
|
// The type returned by the following serialize method
|
|
158
|
-
|
|
187
|
+
using vector_bytes = vector_u8<A>;
|
|
159
188
|
|
|
160
189
|
/**
|
|
161
190
|
* This method serializes the sketch as a vector of bytes.
|
|
@@ -163,7 +192,7 @@ class var_opt_sketch {
|
|
|
163
192
|
* It is a blank space of a given size.
|
|
164
193
|
* This header is used in Datasketches PostgreSQL extension.
|
|
165
194
|
* @param header_size_bytes space to reserve in front of the sketch
|
|
166
|
-
* @param instance of a SerDe
|
|
195
|
+
* @param sd instance of a SerDe
|
|
167
196
|
*/
|
|
168
197
|
template<typename SerDe = serde<T>>
|
|
169
198
|
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
|
|
@@ -171,7 +200,7 @@ class var_opt_sketch {
|
|
|
171
200
|
/**
|
|
172
201
|
* This method serializes the sketch into a given stream in a binary form
|
|
173
202
|
* @param os output stream
|
|
174
|
-
* @param instance of a SerDe
|
|
203
|
+
* @param sd instance of a SerDe
|
|
175
204
|
*/
|
|
176
205
|
template<typename SerDe = serde<T>>
|
|
177
206
|
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
|
|
@@ -179,8 +208,8 @@ class var_opt_sketch {
|
|
|
179
208
|
/**
|
|
180
209
|
* This method deserializes a sketch from a given stream.
|
|
181
210
|
* @param is input stream
|
|
182
|
-
* @param instance of a SerDe
|
|
183
|
-
* @param instance of an
|
|
211
|
+
* @param sd instance of a SerDe
|
|
212
|
+
* @param allocator instance of an allocator
|
|
184
213
|
* @return an instance of a sketch
|
|
185
214
|
*/
|
|
186
215
|
template<typename SerDe = serde<T>>
|
|
@@ -190,8 +219,8 @@ class var_opt_sketch {
|
|
|
190
219
|
* This method deserializes a sketch from a given array of bytes.
|
|
191
220
|
* @param bytes pointer to the array of bytes
|
|
192
221
|
* @param size the size of the array
|
|
193
|
-
* @param instance of a SerDe
|
|
194
|
-
* @param instance of an
|
|
222
|
+
* @param sd instance of a SerDe
|
|
223
|
+
* @param allocator instance of an allocator
|
|
195
224
|
* @return an instance of a sketch
|
|
196
225
|
*/
|
|
197
226
|
template<typename SerDe = serde<T>>
|
|
@@ -205,7 +234,8 @@ class var_opt_sketch {
|
|
|
205
234
|
|
|
206
235
|
/**
|
|
207
236
|
* Prints the raw sketch items to a string. Calls items_to_stream() internally.
|
|
208
|
-
* Only works for type T with a defined
|
|
237
|
+
* Only works for type T with a defined
|
|
238
|
+
* std::ostream& operator<<(std::ostream&, const T&) and
|
|
209
239
|
* kept separate from to_string() to allow compilation even if
|
|
210
240
|
* T does not have such an operator defined.
|
|
211
241
|
* @return a string with the sketch items
|
|
@@ -213,7 +243,20 @@ class var_opt_sketch {
|
|
|
213
243
|
string<A> items_to_string() const;
|
|
214
244
|
|
|
215
245
|
class const_iterator;
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Iterator pointing to the first item in the sketch.
|
|
249
|
+
* If the sketch is empty, the returned iterator must not be dereferenced or incremented.
|
|
250
|
+
* @return iterator pointing to the first item in the sketch
|
|
251
|
+
*/
|
|
216
252
|
const_iterator begin() const;
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* Iterator pointing to the past-the-end item in the sketch.
|
|
256
|
+
* The past-the-end item is the hypothetical item that would follow the last item.
|
|
257
|
+
* It does not point to any item, and must not be dereferenced or incremented.
|
|
258
|
+
* @return iterator pointing to the past-the-end item in the sketch
|
|
259
|
+
*/
|
|
217
260
|
const_iterator end() const;
|
|
218
261
|
|
|
219
262
|
private:
|
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
|
|
37
37
|
namespace datasketches {
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
/*
|
|
40
40
|
* Implementation code for the VarOpt sketch.
|
|
41
41
|
*
|
|
42
42
|
* author Kevin Lang
|
|
@@ -895,7 +895,7 @@ void var_opt_sketch<T, A>::update_heavy_r_eq1(O&& item, double weight, bool mark
|
|
|
895
895
|
grow_candidate_set(weights_[m_slot] + total_wt_r_, 2);
|
|
896
896
|
}
|
|
897
897
|
|
|
898
|
-
|
|
898
|
+
/*
|
|
899
899
|
* Decreases sketch's value of k by 1, updating stored values as needed.
|
|
900
900
|
*
|
|
901
901
|
* <p>Subject to certain pre-conditions, decreasing k causes tau to increase. This fact is used by
|
|
@@ -1685,7 +1685,7 @@ bool var_opt_sketch<T, A>::iterator::get_mark() const {
|
|
|
1685
1685
|
return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
|
|
1686
1686
|
}
|
|
1687
1687
|
|
|
1688
|
-
|
|
1688
|
+
/*
|
|
1689
1689
|
* Checks if target sampling allocation is more than 50% of max sampling size.
|
|
1690
1690
|
* If so, returns max sampling size, otherwise passes through target size.
|
|
1691
1691
|
*/
|
|
@@ -52,7 +52,6 @@ template<
|
|
|
52
52
|
class var_opt_union {
|
|
53
53
|
|
|
54
54
|
public:
|
|
55
|
-
static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
|
|
56
55
|
|
|
57
56
|
explicit var_opt_union(uint32_t max_k, const A& allocator = A());
|
|
58
57
|
var_opt_union(const var_opt_union& other);
|
|
@@ -91,7 +90,7 @@ public:
|
|
|
91
90
|
/**
|
|
92
91
|
* Computes size needed to serialize the current state of the union.
|
|
93
92
|
* This version is for all other types and can be expensive since every item needs to be looked at.
|
|
94
|
-
* @param instance of a SerDe
|
|
93
|
+
* @param sd instance of a SerDe
|
|
95
94
|
* @return size in bytes needed to serialize this sketch
|
|
96
95
|
*/
|
|
97
96
|
template<typename SerDe = serde<T>>
|
|
@@ -108,7 +107,7 @@ public:
|
|
|
108
107
|
* It is a blank space of a given size.
|
|
109
108
|
* This header is used in Datasketches PostgreSQL extension.
|
|
110
109
|
* @param header_size_bytes space to reserve in front of the sketch
|
|
111
|
-
* @param instance of a SerDe
|
|
110
|
+
* @param sd instance of a SerDe
|
|
112
111
|
*/
|
|
113
112
|
template<typename SerDe = serde<T>>
|
|
114
113
|
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
|
|
@@ -117,7 +116,7 @@ public:
|
|
|
117
116
|
* NOTE: This method may be deprecated in a future version.
|
|
118
117
|
* This method serializes the sketch into a given stream in a binary form
|
|
119
118
|
* @param os output stream
|
|
120
|
-
* @param instance of a SerDe
|
|
119
|
+
* @param sd instance of a SerDe
|
|
121
120
|
*/
|
|
122
121
|
template<typename SerDe = serde<T>>
|
|
123
122
|
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
|
|
@@ -126,8 +125,8 @@ public:
|
|
|
126
125
|
* NOTE: This method may be deprecated in a future version.
|
|
127
126
|
* This method deserializes a union from a given stream.
|
|
128
127
|
* @param is input stream
|
|
129
|
-
* @param instance of a SerDe
|
|
130
|
-
* @param instance of an Allocator
|
|
128
|
+
* @param sd instance of a SerDe
|
|
129
|
+
* @param allocator instance of an Allocator
|
|
131
130
|
* @return an instance of a union
|
|
132
131
|
*/
|
|
133
132
|
template<typename SerDe = serde<T>>
|
|
@@ -138,8 +137,8 @@ public:
|
|
|
138
137
|
* This method deserializes a union from a given array of bytes.
|
|
139
138
|
* @param bytes pointer to the array of bytes
|
|
140
139
|
* @param size the size of the array
|
|
141
|
-
* @param instance of a SerDe
|
|
142
|
-
* @param instance of an Allocator
|
|
140
|
+
* @param sd instance of a SerDe
|
|
141
|
+
* @param allocator instance of an Allocator
|
|
143
142
|
* @return an instance of a union
|
|
144
143
|
*/
|
|
145
144
|
template<typename SerDe = serde<T>>
|
|
@@ -152,9 +151,9 @@ public:
|
|
|
152
151
|
string<A> to_string() const;
|
|
153
152
|
|
|
154
153
|
private:
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
154
|
+
using AllocSketch = typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T, A>>;
|
|
155
|
+
using AllocDouble = typename std::allocator_traits<A>::template rebind_alloc<double>;
|
|
156
|
+
using AllocBool = typename std::allocator_traits<A>::template rebind_alloc<bool>;
|
|
158
157
|
|
|
159
158
|
static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
|
|
160
159
|
static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
|
|
@@ -146,7 +146,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(std::istream& is, const Ser
|
|
|
146
146
|
check_preamble_longs(preamble_longs, flags);
|
|
147
147
|
check_family_and_serialization_version(family_id, serial_version);
|
|
148
148
|
|
|
149
|
-
if (max_k == 0 || max_k > MAX_K) {
|
|
149
|
+
if (max_k == 0 || max_k > var_opt_constants::MAX_K) {
|
|
150
150
|
throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
|
|
151
151
|
}
|
|
152
152
|
|
|
@@ -190,7 +190,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(const void* bytes, size_t s
|
|
|
190
190
|
check_preamble_longs(preamble_longs, flags);
|
|
191
191
|
check_family_and_serialization_version(family_id, serial_version);
|
|
192
192
|
|
|
193
|
-
if (max_k == 0 || max_k > MAX_K) {
|
|
193
|
+
if (max_k == 0 || max_k > var_opt_constants::MAX_K) {
|
|
194
194
|
throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
|
|
195
195
|
}
|
|
196
196
|
|
|
@@ -448,7 +448,7 @@ var_opt_sketch<T, A> var_opt_union<T, A>::get_result() const {
|
|
|
448
448
|
|
|
449
449
|
/**
|
|
450
450
|
* When there are no marked items in H, the gadget is mathematically equivalent to a valid
|
|
451
|
-
* varopt sketch. This method simply returns a copy (without
|
|
451
|
+
* varopt sketch. This method simply returns a copy (without preserving marks).
|
|
452
452
|
*
|
|
453
453
|
* @return A shallow copy of the gadget as valid varopt sketch
|
|
454
454
|
*/
|
|
@@ -549,7 +549,7 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
|
549
549
|
|
|
550
550
|
if (result_h + result_r != result_k) throw std::logic_error("H + R counts must equal k");
|
|
551
551
|
if (std::abs(transferred_weight - outer_tau_numer_) > 1e-10) {
|
|
552
|
-
throw std::logic_error("
|
|
552
|
+
throw std::logic_error("unexpected mismatch in transferred weight");
|
|
553
553
|
}
|
|
554
554
|
|
|
555
555
|
const double result_r_weight = gadget_.total_wt_r_ + transferred_weight;
|
|
@@ -15,30 +15,77 @@
|
|
|
15
15
|
# specific language governing permissions and limitations
|
|
16
16
|
# under the License.
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
# separate executables for var_opt and ebpps sampling
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
# VAR OPT SAMPLING
|
|
21
|
+
add_executable(var_opt_sampling_test)
|
|
21
22
|
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
target_link_libraries(var_opt_sampling_test sampling common_test_lib)
|
|
24
|
+
|
|
25
|
+
set_target_properties(var_opt_sampling_test PROPERTIES
|
|
24
26
|
CXX_STANDARD_REQUIRED YES
|
|
25
27
|
)
|
|
26
28
|
|
|
27
29
|
file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" SAMPLING_TEST_BINARY_PATH)
|
|
28
30
|
string(APPEND SAMPLING_TEST_BINARY_PATH "/")
|
|
29
|
-
target_compile_definitions(
|
|
31
|
+
target_compile_definitions(var_opt_sampling_test
|
|
30
32
|
PRIVATE
|
|
31
33
|
TEST_BINARY_INPUT_PATH="${SAMPLING_TEST_BINARY_PATH}"
|
|
32
34
|
)
|
|
33
35
|
|
|
34
36
|
add_test(
|
|
35
|
-
NAME
|
|
36
|
-
COMMAND
|
|
37
|
+
NAME var_opt_sampling_test
|
|
38
|
+
COMMAND var_opt_sampling_test
|
|
37
39
|
)
|
|
38
40
|
|
|
39
|
-
target_sources(
|
|
41
|
+
target_sources(var_opt_sampling_test
|
|
40
42
|
PRIVATE
|
|
41
43
|
var_opt_sketch_test.cpp
|
|
42
44
|
var_opt_union_test.cpp
|
|
43
45
|
var_opt_allocation_test.cpp
|
|
44
46
|
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# EBPPS SAMPLING
|
|
50
|
+
add_executable(ebpps_sampling_test)
|
|
51
|
+
|
|
52
|
+
target_link_libraries(ebpps_sampling_test sampling common_test_lib)
|
|
53
|
+
|
|
54
|
+
set_target_properties(ebpps_sampling_test PROPERTIES
|
|
55
|
+
CXX_STANDARD_REQUIRED YES
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
target_compile_definitions(ebpps_sampling_test
|
|
59
|
+
PRIVATE
|
|
60
|
+
TEST_BINARY_INPUT_PATH="${SAMPLING_TEST_BINARY_PATH}"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
add_test(
|
|
64
|
+
NAME ebpps_sampling_test
|
|
65
|
+
COMMAND ebpps_sampling_test
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
target_sources(ebpps_sampling_test
|
|
69
|
+
PRIVATE
|
|
70
|
+
ebpps_sample_test.cpp
|
|
71
|
+
ebpps_sketch_test.cpp
|
|
72
|
+
ebpps_allocation_test.cpp
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# Compatibility
|
|
77
|
+
if (SERDE_COMPAT)
|
|
78
|
+
target_sources(var_opt_sampling_test
|
|
79
|
+
PRIVATE
|
|
80
|
+
var_opt_sketch_deserialize_from_java_test.cpp
|
|
81
|
+
var_opt_union_deserialize_from_java_test.cpp
|
|
82
|
+
)
|
|
83
|
+
endif()
|
|
84
|
+
|
|
85
|
+
if (GENERATE)
|
|
86
|
+
target_sources(var_opt_sampling_test
|
|
87
|
+
PRIVATE
|
|
88
|
+
var_opt_sketch_serialize_for_java.cpp
|
|
89
|
+
var_opt_union_serialize_for_java.cpp
|
|
90
|
+
)
|
|
91
|
+
endif()
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <ebpps_sketch.hpp>
|
|
21
|
+
#include <test_type.hpp>
|
|
22
|
+
#include <test_allocator.hpp>
|
|
23
|
+
|
|
24
|
+
#include <catch2/catch.hpp>
|
|
25
|
+
|
|
26
|
+
#include <sstream>
|
|
27
|
+
|
|
28
|
+
namespace datasketches {
|
|
29
|
+
|
|
30
|
+
using ebpps_test_sketch = ebpps_sketch<test_type, test_allocator<test_type>>;
|
|
31
|
+
using alloc = test_allocator<test_type>;
|
|
32
|
+
|
|
33
|
+
TEST_CASE("ebpps allocation test", "[ebpps_sketch][test_type]") {
|
|
34
|
+
test_allocator_total_bytes = 0;
|
|
35
|
+
test_allocator_net_allocations = 0;
|
|
36
|
+
{
|
|
37
|
+
ebpps_test_sketch sk1(10, 0);
|
|
38
|
+
for (int i = 0; i < 100; ++i)
|
|
39
|
+
sk1.update(i);
|
|
40
|
+
auto bytes1 = sk1.serialize(0, test_type_serde());
|
|
41
|
+
auto sk2 = ebpps_test_sketch::deserialize(bytes1.data(), bytes1.size(), test_type_serde(), 0);
|
|
42
|
+
|
|
43
|
+
std::stringstream ss;
|
|
44
|
+
sk1.serialize(ss, test_type_serde());
|
|
45
|
+
auto sk3 = ebpps_test_sketch::deserialize(ss, test_type_serde(), alloc(0));
|
|
46
|
+
|
|
47
|
+
sk1.merge(sk2); // same size into sk1
|
|
48
|
+
sk3.merge(sk1); // larger into sk3
|
|
49
|
+
|
|
50
|
+
auto bytes2 = sk1.serialize(0, test_type_serde());
|
|
51
|
+
auto sk4 = ebpps_test_sketch::deserialize(bytes2.data(), bytes2.size(), test_type_serde(), 0);
|
|
52
|
+
}
|
|
53
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
54
|
+
REQUIRE(test_allocator_net_allocations == 0);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
TEST_CASE( "ebpps merge", "[ebpps_sketch][test_type]") {
|
|
58
|
+
test_allocator_total_bytes = 0;
|
|
59
|
+
test_allocator_net_allocations = 0;
|
|
60
|
+
{
|
|
61
|
+
uint32_t n = 20;
|
|
62
|
+
uint32_t k = 5;
|
|
63
|
+
ebpps_test_sketch sk1(k, 0);
|
|
64
|
+
ebpps_test_sketch sk2(k, 0);
|
|
65
|
+
|
|
66
|
+
// move udpates
|
|
67
|
+
for (int i = 0; i < (int) n; ++i) {
|
|
68
|
+
sk1.update(i);
|
|
69
|
+
sk2.update(-i);
|
|
70
|
+
sk1.update(n + i); // sk1 heavier than sk2
|
|
71
|
+
}
|
|
72
|
+
REQUIRE(sk1.get_n() == 2 * n);
|
|
73
|
+
REQUIRE(sk2.get_n() == n);
|
|
74
|
+
|
|
75
|
+
// move merge -- lighter into heavier
|
|
76
|
+
sk1.merge(std::move(sk2));
|
|
77
|
+
REQUIRE(sk1.get_n() == 3 * n);
|
|
78
|
+
|
|
79
|
+
// move constructor
|
|
80
|
+
ebpps_test_sketch sk3(std::move(sk1));
|
|
81
|
+
REQUIRE(sk3.get_n() == 3 * n);
|
|
82
|
+
|
|
83
|
+
// move assignment
|
|
84
|
+
ebpps_test_sketch sk4(k, 0);
|
|
85
|
+
sk4 = std::move(sk2);
|
|
86
|
+
REQUIRE(sk4.get_n() == n);
|
|
87
|
+
|
|
88
|
+
// move merge -- heavier into lighter
|
|
89
|
+
sk4.merge(sk3);
|
|
90
|
+
REQUIRE(sk4.get_n() == 4 * n);
|
|
91
|
+
}
|
|
92
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
93
|
+
REQUIRE(test_allocator_net_allocations == 0);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <ebpps_sample.hpp>
|
|
21
|
+
|
|
22
|
+
#include <catch2/catch.hpp>
|
|
23
|
+
|
|
24
|
+
#include <vector>
|
|
25
|
+
#include <string>
|
|
26
|
+
#include <sstream>
|
|
27
|
+
#include <fstream>
|
|
28
|
+
#include <cmath>
|
|
29
|
+
#include <random>
|
|
30
|
+
#include <stdexcept>
|
|
31
|
+
|
|
32
|
+
namespace datasketches {
|
|
33
|
+
|
|
34
|
+
static constexpr double EPS = 1e-15;
|
|
35
|
+
|
|
36
|
+
TEST_CASE("ebpps sample: basic initialization", "[ebpps_sketch]") {
|
|
37
|
+
ebpps_sample<int> sample = ebpps_sample<int>(0);
|
|
38
|
+
REQUIRE(sample.get_c() == 0.0);
|
|
39
|
+
REQUIRE(sample.get_num_retained_items() == 0);
|
|
40
|
+
REQUIRE(sample.get_sample().size() == 0);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
TEST_CASE("ebpps sample: pre-initialized", "[ebpps_sketch]") {
|
|
44
|
+
double theta = 1.0;
|
|
45
|
+
ebpps_sample<int> sample = ebpps_sample<int>(-1, theta);
|
|
46
|
+
REQUIRE(sample.get_c() == theta);
|
|
47
|
+
REQUIRE(sample.get_num_retained_items() == 1);
|
|
48
|
+
REQUIRE(sample.get_sample().size() == 1);
|
|
49
|
+
REQUIRE(sample.has_partial_item() == false);
|
|
50
|
+
|
|
51
|
+
theta = 1e-300;
|
|
52
|
+
sample = ebpps_sample<int>(-1, theta);
|
|
53
|
+
REQUIRE(sample.get_c() == theta);
|
|
54
|
+
REQUIRE(sample.get_num_retained_items() == 1);
|
|
55
|
+
REQUIRE(sample.get_sample().size() == 0); // assuming the random number is > 1e-300
|
|
56
|
+
REQUIRE(sample.has_partial_item());
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
TEST_CASE("ebpps sample: downsampling", "[ebpps_sketch]") {
|
|
60
|
+
ebpps_sample<char> sample = ebpps_sample<char>('a', 1.0);
|
|
61
|
+
|
|
62
|
+
sample.downsample(2.0); // no-op
|
|
63
|
+
REQUIRE(sample.get_c() == 1.0);
|
|
64
|
+
REQUIRE(sample.get_num_retained_items() == 1);
|
|
65
|
+
REQUIRE(sample.has_partial_item() == false);
|
|
66
|
+
|
|
67
|
+
// downsample and result in an empty sample
|
|
68
|
+
random_utils::override_seed(12);
|
|
69
|
+
std::vector<char> items = {'a', 'b'};
|
|
70
|
+
optional<char> opt; // empty
|
|
71
|
+
sample = ebpps_sample<char>(std::move(items), std::move(opt), 1.8);
|
|
72
|
+
sample.downsample(0.5);
|
|
73
|
+
REQUIRE(sample.get_c() == 0.9);
|
|
74
|
+
REQUIRE(sample.get_num_retained_items() == 0);
|
|
75
|
+
REQUIRE(sample.has_partial_item() == false);
|
|
76
|
+
|
|
77
|
+
// downsample and result in a sample with a partial item
|
|
78
|
+
items = {'a', 'b'};
|
|
79
|
+
opt.reset();
|
|
80
|
+
sample = ebpps_sample<char>(std::move(items), std::move(opt), 1.5);
|
|
81
|
+
sample.downsample(0.5);
|
|
82
|
+
REQUIRE(sample.get_c() == 0.75);
|
|
83
|
+
REQUIRE(sample.get_num_retained_items() == 1);
|
|
84
|
+
REQUIRE(sample.has_partial_item() == true);
|
|
85
|
+
for (char c : sample) {
|
|
86
|
+
REQUIRE((c == 'a' || c == 'b'));
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// downsample to an exact integer c (7.5 * 0.8 = 6.0)
|
|
90
|
+
items = {'a', 'b', 'c', 'd', 'e', 'f', 'g'};
|
|
91
|
+
opt.emplace('h');
|
|
92
|
+
auto ref_items = items; // copy to check contents
|
|
93
|
+
ref_items.emplace_back('h'); // include partial item
|
|
94
|
+
sample = ebpps_sample<char>(std::move(items), std::move(opt), 7.5);
|
|
95
|
+
sample.downsample(0.8);
|
|
96
|
+
REQUIRE(sample.get_c() == 6.0);
|
|
97
|
+
REQUIRE(sample.get_num_retained_items() == 6);
|
|
98
|
+
REQUIRE(sample.has_partial_item() == false);
|
|
99
|
+
for (char c : sample) {
|
|
100
|
+
REQUIRE(std::find(ref_items.begin(), ref_items.end(), c) != ref_items.end());
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// downsample to c > 1 with partial item
|
|
104
|
+
items = ref_items; // includes previous optional item
|
|
105
|
+
opt.emplace('i');
|
|
106
|
+
sample = ebpps_sample<char>(std::move(items), std::move(opt), 8.5);
|
|
107
|
+
REQUIRE(sample.get_partial_item() == 'i');
|
|
108
|
+
sample.downsample(0.8);
|
|
109
|
+
REQUIRE(sample.get_c() == Approx(6.8).margin(EPS));
|
|
110
|
+
REQUIRE(sample.get_num_retained_items() == 7);
|
|
111
|
+
REQUIRE(sample.has_partial_item() == true);
|
|
112
|
+
ref_items.emplace_back('i');
|
|
113
|
+
for (char c : sample) {
|
|
114
|
+
REQUIRE(std::find(ref_items.begin(), ref_items.end(), c) != ref_items.end());
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
random_utils::override_seed(random_utils::rd());
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
TEST_CASE("ebpps sample: merge unit samples", "[ebpps_sketch]") {
|
|
121
|
+
uint32_t k = 8;
|
|
122
|
+
ebpps_sample<int> sample = ebpps_sample<int>(k);
|
|
123
|
+
|
|
124
|
+
for (uint32_t i = 1; i <= k; ++i) {
|
|
125
|
+
ebpps_sample<int> s = ebpps_sample<int>(i, 1.0);
|
|
126
|
+
sample.merge(s);
|
|
127
|
+
REQUIRE(sample.get_c() == static_cast<double>(i));
|
|
128
|
+
REQUIRE(sample.get_num_retained_items() == i);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
sample.reset();
|
|
132
|
+
REQUIRE(sample.get_c() == 0);
|
|
133
|
+
REQUIRE(sample.get_num_retained_items() == 0);
|
|
134
|
+
REQUIRE(sample.has_partial_item() == false);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
} // namespace datasketches
|