datasketches 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
|
@@ -37,7 +37,7 @@ kll_sketch<T, C, A>::kll_sketch(uint16_t k, const C& comparator, const A& alloca
|
|
|
37
37
|
comparator_(comparator),
|
|
38
38
|
allocator_(allocator),
|
|
39
39
|
k_(k),
|
|
40
|
-
m_(DEFAULT_M),
|
|
40
|
+
m_(kll_constants::DEFAULT_M),
|
|
41
41
|
min_k_(k),
|
|
42
42
|
num_levels_(1),
|
|
43
43
|
is_level_zero_sorted_(false),
|
|
@@ -45,12 +45,13 @@ n_(0),
|
|
|
45
45
|
levels_(2, 0, allocator),
|
|
46
46
|
items_(nullptr),
|
|
47
47
|
items_size_(k_),
|
|
48
|
-
min_item_(
|
|
49
|
-
max_item_(
|
|
48
|
+
min_item_(),
|
|
49
|
+
max_item_(),
|
|
50
50
|
sorted_view_(nullptr)
|
|
51
51
|
{
|
|
52
|
-
if (k < MIN_K || k > MAX_K) {
|
|
53
|
-
throw std::invalid_argument("K must be >= " + std::to_string(MIN_K) + " and <= "
|
|
52
|
+
if (k < kll_constants::MIN_K || k > kll_constants::MAX_K) {
|
|
53
|
+
throw std::invalid_argument("K must be >= " + std::to_string(kll_constants::MIN_K) + " and <= "
|
|
54
|
+
+ std::to_string(kll_constants::MAX_K) + ": " + std::to_string(k));
|
|
54
55
|
}
|
|
55
56
|
levels_[0] = levels_[1] = k;
|
|
56
57
|
items_ = allocator_.allocate(items_size_);
|
|
@@ -69,14 +70,12 @@ n_(other.n_),
|
|
|
69
70
|
levels_(other.levels_),
|
|
70
71
|
items_(nullptr),
|
|
71
72
|
items_size_(other.items_size_),
|
|
72
|
-
min_item_(
|
|
73
|
-
max_item_(
|
|
73
|
+
min_item_(other.min_item_),
|
|
74
|
+
max_item_(other.max_item_),
|
|
74
75
|
sorted_view_(nullptr)
|
|
75
76
|
{
|
|
76
77
|
items_ = allocator_.allocate(items_size_);
|
|
77
78
|
for (auto i = levels_[0]; i < levels_[num_levels_]; ++i) new (&items_[i]) T(other.items_[i]);
|
|
78
|
-
if (other.min_item_ != nullptr) min_item_ = new (allocator_.allocate(1)) T(*other.min_item_);
|
|
79
|
-
if (other.max_item_ != nullptr) max_item_ = new (allocator_.allocate(1)) T(*other.max_item_);
|
|
80
79
|
}
|
|
81
80
|
|
|
82
81
|
template<typename T, typename C, typename A>
|
|
@@ -92,13 +91,11 @@ n_(other.n_),
|
|
|
92
91
|
levels_(std::move(other.levels_)),
|
|
93
92
|
items_(other.items_),
|
|
94
93
|
items_size_(other.items_size_),
|
|
95
|
-
min_item_(other.min_item_),
|
|
96
|
-
max_item_(other.max_item_),
|
|
94
|
+
min_item_(std::move(other.min_item_)),
|
|
95
|
+
max_item_(std::move(other.max_item_)),
|
|
97
96
|
sorted_view_(nullptr)
|
|
98
97
|
{
|
|
99
98
|
other.items_ = nullptr;
|
|
100
|
-
other.min_item_ = nullptr;
|
|
101
|
-
other.max_item_ = nullptr;
|
|
102
99
|
}
|
|
103
100
|
|
|
104
101
|
template<typename T, typename C, typename A>
|
|
@@ -148,14 +145,6 @@ kll_sketch<T, C, A>::~kll_sketch() {
|
|
|
148
145
|
for (uint32_t i = begin; i < end; i++) items_[i].~T();
|
|
149
146
|
allocator_.deallocate(items_, items_size_);
|
|
150
147
|
}
|
|
151
|
-
if (min_item_ != nullptr) {
|
|
152
|
-
min_item_->~T();
|
|
153
|
-
allocator_.deallocate(min_item_, 1);
|
|
154
|
-
}
|
|
155
|
-
if (max_item_ != nullptr) {
|
|
156
|
-
max_item_->~T();
|
|
157
|
-
allocator_.deallocate(max_item_, 1);
|
|
158
|
-
}
|
|
159
148
|
reset_sorted_view();
|
|
160
149
|
}
|
|
161
150
|
|
|
@@ -173,8 +162,8 @@ n_(other.n_),
|
|
|
173
162
|
levels_(other.levels_, allocator_),
|
|
174
163
|
items_(nullptr),
|
|
175
164
|
items_size_(other.items_size_),
|
|
176
|
-
min_item_(
|
|
177
|
-
max_item_(
|
|
165
|
+
min_item_(other.min_item_),
|
|
166
|
+
max_item_(other.max_item_),
|
|
178
167
|
sorted_view_(nullptr)
|
|
179
168
|
{
|
|
180
169
|
static_assert(
|
|
@@ -183,8 +172,6 @@ sorted_view_(nullptr)
|
|
|
183
172
|
);
|
|
184
173
|
items_ = allocator_.allocate(items_size_);
|
|
185
174
|
for (auto i = levels_[0]; i < levels_[num_levels_]; ++i) new (&items_[i]) T(other.items_[i]);
|
|
186
|
-
if (other.min_item_ != nullptr) min_item_ = new (allocator_.allocate(1)) T(*other.min_item_);
|
|
187
|
-
if (other.max_item_ != nullptr) max_item_ = new (allocator_.allocate(1)) T(*other.max_item_);
|
|
188
175
|
check_sorting();
|
|
189
176
|
}
|
|
190
177
|
|
|
@@ -192,7 +179,7 @@ template<typename T, typename C, typename A>
|
|
|
192
179
|
template<typename FwdT>
|
|
193
180
|
void kll_sketch<T, C, A>::update(FwdT&& item) {
|
|
194
181
|
if (!check_update_item(item)) { return; }
|
|
195
|
-
update_min_max(item);
|
|
182
|
+
update_min_max(static_cast<const T&>(item)); // min and max are always copies
|
|
196
183
|
const uint32_t index = internal_update();
|
|
197
184
|
new (&items_[index]) T(std::forward<FwdT>(item));
|
|
198
185
|
reset_sorted_view();
|
|
@@ -201,8 +188,8 @@ void kll_sketch<T, C, A>::update(FwdT&& item) {
|
|
|
201
188
|
template<typename T, typename C, typename A>
|
|
202
189
|
void kll_sketch<T, C, A>::update_min_max(const T& item) {
|
|
203
190
|
if (is_empty()) {
|
|
204
|
-
min_item_
|
|
205
|
-
max_item_
|
|
191
|
+
min_item_.emplace(item);
|
|
192
|
+
max_item_.emplace(item);
|
|
206
193
|
} else {
|
|
207
194
|
if (comparator_(item, *min_item_)) *min_item_ = item;
|
|
208
195
|
if (comparator_(*max_item_, item)) *max_item_ = item;
|
|
@@ -225,8 +212,8 @@ void kll_sketch<T, C, A>::merge(FwdSk&& other) {
|
|
|
225
212
|
throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
|
|
226
213
|
}
|
|
227
214
|
if (is_empty()) {
|
|
228
|
-
min_item_
|
|
229
|
-
max_item_
|
|
215
|
+
min_item_.emplace(conditional_forward<FwdSk>(*other.min_item_));
|
|
216
|
+
max_item_.emplace(conditional_forward<FwdSk>(*other.max_item_));
|
|
230
217
|
} else {
|
|
231
218
|
if (comparator_(*other.min_item_, *min_item_)) *min_item_ = conditional_forward<FwdSk>(*other.min_item_);
|
|
232
219
|
if (comparator_(*max_item_, *other.max_item_)) *max_item_ = conditional_forward<FwdSk>(*other.max_item_);
|
|
@@ -322,42 +309,6 @@ auto kll_sketch<T, C, A>::get_quantile(double rank, bool inclusive) const -> qua
|
|
|
322
309
|
return sorted_view_->get_quantile(rank, inclusive);
|
|
323
310
|
}
|
|
324
311
|
|
|
325
|
-
template<typename T, typename C, typename A>
|
|
326
|
-
std::vector<T, A> kll_sketch<T, C, A>::get_quantiles(const double* ranks, uint32_t size, bool inclusive) const {
|
|
327
|
-
if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
|
328
|
-
std::vector<T, A> quantiles(allocator_);
|
|
329
|
-
quantiles.reserve(size);
|
|
330
|
-
|
|
331
|
-
// may have a side effect of sorting level zero if needed
|
|
332
|
-
setup_sorted_view();
|
|
333
|
-
|
|
334
|
-
for (uint32_t i = 0; i < size; i++) {
|
|
335
|
-
const double rank = ranks[i];
|
|
336
|
-
if ((rank < 0.0) || (rank > 1.0)) {
|
|
337
|
-
throw std::invalid_argument("normalized rank cannot be less than 0 or greater than 1");
|
|
338
|
-
}
|
|
339
|
-
quantiles.push_back(sorted_view_->get_quantile(rank, inclusive));
|
|
340
|
-
}
|
|
341
|
-
return quantiles;
|
|
342
|
-
}
|
|
343
|
-
|
|
344
|
-
template<typename T, typename C, typename A>
|
|
345
|
-
std::vector<T, A> kll_sketch<T, C, A>::get_quantiles(uint32_t num, bool inclusive) const {
|
|
346
|
-
if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
|
347
|
-
if (num == 0) {
|
|
348
|
-
throw std::invalid_argument("num must be > 0");
|
|
349
|
-
}
|
|
350
|
-
vector_double ranks(num, 0, allocator_);
|
|
351
|
-
ranks[0] = 0.0;
|
|
352
|
-
for (size_t i = 1; i < num; i++) {
|
|
353
|
-
ranks[i] = static_cast<double>(i) / (num - 1);
|
|
354
|
-
}
|
|
355
|
-
if (num > 1) {
|
|
356
|
-
ranks[num - 1] = 1.0;
|
|
357
|
-
}
|
|
358
|
-
return get_quantiles(ranks.data(), num, inclusive);
|
|
359
|
-
}
|
|
360
|
-
|
|
361
312
|
template<typename T, typename C, typename A>
|
|
362
313
|
double kll_sketch<T, C, A>::get_normalized_rank_error(bool pmf) const {
|
|
363
314
|
return get_normalized_rank_error(min_k_, pmf);
|
|
@@ -396,7 +347,7 @@ template<typename T, typename C, typename A>
|
|
|
396
347
|
template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
|
397
348
|
size_t kll_sketch<T, C, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n) {
|
|
398
349
|
const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
|
|
399
|
-
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
|
|
350
|
+
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, kll_constants::DEFAULT_M, num_levels);
|
|
400
351
|
// the last integer in the levels_ array is not serialized because it can be derived
|
|
401
352
|
return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * sizeof(TT);
|
|
402
353
|
}
|
|
@@ -406,7 +357,7 @@ template<typename T, typename C, typename A>
|
|
|
406
357
|
template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
|
407
358
|
size_t kll_sketch<T, C, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes) {
|
|
408
359
|
const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
|
|
409
|
-
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
|
|
360
|
+
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, kll_constants::DEFAULT_M, num_levels);
|
|
410
361
|
// the last integer in the levels_ array is not serialized because it can be derived
|
|
411
362
|
return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * max_item_size_bytes;
|
|
412
363
|
}
|
|
@@ -438,8 +389,8 @@ void kll_sketch<T, C, A>::serialize(std::ostream& os, const SerDe& sd) const {
|
|
|
438
389
|
write(os, num_levels_);
|
|
439
390
|
write(os, unused);
|
|
440
391
|
write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
|
|
441
|
-
sd.serialize(os, min_item_, 1);
|
|
442
|
-
sd.serialize(os, max_item_, 1);
|
|
392
|
+
sd.serialize(os, &*min_item_, 1);
|
|
393
|
+
sd.serialize(os, &*max_item_, 1);
|
|
443
394
|
}
|
|
444
395
|
sd.serialize(os, &items_[levels_[0]], get_num_retained());
|
|
445
396
|
}
|
|
@@ -474,8 +425,8 @@ auto kll_sketch<T, C, A>::serialize(unsigned header_size_bytes, const SerDe& sd)
|
|
|
474
425
|
ptr += copy_to_mem(num_levels_, ptr);
|
|
475
426
|
ptr += sizeof(uint8_t); // unused
|
|
476
427
|
ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
|
|
477
|
-
ptr += sd.serialize(ptr, end_ptr - ptr, min_item_, 1);
|
|
478
|
-
ptr += sd.serialize(ptr, end_ptr - ptr, max_item_, 1);
|
|
428
|
+
ptr += sd.serialize(ptr, end_ptr - ptr, &*min_item_, 1);
|
|
429
|
+
ptr += sd.serialize(ptr, end_ptr - ptr, &*max_item_, 1);
|
|
479
430
|
}
|
|
480
431
|
const size_t bytes_remaining = end_ptr - ptr;
|
|
481
432
|
ptr += sd.serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
|
|
@@ -530,20 +481,20 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(std::istream& is, const Ser
|
|
|
530
481
|
read(is, levels.data(), sizeof(levels[0]) * num_levels);
|
|
531
482
|
}
|
|
532
483
|
levels[num_levels] = capacity;
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
std::unique_ptr<T, decltype(item_buffer_deleter)> max_item_buffer(alloc.allocate(1), item_buffer_deleter);
|
|
537
|
-
std::unique_ptr<T, item_deleter> min_item(nullptr, item_deleter(allocator));
|
|
538
|
-
std::unique_ptr<T, item_deleter> max_item(nullptr, item_deleter(allocator));
|
|
484
|
+
optional<T> tmp; // space to deserialize min and max
|
|
485
|
+
optional<T> min_item;
|
|
486
|
+
optional<T> max_item;
|
|
539
487
|
if (!is_single_item) {
|
|
540
|
-
sd.deserialize(is,
|
|
541
|
-
// serde call did not throw, repackage
|
|
542
|
-
min_item
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
488
|
+
sd.deserialize(is, &*tmp, 1);
|
|
489
|
+
// serde call did not throw, repackage and cleanup
|
|
490
|
+
min_item.emplace(*tmp);
|
|
491
|
+
(*tmp).~T();
|
|
492
|
+
sd.deserialize(is, &*tmp, 1);
|
|
493
|
+
// serde call did not throw, repackage and cleanup
|
|
494
|
+
max_item.emplace(*tmp);
|
|
495
|
+
(*tmp).~T();
|
|
546
496
|
}
|
|
497
|
+
A alloc(allocator);
|
|
547
498
|
auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
|
|
548
499
|
std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
|
|
549
500
|
const auto num_items = levels[num_levels] - levels[0];
|
|
@@ -552,12 +503,8 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(std::istream& is, const Ser
|
|
|
552
503
|
std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
|
|
553
504
|
const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
|
|
554
505
|
if (is_single_item) {
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
min_item = std::unique_ptr<T, item_deleter>(min_item_buffer.release(), item_deleter(allocator));
|
|
558
|
-
new (max_item_buffer.get()) T(items.get()[levels[0]]);
|
|
559
|
-
// copy did not throw, repackage with destrtuctor
|
|
560
|
-
max_item = std::unique_ptr<T, item_deleter>(max_item_buffer.release(), item_deleter(allocator));
|
|
506
|
+
min_item.emplace(items.get()[levels[0]]);
|
|
507
|
+
max_item.emplace(items.get()[levels[0]]);
|
|
561
508
|
}
|
|
562
509
|
if (!is.good())
|
|
563
510
|
throw std::runtime_error("error reading from std::istream");
|
|
@@ -618,20 +565,20 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(const void* bytes, size_t s
|
|
|
618
565
|
ptr += copy_from_mem(ptr, levels.data(), sizeof(levels[0]) * num_levels);
|
|
619
566
|
}
|
|
620
567
|
levels[num_levels] = capacity;
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
std::unique_ptr<T, decltype(item_buffer_deleter)> max_item_buffer(alloc.allocate(1), item_buffer_deleter);
|
|
625
|
-
std::unique_ptr<T, item_deleter> min_item(nullptr, item_deleter(allocator));
|
|
626
|
-
std::unique_ptr<T, item_deleter> max_item(nullptr, item_deleter(allocator));
|
|
568
|
+
optional<T> tmp; // space to deserialize min and max
|
|
569
|
+
optional<T> min_item;
|
|
570
|
+
optional<T> max_item;
|
|
627
571
|
if (!is_single_item) {
|
|
628
|
-
ptr += sd.deserialize(ptr, end_ptr - ptr,
|
|
629
|
-
// serde call did not throw, repackage
|
|
630
|
-
min_item
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
572
|
+
ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
|
|
573
|
+
// serde call did not throw, repackage and cleanup
|
|
574
|
+
min_item.emplace(*tmp);
|
|
575
|
+
(*tmp).~T();
|
|
576
|
+
ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
|
|
577
|
+
// serde call did not throw, repackage and cleanup
|
|
578
|
+
max_item.emplace(*tmp);
|
|
579
|
+
(*tmp).~T();
|
|
634
580
|
}
|
|
581
|
+
A alloc(allocator);
|
|
635
582
|
auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
|
|
636
583
|
std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
|
|
637
584
|
const auto num_items = levels[num_levels] - levels[0];
|
|
@@ -642,12 +589,8 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(const void* bytes, size_t s
|
|
|
642
589
|
if (delta != size) throw std::logic_error("deserialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
|
|
643
590
|
const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
|
|
644
591
|
if (is_single_item) {
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
min_item = std::unique_ptr<T, item_deleter>(min_item_buffer.release(), item_deleter(allocator));
|
|
648
|
-
new (max_item_buffer.get()) T(items.get()[levels[0]]);
|
|
649
|
-
// copy did not throw, repackage with destrtuctor
|
|
650
|
-
max_item = std::unique_ptr<T, item_deleter>(max_item_buffer.release(), item_deleter(allocator));
|
|
592
|
+
min_item.emplace(items.get()[levels[0]]);
|
|
593
|
+
max_item.emplace(items.get()[levels[0]]);
|
|
651
594
|
}
|
|
652
595
|
return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
|
|
653
596
|
std::move(min_item), std::move(max_item), is_level_zero_sorted, comparator);
|
|
@@ -670,12 +613,12 @@ double kll_sketch<T, C, A>::get_normalized_rank_error(uint16_t k, bool pmf) {
|
|
|
670
613
|
// for deserialization
|
|
671
614
|
template<typename T, typename C, typename A>
|
|
672
615
|
kll_sketch<T, C, A>::kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32&& levels,
|
|
673
|
-
std::unique_ptr<T, items_deleter> items, uint32_t items_size,
|
|
674
|
-
|
|
616
|
+
std::unique_ptr<T, items_deleter> items, uint32_t items_size, optional<T>&& min_item,
|
|
617
|
+
optional<T>&& max_item, bool is_level_zero_sorted, const C& comparator):
|
|
675
618
|
comparator_(comparator),
|
|
676
619
|
allocator_(levels.get_allocator()),
|
|
677
620
|
k_(k),
|
|
678
|
-
m_(DEFAULT_M),
|
|
621
|
+
m_(kll_constants::DEFAULT_M),
|
|
679
622
|
min_k_(min_k),
|
|
680
623
|
num_levels_(num_levels),
|
|
681
624
|
is_level_zero_sorted_(is_level_zero_sorted),
|
|
@@ -683,8 +626,8 @@ n_(n),
|
|
|
683
626
|
levels_(std::move(levels)),
|
|
684
627
|
items_(items.release()),
|
|
685
628
|
items_size_(items_size),
|
|
686
|
-
min_item_(min_item
|
|
687
|
-
max_item_(max_item
|
|
629
|
+
min_item_(std::move(min_item)),
|
|
630
|
+
max_item_(std::move(max_item)),
|
|
688
631
|
sorted_view_(nullptr)
|
|
689
632
|
{}
|
|
690
633
|
|
|
@@ -820,7 +763,7 @@ quantiles_sorted_view<T, C, A> kll_sketch<T, C, A>::get_sorted_view() const {
|
|
|
820
763
|
for (uint8_t level = 0; level < num_levels_; ++level) {
|
|
821
764
|
const auto from = items_ + levels_[level];
|
|
822
765
|
const auto to = items_ + levels_[level + 1]; // exclusive
|
|
823
|
-
view.add(from, to,
|
|
766
|
+
view.add(from, to, 1ULL << level);
|
|
824
767
|
}
|
|
825
768
|
view.convert_to_cummulative();
|
|
826
769
|
return view;
|
|
@@ -917,8 +860,8 @@ uint32_t kll_sketch<T, C, A>::get_num_retained_above_level_zero() const {
|
|
|
917
860
|
|
|
918
861
|
template<typename T, typename C, typename A>
|
|
919
862
|
void kll_sketch<T, C, A>::check_m(uint8_t m) {
|
|
920
|
-
if (m != DEFAULT_M) {
|
|
921
|
-
throw std::invalid_argument("Possible corruption: M must be " + std::to_string(DEFAULT_M)
|
|
863
|
+
if (m != kll_constants::DEFAULT_M) {
|
|
864
|
+
throw std::invalid_argument("Possible corruption: M must be " + std::to_string(kll_constants::DEFAULT_M)
|
|
922
865
|
+ ": " + std::to_string(m));
|
|
923
866
|
}
|
|
924
867
|
}
|
|
@@ -1019,20 +962,6 @@ typename kll_sketch<T, C, A>::const_iterator kll_sketch<T, C, A>::end() const {
|
|
|
1019
962
|
return kll_sketch<T, C, A>::const_iterator(nullptr, levels_.data(), num_levels_);
|
|
1020
963
|
}
|
|
1021
964
|
|
|
1022
|
-
template<typename T, typename C, typename A>
|
|
1023
|
-
class kll_sketch<T, C, A>::item_deleter {
|
|
1024
|
-
public:
|
|
1025
|
-
item_deleter(const A& allocator): allocator_(allocator) {}
|
|
1026
|
-
void operator() (T* ptr) {
|
|
1027
|
-
if (ptr != nullptr) {
|
|
1028
|
-
ptr->~T();
|
|
1029
|
-
allocator_.deallocate(ptr, 1);
|
|
1030
|
-
}
|
|
1031
|
-
}
|
|
1032
|
-
private:
|
|
1033
|
-
A allocator_;
|
|
1034
|
-
};
|
|
1035
|
-
|
|
1036
965
|
template<typename T, typename C, typename A>
|
|
1037
966
|
class kll_sketch<T, C, A>::items_deleter {
|
|
1038
967
|
public:
|
|
@@ -20,7 +20,6 @@ add_executable(kll_test)
|
|
|
20
20
|
target_link_libraries(kll_test kll common_test_lib)
|
|
21
21
|
|
|
22
22
|
set_target_properties(kll_test PROPERTIES
|
|
23
|
-
CXX_STANDARD 11
|
|
24
23
|
CXX_STANDARD_REQUIRED YES
|
|
25
24
|
)
|
|
26
25
|
|
|
@@ -43,3 +42,17 @@ target_sources(kll_test
|
|
|
43
42
|
kll_sketch_validation.cpp
|
|
44
43
|
kolmogorov_smirnov_test.cpp
|
|
45
44
|
)
|
|
45
|
+
|
|
46
|
+
if (SERDE_COMPAT)
|
|
47
|
+
target_sources(kll_test
|
|
48
|
+
PRIVATE
|
|
49
|
+
kll_sketch_deserialize_from_java_test.cpp
|
|
50
|
+
)
|
|
51
|
+
endif()
|
|
52
|
+
|
|
53
|
+
if (GENERATE)
|
|
54
|
+
target_sources(kll_test
|
|
55
|
+
PRIVATE
|
|
56
|
+
kll_sketch_serialize_for_java.cpp
|
|
57
|
+
)
|
|
58
|
+
endif()
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch2/catch.hpp>
|
|
21
|
+
#include <fstream>
|
|
22
|
+
#include <kll_sketch.hpp>
|
|
23
|
+
|
|
24
|
+
namespace datasketches {
|
|
25
|
+
|
|
26
|
+
// assume the binary sketches for this test have been generated by datasketches-java code
|
|
27
|
+
// in the subdirectory called "java" in the root directory of this project
|
|
28
|
+
static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
|
|
29
|
+
|
|
30
|
+
TEST_CASE("kll float", "[serde_compat]") {
|
|
31
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
|
32
|
+
for (const unsigned n: n_arr) {
|
|
33
|
+
std::ifstream is;
|
|
34
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
35
|
+
is.open(testBinaryInputPath + "kll_float_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
|
36
|
+
const auto sketch = kll_sketch<float>::deserialize(is);
|
|
37
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
|
38
|
+
REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K));
|
|
39
|
+
REQUIRE(sketch.get_n() == n);
|
|
40
|
+
if (n > 0) {
|
|
41
|
+
REQUIRE(sketch.get_min_item() == 1.0f);
|
|
42
|
+
REQUIRE(sketch.get_max_item() == static_cast<float>(n));
|
|
43
|
+
uint64_t weight = 0;
|
|
44
|
+
for (const auto pair: sketch) {
|
|
45
|
+
REQUIRE(pair.first >= sketch.get_min_item());
|
|
46
|
+
REQUIRE(pair.first <= sketch.get_max_item());
|
|
47
|
+
weight += pair.second;
|
|
48
|
+
}
|
|
49
|
+
REQUIRE(weight == sketch.get_n());
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
TEST_CASE("kll double", "[serde_compat]") {
|
|
55
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
|
56
|
+
for (const unsigned n: n_arr) {
|
|
57
|
+
std::ifstream is;
|
|
58
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
59
|
+
is.open(testBinaryInputPath + "kll_double_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
|
60
|
+
const auto sketch = kll_sketch<double>::deserialize(is);
|
|
61
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
|
62
|
+
REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K));
|
|
63
|
+
REQUIRE(sketch.get_n() == n);
|
|
64
|
+
if (n > 0) {
|
|
65
|
+
REQUIRE(sketch.get_min_item() == 1.0);
|
|
66
|
+
REQUIRE(sketch.get_max_item() == static_cast<double>(n));
|
|
67
|
+
uint64_t weight = 0;
|
|
68
|
+
for (const auto pair: sketch) {
|
|
69
|
+
REQUIRE(pair.first >= sketch.get_min_item());
|
|
70
|
+
REQUIRE(pair.first <= sketch.get_max_item());
|
|
71
|
+
weight += pair.second;
|
|
72
|
+
}
|
|
73
|
+
REQUIRE(weight == sketch.get_n());
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// numbers are padded with leading spaces so that natural order works
|
|
79
|
+
TEST_CASE("kll string", "[serde_compat]") {
|
|
80
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
|
81
|
+
for (const unsigned n: n_arr) {
|
|
82
|
+
std::ifstream is;
|
|
83
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
84
|
+
is.open(testBinaryInputPath + "kll_string_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
|
85
|
+
const auto sketch = kll_sketch<std::string>::deserialize(is);
|
|
86
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
|
87
|
+
REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K));
|
|
88
|
+
REQUIRE(sketch.get_n() == n);
|
|
89
|
+
if (n > 0) {
|
|
90
|
+
REQUIRE(std::stoul(sketch.get_min_item()) == 1);
|
|
91
|
+
REQUIRE(std::stoul(sketch.get_max_item()) == n);
|
|
92
|
+
uint64_t weight = 0;
|
|
93
|
+
for (const auto pair: sketch) {
|
|
94
|
+
REQUIRE(pair.first >= sketch.get_min_item());
|
|
95
|
+
REQUIRE(pair.first <= sketch.get_max_item());
|
|
96
|
+
weight += pair.second;
|
|
97
|
+
}
|
|
98
|
+
REQUIRE(weight == sketch.get_n());
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
} /* namespace datasketches */
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch2/catch.hpp>
|
|
21
|
+
#include <fstream>
|
|
22
|
+
#include <kll_sketch.hpp>
|
|
23
|
+
|
|
24
|
+
namespace datasketches {
|
|
25
|
+
|
|
26
|
+
TEST_CASE("kll sketch float generate", "[serialize_for_java]") {
|
|
27
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
|
28
|
+
for (const unsigned n: n_arr) {
|
|
29
|
+
kll_sketch<float> sketch;
|
|
30
|
+
for (unsigned i = 1; i <= n; ++i) sketch.update(i);
|
|
31
|
+
std::ofstream os("kll_float_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
|
32
|
+
sketch.serialize(os);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
TEST_CASE("kll sketch double generate", "[serialize_for_java]") {
|
|
37
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
|
38
|
+
for (const unsigned n: n_arr) {
|
|
39
|
+
kll_sketch<double> sketch;
|
|
40
|
+
for (unsigned i = 1; i <= n; ++i) sketch.update(i);
|
|
41
|
+
std::ofstream os("kll_double_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
|
42
|
+
sketch.serialize(os);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
struct compare_as_number {
|
|
47
|
+
bool operator()(const std::string& a, const std::string& b) const {
|
|
48
|
+
return std::stoi(a) < std::stoi(b);
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
TEST_CASE("kll sketch string generate", "[serialize_for_java]") {
|
|
53
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
|
54
|
+
for (const unsigned n: n_arr) {
|
|
55
|
+
kll_sketch<std::string, compare_as_number> sketch;
|
|
56
|
+
for (unsigned i = 1; i <= n; ++i) sketch.update(std::to_string(i));
|
|
57
|
+
std::ofstream os("kll_string_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
|
58
|
+
sketch.serialize(os);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
} /* namespace datasketches */
|
|
@@ -49,9 +49,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
49
49
|
test_allocator_total_bytes = 0;
|
|
50
50
|
|
|
51
51
|
SECTION("k limits") {
|
|
52
|
-
kll_float_sketch sketch1(
|
|
53
|
-
kll_float_sketch sketch2(
|
|
54
|
-
REQUIRE_THROWS_AS(new kll_float_sketch(
|
|
52
|
+
kll_float_sketch sketch1(kll_constants::MIN_K, std::less<float>(), 0); // this should work
|
|
53
|
+
kll_float_sketch sketch2(kll_constants::MAX_K, std::less<float>(), 0); // this should work
|
|
54
|
+
REQUIRE_THROWS_AS(new kll_float_sketch(kll_constants::MIN_K - 1, std::less<float>(), 0), std::invalid_argument);
|
|
55
55
|
// MAX_K + 1 makes no sense because k is uint16_t
|
|
56
56
|
//std::cout << "sizeof(kll_sketch<float>)=" << sizeof(kll_sketch<float>) << "\n";
|
|
57
57
|
//std::cout << "sizeof(kll_sketch<double>)=" << sizeof(kll_sketch<double>) << "\n";
|
|
@@ -67,8 +67,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
67
67
|
REQUIRE_THROWS_AS(sketch.get_max_item(), std::runtime_error);
|
|
68
68
|
REQUIRE_THROWS_AS(sketch.get_rank(0), std::runtime_error);
|
|
69
69
|
REQUIRE_THROWS_AS(sketch.get_quantile(0.5), std::runtime_error);
|
|
70
|
-
const double ranks[3] {0, 0.5, 1};
|
|
71
|
-
REQUIRE_THROWS_AS(sketch.get_quantiles(ranks, 3), std::runtime_error);
|
|
72
70
|
const float split_points[1] {0};
|
|
73
71
|
REQUIRE_THROWS_AS(sketch.get_PMF(split_points, 1), std::runtime_error);
|
|
74
72
|
REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 1), std::runtime_error);
|
|
@@ -99,12 +97,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
99
97
|
REQUIRE(sketch.get_min_item() == 1.0);
|
|
100
98
|
REQUIRE(sketch.get_max_item() == 1.0);
|
|
101
99
|
REQUIRE(sketch.get_quantile(0.5) == 1.0);
|
|
102
|
-
const double ranks[3] {0, 0.5, 1};
|
|
103
|
-
auto quantiles = sketch.get_quantiles(ranks, 3);
|
|
104
|
-
REQUIRE(quantiles.size() == 3);
|
|
105
|
-
REQUIRE(quantiles[0] == 1.0);
|
|
106
|
-
REQUIRE(quantiles[1] == 1.0);
|
|
107
|
-
REQUIRE(quantiles[2] == 1.0);
|
|
108
100
|
|
|
109
101
|
int count = 0;
|
|
110
102
|
for (auto pair: sketch) {
|
|
@@ -144,20 +136,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
144
136
|
REQUIRE(sketch.get_max_item() == n);
|
|
145
137
|
REQUIRE(sketch.get_quantile(1) == n);
|
|
146
138
|
|
|
147
|
-
const double ranks[3] {0, 0.5, 1};
|
|
148
|
-
auto quantiles = sketch.get_quantiles(ranks, 3);
|
|
149
|
-
REQUIRE(quantiles.size() == 3);
|
|
150
|
-
REQUIRE(quantiles[0] == 1);
|
|
151
|
-
REQUIRE(quantiles[1] == n / 2);
|
|
152
|
-
REQUIRE(quantiles[2] == n);
|
|
153
|
-
|
|
154
|
-
// alternative method must produce the same result
|
|
155
|
-
auto quantiles2 = sketch.get_quantiles(3);
|
|
156
|
-
REQUIRE(quantiles2.size() == 3);
|
|
157
|
-
REQUIRE(quantiles[0] == quantiles2[0]);
|
|
158
|
-
REQUIRE(quantiles[1] == quantiles2[1]);
|
|
159
|
-
REQUIRE(quantiles[2] == quantiles2[2]);
|
|
160
|
-
|
|
161
139
|
for (uint32_t i = 1; i <= n; i++) {
|
|
162
140
|
const double true_rank_inclusive = static_cast<double>(i) / n;
|
|
163
141
|
REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank_inclusive);
|
|
@@ -264,19 +242,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
264
242
|
}
|
|
265
243
|
}
|
|
266
244
|
|
|
267
|
-
SECTION("deserialize from java") {
|
|
268
|
-
std::ifstream is;
|
|
269
|
-
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
270
|
-
is.open(testBinaryInputPath + "kll_sketch_from_java.sk", std::ios::binary);
|
|
271
|
-
auto sketch = kll_float_sketch::deserialize(is, serde<float>(), std::less<float>(), 0);
|
|
272
|
-
REQUIRE_FALSE(sketch.is_empty());
|
|
273
|
-
REQUIRE(sketch.is_estimation_mode());
|
|
274
|
-
REQUIRE(sketch.get_n() == 1000000);
|
|
275
|
-
REQUIRE(sketch.get_num_retained() == 614);
|
|
276
|
-
REQUIRE(sketch.get_min_item() == 0.0);
|
|
277
|
-
REQUIRE(sketch.get_max_item() == 999999.0);
|
|
278
|
-
}
|
|
279
|
-
|
|
280
245
|
SECTION("stream serialize deserialize empty") {
|
|
281
246
|
kll_float_sketch sketch(200, std::less<float>(), 0);
|
|
282
247
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|