datasketches 0.3.2 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +539 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -37,7 +37,7 @@ kll_sketch<T, C, A>::kll_sketch(uint16_t k, const C& comparator, const A& alloca
|
|
37
37
|
comparator_(comparator),
|
38
38
|
allocator_(allocator),
|
39
39
|
k_(k),
|
40
|
-
m_(DEFAULT_M),
|
40
|
+
m_(kll_constants::DEFAULT_M),
|
41
41
|
min_k_(k),
|
42
42
|
num_levels_(1),
|
43
43
|
is_level_zero_sorted_(false),
|
@@ -45,12 +45,13 @@ n_(0),
|
|
45
45
|
levels_(2, 0, allocator),
|
46
46
|
items_(nullptr),
|
47
47
|
items_size_(k_),
|
48
|
-
min_item_(
|
49
|
-
max_item_(
|
48
|
+
min_item_(),
|
49
|
+
max_item_(),
|
50
50
|
sorted_view_(nullptr)
|
51
51
|
{
|
52
|
-
if (k < MIN_K || k > MAX_K) {
|
53
|
-
throw std::invalid_argument("K must be >= " + std::to_string(MIN_K) + " and <= "
|
52
|
+
if (k < kll_constants::MIN_K || k > kll_constants::MAX_K) {
|
53
|
+
throw std::invalid_argument("K must be >= " + std::to_string(kll_constants::MIN_K) + " and <= "
|
54
|
+
+ std::to_string(kll_constants::MAX_K) + ": " + std::to_string(k));
|
54
55
|
}
|
55
56
|
levels_[0] = levels_[1] = k;
|
56
57
|
items_ = allocator_.allocate(items_size_);
|
@@ -69,14 +70,12 @@ n_(other.n_),
|
|
69
70
|
levels_(other.levels_),
|
70
71
|
items_(nullptr),
|
71
72
|
items_size_(other.items_size_),
|
72
|
-
min_item_(
|
73
|
-
max_item_(
|
73
|
+
min_item_(other.min_item_),
|
74
|
+
max_item_(other.max_item_),
|
74
75
|
sorted_view_(nullptr)
|
75
76
|
{
|
76
77
|
items_ = allocator_.allocate(items_size_);
|
77
78
|
for (auto i = levels_[0]; i < levels_[num_levels_]; ++i) new (&items_[i]) T(other.items_[i]);
|
78
|
-
if (other.min_item_ != nullptr) min_item_ = new (allocator_.allocate(1)) T(*other.min_item_);
|
79
|
-
if (other.max_item_ != nullptr) max_item_ = new (allocator_.allocate(1)) T(*other.max_item_);
|
80
79
|
}
|
81
80
|
|
82
81
|
template<typename T, typename C, typename A>
|
@@ -92,13 +91,11 @@ n_(other.n_),
|
|
92
91
|
levels_(std::move(other.levels_)),
|
93
92
|
items_(other.items_),
|
94
93
|
items_size_(other.items_size_),
|
95
|
-
min_item_(other.min_item_),
|
96
|
-
max_item_(other.max_item_),
|
94
|
+
min_item_(std::move(other.min_item_)),
|
95
|
+
max_item_(std::move(other.max_item_)),
|
97
96
|
sorted_view_(nullptr)
|
98
97
|
{
|
99
98
|
other.items_ = nullptr;
|
100
|
-
other.min_item_ = nullptr;
|
101
|
-
other.max_item_ = nullptr;
|
102
99
|
}
|
103
100
|
|
104
101
|
template<typename T, typename C, typename A>
|
@@ -148,14 +145,6 @@ kll_sketch<T, C, A>::~kll_sketch() {
|
|
148
145
|
for (uint32_t i = begin; i < end; i++) items_[i].~T();
|
149
146
|
allocator_.deallocate(items_, items_size_);
|
150
147
|
}
|
151
|
-
if (min_item_ != nullptr) {
|
152
|
-
min_item_->~T();
|
153
|
-
allocator_.deallocate(min_item_, 1);
|
154
|
-
}
|
155
|
-
if (max_item_ != nullptr) {
|
156
|
-
max_item_->~T();
|
157
|
-
allocator_.deallocate(max_item_, 1);
|
158
|
-
}
|
159
148
|
reset_sorted_view();
|
160
149
|
}
|
161
150
|
|
@@ -173,8 +162,8 @@ n_(other.n_),
|
|
173
162
|
levels_(other.levels_, allocator_),
|
174
163
|
items_(nullptr),
|
175
164
|
items_size_(other.items_size_),
|
176
|
-
min_item_(
|
177
|
-
max_item_(
|
165
|
+
min_item_(other.min_item_),
|
166
|
+
max_item_(other.max_item_),
|
178
167
|
sorted_view_(nullptr)
|
179
168
|
{
|
180
169
|
static_assert(
|
@@ -183,8 +172,6 @@ sorted_view_(nullptr)
|
|
183
172
|
);
|
184
173
|
items_ = allocator_.allocate(items_size_);
|
185
174
|
for (auto i = levels_[0]; i < levels_[num_levels_]; ++i) new (&items_[i]) T(other.items_[i]);
|
186
|
-
if (other.min_item_ != nullptr) min_item_ = new (allocator_.allocate(1)) T(*other.min_item_);
|
187
|
-
if (other.max_item_ != nullptr) max_item_ = new (allocator_.allocate(1)) T(*other.max_item_);
|
188
175
|
check_sorting();
|
189
176
|
}
|
190
177
|
|
@@ -192,7 +179,7 @@ template<typename T, typename C, typename A>
|
|
192
179
|
template<typename FwdT>
|
193
180
|
void kll_sketch<T, C, A>::update(FwdT&& item) {
|
194
181
|
if (!check_update_item(item)) { return; }
|
195
|
-
update_min_max(item);
|
182
|
+
update_min_max(static_cast<const T&>(item)); // min and max are always copies
|
196
183
|
const uint32_t index = internal_update();
|
197
184
|
new (&items_[index]) T(std::forward<FwdT>(item));
|
198
185
|
reset_sorted_view();
|
@@ -201,8 +188,8 @@ void kll_sketch<T, C, A>::update(FwdT&& item) {
|
|
201
188
|
template<typename T, typename C, typename A>
|
202
189
|
void kll_sketch<T, C, A>::update_min_max(const T& item) {
|
203
190
|
if (is_empty()) {
|
204
|
-
min_item_
|
205
|
-
max_item_
|
191
|
+
min_item_.emplace(item);
|
192
|
+
max_item_.emplace(item);
|
206
193
|
} else {
|
207
194
|
if (comparator_(item, *min_item_)) *min_item_ = item;
|
208
195
|
if (comparator_(*max_item_, item)) *max_item_ = item;
|
@@ -225,8 +212,8 @@ void kll_sketch<T, C, A>::merge(FwdSk&& other) {
|
|
225
212
|
throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
|
226
213
|
}
|
227
214
|
if (is_empty()) {
|
228
|
-
min_item_
|
229
|
-
max_item_
|
215
|
+
min_item_.emplace(conditional_forward<FwdSk>(*other.min_item_));
|
216
|
+
max_item_.emplace(conditional_forward<FwdSk>(*other.max_item_));
|
230
217
|
} else {
|
231
218
|
if (comparator_(*other.min_item_, *min_item_)) *min_item_ = conditional_forward<FwdSk>(*other.min_item_);
|
232
219
|
if (comparator_(*max_item_, *other.max_item_)) *max_item_ = conditional_forward<FwdSk>(*other.max_item_);
|
@@ -322,42 +309,6 @@ auto kll_sketch<T, C, A>::get_quantile(double rank, bool inclusive) const -> qua
|
|
322
309
|
return sorted_view_->get_quantile(rank, inclusive);
|
323
310
|
}
|
324
311
|
|
325
|
-
template<typename T, typename C, typename A>
|
326
|
-
std::vector<T, A> kll_sketch<T, C, A>::get_quantiles(const double* ranks, uint32_t size, bool inclusive) const {
|
327
|
-
if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
328
|
-
std::vector<T, A> quantiles(allocator_);
|
329
|
-
quantiles.reserve(size);
|
330
|
-
|
331
|
-
// may have a side effect of sorting level zero if needed
|
332
|
-
setup_sorted_view();
|
333
|
-
|
334
|
-
for (uint32_t i = 0; i < size; i++) {
|
335
|
-
const double rank = ranks[i];
|
336
|
-
if ((rank < 0.0) || (rank > 1.0)) {
|
337
|
-
throw std::invalid_argument("normalized rank cannot be less than 0 or greater than 1");
|
338
|
-
}
|
339
|
-
quantiles.push_back(sorted_view_->get_quantile(rank, inclusive));
|
340
|
-
}
|
341
|
-
return quantiles;
|
342
|
-
}
|
343
|
-
|
344
|
-
template<typename T, typename C, typename A>
|
345
|
-
std::vector<T, A> kll_sketch<T, C, A>::get_quantiles(uint32_t num, bool inclusive) const {
|
346
|
-
if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
347
|
-
if (num == 0) {
|
348
|
-
throw std::invalid_argument("num must be > 0");
|
349
|
-
}
|
350
|
-
vector_double ranks(num, 0, allocator_);
|
351
|
-
ranks[0] = 0.0;
|
352
|
-
for (size_t i = 1; i < num; i++) {
|
353
|
-
ranks[i] = static_cast<double>(i) / (num - 1);
|
354
|
-
}
|
355
|
-
if (num > 1) {
|
356
|
-
ranks[num - 1] = 1.0;
|
357
|
-
}
|
358
|
-
return get_quantiles(ranks.data(), num, inclusive);
|
359
|
-
}
|
360
|
-
|
361
312
|
template<typename T, typename C, typename A>
|
362
313
|
double kll_sketch<T, C, A>::get_normalized_rank_error(bool pmf) const {
|
363
314
|
return get_normalized_rank_error(min_k_, pmf);
|
@@ -396,7 +347,7 @@ template<typename T, typename C, typename A>
|
|
396
347
|
template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
397
348
|
size_t kll_sketch<T, C, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n) {
|
398
349
|
const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
|
399
|
-
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
|
350
|
+
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, kll_constants::DEFAULT_M, num_levels);
|
400
351
|
// the last integer in the levels_ array is not serialized because it can be derived
|
401
352
|
return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * sizeof(TT);
|
402
353
|
}
|
@@ -406,7 +357,7 @@ template<typename T, typename C, typename A>
|
|
406
357
|
template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
407
358
|
size_t kll_sketch<T, C, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes) {
|
408
359
|
const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
|
409
|
-
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
|
360
|
+
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, kll_constants::DEFAULT_M, num_levels);
|
410
361
|
// the last integer in the levels_ array is not serialized because it can be derived
|
411
362
|
return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * max_item_size_bytes;
|
412
363
|
}
|
@@ -438,8 +389,8 @@ void kll_sketch<T, C, A>::serialize(std::ostream& os, const SerDe& sd) const {
|
|
438
389
|
write(os, num_levels_);
|
439
390
|
write(os, unused);
|
440
391
|
write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
|
441
|
-
sd.serialize(os, min_item_, 1);
|
442
|
-
sd.serialize(os, max_item_, 1);
|
392
|
+
sd.serialize(os, &*min_item_, 1);
|
393
|
+
sd.serialize(os, &*max_item_, 1);
|
443
394
|
}
|
444
395
|
sd.serialize(os, &items_[levels_[0]], get_num_retained());
|
445
396
|
}
|
@@ -474,8 +425,8 @@ auto kll_sketch<T, C, A>::serialize(unsigned header_size_bytes, const SerDe& sd)
|
|
474
425
|
ptr += copy_to_mem(num_levels_, ptr);
|
475
426
|
ptr += sizeof(uint8_t); // unused
|
476
427
|
ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
|
477
|
-
ptr += sd.serialize(ptr, end_ptr - ptr, min_item_, 1);
|
478
|
-
ptr += sd.serialize(ptr, end_ptr - ptr, max_item_, 1);
|
428
|
+
ptr += sd.serialize(ptr, end_ptr - ptr, &*min_item_, 1);
|
429
|
+
ptr += sd.serialize(ptr, end_ptr - ptr, &*max_item_, 1);
|
479
430
|
}
|
480
431
|
const size_t bytes_remaining = end_ptr - ptr;
|
481
432
|
ptr += sd.serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
|
@@ -530,20 +481,20 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(std::istream& is, const Ser
|
|
530
481
|
read(is, levels.data(), sizeof(levels[0]) * num_levels);
|
531
482
|
}
|
532
483
|
levels[num_levels] = capacity;
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
std::unique_ptr<T, decltype(item_buffer_deleter)> max_item_buffer(alloc.allocate(1), item_buffer_deleter);
|
537
|
-
std::unique_ptr<T, item_deleter> min_item(nullptr, item_deleter(allocator));
|
538
|
-
std::unique_ptr<T, item_deleter> max_item(nullptr, item_deleter(allocator));
|
484
|
+
optional<T> tmp; // space to deserialize min and max
|
485
|
+
optional<T> min_item;
|
486
|
+
optional<T> max_item;
|
539
487
|
if (!is_single_item) {
|
540
|
-
sd.deserialize(is,
|
541
|
-
// serde call did not throw, repackage
|
542
|
-
min_item
|
543
|
-
|
544
|
-
|
545
|
-
|
488
|
+
sd.deserialize(is, &*tmp, 1);
|
489
|
+
// serde call did not throw, repackage and cleanup
|
490
|
+
min_item.emplace(*tmp);
|
491
|
+
(*tmp).~T();
|
492
|
+
sd.deserialize(is, &*tmp, 1);
|
493
|
+
// serde call did not throw, repackage and cleanup
|
494
|
+
max_item.emplace(*tmp);
|
495
|
+
(*tmp).~T();
|
546
496
|
}
|
497
|
+
A alloc(allocator);
|
547
498
|
auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
|
548
499
|
std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
|
549
500
|
const auto num_items = levels[num_levels] - levels[0];
|
@@ -552,12 +503,8 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(std::istream& is, const Ser
|
|
552
503
|
std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
|
553
504
|
const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
|
554
505
|
if (is_single_item) {
|
555
|
-
|
556
|
-
|
557
|
-
min_item = std::unique_ptr<T, item_deleter>(min_item_buffer.release(), item_deleter(allocator));
|
558
|
-
new (max_item_buffer.get()) T(items.get()[levels[0]]);
|
559
|
-
// copy did not throw, repackage with destrtuctor
|
560
|
-
max_item = std::unique_ptr<T, item_deleter>(max_item_buffer.release(), item_deleter(allocator));
|
506
|
+
min_item.emplace(items.get()[levels[0]]);
|
507
|
+
max_item.emplace(items.get()[levels[0]]);
|
561
508
|
}
|
562
509
|
if (!is.good())
|
563
510
|
throw std::runtime_error("error reading from std::istream");
|
@@ -618,20 +565,20 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(const void* bytes, size_t s
|
|
618
565
|
ptr += copy_from_mem(ptr, levels.data(), sizeof(levels[0]) * num_levels);
|
619
566
|
}
|
620
567
|
levels[num_levels] = capacity;
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
std::unique_ptr<T, decltype(item_buffer_deleter)> max_item_buffer(alloc.allocate(1), item_buffer_deleter);
|
625
|
-
std::unique_ptr<T, item_deleter> min_item(nullptr, item_deleter(allocator));
|
626
|
-
std::unique_ptr<T, item_deleter> max_item(nullptr, item_deleter(allocator));
|
568
|
+
optional<T> tmp; // space to deserialize min and max
|
569
|
+
optional<T> min_item;
|
570
|
+
optional<T> max_item;
|
627
571
|
if (!is_single_item) {
|
628
|
-
ptr += sd.deserialize(ptr, end_ptr - ptr,
|
629
|
-
// serde call did not throw, repackage
|
630
|
-
min_item
|
631
|
-
|
632
|
-
|
633
|
-
|
572
|
+
ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
|
573
|
+
// serde call did not throw, repackage and cleanup
|
574
|
+
min_item.emplace(*tmp);
|
575
|
+
(*tmp).~T();
|
576
|
+
ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
|
577
|
+
// serde call did not throw, repackage and cleanup
|
578
|
+
max_item.emplace(*tmp);
|
579
|
+
(*tmp).~T();
|
634
580
|
}
|
581
|
+
A alloc(allocator);
|
635
582
|
auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
|
636
583
|
std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
|
637
584
|
const auto num_items = levels[num_levels] - levels[0];
|
@@ -642,12 +589,8 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(const void* bytes, size_t s
|
|
642
589
|
if (delta != size) throw std::logic_error("deserialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
|
643
590
|
const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
|
644
591
|
if (is_single_item) {
|
645
|
-
|
646
|
-
|
647
|
-
min_item = std::unique_ptr<T, item_deleter>(min_item_buffer.release(), item_deleter(allocator));
|
648
|
-
new (max_item_buffer.get()) T(items.get()[levels[0]]);
|
649
|
-
// copy did not throw, repackage with destrtuctor
|
650
|
-
max_item = std::unique_ptr<T, item_deleter>(max_item_buffer.release(), item_deleter(allocator));
|
592
|
+
min_item.emplace(items.get()[levels[0]]);
|
593
|
+
max_item.emplace(items.get()[levels[0]]);
|
651
594
|
}
|
652
595
|
return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
|
653
596
|
std::move(min_item), std::move(max_item), is_level_zero_sorted, comparator);
|
@@ -670,12 +613,12 @@ double kll_sketch<T, C, A>::get_normalized_rank_error(uint16_t k, bool pmf) {
|
|
670
613
|
// for deserialization
|
671
614
|
template<typename T, typename C, typename A>
|
672
615
|
kll_sketch<T, C, A>::kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32&& levels,
|
673
|
-
std::unique_ptr<T, items_deleter> items, uint32_t items_size,
|
674
|
-
|
616
|
+
std::unique_ptr<T, items_deleter> items, uint32_t items_size, optional<T>&& min_item,
|
617
|
+
optional<T>&& max_item, bool is_level_zero_sorted, const C& comparator):
|
675
618
|
comparator_(comparator),
|
676
619
|
allocator_(levels.get_allocator()),
|
677
620
|
k_(k),
|
678
|
-
m_(DEFAULT_M),
|
621
|
+
m_(kll_constants::DEFAULT_M),
|
679
622
|
min_k_(min_k),
|
680
623
|
num_levels_(num_levels),
|
681
624
|
is_level_zero_sorted_(is_level_zero_sorted),
|
@@ -683,8 +626,8 @@ n_(n),
|
|
683
626
|
levels_(std::move(levels)),
|
684
627
|
items_(items.release()),
|
685
628
|
items_size_(items_size),
|
686
|
-
min_item_(min_item
|
687
|
-
max_item_(max_item
|
629
|
+
min_item_(std::move(min_item)),
|
630
|
+
max_item_(std::move(max_item)),
|
688
631
|
sorted_view_(nullptr)
|
689
632
|
{}
|
690
633
|
|
@@ -820,7 +763,7 @@ quantiles_sorted_view<T, C, A> kll_sketch<T, C, A>::get_sorted_view() const {
|
|
820
763
|
for (uint8_t level = 0; level < num_levels_; ++level) {
|
821
764
|
const auto from = items_ + levels_[level];
|
822
765
|
const auto to = items_ + levels_[level + 1]; // exclusive
|
823
|
-
view.add(from, to,
|
766
|
+
view.add(from, to, 1ULL << level);
|
824
767
|
}
|
825
768
|
view.convert_to_cummulative();
|
826
769
|
return view;
|
@@ -917,8 +860,8 @@ uint32_t kll_sketch<T, C, A>::get_num_retained_above_level_zero() const {
|
|
917
860
|
|
918
861
|
template<typename T, typename C, typename A>
|
919
862
|
void kll_sketch<T, C, A>::check_m(uint8_t m) {
|
920
|
-
if (m != DEFAULT_M) {
|
921
|
-
throw std::invalid_argument("Possible corruption: M must be " + std::to_string(DEFAULT_M)
|
863
|
+
if (m != kll_constants::DEFAULT_M) {
|
864
|
+
throw std::invalid_argument("Possible corruption: M must be " + std::to_string(kll_constants::DEFAULT_M)
|
922
865
|
+ ": " + std::to_string(m));
|
923
866
|
}
|
924
867
|
}
|
@@ -1019,20 +962,6 @@ typename kll_sketch<T, C, A>::const_iterator kll_sketch<T, C, A>::end() const {
|
|
1019
962
|
return kll_sketch<T, C, A>::const_iterator(nullptr, levels_.data(), num_levels_);
|
1020
963
|
}
|
1021
964
|
|
1022
|
-
template<typename T, typename C, typename A>
|
1023
|
-
class kll_sketch<T, C, A>::item_deleter {
|
1024
|
-
public:
|
1025
|
-
item_deleter(const A& allocator): allocator_(allocator) {}
|
1026
|
-
void operator() (T* ptr) {
|
1027
|
-
if (ptr != nullptr) {
|
1028
|
-
ptr->~T();
|
1029
|
-
allocator_.deallocate(ptr, 1);
|
1030
|
-
}
|
1031
|
-
}
|
1032
|
-
private:
|
1033
|
-
A allocator_;
|
1034
|
-
};
|
1035
|
-
|
1036
965
|
template<typename T, typename C, typename A>
|
1037
966
|
class kll_sketch<T, C, A>::items_deleter {
|
1038
967
|
public:
|
@@ -20,7 +20,6 @@ add_executable(kll_test)
|
|
20
20
|
target_link_libraries(kll_test kll common_test_lib)
|
21
21
|
|
22
22
|
set_target_properties(kll_test PROPERTIES
|
23
|
-
CXX_STANDARD 11
|
24
23
|
CXX_STANDARD_REQUIRED YES
|
25
24
|
)
|
26
25
|
|
@@ -43,3 +42,17 @@ target_sources(kll_test
|
|
43
42
|
kll_sketch_validation.cpp
|
44
43
|
kolmogorov_smirnov_test.cpp
|
45
44
|
)
|
45
|
+
|
46
|
+
if (SERDE_COMPAT)
|
47
|
+
target_sources(kll_test
|
48
|
+
PRIVATE
|
49
|
+
kll_sketch_deserialize_from_java_test.cpp
|
50
|
+
)
|
51
|
+
endif()
|
52
|
+
|
53
|
+
if (GENERATE)
|
54
|
+
target_sources(kll_test
|
55
|
+
PRIVATE
|
56
|
+
kll_sketch_serialize_for_java.cpp
|
57
|
+
)
|
58
|
+
endif()
|
@@ -0,0 +1,103 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
#include <fstream>
|
22
|
+
#include <kll_sketch.hpp>
|
23
|
+
|
24
|
+
namespace datasketches {
|
25
|
+
|
26
|
+
// assume the binary sketches for this test have been generated by datasketches-java code
|
27
|
+
// in the subdirectory called "java" in the root directory of this project
|
28
|
+
static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
|
29
|
+
|
30
|
+
TEST_CASE("kll float", "[serde_compat]") {
|
31
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
32
|
+
for (const unsigned n: n_arr) {
|
33
|
+
std::ifstream is;
|
34
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
35
|
+
is.open(testBinaryInputPath + "kll_float_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
36
|
+
const auto sketch = kll_sketch<float>::deserialize(is);
|
37
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
38
|
+
REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K));
|
39
|
+
REQUIRE(sketch.get_n() == n);
|
40
|
+
if (n > 0) {
|
41
|
+
REQUIRE(sketch.get_min_item() == 1.0f);
|
42
|
+
REQUIRE(sketch.get_max_item() == static_cast<float>(n));
|
43
|
+
uint64_t weight = 0;
|
44
|
+
for (const auto pair: sketch) {
|
45
|
+
REQUIRE(pair.first >= sketch.get_min_item());
|
46
|
+
REQUIRE(pair.first <= sketch.get_max_item());
|
47
|
+
weight += pair.second;
|
48
|
+
}
|
49
|
+
REQUIRE(weight == sketch.get_n());
|
50
|
+
}
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
TEST_CASE("kll double", "[serde_compat]") {
|
55
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
56
|
+
for (const unsigned n: n_arr) {
|
57
|
+
std::ifstream is;
|
58
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
59
|
+
is.open(testBinaryInputPath + "kll_double_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
60
|
+
const auto sketch = kll_sketch<double>::deserialize(is);
|
61
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
62
|
+
REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K));
|
63
|
+
REQUIRE(sketch.get_n() == n);
|
64
|
+
if (n > 0) {
|
65
|
+
REQUIRE(sketch.get_min_item() == 1.0);
|
66
|
+
REQUIRE(sketch.get_max_item() == static_cast<double>(n));
|
67
|
+
uint64_t weight = 0;
|
68
|
+
for (const auto pair: sketch) {
|
69
|
+
REQUIRE(pair.first >= sketch.get_min_item());
|
70
|
+
REQUIRE(pair.first <= sketch.get_max_item());
|
71
|
+
weight += pair.second;
|
72
|
+
}
|
73
|
+
REQUIRE(weight == sketch.get_n());
|
74
|
+
}
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
// numbers are padded with leading spaces so that natural order works
|
79
|
+
TEST_CASE("kll string", "[serde_compat]") {
|
80
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
81
|
+
for (const unsigned n: n_arr) {
|
82
|
+
std::ifstream is;
|
83
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
84
|
+
is.open(testBinaryInputPath + "kll_string_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
85
|
+
const auto sketch = kll_sketch<std::string>::deserialize(is);
|
86
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
87
|
+
REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K));
|
88
|
+
REQUIRE(sketch.get_n() == n);
|
89
|
+
if (n > 0) {
|
90
|
+
REQUIRE(std::stoul(sketch.get_min_item()) == 1);
|
91
|
+
REQUIRE(std::stoul(sketch.get_max_item()) == n);
|
92
|
+
uint64_t weight = 0;
|
93
|
+
for (const auto pair: sketch) {
|
94
|
+
REQUIRE(pair.first >= sketch.get_min_item());
|
95
|
+
REQUIRE(pair.first <= sketch.get_max_item());
|
96
|
+
weight += pair.second;
|
97
|
+
}
|
98
|
+
REQUIRE(weight == sketch.get_n());
|
99
|
+
}
|
100
|
+
}
|
101
|
+
}
|
102
|
+
|
103
|
+
} /* namespace datasketches */
|
@@ -0,0 +1,62 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
#include <fstream>
|
22
|
+
#include <kll_sketch.hpp>
|
23
|
+
|
24
|
+
namespace datasketches {
|
25
|
+
|
26
|
+
TEST_CASE("kll sketch float generate", "[serialize_for_java]") {
|
27
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
28
|
+
for (const unsigned n: n_arr) {
|
29
|
+
kll_sketch<float> sketch;
|
30
|
+
for (unsigned i = 1; i <= n; ++i) sketch.update(i);
|
31
|
+
std::ofstream os("kll_float_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
32
|
+
sketch.serialize(os);
|
33
|
+
}
|
34
|
+
}
|
35
|
+
|
36
|
+
TEST_CASE("kll sketch double generate", "[serialize_for_java]") {
|
37
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
38
|
+
for (const unsigned n: n_arr) {
|
39
|
+
kll_sketch<double> sketch;
|
40
|
+
for (unsigned i = 1; i <= n; ++i) sketch.update(i);
|
41
|
+
std::ofstream os("kll_double_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
42
|
+
sketch.serialize(os);
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
struct compare_as_number {
|
47
|
+
bool operator()(const std::string& a, const std::string& b) const {
|
48
|
+
return std::stoi(a) < std::stoi(b);
|
49
|
+
}
|
50
|
+
};
|
51
|
+
|
52
|
+
TEST_CASE("kll sketch string generate", "[serialize_for_java]") {
|
53
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
54
|
+
for (const unsigned n: n_arr) {
|
55
|
+
kll_sketch<std::string, compare_as_number> sketch;
|
56
|
+
for (unsigned i = 1; i <= n; ++i) sketch.update(std::to_string(i));
|
57
|
+
std::ofstream os("kll_string_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
58
|
+
sketch.serialize(os);
|
59
|
+
}
|
60
|
+
}
|
61
|
+
|
62
|
+
} /* namespace datasketches */
|
@@ -49,9 +49,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
49
49
|
test_allocator_total_bytes = 0;
|
50
50
|
|
51
51
|
SECTION("k limits") {
|
52
|
-
kll_float_sketch sketch1(
|
53
|
-
kll_float_sketch sketch2(
|
54
|
-
REQUIRE_THROWS_AS(new kll_float_sketch(
|
52
|
+
kll_float_sketch sketch1(kll_constants::MIN_K, std::less<float>(), 0); // this should work
|
53
|
+
kll_float_sketch sketch2(kll_constants::MAX_K, std::less<float>(), 0); // this should work
|
54
|
+
REQUIRE_THROWS_AS(new kll_float_sketch(kll_constants::MIN_K - 1, std::less<float>(), 0), std::invalid_argument);
|
55
55
|
// MAX_K + 1 makes no sense because k is uint16_t
|
56
56
|
//std::cout << "sizeof(kll_sketch<float>)=" << sizeof(kll_sketch<float>) << "\n";
|
57
57
|
//std::cout << "sizeof(kll_sketch<double>)=" << sizeof(kll_sketch<double>) << "\n";
|
@@ -67,8 +67,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
67
67
|
REQUIRE_THROWS_AS(sketch.get_max_item(), std::runtime_error);
|
68
68
|
REQUIRE_THROWS_AS(sketch.get_rank(0), std::runtime_error);
|
69
69
|
REQUIRE_THROWS_AS(sketch.get_quantile(0.5), std::runtime_error);
|
70
|
-
const double ranks[3] {0, 0.5, 1};
|
71
|
-
REQUIRE_THROWS_AS(sketch.get_quantiles(ranks, 3), std::runtime_error);
|
72
70
|
const float split_points[1] {0};
|
73
71
|
REQUIRE_THROWS_AS(sketch.get_PMF(split_points, 1), std::runtime_error);
|
74
72
|
REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 1), std::runtime_error);
|
@@ -99,12 +97,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
99
97
|
REQUIRE(sketch.get_min_item() == 1.0);
|
100
98
|
REQUIRE(sketch.get_max_item() == 1.0);
|
101
99
|
REQUIRE(sketch.get_quantile(0.5) == 1.0);
|
102
|
-
const double ranks[3] {0, 0.5, 1};
|
103
|
-
auto quantiles = sketch.get_quantiles(ranks, 3);
|
104
|
-
REQUIRE(quantiles.size() == 3);
|
105
|
-
REQUIRE(quantiles[0] == 1.0);
|
106
|
-
REQUIRE(quantiles[1] == 1.0);
|
107
|
-
REQUIRE(quantiles[2] == 1.0);
|
108
100
|
|
109
101
|
int count = 0;
|
110
102
|
for (auto pair: sketch) {
|
@@ -144,20 +136,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
144
136
|
REQUIRE(sketch.get_max_item() == n);
|
145
137
|
REQUIRE(sketch.get_quantile(1) == n);
|
146
138
|
|
147
|
-
const double ranks[3] {0, 0.5, 1};
|
148
|
-
auto quantiles = sketch.get_quantiles(ranks, 3);
|
149
|
-
REQUIRE(quantiles.size() == 3);
|
150
|
-
REQUIRE(quantiles[0] == 1);
|
151
|
-
REQUIRE(quantiles[1] == n / 2);
|
152
|
-
REQUIRE(quantiles[2] == n);
|
153
|
-
|
154
|
-
// alternative method must produce the same result
|
155
|
-
auto quantiles2 = sketch.get_quantiles(3);
|
156
|
-
REQUIRE(quantiles2.size() == 3);
|
157
|
-
REQUIRE(quantiles[0] == quantiles2[0]);
|
158
|
-
REQUIRE(quantiles[1] == quantiles2[1]);
|
159
|
-
REQUIRE(quantiles[2] == quantiles2[2]);
|
160
|
-
|
161
139
|
for (uint32_t i = 1; i <= n; i++) {
|
162
140
|
const double true_rank_inclusive = static_cast<double>(i) / n;
|
163
141
|
REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank_inclusive);
|
@@ -264,19 +242,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
264
242
|
}
|
265
243
|
}
|
266
244
|
|
267
|
-
SECTION("deserialize from java") {
|
268
|
-
std::ifstream is;
|
269
|
-
is.exceptions(std::ios::failbit | std::ios::badbit);
|
270
|
-
is.open(testBinaryInputPath + "kll_sketch_from_java.sk", std::ios::binary);
|
271
|
-
auto sketch = kll_float_sketch::deserialize(is, serde<float>(), std::less<float>(), 0);
|
272
|
-
REQUIRE_FALSE(sketch.is_empty());
|
273
|
-
REQUIRE(sketch.is_estimation_mode());
|
274
|
-
REQUIRE(sketch.get_n() == 1000000);
|
275
|
-
REQUIRE(sketch.get_num_retained() == 614);
|
276
|
-
REQUIRE(sketch.get_min_item() == 0.0);
|
277
|
-
REQUIRE(sketch.get_max_item() == 999999.0);
|
278
|
-
}
|
279
|
-
|
280
245
|
SECTION("stream serialize deserialize empty") {
|
281
246
|
kll_float_sketch sketch(200, std::less<float>(), 0);
|
282
247
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|