datasketches 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
|
@@ -151,8 +151,8 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
|
|
|
151
151
|
const uint32_t old_size = 1 << lg_size;
|
|
152
152
|
const uint32_t new_size = 1 << new_lg_size;
|
|
153
153
|
if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
|
|
154
|
-
vector_u32
|
|
155
|
-
slots = vector_u32
|
|
154
|
+
vector_u32 old_slots = std::move(slots);
|
|
155
|
+
slots = vector_u32(new_size, UINT32_MAX, old_slots.get_allocator());
|
|
156
156
|
lg_size = new_lg_size;
|
|
157
157
|
for (uint32_t i = 0; i < old_size; i++) {
|
|
158
158
|
if (old_slots[i] != UINT32_MAX) {
|
|
@@ -168,10 +168,10 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
|
|
|
168
168
|
// and even then the subsequent sort would fix things up.
|
|
169
169
|
// The result is nearly sorted, so make sure to use an efficient sort for that case
|
|
170
170
|
template<typename A>
|
|
171
|
-
|
|
172
|
-
if (num_items == 0) return vector_u32
|
|
171
|
+
auto u32_table<A>::unwrapping_get_items() const -> vector_u32 {
|
|
172
|
+
if (num_items == 0) return vector_u32(slots.get_allocator());
|
|
173
173
|
const uint32_t table_size = 1 << lg_size;
|
|
174
|
-
vector_u32
|
|
174
|
+
vector_u32 result(num_items, 0, slots.get_allocator());
|
|
175
175
|
size_t i = 0;
|
|
176
176
|
size_t l = 0;
|
|
177
177
|
size_t r = num_items - 1;
|
|
@@ -20,16 +20,15 @@ add_executable(cpc_test)
|
|
|
20
20
|
target_link_libraries(cpc_test cpc common_test_lib)
|
|
21
21
|
|
|
22
22
|
set_target_properties(cpc_test PROPERTIES
|
|
23
|
-
CXX_STANDARD 11
|
|
24
23
|
CXX_STANDARD_REQUIRED YES
|
|
25
24
|
)
|
|
26
25
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
26
|
+
file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" CPC_TEST_BINARY_PATH)
|
|
27
|
+
string(APPEND CPC_TEST_BINARY_PATH "/")
|
|
28
|
+
target_compile_definitions(cpc_test
|
|
29
|
+
PRIVATE
|
|
30
|
+
TEST_BINARY_INPUT_PATH="${CPC_TEST_BINARY_PATH}"
|
|
31
|
+
)
|
|
33
32
|
|
|
34
33
|
add_test(
|
|
35
34
|
NAME cpc_test
|
|
@@ -43,3 +42,17 @@ target_sources(cpc_test
|
|
|
43
42
|
compression_test.cpp
|
|
44
43
|
cpc_sketch_allocation_test.cpp
|
|
45
44
|
)
|
|
45
|
+
|
|
46
|
+
if (SERDE_COMPAT)
|
|
47
|
+
target_sources(cpc_test
|
|
48
|
+
PRIVATE
|
|
49
|
+
cpc_sketch_deserialize_from_java_test.cpp
|
|
50
|
+
)
|
|
51
|
+
endif()
|
|
52
|
+
|
|
53
|
+
if (GENERATE)
|
|
54
|
+
target_sources(cpc_test
|
|
55
|
+
PRIVATE
|
|
56
|
+
cpc_sketch_serialize_for_java.cpp
|
|
57
|
+
)
|
|
58
|
+
endif()
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch2/catch.hpp>
|
|
21
|
+
#include <fstream>
|
|
22
|
+
#include <cpc_sketch.hpp>
|
|
23
|
+
|
|
24
|
+
namespace datasketches {
|
|
25
|
+
|
|
26
|
+
// assume the binary sketches for this test have been generated by datasketches-java code
|
|
27
|
+
// in the subdirectory called "java" in the root directory of this project
|
|
28
|
+
static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
|
|
29
|
+
|
|
30
|
+
TEST_CASE("cpc sketch", "[serde_compat]") {
|
|
31
|
+
const unsigned n_arr[] = {0, 100, 200, 2000, 20000};
|
|
32
|
+
for (const unsigned n: n_arr) {
|
|
33
|
+
std::ifstream is;
|
|
34
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
35
|
+
is.open(testBinaryInputPath + "cpc_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
|
36
|
+
const auto sketch = cpc_sketch::deserialize(is);
|
|
37
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
|
38
|
+
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.02));
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
TEST_CASE("cpc sketch negative one", "[serde_compat]") {
|
|
43
|
+
std::ifstream is;
|
|
44
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
45
|
+
is.open(testBinaryInputPath + "cpc_negative_one_java.sk", std::ios::binary);
|
|
46
|
+
auto sketch = cpc_sketch::deserialize(is);
|
|
47
|
+
REQUIRE_FALSE(sketch.is_empty());
|
|
48
|
+
REQUIRE(sketch.get_estimate() == Approx(1).margin(0.01));
|
|
49
|
+
sketch.update((uint64_t) -1);
|
|
50
|
+
sketch.update((int64_t) -1);
|
|
51
|
+
sketch.update((uint32_t) -1);
|
|
52
|
+
sketch.update((int32_t) -1);
|
|
53
|
+
sketch.update((uint16_t) -1);
|
|
54
|
+
sketch.update((int16_t) -1);
|
|
55
|
+
sketch.update((uint8_t) -1);
|
|
56
|
+
sketch.update((int8_t) -1);
|
|
57
|
+
REQUIRE(sketch.get_estimate() == Approx(1).margin(0.01));
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
} /* namespace datasketches */
|
|
@@ -17,21 +17,22 @@
|
|
|
17
17
|
* under the License.
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
-
#
|
|
21
|
-
#
|
|
20
|
+
#include <catch2/catch.hpp>
|
|
21
|
+
#include <fstream>
|
|
22
|
+
#include <cpc_sketch.hpp>
|
|
22
23
|
|
|
23
|
-
|
|
24
|
+
namespace datasketches {
|
|
24
25
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
26
|
+
TEST_CASE("cpc sketch generate", "[serialize_for_java]") {
|
|
27
|
+
const unsigned n_arr[] = {0, 100, 200, 2000, 20000};
|
|
28
|
+
for (const unsigned n: n_arr) {
|
|
29
|
+
cpc_sketch sketch;
|
|
30
|
+
for (unsigned i = 1; i <= n; ++i) sketch.update(i);
|
|
31
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
|
32
|
+
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.02));
|
|
33
|
+
std::ofstream os("cpc_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
|
34
|
+
sketch.serialize(os);
|
|
34
35
|
}
|
|
35
|
-
}
|
|
36
|
+
}
|
|
36
37
|
|
|
37
|
-
|
|
38
|
+
} /* namespace datasketches */
|
|
@@ -32,10 +32,10 @@ namespace datasketches {
|
|
|
32
32
|
static const double RELATIVE_ERROR_FOR_LG_K_11 = 0.02;
|
|
33
33
|
|
|
34
34
|
TEST_CASE("cpc sketch: lg k limits", "[cpc_sketch]") {
|
|
35
|
-
cpc_sketch s1(
|
|
36
|
-
cpc_sketch s2(
|
|
37
|
-
REQUIRE_THROWS_AS(cpc_sketch(
|
|
38
|
-
REQUIRE_THROWS_AS(cpc_sketch(
|
|
35
|
+
cpc_sketch s1(cpc_constants::MIN_LG_K); // this should work
|
|
36
|
+
cpc_sketch s2(cpc_constants::MAX_LG_K); // this should work
|
|
37
|
+
REQUIRE_THROWS_AS(cpc_sketch(cpc_constants::MIN_LG_K - 1), std::invalid_argument);
|
|
38
|
+
REQUIRE_THROWS_AS(cpc_sketch(cpc_constants::MAX_LG_K + 1), std::invalid_argument);
|
|
39
39
|
}
|
|
40
40
|
|
|
41
41
|
TEST_CASE("cpc sketch: empty", "[cpc_sketch]") {
|
|
@@ -88,9 +88,6 @@ TEST_CASE("cpc sketch: serialize deserialize empty", "[cpc_sketch]") {
|
|
|
88
88
|
REQUIRE(deserialized.is_empty() == sketch.is_empty());
|
|
89
89
|
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
|
90
90
|
REQUIRE(deserialized.validate());
|
|
91
|
-
|
|
92
|
-
std::ofstream os("cpc-empty.bin");
|
|
93
|
-
sketch.serialize(os);
|
|
94
91
|
}
|
|
95
92
|
|
|
96
93
|
TEST_CASE("cpc sketch: serialize deserialize sparse", "[cpc_sketch]") {
|
|
@@ -108,9 +105,6 @@ TEST_CASE("cpc sketch: serialize deserialize sparse", "[cpc_sketch]") {
|
|
|
108
105
|
for (int i = 0; i < n; i++) deserialized.update(i);
|
|
109
106
|
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
|
110
107
|
REQUIRE(deserialized.validate());
|
|
111
|
-
|
|
112
|
-
std::ofstream os("cpc-sparse.bin");
|
|
113
|
-
sketch.serialize(os);
|
|
114
108
|
}
|
|
115
109
|
|
|
116
110
|
TEST_CASE("cpc sketch: serialize deserialize hybrid", "[cpc_sketch]") {
|
|
@@ -128,9 +122,6 @@ TEST_CASE("cpc sketch: serialize deserialize hybrid", "[cpc_sketch]") {
|
|
|
128
122
|
for (int i = 0; i < n; i++) deserialized.update(i);
|
|
129
123
|
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
|
130
124
|
REQUIRE(deserialized.validate());
|
|
131
|
-
|
|
132
|
-
std::ofstream os("cpc-hybrid.bin");
|
|
133
|
-
sketch.serialize(os);
|
|
134
125
|
}
|
|
135
126
|
|
|
136
127
|
TEST_CASE("cpc sketch: serialize deserialize pinned", "[cpc_sketch]") {
|
|
@@ -148,9 +139,6 @@ TEST_CASE("cpc sketch: serialize deserialize pinned", "[cpc_sketch]") {
|
|
|
148
139
|
for (int i = 0; i < n; i++) deserialized.update(i);
|
|
149
140
|
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
|
150
141
|
REQUIRE(deserialized.validate());
|
|
151
|
-
|
|
152
|
-
std::ofstream os("cpc-pinned.bin");
|
|
153
|
-
sketch.serialize(os);
|
|
154
142
|
}
|
|
155
143
|
|
|
156
144
|
TEST_CASE("cpc sketch: serialize deserialize sliding", "[cpc_sketch]") {
|
|
@@ -168,9 +156,6 @@ TEST_CASE("cpc sketch: serialize deserialize sliding", "[cpc_sketch]") {
|
|
|
168
156
|
for (int i = 0; i < n; i++) deserialized.update(i);
|
|
169
157
|
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
|
170
158
|
REQUIRE(deserialized.validate());
|
|
171
|
-
|
|
172
|
-
std::ofstream os("cpc-sliding.bin");
|
|
173
|
-
sketch.serialize(os);
|
|
174
159
|
}
|
|
175
160
|
|
|
176
161
|
TEST_CASE("cpc sketch: serializing deserialize sliding large", "[cpc_sketch]") {
|
|
@@ -188,9 +173,6 @@ TEST_CASE("cpc sketch: serializing deserialize sliding large", "[cpc_sketch]") {
|
|
|
188
173
|
for (int i = 0; i < n; i++) deserialized.update(i);
|
|
189
174
|
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
|
190
175
|
REQUIRE(deserialized.validate());
|
|
191
|
-
|
|
192
|
-
std::ofstream os("cpc-sliding-large.bin");
|
|
193
|
-
sketch.serialize(os);
|
|
194
176
|
}
|
|
195
177
|
|
|
196
178
|
TEST_CASE("cpc sketch: serialize deserialize empty, bytes", "[cpc_sketch]") {
|
|
@@ -201,9 +183,6 @@ TEST_CASE("cpc sketch: serialize deserialize empty, bytes", "[cpc_sketch]") {
|
|
|
201
183
|
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
|
202
184
|
REQUIRE(deserialized.validate());
|
|
203
185
|
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
204
|
-
|
|
205
|
-
std::ofstream os("cpc-empty.bin");
|
|
206
|
-
sketch.serialize(os);
|
|
207
186
|
}
|
|
208
187
|
|
|
209
188
|
TEST_CASE("cpc sketch: serialize deserialize sparse, bytes", "[cpc_sketch]") {
|
|
@@ -261,8 +240,6 @@ TEST_CASE("cpc sketch: serialize deserialize pinned, bytes", "[cpc_sketch]") {
|
|
|
261
240
|
for (int i = 0; i < n; i++) deserialized.update(i);
|
|
262
241
|
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
|
263
242
|
REQUIRE(deserialized.validate());
|
|
264
|
-
|
|
265
|
-
std::cout << sketch.to_string();
|
|
266
243
|
}
|
|
267
244
|
|
|
268
245
|
TEST_CASE("cpc sketch: serialize deserialize sliding, bytes", "[cpc_sketch]") {
|
|
@@ -380,8 +357,6 @@ TEST_CASE("cpc sketch: update int equivalence", "[cpc_sketch]") {
|
|
|
380
357
|
sketch.update((uint8_t) -1);
|
|
381
358
|
sketch.update((int8_t) -1);
|
|
382
359
|
REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
|
|
383
|
-
std::ofstream os("cpc-negative-one.bin"); // to compare with Java
|
|
384
|
-
sketch.serialize(os);
|
|
385
360
|
}
|
|
386
361
|
|
|
387
362
|
TEST_CASE("cpc sketch: update float equivalence", "[cpc_sketch]") {
|
|
@@ -28,10 +28,10 @@ namespace datasketches {
|
|
|
28
28
|
static const double RELATIVE_ERROR_FOR_LG_K_11 = 0.02;
|
|
29
29
|
|
|
30
30
|
TEST_CASE("cpc union: lg k limits", "[cpc_union]") {
|
|
31
|
-
cpc_union u1(
|
|
32
|
-
cpc_union u2(
|
|
33
|
-
REQUIRE_THROWS_AS(cpc_union(
|
|
34
|
-
REQUIRE_THROWS_AS(cpc_union(
|
|
31
|
+
cpc_union u1(cpc_constants::MIN_LG_K); // this should work
|
|
32
|
+
cpc_union u2(cpc_constants::MAX_LG_K); // this should work
|
|
33
|
+
REQUIRE_THROWS_AS(cpc_union(cpc_constants::MIN_LG_K - 1), std::invalid_argument);
|
|
34
|
+
REQUIRE_THROWS_AS(cpc_union(cpc_constants::MAX_LG_K + 1), std::invalid_argument);
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
TEST_CASE("cpc union: empty", "[cpc_union]") {
|
|
@@ -28,15 +28,6 @@
|
|
|
28
28
|
|
|
29
29
|
#include "common_defs.hpp"
|
|
30
30
|
|
|
31
|
-
/*
|
|
32
|
-
* Based on the following paper:
|
|
33
|
-
* Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning"
|
|
34
|
-
* https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf
|
|
35
|
-
*
|
|
36
|
-
* Inspired by the following implementation:
|
|
37
|
-
* https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py
|
|
38
|
-
*/
|
|
39
|
-
|
|
40
31
|
namespace datasketches {
|
|
41
32
|
|
|
42
33
|
template<typename T>
|
|
@@ -46,6 +37,18 @@ struct gaussian_kernel {
|
|
|
46
37
|
}
|
|
47
38
|
};
|
|
48
39
|
|
|
40
|
+
/**
|
|
41
|
+
* Density sketch.
|
|
42
|
+
*
|
|
43
|
+
* Builds a coreset from the given set of input points. Provides density estimate at a given point.
|
|
44
|
+
*
|
|
45
|
+
* Based on the following paper:
|
|
46
|
+
* Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning"
|
|
47
|
+
* https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf
|
|
48
|
+
*
|
|
49
|
+
* Inspired by the following implementation:
|
|
50
|
+
* https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py
|
|
51
|
+
*/
|
|
49
52
|
template<
|
|
50
53
|
typename T,
|
|
51
54
|
typename Kernel = gaussian_kernel<T>,
|
|
@@ -118,6 +121,10 @@ public:
|
|
|
118
121
|
template<typename FwdSketch>
|
|
119
122
|
void merge(FwdSketch&& other);
|
|
120
123
|
|
|
124
|
+
/**
|
|
125
|
+
* Density estimate at a given point
|
|
126
|
+
* @return density estimate at a given point
|
|
127
|
+
*/
|
|
121
128
|
T get_estimate(const std::vector<T>& point) const;
|
|
122
129
|
|
|
123
130
|
/**
|
|
@@ -172,7 +179,20 @@ public:
|
|
|
172
179
|
string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
|
|
173
180
|
|
|
174
181
|
class const_iterator;
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Iterator pointing to the first item in the sketch.
|
|
185
|
+
* If the sketch is empty, the returned iterator must not be dereferenced or incremented.
|
|
186
|
+
* @return iterator pointing to the first item in the sketch
|
|
187
|
+
*/
|
|
175
188
|
const_iterator begin() const;
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Iterator pointing to the past-the-end item in the sketch.
|
|
192
|
+
* The past-the-end item is the hypothetical item that would follow the last item.
|
|
193
|
+
* It does not point to any item, and must not be dereferenced or incremented.
|
|
194
|
+
* @return iterator pointing to the past-the-end item in the sketch
|
|
195
|
+
*/
|
|
176
196
|
const_iterator end() const;
|
|
177
197
|
|
|
178
198
|
private:
|
|
@@ -143,7 +143,7 @@ template<typename T, typename K, typename A>
|
|
|
143
143
|
void density_sketch<T, K, A>::compact_level(unsigned height) {
|
|
144
144
|
auto& level = levels_[height];
|
|
145
145
|
std::vector<bool> bits(level.size());
|
|
146
|
-
bits[0] = random_bit();
|
|
146
|
+
bits[0] = random_utils::random_bit();
|
|
147
147
|
std::random_shuffle(level.begin(), level.end());
|
|
148
148
|
for (unsigned i = 1; i < level.size(); ++i) {
|
|
149
149
|
T delta = 0;
|
|
@@ -32,15 +32,19 @@
|
|
|
32
32
|
|
|
33
33
|
namespace datasketches {
|
|
34
34
|
|
|
35
|
-
|
|
35
|
+
/// Frequent items error type
|
|
36
|
+
enum frequent_items_error_type {
|
|
37
|
+
NO_FALSE_POSITIVES, ///< include an item in the result list if get_lower_bound(item) > threshold
|
|
38
|
+
NO_FALSE_NEGATIVES ///< include an item in the result list if get_upper_bound(item) > threshold
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Frequent Items sketch.
|
|
43
|
+
*
|
|
36
44
|
* Based on Java implementation here:
|
|
37
45
|
* https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ItemsSketch.java
|
|
38
|
-
* author Alexander Saydakov
|
|
46
|
+
* @author Alexander Saydakov
|
|
39
47
|
*/
|
|
40
|
-
|
|
41
|
-
enum frequent_items_error_type { NO_FALSE_POSITIVES, NO_FALSE_NEGATIVES };
|
|
42
|
-
|
|
43
|
-
// type W for weight must be an arithmetic type (integral or floating point)
|
|
44
48
|
template<
|
|
45
49
|
typename T,
|
|
46
50
|
typename W = uint64_t,
|
|
@@ -49,6 +53,7 @@ template<
|
|
|
49
53
|
typename A = std::allocator<T>
|
|
50
54
|
>
|
|
51
55
|
class frequent_items_sketch {
|
|
56
|
+
static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
|
|
52
57
|
public:
|
|
53
58
|
|
|
54
59
|
static const uint8_t LG_MIN_MAP_SIZE = 3;
|
|
@@ -194,7 +199,7 @@ public:
|
|
|
194
199
|
* There may be items omitted from the set with true frequencies greater than the
|
|
195
200
|
* threshold (false negatives).</p>
|
|
196
201
|
*
|
|
197
|
-
* @param
|
|
202
|
+
* @param err_type determines whether no false positives or no false negatives are desired.
|
|
198
203
|
* @return an array of frequent items
|
|
199
204
|
*/
|
|
200
205
|
vector_row get_frequent_items(frequent_items_error_type err_type) const;
|
|
@@ -217,7 +222,7 @@ public:
|
|
|
217
222
|
* There may be items omitted from the set with true frequencies greater than the
|
|
218
223
|
* threshold (false negatives).</p>
|
|
219
224
|
*
|
|
220
|
-
* @param
|
|
225
|
+
* @param err_type determines whether no false positives or no false negatives are desired.
|
|
221
226
|
* @param threshold to include items in the result list
|
|
222
227
|
* @return an array of frequent items
|
|
223
228
|
*/
|
|
@@ -293,7 +298,9 @@ private:
|
|
|
293
298
|
static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
|
|
294
299
|
static const uint8_t PREAMBLE_LONGS_NONEMPTY = 4;
|
|
295
300
|
static constexpr double EPSILON_FACTOR = 3.5;
|
|
296
|
-
|
|
301
|
+
// due to a mistake different bits were used in C++ and Java to indicate empty sketch
|
|
302
|
+
// therefore both are set and checked for compatibility with historical binary format
|
|
303
|
+
enum flags { IS_EMPTY_1 = 0, IS_EMPTY_2 = 2 };
|
|
297
304
|
W total_weight;
|
|
298
305
|
W offset;
|
|
299
306
|
reverse_purge_hash_map<T, W, H, E, A> map;
|
|
@@ -318,14 +325,19 @@ private:
|
|
|
318
325
|
class items_deleter;
|
|
319
326
|
};
|
|
320
327
|
|
|
328
|
+
/// Row in the output from #get_frequent_items
|
|
321
329
|
template<typename T, typename W, typename H, typename E, typename A>
|
|
322
330
|
class frequent_items_sketch<T, W, H, E, A>::row {
|
|
323
331
|
public:
|
|
324
332
|
row(const T* item, W weight, W offset):
|
|
325
333
|
item(item), weight(weight), offset(offset) {}
|
|
334
|
+
/// @return item
|
|
326
335
|
const T& get_item() const { return *item; }
|
|
336
|
+
/// @return frequency (weight) estimate
|
|
327
337
|
W get_estimate() const { return weight + offset; }
|
|
338
|
+
/// @return estimate lower bound
|
|
328
339
|
W get_lower_bound() const { return weight; }
|
|
340
|
+
/// @return estimate upper bound
|
|
329
341
|
W get_upper_bound() const { return weight + offset; }
|
|
330
342
|
private:
|
|
331
343
|
const T* item;
|
|
@@ -174,7 +174,8 @@ void frequent_items_sketch<T, W, H, E, A>::serialize(std::ostream& os, const Ser
|
|
|
174
174
|
const uint8_t lg_cur_size = map.get_lg_cur_size();
|
|
175
175
|
write(os, lg_cur_size);
|
|
176
176
|
const uint8_t flags_byte(
|
|
177
|
-
|
|
177
|
+
(is_empty() ? 1 << flags::IS_EMPTY_1 : 0)
|
|
178
|
+
| (is_empty() ? 1 << flags::IS_EMPTY_2 : 0)
|
|
178
179
|
);
|
|
179
180
|
write(os, flags_byte);
|
|
180
181
|
const uint16_t unused16 = 0;
|
|
@@ -234,7 +235,8 @@ auto frequent_items_sketch<T, W, H, E, A>::serialize(unsigned header_size_bytes,
|
|
|
234
235
|
const uint8_t lg_cur_size = map.get_lg_cur_size();
|
|
235
236
|
ptr += copy_to_mem(lg_cur_size, ptr);
|
|
236
237
|
const uint8_t flags_byte(
|
|
237
|
-
|
|
238
|
+
(is_empty() ? 1 << flags::IS_EMPTY_1 : 0)
|
|
239
|
+
| (is_empty() ? 1 << flags::IS_EMPTY_2 : 0)
|
|
238
240
|
);
|
|
239
241
|
ptr += copy_to_mem(flags_byte, ptr);
|
|
240
242
|
ptr += sizeof(uint16_t); // unused
|
|
@@ -298,7 +300,7 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
|
|
|
298
300
|
const auto flags_byte = read<uint8_t>(is);
|
|
299
301
|
read<uint16_t>(is); // unused
|
|
300
302
|
|
|
301
|
-
const bool is_empty = flags_byte & (1 << flags::
|
|
303
|
+
const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY_1)) | (flags_byte & (1 << flags::IS_EMPTY_2));
|
|
302
304
|
|
|
303
305
|
check_preamble_longs(preamble_longs, is_empty);
|
|
304
306
|
check_serial_version(serial_version);
|
|
@@ -352,7 +354,7 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
|
|
|
352
354
|
ptr += copy_from_mem(ptr, flags_byte);
|
|
353
355
|
ptr += sizeof(uint16_t); // unused
|
|
354
356
|
|
|
355
|
-
const bool is_empty = flags_byte & (1 << flags::
|
|
357
|
+
const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY_1)) | (flags_byte & (1 << flags::IS_EMPTY_2));
|
|
356
358
|
|
|
357
359
|
check_preamble_longs(preamble_longs, is_empty);
|
|
358
360
|
check_serial_version(serial_version);
|
|
@@ -20,7 +20,6 @@ add_executable(fi_test)
|
|
|
20
20
|
target_link_libraries(fi_test fi common_test_lib)
|
|
21
21
|
|
|
22
22
|
set_target_properties(fi_test PROPERTIES
|
|
23
|
-
CXX_STANDARD 11
|
|
24
23
|
CXX_STANDARD_REQUIRED YES
|
|
25
24
|
)
|
|
26
25
|
|
|
@@ -42,3 +41,17 @@ target_sources(fi_test
|
|
|
42
41
|
frequent_items_sketch_test.cpp
|
|
43
42
|
frequent_items_sketch_custom_type_test.cpp
|
|
44
43
|
)
|
|
44
|
+
|
|
45
|
+
if (SERDE_COMPAT)
|
|
46
|
+
target_sources(fi_test
|
|
47
|
+
PRIVATE
|
|
48
|
+
frequent_items_sketch_deserialize_from_java_test.cpp
|
|
49
|
+
)
|
|
50
|
+
endif()
|
|
51
|
+
|
|
52
|
+
if (GENERATE)
|
|
53
|
+
target_sources(fi_test
|
|
54
|
+
PRIVATE
|
|
55
|
+
frequent_items_sketch_serialize_for_java.cpp
|
|
56
|
+
)
|
|
57
|
+
endif()
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch2/catch.hpp>
|
|
21
|
+
#include <fstream>
|
|
22
|
+
#include <frequent_items_sketch.hpp>
|
|
23
|
+
|
|
24
|
+
namespace datasketches {
|
|
25
|
+
|
|
26
|
+
// assume the binary sketches for this test have been generated by datasketches-java code
|
|
27
|
+
// in the subdirectory called "java" in the root directory of this project
|
|
28
|
+
static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
|
|
29
|
+
|
|
30
|
+
TEST_CASE("frequent longs", "[serde_compat]") {
|
|
31
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
|
32
|
+
for (const unsigned n: n_arr) {
|
|
33
|
+
std::ifstream is;
|
|
34
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
35
|
+
is.open(testBinaryInputPath + "frequent_long_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
|
36
|
+
const auto sketch = frequent_items_sketch<int64_t>::deserialize(is);
|
|
37
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
|
38
|
+
if (n > 10) {
|
|
39
|
+
REQUIRE(sketch.get_maximum_error() > 0);
|
|
40
|
+
} else {
|
|
41
|
+
REQUIRE(sketch.get_maximum_error() == 0);
|
|
42
|
+
}
|
|
43
|
+
REQUIRE(sketch.get_total_weight() == n);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
TEST_CASE("frequent strings", "[serde_compat]") {
|
|
48
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
|
49
|
+
for (const unsigned n: n_arr) {
|
|
50
|
+
std::ifstream is;
|
|
51
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
52
|
+
is.open(testBinaryInputPath + "frequent_string_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
|
53
|
+
const auto sketch = frequent_items_sketch<std::string>::deserialize(is);
|
|
54
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
|
55
|
+
if (n > 10) {
|
|
56
|
+
REQUIRE(sketch.get_maximum_error() > 0);
|
|
57
|
+
} else {
|
|
58
|
+
REQUIRE(sketch.get_maximum_error() == 0);
|
|
59
|
+
}
|
|
60
|
+
REQUIRE(sketch.get_total_weight() == n);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
TEST_CASE("frequent strings ascii", "[serde_compat]") {
|
|
65
|
+
std::ifstream is;
|
|
66
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
67
|
+
is.open(testBinaryInputPath + "frequent_string_ascii_java.sk", std::ios::binary);
|
|
68
|
+
const auto sketch = frequent_items_sketch<std::string>::deserialize(is);
|
|
69
|
+
REQUIRE_FALSE(sketch.is_empty());
|
|
70
|
+
REQUIRE(sketch.get_maximum_error() == 0);
|
|
71
|
+
REQUIRE(sketch.get_total_weight() == 10);
|
|
72
|
+
REQUIRE(sketch.get_estimate("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa") == 1);
|
|
73
|
+
REQUIRE(sketch.get_estimate("bbbbbbbbbbbbbbbbbbbbbbbbbbbbb") == 2);
|
|
74
|
+
REQUIRE(sketch.get_estimate("ccccccccccccccccccccccccccccc") == 3);
|
|
75
|
+
REQUIRE(sketch.get_estimate("ddddddddddddddddddddddddddddd") == 4);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
TEST_CASE("frequent strings utf8", "[serde_compat]") {
|
|
79
|
+
std::ifstream is;
|
|
80
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
81
|
+
is.open(testBinaryInputPath + "frequent_string_utf8_java.sk", std::ios::binary);
|
|
82
|
+
const auto sketch = frequent_items_sketch<std::string>::deserialize(is);
|
|
83
|
+
REQUIRE_FALSE(sketch.is_empty());
|
|
84
|
+
REQUIRE(sketch.get_maximum_error() == 0);
|
|
85
|
+
REQUIRE(sketch.get_total_weight() == 28);
|
|
86
|
+
REQUIRE(sketch.get_estimate("абвгд") == 1);
|
|
87
|
+
REQUIRE(sketch.get_estimate("еёжзи") == 2);
|
|
88
|
+
REQUIRE(sketch.get_estimate("йклмн") == 3);
|
|
89
|
+
REQUIRE(sketch.get_estimate("опрст") == 4);
|
|
90
|
+
REQUIRE(sketch.get_estimate("уфхцч") == 5);
|
|
91
|
+
REQUIRE(sketch.get_estimate("шщъыь") == 6);
|
|
92
|
+
REQUIRE(sketch.get_estimate("эюя") == 7);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
} /* namespace datasketches */
|