datasketches 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
|
@@ -29,6 +29,7 @@
|
|
|
29
29
|
namespace datasketches {
|
|
30
30
|
|
|
31
31
|
/**
|
|
32
|
+
* Bounds on ratios in sampled sets.
|
|
32
33
|
* This class is used to compute the bounds on the estimate of the ratio <i>|B| / |A|</i>, where:
|
|
33
34
|
* <ul>
|
|
34
35
|
* <li><i>|A|</i> is the unknown size of a set <i>A</i> of unique identifiers.</li>
|
|
@@ -28,6 +28,7 @@
|
|
|
28
28
|
namespace datasketches {
|
|
29
29
|
|
|
30
30
|
/**
|
|
31
|
+
* Bounds on ratios in Theta sketched sets.
|
|
31
32
|
* This is to compute the bounds on the estimate of the ratio <i>B / A</i>, where:
|
|
32
33
|
* <ul>
|
|
33
34
|
* <li><i>A</i> is a Theta Sketch of population <i>PopA</i>.</li>
|
|
@@ -50,8 +51,8 @@ class bounds_on_ratios_in_theta_sketched_sets {
|
|
|
50
51
|
public:
|
|
51
52
|
/**
|
|
52
53
|
* Gets the approximate lower bound for B over A based on a 95% confidence interval
|
|
53
|
-
* @param
|
|
54
|
-
* @param
|
|
54
|
+
* @param sketch_a the sketch A
|
|
55
|
+
* @param sketch_b the sketch B
|
|
55
56
|
* @return the approximate lower bound for B over A
|
|
56
57
|
*/
|
|
57
58
|
template<typename SketchA, typename SketchB>
|
|
@@ -72,8 +73,8 @@ public:
|
|
|
72
73
|
|
|
73
74
|
/**
|
|
74
75
|
* Gets the approximate upper bound for B over A based on a 95% confidence interval
|
|
75
|
-
* @param
|
|
76
|
-
* @param
|
|
76
|
+
* @param sketch_a the sketch A
|
|
77
|
+
* @param sketch_b the sketch B
|
|
77
78
|
* @return the approximate upper bound for B over A
|
|
78
79
|
*/
|
|
79
80
|
template<typename SketchA, typename SketchB>
|
|
@@ -94,8 +95,8 @@ public:
|
|
|
94
95
|
|
|
95
96
|
/**
|
|
96
97
|
* Gets the estimate for B over A
|
|
97
|
-
* @param
|
|
98
|
-
* @param
|
|
98
|
+
* @param sketch_a the sketch A
|
|
99
|
+
* @param sketch_b the sketch B
|
|
99
100
|
* @return the estimate for B over A
|
|
100
101
|
*/
|
|
101
102
|
template<typename SketchA, typename SketchB>
|
|
@@ -25,6 +25,16 @@
|
|
|
25
25
|
|
|
26
26
|
namespace datasketches {
|
|
27
27
|
|
|
28
|
+
// forward declaration
|
|
29
|
+
template<typename A> class theta_a_not_b_alloc;
|
|
30
|
+
|
|
31
|
+
// alias with default allocator for convenience
|
|
32
|
+
using theta_a_not_b = theta_a_not_b_alloc<std::allocator<uint64_t>>;
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Theta A-not-B (set difference).
|
|
36
|
+
* Computes set difference of Theta sketches.
|
|
37
|
+
*/
|
|
28
38
|
template<typename Allocator = std::allocator<uint64_t>>
|
|
29
39
|
class theta_a_not_b_alloc {
|
|
30
40
|
public:
|
|
@@ -33,11 +43,19 @@ public:
|
|
|
33
43
|
using CompactSketch = compact_theta_sketch_alloc<Allocator>;
|
|
34
44
|
using State = theta_set_difference_base<Entry, ExtractKey, CompactSketch, Allocator>;
|
|
35
45
|
|
|
46
|
+
/**
|
|
47
|
+
* Constructor
|
|
48
|
+
* @param seed for the hash function that was used to create the sketch
|
|
49
|
+
* @param allocator to use for allocating and deallocating memory
|
|
50
|
+
*/
|
|
36
51
|
explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
|
37
52
|
|
|
38
53
|
/**
|
|
39
|
-
* Computes the
|
|
40
|
-
* @
|
|
54
|
+
* Computes the A-not-B set operation given two sketches.
|
|
55
|
+
* @param a sketch A
|
|
56
|
+
* @param b sketch B
|
|
57
|
+
* @param ordered optional flag to specify if an ordered sketch should be produced
|
|
58
|
+
* @return the result of A-not-B as a compact sketch
|
|
41
59
|
*/
|
|
42
60
|
template<typename FwdSketch, typename Sketch>
|
|
43
61
|
CompactSketch compute(FwdSketch&& a, const Sketch& b, bool ordered = true) const;
|
|
@@ -46,9 +64,6 @@ private:
|
|
|
46
64
|
State state_;
|
|
47
65
|
};
|
|
48
66
|
|
|
49
|
-
// alias with default allocator for convenience
|
|
50
|
-
using theta_a_not_b = theta_a_not_b_alloc<std::allocator<uint64_t>>;
|
|
51
|
-
|
|
52
67
|
} /* namespace datasketches */
|
|
53
68
|
|
|
54
69
|
#include "theta_a_not_b_impl.hpp"
|
|
@@ -25,15 +25,21 @@
|
|
|
25
25
|
|
|
26
26
|
namespace datasketches {
|
|
27
27
|
|
|
28
|
+
/// Theta constants
|
|
28
29
|
namespace theta_constants {
|
|
30
|
+
/// hash table resize factor
|
|
29
31
|
using resize_factor = datasketches::resize_factor;
|
|
30
|
-
|
|
31
|
-
const
|
|
32
|
+
/// default resize factor
|
|
33
|
+
const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
|
|
34
|
+
|
|
35
|
+
/// max theta - signed max for compatibility with Java
|
|
36
|
+
const uint64_t MAX_THETA = LLONG_MAX;
|
|
37
|
+
/// min log2 of K
|
|
32
38
|
const uint8_t MIN_LG_K = 5;
|
|
39
|
+
/// max log2 of K
|
|
33
40
|
const uint8_t MAX_LG_K = 26;
|
|
34
|
-
|
|
41
|
+
/// default log2 of K
|
|
35
42
|
const uint8_t DEFAULT_LG_K = 12;
|
|
36
|
-
const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
|
|
37
43
|
}
|
|
38
44
|
|
|
39
45
|
} /* namespace datasketches */
|
|
@@ -57,7 +57,7 @@ public:
|
|
|
57
57
|
// consistent way of initializing theta from p
|
|
58
58
|
// avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
|
|
59
59
|
static uint64_t starting_theta_from_p(float p) {
|
|
60
|
-
if (p < 1) return static_cast<
|
|
60
|
+
if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
|
|
61
61
|
return theta_constants::MAX_THETA;
|
|
62
62
|
}
|
|
63
63
|
|
|
@@ -25,6 +25,16 @@
|
|
|
25
25
|
|
|
26
26
|
namespace datasketches {
|
|
27
27
|
|
|
28
|
+
// forward declaration
|
|
29
|
+
template<typename A> class theta_intersection_alloc;
|
|
30
|
+
|
|
31
|
+
// alias with default allocator for convenience
|
|
32
|
+
using theta_intersection = theta_intersection_alloc<std::allocator<uint64_t>>;
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Theta intersection.
|
|
36
|
+
* Computes intersection of Theta sketches.
|
|
37
|
+
*/
|
|
28
38
|
template<typename Allocator = std::allocator<uint64_t>>
|
|
29
39
|
class theta_intersection_alloc {
|
|
30
40
|
public:
|
|
@@ -33,6 +43,7 @@ public:
|
|
|
33
43
|
using Sketch = theta_sketch_alloc<Allocator>;
|
|
34
44
|
using CompactSketch = compact_theta_sketch_alloc<Allocator>;
|
|
35
45
|
|
|
46
|
+
// there is no payload in Theta sketch entry
|
|
36
47
|
struct nop_policy {
|
|
37
48
|
void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
|
|
38
49
|
unused(incoming_entry);
|
|
@@ -41,7 +52,7 @@ public:
|
|
|
41
52
|
};
|
|
42
53
|
using State = theta_intersection_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
|
|
43
54
|
|
|
44
|
-
|
|
55
|
+
/**
|
|
45
56
|
* Constructor
|
|
46
57
|
* @param seed for the hash function that was used to create the sketch
|
|
47
58
|
* @param allocator to use for allocating and deallocating memory
|
|
@@ -61,7 +72,7 @@ public:
|
|
|
61
72
|
* Produces a copy of the current state of the intersection.
|
|
62
73
|
* If update() was not called, the state is the infinite "universe",
|
|
63
74
|
* which is considered an undefined state, and throws an exception.
|
|
64
|
-
* @param ordered optional flag to specify if ordered sketch should be produced
|
|
75
|
+
* @param ordered optional flag to specify if an ordered sketch should be produced
|
|
65
76
|
* @return the result of the intersection
|
|
66
77
|
*/
|
|
67
78
|
CompactSketch get_result(bool ordered = true) const;
|
|
@@ -76,9 +87,6 @@ private:
|
|
|
76
87
|
State state_;
|
|
77
88
|
};
|
|
78
89
|
|
|
79
|
-
// alias with default allocator for convenience
|
|
80
|
-
using theta_intersection = theta_intersection_alloc<std::allocator<uint64_t>>;
|
|
81
|
-
|
|
82
90
|
} /* namespace datasketches */
|
|
83
91
|
|
|
84
92
|
#include "theta_intersection_impl.hpp"
|
|
@@ -49,8 +49,8 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
|
49
49
|
if (!is_valid_) { // first update, copy or move incoming sketch
|
|
50
50
|
is_valid_ = true;
|
|
51
51
|
const uint8_t lg_size = lg_size_from_count(sketch.get_num_retained(), theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
|
|
52
|
-
table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
53
|
-
for (auto
|
|
52
|
+
table_ = hash_table(lg_size, lg_size - 1, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
53
|
+
for (auto&& entry: sketch) {
|
|
54
54
|
auto result = table_.find(EK()(entry));
|
|
55
55
|
if (result.second) {
|
|
56
56
|
throw std::invalid_argument("duplicate key, possibly corrupted input sketch");
|
|
@@ -64,7 +64,7 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
|
64
64
|
matched_entries.reserve(max_matches);
|
|
65
65
|
uint32_t match_count = 0;
|
|
66
66
|
uint32_t count = 0;
|
|
67
|
-
for (auto
|
|
67
|
+
for (auto&& entry: sketch) {
|
|
68
68
|
if (EK()(entry) < table_.theta_) {
|
|
69
69
|
auto result = table_.find(EK()(entry));
|
|
70
70
|
if (result.second) {
|
|
@@ -88,8 +88,8 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
|
88
88
|
if (table_.theta_ == theta_constants::MAX_THETA) table_.is_empty_ = true;
|
|
89
89
|
} else {
|
|
90
90
|
const uint8_t lg_size = lg_size_from_count(match_count, theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
|
|
91
|
-
table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
92
|
-
for (uint32_t i = 0; i < match_count; i
|
|
91
|
+
table_ = hash_table(lg_size, lg_size - 1, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
92
|
+
for (uint32_t i = 0; i < match_count; ++i) {
|
|
93
93
|
auto result = table_.find(EK()(matched_entries[i]));
|
|
94
94
|
table_.insert(result.first, std::move(matched_entries[i]));
|
|
95
95
|
}
|
|
@@ -28,9 +28,9 @@ state_(seed, nop_policy(), allocator)
|
|
|
28
28
|
{}
|
|
29
29
|
|
|
30
30
|
template<typename A>
|
|
31
|
-
template<typename
|
|
32
|
-
void theta_intersection_alloc<A>::update(
|
|
33
|
-
state_.update(std::forward<
|
|
31
|
+
template<typename FwdSketch>
|
|
32
|
+
void theta_intersection_alloc<A>::update(FwdSketch&& sketch) {
|
|
33
|
+
state_.update(std::forward<FwdSketch>(sketch));
|
|
34
34
|
}
|
|
35
35
|
|
|
36
36
|
template<typename A>
|
|
@@ -26,10 +26,11 @@
|
|
|
26
26
|
|
|
27
27
|
namespace datasketches {
|
|
28
28
|
|
|
29
|
+
/// Theta Jaccard similarity alias
|
|
29
30
|
template<typename Allocator = std::allocator<uint64_t>>
|
|
30
31
|
using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_alloc<Allocator>, theta_intersection_alloc<Allocator>, trivial_extract_key>;
|
|
31
32
|
|
|
32
|
-
|
|
33
|
+
/// Theta Jaccard similarity alias with default allocator
|
|
33
34
|
using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>;
|
|
34
35
|
|
|
35
36
|
} /* namespace datasketches */
|
|
@@ -25,6 +25,22 @@
|
|
|
25
25
|
|
|
26
26
|
namespace datasketches {
|
|
27
27
|
|
|
28
|
+
// forward declarations
|
|
29
|
+
template<typename A> class theta_sketch_alloc;
|
|
30
|
+
template<typename A> class update_theta_sketch_alloc;
|
|
31
|
+
template<typename A> class compact_theta_sketch_alloc;
|
|
32
|
+
template<typename A> class wrapped_compact_theta_sketch_alloc;
|
|
33
|
+
|
|
34
|
+
/// Theta sketch alias with default allocator
|
|
35
|
+
using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
36
|
+
/// Update Theta sketch alias with default allocator
|
|
37
|
+
using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
38
|
+
/// Compact Theta sketch alias with default allocator
|
|
39
|
+
using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
40
|
+
/// Wrapped Compact Theta sketch alias with default allocator
|
|
41
|
+
using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
42
|
+
|
|
43
|
+
/// Abstract base class for Theta sketch
|
|
28
44
|
template<typename Allocator = std::allocator<uint64_t>>
|
|
29
45
|
class base_theta_sketch_alloc {
|
|
30
46
|
public:
|
|
@@ -106,6 +122,7 @@ protected:
|
|
|
106
122
|
virtual void print_items(std::ostringstream& os) const = 0;
|
|
107
123
|
};
|
|
108
124
|
|
|
125
|
+
/// Base class for the Theta Sketch, a generalization of the Kth Minimum Value (KMV) sketch.
|
|
109
126
|
template<typename Allocator = std::allocator<uint64_t>>
|
|
110
127
|
class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
|
|
111
128
|
public:
|
|
@@ -149,6 +166,11 @@ protected:
|
|
|
149
166
|
// forward declaration
|
|
150
167
|
template<typename A> class compact_theta_sketch_alloc;
|
|
151
168
|
|
|
169
|
+
/**
|
|
170
|
+
* Update Theta sketch.
|
|
171
|
+
* The purpose of this class is to build a Theta sketch from input data via the update() methods.
|
|
172
|
+
* There is no constructor. Use builder instead.
|
|
173
|
+
*/
|
|
152
174
|
template<typename Allocator = std::allocator<uint64_t>>
|
|
153
175
|
class update_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
|
|
154
176
|
public:
|
|
@@ -163,11 +185,33 @@ public:
|
|
|
163
185
|
// No constructor here. Use builder instead.
|
|
164
186
|
class builder;
|
|
165
187
|
|
|
166
|
-
|
|
167
|
-
|
|
188
|
+
/**
|
|
189
|
+
* Copy constructor
|
|
190
|
+
* @param other sketch to be copied
|
|
191
|
+
*/
|
|
192
|
+
update_theta_sketch_alloc(const update_theta_sketch_alloc& other) = default;
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Move constructor
|
|
196
|
+
* @param other sketch to be moved
|
|
197
|
+
*/
|
|
198
|
+
update_theta_sketch_alloc(update_theta_sketch_alloc&& other) noexcept = default;
|
|
199
|
+
|
|
168
200
|
virtual ~update_theta_sketch_alloc() = default;
|
|
169
|
-
|
|
170
|
-
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Copy assignment
|
|
204
|
+
* @param other sketch to be copied
|
|
205
|
+
* @return reference to this sketch
|
|
206
|
+
*/
|
|
207
|
+
update_theta_sketch_alloc& operator=(const update_theta_sketch_alloc& other) = default;
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Move assignment
|
|
211
|
+
* @param other sketch to be moved
|
|
212
|
+
* @return reference to this sketch
|
|
213
|
+
*/
|
|
214
|
+
update_theta_sketch_alloc& operator=(update_theta_sketch_alloc&& other) = default;
|
|
171
215
|
|
|
172
216
|
virtual Allocator get_allocator() const;
|
|
173
217
|
virtual bool is_empty() const;
|
|
@@ -287,7 +331,7 @@ public:
|
|
|
287
331
|
|
|
288
332
|
/**
|
|
289
333
|
* Converts this sketch to a compact sketch (ordered or unordered).
|
|
290
|
-
* @param ordered optional flag to specify if ordered sketch should be produced
|
|
334
|
+
* @param ordered optional flag to specify if an ordered sketch should be produced
|
|
291
335
|
* @return compact sketch
|
|
292
336
|
*/
|
|
293
337
|
compact_theta_sketch_alloc<Allocator> compact(bool ordered = true) const;
|
|
@@ -307,8 +351,10 @@ private:
|
|
|
307
351
|
virtual void print_specifics(std::ostringstream& os) const;
|
|
308
352
|
};
|
|
309
353
|
|
|
310
|
-
|
|
311
|
-
|
|
354
|
+
/**
|
|
355
|
+
* Compact Theta sketch.
|
|
356
|
+
* This is an immutable form of the Theta sketch, the form that can be serialized and deserialized.
|
|
357
|
+
*/
|
|
312
358
|
template<typename Allocator = std::allocator<uint64_t>>
|
|
313
359
|
class compact_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
|
|
314
360
|
public:
|
|
@@ -327,13 +373,42 @@ public:
|
|
|
327
373
|
// - as a result of a set operation
|
|
328
374
|
// - by deserializing a previously serialized compact sketch
|
|
329
375
|
|
|
376
|
+
/**
|
|
377
|
+
* Copy constructor.
|
|
378
|
+
* Constructs a compact sketch from any other type of Theta sketch
|
|
379
|
+
* @param other sketch to be constructed from
|
|
380
|
+
* @param ordered if true make the resulting sketch ordered
|
|
381
|
+
*/
|
|
330
382
|
template<typename Other>
|
|
331
383
|
compact_theta_sketch_alloc(const Other& other, bool ordered);
|
|
332
|
-
|
|
333
|
-
|
|
384
|
+
|
|
385
|
+
/**
|
|
386
|
+
* Copy constructor
|
|
387
|
+
* @param other sketch to be copied
|
|
388
|
+
*/
|
|
389
|
+
compact_theta_sketch_alloc(const compact_theta_sketch_alloc& other) = default;
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* Move constructor
|
|
393
|
+
* @param other sketch to be moved
|
|
394
|
+
*/
|
|
395
|
+
compact_theta_sketch_alloc(compact_theta_sketch_alloc&& other) noexcept = default;
|
|
396
|
+
|
|
334
397
|
virtual ~compact_theta_sketch_alloc() = default;
|
|
335
|
-
|
|
336
|
-
|
|
398
|
+
|
|
399
|
+
/**
|
|
400
|
+
* Copy assignment
|
|
401
|
+
* @param other sketch to be copied
|
|
402
|
+
* @return reference to this sketch
|
|
403
|
+
*/
|
|
404
|
+
compact_theta_sketch_alloc& operator=(const compact_theta_sketch_alloc& other) = default;
|
|
405
|
+
|
|
406
|
+
/**
|
|
407
|
+
* Move assignment
|
|
408
|
+
* @param other sketch to be moved
|
|
409
|
+
* @return reference to this sketch
|
|
410
|
+
*/
|
|
411
|
+
compact_theta_sketch_alloc& operator=(compact_theta_sketch_alloc&& other) = default;
|
|
337
412
|
|
|
338
413
|
virtual Allocator get_allocator() const;
|
|
339
414
|
virtual bool is_empty() const;
|
|
@@ -385,6 +460,7 @@ public:
|
|
|
385
460
|
* This method deserializes a sketch from a given stream.
|
|
386
461
|
* @param is input stream
|
|
387
462
|
* @param seed the seed for the hash function that was used to create the sketch
|
|
463
|
+
* @param allocator instance of an Allocator
|
|
388
464
|
* @return an instance of the sketch
|
|
389
465
|
*/
|
|
390
466
|
static compact_theta_sketch_alloc deserialize(std::istream& is,
|
|
@@ -395,14 +471,12 @@ public:
|
|
|
395
471
|
* @param bytes pointer to the array of bytes
|
|
396
472
|
* @param size the size of the array
|
|
397
473
|
* @param seed the seed for the hash function that was used to create the sketch
|
|
474
|
+
* @param allocator instance of an Allocator
|
|
398
475
|
* @return an instance of the sketch
|
|
399
476
|
*/
|
|
400
477
|
static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
|
|
401
478
|
uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
|
402
479
|
|
|
403
|
-
// for internal use
|
|
404
|
-
compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
|
|
405
|
-
|
|
406
480
|
private:
|
|
407
481
|
enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
|
|
408
482
|
|
|
@@ -423,20 +497,33 @@ private:
|
|
|
423
497
|
static compact_theta_sketch_alloc deserialize_v4(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
|
|
424
498
|
|
|
425
499
|
virtual void print_specifics(std::ostringstream& os) const;
|
|
500
|
+
|
|
501
|
+
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_union_base;
|
|
502
|
+
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
|
|
503
|
+
template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
|
|
504
|
+
compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
|
|
426
505
|
};
|
|
427
506
|
|
|
507
|
+
/// Update Theta sketch builder
|
|
428
508
|
template<typename Allocator>
|
|
429
509
|
class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
|
|
430
510
|
public:
|
|
511
|
+
/**
|
|
512
|
+
* Constructor
|
|
513
|
+
* @param allocator
|
|
514
|
+
*/
|
|
431
515
|
builder(const Allocator& allocator = Allocator());
|
|
516
|
+
/// @return instance of Update Theta sketch
|
|
432
517
|
update_theta_sketch_alloc build() const;
|
|
433
518
|
};
|
|
434
519
|
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
520
|
+
/**
|
|
521
|
+
* Wrapped Compact Theta sketch.
|
|
522
|
+
* This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
|
|
523
|
+
* It does not take the ownership of the buffer.
|
|
524
|
+
*/
|
|
438
525
|
template<typename Allocator = std::allocator<uint64_t>>
|
|
439
|
-
class wrapped_compact_theta_sketch_alloc
|
|
526
|
+
class wrapped_compact_theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
|
|
440
527
|
public:
|
|
441
528
|
class const_iterator;
|
|
442
529
|
|
|
@@ -447,7 +534,17 @@ public:
|
|
|
447
534
|
uint32_t get_num_retained() const;
|
|
448
535
|
uint16_t get_seed_hash() const;
|
|
449
536
|
|
|
537
|
+
/**
|
|
538
|
+
* Const iterator over hash values in this sketch.
|
|
539
|
+
* @return begin iterator
|
|
540
|
+
*/
|
|
450
541
|
const_iterator begin() const;
|
|
542
|
+
|
|
543
|
+
/**
|
|
544
|
+
* Const iterator pointing past the valid range.
|
|
545
|
+
* Not to be incremented or dereferenced.
|
|
546
|
+
* @return end iterator
|
|
547
|
+
*/
|
|
451
548
|
const_iterator end() const;
|
|
452
549
|
|
|
453
550
|
/**
|
|
@@ -455,6 +552,7 @@ public:
|
|
|
455
552
|
* @param bytes pointer to the array of bytes
|
|
456
553
|
* @param size the size of the array
|
|
457
554
|
* @param seed the seed for the hash function that was used to create the sketch
|
|
555
|
+
* @param dump_on_error if true prints hex dump of the input
|
|
458
556
|
* @return an instance of the sketch
|
|
459
557
|
*/
|
|
460
558
|
static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
|
|
@@ -471,15 +569,22 @@ private:
|
|
|
471
569
|
};
|
|
472
570
|
|
|
473
571
|
template<typename Allocator>
|
|
474
|
-
class wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator
|
|
572
|
+
class wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator {
|
|
475
573
|
public:
|
|
574
|
+
using iterator_category = std::input_iterator_tag;
|
|
575
|
+
using value_type = const uint64_t;
|
|
576
|
+
using difference_type = void;
|
|
577
|
+
using pointer = value_type*;
|
|
578
|
+
using reference = uint64_t;
|
|
579
|
+
|
|
476
580
|
const_iterator(const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index);
|
|
477
581
|
const_iterator& operator++();
|
|
478
582
|
const_iterator operator++(int);
|
|
479
583
|
bool operator==(const const_iterator& other) const;
|
|
480
584
|
bool operator!=(const const_iterator& other) const;
|
|
481
|
-
|
|
482
|
-
|
|
585
|
+
reference operator*() const;
|
|
586
|
+
pointer operator->() const;
|
|
587
|
+
|
|
483
588
|
private:
|
|
484
589
|
const void* ptr_;
|
|
485
590
|
uint8_t entry_bits_;
|
|
@@ -492,12 +597,6 @@ private:
|
|
|
492
597
|
uint64_t buffer_[8];
|
|
493
598
|
};
|
|
494
599
|
|
|
495
|
-
// aliases with default allocator for convenience
|
|
496
|
-
using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
497
|
-
using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
498
|
-
using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
499
|
-
using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
500
|
-
|
|
501
600
|
} /* namespace datasketches */
|
|
502
601
|
|
|
503
602
|
#include "theta_sketch_impl.hpp"
|
|
@@ -357,7 +357,7 @@ void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
|
357
357
|
write(os, flags_byte);
|
|
358
358
|
write(os, get_seed_hash());
|
|
359
359
|
if (preamble_longs > 1) {
|
|
360
|
-
write<uint32_t>(
|
|
360
|
+
write(os, static_cast<uint32_t>(entries_.size()));
|
|
361
361
|
write<uint32_t>(os, 0); // unused
|
|
362
362
|
}
|
|
363
363
|
if (this->is_estimation_mode()) write(os, this->theta_);
|
|
@@ -385,7 +385,7 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
|
|
|
385
385
|
*ptr++ = flags_byte;
|
|
386
386
|
ptr += copy_to_mem(get_seed_hash(), ptr);
|
|
387
387
|
if (preamble_longs > 1) {
|
|
388
|
-
ptr += copy_to_mem<uint32_t>(entries_.size(), ptr);
|
|
388
|
+
ptr += copy_to_mem(static_cast<uint32_t>(entries_.size()), ptr);
|
|
389
389
|
ptr += sizeof(uint32_t); // unused
|
|
390
390
|
}
|
|
391
391
|
if (this->is_estimation_mode()) ptr += copy_to_mem(theta_, ptr);
|
|
@@ -432,7 +432,7 @@ void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const
|
|
|
432
432
|
const uint8_t entry_bits = 64 - compute_min_leading_zeros();
|
|
433
433
|
|
|
434
434
|
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
|
435
|
-
const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(entries_.size()));
|
|
435
|
+
const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
|
|
436
436
|
|
|
437
437
|
write(os, preamble_longs);
|
|
438
438
|
write(os, COMPRESSED_SERIAL_VERSION);
|
|
@@ -447,7 +447,7 @@ void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const
|
|
|
447
447
|
write(os, flags_byte);
|
|
448
448
|
write(os, get_seed_hash());
|
|
449
449
|
if (this->is_estimation_mode()) write(os, this->theta_);
|
|
450
|
-
uint32_t num_entries = entries_.size();
|
|
450
|
+
uint32_t num_entries = static_cast<uint32_t>(entries_.size());
|
|
451
451
|
for (unsigned i = 0; i < num_entries_bytes; ++i) {
|
|
452
452
|
write<uint8_t>(os, num_entries & 0xff);
|
|
453
453
|
num_entries >>= 8;
|
|
@@ -488,7 +488,7 @@ auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_byt
|
|
|
488
488
|
const size_t compressed_bits = entry_bits * entries_.size();
|
|
489
489
|
|
|
490
490
|
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
|
491
|
-
const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(entries_.size()));
|
|
491
|
+
const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
|
|
492
492
|
|
|
493
493
|
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + num_entries_bytes
|
|
494
494
|
+ whole_bytes_to_hold_bits(compressed_bits);
|
|
@@ -510,7 +510,7 @@ auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_byt
|
|
|
510
510
|
if (this->is_estimation_mode()) {
|
|
511
511
|
ptr += copy_to_mem(theta_, ptr);
|
|
512
512
|
}
|
|
513
|
-
uint32_t num_entries = entries_.size();
|
|
513
|
+
uint32_t num_entries = static_cast<uint32_t>(entries_.size());
|
|
514
514
|
for (unsigned i = 0; i < num_entries_bytes; ++i) {
|
|
515
515
|
*ptr++ = num_entries & 0xff;
|
|
516
516
|
num_entries >>= 8;
|
|
@@ -869,13 +869,13 @@ bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator==(c
|
|
|
869
869
|
}
|
|
870
870
|
|
|
871
871
|
template<typename Allocator>
|
|
872
|
-
|
|
872
|
+
auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator*() const -> reference {
|
|
873
873
|
if (entry_bits_ == 64) return *reinterpret_cast<const uint64_t*>(ptr_);
|
|
874
874
|
return buffer_[buf_i_];
|
|
875
875
|
}
|
|
876
876
|
|
|
877
877
|
template<typename Allocator>
|
|
878
|
-
|
|
878
|
+
auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator->() const -> pointer {
|
|
879
879
|
if (entry_bits_ == 64) return reinterpret_cast<const uint64_t*>(ptr_);
|
|
880
880
|
return buffer_ + buf_i_;
|
|
881
881
|
}
|