datasketches 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +3 -3
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +23 -20
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +15 -10
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +2 -2
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/{python/src/__init__.py → count/CMakeLists.txt} +25 -1
- data/vendor/datasketches-cpp/count/include/count_min.hpp +405 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +497 -0
- data/vendor/datasketches-cpp/{MANIFEST.in → count/test/CMakeLists.txt} +23 -20
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +303 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +14 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/{tox.ini → density/CMakeLists.txt} +24 -8
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +256 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/{python/datasketches/__init__.py → density/test/CMakeLists.txt} +15 -3
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +94 -61
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +20 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -18
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +79 -65
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +79 -53
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +61 -132
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +5 -40
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +76 -54
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +66 -136
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +15 -39
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -4
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +105 -26
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +50 -111
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +89 -32
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +33 -19
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +13 -10
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +23 -19
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -51
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -20
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +180 -33
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +10 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +21 -6
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +13 -3
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +15 -1
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +39 -188
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/{python/src/datasketches.cpp → tuple/test/tuple_sketch_serialize_for_java.cpp} +16 -30
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +61 -79
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -81
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -104
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -90
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -128
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -134
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -210
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -111
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -204
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -172
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -110
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -130
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -125
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -126
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -126
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -146
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -125
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -21,9 +21,26 @@
|
|
21
21
|
#define THETA_SKETCH_HPP_
|
22
22
|
|
23
23
|
#include "theta_update_sketch_base.hpp"
|
24
|
+
#include "compact_theta_sketch_parser.hpp"
|
24
25
|
|
25
26
|
namespace datasketches {
|
26
27
|
|
28
|
+
// forward declarations
|
29
|
+
template<typename A> class theta_sketch_alloc;
|
30
|
+
template<typename A> class update_theta_sketch_alloc;
|
31
|
+
template<typename A> class compact_theta_sketch_alloc;
|
32
|
+
template<typename A> class wrapped_compact_theta_sketch_alloc;
|
33
|
+
|
34
|
+
/// Theta sketch alias with default allocator
|
35
|
+
using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
|
36
|
+
/// Update Theta sketch alias with default allocator
|
37
|
+
using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
|
38
|
+
/// Compact Theta sketch alias with default allocator
|
39
|
+
using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
40
|
+
/// Wrapped Compact Theta sketch alias with default allocator
|
41
|
+
using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
42
|
+
|
43
|
+
/// Abstract base class for Theta sketch
|
27
44
|
template<typename Allocator = std::allocator<uint64_t>>
|
28
45
|
class base_theta_sketch_alloc {
|
29
46
|
public:
|
@@ -105,6 +122,7 @@ protected:
|
|
105
122
|
virtual void print_items(std::ostringstream& os) const = 0;
|
106
123
|
};
|
107
124
|
|
125
|
+
/// Base class for the Theta Sketch, a generalization of the Kth Minimum Value (KMV) sketch.
|
108
126
|
template<typename Allocator = std::allocator<uint64_t>>
|
109
127
|
class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
|
110
128
|
public:
|
@@ -148,6 +166,11 @@ protected:
|
|
148
166
|
// forward declaration
|
149
167
|
template<typename A> class compact_theta_sketch_alloc;
|
150
168
|
|
169
|
+
/**
|
170
|
+
* Update Theta sketch.
|
171
|
+
* The purpose of this class is to build a Theta sketch from input data via the update() methods.
|
172
|
+
* There is no constructor. Use builder instead.
|
173
|
+
*/
|
151
174
|
template<typename Allocator = std::allocator<uint64_t>>
|
152
175
|
class update_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
|
153
176
|
public:
|
@@ -162,11 +185,33 @@ public:
|
|
162
185
|
// No constructor here. Use builder instead.
|
163
186
|
class builder;
|
164
187
|
|
165
|
-
|
166
|
-
|
188
|
+
/**
|
189
|
+
* Copy constructor
|
190
|
+
* @param other sketch to be copied
|
191
|
+
*/
|
192
|
+
update_theta_sketch_alloc(const update_theta_sketch_alloc& other) = default;
|
193
|
+
|
194
|
+
/**
|
195
|
+
* Move constructor
|
196
|
+
* @param other sketch to be moved
|
197
|
+
*/
|
198
|
+
update_theta_sketch_alloc(update_theta_sketch_alloc&& other) noexcept = default;
|
199
|
+
|
167
200
|
virtual ~update_theta_sketch_alloc() = default;
|
168
|
-
|
169
|
-
|
201
|
+
|
202
|
+
/**
|
203
|
+
* Copy assignment
|
204
|
+
* @param other sketch to be copied
|
205
|
+
* @return reference to this sketch
|
206
|
+
*/
|
207
|
+
update_theta_sketch_alloc& operator=(const update_theta_sketch_alloc& other) = default;
|
208
|
+
|
209
|
+
/**
|
210
|
+
* Move assignment
|
211
|
+
* @param other sketch to be moved
|
212
|
+
* @return reference to this sketch
|
213
|
+
*/
|
214
|
+
update_theta_sketch_alloc& operator=(update_theta_sketch_alloc&& other) = default;
|
170
215
|
|
171
216
|
virtual Allocator get_allocator() const;
|
172
217
|
virtual bool is_empty() const;
|
@@ -286,7 +331,7 @@ public:
|
|
286
331
|
|
287
332
|
/**
|
288
333
|
* Converts this sketch to a compact sketch (ordered or unordered).
|
289
|
-
* @param ordered optional flag to specify if ordered sketch should be produced
|
334
|
+
* @param ordered optional flag to specify if an ordered sketch should be produced
|
290
335
|
* @return compact sketch
|
291
336
|
*/
|
292
337
|
compact_theta_sketch_alloc<Allocator> compact(bool ordered = true) const;
|
@@ -306,8 +351,10 @@ private:
|
|
306
351
|
virtual void print_specifics(std::ostringstream& os) const;
|
307
352
|
};
|
308
353
|
|
309
|
-
|
310
|
-
|
354
|
+
/**
|
355
|
+
* Compact Theta sketch.
|
356
|
+
* This is an immutable form of the Theta sketch, the form that can be serialized and deserialized.
|
357
|
+
*/
|
311
358
|
template<typename Allocator = std::allocator<uint64_t>>
|
312
359
|
class compact_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
|
313
360
|
public:
|
@@ -317,7 +364,8 @@ public:
|
|
317
364
|
using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
|
318
365
|
using vector_bytes = std::vector<uint8_t, AllocBytes>;
|
319
366
|
|
320
|
-
static const uint8_t
|
367
|
+
static const uint8_t UNCOMPRESSED_SERIAL_VERSION = 3;
|
368
|
+
static const uint8_t COMPRESSED_SERIAL_VERSION = 4;
|
321
369
|
static const uint8_t SKETCH_TYPE = 3;
|
322
370
|
|
323
371
|
// Instances of this type can be obtained:
|
@@ -325,13 +373,42 @@ public:
|
|
325
373
|
// - as a result of a set operation
|
326
374
|
// - by deserializing a previously serialized compact sketch
|
327
375
|
|
376
|
+
/**
|
377
|
+
* Copy constructor.
|
378
|
+
* Constructs a compact sketch from any other type of Theta sketch
|
379
|
+
* @param other sketch to be constructed from
|
380
|
+
* @param ordered if true make the resulting sketch ordered
|
381
|
+
*/
|
328
382
|
template<typename Other>
|
329
383
|
compact_theta_sketch_alloc(const Other& other, bool ordered);
|
330
|
-
|
331
|
-
|
384
|
+
|
385
|
+
/**
|
386
|
+
* Copy constructor
|
387
|
+
* @param other sketch to be copied
|
388
|
+
*/
|
389
|
+
compact_theta_sketch_alloc(const compact_theta_sketch_alloc& other) = default;
|
390
|
+
|
391
|
+
/**
|
392
|
+
* Move constructor
|
393
|
+
* @param other sketch to be moved
|
394
|
+
*/
|
395
|
+
compact_theta_sketch_alloc(compact_theta_sketch_alloc&& other) noexcept = default;
|
396
|
+
|
332
397
|
virtual ~compact_theta_sketch_alloc() = default;
|
333
|
-
|
334
|
-
|
398
|
+
|
399
|
+
/**
|
400
|
+
* Copy assignment
|
401
|
+
* @param other sketch to be copied
|
402
|
+
* @return reference to this sketch
|
403
|
+
*/
|
404
|
+
compact_theta_sketch_alloc& operator=(const compact_theta_sketch_alloc& other) = default;
|
405
|
+
|
406
|
+
/**
|
407
|
+
* Move assignment
|
408
|
+
* @param other sketch to be moved
|
409
|
+
* @return reference to this sketch
|
410
|
+
*/
|
411
|
+
compact_theta_sketch_alloc& operator=(compact_theta_sketch_alloc&& other) = default;
|
335
412
|
|
336
413
|
virtual Allocator get_allocator() const;
|
337
414
|
virtual bool is_empty() const;
|
@@ -355,6 +432,25 @@ public:
|
|
355
432
|
*/
|
356
433
|
vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
357
434
|
|
435
|
+
/**
|
436
|
+
* This method serializes the sketch into a given stream in a compressed binary form.
|
437
|
+
* Compression is applied to ordered sketches except empty and single item.
|
438
|
+
* For unordered, empty and single item sketches this method is equivalent to serialize()
|
439
|
+
* @param os output stream
|
440
|
+
*/
|
441
|
+
void serialize_compressed(std::ostream& os) const;
|
442
|
+
|
443
|
+
/**
|
444
|
+
* This method serializes the sketch as a vector of bytes.
|
445
|
+
* An optional header can be reserved in front of the sketch.
|
446
|
+
* It is an uninitialized space of a given size.
|
447
|
+
* This header is used in Datasketches PostgreSQL extension.
|
448
|
+
* Compression is applied to ordered sketches except empty and single item.
|
449
|
+
* For unordered, empty and single item sketches this method is equivalent to serialize()
|
450
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
451
|
+
*/
|
452
|
+
vector_bytes serialize_compressed(unsigned header_size_bytes = 0) const;
|
453
|
+
|
358
454
|
virtual iterator begin();
|
359
455
|
virtual iterator end();
|
360
456
|
virtual const_iterator begin() const;
|
@@ -364,6 +460,7 @@ public:
|
|
364
460
|
* This method deserializes a sketch from a given stream.
|
365
461
|
* @param is input stream
|
366
462
|
* @param seed the seed for the hash function that was used to create the sketch
|
463
|
+
* @param allocator instance of an Allocator
|
367
464
|
* @return an instance of the sketch
|
368
465
|
*/
|
369
466
|
static compact_theta_sketch_alloc deserialize(std::istream& is,
|
@@ -374,14 +471,12 @@ public:
|
|
374
471
|
* @param bytes pointer to the array of bytes
|
375
472
|
* @param size the size of the array
|
376
473
|
* @param seed the seed for the hash function that was used to create the sketch
|
474
|
+
* @param allocator instance of an Allocator
|
377
475
|
* @return an instance of the sketch
|
378
476
|
*/
|
379
477
|
static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
|
380
478
|
uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
381
479
|
|
382
|
-
// for internal use
|
383
|
-
compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
|
384
|
-
|
385
480
|
private:
|
386
481
|
enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
|
387
482
|
|
@@ -391,23 +486,46 @@ private:
|
|
391
486
|
uint64_t theta_;
|
392
487
|
std::vector<uint64_t, Allocator> entries_;
|
393
488
|
|
489
|
+
bool is_suitable_for_compression() const;
|
490
|
+
uint8_t compute_min_leading_zeros() const;
|
491
|
+
void serialize_version_4(std::ostream& os) const;
|
492
|
+
vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
|
493
|
+
|
494
|
+
static compact_theta_sketch_alloc deserialize_v1(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
|
495
|
+
static compact_theta_sketch_alloc deserialize_v2(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
|
496
|
+
static compact_theta_sketch_alloc deserialize_v3(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
|
497
|
+
static compact_theta_sketch_alloc deserialize_v4(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
|
498
|
+
|
394
499
|
virtual void print_specifics(std::ostringstream& os) const;
|
500
|
+
|
501
|
+
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_union_base;
|
502
|
+
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
|
503
|
+
template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
|
504
|
+
compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
|
395
505
|
};
|
396
506
|
|
507
|
+
/// Update Theta sketch builder
|
397
508
|
template<typename Allocator>
|
398
509
|
class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
|
399
510
|
public:
|
511
|
+
/**
|
512
|
+
* Constructor
|
513
|
+
* @param allocator
|
514
|
+
*/
|
400
515
|
builder(const Allocator& allocator = Allocator());
|
516
|
+
/// @return instance of Update Theta sketch
|
401
517
|
update_theta_sketch_alloc build() const;
|
402
518
|
};
|
403
519
|
|
404
|
-
|
405
|
-
|
406
|
-
|
520
|
+
/**
|
521
|
+
* Wrapped Compact Theta sketch.
|
522
|
+
* This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
|
523
|
+
* It does not take the ownership of the buffer.
|
524
|
+
*/
|
407
525
|
template<typename Allocator = std::allocator<uint64_t>>
|
408
|
-
class wrapped_compact_theta_sketch_alloc
|
526
|
+
class wrapped_compact_theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
|
409
527
|
public:
|
410
|
-
|
528
|
+
class const_iterator;
|
411
529
|
|
412
530
|
Allocator get_allocator() const;
|
413
531
|
bool is_empty() const;
|
@@ -416,7 +534,17 @@ public:
|
|
416
534
|
uint32_t get_num_retained() const;
|
417
535
|
uint16_t get_seed_hash() const;
|
418
536
|
|
537
|
+
/**
|
538
|
+
* Const iterator over hash values in this sketch.
|
539
|
+
* @return begin iterator
|
540
|
+
*/
|
419
541
|
const_iterator begin() const;
|
542
|
+
|
543
|
+
/**
|
544
|
+
* Const iterator pointing past the valid range.
|
545
|
+
* Not to be incremented or dereferenced.
|
546
|
+
* @return end iterator
|
547
|
+
*/
|
420
548
|
const_iterator end() const;
|
421
549
|
|
422
550
|
/**
|
@@ -424,6 +552,7 @@ public:
|
|
424
552
|
* @param bytes pointer to the array of bytes
|
425
553
|
* @param size the size of the array
|
426
554
|
* @param seed the seed for the hash function that was used to create the sketch
|
555
|
+
* @param dump_on_error if true prints hex dump of the input
|
427
556
|
* @return an instance of the sketch
|
428
557
|
*/
|
429
558
|
static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
|
@@ -433,22 +562,40 @@ protected:
|
|
433
562
|
virtual void print_items(std::ostringstream& os) const;
|
434
563
|
|
435
564
|
private:
|
436
|
-
|
437
|
-
|
438
|
-
uint16_t seed_hash_;
|
439
|
-
uint32_t num_entries_;
|
440
|
-
uint64_t theta_;
|
441
|
-
const uint64_t* entries_;
|
565
|
+
using data_type = compact_theta_sketch_parser<true>::compact_theta_sketch_data;
|
566
|
+
data_type data_;
|
442
567
|
|
443
|
-
wrapped_compact_theta_sketch_alloc(
|
444
|
-
uint64_t theta, const uint64_t* entries);
|
568
|
+
wrapped_compact_theta_sketch_alloc(const data_type& data);
|
445
569
|
};
|
446
570
|
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
using
|
451
|
-
using
|
571
|
+
template<typename Allocator>
|
572
|
+
class wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator {
|
573
|
+
public:
|
574
|
+
using iterator_category = std::input_iterator_tag;
|
575
|
+
using value_type = const uint64_t;
|
576
|
+
using difference_type = void;
|
577
|
+
using pointer = value_type*;
|
578
|
+
using reference = uint64_t;
|
579
|
+
|
580
|
+
const_iterator(const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index);
|
581
|
+
const_iterator& operator++();
|
582
|
+
const_iterator operator++(int);
|
583
|
+
bool operator==(const const_iterator& other) const;
|
584
|
+
bool operator!=(const const_iterator& other) const;
|
585
|
+
reference operator*() const;
|
586
|
+
pointer operator->() const;
|
587
|
+
|
588
|
+
private:
|
589
|
+
const void* ptr_;
|
590
|
+
uint8_t entry_bits_;
|
591
|
+
uint32_t num_entries_;
|
592
|
+
uint32_t index_;
|
593
|
+
uint64_t previous_;
|
594
|
+
bool is_block_mode_;
|
595
|
+
uint8_t buf_i_;
|
596
|
+
uint8_t offset_;
|
597
|
+
uint64_t buffer_[8];
|
598
|
+
};
|
452
599
|
|
453
600
|
} /* namespace datasketches */
|
454
601
|
|