datasketches 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
|
@@ -25,31 +25,32 @@
|
|
|
25
25
|
|
|
26
26
|
namespace datasketches {
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
28
|
+
/**
|
|
29
|
+
* C++ implementation of the CountMin sketch data structure of Cormode and Muthukrishnan.
|
|
30
|
+
* [1] - http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
|
|
31
|
+
* The template type W is the type of the vector that contains the weights of the objects inserted into the sketch,
|
|
32
|
+
* not the type of the input items themselves.
|
|
33
|
+
* @author Charlie Dickens
|
|
34
|
+
*/
|
|
36
35
|
template <typename W,
|
|
37
36
|
typename Allocator = std::allocator<W>>
|
|
38
37
|
class count_min_sketch{
|
|
39
38
|
static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
|
|
40
39
|
public:
|
|
41
40
|
using allocator_type = Allocator;
|
|
41
|
+
using const_iterator = typename std::vector<W, Allocator>::const_iterator;
|
|
42
42
|
|
|
43
43
|
/**
|
|
44
44
|
* Creates an instance of the sketch given parameters _num_hashes, _num_buckets and hash seed, `seed`.
|
|
45
|
-
* @param num_hashes
|
|
46
|
-
* @param num_buckets
|
|
45
|
+
* @param num_hashes number of hash functions in the sketch. Equivalently the number of rows in the array
|
|
46
|
+
* @param num_buckets number of buckets that hash functions map into. Equivalently the number of columns in the array
|
|
47
47
|
* @param seed for hash function
|
|
48
|
+
* @param allocator to acquire and release memory
|
|
48
49
|
*
|
|
49
50
|
* The items inserted into the sketch can be arbitrary type, so long as they are hashable via murmurhash.
|
|
50
51
|
* Only update and estimate methods are added for uint64_t and string types.
|
|
51
52
|
*/
|
|
52
|
-
count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator())
|
|
53
|
+
count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
|
53
54
|
|
|
54
55
|
/**
|
|
55
56
|
* @return configured _num_hashes of this sketch
|
|
@@ -67,59 +68,61 @@ public:
|
|
|
67
68
|
uint64_t get_seed() const;
|
|
68
69
|
|
|
69
70
|
/**
|
|
70
|
-
* @return epsilon
|
|
71
|
+
* @return epsilon
|
|
71
72
|
* The maximum permissible error for any frequency estimate query.
|
|
72
73
|
* epsilon = ceil(e / _num_buckets)
|
|
73
74
|
*/
|
|
74
75
|
double get_relative_error() const;
|
|
75
76
|
|
|
76
77
|
/**
|
|
77
|
-
* @return _total_weight
|
|
78
|
+
* @return _total_weight
|
|
78
79
|
* The total weight currently inserted into the stream.
|
|
79
80
|
*/
|
|
80
81
|
W get_total_weight() const;
|
|
81
82
|
|
|
82
|
-
|
|
83
|
-
*
|
|
83
|
+
/**
|
|
84
|
+
* Suggests the number of buckets required to achieve the given relative error
|
|
85
|
+
* @param relative_error the desired accuracy within which estimates should lie.
|
|
84
86
|
* For example, when relative_error = 0.05, then the returned frequency estimates satisfy the
|
|
85
87
|
* `relative_error` guarantee that never overestimates the weights but may underestimate the weights
|
|
86
88
|
* by 5% of the total weight in the sketch.
|
|
87
|
-
* @return
|
|
89
|
+
* @return the number of hash buckets at every level of the
|
|
88
90
|
* sketch required in order to obtain the specified relative error.
|
|
89
91
|
* [1] - Section 3 ``Data Structure'', page 6.
|
|
90
92
|
*/
|
|
91
|
-
static uint32_t suggest_num_buckets(double relative_error)
|
|
93
|
+
static uint32_t suggest_num_buckets(double relative_error);
|
|
92
94
|
|
|
93
|
-
|
|
94
|
-
*
|
|
95
|
+
/**
|
|
96
|
+
* Suggests the number of hash functions required to achieve the given confidence
|
|
97
|
+
* @param confidence the desired confidence with which estimates should be correct.
|
|
95
98
|
* For example, with 95% confidence, frequency estimates satisfy the `relative_error` guarantee.
|
|
96
|
-
* @return
|
|
99
|
+
* @return the number of hash functions that are required in
|
|
97
100
|
* order to achieve the specified confidence of the sketch.
|
|
98
101
|
* confidence = 1 - delta, with delta denoting the sketch failure probability in the literature.
|
|
99
102
|
* [1] - Section 3 ``Data Structure'', page 6.
|
|
100
103
|
*/
|
|
101
|
-
static uint8_t suggest_num_hashes(double confidence)
|
|
104
|
+
static uint8_t suggest_num_hashes(double confidence);
|
|
102
105
|
|
|
103
106
|
/**
|
|
104
107
|
* Specific get_estimate function for uint64_t type
|
|
105
108
|
* see generic get_estimate function
|
|
106
|
-
* @param item
|
|
109
|
+
* @param item uint64_t type.
|
|
107
110
|
* @return an estimate of the item's frequency.
|
|
108
111
|
*/
|
|
109
|
-
W get_estimate(uint64_t item) const
|
|
112
|
+
W get_estimate(uint64_t item) const;
|
|
110
113
|
|
|
111
114
|
/**
|
|
112
115
|
* Specific get_estimate function for int64_t type
|
|
113
116
|
* see generic get_estimate function
|
|
114
|
-
* @param item
|
|
117
|
+
* @param item int64_t type.
|
|
115
118
|
* @return an estimate of the item's frequency.
|
|
116
119
|
*/
|
|
117
|
-
W get_estimate(int64_t item) const
|
|
120
|
+
W get_estimate(int64_t item) const;
|
|
118
121
|
|
|
119
122
|
/**
|
|
120
123
|
* Specific get_estimate function for std::string type
|
|
121
124
|
* see generic get_estimate function
|
|
122
|
-
* @param item
|
|
125
|
+
* @param item std::string type
|
|
123
126
|
* @return an estimate of the item's frequency.
|
|
124
127
|
*/
|
|
125
128
|
W get_estimate(const std::string& item) const;
|
|
@@ -127,69 +130,115 @@ public:
|
|
|
127
130
|
/**
|
|
128
131
|
* This is the generic estimate query function for any of the given datatypes.
|
|
129
132
|
* Query the sketch for the estimate of a given item.
|
|
130
|
-
* @param item
|
|
131
|
-
* @param size
|
|
133
|
+
* @param item pointer to the data item to be query from the sketch.
|
|
134
|
+
* @param size size of the item in bytes
|
|
132
135
|
* @return the estimated frequency of the item denoted f_est satisfying
|
|
133
136
|
* f_true - relative_error*_total_weight <= f_est <= f_true
|
|
134
137
|
*/
|
|
135
|
-
W get_estimate(const void* item, size_t size) const
|
|
138
|
+
W get_estimate(const void* item, size_t size) const;
|
|
136
139
|
|
|
137
140
|
/**
|
|
138
141
|
* Query the sketch for the upper bound of a given item.
|
|
139
|
-
* @param item
|
|
142
|
+
* @param item to query
|
|
143
|
+
* @param size of the item in bytes
|
|
140
144
|
* @return the upper bound on the true frequency of the item
|
|
141
145
|
* f_true <= f_est + relative_error*_total_weight
|
|
142
146
|
*/
|
|
143
147
|
W get_upper_bound(const void* item, size_t size) const;
|
|
144
|
-
|
|
145
|
-
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Query the sketch for the upper bound of a given item.
|
|
151
|
+
* @param item to query
|
|
152
|
+
* @return the upper bound on the true frequency of the item
|
|
153
|
+
* f_true <= f_est + relative_error*_total_weight
|
|
154
|
+
*/
|
|
155
|
+
W get_upper_bound(int64_t item) const;
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Query the sketch for the upper bound of a given item.
|
|
159
|
+
* @param item to query
|
|
160
|
+
* @return the upper bound on the true frequency of the item
|
|
161
|
+
* f_true <= f_est + relative_error*_total_weight
|
|
162
|
+
*/
|
|
163
|
+
W get_upper_bound(uint64_t item) const;
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Query the sketch for the upper bound of a given item.
|
|
167
|
+
* @param item to query
|
|
168
|
+
* @return the upper bound on the true frequency of the item
|
|
169
|
+
* f_true <= f_est + relative_error*_total_weight
|
|
170
|
+
*/
|
|
146
171
|
W get_upper_bound(const std::string& item) const;
|
|
147
172
|
|
|
148
173
|
/**
|
|
149
174
|
* Query the sketch for the lower bound of a given item.
|
|
150
|
-
* @param item
|
|
175
|
+
* @param item to query
|
|
176
|
+
* @param size of the item in bytes
|
|
151
177
|
* @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
|
|
152
178
|
* f_true - relative_error*_total_weight <= f_est
|
|
153
179
|
*/
|
|
154
|
-
W get_lower_bound(const void* item, size_t size) const
|
|
155
|
-
W get_lower_bound(int64_t) const ;
|
|
156
|
-
W get_lower_bound(uint64_t) const ;
|
|
157
|
-
W get_lower_bound(const std::string& item) const ;
|
|
180
|
+
W get_lower_bound(const void* item, size_t size) const;
|
|
158
181
|
|
|
159
|
-
|
|
182
|
+
/**
|
|
183
|
+
* Query the sketch for the lower bound of a given item.
|
|
184
|
+
* @param item to query
|
|
185
|
+
* @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
|
|
186
|
+
* f_true - relative_error*_total_weight <= f_est
|
|
187
|
+
*/
|
|
188
|
+
W get_lower_bound(int64_t item) const;
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Query the sketch for the lower bound of a given item.
|
|
192
|
+
* @param item to query
|
|
193
|
+
* @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
|
|
194
|
+
* f_true - relative_error*_total_weight <= f_est
|
|
195
|
+
*/
|
|
196
|
+
W get_lower_bound(uint64_t item) const;
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Query the sketch for the lower bound of a given item.
|
|
200
|
+
* @param item to query
|
|
201
|
+
* @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
|
|
202
|
+
* f_true - relative_error*_total_weight <= f_est
|
|
203
|
+
*/
|
|
204
|
+
W get_lower_bound(const std::string& item) const;
|
|
205
|
+
|
|
206
|
+
/**
|
|
160
207
|
* Update this sketch with given data of any type.
|
|
161
|
-
* This is a "universal" update that covers all cases
|
|
162
|
-
* but may produce different hashes.
|
|
208
|
+
* This is a "universal" update that covers all cases,
|
|
209
|
+
* but may produce different hashes compared to specialized update methods.
|
|
163
210
|
* @param item pointer to the data item to be inserted into the sketch.
|
|
164
211
|
* @param size of the data in bytes
|
|
165
|
-
* @
|
|
212
|
+
* @param weight arithmetic type
|
|
166
213
|
*/
|
|
167
|
-
void update(const void* item, size_t size, W weight)
|
|
214
|
+
void update(const void* item, size_t size, W weight);
|
|
168
215
|
|
|
169
216
|
/**
|
|
170
|
-
* Update this sketch with a given
|
|
171
|
-
* @param item
|
|
172
|
-
* @param weight
|
|
173
|
-
* void function which inserts an item of type uint64_t into the sketch
|
|
217
|
+
* Update this sketch with a given item.
|
|
218
|
+
* @param item to update the sketch with
|
|
219
|
+
* @param weight arithmetic type
|
|
174
220
|
*/
|
|
175
|
-
void update(uint64_t item, W weight)
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
221
|
+
void update(uint64_t item, W weight = 1);
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Update this sketch with a given item.
|
|
225
|
+
* @param item to update the sketch with
|
|
226
|
+
* @param weight arithmetic type
|
|
227
|
+
*/
|
|
228
|
+
void update(int64_t item, W weight = 1);
|
|
179
229
|
|
|
180
230
|
/**
|
|
181
231
|
* Update this sketch with a given string.
|
|
182
|
-
* @param item
|
|
183
|
-
* @param weight
|
|
184
|
-
* void function which inserts an item of type std::string into the sketch
|
|
232
|
+
* @param item string to update the sketch with
|
|
233
|
+
* @param weight arithmetic type
|
|
185
234
|
*/
|
|
186
|
-
void update(const std::string& item, W weight)
|
|
187
|
-
void update(const std::string& item) ;
|
|
235
|
+
void update(const std::string& item, W weight = 1);
|
|
188
236
|
|
|
189
|
-
|
|
190
|
-
*
|
|
237
|
+
/**
|
|
238
|
+
* Merges another count_min_sketch into this count_min_sketch.
|
|
239
|
+
* @param other_sketch
|
|
191
240
|
*/
|
|
192
|
-
void merge(const count_min_sketch
|
|
241
|
+
void merge(const count_min_sketch& other_sketch);
|
|
193
242
|
|
|
194
243
|
/**
|
|
195
244
|
* Returns true if this sketch is empty.
|
|
@@ -197,7 +246,7 @@ public:
|
|
|
197
246
|
* This can only ever happen if all items inserted to the sketch have weights that cancel each other out.
|
|
198
247
|
* @return empty flag
|
|
199
248
|
*/
|
|
200
|
-
bool is_empty() const
|
|
249
|
+
bool is_empty() const;
|
|
201
250
|
|
|
202
251
|
/**
|
|
203
252
|
* @brief Returns a string describing the sketch
|
|
@@ -205,15 +254,23 @@ public:
|
|
|
205
254
|
*/
|
|
206
255
|
string<Allocator> to_string() const;
|
|
207
256
|
|
|
208
|
-
|
|
209
|
-
|
|
257
|
+
/**
|
|
258
|
+
* Iterator pointing to the first item in the sketch.
|
|
259
|
+
* If the sketch is empty, the returned iterator must not be dereferenced or incremented.
|
|
260
|
+
* @return iterator pointing to the first item in the sketch
|
|
261
|
+
*/
|
|
210
262
|
const_iterator begin() const;
|
|
211
|
-
const_iterator end() const;
|
|
212
263
|
|
|
213
264
|
/**
|
|
214
|
-
*
|
|
215
|
-
*
|
|
216
|
-
*
|
|
265
|
+
* Iterator pointing to the past-the-end item in the sketch.
|
|
266
|
+
* The past-the-end item is the hypothetical item that would follow the last item.
|
|
267
|
+
* It does not point to any item, and must not be dereferenced or incremented.
|
|
268
|
+
* @return iterator pointing to the past-the-end item in the sketch
|
|
269
|
+
*/
|
|
270
|
+
const_iterator end() const;
|
|
271
|
+
|
|
272
|
+
/*
|
|
273
|
+
* The serialized sketch binary form has the following structure
|
|
217
274
|
* Byte 0:
|
|
218
275
|
* 1 - if and only if the sketch is empty
|
|
219
276
|
* 0 - otherwise
|
|
@@ -254,8 +311,6 @@ public:
|
|
|
254
311
|
||---------------------------- sketch entries ---------------------------|
|
|
255
312
|
...
|
|
256
313
|
|
|
257
|
-
*
|
|
258
|
-
*
|
|
259
314
|
*/
|
|
260
315
|
|
|
261
316
|
|
|
@@ -266,7 +321,8 @@ public:
|
|
|
266
321
|
size_t get_serialized_size_bytes() const;
|
|
267
322
|
|
|
268
323
|
/**
|
|
269
|
-
* This method serializes
|
|
324
|
+
* This method serializes the sketch into a given stream in a binary form
|
|
325
|
+
* @param os output stream
|
|
270
326
|
*/
|
|
271
327
|
void serialize(std::ostream& os) const;
|
|
272
328
|
|
|
@@ -287,6 +343,7 @@ public:
|
|
|
287
343
|
* This method deserializes a sketch from a given stream.
|
|
288
344
|
* @param is input stream
|
|
289
345
|
* @param seed the seed for the hash function that was used to create the sketch
|
|
346
|
+
* @param allocator instance of an Allocator
|
|
290
347
|
* @return an instance of a sketch
|
|
291
348
|
*/
|
|
292
349
|
static count_min_sketch deserialize(std::istream& is, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
|
|
@@ -296,24 +353,24 @@ public:
|
|
|
296
353
|
* @param bytes pointer to the array of bytes
|
|
297
354
|
* @param size the size of the array
|
|
298
355
|
* @param seed the seed for the hash function that was used to create the sketch
|
|
356
|
+
* @param allocator instance of an Allocator
|
|
299
357
|
* @return an instance of the sketch
|
|
300
358
|
*/
|
|
301
359
|
static count_min_sketch deserialize(const void* bytes, size_t size, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
|
|
302
360
|
|
|
303
361
|
/**
|
|
304
|
-
* Returns the allocator for this sketch.
|
|
305
362
|
* @return allocator
|
|
306
363
|
*/
|
|
307
364
|
allocator_type get_allocator() const;
|
|
308
365
|
|
|
309
366
|
private:
|
|
310
367
|
Allocator _allocator;
|
|
311
|
-
uint8_t _num_hashes
|
|
312
|
-
uint32_t _num_buckets
|
|
313
|
-
std::vector<W, Allocator> _sketch_array
|
|
314
|
-
uint64_t _seed
|
|
315
|
-
W _total_weight
|
|
316
|
-
std::vector<uint64_t> hash_seeds
|
|
368
|
+
uint8_t _num_hashes;
|
|
369
|
+
uint32_t _num_buckets;
|
|
370
|
+
std::vector<W, Allocator> _sketch_array; // the array stored by the sketch
|
|
371
|
+
uint64_t _seed;
|
|
372
|
+
W _total_weight;
|
|
373
|
+
std::vector<uint64_t> hash_seeds;
|
|
317
374
|
|
|
318
375
|
enum flags {IS_EMPTY};
|
|
319
376
|
static const uint8_t PREAMBLE_LONGS_SHORT = 2; // Empty -> need second byte for sketch parameters
|
|
@@ -331,9 +388,6 @@ private:
|
|
|
331
388
|
*/
|
|
332
389
|
static void check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte);
|
|
333
390
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
391
|
/*
|
|
338
392
|
* Obtain the hash values when inserting an item into the sketch.
|
|
339
393
|
* @param item pointer to the data item to be inserted into the sketch.
|