datasketches 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
@@ -0,0 +1,215 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <memory>
|
21
|
+
#include <pybind11/pybind11.h>
|
22
|
+
#include <pybind11/stl.h>
|
23
|
+
|
24
|
+
#include "theta_sketch.hpp"
|
25
|
+
#include "tuple_sketch.hpp"
|
26
|
+
#include "tuple_union.hpp"
|
27
|
+
#include "tuple_intersection.hpp"
|
28
|
+
#include "tuple_a_not_b.hpp"
|
29
|
+
#include "theta_jaccard_similarity_base.hpp"
|
30
|
+
#include "common_defs.hpp"
|
31
|
+
|
32
|
+
#include "py_serde.hpp"
|
33
|
+
#include "tuple_policy.hpp"
|
34
|
+
|
35
|
+
namespace py = pybind11;
|
36
|
+
|
37
|
+
void init_tuple(py::module &m) {
|
38
|
+
using namespace datasketches;
|
39
|
+
|
40
|
+
// generic tuple_policy:
|
41
|
+
// * update sketch policy uses create_summary and update_summary
|
42
|
+
// * set operation policies all use __call__
|
43
|
+
py::class_<tuple_policy, TuplePolicy, std::shared_ptr<tuple_policy>>(m, "TuplePolicy")
|
44
|
+
.def(py::init())
|
45
|
+
.def("create_summary", &tuple_policy::create_summary)
|
46
|
+
.def("update_summary", &tuple_policy::update_summary, py::arg("summary"), py::arg("update"))
|
47
|
+
.def("__call__", &tuple_policy::operator(), py::arg("summary"), py::arg("update"))
|
48
|
+
;
|
49
|
+
|
50
|
+
// potentially useful for debugging but not needed as a permanent
|
51
|
+
// object type in the library
|
52
|
+
/*
|
53
|
+
py::class_<tuple_policy_holder>(m, "TuplePolicyHolder")
|
54
|
+
.def(py::init<std::shared_ptr<tuple_policy>>(), py::arg("policy"))
|
55
|
+
.def("create", &tuple_policy_holder::create, "Creates a new Summary object")
|
56
|
+
.def("update", &tuple_policy_holder::update, py::arg("summary"), py::arg("update"),
|
57
|
+
"Updates the provided summary using the data in update")
|
58
|
+
;
|
59
|
+
*/
|
60
|
+
|
61
|
+
using py_tuple_sketch = tuple_sketch<py::object>;
|
62
|
+
using py_update_tuple = update_tuple_sketch<py::object, py::object, tuple_policy_holder>;
|
63
|
+
using py_compact_tuple = compact_tuple_sketch<py::object>;
|
64
|
+
using py_tuple_union = tuple_union<py::object, tuple_policy_holder>;
|
65
|
+
using py_tuple_intersection = tuple_intersection<py::object, tuple_policy_holder>;
|
66
|
+
using py_tuple_a_not_b = tuple_a_not_b<py::object>;
|
67
|
+
using py_tuple_jaccard_similarity = jaccard_similarity_base<tuple_union<py::object, dummy_jaccard_policy>, tuple_intersection<py::object, dummy_jaccard_policy>, pair_extract_key<uint64_t, py::object>>;
|
68
|
+
|
69
|
+
py::class_<py_tuple_sketch>(m, "_tuple_sketch")
|
70
|
+
.def("__str__", &py_tuple_sketch::to_string, py::arg("print_items")=false,
|
71
|
+
"Produces a string summary of the sketch")
|
72
|
+
.def("to_string", &py_tuple_sketch::to_string, py::arg("print_items")=false,
|
73
|
+
"Produces a string summary of the sketch")
|
74
|
+
.def("is_empty", &py_tuple_sketch::is_empty,
|
75
|
+
"Returns True if the sketch is empty, otherwise False")
|
76
|
+
.def("get_estimate", &py_tuple_sketch::get_estimate,
|
77
|
+
"Estimate of the distinct count of the input stream")
|
78
|
+
.def("get_upper_bound", static_cast<double (py_tuple_sketch::*)(uint8_t) const>(&py_tuple_sketch::get_upper_bound), py::arg("num_std_devs"),
|
79
|
+
"Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}")
|
80
|
+
.def("get_lower_bound", static_cast<double (py_tuple_sketch::*)(uint8_t) const>(&py_tuple_sketch::get_lower_bound), py::arg("num_std_devs"),
|
81
|
+
"Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}")
|
82
|
+
.def("is_estimation_mode", &py_tuple_sketch::is_estimation_mode,
|
83
|
+
"Returns True if sketch is in estimation mode, otherwise False")
|
84
|
+
.def("get_theta", &py_tuple_sketch::get_theta,
|
85
|
+
"Returns theta (effective sampling rate) as a fraction from 0 to 1")
|
86
|
+
.def("get_theta64", &py_tuple_sketch::get_theta64,
|
87
|
+
"Returns theta as 64-bit value")
|
88
|
+
.def("get_num_retained", &py_tuple_sketch::get_num_retained,
|
89
|
+
"Returns the number of items currently in the sketch")
|
90
|
+
.def("get_seed_hash", [](const py_tuple_sketch& sk) { return sk.get_seed_hash(); }, // why does regular call not work??
|
91
|
+
"Returns a hash of the seed used in the sketch")
|
92
|
+
.def("is_ordered", &py_tuple_sketch::is_ordered,
|
93
|
+
"Returns True if the sketch entries are sorted, otherwise False")
|
94
|
+
.def("__iter__", [](const py_tuple_sketch& s) { return py::make_iterator(s.begin(), s.end()); })
|
95
|
+
.def_property_readonly_static("DEFAULT_SEED", [](py::object /* self */) { return DEFAULT_SEED; });
|
96
|
+
;
|
97
|
+
|
98
|
+
py::class_<py_compact_tuple, py_tuple_sketch>(m, "_compact_tuple_sketch")
|
99
|
+
.def(py::init<const py_compact_tuple&>(), py::arg("other"))
|
100
|
+
.def(py::init<const py_tuple_sketch&, bool>(), py::arg("other"), py::arg("ordered")=true)
|
101
|
+
.def(py::init<const theta_sketch&, py::object&>(), py::arg("other"), py::arg("summary"),
|
102
|
+
"Creates a compact tuple sketch from a theta sketch using a fixed summary value.")
|
103
|
+
.def(
|
104
|
+
"serialize",
|
105
|
+
[](const py_compact_tuple& sk, py_object_serde& serde) {
|
106
|
+
auto bytes = sk.serialize(0, serde);
|
107
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
108
|
+
}, py::arg("serde"),
|
109
|
+
"Serializes the sketch into a bytes object"
|
110
|
+
)
|
111
|
+
.def_static(
|
112
|
+
"deserialize",
|
113
|
+
[](const std::string& bytes, py_object_serde& serde, uint64_t seed) {
|
114
|
+
return py_compact_tuple::deserialize(bytes.data(), bytes.size(), seed, serde);
|
115
|
+
},
|
116
|
+
py::arg("bytes"), py::arg("serde"), py::arg("seed")=DEFAULT_SEED,
|
117
|
+
"Reads a bytes object and returns the corresponding compact_tuple_sketch"
|
118
|
+
);
|
119
|
+
|
120
|
+
py::class_<py_update_tuple, py_tuple_sketch>(m, "_update_tuple_sketch")
|
121
|
+
.def(
|
122
|
+
py::init([](std::shared_ptr<tuple_policy> policy, uint8_t lg_k, double p, uint64_t seed) {
|
123
|
+
tuple_policy_holder holder(policy);
|
124
|
+
return py_update_tuple::builder(holder).set_lg_k(lg_k).set_p(p).set_seed(seed).build();
|
125
|
+
}),
|
126
|
+
py::arg("policy"), py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED
|
127
|
+
)
|
128
|
+
.def(py::init<const py_update_tuple&>())
|
129
|
+
.def("update", static_cast<void (py_update_tuple::*)(int64_t, py::object&)>(&py_update_tuple::update),
|
130
|
+
py::arg("datum"), py::arg("value"),
|
131
|
+
"Updates the sketch with the given integral item and summary value")
|
132
|
+
.def("update", static_cast<void (py_update_tuple::*)(double, py::object&)>(&py_update_tuple::update),
|
133
|
+
py::arg("datum"), py::arg("value"),
|
134
|
+
"Updates the sketch with the given floating point item and summary value")
|
135
|
+
.def("update", static_cast<void (py_update_tuple::*)(const std::string&, py::object&)>(&py_update_tuple::update),
|
136
|
+
py::arg("datum"), py::arg("value"),
|
137
|
+
"Updates the sketch with the given string item and summary value")
|
138
|
+
.def("compact", &py_update_tuple::compact, py::arg("ordered")=true,
|
139
|
+
"Returns a compacted form of the sketch, optionally sorting it")
|
140
|
+
.def("reset", &py_update_tuple::reset, "Resets the sketch to the initial empty state")
|
141
|
+
;
|
142
|
+
|
143
|
+
py::class_<py_tuple_union>(m, "_tuple_union")
|
144
|
+
.def(
|
145
|
+
py::init([](std::shared_ptr<tuple_policy> policy, uint8_t lg_k, double p, uint64_t seed) {
|
146
|
+
tuple_policy_holder holder(policy);
|
147
|
+
return py_tuple_union::builder(holder).set_lg_k(lg_k).set_p(p).set_seed(seed).build();
|
148
|
+
}),
|
149
|
+
py::arg("policy"), py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED
|
150
|
+
)
|
151
|
+
.def("update", &py_tuple_union::update<const py_tuple_sketch&>, py::arg("sketch"),
|
152
|
+
"Updates the union with the given sketch")
|
153
|
+
.def("get_result", &py_tuple_union::get_result, py::arg("ordered")=true,
|
154
|
+
"Returns the sketch corresponding to the union result")
|
155
|
+
.def("reset", &py_tuple_union::reset,
|
156
|
+
"Resets the sketch to the initial empty")
|
157
|
+
;
|
158
|
+
|
159
|
+
py::class_<py_tuple_intersection>(m, "_tuple_intersection")
|
160
|
+
.def(
|
161
|
+
py::init([](std::shared_ptr<tuple_policy> policy, uint64_t seed) {
|
162
|
+
tuple_policy_holder holder(policy);
|
163
|
+
return py_tuple_intersection(seed, holder);
|
164
|
+
}),
|
165
|
+
py::arg("policy"), py::arg("seed")=DEFAULT_SEED)
|
166
|
+
.def("update", &py_tuple_intersection::update<const py_tuple_sketch&>, py::arg("sketch"),
|
167
|
+
"Intersects the provided sketch with the current intersection state")
|
168
|
+
.def("get_result", &py_tuple_intersection::get_result, py::arg("ordered")=true,
|
169
|
+
"Returns the sketch corresponding to the intersection result")
|
170
|
+
.def("has_result", &py_tuple_intersection::has_result,
|
171
|
+
"Returns True if the intersection has a valid result, otherwise False")
|
172
|
+
;
|
173
|
+
|
174
|
+
py::class_<py_tuple_a_not_b>(m, "_tuple_a_not_b")
|
175
|
+
.def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
|
176
|
+
.def(
|
177
|
+
"compute",
|
178
|
+
&py_tuple_a_not_b::compute<const py_tuple_sketch&, const py_tuple_sketch&>,
|
179
|
+
py::arg("a"), py::arg("b"), py::arg("ordered")=true,
|
180
|
+
"Returns a sketch with the result of applying the A-not-B operation on the given inputs"
|
181
|
+
)
|
182
|
+
;
|
183
|
+
|
184
|
+
py::class_<py_tuple_jaccard_similarity>(m, "_tuple_jaccard_similarity")
|
185
|
+
.def_static(
|
186
|
+
"jaccard",
|
187
|
+
[](const py_tuple_sketch& sketch_a, const py_tuple_sketch& sketch_b, uint64_t seed) {
|
188
|
+
return py_tuple_jaccard_similarity::jaccard(sketch_a, sketch_b, seed);
|
189
|
+
},
|
190
|
+
py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
|
191
|
+
"Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches"
|
192
|
+
)
|
193
|
+
.def_static(
|
194
|
+
"exactly_equal",
|
195
|
+
&py_tuple_jaccard_similarity::exactly_equal<const py_tuple_sketch&, const py_tuple_sketch&>,
|
196
|
+
py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
|
197
|
+
"Returns True if sketch_a and sketch_b are equivalent, otherwise False"
|
198
|
+
)
|
199
|
+
.def_static(
|
200
|
+
"similarity_test",
|
201
|
+
&py_tuple_jaccard_similarity::similarity_test<const py_tuple_sketch&, const py_tuple_sketch&>,
|
202
|
+
py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
|
203
|
+
"Tests similarity of an actual sketch against an expected sketch. Computes the lower bound of the Jaccard "
|
204
|
+
"index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered "
|
205
|
+
"to be similar with a confidence of 97.7% and returns True, otherwise False.")
|
206
|
+
.def_static(
|
207
|
+
"dissimilarity_test",
|
208
|
+
&py_tuple_jaccard_similarity::dissimilarity_test<const py_tuple_sketch&, const py_tuple_sketch&>,
|
209
|
+
py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
|
210
|
+
"Tests dissimilarity of an actual sketch against an expected sketch. Computes the upper bound of the Jaccard "
|
211
|
+
"index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered "
|
212
|
+
"to be dissimilar with a confidence of 97.7% and returns True, otherwise False."
|
213
|
+
)
|
214
|
+
;
|
215
|
+
}
|
@@ -140,7 +140,7 @@ void bind_vo_sketch(py::module &m, const char* name) {
|
|
140
140
|
.def("serialize", &dspy::vo_sketch_serialize<T>, py::arg("serde"), "Serialize the var opt sketch using the provided serde")
|
141
141
|
.def_static("deserialize", &dspy::vo_sketch_deserialize<T>, py::arg("bytes"), py::arg("serde"),
|
142
142
|
"Constructs a var opt sketch from the given bytes using the provided serde")
|
143
|
-
;
|
143
|
+
.def("__iter__", [](const var_opt_sketch<T>& sk) { return py::make_iterator(sk.begin(), sk.end()); });
|
144
144
|
}
|
145
145
|
|
146
146
|
template<typename T>
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
import unittest
|
19
|
+
from datasketches import count_min_sketch
|
20
|
+
|
21
|
+
class CountMinTest(unittest.TestCase):
|
22
|
+
def test_count_min_example(self):
|
23
|
+
# we'll define target confidence and relative error and use the built-in
|
24
|
+
# methods to determine how many hashes and buckets to use
|
25
|
+
confidence = 0.95
|
26
|
+
num_hashes = count_min_sketch.suggest_num_hashes(confidence)
|
27
|
+
relative_error = 0.01
|
28
|
+
num_buckets = count_min_sketch.suggest_num_buckets(relative_error)
|
29
|
+
|
30
|
+
# now we can create a few empty sketches
|
31
|
+
cm = count_min_sketch(num_hashes, num_buckets)
|
32
|
+
cm2 = count_min_sketch(num_hashes, num_buckets)
|
33
|
+
self.assertTrue(cm.is_empty())
|
34
|
+
|
35
|
+
# we'll use a moderate number of distinct items with
|
36
|
+
# increasing weights, with each item's weight being
|
37
|
+
# equal to its value
|
38
|
+
n = 1000
|
39
|
+
total_wt = 0
|
40
|
+
for i in range(1, n+1):
|
41
|
+
cm.update(i, i)
|
42
|
+
total_wt += i
|
43
|
+
self.assertFalse(cm.is_empty())
|
44
|
+
self.assertEqual(cm.get_total_weight(), total_wt)
|
45
|
+
|
46
|
+
# querying the items, each of them should
|
47
|
+
# have a non-zero count. the estimate should
|
48
|
+
# be at least i with appropriately behaved bounds.
|
49
|
+
for i in range(1, n+1):
|
50
|
+
val = cm.get_estimate(i)
|
51
|
+
self.assertGreaterEqual(val, i)
|
52
|
+
self.assertGreaterEqual(val, cm.get_lower_bound(i))
|
53
|
+
self.assertGreater(cm.get_upper_bound(i), val)
|
54
|
+
|
55
|
+
# values not in the sketch should have lower estimates, but
|
56
|
+
# are not guaranteed to be zero and will succeed
|
57
|
+
self.assertIsNotNone(cm.get_estimate("not in set"))
|
58
|
+
|
59
|
+
# we can create another sketch with partial overlap
|
60
|
+
# and merge them
|
61
|
+
for i in range(int(n / 2), int(3 * n / 2)):
|
62
|
+
cm2.update(i, i)
|
63
|
+
cm.merge(cm2)
|
64
|
+
|
65
|
+
# and the estimated weight for the overlapped meerged values
|
66
|
+
# (n/2 to n) should now be at least 2x the value
|
67
|
+
self.assertGreaterEqual(cm.get_estimate(n), 2 * n)
|
68
|
+
|
69
|
+
# finally, serialize and reconstruct
|
70
|
+
cm_bytes = cm.serialize()
|
71
|
+
self.assertEqual(cm.get_serialized_size_bytes(), len(cm_bytes))
|
72
|
+
new_cm = count_min_sketch.deserialize(cm_bytes)
|
73
|
+
|
74
|
+
# and now interrogate the sketch
|
75
|
+
self.assertFalse(new_cm.is_empty())
|
76
|
+
self.assertEqual(new_cm.get_num_hashes(), cm.get_num_hashes())
|
77
|
+
self.assertEqual(new_cm.get_num_buckets(), cm.get_num_buckets())
|
78
|
+
self.assertEqual(new_cm.get_total_weight(), cm.get_total_weight())
|
79
|
+
|
80
|
+
# we can also iterate through values in and out of the sketch to ensure
|
81
|
+
# the estimates match
|
82
|
+
for i in range(0, 2 * n):
|
83
|
+
self.assertEqual(cm.get_estimate(i), new_cm.get_estimate(i))
|
84
|
+
|
85
|
+
if __name__ == '__main__':
|
86
|
+
unittest.main()
|
@@ -14,26 +14,26 @@
|
|
14
14
|
# KIND, either express or implied. See the License for the
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
|
-
|
17
|
+
|
18
18
|
import unittest
|
19
19
|
from datasketches import cpc_sketch, cpc_union
|
20
20
|
|
21
21
|
class CpcTest(unittest.TestCase):
|
22
22
|
def test_cpc_example(self):
|
23
|
-
|
24
|
-
n = 1 << 18 # ~256k
|
23
|
+
lgk = 12 # 2^k = 4096 rows in the table
|
24
|
+
n = 1 << 18 # ~256k distinct values
|
25
25
|
|
26
26
|
# create a couple sketches and inject some values
|
27
27
|
# we'll have 1/4 of the values overlap
|
28
|
-
cpc = cpc_sketch(
|
29
|
-
cpc2 = cpc_sketch(
|
28
|
+
cpc = cpc_sketch(lgk)
|
29
|
+
cpc2 = cpc_sketch(lgk)
|
30
30
|
offset = int(3 * n / 4) # it's a float w/o cast
|
31
31
|
# because we hash on the bits, not an abstract numeric value,
|
32
32
|
# cpc.update(1) and cpc.update(1.0) give different results.
|
33
33
|
for i in range(0, n):
|
34
34
|
cpc.update(i)
|
35
35
|
cpc2.update(i + offset)
|
36
|
-
|
36
|
+
|
37
37
|
# although we provide get_composite_estimate() and get_estimate(),
|
38
38
|
# the latter will always give the best available estimate. we
|
39
39
|
# recommend using get_estimate().
|
@@ -42,9 +42,9 @@ class CpcTest(unittest.TestCase):
|
|
42
42
|
self.assertLessEqual(cpc.get_lower_bound(1), cpc.get_estimate())
|
43
43
|
self.assertGreaterEqual(cpc.get_upper_bound(1), cpc.get_estimate())
|
44
44
|
|
45
|
-
#
|
46
|
-
#
|
47
|
-
union = cpc_union(
|
45
|
+
# union is a separate class, so we need to get_result()
|
46
|
+
# to query the unioned sketches
|
47
|
+
union = cpc_union(lgk)
|
48
48
|
union.update(cpc)
|
49
49
|
union.update(cpc2)
|
50
50
|
result = union.get_result()
|
@@ -54,7 +54,7 @@ class CpcTest(unittest.TestCase):
|
|
54
54
|
# answer is within one standard deviation of the estimate
|
55
55
|
self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
|
56
56
|
self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
|
57
|
-
|
57
|
+
|
58
58
|
# serialize for storage and reconstruct
|
59
59
|
sk_bytes = result.serialize()
|
60
60
|
new_cpc = cpc_sketch.deserialize(sk_bytes)
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
import unittest
|
19
|
+
from datasketches import density_sketch, KernelFunction
|
20
|
+
import numpy as np
|
21
|
+
|
22
|
+
class UnitSphereKernel(KernelFunction):
|
23
|
+
def __call__(self, a: np.array, b: np.array) -> float:
|
24
|
+
if np.linalg.norm(a - b) < 1.0:
|
25
|
+
return 1.0
|
26
|
+
else:
|
27
|
+
return 0.0
|
28
|
+
|
29
|
+
class densityTest(unittest.TestCase):
|
30
|
+
def test_density_sketch(self):
|
31
|
+
k = 10
|
32
|
+
dim = 3
|
33
|
+
n = 1000
|
34
|
+
|
35
|
+
sketch = density_sketch(k, dim)
|
36
|
+
|
37
|
+
self.assertEqual(sketch.get_k(), k)
|
38
|
+
self.assertEqual(sketch.get_dim(), dim)
|
39
|
+
self.assertTrue(sketch.is_empty())
|
40
|
+
self.assertFalse(sketch.is_estimation_mode())
|
41
|
+
self.assertEqual(sketch.get_n(), 0)
|
42
|
+
self.assertEqual(sketch.get_num_retained(), 0)
|
43
|
+
|
44
|
+
for i in range(n):
|
45
|
+
sketch.update([i, i, i])
|
46
|
+
|
47
|
+
self.assertFalse(sketch.is_empty())
|
48
|
+
self.assertTrue(sketch.is_estimation_mode())
|
49
|
+
self.assertEqual(sketch.get_n(), n)
|
50
|
+
self.assertGreater(sketch.get_num_retained(), k)
|
51
|
+
self.assertLess(sketch.get_num_retained(), n)
|
52
|
+
self.assertGreater(sketch.get_estimate([n - 1, n - 1, n - 1]), 0)
|
53
|
+
|
54
|
+
for tuple in sketch:
|
55
|
+
vector = tuple[0]
|
56
|
+
weight = tuple[1]
|
57
|
+
self.assertEqual(len(vector), dim)
|
58
|
+
self.assertGreaterEqual(weight, 1)
|
59
|
+
|
60
|
+
sk_bytes = sketch.serialize()
|
61
|
+
sketch2 = density_sketch.deserialize(sk_bytes)
|
62
|
+
self.assertEqual(sketch.get_estimate([1.5, 2.5, 3.5]), sketch2.get_estimate([1.5, 2.5, 3.5]))
|
63
|
+
|
64
|
+
def test_density_merge(self):
|
65
|
+
sketch1 = density_sketch(10, 2)
|
66
|
+
sketch1.update([0, 0])
|
67
|
+
sketch2 = density_sketch(10, 2)
|
68
|
+
sketch2.update([0, 1])
|
69
|
+
sketch1.merge(sketch2)
|
70
|
+
self.assertEqual(sketch1.get_n(), 2)
|
71
|
+
self.assertEqual(sketch1.get_num_retained(), 2)
|
72
|
+
|
73
|
+
def test_custom_kernel(self):
|
74
|
+
gaussianSketch = density_sketch(10, 2) # default kernel
|
75
|
+
sphericalSketch = density_sketch(10, 2, UnitSphereKernel())
|
76
|
+
|
77
|
+
p = [1, 1]
|
78
|
+
gaussianSketch.update(p)
|
79
|
+
sphericalSketch.update(p)
|
80
|
+
|
81
|
+
# Spherical kernel should return 1.0 for a nearby point, 0 farther
|
82
|
+
# Gaussian kernel should return something nonzero when farther away
|
83
|
+
self.assertEqual(sphericalSketch.get_estimate([1.001, 1]), 1.0)
|
84
|
+
self.assertEqual(sphericalSketch.get_estimate([2, 2]), 0.0)
|
85
|
+
self.assertGreater(gaussianSketch.get_estimate([2, 2]), 0.0)
|
86
|
+
|
87
|
+
# We can also use a custom kernel when deserializing
|
88
|
+
sk_bytes = sphericalSketch.serialize()
|
89
|
+
sphericalRebuilt = density_sketch.deserialize(sk_bytes, UnitSphereKernel())
|
90
|
+
self.assertEqual(sphericalSketch.get_estimate([1.001, 1]), sphericalRebuilt.get_estimate([1.001, 1]))
|
91
|
+
|
92
|
+
if __name__ == '__main__':
|
93
|
+
unittest.main()
|
@@ -16,10 +16,11 @@
|
|
16
16
|
# under the License.
|
17
17
|
|
18
18
|
import unittest
|
19
|
-
from datasketches import frequent_strings_sketch,
|
19
|
+
from datasketches import frequent_strings_sketch, frequent_items_sketch
|
20
|
+
from datasketches import frequent_items_error_type, PyIntsSerDe
|
20
21
|
|
21
22
|
class FiTest(unittest.TestCase):
|
22
|
-
def
|
23
|
+
def test_fi_strings_example(self):
|
23
24
|
k = 3 # a small value so we can easily fill the sketch
|
24
25
|
fi = frequent_strings_sketch(k)
|
25
26
|
|
@@ -93,6 +94,44 @@ class FiTest(unittest.TestCase):
|
|
93
94
|
self.assertGreater(new_fi.get_num_active_items(), 0)
|
94
95
|
self.assertEqual(5 * wt, new_fi.get_total_weight())
|
95
96
|
|
97
|
+
# This example uses generic objects but is otherwise identical
|
98
|
+
def test_fi_items_example(self):
|
99
|
+
k = 3 # a small value so we can easily fill the sketch
|
100
|
+
fi = frequent_items_sketch(k)
|
101
|
+
|
102
|
+
# as above, but in this case inserting ints
|
103
|
+
n = 8
|
104
|
+
for i in range(0, n):
|
105
|
+
fi.update(i, 2 ** (n - i))
|
106
|
+
|
107
|
+
# everything else works identically, so let's jump straight
|
108
|
+
# to merging and serialization
|
109
|
+
|
110
|
+
# now create a second sketch with a lot of unique
|
111
|
+
# values but all with equal weight (of 1) such that
|
112
|
+
# the total weight is much larger than the first sketch
|
113
|
+
fi2 = frequent_items_sketch(k)
|
114
|
+
wt = fi.get_total_weight()
|
115
|
+
for i in range(0, 4*wt):
|
116
|
+
fi2.update(i)
|
117
|
+
|
118
|
+
# merge the second sketch into the first
|
119
|
+
fi.merge(fi2)
|
120
|
+
|
121
|
+
# we can see that the weight is much larger
|
122
|
+
self.assertEqual(5 * wt, fi.get_total_weight())
|
123
|
+
|
124
|
+
# finally, serialize and reconstruct -- now we need a serde to tell
|
125
|
+
# (de)serialization how to interpret the objects
|
126
|
+
fi_bytes = fi.serialize(PyIntsSerDe())
|
127
|
+
self.assertEqual(len(fi_bytes), fi.get_serialized_size_bytes(PyIntsSerDe()))
|
128
|
+
new_fi = frequent_items_sketch.deserialize(fi_bytes, PyIntsSerDe())
|
129
|
+
|
130
|
+
# and again interrogate the sketch to check that it's what we serialized
|
131
|
+
self.assertFalse(new_fi.is_empty())
|
132
|
+
self.assertGreater(new_fi.get_num_active_items(), 0)
|
133
|
+
self.assertEqual(5 * wt, new_fi.get_total_weight())
|
134
|
+
|
96
135
|
|
97
136
|
def test_fi_sketch(self):
|
98
137
|
# only testing a few things not used in the above example
|
@@ -14,34 +14,34 @@
|
|
14
14
|
# KIND, either express or implied. See the License for the
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
|
-
|
17
|
+
|
18
18
|
import unittest
|
19
19
|
from datasketches import hll_sketch, hll_union, tgt_hll_type
|
20
20
|
|
21
21
|
class HllTest(unittest.TestCase):
|
22
22
|
def test_hll_example(self):
|
23
|
-
|
23
|
+
lgk = 12 # 2^k = 4096 rows in the table
|
24
24
|
n = 1 << 18 # ~256k unique values
|
25
25
|
|
26
26
|
# create a couple sketches and inject some values
|
27
27
|
# we'll have 1/4 of the values overlap
|
28
|
-
hll = hll_sketch(
|
29
|
-
hll2 = hll_sketch(
|
28
|
+
hll = hll_sketch(lgk, tgt_hll_type.HLL_8)
|
29
|
+
hll2 = hll_sketch(lgk, tgt_hll_type.HLL_6)
|
30
30
|
offset = int(3 * n / 4) # it's a float w/o cast
|
31
31
|
# because we hash on the bits, not an abstract numeric value,
|
32
32
|
# hll.update(1) and hll.update(1.0) give different results.
|
33
33
|
for i in range(0, n):
|
34
34
|
hll.update(i)
|
35
35
|
hll2.update(i + offset)
|
36
|
-
|
36
|
+
|
37
37
|
# we can check that the upper and lower bounds bracket the
|
38
38
|
# estimate, without needing to know the exact value.
|
39
39
|
self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
|
40
40
|
self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())
|
41
41
|
|
42
|
-
#
|
42
|
+
# union is a separate class, and we can either get a result
|
43
43
|
# sketch or query the union object directly
|
44
|
-
union = hll_union(
|
44
|
+
union = hll_union(lgk)
|
45
45
|
union.update(hll)
|
46
46
|
union.update(hll2)
|
47
47
|
result = union.get_result()
|
@@ -59,7 +59,7 @@ class HllTest(unittest.TestCase):
|
|
59
59
|
new_hll = hll_sketch.deserialize(sk_bytes)
|
60
60
|
|
61
61
|
# the sketch can self-report its configuration and status
|
62
|
-
self.assertEqual(new_hll.lg_config_k,
|
62
|
+
self.assertEqual(new_hll.lg_config_k, lgk)
|
63
63
|
self.assertEqual(new_hll.tgt_type, tgt_hll_type.HLL_4)
|
64
64
|
self.assertFalse(new_hll.is_empty())
|
65
65
|
|
@@ -68,16 +68,16 @@ class HllTest(unittest.TestCase):
|
|
68
68
|
self.assertTrue(new_hll.is_empty())
|
69
69
|
|
70
70
|
def test_hll_sketch(self):
|
71
|
-
|
71
|
+
lgk = 8
|
72
72
|
n = 117
|
73
|
-
hll = self.generate_sketch(n,
|
73
|
+
hll = self.generate_sketch(n, lgk, tgt_hll_type.HLL_6)
|
74
74
|
hll.update('string data')
|
75
75
|
hll.update(3.14159) # double data
|
76
76
|
|
77
77
|
self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
|
78
78
|
self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())
|
79
79
|
|
80
|
-
self.assertEqual(hll.lg_config_k,
|
80
|
+
self.assertEqual(hll.lg_config_k, lgk)
|
81
81
|
self.assertEqual(hll.tgt_type, tgt_hll_type.HLL_6)
|
82
82
|
|
83
83
|
bytes_compact = hll.serialize_compact()
|
@@ -98,13 +98,13 @@ class HllTest(unittest.TestCase):
|
|
98
98
|
self.assertTrue(hll.is_empty())
|
99
99
|
|
100
100
|
def test_hll_union(self):
|
101
|
-
|
101
|
+
lgk = 7
|
102
102
|
n = 53
|
103
|
-
union = hll_union(
|
103
|
+
union = hll_union(lgk)
|
104
104
|
|
105
|
-
sk = self.generate_sketch(n,
|
105
|
+
sk = self.generate_sketch(n, lgk, tgt_hll_type.HLL_4, 0)
|
106
106
|
union.update(sk)
|
107
|
-
sk = self.generate_sketch(3 * n,
|
107
|
+
sk = self.generate_sketch(3 * n, lgk, tgt_hll_type.HLL_4, n)
|
108
108
|
union.update(sk)
|
109
109
|
union.update('string data')
|
110
110
|
union.update(1.4142136)
|
@@ -112,19 +112,18 @@ class HllTest(unittest.TestCase):
|
|
112
112
|
self.assertLessEqual(union.get_lower_bound(1), union.get_estimate())
|
113
113
|
self.assertGreaterEqual(union.get_upper_bound(1), union.get_estimate())
|
114
114
|
|
115
|
-
self.assertEqual(union.lg_config_k,
|
115
|
+
self.assertEqual(union.lg_config_k, lgk)
|
116
116
|
self.assertFalse(union.is_empty())
|
117
117
|
|
118
118
|
sk = union.get_result()
|
119
119
|
self.assertTrue(isinstance(sk, hll_sketch))
|
120
120
|
self.assertEqual(sk.tgt_type, tgt_hll_type.HLL_4)
|
121
121
|
|
122
|
-
def generate_sketch(self, n,
|
123
|
-
sk = hll_sketch(
|
122
|
+
def generate_sketch(self, n, lgk, sk_type=tgt_hll_type.HLL_4, st_idx=0):
|
123
|
+
sk = hll_sketch(lgk, sk_type)
|
124
124
|
for i in range(st_idx, st_idx + n):
|
125
125
|
sk.update(i)
|
126
126
|
return sk
|
127
|
-
|
128
|
-
|
127
|
+
|
129
128
|
if __name__ == '__main__':
|
130
129
|
unittest.main()
|