datasketches 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -0,0 +1,215 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <memory>
21
+ #include <pybind11/pybind11.h>
22
+ #include <pybind11/stl.h>
23
+
24
+ #include "theta_sketch.hpp"
25
+ #include "tuple_sketch.hpp"
26
+ #include "tuple_union.hpp"
27
+ #include "tuple_intersection.hpp"
28
+ #include "tuple_a_not_b.hpp"
29
+ #include "theta_jaccard_similarity_base.hpp"
30
+ #include "common_defs.hpp"
31
+
32
+ #include "py_serde.hpp"
33
+ #include "tuple_policy.hpp"
34
+
35
+ namespace py = pybind11;
36
+
37
+ void init_tuple(py::module &m) {
38
+ using namespace datasketches;
39
+
40
+ // generic tuple_policy:
41
+ // * update sketch policy uses create_summary and update_summary
42
+ // * set operation policies all use __call__
43
+ py::class_<tuple_policy, TuplePolicy, std::shared_ptr<tuple_policy>>(m, "TuplePolicy")
44
+ .def(py::init())
45
+ .def("create_summary", &tuple_policy::create_summary)
46
+ .def("update_summary", &tuple_policy::update_summary, py::arg("summary"), py::arg("update"))
47
+ .def("__call__", &tuple_policy::operator(), py::arg("summary"), py::arg("update"))
48
+ ;
49
+
50
+ // potentially useful for debugging but not needed as a permanent
51
+ // object type in the library
52
+ /*
53
+ py::class_<tuple_policy_holder>(m, "TuplePolicyHolder")
54
+ .def(py::init<std::shared_ptr<tuple_policy>>(), py::arg("policy"))
55
+ .def("create", &tuple_policy_holder::create, "Creates a new Summary object")
56
+ .def("update", &tuple_policy_holder::update, py::arg("summary"), py::arg("update"),
57
+ "Updates the provided summary using the data in update")
58
+ ;
59
+ */
60
+
61
+ using py_tuple_sketch = tuple_sketch<py::object>;
62
+ using py_update_tuple = update_tuple_sketch<py::object, py::object, tuple_policy_holder>;
63
+ using py_compact_tuple = compact_tuple_sketch<py::object>;
64
+ using py_tuple_union = tuple_union<py::object, tuple_policy_holder>;
65
+ using py_tuple_intersection = tuple_intersection<py::object, tuple_policy_holder>;
66
+ using py_tuple_a_not_b = tuple_a_not_b<py::object>;
67
+ using py_tuple_jaccard_similarity = jaccard_similarity_base<tuple_union<py::object, dummy_jaccard_policy>, tuple_intersection<py::object, dummy_jaccard_policy>, pair_extract_key<uint64_t, py::object>>;
68
+
69
+ py::class_<py_tuple_sketch>(m, "_tuple_sketch")
70
+ .def("__str__", &py_tuple_sketch::to_string, py::arg("print_items")=false,
71
+ "Produces a string summary of the sketch")
72
+ .def("to_string", &py_tuple_sketch::to_string, py::arg("print_items")=false,
73
+ "Produces a string summary of the sketch")
74
+ .def("is_empty", &py_tuple_sketch::is_empty,
75
+ "Returns True if the sketch is empty, otherwise False")
76
+ .def("get_estimate", &py_tuple_sketch::get_estimate,
77
+ "Estimate of the distinct count of the input stream")
78
+ .def("get_upper_bound", static_cast<double (py_tuple_sketch::*)(uint8_t) const>(&py_tuple_sketch::get_upper_bound), py::arg("num_std_devs"),
79
+ "Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}")
80
+ .def("get_lower_bound", static_cast<double (py_tuple_sketch::*)(uint8_t) const>(&py_tuple_sketch::get_lower_bound), py::arg("num_std_devs"),
81
+ "Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}")
82
+ .def("is_estimation_mode", &py_tuple_sketch::is_estimation_mode,
83
+ "Returns True if sketch is in estimation mode, otherwise False")
84
+ .def("get_theta", &py_tuple_sketch::get_theta,
85
+ "Returns theta (effective sampling rate) as a fraction from 0 to 1")
86
+ .def("get_theta64", &py_tuple_sketch::get_theta64,
87
+ "Returns theta as 64-bit value")
88
+ .def("get_num_retained", &py_tuple_sketch::get_num_retained,
89
+ "Returns the number of items currently in the sketch")
90
+ .def("get_seed_hash", [](const py_tuple_sketch& sk) { return sk.get_seed_hash(); }, // why does regular call not work??
91
+ "Returns a hash of the seed used in the sketch")
92
+ .def("is_ordered", &py_tuple_sketch::is_ordered,
93
+ "Returns True if the sketch entries are sorted, otherwise False")
94
+ .def("__iter__", [](const py_tuple_sketch& s) { return py::make_iterator(s.begin(), s.end()); })
95
+ .def_property_readonly_static("DEFAULT_SEED", [](py::object /* self */) { return DEFAULT_SEED; });
96
+ ;
97
+
98
+ py::class_<py_compact_tuple, py_tuple_sketch>(m, "_compact_tuple_sketch")
99
+ .def(py::init<const py_compact_tuple&>(), py::arg("other"))
100
+ .def(py::init<const py_tuple_sketch&, bool>(), py::arg("other"), py::arg("ordered")=true)
101
+ .def(py::init<const theta_sketch&, py::object&>(), py::arg("other"), py::arg("summary"),
102
+ "Creates a compact tuple sketch from a theta sketch using a fixed summary value.")
103
+ .def(
104
+ "serialize",
105
+ [](const py_compact_tuple& sk, py_object_serde& serde) {
106
+ auto bytes = sk.serialize(0, serde);
107
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
108
+ }, py::arg("serde"),
109
+ "Serializes the sketch into a bytes object"
110
+ )
111
+ .def_static(
112
+ "deserialize",
113
+ [](const std::string& bytes, py_object_serde& serde, uint64_t seed) {
114
+ return py_compact_tuple::deserialize(bytes.data(), bytes.size(), seed, serde);
115
+ },
116
+ py::arg("bytes"), py::arg("serde"), py::arg("seed")=DEFAULT_SEED,
117
+ "Reads a bytes object and returns the corresponding compact_tuple_sketch"
118
+ );
119
+
120
+ py::class_<py_update_tuple, py_tuple_sketch>(m, "_update_tuple_sketch")
121
+ .def(
122
+ py::init([](std::shared_ptr<tuple_policy> policy, uint8_t lg_k, double p, uint64_t seed) {
123
+ tuple_policy_holder holder(policy);
124
+ return py_update_tuple::builder(holder).set_lg_k(lg_k).set_p(p).set_seed(seed).build();
125
+ }),
126
+ py::arg("policy"), py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED
127
+ )
128
+ .def(py::init<const py_update_tuple&>())
129
+ .def("update", static_cast<void (py_update_tuple::*)(int64_t, py::object&)>(&py_update_tuple::update),
130
+ py::arg("datum"), py::arg("value"),
131
+ "Updates the sketch with the given integral item and summary value")
132
+ .def("update", static_cast<void (py_update_tuple::*)(double, py::object&)>(&py_update_tuple::update),
133
+ py::arg("datum"), py::arg("value"),
134
+ "Updates the sketch with the given floating point item and summary value")
135
+ .def("update", static_cast<void (py_update_tuple::*)(const std::string&, py::object&)>(&py_update_tuple::update),
136
+ py::arg("datum"), py::arg("value"),
137
+ "Updates the sketch with the given string item and summary value")
138
+ .def("compact", &py_update_tuple::compact, py::arg("ordered")=true,
139
+ "Returns a compacted form of the sketch, optionally sorting it")
140
+ .def("reset", &py_update_tuple::reset, "Resets the sketch to the initial empty state")
141
+ ;
142
+
143
+ py::class_<py_tuple_union>(m, "_tuple_union")
144
+ .def(
145
+ py::init([](std::shared_ptr<tuple_policy> policy, uint8_t lg_k, double p, uint64_t seed) {
146
+ tuple_policy_holder holder(policy);
147
+ return py_tuple_union::builder(holder).set_lg_k(lg_k).set_p(p).set_seed(seed).build();
148
+ }),
149
+ py::arg("policy"), py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED
150
+ )
151
+ .def("update", &py_tuple_union::update<const py_tuple_sketch&>, py::arg("sketch"),
152
+ "Updates the union with the given sketch")
153
+ .def("get_result", &py_tuple_union::get_result, py::arg("ordered")=true,
154
+ "Returns the sketch corresponding to the union result")
155
+ .def("reset", &py_tuple_union::reset,
156
+ "Resets the sketch to the initial empty")
157
+ ;
158
+
159
+ py::class_<py_tuple_intersection>(m, "_tuple_intersection")
160
+ .def(
161
+ py::init([](std::shared_ptr<tuple_policy> policy, uint64_t seed) {
162
+ tuple_policy_holder holder(policy);
163
+ return py_tuple_intersection(seed, holder);
164
+ }),
165
+ py::arg("policy"), py::arg("seed")=DEFAULT_SEED)
166
+ .def("update", &py_tuple_intersection::update<const py_tuple_sketch&>, py::arg("sketch"),
167
+ "Intersects the provided sketch with the current intersection state")
168
+ .def("get_result", &py_tuple_intersection::get_result, py::arg("ordered")=true,
169
+ "Returns the sketch corresponding to the intersection result")
170
+ .def("has_result", &py_tuple_intersection::has_result,
171
+ "Returns True if the intersection has a valid result, otherwise False")
172
+ ;
173
+
174
+ py::class_<py_tuple_a_not_b>(m, "_tuple_a_not_b")
175
+ .def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
176
+ .def(
177
+ "compute",
178
+ &py_tuple_a_not_b::compute<const py_tuple_sketch&, const py_tuple_sketch&>,
179
+ py::arg("a"), py::arg("b"), py::arg("ordered")=true,
180
+ "Returns a sketch with the result of applying the A-not-B operation on the given inputs"
181
+ )
182
+ ;
183
+
184
+ py::class_<py_tuple_jaccard_similarity>(m, "_tuple_jaccard_similarity")
185
+ .def_static(
186
+ "jaccard",
187
+ [](const py_tuple_sketch& sketch_a, const py_tuple_sketch& sketch_b, uint64_t seed) {
188
+ return py_tuple_jaccard_similarity::jaccard(sketch_a, sketch_b, seed);
189
+ },
190
+ py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
191
+ "Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches"
192
+ )
193
+ .def_static(
194
+ "exactly_equal",
195
+ &py_tuple_jaccard_similarity::exactly_equal<const py_tuple_sketch&, const py_tuple_sketch&>,
196
+ py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
197
+ "Returns True if sketch_a and sketch_b are equivalent, otherwise False"
198
+ )
199
+ .def_static(
200
+ "similarity_test",
201
+ &py_tuple_jaccard_similarity::similarity_test<const py_tuple_sketch&, const py_tuple_sketch&>,
202
+ py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
203
+ "Tests similarity of an actual sketch against an expected sketch. Computes the lower bound of the Jaccard "
204
+ "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered "
205
+ "to be similar with a confidence of 97.7% and returns True, otherwise False.")
206
+ .def_static(
207
+ "dissimilarity_test",
208
+ &py_tuple_jaccard_similarity::dissimilarity_test<const py_tuple_sketch&, const py_tuple_sketch&>,
209
+ py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
210
+ "Tests dissimilarity of an actual sketch against an expected sketch. Computes the upper bound of the Jaccard "
211
+ "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered "
212
+ "to be dissimilar with a confidence of 97.7% and returns True, otherwise False."
213
+ )
214
+ ;
215
+ }
@@ -140,7 +140,7 @@ void bind_vo_sketch(py::module &m, const char* name) {
140
140
  .def("serialize", &dspy::vo_sketch_serialize<T>, py::arg("serde"), "Serialize the var opt sketch using the provided serde")
141
141
  .def_static("deserialize", &dspy::vo_sketch_deserialize<T>, py::arg("bytes"), py::arg("serde"),
142
142
  "Constructs a var opt sketch from the given bytes using the provided serde")
143
- ;
143
+ .def("__iter__", [](const var_opt_sketch<T>& sk) { return py::make_iterator(sk.begin(), sk.end()); });
144
144
  }
145
145
 
146
146
  template<typename T>
@@ -0,0 +1,86 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import unittest
19
+ from datasketches import count_min_sketch
20
+
21
+ class CountMinTest(unittest.TestCase):
22
+ def test_count_min_example(self):
23
+ # we'll define target confidence and relative error and use the built-in
24
+ # methods to determine how many hashes and buckets to use
25
+ confidence = 0.95
26
+ num_hashes = count_min_sketch.suggest_num_hashes(confidence)
27
+ relative_error = 0.01
28
+ num_buckets = count_min_sketch.suggest_num_buckets(relative_error)
29
+
30
+ # now we can create a few empty sketches
31
+ cm = count_min_sketch(num_hashes, num_buckets)
32
+ cm2 = count_min_sketch(num_hashes, num_buckets)
33
+ self.assertTrue(cm.is_empty())
34
+
35
+ # we'll use a moderate number of distinct items with
36
+ # increasing weights, with each item's weight being
37
+ # equal to its value
38
+ n = 1000
39
+ total_wt = 0
40
+ for i in range(1, n+1):
41
+ cm.update(i, i)
42
+ total_wt += i
43
+ self.assertFalse(cm.is_empty())
44
+ self.assertEqual(cm.get_total_weight(), total_wt)
45
+
46
+ # querying the items, each of them should
47
+ # have a non-zero count. the estimate should
48
+ # be at least i with appropriately behaved bounds.
49
+ for i in range(1, n+1):
50
+ val = cm.get_estimate(i)
51
+ self.assertGreaterEqual(val, i)
52
+ self.assertGreaterEqual(val, cm.get_lower_bound(i))
53
+ self.assertGreater(cm.get_upper_bound(i), val)
54
+
55
+ # values not in the sketch should have lower estimates, but
56
+ # are not guaranteed to be zero and will succeed
57
+ self.assertIsNotNone(cm.get_estimate("not in set"))
58
+
59
+ # we can create another sketch with partial overlap
60
+ # and merge them
61
+ for i in range(int(n / 2), int(3 * n / 2)):
62
+ cm2.update(i, i)
63
+ cm.merge(cm2)
64
+
65
+ # and the estimated weight for the overlapped meerged values
66
+ # (n/2 to n) should now be at least 2x the value
67
+ self.assertGreaterEqual(cm.get_estimate(n), 2 * n)
68
+
69
+ # finally, serialize and reconstruct
70
+ cm_bytes = cm.serialize()
71
+ self.assertEqual(cm.get_serialized_size_bytes(), len(cm_bytes))
72
+ new_cm = count_min_sketch.deserialize(cm_bytes)
73
+
74
+ # and now interrogate the sketch
75
+ self.assertFalse(new_cm.is_empty())
76
+ self.assertEqual(new_cm.get_num_hashes(), cm.get_num_hashes())
77
+ self.assertEqual(new_cm.get_num_buckets(), cm.get_num_buckets())
78
+ self.assertEqual(new_cm.get_total_weight(), cm.get_total_weight())
79
+
80
+ # we can also iterate through values in and out of the sketch to ensure
81
+ # the estimates match
82
+ for i in range(0, 2 * n):
83
+ self.assertEqual(cm.get_estimate(i), new_cm.get_estimate(i))
84
+
85
+ if __name__ == '__main__':
86
+ unittest.main()
@@ -14,26 +14,26 @@
14
14
  # KIND, either express or implied. See the License for the
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
-
17
+
18
18
  import unittest
19
19
  from datasketches import cpc_sketch, cpc_union
20
20
 
21
21
  class CpcTest(unittest.TestCase):
22
22
  def test_cpc_example(self):
23
- k = 12 # 2^k = 4096 rows in the table
24
- n = 1 << 18 # ~256k unique values
23
+ lgk = 12 # 2^k = 4096 rows in the table
24
+ n = 1 << 18 # ~256k distinct values
25
25
 
26
26
  # create a couple sketches and inject some values
27
27
  # we'll have 1/4 of the values overlap
28
- cpc = cpc_sketch(k)
29
- cpc2 = cpc_sketch(k)
28
+ cpc = cpc_sketch(lgk)
29
+ cpc2 = cpc_sketch(lgk)
30
30
  offset = int(3 * n / 4) # it's a float w/o cast
31
31
  # because we hash on the bits, not an abstract numeric value,
32
32
  # cpc.update(1) and cpc.update(1.0) give different results.
33
33
  for i in range(0, n):
34
34
  cpc.update(i)
35
35
  cpc2.update(i + offset)
36
-
36
+
37
37
  # although we provide get_composite_estimate() and get_estimate(),
38
38
  # the latter will always give the best available estimate. we
39
39
  # recommend using get_estimate().
@@ -42,9 +42,9 @@ class CpcTest(unittest.TestCase):
42
42
  self.assertLessEqual(cpc.get_lower_bound(1), cpc.get_estimate())
43
43
  self.assertGreaterEqual(cpc.get_upper_bound(1), cpc.get_estimate())
44
44
 
45
- # unioning uses a separate class, but we need to get_result()
46
- # tp query the unioned sketches
47
- union = cpc_union(k)
45
+ # union is a separate class, so we need to get_result()
46
+ # to query the unioned sketches
47
+ union = cpc_union(lgk)
48
48
  union.update(cpc)
49
49
  union.update(cpc2)
50
50
  result = union.get_result()
@@ -54,7 +54,7 @@ class CpcTest(unittest.TestCase):
54
54
  # answer is within one standard deviation of the estimate
55
55
  self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
56
56
  self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
57
-
57
+
58
58
  # serialize for storage and reconstruct
59
59
  sk_bytes = result.serialize()
60
60
  new_cpc = cpc_sketch.deserialize(sk_bytes)
@@ -0,0 +1,93 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import unittest
19
+ from datasketches import density_sketch, KernelFunction
20
+ import numpy as np
21
+
22
+ class UnitSphereKernel(KernelFunction):
23
+ def __call__(self, a: np.array, b: np.array) -> float:
24
+ if np.linalg.norm(a - b) < 1.0:
25
+ return 1.0
26
+ else:
27
+ return 0.0
28
+
29
+ class densityTest(unittest.TestCase):
30
+ def test_density_sketch(self):
31
+ k = 10
32
+ dim = 3
33
+ n = 1000
34
+
35
+ sketch = density_sketch(k, dim)
36
+
37
+ self.assertEqual(sketch.get_k(), k)
38
+ self.assertEqual(sketch.get_dim(), dim)
39
+ self.assertTrue(sketch.is_empty())
40
+ self.assertFalse(sketch.is_estimation_mode())
41
+ self.assertEqual(sketch.get_n(), 0)
42
+ self.assertEqual(sketch.get_num_retained(), 0)
43
+
44
+ for i in range(n):
45
+ sketch.update([i, i, i])
46
+
47
+ self.assertFalse(sketch.is_empty())
48
+ self.assertTrue(sketch.is_estimation_mode())
49
+ self.assertEqual(sketch.get_n(), n)
50
+ self.assertGreater(sketch.get_num_retained(), k)
51
+ self.assertLess(sketch.get_num_retained(), n)
52
+ self.assertGreater(sketch.get_estimate([n - 1, n - 1, n - 1]), 0)
53
+
54
+ for tuple in sketch:
55
+ vector = tuple[0]
56
+ weight = tuple[1]
57
+ self.assertEqual(len(vector), dim)
58
+ self.assertGreaterEqual(weight, 1)
59
+
60
+ sk_bytes = sketch.serialize()
61
+ sketch2 = density_sketch.deserialize(sk_bytes)
62
+ self.assertEqual(sketch.get_estimate([1.5, 2.5, 3.5]), sketch2.get_estimate([1.5, 2.5, 3.5]))
63
+
64
+ def test_density_merge(self):
65
+ sketch1 = density_sketch(10, 2)
66
+ sketch1.update([0, 0])
67
+ sketch2 = density_sketch(10, 2)
68
+ sketch2.update([0, 1])
69
+ sketch1.merge(sketch2)
70
+ self.assertEqual(sketch1.get_n(), 2)
71
+ self.assertEqual(sketch1.get_num_retained(), 2)
72
+
73
+ def test_custom_kernel(self):
74
+ gaussianSketch = density_sketch(10, 2) # default kernel
75
+ sphericalSketch = density_sketch(10, 2, UnitSphereKernel())
76
+
77
+ p = [1, 1]
78
+ gaussianSketch.update(p)
79
+ sphericalSketch.update(p)
80
+
81
+ # Spherical kernel should return 1.0 for a nearby point, 0 farther
82
+ # Gaussian kernel should return something nonzero when farther away
83
+ self.assertEqual(sphericalSketch.get_estimate([1.001, 1]), 1.0)
84
+ self.assertEqual(sphericalSketch.get_estimate([2, 2]), 0.0)
85
+ self.assertGreater(gaussianSketch.get_estimate([2, 2]), 0.0)
86
+
87
+ # We can also use a custom kernel when deserializing
88
+ sk_bytes = sphericalSketch.serialize()
89
+ sphericalRebuilt = density_sketch.deserialize(sk_bytes, UnitSphereKernel())
90
+ self.assertEqual(sphericalSketch.get_estimate([1.001, 1]), sphericalRebuilt.get_estimate([1.001, 1]))
91
+
92
+ if __name__ == '__main__':
93
+ unittest.main()
@@ -16,10 +16,11 @@
16
16
  # under the License.
17
17
 
18
18
  import unittest
19
- from datasketches import frequent_strings_sketch, frequent_items_error_type
19
+ from datasketches import frequent_strings_sketch, frequent_items_sketch
20
+ from datasketches import frequent_items_error_type, PyIntsSerDe
20
21
 
21
22
  class FiTest(unittest.TestCase):
22
- def test_fi_example(self):
23
+ def test_fi_strings_example(self):
23
24
  k = 3 # a small value so we can easily fill the sketch
24
25
  fi = frequent_strings_sketch(k)
25
26
 
@@ -93,6 +94,44 @@ class FiTest(unittest.TestCase):
93
94
  self.assertGreater(new_fi.get_num_active_items(), 0)
94
95
  self.assertEqual(5 * wt, new_fi.get_total_weight())
95
96
 
97
+ # This example uses generic objects but is otherwise identical
98
+ def test_fi_items_example(self):
99
+ k = 3 # a small value so we can easily fill the sketch
100
+ fi = frequent_items_sketch(k)
101
+
102
+ # as above, but in this case inserting ints
103
+ n = 8
104
+ for i in range(0, n):
105
+ fi.update(i, 2 ** (n - i))
106
+
107
+ # everything else works identically, so let's jump straight
108
+ # to merging and serialization
109
+
110
+ # now create a second sketch with a lot of unique
111
+ # values but all with equal weight (of 1) such that
112
+ # the total weight is much larger than the first sketch
113
+ fi2 = frequent_items_sketch(k)
114
+ wt = fi.get_total_weight()
115
+ for i in range(0, 4*wt):
116
+ fi2.update(i)
117
+
118
+ # merge the second sketch into the first
119
+ fi.merge(fi2)
120
+
121
+ # we can see that the weight is much larger
122
+ self.assertEqual(5 * wt, fi.get_total_weight())
123
+
124
+ # finally, serialize and reconstruct -- now we need a serde to tell
125
+ # (de)serialization how to interpret the objects
126
+ fi_bytes = fi.serialize(PyIntsSerDe())
127
+ self.assertEqual(len(fi_bytes), fi.get_serialized_size_bytes(PyIntsSerDe()))
128
+ new_fi = frequent_items_sketch.deserialize(fi_bytes, PyIntsSerDe())
129
+
130
+ # and again interrogate the sketch to check that it's what we serialized
131
+ self.assertFalse(new_fi.is_empty())
132
+ self.assertGreater(new_fi.get_num_active_items(), 0)
133
+ self.assertEqual(5 * wt, new_fi.get_total_weight())
134
+
96
135
 
97
136
  def test_fi_sketch(self):
98
137
  # only testing a few things not used in the above example
@@ -14,34 +14,34 @@
14
14
  # KIND, either express or implied. See the License for the
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
-
17
+
18
18
  import unittest
19
19
  from datasketches import hll_sketch, hll_union, tgt_hll_type
20
20
 
21
21
  class HllTest(unittest.TestCase):
22
22
  def test_hll_example(self):
23
- k = 12 # 2^k = 4096 rows in the table
23
+ lgk = 12 # 2^k = 4096 rows in the table
24
24
  n = 1 << 18 # ~256k unique values
25
25
 
26
26
  # create a couple sketches and inject some values
27
27
  # we'll have 1/4 of the values overlap
28
- hll = hll_sketch(k, tgt_hll_type.HLL_8)
29
- hll2 = hll_sketch(k, tgt_hll_type.HLL_6)
28
+ hll = hll_sketch(lgk, tgt_hll_type.HLL_8)
29
+ hll2 = hll_sketch(lgk, tgt_hll_type.HLL_6)
30
30
  offset = int(3 * n / 4) # it's a float w/o cast
31
31
  # because we hash on the bits, not an abstract numeric value,
32
32
  # hll.update(1) and hll.update(1.0) give different results.
33
33
  for i in range(0, n):
34
34
  hll.update(i)
35
35
  hll2.update(i + offset)
36
-
36
+
37
37
  # we can check that the upper and lower bounds bracket the
38
38
  # estimate, without needing to know the exact value.
39
39
  self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
40
40
  self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())
41
41
 
42
- # unioning uses a separate class, and we can either get a result
42
+ # union is a separate class, and we can either get a result
43
43
  # sketch or query the union object directly
44
- union = hll_union(k)
44
+ union = hll_union(lgk)
45
45
  union.update(hll)
46
46
  union.update(hll2)
47
47
  result = union.get_result()
@@ -59,7 +59,7 @@ class HllTest(unittest.TestCase):
59
59
  new_hll = hll_sketch.deserialize(sk_bytes)
60
60
 
61
61
  # the sketch can self-report its configuration and status
62
- self.assertEqual(new_hll.lg_config_k, k)
62
+ self.assertEqual(new_hll.lg_config_k, lgk)
63
63
  self.assertEqual(new_hll.tgt_type, tgt_hll_type.HLL_4)
64
64
  self.assertFalse(new_hll.is_empty())
65
65
 
@@ -68,16 +68,16 @@ class HllTest(unittest.TestCase):
68
68
  self.assertTrue(new_hll.is_empty())
69
69
 
70
70
  def test_hll_sketch(self):
71
- k = 8
71
+ lgk = 8
72
72
  n = 117
73
- hll = self.generate_sketch(n, k, tgt_hll_type.HLL_6)
73
+ hll = self.generate_sketch(n, lgk, tgt_hll_type.HLL_6)
74
74
  hll.update('string data')
75
75
  hll.update(3.14159) # double data
76
76
 
77
77
  self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
78
78
  self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())
79
79
 
80
- self.assertEqual(hll.lg_config_k, k)
80
+ self.assertEqual(hll.lg_config_k, lgk)
81
81
  self.assertEqual(hll.tgt_type, tgt_hll_type.HLL_6)
82
82
 
83
83
  bytes_compact = hll.serialize_compact()
@@ -98,13 +98,13 @@ class HllTest(unittest.TestCase):
98
98
  self.assertTrue(hll.is_empty())
99
99
 
100
100
  def test_hll_union(self):
101
- k = 7
101
+ lgk = 7
102
102
  n = 53
103
- union = hll_union(k)
103
+ union = hll_union(lgk)
104
104
 
105
- sk = self.generate_sketch(n, k, tgt_hll_type.HLL_4, 0)
105
+ sk = self.generate_sketch(n, lgk, tgt_hll_type.HLL_4, 0)
106
106
  union.update(sk)
107
- sk = self.generate_sketch(3 * n, k, tgt_hll_type.HLL_4, n)
107
+ sk = self.generate_sketch(3 * n, lgk, tgt_hll_type.HLL_4, n)
108
108
  union.update(sk)
109
109
  union.update('string data')
110
110
  union.update(1.4142136)
@@ -112,19 +112,18 @@ class HllTest(unittest.TestCase):
112
112
  self.assertLessEqual(union.get_lower_bound(1), union.get_estimate())
113
113
  self.assertGreaterEqual(union.get_upper_bound(1), union.get_estimate())
114
114
 
115
- self.assertEqual(union.lg_config_k, k)
115
+ self.assertEqual(union.lg_config_k, lgk)
116
116
  self.assertFalse(union.is_empty())
117
117
 
118
118
  sk = union.get_result()
119
119
  self.assertTrue(isinstance(sk, hll_sketch))
120
120
  self.assertEqual(sk.tgt_type, tgt_hll_type.HLL_4)
121
121
 
122
- def generate_sketch(self, n, k, sk_type=tgt_hll_type.HLL_4, st_idx=0):
123
- sk = hll_sketch(k, sk_type)
122
+ def generate_sketch(self, n, lgk, sk_type=tgt_hll_type.HLL_4, st_idx=0):
123
+ sk = hll_sketch(lgk, sk_type)
124
124
  for i in range(st_idx, st_idx + n):
125
125
  sk.update(i)
126
126
  return sk
127
-
128
-
127
+
129
128
  if __name__ == '__main__':
130
129
  unittest.main()