datasketches 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  4. data/ext/datasketches/ext.cpp +1 -1
  5. data/ext/datasketches/ext.h +4 -0
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/fi_wrapper.cpp +6 -8
  8. data/ext/datasketches/hll_wrapper.cpp +13 -14
  9. data/ext/datasketches/kll_wrapper.cpp +28 -76
  10. data/ext/datasketches/theta_wrapper.cpp +27 -41
  11. data/ext/datasketches/vo_wrapper.cpp +4 -6
  12. data/lib/datasketches/version.rb +1 -1
  13. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  14. data/vendor/datasketches-cpp/README.md +4 -4
  15. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
  16. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  17. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  18. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  19. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  20. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
  22. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
  24. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
  25. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  26. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
  27. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
  28. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
  29. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
  30. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
  31. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  33. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  34. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
  35. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  36. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  38. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
  41. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
  44. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
  45. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
  46. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
  47. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  48. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  49. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
  50. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
  51. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
  52. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
  53. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
  54. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
  55. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
  56. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
  57. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
  58. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
  59. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
  60. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
  62. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  63. data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
  64. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
  65. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
  66. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
  67. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
  68. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
  70. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
  71. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
  72. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  74. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
  75. data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
  76. data/vendor/datasketches-cpp/python/README.md +6 -3
  77. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  78. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
  79. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
  80. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  81. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
  82. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
  83. data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
  84. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  85. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  86. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  87. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
  88. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  89. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
  90. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  91. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  92. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  93. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  94. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  95. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  96. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  97. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  98. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  99. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  100. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  101. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
  104. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  105. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
  106. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  107. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  108. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
  109. data/vendor/datasketches-cpp/setup.py +5 -3
  110. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  111. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
  112. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  114. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  115. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  116. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
  117. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
  119. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  120. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  121. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  122. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  123. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
  124. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  125. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  126. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
  127. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
  128. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  129. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  130. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
  131. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  132. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
  133. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
  134. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  135. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
  136. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
  137. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  138. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
  141. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  142. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
  143. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
  144. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
  145. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
  146. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
  147. metadata +43 -34
  148. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  149. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  150. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  151. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  152. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  153. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  154. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  155. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  156. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  157. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  160. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -0,0 +1,246 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include "req_sketch.hpp"
21
+
22
+ #include <pybind11/pybind11.h>
23
+ #include <pybind11/stl.h>
24
+ #include <pybind11/numpy.h>
25
+ #include <sstream>
26
+ #include <vector>
27
+
28
+ namespace py = pybind11;
29
+
30
+ namespace datasketches {
31
+
32
+ namespace python {
33
+
34
+ template<typename T>
35
+ req_sketch<T> req_sketch_deserialize(py::bytes sk_bytes) {
36
+ std::string sk_str = sk_bytes; // implicit cast
37
+ return req_sketch<T>::deserialize(sk_str.c_str(), sk_str.length());
38
+ }
39
+
40
+ template<typename T>
41
+ py::object req_sketch_serialize(const req_sketch<T>& sk) {
42
+ auto ser_result = sk.serialize();
43
+ return py::bytes((char*)ser_result.data(), ser_result.size());
44
+ }
45
+
46
+ // maybe possible to disambiguate the static vs method rank error calls, but
47
+ // this is easier for now
48
+ template<typename T>
49
+ double req_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
50
+ return req_sketch<T>::get_normalized_rank_error(k, pmf);
51
+ }
52
+
53
+ template<typename T>
54
+ double req_sketch_get_rank(const req_sketch<T>& sk,
55
+ const T& item,
56
+ bool inclusive) {
57
+ if (inclusive)
58
+ return sk.template get_rank<true>(item);
59
+ else
60
+ return sk.template get_rank<false>(item);
61
+ }
62
+
63
+ template<typename T>
64
+ T req_sketch_get_quantile(const req_sketch<T>& sk,
65
+ double rank,
66
+ bool inclusive) {
67
+ if (inclusive)
68
+ return T(sk.template get_quantile<true>(rank));
69
+ else
70
+ return T(sk.template get_quantile<false>(rank));
71
+ }
72
+
73
+ template<typename T>
74
+ py::list req_sketch_get_quantiles(const req_sketch<T>& sk,
75
+ std::vector<double>& fractions,
76
+ bool inclusive) {
77
+ size_t n_quantiles = fractions.size();
78
+ auto result = inclusive
79
+ ? sk.template get_quantiles<true>(&fractions[0], n_quantiles)
80
+ : sk.template get_quantiles<false>(&fractions[0], n_quantiles);
81
+
82
+ // returning as std::vector<> would copy values to a list anyway
83
+ py::list list(n_quantiles);
84
+ for (size_t i = 0; i < n_quantiles; ++i) {
85
+ list[i] = result[i];
86
+ }
87
+
88
+ return list;
89
+ }
90
+
91
+ template<typename T>
92
+ py::list req_sketch_get_pmf(const req_sketch<T>& sk,
93
+ std::vector<T>& split_points,
94
+ bool inclusive) {
95
+ size_t n_points = split_points.size();
96
+ auto result = inclusive
97
+ ? sk.template get_PMF<true>(&split_points[0], n_points)
98
+ : sk.template get_PMF<false>(&split_points[0], n_points);
99
+
100
+ py::list list(n_points + 1);
101
+ for (size_t i = 0; i <= n_points; ++i) {
102
+ list[i] = result[i];
103
+ }
104
+
105
+ return list;
106
+ }
107
+
108
+ template<typename T>
109
+ py::list req_sketch_get_cdf(const req_sketch<T>& sk,
110
+ std::vector<T>& split_points,
111
+ bool inclusive) {
112
+ size_t n_points = split_points.size();
113
+ auto result = inclusive
114
+ ? sk.template get_CDF<true>(&split_points[0], n_points)
115
+ : sk.template get_CDF<false>(&split_points[0], n_points);
116
+
117
+ py::list list(n_points + 1);
118
+ for (size_t i = 0; i <= n_points; ++i) {
119
+ list[i] = result[i];
120
+ }
121
+
122
+ return list;
123
+ }
124
+
125
+ template<typename T>
126
+ void req_sketch_update(req_sketch<T>& sk, py::array_t<T, py::array::c_style | py::array::forcecast> items) {
127
+ if (items.ndim() != 1) {
128
+ throw std::invalid_argument("input data must have only one dimension. Found: "
129
+ + std::to_string(items.ndim()));
130
+ }
131
+
132
+ auto data = items.template unchecked<1>();
133
+ for (uint32_t i = 0; i < data.size(); ++i) {
134
+ sk.update(data(i));
135
+ }
136
+ }
137
+
138
+ }
139
+ }
140
+
141
+ namespace dspy = datasketches::python;
142
+
143
+ template<typename T>
144
+ void bind_req_sketch(py::module &m, const char* name) {
145
+ using namespace datasketches;
146
+
147
+ py::class_<req_sketch<T>>(m, name)
148
+ .def(py::init<uint16_t, bool>(), py::arg("k")=12, py::arg("is_hra")=true)
149
+ .def(py::init<const req_sketch<T>&>())
150
+ .def("update", (void (req_sketch<T>::*)(const T&)) &req_sketch<T>::update, py::arg("item"),
151
+ "Updates the sketch with the given value")
152
+ .def("update", &dspy::req_sketch_update<T>, py::arg("array"),
153
+ "Updates the sketch with the values in the given array")
154
+ .def("merge", (void (req_sketch<T>::*)(const req_sketch<T>&)) &req_sketch<T>::merge, py::arg("sketch"),
155
+ "Merges the provided sketch into the this one")
156
+ .def("__str__", &req_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
157
+ "Produces a string summary of the sketch")
158
+ .def("to_string", &req_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
159
+ "Produces a string summary of the sketch")
160
+ .def("is_hra", &req_sketch<T>::is_HRA,
161
+ "Returns True if the sketch is in High Rank Accuracy mode, otherwise False")
162
+ .def("is_empty", &req_sketch<T>::is_empty,
163
+ "Returns True if the sketch is empty, otherwise False")
164
+ .def("get_k", &req_sketch<T>::get_k,
165
+ "Returns the configured parameter k")
166
+ .def("get_n", &req_sketch<T>::get_n,
167
+ "Returns the length of the input stream")
168
+ .def("get_num_retained", &req_sketch<T>::get_num_retained,
169
+ "Returns the number of retained items (samples) in the sketch")
170
+ .def("is_estimation_mode", &req_sketch<T>::is_estimation_mode,
171
+ "Returns True if the sketch is in estimation mode, otherwise False")
172
+ .def("get_min_value", &req_sketch<T>::get_min_value,
173
+ "Returns the minimum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
174
+ .def("get_max_value", &req_sketch<T>::get_max_value,
175
+ "Returns the maximum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
176
+ .def("get_quantile", &dspy::req_sketch_get_quantile<T>, py::arg("rank"), py::arg("inclusive")=false,
177
+ "Returns an approximation to the value of the data item "
178
+ "that would be preceded by the given fraction of a hypothetical sorted "
179
+ "version of the input stream so far.\n"
180
+ "Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
181
+ "so it should not be called multiple times to get different quantiles from the same "
182
+ "sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
183
+ "For req_floats_sketch: if the sketch is empty this returns nan. "
184
+ "For req_ints_sketch: if the sketch is empty this throws a RuntimeError.")
185
+ .def("get_quantiles", &dspy::req_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
186
+ "This is a more efficient multiple-query version of get_quantile().\n"
187
+ "This returns an array that could have been generated by using get_quantile() for each "
188
+ "fractional rank separately, but would be very inefficient. "
189
+ "This method incurs the internal set-up overhead once and obtains multiple quantile values in "
190
+ "a single query. It is strongly recommend that this method be used instead of multiple calls "
191
+ "to get_quantile().\n"
192
+ "If the sketch is empty this returns an empty vector.")
193
+ .def("get_rank", &dspy::req_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
194
+ "Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
195
+ "The resulting approximation has a probabilistic guarantee that can be obtained from the "
196
+ "get_normalized_rank_error(False) function.\n"
197
+ "If the sketch is empty this returns nan.")
198
+ .def("get_pmf", &dspy::req_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
199
+ "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
200
+ "given a set of split points (values).\n"
201
+ "The resulting approximations have a probabilistic guarantee that can be obtained from the "
202
+ "get_normalized_rank_error(True) function.\n"
203
+ "If the sketch is empty this returns an empty vector.\n"
204
+ "split_points is an array of m unique, monotonically increasing float values "
205
+ "that divide the real number line into m+1 consecutive disjoint intervals.\n"
206
+ "The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
207
+ "exclusive of the right split point, with the exception that the last interval will include "
208
+ "the maximum value.\n"
209
+ "It is not necessary to include either the min or max values in these split points.")
210
+ .def("get_cdf", &dspy::req_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
211
+ "Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
212
+ "cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
213
+ "The resulting approximations have a probabilistic guarantee that can be obtained from the "
214
+ "get_normalized_rank_error(True) function.\n"
215
+ "If the sketch is empty this returns an empty vector.\n"
216
+ "split_points is an array of m unique, monotonically increasing float values "
217
+ "that divide the real number line into m+1 consecutive disjoint intervals.\n"
218
+ "The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
219
+ "exclusive of the right split point, with the exception that the last interval will include "
220
+ "the maximum value.\n"
221
+ "It is not necessary to include either the min or max values in these split points.")
222
+ .def("get_rank_lower_bound", &req_sketch<T>::get_rank_lower_bound, py::arg("rank"), py::arg("num_std_dev"),
223
+ "Returns an approximate lower bound on the given normalized rank.\n"
224
+ "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
225
+ "the number of standard deviations must be 1, 2, or 3.")
226
+ .def("get_rank_upper_bound", &req_sketch<T>::get_rank_upper_bound, py::arg("rank"), py::arg("num_std_dev"),
227
+ "Returns an approximate upper bound on the given normalized rank.\n"
228
+ "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
229
+ "the number of standard deviations must be 1, 2, or 3.")
230
+ .def_static("get_RSE", &req_sketch<T>::get_RSE,
231
+ py::arg("k"), py::arg("rank"), py::arg("is_hra"), py::arg("n"),
232
+ "Returns an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]). "
233
+ "Derived from Lemma 12 in http://arxiv.org/abs/2004.01668v2, but the constant factors have been "
234
+ "modified based on empirical measurements, for a given value of parameter k.\n"
235
+ "Normalized rank must be a value between 0.0 and 1.0 (inclusive). If is_hra is True, uses high "
236
+ "rank accuracy mode, else low rank accuracy. N is an estimate of the total number of points "
237
+ "provided to the sketch.")
238
+ .def("serialize", &dspy::req_sketch_serialize<T>, "Serializes the sketch into a bytes object")
239
+ .def_static("deserialize", &dspy::req_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
240
+ ;
241
+ }
242
+
243
+ void init_req(py::module &m) {
244
+ bind_req_sketch<int>(m, "req_ints_sketch");
245
+ bind_req_sketch<float>(m, "req_floats_sketch");
246
+ }
@@ -19,11 +19,13 @@
19
19
 
20
20
  #include <sstream>
21
21
  #include <pybind11/pybind11.h>
22
+ #include <pybind11/stl.h>
22
23
 
23
24
  #include "theta_sketch.hpp"
24
25
  #include "theta_union.hpp"
25
26
  #include "theta_intersection.hpp"
26
27
  #include "theta_a_not_b.hpp"
28
+ #include "theta_jaccard_similarity.hpp"
27
29
  #include "common_defs.hpp"
28
30
 
29
31
 
@@ -48,23 +50,13 @@ theta_union theta_union_factory(uint8_t lg_k, double p, uint64_t seed) {
48
50
  return builder.build();
49
51
  }
50
52
 
51
- theta_sketch* theta_sketch_deserialize(py::bytes skBytes, uint64_t seed) {
52
- std::string skStr = skBytes; // implicit cast
53
- return theta_sketch::deserialize(skStr.c_str(), skStr.length(), seed).release();
54
- }
55
-
56
- py::object theta_sketch_serialize(const theta_sketch& sk) {
57
- auto serResult = sk.serialize();
58
- return py::bytes((char*)serResult.data(), serResult.size());
59
- }
60
-
61
53
  uint16_t theta_sketch_get_seed_hash(const theta_sketch& sk) {
62
54
  return sk.get_seed_hash();
63
55
  }
64
56
 
65
- update_theta_sketch update_theta_sketch_deserialize(py::bytes skBytes, uint64_t seed) {
66
- std::string skStr = skBytes; // implicit cast
67
- return update_theta_sketch::deserialize(skStr.c_str(), skStr.length(), seed);
57
+ py::object compact_theta_sketch_serialize(const compact_theta_sketch& sk) {
58
+ auto serResult = sk.serialize();
59
+ return py::bytes((char*)serResult.data(), serResult.size());
68
60
  }
69
61
 
70
62
  compact_theta_sketch compact_theta_sketch_deserialize(py::bytes skBytes, uint64_t seed) {
@@ -72,6 +64,10 @@ compact_theta_sketch compact_theta_sketch_deserialize(py::bytes skBytes, uint64_
72
64
  return compact_theta_sketch::deserialize(skStr.c_str(), skStr.length(), seed);
73
65
  }
74
66
 
67
+ py::list theta_jaccard_sim_computation(const theta_sketch& sketch_a, const theta_sketch& sketch_b) {
68
+ return py::cast(theta_jaccard_similarity::jaccard(sketch_a, sketch_b));
69
+ }
70
+
75
71
  }
76
72
  }
77
73
 
@@ -81,16 +77,12 @@ void init_theta(py::module &m) {
81
77
  using namespace datasketches;
82
78
 
83
79
  py::class_<theta_sketch>(m, "theta_sketch")
84
- .def("serialize", &dspy::theta_sketch_serialize,
85
- "Serializes the sketch into a bytes object")
86
- .def_static("deserialize", &dspy::theta_sketch_deserialize, py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
87
- "Reads a bytes object and returns the corresponding cpc_sketch")
88
80
  .def("__str__", &theta_sketch::to_string, py::arg("print_items")=false,
89
81
  "Produces a string summary of the sketch")
90
82
  .def("to_string", &theta_sketch::to_string, py::arg("print_items")=false,
91
83
  "Produces a string summary of the sketch")
92
84
  .def("is_empty", &theta_sketch::is_empty,
93
- "Returns True if the sketch is empty, otherwise Dalse")
85
+ "Returns True if the sketch is empty, otherwise False")
94
86
  .def("get_estimate", &theta_sketch::get_estimate,
95
87
  "Estimate of the distinct count of the input stream")
96
88
  .def("get_upper_bound", &theta_sketch::get_upper_bound, py::arg("num_std_devs"),
@@ -121,23 +113,22 @@ void init_theta(py::module &m) {
121
113
  "Updates the sketch with the given string")
122
114
  .def("compact", &update_theta_sketch::compact, py::arg("ordered")=true,
123
115
  "Returns a compacted form of the sketch, optionally sorting it")
124
- .def_static("deserialize", &dspy::update_theta_sketch_deserialize,
125
- py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
126
- "Reads a bytes object and returns the corresponding update_theta_sketch")
127
116
  ;
128
117
 
129
118
  py::class_<compact_theta_sketch, theta_sketch>(m, "compact_theta_sketch")
130
119
  .def(py::init<const compact_theta_sketch&>())
131
120
  .def(py::init<const theta_sketch&, bool>())
121
+ .def("serialize", &dspy::compact_theta_sketch_serialize,
122
+ "Serializes the sketch into a bytes object")
132
123
  .def_static("deserialize", &dspy::compact_theta_sketch_deserialize,
133
124
  py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
134
- "Reads a bytes object and returns the corresponding update_theta_sketch")
125
+ "Reads a bytes object and returns the corresponding compact_theta_sketch")
135
126
  ;
136
127
 
137
128
  py::class_<theta_union>(m, "theta_union")
138
129
  .def(py::init(&dspy::theta_union_factory),
139
130
  py::arg("lg_k")=update_theta_sketch::builder::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
140
- .def("update", &theta_union::update, py::arg("sketch"),
131
+ .def("update", &theta_union::update<const theta_sketch&>, py::arg("sketch"),
141
132
  "Updates the union with the given sketch")
142
133
  .def("get_result", &theta_union::get_result, py::arg("ordered")=true,
143
134
  "Returns the sketch corresponding to the union result")
@@ -146,17 +137,36 @@ void init_theta(py::module &m) {
146
137
  py::class_<theta_intersection>(m, "theta_intersection")
147
138
  .def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
148
139
  .def(py::init<const theta_intersection&>())
149
- .def("update", &theta_intersection::update, py::arg("sketch"),
140
+ .def("update", &theta_intersection::update<const theta_sketch&>, py::arg("sketch"),
150
141
  "Intersections the provided sketch with the current intersection state")
151
142
  .def("get_result", &theta_intersection::get_result, py::arg("ordered")=true,
152
143
  "Returns the sketch corresponding to the intersection result")
153
144
  .def("has_result", &theta_intersection::has_result,
154
- "Returns True if the intersection has a valid result, otherwisel False")
145
+ "Returns True if the intersection has a valid result, otherwise False")
155
146
  ;
156
147
 
157
148
  py::class_<theta_a_not_b>(m, "theta_a_not_b")
158
149
  .def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
159
- .def("compute", &theta_a_not_b::compute, py::arg("a"), py::arg("b"), py::arg("ordered")=true,
150
+ .def("compute", &theta_a_not_b::compute<const theta_sketch&, const theta_sketch&>, py::arg("a"), py::arg("b"), py::arg("ordered")=true,
160
151
  "Returns a sketch with the reuslt of appying the A-not-B operation on the given inputs")
161
152
  ;
153
+
154
+ py::class_<theta_jaccard_similarity>(m, "theta_jaccard_similarity")
155
+ .def_static("jaccard", &dspy::theta_jaccard_sim_computation,
156
+ py::arg("sketch_a"), py::arg("sketch_b"),
157
+ "Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches")
158
+ .def_static("exactly_equal", &theta_jaccard_similarity::exactly_equal<const theta_sketch&, const theta_sketch&>,
159
+ py::arg("sketch_a"), py::arg("sketch_b"),
160
+ "Returns True if sketch_a and sketch_b are equivalent, otherwise False")
161
+ .def_static("similarity_test", &theta_jaccard_similarity::similarity_test<const theta_sketch&, const theta_sketch&>,
162
+ py::arg("actual"), py::arg("expected"), py::arg("threshold"),
163
+ "Tests similarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
164
+ "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered "
165
+ "to be similar sith a confidence of 97.7% and returns True, otherwise False.")
166
+ .def_static("dissimilarity_test", &theta_jaccard_similarity::dissimilarity_test<const theta_sketch&, const theta_sketch&>,
167
+ py::arg("actual"), py::arg("expected"), py::arg("threshold"),
168
+ "Tests dissimilarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
169
+ "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered "
170
+ "to be dissimilar sith a confidence of 97.7% and returns True, otherwise False.")
171
+ ;
162
172
  }
@@ -113,7 +113,6 @@ class HllTest(unittest.TestCase):
113
113
  self.assertGreaterEqual(union.get_upper_bound(1), union.get_estimate())
114
114
 
115
115
  self.assertEqual(union.lg_config_k, k)
116
- self.assertFalse(union.is_compact())
117
116
  self.assertFalse(union.is_empty())
118
117
 
119
118
  sk = union.get_result()
@@ -16,9 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  import unittest
19
- from datasketches import (kll_ints_sketch, kll_floats_sketch,
20
- vector_of_kll_ints_sketches,
21
- vector_of_kll_floats_sketches)
19
+ from datasketches import kll_ints_sketch, kll_floats_sketch
22
20
  import numpy as np
23
21
 
24
22
  class KllTest(unittest.TestCase):
@@ -59,6 +57,7 @@ class KllTest(unittest.TestCase):
59
57
  self.assertFalse(kll.is_empty())
60
58
  self.assertTrue(kll.is_estimation_mode())
61
59
  self.assertEqual(kll.get_n(), n)
60
+ self.assertEqual(kll.get_k(), k)
62
61
  self.assertLess(kll.get_num_retained(), n)
63
62
 
64
63
  # merging itself will double the number of items the sketch has seen
@@ -86,6 +85,7 @@ class KllTest(unittest.TestCase):
86
85
  self.assertEqual(kll.get_n(), n)
87
86
  self.assertFalse(kll.is_empty())
88
87
  self.assertFalse(kll.is_estimation_mode()) # n < k
88
+ self.assertEqual(kll.get_k(), k)
89
89
 
90
90
  pmf = kll.get_pmf([round(n/2)])
91
91
  self.assertIsNotNone(pmf)
@@ -0,0 +1,126 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import unittest
19
+ from datasketches import req_ints_sketch, req_floats_sketch
20
+ import numpy as np
21
+
22
+ class reqTest(unittest.TestCase):
23
+ def test_req_example(self):
24
+ k = 12
25
+ n = 2 ** 20
26
+
27
+ # create a sketch and inject ~1 million N(0,1) points as an array and as a single item
28
+ req = req_floats_sketch(k, True) # high rank accuracy
29
+ req.update(np.random.normal(size=n-1))
30
+ req.update(0.0)
31
+
32
+ # 0 should be near the median
33
+ self.assertAlmostEqual(0.5, req.get_rank(0.0), delta=0.03)
34
+
35
+ # the median should be near 0
36
+ self.assertAlmostEqual(0.0, req.get_quantile(0.5), delta=0.03)
37
+
38
+ # we also track the min/max independently from the rest of the data
39
+ # which lets us know the full observed data range
40
+ self.assertLessEqual(req.get_min_value(), req.get_quantile(0.01))
41
+ self.assertLessEqual(0.0, req.get_rank(req.get_min_value()))
42
+ self.assertGreaterEqual(req.get_max_value(), req.get_quantile(0.99))
43
+ self.assertGreaterEqual(1.0, req.get_rank(req.get_max_value()))
44
+
45
+ # we can also extract a list of values at a time,
46
+ # here the values should give us something close to [-2, -1, 0, 1, 2].
47
+ # then get the CDF, which will return something close to
48
+ # the original values used in get_quantiles()
49
+ # finally, can check the normalized rank error bound
50
+ pts = req.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
51
+ cdf = req.get_cdf(pts) # include 1.0 at end to account for all probability mass
52
+ self.assertEqual(len(cdf), len(pts)+1)
53
+
54
+ # For relative error quantiles, the error depends on the actual rank
55
+ # so we need to use that to detemrine the bounds
56
+ est = req.get_rank(0.999, True)
57
+ lb = req.get_rank_lower_bound(est, 1)
58
+ ub = req.get_rank_upper_bound(est, 1)
59
+ self.assertLessEqual(lb, est)
60
+ self.assertLessEqual(est, ub)
61
+
62
+ # and a few basic queries about the sketch
63
+ self.assertFalse(req.is_empty())
64
+ self.assertTrue(req.is_estimation_mode())
65
+ self.assertEqual(req.get_n(), n)
66
+ self.assertLess(req.get_num_retained(), n)
67
+ self.assertEqual(req.get_k(), k)
68
+
69
+ # merging itself will double the number of items the sketch has seen
70
+ req.merge(req)
71
+ self.assertEqual(req.get_n(), 2*n)
72
+
73
+ # we can then serialize and reconstruct the sketch
74
+ req_bytes = req.serialize()
75
+ new_req = req.deserialize(req_bytes)
76
+ self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
77
+ self.assertEqual(req.get_min_value(), new_req.get_min_value())
78
+ self.assertEqual(req.get_max_value(), new_req.get_max_value())
79
+ self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
80
+ self.assertEqual(req.get_rank(0.0), new_req.get_rank(0.0))
81
+
82
+ def test_req_ints_sketch(self):
83
+ k = 100
84
+ n = 10
85
+ req = req_ints_sketch(k)
86
+ for i in range(0, n):
87
+ req.update(i)
88
+
89
+ self.assertEqual(req.get_min_value(), 0)
90
+ self.assertEqual(req.get_max_value(), n-1)
91
+ self.assertEqual(req.get_n(), n)
92
+ self.assertFalse(req.is_empty())
93
+ self.assertFalse(req.is_estimation_mode()) # n < k
94
+ self.assertEqual(req.get_k(), k)
95
+
96
+ pmf = req.get_pmf([round(n/2)])
97
+ self.assertIsNotNone(pmf)
98
+ self.assertEqual(len(pmf), 2)
99
+
100
+ cdf = req.get_cdf([round(n/2)])
101
+ self.assertIsNotNone(cdf)
102
+ self.assertEqual(len(cdf), 2)
103
+
104
+ self.assertEqual(req.get_quantile(0.5), round(n/2))
105
+ quants = req.get_quantiles([0.25, 0.5, 0.75])
106
+ self.assertIsNotNone(quants)
107
+ self.assertEqual(len(quants), 3)
108
+
109
+ self.assertEqual(req.get_rank(round(n/2)), 0.5)
110
+
111
+ # merge self
112
+ req.merge(req)
113
+ self.assertEqual(req.get_n(), 2 * n)
114
+
115
+ sk_bytes = req.serialize()
116
+ self.assertTrue(isinstance(req_ints_sketch.deserialize(sk_bytes), req_ints_sketch))
117
+
118
+ def test_req_floats_sketch(self):
119
+ # already tested ints and it's templatized, so just make sure it instantiates properly
120
+ k = 75
121
+ req = req_floats_sketch(k, False) # low rank accuracy
122
+ self.assertTrue(req.is_empty())
123
+ self.assertFalse(req.is_hra())
124
+
125
+ if __name__ == '__main__':
126
+ unittest.main()