datasketches 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -17,105 +17,158 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
+
21
+ #include "py_serde.hpp"
22
+ #include "py_object_ostream.hpp"
20
23
  #include "frequent_items_sketch.hpp"
21
24
 
22
25
  #include <pybind11/pybind11.h>
23
- #include <sstream>
24
-
25
- namespace py = pybind11;
26
-
27
- namespace datasketches {
28
- namespace python {
29
-
30
- template<typename T>
31
- frequent_items_sketch<T> fi_sketch_deserialize(py::bytes skBytes) {
32
- std::string skStr = skBytes; // implicit cast
33
- return frequent_items_sketch<T>::deserialize(skStr.c_str(), skStr.length());
34
- }
35
26
 
36
- template<typename T>
37
- py::object fi_sketch_serialize(const frequent_items_sketch<T>& sk) {
38
- auto serResult = sk.serialize();
39
- return py::bytes((char*)serResult.data(), serResult.size());
40
- }
41
-
42
- // maybe possible to disambiguate the static vs method get_epsilon calls, but
43
- // this is easier for now
44
- template<typename T>
45
- double fi_sketch_get_generic_epsilon(uint8_t lg_max_map_size) {
46
- return frequent_items_sketch<T>::get_epsilon(lg_max_map_size);
47
- }
48
-
49
- template<typename T>
50
- py::list fi_sketch_get_frequent_items(const frequent_items_sketch<T>& sk,
51
- frequent_items_error_type err_type,
52
- uint64_t threshold = 0) {
53
- if (threshold == 0) { threshold = sk.get_maximum_error(); }
54
-
55
- py::list list;
56
- auto items = sk.get_frequent_items(err_type, threshold);
57
- for (auto iter = items.begin(); iter != items.end(); ++iter) {
58
- py::tuple t = py::make_tuple(iter->get_item(),
59
- iter->get_estimate(),
60
- iter->get_lower_bound(),
61
- iter->get_upper_bound());
62
- list.append(t);
63
- }
64
- return list;
65
- }
27
+ #include <ostream>
66
28
 
67
- template<typename T>
68
- size_t fi_sketch_get_serialized_size_bytes(const frequent_items_sketch<T>& sk) {
69
- return sk.get_serialized_size_bytes();
70
- }
29
+ namespace py = pybind11;
71
30
 
72
- }
73
- }
31
+ // forward declarations
32
+ // std::string and arithmetic types, where we don't need a separate serde
33
+ template<typename T, typename W, typename H, typename E, typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<std::string, T>::value, bool>::type = 0>
34
+ void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz);
74
35
 
75
- namespace dspy = datasketches::python;
36
+ // py::object and other types where the caller must provide a serde
37
+ template<typename T, typename W, typename H, typename E, typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<std::string, T>::value, bool>::type = 0>
38
+ void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz);
76
39
 
77
- template<typename T>
40
+ template<typename T, typename W, typename H, typename E>
78
41
  void bind_fi_sketch(py::module &m, const char* name) {
79
42
  using namespace datasketches;
80
43
 
81
- py::class_<frequent_items_sketch<T>>(m, name)
44
+ auto fi_class = py::class_<frequent_items_sketch<T, W, H, E>>(m, name)
82
45
  .def(py::init<uint8_t>(), py::arg("lg_max_k"))
83
- .def("__str__", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
46
+ .def("__str__", &frequent_items_sketch<T, W, H, E>::to_string, py::arg("print_items")=false,
84
47
  "Produces a string summary of the sketch")
85
- .def("to_string", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
48
+ .def("to_string", &frequent_items_sketch<T, W, H, E>::to_string, py::arg("print_items")=false,
86
49
  "Produces a string summary of the sketch")
87
- .def("update", (void (frequent_items_sketch<T>::*)(const T&, uint64_t)) &frequent_items_sketch<T>::update, py::arg("item"), py::arg("weight")=1,
50
+ .def("update", (void (frequent_items_sketch<T, W, H, E>::*)(const T&, uint64_t)) &frequent_items_sketch<T, W, H, E>::update, py::arg("item"), py::arg("weight")=1,
88
51
  "Updates the sketch with the given string and, optionally, a weight")
89
- .def("get_frequent_items", &dspy::fi_sketch_get_frequent_items<T>, py::arg("err_type"), py::arg("threshold")=0)
90
- .def("merge", (void (frequent_items_sketch<T>::*)(const frequent_items_sketch<T>&)) &frequent_items_sketch<T>::merge,
52
+ .def("merge", (void (frequent_items_sketch<T, W, H, E>::*)(const frequent_items_sketch<T, W, H, E>&)) &frequent_items_sketch<T, W, H, E>::merge,
91
53
  "Merges the given sketch into this one")
92
- .def("is_empty", &frequent_items_sketch<T>::is_empty,
54
+ .def("is_empty", &frequent_items_sketch<T, W, H, E>::is_empty,
93
55
  "Returns True if the sketch is empty, otherwise False")
94
- .def("get_num_active_items", &frequent_items_sketch<T>::get_num_active_items,
56
+ .def("get_num_active_items", &frequent_items_sketch<T, W, H, E>::get_num_active_items,
95
57
  "Returns the number of active items in the sketch")
96
- .def("get_total_weight", &frequent_items_sketch<T>::get_total_weight,
58
+ .def("get_total_weight", &frequent_items_sketch<T, W, H, E>::get_total_weight,
97
59
  "Returns the sum of the weights (frequencies) in the stream seen so far by the sketch")
98
- .def("get_estimate", &frequent_items_sketch<T>::get_estimate, py::arg("item"),
60
+ .def("get_estimate", &frequent_items_sketch<T, W, H, E>::get_estimate, py::arg("item"),
99
61
  "Returns the estimate of the weight (frequency) of the given item.\n"
100
62
  "Note: The true frequency of a item would be the sum of the counts as a result of the "
101
63
  "two update functions.")
102
- .def("get_lower_bound", &frequent_items_sketch<T>::get_lower_bound, py::arg("item"),
64
+ .def("get_lower_bound", &frequent_items_sketch<T, W, H, E>::get_lower_bound, py::arg("item"),
103
65
  "Returns the guaranteed lower bound weight (frequency) of the given item.")
104
- .def("get_upper_bound", &frequent_items_sketch<T>::get_upper_bound, py::arg("item"),
66
+ .def("get_upper_bound", &frequent_items_sketch<T, W, H, E>::get_upper_bound, py::arg("item"),
105
67
  "Returns the guaranteed upper bound weight (frequency) of the given item.")
106
- .def("get_sketch_epsilon", (double (frequent_items_sketch<T>::*)(void) const) &frequent_items_sketch<T>::get_epsilon,
68
+ .def("get_sketch_epsilon", (double (frequent_items_sketch<T, W, H, E>::*)(void) const) &frequent_items_sketch<T, W, H, E>::get_epsilon,
107
69
  "Returns the epsilon value used by the sketch to compute error")
108
- .def_static("get_epsilon_for_lg_size", &dspy::fi_sketch_get_generic_epsilon<T>, py::arg("lg_max_map_size"),
109
- "Returns the epsilon value used to compute a priori error for a given log2(max_map_size)")
110
- .def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
111
- "Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.")
112
- .def("get_serialized_size_bytes", &dspy::fi_sketch_get_serialized_size_bytes<T>,
113
- "Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.")
114
- .def("serialize", &dspy::fi_sketch_serialize<T>, "Serializes the sketch into a bytes object")
115
- .def_static("deserialize", &dspy::fi_sketch_deserialize<T>, "Reads a bytes object and returns the corresponding frequent_strings_sketch")
116
- ;
70
+ .def(
71
+ "get_frequent_items",
72
+ [](const frequent_items_sketch<T, W, H, E>& sk, frequent_items_error_type err_type, uint64_t threshold) {
73
+ if (threshold == 0) threshold = sk.get_maximum_error();
74
+ py::list list;
75
+ auto rows = sk.get_frequent_items(err_type, threshold);
76
+ for (auto row: rows) {
77
+ list.append(py::make_tuple(
78
+ row.get_item(),
79
+ row.get_estimate(),
80
+ row.get_lower_bound(),
81
+ row.get_upper_bound())
82
+ );
83
+ }
84
+ return list;
85
+ },
86
+ py::arg("err_type"), py::arg("threshold")=0
87
+ )
88
+ .def_static(
89
+ "get_epsilon_for_lg_size",
90
+ [](uint8_t lg_max_map_size) { return frequent_items_sketch<T, W, H, E>::get_epsilon(lg_max_map_size); },
91
+ py::arg("lg_max_map_size"),
92
+ "Returns the epsilon value used to compute a priori error for a given log2(max_map_size)"
93
+ )
94
+ .def_static(
95
+ "get_apriori_error",
96
+ &frequent_items_sketch<T, W, H, E>::get_apriori_error,
97
+ py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
98
+ "Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight."
99
+ );
100
+
101
+ // serialization may need a caller-provided serde depending on the sketch type, so
102
+ // we use a separate method to handle that appropriately based on type T.
103
+ add_serialization(fi_class);
104
+ }
105
+
106
+ // std::string or arithmetic types, for which we have a built-in serde
107
+ template<typename T, typename W, typename H, typename E, typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<std::string, T>::value, bool>::type>
108
+ void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz) {
109
+ using namespace datasketches;
110
+ clazz.def(
111
+ "get_serialized_size_bytes",
112
+ [](const frequent_items_sketch<T, W, H, E>& sk) { return sk.get_serialized_size_bytes(); },
113
+ "Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at."
114
+ )
115
+ .def(
116
+ "serialize",
117
+ [](const frequent_items_sketch<T, W, H, E>& sk) {
118
+ auto bytes = sk.serialize();
119
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
120
+ },
121
+ "Serializes the sketch into a bytes object."
122
+ )
123
+ .def_static(
124
+ "deserialize",
125
+ [](const std::string& bytes) { return frequent_items_sketch<T, W, H, E>::deserialize(bytes.data(), bytes.size()); },
126
+ py::arg("bytes"),
127
+ "Reads a bytes object and returns the corresponding frequent_strings_sketch."
128
+ );
117
129
  }
118
130
 
131
+ // py::object or any other type that requires a provided serde
132
+ template<typename T, typename W, typename H, typename E, typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<std::string, T>::value, bool>::type>
133
+ void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz) {
134
+ using namespace datasketches;
135
+ clazz.def(
136
+ "get_serialized_size_bytes",
137
+ [](const frequent_items_sketch<T, W, H, E>& sk, py_object_serde& serde) { return sk.get_serialized_size_bytes(serde); },
138
+ py::arg("serde"),
139
+ "Computes the size needed to serialize the current state of the sketch using the provided serde. This can be expensive since every item needs to be looked at."
140
+ )
141
+ .def(
142
+ "serialize",
143
+ [](const frequent_items_sketch<T, W, H, E>& sk, py_object_serde& serde) {
144
+ auto bytes = sk.serialize(0, serde);
145
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
146
+ }, py::arg("serde"),
147
+ "Serializes the sketch into a bytes object using the provided serde."
148
+ )
149
+ .def_static(
150
+ "deserialize",
151
+ [](const std::string& bytes, py_object_serde& serde) {
152
+ return frequent_items_sketch<T, W, H, E>::deserialize(bytes.data(), bytes.size(), serde);
153
+ }, py::arg("bytes"), py::arg("serde"),
154
+ "Reads a bytes object using the provided serde and returns the corresponding frequent_strings_sketch."
155
+ );
156
+ }
157
+
158
+ // calls class __hash__ method
159
+ struct py_hash_caller {
160
+ size_t operator()(const py::object& a) const {
161
+ return py::hash(a);
162
+ }
163
+ };
164
+
165
+ // calls class __eq__ method
166
+ struct py_equal_caller {
167
+ bool operator()(const py::object& a, const py::object& b) const {
168
+ return a.equal(b);
169
+ }
170
+ };
171
+
119
172
  void init_fi(py::module &m) {
120
173
  using namespace datasketches;
121
174
 
@@ -124,5 +177,6 @@ void init_fi(py::module &m) {
124
177
  .value("NO_FALSE_NEGATIVES", NO_FALSE_NEGATIVES)
125
178
  .export_values();
126
179
 
127
- bind_fi_sketch<std::string>(m, "frequent_strings_sketch");
180
+ bind_fi_sketch<std::string, uint64_t, std::hash<std::string>, std::equal_to<std::string>>(m, "frequent_strings_sketch");
181
+ bind_fi_sketch<py::object, uint64_t, py_hash_caller, py_equal_caller>(m, "frequent_items_sketch");
128
182
  }
@@ -17,34 +17,11 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
- #include "hll.hpp"
21
-
22
20
  #include <pybind11/pybind11.h>
23
21
 
24
- namespace py = pybind11;
25
-
26
- namespace datasketches {
27
- namespace python {
28
-
29
- hll_sketch hll_sketch_deserialize(py::bytes skBytes) {
30
- std::string skStr = skBytes; // implicit cast
31
- return hll_sketch::deserialize(skStr.c_str(), skStr.length());
32
- }
33
-
34
- py::object hll_sketch_serialize_compact(const hll_sketch& sk) {
35
- auto serResult = sk.serialize_compact();
36
- return py::bytes((char*)serResult.data(), serResult.size());
37
- }
38
-
39
- py::object hll_sketch_serialize_updatable(const hll_sketch& sk) {
40
- auto serResult = sk.serialize_updatable();
41
- return py::bytes((char*)serResult.data(), serResult.size());
42
- }
43
-
44
- }
45
- }
22
+ #include "hll.hpp"
46
23
 
47
- namespace dspy = datasketches::python;
24
+ namespace py = pybind11;
48
25
 
49
26
  void init_hll(py::module &m) {
50
27
  using namespace datasketches;
@@ -59,12 +36,6 @@ void init_hll(py::module &m) {
59
36
  .def(py::init<uint8_t>(), py::arg("lg_k"))
60
37
  .def(py::init<uint8_t, target_hll_type>(), py::arg("lg_k"), py::arg("tgt_type"))
61
38
  .def(py::init<uint8_t, target_hll_type, bool>(), py::arg("lg_k"), py::arg("tgt_type"), py::arg("start_max_size")=false)
62
- .def_static("deserialize", &dspy::hll_sketch_deserialize,
63
- "Reads a bytes object and returns the corresponding hll_sketch")
64
- .def("serialize_compact", &dspy::hll_sketch_serialize_compact,
65
- "Serializes the sketch into a bytes object, compressiong the exception table if HLL_4")
66
- .def("serialize_updatable", &dspy::hll_sketch_serialize_updatable,
67
- "Serializes the sketch into a bytes object")
68
39
  .def("__str__", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string,
69
40
  py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false,
70
41
  "Produces a string summary of the sketch")
@@ -88,7 +59,7 @@ void init_hll(py::module &m) {
88
59
  .def("get_compact_serialization_bytes", &hll_sketch::get_compact_serialization_bytes,
89
60
  "Returns the size of the serialized sketch when compressing the exception table if HLL_4")
90
61
  .def("reset", &hll_sketch::reset,
91
- "Resets the sketch to the empty state in coupon colleciton mode")
62
+ "Resets the sketch to the empty state in coupon collection mode")
92
63
  .def("update", (void (hll_sketch::*)(int64_t)) &hll_sketch::update, py::arg("datum"),
93
64
  "Updates the sketch with the given integral value")
94
65
  .def("update", (void (hll_sketch::*)(double)) &hll_sketch::update, py::arg("datum"),
@@ -97,11 +68,32 @@ void init_hll(py::module &m) {
97
68
  "Updates the sketch with the given string value")
98
69
  .def_static("get_max_updatable_serialization_bytes", &hll_sketch::get_max_updatable_serialization_bytes,
99
70
  py::arg("lg_k"), py::arg("tgt_type"),
100
- "Provides a likely upper bound on serialization size for the given paramters")
71
+ "Provides a likely upper bound on serialization size for the given parameters")
101
72
  .def_static("get_rel_err", &hll_sketch::get_rel_err,
102
73
  py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
103
- "Retuns the a priori relative error bound for the given parameters")
104
- ;
74
+ "Returns the a priori relative error bound for the given parameters")
75
+ .def(
76
+ "serialize_compact",
77
+ [](const hll_sketch& sk) {
78
+ auto bytes = sk.serialize_compact();
79
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
80
+ },
81
+ "Serializes the sketch into a bytes object, compressing the exception table if HLL_4"
82
+ )
83
+ .def(
84
+ "serialize_updatable",
85
+ [](const hll_sketch& sk) {
86
+ auto bytes = sk.serialize_updatable();
87
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
88
+ },
89
+ "Serializes the sketch into a bytes object"
90
+ )
91
+ .def_static(
92
+ "deserialize",
93
+ [](const std::string& bytes) { return hll_sketch::deserialize(bytes.data(), bytes.size()); },
94
+ py::arg("bytes"),
95
+ "Reads a bytes object and returns the corresponding hll_sketch"
96
+ );
105
97
 
106
98
  py::class_<hll_union>(m, "hll_union")
107
99
  .def(py::init<uint8_t>(), py::arg("lg_max_k"))
@@ -129,6 +121,6 @@ void init_hll(py::module &m) {
129
121
  "Updates the union with the given string value")
130
122
  .def_static("get_rel_err", &hll_union::get_rel_err,
131
123
  py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
132
- "Retuns the a priori relative error bound for the given parameters")
124
+ "Returns the a priori relative error bound for the given parameters")
133
125
  ;
134
126
  }