datasketches 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/kll_wrapper.cpp +20 -20
  4. data/ext/datasketches/theta_wrapper.cpp +2 -2
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  7. data/vendor/datasketches-cpp/MANIFEST.in +21 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  10. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  11. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  12. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  13. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  14. data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
  15. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
  16. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  17. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  18. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  19. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  20. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  21. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
  22. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
  24. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  25. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  26. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  27. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  28. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  29. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  30. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  31. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
  32. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
  33. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
  34. data/vendor/datasketches-cpp/pyproject.toml +17 -12
  35. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  36. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  37. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  38. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  39. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  40. data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
  41. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  42. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  43. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  44. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  45. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  46. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  47. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
  48. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  49. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  50. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  51. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  52. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  53. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  54. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  55. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
  56. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
  57. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  58. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  59. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  60. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  61. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  62. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
  63. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
  64. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  65. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  66. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  67. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  68. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  69. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
  70. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
  71. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
  72. data/vendor/datasketches-cpp/setup.py +14 -2
  73. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  74. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  75. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  76. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  77. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
  78. data/vendor/datasketches-cpp/tox.ini +26 -0
  79. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  80. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
  81. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  82. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  83. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
  84. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  85. metadata +14 -5
  86. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -22,14 +22,11 @@
22
22
  #include <kll_sketch.hpp>
23
23
  #include <kll_helper.hpp>
24
24
 
25
- #include <assert.h>
26
-
27
25
  #ifdef KLL_VALIDATION
28
26
 
29
27
  // This is to make sure the implementation matches exactly the reference implementation in OCaml.
30
- // Conditional compilation is used because the implementation needs a few modifications:
31
- // - switch from random choice to deterministic
32
- // - a few methods to expose internals of the sketch
28
+ // Conditional compilation is used because the implementation needs
29
+ // to switch from random choice to deterministic
33
30
 
34
31
  namespace datasketches {
35
32
 
@@ -154,11 +151,11 @@ const int64_t correct_results[num_tests * 7] = {
154
151
  113, 200, 8311133, 6554171, 16, 637, 121111429906734123
155
152
  };
156
153
 
157
- static std::unique_ptr<int[]> make_input_array(unsigned n, unsigned stride) {
158
- assert (kll_helper::is_odd(stride));
159
- unsigned mask((1 << 23) - 1); // because library items are single-precision floats at the moment
160
- unsigned cur(0);
161
- std::unique_ptr<int[]> arr(new int[n]);
154
+ static std::vector<int> make_input_array(unsigned n, unsigned stride) {
155
+ if (!kll_helper::is_odd(stride)) throw std::logic_error("stride must be odd");
156
+ unsigned mask = (1 << 23) - 1; // because items are single-precision floats at the moment
157
+ unsigned cur = 0;
158
+ std::vector<int> arr(n, 0);
162
159
  for (unsigned i = 0; i < n; i++) {
163
160
  cur += stride;
164
161
  cur &= mask;
@@ -167,50 +164,63 @@ static std::unique_ptr<int[]> make_input_array(unsigned n, unsigned stride) {
167
164
  return arr;
168
165
  }
169
166
 
170
- static int64_t simple_hash_of_sub_array(const float* arr, unsigned start, unsigned length) {
171
- int64_t multiplier(738219921); // an arbitrary odd 30-bit number
172
- int64_t mask60((1ULL << 60) - 1ULL);
173
- int64_t accum(0);
174
- for (unsigned i = start; i < start + length; i++) {
175
- accum += (int64_t) arr[i];
167
+ template<typename It>
168
+ std::pair<int64_t, uint8_t> hash_samples_and_count_levels(It from, It to) {
169
+ int64_t multiplier = 738219921; // an arbitrary odd 30-bit number
170
+ int64_t mask60 = (1ULL << 60) - 1ULL;
171
+ int64_t accum = 0;
172
+ uint8_t num_levels = 1;
173
+ for (auto it = from; it != to; ++it) {
174
+ accum += static_cast<int64_t>((*it).first);
176
175
  accum *= multiplier;
177
176
  accum &= mask60;
178
177
  accum ^= accum >> 30;
178
+ const uint8_t level = count_trailing_zeros_in_u64((*it).second);
179
+ if (num_levels <= level) num_levels = level + 1;
179
180
  }
180
- return accum;
181
+ return std::pair<uint64_t, uint8_t>(accum, num_levels);
181
182
  }
182
183
 
183
184
  TEST_CASE("kll validation", "[kll_sketch][validation]") {
184
185
  for (unsigned i = 0; i < num_tests; i++) {
185
- assert (correct_results[7 * i] == i);
186
- unsigned k(correct_results[7 * i + 1]);
187
- unsigned n(correct_results[7 * i + 2]);
188
- unsigned stride(correct_results[7 * i + 3]);
189
- std::unique_ptr<int[]> input_array = make_input_array(n, stride);
186
+ if (correct_results[7 * i] != i) throw std::logic_error("test number mismatch");
187
+ unsigned k = correct_results[7 * i + 1];
188
+ unsigned n = correct_results[7 * i + 2];
189
+ unsigned stride = correct_results[7 * i + 3];
190
+ auto input_array = make_input_array(n, stride);
190
191
  kll_sketch<float> sketch(k);
191
192
  kll_next_offset = 0;
192
193
  for (unsigned j = 0; j < n; j++) {
193
194
  sketch.update(input_array[j]);
194
195
  }
195
- unsigned num_levels = sketch.get_num_levels();
196
196
  unsigned num_samples = sketch.get_num_retained();
197
- int64_t hashed_samples = simple_hash_of_sub_array(sketch.get_items(), sketch.get_levels()[0], num_samples);
197
+ auto p = hash_samples_and_count_levels(sketch.begin(), sketch.end());
198
198
  std::cout << i;
199
- REQUIRE(correct_results[7 * i + 4] == num_levels);
199
+ REQUIRE(correct_results[7 * i + 4] == p.second);
200
200
  REQUIRE(correct_results[7 * i + 5] == num_samples);
201
- if (correct_results[7 * i + 6] == hashed_samples) {
201
+ if (correct_results[7 * i + 6] == p.first) {
202
202
  std::cout << " pass" << std::endl;
203
203
  } else {
204
- std::cout << " " << (correct_results[7 * i + 6]) << " != " << hashed_samples;
205
- sketch.to_stream(std::cout);
204
+ std::cout << " " << (correct_results[7 * i + 6]) << " != " << p.first << "\n";
205
+ std::cout << sketch.to_string();
206
206
  FAIL();
207
207
  }
208
208
  }
209
209
  }
210
210
 
211
- TEST_CASE("kll validation: test hash", "[kll_sketch][validaiton]") {
212
- float array[] = { 907500, 944104, 807020, 219921, 678370, 955217, 426885 };
213
- REQUIRE(simple_hash_of_sub_array(array, 1, 5) == 1141543353991880193LL);
211
+ TEST_CASE("kll validation: test hash and num levels", "[kll_sketch][validaiton]") {
212
+ std::pair<float, uint64_t> array[] = {
213
+ std::make_pair(907500, 1),
214
+ std::make_pair(944104, 1),
215
+ std::make_pair(807020, 2),
216
+ std::make_pair(219921, 2),
217
+ std::make_pair(678370, 2),
218
+ std::make_pair(955217, 4),
219
+ std::make_pair(426885, 8)
220
+ };
221
+ auto hash_and_num_levels = hash_samples_and_count_levels(array + 1, array + 6);
222
+ REQUIRE(hash_and_num_levels.first == 1141543353991880193LL);
223
+ REQUIRE(hash_and_num_levels.second == 3);
214
224
  }
215
225
 
216
226
  TEST_CASE("kll validation: make input array", "[kll_sketch][validaiton]") {
@@ -1,18 +1,23 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
1
18
  [build-system]
2
19
  requires = ["wheel",
3
20
  "setuptools >= 30.3.0",
4
21
  "cmake >= 3.16",
5
22
  "pybind11[global] >= 2.6.0"]
6
23
  build-backend = "setuptools.build_meta"
7
-
8
- [tool.tox]
9
- legacy_tox_ini = """
10
- [tox]
11
- envlist = py3
12
-
13
- [testenv]
14
- deps = pytest
15
- numpy
16
- changedir = python/tests
17
- commands = pytest
18
- """
@@ -50,7 +50,13 @@ target_link_libraries(python
50
50
 
51
51
  set_target_properties(python PROPERTIES
52
52
  PREFIX ""
53
- OUTPUT_NAME datasketches
53
+ OUTPUT_NAME _datasketches
54
+ )
55
+
56
+ target_include_directories(python
57
+ PUBLIC
58
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
59
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
54
60
  )
55
61
 
56
62
  # ensure we make a .so on Mac rather than .dylib
@@ -71,4 +77,5 @@ target_sources(python
71
77
  src/quantiles_wrapper.cpp
72
78
  src/ks_wrapper.cpp
73
79
  src/vector_of_kll.cpp
80
+ src/py_serde.cpp
74
81
  )
@@ -0,0 +1,104 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ from _datasketches import PyObjectSerDe
19
+
20
+ import struct
21
+
22
+ # This file provides several Python SerDe implementation examples.
23
+ #
24
+ # Each implementation must extend the PyObjectSerDe class and define
25
+ # three methods:
26
+ # * get_size(item) returns an int of the number of bytes needed to
27
+ # serialize the given item
28
+ # * to_bytes(item) returns a bytes object representing a serialized
29
+ # version of the given item
30
+ # * from_bytes(data, offset) takes a bytes object (data) and an offset
31
+ # indicating where in the data array to start reading. The method
32
+ # returns a tuple with the newly reconstructed object and the
33
+ # total number of bytes beyond the offset read from the input data.
34
+
35
+ # Implements a simple string-encoding scheme where a string is
36
+ # written as <num_bytes> <string_contents>, with no null termination.
37
+ # This format allows pre-allocating each string, at the cost of
38
+ # additional storage. Using this format, the serialized string consumes
39
+ # 4 + len(item) bytes.
40
+ class PyStringsSerDe(PyObjectSerDe):
41
+ def get_size(self, item):
42
+ return int(4 + len(item))
43
+
44
+ def to_bytes(self, item: str):
45
+ b = bytearray()
46
+ b.extend(len(item).to_bytes(4, 'little'))
47
+ b.extend(map(ord,item))
48
+ return bytes(b)
49
+
50
+ def from_bytes(self, data: bytes, offset: int):
51
+ num_chars = int.from_bytes(data[offset:offset+3], 'little')
52
+ if (num_chars < 0 or num_chars > offset + len(data)):
53
+ raise IndexError(f'num_chars read must be non-negative and not larger than the buffer. Found {num_chars}')
54
+ str = data[offset+4:offset+4+num_chars].decode()
55
+ return (str, 4+num_chars)
56
+
57
+ # Implements an integer-encoding scheme where each integer is written
58
+ # as a 32-bit (4 byte) little-endian value.
59
+ class PyIntsSerDe(PyObjectSerDe):
60
+ def get_size(self, item):
61
+ return int(4)
62
+
63
+ def to_bytes(self, item):
64
+ return struct.pack('i', item)
65
+
66
+ def from_bytes(self, data: bytes, offset: int):
67
+ val = struct.unpack_from('i', data, offset)[0]
68
+ return (val, 4)
69
+
70
+
71
+ class PyLongsSerDe(PyObjectSerDe):
72
+ def get_size(self, item):
73
+ return int(8)
74
+
75
+ def to_bytes(self, item):
76
+ return struct.pack('l', item)
77
+
78
+ def from_bytes(self, data: bytes, offset: int):
79
+ val = struct.unpack_from('l', data, offset)[0]
80
+ return (val, 8)
81
+
82
+
83
+ class PyFloatsSerDe(PyObjectSerDe):
84
+ def get_size(self, item):
85
+ return int(4)
86
+
87
+ def to_bytes(self, item):
88
+ return struct.pack('f', item)
89
+
90
+ def from_bytes(self, data: bytes, offset: int):
91
+ val = struct.unpack_from('f', data, offset)[0]
92
+ return (val, 4)
93
+
94
+
95
+ class PyDoublesSerDe(PyObjectSerDe):
96
+ def get_size(self, item):
97
+ return int(8)
98
+
99
+ def to_bytes(self, item):
100
+ return struct.pack('d', item)
101
+
102
+ def from_bytes(self, data: bytes, offset: int):
103
+ val = struct.unpack_from('d', data, offset)[0]
104
+ return (val, 8)
@@ -0,0 +1,22 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ name = 'datasketches'
19
+
20
+ from .PySerDe import *
21
+
22
+ from _datasketches import *
@@ -0,0 +1,113 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <pybind11/pybind11.h>
21
+ #include <pybind11/functional.h>
22
+ #include <sstream>
23
+
24
+ #ifndef _PY_SERDE_HPP_
25
+ #define _PY_SERDE_HPP_
26
+
27
+ namespace py = pybind11;
28
+
29
+ namespace datasketches {
30
+
31
+ /**
32
+ * @brief The py_object_serde is an abstract class that implements the
33
+ * datasketches serde interface, and is used to allow custom Python
34
+ * serialization of items wrapped as generic py::object types. The actual
35
+ * Python implementation classes must extend the PyObjectSerDe class.
36
+ */
37
+ struct py_object_serde {
38
+ /**
39
+ * @brief Get the serialized size of an object, in bytes
40
+ *
41
+ * @param item A provided item
42
+ * @return int64_t The serialized size of the item, in bytes
43
+ */
44
+ virtual int64_t get_size(const py::object& item) const = 0;
45
+
46
+ /**
47
+ * @brief Serializes an item to a bytes object
48
+ *
49
+ * @param item A provided item
50
+ * @return The serialized image of the item as a Python bytes object
51
+ */
52
+ virtual py::bytes to_bytes(const py::object& item) const = 0;
53
+
54
+ /**
55
+ * @brief Constructs an object from a serialized image, reading the
56
+ * incoming buffer starting at the specified offset.
57
+ *
58
+ * @param bytes A buffer containing items from a serialized sketch
59
+ * @param offset The starting offset into the bytes buffer
60
+ * @return A Python tuple of the reconstructed item and the total number of bytes read
61
+ */
62
+ virtual py::tuple from_bytes(py::bytes& bytes, size_t offset) const = 0;
63
+
64
+ virtual ~py_object_serde() = default;
65
+
66
+ // these methods are required by the serde interface; see common/include/serde.hpp for
67
+ // default implementations for C++ std::string and numeric types.
68
+ size_t size_of_item(const py::object& item) const;
69
+ size_t serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const;
70
+ size_t deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const;
71
+ };
72
+
73
+ /**
74
+ * @brief The PyObjectSerDe class provides a concrete base class
75
+ * that pybind11 uses as a "trampoline" to pass calls through to
76
+ * the abstract py_object_serde class. Custom Python serde implementations
77
+ * must extend this class.
78
+ */
79
+ struct PyObjectSerDe : public py_object_serde {
80
+ using py_object_serde::py_object_serde;
81
+
82
+ // trampoline definitions -- need one for each virtual function
83
+ int64_t get_size(const py::object& item) const override {
84
+ PYBIND11_OVERRIDE_PURE(
85
+ int64_t, // Return type
86
+ py_object_serde, // Parent class
87
+ get_size, // Name of function in C++ (must match Python name)
88
+ item // Argument(s)
89
+ );
90
+ }
91
+
92
+ py::bytes to_bytes(const py::object& item) const override {
93
+ PYBIND11_OVERRIDE_PURE(
94
+ py::bytes, // Return type
95
+ py_object_serde, // Parent class
96
+ to_bytes, // Name of function in C++ (must match Python name)
97
+ item // Argument(s)
98
+ );
99
+ }
100
+
101
+ py::tuple from_bytes(py::bytes& bytes, size_t offset) const override {
102
+ PYBIND11_OVERRIDE_PURE(
103
+ py::tuple, // Return type
104
+ py_object_serde, // Parent class
105
+ from_bytes, // Name of function in C++ (must match Python name)
106
+ bytes, offset // Argument(s)
107
+ );
108
+ }
109
+ };
110
+
111
+ }
112
+
113
+ #endif // _PY_SERDE_HPP_
@@ -40,20 +40,20 @@
40
40
  "name": "stdout",
41
41
  "output_type": "stream",
42
42
  "text": [
43
- "### Update Theta sketch summary:\n",
44
- " lg nominal size : 12\n",
45
- " lg current size : 13\n",
46
- " num retained keys : 6560\n",
47
- " resize factor : 8\n",
48
- " sampling probability : 1\n",
43
+ "### Theta sketch summary:\n",
44
+ " num retained entries : 6560\n",
49
45
  " seed hash : 37836\n",
46
+ " empty? : false\n",
50
47
  " ordered? : false\n",
48
+ " estimation mode? : true\n",
51
49
  " theta (fraction) : 0.00654224\n",
52
50
  " theta (raw 64-bit) : 60341508738660257\n",
53
- " estimation mode? : true\n",
54
51
  " estimate : 1.00271e+06\n",
55
52
  " lower bound 95% conf : 978261\n",
56
53
  " upper bound 95% conf : 1.02778e+06\n",
54
+ " lg nominal size : 12\n",
55
+ " lg current size : 13\n",
56
+ " resize factor : 8\n",
57
57
  "### End sketch summary\n",
58
58
  "\n"
59
59
  ]
@@ -100,7 +100,7 @@
100
100
  "cell_type": "markdown",
101
101
  "metadata": {},
102
102
  "source": [
103
- "We can serialize and reconstruct the sketch. If we compact the sketch prior to serialization, we can still query the rebuilt sketch but cannot update it further."
103
+ "We can serialize and reconstruct the sketch. Serialization necessarily produces a compact sketch, meaning the sketch can be deserialized and queried or used for further unions or set operations but can not be updated directly."
104
104
  ]
105
105
  },
106
106
  {
@@ -139,7 +139,7 @@
139
139
  }
140
140
  ],
141
141
  "source": [
142
- "new_sk1 = theta_sketch.deserialize(sk1_bytes)\n",
142
+ "new_sk1 = compact_theta_sketch.deserialize(sk1_bytes)\n",
143
143
  "print(\"Estimate: \\t\\t\", new_sk1.get_estimate())\n",
144
144
  "print(\"Estimation mode: \\t\", new_sk1.is_estimation_mode())"
145
145
  ]
@@ -169,20 +169,20 @@
169
169
  "name": "stdout",
170
170
  "output_type": "stream",
171
171
  "text": [
172
- "### Update Theta sketch summary:\n",
173
- " lg nominal size : 13\n",
174
- " lg current size : 14\n",
175
- " num retained keys : 12488\n",
176
- " resize factor : 8\n",
177
- " sampling probability : 1\n",
172
+ "### Theta sketch summary:\n",
173
+ " num retained entries : 12488\n",
178
174
  " seed hash : 37836\n",
175
+ " empty? : false\n",
179
176
  " ordered? : false\n",
177
+ " estimation mode? : true\n",
180
178
  " theta (fraction) : 0.0123336\n",
181
179
  " theta (raw 64-bit) : 113757656857900725\n",
182
- " estimation mode? : true\n",
183
180
  " estimate : 1.01252e+06\n",
184
181
  " lower bound 95% conf : 994626\n",
185
182
  " upper bound 95% conf : 1.03073e+06\n",
183
+ " lg nominal size : 13\n",
184
+ " lg current size : 14\n",
185
+ " resize factor : 8\n",
186
186
  "### End sketch summary\n",
187
187
  "\n"
188
188
  ]
@@ -255,13 +255,14 @@
255
255
  "output_type": "stream",
256
256
  "text": [
257
257
  "Has result: True\n",
258
- "### Compact Theta sketch summary:\n",
259
- " num retained keys : 1668\n",
258
+ "### Theta sketch summary:\n",
259
+ " num retained entries : 1668\n",
260
260
  " seed hash : 37836\n",
261
+ " empty? : false\n",
261
262
  " ordered? : true\n",
263
+ " estimation mode? : true\n",
262
264
  " theta (fraction) : 0.00654224\n",
263
265
  " theta (raw 64-bit) : 60341508738660257\n",
264
- " estimation mode? : true\n",
265
266
  " estimate : 254959\n",
266
267
  " lower bound 95% conf : 242739\n",
267
268
  " upper bound 95% conf : 267789\n",
@@ -326,13 +327,14 @@
326
327
  "name": "stdout",
327
328
  "output_type": "stream",
328
329
  "text": [
329
- "### Compact Theta sketch summary:\n",
330
- " num retained keys : 4892\n",
330
+ "### Theta sketch summary:\n",
331
+ " num retained entries : 4892\n",
331
332
  " seed hash : 37836\n",
333
+ " empty? : false\n",
332
334
  " ordered? : true\n",
335
+ " estimation mode? : true\n",
333
336
  " theta (fraction) : 0.00654224\n",
334
337
  " theta (raw 64-bit) : 60341508738660257\n",
335
- " estimation mode? : true\n",
336
338
  " estimate : 747756\n",
337
339
  " lower bound 95% conf : 726670\n",
338
340
  " upper bound 95% conf : 769452\n",
@@ -374,7 +376,7 @@
374
376
  ],
375
377
  "metadata": {
376
378
  "kernelspec": {
377
- "display_name": "Python 3",
379
+ "display_name": "Python 3.10.6 64-bit",
378
380
  "language": "python",
379
381
  "name": "python3"
380
382
  },
@@ -388,7 +390,12 @@
388
390
  "name": "python",
389
391
  "nbconvert_exporter": "python",
390
392
  "pygments_lexer": "ipython3",
391
- "version": "3.7.0"
393
+ "version": "3.10.6"
394
+ },
395
+ "vscode": {
396
+ "interpreter": {
397
+ "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
398
+ }
392
399
  }
393
400
  },
394
401
  "nbformat": 4,
@@ -1,3 +1,21 @@
1
+ :: Licensed to the Apache Software Foundation (ASF) under one
2
+ :: or more contributor license agreements. See the NOTICE file
3
+ :: distributed with this work for additional information
4
+ :: regarding copyright ownership. The ASF licenses this file
5
+ :: to you under the Apache License, Version 2.0 (the
6
+ :: "License"); you may not use this file except in compliance
7
+ :: with the License. You may obtain a copy of the License at
8
+ ::
9
+ :: http://www.apache.org/licenses/LICENSE-2.0
10
+ ::
11
+ :: Unless required by applicable law or agreed to in writing,
12
+ :: software distributed under the License is distributed on an
13
+ :: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ :: KIND, either express or implied. See the License for the
15
+ :: specific language governing permissions and limitations
16
+ :: under the License.
17
+
18
+
1
19
  @echo off
2
20
  :: Takes path to the Python interpreter and returns the path to pybind11
3
21
  %1 -c "import pybind11,sys;sys.stdout.write(pybind11.get_cmake_dir())"
@@ -1,2 +1,18 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
1
18
  name = "datasketches"
2
-
@@ -21,6 +21,7 @@
21
21
 
22
22
  namespace py = pybind11;
23
23
 
24
+ // sketches
24
25
  void init_hll(py::module& m);
25
26
  void init_kll(py::module& m);
26
27
  void init_fi(py::module& m);
@@ -29,10 +30,13 @@ void init_theta(py::module& m);
29
30
  void init_vo(py::module& m);
30
31
  void init_req(py::module& m);
31
32
  void init_quantiles(py::module& m);
32
- void init_kolmogorov_smirnov(py::module& m);
33
33
  void init_vector_of_kll(py::module& m);
34
34
 
35
- PYBIND11_MODULE(datasketches, m) {
35
+ // supporting objects
36
+ void init_kolmogorov_smirnov(py::module& m);
37
+ void init_serde(py::module& m);
38
+
39
+ PYBIND11_MODULE(_datasketches, m) {
36
40
  init_hll(m);
37
41
  init_kll(m);
38
42
  init_fi(m);
@@ -41,6 +45,8 @@ PYBIND11_MODULE(datasketches, m) {
41
45
  init_vo(m);
42
46
  init_req(m);
43
47
  init_quantiles(m);
44
- init_kolmogorov_smirnov(m);
45
48
  init_vector_of_kll(m);
49
+
50
+ init_kolmogorov_smirnov(m);
51
+ init_serde(m);
46
52
  }