datasketches 0.2.7 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +9 -0
  3. data/ext/datasketches/kll_wrapper.cpp +20 -20
  4. data/ext/datasketches/theta_wrapper.cpp +2 -2
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  7. data/vendor/datasketches-cpp/MANIFEST.in +21 -2
  8. data/vendor/datasketches-cpp/NOTICE +1 -1
  9. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  10. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  11. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  12. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  13. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  14. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  15. data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
  16. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
  17. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -1
  19. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  20. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  21. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  22. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  23. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
  24. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  27. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  28. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  29. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  30. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  31. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  32. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  33. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
  34. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
  35. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
  36. data/vendor/datasketches-cpp/pyproject.toml +17 -12
  37. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  38. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  39. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  40. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  41. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  42. data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
  43. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  44. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  45. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  46. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  47. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  48. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  49. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +63 -68
  50. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  51. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  52. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  53. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  54. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  55. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  56. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  57. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
  58. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
  59. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  60. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  61. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  62. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  63. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  64. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
  65. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
  66. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  67. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  68. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  69. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  70. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  71. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
  72. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
  73. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
  74. data/vendor/datasketches-cpp/setup.py +14 -2
  75. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  76. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  77. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  78. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  79. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
  80. data/vendor/datasketches-cpp/tox.ini +26 -0
  81. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  82. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
  83. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  84. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  85. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
  86. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  87. metadata +15 -6
  88. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -22,14 +22,11 @@
22
22
  #include <kll_sketch.hpp>
23
23
  #include <kll_helper.hpp>
24
24
 
25
- #include <assert.h>
26
-
27
25
  #ifdef KLL_VALIDATION
28
26
 
29
27
  // This is to make sure the implementation matches exactly the reference implementation in OCaml.
30
- // Conditional compilation is used because the implementation needs a few modifications:
31
- // - switch from random choice to deterministic
32
- // - a few methods to expose internals of the sketch
28
+ // Conditional compilation is used because the implementation needs
29
+ // to switch from random choice to deterministic
33
30
 
34
31
  namespace datasketches {
35
32
 
@@ -154,11 +151,11 @@ const int64_t correct_results[num_tests * 7] = {
154
151
  113, 200, 8311133, 6554171, 16, 637, 121111429906734123
155
152
  };
156
153
 
157
- static std::unique_ptr<int[]> make_input_array(unsigned n, unsigned stride) {
158
- assert (kll_helper::is_odd(stride));
159
- unsigned mask((1 << 23) - 1); // because library items are single-precision floats at the moment
160
- unsigned cur(0);
161
- std::unique_ptr<int[]> arr(new int[n]);
154
+ static std::vector<int> make_input_array(unsigned n, unsigned stride) {
155
+ if (!kll_helper::is_odd(stride)) throw std::logic_error("stride must be odd");
156
+ unsigned mask = (1 << 23) - 1; // because items are single-precision floats at the moment
157
+ unsigned cur = 0;
158
+ std::vector<int> arr(n, 0);
162
159
  for (unsigned i = 0; i < n; i++) {
163
160
  cur += stride;
164
161
  cur &= mask;
@@ -167,50 +164,63 @@ static std::unique_ptr<int[]> make_input_array(unsigned n, unsigned stride) {
167
164
  return arr;
168
165
  }
169
166
 
170
- static int64_t simple_hash_of_sub_array(const float* arr, unsigned start, unsigned length) {
171
- int64_t multiplier(738219921); // an arbitrary odd 30-bit number
172
- int64_t mask60((1ULL << 60) - 1ULL);
173
- int64_t accum(0);
174
- for (unsigned i = start; i < start + length; i++) {
175
- accum += (int64_t) arr[i];
167
+ template<typename It>
168
+ std::pair<int64_t, uint8_t> hash_samples_and_count_levels(It from, It to) {
169
+ int64_t multiplier = 738219921; // an arbitrary odd 30-bit number
170
+ int64_t mask60 = (1ULL << 60) - 1ULL;
171
+ int64_t accum = 0;
172
+ uint8_t num_levels = 1;
173
+ for (auto it = from; it != to; ++it) {
174
+ accum += static_cast<int64_t>((*it).first);
176
175
  accum *= multiplier;
177
176
  accum &= mask60;
178
177
  accum ^= accum >> 30;
178
+ const uint8_t level = count_trailing_zeros_in_u64((*it).second);
179
+ if (num_levels <= level) num_levels = level + 1;
179
180
  }
180
- return accum;
181
+ return std::pair<uint64_t, uint8_t>(accum, num_levels);
181
182
  }
182
183
 
183
184
  TEST_CASE("kll validation", "[kll_sketch][validation]") {
184
185
  for (unsigned i = 0; i < num_tests; i++) {
185
- assert (correct_results[7 * i] == i);
186
- unsigned k(correct_results[7 * i + 1]);
187
- unsigned n(correct_results[7 * i + 2]);
188
- unsigned stride(correct_results[7 * i + 3]);
189
- std::unique_ptr<int[]> input_array = make_input_array(n, stride);
186
+ if (correct_results[7 * i] != i) throw std::logic_error("test number mismatch");
187
+ unsigned k = correct_results[7 * i + 1];
188
+ unsigned n = correct_results[7 * i + 2];
189
+ unsigned stride = correct_results[7 * i + 3];
190
+ auto input_array = make_input_array(n, stride);
190
191
  kll_sketch<float> sketch(k);
191
192
  kll_next_offset = 0;
192
193
  for (unsigned j = 0; j < n; j++) {
193
194
  sketch.update(input_array[j]);
194
195
  }
195
- unsigned num_levels = sketch.get_num_levels();
196
196
  unsigned num_samples = sketch.get_num_retained();
197
- int64_t hashed_samples = simple_hash_of_sub_array(sketch.get_items(), sketch.get_levels()[0], num_samples);
197
+ auto p = hash_samples_and_count_levels(sketch.begin(), sketch.end());
198
198
  std::cout << i;
199
- REQUIRE(correct_results[7 * i + 4] == num_levels);
199
+ REQUIRE(correct_results[7 * i + 4] == p.second);
200
200
  REQUIRE(correct_results[7 * i + 5] == num_samples);
201
- if (correct_results[7 * i + 6] == hashed_samples) {
201
+ if (correct_results[7 * i + 6] == p.first) {
202
202
  std::cout << " pass" << std::endl;
203
203
  } else {
204
- std::cout << " " << (correct_results[7 * i + 6]) << " != " << hashed_samples;
205
- sketch.to_stream(std::cout);
204
+ std::cout << " " << (correct_results[7 * i + 6]) << " != " << p.first << "\n";
205
+ std::cout << sketch.to_string();
206
206
  FAIL();
207
207
  }
208
208
  }
209
209
  }
210
210
 
211
- TEST_CASE("kll validation: test hash", "[kll_sketch][validaiton]") {
212
- float array[] = { 907500, 944104, 807020, 219921, 678370, 955217, 426885 };
213
- REQUIRE(simple_hash_of_sub_array(array, 1, 5) == 1141543353991880193LL);
211
+ TEST_CASE("kll validation: test hash and num levels", "[kll_sketch][validaiton]") {
212
+ std::pair<float, uint64_t> array[] = {
213
+ std::make_pair(907500, 1),
214
+ std::make_pair(944104, 1),
215
+ std::make_pair(807020, 2),
216
+ std::make_pair(219921, 2),
217
+ std::make_pair(678370, 2),
218
+ std::make_pair(955217, 4),
219
+ std::make_pair(426885, 8)
220
+ };
221
+ auto hash_and_num_levels = hash_samples_and_count_levels(array + 1, array + 6);
222
+ REQUIRE(hash_and_num_levels.first == 1141543353991880193LL);
223
+ REQUIRE(hash_and_num_levels.second == 3);
214
224
  }
215
225
 
216
226
  TEST_CASE("kll validation: make input array", "[kll_sketch][validaiton]") {
@@ -1,18 +1,23 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
1
18
  [build-system]
2
19
  requires = ["wheel",
3
20
  "setuptools >= 30.3.0",
4
21
  "cmake >= 3.16",
5
22
  "pybind11[global] >= 2.6.0"]
6
23
  build-backend = "setuptools.build_meta"
7
-
8
- [tool.tox]
9
- legacy_tox_ini = """
10
- [tox]
11
- envlist = py3
12
-
13
- [testenv]
14
- deps = pytest
15
- numpy
16
- changedir = python/tests
17
- commands = pytest
18
- """
@@ -50,7 +50,13 @@ target_link_libraries(python
50
50
 
51
51
  set_target_properties(python PROPERTIES
52
52
  PREFIX ""
53
- OUTPUT_NAME datasketches
53
+ OUTPUT_NAME _datasketches
54
+ )
55
+
56
+ target_include_directories(python
57
+ PUBLIC
58
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
59
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
54
60
  )
55
61
 
56
62
  # ensure we make a .so on Mac rather than .dylib
@@ -71,4 +77,5 @@ target_sources(python
71
77
  src/quantiles_wrapper.cpp
72
78
  src/ks_wrapper.cpp
73
79
  src/vector_of_kll.cpp
80
+ src/py_serde.cpp
74
81
  )
@@ -0,0 +1,104 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ from _datasketches import PyObjectSerDe
19
+
20
+ import struct
21
+
22
+ # This file provides several Python SerDe implementation examples.
23
+ #
24
+ # Each implementation must extend the PyObjectSerDe class and define
25
+ # three methods:
26
+ # * get_size(item) returns an int of the number of bytes needed to
27
+ # serialize the given item
28
+ # * to_bytes(item) returns a bytes object representing a serialized
29
+ # version of the given item
30
+ # * from_bytes(data, offset) takes a bytes object (data) and an offset
31
+ # indicating where in the data array to start reading. The method
32
+ # returns a tuple with the newly reconstructed object and the
33
+ # total number of bytes beyond the offset read from the input data.
34
+
35
+ # Implements a simple string-encoding scheme where a string is
36
+ # written as <num_bytes> <string_contents>, with no null termination.
37
+ # This format allows pre-allocating each string, at the cost of
38
+ # additional storage. Using this format, the serialized string consumes
39
+ # 4 + len(item) bytes.
40
+ class PyStringsSerDe(PyObjectSerDe):
41
+ def get_size(self, item):
42
+ return int(4 + len(item))
43
+
44
+ def to_bytes(self, item: str):
45
+ b = bytearray()
46
+ b.extend(len(item).to_bytes(4, 'little'))
47
+ b.extend(map(ord,item))
48
+ return bytes(b)
49
+
50
+ def from_bytes(self, data: bytes, offset: int):
51
+ num_chars = int.from_bytes(data[offset:offset+3], 'little')
52
+ if (num_chars < 0 or num_chars > offset + len(data)):
53
+ raise IndexError(f'num_chars read must be non-negative and not larger than the buffer. Found {num_chars}')
54
+ str = data[offset+4:offset+4+num_chars].decode()
55
+ return (str, 4+num_chars)
56
+
57
+ # Implements an integer-encoding scheme where each integer is written
58
+ # as a 32-bit (4 byte) little-endian value.
59
+ class PyIntsSerDe(PyObjectSerDe):
60
+ def get_size(self, item):
61
+ return int(4)
62
+
63
+ def to_bytes(self, item):
64
+ return struct.pack('i', item)
65
+
66
+ def from_bytes(self, data: bytes, offset: int):
67
+ val = struct.unpack_from('i', data, offset)[0]
68
+ return (val, 4)
69
+
70
+
71
+ class PyLongsSerDe(PyObjectSerDe):
72
+ def get_size(self, item):
73
+ return int(8)
74
+
75
+ def to_bytes(self, item):
76
+ return struct.pack('l', item)
77
+
78
+ def from_bytes(self, data: bytes, offset: int):
79
+ val = struct.unpack_from('l', data, offset)[0]
80
+ return (val, 8)
81
+
82
+
83
+ class PyFloatsSerDe(PyObjectSerDe):
84
+ def get_size(self, item):
85
+ return int(4)
86
+
87
+ def to_bytes(self, item):
88
+ return struct.pack('f', item)
89
+
90
+ def from_bytes(self, data: bytes, offset: int):
91
+ val = struct.unpack_from('f', data, offset)[0]
92
+ return (val, 4)
93
+
94
+
95
+ class PyDoublesSerDe(PyObjectSerDe):
96
+ def get_size(self, item):
97
+ return int(8)
98
+
99
+ def to_bytes(self, item):
100
+ return struct.pack('d', item)
101
+
102
+ def from_bytes(self, data: bytes, offset: int):
103
+ val = struct.unpack_from('d', data, offset)[0]
104
+ return (val, 8)
@@ -0,0 +1,22 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ name = 'datasketches'
19
+
20
+ from .PySerDe import *
21
+
22
+ from _datasketches import *
@@ -0,0 +1,113 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <pybind11/pybind11.h>
21
+ #include <pybind11/functional.h>
22
+ #include <sstream>
23
+
24
+ #ifndef _PY_SERDE_HPP_
25
+ #define _PY_SERDE_HPP_
26
+
27
+ namespace py = pybind11;
28
+
29
+ namespace datasketches {
30
+
31
+ /**
32
+ * @brief The py_object_serde is an abstract class that implements the
33
+ * datasketches serde interface, and is used to allow custom Python
34
+ * serialization of items wrapped as generic py::object types. The actual
35
+ * Python implementation classes must extend the PyObjectSerDe class.
36
+ */
37
+ struct py_object_serde {
38
+ /**
39
+ * @brief Get the serialized size of an object, in bytes
40
+ *
41
+ * @param item A provided item
42
+ * @return int64_t The serialized size of the item, in bytes
43
+ */
44
+ virtual int64_t get_size(const py::object& item) const = 0;
45
+
46
+ /**
47
+ * @brief Serializes an item to a bytes object
48
+ *
49
+ * @param item A provided item
50
+ * @return The serialized image of the item as a Python bytes object
51
+ */
52
+ virtual py::bytes to_bytes(const py::object& item) const = 0;
53
+
54
+ /**
55
+ * @brief Constructs an object from a serialized image, reading the
56
+ * incoming buffer starting at the specified offset.
57
+ *
58
+ * @param bytes A buffer containing items from a serialized sketch
59
+ * @param offset The starting offset into the bytes buffer
60
+ * @return A Python tuple of the reconstructed item and the total number of bytes read
61
+ */
62
+ virtual py::tuple from_bytes(py::bytes& bytes, size_t offset) const = 0;
63
+
64
+ virtual ~py_object_serde() = default;
65
+
66
+ // these methods are required by the serde interface; see common/include/serde.hpp for
67
+ // default implementations for C++ std::string and numeric types.
68
+ size_t size_of_item(const py::object& item) const;
69
+ size_t serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const;
70
+ size_t deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const;
71
+ };
72
+
73
+ /**
74
+ * @brief The PyObjectSerDe class provides a concrete base class
75
+ * that pybind11 uses as a "trampoline" to pass calls through to
76
+ * the abstract py_object_serde class. Custom Python serde implementations
77
+ * must extend this class.
78
+ */
79
+ struct PyObjectSerDe : public py_object_serde {
80
+ using py_object_serde::py_object_serde;
81
+
82
+ // trampoline definitions -- need one for each virtual function
83
+ int64_t get_size(const py::object& item) const override {
84
+ PYBIND11_OVERRIDE_PURE(
85
+ int64_t, // Return type
86
+ py_object_serde, // Parent class
87
+ get_size, // Name of function in C++ (must match Python name)
88
+ item // Argument(s)
89
+ );
90
+ }
91
+
92
+ py::bytes to_bytes(const py::object& item) const override {
93
+ PYBIND11_OVERRIDE_PURE(
94
+ py::bytes, // Return type
95
+ py_object_serde, // Parent class
96
+ to_bytes, // Name of function in C++ (must match Python name)
97
+ item // Argument(s)
98
+ );
99
+ }
100
+
101
+ py::tuple from_bytes(py::bytes& bytes, size_t offset) const override {
102
+ PYBIND11_OVERRIDE_PURE(
103
+ py::tuple, // Return type
104
+ py_object_serde, // Parent class
105
+ from_bytes, // Name of function in C++ (must match Python name)
106
+ bytes, offset // Argument(s)
107
+ );
108
+ }
109
+ };
110
+
111
+ }
112
+
113
+ #endif // _PY_SERDE_HPP_
@@ -40,20 +40,20 @@
40
40
  "name": "stdout",
41
41
  "output_type": "stream",
42
42
  "text": [
43
- "### Update Theta sketch summary:\n",
44
- " lg nominal size : 12\n",
45
- " lg current size : 13\n",
46
- " num retained keys : 6560\n",
47
- " resize factor : 8\n",
48
- " sampling probability : 1\n",
43
+ "### Theta sketch summary:\n",
44
+ " num retained entries : 6560\n",
49
45
  " seed hash : 37836\n",
46
+ " empty? : false\n",
50
47
  " ordered? : false\n",
48
+ " estimation mode? : true\n",
51
49
  " theta (fraction) : 0.00654224\n",
52
50
  " theta (raw 64-bit) : 60341508738660257\n",
53
- " estimation mode? : true\n",
54
51
  " estimate : 1.00271e+06\n",
55
52
  " lower bound 95% conf : 978261\n",
56
53
  " upper bound 95% conf : 1.02778e+06\n",
54
+ " lg nominal size : 12\n",
55
+ " lg current size : 13\n",
56
+ " resize factor : 8\n",
57
57
  "### End sketch summary\n",
58
58
  "\n"
59
59
  ]
@@ -100,7 +100,7 @@
100
100
  "cell_type": "markdown",
101
101
  "metadata": {},
102
102
  "source": [
103
- "We can serialize and reconstruct the sketch. If we compact the sketch prior to serialization, we can still query the rebuilt sketch but cannot update it further."
103
+ "We can serialize and reconstruct the sketch. Serialization necessarily produces a compact sketch, meaning the sketch can be deserialized and queried or used for further unions or set operations but can not be updated directly."
104
104
  ]
105
105
  },
106
106
  {
@@ -139,7 +139,7 @@
139
139
  }
140
140
  ],
141
141
  "source": [
142
- "new_sk1 = theta_sketch.deserialize(sk1_bytes)\n",
142
+ "new_sk1 = compact_theta_sketch.deserialize(sk1_bytes)\n",
143
143
  "print(\"Estimate: \\t\\t\", new_sk1.get_estimate())\n",
144
144
  "print(\"Estimation mode: \\t\", new_sk1.is_estimation_mode())"
145
145
  ]
@@ -169,20 +169,20 @@
169
169
  "name": "stdout",
170
170
  "output_type": "stream",
171
171
  "text": [
172
- "### Update Theta sketch summary:\n",
173
- " lg nominal size : 13\n",
174
- " lg current size : 14\n",
175
- " num retained keys : 12488\n",
176
- " resize factor : 8\n",
177
- " sampling probability : 1\n",
172
+ "### Theta sketch summary:\n",
173
+ " num retained entries : 12488\n",
178
174
  " seed hash : 37836\n",
175
+ " empty? : false\n",
179
176
  " ordered? : false\n",
177
+ " estimation mode? : true\n",
180
178
  " theta (fraction) : 0.0123336\n",
181
179
  " theta (raw 64-bit) : 113757656857900725\n",
182
- " estimation mode? : true\n",
183
180
  " estimate : 1.01252e+06\n",
184
181
  " lower bound 95% conf : 994626\n",
185
182
  " upper bound 95% conf : 1.03073e+06\n",
183
+ " lg nominal size : 13\n",
184
+ " lg current size : 14\n",
185
+ " resize factor : 8\n",
186
186
  "### End sketch summary\n",
187
187
  "\n"
188
188
  ]
@@ -255,13 +255,14 @@
255
255
  "output_type": "stream",
256
256
  "text": [
257
257
  "Has result: True\n",
258
- "### Compact Theta sketch summary:\n",
259
- " num retained keys : 1668\n",
258
+ "### Theta sketch summary:\n",
259
+ " num retained entries : 1668\n",
260
260
  " seed hash : 37836\n",
261
+ " empty? : false\n",
261
262
  " ordered? : true\n",
263
+ " estimation mode? : true\n",
262
264
  " theta (fraction) : 0.00654224\n",
263
265
  " theta (raw 64-bit) : 60341508738660257\n",
264
- " estimation mode? : true\n",
265
266
  " estimate : 254959\n",
266
267
  " lower bound 95% conf : 242739\n",
267
268
  " upper bound 95% conf : 267789\n",
@@ -326,13 +327,14 @@
326
327
  "name": "stdout",
327
328
  "output_type": "stream",
328
329
  "text": [
329
- "### Compact Theta sketch summary:\n",
330
- " num retained keys : 4892\n",
330
+ "### Theta sketch summary:\n",
331
+ " num retained entries : 4892\n",
331
332
  " seed hash : 37836\n",
333
+ " empty? : false\n",
332
334
  " ordered? : true\n",
335
+ " estimation mode? : true\n",
333
336
  " theta (fraction) : 0.00654224\n",
334
337
  " theta (raw 64-bit) : 60341508738660257\n",
335
- " estimation mode? : true\n",
336
338
  " estimate : 747756\n",
337
339
  " lower bound 95% conf : 726670\n",
338
340
  " upper bound 95% conf : 769452\n",
@@ -374,7 +376,7 @@
374
376
  ],
375
377
  "metadata": {
376
378
  "kernelspec": {
377
- "display_name": "Python 3",
379
+ "display_name": "Python 3.10.6 64-bit",
378
380
  "language": "python",
379
381
  "name": "python3"
380
382
  },
@@ -388,7 +390,12 @@
388
390
  "name": "python",
389
391
  "nbconvert_exporter": "python",
390
392
  "pygments_lexer": "ipython3",
391
- "version": "3.7.0"
393
+ "version": "3.10.6"
394
+ },
395
+ "vscode": {
396
+ "interpreter": {
397
+ "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
398
+ }
392
399
  }
393
400
  },
394
401
  "nbformat": 4,
@@ -1,3 +1,21 @@
1
+ :: Licensed to the Apache Software Foundation (ASF) under one
2
+ :: or more contributor license agreements. See the NOTICE file
3
+ :: distributed with this work for additional information
4
+ :: regarding copyright ownership. The ASF licenses this file
5
+ :: to you under the Apache License, Version 2.0 (the
6
+ :: "License"); you may not use this file except in compliance
7
+ :: with the License. You may obtain a copy of the License at
8
+ ::
9
+ :: http://www.apache.org/licenses/LICENSE-2.0
10
+ ::
11
+ :: Unless required by applicable law or agreed to in writing,
12
+ :: software distributed under the License is distributed on an
13
+ :: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ :: KIND, either express or implied. See the License for the
15
+ :: specific language governing permissions and limitations
16
+ :: under the License.
17
+
18
+
1
19
  @echo off
2
20
  :: Takes path to the Python interpreter and returns the path to pybind11
3
21
  %1 -c "import pybind11,sys;sys.stdout.write(pybind11.get_cmake_dir())"
@@ -1,2 +1,18 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
1
18
  name = "datasketches"
2
-
@@ -21,6 +21,7 @@
21
21
 
22
22
  namespace py = pybind11;
23
23
 
24
+ // sketches
24
25
  void init_hll(py::module& m);
25
26
  void init_kll(py::module& m);
26
27
  void init_fi(py::module& m);
@@ -29,10 +30,13 @@ void init_theta(py::module& m);
29
30
  void init_vo(py::module& m);
30
31
  void init_req(py::module& m);
31
32
  void init_quantiles(py::module& m);
32
- void init_kolmogorov_smirnov(py::module& m);
33
33
  void init_vector_of_kll(py::module& m);
34
34
 
35
- PYBIND11_MODULE(datasketches, m) {
35
+ // supporting objects
36
+ void init_kolmogorov_smirnov(py::module& m);
37
+ void init_serde(py::module& m);
38
+
39
+ PYBIND11_MODULE(_datasketches, m) {
36
40
  init_hll(m);
37
41
  init_kll(m);
38
42
  init_fi(m);
@@ -41,6 +45,8 @@ PYBIND11_MODULE(datasketches, m) {
41
45
  init_vo(m);
42
46
  init_req(m);
43
47
  init_quantiles(m);
44
- init_kolmogorov_smirnov(m);
45
48
  init_vector_of_kll(m);
49
+
50
+ init_kolmogorov_smirnov(m);
51
+ init_serde(m);
46
52
  }