datasketches 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/MANIFEST.in +21 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
- data/vendor/datasketches-cpp/pyproject.toml +17 -12
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/setup.py +14 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +14 -5
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
|
@@ -22,14 +22,11 @@
|
|
|
22
22
|
#include <kll_sketch.hpp>
|
|
23
23
|
#include <kll_helper.hpp>
|
|
24
24
|
|
|
25
|
-
#include <assert.h>
|
|
26
|
-
|
|
27
25
|
#ifdef KLL_VALIDATION
|
|
28
26
|
|
|
29
27
|
// This is to make sure the implementation matches exactly the reference implementation in OCaml.
|
|
30
|
-
// Conditional compilation is used because the implementation needs
|
|
31
|
-
//
|
|
32
|
-
// - a few methods to expose internals of the sketch
|
|
28
|
+
// Conditional compilation is used because the implementation needs
|
|
29
|
+
// to switch from random choice to deterministic
|
|
33
30
|
|
|
34
31
|
namespace datasketches {
|
|
35
32
|
|
|
@@ -154,11 +151,11 @@ const int64_t correct_results[num_tests * 7] = {
|
|
|
154
151
|
113, 200, 8311133, 6554171, 16, 637, 121111429906734123
|
|
155
152
|
};
|
|
156
153
|
|
|
157
|
-
static std::
|
|
158
|
-
|
|
159
|
-
unsigned mask(
|
|
160
|
-
unsigned cur
|
|
161
|
-
std::
|
|
154
|
+
static std::vector<int> make_input_array(unsigned n, unsigned stride) {
|
|
155
|
+
if (!kll_helper::is_odd(stride)) throw std::logic_error("stride must be odd");
|
|
156
|
+
unsigned mask = (1 << 23) - 1; // because items are single-precision floats at the moment
|
|
157
|
+
unsigned cur = 0;
|
|
158
|
+
std::vector<int> arr(n, 0);
|
|
162
159
|
for (unsigned i = 0; i < n; i++) {
|
|
163
160
|
cur += stride;
|
|
164
161
|
cur &= mask;
|
|
@@ -167,50 +164,63 @@ static std::unique_ptr<int[]> make_input_array(unsigned n, unsigned stride) {
|
|
|
167
164
|
return arr;
|
|
168
165
|
}
|
|
169
166
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
int64_t
|
|
173
|
-
int64_t
|
|
174
|
-
|
|
175
|
-
|
|
167
|
+
template<typename It>
|
|
168
|
+
std::pair<int64_t, uint8_t> hash_samples_and_count_levels(It from, It to) {
|
|
169
|
+
int64_t multiplier = 738219921; // an arbitrary odd 30-bit number
|
|
170
|
+
int64_t mask60 = (1ULL << 60) - 1ULL;
|
|
171
|
+
int64_t accum = 0;
|
|
172
|
+
uint8_t num_levels = 1;
|
|
173
|
+
for (auto it = from; it != to; ++it) {
|
|
174
|
+
accum += static_cast<int64_t>((*it).first);
|
|
176
175
|
accum *= multiplier;
|
|
177
176
|
accum &= mask60;
|
|
178
177
|
accum ^= accum >> 30;
|
|
178
|
+
const uint8_t level = count_trailing_zeros_in_u64((*it).second);
|
|
179
|
+
if (num_levels <= level) num_levels = level + 1;
|
|
179
180
|
}
|
|
180
|
-
return accum;
|
|
181
|
+
return std::pair<uint64_t, uint8_t>(accum, num_levels);
|
|
181
182
|
}
|
|
182
183
|
|
|
183
184
|
TEST_CASE("kll validation", "[kll_sketch][validation]") {
|
|
184
185
|
for (unsigned i = 0; i < num_tests; i++) {
|
|
185
|
-
|
|
186
|
-
unsigned k
|
|
187
|
-
unsigned n
|
|
188
|
-
unsigned stride
|
|
189
|
-
|
|
186
|
+
if (correct_results[7 * i] != i) throw std::logic_error("test number mismatch");
|
|
187
|
+
unsigned k = correct_results[7 * i + 1];
|
|
188
|
+
unsigned n = correct_results[7 * i + 2];
|
|
189
|
+
unsigned stride = correct_results[7 * i + 3];
|
|
190
|
+
auto input_array = make_input_array(n, stride);
|
|
190
191
|
kll_sketch<float> sketch(k);
|
|
191
192
|
kll_next_offset = 0;
|
|
192
193
|
for (unsigned j = 0; j < n; j++) {
|
|
193
194
|
sketch.update(input_array[j]);
|
|
194
195
|
}
|
|
195
|
-
unsigned num_levels = sketch.get_num_levels();
|
|
196
196
|
unsigned num_samples = sketch.get_num_retained();
|
|
197
|
-
|
|
197
|
+
auto p = hash_samples_and_count_levels(sketch.begin(), sketch.end());
|
|
198
198
|
std::cout << i;
|
|
199
|
-
REQUIRE(correct_results[7 * i + 4] ==
|
|
199
|
+
REQUIRE(correct_results[7 * i + 4] == p.second);
|
|
200
200
|
REQUIRE(correct_results[7 * i + 5] == num_samples);
|
|
201
|
-
if (correct_results[7 * i + 6] ==
|
|
201
|
+
if (correct_results[7 * i + 6] == p.first) {
|
|
202
202
|
std::cout << " pass" << std::endl;
|
|
203
203
|
} else {
|
|
204
|
-
std::cout << " " << (correct_results[7 * i + 6]) << " != " <<
|
|
205
|
-
sketch.
|
|
204
|
+
std::cout << " " << (correct_results[7 * i + 6]) << " != " << p.first << "\n";
|
|
205
|
+
std::cout << sketch.to_string();
|
|
206
206
|
FAIL();
|
|
207
207
|
}
|
|
208
208
|
}
|
|
209
209
|
}
|
|
210
210
|
|
|
211
|
-
TEST_CASE("kll validation: test hash", "[kll_sketch][validaiton]") {
|
|
212
|
-
float array[] = {
|
|
213
|
-
|
|
211
|
+
TEST_CASE("kll validation: test hash and num levels", "[kll_sketch][validaiton]") {
|
|
212
|
+
std::pair<float, uint64_t> array[] = {
|
|
213
|
+
std::make_pair(907500, 1),
|
|
214
|
+
std::make_pair(944104, 1),
|
|
215
|
+
std::make_pair(807020, 2),
|
|
216
|
+
std::make_pair(219921, 2),
|
|
217
|
+
std::make_pair(678370, 2),
|
|
218
|
+
std::make_pair(955217, 4),
|
|
219
|
+
std::make_pair(426885, 8)
|
|
220
|
+
};
|
|
221
|
+
auto hash_and_num_levels = hash_samples_and_count_levels(array + 1, array + 6);
|
|
222
|
+
REQUIRE(hash_and_num_levels.first == 1141543353991880193LL);
|
|
223
|
+
REQUIRE(hash_and_num_levels.second == 3);
|
|
214
224
|
}
|
|
215
225
|
|
|
216
226
|
TEST_CASE("kll validation: make input array", "[kll_sketch][validaiton]") {
|
|
@@ -1,18 +1,23 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
1
18
|
[build-system]
|
|
2
19
|
requires = ["wheel",
|
|
3
20
|
"setuptools >= 30.3.0",
|
|
4
21
|
"cmake >= 3.16",
|
|
5
22
|
"pybind11[global] >= 2.6.0"]
|
|
6
23
|
build-backend = "setuptools.build_meta"
|
|
7
|
-
|
|
8
|
-
[tool.tox]
|
|
9
|
-
legacy_tox_ini = """
|
|
10
|
-
[tox]
|
|
11
|
-
envlist = py3
|
|
12
|
-
|
|
13
|
-
[testenv]
|
|
14
|
-
deps = pytest
|
|
15
|
-
numpy
|
|
16
|
-
changedir = python/tests
|
|
17
|
-
commands = pytest
|
|
18
|
-
"""
|
|
@@ -50,7 +50,13 @@ target_link_libraries(python
|
|
|
50
50
|
|
|
51
51
|
set_target_properties(python PROPERTIES
|
|
52
52
|
PREFIX ""
|
|
53
|
-
OUTPUT_NAME
|
|
53
|
+
OUTPUT_NAME _datasketches
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
target_include_directories(python
|
|
57
|
+
PUBLIC
|
|
58
|
+
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
|
|
59
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
|
54
60
|
)
|
|
55
61
|
|
|
56
62
|
# ensure we make a .so on Mac rather than .dylib
|
|
@@ -71,4 +77,5 @@ target_sources(python
|
|
|
71
77
|
src/quantiles_wrapper.cpp
|
|
72
78
|
src/ks_wrapper.cpp
|
|
73
79
|
src/vector_of_kll.cpp
|
|
80
|
+
src/py_serde.cpp
|
|
74
81
|
)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
from _datasketches import PyObjectSerDe
|
|
19
|
+
|
|
20
|
+
import struct
|
|
21
|
+
|
|
22
|
+
# This file provides several Python SerDe implementation examples.
|
|
23
|
+
#
|
|
24
|
+
# Each implementation must extend the PyObjectSerDe class and define
|
|
25
|
+
# three methods:
|
|
26
|
+
# * get_size(item) returns an int of the number of bytes needed to
|
|
27
|
+
# serialize the given item
|
|
28
|
+
# * to_bytes(item) returns a bytes object representing a serialized
|
|
29
|
+
# version of the given item
|
|
30
|
+
# * from_bytes(data, offset) takes a bytes object (data) and an offset
|
|
31
|
+
# indicating where in the data array to start reading. The method
|
|
32
|
+
# returns a tuple with the newly reconstructed object and the
|
|
33
|
+
# total number of bytes beyond the offset read from the input data.
|
|
34
|
+
|
|
35
|
+
# Implements a simple string-encoding scheme where a string is
|
|
36
|
+
# written as <num_bytes> <string_contents>, with no null termination.
|
|
37
|
+
# This format allows pre-allocating each string, at the cost of
|
|
38
|
+
# additional storage. Using this format, the serialized string consumes
|
|
39
|
+
# 4 + len(item) bytes.
|
|
40
|
+
class PyStringsSerDe(PyObjectSerDe):
|
|
41
|
+
def get_size(self, item):
|
|
42
|
+
return int(4 + len(item))
|
|
43
|
+
|
|
44
|
+
def to_bytes(self, item: str):
|
|
45
|
+
b = bytearray()
|
|
46
|
+
b.extend(len(item).to_bytes(4, 'little'))
|
|
47
|
+
b.extend(map(ord,item))
|
|
48
|
+
return bytes(b)
|
|
49
|
+
|
|
50
|
+
def from_bytes(self, data: bytes, offset: int):
|
|
51
|
+
num_chars = int.from_bytes(data[offset:offset+3], 'little')
|
|
52
|
+
if (num_chars < 0 or num_chars > offset + len(data)):
|
|
53
|
+
raise IndexError(f'num_chars read must be non-negative and not larger than the buffer. Found {num_chars}')
|
|
54
|
+
str = data[offset+4:offset+4+num_chars].decode()
|
|
55
|
+
return (str, 4+num_chars)
|
|
56
|
+
|
|
57
|
+
# Implements an integer-encoding scheme where each integer is written
|
|
58
|
+
# as a 32-bit (4 byte) little-endian value.
|
|
59
|
+
class PyIntsSerDe(PyObjectSerDe):
|
|
60
|
+
def get_size(self, item):
|
|
61
|
+
return int(4)
|
|
62
|
+
|
|
63
|
+
def to_bytes(self, item):
|
|
64
|
+
return struct.pack('i', item)
|
|
65
|
+
|
|
66
|
+
def from_bytes(self, data: bytes, offset: int):
|
|
67
|
+
val = struct.unpack_from('i', data, offset)[0]
|
|
68
|
+
return (val, 4)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class PyLongsSerDe(PyObjectSerDe):
|
|
72
|
+
def get_size(self, item):
|
|
73
|
+
return int(8)
|
|
74
|
+
|
|
75
|
+
def to_bytes(self, item):
|
|
76
|
+
return struct.pack('l', item)
|
|
77
|
+
|
|
78
|
+
def from_bytes(self, data: bytes, offset: int):
|
|
79
|
+
val = struct.unpack_from('l', data, offset)[0]
|
|
80
|
+
return (val, 8)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class PyFloatsSerDe(PyObjectSerDe):
|
|
84
|
+
def get_size(self, item):
|
|
85
|
+
return int(4)
|
|
86
|
+
|
|
87
|
+
def to_bytes(self, item):
|
|
88
|
+
return struct.pack('f', item)
|
|
89
|
+
|
|
90
|
+
def from_bytes(self, data: bytes, offset: int):
|
|
91
|
+
val = struct.unpack_from('f', data, offset)[0]
|
|
92
|
+
return (val, 4)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class PyDoublesSerDe(PyObjectSerDe):
|
|
96
|
+
def get_size(self, item):
|
|
97
|
+
return int(8)
|
|
98
|
+
|
|
99
|
+
def to_bytes(self, item):
|
|
100
|
+
return struct.pack('d', item)
|
|
101
|
+
|
|
102
|
+
def from_bytes(self, data: bytes, offset: int):
|
|
103
|
+
val = struct.unpack_from('d', data, offset)[0]
|
|
104
|
+
return (val, 8)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
name = 'datasketches'
|
|
19
|
+
|
|
20
|
+
from .PySerDe import *
|
|
21
|
+
|
|
22
|
+
from _datasketches import *
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <pybind11/pybind11.h>
|
|
21
|
+
#include <pybind11/functional.h>
|
|
22
|
+
#include <sstream>
|
|
23
|
+
|
|
24
|
+
#ifndef _PY_SERDE_HPP_
|
|
25
|
+
#define _PY_SERDE_HPP_
|
|
26
|
+
|
|
27
|
+
namespace py = pybind11;
|
|
28
|
+
|
|
29
|
+
namespace datasketches {
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* @brief The py_object_serde is an abstract class that implements the
|
|
33
|
+
* datasketches serde interface, and is used to allow custom Python
|
|
34
|
+
* serialization of items wrapped as generic py::object types. The actual
|
|
35
|
+
* Python implementation classes must extend the PyObjectSerDe class.
|
|
36
|
+
*/
|
|
37
|
+
struct py_object_serde {
|
|
38
|
+
/**
|
|
39
|
+
* @brief Get the serialized size of an object, in bytes
|
|
40
|
+
*
|
|
41
|
+
* @param item A provided item
|
|
42
|
+
* @return int64_t The serialized size of the item, in bytes
|
|
43
|
+
*/
|
|
44
|
+
virtual int64_t get_size(const py::object& item) const = 0;
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* @brief Serializes an item to a bytes object
|
|
48
|
+
*
|
|
49
|
+
* @param item A provided item
|
|
50
|
+
* @return The serialized image of the item as a Python bytes object
|
|
51
|
+
*/
|
|
52
|
+
virtual py::bytes to_bytes(const py::object& item) const = 0;
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* @brief Constructs an object from a serialized image, reading the
|
|
56
|
+
* incoming buffer starting at the specified offset.
|
|
57
|
+
*
|
|
58
|
+
* @param bytes A buffer containing items from a serialized sketch
|
|
59
|
+
* @param offset The starting offset into the bytes buffer
|
|
60
|
+
* @return A Python tuple of the reconstructed item and the total number of bytes read
|
|
61
|
+
*/
|
|
62
|
+
virtual py::tuple from_bytes(py::bytes& bytes, size_t offset) const = 0;
|
|
63
|
+
|
|
64
|
+
virtual ~py_object_serde() = default;
|
|
65
|
+
|
|
66
|
+
// these methods are required by the serde interface; see common/include/serde.hpp for
|
|
67
|
+
// default implementations for C++ std::string and numeric types.
|
|
68
|
+
size_t size_of_item(const py::object& item) const;
|
|
69
|
+
size_t serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const;
|
|
70
|
+
size_t deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const;
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* @brief The PyObjectSerDe class provides a concrete base class
|
|
75
|
+
* that pybind11 uses as a "trampoline" to pass calls through to
|
|
76
|
+
* the abstract py_object_serde class. Custom Python serde implementations
|
|
77
|
+
* must extend this class.
|
|
78
|
+
*/
|
|
79
|
+
struct PyObjectSerDe : public py_object_serde {
|
|
80
|
+
using py_object_serde::py_object_serde;
|
|
81
|
+
|
|
82
|
+
// trampoline definitions -- need one for each virtual function
|
|
83
|
+
int64_t get_size(const py::object& item) const override {
|
|
84
|
+
PYBIND11_OVERRIDE_PURE(
|
|
85
|
+
int64_t, // Return type
|
|
86
|
+
py_object_serde, // Parent class
|
|
87
|
+
get_size, // Name of function in C++ (must match Python name)
|
|
88
|
+
item // Argument(s)
|
|
89
|
+
);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
py::bytes to_bytes(const py::object& item) const override {
|
|
93
|
+
PYBIND11_OVERRIDE_PURE(
|
|
94
|
+
py::bytes, // Return type
|
|
95
|
+
py_object_serde, // Parent class
|
|
96
|
+
to_bytes, // Name of function in C++ (must match Python name)
|
|
97
|
+
item // Argument(s)
|
|
98
|
+
);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
py::tuple from_bytes(py::bytes& bytes, size_t offset) const override {
|
|
102
|
+
PYBIND11_OVERRIDE_PURE(
|
|
103
|
+
py::tuple, // Return type
|
|
104
|
+
py_object_serde, // Parent class
|
|
105
|
+
from_bytes, // Name of function in C++ (must match Python name)
|
|
106
|
+
bytes, offset // Argument(s)
|
|
107
|
+
);
|
|
108
|
+
}
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
#endif // _PY_SERDE_HPP_
|
|
@@ -40,20 +40,20 @@
|
|
|
40
40
|
"name": "stdout",
|
|
41
41
|
"output_type": "stream",
|
|
42
42
|
"text": [
|
|
43
|
-
"###
|
|
44
|
-
"
|
|
45
|
-
" lg current size : 13\n",
|
|
46
|
-
" num retained keys : 6560\n",
|
|
47
|
-
" resize factor : 8\n",
|
|
48
|
-
" sampling probability : 1\n",
|
|
43
|
+
"### Theta sketch summary:\n",
|
|
44
|
+
" num retained entries : 6560\n",
|
|
49
45
|
" seed hash : 37836\n",
|
|
46
|
+
" empty? : false\n",
|
|
50
47
|
" ordered? : false\n",
|
|
48
|
+
" estimation mode? : true\n",
|
|
51
49
|
" theta (fraction) : 0.00654224\n",
|
|
52
50
|
" theta (raw 64-bit) : 60341508738660257\n",
|
|
53
|
-
" estimation mode? : true\n",
|
|
54
51
|
" estimate : 1.00271e+06\n",
|
|
55
52
|
" lower bound 95% conf : 978261\n",
|
|
56
53
|
" upper bound 95% conf : 1.02778e+06\n",
|
|
54
|
+
" lg nominal size : 12\n",
|
|
55
|
+
" lg current size : 13\n",
|
|
56
|
+
" resize factor : 8\n",
|
|
57
57
|
"### End sketch summary\n",
|
|
58
58
|
"\n"
|
|
59
59
|
]
|
|
@@ -100,7 +100,7 @@
|
|
|
100
100
|
"cell_type": "markdown",
|
|
101
101
|
"metadata": {},
|
|
102
102
|
"source": [
|
|
103
|
-
"We can serialize and reconstruct the sketch.
|
|
103
|
+
"We can serialize and reconstruct the sketch. Serialization necessarily produces a compact sketch, meaning the sketch can be deserialized and queried or used for further unions or set operations but can not be updated directly."
|
|
104
104
|
]
|
|
105
105
|
},
|
|
106
106
|
{
|
|
@@ -139,7 +139,7 @@
|
|
|
139
139
|
}
|
|
140
140
|
],
|
|
141
141
|
"source": [
|
|
142
|
-
"new_sk1 =
|
|
142
|
+
"new_sk1 = compact_theta_sketch.deserialize(sk1_bytes)\n",
|
|
143
143
|
"print(\"Estimate: \\t\\t\", new_sk1.get_estimate())\n",
|
|
144
144
|
"print(\"Estimation mode: \\t\", new_sk1.is_estimation_mode())"
|
|
145
145
|
]
|
|
@@ -169,20 +169,20 @@
|
|
|
169
169
|
"name": "stdout",
|
|
170
170
|
"output_type": "stream",
|
|
171
171
|
"text": [
|
|
172
|
-
"###
|
|
173
|
-
"
|
|
174
|
-
" lg current size : 14\n",
|
|
175
|
-
" num retained keys : 12488\n",
|
|
176
|
-
" resize factor : 8\n",
|
|
177
|
-
" sampling probability : 1\n",
|
|
172
|
+
"### Theta sketch summary:\n",
|
|
173
|
+
" num retained entries : 12488\n",
|
|
178
174
|
" seed hash : 37836\n",
|
|
175
|
+
" empty? : false\n",
|
|
179
176
|
" ordered? : false\n",
|
|
177
|
+
" estimation mode? : true\n",
|
|
180
178
|
" theta (fraction) : 0.0123336\n",
|
|
181
179
|
" theta (raw 64-bit) : 113757656857900725\n",
|
|
182
|
-
" estimation mode? : true\n",
|
|
183
180
|
" estimate : 1.01252e+06\n",
|
|
184
181
|
" lower bound 95% conf : 994626\n",
|
|
185
182
|
" upper bound 95% conf : 1.03073e+06\n",
|
|
183
|
+
" lg nominal size : 13\n",
|
|
184
|
+
" lg current size : 14\n",
|
|
185
|
+
" resize factor : 8\n",
|
|
186
186
|
"### End sketch summary\n",
|
|
187
187
|
"\n"
|
|
188
188
|
]
|
|
@@ -255,13 +255,14 @@
|
|
|
255
255
|
"output_type": "stream",
|
|
256
256
|
"text": [
|
|
257
257
|
"Has result: True\n",
|
|
258
|
-
"###
|
|
259
|
-
" num retained
|
|
258
|
+
"### Theta sketch summary:\n",
|
|
259
|
+
" num retained entries : 1668\n",
|
|
260
260
|
" seed hash : 37836\n",
|
|
261
|
+
" empty? : false\n",
|
|
261
262
|
" ordered? : true\n",
|
|
263
|
+
" estimation mode? : true\n",
|
|
262
264
|
" theta (fraction) : 0.00654224\n",
|
|
263
265
|
" theta (raw 64-bit) : 60341508738660257\n",
|
|
264
|
-
" estimation mode? : true\n",
|
|
265
266
|
" estimate : 254959\n",
|
|
266
267
|
" lower bound 95% conf : 242739\n",
|
|
267
268
|
" upper bound 95% conf : 267789\n",
|
|
@@ -326,13 +327,14 @@
|
|
|
326
327
|
"name": "stdout",
|
|
327
328
|
"output_type": "stream",
|
|
328
329
|
"text": [
|
|
329
|
-
"###
|
|
330
|
-
" num retained
|
|
330
|
+
"### Theta sketch summary:\n",
|
|
331
|
+
" num retained entries : 4892\n",
|
|
331
332
|
" seed hash : 37836\n",
|
|
333
|
+
" empty? : false\n",
|
|
332
334
|
" ordered? : true\n",
|
|
335
|
+
" estimation mode? : true\n",
|
|
333
336
|
" theta (fraction) : 0.00654224\n",
|
|
334
337
|
" theta (raw 64-bit) : 60341508738660257\n",
|
|
335
|
-
" estimation mode? : true\n",
|
|
336
338
|
" estimate : 747756\n",
|
|
337
339
|
" lower bound 95% conf : 726670\n",
|
|
338
340
|
" upper bound 95% conf : 769452\n",
|
|
@@ -374,7 +376,7 @@
|
|
|
374
376
|
],
|
|
375
377
|
"metadata": {
|
|
376
378
|
"kernelspec": {
|
|
377
|
-
"display_name": "Python 3",
|
|
379
|
+
"display_name": "Python 3.10.6 64-bit",
|
|
378
380
|
"language": "python",
|
|
379
381
|
"name": "python3"
|
|
380
382
|
},
|
|
@@ -388,7 +390,12 @@
|
|
|
388
390
|
"name": "python",
|
|
389
391
|
"nbconvert_exporter": "python",
|
|
390
392
|
"pygments_lexer": "ipython3",
|
|
391
|
-
"version": "3.
|
|
393
|
+
"version": "3.10.6"
|
|
394
|
+
},
|
|
395
|
+
"vscode": {
|
|
396
|
+
"interpreter": {
|
|
397
|
+
"hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
|
|
398
|
+
}
|
|
392
399
|
}
|
|
393
400
|
},
|
|
394
401
|
"nbformat": 4,
|
|
@@ -1,3 +1,21 @@
|
|
|
1
|
+
:: Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
:: or more contributor license agreements. See the NOTICE file
|
|
3
|
+
:: distributed with this work for additional information
|
|
4
|
+
:: regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
:: to you under the Apache License, Version 2.0 (the
|
|
6
|
+
:: "License"); you may not use this file except in compliance
|
|
7
|
+
:: with the License. You may obtain a copy of the License at
|
|
8
|
+
::
|
|
9
|
+
:: http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
::
|
|
11
|
+
:: Unless required by applicable law or agreed to in writing,
|
|
12
|
+
:: software distributed under the License is distributed on an
|
|
13
|
+
:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
:: KIND, either express or implied. See the License for the
|
|
15
|
+
:: specific language governing permissions and limitations
|
|
16
|
+
:: under the License.
|
|
17
|
+
|
|
18
|
+
|
|
1
19
|
@echo off
|
|
2
20
|
:: Takes path to the Python interpreter and returns the path to pybind11
|
|
3
21
|
%1 -c "import pybind11,sys;sys.stdout.write(pybind11.get_cmake_dir())"
|
|
@@ -1,2 +1,18 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
1
18
|
name = "datasketches"
|
|
2
|
-
|
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
|
|
22
22
|
namespace py = pybind11;
|
|
23
23
|
|
|
24
|
+
// sketches
|
|
24
25
|
void init_hll(py::module& m);
|
|
25
26
|
void init_kll(py::module& m);
|
|
26
27
|
void init_fi(py::module& m);
|
|
@@ -29,10 +30,13 @@ void init_theta(py::module& m);
|
|
|
29
30
|
void init_vo(py::module& m);
|
|
30
31
|
void init_req(py::module& m);
|
|
31
32
|
void init_quantiles(py::module& m);
|
|
32
|
-
void init_kolmogorov_smirnov(py::module& m);
|
|
33
33
|
void init_vector_of_kll(py::module& m);
|
|
34
34
|
|
|
35
|
-
|
|
35
|
+
// supporting objects
|
|
36
|
+
void init_kolmogorov_smirnov(py::module& m);
|
|
37
|
+
void init_serde(py::module& m);
|
|
38
|
+
|
|
39
|
+
PYBIND11_MODULE(_datasketches, m) {
|
|
36
40
|
init_hll(m);
|
|
37
41
|
init_kll(m);
|
|
38
42
|
init_fi(m);
|
|
@@ -41,6 +45,8 @@ PYBIND11_MODULE(datasketches, m) {
|
|
|
41
45
|
init_vo(m);
|
|
42
46
|
init_req(m);
|
|
43
47
|
init_quantiles(m);
|
|
44
|
-
init_kolmogorov_smirnov(m);
|
|
45
48
|
init_vector_of_kll(m);
|
|
49
|
+
|
|
50
|
+
init_kolmogorov_smirnov(m);
|
|
51
|
+
init_serde(m);
|
|
46
52
|
}
|