datasketches 0.2.7 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/MANIFEST.in +21 -2
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
- data/vendor/datasketches-cpp/pyproject.toml +17 -12
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +63 -68
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/setup.py +14 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +15 -6
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -22,14 +22,11 @@
|
|
22
22
|
#include <kll_sketch.hpp>
|
23
23
|
#include <kll_helper.hpp>
|
24
24
|
|
25
|
-
#include <assert.h>
|
26
|
-
|
27
25
|
#ifdef KLL_VALIDATION
|
28
26
|
|
29
27
|
// This is to make sure the implementation matches exactly the reference implementation in OCaml.
|
30
|
-
// Conditional compilation is used because the implementation needs
|
31
|
-
//
|
32
|
-
// - a few methods to expose internals of the sketch
|
28
|
+
// Conditional compilation is used because the implementation needs
|
29
|
+
// to switch from random choice to deterministic
|
33
30
|
|
34
31
|
namespace datasketches {
|
35
32
|
|
@@ -154,11 +151,11 @@ const int64_t correct_results[num_tests * 7] = {
|
|
154
151
|
113, 200, 8311133, 6554171, 16, 637, 121111429906734123
|
155
152
|
};
|
156
153
|
|
157
|
-
static std::
|
158
|
-
|
159
|
-
unsigned mask(
|
160
|
-
unsigned cur
|
161
|
-
std::
|
154
|
+
static std::vector<int> make_input_array(unsigned n, unsigned stride) {
|
155
|
+
if (!kll_helper::is_odd(stride)) throw std::logic_error("stride must be odd");
|
156
|
+
unsigned mask = (1 << 23) - 1; // because items are single-precision floats at the moment
|
157
|
+
unsigned cur = 0;
|
158
|
+
std::vector<int> arr(n, 0);
|
162
159
|
for (unsigned i = 0; i < n; i++) {
|
163
160
|
cur += stride;
|
164
161
|
cur &= mask;
|
@@ -167,50 +164,63 @@ static std::unique_ptr<int[]> make_input_array(unsigned n, unsigned stride) {
|
|
167
164
|
return arr;
|
168
165
|
}
|
169
166
|
|
170
|
-
|
171
|
-
|
172
|
-
int64_t
|
173
|
-
int64_t
|
174
|
-
|
175
|
-
|
167
|
+
template<typename It>
|
168
|
+
std::pair<int64_t, uint8_t> hash_samples_and_count_levels(It from, It to) {
|
169
|
+
int64_t multiplier = 738219921; // an arbitrary odd 30-bit number
|
170
|
+
int64_t mask60 = (1ULL << 60) - 1ULL;
|
171
|
+
int64_t accum = 0;
|
172
|
+
uint8_t num_levels = 1;
|
173
|
+
for (auto it = from; it != to; ++it) {
|
174
|
+
accum += static_cast<int64_t>((*it).first);
|
176
175
|
accum *= multiplier;
|
177
176
|
accum &= mask60;
|
178
177
|
accum ^= accum >> 30;
|
178
|
+
const uint8_t level = count_trailing_zeros_in_u64((*it).second);
|
179
|
+
if (num_levels <= level) num_levels = level + 1;
|
179
180
|
}
|
180
|
-
return accum;
|
181
|
+
return std::pair<uint64_t, uint8_t>(accum, num_levels);
|
181
182
|
}
|
182
183
|
|
183
184
|
TEST_CASE("kll validation", "[kll_sketch][validation]") {
|
184
185
|
for (unsigned i = 0; i < num_tests; i++) {
|
185
|
-
|
186
|
-
unsigned k
|
187
|
-
unsigned n
|
188
|
-
unsigned stride
|
189
|
-
|
186
|
+
if (correct_results[7 * i] != i) throw std::logic_error("test number mismatch");
|
187
|
+
unsigned k = correct_results[7 * i + 1];
|
188
|
+
unsigned n = correct_results[7 * i + 2];
|
189
|
+
unsigned stride = correct_results[7 * i + 3];
|
190
|
+
auto input_array = make_input_array(n, stride);
|
190
191
|
kll_sketch<float> sketch(k);
|
191
192
|
kll_next_offset = 0;
|
192
193
|
for (unsigned j = 0; j < n; j++) {
|
193
194
|
sketch.update(input_array[j]);
|
194
195
|
}
|
195
|
-
unsigned num_levels = sketch.get_num_levels();
|
196
196
|
unsigned num_samples = sketch.get_num_retained();
|
197
|
-
|
197
|
+
auto p = hash_samples_and_count_levels(sketch.begin(), sketch.end());
|
198
198
|
std::cout << i;
|
199
|
-
REQUIRE(correct_results[7 * i + 4] ==
|
199
|
+
REQUIRE(correct_results[7 * i + 4] == p.second);
|
200
200
|
REQUIRE(correct_results[7 * i + 5] == num_samples);
|
201
|
-
if (correct_results[7 * i + 6] ==
|
201
|
+
if (correct_results[7 * i + 6] == p.first) {
|
202
202
|
std::cout << " pass" << std::endl;
|
203
203
|
} else {
|
204
|
-
std::cout << " " << (correct_results[7 * i + 6]) << " != " <<
|
205
|
-
sketch.
|
204
|
+
std::cout << " " << (correct_results[7 * i + 6]) << " != " << p.first << "\n";
|
205
|
+
std::cout << sketch.to_string();
|
206
206
|
FAIL();
|
207
207
|
}
|
208
208
|
}
|
209
209
|
}
|
210
210
|
|
211
|
-
TEST_CASE("kll validation: test hash", "[kll_sketch][validaiton]") {
|
212
|
-
float array[] = {
|
213
|
-
|
211
|
+
TEST_CASE("kll validation: test hash and num levels", "[kll_sketch][validaiton]") {
|
212
|
+
std::pair<float, uint64_t> array[] = {
|
213
|
+
std::make_pair(907500, 1),
|
214
|
+
std::make_pair(944104, 1),
|
215
|
+
std::make_pair(807020, 2),
|
216
|
+
std::make_pair(219921, 2),
|
217
|
+
std::make_pair(678370, 2),
|
218
|
+
std::make_pair(955217, 4),
|
219
|
+
std::make_pair(426885, 8)
|
220
|
+
};
|
221
|
+
auto hash_and_num_levels = hash_samples_and_count_levels(array + 1, array + 6);
|
222
|
+
REQUIRE(hash_and_num_levels.first == 1141543353991880193LL);
|
223
|
+
REQUIRE(hash_and_num_levels.second == 3);
|
214
224
|
}
|
215
225
|
|
216
226
|
TEST_CASE("kll validation: make input array", "[kll_sketch][validaiton]") {
|
@@ -1,18 +1,23 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
1
18
|
[build-system]
|
2
19
|
requires = ["wheel",
|
3
20
|
"setuptools >= 30.3.0",
|
4
21
|
"cmake >= 3.16",
|
5
22
|
"pybind11[global] >= 2.6.0"]
|
6
23
|
build-backend = "setuptools.build_meta"
|
7
|
-
|
8
|
-
[tool.tox]
|
9
|
-
legacy_tox_ini = """
|
10
|
-
[tox]
|
11
|
-
envlist = py3
|
12
|
-
|
13
|
-
[testenv]
|
14
|
-
deps = pytest
|
15
|
-
numpy
|
16
|
-
changedir = python/tests
|
17
|
-
commands = pytest
|
18
|
-
"""
|
@@ -50,7 +50,13 @@ target_link_libraries(python
|
|
50
50
|
|
51
51
|
set_target_properties(python PROPERTIES
|
52
52
|
PREFIX ""
|
53
|
-
OUTPUT_NAME
|
53
|
+
OUTPUT_NAME _datasketches
|
54
|
+
)
|
55
|
+
|
56
|
+
target_include_directories(python
|
57
|
+
PUBLIC
|
58
|
+
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
|
59
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
54
60
|
)
|
55
61
|
|
56
62
|
# ensure we make a .so on Mac rather than .dylib
|
@@ -71,4 +77,5 @@ target_sources(python
|
|
71
77
|
src/quantiles_wrapper.cpp
|
72
78
|
src/ks_wrapper.cpp
|
73
79
|
src/vector_of_kll.cpp
|
80
|
+
src/py_serde.cpp
|
74
81
|
)
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
from _datasketches import PyObjectSerDe
|
19
|
+
|
20
|
+
import struct
|
21
|
+
|
22
|
+
# This file provides several Python SerDe implementation examples.
|
23
|
+
#
|
24
|
+
# Each implementation must extend the PyObjectSerDe class and define
|
25
|
+
# three methods:
|
26
|
+
# * get_size(item) returns an int of the number of bytes needed to
|
27
|
+
# serialize the given item
|
28
|
+
# * to_bytes(item) returns a bytes object representing a serialized
|
29
|
+
# version of the given item
|
30
|
+
# * from_bytes(data, offset) takes a bytes object (data) and an offset
|
31
|
+
# indicating where in the data array to start reading. The method
|
32
|
+
# returns a tuple with the newly reconstructed object and the
|
33
|
+
# total number of bytes beyond the offset read from the input data.
|
34
|
+
|
35
|
+
# Implements a simple string-encoding scheme where a string is
|
36
|
+
# written as <num_bytes> <string_contents>, with no null termination.
|
37
|
+
# This format allows pre-allocating each string, at the cost of
|
38
|
+
# additional storage. Using this format, the serialized string consumes
|
39
|
+
# 4 + len(item) bytes.
|
40
|
+
class PyStringsSerDe(PyObjectSerDe):
|
41
|
+
def get_size(self, item):
|
42
|
+
return int(4 + len(item))
|
43
|
+
|
44
|
+
def to_bytes(self, item: str):
|
45
|
+
b = bytearray()
|
46
|
+
b.extend(len(item).to_bytes(4, 'little'))
|
47
|
+
b.extend(map(ord,item))
|
48
|
+
return bytes(b)
|
49
|
+
|
50
|
+
def from_bytes(self, data: bytes, offset: int):
|
51
|
+
num_chars = int.from_bytes(data[offset:offset+3], 'little')
|
52
|
+
if (num_chars < 0 or num_chars > offset + len(data)):
|
53
|
+
raise IndexError(f'num_chars read must be non-negative and not larger than the buffer. Found {num_chars}')
|
54
|
+
str = data[offset+4:offset+4+num_chars].decode()
|
55
|
+
return (str, 4+num_chars)
|
56
|
+
|
57
|
+
# Implements an integer-encoding scheme where each integer is written
|
58
|
+
# as a 32-bit (4 byte) little-endian value.
|
59
|
+
class PyIntsSerDe(PyObjectSerDe):
|
60
|
+
def get_size(self, item):
|
61
|
+
return int(4)
|
62
|
+
|
63
|
+
def to_bytes(self, item):
|
64
|
+
return struct.pack('i', item)
|
65
|
+
|
66
|
+
def from_bytes(self, data: bytes, offset: int):
|
67
|
+
val = struct.unpack_from('i', data, offset)[0]
|
68
|
+
return (val, 4)
|
69
|
+
|
70
|
+
|
71
|
+
class PyLongsSerDe(PyObjectSerDe):
|
72
|
+
def get_size(self, item):
|
73
|
+
return int(8)
|
74
|
+
|
75
|
+
def to_bytes(self, item):
|
76
|
+
return struct.pack('l', item)
|
77
|
+
|
78
|
+
def from_bytes(self, data: bytes, offset: int):
|
79
|
+
val = struct.unpack_from('l', data, offset)[0]
|
80
|
+
return (val, 8)
|
81
|
+
|
82
|
+
|
83
|
+
class PyFloatsSerDe(PyObjectSerDe):
|
84
|
+
def get_size(self, item):
|
85
|
+
return int(4)
|
86
|
+
|
87
|
+
def to_bytes(self, item):
|
88
|
+
return struct.pack('f', item)
|
89
|
+
|
90
|
+
def from_bytes(self, data: bytes, offset: int):
|
91
|
+
val = struct.unpack_from('f', data, offset)[0]
|
92
|
+
return (val, 4)
|
93
|
+
|
94
|
+
|
95
|
+
class PyDoublesSerDe(PyObjectSerDe):
|
96
|
+
def get_size(self, item):
|
97
|
+
return int(8)
|
98
|
+
|
99
|
+
def to_bytes(self, item):
|
100
|
+
return struct.pack('d', item)
|
101
|
+
|
102
|
+
def from_bytes(self, data: bytes, offset: int):
|
103
|
+
val = struct.unpack_from('d', data, offset)[0]
|
104
|
+
return (val, 8)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
name = 'datasketches'
|
19
|
+
|
20
|
+
from .PySerDe import *
|
21
|
+
|
22
|
+
from _datasketches import *
|
@@ -0,0 +1,113 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <pybind11/pybind11.h>
|
21
|
+
#include <pybind11/functional.h>
|
22
|
+
#include <sstream>
|
23
|
+
|
24
|
+
#ifndef _PY_SERDE_HPP_
|
25
|
+
#define _PY_SERDE_HPP_
|
26
|
+
|
27
|
+
namespace py = pybind11;
|
28
|
+
|
29
|
+
namespace datasketches {
|
30
|
+
|
31
|
+
/**
|
32
|
+
* @brief The py_object_serde is an abstract class that implements the
|
33
|
+
* datasketches serde interface, and is used to allow custom Python
|
34
|
+
* serialization of items wrapped as generic py::object types. The actual
|
35
|
+
* Python implementation classes must extend the PyObjectSerDe class.
|
36
|
+
*/
|
37
|
+
struct py_object_serde {
|
38
|
+
/**
|
39
|
+
* @brief Get the serialized size of an object, in bytes
|
40
|
+
*
|
41
|
+
* @param item A provided item
|
42
|
+
* @return int64_t The serialized size of the item, in bytes
|
43
|
+
*/
|
44
|
+
virtual int64_t get_size(const py::object& item) const = 0;
|
45
|
+
|
46
|
+
/**
|
47
|
+
* @brief Serializes an item to a bytes object
|
48
|
+
*
|
49
|
+
* @param item A provided item
|
50
|
+
* @return The serialized image of the item as a Python bytes object
|
51
|
+
*/
|
52
|
+
virtual py::bytes to_bytes(const py::object& item) const = 0;
|
53
|
+
|
54
|
+
/**
|
55
|
+
* @brief Constructs an object from a serialized image, reading the
|
56
|
+
* incoming buffer starting at the specified offset.
|
57
|
+
*
|
58
|
+
* @param bytes A buffer containing items from a serialized sketch
|
59
|
+
* @param offset The starting offset into the bytes buffer
|
60
|
+
* @return A Python tuple of the reconstructed item and the total number of bytes read
|
61
|
+
*/
|
62
|
+
virtual py::tuple from_bytes(py::bytes& bytes, size_t offset) const = 0;
|
63
|
+
|
64
|
+
virtual ~py_object_serde() = default;
|
65
|
+
|
66
|
+
// these methods are required by the serde interface; see common/include/serde.hpp for
|
67
|
+
// default implementations for C++ std::string and numeric types.
|
68
|
+
size_t size_of_item(const py::object& item) const;
|
69
|
+
size_t serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const;
|
70
|
+
size_t deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const;
|
71
|
+
};
|
72
|
+
|
73
|
+
/**
|
74
|
+
* @brief The PyObjectSerDe class provides a concrete base class
|
75
|
+
* that pybind11 uses as a "trampoline" to pass calls through to
|
76
|
+
* the abstract py_object_serde class. Custom Python serde implementations
|
77
|
+
* must extend this class.
|
78
|
+
*/
|
79
|
+
struct PyObjectSerDe : public py_object_serde {
|
80
|
+
using py_object_serde::py_object_serde;
|
81
|
+
|
82
|
+
// trampoline definitions -- need one for each virtual function
|
83
|
+
int64_t get_size(const py::object& item) const override {
|
84
|
+
PYBIND11_OVERRIDE_PURE(
|
85
|
+
int64_t, // Return type
|
86
|
+
py_object_serde, // Parent class
|
87
|
+
get_size, // Name of function in C++ (must match Python name)
|
88
|
+
item // Argument(s)
|
89
|
+
);
|
90
|
+
}
|
91
|
+
|
92
|
+
py::bytes to_bytes(const py::object& item) const override {
|
93
|
+
PYBIND11_OVERRIDE_PURE(
|
94
|
+
py::bytes, // Return type
|
95
|
+
py_object_serde, // Parent class
|
96
|
+
to_bytes, // Name of function in C++ (must match Python name)
|
97
|
+
item // Argument(s)
|
98
|
+
);
|
99
|
+
}
|
100
|
+
|
101
|
+
py::tuple from_bytes(py::bytes& bytes, size_t offset) const override {
|
102
|
+
PYBIND11_OVERRIDE_PURE(
|
103
|
+
py::tuple, // Return type
|
104
|
+
py_object_serde, // Parent class
|
105
|
+
from_bytes, // Name of function in C++ (must match Python name)
|
106
|
+
bytes, offset // Argument(s)
|
107
|
+
);
|
108
|
+
}
|
109
|
+
};
|
110
|
+
|
111
|
+
}
|
112
|
+
|
113
|
+
#endif // _PY_SERDE_HPP_
|
@@ -40,20 +40,20 @@
|
|
40
40
|
"name": "stdout",
|
41
41
|
"output_type": "stream",
|
42
42
|
"text": [
|
43
|
-
"###
|
44
|
-
"
|
45
|
-
" lg current size : 13\n",
|
46
|
-
" num retained keys : 6560\n",
|
47
|
-
" resize factor : 8\n",
|
48
|
-
" sampling probability : 1\n",
|
43
|
+
"### Theta sketch summary:\n",
|
44
|
+
" num retained entries : 6560\n",
|
49
45
|
" seed hash : 37836\n",
|
46
|
+
" empty? : false\n",
|
50
47
|
" ordered? : false\n",
|
48
|
+
" estimation mode? : true\n",
|
51
49
|
" theta (fraction) : 0.00654224\n",
|
52
50
|
" theta (raw 64-bit) : 60341508738660257\n",
|
53
|
-
" estimation mode? : true\n",
|
54
51
|
" estimate : 1.00271e+06\n",
|
55
52
|
" lower bound 95% conf : 978261\n",
|
56
53
|
" upper bound 95% conf : 1.02778e+06\n",
|
54
|
+
" lg nominal size : 12\n",
|
55
|
+
" lg current size : 13\n",
|
56
|
+
" resize factor : 8\n",
|
57
57
|
"### End sketch summary\n",
|
58
58
|
"\n"
|
59
59
|
]
|
@@ -100,7 +100,7 @@
|
|
100
100
|
"cell_type": "markdown",
|
101
101
|
"metadata": {},
|
102
102
|
"source": [
|
103
|
-
"We can serialize and reconstruct the sketch.
|
103
|
+
"We can serialize and reconstruct the sketch. Serialization necessarily produces a compact sketch, meaning the sketch can be deserialized and queried or used for further unions or set operations but can not be updated directly."
|
104
104
|
]
|
105
105
|
},
|
106
106
|
{
|
@@ -139,7 +139,7 @@
|
|
139
139
|
}
|
140
140
|
],
|
141
141
|
"source": [
|
142
|
-
"new_sk1 =
|
142
|
+
"new_sk1 = compact_theta_sketch.deserialize(sk1_bytes)\n",
|
143
143
|
"print(\"Estimate: \\t\\t\", new_sk1.get_estimate())\n",
|
144
144
|
"print(\"Estimation mode: \\t\", new_sk1.is_estimation_mode())"
|
145
145
|
]
|
@@ -169,20 +169,20 @@
|
|
169
169
|
"name": "stdout",
|
170
170
|
"output_type": "stream",
|
171
171
|
"text": [
|
172
|
-
"###
|
173
|
-
"
|
174
|
-
" lg current size : 14\n",
|
175
|
-
" num retained keys : 12488\n",
|
176
|
-
" resize factor : 8\n",
|
177
|
-
" sampling probability : 1\n",
|
172
|
+
"### Theta sketch summary:\n",
|
173
|
+
" num retained entries : 12488\n",
|
178
174
|
" seed hash : 37836\n",
|
175
|
+
" empty? : false\n",
|
179
176
|
" ordered? : false\n",
|
177
|
+
" estimation mode? : true\n",
|
180
178
|
" theta (fraction) : 0.0123336\n",
|
181
179
|
" theta (raw 64-bit) : 113757656857900725\n",
|
182
|
-
" estimation mode? : true\n",
|
183
180
|
" estimate : 1.01252e+06\n",
|
184
181
|
" lower bound 95% conf : 994626\n",
|
185
182
|
" upper bound 95% conf : 1.03073e+06\n",
|
183
|
+
" lg nominal size : 13\n",
|
184
|
+
" lg current size : 14\n",
|
185
|
+
" resize factor : 8\n",
|
186
186
|
"### End sketch summary\n",
|
187
187
|
"\n"
|
188
188
|
]
|
@@ -255,13 +255,14 @@
|
|
255
255
|
"output_type": "stream",
|
256
256
|
"text": [
|
257
257
|
"Has result: True\n",
|
258
|
-
"###
|
259
|
-
" num retained
|
258
|
+
"### Theta sketch summary:\n",
|
259
|
+
" num retained entries : 1668\n",
|
260
260
|
" seed hash : 37836\n",
|
261
|
+
" empty? : false\n",
|
261
262
|
" ordered? : true\n",
|
263
|
+
" estimation mode? : true\n",
|
262
264
|
" theta (fraction) : 0.00654224\n",
|
263
265
|
" theta (raw 64-bit) : 60341508738660257\n",
|
264
|
-
" estimation mode? : true\n",
|
265
266
|
" estimate : 254959\n",
|
266
267
|
" lower bound 95% conf : 242739\n",
|
267
268
|
" upper bound 95% conf : 267789\n",
|
@@ -326,13 +327,14 @@
|
|
326
327
|
"name": "stdout",
|
327
328
|
"output_type": "stream",
|
328
329
|
"text": [
|
329
|
-
"###
|
330
|
-
" num retained
|
330
|
+
"### Theta sketch summary:\n",
|
331
|
+
" num retained entries : 4892\n",
|
331
332
|
" seed hash : 37836\n",
|
333
|
+
" empty? : false\n",
|
332
334
|
" ordered? : true\n",
|
335
|
+
" estimation mode? : true\n",
|
333
336
|
" theta (fraction) : 0.00654224\n",
|
334
337
|
" theta (raw 64-bit) : 60341508738660257\n",
|
335
|
-
" estimation mode? : true\n",
|
336
338
|
" estimate : 747756\n",
|
337
339
|
" lower bound 95% conf : 726670\n",
|
338
340
|
" upper bound 95% conf : 769452\n",
|
@@ -374,7 +376,7 @@
|
|
374
376
|
],
|
375
377
|
"metadata": {
|
376
378
|
"kernelspec": {
|
377
|
-
"display_name": "Python 3",
|
379
|
+
"display_name": "Python 3.10.6 64-bit",
|
378
380
|
"language": "python",
|
379
381
|
"name": "python3"
|
380
382
|
},
|
@@ -388,7 +390,12 @@
|
|
388
390
|
"name": "python",
|
389
391
|
"nbconvert_exporter": "python",
|
390
392
|
"pygments_lexer": "ipython3",
|
391
|
-
"version": "3.
|
393
|
+
"version": "3.10.6"
|
394
|
+
},
|
395
|
+
"vscode": {
|
396
|
+
"interpreter": {
|
397
|
+
"hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
|
398
|
+
}
|
392
399
|
}
|
393
400
|
},
|
394
401
|
"nbformat": 4,
|
@@ -1,3 +1,21 @@
|
|
1
|
+
:: Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
:: or more contributor license agreements. See the NOTICE file
|
3
|
+
:: distributed with this work for additional information
|
4
|
+
:: regarding copyright ownership. The ASF licenses this file
|
5
|
+
:: to you under the Apache License, Version 2.0 (the
|
6
|
+
:: "License"); you may not use this file except in compliance
|
7
|
+
:: with the License. You may obtain a copy of the License at
|
8
|
+
::
|
9
|
+
:: http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
::
|
11
|
+
:: Unless required by applicable law or agreed to in writing,
|
12
|
+
:: software distributed under the License is distributed on an
|
13
|
+
:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
:: KIND, either express or implied. See the License for the
|
15
|
+
:: specific language governing permissions and limitations
|
16
|
+
:: under the License.
|
17
|
+
|
18
|
+
|
1
19
|
@echo off
|
2
20
|
:: Takes path to the Python interpreter and returns the path to pybind11
|
3
21
|
%1 -c "import pybind11,sys;sys.stdout.write(pybind11.get_cmake_dir())"
|
@@ -1,2 +1,18 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
1
18
|
name = "datasketches"
|
2
|
-
|
@@ -21,6 +21,7 @@
|
|
21
21
|
|
22
22
|
namespace py = pybind11;
|
23
23
|
|
24
|
+
// sketches
|
24
25
|
void init_hll(py::module& m);
|
25
26
|
void init_kll(py::module& m);
|
26
27
|
void init_fi(py::module& m);
|
@@ -29,10 +30,13 @@ void init_theta(py::module& m);
|
|
29
30
|
void init_vo(py::module& m);
|
30
31
|
void init_req(py::module& m);
|
31
32
|
void init_quantiles(py::module& m);
|
32
|
-
void init_kolmogorov_smirnov(py::module& m);
|
33
33
|
void init_vector_of_kll(py::module& m);
|
34
34
|
|
35
|
-
|
35
|
+
// supporting objects
|
36
|
+
void init_kolmogorov_smirnov(py::module& m);
|
37
|
+
void init_serde(py::module& m);
|
38
|
+
|
39
|
+
PYBIND11_MODULE(_datasketches, m) {
|
36
40
|
init_hll(m);
|
37
41
|
init_kll(m);
|
38
42
|
init_fi(m);
|
@@ -41,6 +45,8 @@ PYBIND11_MODULE(datasketches, m) {
|
|
41
45
|
init_vo(m);
|
42
46
|
init_req(m);
|
43
47
|
init_quantiles(m);
|
44
|
-
init_kolmogorov_smirnov(m);
|
45
48
|
init_vector_of_kll(m);
|
49
|
+
|
50
|
+
init_kolmogorov_smirnov(m);
|
51
|
+
init_serde(m);
|
46
52
|
}
|