datasketches 0.2.2 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +8 -8
- data/ext/datasketches/kll_wrapper.cpp +5 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +27 -5
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +16 -0
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +30 -12
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +103 -44
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +110 -130
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +156 -23
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +17 -6
- data/vendor/datasketches-cpp/python/README.md +57 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +49 -14
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +12 -5
- data/vendor/datasketches-cpp/python/tests/kll_test.py +12 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
- data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +66 -61
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +54 -12
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +45 -34
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +9 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +39 -10
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
- metadata +34 -12
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -22,6 +22,7 @@
|
|
22
22
|
#include <cstring>
|
23
23
|
#include <sstream>
|
24
24
|
#include <fstream>
|
25
|
+
#include <stdexcept>
|
25
26
|
|
26
27
|
#include <kll_sketch.hpp>
|
27
28
|
#include <test_allocator.hpp>
|
@@ -90,7 +91,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
90
91
|
REQUIRE(sketch.get_n() == 1);
|
91
92
|
REQUIRE(sketch.get_num_retained() == 1);
|
92
93
|
REQUIRE(sketch.get_rank(1.0f) == 0.0);
|
94
|
+
REQUIRE(sketch.get_rank<true>(1.0f) == 1.0);
|
93
95
|
REQUIRE(sketch.get_rank(2.0f) == 1.0);
|
96
|
+
REQUIRE(sketch.get_rank(std::numeric_limits<float>::infinity()) == 1.0);
|
94
97
|
REQUIRE(sketch.get_min_value() == 1.0);
|
95
98
|
REQUIRE(sketch.get_max_value() == 1.0);
|
96
99
|
REQUIRE(sketch.get_quantile(0.5) == 1.0);
|
@@ -142,8 +145,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
142
145
|
REQUIRE(quantiles[2] == n - 1 );
|
143
146
|
|
144
147
|
for (uint32_t i = 0; i < n; i++) {
|
145
|
-
const double
|
146
|
-
REQUIRE(sketch.get_rank(static_cast<float>(i)) ==
|
148
|
+
const double true_rank = (double) i / n;
|
149
|
+
REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank);
|
150
|
+
const double true_rank_inclusive = (double) (i + 1) / n;
|
151
|
+
REQUIRE(sketch.get_rank<true>(static_cast<float>(i)) == true_rank_inclusive);
|
147
152
|
}
|
148
153
|
|
149
154
|
// the alternative method must produce the same result
|
@@ -241,20 +246,38 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
241
246
|
sketch.update(static_cast<float>(i));
|
242
247
|
values[i] = static_cast<float>(i);
|
243
248
|
}
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
249
|
+
{ // inclusive=false (default)
|
250
|
+
const auto ranks(sketch.get_CDF(values, n));
|
251
|
+
const auto pmf(sketch.get_PMF(values, n));
|
252
|
+
|
253
|
+
double subtotal_pmf = 0;
|
254
|
+
for (int i = 0; i < n; i++) {
|
255
|
+
if (sketch.get_rank(values[i]) != ranks[i]) {
|
256
|
+
std::cerr << "checking rank vs CDF for value " << i << std::endl;
|
257
|
+
REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
|
258
|
+
}
|
259
|
+
subtotal_pmf += pmf[i];
|
260
|
+
if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
|
261
|
+
std::cerr << "CDF vs PMF for value " << i << std::endl;
|
262
|
+
REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
|
263
|
+
}
|
253
264
|
}
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
265
|
+
}
|
266
|
+
{ // inclusive=true
|
267
|
+
const auto ranks(sketch.get_CDF<true>(values, n));
|
268
|
+
const auto pmf(sketch.get_PMF<true>(values, n));
|
269
|
+
|
270
|
+
double subtotal_pmf = 0;
|
271
|
+
for (int i = 0; i < n; i++) {
|
272
|
+
if (sketch.get_rank<true>(values[i]) != ranks[i]) {
|
273
|
+
std::cerr << "checking rank vs CDF for value " << i << std::endl;
|
274
|
+
REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
|
275
|
+
}
|
276
|
+
subtotal_pmf += pmf[i];
|
277
|
+
if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
|
278
|
+
std::cerr << "CDF vs PMF for value " << i << std::endl;
|
279
|
+
REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
|
280
|
+
}
|
258
281
|
}
|
259
282
|
}
|
260
283
|
}
|
@@ -279,6 +302,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
279
302
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
280
303
|
auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
|
281
304
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
305
|
+
REQUIRE(s.tellg() == s.tellp());
|
282
306
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
283
307
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
284
308
|
REQUIRE(sketch2.get_n() == sketch.get_n());
|
@@ -292,7 +316,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
292
316
|
SECTION("bytes serialize deserialize empty") {
|
293
317
|
kll_float_sketch sketch(200, 0);
|
294
318
|
auto bytes = sketch.serialize();
|
295
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
319
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
296
320
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
297
321
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
298
322
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
@@ -304,13 +328,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
304
328
|
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
|
305
329
|
}
|
306
330
|
|
307
|
-
SECTION("serialize deserialize one item") {
|
331
|
+
SECTION("stream serialize deserialize one item") {
|
308
332
|
kll_float_sketch sketch(200, 0);
|
309
333
|
sketch.update(1.0f);
|
310
334
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
311
335
|
sketch.serialize(s);
|
312
336
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
313
|
-
auto sketch2 = kll_float_sketch::deserialize(s,
|
337
|
+
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
|
314
338
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
315
339
|
REQUIRE(s.tellg() == s.tellp());
|
316
340
|
REQUIRE_FALSE(sketch2.is_empty());
|
@@ -324,11 +348,29 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
324
348
|
REQUIRE(sketch2.get_rank(2) == 1.0);
|
325
349
|
}
|
326
350
|
|
351
|
+
SECTION("bytes serialize deserialize one item") {
|
352
|
+
kll_float_sketch sketch(200, 0);
|
353
|
+
sketch.update(1.0f);
|
354
|
+
auto bytes = sketch.serialize();
|
355
|
+
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
356
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
357
|
+
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
358
|
+
REQUIRE_FALSE(sketch2.is_empty());
|
359
|
+
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
360
|
+
REQUIRE(sketch2.get_n() == 1);
|
361
|
+
REQUIRE(sketch2.get_num_retained() == 1);
|
362
|
+
REQUIRE(sketch2.get_min_value() == 1.0);
|
363
|
+
REQUIRE(sketch2.get_max_value() == 1.0);
|
364
|
+
REQUIRE(sketch2.get_quantile(0.5) == 1.0);
|
365
|
+
REQUIRE(sketch2.get_rank(1) == 0.0);
|
366
|
+
REQUIRE(sketch2.get_rank(2) == 1.0);
|
367
|
+
}
|
368
|
+
|
327
369
|
SECTION("deserialize one item v1") {
|
328
370
|
std::ifstream is;
|
329
371
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
330
372
|
is.open(testBinaryInputPath + "kll_sketch_float_one_item_v1.sk", std::ios::binary);
|
331
|
-
auto sketch = kll_float_sketch::deserialize(is,
|
373
|
+
auto sketch = kll_float_sketch::deserialize(is, serde<float>(), 0);
|
332
374
|
REQUIRE_FALSE(sketch.is_empty());
|
333
375
|
REQUIRE_FALSE(sketch.is_estimation_mode());
|
334
376
|
REQUIRE(sketch.get_n() == 1);
|
@@ -337,6 +379,42 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
337
379
|
REQUIRE(sketch.get_max_value() == 1.0);
|
338
380
|
}
|
339
381
|
|
382
|
+
SECTION("stream serialize deserialize three items") {
|
383
|
+
kll_float_sketch sketch(200, 0);
|
384
|
+
sketch.update(1.0f);
|
385
|
+
sketch.update(2.0f);
|
386
|
+
sketch.update(3.0f);
|
387
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
388
|
+
sketch.serialize(s);
|
389
|
+
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
390
|
+
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
|
391
|
+
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
392
|
+
REQUIRE(s.tellg() == s.tellp());
|
393
|
+
REQUIRE_FALSE(sketch2.is_empty());
|
394
|
+
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
395
|
+
REQUIRE(sketch2.get_n() == 3);
|
396
|
+
REQUIRE(sketch2.get_num_retained() == 3);
|
397
|
+
REQUIRE(sketch2.get_min_value() == 1.0);
|
398
|
+
REQUIRE(sketch2.get_max_value() == 3.0);
|
399
|
+
}
|
400
|
+
|
401
|
+
SECTION("bytes serialize deserialize three items") {
|
402
|
+
kll_float_sketch sketch(200, 0);
|
403
|
+
sketch.update(1.0f);
|
404
|
+
sketch.update(2.0f);
|
405
|
+
sketch.update(3.0f);
|
406
|
+
auto bytes = sketch.serialize();
|
407
|
+
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
408
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
409
|
+
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
410
|
+
REQUIRE_FALSE(sketch2.is_empty());
|
411
|
+
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
412
|
+
REQUIRE(sketch2.get_n() == 3);
|
413
|
+
REQUIRE(sketch2.get_num_retained() == 3);
|
414
|
+
REQUIRE(sketch2.get_min_value() == 1.0);
|
415
|
+
REQUIRE(sketch2.get_max_value() == 3.0);
|
416
|
+
}
|
417
|
+
|
340
418
|
SECTION("stream serialize deserialize many floats") {
|
341
419
|
kll_float_sketch sketch(200, 0);
|
342
420
|
const int n = 1000;
|
@@ -344,7 +422,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
344
422
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
345
423
|
sketch.serialize(s);
|
346
424
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
347
|
-
auto sketch2 = kll_float_sketch::deserialize(s,
|
425
|
+
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
|
348
426
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
349
427
|
REQUIRE(s.tellg() == s.tellp());
|
350
428
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
@@ -366,7 +444,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
366
444
|
for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
|
367
445
|
auto bytes = sketch.serialize();
|
368
446
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
369
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
447
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
370
448
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
371
449
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
372
450
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
@@ -623,7 +701,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
623
701
|
|
624
702
|
auto bytes = sketch1.serialize();
|
625
703
|
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
|
626
|
-
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
704
|
+
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
|
627
705
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
628
706
|
REQUIRE(sketch2.is_empty() == sketch1.is_empty());
|
629
707
|
REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
|
@@ -644,7 +722,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
644
722
|
sketch1.update("a");
|
645
723
|
auto bytes = sketch1.serialize();
|
646
724
|
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
|
647
|
-
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
725
|
+
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
|
648
726
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
649
727
|
}
|
650
728
|
|
@@ -702,6 +780,61 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
702
780
|
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000000, 4) == 3160);
|
703
781
|
}
|
704
782
|
|
783
|
+
SECTION("issue #236") {
|
784
|
+
kll_sketch<int8_t> kll;
|
785
|
+
kll.update(1);
|
786
|
+
kll.update(2);
|
787
|
+
kll.update(3);
|
788
|
+
auto blob = kll.serialize();
|
789
|
+
auto kll2 = kll_sketch<int8_t>::deserialize(blob.data(), blob.size());
|
790
|
+
}
|
791
|
+
|
792
|
+
SECTION("sorted view") {
|
793
|
+
kll_sketch<int> kll;
|
794
|
+
kll.update(2);
|
795
|
+
kll.update(3);
|
796
|
+
kll.update(1);
|
797
|
+
|
798
|
+
{ // non-cumulative, using operator->
|
799
|
+
auto view = kll.get_sorted_view(false);
|
800
|
+
REQUIRE(view.size() == 3);
|
801
|
+
auto it = view.begin();
|
802
|
+
REQUIRE(it->first == 1);
|
803
|
+
REQUIRE(it->second == 1);
|
804
|
+
++it;
|
805
|
+
REQUIRE(it->first == 2);
|
806
|
+
REQUIRE(it->second == 1);
|
807
|
+
++it;
|
808
|
+
REQUIRE(it->first == 3);
|
809
|
+
REQUIRE(it->second == 1);
|
810
|
+
}
|
811
|
+
{ // cumulative, non-inclusive, using operator->
|
812
|
+
auto view = kll.get_sorted_view(true);
|
813
|
+
REQUIRE(view.size() == 3);
|
814
|
+
auto it = view.begin();
|
815
|
+
REQUIRE(it->first == 1);
|
816
|
+
REQUIRE(it->second == 0);
|
817
|
+
++it;
|
818
|
+
REQUIRE(it->first == 2);
|
819
|
+
REQUIRE(it->second == 1);
|
820
|
+
++it;
|
821
|
+
REQUIRE(it->first == 3);
|
822
|
+
REQUIRE(it->second == 2);
|
823
|
+
}
|
824
|
+
{ // cumulative, inclusive, using operator*
|
825
|
+
auto view = kll.get_sorted_view<true>(true);
|
826
|
+
REQUIRE(view.size() == 3);
|
827
|
+
auto it = view.begin();
|
828
|
+
REQUIRE((*it).first == 1);
|
829
|
+
REQUIRE((*it).second == 1);
|
830
|
+
++it;
|
831
|
+
REQUIRE((*it).first == 2);
|
832
|
+
REQUIRE((*it).second == 2);
|
833
|
+
++it;
|
834
|
+
REQUIRE((*it).first == 3);
|
835
|
+
REQUIRE((*it).second == 3);
|
836
|
+
}
|
837
|
+
}
|
705
838
|
// cleanup
|
706
839
|
if (test_allocator_total_bytes != 0) {
|
707
840
|
REQUIRE(test_allocator_total_bytes == 0);
|
@@ -46,7 +46,7 @@ TEST_CASE("kolmogorov-smirnov same distribution", "[kll_sketch]") {
|
|
46
46
|
sketch1.update(x);
|
47
47
|
sketch2.update(x);
|
48
48
|
}
|
49
|
-
REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.
|
49
|
+
REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.02));
|
50
50
|
REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
|
51
51
|
}
|
52
52
|
|
@@ -15,16 +15,24 @@
|
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
set(PYBIND11_CPP_STANDARD /std:c++11)
|
18
|
+
if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0")
|
19
|
+
find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
|
21
20
|
else()
|
22
|
-
|
21
|
+
find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
|
22
|
+
endif()
|
23
|
+
|
24
|
+
# only Windows+MSVC seems to have trouble locating pybind11
|
25
|
+
if (MSVC)
|
26
|
+
execute_process(COMMAND cmd.exe /c ${CMAKE_CURRENT_SOURCE_DIR}/pybind11Path.cmd "${Python3_EXECUTABLE}"
|
27
|
+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
28
|
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
29
|
+
OUTPUT_VARIABLE EXTRA_PACKAGE_PATH)
|
30
|
+
set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${EXTRA_PACKAGE_PATH})
|
23
31
|
endif()
|
24
32
|
|
25
|
-
|
33
|
+
find_package(pybind11 CONFIG REQUIRED)
|
26
34
|
|
27
|
-
pybind11_add_module(python MODULE EXCLUDE_FROM_ALL
|
35
|
+
pybind11_add_module(python MODULE EXCLUDE_FROM_ALL THIN_LTO)
|
28
36
|
|
29
37
|
target_link_libraries(python
|
30
38
|
PRIVATE
|
@@ -36,6 +44,7 @@ target_link_libraries(python
|
|
36
44
|
theta
|
37
45
|
sampling
|
38
46
|
req
|
47
|
+
quantiles
|
39
48
|
pybind11::module
|
40
49
|
)
|
41
50
|
|
@@ -59,5 +68,7 @@ target_sources(python
|
|
59
68
|
src/theta_wrapper.cpp
|
60
69
|
src/vo_wrapper.cpp
|
61
70
|
src/req_wrapper.cpp
|
71
|
+
src/quantiles_wrapper.cpp
|
72
|
+
src/ks_wrapper.cpp
|
62
73
|
src/vector_of_kll.cpp
|
63
74
|
)
|
@@ -1,76 +1,64 @@
|
|
1
|
-
|
1
|
+
<img src="https://raw.githubusercontent.com/apache/datasketches-website/master/logos/svg/datasketches-HorizontalColor-TM.svg" width="75%" alt="Apache DataSketchs Logo">
|
2
2
|
|
3
|
-
|
3
|
+
# The Apache DataSketches Library for Python
|
4
4
|
|
5
|
-
|
6
|
-
from a relase package, you must ensure that the pybind11 directory points to a local copy of pybind11.
|
5
|
+
This is the official version of the [Apache DataSketches](https://datasketches.apache.org) Python library.
|
7
6
|
|
8
|
-
|
7
|
+
In the analysis of big data there are often problem queries that don’t scale because they require huge compute resources and time to generate exact results. Examples include count distinct, quantiles, most-frequent items, joins, matrix computations, and graph analysis.
|
9
8
|
|
10
|
-
If
|
11
|
-
```pip install git+https://github.com/apache/datasketches-cpp.git```
|
9
|
+
If approximate results are acceptable, there is a class of specialized algorithms, called streaming algorithms, or sketches that can produce results orders-of magnitude faster and with mathematically proven error bounds. For interactive queries there may not be other viable alternatives, and in the case of real-time analysis, sketches are the only known solution.
|
12
10
|
|
13
|
-
|
14
|
-
|
15
|
-
### Building
|
16
|
-
|
17
|
-
When cloning the source repository, you should include the pybind11 submodule with the `--recursive` option to the clone command:
|
18
|
-
```
|
19
|
-
git clone --recursive https://github.com/apache/datasketches-cpp.git
|
20
|
-
cd datasketches-cpp
|
21
|
-
python -m pip install --upgrade pip setuptools wheel numpy
|
22
|
-
python setup.py build
|
23
|
-
```
|
11
|
+
This package provides a variety of sketches as described below. Wherever a specific type of sketch exists in Apache DataSketches packages for other languages, the sketches will be portable between languages (for platforms with the same endianness).
|
24
12
|
|
25
|
-
|
13
|
+
## Building and Installation
|
26
14
|
|
27
|
-
|
15
|
+
Once cloned, the library can be installed by running `python -m pip install .` in the project root directory, which will also install the necessary dependencies, namely numpy and [pybind11[global]](https://github.com/pybind/pybind11).
|
28
16
|
|
29
|
-
|
30
|
-
line of the build command with `python setup.py install`.
|
17
|
+
If you prefer to call the `setup.py` build script directly, you must first install `pybind11[global]`, as well as any other dependencies listed under the build-system section in `pyproject.toml`.
|
31
18
|
|
32
|
-
|
33
|
-
|
34
|
-
The python tests are run with `tox`. To ensure you have all the needed packages, from the package base directory run:
|
35
|
-
```
|
36
|
-
python -m pip install --upgrade pip setuptools wheel numpy tox
|
37
|
-
tox
|
38
|
-
```
|
19
|
+
The library is also available from PyPI via `python -m pip install datasketches`.
|
39
20
|
|
40
21
|
## Usage
|
41
22
|
|
42
|
-
Having installed the library, loading the Apache Datasketches
|
23
|
+
Having installed the library, loading the Apache Datasketches Library in Python is simple: `import datasketches`.
|
43
24
|
|
44
25
|
## Available Sketch Classes
|
45
26
|
|
46
27
|
- KLL (Absolute Error Quantiles)
|
47
|
-
|
48
|
-
|
28
|
+
- `kll_ints_sketch`
|
29
|
+
- `kll_floats_sketch`
|
30
|
+
- `kll_doubles_sketch`
|
31
|
+
- Quantiles (Absolute Error Quantiles, inferior algorithm)
|
32
|
+
- `quantiles_ints_sketch`
|
33
|
+
- `quantiles_floats_sketch`
|
34
|
+
- `quantiles_doubles_sketch`
|
49
35
|
- REQ (Relative Error Quantiles)
|
50
|
-
|
51
|
-
|
36
|
+
- `req_ints_sketch`
|
37
|
+
- `req_floats_sketch`
|
52
38
|
- Frequent Items
|
53
|
-
|
54
|
-
|
39
|
+
- `frequent_strings_sketch`
|
40
|
+
- Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
|
55
41
|
- Theta
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
42
|
+
- `update_theta_sketch`
|
43
|
+
- `compact_theta_sketch` (cannot be instantiated directly)
|
44
|
+
- `theta_union`
|
45
|
+
- `theta_intersection`
|
46
|
+
- `theta_a_not_b`
|
61
47
|
- HLL
|
62
|
-
|
63
|
-
|
64
|
-
|
48
|
+
- `hll_sketch`
|
49
|
+
- `hll_union`
|
50
|
+
- Target HLL types are `tgt_hll_type.{HLL_4 | HLL_6 | HLL_8}`
|
65
51
|
- CPC
|
66
|
-
|
67
|
-
|
52
|
+
- `cpc_sketch`
|
53
|
+
- `cpc_union`
|
68
54
|
- VarOpt Sampling
|
69
|
-
|
70
|
-
|
55
|
+
- `var_opt_sketch`
|
56
|
+
- `var_opt_union`
|
71
57
|
- Vector of KLL
|
72
|
-
|
73
|
-
|
58
|
+
- `vector_of_kll_ints_sketches`
|
59
|
+
- `vector_of_kll_floats_sketches`
|
60
|
+
- Kolmogorov-Smirnov Test
|
61
|
+
- `ks_test` applied to a pair of matched-type Absolute Error quantiles sketches
|
74
62
|
|
75
63
|
## Known Differences from C++
|
76
64
|
|
@@ -79,3 +67,22 @@ The Python API largely mirrors the C++ API, with a few minor exceptions: The pri
|
|
79
67
|
The Vector of KLL object is currently exclusive to python, and holds an array of independent KLL sketches. This is useful for creating a set of KLL sketches over a vector and has been designed to allow input as either a vector or a matrix of multiple vectors.
|
80
68
|
|
81
69
|
We have also removed reliance on a builder class for theta sketches as Python allows named arguments to the constructor, not strictly positional arguments.
|
70
|
+
|
71
|
+
## Developer Instructions
|
72
|
+
|
73
|
+
The only developer-specific instructions relate to running unit tests.
|
74
|
+
|
75
|
+
### Unit tests
|
76
|
+
|
77
|
+
The Python unit tests are run with `tox`. To ensure you have all the needed package, from the package base directory run:
|
78
|
+
|
79
|
+
```bash
|
80
|
+
python -m pip install --upgrade tox
|
81
|
+
tox
|
82
|
+
```
|
83
|
+
|
84
|
+
## License
|
85
|
+
|
86
|
+
The Apache DataSketches Library is distrubted under an Apache 2.0 License.
|
87
|
+
|
88
|
+
There may be precompiled binaries provided as a convenience and distributed through PyPI via [https://pypi.org/project/datasketches/] contain compiled code from [pybind11](https://github.com/pybind/pybind11), which is distributed under a BSD license.
|
@@ -53,7 +53,7 @@ void init_cpc(py::module &m) {
|
|
53
53
|
using namespace datasketches;
|
54
54
|
|
55
55
|
py::class_<cpc_sketch>(m, "cpc_sketch")
|
56
|
-
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=
|
56
|
+
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=cpc_constants::DEFAULT_LG_K, py::arg("seed")=DEFAULT_SEED)
|
57
57
|
.def(py::init<const cpc_sketch&>())
|
58
58
|
.def("__str__", &cpc_sketch::to_string,
|
59
59
|
"Produces a string summary of the sketch")
|
@@ -28,6 +28,8 @@ void init_cpc(py::module& m);
|
|
28
28
|
void init_theta(py::module& m);
|
29
29
|
void init_vo(py::module& m);
|
30
30
|
void init_req(py::module& m);
|
31
|
+
void init_quantiles(py::module& m);
|
32
|
+
void init_kolmogorov_smirnov(py::module& m);
|
31
33
|
void init_vector_of_kll(py::module& m);
|
32
34
|
|
33
35
|
PYBIND11_MODULE(datasketches, m) {
|
@@ -38,5 +40,7 @@ PYBIND11_MODULE(datasketches, m) {
|
|
38
40
|
init_theta(m);
|
39
41
|
init_vo(m);
|
40
42
|
init_req(m);
|
43
|
+
init_quantiles(m);
|
44
|
+
init_kolmogorov_smirnov(m);
|
41
45
|
init_vector_of_kll(m);
|
42
46
|
}
|
@@ -64,6 +64,11 @@ py::list fi_sketch_get_frequent_items(const frequent_items_sketch<T>& sk,
|
|
64
64
|
return list;
|
65
65
|
}
|
66
66
|
|
67
|
+
template<typename T>
|
68
|
+
size_t fi_sketch_get_serialized_size_bytes(const frequent_items_sketch<T>& sk) {
|
69
|
+
return sk.get_serialized_size_bytes();
|
70
|
+
}
|
71
|
+
|
67
72
|
}
|
68
73
|
}
|
69
74
|
|
@@ -104,7 +109,7 @@ void bind_fi_sketch(py::module &m, const char* name) {
|
|
104
109
|
"Returns the epsilon value used to compute a priori error for a given log2(max_map_size)")
|
105
110
|
.def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
|
106
111
|
"Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.")
|
107
|
-
.def("get_serialized_size_bytes", &
|
112
|
+
.def("get_serialized_size_bytes", &dspy::fi_sketch_get_serialized_size_bytes<T>,
|
108
113
|
"Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.")
|
109
114
|
.def("serialize", &dspy::fi_sketch_serialize<T>, "Serializes the sketch into a bytes object")
|
110
115
|
.def_static("deserialize", &dspy::fi_sketch_deserialize<T>, "Reads a bytes object and returns the corresponding frequent_strings_sketch")
|