datasketches 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/MANIFEST.in +21 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
- data/vendor/datasketches-cpp/pyproject.toml +17 -12
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/setup.py +14 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +14 -5
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
|
@@ -341,8 +341,7 @@ void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {
|
|
|
341
341
|
|
|
342
342
|
template<typename A>
|
|
343
343
|
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
344
|
-
const
|
|
345
|
-
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
|
|
344
|
+
const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
|
|
346
345
|
write(os, preamble_longs);
|
|
347
346
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
348
347
|
write(os, serial_version);
|
|
@@ -359,24 +358,19 @@ void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
|
359
358
|
write(os, flags_byte);
|
|
360
359
|
const uint16_t seed_hash = get_seed_hash();
|
|
361
360
|
write(os, seed_hash);
|
|
362
|
-
if (
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
write(os, unused32);
|
|
368
|
-
if (this->is_estimation_mode()) {
|
|
369
|
-
write(os, this->theta_);
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
|
|
361
|
+
if (preamble_longs > 1) {
|
|
362
|
+
const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
|
|
363
|
+
write(os, num_entries);
|
|
364
|
+
const uint32_t unused32 = 0;
|
|
365
|
+
write(os, unused32);
|
|
373
366
|
}
|
|
367
|
+
if (this->is_estimation_mode()) write(os, this->theta_);
|
|
368
|
+
if (entries_.size() > 0) write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
|
|
374
369
|
}
|
|
375
370
|
|
|
376
371
|
template<typename A>
|
|
377
372
|
auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
|
|
378
|
-
const
|
|
379
|
-
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
|
|
373
|
+
const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
|
|
380
374
|
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
|
|
381
375
|
+ sizeof(uint64_t) * entries_.size();
|
|
382
376
|
vector_bytes bytes(size, 0, entries_.get_allocator());
|
|
@@ -397,17 +391,13 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
|
|
|
397
391
|
ptr += copy_to_mem(flags_byte, ptr);
|
|
398
392
|
const uint16_t seed_hash = get_seed_hash();
|
|
399
393
|
ptr += copy_to_mem(seed_hash, ptr);
|
|
400
|
-
if (
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
ptr += sizeof(uint32_t);
|
|
405
|
-
if (this->is_estimation_mode()) {
|
|
406
|
-
ptr += copy_to_mem(theta_, ptr);
|
|
407
|
-
}
|
|
408
|
-
}
|
|
409
|
-
ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
|
|
394
|
+
if (preamble_longs > 1) {
|
|
395
|
+
const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
|
|
396
|
+
ptr += copy_to_mem(num_entries, ptr);
|
|
397
|
+
ptr += sizeof(uint32_t); // unused
|
|
410
398
|
}
|
|
399
|
+
if (this->is_estimation_mode()) ptr += copy_to_mem(theta_, ptr);
|
|
400
|
+
if (entries_.size() > 0) ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
|
|
411
401
|
return bytes;
|
|
412
402
|
}
|
|
413
403
|
|
|
@@ -96,15 +96,6 @@ struct theta_update_sketch_base {
|
|
|
96
96
|
template<typename Derived, typename Allocator>
|
|
97
97
|
class theta_base_builder {
|
|
98
98
|
public:
|
|
99
|
-
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
|
100
|
-
using resize_factor = theta_constants::resize_factor;
|
|
101
|
-
static const uint8_t MIN_LG_K = theta_constants::MIN_LG_K;
|
|
102
|
-
static const uint8_t MAX_LG_K = theta_constants::MAX_LG_K;
|
|
103
|
-
// TODO: The following defaults are redundant and deprecated. Will be removed in the
|
|
104
|
-
// next major version release
|
|
105
|
-
static const uint8_t DEFAULT_LG_K = theta_constants::DEFAULT_LG_K;
|
|
106
|
-
static const resize_factor DEFAULT_RESIZE_FACTOR = theta_constants::DEFAULT_RESIZE_FACTOR;
|
|
107
|
-
|
|
108
99
|
/**
|
|
109
100
|
* Creates and instance of the builder with default parameters.
|
|
110
101
|
*/
|
|
@@ -310,11 +310,11 @@ seed_(DEFAULT_SEED) {}
|
|
|
310
310
|
|
|
311
311
|
template<typename Derived, typename Allocator>
|
|
312
312
|
Derived& theta_base_builder<Derived, Allocator>::set_lg_k(uint8_t lg_k) {
|
|
313
|
-
if (lg_k < MIN_LG_K) {
|
|
314
|
-
throw std::invalid_argument("lg_k must not be less than " + std::to_string(MIN_LG_K) + ": " + std::to_string(lg_k));
|
|
313
|
+
if (lg_k < theta_constants::MIN_LG_K) {
|
|
314
|
+
throw std::invalid_argument("lg_k must not be less than " + std::to_string(theta_constants::MIN_LG_K) + ": " + std::to_string(lg_k));
|
|
315
315
|
}
|
|
316
|
-
if (lg_k > MAX_LG_K) {
|
|
317
|
-
throw std::invalid_argument("lg_k must not be greater than " + std::to_string(MAX_LG_K) + ": " + std::to_string(lg_k));
|
|
316
|
+
if (lg_k > theta_constants::MAX_LG_K) {
|
|
317
|
+
throw std::invalid_argument("lg_k must not be greater than " + std::to_string(theta_constants::MAX_LG_K) + ": " + std::to_string(lg_k));
|
|
318
318
|
}
|
|
319
319
|
lg_k_ = lg_k;
|
|
320
320
|
return static_cast<Derived&>(*this);
|
|
@@ -346,7 +346,7 @@ uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
|
|
|
346
346
|
|
|
347
347
|
template<typename Derived, typename Allocator>
|
|
348
348
|
uint8_t theta_base_builder<Derived, Allocator>::starting_lg_size() const {
|
|
349
|
-
return theta_build_helper<true>::starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
|
|
349
|
+
return theta_build_helper<true>::starting_sub_multiple(lg_k_ + 1, theta_constants::MIN_LG_K, static_cast<uint8_t>(rf_));
|
|
350
350
|
}
|
|
351
351
|
|
|
352
352
|
// iterator
|
|
@@ -152,7 +152,7 @@ TEST_CASE("theta sketch: estimation", "[theta_sketch]") {
|
|
|
152
152
|
REQUIRE(update_sketch.get_lower_bound(1) < n);
|
|
153
153
|
REQUIRE(update_sketch.get_upper_bound(1) > n);
|
|
154
154
|
|
|
155
|
-
const uint32_t k = 1 <<
|
|
155
|
+
const uint32_t k = 1 << theta_constants::DEFAULT_LG_K;
|
|
156
156
|
REQUIRE(update_sketch.get_num_retained() >= k);
|
|
157
157
|
update_sketch.trim();
|
|
158
158
|
REQUIRE(update_sketch.get_num_retained() == k);
|
|
@@ -398,6 +398,7 @@ TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[
|
|
|
398
398
|
TEST_CASE("theta sketch: deserialize empty buffer overrun", "[theta_sketch]") {
|
|
399
399
|
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
400
400
|
auto bytes = update_sketch.compact().serialize();
|
|
401
|
+
REQUIRE(bytes.size() == 8);
|
|
401
402
|
REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
402
403
|
}
|
|
403
404
|
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
[tox]
|
|
19
|
+
envlist = py3
|
|
20
|
+
isolated_build = true
|
|
21
|
+
|
|
22
|
+
[testenv]
|
|
23
|
+
deps = pytest
|
|
24
|
+
numpy
|
|
25
|
+
changedir = python/tests
|
|
26
|
+
commands = pytest
|
|
@@ -72,21 +72,45 @@ public:
|
|
|
72
72
|
double get_estimate() const;
|
|
73
73
|
|
|
74
74
|
/**
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
75
|
+
* Returns the approximate lower error bound given a number of standard deviations over an arbitrary number of
|
|
76
|
+
* items stored in the sketch.
|
|
77
|
+
* This parameter is similar to the number of standard deviations of the normal distribution
|
|
78
|
+
* and corresponds to approximately 67%, 95% and 99% confidence intervals.
|
|
79
|
+
* @param num_std_devs number of Standard Deviations (1, 2 or 3)
|
|
80
|
+
* @param num_subset_entries number of items from {0, 1, ..., get_num_retained()} over which to estimate the bound
|
|
81
|
+
* @return the lower bound
|
|
82
|
+
*/
|
|
83
|
+
double get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Returns the approximate lower error bound given a number of standard deviations.
|
|
87
|
+
* This parameter is similar to the number of standard deviations of the normal distribution
|
|
88
|
+
* and corresponds to approximately 67%, 95% and 99% confidence intervals.
|
|
89
|
+
* @param num_std_devs number of Standard Deviations (1, 2 or 3)
|
|
90
|
+
* @return the lower bound
|
|
91
|
+
*/
|
|
81
92
|
double get_lower_bound(uint8_t num_std_devs) const;
|
|
82
93
|
|
|
94
|
+
|
|
83
95
|
/**
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
96
|
+
* Returns the approximate upper error bound given a number of standard deviations over an arbitrary number of
|
|
97
|
+
* items stored in the sketch.
|
|
98
|
+
* This parameter is similar to the number of standard deviations of the normal distribution
|
|
99
|
+
* and corresponds to approximately 67%, 95% and 99% confidence intervals.
|
|
100
|
+
* @param num_std_devs number of Standard Deviations (1, 2 or 3)
|
|
101
|
+
* @param num_subset_entries number of items from {0, 1, ..., get_num_retained()} over which to estimate the bound
|
|
102
|
+
* @return the lower bound
|
|
103
|
+
*/
|
|
104
|
+
double get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Returns the approximate upper error bound given a number of standard deviations.
|
|
109
|
+
* This parameter is similar to the number of standard deviations of the normal distribution
|
|
110
|
+
* and corresponds to approximately 67%, 95% and 99% confidence intervals.
|
|
111
|
+
* @param num_std_devs number of Standard Deviations (1, 2 or 3)
|
|
112
|
+
* @return the upper bound
|
|
113
|
+
*/
|
|
90
114
|
double get_upper_bound(uint8_t num_std_devs) const;
|
|
91
115
|
|
|
92
116
|
/**
|
|
@@ -40,16 +40,28 @@ double tuple_sketch<S, A>::get_estimate() const {
|
|
|
40
40
|
return get_num_retained() / get_theta();
|
|
41
41
|
}
|
|
42
42
|
|
|
43
|
+
template<typename S, typename A>
|
|
44
|
+
double tuple_sketch<S, A>::get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const {
|
|
45
|
+
num_subset_entries = std::min(num_subset_entries, get_num_retained()) ;
|
|
46
|
+
if (!is_estimation_mode()) return num_subset_entries;
|
|
47
|
+
return binomial_bounds::get_lower_bound(num_subset_entries, get_theta(), num_std_devs);
|
|
48
|
+
}
|
|
49
|
+
|
|
43
50
|
template<typename S, typename A>
|
|
44
51
|
double tuple_sketch<S, A>::get_lower_bound(uint8_t num_std_devs) const {
|
|
45
|
-
|
|
46
|
-
|
|
52
|
+
return get_lower_bound(num_std_devs, get_num_retained()) ;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
template<typename S, typename A>
|
|
56
|
+
double tuple_sketch<S, A>::get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const {
|
|
57
|
+
num_subset_entries = std::min(num_subset_entries, get_num_retained()) ;
|
|
58
|
+
if (!is_estimation_mode()) return num_subset_entries;
|
|
59
|
+
return binomial_bounds::get_upper_bound(num_subset_entries, get_theta(), num_std_devs);
|
|
47
60
|
}
|
|
48
61
|
|
|
49
62
|
template<typename S, typename A>
|
|
50
63
|
double tuple_sketch<S, A>::get_upper_bound(uint8_t num_std_devs) const {
|
|
51
|
-
|
|
52
|
-
return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
|
|
64
|
+
return get_upper_bound(num_std_devs, get_num_retained()) ;
|
|
53
65
|
}
|
|
54
66
|
|
|
55
67
|
template<typename S, typename A>
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
|
|
18
18
|
add_executable(tuple_test)
|
|
19
19
|
|
|
20
|
-
target_link_libraries(tuple_test tuple
|
|
20
|
+
target_link_libraries(tuple_test tuple common_test_lib)
|
|
21
21
|
|
|
22
22
|
set_target_properties(tuple_test PROPERTIES
|
|
23
23
|
CXX_STANDARD 11
|
|
@@ -45,4 +45,5 @@ target_sources(tuple_test
|
|
|
45
45
|
tuple_a_not_b_test.cpp
|
|
46
46
|
tuple_jaccard_similarity_test.cpp
|
|
47
47
|
array_of_doubles_sketch_test.cpp
|
|
48
|
+
engagement_test.cpp
|
|
48
49
|
)
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <iostream>
|
|
21
|
+
#include <iomanip>
|
|
22
|
+
#include <set>
|
|
23
|
+
#include <catch2/catch.hpp>
|
|
24
|
+
#include <tuple_sketch.hpp>
|
|
25
|
+
#include <tuple_union.hpp>
|
|
26
|
+
#include <stdexcept>
|
|
27
|
+
|
|
28
|
+
template<typename T>
|
|
29
|
+
class max_value_policy {
|
|
30
|
+
public:
|
|
31
|
+
max_value_policy(const T& initial_value): initial_value(initial_value) {}
|
|
32
|
+
T create() const { return initial_value; }
|
|
33
|
+
void update(T& summary, const T& update) const { summary = std::max(summary, update); }
|
|
34
|
+
private:
|
|
35
|
+
T initial_value;
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
using max_float_update_tuple_sketch = datasketches::update_tuple_sketch<float, float, max_value_policy<float>>;
|
|
39
|
+
|
|
40
|
+
template<typename T>
|
|
41
|
+
class always_one_policy {
|
|
42
|
+
public:
|
|
43
|
+
always_one_policy(): initial_value(1) {}
|
|
44
|
+
T create() const { return 1; }
|
|
45
|
+
void update(T&, const T&) const { }
|
|
46
|
+
private:
|
|
47
|
+
T initial_value;
|
|
48
|
+
};
|
|
49
|
+
using always_one_tuple_sketch = datasketches::update_tuple_sketch<int, int, always_one_policy<int>> ;
|
|
50
|
+
|
|
51
|
+
template<typename T>
|
|
52
|
+
class update_sum_value_policy {
|
|
53
|
+
public:
|
|
54
|
+
update_sum_value_policy(): initial_value(0) {}
|
|
55
|
+
T create() const { return initial_value; }
|
|
56
|
+
void update(T& summary, const T& update) const { summary += update; }
|
|
57
|
+
private:
|
|
58
|
+
T initial_value;
|
|
59
|
+
};
|
|
60
|
+
using sum_update_tuple_sketch = datasketches::update_tuple_sketch<int, int, update_sum_value_policy<int>>;
|
|
61
|
+
|
|
62
|
+
template<typename Summary>
|
|
63
|
+
struct union_sum_value_policy {
|
|
64
|
+
void operator()(Summary& summary, const Summary& other) const {
|
|
65
|
+
summary += other;
|
|
66
|
+
}
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
using sum_union_tuple_sketch = datasketches::tuple_union<int, union_sum_value_policy<int>> ;
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class EngagementTest{
|
|
73
|
+
public:
|
|
74
|
+
int num_std_dev = 2 ;
|
|
75
|
+
void test_always_one_update(){
|
|
76
|
+
/*
|
|
77
|
+
* Tests that updates into an update_tuple_sketch sketch only keeps a 1 in the column for stored values.
|
|
78
|
+
*/
|
|
79
|
+
int lgK = 8 ;
|
|
80
|
+
std::vector<datasketches::update_tuple_sketch<int, int, always_one_policy<int>>> sketch_array ;
|
|
81
|
+
|
|
82
|
+
auto always_one_sketch = always_one_tuple_sketch::builder(always_one_policy<int>()).set_lg_k(lgK).build() ;
|
|
83
|
+
|
|
84
|
+
always_one_sketch.update(1, 1);
|
|
85
|
+
always_one_sketch.update(1, 2);
|
|
86
|
+
always_one_sketch.update(2, 1);
|
|
87
|
+
always_one_sketch.update(3, 3);
|
|
88
|
+
always_one_sketch.update(3, 7);
|
|
89
|
+
|
|
90
|
+
int num_retained = 0;
|
|
91
|
+
int sum = 0;
|
|
92
|
+
for (const auto& entry: always_one_sketch) {
|
|
93
|
+
sum += entry.second;
|
|
94
|
+
++num_retained;
|
|
95
|
+
}
|
|
96
|
+
REQUIRE(num_retained == 3);
|
|
97
|
+
REQUIRE(sum == 3); // we only keep 1 for every stored key.
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
void test_sum_update_policy(){
|
|
101
|
+
/*
|
|
102
|
+
* Tests that updates into an sum_update_tuple_sketch sum the stored values on updates.
|
|
103
|
+
*/
|
|
104
|
+
int lgK = 8 ;
|
|
105
|
+
auto sum_sketch = sum_update_tuple_sketch::builder().set_lg_k(lgK).build() ;
|
|
106
|
+
|
|
107
|
+
sum_sketch.update(1, 1);
|
|
108
|
+
sum_sketch.update(1, 2);
|
|
109
|
+
sum_sketch.update(2, 1);
|
|
110
|
+
sum_sketch.update(3, 3);
|
|
111
|
+
sum_sketch.update(3, 7);
|
|
112
|
+
int num_retained = 0;
|
|
113
|
+
int sum = 0;
|
|
114
|
+
for (const auto& entry: sum_sketch) {
|
|
115
|
+
sum += entry.second;
|
|
116
|
+
++num_retained;
|
|
117
|
+
}
|
|
118
|
+
REQUIRE(num_retained == 3);
|
|
119
|
+
REQUIRE(sum == 14); // (1+2) + 1 + (3 + 7) = 14
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
void test_sum_union_policy(){
|
|
123
|
+
/*
|
|
124
|
+
* Tests that updates into two sketches of sum_update_tuple_sketch flavour, which have been unioned,
|
|
125
|
+
* cause the stored values of two of the same keys to be summed.
|
|
126
|
+
*/
|
|
127
|
+
auto sketch1 = sum_update_tuple_sketch::builder().build() ;
|
|
128
|
+
auto sketch2 = sum_update_tuple_sketch::builder().build() ;
|
|
129
|
+
|
|
130
|
+
sketch1.update(1, 1);
|
|
131
|
+
sketch1.update(2, 1);
|
|
132
|
+
sketch1.update(3, 3);
|
|
133
|
+
|
|
134
|
+
sketch2.update(1, 2);
|
|
135
|
+
sketch2.update(2, 1);
|
|
136
|
+
sketch2.update(3, 7);
|
|
137
|
+
|
|
138
|
+
auto union_sketch = sum_union_tuple_sketch::builder().build() ;
|
|
139
|
+
union_sketch.update(sketch1) ;
|
|
140
|
+
union_sketch.update(sketch2) ;
|
|
141
|
+
auto union_result = union_sketch.get_result() ;
|
|
142
|
+
|
|
143
|
+
int num_retained = 0;
|
|
144
|
+
int sum = 0;
|
|
145
|
+
for (const auto& entry: union_result) {
|
|
146
|
+
sum += entry.second;
|
|
147
|
+
++num_retained;
|
|
148
|
+
}
|
|
149
|
+
REQUIRE(num_retained == 3);
|
|
150
|
+
REQUIRE(sum == 15); // 1:(1+2) + 2:(1+1) + 3:(3+7) = 15
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
void compute_engagement_histogram(){
|
|
154
|
+
/*
|
|
155
|
+
* Returns the estimated histogram from the synthetic data.
|
|
156
|
+
* On inspection one can verify this agrees with the
|
|
157
|
+
* https://github.com/apache/datasketches-java/blob/master/src/test/java/org/apache/datasketches/tuple/aninteger/EngagementTest.java
|
|
158
|
+
*/
|
|
159
|
+
int lgK = 8 ;
|
|
160
|
+
const int days = 30 ;
|
|
161
|
+
int v = 0 ;
|
|
162
|
+
std::set<int> set_array[days];
|
|
163
|
+
std::vector<datasketches::update_tuple_sketch<int, int, always_one_policy<int>>> sketch_array ;
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
for(int i=0; i<days ; i++){
|
|
167
|
+
auto builder = always_one_tuple_sketch::builder(always_one_policy<int>()) ;
|
|
168
|
+
builder.set_lg_k(lgK) ;
|
|
169
|
+
auto sketch = builder.build() ;
|
|
170
|
+
sketch_array.push_back(sketch);
|
|
171
|
+
}
|
|
172
|
+
REQUIRE(sketch_array.size() == days) ;
|
|
173
|
+
|
|
174
|
+
for(int i=0; i<=days; i++){
|
|
175
|
+
int32_t num_ids = get_num_ids(days, i) ;
|
|
176
|
+
int32_t num_days = get_num_days(days, i) ;
|
|
177
|
+
|
|
178
|
+
int my_v = v++ ;
|
|
179
|
+
for(int d=0 ; d<num_days; d++){
|
|
180
|
+
for(int id = 0; id < num_ids; id++){
|
|
181
|
+
set_array[d].insert(my_v + id) ;
|
|
182
|
+
sketch_array[d].update(my_v + id, 1) ;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
v += num_ids ;
|
|
186
|
+
}
|
|
187
|
+
union_ops(lgK, sketch_array) ;
|
|
188
|
+
}
|
|
189
|
+
private:
|
|
190
|
+
int32_t get_num_ids(int total_days, int index){
|
|
191
|
+
/*
|
|
192
|
+
* Generates power law distributed synthetic data
|
|
193
|
+
*/
|
|
194
|
+
double d = total_days ;
|
|
195
|
+
double i = index ;
|
|
196
|
+
return int(round(exp(i * log(d) / d))) ;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
int32_t get_num_days(int total_days, int index){
|
|
200
|
+
double d = total_days ;
|
|
201
|
+
double i = index ;
|
|
202
|
+
return int(round(exp( (d-i) * log(d) / d ))) ;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
int32_t round_double_to_int(double x){
|
|
206
|
+
return int(std::round(x)) ;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
void union_ops(int lgk, std::vector<datasketches::update_tuple_sketch<int, int, always_one_policy<int>>> sketches){
|
|
210
|
+
int num_sketches = sketches.size() ;
|
|
211
|
+
auto u = sum_union_tuple_sketch::builder().set_lg_k(lgk).build() ;
|
|
212
|
+
|
|
213
|
+
for(auto sk:sketches){
|
|
214
|
+
u.update(sk) ;
|
|
215
|
+
}
|
|
216
|
+
auto union_result = u.get_result() ;
|
|
217
|
+
std::vector<uint64_t> num_days_arr(num_sketches+1) ;
|
|
218
|
+
|
|
219
|
+
for (const auto& entry: union_result) {
|
|
220
|
+
int num_days_visited = entry.second ;
|
|
221
|
+
num_days_arr[num_days_visited]++;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
int sum_visits = 0;
|
|
225
|
+
double theta = union_result.get_theta();
|
|
226
|
+
std::cout <<"\t\tEngagement Histogram.\t\t\t\n" ;
|
|
227
|
+
std::cout << "Number of Unique Visitors by Number of Days Visited" << std::endl ;
|
|
228
|
+
std::cout << "---------------------------------------------------" << std::endl ;
|
|
229
|
+
|
|
230
|
+
std::cout << std::setw(12) << "Days Visited"
|
|
231
|
+
<< std::setw(12) << "Estimate"
|
|
232
|
+
<< std::setw(12) << "LB"
|
|
233
|
+
<< std::setw(12) << "UB"
|
|
234
|
+
<< std:: endl ;
|
|
235
|
+
|
|
236
|
+
for (uint64_t i = 0; i < num_days_arr.size(); i++) {
|
|
237
|
+
int visitors_at_days_visited = num_days_arr[i] ;
|
|
238
|
+
if(visitors_at_days_visited == 0){ continue; }
|
|
239
|
+
sum_visits += visitors_at_days_visited * i ;
|
|
240
|
+
|
|
241
|
+
double est_visitors_at_days_visited = visitors_at_days_visited / theta ;
|
|
242
|
+
double lower_bound_at_days_visited = union_result.get_lower_bound(num_std_dev, visitors_at_days_visited);
|
|
243
|
+
double upper_bound_at_days_visited = union_result.get_upper_bound(num_std_dev, visitors_at_days_visited);
|
|
244
|
+
|
|
245
|
+
std::cout << std::setw(12) << i
|
|
246
|
+
<< std::setw(12) << est_visitors_at_days_visited
|
|
247
|
+
<< std::setw(12) << lower_bound_at_days_visited
|
|
248
|
+
<< std::setw(12) << upper_bound_at_days_visited
|
|
249
|
+
<< std:: endl ;
|
|
250
|
+
|
|
251
|
+
}
|
|
252
|
+
std::cout << std::endl << std::endl ;
|
|
253
|
+
std::cout << std::setw(12) << "Totals"
|
|
254
|
+
<< std::setw(12) << "Estimate"
|
|
255
|
+
<< std::setw(12) << "LB"
|
|
256
|
+
<< std::setw(12) << "UB"
|
|
257
|
+
<< std:: endl ;
|
|
258
|
+
std::cout << "---------------------------------------------------" << std::endl ;
|
|
259
|
+
|
|
260
|
+
const double total_visitors = union_result.get_estimate() ;
|
|
261
|
+
const double lb_visitors = union_result.get_lower_bound(num_std_dev) ;
|
|
262
|
+
const double ub_visitors = union_result.get_upper_bound(num_std_dev) ;
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
std::cout << std::setw(12) << "Visitors"
|
|
266
|
+
<< std::setw(12) << total_visitors
|
|
267
|
+
<< std::setw(12) << lb_visitors
|
|
268
|
+
<< std::setw(12) << ub_visitors
|
|
269
|
+
<< std:: endl ;
|
|
270
|
+
|
|
271
|
+
// The total number of visits, however, is a scaled metric and takes advantage of the fact that
|
|
272
|
+
// the retained entries in the sketch is a uniform random sample of all unique visitors, and
|
|
273
|
+
// the rest of the unique users will likely behave in the same way.
|
|
274
|
+
const double est_visits = sum_visits / theta;
|
|
275
|
+
const double lb_visits = est_visits * lb_visitors / total_visitors;
|
|
276
|
+
const double ub_visits = est_visits * ub_visitors / total_visitors;
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
std::cout << std::setw(12) << "Visits"
|
|
280
|
+
<< std::setw(12) << est_visits
|
|
281
|
+
<< std::setw(12) << lb_visits
|
|
282
|
+
<< std::setw(12) << ub_visits
|
|
283
|
+
<< std:: endl ;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
};
|
|
287
|
+
|
|
288
|
+
namespace datasketches {
|
|
289
|
+
|
|
290
|
+
TEST_CASE("engagement", "[engagement]") {
|
|
291
|
+
EngagementTest E ;
|
|
292
|
+
E.test_always_one_update() ;
|
|
293
|
+
E.test_sum_update_policy() ;
|
|
294
|
+
E.test_sum_union_policy() ;
|
|
295
|
+
E.compute_engagement_histogram() ;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
} /* namespace datasketches */
|
|
@@ -56,7 +56,13 @@ TEST_CASE("tuple sketch float: empty", "[tuple_sketch]") {
|
|
|
56
56
|
REQUIRE(!update_sketch.is_estimation_mode());
|
|
57
57
|
REQUIRE(update_sketch.get_estimate() == 0);
|
|
58
58
|
REQUIRE(update_sketch.get_lower_bound(1) == 0);
|
|
59
|
+
REQUIRE(update_sketch.get_lower_bound(1, 1) == 0);
|
|
60
|
+
REQUIRE(update_sketch.get_lower_bound(1, update_sketch.get_num_retained()) == 0);
|
|
61
|
+
REQUIRE(update_sketch.get_lower_bound(1, update_sketch.get_num_retained()+1) == 0);
|
|
59
62
|
REQUIRE(update_sketch.get_upper_bound(1) == 0);
|
|
63
|
+
REQUIRE(update_sketch.get_upper_bound(1, 1) == 0);
|
|
64
|
+
REQUIRE(update_sketch.get_upper_bound(1, update_sketch.get_num_retained()) == 0);
|
|
65
|
+
REQUIRE(update_sketch.get_upper_bound(1, update_sketch.get_num_retained()+1) == 0);
|
|
60
66
|
REQUIRE(update_sketch.get_theta() == 1);
|
|
61
67
|
REQUIRE(update_sketch.get_num_retained() == 0);
|
|
62
68
|
REQUIRE(update_sketch.is_ordered());
|
|
@@ -67,7 +73,11 @@ TEST_CASE("tuple sketch float: empty", "[tuple_sketch]") {
|
|
|
67
73
|
REQUIRE(!compact_sketch.is_estimation_mode());
|
|
68
74
|
REQUIRE(compact_sketch.get_estimate() == 0);
|
|
69
75
|
REQUIRE(compact_sketch.get_lower_bound(1) == 0);
|
|
76
|
+
REQUIRE(compact_sketch.get_lower_bound(1, 1) == 0);
|
|
77
|
+
REQUIRE(compact_sketch.get_lower_bound(1, update_sketch.get_num_retained()) == 0);
|
|
70
78
|
REQUIRE(compact_sketch.get_upper_bound(1) == 0);
|
|
79
|
+
REQUIRE(compact_sketch.get_upper_bound(1, 1) == 0);
|
|
80
|
+
REQUIRE(compact_sketch.get_upper_bound(1, update_sketch.get_num_retained()) == 0);
|
|
71
81
|
REQUIRE(compact_sketch.get_theta() == 1);
|
|
72
82
|
REQUIRE(compact_sketch.get_num_retained() == 0);
|
|
73
83
|
REQUIRE(compact_sketch.is_ordered());
|
|
@@ -110,7 +120,11 @@ TEST_CASE("tuple sketch float: exact mode", "[tuple_sketch]") {
|
|
|
110
120
|
REQUIRE_FALSE(update_sketch.is_estimation_mode());
|
|
111
121
|
REQUIRE(update_sketch.get_estimate() == 2);
|
|
112
122
|
REQUIRE(update_sketch.get_lower_bound(1) == 2);
|
|
123
|
+
REQUIRE(update_sketch.get_lower_bound(1, 1) == 1);
|
|
124
|
+
REQUIRE(update_sketch.get_lower_bound(1, update_sketch.get_num_retained()) == 2);
|
|
113
125
|
REQUIRE(update_sketch.get_upper_bound(1) == 2);
|
|
126
|
+
REQUIRE(update_sketch.get_upper_bound(1, 1) == 1);
|
|
127
|
+
REQUIRE(update_sketch.get_upper_bound(1, update_sketch.get_num_retained()) == 2);
|
|
114
128
|
REQUIRE(update_sketch.get_theta() == 1);
|
|
115
129
|
REQUIRE(update_sketch.get_num_retained() == 2);
|
|
116
130
|
REQUIRE_FALSE(update_sketch.is_ordered());
|
|
@@ -127,7 +141,11 @@ TEST_CASE("tuple sketch float: exact mode", "[tuple_sketch]") {
|
|
|
127
141
|
REQUIRE_FALSE(compact_sketch.is_estimation_mode());
|
|
128
142
|
REQUIRE(compact_sketch.get_estimate() == 2);
|
|
129
143
|
REQUIRE(compact_sketch.get_lower_bound(1) == 2);
|
|
144
|
+
REQUIRE(compact_sketch.get_lower_bound(1, 1) == 1);
|
|
145
|
+
REQUIRE(compact_sketch.get_lower_bound(1, compact_sketch.get_num_retained()) == 2);
|
|
130
146
|
REQUIRE(compact_sketch.get_upper_bound(1) == 2);
|
|
147
|
+
REQUIRE(compact_sketch.get_upper_bound(1, 1) == 1);
|
|
148
|
+
REQUIRE(compact_sketch.get_upper_bound(1, compact_sketch.get_num_retained()) == 2);
|
|
131
149
|
REQUIRE(compact_sketch.get_theta() == 1);
|
|
132
150
|
REQUIRE(compact_sketch.get_num_retained() == 2);
|
|
133
151
|
REQUIRE(compact_sketch.is_ordered());
|
|
@@ -146,7 +164,11 @@ TEST_CASE("tuple sketch float: exact mode", "[tuple_sketch]") {
|
|
|
146
164
|
REQUIRE(!deserialized_sketch.is_estimation_mode());
|
|
147
165
|
REQUIRE(deserialized_sketch.get_estimate() == 2);
|
|
148
166
|
REQUIRE(deserialized_sketch.get_lower_bound(1) == 2);
|
|
167
|
+
REQUIRE(deserialized_sketch.get_lower_bound(1, 1) == 1);
|
|
168
|
+
REQUIRE(deserialized_sketch.get_lower_bound(1, deserialized_sketch.get_num_retained()) == 2);
|
|
149
169
|
REQUIRE(deserialized_sketch.get_upper_bound(1) == 2);
|
|
170
|
+
REQUIRE(deserialized_sketch.get_upper_bound(1, 1) == 1);
|
|
171
|
+
REQUIRE(deserialized_sketch.get_upper_bound(1, deserialized_sketch.get_num_retained()) == 2);
|
|
150
172
|
REQUIRE(deserialized_sketch.get_theta() == 1);
|
|
151
173
|
REQUIRE(deserialized_sketch.get_num_retained() == 2);
|
|
152
174
|
REQUIRE(deserialized_sketch.is_ordered());
|
|
@@ -160,7 +182,11 @@ TEST_CASE("tuple sketch float: exact mode", "[tuple_sketch]") {
|
|
|
160
182
|
REQUIRE(!deserialized_sketch.is_estimation_mode());
|
|
161
183
|
REQUIRE(deserialized_sketch.get_estimate() == 2);
|
|
162
184
|
REQUIRE(deserialized_sketch.get_lower_bound(1) == 2);
|
|
185
|
+
REQUIRE(deserialized_sketch.get_lower_bound(1, 1) == 1);
|
|
186
|
+
REQUIRE(deserialized_sketch.get_lower_bound(1, deserialized_sketch.get_num_retained()) == 2);
|
|
163
187
|
REQUIRE(deserialized_sketch.get_upper_bound(1) == 2);
|
|
188
|
+
REQUIRE(deserialized_sketch.get_upper_bound(1, 1) == 1);
|
|
189
|
+
REQUIRE(deserialized_sketch.get_upper_bound(1, deserialized_sketch.get_num_retained()) == 2);
|
|
164
190
|
REQUIRE(deserialized_sketch.get_theta() == 1);
|
|
165
191
|
REQUIRE(deserialized_sketch.get_num_retained() == 2);
|
|
166
192
|
REQUIRE(deserialized_sketch.is_ordered());
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
4.0.0
|