datasketches 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/kll_wrapper.cpp +20 -20
  4. data/ext/datasketches/theta_wrapper.cpp +2 -2
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  7. data/vendor/datasketches-cpp/MANIFEST.in +21 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  10. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  11. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  12. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  13. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  14. data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
  15. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
  16. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  17. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  18. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  19. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  20. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  21. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
  22. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
  24. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  25. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  26. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  27. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  28. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  29. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  30. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  31. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
  32. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
  33. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
  34. data/vendor/datasketches-cpp/pyproject.toml +17 -12
  35. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  36. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  37. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  38. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  39. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  40. data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
  41. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  42. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  43. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  44. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  45. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  46. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  47. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
  48. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  49. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  50. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  51. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  52. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  53. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  54. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  55. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
  56. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
  57. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  58. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  59. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  60. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  61. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  62. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
  63. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
  64. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  65. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  66. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  67. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  68. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  69. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
  70. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
  71. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
  72. data/vendor/datasketches-cpp/setup.py +14 -2
  73. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  74. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  75. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  76. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  77. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
  78. data/vendor/datasketches-cpp/tox.ini +26 -0
  79. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  80. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
  81. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  82. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  83. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
  84. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  85. metadata +14 -5
  86. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -341,8 +341,7 @@ void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {
341
341
 
342
342
  template<typename A>
343
343
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
344
- const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
345
- const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
344
+ const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
346
345
  write(os, preamble_longs);
347
346
  const uint8_t serial_version = SERIAL_VERSION;
348
347
  write(os, serial_version);
@@ -359,24 +358,19 @@ void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
359
358
  write(os, flags_byte);
360
359
  const uint16_t seed_hash = get_seed_hash();
361
360
  write(os, seed_hash);
362
- if (!this->is_empty()) {
363
- if (!is_single_item) {
364
- const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
365
- write(os, num_entries);
366
- const uint32_t unused32 = 0;
367
- write(os, unused32);
368
- if (this->is_estimation_mode()) {
369
- write(os, this->theta_);
370
- }
371
- }
372
- write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
361
+ if (preamble_longs > 1) {
362
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
363
+ write(os, num_entries);
364
+ const uint32_t unused32 = 0;
365
+ write(os, unused32);
373
366
  }
367
+ if (this->is_estimation_mode()) write(os, this->theta_);
368
+ if (entries_.size() > 0) write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
374
369
  }
375
370
 
376
371
  template<typename A>
377
372
  auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
378
- const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
379
- const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
373
+ const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
380
374
  const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
381
375
  + sizeof(uint64_t) * entries_.size();
382
376
  vector_bytes bytes(size, 0, entries_.get_allocator());
@@ -397,17 +391,13 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
397
391
  ptr += copy_to_mem(flags_byte, ptr);
398
392
  const uint16_t seed_hash = get_seed_hash();
399
393
  ptr += copy_to_mem(seed_hash, ptr);
400
- if (!this->is_empty()) {
401
- if (!is_single_item) {
402
- const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
403
- ptr += copy_to_mem(num_entries, ptr);
404
- ptr += sizeof(uint32_t);
405
- if (this->is_estimation_mode()) {
406
- ptr += copy_to_mem(theta_, ptr);
407
- }
408
- }
409
- ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
394
+ if (preamble_longs > 1) {
395
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
396
+ ptr += copy_to_mem(num_entries, ptr);
397
+ ptr += sizeof(uint32_t); // unused
410
398
  }
399
+ if (this->is_estimation_mode()) ptr += copy_to_mem(theta_, ptr);
400
+ if (entries_.size() > 0) ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
411
401
  return bytes;
412
402
  }
413
403
 
@@ -96,15 +96,6 @@ struct theta_update_sketch_base {
96
96
  template<typename Derived, typename Allocator>
97
97
  class theta_base_builder {
98
98
  public:
99
- // TODO: Redundant and deprecated. Will be removed in next major version release.
100
- using resize_factor = theta_constants::resize_factor;
101
- static const uint8_t MIN_LG_K = theta_constants::MIN_LG_K;
102
- static const uint8_t MAX_LG_K = theta_constants::MAX_LG_K;
103
- // TODO: The following defaults are redundant and deprecated. Will be removed in the
104
- // next major version release
105
- static const uint8_t DEFAULT_LG_K = theta_constants::DEFAULT_LG_K;
106
- static const resize_factor DEFAULT_RESIZE_FACTOR = theta_constants::DEFAULT_RESIZE_FACTOR;
107
-
108
99
  /**
109
100
  * Creates and instance of the builder with default parameters.
110
101
  */
@@ -310,11 +310,11 @@ seed_(DEFAULT_SEED) {}
310
310
 
311
311
  template<typename Derived, typename Allocator>
312
312
  Derived& theta_base_builder<Derived, Allocator>::set_lg_k(uint8_t lg_k) {
313
- if (lg_k < MIN_LG_K) {
314
- throw std::invalid_argument("lg_k must not be less than " + std::to_string(MIN_LG_K) + ": " + std::to_string(lg_k));
313
+ if (lg_k < theta_constants::MIN_LG_K) {
314
+ throw std::invalid_argument("lg_k must not be less than " + std::to_string(theta_constants::MIN_LG_K) + ": " + std::to_string(lg_k));
315
315
  }
316
- if (lg_k > MAX_LG_K) {
317
- throw std::invalid_argument("lg_k must not be greater than " + std::to_string(MAX_LG_K) + ": " + std::to_string(lg_k));
316
+ if (lg_k > theta_constants::MAX_LG_K) {
317
+ throw std::invalid_argument("lg_k must not be greater than " + std::to_string(theta_constants::MAX_LG_K) + ": " + std::to_string(lg_k));
318
318
  }
319
319
  lg_k_ = lg_k;
320
320
  return static_cast<Derived&>(*this);
@@ -346,7 +346,7 @@ uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
346
346
 
347
347
  template<typename Derived, typename Allocator>
348
348
  uint8_t theta_base_builder<Derived, Allocator>::starting_lg_size() const {
349
- return theta_build_helper<true>::starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
349
+ return theta_build_helper<true>::starting_sub_multiple(lg_k_ + 1, theta_constants::MIN_LG_K, static_cast<uint8_t>(rf_));
350
350
  }
351
351
 
352
352
  // iterator
@@ -17,7 +17,7 @@
17
17
 
18
18
  add_executable(theta_test)
19
19
 
20
- target_link_libraries(theta_test theta common_test)
20
+ target_link_libraries(theta_test theta common_test_lib)
21
21
 
22
22
  set_target_properties(theta_test PROPERTIES
23
23
  CXX_STANDARD 11
@@ -152,7 +152,7 @@ TEST_CASE("theta sketch: estimation", "[theta_sketch]") {
152
152
  REQUIRE(update_sketch.get_lower_bound(1) < n);
153
153
  REQUIRE(update_sketch.get_upper_bound(1) > n);
154
154
 
155
- const uint32_t k = 1 << update_theta_sketch::builder::DEFAULT_LG_K;
155
+ const uint32_t k = 1 << theta_constants::DEFAULT_LG_K;
156
156
  REQUIRE(update_sketch.get_num_retained() >= k);
157
157
  update_sketch.trim();
158
158
  REQUIRE(update_sketch.get_num_retained() == k);
@@ -398,6 +398,7 @@ TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[
398
398
  TEST_CASE("theta sketch: deserialize empty buffer overrun", "[theta_sketch]") {
399
399
  update_theta_sketch update_sketch = update_theta_sketch::builder().build();
400
400
  auto bytes = update_sketch.compact().serialize();
401
+ REQUIRE(bytes.size() == 8);
401
402
  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
402
403
  }
403
404
 
@@ -0,0 +1,26 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ [tox]
19
+ envlist = py3
20
+ isolated_build = true
21
+
22
+ [testenv]
23
+ deps = pytest
24
+ numpy
25
+ changedir = python/tests
26
+ commands = pytest
@@ -72,21 +72,45 @@ public:
72
72
  double get_estimate() const;
73
73
 
74
74
  /**
75
- * Returns the approximate lower error bound given a number of standard deviations.
76
- * This parameter is similar to the number of standard deviations of the normal distribution
77
- * and corresponds to approximately 67%, 95% and 99% confidence intervals.
78
- * @param num_std_devs number of Standard Deviations (1, 2 or 3)
79
- * @return the lower bound
80
- */
75
+ * Returns the approximate lower error bound given a number of standard deviations over an arbitrary number of
76
+ * items stored in the sketch.
77
+ * This parameter is similar to the number of standard deviations of the normal distribution
78
+ * and corresponds to approximately 67%, 95% and 99% confidence intervals.
79
+ * @param num_std_devs number of Standard Deviations (1, 2 or 3)
80
+ * @param num_subset_entries number of items from {0, 1, ..., get_num_retained()} over which to estimate the bound
81
+ * @return the lower bound
82
+ */
83
+ double get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
84
+
85
+ /**
86
+ * Returns the approximate lower error bound given a number of standard deviations.
87
+ * This parameter is similar to the number of standard deviations of the normal distribution
88
+ * and corresponds to approximately 67%, 95% and 99% confidence intervals.
89
+ * @param num_std_devs number of Standard Deviations (1, 2 or 3)
90
+ * @return the lower bound
91
+ */
81
92
  double get_lower_bound(uint8_t num_std_devs) const;
82
93
 
94
+
83
95
  /**
84
- * Returns the approximate upper error bound given a number of standard deviations.
85
- * This parameter is similar to the number of standard deviations of the normal distribution
86
- * and corresponds to approximately 67%, 95% and 99% confidence intervals.
87
- * @param num_std_devs number of Standard Deviations (1, 2 or 3)
88
- * @return the upper bound
89
- */
96
+ * Returns the approximate upper error bound given a number of standard deviations over an arbitrary number of
97
+ * items stored in the sketch.
98
+ * This parameter is similar to the number of standard deviations of the normal distribution
99
+ * and corresponds to approximately 67%, 95% and 99% confidence intervals.
100
+ * @param num_std_devs number of Standard Deviations (1, 2 or 3)
101
+ * @param num_subset_entries number of items from {0, 1, ..., get_num_retained()} over which to estimate the bound
102
+ * @return the lower bound
103
+ */
104
+ double get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
105
+
106
+
107
+ /**
108
+ * Returns the approximate upper error bound given a number of standard deviations.
109
+ * This parameter is similar to the number of standard deviations of the normal distribution
110
+ * and corresponds to approximately 67%, 95% and 99% confidence intervals.
111
+ * @param num_std_devs number of Standard Deviations (1, 2 or 3)
112
+ * @return the upper bound
113
+ */
90
114
  double get_upper_bound(uint8_t num_std_devs) const;
91
115
 
92
116
  /**
@@ -40,16 +40,28 @@ double tuple_sketch<S, A>::get_estimate() const {
40
40
  return get_num_retained() / get_theta();
41
41
  }
42
42
 
43
+ template<typename S, typename A>
44
+ double tuple_sketch<S, A>::get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const {
45
+ num_subset_entries = std::min(num_subset_entries, get_num_retained()) ;
46
+ if (!is_estimation_mode()) return num_subset_entries;
47
+ return binomial_bounds::get_lower_bound(num_subset_entries, get_theta(), num_std_devs);
48
+ }
49
+
43
50
  template<typename S, typename A>
44
51
  double tuple_sketch<S, A>::get_lower_bound(uint8_t num_std_devs) const {
45
- if (!is_estimation_mode()) return get_num_retained();
46
- return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
52
+ return get_lower_bound(num_std_devs, get_num_retained()) ;
53
+ }
54
+
55
+ template<typename S, typename A>
56
+ double tuple_sketch<S, A>::get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const {
57
+ num_subset_entries = std::min(num_subset_entries, get_num_retained()) ;
58
+ if (!is_estimation_mode()) return num_subset_entries;
59
+ return binomial_bounds::get_upper_bound(num_subset_entries, get_theta(), num_std_devs);
47
60
  }
48
61
 
49
62
  template<typename S, typename A>
50
63
  double tuple_sketch<S, A>::get_upper_bound(uint8_t num_std_devs) const {
51
- if (!is_estimation_mode()) return get_num_retained();
52
- return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
64
+ return get_upper_bound(num_std_devs, get_num_retained()) ;
53
65
  }
54
66
 
55
67
  template<typename S, typename A>
@@ -17,7 +17,7 @@
17
17
 
18
18
  add_executable(tuple_test)
19
19
 
20
- target_link_libraries(tuple_test tuple common_test)
20
+ target_link_libraries(tuple_test tuple common_test_lib)
21
21
 
22
22
  set_target_properties(tuple_test PROPERTIES
23
23
  CXX_STANDARD 11
@@ -45,4 +45,5 @@ target_sources(tuple_test
45
45
  tuple_a_not_b_test.cpp
46
46
  tuple_jaccard_similarity_test.cpp
47
47
  array_of_doubles_sketch_test.cpp
48
+ engagement_test.cpp
48
49
  )
@@ -0,0 +1,299 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <iostream>
21
+ #include <iomanip>
22
+ #include <set>
23
+ #include <catch2/catch.hpp>
24
+ #include <tuple_sketch.hpp>
25
+ #include <tuple_union.hpp>
26
+ #include <stdexcept>
27
+
28
+ template<typename T>
29
+ class max_value_policy {
30
+ public:
31
+ max_value_policy(const T& initial_value): initial_value(initial_value) {}
32
+ T create() const { return initial_value; }
33
+ void update(T& summary, const T& update) const { summary = std::max(summary, update); }
34
+ private:
35
+ T initial_value;
36
+ };
37
+
38
+ using max_float_update_tuple_sketch = datasketches::update_tuple_sketch<float, float, max_value_policy<float>>;
39
+
40
+ template<typename T>
41
+ class always_one_policy {
42
+ public:
43
+ always_one_policy(): initial_value(1) {}
44
+ T create() const { return 1; }
45
+ void update(T&, const T&) const { }
46
+ private:
47
+ T initial_value;
48
+ };
49
+ using always_one_tuple_sketch = datasketches::update_tuple_sketch<int, int, always_one_policy<int>> ;
50
+
51
+ template<typename T>
52
+ class update_sum_value_policy {
53
+ public:
54
+ update_sum_value_policy(): initial_value(0) {}
55
+ T create() const { return initial_value; }
56
+ void update(T& summary, const T& update) const { summary += update; }
57
+ private:
58
+ T initial_value;
59
+ };
60
+ using sum_update_tuple_sketch = datasketches::update_tuple_sketch<int, int, update_sum_value_policy<int>>;
61
+
62
+ template<typename Summary>
63
+ struct union_sum_value_policy {
64
+ void operator()(Summary& summary, const Summary& other) const {
65
+ summary += other;
66
+ }
67
+ };
68
+
69
+ using sum_union_tuple_sketch = datasketches::tuple_union<int, union_sum_value_policy<int>> ;
70
+
71
+
72
+ class EngagementTest{
73
+ public:
74
+ int num_std_dev = 2 ;
75
+ void test_always_one_update(){
76
+ /*
77
+ * Tests that updates into an update_tuple_sketch sketch only keeps a 1 in the column for stored values.
78
+ */
79
+ int lgK = 8 ;
80
+ std::vector<datasketches::update_tuple_sketch<int, int, always_one_policy<int>>> sketch_array ;
81
+
82
+ auto always_one_sketch = always_one_tuple_sketch::builder(always_one_policy<int>()).set_lg_k(lgK).build() ;
83
+
84
+ always_one_sketch.update(1, 1);
85
+ always_one_sketch.update(1, 2);
86
+ always_one_sketch.update(2, 1);
87
+ always_one_sketch.update(3, 3);
88
+ always_one_sketch.update(3, 7);
89
+
90
+ int num_retained = 0;
91
+ int sum = 0;
92
+ for (const auto& entry: always_one_sketch) {
93
+ sum += entry.second;
94
+ ++num_retained;
95
+ }
96
+ REQUIRE(num_retained == 3);
97
+ REQUIRE(sum == 3); // we only keep 1 for every stored key.
98
+ }
99
+
100
+ void test_sum_update_policy(){
101
+ /*
102
+ * Tests that updates into an sum_update_tuple_sketch sum the stored values on updates.
103
+ */
104
+ int lgK = 8 ;
105
+ auto sum_sketch = sum_update_tuple_sketch::builder().set_lg_k(lgK).build() ;
106
+
107
+ sum_sketch.update(1, 1);
108
+ sum_sketch.update(1, 2);
109
+ sum_sketch.update(2, 1);
110
+ sum_sketch.update(3, 3);
111
+ sum_sketch.update(3, 7);
112
+ int num_retained = 0;
113
+ int sum = 0;
114
+ for (const auto& entry: sum_sketch) {
115
+ sum += entry.second;
116
+ ++num_retained;
117
+ }
118
+ REQUIRE(num_retained == 3);
119
+ REQUIRE(sum == 14); // (1+2) + 1 + (3 + 7) = 14
120
+ }
121
+
122
+ void test_sum_union_policy(){
123
+ /*
124
+ * Tests that updates into two sketches of sum_update_tuple_sketch flavour, which have been unioned,
125
+ * cause the stored values of two of the same keys to be summed.
126
+ */
127
+ auto sketch1 = sum_update_tuple_sketch::builder().build() ;
128
+ auto sketch2 = sum_update_tuple_sketch::builder().build() ;
129
+
130
+ sketch1.update(1, 1);
131
+ sketch1.update(2, 1);
132
+ sketch1.update(3, 3);
133
+
134
+ sketch2.update(1, 2);
135
+ sketch2.update(2, 1);
136
+ sketch2.update(3, 7);
137
+
138
+ auto union_sketch = sum_union_tuple_sketch::builder().build() ;
139
+ union_sketch.update(sketch1) ;
140
+ union_sketch.update(sketch2) ;
141
+ auto union_result = union_sketch.get_result() ;
142
+
143
+ int num_retained = 0;
144
+ int sum = 0;
145
+ for (const auto& entry: union_result) {
146
+ sum += entry.second;
147
+ ++num_retained;
148
+ }
149
+ REQUIRE(num_retained == 3);
150
+ REQUIRE(sum == 15); // 1:(1+2) + 2:(1+1) + 3:(3+7) = 15
151
+ }
152
+
153
+ void compute_engagement_histogram(){
154
+ /*
155
+ * Returns the estimated histogram from the synthetic data.
156
+ * On inspection one can verify this agrees with the
157
+ * https://github.com/apache/datasketches-java/blob/master/src/test/java/org/apache/datasketches/tuple/aninteger/EngagementTest.java
158
+ */
159
+ int lgK = 8 ;
160
+ const int days = 30 ;
161
+ int v = 0 ;
162
+ std::set<int> set_array[days];
163
+ std::vector<datasketches::update_tuple_sketch<int, int, always_one_policy<int>>> sketch_array ;
164
+
165
+
166
+ for(int i=0; i<days ; i++){
167
+ auto builder = always_one_tuple_sketch::builder(always_one_policy<int>()) ;
168
+ builder.set_lg_k(lgK) ;
169
+ auto sketch = builder.build() ;
170
+ sketch_array.push_back(sketch);
171
+ }
172
+ REQUIRE(sketch_array.size() == days) ;
173
+
174
+ for(int i=0; i<=days; i++){
175
+ int32_t num_ids = get_num_ids(days, i) ;
176
+ int32_t num_days = get_num_days(days, i) ;
177
+
178
+ int my_v = v++ ;
179
+ for(int d=0 ; d<num_days; d++){
180
+ for(int id = 0; id < num_ids; id++){
181
+ set_array[d].insert(my_v + id) ;
182
+ sketch_array[d].update(my_v + id, 1) ;
183
+ }
184
+ }
185
+ v += num_ids ;
186
+ }
187
+ union_ops(lgK, sketch_array) ;
188
+ }
189
+ private:
190
+ int32_t get_num_ids(int total_days, int index){
191
+ /*
192
+ * Generates power law distributed synthetic data
193
+ */
194
+ double d = total_days ;
195
+ double i = index ;
196
+ return int(round(exp(i * log(d) / d))) ;
197
+ }
198
+
199
+ int32_t get_num_days(int total_days, int index){
200
+ double d = total_days ;
201
+ double i = index ;
202
+ return int(round(exp( (d-i) * log(d) / d ))) ;
203
+ }
204
+
205
+ int32_t round_double_to_int(double x){
206
+ return int(std::round(x)) ;
207
+ }
208
+
209
+ void union_ops(int lgk, std::vector<datasketches::update_tuple_sketch<int, int, always_one_policy<int>>> sketches){
210
+ int num_sketches = sketches.size() ;
211
+ auto u = sum_union_tuple_sketch::builder().set_lg_k(lgk).build() ;
212
+
213
+ for(auto sk:sketches){
214
+ u.update(sk) ;
215
+ }
216
+ auto union_result = u.get_result() ;
217
+ std::vector<uint64_t> num_days_arr(num_sketches+1) ;
218
+
219
+ for (const auto& entry: union_result) {
220
+ int num_days_visited = entry.second ;
221
+ num_days_arr[num_days_visited]++;
222
+ }
223
+
224
+ int sum_visits = 0;
225
+ double theta = union_result.get_theta();
226
+ std::cout <<"\t\tEngagement Histogram.\t\t\t\n" ;
227
+ std::cout << "Number of Unique Visitors by Number of Days Visited" << std::endl ;
228
+ std::cout << "---------------------------------------------------" << std::endl ;
229
+
230
+ std::cout << std::setw(12) << "Days Visited"
231
+ << std::setw(12) << "Estimate"
232
+ << std::setw(12) << "LB"
233
+ << std::setw(12) << "UB"
234
+ << std:: endl ;
235
+
236
+ for (uint64_t i = 0; i < num_days_arr.size(); i++) {
237
+ int visitors_at_days_visited = num_days_arr[i] ;
238
+ if(visitors_at_days_visited == 0){ continue; }
239
+ sum_visits += visitors_at_days_visited * i ;
240
+
241
+ double est_visitors_at_days_visited = visitors_at_days_visited / theta ;
242
+ double lower_bound_at_days_visited = union_result.get_lower_bound(num_std_dev, visitors_at_days_visited);
243
+ double upper_bound_at_days_visited = union_result.get_upper_bound(num_std_dev, visitors_at_days_visited);
244
+
245
+ std::cout << std::setw(12) << i
246
+ << std::setw(12) << est_visitors_at_days_visited
247
+ << std::setw(12) << lower_bound_at_days_visited
248
+ << std::setw(12) << upper_bound_at_days_visited
249
+ << std:: endl ;
250
+
251
+ }
252
+ std::cout << std::endl << std::endl ;
253
+ std::cout << std::setw(12) << "Totals"
254
+ << std::setw(12) << "Estimate"
255
+ << std::setw(12) << "LB"
256
+ << std::setw(12) << "UB"
257
+ << std:: endl ;
258
+ std::cout << "---------------------------------------------------" << std::endl ;
259
+
260
+ const double total_visitors = union_result.get_estimate() ;
261
+ const double lb_visitors = union_result.get_lower_bound(num_std_dev) ;
262
+ const double ub_visitors = union_result.get_upper_bound(num_std_dev) ;
263
+
264
+
265
+ std::cout << std::setw(12) << "Visitors"
266
+ << std::setw(12) << total_visitors
267
+ << std::setw(12) << lb_visitors
268
+ << std::setw(12) << ub_visitors
269
+ << std:: endl ;
270
+
271
+ // The total number of visits, however, is a scaled metric and takes advantage of the fact that
272
+ // the retained entries in the sketch is a uniform random sample of all unique visitors, and
273
+ // the rest of the unique users will likely behave in the same way.
274
+ const double est_visits = sum_visits / theta;
275
+ const double lb_visits = est_visits * lb_visitors / total_visitors;
276
+ const double ub_visits = est_visits * ub_visitors / total_visitors;
277
+
278
+
279
+ std::cout << std::setw(12) << "Visits"
280
+ << std::setw(12) << est_visits
281
+ << std::setw(12) << lb_visits
282
+ << std::setw(12) << ub_visits
283
+ << std:: endl ;
284
+ }
285
+
286
+ };
287
+
288
+ namespace datasketches {
289
+
290
+ TEST_CASE("engagement", "[engagement]") {
291
+ EngagementTest E ;
292
+ E.test_always_one_update() ;
293
+ E.test_sum_update_policy() ;
294
+ E.test_sum_union_policy() ;
295
+ E.compute_engagement_histogram() ;
296
+ }
297
+
298
+
299
+ } /* namespace datasketches */
@@ -56,7 +56,13 @@ TEST_CASE("tuple sketch float: empty", "[tuple_sketch]") {
56
56
  REQUIRE(!update_sketch.is_estimation_mode());
57
57
  REQUIRE(update_sketch.get_estimate() == 0);
58
58
  REQUIRE(update_sketch.get_lower_bound(1) == 0);
59
+ REQUIRE(update_sketch.get_lower_bound(1, 1) == 0);
60
+ REQUIRE(update_sketch.get_lower_bound(1, update_sketch.get_num_retained()) == 0);
61
+ REQUIRE(update_sketch.get_lower_bound(1, update_sketch.get_num_retained()+1) == 0);
59
62
  REQUIRE(update_sketch.get_upper_bound(1) == 0);
63
+ REQUIRE(update_sketch.get_upper_bound(1, 1) == 0);
64
+ REQUIRE(update_sketch.get_upper_bound(1, update_sketch.get_num_retained()) == 0);
65
+ REQUIRE(update_sketch.get_upper_bound(1, update_sketch.get_num_retained()+1) == 0);
60
66
  REQUIRE(update_sketch.get_theta() == 1);
61
67
  REQUIRE(update_sketch.get_num_retained() == 0);
62
68
  REQUIRE(update_sketch.is_ordered());
@@ -67,7 +73,11 @@ TEST_CASE("tuple sketch float: empty", "[tuple_sketch]") {
67
73
  REQUIRE(!compact_sketch.is_estimation_mode());
68
74
  REQUIRE(compact_sketch.get_estimate() == 0);
69
75
  REQUIRE(compact_sketch.get_lower_bound(1) == 0);
76
+ REQUIRE(compact_sketch.get_lower_bound(1, 1) == 0);
77
+ REQUIRE(compact_sketch.get_lower_bound(1, update_sketch.get_num_retained()) == 0);
70
78
  REQUIRE(compact_sketch.get_upper_bound(1) == 0);
79
+ REQUIRE(compact_sketch.get_upper_bound(1, 1) == 0);
80
+ REQUIRE(compact_sketch.get_upper_bound(1, update_sketch.get_num_retained()) == 0);
71
81
  REQUIRE(compact_sketch.get_theta() == 1);
72
82
  REQUIRE(compact_sketch.get_num_retained() == 0);
73
83
  REQUIRE(compact_sketch.is_ordered());
@@ -110,7 +120,11 @@ TEST_CASE("tuple sketch float: exact mode", "[tuple_sketch]") {
110
120
  REQUIRE_FALSE(update_sketch.is_estimation_mode());
111
121
  REQUIRE(update_sketch.get_estimate() == 2);
112
122
  REQUIRE(update_sketch.get_lower_bound(1) == 2);
123
+ REQUIRE(update_sketch.get_lower_bound(1, 1) == 1);
124
+ REQUIRE(update_sketch.get_lower_bound(1, update_sketch.get_num_retained()) == 2);
113
125
  REQUIRE(update_sketch.get_upper_bound(1) == 2);
126
+ REQUIRE(update_sketch.get_upper_bound(1, 1) == 1);
127
+ REQUIRE(update_sketch.get_upper_bound(1, update_sketch.get_num_retained()) == 2);
114
128
  REQUIRE(update_sketch.get_theta() == 1);
115
129
  REQUIRE(update_sketch.get_num_retained() == 2);
116
130
  REQUIRE_FALSE(update_sketch.is_ordered());
@@ -127,7 +141,11 @@ TEST_CASE("tuple sketch float: exact mode", "[tuple_sketch]") {
127
141
  REQUIRE_FALSE(compact_sketch.is_estimation_mode());
128
142
  REQUIRE(compact_sketch.get_estimate() == 2);
129
143
  REQUIRE(compact_sketch.get_lower_bound(1) == 2);
144
+ REQUIRE(compact_sketch.get_lower_bound(1, 1) == 1);
145
+ REQUIRE(compact_sketch.get_lower_bound(1, compact_sketch.get_num_retained()) == 2);
130
146
  REQUIRE(compact_sketch.get_upper_bound(1) == 2);
147
+ REQUIRE(compact_sketch.get_upper_bound(1, 1) == 1);
148
+ REQUIRE(compact_sketch.get_upper_bound(1, compact_sketch.get_num_retained()) == 2);
131
149
  REQUIRE(compact_sketch.get_theta() == 1);
132
150
  REQUIRE(compact_sketch.get_num_retained() == 2);
133
151
  REQUIRE(compact_sketch.is_ordered());
@@ -146,7 +164,11 @@ TEST_CASE("tuple sketch float: exact mode", "[tuple_sketch]") {
146
164
  REQUIRE(!deserialized_sketch.is_estimation_mode());
147
165
  REQUIRE(deserialized_sketch.get_estimate() == 2);
148
166
  REQUIRE(deserialized_sketch.get_lower_bound(1) == 2);
167
+ REQUIRE(deserialized_sketch.get_lower_bound(1, 1) == 1);
168
+ REQUIRE(deserialized_sketch.get_lower_bound(1, deserialized_sketch.get_num_retained()) == 2);
149
169
  REQUIRE(deserialized_sketch.get_upper_bound(1) == 2);
170
+ REQUIRE(deserialized_sketch.get_upper_bound(1, 1) == 1);
171
+ REQUIRE(deserialized_sketch.get_upper_bound(1, deserialized_sketch.get_num_retained()) == 2);
150
172
  REQUIRE(deserialized_sketch.get_theta() == 1);
151
173
  REQUIRE(deserialized_sketch.get_num_retained() == 2);
152
174
  REQUIRE(deserialized_sketch.is_ordered());
@@ -160,7 +182,11 @@ TEST_CASE("tuple sketch float: exact mode", "[tuple_sketch]") {
160
182
  REQUIRE(!deserialized_sketch.is_estimation_mode());
161
183
  REQUIRE(deserialized_sketch.get_estimate() == 2);
162
184
  REQUIRE(deserialized_sketch.get_lower_bound(1) == 2);
185
+ REQUIRE(deserialized_sketch.get_lower_bound(1, 1) == 1);
186
+ REQUIRE(deserialized_sketch.get_lower_bound(1, deserialized_sketch.get_num_retained()) == 2);
163
187
  REQUIRE(deserialized_sketch.get_upper_bound(1) == 2);
188
+ REQUIRE(deserialized_sketch.get_upper_bound(1, 1) == 1);
189
+ REQUIRE(deserialized_sketch.get_upper_bound(1, deserialized_sketch.get_num_retained()) == 2);
164
190
  REQUIRE(deserialized_sketch.get_theta() == 1);
165
191
  REQUIRE(deserialized_sketch.get_num_retained() == 2);
166
192
  REQUIRE(deserialized_sketch.is_ordered());
@@ -0,0 +1 @@
1
+ 4.0.0