datasketches 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -26,22 +26,12 @@
26
26
  #include <iterator>
27
27
  #include <vector>
28
28
 
29
-
30
- /**
31
- * This sketch samples data from a stream of items, designed for optimal (minimum) variance when
32
- * querying the sketch to estimate subset sums of items matchng a provided predicate. Variance
33
- * optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
34
- * subset sum estimation.
35
- *
36
- * author Kevin Lang
37
- * author Jon Malkin
38
- */
39
29
  namespace datasketches {
40
30
 
41
31
  template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
42
32
  template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
43
33
 
44
- /**
34
+ /*
45
35
  * A struct to hold the result of subset sum queries
46
36
  */
47
37
  struct subset_summary {
@@ -53,11 +43,23 @@ struct subset_summary {
53
43
 
54
44
  template <typename T, typename A> class var_opt_union; // forward declaration
55
45
 
46
+ /// VarOpt sketch constants
56
47
  namespace var_opt_constants {
57
- const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
58
- const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
48
+ /// default resize factor
49
+ const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
50
+ /// maximum value of parameter K
51
+ const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
59
52
  }
60
53
 
54
+ /**
55
+ * This sketch samples data from a stream of items. Designed for optimal (minimum) variance when
56
+ * querying the sketch to estimate subset sums of items matching a provided predicate. Variance
57
+ * optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
58
+ * subset sum estimation.
59
+ *
60
+ * author Kevin Lang
61
+ * author Jon Malkin
62
+ */
61
63
  template<
62
64
  typename T,
63
65
  typename A = std::allocator<T>
@@ -68,15 +70,42 @@ class var_opt_sketch {
68
70
  static const resize_factor DEFAULT_RESIZE_FACTOR = var_opt_constants::DEFAULT_RESIZE_FACTOR;
69
71
  static const uint32_t MAX_K = var_opt_constants::MAX_K;
70
72
 
73
+ /**
74
+ * Constructor
75
+ * @param k sketch size
76
+ * @param rf resize factor
77
+ * @param allocator instance of an allocator
78
+ */
71
79
  explicit var_opt_sketch(uint32_t k,
72
80
  resize_factor rf = var_opt_constants::DEFAULT_RESIZE_FACTOR,
73
81
  const A& allocator = A());
82
+
83
+ /**
84
+ * Copy constructor
85
+ * @param other sketch to be copied
86
+ */
74
87
  var_opt_sketch(const var_opt_sketch& other);
88
+
89
+ /**
90
+ * Move constructor
91
+ * @param other sketch to be moved
92
+ */
75
93
  var_opt_sketch(var_opt_sketch&& other) noexcept;
76
94
 
77
95
  ~var_opt_sketch();
78
96
 
97
+ /**
98
+ * Copy assignment
99
+ * @param other sketch to be copied
100
+ * @return reference to this sketch
101
+ */
79
102
  var_opt_sketch& operator=(const var_opt_sketch& other);
103
+
104
+ /**
105
+ * Move assignment
106
+ * @param other sketch to be moved
107
+ * @return reference to this sketch
108
+ */
80
109
  var_opt_sketch& operator=(var_opt_sketch&& other);
81
110
 
82
111
  /**
@@ -85,7 +114,7 @@ class var_opt_sketch {
85
114
  * @param item an item from a stream of items
86
115
  * @param weight the weight of the item
87
116
  */
88
- void update(const T& item, double weight=1.0);
117
+ void update(const T& item, double weight = 1.0);
89
118
 
90
119
  /**
91
120
  * Updates this sketch with the given data item with the given weight.
@@ -93,7 +122,7 @@ class var_opt_sketch {
93
122
  * @param item an item from a stream of items
94
123
  * @param weight the weight of the item
95
124
  */
96
- void update(T&& item, double weight=1.0);
125
+ void update(T&& item, double weight = 1.0);
97
126
 
98
127
  /**
99
128
  * Returns the configured maximum sample size.
@@ -117,7 +146,7 @@ class var_opt_sketch {
117
146
  * Computes an estimated subset sum from the entire stream for objects matching a given
118
147
  * predicate. Provides a lower bound, estimate, and upper bound using a target of 2 standard
119
148
  * deviations. This is technically a heuristic method and tries to err on the conservative side.
120
- * @param P a predicate function
149
+ * @param predicate a predicate function
121
150
  * @return a subset_summary item with estimate, upper and lower bounds,
122
151
  * and total sketch weight
123
152
  */
@@ -138,7 +167,7 @@ class var_opt_sketch {
138
167
  /**
139
168
  * Computes size needed to serialize the current state of the sketch.
140
169
  * This version is for fixed-size arithmetic types (integral and floating point).
141
- * @param instance of a SerDe
170
+ * @param sd instance of a SerDe
142
171
  * @return size in bytes needed to serialize this sketch
143
172
  */
144
173
  template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
@@ -147,7 +176,7 @@ class var_opt_sketch {
147
176
  /**
148
177
  * Computes size needed to serialize the current state of the sketch.
149
178
  * This version is for all other types and can be expensive since every item needs to be looked at.
150
- * @param instance of a SerDe
179
+ * @param sd instance of a SerDe
151
180
  * @return size in bytes needed to serialize this sketch
152
181
  */
153
182
  template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
@@ -155,7 +184,7 @@ class var_opt_sketch {
155
184
 
156
185
  // This is a convenience alias for users
157
186
  // The type returned by the following serialize method
158
- typedef vector_u8<A> vector_bytes;
187
+ using vector_bytes = vector_u8<A>;
159
188
 
160
189
  /**
161
190
  * This method serializes the sketch as a vector of bytes.
@@ -163,7 +192,7 @@ class var_opt_sketch {
163
192
  * It is a blank space of a given size.
164
193
  * This header is used in Datasketches PostgreSQL extension.
165
194
  * @param header_size_bytes space to reserve in front of the sketch
166
- * @param instance of a SerDe
195
+ * @param sd instance of a SerDe
167
196
  */
168
197
  template<typename SerDe = serde<T>>
169
198
  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
@@ -171,7 +200,7 @@ class var_opt_sketch {
171
200
  /**
172
201
  * This method serializes the sketch into a given stream in a binary form
173
202
  * @param os output stream
174
- * @param instance of a SerDe
203
+ * @param sd instance of a SerDe
175
204
  */
176
205
  template<typename SerDe = serde<T>>
177
206
  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
@@ -179,8 +208,8 @@ class var_opt_sketch {
179
208
  /**
180
209
  * This method deserializes a sketch from a given stream.
181
210
  * @param is input stream
182
- * @param instance of a SerDe
183
- * @param instance of an Allocator
211
+ * @param sd instance of a SerDe
212
+ * @param allocator instance of an allocator
184
213
  * @return an instance of a sketch
185
214
  */
186
215
  template<typename SerDe = serde<T>>
@@ -190,8 +219,8 @@ class var_opt_sketch {
190
219
  * This method deserializes a sketch from a given array of bytes.
191
220
  * @param bytes pointer to the array of bytes
192
221
  * @param size the size of the array
193
- * @param instance of a SerDe
194
- * @param instance of an Allocator
222
+ * @param sd instance of a SerDe
223
+ * @param allocator instance of an allocator
195
224
  * @return an instance of a sketch
196
225
  */
197
226
  template<typename SerDe = serde<T>>
@@ -205,7 +234,8 @@ class var_opt_sketch {
205
234
 
206
235
  /**
207
236
  * Prints the raw sketch items to a string. Calls items_to_stream() internally.
208
- * Only works for type T with a defined operator<<() and
237
+ * Only works for type T with a defined
238
+ * std::ostream& operator<<(std::ostream&, const T&) and
209
239
  * kept separate from to_string() to allow compilation even if
210
240
  * T does not have such an operator defined.
211
241
  * @return a string with the sketch items
@@ -213,7 +243,20 @@ class var_opt_sketch {
213
243
  string<A> items_to_string() const;
214
244
 
215
245
  class const_iterator;
246
+
247
+ /**
248
+ * Iterator pointing to the first item in the sketch.
249
+ * If the sketch is empty, the returned iterator must not be dereferenced or incremented.
250
+ * @return iterator pointing to the first item in the sketch
251
+ */
216
252
  const_iterator begin() const;
253
+
254
+ /**
255
+ * Iterator pointing to the past-the-end item in the sketch.
256
+ * The past-the-end item is the hypothetical item that would follow the last item.
257
+ * It does not point to any item, and must not be dereferenced or incremented.
258
+ * @return iterator pointing to the past-the-end item in the sketch
259
+ */
217
260
  const_iterator end() const;
218
261
 
219
262
  private:
@@ -36,7 +36,7 @@
36
36
 
37
37
  namespace datasketches {
38
38
 
39
- /**
39
+ /*
40
40
  * Implementation code for the VarOpt sketch.
41
41
  *
42
42
  * author Kevin Lang
@@ -895,7 +895,7 @@ void var_opt_sketch<T, A>::update_heavy_r_eq1(O&& item, double weight, bool mark
895
895
  grow_candidate_set(weights_[m_slot] + total_wt_r_, 2);
896
896
  }
897
897
 
898
- /**
898
+ /*
899
899
  * Decreases sketch's value of k by 1, updating stored values as needed.
900
900
  *
901
901
  * <p>Subject to certain pre-conditions, decreasing k causes tau to increase. This fact is used by
@@ -1685,7 +1685,7 @@ bool var_opt_sketch<T, A>::iterator::get_mark() const {
1685
1685
  return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
1686
1686
  }
1687
1687
 
1688
- /**
1688
+ /*
1689
1689
  * Checks if target sampling allocation is more than 50% of max sampling size.
1690
1690
  * If so, returns max sampling size, otherwise passes through target size.
1691
1691
  */
@@ -52,7 +52,6 @@ template<
52
52
  class var_opt_union {
53
53
 
54
54
  public:
55
- static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
56
55
 
57
56
  explicit var_opt_union(uint32_t max_k, const A& allocator = A());
58
57
  var_opt_union(const var_opt_union& other);
@@ -91,7 +90,7 @@ public:
91
90
  /**
92
91
  * Computes size needed to serialize the current state of the union.
93
92
  * This version is for all other types and can be expensive since every item needs to be looked at.
94
- * @param instance of a SerDe
93
+ * @param sd instance of a SerDe
95
94
  * @return size in bytes needed to serialize this sketch
96
95
  */
97
96
  template<typename SerDe = serde<T>>
@@ -108,7 +107,7 @@ public:
108
107
  * It is a blank space of a given size.
109
108
  * This header is used in Datasketches PostgreSQL extension.
110
109
  * @param header_size_bytes space to reserve in front of the sketch
111
- * @param instance of a SerDe
110
+ * @param sd instance of a SerDe
112
111
  */
113
112
  template<typename SerDe = serde<T>>
114
113
  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
@@ -117,7 +116,7 @@ public:
117
116
  * NOTE: This method may be deprecated in a future version.
118
117
  * This method serializes the sketch into a given stream in a binary form
119
118
  * @param os output stream
120
- * @param instance of a SerDe
119
+ * @param sd instance of a SerDe
121
120
  */
122
121
  template<typename SerDe = serde<T>>
123
122
  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
@@ -126,8 +125,8 @@ public:
126
125
  * NOTE: This method may be deprecated in a future version.
127
126
  * This method deserializes a union from a given stream.
128
127
  * @param is input stream
129
- * @param instance of a SerDe
130
- * @param instance of an Allocator
128
+ * @param sd instance of a SerDe
129
+ * @param allocator instance of an Allocator
131
130
  * @return an instance of a union
132
131
  */
133
132
  template<typename SerDe = serde<T>>
@@ -138,8 +137,8 @@ public:
138
137
  * This method deserializes a union from a given array of bytes.
139
138
  * @param bytes pointer to the array of bytes
140
139
  * @param size the size of the array
141
- * @param instance of a SerDe
142
- * @param instance of an Allocator
140
+ * @param sd instance of a SerDe
141
+ * @param allocator instance of an Allocator
143
142
  * @return an instance of a union
144
143
  */
145
144
  template<typename SerDe = serde<T>>
@@ -152,9 +151,9 @@ public:
152
151
  string<A> to_string() const;
153
152
 
154
153
  private:
155
- typedef typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T, A>> AllocSketch;
156
- typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
157
- typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
154
+ using AllocSketch = typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T, A>>;
155
+ using AllocDouble = typename std::allocator_traits<A>::template rebind_alloc<double>;
156
+ using AllocBool = typename std::allocator_traits<A>::template rebind_alloc<bool>;
158
157
 
159
158
  static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
160
159
  static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
@@ -146,7 +146,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(std::istream& is, const Ser
146
146
  check_preamble_longs(preamble_longs, flags);
147
147
  check_family_and_serialization_version(family_id, serial_version);
148
148
 
149
- if (max_k == 0 || max_k > MAX_K) {
149
+ if (max_k == 0 || max_k > var_opt_constants::MAX_K) {
150
150
  throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
151
151
  }
152
152
 
@@ -190,7 +190,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(const void* bytes, size_t s
190
190
  check_preamble_longs(preamble_longs, flags);
191
191
  check_family_and_serialization_version(family_id, serial_version);
192
192
 
193
- if (max_k == 0 || max_k > MAX_K) {
193
+ if (max_k == 0 || max_k > var_opt_constants::MAX_K) {
194
194
  throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
195
195
  }
196
196
 
@@ -448,7 +448,7 @@ var_opt_sketch<T, A> var_opt_union<T, A>::get_result() const {
448
448
 
449
449
  /**
450
450
  * When there are no marked items in H, the gadget is mathematically equivalent to a valid
451
- * varopt sketch. This method simply returns a copy (without perserving marks).
451
+ * varopt sketch. This method simply returns a copy (without preserving marks).
452
452
  *
453
453
  * @return A shallow copy of the gadget as valid varopt sketch
454
454
  */
@@ -549,7 +549,7 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
549
549
 
550
550
  if (result_h + result_r != result_k) throw std::logic_error("H + R counts must equal k");
551
551
  if (std::abs(transferred_weight - outer_tau_numer_) > 1e-10) {
552
- throw std::logic_error("uexpected mismatch in transferred weight");
552
+ throw std::logic_error("unexpected mismatch in transferred weight");
553
553
  }
554
554
 
555
555
  const double result_r_weight = gadget_.total_wt_r_ + transferred_weight;
@@ -15,30 +15,77 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- add_executable(sampling_test)
18
+ # separate executables for var_opt and ebpps sampling
19
19
 
20
- target_link_libraries(sampling_test sampling common_test_lib)
20
+ # VAR OPT SAMPLING
21
+ add_executable(var_opt_sampling_test)
21
22
 
22
- set_target_properties(sampling_test PROPERTIES
23
- CXX_STANDARD 11
23
+ target_link_libraries(var_opt_sampling_test sampling common_test_lib)
24
+
25
+ set_target_properties(var_opt_sampling_test PROPERTIES
24
26
  CXX_STANDARD_REQUIRED YES
25
27
  )
26
28
 
27
29
  file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" SAMPLING_TEST_BINARY_PATH)
28
30
  string(APPEND SAMPLING_TEST_BINARY_PATH "/")
29
- target_compile_definitions(sampling_test
31
+ target_compile_definitions(var_opt_sampling_test
30
32
  PRIVATE
31
33
  TEST_BINARY_INPUT_PATH="${SAMPLING_TEST_BINARY_PATH}"
32
34
  )
33
35
 
34
36
  add_test(
35
- NAME sampling_test
36
- COMMAND sampling_test
37
+ NAME var_opt_sampling_test
38
+ COMMAND var_opt_sampling_test
37
39
  )
38
40
 
39
- target_sources(sampling_test
41
+ target_sources(var_opt_sampling_test
40
42
  PRIVATE
41
43
  var_opt_sketch_test.cpp
42
44
  var_opt_union_test.cpp
43
45
  var_opt_allocation_test.cpp
44
46
  )
47
+
48
+
49
+ # EBPPS SAMPLING
50
+ add_executable(ebpps_sampling_test)
51
+
52
+ target_link_libraries(ebpps_sampling_test sampling common_test_lib)
53
+
54
+ set_target_properties(ebpps_sampling_test PROPERTIES
55
+ CXX_STANDARD_REQUIRED YES
56
+ )
57
+
58
+ target_compile_definitions(ebpps_sampling_test
59
+ PRIVATE
60
+ TEST_BINARY_INPUT_PATH="${SAMPLING_TEST_BINARY_PATH}"
61
+ )
62
+
63
+ add_test(
64
+ NAME ebpps_sampling_test
65
+ COMMAND ebpps_sampling_test
66
+ )
67
+
68
+ target_sources(ebpps_sampling_test
69
+ PRIVATE
70
+ ebpps_sample_test.cpp
71
+ ebpps_sketch_test.cpp
72
+ ebpps_allocation_test.cpp
73
+ )
74
+
75
+
76
+ # Compatibility
77
+ if (SERDE_COMPAT)
78
+ target_sources(var_opt_sampling_test
79
+ PRIVATE
80
+ var_opt_sketch_deserialize_from_java_test.cpp
81
+ var_opt_union_deserialize_from_java_test.cpp
82
+ )
83
+ endif()
84
+
85
+ if (GENERATE)
86
+ target_sources(var_opt_sampling_test
87
+ PRIVATE
88
+ var_opt_sketch_serialize_for_java.cpp
89
+ var_opt_union_serialize_for_java.cpp
90
+ )
91
+ endif()
@@ -0,0 +1,96 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <ebpps_sketch.hpp>
21
+ #include <test_type.hpp>
22
+ #include <test_allocator.hpp>
23
+
24
+ #include <catch2/catch.hpp>
25
+
26
+ #include <sstream>
27
+
28
+ namespace datasketches {
29
+
30
+ using ebpps_test_sketch = ebpps_sketch<test_type, test_allocator<test_type>>;
31
+ using alloc = test_allocator<test_type>;
32
+
33
+ TEST_CASE("ebpps allocation test", "[ebpps_sketch][test_type]") {
34
+ test_allocator_total_bytes = 0;
35
+ test_allocator_net_allocations = 0;
36
+ {
37
+ ebpps_test_sketch sk1(10, 0);
38
+ for (int i = 0; i < 100; ++i)
39
+ sk1.update(i);
40
+ auto bytes1 = sk1.serialize(0, test_type_serde());
41
+ auto sk2 = ebpps_test_sketch::deserialize(bytes1.data(), bytes1.size(), test_type_serde(), 0);
42
+
43
+ std::stringstream ss;
44
+ sk1.serialize(ss, test_type_serde());
45
+ auto sk3 = ebpps_test_sketch::deserialize(ss, test_type_serde(), alloc(0));
46
+
47
+ sk1.merge(sk2); // same size into sk1
48
+ sk3.merge(sk1); // larger into sk3
49
+
50
+ auto bytes2 = sk1.serialize(0, test_type_serde());
51
+ auto sk4 = ebpps_test_sketch::deserialize(bytes2.data(), bytes2.size(), test_type_serde(), 0);
52
+ }
53
+ REQUIRE(test_allocator_total_bytes == 0);
54
+ REQUIRE(test_allocator_net_allocations == 0);
55
+ }
56
+
57
+ TEST_CASE( "ebpps merge", "[ebpps_sketch][test_type]") {
58
+ test_allocator_total_bytes = 0;
59
+ test_allocator_net_allocations = 0;
60
+ {
61
+ uint32_t n = 20;
62
+ uint32_t k = 5;
63
+ ebpps_test_sketch sk1(k, 0);
64
+ ebpps_test_sketch sk2(k, 0);
65
+
66
+ // move udpates
67
+ for (int i = 0; i < (int) n; ++i) {
68
+ sk1.update(i);
69
+ sk2.update(-i);
70
+ sk1.update(n + i); // sk1 heavier than sk2
71
+ }
72
+ REQUIRE(sk1.get_n() == 2 * n);
73
+ REQUIRE(sk2.get_n() == n);
74
+
75
+ // move merge -- lighter into heavier
76
+ sk1.merge(std::move(sk2));
77
+ REQUIRE(sk1.get_n() == 3 * n);
78
+
79
+ // move constructor
80
+ ebpps_test_sketch sk3(std::move(sk1));
81
+ REQUIRE(sk3.get_n() == 3 * n);
82
+
83
+ // move assignment
84
+ ebpps_test_sketch sk4(k, 0);
85
+ sk4 = std::move(sk2);
86
+ REQUIRE(sk4.get_n() == n);
87
+
88
+ // move merge -- heavier into lighter
89
+ sk4.merge(sk3);
90
+ REQUIRE(sk4.get_n() == 4 * n);
91
+ }
92
+ REQUIRE(test_allocator_total_bytes == 0);
93
+ REQUIRE(test_allocator_net_allocations == 0);
94
+ }
95
+
96
+ }
@@ -0,0 +1,137 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <ebpps_sample.hpp>
21
+
22
+ #include <catch2/catch.hpp>
23
+
24
+ #include <vector>
25
+ #include <string>
26
+ #include <sstream>
27
+ #include <fstream>
28
+ #include <cmath>
29
+ #include <random>
30
+ #include <stdexcept>
31
+
32
+ namespace datasketches {
33
+
34
+ static constexpr double EPS = 1e-15;
35
+
36
+ TEST_CASE("ebpps sample: basic initialization", "[ebpps_sketch]") {
37
+ ebpps_sample<int> sample = ebpps_sample<int>(0);
38
+ REQUIRE(sample.get_c() == 0.0);
39
+ REQUIRE(sample.get_num_retained_items() == 0);
40
+ REQUIRE(sample.get_sample().size() == 0);
41
+ }
42
+
43
+ TEST_CASE("ebpps sample: pre-initialized", "[ebpps_sketch]") {
44
+ double theta = 1.0;
45
+ ebpps_sample<int> sample = ebpps_sample<int>(-1, theta);
46
+ REQUIRE(sample.get_c() == theta);
47
+ REQUIRE(sample.get_num_retained_items() == 1);
48
+ REQUIRE(sample.get_sample().size() == 1);
49
+ REQUIRE(sample.has_partial_item() == false);
50
+
51
+ theta = 1e-300;
52
+ sample = ebpps_sample<int>(-1, theta);
53
+ REQUIRE(sample.get_c() == theta);
54
+ REQUIRE(sample.get_num_retained_items() == 1);
55
+ REQUIRE(sample.get_sample().size() == 0); // assuming the random number is > 1e-300
56
+ REQUIRE(sample.has_partial_item());
57
+ }
58
+
59
+ TEST_CASE("ebpps sample: downsampling", "[ebpps_sketch]") {
60
+ ebpps_sample<char> sample = ebpps_sample<char>('a', 1.0);
61
+
62
+ sample.downsample(2.0); // no-op
63
+ REQUIRE(sample.get_c() == 1.0);
64
+ REQUIRE(sample.get_num_retained_items() == 1);
65
+ REQUIRE(sample.has_partial_item() == false);
66
+
67
+ // downsample and result in an empty sample
68
+ random_utils::override_seed(12);
69
+ std::vector<char> items = {'a', 'b'};
70
+ optional<char> opt; // empty
71
+ sample = ebpps_sample<char>(std::move(items), std::move(opt), 1.8);
72
+ sample.downsample(0.5);
73
+ REQUIRE(sample.get_c() == 0.9);
74
+ REQUIRE(sample.get_num_retained_items() == 0);
75
+ REQUIRE(sample.has_partial_item() == false);
76
+
77
+ // downsample and result in a sample with a partial item
78
+ items = {'a', 'b'};
79
+ opt.reset();
80
+ sample = ebpps_sample<char>(std::move(items), std::move(opt), 1.5);
81
+ sample.downsample(0.5);
82
+ REQUIRE(sample.get_c() == 0.75);
83
+ REQUIRE(sample.get_num_retained_items() == 1);
84
+ REQUIRE(sample.has_partial_item() == true);
85
+ for (char c : sample) {
86
+ REQUIRE((c == 'a' || c == 'b'));
87
+ }
88
+
89
+ // downsample to an exact integer c (7.5 * 0.8 = 6.0)
90
+ items = {'a', 'b', 'c', 'd', 'e', 'f', 'g'};
91
+ opt.emplace('h');
92
+ auto ref_items = items; // copy to check contents
93
+ ref_items.emplace_back('h'); // include partial item
94
+ sample = ebpps_sample<char>(std::move(items), std::move(opt), 7.5);
95
+ sample.downsample(0.8);
96
+ REQUIRE(sample.get_c() == 6.0);
97
+ REQUIRE(sample.get_num_retained_items() == 6);
98
+ REQUIRE(sample.has_partial_item() == false);
99
+ for (char c : sample) {
100
+ REQUIRE(std::find(ref_items.begin(), ref_items.end(), c) != ref_items.end());
101
+ }
102
+
103
+ // downsample to c > 1 with partial item
104
+ items = ref_items; // includes previous optional item
105
+ opt.emplace('i');
106
+ sample = ebpps_sample<char>(std::move(items), std::move(opt), 8.5);
107
+ REQUIRE(sample.get_partial_item() == 'i');
108
+ sample.downsample(0.8);
109
+ REQUIRE(sample.get_c() == Approx(6.8).margin(EPS));
110
+ REQUIRE(sample.get_num_retained_items() == 7);
111
+ REQUIRE(sample.has_partial_item() == true);
112
+ ref_items.emplace_back('i');
113
+ for (char c : sample) {
114
+ REQUIRE(std::find(ref_items.begin(), ref_items.end(), c) != ref_items.end());
115
+ }
116
+
117
+ random_utils::override_seed(random_utils::rd());
118
+ }
119
+
120
+ TEST_CASE("ebpps sample: merge unit samples", "[ebpps_sketch]") {
121
+ uint32_t k = 8;
122
+ ebpps_sample<int> sample = ebpps_sample<int>(k);
123
+
124
+ for (uint32_t i = 1; i <= k; ++i) {
125
+ ebpps_sample<int> s = ebpps_sample<int>(i, 1.0);
126
+ sample.merge(s);
127
+ REQUIRE(sample.get_c() == static_cast<double>(i));
128
+ REQUIRE(sample.get_num_retained_items() == i);
129
+ }
130
+
131
+ sample.reset();
132
+ REQUIRE(sample.get_c() == 0);
133
+ REQUIRE(sample.get_num_retained_items() == 0);
134
+ REQUIRE(sample.has_partial_item() == false);
135
+ }
136
+
137
+ } // namespace datasketches