datasketches 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -37,7 +37,7 @@ kll_sketch<T, C, A>::kll_sketch(uint16_t k, const C& comparator, const A& alloca
37
37
  comparator_(comparator),
38
38
  allocator_(allocator),
39
39
  k_(k),
40
- m_(DEFAULT_M),
40
+ m_(kll_constants::DEFAULT_M),
41
41
  min_k_(k),
42
42
  num_levels_(1),
43
43
  is_level_zero_sorted_(false),
@@ -45,12 +45,13 @@ n_(0),
45
45
  levels_(2, 0, allocator),
46
46
  items_(nullptr),
47
47
  items_size_(k_),
48
- min_item_(nullptr),
49
- max_item_(nullptr),
48
+ min_item_(),
49
+ max_item_(),
50
50
  sorted_view_(nullptr)
51
51
  {
52
- if (k < MIN_K || k > MAX_K) {
53
- throw std::invalid_argument("K must be >= " + std::to_string(MIN_K) + " and <= " + std::to_string(MAX_K) + ": " + std::to_string(k));
52
+ if (k < kll_constants::MIN_K || k > kll_constants::MAX_K) {
53
+ throw std::invalid_argument("K must be >= " + std::to_string(kll_constants::MIN_K) + " and <= "
54
+ + std::to_string(kll_constants::MAX_K) + ": " + std::to_string(k));
54
55
  }
55
56
  levels_[0] = levels_[1] = k;
56
57
  items_ = allocator_.allocate(items_size_);
@@ -69,14 +70,12 @@ n_(other.n_),
69
70
  levels_(other.levels_),
70
71
  items_(nullptr),
71
72
  items_size_(other.items_size_),
72
- min_item_(nullptr),
73
- max_item_(nullptr),
73
+ min_item_(other.min_item_),
74
+ max_item_(other.max_item_),
74
75
  sorted_view_(nullptr)
75
76
  {
76
77
  items_ = allocator_.allocate(items_size_);
77
78
  for (auto i = levels_[0]; i < levels_[num_levels_]; ++i) new (&items_[i]) T(other.items_[i]);
78
- if (other.min_item_ != nullptr) min_item_ = new (allocator_.allocate(1)) T(*other.min_item_);
79
- if (other.max_item_ != nullptr) max_item_ = new (allocator_.allocate(1)) T(*other.max_item_);
80
79
  }
81
80
 
82
81
  template<typename T, typename C, typename A>
@@ -92,13 +91,11 @@ n_(other.n_),
92
91
  levels_(std::move(other.levels_)),
93
92
  items_(other.items_),
94
93
  items_size_(other.items_size_),
95
- min_item_(other.min_item_),
96
- max_item_(other.max_item_),
94
+ min_item_(std::move(other.min_item_)),
95
+ max_item_(std::move(other.max_item_)),
97
96
  sorted_view_(nullptr)
98
97
  {
99
98
  other.items_ = nullptr;
100
- other.min_item_ = nullptr;
101
- other.max_item_ = nullptr;
102
99
  }
103
100
 
104
101
  template<typename T, typename C, typename A>
@@ -148,14 +145,6 @@ kll_sketch<T, C, A>::~kll_sketch() {
148
145
  for (uint32_t i = begin; i < end; i++) items_[i].~T();
149
146
  allocator_.deallocate(items_, items_size_);
150
147
  }
151
- if (min_item_ != nullptr) {
152
- min_item_->~T();
153
- allocator_.deallocate(min_item_, 1);
154
- }
155
- if (max_item_ != nullptr) {
156
- max_item_->~T();
157
- allocator_.deallocate(max_item_, 1);
158
- }
159
148
  reset_sorted_view();
160
149
  }
161
150
 
@@ -173,8 +162,8 @@ n_(other.n_),
173
162
  levels_(other.levels_, allocator_),
174
163
  items_(nullptr),
175
164
  items_size_(other.items_size_),
176
- min_item_(nullptr),
177
- max_item_(nullptr),
165
+ min_item_(other.min_item_),
166
+ max_item_(other.max_item_),
178
167
  sorted_view_(nullptr)
179
168
  {
180
169
  static_assert(
@@ -183,8 +172,6 @@ sorted_view_(nullptr)
183
172
  );
184
173
  items_ = allocator_.allocate(items_size_);
185
174
  for (auto i = levels_[0]; i < levels_[num_levels_]; ++i) new (&items_[i]) T(other.items_[i]);
186
- if (other.min_item_ != nullptr) min_item_ = new (allocator_.allocate(1)) T(*other.min_item_);
187
- if (other.max_item_ != nullptr) max_item_ = new (allocator_.allocate(1)) T(*other.max_item_);
188
175
  check_sorting();
189
176
  }
190
177
 
@@ -192,7 +179,7 @@ template<typename T, typename C, typename A>
192
179
  template<typename FwdT>
193
180
  void kll_sketch<T, C, A>::update(FwdT&& item) {
194
181
  if (!check_update_item(item)) { return; }
195
- update_min_max(item);
182
+ update_min_max(static_cast<const T&>(item)); // min and max are always copies
196
183
  const uint32_t index = internal_update();
197
184
  new (&items_[index]) T(std::forward<FwdT>(item));
198
185
  reset_sorted_view();
@@ -201,8 +188,8 @@ void kll_sketch<T, C, A>::update(FwdT&& item) {
201
188
  template<typename T, typename C, typename A>
202
189
  void kll_sketch<T, C, A>::update_min_max(const T& item) {
203
190
  if (is_empty()) {
204
- min_item_ = new (allocator_.allocate(1)) T(item);
205
- max_item_ = new (allocator_.allocate(1)) T(item);
191
+ min_item_.emplace(item);
192
+ max_item_.emplace(item);
206
193
  } else {
207
194
  if (comparator_(item, *min_item_)) *min_item_ = item;
208
195
  if (comparator_(*max_item_, item)) *max_item_ = item;
@@ -225,8 +212,8 @@ void kll_sketch<T, C, A>::merge(FwdSk&& other) {
225
212
  throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
226
213
  }
227
214
  if (is_empty()) {
228
- min_item_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.min_item_));
229
- max_item_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.max_item_));
215
+ min_item_.emplace(conditional_forward<FwdSk>(*other.min_item_));
216
+ max_item_.emplace(conditional_forward<FwdSk>(*other.max_item_));
230
217
  } else {
231
218
  if (comparator_(*other.min_item_, *min_item_)) *min_item_ = conditional_forward<FwdSk>(*other.min_item_);
232
219
  if (comparator_(*max_item_, *other.max_item_)) *max_item_ = conditional_forward<FwdSk>(*other.max_item_);
@@ -322,42 +309,6 @@ auto kll_sketch<T, C, A>::get_quantile(double rank, bool inclusive) const -> qua
322
309
  return sorted_view_->get_quantile(rank, inclusive);
323
310
  }
324
311
 
325
- template<typename T, typename C, typename A>
326
- std::vector<T, A> kll_sketch<T, C, A>::get_quantiles(const double* ranks, uint32_t size, bool inclusive) const {
327
- if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
328
- std::vector<T, A> quantiles(allocator_);
329
- quantiles.reserve(size);
330
-
331
- // may have a side effect of sorting level zero if needed
332
- setup_sorted_view();
333
-
334
- for (uint32_t i = 0; i < size; i++) {
335
- const double rank = ranks[i];
336
- if ((rank < 0.0) || (rank > 1.0)) {
337
- throw std::invalid_argument("normalized rank cannot be less than 0 or greater than 1");
338
- }
339
- quantiles.push_back(sorted_view_->get_quantile(rank, inclusive));
340
- }
341
- return quantiles;
342
- }
343
-
344
- template<typename T, typename C, typename A>
345
- std::vector<T, A> kll_sketch<T, C, A>::get_quantiles(uint32_t num, bool inclusive) const {
346
- if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
347
- if (num == 0) {
348
- throw std::invalid_argument("num must be > 0");
349
- }
350
- vector_double ranks(num, 0, allocator_);
351
- ranks[0] = 0.0;
352
- for (size_t i = 1; i < num; i++) {
353
- ranks[i] = static_cast<double>(i) / (num - 1);
354
- }
355
- if (num > 1) {
356
- ranks[num - 1] = 1.0;
357
- }
358
- return get_quantiles(ranks.data(), num, inclusive);
359
- }
360
-
361
312
  template<typename T, typename C, typename A>
362
313
  double kll_sketch<T, C, A>::get_normalized_rank_error(bool pmf) const {
363
314
  return get_normalized_rank_error(min_k_, pmf);
@@ -396,7 +347,7 @@ template<typename T, typename C, typename A>
396
347
  template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
397
348
  size_t kll_sketch<T, C, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n) {
398
349
  const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
399
- const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
350
+ const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, kll_constants::DEFAULT_M, num_levels);
400
351
  // the last integer in the levels_ array is not serialized because it can be derived
401
352
  return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * sizeof(TT);
402
353
  }
@@ -406,7 +357,7 @@ template<typename T, typename C, typename A>
406
357
  template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
407
358
  size_t kll_sketch<T, C, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes) {
408
359
  const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
409
- const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
360
+ const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, kll_constants::DEFAULT_M, num_levels);
410
361
  // the last integer in the levels_ array is not serialized because it can be derived
411
362
  return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * max_item_size_bytes;
412
363
  }
@@ -438,8 +389,8 @@ void kll_sketch<T, C, A>::serialize(std::ostream& os, const SerDe& sd) const {
438
389
  write(os, num_levels_);
439
390
  write(os, unused);
440
391
  write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
441
- sd.serialize(os, min_item_, 1);
442
- sd.serialize(os, max_item_, 1);
392
+ sd.serialize(os, &*min_item_, 1);
393
+ sd.serialize(os, &*max_item_, 1);
443
394
  }
444
395
  sd.serialize(os, &items_[levels_[0]], get_num_retained());
445
396
  }
@@ -474,8 +425,8 @@ auto kll_sketch<T, C, A>::serialize(unsigned header_size_bytes, const SerDe& sd)
474
425
  ptr += copy_to_mem(num_levels_, ptr);
475
426
  ptr += sizeof(uint8_t); // unused
476
427
  ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
477
- ptr += sd.serialize(ptr, end_ptr - ptr, min_item_, 1);
478
- ptr += sd.serialize(ptr, end_ptr - ptr, max_item_, 1);
428
+ ptr += sd.serialize(ptr, end_ptr - ptr, &*min_item_, 1);
429
+ ptr += sd.serialize(ptr, end_ptr - ptr, &*max_item_, 1);
479
430
  }
480
431
  const size_t bytes_remaining = end_ptr - ptr;
481
432
  ptr += sd.serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
@@ -530,20 +481,20 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(std::istream& is, const Ser
530
481
  read(is, levels.data(), sizeof(levels[0]) * num_levels);
531
482
  }
532
483
  levels[num_levels] = capacity;
533
- A alloc(allocator);
534
- auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
535
- std::unique_ptr<T, decltype(item_buffer_deleter)> min_item_buffer(alloc.allocate(1), item_buffer_deleter);
536
- std::unique_ptr<T, decltype(item_buffer_deleter)> max_item_buffer(alloc.allocate(1), item_buffer_deleter);
537
- std::unique_ptr<T, item_deleter> min_item(nullptr, item_deleter(allocator));
538
- std::unique_ptr<T, item_deleter> max_item(nullptr, item_deleter(allocator));
484
+ optional<T> tmp; // space to deserialize min and max
485
+ optional<T> min_item;
486
+ optional<T> max_item;
539
487
  if (!is_single_item) {
540
- sd.deserialize(is, min_item_buffer.get(), 1);
541
- // serde call did not throw, repackage with destrtuctor
542
- min_item = std::unique_ptr<T, item_deleter>(min_item_buffer.release(), item_deleter(allocator));
543
- sd.deserialize(is, max_item_buffer.get(), 1);
544
- // serde call did not throw, repackage with destrtuctor
545
- max_item = std::unique_ptr<T, item_deleter>(max_item_buffer.release(), item_deleter(allocator));
488
+ sd.deserialize(is, &*tmp, 1);
489
+ // serde call did not throw, repackage and cleanup
490
+ min_item.emplace(*tmp);
491
+ (*tmp).~T();
492
+ sd.deserialize(is, &*tmp, 1);
493
+ // serde call did not throw, repackage and cleanup
494
+ max_item.emplace(*tmp);
495
+ (*tmp).~T();
546
496
  }
497
+ A alloc(allocator);
547
498
  auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
548
499
  std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
549
500
  const auto num_items = levels[num_levels] - levels[0];
@@ -552,12 +503,8 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(std::istream& is, const Ser
552
503
  std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
553
504
  const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
554
505
  if (is_single_item) {
555
- new (min_item_buffer.get()) T(items.get()[levels[0]]);
556
- // copy did not throw, repackage with destrtuctor
557
- min_item = std::unique_ptr<T, item_deleter>(min_item_buffer.release(), item_deleter(allocator));
558
- new (max_item_buffer.get()) T(items.get()[levels[0]]);
559
- // copy did not throw, repackage with destrtuctor
560
- max_item = std::unique_ptr<T, item_deleter>(max_item_buffer.release(), item_deleter(allocator));
506
+ min_item.emplace(items.get()[levels[0]]);
507
+ max_item.emplace(items.get()[levels[0]]);
561
508
  }
562
509
  if (!is.good())
563
510
  throw std::runtime_error("error reading from std::istream");
@@ -618,20 +565,20 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(const void* bytes, size_t s
618
565
  ptr += copy_from_mem(ptr, levels.data(), sizeof(levels[0]) * num_levels);
619
566
  }
620
567
  levels[num_levels] = capacity;
621
- A alloc(allocator);
622
- auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
623
- std::unique_ptr<T, decltype(item_buffer_deleter)> min_item_buffer(alloc.allocate(1), item_buffer_deleter);
624
- std::unique_ptr<T, decltype(item_buffer_deleter)> max_item_buffer(alloc.allocate(1), item_buffer_deleter);
625
- std::unique_ptr<T, item_deleter> min_item(nullptr, item_deleter(allocator));
626
- std::unique_ptr<T, item_deleter> max_item(nullptr, item_deleter(allocator));
568
+ optional<T> tmp; // space to deserialize min and max
569
+ optional<T> min_item;
570
+ optional<T> max_item;
627
571
  if (!is_single_item) {
628
- ptr += sd.deserialize(ptr, end_ptr - ptr, min_item_buffer.get(), 1);
629
- // serde call did not throw, repackage with destrtuctor
630
- min_item = std::unique_ptr<T, item_deleter>(min_item_buffer.release(), item_deleter(allocator));
631
- ptr += sd.deserialize(ptr, end_ptr - ptr, max_item_buffer.get(), 1);
632
- // serde call did not throw, repackage with destrtuctor
633
- max_item = std::unique_ptr<T, item_deleter>(max_item_buffer.release(), item_deleter(allocator));
572
+ ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
573
+ // serde call did not throw, repackage and cleanup
574
+ min_item.emplace(*tmp);
575
+ (*tmp).~T();
576
+ ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
577
+ // serde call did not throw, repackage and cleanup
578
+ max_item.emplace(*tmp);
579
+ (*tmp).~T();
634
580
  }
581
+ A alloc(allocator);
635
582
  auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
636
583
  std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
637
584
  const auto num_items = levels[num_levels] - levels[0];
@@ -642,12 +589,8 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(const void* bytes, size_t s
642
589
  if (delta != size) throw std::logic_error("deserialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
643
590
  const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
644
591
  if (is_single_item) {
645
- new (min_item_buffer.get()) T(items.get()[levels[0]]);
646
- // copy did not throw, repackage with destrtuctor
647
- min_item = std::unique_ptr<T, item_deleter>(min_item_buffer.release(), item_deleter(allocator));
648
- new (max_item_buffer.get()) T(items.get()[levels[0]]);
649
- // copy did not throw, repackage with destrtuctor
650
- max_item = std::unique_ptr<T, item_deleter>(max_item_buffer.release(), item_deleter(allocator));
592
+ min_item.emplace(items.get()[levels[0]]);
593
+ max_item.emplace(items.get()[levels[0]]);
651
594
  }
652
595
  return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
653
596
  std::move(min_item), std::move(max_item), is_level_zero_sorted, comparator);
@@ -670,12 +613,12 @@ double kll_sketch<T, C, A>::get_normalized_rank_error(uint16_t k, bool pmf) {
670
613
  // for deserialization
671
614
  template<typename T, typename C, typename A>
672
615
  kll_sketch<T, C, A>::kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32&& levels,
673
- std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_item,
674
- std::unique_ptr<T, item_deleter> max_item, bool is_level_zero_sorted, const C& comparator):
616
+ std::unique_ptr<T, items_deleter> items, uint32_t items_size, optional<T>&& min_item,
617
+ optional<T>&& max_item, bool is_level_zero_sorted, const C& comparator):
675
618
  comparator_(comparator),
676
619
  allocator_(levels.get_allocator()),
677
620
  k_(k),
678
- m_(DEFAULT_M),
621
+ m_(kll_constants::DEFAULT_M),
679
622
  min_k_(min_k),
680
623
  num_levels_(num_levels),
681
624
  is_level_zero_sorted_(is_level_zero_sorted),
@@ -683,8 +626,8 @@ n_(n),
683
626
  levels_(std::move(levels)),
684
627
  items_(items.release()),
685
628
  items_size_(items_size),
686
- min_item_(min_item.release()),
687
- max_item_(max_item.release()),
629
+ min_item_(std::move(min_item)),
630
+ max_item_(std::move(max_item)),
688
631
  sorted_view_(nullptr)
689
632
  {}
690
633
 
@@ -820,7 +763,7 @@ quantiles_sorted_view<T, C, A> kll_sketch<T, C, A>::get_sorted_view() const {
820
763
  for (uint8_t level = 0; level < num_levels_; ++level) {
821
764
  const auto from = items_ + levels_[level];
822
765
  const auto to = items_ + levels_[level + 1]; // exclusive
823
- view.add(from, to, 1 << level);
766
+ view.add(from, to, 1ULL << level);
824
767
  }
825
768
  view.convert_to_cummulative();
826
769
  return view;
@@ -917,8 +860,8 @@ uint32_t kll_sketch<T, C, A>::get_num_retained_above_level_zero() const {
917
860
 
918
861
  template<typename T, typename C, typename A>
919
862
  void kll_sketch<T, C, A>::check_m(uint8_t m) {
920
- if (m != DEFAULT_M) {
921
- throw std::invalid_argument("Possible corruption: M must be " + std::to_string(DEFAULT_M)
863
+ if (m != kll_constants::DEFAULT_M) {
864
+ throw std::invalid_argument("Possible corruption: M must be " + std::to_string(kll_constants::DEFAULT_M)
922
865
  + ": " + std::to_string(m));
923
866
  }
924
867
  }
@@ -1019,20 +962,6 @@ typename kll_sketch<T, C, A>::const_iterator kll_sketch<T, C, A>::end() const {
1019
962
  return kll_sketch<T, C, A>::const_iterator(nullptr, levels_.data(), num_levels_);
1020
963
  }
1021
964
 
1022
- template<typename T, typename C, typename A>
1023
- class kll_sketch<T, C, A>::item_deleter {
1024
- public:
1025
- item_deleter(const A& allocator): allocator_(allocator) {}
1026
- void operator() (T* ptr) {
1027
- if (ptr != nullptr) {
1028
- ptr->~T();
1029
- allocator_.deallocate(ptr, 1);
1030
- }
1031
- }
1032
- private:
1033
- A allocator_;
1034
- };
1035
-
1036
965
  template<typename T, typename C, typename A>
1037
966
  class kll_sketch<T, C, A>::items_deleter {
1038
967
  public:
@@ -20,7 +20,6 @@ add_executable(kll_test)
20
20
  target_link_libraries(kll_test kll common_test_lib)
21
21
 
22
22
  set_target_properties(kll_test PROPERTIES
23
- CXX_STANDARD 11
24
23
  CXX_STANDARD_REQUIRED YES
25
24
  )
26
25
 
@@ -43,3 +42,17 @@ target_sources(kll_test
43
42
  kll_sketch_validation.cpp
44
43
  kolmogorov_smirnov_test.cpp
45
44
  )
45
+
46
+ if (SERDE_COMPAT)
47
+ target_sources(kll_test
48
+ PRIVATE
49
+ kll_sketch_deserialize_from_java_test.cpp
50
+ )
51
+ endif()
52
+
53
+ if (GENERATE)
54
+ target_sources(kll_test
55
+ PRIVATE
56
+ kll_sketch_serialize_for_java.cpp
57
+ )
58
+ endif()
@@ -0,0 +1,103 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+ #include <kll_sketch.hpp>
23
+
24
+ namespace datasketches {
25
+
26
+ // assume the binary sketches for this test have been generated by datasketches-java code
27
+ // in the subdirectory called "java" in the root directory of this project
28
+ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
29
+
30
+ TEST_CASE("kll float", "[serde_compat]") {
31
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
32
+ for (const unsigned n: n_arr) {
33
+ std::ifstream is;
34
+ is.exceptions(std::ios::failbit | std::ios::badbit);
35
+ is.open(testBinaryInputPath + "kll_float_n" + std::to_string(n) + "_java.sk", std::ios::binary);
36
+ const auto sketch = kll_sketch<float>::deserialize(is);
37
+ REQUIRE(sketch.is_empty() == (n == 0));
38
+ REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K));
39
+ REQUIRE(sketch.get_n() == n);
40
+ if (n > 0) {
41
+ REQUIRE(sketch.get_min_item() == 1.0f);
42
+ REQUIRE(sketch.get_max_item() == static_cast<float>(n));
43
+ uint64_t weight = 0;
44
+ for (const auto pair: sketch) {
45
+ REQUIRE(pair.first >= sketch.get_min_item());
46
+ REQUIRE(pair.first <= sketch.get_max_item());
47
+ weight += pair.second;
48
+ }
49
+ REQUIRE(weight == sketch.get_n());
50
+ }
51
+ }
52
+ }
53
+
54
+ TEST_CASE("kll double", "[serde_compat]") {
55
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
56
+ for (const unsigned n: n_arr) {
57
+ std::ifstream is;
58
+ is.exceptions(std::ios::failbit | std::ios::badbit);
59
+ is.open(testBinaryInputPath + "kll_double_n" + std::to_string(n) + "_java.sk", std::ios::binary);
60
+ const auto sketch = kll_sketch<double>::deserialize(is);
61
+ REQUIRE(sketch.is_empty() == (n == 0));
62
+ REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K));
63
+ REQUIRE(sketch.get_n() == n);
64
+ if (n > 0) {
65
+ REQUIRE(sketch.get_min_item() == 1.0);
66
+ REQUIRE(sketch.get_max_item() == static_cast<double>(n));
67
+ uint64_t weight = 0;
68
+ for (const auto pair: sketch) {
69
+ REQUIRE(pair.first >= sketch.get_min_item());
70
+ REQUIRE(pair.first <= sketch.get_max_item());
71
+ weight += pair.second;
72
+ }
73
+ REQUIRE(weight == sketch.get_n());
74
+ }
75
+ }
76
+ }
77
+
78
+ // numbers are padded with leading spaces so that natural order works
79
+ TEST_CASE("kll string", "[serde_compat]") {
80
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
81
+ for (const unsigned n: n_arr) {
82
+ std::ifstream is;
83
+ is.exceptions(std::ios::failbit | std::ios::badbit);
84
+ is.open(testBinaryInputPath + "kll_string_n" + std::to_string(n) + "_java.sk", std::ios::binary);
85
+ const auto sketch = kll_sketch<std::string>::deserialize(is);
86
+ REQUIRE(sketch.is_empty() == (n == 0));
87
+ REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K));
88
+ REQUIRE(sketch.get_n() == n);
89
+ if (n > 0) {
90
+ REQUIRE(std::stoul(sketch.get_min_item()) == 1);
91
+ REQUIRE(std::stoul(sketch.get_max_item()) == n);
92
+ uint64_t weight = 0;
93
+ for (const auto pair: sketch) {
94
+ REQUIRE(pair.first >= sketch.get_min_item());
95
+ REQUIRE(pair.first <= sketch.get_max_item());
96
+ weight += pair.second;
97
+ }
98
+ REQUIRE(weight == sketch.get_n());
99
+ }
100
+ }
101
+ }
102
+
103
+ } /* namespace datasketches */
@@ -0,0 +1,62 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+ #include <kll_sketch.hpp>
23
+
24
+ namespace datasketches {
25
+
26
+ TEST_CASE("kll sketch float generate", "[serialize_for_java]") {
27
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
28
+ for (const unsigned n: n_arr) {
29
+ kll_sketch<float> sketch;
30
+ for (unsigned i = 1; i <= n; ++i) sketch.update(i);
31
+ std::ofstream os("kll_float_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
32
+ sketch.serialize(os);
33
+ }
34
+ }
35
+
36
+ TEST_CASE("kll sketch double generate", "[serialize_for_java]") {
37
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
38
+ for (const unsigned n: n_arr) {
39
+ kll_sketch<double> sketch;
40
+ for (unsigned i = 1; i <= n; ++i) sketch.update(i);
41
+ std::ofstream os("kll_double_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
42
+ sketch.serialize(os);
43
+ }
44
+ }
45
+
46
+ struct compare_as_number {
47
+ bool operator()(const std::string& a, const std::string& b) const {
48
+ return std::stoi(a) < std::stoi(b);
49
+ }
50
+ };
51
+
52
+ TEST_CASE("kll sketch string generate", "[serialize_for_java]") {
53
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
54
+ for (const unsigned n: n_arr) {
55
+ kll_sketch<std::string, compare_as_number> sketch;
56
+ for (unsigned i = 1; i <= n; ++i) sketch.update(std::to_string(i));
57
+ std::ofstream os("kll_string_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
58
+ sketch.serialize(os);
59
+ }
60
+ }
61
+
62
+ } /* namespace datasketches */
@@ -49,9 +49,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
49
49
  test_allocator_total_bytes = 0;
50
50
 
51
51
  SECTION("k limits") {
52
- kll_float_sketch sketch1(kll_float_sketch::MIN_K, std::less<float>(), 0); // this should work
53
- kll_float_sketch sketch2(kll_float_sketch::MAX_K, std::less<float>(), 0); // this should work
54
- REQUIRE_THROWS_AS(new kll_float_sketch(kll_float_sketch::MIN_K - 1, std::less<float>(), 0), std::invalid_argument);
52
+ kll_float_sketch sketch1(kll_constants::MIN_K, std::less<float>(), 0); // this should work
53
+ kll_float_sketch sketch2(kll_constants::MAX_K, std::less<float>(), 0); // this should work
54
+ REQUIRE_THROWS_AS(new kll_float_sketch(kll_constants::MIN_K - 1, std::less<float>(), 0), std::invalid_argument);
55
55
  // MAX_K + 1 makes no sense because k is uint16_t
56
56
  //std::cout << "sizeof(kll_sketch<float>)=" << sizeof(kll_sketch<float>) << "\n";
57
57
  //std::cout << "sizeof(kll_sketch<double>)=" << sizeof(kll_sketch<double>) << "\n";
@@ -67,8 +67,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
67
67
  REQUIRE_THROWS_AS(sketch.get_max_item(), std::runtime_error);
68
68
  REQUIRE_THROWS_AS(sketch.get_rank(0), std::runtime_error);
69
69
  REQUIRE_THROWS_AS(sketch.get_quantile(0.5), std::runtime_error);
70
- const double ranks[3] {0, 0.5, 1};
71
- REQUIRE_THROWS_AS(sketch.get_quantiles(ranks, 3), std::runtime_error);
72
70
  const float split_points[1] {0};
73
71
  REQUIRE_THROWS_AS(sketch.get_PMF(split_points, 1), std::runtime_error);
74
72
  REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 1), std::runtime_error);
@@ -99,12 +97,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
99
97
  REQUIRE(sketch.get_min_item() == 1.0);
100
98
  REQUIRE(sketch.get_max_item() == 1.0);
101
99
  REQUIRE(sketch.get_quantile(0.5) == 1.0);
102
- const double ranks[3] {0, 0.5, 1};
103
- auto quantiles = sketch.get_quantiles(ranks, 3);
104
- REQUIRE(quantiles.size() == 3);
105
- REQUIRE(quantiles[0] == 1.0);
106
- REQUIRE(quantiles[1] == 1.0);
107
- REQUIRE(quantiles[2] == 1.0);
108
100
 
109
101
  int count = 0;
110
102
  for (auto pair: sketch) {
@@ -144,20 +136,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
144
136
  REQUIRE(sketch.get_max_item() == n);
145
137
  REQUIRE(sketch.get_quantile(1) == n);
146
138
 
147
- const double ranks[3] {0, 0.5, 1};
148
- auto quantiles = sketch.get_quantiles(ranks, 3);
149
- REQUIRE(quantiles.size() == 3);
150
- REQUIRE(quantiles[0] == 1);
151
- REQUIRE(quantiles[1] == n / 2);
152
- REQUIRE(quantiles[2] == n);
153
-
154
- // alternative method must produce the same result
155
- auto quantiles2 = sketch.get_quantiles(3);
156
- REQUIRE(quantiles2.size() == 3);
157
- REQUIRE(quantiles[0] == quantiles2[0]);
158
- REQUIRE(quantiles[1] == quantiles2[1]);
159
- REQUIRE(quantiles[2] == quantiles2[2]);
160
-
161
139
  for (uint32_t i = 1; i <= n; i++) {
162
140
  const double true_rank_inclusive = static_cast<double>(i) / n;
163
141
  REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank_inclusive);
@@ -264,19 +242,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
264
242
  }
265
243
  }
266
244
 
267
- SECTION("deserialize from java") {
268
- std::ifstream is;
269
- is.exceptions(std::ios::failbit | std::ios::badbit);
270
- is.open(testBinaryInputPath + "kll_sketch_from_java.sk", std::ios::binary);
271
- auto sketch = kll_float_sketch::deserialize(is, serde<float>(), std::less<float>(), 0);
272
- REQUIRE_FALSE(sketch.is_empty());
273
- REQUIRE(sketch.is_estimation_mode());
274
- REQUIRE(sketch.get_n() == 1000000);
275
- REQUIRE(sketch.get_num_retained() == 614);
276
- REQUIRE(sketch.get_min_item() == 0.0);
277
- REQUIRE(sketch.get_max_item() == 999999.0);
278
- }
279
-
280
245
  SECTION("stream serialize deserialize empty") {
281
246
  kll_float_sketch sketch(200, std::less<float>(), 0);
282
247
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);