datasketches 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -0,0 +1,69 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+ #include <hll.hpp>
23
+
24
+ namespace datasketches {
25
+
26
+ // assume the binary sketches for this test have been generated by datasketches-java code
27
+ // in the subdirectory called "java" in the root directory of this project
28
+ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
29
+
30
+ TEST_CASE("hll4 sketch", "[serde_compat]") {
31
+ const unsigned n_arr[] = {0, 10, 100, 1000, 10000, 100000, 1000000};
32
+ for (const unsigned n: n_arr) {
33
+ std::ifstream is;
34
+ is.exceptions(std::ios::failbit | std::ios::badbit);
35
+ is.open(testBinaryInputPath + "hll4_n" + std::to_string(n) + "_java.sk", std::ios::binary);
36
+ const auto sketch = hll_sketch::deserialize(is);
37
+ REQUIRE(sketch.get_lg_config_k() == 12);
38
+ REQUIRE(sketch.is_empty() == (n == 0));
39
+ REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.02));
40
+ }
41
+ }
42
+
43
+ TEST_CASE("hll6 sketch", "[serde_compat]") {
44
+ const unsigned n_arr[] = {0, 10, 100, 1000, 10000, 100000, 1000000};
45
+ for (const unsigned n: n_arr) {
46
+ std::ifstream is;
47
+ is.exceptions(std::ios::failbit | std::ios::badbit);
48
+ is.open(testBinaryInputPath + "hll6_n" + std::to_string(n) + "_java.sk", std::ios::binary);
49
+ const auto sketch = hll_sketch::deserialize(is);
50
+ REQUIRE(sketch.get_lg_config_k() == 12);
51
+ REQUIRE(sketch.is_empty() == (n == 0));
52
+ REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.02));
53
+ }
54
+ }
55
+
56
+ TEST_CASE("hll8 sketch", "[serde_compat]") {
57
+ const unsigned n_arr[] = {0, 10, 100, 1000, 10000, 100000, 1000000};
58
+ for (const unsigned n: n_arr) {
59
+ std::ifstream is;
60
+ is.exceptions(std::ios::failbit | std::ios::badbit);
61
+ is.open(testBinaryInputPath + "hll8_n" + std::to_string(n) + "_java.sk", std::ios::binary);
62
+ const auto sketch = hll_sketch::deserialize(is);
63
+ REQUIRE(sketch.get_lg_config_k() == 12);
64
+ REQUIRE(sketch.is_empty() == (n == 0));
65
+ REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.02));
66
+ }
67
+ }
68
+
69
+ } /* namespace datasketches */
@@ -0,0 +1,52 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+ #include <hll.hpp>
23
+
24
+ namespace datasketches {
25
+
26
+ TEST_CASE("hll sketch generate", "[serialize_for_java]") {
27
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
28
+ for (const unsigned n: n_arr) {
29
+ hll_sketch hll4(12, HLL_4);
30
+ hll_sketch hll6(12, HLL_6);
31
+ hll_sketch hll8(12, HLL_8);
32
+ for (unsigned i = 0; i < n; ++i) {
33
+ hll4.update(i);
34
+ hll6.update(i);
35
+ hll8.update(i);
36
+ }
37
+ {
38
+ std::ofstream os("hll4_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
39
+ hll4.serialize_compact(os);
40
+ }
41
+ {
42
+ std::ofstream os("hll6_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
43
+ hll6.serialize_compact(os);
44
+ }
45
+ {
46
+ std::ofstream os("hll8_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
47
+ hll8.serialize_compact(os);
48
+ }
49
+ }
50
+ }
51
+
52
+ } /* namespace datasketches */
@@ -99,7 +99,7 @@ void kll_helper::randomly_halve_down(T* buf, uint32_t start, uint32_t length) {
99
99
  #ifdef KLL_VALIDATION
100
100
  const uint32_t offset = deterministic_offset();
101
101
  #else
102
- const uint32_t offset = random_bit();
102
+ const uint32_t offset = random_utils::random_bit();
103
103
  #endif
104
104
  uint32_t j = start + offset;
105
105
  for (uint32_t i = start; i < (start + half_length); i++) {
@@ -115,7 +115,7 @@ void kll_helper::randomly_halve_up(T* buf, uint32_t start, uint32_t length) {
115
115
  #ifdef KLL_VALIDATION
116
116
  const uint32_t offset = deterministic_offset();
117
117
  #else
118
- const uint32_t offset = random_bit();
118
+ const uint32_t offset = random_utils::random_bit();
119
119
  #endif
120
120
  uint32_t j = (start + length) - 1 - offset;
121
121
  for (uint32_t i = (start + length) - 1; i >= (start + half_length); i--) {
@@ -26,10 +26,22 @@
26
26
  #include "common_defs.hpp"
27
27
  #include "serde.hpp"
28
28
  #include "quantiles_sorted_view.hpp"
29
+ #include "optional.hpp"
29
30
 
30
31
  namespace datasketches {
31
32
 
32
- /*
33
+ /// KLL sketch constants
34
+ namespace kll_constants {
35
+ /// default value of parameter K
36
+ const uint16_t DEFAULT_K = 200;
37
+ const uint8_t DEFAULT_M = 8;
38
+ /// min value of parameter K
39
+ const uint16_t MIN_K = DEFAULT_M;
40
+ /// max value of parameter K
41
+ const uint16_t MAX_K = (1 << 16) - 1;
42
+ }
43
+
44
+ /**
33
45
  * Implementation of a very compact quantiles sketch with lazy compaction scheme
34
46
  * and nearly optimal accuracy per retained item.
35
47
  * See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>.
@@ -146,10 +158,6 @@ namespace datasketches {
146
158
  * author Lee Rhodes
147
159
  */
148
160
 
149
- namespace kll_constants {
150
- const uint16_t DEFAULT_K = 200;
151
- }
152
-
153
161
  template <
154
162
  typename T,
155
163
  typename C = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
@@ -159,17 +167,51 @@ class kll_sketch {
159
167
  public:
160
168
  using value_type = T;
161
169
  using comparator = C;
170
+ using allocator_type = A;
162
171
  using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
172
+ using vector_double = typename quantiles_sorted_view<T, C, A>::vector_double;
163
173
 
164
- static const uint8_t DEFAULT_M = 8;
165
- static const uint16_t MIN_K = DEFAULT_M;
166
- static const uint16_t MAX_K = (1 << 16) - 1;
174
+ /**
175
+ * Quantile return type.
176
+ * This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types)
177
+ */
178
+ using quantile_return_type = typename quantiles_sorted_view<T, C, A>::quantile_return_type;
167
179
 
180
+ /**
181
+ * Constructor
182
+ * @param k affects the size of the sketch and its estimation error
183
+ * @param comparator strict weak ordering function (see C++ named requirements: Compare)
184
+ * @param allocator used by this sketch to allocate memory
185
+ */
168
186
  explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const C& comparator = C(), const A& allocator = A());
187
+
188
+ /**
189
+ * Copy constructor
190
+ * @param other sketch to be copied
191
+ */
169
192
  kll_sketch(const kll_sketch& other);
193
+
194
+ /**
195
+ * Move constructor
196
+ * @param other sketch to be moved
197
+ */
170
198
  kll_sketch(kll_sketch&& other) noexcept;
199
+
200
+
171
201
  ~kll_sketch();
202
+
203
+ /**
204
+ * Copy assignment
205
+ * @param other sketch to be copied
206
+ * @return reference to this sketch
207
+ */
172
208
  kll_sketch& operator=(const kll_sketch& other);
209
+
210
+ /**
211
+ * Move assignment
212
+ * @param other sketch to be moved
213
+ * @return reference to this sketch
214
+ */
173
215
  kll_sketch& operator=(kll_sketch&& other);
174
216
 
175
217
  /*
@@ -262,44 +304,8 @@ class kll_sketch {
262
304
  *
263
305
  * @return approximate quantile associated with the given rank
264
306
  */
265
- using quantile_return_type = typename quantiles_sorted_view<T, C, A>::quantile_return_type;
266
307
  quantile_return_type get_quantile(double rank, bool inclusive = true) const;
267
308
 
268
- /**
269
- * This returns an array that could have been generated by using get_quantile() for each
270
- * rank separately.
271
- *
272
- * <p>If the sketch is empty this throws std::runtime_error.
273
- *
274
- * @param ranks given array of ranks in the hypothetical sorted stream.
275
- * These ranks must be in the interval [0.0, 1.0].
276
- * @param size the number of ranks in the array
277
- * @param inclusive if true, the given ranks are considered inclusive (include weights of items)
278
- *
279
- * @return array of approximate quantiles corresponding to the given ranks in the same order.
280
- *
281
- * Deprecated. Will be removed in the next major version. Use get_quantile() instead.
282
- */
283
- std::vector<T, A> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
284
-
285
- /**
286
- * This is a multiple-query version of get_quantile() that allows the caller to
287
- * specify the number of evenly-spaced ranks.
288
- *
289
- * <p>If the sketch is empty this throws std::runtime_error.
290
- *
291
- * @param num an integer that specifies the number of evenly-spaced ranks.
292
- * This must be an integer greater than 0. A value of 1 will return the quantile of rank 0.
293
- * A value of 2 will return quantiles of ranks 0 and 1. A value of 3 will return quantiles of ranks 0,
294
- * 0.5 (median) and 1, etc.
295
- * @param inclusive if true, the ranks are considered inclusive (include weights of items)
296
- *
297
- * @return array of approximate quantiles corresponding to the given number of evenly-spaced ranks.
298
- *
299
- * Deprecated. Will be removed in the next major version. Use get_quantile() instead.
300
- */
301
- std::vector<T, A> get_quantiles(uint32_t num, bool inclusive = true) const;
302
-
303
309
  /**
304
310
  * Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
305
311
  *
@@ -339,7 +345,6 @@ class kll_sketch {
339
345
  * @return an array of m+1 doubles each of which is an approximation
340
346
  * to the fraction of the input stream items (the mass) that fall into one of those intervals.
341
347
  */
342
- using vector_double = typename quantiles_sorted_view<T, C, A>::vector_double;
343
348
  vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
344
349
 
345
350
  /**
@@ -489,9 +494,26 @@ class kll_sketch {
489
494
  string<A> to_string(bool print_levels = false, bool print_items = false) const;
490
495
 
491
496
  class const_iterator;
497
+
498
+ /**
499
+ * Iterator pointing to the first item in the sketch.
500
+ * If the sketch is empty, the returned iterator must not be dereferenced or incremented.
501
+ * @return iterator pointing to the first item in the sketch
502
+ */
492
503
  const_iterator begin() const;
504
+
505
+ /**
506
+ * Iterator pointing to the past-the-end item in the sketch.
507
+ * The past-the-end item is the hypothetical item that would follow the last item.
508
+ * It does not point to any item, and must not be dereferenced or incremented.
509
+ * @return iterator pointing to the past-the-end item in the sketch
510
+ */
493
511
  const_iterator end() const;
494
512
 
513
+ /**
514
+ * Gets the sorted view of this sketch
515
+ * @return the sorted view of this sketch
516
+ */
495
517
  quantiles_sorted_view<T, C, A> get_sorted_view() const;
496
518
 
497
519
  private:
@@ -529,16 +551,15 @@ class kll_sketch {
529
551
  vector_u32 levels_;
530
552
  T* items_;
531
553
  uint32_t items_size_;
532
- T* min_item_;
533
- T* max_item_;
554
+ optional<T> min_item_;
555
+ optional<T> max_item_;
534
556
  mutable quantiles_sorted_view<T, C, A>* sorted_view_;
535
557
 
536
558
  // for deserialization
537
- class item_deleter;
538
559
  class items_deleter;
539
560
  kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32&& levels,
540
- std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_item,
541
- std::unique_ptr<T, item_deleter> max_item, bool is_level_zero_sorted, const C& comparator);
561
+ std::unique_ptr<T, items_deleter> items, uint32_t items_size, optional<T>&& min_item,
562
+ optional<T>&& max_item, bool is_level_zero_sorted, const C& comparator);
542
563
 
543
564
  // common update code
544
565
  inline void update_min_max(const T& item);