datasketches 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (245) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +3 -3
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +23 -20
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/README.md +1 -3
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +15 -10
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  21. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  23. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  24. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  25. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +2 -2
  26. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  27. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  28. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  29. data/vendor/datasketches-cpp/{python/src/__init__.py → count/CMakeLists.txt} +25 -1
  30. data/vendor/datasketches-cpp/count/include/count_min.hpp +405 -0
  31. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +497 -0
  32. data/vendor/datasketches-cpp/{MANIFEST.in → count/test/CMakeLists.txt} +23 -20
  33. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  34. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +303 -0
  35. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  36. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  37. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  38. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  39. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  40. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +14 -11
  41. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  42. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  43. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  44. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  45. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  46. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  47. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  48. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_serialize_for_java.cpp +38 -0
  49. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  50. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  51. data/vendor/datasketches-cpp/{tox.ini → density/CMakeLists.txt} +24 -8
  52. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +256 -0
  53. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  54. data/vendor/datasketches-cpp/{python/datasketches/__init__.py → density/test/CMakeLists.txt} +15 -3
  55. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  59. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  60. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  63. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  64. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +94 -61
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +20 -8
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  76. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  77. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -18
  78. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  79. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  80. data/vendor/datasketches-cpp/hll/include/hll.hpp +79 -65
  81. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  82. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  83. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  84. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  85. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  86. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  87. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  88. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +79 -53
  89. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +61 -132
  90. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  91. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  92. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  93. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +5 -40
  94. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +76 -54
  95. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +66 -136
  96. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  97. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  98. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  99. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +15 -39
  100. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  101. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -4
  102. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +105 -26
  103. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +50 -111
  104. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  105. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  106. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  107. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  108. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  109. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  110. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  111. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  112. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  113. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +89 -32
  114. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +33 -19
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +13 -10
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +23 -19
  117. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  118. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  119. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  120. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  121. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  122. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  123. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -51
  124. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  125. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  126. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -20
  127. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  128. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  129. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  130. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  131. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  132. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  133. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  134. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  135. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  137. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  139. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  140. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  141. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  142. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +180 -33
  143. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  144. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  145. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +10 -10
  146. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  147. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +21 -6
  148. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +13 -3
  149. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +15 -1
  150. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  151. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  152. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  153. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +39 -188
  154. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  155. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  157. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  158. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  159. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  160. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection_impl.hpp +31 -0
  161. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  162. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  163. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  164. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  165. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  166. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  167. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  168. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  169. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  170. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  171. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  172. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  173. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  174. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  175. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  176. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  177. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  178. data/vendor/datasketches-cpp/{python/src/datasketches.cpp → tuple/test/tuple_sketch_serialize_for_java.cpp} +16 -30
  179. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  180. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  181. metadata +61 -79
  182. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  183. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  184. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  185. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  188. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  189. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  190. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  191. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  192. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  193. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -81
  194. data/vendor/datasketches-cpp/python/README.md +0 -85
  195. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -104
  196. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  197. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  198. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  199. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  200. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  201. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  202. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  203. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -90
  204. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -128
  205. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -134
  206. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -210
  207. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  208. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -111
  209. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -204
  210. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -215
  211. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -172
  212. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  213. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  214. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  215. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  216. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -110
  217. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -130
  218. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -125
  219. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -126
  220. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -126
  221. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -146
  222. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  223. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -125
  224. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  225. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  230. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  231. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  232. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  233. data/vendor/datasketches-cpp/setup.py +0 -110
  234. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  238. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  239. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  240. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  241. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  242. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  243. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  244. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  245. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -0,0 +1,535 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _EBPPS_SAMPLE_IMPL_HPP_
21
+ #define _EBPPS_SAMPLE_IMPL_HPP_
22
+
23
+ #include "common_defs.hpp"
24
+ #include "conditional_forward.hpp"
25
+ #include "ebpps_sample.hpp"
26
+ #include "serde.hpp"
27
+
28
+ #include <cmath>
29
+ #include <string>
30
+ #include <sstream>
31
+
32
+ namespace datasketches {
33
+
34
+ template<typename T, typename A>
35
+ ebpps_sample<T,A>::ebpps_sample(uint32_t reserved_size, const A& allocator) :
36
+ allocator_(allocator),
37
+ c_(0.0),
38
+ partial_item_(),
39
+ data_(allocator)
40
+ {
41
+ data_.reserve(reserved_size);
42
+ }
43
+
44
+ template<typename T, typename A>
45
+ template<typename TT>
46
+ ebpps_sample<T,A>::ebpps_sample(TT&& item, double theta, const A& allocator) :
47
+ allocator_(allocator),
48
+ c_(theta),
49
+ partial_item_(),
50
+ data_(allocator)
51
+ {
52
+ if (theta == 1.0) {
53
+ data_.reserve(1);
54
+ data_.emplace_back(std::forward<TT>(item));
55
+ } else {
56
+ partial_item_.emplace(std::forward<TT>(item));
57
+ }
58
+ }
59
+
60
+ template<typename T, typename A>
61
+ ebpps_sample<T,A>::ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_item, double c, const A& allocator) :
62
+ allocator_(allocator),
63
+ c_(c),
64
+ partial_item_(partial_item),
65
+ data_(data, allocator)
66
+ {}
67
+
68
+ template<typename T, typename A>
69
+ auto ebpps_sample<T,A>::get_sample() const -> result_type {
70
+ double unused;
71
+ const double c_frac = std::modf(c_, &unused);
72
+ const bool include_partial = next_double() < c_frac;
73
+ const uint32_t result_size = static_cast<uint32_t>(data_.size()) + (include_partial ? 1 : 0);
74
+
75
+ result_type result;
76
+ result.reserve(result_size);
77
+ std::copy(data_.begin(), data_.end(), std::back_inserter(result));
78
+ if (include_partial)
79
+ result.emplace_back(static_cast<const T&>(*partial_item_));
80
+
81
+ return result;
82
+ }
83
+
84
+ template<typename T, typename A>
85
+ void ebpps_sample<T,A>::downsample(double theta) {
86
+ if (theta >= 1.0) return;
87
+
88
+ const double new_c = theta * c_;
89
+ double new_c_int;
90
+ const double new_c_frac = std::modf(new_c, &new_c_int);
91
+ double c_int;
92
+ const double c_frac = std::modf(c_, &c_int);
93
+
94
+ if (new_c_int == 0.0) {
95
+ // no full items retained
96
+ if (next_double() > (c_frac / c_)) {
97
+ swap_with_partial();
98
+ }
99
+ data_.clear();
100
+ } else if (new_c_int == c_int) {
101
+ // no items deleted
102
+ if (next_double() > (1 - theta * c_frac)/(1 - new_c_frac)) {
103
+ swap_with_partial();
104
+ }
105
+ } else {
106
+ if (next_double() < theta * c_frac) {
107
+ // subsample data in random order; last item is partial
108
+ // create sample size new_c_int then swap_with_partial()
109
+ subsample(static_cast<uint32_t>(new_c_int));
110
+ swap_with_partial();
111
+ } else {
112
+ // create sample size new_c_int + 1 then move_one_to_partial)
113
+ subsample(static_cast<uint32_t>(new_c_int) + 1);
114
+ move_one_to_partial();
115
+ }
116
+ }
117
+
118
+ if (new_c == new_c_int)
119
+ partial_item_.reset();
120
+
121
+ c_ = new_c;
122
+ }
123
+
124
+ template<typename T, typename A>
125
+ template<typename FwdSample>
126
+ void ebpps_sample<T,A>::merge(FwdSample&& other) {
127
+ double c_int;
128
+ const double c_frac = std::modf(c_, &c_int);
129
+
130
+ double unused;
131
+ const double other_c_frac = std::modf(other.c_, &unused);
132
+
133
+ // update c_ here but do NOT recompute fractional part yet
134
+ c_ += other.c_;
135
+
136
+ for (uint32_t i = 0; i < other.data_.size(); ++i)
137
+ data_.emplace_back(conditional_forward<FwdSample>(other.data_[i]));
138
+
139
+ // This modifies the original algorithm slightly due to numeric
140
+ // precision issues. Specifically, the test if c_frac + other_c_frac == 1.0
141
+ // happens before tests for < 1.0 or > 1.0 and can also be triggered
142
+ // if c_ == floor(c_) (the updated value of c_, not the input).
143
+ //
144
+ // We can still run into issues where c_frac + other_c_frac == epsilon
145
+ // and the first case would have ideally triggered. As a result, we must
146
+ // check if the partial item exists before adding to the data_ vector.
147
+
148
+ if (c_frac == 0.0 && other_c_frac == 0.0) {
149
+ partial_item_.reset();
150
+ } else if (c_frac + other_c_frac == 1.0 || c_ == std::floor(c_)) {
151
+ if (next_double() <= c_frac) {
152
+ if (partial_item_)
153
+ data_.emplace_back(std::move(*partial_item_));
154
+ } else {
155
+ if (other.partial_item_)
156
+ data_.emplace_back(conditional_forward<FwdSample>(*other.partial_item_));
157
+ }
158
+ partial_item_.reset();
159
+ } else if (c_frac + other_c_frac < 1.0) {
160
+ if (next_double() > c_frac / (c_frac + other_c_frac)) {
161
+ set_partial(conditional_forward<FwdSample>(*other.partial_item_));
162
+ }
163
+ } else { // c_frac + other_c_frac > 1
164
+ if (next_double() <= (1 - c_frac) / ((1 - c_frac) + (1 - other_c_frac))) {
165
+ data_.emplace_back(conditional_forward<FwdSample>(*other.partial_item_));
166
+ } else {
167
+ data_.emplace_back(std::move(*partial_item_));
168
+ partial_item_.reset();
169
+ set_partial(conditional_forward<FwdSample>(*other.partial_item_));
170
+ }
171
+ }
172
+ }
173
+
174
+ template<typename T, typename A>
175
+ string<A> ebpps_sample<T ,A>::to_string() const {
176
+ std::ostringstream oss;
177
+ oss << " sample:" << std::endl;
178
+ uint32_t idx = 0;
179
+ for (const T& item : data_)
180
+ oss << "\t" << idx++ << ":\t" << item << std::endl;
181
+ oss << " partial: " << (bool(partial_item_) ? (*partial_item_) : "NULL") << std::endl;
182
+
183
+ return oss.str();
184
+ }
185
+
186
+ template<typename T, typename A>
187
+ void ebpps_sample<T,A>::subsample(uint32_t num_samples) {
188
+ // we can perform a Fisher-Yates style shuffle, stopping after
189
+ // num_samples points since subsequent swaps would only be
190
+ // between items after num_samples. This is valid since a
191
+ // point from anywhere in the initial array would be eligible
192
+ // to end up in the final subsample.
193
+
194
+ if (num_samples == data_.size()) return;
195
+
196
+ auto erase_start = data_.begin();
197
+ uint32_t data_len = static_cast<uint32_t>(data_.size());
198
+ for (uint32_t i = 0; i < num_samples; ++i, ++erase_start) {
199
+ uint32_t j = i + random_idx(data_len - i);
200
+ std::swap(data_[i], data_[j]);
201
+ }
202
+
203
+ data_.erase(erase_start, data_.end());
204
+ }
205
+
206
+ template<typename T, typename A>
207
+ template<typename FwdItem>
208
+ void ebpps_sample<T,A>::set_partial(FwdItem&& item) {
209
+ if (partial_item_)
210
+ *partial_item_ = conditional_forward<FwdItem>(item);
211
+ else
212
+ partial_item_.emplace(conditional_forward<FwdItem>(item));
213
+ }
214
+
215
+ template<typename T, typename A>
216
+ void ebpps_sample<T,A>::move_one_to_partial() {
217
+ const size_t idx = random_idx(static_cast<uint32_t>(data_.size()));
218
+ // swap selected item to end so we can delete it easily
219
+ const size_t last_idx = data_.size() - 1;
220
+ if (idx != last_idx) {
221
+ std::swap(data_[idx], data_[last_idx]);
222
+ }
223
+
224
+ set_partial(std::move(data_[last_idx]));
225
+
226
+ data_.pop_back();
227
+ }
228
+
229
+ template<typename T, typename A>
230
+ void ebpps_sample<T,A>::swap_with_partial() {
231
+ if (partial_item_) {
232
+ const size_t idx = random_idx(static_cast<uint32_t>(data_.size()));
233
+ std::swap(data_[idx], *partial_item_);
234
+ } else {
235
+ move_one_to_partial();
236
+ }
237
+ }
238
+
239
+ template<typename T, typename A>
240
+ void ebpps_sample<T,A>::reset() {
241
+ c_ = 0.0;
242
+ partial_item_.reset();
243
+ data_.clear();
244
+ }
245
+
246
+ template<typename T, typename A>
247
+ double ebpps_sample<T,A>::get_c() const {
248
+ return c_;
249
+ }
250
+
251
+ template<typename T, typename A>
252
+ auto ebpps_sample<T,A>::get_full_items() const -> result_type {
253
+ return result_type(data_);
254
+ }
255
+
256
+ template<typename T, typename A>
257
+ bool ebpps_sample<T,A>::has_partial_item() const {
258
+ return bool(partial_item_);
259
+ }
260
+
261
+ template<typename T, typename A>
262
+ T ebpps_sample<T,A>::get_partial_item() const {
263
+ if (!partial_item_) throw std::runtime_error("Call to get_partial_item() when no partial item exists");
264
+ return *partial_item_;
265
+ }
266
+
267
+ template<typename T, typename A>
268
+ uint32_t ebpps_sample<T,A>::random_idx(uint32_t max) {
269
+ static std::uniform_int_distribution<uint32_t> dist;
270
+ return dist(random_utils::rand, std::uniform_int_distribution<uint32_t>::param_type(0, max - 1));
271
+ }
272
+
273
+ template<typename T, typename A>
274
+ double ebpps_sample<T,A>::next_double() {
275
+ return random_utils::next_double(random_utils::rand);
276
+ }
277
+
278
+ template<typename T, typename A>
279
+ uint32_t ebpps_sample<T,A>::get_num_retained_items() const {
280
+ return static_cast<uint32_t>(data_.size() + (partial_item_ ? 1 : 0));
281
+ }
282
+
283
+ // implementation for fixed-size arithmetic types (integral and floating point)
284
+ template<typename T, typename A>
285
+ template<typename TT, typename SerDe, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
286
+ size_t ebpps_sample<T, A>::get_serialized_size_bytes(const SerDe&) const {
287
+ if (c_ == 0.0)
288
+ return 0;
289
+ else
290
+ return sizeof(c_) + (data_.size() + (partial_item_ ? 1 : 0)) * sizeof(T);
291
+ }
292
+
293
+ // implementation for all other types
294
+ template<typename T, typename A>
295
+ template<typename TT, typename SerDe, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
296
+ size_t ebpps_sample<T, A>::get_serialized_size_bytes(const SerDe& sd) const {
297
+ if (c_ == 0.0) return 0;
298
+
299
+ size_t num_bytes = sizeof(c_);
300
+ for (auto it : data_)
301
+ num_bytes += sd.size_of_item(it);
302
+
303
+ if (partial_item_)
304
+ num_bytes += sd.size_of_item(*partial_item_);
305
+
306
+ return num_bytes;
307
+ }
308
+
309
+ template<typename T, typename A>
310
+ template<typename SerDe>
311
+ size_t ebpps_sample<T,A>::serialize(uint8_t* ptr, const uint8_t* end_ptr, const SerDe& sd) const {
312
+ uint8_t* st_ptr = ptr;
313
+
314
+ ensure_minimum_memory(end_ptr - ptr, sizeof(c_));
315
+ ptr += copy_to_mem(c_, ptr);
316
+
317
+ ptr += sd.serialize(ptr, end_ptr - ptr, data_.data(), static_cast<unsigned>(data_.size()));
318
+
319
+ if (partial_item_) {
320
+ ptr += sd.serialize(ptr, end_ptr - ptr, &*partial_item_, 1);
321
+ }
322
+
323
+ return ptr - st_ptr;
324
+ }
325
+
326
+ template<typename T, typename A>
327
+ template<typename SerDe>
328
+ void ebpps_sample<T,A>::serialize(std::ostream& os, const SerDe& sd) const {
329
+ write(os, c_);
330
+
331
+ sd.serialize(os, data_.data(), static_cast<unsigned>(data_.size()));
332
+
333
+ if (partial_item_)
334
+ sd.serialize(os, &*partial_item_, 1);
335
+
336
+ if (!os.good()) throw std::runtime_error("error writing to std::ostream");
337
+ }
338
+
339
+ template<typename T, typename A>
340
+ template<typename SerDe>
341
+ std::pair<ebpps_sample<T, A>, size_t> ebpps_sample<T, A>::deserialize(const uint8_t* ptr, size_t size, const SerDe& sd, const A& allocator) {
342
+ const uint8_t* st_ptr = ptr;
343
+ const uint8_t* end_ptr = ptr + size;
344
+
345
+ ensure_minimum_memory(size, sizeof(double));
346
+ double c;
347
+ ptr += copy_from_mem(ptr, c);
348
+ if (c < 0.0)
349
+ throw std::runtime_error("sketch image has C < 0.0 during deserializaiton");
350
+
351
+ double c_int;
352
+ const double c_frac = std::modf(c, &c_int);
353
+ const bool has_partial = c_frac != 0.0;
354
+
355
+ const uint32_t num_full_items = static_cast<uint32_t>(c_int);
356
+ A alloc(allocator);
357
+ std::unique_ptr<T, items_deleter> items(alloc.allocate(num_full_items), items_deleter(allocator, false, num_full_items));
358
+ ptr += sd.deserialize(ptr, end_ptr - ptr, items.get(), num_full_items);
359
+ // serde did not throw, enable destructors
360
+ items.get_deleter().set_destroy(true);
361
+ std::vector<T, A> data(std::make_move_iterator(items.get()),
362
+ std::make_move_iterator(items.get() + num_full_items),
363
+ allocator);
364
+
365
+ optional<T> partial_item;
366
+ if (has_partial) {
367
+ optional<T> tmp; // space to deserialize
368
+ ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
369
+ // serde did not throw so place item and clean up
370
+ partial_item.emplace(*tmp);
371
+ (*tmp).~T();
372
+ }
373
+
374
+ return std::pair<ebpps_sample<T,A>, size_t>(
375
+ ebpps_sample<T,A>(std::move(data), std::move(partial_item), c, allocator),
376
+ ptr - st_ptr);
377
+ }
378
+
379
+ template<typename T, typename A>
380
+ template<typename SerDe>
381
+ ebpps_sample<T, A> ebpps_sample<T, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
382
+ const double c = read<double>(is);
383
+ if (c < 0.0)
384
+ throw std::runtime_error("sketch image has C < 0.0 during deserializaiton");
385
+
386
+ double c_int;
387
+ const double c_frac = std::modf(c, &c_int);
388
+ const bool has_partial = c_frac != 0.0;
389
+
390
+ const uint32_t num_full_items = static_cast<uint32_t>(c_int);
391
+ A alloc(allocator);
392
+ std::unique_ptr<T, items_deleter> items(alloc.allocate(num_full_items), items_deleter(allocator, false, num_full_items));
393
+ sd.deserialize(is, items.get(), num_full_items);
394
+ // serde did not throw, enable destructors
395
+ items.get_deleter().set_destroy(true);
396
+ std::vector<T, A> data(std::make_move_iterator(items.get()),
397
+ std::make_move_iterator(items.get() + num_full_items),
398
+ allocator);
399
+
400
+ optional<T> partial_item;
401
+ if (has_partial) {
402
+ optional<T> tmp; // space to deserialize
403
+ sd.deserialize(is, &*tmp, 1);
404
+ // serde did not throw so place item and clean up
405
+ partial_item.emplace(*tmp);
406
+ (*tmp).~T();
407
+ }
408
+
409
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
410
+
411
+ return ebpps_sample<T,A>(std::move(data), std::move(partial_item), c, allocator);
412
+ }
413
+
414
+
415
+ template<typename T, typename A>
416
+ typename ebpps_sample<T, A>::const_iterator ebpps_sample<T, A>::begin() const {
417
+ return const_iterator(this);
418
+ }
419
+
420
+ template<typename T, typename A>
421
+ typename ebpps_sample<T, A>::const_iterator ebpps_sample<T, A>::end() const {
422
+ return const_iterator(nullptr);
423
+ }
424
+
425
+
426
+ // -------- ebpps_sketch::const_iterator implementation ---------
427
+
428
+ template<typename T, typename A>
429
+ ebpps_sample<T, A>::const_iterator::const_iterator(const ebpps_sample* sample) :
430
+ sample_(sample),
431
+ idx_(0),
432
+ use_partial_(false)
433
+ {
434
+ if (sample == nullptr)
435
+ return;
436
+
437
+ // determine in advance if we use the partial item
438
+ double c_int;
439
+ const double c_frac = std::modf(sample_->get_c(), &c_int);
440
+ use_partial_ = sample->next_double() < c_frac;
441
+
442
+ // sample with no items
443
+ if (sample_->data_.size() == 0 && use_partial_) {
444
+ idx_ = PARTIAL_IDX;
445
+ }
446
+
447
+ if (sample_->c_== 0.0 || (sample_->data_.size() == 0 && !sample_->has_partial_item())) { sample_ = nullptr; }
448
+ }
449
+
450
+ template<typename T, typename A>
451
+ ebpps_sample<T, A>::const_iterator::const_iterator(const const_iterator& other) :
452
+ sample_(other.sample_),
453
+ idx_(other.idx_),
454
+ use_partial_(other.use_partial_)
455
+ {}
456
+
457
+ template<typename T, typename A>
458
+ typename ebpps_sample<T, A>::const_iterator& ebpps_sample<T, A>::const_iterator::operator++() {
459
+ if (sample_ == nullptr)
460
+ return *this;
461
+ else if (idx_ == PARTIAL_IDX) {
462
+ idx_ = sample_->data_.size();
463
+ sample_ = nullptr;
464
+ return * this;
465
+ }
466
+
467
+ ++idx_;
468
+
469
+ if (idx_ == sample_->data_.size()) {
470
+ if (use_partial_)
471
+ idx_ = PARTIAL_IDX;
472
+ else
473
+ sample_ = nullptr;
474
+ }
475
+
476
+ return *this;
477
+ }
478
+
479
+ template<typename T, typename A>
480
+ typename ebpps_sample<T, A>::const_iterator& ebpps_sample<T, A>::const_iterator::operator++(int) {
481
+ const_iterator tmp(*this);
482
+ operator++();
483
+ return tmp;
484
+ }
485
+
486
+ template<typename T, typename A>
487
+ bool ebpps_sample<T, A>::const_iterator::operator==(const const_iterator& other) const {
488
+ if (sample_ != other.sample_) return false;
489
+ if (sample_ == nullptr) return true; // end (and we know other.sample_ is also null)
490
+ return idx_ == other.idx_;
491
+ }
492
+
493
+ template<typename T, typename A>
494
+ bool ebpps_sample<T, A>::const_iterator::operator!=(const const_iterator& other) const {
495
+ return !operator==(other);
496
+ }
497
+
498
+ template<typename T, typename A>
499
+ auto ebpps_sample<T, A>::const_iterator::operator*() const -> reference {
500
+ if (idx_ == PARTIAL_IDX)
501
+ return *(sample_->partial_item_);
502
+ else
503
+ return sample_->data_[idx_];
504
+ }
505
+
506
+ template<typename T, typename A>
507
+ auto ebpps_sample<T, A>::const_iterator::operator->() const -> pointer {
508
+ return **this;
509
+ }
510
+
511
+ template<typename T, typename A>
512
+ class ebpps_sample<T, A>::items_deleter {
513
+ public:
514
+ items_deleter(const A& allocator, bool destroy, size_t num): allocator_(allocator), destroy_(destroy), num_(num) {}
515
+ void operator() (T* ptr) {
516
+ if (ptr != nullptr) {
517
+ if (destroy_) {
518
+ for (size_t i = 0; i < num_; ++i) {
519
+ ptr[i].~T();
520
+ }
521
+ }
522
+ allocator_.deallocate(ptr, num_);
523
+ }
524
+ }
525
+ void set_destroy(bool destroy) { destroy_ = destroy; }
526
+ private:
527
+ A allocator_;
528
+ bool destroy_;
529
+ size_t num_;
530
+ };
531
+
532
+
533
+ } // namespace datasketches
534
+
535
+ #endif // _EBPPS_SAMPLE_IMPL_HPP_