datasketches 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -205,7 +205,6 @@ APPENDIX A: How to apply the Apache License to your work.
205
205
  -------------------------------------------------------------
206
206
 
207
207
 
208
-
209
208
  APPENDIX B: Additional licenses relevant to this product.
210
209
 
211
210
  This product includes a number of source files with code that has been
@@ -215,43 +214,6 @@ APPENDIX B: Additional licenses relevant to this product.
215
214
  conditions of the following licenses.
216
215
 
217
216
 
218
-
219
- =============================================================
220
- MIT License
221
- =============================================================
222
- Original source code:
223
- https://github.com/benjaminjack/python_cpp_example
224
- -------------------------------------------------------------
225
- Copyright (c) 2017 Benjamin R. Jack
226
-
227
- MIT License (https://opensource.org/licenses/MIT):
228
-
229
- Permission is hereby granted, free of charge, to any person
230
- obtaining a copy of this software and associated documentation
231
- files (the "Software"), to deal in the Software without restriction,
232
- including without limitation the rights to use, copy, modify, merge,
233
- publish, distribute, sublicense, and/or sell copies of the Software,
234
- and to permit persons to whom the Software is furnished to do so,
235
- subject to the following conditions:
236
-
237
- The above copyright notice and this permission notice shall be
238
- included in all copies or substantial portions of the Software.
239
-
240
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
241
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
242
- OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
243
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
244
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
245
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
246
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
247
- SOFTWARE.
248
- -------------------------------------------------------------
249
- Code locations:
250
- * https://github.com/apache/datasketches-cpp/blob/master/setup.py
251
- that is adapted from the above.
252
-
253
-
254
-
255
217
  =============================================================
256
218
  Boost License (https://www.boost.org/LICENSE_1_0.txt)
257
219
  =============================================================
@@ -287,44 +249,6 @@ APPENDIX B: Additional licenses relevant to this product.
287
249
  of CMake configuration if configured to build tests.
288
250
 
289
251
 
290
- =============================================================
291
- BSD License
292
- =============================================================
293
- Original source code:
294
- https://github.com/pybind/pybind11/blob/master/LICENSE
295
-
296
- Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
297
-
298
- Redistribution and use in source and binary forms, with or without
299
- modification, are permitted provided that the following conditions are met:
300
-
301
- 1. Redistributions of source code must retain the above copyright notice, this
302
- list of conditions and the following disclaimer.
303
-
304
- 2. Redistributions in binary form must reproduce the above copyright notice,
305
- this list of conditions and the following disclaimer in the documentation
306
- and/or other materials provided with the distribution.
307
-
308
- 3. Neither the name of the copyright holder nor the names of its contributors
309
- may be used to endorse or promote products derived from this software
310
- without specific prior written permission.
311
-
312
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
313
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
314
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
315
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
316
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
317
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
318
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
319
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
320
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
321
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
322
- -------------------------------------------------------------
323
- Code Locations:
324
- Found only in the convenience binaries distributed from PyPI, which rely
325
- on pybind11 code during compilation.
326
-
327
-
328
252
  =============================================================
329
253
  Public Domain
330
254
  =============================================================
@@ -1,5 +1,5 @@
1
1
  Apache DataSketches C++ and Python
2
- Copyright 2022 The Apache Software Foundation
2
+ Copyright 2023 The Apache Software Foundation
3
3
 
4
4
  Copyright 2015-2018 Yahoo Inc.
5
5
  Copyright 2019-2020 Verizon Media
@@ -14,9 +14,7 @@ If you are interested in making contributions to this site please see our [Commu
14
14
 
15
15
  This code requires C++11.
16
16
 
17
- This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/datasketches-cpp/tree/master/python).
18
-
19
- This library is header-only. The build process provided is only for building unit tests and the python library.
17
+ This library is header-only. The build process provided is only for building unit tests.
20
18
 
21
19
  Building the unit tests requires cmake 3.12.0 or higher.
22
20
 
@@ -35,19 +35,20 @@ install(TARGETS common EXPORT ${PROJECT_NAME})
35
35
 
36
36
  install(FILES
37
37
  ${CMAKE_CURRENT_BINARY_DIR}/include/version.hpp
38
+ include/binomial_bounds.hpp
39
+ include/bounds_binomial_proportions.hpp
40
+ include/ceiling_power_of_2.hpp
38
41
  include/common_defs.hpp
42
+ include/conditional_back_inserter.hpp
43
+ include/conditional_forward.hpp
44
+ include/count_zeros.hpp
45
+ include/inv_pow2_table.hpp
46
+ include/kolmogorov_smirnov_impl.hpp
47
+ include/kolmogorov_smirnov.hpp
39
48
  include/memory_operations.hpp
40
49
  include/MurmurHash3.h
41
- include/serde.hpp
42
- include/count_zeros.hpp
43
- include/inv_pow2_table.hpp
44
- include/binomial_bounds.hpp
45
- include/conditional_back_inserter.hpp
46
- include/conditional_forward.hpp
47
- include/ceiling_power_of_2.hpp
48
- include/bounds_binomial_proportions.hpp
50
+ include/optional.hpp
51
+ include/quantiles_sorted_view_impl.hpp
49
52
  include/quantiles_sorted_view.hpp
50
- include/quantiles_sorted_view_impl.hpp
51
- include/kolmogorov_smirnov.hpp
52
- include/kolmogorov_smirnov_impl.hpp
53
+ include/serde.hpp
53
54
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
@@ -28,27 +28,30 @@
28
28
  #include <chrono>
29
29
  #include <thread>
30
30
 
31
+ /// DataSketches namespace
31
32
  namespace datasketches {
32
33
 
33
34
  static const uint64_t DEFAULT_SEED = 9001;
34
35
 
35
36
  enum resize_factor { X1 = 0, X2, X4, X8 };
36
37
 
37
- template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
38
- template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;
39
-
40
- // thread-safe random bit
41
- static thread_local std::independent_bits_engine<std::mt19937, 1, uint32_t>
42
- random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()
43
- + std::hash<std::thread::id>{}(std::this_thread::get_id())));
38
+ template<typename A> using string = std::basic_string<char, std::char_traits<char>, typename std::allocator_traits<A>::template rebind_alloc<char>>;
44
39
 
45
40
  // common random declarations
46
41
  namespace random_utils {
47
42
  static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
48
43
  static thread_local std::mt19937_64 rand(rd());
49
44
  static thread_local std::uniform_real_distribution<> next_double(0.0, 1.0);
50
- }
51
45
 
46
+ // thread-safe random bit
47
+ static thread_local std::independent_bits_engine<std::mt19937, 1, uint32_t>
48
+ random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()
49
+ + std::hash<std::thread::id>{}(std::this_thread::get_id())));
50
+
51
+ inline void override_seed(uint64_t s) {
52
+ rand.seed(s);
53
+ }
54
+ }
52
55
 
53
56
  // utility function to hide unused compiler warning
54
57
  // usually has no additional cost
@@ -22,8 +22,6 @@
22
22
 
23
23
  #include <cstdint>
24
24
 
25
- #include <stdio.h>
26
-
27
25
  namespace datasketches {
28
26
 
29
27
  static const uint8_t byte_leading_zeros_table[256] = {
@@ -22,13 +22,16 @@
22
22
 
23
23
  namespace datasketches {
24
24
 
25
+ /**
26
+ * Kolmogorov-Smirnov test for KLL or Quantiles sketches
27
+ */
25
28
  class kolmogorov_smirnov {
26
29
  public:
27
30
  /**
28
31
  * Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
29
32
  * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
30
- * @param sketch1 KLL sketch 1
31
- * @param sketch2 KLL sketch 2
33
+ * @param sketch1 sketch 1
34
+ * @param sketch2 sketch 2
32
35
  * @return the raw delta between two KLL quantile sketches
33
36
  */
34
37
  template<typename Sketch>
@@ -39,8 +42,8 @@ public:
39
42
  * Adjusts the computed threshold by the error epsilons of the two given sketches.
40
43
  * See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
41
44
  * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
42
- * @param sketch1 KLL sketch 1
43
- * @param sketch2 KLL sketch 2
45
+ * @param sketch1 sketch 1
46
+ * @param sketch2 sketch 2
44
47
  * @param p Target p-value. Typically .001 to .1, e.g., .05.
45
48
  * @return the adjusted threshold to be compared with the raw delta
46
49
  */
@@ -52,8 +55,8 @@ public:
52
55
  * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
53
56
  * Note: if the given sketches have insufficient data or if the sketch sizes are too small,
54
57
  * this will return false.
55
- * @param sketch1 KLL sketch 1
56
- * @param sketch2 KLL sketch 2
58
+ * @param sketch1 sketch 1
59
+ * @param sketch2 sketch 2
57
60
  * @param p Target p-value. Typically .001 to .1, e.g., .05.
58
61
  * @return Boolean indicating whether we can reject the null hypothesis (that the sketches
59
62
  * reflect the same underlying distribution) using the provided p-value.
@@ -0,0 +1,148 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _OPTIONAL_HPP_
21
+ #define _OPTIONAL_HPP_
22
+
23
+ // This is a simplistic substitute for std::optional until we require C++17
24
+
25
+ #if (__cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L))
26
+ #include <optional>
27
+ using std::optional;
28
+ #else
29
+
30
+ #include <type_traits>
31
+
32
+ namespace datasketches {
33
+
34
+ template<typename T>
35
+ class optional {
36
+ public:
37
+
38
+ optional() noexcept: initialized_(false) {}
39
+
40
+ optional(const T& value) noexcept(std::is_nothrow_copy_constructible<T>::value) {
41
+ new (&value_) T(value);
42
+ initialized_ = true;
43
+ }
44
+
45
+ optional(T&& value) noexcept(std::is_nothrow_move_constructible<T>::value) {
46
+ new (&value_) T(std::move(value));
47
+ initialized_ = true;
48
+ }
49
+
50
+ // conversion from compatible types
51
+ template<typename TT>
52
+ optional(const optional<TT>& other) noexcept(std::is_nothrow_constructible<T, TT>::value): initialized_(false) {
53
+ if (other.initialized_) {
54
+ new (&value_) T(other.value_);
55
+ initialized_ = true;
56
+ }
57
+ }
58
+
59
+ optional(const optional& other) noexcept(std::is_nothrow_copy_constructible<T>::value): initialized_(false) {
60
+ if (other.initialized_) {
61
+ new (&value_) T(other.value_);
62
+ initialized_ = true;
63
+ }
64
+ }
65
+
66
+ optional(optional&& other) noexcept(std::is_nothrow_move_constructible<T>::value): initialized_(false) {
67
+ if (other.initialized_) {
68
+ new (&value_) T(std::move(other.value_));
69
+ initialized_ = true;
70
+ }
71
+ }
72
+
73
+ ~optional() noexcept(std::is_nothrow_destructible<T>::value) {
74
+ if (initialized_) value_.~T();
75
+ }
76
+
77
+ explicit operator bool() const noexcept {
78
+ return initialized_;
79
+ }
80
+
81
+ optional& operator=(const optional& other)
82
+ noexcept(std::is_nothrow_copy_constructible<T>::value && std::is_nothrow_copy_assignable<T>::value) {
83
+ if (initialized_) {
84
+ if (other.initialized_) {
85
+ value_ = other.value_;
86
+ } else {
87
+ reset();
88
+ }
89
+ } else {
90
+ if (other.initialized_) {
91
+ new (&value_) T(other.value_);
92
+ initialized_ = true;
93
+ }
94
+ }
95
+ return *this;
96
+ }
97
+
98
+ optional& operator=(optional&& other)
99
+ noexcept(std::is_nothrow_move_constructible<T>::value && std::is_nothrow_move_assignable<T>::value) {
100
+ if (initialized_) {
101
+ if (other.initialized_) {
102
+ value_ = std::move(other.value_);
103
+ } else {
104
+ reset();
105
+ }
106
+ } else {
107
+ if (other.initialized_) {
108
+ new (&value_) T(std::move(other.value_));
109
+ initialized_ = true;
110
+ }
111
+ }
112
+ return *this;
113
+ }
114
+
115
+ template<typename... Args>
116
+ void emplace(Args&&... args) noexcept(std::is_nothrow_constructible<T, Args...>::value) {
117
+ new (&value_) T(args...);
118
+ initialized_ = true;
119
+ }
120
+
121
+ T& operator*() & noexcept { return value_; }
122
+ const T& operator*() const & noexcept { return value_; }
123
+ T&& operator*() && noexcept { return std::move(value_); }
124
+ const T&& operator*() const && noexcept { return std::move(value_); }
125
+
126
+ T* operator->() noexcept { return &value_; }
127
+ const T* operator->() const noexcept { return &value_; }
128
+
129
+ void reset() noexcept(std::is_nothrow_destructible<T>::value) {
130
+ if (initialized_) value_.~T();
131
+ initialized_ = false;
132
+ }
133
+
134
+ private:
135
+ union {
136
+ T value_;
137
+ };
138
+ bool initialized_;
139
+
140
+ // for converting constructor
141
+ template<typename TT> friend class optional;
142
+ };
143
+
144
+ } // namespace
145
+
146
+ #endif // C++17
147
+
148
+ #endif // _OPTIONAL_HPP_
@@ -27,6 +27,9 @@
27
27
 
28
28
  namespace datasketches {
29
29
 
30
+ /**
31
+ * Sorted view for quantiles sketches (REQ, KLL and Quantiles)
32
+ */
30
33
  template<
31
34
  typename T,
32
35
  typename Comparator, // strict weak ordering function (see C++ named requirements: Compare)
@@ -34,30 +37,119 @@ template<
34
37
  >
35
38
  class quantiles_sorted_view {
36
39
  public:
40
+ /// Entry type
37
41
  using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type;
38
42
  using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
39
43
  using Container = std::vector<Entry, AllocEntry>;
40
44
 
45
+ /// @private
41
46
  quantiles_sorted_view(uint32_t num, const Comparator& comparator, const Allocator& allocator);
42
47
 
48
+ /// @private
43
49
  template<typename Iterator>
44
50
  void add(Iterator begin, Iterator end, uint64_t weight);
45
51
 
52
+ /// @private
46
53
  void convert_to_cummulative();
47
54
 
48
55
  class const_iterator;
56
+
57
+ /**
58
+ * Iterator pointing to the first entry in the view.
59
+ * If the view is empty, the returned iterator must not be dereferenced or incremented.
60
+ * @return iterator pointing to the first entry
61
+ */
49
62
  const_iterator begin() const;
63
+
64
+ /**
65
+ * Iterator pointing to the past-the-end entry in the view.
66
+ * The past-the-end entry is the hypothetical entry that would follow the last entry.
67
+ * It does not point to any entry, and must not be dereferenced or incremented.
68
+ * @return iterator pointing to the past-the-end entry
69
+ */
50
70
  const_iterator end() const;
51
71
 
72
+ /// @return size of the view
52
73
  size_t size() const;
53
74
 
75
+ /**
76
+ * Returns an approximation to the normalized rank of the given item.
77
+ *
78
+ * <p>If the view is empty this throws std::runtime_error.
79
+ *
80
+ * @param item to be ranked
81
+ * @param inclusive if true the weight of the given item is included into the rank.
82
+ * Otherwise the rank equals the sum of the weights of all items that are less than the given item
83
+ * according to the Comparator.
84
+ *
85
+ * @return an approximate normalized rank of the given item (0 to 1 inclusive)
86
+ */
54
87
  double get_rank(const T& item, bool inclusive = true) const;
55
88
 
89
+ /**
90
+ * Quantile return type.
91
+ * This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types)
92
+ */
56
93
  using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type;
94
+
95
+ /**
96
+ * Returns an item from the sketch that is the best approximation to an item
97
+ * from the original stream with the given normalized rank.
98
+ *
99
+ * <p>If the view is empty this throws std::runtime_error.
100
+ *
101
+ * @param rank of an item in the hypothetical sorted stream.
102
+ * @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
103
+ *
104
+ * @return approximate quantile associated with the given normalized rank
105
+ */
57
106
  quantile_return_type get_quantile(double rank, bool inclusive = true) const;
58
107
 
59
108
  using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
109
+
110
+ /**
111
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
112
+ * cumulative analog of the PMF, of the input stream given a set of split points (items).
113
+ *
114
+ * <p>If the view is empty this throws std::runtime_error.
115
+ *
116
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
117
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
118
+ *
119
+ * @param size the number of split points in the array
120
+ *
121
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
122
+ * if the sketch contains items equal to a slit point, then in CDF such items are
123
+ * included into the interval to the left of split point. Otherwise they are included into
124
+ * the interval to the right of split point.
125
+ *
126
+ * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
127
+ * of the input stream given the split_points. The value at array position j of the returned
128
+ * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
129
+ * array. This can be viewed as array of ranks of the given split points plus one more value
130
+ * that is always 1.
131
+ */
60
132
  vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
133
+
134
+ /**
135
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
136
+ * given a set of split points (items).
137
+ *
138
+ * <p>If the view is empty this throws std::runtime_error.
139
+ *
140
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
141
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
142
+ *
143
+ * @param size the number of split points in the array
144
+ *
145
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
146
+ * if the sketch contains items equal to a slit point, then in PMF such items are
147
+ * included into the interval to the left of split point. Otherwise they are included into the interval
148
+ * to the right of split point.
149
+ *
150
+ * @return an array of m+1 doubles each of which is an approximation
151
+ * to the fraction of the input stream items (the mass) that fall into one of those intervals.
152
+ */
61
153
  vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
62
154
 
63
155
  private:
@@ -122,8 +214,6 @@ public:
122
214
  using Base = typename quantiles_sorted_view<T, C, A>::Container::const_iterator;
123
215
  using value_type = typename std::conditional<std::is_arithmetic<T>::value, typename Base::value_type, std::pair<const T&, const uint64_t>>::type;
124
216
 
125
- const_iterator(const Base& it, const Base& begin): Base(it), begin(begin) {}
126
-
127
217
  template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
128
218
  const value_type operator*() const { return Base::operator*(); }
129
219
 
@@ -147,6 +237,9 @@ public:
147
237
 
148
238
  private:
149
239
  Base begin;
240
+
241
+ friend class quantiles_sorted_view<T, C, A>;
242
+ const_iterator(const Base& it, const Base& begin): Base(it), begin(begin) {}
150
243
  };
151
244
 
152
245
  } /* namespace datasketches */
@@ -75,7 +75,7 @@ double quantiles_sorted_view<T, C, A>::get_rank(const T& item, bool inclusive) c
75
75
  template<typename T, typename C, typename A>
76
76
  auto quantiles_sorted_view<T, C, A>::get_quantile(double rank, bool inclusive) const -> quantile_return_type {
77
77
  if (entries_.empty()) throw std::runtime_error("operation is undefined for an empty sketch");
78
- uint64_t weight = inclusive ? std::ceil(rank * total_weight_) : rank * total_weight_;
78
+ uint64_t weight = static_cast<uint64_t>(inclusive ? std::ceil(rank * total_weight_) : rank * total_weight_);
79
79
  auto it = inclusive ?
80
80
  std::lower_bound(entries_.begin(), entries_.end(), make_dummy_entry<T>(weight), compare_pairs_by_second())
81
81
  : std::upper_bound(entries_.begin(), entries_.end(), make_dummy_entry<T>(weight), compare_pairs_by_second());