datasketches 0.3.2 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -205,7 +205,6 @@ APPENDIX A: How to apply the Apache License to your work.
205
205
  -------------------------------------------------------------
206
206
 
207
207
 
208
-
209
208
  APPENDIX B: Additional licenses relevant to this product.
210
209
 
211
210
  This product includes a number of source files with code that has been
@@ -215,43 +214,6 @@ APPENDIX B: Additional licenses relevant to this product.
215
214
  conditions of the following licenses.
216
215
 
217
216
 
218
-
219
- =============================================================
220
- MIT License
221
- =============================================================
222
- Original source code:
223
- https://github.com/benjaminjack/python_cpp_example
224
- -------------------------------------------------------------
225
- Copyright (c) 2017 Benjamin R. Jack
226
-
227
- MIT License (https://opensource.org/licenses/MIT):
228
-
229
- Permission is hereby granted, free of charge, to any person
230
- obtaining a copy of this software and associated documentation
231
- files (the "Software"), to deal in the Software without restriction,
232
- including without limitation the rights to use, copy, modify, merge,
233
- publish, distribute, sublicense, and/or sell copies of the Software,
234
- and to permit persons to whom the Software is furnished to do so,
235
- subject to the following conditions:
236
-
237
- The above copyright notice and this permission notice shall be
238
- included in all copies or substantial portions of the Software.
239
-
240
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
241
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
242
- OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
243
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
244
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
245
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
246
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
247
- SOFTWARE.
248
- -------------------------------------------------------------
249
- Code locations:
250
- * https://github.com/apache/datasketches-cpp/blob/master/setup.py
251
- that is adapted from the above.
252
-
253
-
254
-
255
217
  =============================================================
256
218
  Boost License (https://www.boost.org/LICENSE_1_0.txt)
257
219
  =============================================================
@@ -287,44 +249,6 @@ APPENDIX B: Additional licenses relevant to this product.
287
249
  of CMake configuration if configured to build tests.
288
250
 
289
251
 
290
- =============================================================
291
- BSD License
292
- =============================================================
293
- Original source code:
294
- https://github.com/pybind/pybind11/blob/master/LICENSE
295
-
296
- Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
297
-
298
- Redistribution and use in source and binary forms, with or without
299
- modification, are permitted provided that the following conditions are met:
300
-
301
- 1. Redistributions of source code must retain the above copyright notice, this
302
- list of conditions and the following disclaimer.
303
-
304
- 2. Redistributions in binary form must reproduce the above copyright notice,
305
- this list of conditions and the following disclaimer in the documentation
306
- and/or other materials provided with the distribution.
307
-
308
- 3. Neither the name of the copyright holder nor the names of its contributors
309
- may be used to endorse or promote products derived from this software
310
- without specific prior written permission.
311
-
312
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
313
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
314
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
315
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
316
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
317
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
318
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
319
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
320
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
321
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
322
- -------------------------------------------------------------
323
- Code Locations:
324
- Found only in the convenience binaries distributed from PyPI, which rely
325
- on pybind11 code during compilation.
326
-
327
-
328
252
  =============================================================
329
253
  Public Domain
330
254
  =============================================================
@@ -1,5 +1,5 @@
1
1
  Apache DataSketches C++ and Python
2
- Copyright 2022 The Apache Software Foundation
2
+ Copyright 2023 The Apache Software Foundation
3
3
 
4
4
  Copyright 2015-2018 Yahoo Inc.
5
5
  Copyright 2019-2020 Verizon Media
@@ -14,9 +14,7 @@ If you are interested in making contributions to this site please see our [Commu
14
14
 
15
15
  This code requires C++11.
16
16
 
17
- This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/datasketches-cpp/tree/master/python).
18
-
19
- This library is header-only. The build process provided is only for building unit tests and the python library.
17
+ This library is header-only. The build process provided is only for building unit tests.
20
18
 
21
19
  Building the unit tests requires cmake 3.12.0 or higher.
22
20
 
@@ -35,19 +35,20 @@ install(TARGETS common EXPORT ${PROJECT_NAME})
35
35
 
36
36
  install(FILES
37
37
  ${CMAKE_CURRENT_BINARY_DIR}/include/version.hpp
38
+ include/binomial_bounds.hpp
39
+ include/bounds_binomial_proportions.hpp
40
+ include/ceiling_power_of_2.hpp
38
41
  include/common_defs.hpp
42
+ include/conditional_back_inserter.hpp
43
+ include/conditional_forward.hpp
44
+ include/count_zeros.hpp
45
+ include/inv_pow2_table.hpp
46
+ include/kolmogorov_smirnov_impl.hpp
47
+ include/kolmogorov_smirnov.hpp
39
48
  include/memory_operations.hpp
40
49
  include/MurmurHash3.h
41
- include/serde.hpp
42
- include/count_zeros.hpp
43
- include/inv_pow2_table.hpp
44
- include/binomial_bounds.hpp
45
- include/conditional_back_inserter.hpp
46
- include/conditional_forward.hpp
47
- include/ceiling_power_of_2.hpp
48
- include/bounds_binomial_proportions.hpp
50
+ include/optional.hpp
51
+ include/quantiles_sorted_view_impl.hpp
49
52
  include/quantiles_sorted_view.hpp
50
- include/quantiles_sorted_view_impl.hpp
51
- include/kolmogorov_smirnov.hpp
52
- include/kolmogorov_smirnov_impl.hpp
53
+ include/serde.hpp
53
54
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
@@ -28,27 +28,30 @@
28
28
  #include <chrono>
29
29
  #include <thread>
30
30
 
31
+ /// DataSketches namespace
31
32
  namespace datasketches {
32
33
 
33
34
  static const uint64_t DEFAULT_SEED = 9001;
34
35
 
35
36
  enum resize_factor { X1 = 0, X2, X4, X8 };
36
37
 
37
- template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
38
- template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;
39
-
40
- // thread-safe random bit
41
- static thread_local std::independent_bits_engine<std::mt19937, 1, uint32_t>
42
- random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()
43
- + std::hash<std::thread::id>{}(std::this_thread::get_id())));
38
+ template<typename A> using string = std::basic_string<char, std::char_traits<char>, typename std::allocator_traits<A>::template rebind_alloc<char>>;
44
39
 
45
40
  // common random declarations
46
41
  namespace random_utils {
47
42
  static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
48
43
  static thread_local std::mt19937_64 rand(rd());
49
44
  static thread_local std::uniform_real_distribution<> next_double(0.0, 1.0);
50
- }
51
45
 
46
+ // thread-safe random bit
47
+ static thread_local std::independent_bits_engine<std::mt19937, 1, uint32_t>
48
+ random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()
49
+ + std::hash<std::thread::id>{}(std::this_thread::get_id())));
50
+
51
+ inline void override_seed(uint64_t s) {
52
+ rand.seed(s);
53
+ }
54
+ }
52
55
 
53
56
  // utility function to hide unused compiler warning
54
57
  // usually has no additional cost
@@ -22,8 +22,6 @@
22
22
 
23
23
  #include <cstdint>
24
24
 
25
- #include <stdio.h>
26
-
27
25
  namespace datasketches {
28
26
 
29
27
  static const uint8_t byte_leading_zeros_table[256] = {
@@ -22,13 +22,16 @@
22
22
 
23
23
  namespace datasketches {
24
24
 
25
+ /**
26
+ * Kolmogorov-Smirnov test for KLL or Quantiles sketches
27
+ */
25
28
  class kolmogorov_smirnov {
26
29
  public:
27
30
  /**
28
31
  * Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
29
32
  * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
30
- * @param sketch1 KLL sketch 1
31
- * @param sketch2 KLL sketch 2
33
+ * @param sketch1 sketch 1
34
+ * @param sketch2 sketch 2
32
35
  * @return the raw delta between two KLL quantile sketches
33
36
  */
34
37
  template<typename Sketch>
@@ -39,8 +42,8 @@ public:
39
42
  * Adjusts the computed threshold by the error epsilons of the two given sketches.
40
43
  * See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
41
44
  * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
42
- * @param sketch1 KLL sketch 1
43
- * @param sketch2 KLL sketch 2
45
+ * @param sketch1 sketch 1
46
+ * @param sketch2 sketch 2
44
47
  * @param p Target p-value. Typically .001 to .1, e.g., .05.
45
48
  * @return the adjusted threshold to be compared with the raw delta
46
49
  */
@@ -52,8 +55,8 @@ public:
52
55
  * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
53
56
  * Note: if the given sketches have insufficient data or if the sketch sizes are too small,
54
57
  * this will return false.
55
- * @param sketch1 KLL sketch 1
56
- * @param sketch2 KLL sketch 2
58
+ * @param sketch1 sketch 1
59
+ * @param sketch2 sketch 2
57
60
  * @param p Target p-value. Typically .001 to .1, e.g., .05.
58
61
  * @return Boolean indicating whether we can reject the null hypothesis (that the sketches
59
62
  * reflect the same underlying distribution) using the provided p-value.
@@ -0,0 +1,148 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _OPTIONAL_HPP_
21
+ #define _OPTIONAL_HPP_
22
+
23
+ // This is a simplistic substitute for std::optional until we require C++17
24
+
25
+ #if (__cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L))
26
+ #include <optional>
27
+ using std::optional;
28
+ #else
29
+
30
+ #include <type_traits>
31
+
32
+ namespace datasketches {
33
+
34
+ template<typename T>
35
+ class optional {
36
+ public:
37
+
38
+ optional() noexcept: initialized_(false) {}
39
+
40
+ optional(const T& value) noexcept(std::is_nothrow_copy_constructible<T>::value) {
41
+ new (&value_) T(value);
42
+ initialized_ = true;
43
+ }
44
+
45
+ optional(T&& value) noexcept(std::is_nothrow_move_constructible<T>::value) {
46
+ new (&value_) T(std::move(value));
47
+ initialized_ = true;
48
+ }
49
+
50
+ // conversion from compatible types
51
+ template<typename TT>
52
+ optional(const optional<TT>& other) noexcept(std::is_nothrow_constructible<T, TT>::value): initialized_(false) {
53
+ if (other.initialized_) {
54
+ new (&value_) T(other.value_);
55
+ initialized_ = true;
56
+ }
57
+ }
58
+
59
+ optional(const optional& other) noexcept(std::is_nothrow_copy_constructible<T>::value): initialized_(false) {
60
+ if (other.initialized_) {
61
+ new (&value_) T(other.value_);
62
+ initialized_ = true;
63
+ }
64
+ }
65
+
66
+ optional(optional&& other) noexcept(std::is_nothrow_move_constructible<T>::value): initialized_(false) {
67
+ if (other.initialized_) {
68
+ new (&value_) T(std::move(other.value_));
69
+ initialized_ = true;
70
+ }
71
+ }
72
+
73
+ ~optional() noexcept(std::is_nothrow_destructible<T>::value) {
74
+ if (initialized_) value_.~T();
75
+ }
76
+
77
+ explicit operator bool() const noexcept {
78
+ return initialized_;
79
+ }
80
+
81
+ optional& operator=(const optional& other)
82
+ noexcept(std::is_nothrow_copy_constructible<T>::value && std::is_nothrow_copy_assignable<T>::value) {
83
+ if (initialized_) {
84
+ if (other.initialized_) {
85
+ value_ = other.value_;
86
+ } else {
87
+ reset();
88
+ }
89
+ } else {
90
+ if (other.initialized_) {
91
+ new (&value_) T(other.value_);
92
+ initialized_ = true;
93
+ }
94
+ }
95
+ return *this;
96
+ }
97
+
98
+ optional& operator=(optional&& other)
99
+ noexcept(std::is_nothrow_move_constructible<T>::value && std::is_nothrow_move_assignable<T>::value) {
100
+ if (initialized_) {
101
+ if (other.initialized_) {
102
+ value_ = std::move(other.value_);
103
+ } else {
104
+ reset();
105
+ }
106
+ } else {
107
+ if (other.initialized_) {
108
+ new (&value_) T(std::move(other.value_));
109
+ initialized_ = true;
110
+ }
111
+ }
112
+ return *this;
113
+ }
114
+
115
+ template<typename... Args>
116
+ void emplace(Args&&... args) noexcept(std::is_nothrow_constructible<T, Args...>::value) {
117
+ new (&value_) T(args...);
118
+ initialized_ = true;
119
+ }
120
+
121
+ T& operator*() & noexcept { return value_; }
122
+ const T& operator*() const & noexcept { return value_; }
123
+ T&& operator*() && noexcept { return std::move(value_); }
124
+ const T&& operator*() const && noexcept { return std::move(value_); }
125
+
126
+ T* operator->() noexcept { return &value_; }
127
+ const T* operator->() const noexcept { return &value_; }
128
+
129
+ void reset() noexcept(std::is_nothrow_destructible<T>::value) {
130
+ if (initialized_) value_.~T();
131
+ initialized_ = false;
132
+ }
133
+
134
+ private:
135
+ union {
136
+ T value_;
137
+ };
138
+ bool initialized_;
139
+
140
+ // for converting constructor
141
+ template<typename TT> friend class optional;
142
+ };
143
+
144
+ } // namespace
145
+
146
+ #endif // C++17
147
+
148
+ #endif // _OPTIONAL_HPP_
@@ -27,6 +27,9 @@
27
27
 
28
28
  namespace datasketches {
29
29
 
30
+ /**
31
+ * Sorted view for quantiles sketches (REQ, KLL and Quantiles)
32
+ */
30
33
  template<
31
34
  typename T,
32
35
  typename Comparator, // strict weak ordering function (see C++ named requirements: Compare)
@@ -34,30 +37,119 @@ template<
34
37
  >
35
38
  class quantiles_sorted_view {
36
39
  public:
40
+ /// Entry type
37
41
  using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type;
38
42
  using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
39
43
  using Container = std::vector<Entry, AllocEntry>;
40
44
 
45
+ /// @private
41
46
  quantiles_sorted_view(uint32_t num, const Comparator& comparator, const Allocator& allocator);
42
47
 
48
+ /// @private
43
49
  template<typename Iterator>
44
50
  void add(Iterator begin, Iterator end, uint64_t weight);
45
51
 
52
+ /// @private
46
53
  void convert_to_cummulative();
47
54
 
48
55
  class const_iterator;
56
+
57
+ /**
58
+ * Iterator pointing to the first entry in the view.
59
+ * If the view is empty, the returned iterator must not be dereferenced or incremented.
60
+ * @return iterator pointing to the first entry
61
+ */
49
62
  const_iterator begin() const;
63
+
64
+ /**
65
+ * Iterator pointing to the past-the-end entry in the view.
66
+ * The past-the-end entry is the hypothetical entry that would follow the last entry.
67
+ * It does not point to any entry, and must not be dereferenced or incremented.
68
+ * @return iterator pointing to the past-the-end entry
69
+ */
50
70
  const_iterator end() const;
51
71
 
72
+ /// @return size of the view
52
73
  size_t size() const;
53
74
 
75
+ /**
76
+ * Returns an approximation to the normalized rank of the given item.
77
+ *
78
+ * <p>If the view is empty this throws std::runtime_error.
79
+ *
80
+ * @param item to be ranked
81
+ * @param inclusive if true the weight of the given item is included into the rank.
82
+ * Otherwise the rank equals the sum of the weights of all items that are less than the given item
83
+ * according to the Comparator.
84
+ *
85
+ * @return an approximate normalized rank of the given item (0 to 1 inclusive)
86
+ */
54
87
  double get_rank(const T& item, bool inclusive = true) const;
55
88
 
89
+ /**
90
+ * Quantile return type.
91
+ * This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types)
92
+ */
56
93
  using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type;
94
+
95
+ /**
96
+ * Returns an item from the sketch that is the best approximation to an item
97
+ * from the original stream with the given normalized rank.
98
+ *
99
+ * <p>If the view is empty this throws std::runtime_error.
100
+ *
101
+ * @param rank of an item in the hypothetical sorted stream.
102
+ * @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
103
+ *
104
+ * @return approximate quantile associated with the given normalized rank
105
+ */
57
106
  quantile_return_type get_quantile(double rank, bool inclusive = true) const;
58
107
 
59
108
  using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
109
+
110
+ /**
111
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
112
+ * cumulative analog of the PMF, of the input stream given a set of split points (items).
113
+ *
114
+ * <p>If the view is empty this throws std::runtime_error.
115
+ *
116
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
117
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
118
+ *
119
+ * @param size the number of split points in the array
120
+ *
121
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
122
+ * if the sketch contains items equal to a slit point, then in CDF such items are
123
+ * included into the interval to the left of split point. Otherwise they are included into
124
+ * the interval to the right of split point.
125
+ *
126
+ * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
127
+ * of the input stream given the split_points. The value at array position j of the returned
128
+ * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
129
+ * array. This can be viewed as array of ranks of the given split points plus one more value
130
+ * that is always 1.
131
+ */
60
132
  vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
133
+
134
+ /**
135
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
136
+ * given a set of split points (items).
137
+ *
138
+ * <p>If the view is empty this throws std::runtime_error.
139
+ *
140
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
141
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
142
+ *
143
+ * @param size the number of split points in the array
144
+ *
145
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
146
+ * if the sketch contains items equal to a slit point, then in PMF such items are
147
+ * included into the interval to the left of split point. Otherwise they are included into the interval
148
+ * to the right of split point.
149
+ *
150
+ * @return an array of m+1 doubles each of which is an approximation
151
+ * to the fraction of the input stream items (the mass) that fall into one of those intervals.
152
+ */
61
153
  vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
62
154
 
63
155
  private:
@@ -122,8 +214,6 @@ public:
122
214
  using Base = typename quantiles_sorted_view<T, C, A>::Container::const_iterator;
123
215
  using value_type = typename std::conditional<std::is_arithmetic<T>::value, typename Base::value_type, std::pair<const T&, const uint64_t>>::type;
124
216
 
125
- const_iterator(const Base& it, const Base& begin): Base(it), begin(begin) {}
126
-
127
217
  template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
128
218
  const value_type operator*() const { return Base::operator*(); }
129
219
 
@@ -147,6 +237,9 @@ public:
147
237
 
148
238
  private:
149
239
  Base begin;
240
+
241
+ friend class quantiles_sorted_view<T, C, A>;
242
+ const_iterator(const Base& it, const Base& begin): Base(it), begin(begin) {}
150
243
  };
151
244
 
152
245
  } /* namespace datasketches */
@@ -75,7 +75,7 @@ double quantiles_sorted_view<T, C, A>::get_rank(const T& item, bool inclusive) c
75
75
  template<typename T, typename C, typename A>
76
76
  auto quantiles_sorted_view<T, C, A>::get_quantile(double rank, bool inclusive) const -> quantile_return_type {
77
77
  if (entries_.empty()) throw std::runtime_error("operation is undefined for an empty sketch");
78
- uint64_t weight = inclusive ? std::ceil(rank * total_weight_) : rank * total_weight_;
78
+ uint64_t weight = static_cast<uint64_t>(inclusive ? std::ceil(rank * total_weight_) : rank * total_weight_);
79
79
  auto it = inclusive ?
80
80
  std::lower_bound(entries_.begin(), entries_.end(), make_dummy_entry<T>(weight), compare_pairs_by_second())
81
81
  : std::upper_bound(entries_.begin(), entries_.end(), make_dummy_entry<T>(weight), compare_pairs_by_second());