datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KLL_QUANTILE_CALCULATOR_HPP_
21
+ #define KLL_QUANTILE_CALCULATOR_HPP_
22
+
23
+ #include <memory>
24
+
25
+ namespace datasketches {
26
+
27
+ template <typename T, typename C, typename A>
28
+ class kll_quantile_calculator {
29
+ public:
30
+ // assumes that all levels are sorted including level 0
31
+ kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n);
32
+ T get_quantile(double fraction) const;
33
+
34
+ private:
35
+ using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
36
+ using vector_u32 = std::vector<uint32_t, AllocU32>;
37
+ using Entry = std::pair<T, uint64_t>;
38
+ using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
39
+ using Container = std::vector<Entry, AllocEntry>;
40
+ uint64_t n_;
41
+ vector_u32 levels_;
42
+ Container entries_;
43
+
44
+ void populate_from_sketch(const T* items, const uint32_t* levels, uint8_t num_levels);
45
+ T approximately_answer_positional_query(uint64_t pos) const;
46
+ void convert_to_preceding_cummulative();
47
+ uint32_t chunk_containing_pos(uint64_t pos) const;
48
+ uint32_t search_for_chunk_containing_pos(uint64_t pos, uint32_t l, uint32_t r) const;
49
+ static void merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items);
50
+ static void merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
51
+ static void merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
52
+ static uint64_t pos_of_phi(double phi, uint64_t n);
53
+
54
+ template<typename Comparator>
55
+ struct compare_pair_by_first {
56
+ template<typename Entry1, typename Entry2>
57
+ bool operator()(Entry1&& a, Entry2&& b) const {
58
+ return Comparator()(std::forward<Entry1>(a).first, std::forward<Entry2>(b).first);
59
+ }
60
+ };
61
+ };
62
+
63
+ } /* namespace datasketches */
64
+
65
+ #include "kll_quantile_calculator_impl.hpp"
66
+
67
+ #endif // KLL_QUANTILE_CALCULATOR_HPP_
@@ -0,0 +1,169 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KLL_QUANTILE_CALCULATOR_IMPL_HPP_
21
+ #define KLL_QUANTILE_CALCULATOR_IMPL_HPP_
22
+
23
+ #include <memory>
24
+ #include <cmath>
25
+ #include <algorithm>
26
+
27
+ #include "kll_helper.hpp"
28
+
29
+ namespace datasketches {
30
+
31
+ template <typename T, typename C, typename A>
32
+ kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n):
33
+ n_(n), levels_(num_levels + 1)
34
+ {
35
+ const uint32_t num_items = levels[num_levels] - levels[0];
36
+ entries_.reserve(num_items);
37
+ populate_from_sketch(items, levels, num_levels);
38
+ merge_sorted_blocks(entries_, levels_.data(), levels_.size() - 1, num_items);
39
+ if (!is_sorted(entries_.begin(), entries_.end(), compare_pair_by_first<C>())) throw std::logic_error("entries must be sorted");
40
+ convert_to_preceding_cummulative();
41
+ }
42
+
43
+ template <typename T, typename C, typename A>
44
+ T kll_quantile_calculator<T, C, A>::get_quantile(double fraction) const {
45
+ return approximately_answer_positional_query(pos_of_phi(fraction, n_));
46
+ }
47
+
48
+ template <typename T, typename C, typename A>
49
+ void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, const uint32_t* levels, uint8_t num_levels) {
50
+ size_t src_level = 0;
51
+ size_t dst_level = 0;
52
+ uint64_t weight = 1;
53
+ uint32_t offset = levels[0];
54
+ while (src_level < num_levels) {
55
+ const uint32_t from_index(levels[src_level] - offset);
56
+ const uint32_t to_index(levels[src_level + 1] - offset); // exclusive
57
+ if (from_index < to_index) { // skip empty levels
58
+ for (uint32_t i = from_index; i < to_index; ++i) {
59
+ entries_.push_back(Entry(items[i + offset], weight));
60
+ }
61
+ levels_[dst_level] = from_index;
62
+ levels_[dst_level + 1] = to_index;
63
+ dst_level++;
64
+ }
65
+ src_level++;
66
+ weight *= 2;
67
+ }
68
+ if (levels_.size() > static_cast<size_t>(dst_level + 1)) levels_.resize(dst_level + 1);
69
+ }
70
+
71
+ template <typename T, typename C, typename A>
72
+ T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64_t pos) const {
73
+ if (pos >= n_) throw std::logic_error("position out of range");
74
+ const uint32_t num_items = levels_[levels_.size() - 1];
75
+ if (pos > entries_[num_items - 1].second) return entries_[num_items - 1].first;
76
+ const uint32_t index = chunk_containing_pos(pos);
77
+ return entries_[index].first;
78
+ }
79
+
80
+ template <typename T, typename C, typename A>
81
+ void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
82
+ uint64_t subtotal = 0;
83
+ for (auto& entry: entries_) {
84
+ const uint64_t new_subtotal = subtotal + entry.second;
85
+ entry.second = subtotal;
86
+ subtotal = new_subtotal;
87
+ }
88
+ }
89
+
90
+ template <typename T, typename C, typename A>
91
+ uint64_t kll_quantile_calculator<T, C, A>::pos_of_phi(double phi, uint64_t n) {
92
+ const uint64_t pos = std::floor(phi * n);
93
+ return (pos == n) ? n - 1 : pos;
94
+ }
95
+
96
+ template <typename T, typename C, typename A>
97
+ uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) const {
98
+ if (entries_.size() < 1) throw std::logic_error("array too short");
99
+ if (pos < entries_[0].second) throw std::logic_error("position too small");
100
+ if (pos > entries_[entries_.size() - 1].second) throw std::logic_error("position too large");
101
+ return search_for_chunk_containing_pos(pos, 0, entries_.size());
102
+ }
103
+
104
+ template <typename T, typename C, typename A>
105
+ uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos, uint32_t l, uint32_t r) const {
106
+ if (l + 1 == r) {
107
+ return l;
108
+ }
109
+ const uint32_t m(l + (r - l) / 2);
110
+ if (entries_[m].second <= pos) {
111
+ return search_for_chunk_containing_pos(pos, m, r);
112
+ }
113
+ return search_for_chunk_containing_pos(pos, l, m);
114
+ }
115
+
116
+ template <typename T, typename C, typename A>
117
+ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items) {
118
+ if (num_levels == 1) return;
119
+ Container temporary;
120
+ temporary.reserve(num_items);
121
+ merge_sorted_blocks_direct(entries, temporary, levels, 0, num_levels);
122
+ }
123
+
124
+ template <typename T, typename C, typename A>
125
+ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels,
126
+ uint8_t starting_level, uint8_t num_levels) {
127
+ if (num_levels == 1) return;
128
+ const uint8_t num_levels_1 = num_levels / 2;
129
+ const uint8_t num_levels_2 = num_levels - num_levels_1;
130
+ const uint8_t starting_level_1 = starting_level;
131
+ const uint8_t starting_level_2 = starting_level + num_levels_1;
132
+ const auto chunk_begin = temp.begin() + temp.size();
133
+ merge_sorted_blocks_reversed(orig, temp, levels, starting_level_1, num_levels_1);
134
+ merge_sorted_blocks_reversed(orig, temp, levels, starting_level_2, num_levels_2);
135
+ const uint32_t num_items_1 = levels[starting_level_1 + num_levels_1] - levels[starting_level_1];
136
+ std::merge(
137
+ std::make_move_iterator(chunk_begin), std::make_move_iterator(chunk_begin + num_items_1),
138
+ std::make_move_iterator(chunk_begin + num_items_1), std::make_move_iterator(temp.end()),
139
+ orig.begin() + levels[starting_level], compare_pair_by_first<C>()
140
+ );
141
+ temp.erase(chunk_begin, temp.end());
142
+ }
143
+
144
+ template <typename T, typename C, typename A>
145
+ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels,
146
+ uint8_t starting_level, uint8_t num_levels) {
147
+ if (num_levels == 1) {
148
+ std::move(orig.begin() + levels[starting_level], orig.begin() + levels[starting_level + 1], std::back_inserter(temp));
149
+ return;
150
+ }
151
+ const uint8_t num_levels_1 = num_levels / 2;
152
+ const uint8_t num_levels_2 = num_levels - num_levels_1;
153
+ const uint8_t starting_level_1 = starting_level;
154
+ const uint8_t starting_level_2 = starting_level + num_levels_1;
155
+ merge_sorted_blocks_direct(orig, temp, levels, starting_level_1, num_levels_1);
156
+ merge_sorted_blocks_direct(orig, temp, levels, starting_level_2, num_levels_2);
157
+ std::merge(
158
+ std::make_move_iterator(orig.begin() + levels[starting_level_1]),
159
+ std::make_move_iterator(orig.begin() + levels[starting_level_1 + num_levels_1]),
160
+ std::make_move_iterator(orig.begin() + levels[starting_level_2]),
161
+ std::make_move_iterator(orig.begin() + levels[starting_level_2 + num_levels_2]),
162
+ std::back_inserter(temp),
163
+ compare_pair_by_first<C>()
164
+ );
165
+ }
166
+
167
+ } /* namespace datasketches */
168
+
169
+ #endif // KLL_QUANTILE_CALCULATOR_IMPL_HPP_
@@ -0,0 +1,559 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KLL_SKETCH_HPP_
21
+ #define KLL_SKETCH_HPP_
22
+
23
+ #include <functional>
24
+ #include <memory>
25
+ #include <vector>
26
+
27
+ #include "kll_quantile_calculator.hpp"
28
+ #include "common_defs.hpp"
29
+ #include "serde.hpp"
30
+
31
+ namespace datasketches {
32
+
33
+ /*
34
+ * Implementation of a very compact quantiles sketch with lazy compaction scheme
35
+ * and nearly optimal accuracy per retained item.
36
+ * See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>.
37
+ *
38
+ * <p>This is a stochastic streaming sketch that enables near-real time analysis of the
39
+ * approximate distribution of values from a very large stream in a single pass, requiring only
40
+ * that the values are comparable.
41
+ * The analysis is obtained using <i>get_quantile()</i> or <i>get_quantiles()</i> functions or the
42
+ * inverse functions get_rank(), get_PMF() (Probability Mass Function), and get_CDF()
43
+ * (Cumulative Distribution Function).
44
+ *
45
+ * <p>As of May 2020, this implementation produces serialized sketches which are binary-compatible
46
+ * with the equivalent Java implementation only when template parameter T = float
47
+ * (32-bit single precision values).
48
+ *
49
+ * <p>Given an input stream of <i>N</i> numeric values, the <i>absolute rank</i> of any specific
50
+ * value is defined as its index <i>(0 to N-1)</i> in the hypothetical sorted stream of all
51
+ * <i>N</i> input values.
52
+ *
53
+ * <p>The <i>normalized rank</i> (<i>rank</i>) of any specific value is defined as its
54
+ * <i>absolute rank</i> divided by <i>N</i>.
55
+ * Thus, the <i>normalized rank</i> is a value between zero and one.
56
+ * In the documentation for this sketch <i>absolute rank</i> is never used so any
57
+ * reference to just <i>rank</i> should be interpreted to mean <i>normalized rank</i>.
58
+ *
59
+ * <p>This sketch is configured with a parameter <i>k</i>, which affects the size of the sketch
60
+ * and its estimation error.
61
+ *
62
+ * <p>The estimation error is commonly called <i>epsilon</i> (or <i>eps</i>) and is a fraction
63
+ * between zero and one. Larger values of <i>k</i> result in smaller values of epsilon.
64
+ * Epsilon is always with respect to the rank and cannot be applied to the
65
+ * corresponding values.
66
+ *
67
+ * <p>The relationship between the normalized rank and the corresponding values can be viewed
68
+ * as a two dimensional monotonic plot with the normalized rank on one axis and the
69
+ * corresponding values on the other axis. If the y-axis is specified as the value-axis and
70
+ * the x-axis as the normalized rank, then <i>y = get_quantile(x)</i> is a monotonically
71
+ * increasing function.
72
+ *
73
+ * <p>The functions <i>get_quantile(rank)</i> and get_quantiles(...) translate ranks into
74
+ * corresponding values. The functions <i>get_rank(value),
75
+ * get_CDF(...) (Cumulative Distribution Function), and get_PMF(...)
76
+ * (Probability Mass Function)</i> perform the opposite operation and translate values into ranks.
77
+ *
78
+ * <p>The <i>getPMF(...)</i> function has about 13 to 47% worse rank error (depending
79
+ * on <i>k</i>) than the other queries because the mass of each "bin" of the PMF has
80
+ * "double-sided" error from the upper and lower edges of the bin as a result of a subtraction,
81
+ * as the errors from the two edges can sometimes add.
82
+ *
83
+ * <p>The default <i>k</i> of 200 yields a "single-sided" epsilon of about 1.33% and a
84
+ * "double-sided" (PMF) epsilon of about 1.65%.
85
+ *
86
+ * <p>A <i>get_quantile(rank)</i> query has the following guarantees:
87
+ * <ul>
88
+ * <li>Let <i>v = get_quantile(r)</i> where <i>r</i> is the rank between zero and one.</li>
89
+ * <li>The value <i>v</i> will be a value from the input stream.</li>
90
+ * <li>Let <i>trueRank</i> be the true rank of <i>v</i> derived from the hypothetical sorted
91
+ * stream of all <i>N</i> values.</li>
92
+ * <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
93
+ * <li>Then <i>r - eps &le; trueRank &le; r + eps</i> with a confidence of 99%. Note that the
94
+ * error is on the rank, not the value.</li>
95
+ * </ul>
96
+ *
97
+ * <p>A <i>get_rank(value)</i> query has the following guarantees:
98
+ * <ul>
99
+ * <li>Let <i>r = get_rank(v)</i> where <i>v</i> is a value between the min and max values of
100
+ * the input stream.</li>
101
+ * <li>Let <i>true_rank</i> be the true rank of <i>v</i> derived from the hypothetical sorted
102
+ * stream of all <i>N</i> values.</li>
103
+ * <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
104
+ * <li>Then <i>r - eps &le; trueRank &le; r + eps</i> with a confidence of 99%.</li>
105
+ * </ul>
106
+ *
107
+ * <p>A <i>get_PMF()</i> query has the following guarantees:
108
+ * <ul>
109
+ * <li>Let <i>{r1, r2, ..., r(m+1)} = get_PMF(v1, v2, ..., vm)</i> where <i>v1, v2</i> are values
110
+ * between the min and max values of the input stream.
111
+ * <li>Let <i>mass<sub>i</sub> = estimated mass between v<sub>i</sub> and v<sub>i+1</sub></i>.</li>
112
+ * <li>Let <i>trueMass</i> be the true mass between the values of <i>v<sub>i</sub>,
113
+ * v<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> values.</li>
114
+ * <li>Let <i>eps = get_normalized_rank_error(true)</i>.</li>
115
+ * <li>then <i>mass - eps &le; trueMass &le; mass + eps</i> with a confidence of 99%.</li>
116
+ * <li>r(m+1) includes the mass of all points larger than vm.</li>
117
+ * </ul>
118
+ *
119
+ * <p>A <i>get_CDF(...)</i> query has the following guarantees;
120
+ * <ul>
121
+ * <li>Let <i>{r1, r2, ..., r(m+1)} = get_CDF(v1, v2, ..., vm)</i> where <i>v1, v2</i> are values
122
+ * between the min and max values of the input stream.
123
+ * <li>Let <i>mass<sub>i</sub> = r<sub>i+1</sub> - r<sub>i</sub></i>.</li>
124
+ * <li>Let <i>trueMass</i> be the true mass between the true ranks of <i>v<sub>i</sub>,
125
+ * v<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> values.</li>
126
+ * <li>Let <i>eps = get_normalized_rank_error(true)</i>.</li>
127
+ * <li>then <i>mass - eps &le; trueMass &le; mass + eps</i> with a confidence of 99%.</li>
128
+ * <li>1 - r(m+1) includes the mass of all points larger than vm.</li>
129
+ * </ul>
130
+ *
131
+ * <p>From the above, it might seem like we could make some estimates to bound the
132
+ * <em>value</em> returned from a call to <em>get_quantile()</em>. The sketch, however, does not
133
+ * let us derive error bounds or confidences around values. Because errors are independent, we
134
+ * can approximately bracket a value as shown below, but there are no error estimates available.
135
+ * Additionally, the interval may be quite large for certain distributions.
136
+ * <ul>
137
+ * <li>Let <i>v = get_quantile(r)</i>, the estimated quantile value of rank <i>r</i>.</li>
138
+ * <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
139
+ * <li>Let <i>v<sub>lo</sub></i> = estimated quantile value of rank <i>(r - eps)</i>.</li>
140
+ * <li>Let <i>v<sub>hi</sub></i> = estimated quantile value of rank <i>(r + eps)</i>.</li>
141
+ * <li>Then <i>v<sub>lo</sub> &le; v &le; v<sub>hi</sub></i>, with 99% confidence.</li>
142
+ * </ul>
143
+ *
144
+ * author Kevin Lang
145
+ * author Alexander Saydakov
146
+ * author Lee Rhodes
147
+ */
148
+
149
+ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
150
+ template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
151
+ template<typename A> using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
152
+ template<typename A> using vector_u32 = std::vector<uint32_t, AllocU32<A>>;
153
+ template<typename A> using AllocD = typename std::allocator_traits<A>::template rebind_alloc<double>;
154
+ template<typename A> using vector_d = std::vector<double, AllocD<A>>;
155
+
156
+ template <typename T, typename C = std::less<T>, typename S = serde<T>, typename A = std::allocator<T>>
157
+ class kll_sketch {
158
+ public:
159
+ static const uint8_t DEFAULT_M = 8;
160
+ static const uint16_t DEFAULT_K = 200;
161
+ static const uint16_t MIN_K = DEFAULT_M;
162
+ static const uint16_t MAX_K = (1 << 16) - 1;
163
+
164
+ explicit kll_sketch(uint16_t k = DEFAULT_K);
165
+ kll_sketch(const kll_sketch& other);
166
+ kll_sketch(kll_sketch&& other) noexcept;
167
+ ~kll_sketch();
168
+ kll_sketch& operator=(const kll_sketch& other);
169
+ kll_sketch& operator=(kll_sketch&& other);
170
+
171
+ /**
172
+ * Updates this sketch with the given data item.
173
+ * This method takes lvalue.
174
+ * @param value an item from a stream of items
175
+ */
176
+ void update(const T& value);
177
+
178
+ /**
179
+ * Updates this sketch with the given data item.
180
+ * This method takes rvalue.
181
+ * @param value an item from a stream of items
182
+ */
183
+ void update(T&& value);
184
+
185
+ /**
186
+ * Merges another sketch into this one.
187
+ * This method takes lvalue.
188
+ * @param other sketch to merge into this one
189
+ */
190
+ void merge(const kll_sketch& other);
191
+
192
+ /**
193
+ * Merges another sketch into this one.
194
+ * This method takes rvalue.
195
+ * @param other sketch to merge into this one
196
+ */
197
+ void merge(kll_sketch&& other);
198
+
199
+ /**
200
+ * Returns true if this sketch is empty.
201
+ * @return empty flag
202
+ */
203
+ bool is_empty() const;
204
+
205
+ /**
206
+ * Returns the length of the input stream.
207
+ * @return stream length
208
+ */
209
+ uint64_t get_n() const;
210
+
211
+ /**
212
+ * Returns the number of retained items (samples) in the sketch.
213
+ * @return the number of retained items
214
+ */
215
+ uint32_t get_num_retained() const;
216
+
217
+ /**
218
+ * Returns true if this sketch is in estimation mode.
219
+ * @return estimation mode flag
220
+ */
221
+ bool is_estimation_mode() const;
222
+
223
+ /**
224
+ * Returns the min value of the stream.
225
+ * For floating point types: if the sketch is empty this returns NaN.
226
+ * For other types: if the sketch is empty this throws runtime_error.
227
+ * @return the min value of the stream
228
+ */
229
+ T get_min_value() const;
230
+
231
+ /**
232
+ * Returns the max value of the stream.
233
+ * For floating point types: if the sketch is empty this returns NaN.
234
+ * For other types: if the sketch is empty this throws runtime_error.
235
+ * @return the max value of the stream
236
+ */
237
+ T get_max_value() const;
238
+
239
+ /**
240
+ * Returns an approximation to the value of the data item
241
+ * that would be preceded by the given fraction of a hypothetical sorted
242
+ * version of the input stream so far.
243
+ * <p>
244
+ * Note that this method has a fairly large overhead (microseconds instead of nanoseconds)
245
+ * so it should not be called multiple times to get different quantiles from the same
246
+ * sketch. Instead use get_quantiles(), which pays the overhead only once.
247
+ * <p>
248
+ * For floating point types: if the sketch is empty this returns NaN.
249
+ * For other types: if the sketch is empty this throws runtime_error.
250
+ *
251
+ * @param fraction the specified fractional position in the hypothetical sorted stream.
252
+ * These are also called normalized ranks or fractional ranks.
253
+ * If fraction = 0.0, the true minimum value of the stream is returned.
254
+ * If fraction = 1.0, the true maximum value of the stream is returned.
255
+ *
256
+ * @return the approximation to the value at the given fraction
257
+ */
258
+ T get_quantile(double fraction) const;
259
+
260
+ /**
261
+ * This is a more efficient multiple-query version of get_quantile().
262
+ * <p>
263
+ * This returns an array that could have been generated by using get_quantile() for each
264
+ * fractional rank separately, but would be very inefficient.
265
+ * This method incurs the internal set-up overhead once and obtains multiple quantile values in
266
+ * a single query. It is strongly recommend that this method be used instead of multiple calls
267
+ * to get_quantile().
268
+ *
269
+ * <p>If the sketch is empty this returns an empty vector.
270
+ *
271
+ * @param fractions given array of fractional positions in the hypothetical sorted stream.
272
+ * These are also called normalized ranks or fractional ranks.
273
+ * These fractions must be in the interval [0.0, 1.0], inclusive.
274
+ *
275
+ * @return array of approximations to the given fractions in the same order as given fractions
276
+ * in the input array.
277
+ */
278
+ std::vector<T, A> get_quantiles(const double* fractions, uint32_t size) const;
279
+
280
+ /**
281
+ * This is a multiple-query version of get_quantile() that allows the caller to
282
+ * specify the number of evenly-spaced fractional ranks.
283
+ *
284
+ * <p>If the sketch is empty this returns an empty vector.
285
+ *
286
+ * @param num an integer that specifies the number of evenly-spaced fractional ranks.
287
+ * This must be an integer greater than 0. A value of 1 will return the min value.
288
+ * A value of 2 will return the min and the max value. A value of 3 will return the min,
289
+ * the median and the max value, etc.
290
+ *
291
+ * @return array of approximations to the given number of evenly-spaced fractional ranks.
292
+ */
293
+ std::vector<T, A> get_quantiles(size_t num) const;
294
+
295
+ /**
296
+ * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
297
+ * inclusive.
298
+ *
299
+ * <p>The resulting approximation has a probabilistic guarantee that can be obtained from the
300
+ * get_normalized_rank_error(false) function.
301
+ *
302
+ * <p>If the sketch is empty this returns NaN.
303
+ *
304
+ * @param value to be ranked
305
+ * @return an approximate rank of the given value
306
+ */
307
+ double get_rank(const T& value) const;
308
+
309
+ /**
310
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
311
+ * given a set of split points (values).
312
+ *
313
+ * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
314
+ * get_normalized_rank_error(true) function.
315
+ *
316
+ * <p>If the sketch is empty this returns an empty vector.
317
+ *
318
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
319
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
320
+ * The definition of an "interval" is inclusive of the left split point (or minimum value) and
321
+ * exclusive of the right split point, with the exception that the last interval will include
322
+ * the maximum value.
323
+ * It is not necessary to include either the min or max values in these split points.
324
+ *
325
+ * @return an array of m+1 doubles each of which is an approximation
326
+ * to the fraction of the input stream values (the mass) that fall into one of those intervals.
327
+ * The definition of an "interval" is inclusive of the left split point and exclusive of the right
328
+ * split point, with the exception that the last interval will include maximum value.
329
+ */
330
+ vector_d<A> get_PMF(const T* split_points, uint32_t size) const;
331
+
332
+ /**
333
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
334
+ * cumulative analog of the PMF, of the input stream given a set of split points (values).
335
+ *
336
+ * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
337
+ * get_normalized_rank_error(false) function.
338
+ *
339
+ * <p>If the sketch is empty this returns an empty vector.
340
+ *
341
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
342
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
343
+ * The definition of an "interval" is inclusive of the left split point (or minimum value) and
344
+ * exclusive of the right split point, with the exception that the last interval will include
345
+ * the maximum value.
346
+ * It is not necessary to include either the min or max values in these split points.
347
+ *
348
+ * @return an array of m+1 double values, which are a consecutive approximation to the CDF
349
+ * of the input stream given the split_points. The value at array position j of the returned
350
+ * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
351
+ * array.
352
+ */
353
+ vector_d<A> get_CDF(const T* split_points, uint32_t size) const;
354
+
355
+ /**
356
+ * Gets the approximate rank error of this sketch normalized as a fraction between zero and one.
357
+ * @param pmf if true, returns the "double-sided" normalized rank error for the get_PMF() function.
358
+ * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
359
+ * @return if pmf is true, returns the normalized rank error for the get_PMF() function.
360
+ * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
361
+ */
362
+ double get_normalized_rank_error(bool pmf) const;
363
+
364
+ /**
365
+ * Computes size needed to serialize the current state of the sketch.
366
+ * This version is for fixed-size arithmetic types (integral and floating point).
367
+ * @return size in bytes needed to serialize this sketch
368
+ */
369
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
370
+ size_t get_serialized_size_bytes() const;
371
+
372
+ /**
373
+ * Computes size needed to serialize the current state of the sketch.
374
+ * This version is for all other types and can be expensive since every item needs to be looked at.
375
+ * @return size in bytes needed to serialize this sketch
376
+ */
377
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
378
+ size_t get_serialized_size_bytes() const;
379
+
380
+ /**
381
+ * This method serializes the sketch into a given stream in a binary form
382
+ * @param os output stream
383
+ */
384
+ void serialize(std::ostream& os) const;
385
+
386
+ // This is a convenience alias for users
387
+ // The type returned by the following serialize method
388
+ typedef vector_u8<A> vector_bytes;
389
+
390
+ /**
391
+ * This method serializes the sketch as a vector of bytes.
392
+ * An optional header can be reserved in front of the sketch.
393
+ * It is a blank space of a given size.
394
+ * This header is used in Datasketches PostgreSQL extension.
395
+ * @param header_size_bytes space to reserve in front of the sketch
396
+ */
397
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
398
+
399
+ /**
400
+ * This method deserializes a sketch from a given stream.
401
+ * @param is input stream
402
+ * @return an instance of a sketch
403
+ */
404
+ static kll_sketch deserialize(std::istream& is);
405
+
406
+ /**
407
+ * This method deserializes a sketch from a given array of bytes.
408
+ * @param bytes pointer to the array of bytes
409
+ * @param size the size of the array
410
+ * @return an instance of a sketch
411
+ */
412
+ static kll_sketch deserialize(const void* bytes, size_t size);
413
+
414
+ /*
415
+ * Gets the normalized rank error given k and pmf.
416
+ * k - the configuration parameter
417
+ * pmf - if true, returns the "double-sided" normalized rank error for the get_PMF() function.
418
+ * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
419
+ * Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials
420
+ */
421
+ static double get_normalized_rank_error(uint16_t k, bool pmf);
422
+
423
+ /**
424
+ * Prints a summary of the sketch.
425
+ * @param print_levels if true include information about levels
426
+ * @param print_items if true include sketch data
427
+ */
428
+ string<A> to_string(bool print_levels = false, bool print_items = false) const;
429
+
430
+ class const_iterator;
431
+ const_iterator begin() const;
432
+ const_iterator end() const;
433
+
434
+ #ifdef KLL_VALIDATION
435
+ uint8_t get_num_levels() { return num_levels_; }
436
+ uint32_t* get_levels() { return levels_; }
437
+ T* get_items() { return items_; }
438
+ #endif
439
+
440
+ private:
441
+ /* Serialized sketch layout:
442
+ * Adr:
443
+ * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
444
+ * 0 || unused | M |--------K--------| Flags | FamID | SerVer | PreambleInts |
445
+ * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
446
+ * 1 ||-----------------------------------N------------------------------------------|
447
+ * || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 |
448
+ * 2 ||---------------data----------------|-unused-|numLevels|-------min K-----------|
449
+ */
450
+
451
+ static const size_t EMPTY_SIZE_BYTES = 8;
452
+ static const size_t DATA_START_SINGLE_ITEM = 8;
453
+ static const size_t DATA_START = 20;
454
+
455
+ static const uint8_t SERIAL_VERSION_1 = 1;
456
+ static const uint8_t SERIAL_VERSION_2 = 2;
457
+ static const uint8_t FAMILY = 15;
458
+
459
+ enum flags { IS_EMPTY, IS_LEVEL_ZERO_SORTED, IS_SINGLE_ITEM };
460
+
461
+ static const uint8_t PREAMBLE_INTS_SHORT = 2; // for empty and single item
462
+ static const uint8_t PREAMBLE_INTS_FULL = 5;
463
+
464
+ uint16_t k_;
465
+ uint8_t m_; // minimum buffer "width"
466
+ uint16_t min_k_; // for error estimation after merging with different k
467
+ uint64_t n_;
468
+ uint8_t num_levels_;
469
+ vector_u32<A> levels_;
470
+ T* items_;
471
+ uint32_t items_size_;
472
+ T* min_value_;
473
+ T* max_value_;
474
+ bool is_level_zero_sorted_;
475
+
476
+ // for deserialization
477
+ class item_deleter;
478
+ class items_deleter;
479
+ kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32<A>&& levels,
480
+ std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_value,
481
+ std::unique_ptr<T, item_deleter> max_value, bool is_level_zero_sorted);
482
+
483
+ // common update code
484
+ inline void update_min_max(const T& value);
485
+ inline uint32_t internal_update();
486
+
487
+ // The following code is only valid in the special case of exactly reaching capacity while updating.
488
+ // It cannot be used while merging, while reducing k, or anything else.
489
+ void compress_while_updating(void);
490
+
491
+ uint8_t find_level_to_compact() const;
492
+ void add_empty_top_level_to_completely_full_sketch();
493
+ void sort_level_zero();
494
+ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> get_quantile_calculator();
495
+ vector_d<A> get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const;
496
+ void increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
497
+ const T* split_points, uint32_t size, double* buckets) const;
498
+ void increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
499
+ const T* split_points, uint32_t size, double* buckets) const;
500
+ template<typename O> void merge_higher_levels(O&& other, uint64_t final_n);
501
+ void populate_work_arrays(const kll_sketch& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels);
502
+ void populate_work_arrays(kll_sketch&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels);
503
+ void assert_correct_total_weight() const;
504
+ uint32_t safe_level_size(uint8_t level) const;
505
+ uint32_t get_num_retained_above_level_zero() const;
506
+
507
+ static void check_m(uint8_t m);
508
+ static void check_preamble_ints(uint8_t preamble_ints, uint8_t flags_byte);
509
+ static void check_serial_version(uint8_t serial_version);
510
+ static void check_family_id(uint8_t family_id);
511
+
512
+ // implementations for floating point types
513
+ template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
514
+ static TT get_invalid_value() {
515
+ return std::numeric_limits<TT>::quiet_NaN();
516
+ }
517
+
518
+ template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
519
+ static inline bool check_update_value(TT value) {
520
+ return !std::isnan(value);
521
+ }
522
+
523
+ // implementations for all other types
524
+ template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
525
+ static TT get_invalid_value() {
526
+ throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of values");
527
+ }
528
+
529
+ template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
530
+ static inline bool check_update_value(TT) {
531
+ return true;
532
+ }
533
+
534
+ };
535
+
536
+ template<typename T, typename C, typename S, typename A>
537
+ class kll_sketch<T, C, S, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
538
+ public:
539
+ friend class kll_sketch<T, C, S, A>;
540
+ const_iterator& operator++();
541
+ const_iterator& operator++(int);
542
+ bool operator==(const const_iterator& other) const;
543
+ bool operator!=(const const_iterator& other) const;
544
+ const std::pair<const T&, const uint64_t> operator*() const;
545
+ private:
546
+ const T* items;
547
+ const uint32_t* levels;
548
+ const uint8_t num_levels;
549
+ uint32_t index;
550
+ uint8_t level;
551
+ uint64_t weight;
552
+ const_iterator(const T* items, const uint32_t* levels, const uint8_t num_levels);
553
+ };
554
+
555
+ } /* namespace datasketches */
556
+
557
+ #include "kll_sketch_impl.hpp"
558
+
559
+ #endif