datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,291 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _BOUNDS_BINOMIAL_PROPORTIONS_HPP_
21
+ #define _BOUNDS_BINOMIAL_PROPORTIONS_HPP_
22
+
23
+ #include <cmath>
24
+ #include <stdexcept>
25
+
26
+ namespace datasketches {
27
+
28
+ /**
29
+ * Confidence intervals for binomial proportions.
30
+ *
31
+ * <p>This class computes an approximation to the Clopper-Pearson confidence interval
32
+ * for a binomial proportion. Exact Clopper-Pearson intervals are strictly
33
+ * conservative, but these approximations are not.</p>
34
+ *
35
+ * <p>The main inputs are numbers <i>n</i> and <i>k</i>, which are not the same as other things
36
+ * that are called <i>n</i> and <i>k</i> in our sketching library. There is also a third
37
+ * parameter, numStdDev, that specifies the desired confidence level.</p>
38
+ * <ul>
39
+ * <li><i>n</i> is the number of independent randomized trials. It is given and therefore known.
40
+ * </li>
41
+ * <li><i>p</i> is the probability of a trial being a success. It is unknown.</li>
42
+ * <li><i>k</i> is the number of trials (out of <i>n</i>) that turn out to be successes. It is
43
+ * a random variable governed by a binomial distribution. After any given
44
+ * batch of <i>n</i> independent trials, the random variable <i>k</i> has a specific
45
+ * value which is observed and is therefore known.</li>
46
+ * <li><i>pHat</i> = <i>k</i> / <i>n</i> is an unbiased estimate of the unknown success
47
+ * probability <i>p</i>.</li>
48
+ * </ul>
49
+ *
50
+ * <p>Alternatively, consider a coin with unknown heads probability <i>p</i>. Where
51
+ * <i>n</i> is the number of independent flips of that coin, and <i>k</i> is the number
52
+ * of times that the coin comes up heads during a given batch of <i>n</i> flips.
53
+ * This class computes a frequentist confidence interval [lowerBoundOnP, upperBoundOnP] for the
54
+ * unknown <i>p</i>.</p>
55
+ *
56
+ * <p>Conceptually, the desired confidence level is specified by a tail probability delta.</p>
57
+ *
58
+ * <p>Ideally, over a large ensemble of independent batches of trials,
59
+ * the fraction of batches in which the true <i>p</i> lies below lowerBoundOnP would be at most
60
+ * delta, and the fraction of batches in which the true <i>p</i> lies above upperBoundOnP
61
+ * would also be at most delta.
62
+ *
63
+ * <p>Setting aside the philosophical difficulties attaching to that statement, it isn't quite
64
+ * true because we are approximating the Clopper-Pearson interval.</p>
65
+ *
66
+ * <p>Finally, we point out that in this class's interface, the confidence parameter delta is
67
+ * not specified directly, but rather through a "number of standard deviations" numStdDev.
68
+ * The library effectively converts that to a delta via delta = normalCDF (-1.0 * numStdDev).</p>
69
+ *
70
+ * <p>It is perhaps worth emphasizing that the library is NOT merely adding and subtracting
71
+ * numStdDev standard deviations to the estimate. It is doing something better, that to some
72
+ * extent accounts for the fact that the binomial distribution has a non-gaussian shape.</p>
73
+ *
74
+ * <p>In particular, it is using an approximation to the inverse of the incomplete beta function
75
+ * that appears as formula 26.5.22 on page 945 of the "Handbook of Mathematical Functions"
76
+ * by Abramowitz and Stegun.</p>
77
+ *
78
+ * @author Kevin Lang
79
+ * @author Jon Malkin
80
+ */
81
+ class bounds_binomial_proportions { // confidence intervals for binomial proportions
82
+
83
+ public:
84
+ /**
85
+ * Computes lower bound of approximate Clopper-Pearson confidence interval for a binomial
86
+ * proportion.
87
+ *
88
+ * <p>Implementation Notes:<br>
89
+ * The approximateLowerBoundOnP is defined with respect to the right tail of the binomial
90
+ * distribution.</p>
91
+ * <ul>
92
+ * <li>We want to solve for the <i>p</i> for which sum<sub><i>j,k,n</i></sub>bino(<i>j;n,p</i>)
93
+ * = delta.</li>
94
+ * <li>We now restate that in terms of the left tail.</li>
95
+ * <li>We want to solve for the p for which sum<sub><i>j,0,(k-1)</i></sub>bino(<i>j;n,p</i>)
96
+ * = 1 - delta.</li>
97
+ * <li>Define <i>x</i> = 1-<i>p</i>.</li>
98
+ * <li>We want to solve for the <i>x</i> for which I<sub><i>x(n-k+1,k)</i></sub> = 1 - delta.</li>
99
+ * <li>We specify 1-delta via numStdDevs through the right tail of the standard normal
100
+ * distribution.</li>
101
+ * <li>Smaller values of numStdDevs correspond to bigger values of 1-delta and hence to smaller
102
+ * values of delta. In fact, usefully small values of delta correspond to negative values of
103
+ * numStdDevs.</li>
104
+ * <li>return <i>p</i> = 1-<i>x</i>.</li>
105
+ * </ul>
106
+ *
107
+ * @param n is the number of trials. Must be non-negative.
108
+ * @param k is the number of successes. Must be non-negative, and cannot exceed n.
109
+ * @param num_std_devs the number of standard deviations defining the confidence interval
110
+ * @return the lower bound of the approximate Clopper-Pearson confidence interval for the
111
+ * unknown success probability.
112
+ */
113
+ static inline double approximate_lower_bound_on_p(long n, long k, double num_std_devs) {
114
+ check_inputs(n, k);
115
+ if (n == 0) { return 0.0; } // the coin was never flipped, so we know nothing
116
+ else if (k == 0) { return 0.0; }
117
+ else if (k == 1) { return (exact_lower_bound_on_p_k_eq_1(n, delta_of_num_stdevs(num_std_devs))); }
118
+ else if (k == n) { return (exact_lower_bound_on_p_k_eq_n(n, delta_of_num_stdevs(num_std_devs))); }
119
+ else {
120
+ double x = abramowitz_stegun_formula_26p5p22((n - k) + 1, k, (-1.0 * num_std_devs));
121
+ return (1.0 - x); // which is p
122
+ }
123
+ }
124
+
125
+ /**
126
+ * Computes upper bound of approximate Clopper-Pearson confidence interval for a binomial
127
+ * proportion.
128
+ *
129
+ * <p>Implementation Notes:<br>
130
+ * The approximateUpperBoundOnP is defined with respect to the left tail of the binomial
131
+ * distribution.</p>
132
+ * <ul>
133
+ * <li>We want to solve for the <i>p</i> for which sum<sub><i>j,0,k</i></sub>bino(<i>j;n,p</i>)
134
+ * = delta.</li>
135
+ * <li>Define <i>x</i> = 1-<i>p</i>.</li>
136
+ * <li>We want to solve for the <i>x</i> for which I<sub><i>x(n-k,k+1)</i></sub> = delta.</li>
137
+ * <li>We specify delta via numStdDevs through the right tail of the standard normal
138
+ * distribution.</li>
139
+ * <li>Bigger values of numStdDevs correspond to smaller values of delta.</li>
140
+ * <li>return <i>p</i> = 1-<i>x</i>.</li>
141
+ * </ul>
142
+ * @param n is the number of trials. Must be non-negative.
143
+ * @param k is the number of successes. Must be non-negative, and cannot exceed <i>n</i>.
144
+ * @param num_std_devs the number of standard deviations defining the confidence interval
145
+ * @return the upper bound of the approximate Clopper-Pearson confidence interval for the
146
+ * unknown success probability.
147
+ */
148
+ static inline double approximate_upper_bound_on_p(long n, long k, double num_std_devs) {
149
+ check_inputs(n, k);
150
+ if (n == 0) { return 1.0; } // the coin was never flipped, so we know nothing
151
+ else if (k == n) { return 1.0; }
152
+ else if (k == (n - 1)) {
153
+ return (exactU_upper_bound_on_p_k_eq_minusone(n, delta_of_num_stdevs(num_std_devs)));
154
+ }
155
+ else if (k == 0) {
156
+ return (exact_upper_bound_on_p_k_eq_zero(n, delta_of_num_stdevs(num_std_devs)));
157
+ }
158
+ else {
159
+ double x = abramowitz_stegun_formula_26p5p22(n - k, k + 1, num_std_devs);
160
+ return (1.0 - x); // which is p
161
+ }
162
+ }
163
+
164
+ /**
165
+ * Computes an estimate of an unknown binomial proportion.
166
+ * @param n is the number of trials. Must be non-negative.
167
+ * @param k is the number of successes. Must be non-negative, and cannot exceed n.
168
+ * @return the estimate of the unknown binomial proportion.
169
+ */
170
+ static inline double estimate_unknown_p(long n, long k) {
171
+ check_inputs(n, k);
172
+ if (n == 0) { return 0.5; } // the coin was never flipped, so we know nothing
173
+ else { return ((double) k / (double) n); }
174
+ }
175
+
176
+ /**
177
+ * Computes an approximation to the erf() function.
178
+ * @param x is the input to the erf function
179
+ * @return returns erf(x), accurate to roughly 7 decimal digits.
180
+ */
181
+ static inline double erf(double x) {
182
+ if (x < 0.0) { return (-1.0 * (erf_of_nonneg(-1.0 * x))); }
183
+ else { return (erf_of_nonneg(x)); }
184
+ }
185
+
186
+ /**
187
+ * Computes an approximation to normal_cdf(x).
188
+ * @param x is the input to the normal_cdf function
189
+ * @return returns the approximation to normalCDF(x).
190
+ */
191
+ static inline double normal_cdf(double x) {
192
+ return (0.5 * (1.0 + (erf(x / (sqrt(2.0))))));
193
+ }
194
+
195
+ private:
196
+ static inline void check_inputs(long n, long k) {
197
+ if (n < 0) { throw std::invalid_argument("N must be non-negative"); }
198
+ if (k < 0) { throw std::invalid_argument("K must be non-negative"); }
199
+ if (k > n) { throw std::invalid_argument("K cannot exceed N"); }
200
+ }
201
+
202
+ //@formatter:off
203
+ // Abramowitz and Stegun formula 7.1.28, p. 88; Claims accuracy of about 7 decimal digits */
204
+ static inline double erf_of_nonneg(double x) {
205
+ // The constants that appear below, formatted for easy checking against the book.
206
+ // a1 = 0.07052 30784
207
+ // a3 = 0.00927 05272
208
+ // a5 = 0.00027 65672
209
+ // a2 = 0.04228 20123
210
+ // a4 = 0.00015 20143
211
+ // a6 = 0.00004 30638
212
+ static const double a1 = 0.0705230784;
213
+ static const double a3 = 0.0092705272;
214
+ static const double a5 = 0.0002765672;
215
+ static const double a2 = 0.0422820123;
216
+ static const double a4 = 0.0001520143;
217
+ static const double a6 = 0.0000430638;
218
+ const double x2 = x * x; // x squared, x cubed, etc.
219
+ const double x3 = x2 * x;
220
+ const double x4 = x2 * x2;
221
+ const double x5 = x2 * x3;
222
+ const double x6 = x3 * x3;
223
+ const double sum = ( 1.0
224
+ + (a1 * x)
225
+ + (a2 * x2)
226
+ + (a3 * x3)
227
+ + (a4 * x4)
228
+ + (a5 * x5)
229
+ + (a6 * x6) );
230
+ const double sum2 = sum * sum; // raise the sum to the 16th power
231
+ const double sum4 = sum2 * sum2;
232
+ const double sum8 = sum4 * sum4;
233
+ const double sum16 = sum8 * sum8;
234
+ return (1.0 - (1.0 / sum16));
235
+ }
236
+
237
+ static inline double delta_of_num_stdevs(double kappa) {
238
+ return (normal_cdf(-1.0 * kappa));
239
+ }
240
+
241
+ //@formatter:on
242
+ // Formula 26.5.22 on page 945 of Abramowitz & Stegun, which is an approximation
243
+ // of the inverse of the incomplete beta function I_x(a,b) = delta
244
+ // viewed as a scalar function of x.
245
+ // In other words, we specify delta, and it gives us x (with a and b held constant).
246
+ // However, delta is specified in an indirect way through yp which
247
+ // is the number of stdDevs that leaves delta probability in the right
248
+ // tail of a standard gaussian distribution.
249
+
250
+ // We point out that the variable names correspond to those in the book,
251
+ // and it is worth keeping it that way so that it will always be easy to verify
252
+ // that the formula was typed in correctly.
253
+
254
+ static inline double abramowitz_stegun_formula_26p5p22(double a, double b,
255
+ double yp) {
256
+ const double b2m1 = (2.0 * b) - 1.0;
257
+ const double a2m1 = (2.0 * a) - 1.0;
258
+ const double lambda = ((yp * yp) - 3.0) / 6.0;
259
+ const double htmp = (1.0 / a2m1) + (1.0 / b2m1);
260
+ const double h = 2.0 / htmp;
261
+ const double term1 = (yp * (sqrt(h + lambda))) / h;
262
+ const double term2 = (1.0 / b2m1) - (1.0 / a2m1);
263
+ const double term3 = (lambda + (5.0 / 6.0)) - (2.0 / (3.0 * h));
264
+ const double w = term1 - (term2 * term3);
265
+ const double xp = a / (a + (b * (exp(2.0 * w))));
266
+ return xp;
267
+ }
268
+
269
+ // Formulas for some special cases.
270
+
271
+ static inline double exact_upper_bound_on_p_k_eq_zero(double n, double delta) {
272
+ return (1.0 - pow(delta, (1.0 / n)));
273
+ }
274
+
275
+ static inline double exact_lower_bound_on_p_k_eq_n(double n, double delta) {
276
+ return (pow(delta, (1.0 / n)));
277
+ }
278
+
279
+ static inline double exact_lower_bound_on_p_k_eq_1(double n, double delta) {
280
+ return (1.0 - pow((1.0 - delta), (1.0 / n)));
281
+ }
282
+
283
+ static inline double exactU_upper_bound_on_p_k_eq_minusone(double n, double delta) {
284
+ return (pow((1.0 - delta), (1.0 / n)));
285
+ }
286
+
287
+ };
288
+
289
+ }
290
+
291
+ #endif // _BOUNDS_BINOMIAL_PROPORTIONS_HPP_
@@ -0,0 +1,41 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef CEILING_POWER_OF_2_HPP_
21
+ #define CEILING_POWER_OF_2_HPP_
22
+
23
+ #include <cstdint>
24
+
25
+ namespace datasketches {
26
+
27
+ // compute the next highest power of 2 of 32-bit n
28
+ // taken from https://graphics.stanford.edu/~seander/bithacks.html
29
+ static inline uint32_t ceiling_power_of_2(uint32_t n) {
30
+ --n;
31
+ n |= n >> 1;
32
+ n |= n >> 2;
33
+ n |= n >> 4;
34
+ n |= n >> 8;
35
+ n |= n >> 16;
36
+ return ++n;
37
+ }
38
+
39
+ } /* namespace datasketches */
40
+
41
+ #endif // CEILING_POWER_OF_2_HPP_
@@ -0,0 +1,51 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _COMMON_DEFS_HPP_
21
+ #define _COMMON_DEFS_HPP_
22
+
23
+ #include <cstdint>
24
+ #include <string>
25
+ #include <memory>
26
+
27
+ namespace datasketches {
28
+
29
+ static const uint64_t DEFAULT_SEED = 9001;
30
+
31
+ template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
32
+ template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;
33
+
34
+ // utility function to hide unused compiler warning
35
+ // usually has no additional cost
36
+ template<typename T> void unused(T&&...) {}
37
+
38
+ // common helping functions
39
+ // TODO: find a better place for them
40
+
41
+ constexpr uint8_t log2(uint32_t n) {
42
+ return (n > 1) ? 1 + log2(n >> 1) : 0;
43
+ }
44
+
45
+ constexpr uint8_t lg_size_from_count(uint32_t n, double load_factor) {
46
+ return log2(n) + ((n > static_cast<uint32_t>((1 << (log2(n) + 1)) * load_factor)) ? 2 : 1);
47
+ }
48
+
49
+ } // namespace
50
+
51
+ #endif // _COMMON_DEFS_HPP_
@@ -0,0 +1,68 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef CONDITIONAL_BACK_INSERTER_HPP_
21
+ #define CONDITIONAL_BACK_INSERTER_HPP_
22
+
23
+ #include <iterator>
24
+ #include <functional>
25
+
26
+ namespace datasketches {
27
+
28
+ template <typename Container, typename Predicate>
29
+ class conditional_back_insert_iterator: public std::back_insert_iterator<Container> {
30
+ public:
31
+ template<typename P>
32
+ conditional_back_insert_iterator(Container& c, P&& p): std::back_insert_iterator<Container>(c), p(std::forward<P>(p)) {}
33
+
34
+ // MSVC seems to insist on having copy constructor and assignment
35
+ conditional_back_insert_iterator(const conditional_back_insert_iterator& other):
36
+ std::back_insert_iterator<Container>(other), p(other.p) {}
37
+ conditional_back_insert_iterator& operator=(const conditional_back_insert_iterator& other) {
38
+ std::back_insert_iterator<Container>::operator=(other);
39
+ p = other.p;
40
+ return *this;
41
+ }
42
+
43
+ conditional_back_insert_iterator& operator=(const typename Container::value_type& value) {
44
+ if (p(value)) std::back_insert_iterator<Container>::operator=(value);
45
+ return *this;
46
+ }
47
+
48
+ conditional_back_insert_iterator& operator=(typename Container::value_type&& value) {
49
+ if (p(value)) std::back_insert_iterator<Container>::operator=(std::move(value));
50
+ return *this;
51
+ }
52
+
53
+ conditional_back_insert_iterator& operator*() { return *this; }
54
+ conditional_back_insert_iterator& operator++() { return *this; }
55
+ conditional_back_insert_iterator& operator++(int) { return *this; }
56
+
57
+ private:
58
+ Predicate p;
59
+ };
60
+
61
+ template<typename Container, typename Predicate>
62
+ conditional_back_insert_iterator<Container, Predicate> conditional_back_inserter(Container& c, Predicate&& p) {
63
+ return conditional_back_insert_iterator<Container, Predicate>(c, std::forward<Predicate>(p));
64
+ }
65
+
66
+ } /* namespace datasketches */
67
+
68
+ #endif