datasketches 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,291 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _BOUNDS_BINOMIAL_PROPORTIONS_HPP_
21
+ #define _BOUNDS_BINOMIAL_PROPORTIONS_HPP_
22
+
23
+ #include <cmath>
24
+ #include <stdexcept>
25
+
26
+ namespace datasketches {
27
+
28
+ /**
29
+ * Confidence intervals for binomial proportions.
30
+ *
31
+ * <p>This class computes an approximation to the Clopper-Pearson confidence interval
32
+ * for a binomial proportion. Exact Clopper-Pearson intervals are strictly
33
+ * conservative, but these approximations are not.</p>
34
+ *
35
+ * <p>The main inputs are numbers <i>n</i> and <i>k</i>, which are not the same as other things
36
+ * that are called <i>n</i> and <i>k</i> in our sketching library. There is also a third
37
+ * parameter, numStdDev, that specifies the desired confidence level.</p>
38
+ * <ul>
39
+ * <li><i>n</i> is the number of independent randomized trials. It is given and therefore known.
40
+ * </li>
41
+ * <li><i>p</i> is the probability of a trial being a success. It is unknown.</li>
42
+ * <li><i>k</i> is the number of trials (out of <i>n</i>) that turn out to be successes. It is
43
+ * a random variable governed by a binomial distribution. After any given
44
+ * batch of <i>n</i> independent trials, the random variable <i>k</i> has a specific
45
+ * value which is observed and is therefore known.</li>
46
+ * <li><i>pHat</i> = <i>k</i> / <i>n</i> is an unbiased estimate of the unknown success
47
+ * probability <i>p</i>.</li>
48
+ * </ul>
49
+ *
50
+ * <p>Alternatively, consider a coin with unknown heads probability <i>p</i>. Where
51
+ * <i>n</i> is the number of independent flips of that coin, and <i>k</i> is the number
52
+ * of times that the coin comes up heads during a given batch of <i>n</i> flips.
53
+ * This class computes a frequentist confidence interval [lowerBoundOnP, upperBoundOnP] for the
54
+ * unknown <i>p</i>.</p>
55
+ *
56
+ * <p>Conceptually, the desired confidence level is specified by a tail probability delta.</p>
57
+ *
58
+ * <p>Ideally, over a large ensemble of independent batches of trials,
59
+ * the fraction of batches in which the true <i>p</i> lies below lowerBoundOnP would be at most
60
+ * delta, and the fraction of batches in which the true <i>p</i> lies above upperBoundOnP
61
+ * would also be at most delta.
62
+ *
63
+ * <p>Setting aside the philosophical difficulties attaching to that statement, it isn't quite
64
+ * true because we are approximating the Clopper-Pearson interval.</p>
65
+ *
66
+ * <p>Finally, we point out that in this class's interface, the confidence parameter delta is
67
+ * not specified directly, but rather through a "number of standard deviations" numStdDev.
68
+ * The library effectively converts that to a delta via delta = normalCDF (-1.0 * numStdDev).</p>
69
+ *
70
+ * <p>It is perhaps worth emphasizing that the library is NOT merely adding and subtracting
71
+ * numStdDev standard deviations to the estimate. It is doing something better, that to some
72
+ * extent accounts for the fact that the binomial distribution has a non-gaussian shape.</p>
73
+ *
74
+ * <p>In particular, it is using an approximation to the inverse of the incomplete beta function
75
+ * that appears as formula 26.5.22 on page 945 of the "Handbook of Mathematical Functions"
76
+ * by Abramowitz and Stegun.</p>
77
+ *
78
+ * @author Kevin Lang
79
+ * @author Jon Malkin
80
+ */
81
+ class bounds_binomial_proportions { // confidence intervals for binomial proportions
82
+
83
+ public:
84
+ /**
85
+ * Computes lower bound of approximate Clopper-Pearson confidence interval for a binomial
86
+ * proportion.
87
+ *
88
+ * <p>Implementation Notes:<br>
89
+ * The approximateLowerBoundOnP is defined with respect to the right tail of the binomial
90
+ * distribution.</p>
91
+ * <ul>
92
+ * <li>We want to solve for the <i>p</i> for which sum<sub><i>j,k,n</i></sub>bino(<i>j;n,p</i>)
93
+ * = delta.</li>
94
+ * <li>We now restate that in terms of the left tail.</li>
95
+ * <li>We want to solve for the p for which sum<sub><i>j,0,(k-1)</i></sub>bino(<i>j;n,p</i>)
96
+ * = 1 - delta.</li>
97
+ * <li>Define <i>x</i> = 1-<i>p</i>.</li>
98
+ * <li>We want to solve for the <i>x</i> for which I<sub><i>x(n-k+1,k)</i></sub> = 1 - delta.</li>
99
+ * <li>We specify 1-delta via numStdDevs through the right tail of the standard normal
100
+ * distribution.</li>
101
+ * <li>Smaller values of numStdDevs correspond to bigger values of 1-delta and hence to smaller
102
+ * values of delta. In fact, usefully small values of delta correspond to negative values of
103
+ * numStdDevs.</li>
104
+ * <li>return <i>p</i> = 1-<i>x</i>.</li>
105
+ * </ul>
106
+ *
107
+ * @param n is the number of trials. Must be non-negative.
108
+ * @param k is the number of successes. Must be non-negative, and cannot exceed n.
109
+ * @param num_std_devs the number of standard deviations defining the confidence interval
110
+ * @return the lower bound of the approximate Clopper-Pearson confidence interval for the
111
+ * unknown success probability.
112
+ */
113
+ static inline double approximate_lower_bound_on_p(long n, long k, double num_std_devs) {
114
+ check_inputs(n, k);
115
+ if (n == 0) { return 0.0; } // the coin was never flipped, so we know nothing
116
+ else if (k == 0) { return 0.0; }
117
+ else if (k == 1) { return (exact_lower_bound_on_p_k_eq_1(n, delta_of_num_stdevs(num_std_devs))); }
118
+ else if (k == n) { return (exact_lower_bound_on_p_k_eq_n(n, delta_of_num_stdevs(num_std_devs))); }
119
+ else {
120
+ double x = abramowitz_stegun_formula_26p5p22((n - k) + 1, k, (-1.0 * num_std_devs));
121
+ return (1.0 - x); // which is p
122
+ }
123
+ }
124
+
125
+ /**
126
+ * Computes upper bound of approximate Clopper-Pearson confidence interval for a binomial
127
+ * proportion.
128
+ *
129
+ * <p>Implementation Notes:<br>
130
+ * The approximateUpperBoundOnP is defined with respect to the left tail of the binomial
131
+ * distribution.</p>
132
+ * <ul>
133
+ * <li>We want to solve for the <i>p</i> for which sum<sub><i>j,0,k</i></sub>bino(<i>j;n,p</i>)
134
+ * = delta.</li>
135
+ * <li>Define <i>x</i> = 1-<i>p</i>.</li>
136
+ * <li>We want to solve for the <i>x</i> for which I<sub><i>x(n-k,k+1)</i></sub> = delta.</li>
137
+ * <li>We specify delta via numStdDevs through the right tail of the standard normal
138
+ * distribution.</li>
139
+ * <li>Bigger values of numStdDevs correspond to smaller values of delta.</li>
140
+ * <li>return <i>p</i> = 1-<i>x</i>.</li>
141
+ * </ul>
142
+ * @param n is the number of trials. Must be non-negative.
143
+ * @param k is the number of successes. Must be non-negative, and cannot exceed <i>n</i>.
144
+ * @param num_std_devs the number of standard deviations defining the confidence interval
145
+ * @return the upper bound of the approximate Clopper-Pearson confidence interval for the
146
+ * unknown success probability.
147
+ */
148
+ static inline double approximate_upper_bound_on_p(long n, long k, double num_std_devs) {
149
+ check_inputs(n, k);
150
+ if (n == 0) { return 1.0; } // the coin was never flipped, so we know nothing
151
+ else if (k == n) { return 1.0; }
152
+ else if (k == (n - 1)) {
153
+ return (exactU_upper_bound_on_p_k_eq_minusone(n, delta_of_num_stdevs(num_std_devs)));
154
+ }
155
+ else if (k == 0) {
156
+ return (exact_upper_bound_on_p_k_eq_zero(n, delta_of_num_stdevs(num_std_devs)));
157
+ }
158
+ else {
159
+ double x = abramowitz_stegun_formula_26p5p22(n - k, k + 1, num_std_devs);
160
+ return (1.0 - x); // which is p
161
+ }
162
+ }
163
+
164
+ /**
165
+ * Computes an estimate of an unknown binomial proportion.
166
+ * @param n is the number of trials. Must be non-negative.
167
+ * @param k is the number of successes. Must be non-negative, and cannot exceed n.
168
+ * @return the estimate of the unknown binomial proportion.
169
+ */
170
+ static inline double estimate_unknown_p(long n, long k) {
171
+ check_inputs(n, k);
172
+ if (n == 0) { return 0.5; } // the coin was never flipped, so we know nothing
173
+ else { return ((double) k / (double) n); }
174
+ }
175
+
176
+ /**
177
+ * Computes an approximation to the erf() function.
178
+ * @param x is the input to the erf function
179
+ * @return returns erf(x), accurate to roughly 7 decimal digits.
180
+ */
181
+ static inline double erf(double x) {
182
+ if (x < 0.0) { return (-1.0 * (erf_of_nonneg(-1.0 * x))); }
183
+ else { return (erf_of_nonneg(x)); }
184
+ }
185
+
186
+ /**
187
+ * Computes an approximation to normal_cdf(x).
188
+ * @param x is the input to the normal_cdf function
189
+ * @return returns the approximation to normalCDF(x).
190
+ */
191
+ static inline double normal_cdf(double x) {
192
+ return (0.5 * (1.0 + (erf(x / (sqrt(2.0))))));
193
+ }
194
+
195
+ private:
196
+ static inline void check_inputs(long n, long k) {
197
+ if (n < 0) { throw std::invalid_argument("N must be non-negative"); }
198
+ if (k < 0) { throw std::invalid_argument("K must be non-negative"); }
199
+ if (k > n) { throw std::invalid_argument("K cannot exceed N"); }
200
+ }
201
+
202
+ //@formatter:off
203
+ // Abramowitz and Stegun formula 7.1.28, p. 88; Claims accuracy of about 7 decimal digits */
204
+ static inline double erf_of_nonneg(double x) {
205
+ // The constants that appear below, formatted for easy checking against the book.
206
+ // a1 = 0.07052 30784
207
+ // a3 = 0.00927 05272
208
+ // a5 = 0.00027 65672
209
+ // a2 = 0.04228 20123
210
+ // a4 = 0.00015 20143
211
+ // a6 = 0.00004 30638
212
+ static const double a1 = 0.0705230784;
213
+ static const double a3 = 0.0092705272;
214
+ static const double a5 = 0.0002765672;
215
+ static const double a2 = 0.0422820123;
216
+ static const double a4 = 0.0001520143;
217
+ static const double a6 = 0.0000430638;
218
+ const double x2 = x * x; // x squared, x cubed, etc.
219
+ const double x3 = x2 * x;
220
+ const double x4 = x2 * x2;
221
+ const double x5 = x2 * x3;
222
+ const double x6 = x3 * x3;
223
+ const double sum = ( 1.0
224
+ + (a1 * x)
225
+ + (a2 * x2)
226
+ + (a3 * x3)
227
+ + (a4 * x4)
228
+ + (a5 * x5)
229
+ + (a6 * x6) );
230
+ const double sum2 = sum * sum; // raise the sum to the 16th power
231
+ const double sum4 = sum2 * sum2;
232
+ const double sum8 = sum4 * sum4;
233
+ const double sum16 = sum8 * sum8;
234
+ return (1.0 - (1.0 / sum16));
235
+ }
236
+
237
+ static inline double delta_of_num_stdevs(double kappa) {
238
+ return (normal_cdf(-1.0 * kappa));
239
+ }
240
+
241
+ //@formatter:on
242
+ // Formula 26.5.22 on page 945 of Abramowitz & Stegun, which is an approximation
243
+ // of the inverse of the incomplete beta function I_x(a,b) = delta
244
+ // viewed as a scalar function of x.
245
+ // In other words, we specify delta, and it gives us x (with a and b held constant).
246
+ // However, delta is specified in an indirect way through yp which
247
+ // is the number of stdDevs that leaves delta probability in the right
248
+ // tail of a standard gaussian distribution.
249
+
250
+ // We point out that the variable names correspond to those in the book,
251
+ // and it is worth keeping it that way so that it will always be easy to verify
252
+ // that the formula was typed in correctly.
253
+
254
+ static inline double abramowitz_stegun_formula_26p5p22(double a, double b,
255
+ double yp) {
256
+ const double b2m1 = (2.0 * b) - 1.0;
257
+ const double a2m1 = (2.0 * a) - 1.0;
258
+ const double lambda = ((yp * yp) - 3.0) / 6.0;
259
+ const double htmp = (1.0 / a2m1) + (1.0 / b2m1);
260
+ const double h = 2.0 / htmp;
261
+ const double term1 = (yp * (sqrt(h + lambda))) / h;
262
+ const double term2 = (1.0 / b2m1) - (1.0 / a2m1);
263
+ const double term3 = (lambda + (5.0 / 6.0)) - (2.0 / (3.0 * h));
264
+ const double w = term1 - (term2 * term3);
265
+ const double xp = a / (a + (b * (exp(2.0 * w))));
266
+ return xp;
267
+ }
268
+
269
+ // Formulas for some special cases.
270
+
271
+ static inline double exact_upper_bound_on_p_k_eq_zero(double n, double delta) {
272
+ return (1.0 - pow(delta, (1.0 / n)));
273
+ }
274
+
275
+ static inline double exact_lower_bound_on_p_k_eq_n(double n, double delta) {
276
+ return (pow(delta, (1.0 / n)));
277
+ }
278
+
279
+ static inline double exact_lower_bound_on_p_k_eq_1(double n, double delta) {
280
+ return (1.0 - pow((1.0 - delta), (1.0 / n)));
281
+ }
282
+
283
+ static inline double exactU_upper_bound_on_p_k_eq_minusone(double n, double delta) {
284
+ return (pow((1.0 - delta), (1.0 / n)));
285
+ }
286
+
287
+ };
288
+
289
+ }
290
+
291
+ #endif // _BOUNDS_BINOMIAL_PROPORTIONS_HPP_
@@ -0,0 +1,41 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef CEILING_POWER_OF_2_HPP_
21
+ #define CEILING_POWER_OF_2_HPP_
22
+
23
+ #include <cstdint>
24
+
25
+ namespace datasketches {
26
+
27
+ // compute the next highest power of 2 of 32-bit n
28
+ // taken from https://graphics.stanford.edu/~seander/bithacks.html
29
+ static inline uint32_t ceiling_power_of_2(uint32_t n) {
30
+ --n;
31
+ n |= n >> 1;
32
+ n |= n >> 2;
33
+ n |= n >> 4;
34
+ n |= n >> 8;
35
+ n |= n >> 16;
36
+ return ++n;
37
+ }
38
+
39
+ } /* namespace datasketches */
40
+
41
+ #endif // CEILING_POWER_OF_2_HPP_
@@ -0,0 +1,51 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _COMMON_DEFS_HPP_
21
+ #define _COMMON_DEFS_HPP_
22
+
23
+ #include <cstdint>
24
+ #include <string>
25
+ #include <memory>
26
+
27
+ namespace datasketches {
28
+
29
+ static const uint64_t DEFAULT_SEED = 9001;
30
+
31
+ template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
32
+ template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;
33
+
34
+ // utility function to hide unused compiler warning
35
+ // usually has no additional cost
36
+ template<typename T> void unused(T&&...) {}
37
+
38
+ // common helping functions
39
+ // TODO: find a better place for them
40
+
41
+ constexpr uint8_t log2(uint32_t n) {
42
+ return (n > 1) ? 1 + log2(n >> 1) : 0;
43
+ }
44
+
45
+ constexpr uint8_t lg_size_from_count(uint32_t n, double load_factor) {
46
+ return log2(n) + ((n > static_cast<uint32_t>((1 << (log2(n) + 1)) * load_factor)) ? 2 : 1);
47
+ }
48
+
49
+ } // namespace
50
+
51
+ #endif // _COMMON_DEFS_HPP_
@@ -0,0 +1,68 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef CONDITIONAL_BACK_INSERTER_HPP_
21
+ #define CONDITIONAL_BACK_INSERTER_HPP_
22
+
23
+ #include <iterator>
24
+ #include <functional>
25
+
26
+ namespace datasketches {
27
+
28
+ template <typename Container, typename Predicate>
29
+ class conditional_back_insert_iterator: public std::back_insert_iterator<Container> {
30
+ public:
31
+ template<typename P>
32
+ conditional_back_insert_iterator(Container& c, P&& p): std::back_insert_iterator<Container>(c), p(std::forward<P>(p)) {}
33
+
34
+ // MSVC seems to insist on having copy constructor and assignment
35
+ conditional_back_insert_iterator(const conditional_back_insert_iterator& other):
36
+ std::back_insert_iterator<Container>(other), p(other.p) {}
37
+ conditional_back_insert_iterator& operator=(const conditional_back_insert_iterator& other) {
38
+ std::back_insert_iterator<Container>::operator=(other);
39
+ p = other.p;
40
+ return *this;
41
+ }
42
+
43
+ conditional_back_insert_iterator& operator=(const typename Container::value_type& value) {
44
+ if (p(value)) std::back_insert_iterator<Container>::operator=(value);
45
+ return *this;
46
+ }
47
+
48
+ conditional_back_insert_iterator& operator=(typename Container::value_type&& value) {
49
+ if (p(value)) std::back_insert_iterator<Container>::operator=(std::move(value));
50
+ return *this;
51
+ }
52
+
53
+ conditional_back_insert_iterator& operator*() { return *this; }
54
+ conditional_back_insert_iterator& operator++() { return *this; }
55
+ conditional_back_insert_iterator& operator++(int) { return *this; }
56
+
57
+ private:
58
+ Predicate p;
59
+ };
60
+
61
+ template<typename Container, typename Predicate>
62
+ conditional_back_insert_iterator<Container, Predicate> conditional_back_inserter(Container& c, Predicate&& p) {
63
+ return conditional_back_insert_iterator<Container, Predicate>(c, std::forward<Predicate>(p));
64
+ }
65
+
66
+ } /* namespace datasketches */
67
+
68
+ #endif