datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,19 @@
1
+ global-include CMakeLists.txt
2
+ global-include *.cpp
3
+ global-include *.c
4
+ global-include *.hpp
5
+ global-include *.h
6
+ global-include *.bin
7
+
8
+ global-exclude .git*
9
+
10
+ recursive-include python/pybind11 *
11
+
12
+ graft common
13
+ graft cpc
14
+ graft fi
15
+ graft hll
16
+ graft kll
17
+ graft theta
18
+ graft sampling
19
+ graft python
@@ -0,0 +1,11 @@
1
+ Apache DataSketches-cpp
2
+ Copyright 2020 The Apache Software Foundation
3
+
4
+ Copyright 2015-2018 Yahoo
5
+ Copyright 2019 Verizon Media
6
+
7
+ This product includes software developed at
8
+ The Apache Software Foundation (http://www.apache.org/).
9
+
10
+ Prior to moving to ASF, the software for this project was developed at
11
+ Yahoo (now Verizon Media) (https://developer.yahoo.com).
@@ -0,0 +1,42 @@
1
+ # DataSketches Core C++ Library Component
2
+ This is the core C++ component of the DataSketches library. It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications.
3
+
4
+ This component is also a dependency of other components of the library that create adaptors for target systems, such as PostgreSQL.
5
+
6
+ Note that we have a parallel core component for Java implementations of the same sketch algorithms,
7
+ [datasketches-java](https://github.com/apache/datasketches-java).
8
+
9
+ Please visit the main [DataSketches website](https://datasketches.apache.org) for more information.
10
+
11
+ If you are interested in making contributions to this site please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us.
12
+
13
+ ---
14
+
15
+ This code requires C++11. It was tested with GCC 4.8.5 (standard in RedHat at the time of this writing), GCC 8.2.0 and Apple LLVM version 10.0.1 (clang-1001.0.46.4)
16
+
17
+ This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/datasketches-cpp/tree/master/python).
18
+
19
+ This library is header-only. The build process provided is only for building unit tests and the python library.
20
+
21
+ Building the unit tests requires cmake 3.12.0 or higher.
22
+
23
+ Installing the latest cmake on OSX: brew install cmake
24
+
25
+ Building and running unit tests using cmake for OSX and Linux:
26
+
27
+ ```
28
+ $ cd build
29
+ $ cmake ..
30
+ $ make
31
+ $ make test
32
+ ```
33
+
34
+ Building and running unit tests using cmake for Windows from the command line:
35
+
36
+ ```
37
+ $ cd build
38
+ $ cmake ..
39
+ $ cd ..
40
+ $ cmake --build build --config Release
41
+ $ cmake --build build --config Release --target RUN_TESTS
42
+ ```
@@ -0,0 +1,45 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_library(common INTERFACE)
19
+
20
+ if (BUILD_TESTS)
21
+ add_subdirectory(test)
22
+ endif()
23
+
24
+ target_include_directories(common
25
+ INTERFACE
26
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
27
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
28
+ )
29
+
30
+ target_compile_features(common INTERFACE cxx_std_11)
31
+
32
+ target_sources(common
33
+ INTERFACE
34
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/common_defs.hpp
35
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/memory_operations.hpp
36
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/MurmurHash3.h
37
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/serde.hpp
38
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/count_zeros.hpp
39
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/inv_pow2_table.hpp
40
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/binomial_bounds.hpp
41
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_back_inserter.hpp
42
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_forward.hpp
43
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/ceiling_power_of_2.hpp
44
+ )
45
+
@@ -0,0 +1,173 @@
1
+ // Minimally modified from Austin Applebee's code:
2
+ // * Removed MurmurHash3_x86_32 and MurmurHash3_x86_128
3
+ // * Changed input seed in MurmurHash3_x64_128 to uint64_t
4
+ // * Define and use HashState reference to return result
5
+ // * Made entire hash function defined inline
6
+ //-----------------------------------------------------------------------------
7
+ // MurmurHash3 was written by Austin Appleby, and is placed in the public
8
+ // domain. The author hereby disclaims copyright to this source code.
9
+
10
+ // Note - The x86 and x64 versions do _not_ produce the same results, as the
11
+ // algorithms are optimized for their respective platforms. You can still
12
+ // compile and run any of them on any platform, but your performance with the
13
+ // non-native version will be less than optimal.
14
+
15
+ #ifndef _MURMURHASH3_H_
16
+ #define _MURMURHASH3_H_
17
+
18
+ //-----------------------------------------------------------------------------
19
+ // Platform-specific functions and macros
20
+
21
+ // Microsoft Visual Studio
22
+
23
+ #if defined(_MSC_VER)
24
+
25
+ typedef unsigned char uint8_t;
26
+ typedef unsigned int uint32_t;
27
+ typedef unsigned __int64 uint64_t;
28
+
29
+ #define FORCE_INLINE __forceinline
30
+
31
+ #include <stdlib.h>
32
+
33
+ #define ROTL32(x,y) _rotl(x,y)
34
+ #define ROTL64(x,y) _rotl64(x,y)
35
+
36
+ #define BIG_CONSTANT(x) (x)
37
+
38
+ // Other compilers
39
+
40
+ #else // defined(_MSC_VER)
41
+
42
+ #include <stdint.h>
43
+
44
+ #define FORCE_INLINE inline __attribute__((always_inline))
45
+
46
+ inline uint32_t rotl32 ( uint32_t x, int8_t r )
47
+ {
48
+ return (x << r) | (x >> (32 - r));
49
+ }
50
+
51
+ inline uint64_t rotl64 ( uint64_t x, int8_t r )
52
+ {
53
+ return (x << r) | (x >> (64 - r));
54
+ }
55
+
56
+ #define ROTL32(x,y) rotl32(x,y)
57
+ #define ROTL64(x,y) rotl64(x,y)
58
+
59
+ #define BIG_CONSTANT(x) (x##LLU)
60
+
61
+ #endif // !defined(_MSC_VER)
62
+
63
+ //-----------------------------------------------------------------------------
64
+
65
+ //-----------------------------------------------------------------------------
66
+ // Return type - Using C++ reference for return type which should allow better
67
+ // compiler optimization than a void* pointer
68
+ typedef struct {
69
+ uint64_t h1;
70
+ uint64_t h2;
71
+ } HashState;
72
+
73
+
74
+ //-----------------------------------------------------------------------------
75
+ // Block read - if your platform needs to do endian-swapping or can only
76
+ // handle aligned reads, do the conversion here
77
+
78
+ FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
79
+ {
80
+ return p[i];
81
+ }
82
+
83
+ //-----------------------------------------------------------------------------
84
+ // Finalization mix - force all bits of a hash block to avalanche
85
+
86
+ FORCE_INLINE uint64_t fmix64 ( uint64_t k )
87
+ {
88
+ k ^= k >> 33;
89
+ k *= BIG_CONSTANT(0xff51afd7ed558ccd);
90
+ k ^= k >> 33;
91
+ k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
92
+ k ^= k >> 33;
93
+
94
+ return k;
95
+ }
96
+
97
+ FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t seed, HashState& out) {
98
+ static const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
99
+ static const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
100
+
101
+ const uint8_t* data = (const uint8_t*)key;
102
+
103
+ out.h1 = seed;
104
+ out.h2 = seed;
105
+
106
+ // Number of full 128-bit blocks of 16 bytes.
107
+ // Possible exclusion of a remainder of up to 15 bytes.
108
+ const int nblocks = lenBytes >> 4; // bytes / 16
109
+
110
+ // Process the 128-bit blocks (the body) into the hash
111
+ const uint64_t* blocks = (const uint64_t*)(data);
112
+ for (int i = 0; i < nblocks; ++i) { // 16 bytes per block
113
+ uint64_t k1 = getblock64(blocks,i*2+0);
114
+ uint64_t k2 = getblock64(blocks,i*2+1);
115
+
116
+ k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; out.h1 ^= k1;
117
+ out.h1 = ROTL64(out.h1,27);
118
+ out.h1 += out.h2;
119
+ out.h1 = out.h1*5+0x52dce729;
120
+
121
+ k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; out.h2 ^= k2;
122
+ out.h2 = ROTL64(out.h2,31);
123
+ out.h2 += out.h1;
124
+ out.h2 = out.h2*5+0x38495ab5;
125
+ }
126
+
127
+ // tail
128
+ const uint8_t * tail = (const uint8_t*)(data + (nblocks << 4));
129
+
130
+ uint64_t k1 = 0;
131
+ uint64_t k2 = 0;
132
+
133
+ switch(lenBytes & 15)
134
+ {
135
+ case 15: k2 ^= ((uint64_t)tail[14]) << 48; // falls through
136
+ case 14: k2 ^= ((uint64_t)tail[13]) << 40; // falls through
137
+ case 13: k2 ^= ((uint64_t)tail[12]) << 32; // falls through
138
+ case 12: k2 ^= ((uint64_t)tail[11]) << 24; // falls through
139
+ case 11: k2 ^= ((uint64_t)tail[10]) << 16; // falls through
140
+ case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; // falls through
141
+ case 9: k2 ^= ((uint64_t)tail[ 8]) << 0;
142
+ k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; out.h2 ^= k2;
143
+ // falls through
144
+ case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; // falls through
145
+ case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; // falls through
146
+ case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; // falls through
147
+ case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; // falls through
148
+ case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; // falls through
149
+ case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; // falls through
150
+ case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; // falls through
151
+ case 1: k1 ^= ((uint64_t)tail[ 0]) << 0;
152
+ k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; out.h1 ^= k1;
153
+ };
154
+
155
+ //----------
156
+ // finalization
157
+
158
+ out.h1 ^= lenBytes;
159
+ out.h2 ^= lenBytes;
160
+
161
+ out.h1 += out.h2;
162
+ out.h2 += out.h1;
163
+
164
+ out.h1 = fmix64(out.h1);
165
+ out.h2 = fmix64(out.h2);
166
+
167
+ out.h1 += out.h2;
168
+ out.h2 += out.h1;
169
+ }
170
+
171
+ //-----------------------------------------------------------------------------
172
+
173
+ #endif // _MURMURHASH3_H_
@@ -0,0 +1,458 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef BINOMIAL_BOUNDS_HPP_
21
+ #define BINOMIAL_BOUNDS_HPP_
22
+
23
+ #include <algorithm>
24
+ #include <cmath>
25
+
26
+ /*
27
+ * This class enables the estimation of error bounds given a sample set size, the sampling
28
+ * probability theta, the number of standard deviations and a simple noDataSeen flag. This can
29
+ * be used to estimate error bounds for fixed threshold sampling as well as the error bounds
30
+ * calculations for sketches.
31
+ *
32
+ * author Alexander Saydakov
33
+ * author Lee Rhodes
34
+ * author Kevin Lang
35
+ */
36
+
37
+ namespace datasketches {
38
+
39
+ static constexpr double delta_of_num_std_devs[] = {
40
+ 0.5000000000000000000, // not actually using this value
41
+ 0.1586553191586026479,
42
+ 0.0227502618904135701,
43
+ 0.0013498126861731796
44
+ };
45
+
46
+ static constexpr double lb_equiv_table[] = {
47
+ 1.0, 2.0, 3.0, // fake values for k = 0
48
+ 0.78733703534118149, 3.14426768537558132, 13.56789685109913535, // k = 1
49
+ 0.94091379266077979, 2.64699271711145911, 6.29302733018320737, // k = 2
50
+ 0.96869128474958188, 2.46531676590527127, 4.97375283467403051, // k = 3
51
+ 0.97933572521046131, 2.37418810664669877, 4.44899975481712318, // k = 4
52
+ 0.98479165917274258, 2.31863116255024693, 4.16712379778553554, // k = 5
53
+ 0.98806033915698777, 2.28075536565225434, 3.99010556144099837, // k = 6
54
+ 0.99021896790580399, 2.25302005857281529, 3.86784477136922078, // k = 7
55
+ 0.99174267079089873, 2.23168103978522936, 3.77784896945266269, // k = 8
56
+ 0.99287147837287648, 2.21465899260871879, 3.70851932988722410, // k = 9
57
+ 0.99373900046805375, 2.20070155496262032, 3.65326029076638292, // k = 10
58
+ 0.99442519013851438, 2.18900651202670815, 3.60803817612955413, // k = 11
59
+ 0.99498066823221620, 2.17903457780744247, 3.57024330407946877, // k = 12
60
+ 0.99543899410224412, 2.17040883161922693, 3.53810982030634591, // k = 13
61
+ 0.99582322541263579, 2.16285726913676513, 3.51039837124298515, // k = 14
62
+ 0.99614973311747690, 2.15617827879603396, 3.48621230377099778, // k = 15
63
+ 0.99643042892560629, 2.15021897666090922, 3.46488605693562590, // k = 16
64
+ 0.99667418783778317, 2.14486114872480016, 3.44591466064832730, // k = 17
65
+ 0.99688774875812669, 2.14001181420209718, 3.42890765690452781, // k = 18
66
+ 0.99707632299691795, 2.13559675336844634, 3.41355809420343803, // k = 19
67
+ 0.99724399084971083, 2.13155592217421486, 3.39962113251016262, // k = 20
68
+ 0.99739400151915447, 2.12784018863251845, 3.38689892877548004, // k = 21
69
+ 0.99752896842633731, 2.12440890875851096, 3.37522975271599535, // k = 22
70
+ 0.99765101725122918, 2.12122815311133195, 3.36448003577621080, // k = 23
71
+ 0.99776189496810730, 2.11826934724291505, 3.35453840911279144, // k = 24
72
+ 0.99786304821586214, 2.11550823850916458, 3.34531123809287578, // k = 25
73
+ 0.99795568665180667, 2.11292409529477254, 3.33671916527694634, // k = 26
74
+ 0.99804083063483517, 2.11049908609763293, 3.32869446834217797, // k = 27
75
+ 0.99811933910984862, 2.10821776918189130, 3.32117898316676019, // k = 28
76
+ 0.99819195457286014, 2.10606671027090897, 3.31412243534683171, // k = 29
77
+ 0.99825930555178388, 2.10403415237001923, 3.30748113008135647, // k = 30
78
+ 0.99832193858154028, 2.10210975877822648, 3.30121691946897045, // k = 31
79
+ 0.99838032666573895, 2.10028440670842542, 3.29529629751144171, // k = 32
80
+ 0.99843488390555990, 2.09855000145353188, 3.28968974413223236, // k = 33
81
+ 0.99848596721417948, 2.09689934193824001, 3.28437111460505093, // k = 34
82
+ 0.99853390005924325, 2.09532599155502908, 3.27931717312372939, // k = 35
83
+ 0.99857895741078551, 2.09382418262592296, 3.27450718840060517, // k = 36
84
+ 0.99862138880970974, 2.09238872751677718, 3.26992261182860489, // k = 37
85
+ 0.99866141580770318, 2.09101494715108061, 3.26554677962434425, // k = 38
86
+ 0.99869923565267982, 2.08969860402822860, 3.26136468165239535, // k = 39
87
+ 0.99873502010169091, 2.08843585627218431, 3.25736275677081721, // k = 40
88
+ 0.99876893292508839, 2.08722321436752623, 3.25352872241415980, // k = 41
89
+ 0.99880111078502409, 2.08605749165553789, 3.24985141664350863, // k = 42
90
+ 0.99883168573342118, 2.08493577529222307, 3.24632068399498053, // k = 43
91
+ 0.99886077231613513, 2.08385540129560809, 3.24292724848112357, // k = 44
92
+ 0.99888847451828155, 2.08281392374021834, 3.23966263299664092, // k = 45
93
+ 0.99891488795844907, 2.08180908991394631, 3.23651906111521726, // k = 46
94
+ 0.99894010085196783, 2.08083882998420222, 3.23348939240611344, // k = 47
95
+ 0.99896419358239541, 2.07990122528650545, 3.23056705515594444, // k = 48
96
+ 0.99898723510594323, 2.07899450946285924, 3.22774598963252402, // k = 49
97
+ 0.99900929266780736, 2.07811704477046533, 3.22502059972006805, // k = 50
98
+ 0.99903043086155208, 2.07726730587160091, 3.22238570890294795, // k = 51
99
+ 0.99905070073845081, 2.07644388314946582, 3.21983651940365689, // k = 52
100
+ 0.99907015770423868, 2.07564546080757850, 3.21736857351049821, // k = 53
101
+ 0.99908884779227947, 2.07487081196367740, 3.21497773796417619, // k = 54
102
+ 0.99910681586905525, 2.07411879634256024, 3.21266015316183484, // k = 55
103
+ 0.99912410177549305, 2.07338834403498140, 3.21041222805715165, // k = 56
104
+ 0.99914074347179849, 2.07267845454973099, 3.20823061166797174, // k = 57
105
+ 0.99915677607464204, 2.07198819052374006, 3.20611216970604573, // k = 58
106
+ 0.99917223149395795, 2.07131667846186929, 3.20405396962596001, // k = 59
107
+ 0.99918714153457699, 2.07066309019154460, 3.20205326110445299, // k = 60
108
+ 0.99920153247185794, 2.07002665203046377, 3.20010746990493544, // k = 61
109
+ 0.99921543193525508, 2.06940663431663552, 3.19821417453343315, // k = 62
110
+ 0.99922886570365677, 2.06880235245998279, 3.19637109973109546, // k = 63
111
+ 0.99924185357357942, 2.06821315729285971, 3.19457610621114441, // k = 64
112
+ 0.99925441845175555, 2.06763843812092318, 3.19282717869864996, // k = 65
113
+ 0.99926658263325407, 2.06707761824370095, 3.19112241228646099, // k = 66
114
+ 0.99927836173816331, 2.06653015295219689, 3.18946001739936946, // k = 67
115
+ 0.99928977431994781, 2.06599552505539918, 3.18783829446098821, // k = 68
116
+ 0.99930083753795884, 2.06547324585920933, 3.18625564538041317, // k = 69
117
+ 0.99931156864562354, 2.06496285191821016, 3.18471055124089730, // k = 70
118
+ 0.99932197985521043, 2.06446390392778767, 3.18320157510865442, // k = 71
119
+ 0.99933208559809827, 2.06397598606787369, 3.18172735837393361, // k = 72
120
+ 0.99934190032416836, 2.06349869971447220, 3.18028661102792398, // k = 73
121
+ 0.99935143390791836, 2.06303166975550312, 3.17887810481605015, // k = 74
122
+ 0.99936070171270330, 2.06257453607466346, 3.17750067581857820, // k = 75
123
+ 0.99936971103502970, 2.06212696042919674, 3.17615321728274580, // k = 76
124
+ 0.99937847392385493, 2.06168861430600714, 3.17483467831510779, // k = 77
125
+ 0.99938700168914352, 2.06125918927764928, 3.17354405480557489, // k = 78
126
+ 0.99939530099953799, 2.06083838987589729, 3.17228039269048168, // k = 79
127
+ 0.99940338278830154, 2.06042593411496000, 3.17104278166036124, // k = 80
128
+ 0.99941125463777780, 2.06002155276328835, 3.16983035274597569, // k = 81
129
+ 0.99941892470027938, 2.05962498741951094, 3.16864227952240185, // k = 82
130
+ 0.99942640059737187, 2.05923599161263837, 3.16747776846497686, // k = 83
131
+ 0.99943368842187397, 2.05885433061945378, 3.16633606416374391, // k = 84
132
+ 0.99944079790603269, 2.05847977868873500, 3.16521644518826406, // k = 85
133
+ 0.99944773295734990, 2.05811212058944193, 3.16411821883858124, // k = 86
134
+ 0.99945450059186669, 2.05775114781260982, 3.16304072400711789, // k = 87
135
+ 0.99946110646314423, 2.05739666442039493, 3.16198332650733960, // k = 88
136
+ 0.99946755770463369, 2.05704847678819647, 3.16094541781455973, // k = 89
137
+ 0.99947385746861528, 2.05670640500335367, 3.15992641851471490, // k = 90
138
+ 0.99948001256305474, 2.05637027420314666, 3.15892576988736096, // k = 91
139
+ 0.99948602689656241, 2.05603991286400856, 3.15794293484717059, // k = 92
140
+ 0.99949190674294641, 2.05571516158917689, 3.15697740043813724, // k = 93
141
+ 0.99949765436329585, 2.05539586490317561, 3.15602867309343083, // k = 94
142
+ 0.99950327557880314, 2.05508187237845164, 3.15509627710042651, // k = 95
143
+ 0.99950877461972709, 2.05477304104951486, 3.15417975753007340, // k = 96
144
+ 0.99951415481862682, 2.05446923022574879, 3.15327867462917766, // k = 97
145
+ 0.99951942042375208, 2.05417030908833453, 3.15239260700215596, // k = 98
146
+ 0.99952457390890004, 2.05387614661762541, 3.15152114915238712, // k = 99
147
+ 0.99952962005008317, 2.05358662050909402, 3.15066390921020911, // k = 100
148
+ 0.99953456216121594, 2.05330161104427589, 3.14982051097524618, // k = 101
149
+ 0.99953940176368405, 2.05302100378725072, 3.14899059183684926, // k = 102
150
+ 0.99954414373920031, 2.05274468493067275, 3.14817379948561893, // k = 103
151
+ 0.99954879047621148, 2.05247255013657082, 3.14736979964868624, // k = 104
152
+ 0.99955334485656522, 2.05220449388099269, 3.14657826610371671, // k = 105
153
+ 0.99955780993869325, 2.05194041831310869, 3.14579888316276879, // k = 106
154
+ 0.99956218652590678, 2.05168022402710903, 3.14503134811607765, // k = 107
155
+ 0.99956647932785359, 2.05142381889103831, 3.14427536967733090, // k = 108
156
+ 0.99957069025060719, 2.05117111251445294, 3.14353066260227365, // k = 109
157
+ 0.99957482032178291, 2.05092201793428330, 3.14279695558593630, // k = 110
158
+ 0.99957887261450651, 2.05067645094720774, 3.14207398336887422, // k = 111
159
+ 0.99958284988383639, 2.05043432833224415, 3.14136149076028914, // k = 112
160
+ 0.99958675435604505, 2.05019557189746138, 3.14065923143530767, // k = 113
161
+ 0.99959058650074439, 2.04996010556124020, 3.13996696426707445, // k = 114
162
+ 0.99959434898201494, 2.04972785368377686, 3.13928445867830419, // k = 115
163
+ 0.99959804437042976, 2.04949874512311681, 3.13861149103462367, // k = 116
164
+ 0.99960167394553423, 2.04927271043337100, 3.13794784369528656, // k = 117
165
+ 0.99960523957651048, 2.04904968140490951, 3.13729330661277572, // k = 118
166
+ 0.99960874253329735, 2.04882959397491504, 3.13664767767019725, // k = 119
167
+ 0.99961218434327748, 2.04861238220240693, 3.13601075688413289 // k = 120
168
+ };
169
+
170
+ static constexpr double ub_equiv_table[] = {
171
+ 1.0, 2.0, 3.0, // fake values for k = 0
172
+ 0.99067760836669549, 1.75460517119302040, 2.48055626001627161, // k = 1
173
+ 0.99270518097577565, 1.78855957509907171, 2.53863835259832626, // k = 2
174
+ 0.99402032633599902, 1.81047286499563143, 2.57811676180597260, // k = 3
175
+ 0.99492607629539975, 1.82625928017762362, 2.60759550546498531, // k = 4
176
+ 0.99558653966013821, 1.83839160339161367, 2.63086812358551470, // k = 5
177
+ 0.99608981951632813, 1.84812399034444752, 2.64993712523727254, // k = 6
178
+ 0.99648648035983456, 1.85617372053235385, 2.66598485907860550, // k = 7
179
+ 0.99680750790483330, 1.86298655802610824, 2.67976541374471822, // k = 8
180
+ 0.99707292880049181, 1.86885682585270274, 2.69178781407745760, // k = 9
181
+ 0.99729614928489241, 1.87398826101983218, 2.70241106542158604, // k = 10
182
+ 0.99748667952445658, 1.87852708449801753, 2.71189717290596377, // k = 11
183
+ 0.99765127712748836, 1.88258159501103250, 2.72044290303773550, // k = 12
184
+ 0.99779498340305395, 1.88623391878036273, 2.72819957382063194, // k = 13
185
+ 0.99792160418357412, 1.88954778748873764, 2.73528576807902368, // k = 14
186
+ 0.99803398604944960, 1.89257337682371940, 2.74179612106766513, // k = 15
187
+ 0.99813449883217231, 1.89535099316557876, 2.74780718300419835, // k = 16
188
+ 0.99822494122659577, 1.89791339232732525, 2.75338173141955167, // k = 17
189
+ 0.99830679915913834, 1.90028752122407241, 2.75857186416826039, // k = 18
190
+ 0.99838117410831728, 1.90249575897183831, 2.76342117562634826, // k = 19
191
+ 0.99844913407071090, 1.90455689090418900, 2.76796659454200267, // k = 20
192
+ 0.99851147736424650, 1.90648682834171268, 2.77223944710058845, // k = 21
193
+ 0.99856879856019987, 1.90829917277082473, 2.77626682032629901, // k = 22
194
+ 0.99862183849734265, 1.91000561415842185, 2.78007199816156003, // k = 23
195
+ 0.99867096266018507, 1.91161621560812023, 2.78367524259661536, // k = 24
196
+ 0.99871656986212543, 1.91313978579765376, 2.78709435016625662, // k = 25
197
+ 0.99875907577771272, 1.91458400425526065, 2.79034488416175463, // k = 26
198
+ 0.99879885565047744, 1.91595563175945927, 2.79344064132371273, // k = 27
199
+ 0.99883610756373287, 1.91726064301425936, 2.79639384757751941, // k = 28
200
+ 0.99887095169674467, 1.91850441099725799, 2.79921543574803877, // k = 29
201
+ 0.99890379414739527, 1.91969155477030995, 2.80191513182441554, // k = 30
202
+ 0.99893466279047516, 1.92082633358913313, 2.80450167352080371, // k = 31
203
+ 0.99896392088177777, 1.92191254955568525, 2.80698295731653502, // k = 32
204
+ 0.99899147889385631, 1.92295362479495680, 2.80936614404217266, // k = 33
205
+ 0.99901764688726757, 1.92395267400968351, 2.81165765979318394, // k = 34
206
+ 0.99904238606342233, 1.92491244978191389, 2.81386337393604435, // k = 35
207
+ 0.99906590152386343, 1.92583552644848055, 2.81598868034527072, // k = 36
208
+ 0.99908829040739988, 1.92672418013918900, 2.81803841726804194, // k = 37
209
+ 0.99910959420023460, 1.92758051694144683, 2.82001709302821268, // k = 38
210
+ 0.99912996403594434, 1.92840654943159961, 2.82192875763732332, // k = 39
211
+ 0.99914930224576892, 1.92920397044028391, 2.82377730628954282, // k = 40
212
+ 0.99916781270195543, 1.92997447498220254, 2.82556612075063640, // k = 41
213
+ 0.99918553179077207, 1.93071949211818605, 2.82729843191989971, // k = 42
214
+ 0.99920250730914972, 1.93144048613876862, 2.82897728689417249, // k = 43
215
+ 0.99921873345181211, 1.93213870990595638, 2.83060537017752267, // k = 44
216
+ 0.99923435180002684, 1.93281536508689555, 2.83218527795750674, // k = 45
217
+ 0.99924930425362390, 1.93347145882316340, 2.83371938965598247, // k = 46
218
+ 0.99926370394567243, 1.93410820221384938, 2.83520990872793277, // k = 47
219
+ 0.99927750755296074, 1.93472643138986200, 2.83665891945119597, // k = 48
220
+ 0.99929082941537217, 1.93532697329771963, 2.83806833931606661, // k = 49
221
+ 0.99930366295501472, 1.93591074716263734, 2.83943997143404658, // k = 50
222
+ 0.99931598804721489, 1.93647857274021362, 2.84077557836653227, // k = 51
223
+ 0.99932789059798210, 1.93703110239354714, 2.84207662106302905, // k = 52
224
+ 0.99933946180485123, 1.93756904936378760, 2.84334468086129277, // k = 53
225
+ 0.99935053819703512, 1.93809302131219852, 2.84458116874117195, // k = 54
226
+ 0.99936126637970801, 1.93860365411038060, 2.84578731838604426, // k = 55
227
+ 0.99937166229284458, 1.93910149816429112, 2.84696443486512862, // k = 56
228
+ 0.99938169190727422, 1.93958709548454067, 2.84811369085281285, // k = 57
229
+ 0.99939136927613959, 1.94006085573701625, 2.84923617230361970, // k = 58
230
+ 0.99940074328745254, 1.94052339623206649, 2.85033291216254270, // k = 59
231
+ 0.99940993070470086, 1.94097508636855309, 2.85140492437699322, // k = 60
232
+ 0.99941868577388959, 1.94141633372043998, 2.85245314430358121, // k = 61
233
+ 0.99942734443487780, 1.94184757038001976, 2.85347839582286156, // k = 62
234
+ 0.99943556385736088, 1.94226915100517772, 2.85448160365493209, // k = 63
235
+ 0.99944374522542034, 1.94268143723749631, 2.85546346373061510, // k = 64
236
+ 0.99945159955424856, 1.94308482059116727, 2.85642486111805738, // k = 65
237
+ 0.99945915301904620, 1.94347956957849988, 2.85736639994965458, // k = 66
238
+ 0.99946660663832176, 1.94386600964031686, 2.85828887832701639, // k = 67
239
+ 0.99947383703224091, 1.94424436597356021, 2.85919278275500233, // k = 68
240
+ 0.99948075442870277, 1.94461502153473020, 2.86007887186090670, // k = 69
241
+ 0.99948766082269458, 1.94497821937304138, 2.86094774077355396, // k = 70
242
+ 0.99949422748713346, 1.94533411296001191, 2.86179981848076181, // k = 71
243
+ 0.99950070756119658, 1.94568300035135167, 2.86263579405672886, // k = 72
244
+ 0.99950704321753392, 1.94602523449961495, 2.86345610449197352, // k = 73
245
+ 0.99951320334216121, 1.94636083782822311, 2.86426125541271404, // k = 74
246
+ 0.99951920293474927, 1.94669011080745236, 2.86505169255406145, // k = 75
247
+ 0.99952501670378524, 1.94701327348536779, 2.86582788270862920, // k = 76
248
+ 0.99953071209267819, 1.94733044372333097, 2.86659027602854621, // k = 77
249
+ 0.99953632734991515, 1.94764180764266825, 2.86733927778843167, // k = 78
250
+ 0.99954171164873173, 1.94794766430732125, 2.86807526143834934, // k = 79
251
+ 0.99954699274462655, 1.94824807472994621, 2.86879864789403882, // k = 80
252
+ 0.99955216611081710, 1.94854317889829076, 2.86950970901679625, // k = 81
253
+ 0.99955730019613043, 1.94883320227168610, 2.87020887436986527, // k = 82
254
+ 0.99956213770650493, 1.94911826561721568, 2.87089648477021342, // k = 83
255
+ 0.99956704264963037, 1.94939848545763539, 2.87157281693902178, // k = 84
256
+ 0.99957166306481327, 1.94967401618316671, 2.87223821840905202, // k = 85
257
+ 0.99957632713136491, 1.94994497791333288, 2.87289293193450135, // k = 86
258
+ 0.99958087233392234, 1.95021155752212394, 2.87353731228213860, // k = 87
259
+ 0.99958532555996271, 1.95047376805584349, 2.87417154907075201, // k = 88
260
+ 0.99958956246481989, 1.95073180380688882, 2.87479599765507032, // k = 89
261
+ 0.99959389351869277, 1.95098572880579013, 2.87541081987382086, // k = 90
262
+ 0.99959807862052230, 1.95123574036898617, 2.87601637401948551, // k = 91
263
+ 0.99960214057801977, 1.95148186921983324, 2.87661283691068093, // k = 92
264
+ 0.99960607527256684, 1.95172415829728152, 2.87720042968334155, // k = 93
265
+ 0.99960996433179616, 1.95196280898670693, 2.87777936649376898, // k = 94
266
+ 0.99961379137860717, 1.95219787713926962, 2.87834989933620022, // k = 95
267
+ 0.99961756088146103, 1.95242944583677058, 2.87891216133900230, // k = 96
268
+ 0.99962125605327401, 1.95265762420910960, 2.87946647367488140, // k = 97
269
+ 0.99962486179100551, 1.95288245314810638, 2.88001290210658567, // k = 98
270
+ 0.99962843240297161, 1.95310404286672679, 2.88055166523392359, // k = 99
271
+ 0.99963187276145504, 1.95332251980147475, 2.88108300006589957, // k = 100
272
+ 0.99963525453173929, 1.95353785898848287, 2.88160703591438505, // k = 101
273
+ 0.99963855412988778, 1.95375019354571577, 2.88212393551896184, // k = 102
274
+ 0.99964190254169694, 1.95395953472205974, 2.88263389761985422, // k = 103
275
+ 0.99964506565942202, 1.95416607430155409, 2.88313700661564098, // k = 104
276
+ 0.99964834424233118, 1.95436972855640079, 2.88363350163803034, // k = 105
277
+ 0.99965136548857458, 1.95457068540693513, 2.88412349413960101, // k = 106
278
+ 0.99965436594726498, 1.95476896383092935, 2.88460710620208260, // k = 107
279
+ 0.99965736463468602, 1.95496457504532373, 2.88508450078833789, // k = 108
280
+ 0.99966034130443404, 1.95515761150707590, 2.88555580586194083, // k = 109
281
+ 0.99966326130828520, 1.95534810382198998, 2.88602118761679094, // k = 110
282
+ 0.99966601446035952, 1.95553622237747504, 2.88648066384146773, // k = 111
283
+ 0.99966887679593697, 1.95572186728168163, 2.88693444915907094, // k = 112
284
+ 0.99967161286551232, 1.95590523410490391, 2.88738271495714116, // k = 113
285
+ 0.99967435412270333, 1.95608626483223702, 2.88782540459769166, // k = 114
286
+ 0.99967701261934394, 1.95626497627117146, 2.88826277189363623, // k = 115
287
+ 0.99967963265157778, 1.95644153684824573, 2.88869486674335008, // k = 116
288
+ 0.99968216317182623, 1.95661589936000269, 2.88912184353694101, // k = 117
289
+ 0.99968479674396349, 1.95678821614791332, 2.88954376359643561, // k = 118
290
+ 0.99968729031337489, 1.95695842061650183, 2.88996069422501023, // k = 119
291
+ 0.99968963358631413, 1.95712651709766305, 2.89037285320668502 // k = 120
292
+ };
293
+
294
+ class binomial_bounds {
295
+
296
+ public:
297
+ static double get_lower_bound(unsigned long long num_samples, double theta, unsigned num_std_devs) {
298
+ check_theta(theta);
299
+ check_num_std_devs(num_std_devs);
300
+ const double estimate = num_samples / theta;
301
+ const double lb = compute_approx_binomial_lower_bound(num_samples, theta, num_std_devs);
302
+ return std::min(estimate, std::max(static_cast<double>(num_samples), lb));
303
+ }
304
+
305
+ static double get_upper_bound(unsigned long long num_samples, double theta, unsigned num_std_devs) {
306
+ check_theta(theta);
307
+ check_num_std_devs(num_std_devs);
308
+ const double estimate = num_samples / theta;
309
+ const double ub = compute_approx_binomial_upper_bound(num_samples, theta, num_std_devs);
310
+ return std::max(estimate, ub);
311
+ }
312
+
313
+ private:
314
+ // our "classic" bounds, but now with continuity correction
315
+ static double cont_classic_lb(unsigned long long num_samples, double theta, double num_std_devs) {
316
+ const double n_hat = (num_samples - 0.5) / theta;
317
+ const double b = num_std_devs * std::sqrt((1.0 - theta) / theta);
318
+ const double d = 0.5 * b * std::sqrt((b * b) + (4.0 * n_hat));
319
+ const double center = n_hat + (0.5 * (b * b));
320
+ return (center - d);
321
+ }
322
+
323
+ // our "classic" bounds, but now with continuity correction
324
+ static double cont_classic_ub(unsigned long long num_samples, double theta, double num_std_devs) {
325
+ const double n_hat = (num_samples + 0.5) / theta;
326
+ const double b = num_std_devs * std::sqrt((1.0 - theta) / theta);
327
+ const double d = 0.5 * b * std::sqrt((b * b) + (4.0 * n_hat));
328
+ const double center = n_hat + (0.5 * (b * b));
329
+ return (center + d);
330
+ }
331
+
332
+ // This is a special purpose calculator for NStar, using a computational
333
+ // strategy inspired by its Bayesian definition. It is only appropriate
334
+ // for a very limited set of inputs. However, the procedure compute_approx_binomial_lower_bound()
335
+ // below does in fact only call it for suitably limited inputs.
336
+ // Outside of this limited range, two different bad things will happen.
337
+ // First, because we are not using logarithms, the values of intermediate
338
+ // quantities will exceed the dynamic range of doubles. Second, even if that
339
+ // problem were fixed, the running time of this procedure is essentially linear
340
+ // in est = (numSamples / p), and that can be Very, Very Big.
341
+ static unsigned long long special_n_star(unsigned long long num_samples, double p, double delta) {
342
+ const double q = 1.0 - p;
343
+ // Use a different algorithm if the following is true; this one will be too slow, or worse.
344
+ if ((num_samples / p) >= 500.0) throw std::invalid_argument("out of range");
345
+ double cur_term = std::pow(p, num_samples); // curTerm = posteriorProbability (k, k, p)
346
+ if (cur_term <= 1e-100) throw std::logic_error("out of range"); // sanity check for non-use of logarithms
347
+ double tot = cur_term;
348
+ unsigned long long m = num_samples;
349
+ while (tot <= delta) { // this test can fail even the first time
350
+ cur_term = (cur_term * q * (m)) / ((m + 1) - num_samples);
351
+ tot += cur_term;
352
+ m += 1;
353
+ }
354
+ // we have reached a state where tot > delta, so back up one
355
+ return (m - 1);
356
+ }
357
+
358
+ // The following procedure has very limited applicability.
359
+ // The above remarks about special_n_star() also apply here.
360
+ static unsigned long long special_n_prime_b(unsigned long long num_samples, double p, double delta) {
361
+ const double q = 1.0 - p;
362
+ const double one_minus_delta = 1.0 - delta;
363
+ double cur_term = std::pow(p, num_samples); // curTerm = posteriorProbability (k, k, p)
364
+ if (cur_term <= 1e-100) throw std::logic_error("out of range"); // sanity check for non-use of logarithms
365
+ double tot = cur_term;
366
+ unsigned long long m = num_samples;
367
+ while (tot < one_minus_delta) {
368
+ cur_term = (cur_term * q * (m)) / ((m + 1) - num_samples);
369
+ tot += cur_term;
370
+ m += 1;
371
+ }
372
+ return m; // no need to back up
373
+ }
374
+
375
+ static unsigned long long special_n_prime_f(unsigned long long num_samples, double p, double delta) {
376
+ // Use a different algorithm if the following is true; this one will be too slow, or worse.
377
+ if ((num_samples / p) >= 500.0) throw std::invalid_argument("out of range"); //A super-small delta could also make it slow.
378
+ return special_n_prime_b(num_samples + 1, p, delta);
379
+ }
380
+
381
+ // The following computes an approximation to the lower bound of a Frequentist
382
+ // confidence interval based on the tails of the Binomial distribution.
383
+ static double compute_approx_binomial_lower_bound(unsigned long long num_samples, double theta, unsigned num_std_devs) {
384
+ if (theta == 1) return num_samples;
385
+ if (num_samples == 0) return 0;
386
+ if (num_samples == 1) {
387
+ const double delta = delta_of_num_std_devs[num_std_devs];
388
+ const double raw_lb = std::log(1 - delta) / std::log(1 - theta);
389
+ return std::floor(raw_lb); // round down
390
+ }
391
+ if (num_samples > 120) {
392
+ // plenty of samples, so gaussian approximation to binomial distribution isn't too bad
393
+ const double raw_lb = cont_classic_lb(num_samples, theta, num_std_devs);
394
+ return (raw_lb - 0.5); // fake round down
395
+ }
396
+ // at this point we know 2 <= num_samples <= 120
397
+ if (theta > (1 - 1e-5)) { // empirically-determined threshold
398
+ return num_samples;
399
+ }
400
+ if (theta < (num_samples / 360.0)) { // empirically-determined threshold
401
+ // here we use the Gaussian approximation, but with a modified num_std_devs
402
+ const unsigned index = 3 * num_samples + (num_std_devs - 1);
403
+ const double raw_lb = cont_classic_lb(num_samples, theta, lb_equiv_table[index]);
404
+ return raw_lb - 0.5; // fake round down
405
+ }
406
+ // This is the most difficult range to approximate; we will compute an "exact" LB.
407
+ // We know that est <= 360, so specialNStar() shouldn't be ridiculously slow.
408
+ const double delta = delta_of_num_std_devs[num_std_devs];
409
+ return special_n_star(num_samples, theta, delta); // no need to round
410
+ }
411
+
412
+ // The following computes an approximation to the upper bound of a Frequentist
413
+ // confidence interval based on the tails of the Binomial distribution.
414
+ static double compute_approx_binomial_upper_bound(unsigned long long num_samples, double theta, unsigned num_std_devs) {
415
+ if (theta == 1) return num_samples;
416
+ if (num_samples == 0) {
417
+ const double delta = delta_of_num_std_devs[num_std_devs];
418
+ const double raw_ub = std::log(delta) / std::log(1 - theta);
419
+ return std::ceil(raw_ub); // round up
420
+ }
421
+ if (num_samples > 120) {
422
+ // plenty of samples, so gaussian approximation to binomial distribution isn't too bad
423
+ const double raw_ub = cont_classic_ub(num_samples, theta, num_std_devs);
424
+ return (raw_ub + 0.5); // fake round up
425
+ }
426
+ // at this point we know 2 <= num_samples <= 120
427
+ if (theta > (1 - 1e-5)) { // empirically-determined threshold
428
+ return num_samples + 1;
429
+ }
430
+ if (theta < (num_samples / 360.0)) { // empirically-determined threshold
431
+ // here we use the Gaussian approximation, but with a modified num_std_devs
432
+ const unsigned index = 3 * num_samples + (num_std_devs - 1);
433
+ const double raw_ub = cont_classic_ub(num_samples, theta, ub_equiv_table[index]);
434
+ return raw_ub + 0.5; // fake round up
435
+ }
436
+ // This is the most difficult range to approximate; we will compute an "exact" UB.
437
+ // We know that est <= 360, so specialNPrimeF() shouldn't be ridiculously slow.
438
+ const double delta = delta_of_num_std_devs[num_std_devs];
439
+ return special_n_prime_f(num_samples, theta, delta); // no need to round
440
+ }
441
+
442
+ static void check_theta(double theta) {
443
+ if (theta < 0 || theta > 1) {
444
+ throw std::invalid_argument("theta must be in [0, 1]");
445
+ }
446
+ }
447
+
448
+ static void check_num_std_devs(unsigned num_std_devs) {
449
+ if (num_std_devs < 1 || num_std_devs > 3) {
450
+ throw std::invalid_argument("num_std_devs must be 1, 2 or 3");
451
+ }
452
+ }
453
+
454
+ };
455
+
456
+ } /* namespace datasketches */
457
+
458
+ # endif