datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,167 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ // author Kevin Lang, Oath Research
21
+
22
+ #ifndef CPC_CONFIDENCE_HPP_
23
+ #define CPC_CONFIDENCE_HPP_
24
+
25
+ #include <cmath>
26
+
27
+ #include "cpc_sketch.hpp"
28
+
29
+ namespace datasketches {
30
+
31
+ // ln 2.0
32
+ static const double ICON_ERROT_CONSTANT = 0.693147180559945286;
33
+
34
+ // 1, 2, 3, // kappa
35
+ static const int16_t ICON_LOW_SIDE_DATA [33] = { // Empirically measured at N = 1000 * K.
36
+ 6037, 5720, 5328, // 4 1000000
37
+ 6411, 6262, 5682, // 5 1000000
38
+ 6724, 6403, 6127, // 6 1000000
39
+ 6665, 6411, 6208, // 7 1000000
40
+ 6959, 6525, 6427, // 8 1000000
41
+ 6892, 6665, 6619, // 9 1000000
42
+ 6792, 6752, 6690, // 10 1000000
43
+ 6899, 6818, 6708, // 11 1000000
44
+ 6871, 6845, 6812, // 12 1046369
45
+ 6909, 6861, 6828, // 13 1043411
46
+ 6919, 6897, 6842, // 14 1000297
47
+ }; // lgK numtrials
48
+
49
+ // 1, 2, 3, // kappa
50
+ static const int16_t ICON_HIGH_SIDE_DATA [33] = { // Empirically measured at N = 1000 * K.
51
+ 8031, 8559, 9309, // 4 1000000
52
+ 7084, 7959, 8660, // 5 1000000
53
+ 7141, 7514, 7876, // 6 1000000
54
+ 7458, 7430, 7572, // 7 1000000
55
+ 6892, 7141, 7497, // 8 1000000
56
+ 6889, 7132, 7290, // 9 1000000
57
+ 7075, 7118, 7185, // 10 1000000
58
+ 7040, 7047, 7085, // 11 1000000
59
+ 6993, 7019, 7053, // 12 1046369
60
+ 6953, 7001, 6983, // 13 1043411
61
+ 6944, 6966, 7004, // 14 1000297
62
+ }; // lgK numtrials
63
+
64
+ // sqrt((ln 2.0) / 2.0)
65
+ static const double HIP_ERROR_CONSTANT = 0.588705011257737332;
66
+
67
+ // 1, 2, 3, // kappa
68
+ static const int16_t HIP_LOW_SIDE_DATA [33] = { // Empirically measured at N = 1000 * K.
69
+ 5871, 5247, 4826, // 4 1000000
70
+ 5877, 5403, 5070, // 5 1000000
71
+ 5873, 5533, 5304, // 6 1000000
72
+ 5878, 5632, 5464, // 7 1000000
73
+ 5874, 5690, 5564, // 8 1000000
74
+ 5880, 5745, 5619, // 9 1000000
75
+ 5875, 5784, 5701, // 10 1000000
76
+ 5866, 5789, 5742, // 11 1000000
77
+ 5869, 5827, 5784, // 12 1046369
78
+ 5876, 5860, 5827, // 13 1043411
79
+ 5881, 5853, 5842, // 14 1000297
80
+ }; // lgK numtrials
81
+
82
+ // 1, 2, 3, // kappa
83
+ static const int16_t HIP_HIGH_SIDE_DATA [33] = { // Empirically measured at N = 1000 * K.
84
+ 5855, 6688, 7391, // 4 1000000
85
+ 5886, 6444, 6923, // 5 1000000
86
+ 5885, 6254, 6594, // 6 1000000
87
+ 5889, 6134, 6326, // 7 1000000
88
+ 5900, 6072, 6203, // 8 1000000
89
+ 5875, 6005, 6089, // 9 1000000
90
+ 5871, 5980, 6040, // 10 1000000
91
+ 5889, 5941, 6015, // 11 1000000
92
+ 5871, 5926, 5973, // 12 1046369
93
+ 5866, 5901, 5915, // 13 1043411
94
+ 5880, 5914, 5953, // 14 1000297
95
+ }; // lgK numtrials
96
+
97
+ template<typename A>
98
+ double get_icon_confidence_lb(const cpc_sketch_alloc<A>& sketch, int kappa) {
99
+ if (sketch.get_num_coupons() == 0) return 0.0;
100
+ const int lg_k = sketch.get_lg_k();
101
+ const long k = 1 << lg_k;
102
+ if (lg_k < 4) throw std::logic_error("lgk < 4");
103
+ if (kappa < 1 || kappa > 3) throw std::invalid_argument("kappa must be between 1 and 3");
104
+ double x = ICON_ERROT_CONSTANT;
105
+ if (lg_k <= 14) x = ((double) ICON_HIGH_SIDE_DATA[3 * (lg_k - 4) + (kappa - 1)]) / 10000.0;
106
+ const double rel = x / sqrt(k);
107
+ const double eps = kappa * rel;
108
+ const double est = sketch.get_icon_estimate();
109
+ double result = est / (1.0 + eps);
110
+ const double check = sketch.get_num_coupons();
111
+ if (result < check) result = check;
112
+ return result;
113
+ }
114
+
115
+ template<typename A>
116
+ double get_icon_confidence_ub(const cpc_sketch_alloc<A>& sketch, int kappa) {
117
+ if (sketch.get_num_coupons() == 0) return 0.0;
118
+ const int lg_k = sketch.get_lg_k();
119
+ const long k = 1 << lg_k;
120
+ if (lg_k < 4) throw std::logic_error("lgk < 4");
121
+ if (kappa < 1 || kappa > 3) throw std::invalid_argument("kappa must be between 1 and 3");
122
+ double x = ICON_ERROT_CONSTANT;
123
+ if (lg_k <= 14) x = ((double) ICON_LOW_SIDE_DATA[3 * (lg_k - 4) + (kappa - 1)]) / 10000.0;
124
+ const double rel = x / sqrt(k);
125
+ const double eps = kappa * rel;
126
+ const double est = sketch.get_icon_estimate();
127
+ const double result = est / (1.0 - eps);
128
+ return ceil(result); // widening for coverage
129
+ }
130
+
131
+ template<typename A>
132
+ double get_hip_confidence_lb(const cpc_sketch_alloc<A>& sketch, int kappa) {
133
+ if (sketch.get_num_coupons() == 0) return 0.0;
134
+ const int lg_k = sketch.get_lg_k();
135
+ const long k = 1 << lg_k;
136
+ if (lg_k < 4) throw std::logic_error("lgk < 4");
137
+ if (kappa < 1 || kappa > 3) throw std::invalid_argument("kappa must be between 1 and 3");
138
+ double x = HIP_ERROR_CONSTANT;
139
+ if (lg_k <= 14) x = ((double) HIP_HIGH_SIDE_DATA[3 * (lg_k - 4) + (kappa - 1)]) / 10000.0;
140
+ const double rel = x / (sqrt((double) k));
141
+ const double eps = ((double) kappa) * rel;
142
+ const double est = sketch.get_hip_estimate();
143
+ double result = est / (1.0 + eps);
144
+ const double check = (double) sketch.get_num_coupons();
145
+ if (result < check) result = check;
146
+ return result;
147
+ }
148
+
149
+ template<typename A>
150
+ double get_hip_confidence_ub(const cpc_sketch_alloc<A>& sketch, int kappa) {
151
+ if (sketch.get_num_coupons() == 0) return 0.0;
152
+ const int lg_k = sketch.get_lg_k();
153
+ const long k = 1 << lg_k;
154
+ if (lg_k < 4) throw std::logic_error("lgk < 4");
155
+ if (kappa < 1 || kappa > 3) throw std::invalid_argument("kappa must be between 1 and 3");
156
+ double x = HIP_ERROR_CONSTANT;
157
+ if (lg_k <= 14) x = ((double) HIP_LOW_SIDE_DATA[3 * (lg_k - 4) + (kappa - 1)]) / 10000.0;
158
+ const double rel = x / sqrt(k);
159
+ const double eps = kappa * rel;
160
+ const double est = sketch.get_hip_estimate();
161
+ const double result = est / (1.0 - eps);
162
+ return ceil(result); // widening for coverage
163
+ }
164
+
165
+ } /* namespace datasketches */
166
+
167
+ #endif
@@ -0,0 +1,311 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef CPC_SKETCH_HPP_
21
+ #define CPC_SKETCH_HPP_
22
+
23
+ #include <iostream>
24
+ #include <functional>
25
+ #include <string>
26
+ #include <vector>
27
+
28
+ #include "u32_table.hpp"
29
+ #include "cpc_common.hpp"
30
+ #include "cpc_compressor.hpp"
31
+ #include "cpc_confidence.hpp"
32
+ #include "common_defs.hpp"
33
+
34
+ namespace datasketches {
35
+
36
+ /*
37
+ * High performance C++ implementation of Compressed Probabilistic Counting (CPC) Sketch
38
+ *
39
+ * This is a very compact (in serialized form) distinct counting sketch.
40
+ * The theory is described in the following paper:
41
+ * https://arxiv.org/abs/1708.06839
42
+ *
43
+ * author Kevin Lang
44
+ * author Alexander Saydakov
45
+ */
46
+
47
+ // forward-declarations
48
+ template<typename A> class cpc_sketch_alloc;
49
+ template<typename A> class cpc_union_alloc;
50
+
51
+ // alias with default allocator for convenience
52
+ typedef cpc_sketch_alloc<std::allocator<void>> cpc_sketch;
53
+
54
+ // allocation and initialization of global decompression (decoding) tables
55
+ // call this before anything else if you want to control the initialization time
56
+ // for instance, to have this happen outside of a transaction context
57
+ // otherwise initialization happens on the first use (serialization or deserialization)
58
+ // it is safe to call more than once assuming no race conditions
59
+ // this is not thread safe! neither is the rest of the library
60
+ template<typename A> void cpc_init();
61
+
62
+ template<typename A>
63
+ class cpc_sketch_alloc {
64
+ public:
65
+ /**
66
+ * Creates an instance of the sketch given the lg_k parameter and hash seed.
67
+ * @param lg_k base 2 logarithm of the number of bins in the sketch
68
+ * @param seed for hash function
69
+ */
70
+ explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED);
71
+
72
+ /**
73
+ * @return configured lg_k of this sketch
74
+ */
75
+ uint8_t get_lg_k() const;
76
+
77
+ /**
78
+ * @return true if this sketch represents an empty set
79
+ */
80
+ bool is_empty() const;
81
+
82
+ /**
83
+ * @return estimate of the distinct count of the input stream
84
+ */
85
+ double get_estimate() const;
86
+
87
+ /**
88
+ * Returns the approximate lower error bound given a parameter kappa (1, 2 or 3).
89
+ * This parameter is similar to the number of standard deviations of the normal distribution
90
+ * and corresponds to approximately 67%, 95% and 99% confidence intervals.
91
+ * @param kappa parameter to specify confidence interval (1, 2 or 3)
92
+ * @return the lower bound
93
+ */
94
+ double get_lower_bound(unsigned kappa) const;
95
+
96
+ /**
97
+ * Returns the approximate upper error bound given a parameter kappa (1, 2 or 3).
98
+ * This parameter is similar to the number of standard deviations of the normal distribution
99
+ * and corresponds to approximately 67%, 95% and 99% confidence intervals.
100
+ * @param kappa parameter to specify confidence interval (1, 2 or 3)
101
+ * @return the upper bound
102
+ */
103
+ double get_upper_bound(unsigned kappa) const;
104
+
105
+ /**
106
+ * Update this sketch with a given string.
107
+ * @param value string to update the sketch with
108
+ */
109
+ void update(const std::string& value);
110
+
111
+ /**
112
+ * Update this sketch with a given unsigned 64-bit integer.
113
+ * @param value uint64_t to update the sketch with
114
+ */
115
+ void update(uint64_t value);
116
+
117
+ /**
118
+ * Update this sketch with a given signed 64-bit integer.
119
+ * @param value int64_t to update the sketch with
120
+ */
121
+ void update(int64_t value);
122
+
123
+ /**
124
+ * Update this sketch with a given unsigned 32-bit integer.
125
+ * For compatibility with Java implementation.
126
+ * @param value uint32_t to update the sketch with
127
+ */
128
+ void update(uint32_t value);
129
+
130
+ /**
131
+ * Update this sketch with a given signed 32-bit integer.
132
+ * For compatibility with Java implementation.
133
+ * @param value int32_t to update the sketch with
134
+ */
135
+ void update(int32_t value);
136
+
137
+ /**
138
+ * Update this sketch with a given unsigned 16-bit integer.
139
+ * For compatibility with Java implementation.
140
+ * @param value uint16_t to update the sketch with
141
+ */
142
+ void update(uint16_t value);
143
+
144
+ /**
145
+ * Update this sketch with a given signed 16-bit integer.
146
+ * For compatibility with Java implementation.
147
+ * @param value int16_t to update the sketch with
148
+ */
149
+ void update(int16_t value);
150
+
151
+ /**
152
+ * Update this sketch with a given unsigned 8-bit integer.
153
+ * For compatibility with Java implementation.
154
+ * @param value uint8_t to update the sketch with
155
+ */
156
+ void update(uint8_t value);
157
+
158
+ /**
159
+ * Update this sketch with a given signed 8-bit integer.
160
+ * For compatibility with Java implementation.
161
+ * @param value int8_t to update the sketch with
162
+ */
163
+ void update(int8_t value);
164
+
165
+ /**
166
+ * Update this sketch with a given double-precision floating point value.
167
+ * For compatibility with Java implementation.
168
+ * @param value double to update the sketch with
169
+ */
170
+ void update(double value);
171
+
172
+ /**
173
+ * Update this sketch with a given floating point value.
174
+ * For compatibility with Java implementation.
175
+ * @param value float to update the sketch with
176
+ */
177
+ void update(float value);
178
+
179
+ /**
180
+ * Update this sketch with given data of any type.
181
+ * This is a "universal" update that covers all cases above,
182
+ * but may produce different hashes.
183
+ * Be very careful to hash input values consistently using the same approach
184
+ * both over time and on different platforms
185
+ * and while passing sketches between C++ environment and Java environment.
186
+ * Otherwise two sketches that should represent overlapping sets will be disjoint
187
+ * For instance, for signed 32-bit values call update(int32_t) method above,
188
+ * which does widening conversion to int64_t, if compatibility with Java is expected
189
+ * @param data pointer to the data
190
+ * @param length of the data in bytes
191
+ */
192
+ void update(const void* value, int size);
193
+
194
+ /**
195
+ * Returns a human-readable summary of this sketch
196
+ */
197
+ string<A> to_string() const;
198
+
199
+ /**
200
+ * This method serializes the sketch into a given stream in a binary form
201
+ * @param os output stream
202
+ */
203
+ void serialize(std::ostream& os) const;
204
+
205
+ // This is a convenience alias for users
206
+ // The type returned by the following serialize method
207
+ typedef vector_u8<A> vector_bytes;
208
+
209
+ /**
210
+ * This method serializes the sketch as a vector of bytes.
211
+ * An optional header can be reserved in front of the sketch.
212
+ * It is an uninitialized space of a given size.
213
+ * This header is used in Datasketches PostgreSQL extension.
214
+ * @param header_size_bytes space to reserve in front of the sketch
215
+ */
216
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
217
+
218
+ /**
219
+ * This method deserializes a sketch from a given stream.
220
+ * @param is input stream
221
+ * @param seed the seed for the hash function that was used to create the sketch
222
+ * @return an instance of a sketch
223
+ */
224
+ static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
225
+
226
+ /**
227
+ * This method deserializes a sketch from a given array of bytes.
228
+ * @param bytes pointer to the array of bytes
229
+ * @param size the size of the array
230
+ * @param seed the seed for the hash function that was used to create the sketch
231
+ * @return an instance of the sketch
232
+ */
233
+ static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
234
+
235
+ // for internal use
236
+ uint32_t get_num_coupons() const;
237
+
238
+ // for debugging
239
+ // this should catch some forms of corruption during serialization-deserialization
240
+ bool validate() const;
241
+
242
+ private:
243
+ static const uint8_t SERIAL_VERSION = 1;
244
+ static const uint8_t FAMILY = 16;
245
+
246
+ enum flags { IS_BIG_ENDIAN, IS_COMPRESSED, HAS_HIP, HAS_TABLE, HAS_WINDOW };
247
+
248
+ // Note: except for brief transitional moments, these sketches always obey
249
+ // the following strict mapping between the flavor of a sketch and the
250
+ // number of coupons that it has collected
251
+ enum flavor {
252
+ EMPTY, // 0 == C < 1
253
+ SPARSE, // 1 <= C < 3K/32
254
+ HYBRID, // 3K/32 <= C < K/2
255
+ PINNED, // K/2 <= C < 27K/8 [NB: 27/8 = 3 + 3/8]
256
+ SLIDING // 27K/8 <= C
257
+ };
258
+
259
+ uint8_t lg_k;
260
+ uint64_t seed;
261
+ bool was_merged; // is the sketch the result of merging?
262
+ uint32_t num_coupons; // the number of coupons collected so far
263
+
264
+ u32_table<A> surprising_value_table;
265
+ vector_u8<A> sliding_window;
266
+ uint8_t window_offset; // derivable from num_coupons, but made explicit for speed
267
+ uint8_t first_interesting_column; // This is part of a speed optimization
268
+
269
+ double kxp;
270
+ double hip_est_accum;
271
+
272
+ // for deserialization and cpc_union::get_result()
273
+ cpc_sketch_alloc(uint8_t lg_k, uint32_t num_coupons, uint8_t first_interesting_column, u32_table<A>&& table,
274
+ vector_u8<A>&& window, bool has_hip, double kxp, double hip_est_accum, uint64_t seed);
275
+
276
+ inline void row_col_update(uint32_t row_col);
277
+ inline void update_sparse(uint32_t row_col);
278
+ inline void update_windowed(uint32_t row_col);
279
+ inline void update_hip(uint32_t row_col);
280
+ void promote_sparse_to_windowed();
281
+ void move_window();
282
+ void refresh_kxp(const uint64_t* bit_matrix);
283
+
284
+ friend double get_hip_confidence_lb<A>(const cpc_sketch_alloc<A>& sketch, int kappa);
285
+ friend double get_hip_confidence_ub<A>(const cpc_sketch_alloc<A>& sketch, int kappa);
286
+ friend double get_icon_confidence_lb<A>(const cpc_sketch_alloc<A>& sketch, int kappa);
287
+ friend double get_icon_confidence_ub<A>(const cpc_sketch_alloc<A>& sketch, int kappa);
288
+ double get_hip_estimate() const;
289
+ double get_icon_estimate() const;
290
+
291
+ inline flavor determine_flavor() const;
292
+ static inline flavor determine_flavor(uint8_t lg_k, uint64_t c);
293
+
294
+ static inline uint8_t determine_correct_offset(uint8_t lg_k, uint64_t c);
295
+
296
+ // this produces a full-size k-by-64 bit matrix
297
+ vector_u64<A> build_bit_matrix() const;
298
+
299
+ static uint8_t get_preamble_ints(uint32_t num_coupons, bool has_hip, bool has_table, bool has_window);
300
+ inline void write_hip(std::ostream& os) const;
301
+ inline size_t copy_hip_to_mem(void* dst) const;
302
+
303
+ friend cpc_compressor<A>;
304
+ friend cpc_union_alloc<A>;
305
+ };
306
+
307
+ } /* namespace datasketches */
308
+
309
+ #include "cpc_sketch_impl.hpp"
310
+
311
+ #endif