datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,140 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include "var_opt_sketch.hpp"
21
+ #include "var_opt_union.hpp"
22
+
23
+ #include <pybind11/pybind11.h>
24
+ #include <pybind11/functional.h>
25
+ #include <sstream>
26
+
27
+ namespace py = pybind11;
28
+
29
+ namespace datasketches {
30
+ namespace python {
31
+
32
+ template<typename T>
33
+ py::list vo_sketch_get_samples(const var_opt_sketch<T>& sk) {
34
+ py::list list;
35
+ for (auto& item : sk) {
36
+ py::tuple t = py::make_tuple(item.first, item.second);
37
+ list.append(t);
38
+ }
39
+ return list;
40
+ }
41
+
42
+ template<typename T>
43
+ py::dict vo_sketch_estimate_subset_sum(const var_opt_sketch<T>& sk, const std::function<bool(T)> func) {
44
+ subset_summary summary = sk.estimate_subset_sum(func);
45
+ py::dict d;
46
+ d["estimate"] = summary.estimate;
47
+ d["lower_bound"] = summary.lower_bound;
48
+ d["upper_bound"] = summary.upper_bound;
49
+ d["total_sketch_weight"] = summary.total_sketch_weight;
50
+ return d;
51
+ }
52
+
53
+ template<typename T>
54
+ std::string vo_sketch_to_string(const var_opt_sketch<T>& sk, bool print_items) {
55
+ if (print_items) {
56
+ std::ostringstream ss;
57
+ ss << sk.to_string();
58
+ ss << "### VarOpt Sketch Items" << std::endl;
59
+ int i = 0;
60
+ for (auto& item : sk) {
61
+ // item.second is always a double
62
+ // item.first is an arbitrary py::object, so get the value by
63
+ // using internal str() method then casting to C++ std::string
64
+ py::str item_pystr(item.first);
65
+ std::string item_str = py::cast<std::string>(item_pystr);
66
+ // item.second is guaranteed to be a double
67
+ ss << i++ << ": " << item_str << "\twt = " << item.second << std::endl;
68
+ }
69
+ return ss.str();
70
+ } else {
71
+ return sk.to_string();
72
+ }
73
+ }
74
+
75
+ }
76
+ }
77
+
78
+ namespace dspy = datasketches::python;
79
+
80
+ template<typename T>
81
+ void bind_vo_sketch(py::module &m, const char* name) {
82
+ using namespace datasketches;
83
+
84
+ py::class_<var_opt_sketch<T>>(m, name)
85
+ .def(py::init<uint32_t>(), py::arg("k"))
86
+ .def("__str__", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false,
87
+ "Produces a string summary of the sketch")
88
+ .def("to_string", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false,
89
+ "Produces a string summary of the sketch")
90
+ .def("update", (void (var_opt_sketch<T>::*)(const T&, double)) &var_opt_sketch<T>::update, py::arg("item"), py::arg("weight")=1.0,
91
+ "Updates the sketch with the given value and weight")
92
+ .def_property_readonly("k", &var_opt_sketch<T>::get_k,
93
+ "Returns the sketch's maximum configured sample size")
94
+ .def_property_readonly("n", &var_opt_sketch<T>::get_n,
95
+ "Returns the total stream length")
96
+ .def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples,
97
+ "Returns the number of samples currently in the sketch")
98
+ .def("get_samples", &dspy::vo_sketch_get_samples<T>,
99
+ "Retyrns the set of samples in the sketch")
100
+ .def("is_empty", &var_opt_sketch<T>::is_empty,
101
+ "Returns True if the sketch is empty, otherwise False")
102
+ .def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>,
103
+ "Applies a provided predicate to the sketch and returns the estimated total weight matching the predicate, as well "
104
+ "as upper and lower bounds on the estimate and the total weight processed by the sketch")
105
+ // As of writing, not yet clear how to serialize arbitrary python objects,
106
+ // especially in any sort of language-portable way
107
+ //.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
108
+ //.def("serialize", &dspy::vo_sketch_serialize<T>)
109
+ //.def_static("deserialize", &dspy::vo_sketch_deserialize<T>)
110
+ ;
111
+ }
112
+
113
+ template<typename T>
114
+ void bind_vo_union(py::module &m, const char* name) {
115
+ using namespace datasketches;
116
+
117
+ py::class_<var_opt_union<T>>(m, name)
118
+ .def(py::init<uint32_t>(), py::arg("max_k"))
119
+ .def("__str__", &var_opt_union<T>::to_string,
120
+ "Produces a string summary of the sketch")
121
+ .def("to_string", &var_opt_union<T>::to_string,
122
+ "Produces a string summary of the sketch")
123
+ .def("update", (void (var_opt_union<T>::*)(const var_opt_sketch<T>& sk)) &var_opt_union<T>::update, py::arg("sketch"),
124
+ "Updates the union with the given sketch")
125
+ .def("get_result", &var_opt_union<T>::get_result,
126
+ "Returns a sketch corresponding to the union result")
127
+ .def("reset", &var_opt_union<T>::reset,
128
+ "Resets the union to the empty state")
129
+ // As of writing, not yet clear how to serialize arbitrary python objects,
130
+ // especially in any sort of language-portable way
131
+ //.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
132
+ //.def("serialize", &dspy::vo_union_serialize<T>)
133
+ //.def_static("deserialize", &dspy::vo_union_deserialize<T>)
134
+ ;
135
+ }
136
+
137
+ void init_vo(py::module &m) {
138
+ bind_vo_sketch<py::object>(m, "var_opt_sketch");
139
+ bind_vo_union<py::object>(m, "var_opt_union");
140
+ }
@@ -0,0 +1,64 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import unittest
19
+ from datasketches import cpc_sketch, cpc_union
20
+
21
+ class CpcTest(unittest.TestCase):
22
+ def test_cpc_example(self):
23
+ k = 12 # 2^k = 4096 rows in the table
24
+ n = 1 << 18 # ~256k unique values
25
+
26
+ # create a couple sketches and inject some values
27
+ # we'll have 1/4 of the values overlap
28
+ cpc = cpc_sketch(k)
29
+ cpc2 = cpc_sketch(k)
30
+ offset = int(3 * n / 4) # it's a float w/o cast
31
+ # because we hash on the bits, not an abstract numeric value,
32
+ # cpc.update(1) and cpc.update(1.0) give different results.
33
+ for i in range(0, n):
34
+ cpc.update(i)
35
+ cpc2.update(i + offset)
36
+
37
+ # although we provide get_composite_estimate() and get_estimate(),
38
+ # the latter will always give the best available estimate. we
39
+ # recommend using get_estimate().
40
+ # we can check that the upper and lower bounds bracket the
41
+ # estimate, without needing to know the exact value.
42
+ self.assertLessEqual(cpc.get_lower_bound(1), cpc.get_estimate())
43
+ self.assertGreaterEqual(cpc.get_upper_bound(1), cpc.get_estimate())
44
+
45
+ # unioning uses a separate class, but we need to get_result()
46
+ # tp query the unioned sketches
47
+ union = cpc_union(k)
48
+ union.update(cpc)
49
+ union.update(cpc2)
50
+ result = union.get_result()
51
+
52
+ # since our process here (including post-union CPC) is
53
+ # deterministic, we have checked and know the exact
54
+ # answer is within one standard deviation of the estimate
55
+ self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
56
+ self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
57
+
58
+ # serialize for storage and reconstruct
59
+ sk_bytes = result.serialize()
60
+ new_cpc = cpc_sketch.deserialize(sk_bytes)
61
+ self.assertFalse(new_cpc.is_empty())
62
+
63
+ if __name__ == '__main__':
64
+ unittest.main()
@@ -0,0 +1,110 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import unittest
19
+ from datasketches import frequent_strings_sketch, frequent_items_error_type
20
+
21
+ class FiTest(unittest.TestCase):
22
+ def test_fi_example(self):
23
+ k = 3 # a small value so we can easily fill the sketch
24
+ fi = frequent_strings_sketch(k)
25
+
26
+ # we'll use a small number of distinct items so we
27
+ # can use exponentially increasing weights and have
28
+ # some frequent items, decreasing so we have some
29
+ # small items inserted after a purge
30
+ n = 8
31
+ for i in range(0, n):
32
+ fi.update(str(i), 2 ** (n - i))
33
+
34
+ # there are two ways to extract items :
35
+ # * NO_FALSE_POSITIVES includes all items with a lower bound
36
+ # above the a posteriori error
37
+ # * NO_FALSE_NEGATIVES includes all items with an uper bound
38
+ # above the a posteriori error
39
+ # a more complete discussion may be found at
40
+ # https://datasketches.github.io/docs/Frequency/FrequentItemsOverview.html
41
+ items_no_fp = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_POSITIVES)
42
+ items_no_fn = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_NEGATIVES)
43
+ self.assertLessEqual(len(items_no_fp), len(items_no_fn))
44
+
45
+ # the items list returns a decreasing weight-sorted list, and
46
+ # for each item we have (item, estimate, lower_bound, upper_bound)
47
+ item = items_no_fp[1]
48
+ self.assertLessEqual(item[2], item[1]) # lower bound vs estimate
49
+ self.assertLessEqual(item[1], item[3]) # estimate vs upper bound
50
+
51
+ # we can also query directly for a specific item
52
+ id = items_no_fn[0][0]
53
+ est = fi.get_estimate(id)
54
+ lb = fi.get_lower_bound(id)
55
+ ub = fi.get_upper_bound(id)
56
+ self.assertLessEqual(lb, est)
57
+ self.assertLessEqual(est, ub)
58
+
59
+ # the values are zero if the item isn't in our list
60
+ self.assertEqual(fi.get_estimate("NaN"), 0)
61
+
62
+ # now create a second sketch with a lot of unique
63
+ # values but all with equal weight (of 1) such that
64
+ # the total weight is much larger than the first sketch
65
+ fi2 = frequent_strings_sketch(k)
66
+ wt = fi.get_total_weight()
67
+ for i in range(0, 4*wt):
68
+ fi2.update(str(i))
69
+
70
+ # merge the second sketch into the first
71
+ fi.merge(fi2)
72
+
73
+ # we can see that the weight is much larger
74
+ self.assertEqual(5 * wt, fi.get_total_weight())
75
+
76
+ # querying with NO_FALSE_POSITIVES means we don't find anything
77
+ # heavy enough to return
78
+ items_no_fp = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_POSITIVES)
79
+ self.assertEqual(len(items_no_fp), 0)
80
+
81
+ # we do, however, find a few potential heavy items
82
+ # if querying with NO_FALSE_NEGATIVES
83
+ items_no_fn = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_NEGATIVES)
84
+ self.assertGreater(len(items_no_fn), 0)
85
+
86
+ # finally, serialize and reconstruct
87
+ fi_bytes = fi.serialize()
88
+ self.assertEqual(len(fi_bytes), fi.get_serialized_size_bytes())
89
+ new_fi = frequent_strings_sketch.deserialize(fi_bytes)
90
+
91
+ # and now interrogate the sketch
92
+ self.assertFalse(new_fi.is_empty())
93
+ self.assertGreater(new_fi.get_num_active_items(), 0)
94
+ self.assertEqual(5 * wt, new_fi.get_total_weight())
95
+
96
+
97
+ def test_fi_sketch(self):
98
+ # only testing a few things not used in the above example
99
+ k = 12
100
+ wt = 10000
101
+ fi = frequent_strings_sketch(k)
102
+
103
+ self.assertAlmostEqual(fi.get_sketch_epsilon(), 0.0008545, delta=1e-6)
104
+
105
+ sk_apriori_error = fi.get_sketch_epsilon() * wt
106
+ reference_apriori_error = frequent_strings_sketch.get_apriori_error(k, wt)
107
+ self.assertAlmostEqual(sk_apriori_error, reference_apriori_error, delta=1e-6)
108
+
109
+ if __name__ == '__main__':
110
+ unittest.main()
@@ -0,0 +1,131 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import unittest
19
+ from datasketches import hll_sketch, hll_union, tgt_hll_type
20
+
21
+ class HllTest(unittest.TestCase):
22
+ def test_hll_example(self):
23
+ k = 12 # 2^k = 4096 rows in the table
24
+ n = 1 << 18 # ~256k unique values
25
+
26
+ # create a couple sketches and inject some values
27
+ # we'll have 1/4 of the values overlap
28
+ hll = hll_sketch(k, tgt_hll_type.HLL_8)
29
+ hll2 = hll_sketch(k, tgt_hll_type.HLL_6)
30
+ offset = int(3 * n / 4) # it's a float w/o cast
31
+ # because we hash on the bits, not an abstract numeric value,
32
+ # hll.update(1) and hll.update(1.0) give different results.
33
+ for i in range(0, n):
34
+ hll.update(i)
35
+ hll2.update(i + offset)
36
+
37
+ # we can check that the upper and lower bounds bracket the
38
+ # estimate, without needing to know the exact value.
39
+ self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
40
+ self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())
41
+
42
+ # unioning uses a separate class, and we can either get a result
43
+ # sketch or query the union object directly
44
+ union = hll_union(k)
45
+ union.update(hll)
46
+ union.update(hll2)
47
+ result = union.get_result()
48
+ self.assertEqual(result.get_estimate(), union.get_estimate())
49
+
50
+ # since our process here (including post-union HLL) is
51
+ # deterministic, we have checked and know the exact
52
+ # answer is within one standard deviation of the estimate
53
+ self.assertLessEqual(union.get_lower_bound(1), 7 * n / 4)
54
+ self.assertGreaterEqual(union.get_upper_bound(1), 7 * n / 4)
55
+
56
+ # serialize for storage and reconstruct
57
+ sk_bytes = result.serialize_compact()
58
+ self.assertEqual(len(sk_bytes), result.get_compact_serialization_bytes())
59
+ new_hll = hll_sketch.deserialize(sk_bytes)
60
+
61
+ # the sketch can self-report its configuation and status
62
+ self.assertEqual(new_hll.lg_config_k, k)
63
+ self.assertEqual(new_hll.tgt_type, tgt_hll_type.HLL_4)
64
+ self.assertFalse(new_hll.is_empty())
65
+
66
+ # if we want to reduce some object overhead, we can also reset
67
+ new_hll.reset()
68
+ self.assertTrue(new_hll.is_empty())
69
+
70
+ def test_hll_sketch(self):
71
+ k = 8
72
+ n = 117
73
+ hll = self.generate_sketch(n, k, tgt_hll_type.HLL_6)
74
+ hll.update('string data')
75
+ hll.update(3.14159) # double data
76
+
77
+ self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
78
+ self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())
79
+
80
+ self.assertEqual(hll.lg_config_k, k)
81
+ self.assertEqual(hll.tgt_type, tgt_hll_type.HLL_6)
82
+
83
+ bytes_compact = hll.serialize_compact()
84
+ bytes_update = hll.serialize_updatable()
85
+ self.assertEqual(len(bytes_compact), hll.get_compact_serialization_bytes())
86
+ self.assertEqual(len(bytes_update), hll.get_updatable_serialization_bytes())
87
+
88
+ self.assertFalse(hll.is_compact())
89
+ self.assertFalse(hll.is_empty())
90
+
91
+ self.assertTrue(isinstance(hll_sketch.deserialize(bytes_compact), hll_sketch))
92
+ self.assertTrue(isinstance(hll_sketch.deserialize(bytes_update), hll_sketch))
93
+
94
+ self.assertIsNotNone(hll_sketch.get_rel_err(True, False, 12, 1))
95
+ self.assertIsNotNone(hll_sketch.get_max_updatable_serialization_bytes(20, tgt_hll_type.HLL_6))
96
+
97
+ hll.reset()
98
+ self.assertTrue(hll.is_empty())
99
+
100
+ def test_hll_union(self):
101
+ k = 7
102
+ n = 53
103
+ union = hll_union(k)
104
+
105
+ sk = self.generate_sketch(n, k, tgt_hll_type.HLL_4, 0)
106
+ union.update(sk)
107
+ sk = self.generate_sketch(3 * n, k, tgt_hll_type.HLL_4, n)
108
+ union.update(sk)
109
+ union.update('string data')
110
+ union.update(1.4142136)
111
+
112
+ self.assertLessEqual(union.get_lower_bound(1), union.get_estimate())
113
+ self.assertGreaterEqual(union.get_upper_bound(1), union.get_estimate())
114
+
115
+ self.assertEqual(union.lg_config_k, k)
116
+ self.assertFalse(union.is_compact())
117
+ self.assertFalse(union.is_empty())
118
+
119
+ sk = union.get_result()
120
+ self.assertTrue(isinstance(sk, hll_sketch))
121
+ self.assertEqual(sk.tgt_type, tgt_hll_type.HLL_4)
122
+
123
+ def generate_sketch(self, n, k, sk_type=tgt_hll_type.HLL_4, st_idx=0):
124
+ sk = hll_sketch(k, sk_type)
125
+ for i in range(st_idx, st_idx + n):
126
+ sk.update(i)
127
+ return sk
128
+
129
+
130
+ if __name__ == '__main__':
131
+ unittest.main()