datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,119 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import unittest
19
+ from datasketches import (kll_ints_sketch, kll_floats_sketch,
20
+ vector_of_kll_ints_sketches,
21
+ vector_of_kll_floats_sketches)
22
+ import numpy as np
23
+
24
+ class KllTest(unittest.TestCase):
25
+ def test_kll_example(self):
26
+ k = 160
27
+ n = 2 ** 20
28
+
29
+ # create a sketch and inject ~1 million N(0,1) points as an array and as a single item
30
+ kll = kll_floats_sketch(k)
31
+ kll.update(np.random.normal(size=n-1))
32
+ kll.update(0.0)
33
+
34
+ # 0 should be near the median
35
+ self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.025)
36
+
37
+ # the median should be near 0
38
+ self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.025)
39
+
40
+ # we also track the min/max independently from the rest of the data
41
+ # which lets us know the full observed data range
42
+ self.assertLessEqual(kll.get_min_value(), kll.get_quantile(0.01))
43
+ self.assertLessEqual(0.0, kll.get_rank(kll.get_min_value()))
44
+ self.assertGreaterEqual(kll.get_max_value(), kll.get_quantile(0.99))
45
+ self.assertGreaterEqual(1.0, kll.get_rank(kll.get_max_value()))
46
+
47
+ # we can also extract a list of values at a time,
48
+ # here the values should give us something close to [-2, -1, 0, 1, 2].
49
+ # then get the CDF, which will return something close to
50
+ # the original values used in get_quantiles()
51
+ # finally, can check the normalized rank error bound
52
+ pts = kll.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
53
+ cdf = kll.get_cdf(pts) # include 1.0 at end to account for all probability mass
54
+ self.assertEqual(len(cdf), len(pts)+1)
55
+ err = kll.normalized_rank_error(False)
56
+ self.assertEqual(err, kll_floats_sketch.get_normalized_rank_error(k, False))
57
+
58
+ # and a few basic queries about the sketch
59
+ self.assertFalse(kll.is_empty())
60
+ self.assertTrue(kll.is_estimation_mode())
61
+ self.assertEqual(kll.get_n(), n)
62
+ self.assertLess(kll.get_num_retained(), n)
63
+
64
+ # merging itself will double the number of items the sketch has seen
65
+ kll.merge(kll)
66
+ self.assertEqual(kll.get_n(), 2*n)
67
+
68
+ # we can then serialize and reconstruct the sketch
69
+ kll_bytes = kll.serialize()
70
+ new_kll = kll.deserialize(kll_bytes)
71
+ self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained())
72
+ self.assertEqual(kll.get_min_value(), new_kll.get_min_value())
73
+ self.assertEqual(kll.get_max_value(), new_kll.get_max_value())
74
+ self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7))
75
+ self.assertEqual(kll.get_rank(0.0), new_kll.get_rank(0.0))
76
+
77
+ def test_kll_ints_sketch(self):
78
+ k = 100
79
+ n = 10
80
+ kll = kll_ints_sketch(k)
81
+ for i in range(0, n):
82
+ kll.update(i)
83
+
84
+ self.assertEqual(kll.get_min_value(), 0)
85
+ self.assertEqual(kll.get_max_value(), n-1)
86
+ self.assertEqual(kll.get_n(), n)
87
+ self.assertFalse(kll.is_empty())
88
+ self.assertFalse(kll.is_estimation_mode()) # n < k
89
+
90
+ pmf = kll.get_pmf([round(n/2)])
91
+ self.assertIsNotNone(pmf)
92
+ self.assertEqual(len(pmf), 2)
93
+
94
+ cdf = kll.get_cdf([round(n/2)])
95
+ self.assertIsNotNone(cdf)
96
+ self.assertEqual(len(cdf), 2)
97
+
98
+ self.assertEqual(kll.get_quantile(0.5), round(n/2))
99
+ quants = kll.get_quantiles([0.25, 0.5, 0.75])
100
+ self.assertIsNotNone(quants)
101
+ self.assertEqual(len(quants), 3)
102
+
103
+ self.assertEqual(kll.get_rank(round(n/2)), 0.5)
104
+
105
+ # merge self
106
+ kll.merge(kll)
107
+ self.assertEqual(kll.get_n(), 2 * n)
108
+
109
+ sk_bytes = kll.serialize()
110
+ self.assertTrue(isinstance(kll_ints_sketch.deserialize(sk_bytes), kll_ints_sketch))
111
+
112
+ def test_kll_floats_sketch(self):
113
+ # already tested ints and it's templatized, so just make sure it instantiates properly
114
+ k = 75
115
+ kll = kll_floats_sketch(k)
116
+ self.assertTrue(kll.is_empty())
117
+
118
+ if __name__ == '__main__':
119
+ unittest.main()
@@ -0,0 +1,121 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import unittest
19
+
20
+ from datasketches import theta_sketch, update_theta_sketch
21
+ from datasketches import compact_theta_sketch, theta_union
22
+ from datasketches import theta_intersection, theta_a_not_b
23
+
24
+ class ThetaTest(unittest.TestCase):
25
+ def test_theta_basic_example(self):
26
+ k = 12 # 2^k = 4096 rows in the table
27
+ n = 1 << 18 # ~256k unique values
28
+
29
+ # create a sketch and inject some values
30
+ sk = self.generate_theta_sketch(n, k)
31
+
32
+ # we can check that the upper and lower bounds bracket the
33
+ # estimate, without needing to know the exact value.
34
+ self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
35
+ self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
36
+
37
+ # because this sketch is deterministically generated, we can
38
+ # also compare against the exact value
39
+ self.assertLessEqual(sk.get_lower_bound(1), n)
40
+ self.assertGreaterEqual(sk.get_upper_bound(1), n)
41
+
42
+ # serialize for storage and reconstruct
43
+ sk_bytes = sk.serialize()
44
+ new_sk = update_theta_sketch.deserialize(sk_bytes)
45
+
46
+ # estimate remains unchanged
47
+ self.assertFalse(sk.is_empty())
48
+ self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
49
+
50
+ def test_theta_set_operations(self):
51
+ k = 12 # 2^k = 4096 rows in the table
52
+ n = 1 << 18 # ~256k unique values
53
+
54
+ # we'll have 1/4 of the values overlap
55
+ offset = int(3 * n / 4) # it's a float w/o cast
56
+
57
+ # create a couple sketches and inject some values
58
+ sk1 = self.generate_theta_sketch(n, k)
59
+ sk2 = self.generate_theta_sketch(n, k, offset)
60
+
61
+ # UNIONS
62
+ # create a union object
63
+ union = theta_union(k)
64
+ union.update(sk1)
65
+ union.update(sk2)
66
+
67
+ # getting result from union returns a compact_theta_sketch
68
+ # compact theta sketches can be used in additional unions
69
+ # or set operations but cannot accept further item updates
70
+ result = union.get_result()
71
+ self.assertTrue(isinstance(result, compact_theta_sketch))
72
+
73
+ # since our process here is deterministic, we have
74
+ # checked and know the exact answer is within one
75
+ # standard deviation of the estimate
76
+ self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
77
+ self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
78
+
79
+
80
+ # INTERSECTIONS
81
+ # create an intersection object
82
+ intersect = theta_intersection() # no lg_k
83
+ intersect.update(sk1)
84
+ intersect.update(sk2)
85
+
86
+ # has_result() indicates the intersection has been used,
87
+ # although the result may be the empty set
88
+ self.assertTrue(intersect.has_result())
89
+
90
+ # as with unions, the result is a compact sketch
91
+ result = intersect.get_result()
92
+ self.assertTrue(isinstance(result, compact_theta_sketch))
93
+
94
+ # we know the sets overlap by 1/4
95
+ self.assertLessEqual(result.get_lower_bound(1), n / 4)
96
+ self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
97
+
98
+
99
+ # A NOT B
100
+ # create an a_not_b object
101
+ anb = theta_a_not_b() # no lg_k
102
+ result = anb.compute(sk1, sk2)
103
+
104
+ # as with unions, the result is a compact sketch
105
+ self.assertTrue(isinstance(result, compact_theta_sketch))
106
+
107
+ # we know the sets overlap by 1/4, so the remainder is 3/4
108
+ self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
109
+ self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
110
+
111
+
112
+ def generate_theta_sketch(self, n, k, offset=0):
113
+ sk = update_theta_sketch(k)
114
+ for i in range(0, n):
115
+ sk.update(i + offset)
116
+ return sk
117
+
118
+ if __name__ == '__main__':
119
+ unittest.main()
120
+
121
+
@@ -0,0 +1,148 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import unittest
19
+ from datasketches import (vector_of_kll_ints_sketches,
20
+ vector_of_kll_floats_sketches)
21
+ import numpy as np
22
+
23
+ class VectorOfKllSketchesTest(unittest.TestCase):
24
+ def test_vector_of_kll_floats_sketches_example(self):
25
+ k = 200
26
+ d = 3
27
+ n = 2 ** 20
28
+
29
+ # create a sketch and inject ~1 million N(0,1) points
30
+ kll = vector_of_kll_floats_sketches(k, d)
31
+ # Track the min/max for each sketch to test later
32
+ smin = np.zeros(d) + np.inf
33
+ smax = np.zeros(d) - np.inf
34
+
35
+ for i in range(0, n):
36
+ dat = np.random.randn(d)
37
+ smin = np.amin([smin, dat], axis=0)
38
+ smax = np.amax([smax, dat], axis=0)
39
+ kll.update(dat)
40
+
41
+ # 0 should be near the median
42
+ np.testing.assert_allclose(0.5, kll.get_ranks(0.0), atol=0.025)
43
+ # the median should be near 0
44
+ np.testing.assert_allclose(0.0, kll.get_quantiles(0.5), atol=0.025)
45
+ # we also track the min/max independently from the rest of the data
46
+ # which lets us know the full observed data range
47
+ np.testing.assert_allclose(kll.get_min_values(), smin)
48
+ np.testing.assert_allclose(kll.get_max_values(), smax)
49
+ np.testing.assert_array_less(kll.get_min_values(), kll.get_quantiles(0.01)[:,0])
50
+ np.testing.assert_array_less(kll.get_quantiles(0.99)[:,0], kll.get_max_values())
51
+
52
+ # we can also extract a list of values at a time,
53
+ # here the values should give us something close to [-2, -1, 0, 1, 2].
54
+ # then get the CDF, which will return something close to
55
+ # the original values used in get_quantiles()
56
+ # finally, can check the normalized rank error bound
57
+ pts = kll.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
58
+ # use the mean pts for the CDF, include 1.0 at end to account for all probability mass
59
+ meanpts = np.mean(pts, axis=0)
60
+ cdf = kll.get_cdf(meanpts)
61
+ self.assertEqual(cdf.shape[0], pts.shape[0])
62
+ self.assertEqual(cdf.shape[1], pts.shape[1]+1)
63
+
64
+ # and a few basic queries about the sketch
65
+ self.assertFalse(np.all(kll.is_empty()))
66
+ self.assertTrue(np.all(kll.is_estimation_mode()))
67
+ self.assertTrue(np.all(kll.get_n() == n))
68
+ self.assertTrue(np.all(kll.get_num_retained() < n))
69
+
70
+ # we can combine sketches across all dimensions and get the reuslt
71
+ result = kll.collapse()
72
+ self.assertEqual(result.get_n(), d * n)
73
+
74
+ # merging a copy of itself will double the number of items the sketch has seen
75
+ kll_copy = vector_of_kll_floats_sketches(kll)
76
+ kll.merge(kll_copy)
77
+ np.testing.assert_equal(kll.get_n(), 2*n)
78
+
79
+ # we can then serialize and reconstruct the sketch
80
+ kll_bytes = kll.serialize() # serializes each sketch as a list
81
+ new_kll = vector_of_kll_floats_sketches(k, d)
82
+ for s in range(len(kll_bytes)):
83
+ new_kll.deserialize(kll_bytes[s], s)
84
+
85
+ # everything should be exactly equal
86
+ np.testing.assert_equal(kll.get_num_retained(), new_kll.get_num_retained())
87
+ np.testing.assert_equal;(kll.get_min_values(), new_kll.get_min_values())
88
+ np.testing.assert_equal(kll.get_max_values(), new_kll.get_max_values())
89
+ np.testing.assert_equal(kll.get_quantiles(0.7), new_kll.get_quantiles(0.7))
90
+ np.testing.assert_equal(kll.get_ranks(0.0), new_kll.get_ranks(0.0))
91
+
92
+ def test_kll_ints_sketches(self):
93
+ # already tested floats and it's templatized, so just make sure it instantiates properly
94
+ k = 100
95
+ d = 5
96
+ kll = vector_of_kll_ints_sketches(k, d)
97
+ self.assertTrue(np.all(kll.is_empty()))
98
+
99
+ def test_kll_2Dupdates(self):
100
+ # 1D case tested in the first example
101
+ # 2D case will follow same idea, but focusing on update()
102
+ k = 200
103
+ d = 3
104
+ # we'll do ~250k updates of 4 values each (total ~1mil updates, as above)
105
+ n = 2 ** 18
106
+ nbatch = 4
107
+
108
+ # create a sketch and inject ~1 million N(0,1) points
109
+ kll = vector_of_kll_floats_sketches(k, d)
110
+ # Track the min/max for each sketch to test later
111
+ smin = np.zeros(d) + np.inf
112
+ smax = np.zeros(d) - np.inf
113
+
114
+ for i in range(0, n):
115
+ dat = np.random.randn(nbatch, d)
116
+ smin = np.amin(np.row_stack((smin, dat)), axis=0)
117
+ smax = np.amax(np.row_stack((smax, dat)), axis=0)
118
+ kll.update(dat)
119
+
120
+ # 0 should be near the median
121
+ np.testing.assert_allclose(0.5, kll.get_ranks(0.0), atol=0.025)
122
+ # the median should be near 0
123
+ np.testing.assert_allclose(0.0, kll.get_quantiles(0.5), atol=0.025)
124
+ # we also track the min/max independently from the rest of the data
125
+ # which lets us know the full observed data range
126
+ np.testing.assert_allclose(kll.get_min_values(), smin)
127
+ np.testing.assert_allclose(kll.get_max_values(), smax)
128
+
129
+ def test_kll_3Dupdates(self):
130
+ # now test 3D update, which should fail
131
+ k = 200
132
+ d = 3
133
+
134
+ # create a sketch
135
+ kll = vector_of_kll_floats_sketches(k, d)
136
+
137
+ # we'll try 1 3D update
138
+ dat = np.random.randn(10, 7, d)
139
+ try:
140
+ kll.update(dat)
141
+ except:
142
+ # this is what we expect
143
+ pass
144
+ # the sketches should still be empty
145
+ self.assertTrue(np.all(kll.is_empty()))
146
+
147
+ if __name__ == '__main__':
148
+ unittest.main()
@@ -0,0 +1,101 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import unittest
19
+ from datasketches import var_opt_sketch, var_opt_union
20
+
21
+ class VoTest(unittest.TestCase):
22
+ def test_vo_example(self):
23
+ k = 50 # a small value so we can easily fill the sketch
24
+ vo = var_opt_sketch(k)
25
+
26
+ # varopt sampling reduces to standard reservoir sampling
27
+ # if the items are all equally weighted, although the
28
+ # algorithm will be significantly slower than an optimized
29
+ # reservoir sampler
30
+ n = 5 * k
31
+ for i in range(0, n):
32
+ vo.update(i)
33
+
34
+ # we can also add a heavy item, using a negative weight for
35
+ # easy filtering later. keep in mind that "heavy" is a
36
+ # relative concept, so using a fixed multiple of n may not
37
+ # be considered a heavy item for larger values of n
38
+ vo.update(-1, 1000 * n)
39
+ self.assertEqual(k, vo.k)
40
+ self.assertEqual(k, vo.num_samples)
41
+ self.assertEqual(n + 1, vo.n)
42
+ self.assertFalse(vo.is_empty())
43
+
44
+ # we can easily get the list of items in the sample
45
+ items = vo.get_samples()
46
+ self.assertEqual(len(items), k)
47
+
48
+ # we can also apply a predicate to the sketch to get an estimate
49
+ # (with optimially minimal variance) of the subset sum of items
50
+ # matching that predicate among the entire population
51
+
52
+ # we'll use a lambda here, but any function operating on a single
53
+ # item which returns a boolean value should work
54
+ summary = vo.estimate_subset_sum(lambda x: x < 0)
55
+ self.assertEqual(summary['estimate'], 1000 * n)
56
+ self.assertEqual(summary['total_sketch_weight'], 1001 * n)
57
+
58
+ # a regular function is similarly handled
59
+ def geq_zero(x):
60
+ return x >= 0
61
+ summary = vo.estimate_subset_sum(geq_zero)
62
+ self.assertEqual(summary['estimate'], n)
63
+ self.assertEqual(summary['total_sketch_weight'], 1001 * n)
64
+
65
+ # next we'll create a second, smaller sketch with
66
+ # only heavier items relative to the previous sketch,
67
+ # but with the sketch in sampling mode
68
+ k2 = 5
69
+ vo2 = var_opt_sketch(k2)
70
+ # for weight, use the estimate of all items >=0 from before
71
+ wt = summary['estimate']
72
+ for i in range(0, k2 + 1):
73
+ vo2.update((2 * n) + i, wt)
74
+
75
+ # now union the sketches, demonstrating how the
76
+ # union's k may not be equal to that of either
77
+ # input value
78
+ union = var_opt_union(k)
79
+ union.update(vo)
80
+ union.update(vo2)
81
+
82
+ result = union.get_result()
83
+ self.assertEqual(n + k2 + 2, result.n)
84
+ self.assertFalse(result.is_empty())
85
+ self.assertGreater(result.k, k2)
86
+ self.assertLess(result.k, k)
87
+
88
+ # we can compare what information is available from both
89
+ # the union and a sketch.
90
+ print(union)
91
+
92
+ # if we want to print the list of itmes, there must be a
93
+ # __str__() method for each item (which need not be the same
94
+ # type; they're all generic python objects when used from
95
+ # python), otherwise you may trigger an exception.
96
+ # to_string() is provided as a convenince to avoid direct
97
+ # calls to __str__() with parameters.
98
+ print(result.to_string(True))
99
+
100
+ if __name__ == '__main__':
101
+ unittest.main()