datasketches 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -1,160 +0,0 @@
1
- # Licensed to the Apache Software Foundation (ASF) under one
2
- # or more contributor license agreements. See the NOTICE file
3
- # distributed with this work for additional information
4
- # regarding copyright ownership. The ASF licenses this file
5
- # to you under the Apache License, Version 2.0 (the
6
- # "License"); you may not use this file except in compliance
7
- # with the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing,
12
- # software distributed under the License is distributed on an
13
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
- # KIND, either express or implied. See the License for the
15
- # specific language governing permissions and limitations
16
- # under the License.
17
-
18
- import unittest
19
- from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch
20
- from datasketches import quantiles_items_sketch, ks_test, PyStringsSerDe
21
- import numpy as np
22
-
23
- class QuantilesTest(unittest.TestCase):
24
- def test_quantiles_floats_example(self):
25
- k = 128
26
- n = 2 ** 20
27
-
28
- # create a sketch and inject ~1 million N(0,1) points as an array and as a single item
29
- quantiles = quantiles_floats_sketch(k)
30
- quantiles.update(np.random.normal(size=n-1))
31
- quantiles.update(0.0)
32
-
33
- # 0 should be near the median
34
- self.assertAlmostEqual(0.5, quantiles.get_rank(0.0), delta=0.035)
35
-
36
- # the median should be near 0
37
- self.assertAlmostEqual(0.0, quantiles.get_quantile(0.5), delta=0.035)
38
-
39
- # we also track the min/max independently from the rest of the data
40
- # which lets us know the full observed data range
41
- self.assertLessEqual(quantiles.get_min_value(), quantiles.get_quantile(0.01))
42
- self.assertLessEqual(0.0, quantiles.get_rank(quantiles.get_min_value()))
43
- self.assertGreaterEqual(quantiles.get_max_value(), quantiles.get_quantile(0.99))
44
- self.assertGreaterEqual(1.0, quantiles.get_rank(quantiles.get_max_value()))
45
-
46
- # we can also extract a list of values at a time,
47
- # here the values should give us something close to [-2, -1, 0, 1, 2].
48
- # then get the CDF, which will return something close to
49
- # the original values used in get_quantiles()
50
- # finally, can check the normalized rank error bound
51
- pts = quantiles.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
52
- cdf = quantiles.get_cdf(pts) # include 1.0 at end to account for all probability mass
53
- self.assertEqual(len(cdf), len(pts)+1)
54
- err = quantiles.normalized_rank_error(False)
55
- self.assertEqual(err, quantiles_floats_sketch.get_normalized_rank_error(k, False))
56
-
57
- # and a few basic queries about the sketch
58
- self.assertFalse(quantiles.is_empty())
59
- self.assertTrue(quantiles.is_estimation_mode())
60
- self.assertEqual(quantiles.get_n(), n)
61
- self.assertEqual(quantiles.get_k(), k)
62
- self.assertLess(quantiles.get_num_retained(), n)
63
-
64
- # merging itself will double the number of items the sketch has seen
65
- quantiles_copy = quantiles_floats_sketch(quantiles)
66
- quantiles.merge(quantiles_copy)
67
- self.assertEqual(quantiles.get_n(), 2*n)
68
-
69
- # we can then serialize and reconstruct the sketch
70
- quantiles_bytes = quantiles.serialize()
71
- new_quantiles = quantiles_floats_sketch.deserialize(quantiles_bytes)
72
- self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
73
- self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
74
- self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
75
- self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7))
76
- self.assertEqual(quantiles.get_rank(0.0), new_quantiles.get_rank(0.0))
77
-
78
- # If we create a new sketch with a very different distribution, a Kolmogorov-Smirnov Test
79
- # of the two should return True: we can reject the null hypothesis that the sketches
80
- # come from the same distributions.
81
- unif_quantiles = quantiles_floats_sketch(k)
82
- unif_quantiles.update(np.random.uniform(10, 20, size=n-1))
83
- self.assertTrue(ks_test(quantiles, unif_quantiles, 0.001))
84
-
85
- total_weight = 0
86
- for tuple in quantiles:
87
- item = tuple[0]
88
- weight = tuple[1]
89
- total_weight = total_weight + weight
90
- self.assertEqual(total_weight, quantiles.get_n())
91
-
92
- def test_quantiles_ints_sketch(self):
93
- k = 128
94
- n = 10
95
- quantiles = quantiles_ints_sketch(k)
96
- for i in range(0, n):
97
- quantiles.update(i)
98
-
99
- self.assertEqual(quantiles.get_min_value(), 0)
100
- self.assertEqual(quantiles.get_max_value(), n-1)
101
- self.assertEqual(quantiles.get_n(), n)
102
- self.assertFalse(quantiles.is_empty())
103
- self.assertFalse(quantiles.is_estimation_mode()) # n < k
104
- self.assertEqual(quantiles.get_k(), k)
105
-
106
- pmf = quantiles.get_pmf([round(n/2)])
107
- self.assertIsNotNone(pmf)
108
- self.assertEqual(len(pmf), 2)
109
-
110
- cdf = quantiles.get_cdf([round(n/2)])
111
- self.assertIsNotNone(cdf)
112
- self.assertEqual(len(cdf), 2)
113
-
114
- self.assertEqual(quantiles.get_quantile(0.5), round(n/2))
115
- quants = quantiles.get_quantiles([0.25, 0.5, 0.75])
116
- self.assertIsNotNone(quants)
117
- self.assertEqual(len(quants), 3)
118
-
119
- self.assertEqual(quantiles.get_rank(round(n/2)), 0.5)
120
-
121
- # merge self
122
- quantiles_copy = quantiles_ints_sketch(quantiles)
123
- quantiles.merge(quantiles_copy)
124
- self.assertEqual(quantiles.get_n(), 2 * n)
125
-
126
- sk_bytes = quantiles.serialize()
127
- self.assertTrue(isinstance(quantiles_ints_sketch.deserialize(sk_bytes), quantiles_ints_sketch))
128
-
129
- def test_quantiles_doubles_sketch(self):
130
- # already tested floats and ints and it's templatized, so just make sure it instantiates properly
131
- k = 128
132
- quantiles = quantiles_doubles_sketch(k)
133
- self.assertTrue(quantiles.is_empty())
134
-
135
- def test_quantiles_items_sketch(self):
136
- # most functionality has been tested, but we need to ensure objects and sorting work
137
- # as well as serialization
138
- k = 128
139
- n = 2 ** 16
140
-
141
- # create a sketch and inject enough points to force compaction
142
- quantiles = quantiles_items_sketch(k)
143
- for i in range(0, n):
144
- quantiles.update(str(i))
145
-
146
- quantiles_copy = quantiles_items_sketch(quantiles)
147
- quantiles.merge(quantiles_copy)
148
- self.assertEqual(quantiles.get_n(), 2 * n)
149
-
150
- quantiles_bytes = quantiles.serialize(PyStringsSerDe())
151
- new_quantiles = quantiles_items_sketch.deserialize(quantiles_bytes, PyStringsSerDe())
152
- self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
153
- self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
154
- self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
155
- self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7))
156
- self.assertEqual(quantiles.get_rank(str(n/4)), new_quantiles.get_rank(str(n/4)))
157
-
158
-
159
- if __name__ == '__main__':
160
- unittest.main()
@@ -1,159 +0,0 @@
1
- # Licensed to the Apache Software Foundation (ASF) under one
2
- # or more contributor license agreements. See the NOTICE file
3
- # distributed with this work for additional information
4
- # regarding copyright ownership. The ASF licenses this file
5
- # to you under the Apache License, Version 2.0 (the
6
- # "License"); you may not use this file except in compliance
7
- # with the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing,
12
- # software distributed under the License is distributed on an
13
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
- # KIND, either express or implied. See the License for the
15
- # specific language governing permissions and limitations
16
- # under the License.
17
-
18
- import unittest
19
- from datasketches import req_ints_sketch, req_floats_sketch, req_items_sketch, PyStringsSerDe
20
- import numpy as np
21
-
22
- class reqTest(unittest.TestCase):
23
- def test_req_example(self):
24
- k = 12
25
- n = 2 ** 20
26
-
27
- # create a sketch and inject ~1 million N(0,1) points as an array and as a single item
28
- req = req_floats_sketch(k, True) # high rank accuracy
29
- req.update(np.random.normal(size=n-1))
30
- req.update(0.0)
31
-
32
- # 0 should be near the median
33
- self.assertAlmostEqual(0.5, req.get_rank(0.0), delta=0.045)
34
-
35
- # the median should be near 0
36
- self.assertAlmostEqual(0.0, req.get_quantile(0.5), delta=0.045)
37
-
38
- # we also track the min/max independently from the rest of the data
39
- # which lets us know the full observed data range
40
- self.assertLessEqual(req.get_min_value(), req.get_quantile(0.01))
41
- self.assertLessEqual(0.0, req.get_rank(req.get_min_value()))
42
- self.assertGreaterEqual(req.get_max_value(), req.get_quantile(0.99))
43
- self.assertGreaterEqual(1.0, req.get_rank(req.get_max_value()))
44
-
45
- # we can also extract a list of values at a time,
46
- # here the values should give us something close to [-2, -1, 0, 1, 2].
47
- # then get the CDF, which will return something close to
48
- # the original values used in get_quantiles()
49
- # finally, can check the normalized rank error bound
50
- pts = req.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
51
- cdf = req.get_cdf(pts) # include 1.0 at end to account for all probability mass
52
- self.assertEqual(len(cdf), len(pts)+1)
53
-
54
- # For relative error quantiles, the error depends on the actual rank
55
- # so we need to use that to detemrine the bounds
56
- est = req.get_rank(0.999, True)
57
- lb = req.get_rank_lower_bound(est, 1)
58
- ub = req.get_rank_upper_bound(est, 1)
59
- self.assertLessEqual(lb, est)
60
- self.assertLessEqual(est, ub)
61
-
62
- # and a few basic queries about the sketch
63
- self.assertFalse(req.is_empty())
64
- self.assertTrue(req.is_estimation_mode())
65
- self.assertEqual(req.get_n(), n)
66
- self.assertLess(req.get_num_retained(), n)
67
- self.assertEqual(req.get_k(), k)
68
-
69
- # merging itself will double the number of items the sketch has seen
70
- req_copy = req_floats_sketch(req)
71
- req.merge(req_copy)
72
- self.assertEqual(req.get_n(), 2*n)
73
-
74
- # we can then serialize and reconstruct the sketch
75
- req_bytes = req.serialize()
76
- new_req = req_floats_sketch.deserialize(req_bytes)
77
- self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
78
- self.assertEqual(req.get_min_value(), new_req.get_min_value())
79
- self.assertEqual(req.get_max_value(), new_req.get_max_value())
80
- self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
81
- self.assertEqual(req.get_rank(0.0), new_req.get_rank(0.0))
82
-
83
- total_weight = 0
84
- for tuple in req:
85
- item = tuple[0]
86
- weight = tuple[1]
87
- total_weight = total_weight + weight
88
- self.assertEqual(total_weight, req.get_n())
89
-
90
- def test_req_ints_sketch(self):
91
- k = 100
92
- n = 10
93
- req = req_ints_sketch(k)
94
- for i in range(0, n):
95
- req.update(i)
96
-
97
- self.assertEqual(req.get_min_value(), 0)
98
- self.assertEqual(req.get_max_value(), n-1)
99
- self.assertEqual(req.get_n(), n)
100
- self.assertFalse(req.is_empty())
101
- self.assertFalse(req.is_estimation_mode()) # n < k
102
- self.assertEqual(req.get_k(), k)
103
-
104
- pmf = req.get_pmf([round(n/2)])
105
- self.assertIsNotNone(pmf)
106
- self.assertEqual(len(pmf), 2)
107
-
108
- cdf = req.get_cdf([round(n/2)])
109
- self.assertIsNotNone(cdf)
110
- self.assertEqual(len(cdf), 2)
111
-
112
- self.assertEqual(req.get_quantile(0.5), round(n/2))
113
- quants = req.get_quantiles([0.25, 0.5, 0.75])
114
- self.assertIsNotNone(quants)
115
- self.assertEqual(len(quants), 3)
116
-
117
- self.assertEqual(req.get_rank(round(n/2)), 0.5)
118
-
119
- # merge self
120
- req_copy = req_ints_sketch(req)
121
- req.merge(req_copy)
122
- self.assertEqual(req.get_n(), 2 * n)
123
-
124
- sk_bytes = req.serialize()
125
- self.assertTrue(isinstance(req_ints_sketch.deserialize(sk_bytes), req_ints_sketch))
126
-
127
- def test_req_floats_sketch(self):
128
- # already tested floats with LRA so just check that HRA works
129
- k = 75
130
- req = req_floats_sketch(k, False) # low rank accuracy
131
- self.assertTrue(req.is_empty())
132
- self.assertFalse(req.is_hra())
133
-
134
- def test_req_items_sketch(self):
135
- # most functionality has been tested, but we need to ensure objects and sorting work
136
- # as well as serialization
137
- k = 100
138
- n = 2 ** 16
139
-
140
- # create a sketch and inject enough points to force compaction
141
- req = req_items_sketch(k)
142
- for i in range(0, n):
143
- req.update(str(i))
144
-
145
- req_copy = req_items_sketch(req)
146
- req.merge(req_copy)
147
- self.assertEqual(req.get_n(), 2 * n)
148
-
149
- req_bytes = req.serialize(PyStringsSerDe())
150
- new_req = req_items_sketch.deserialize(req_bytes, PyStringsSerDe())
151
- self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
152
- self.assertEqual(req.get_min_value(), new_req.get_min_value())
153
- self.assertEqual(req.get_max_value(), new_req.get_max_value())
154
- self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
155
- self.assertEqual(req.get_rank(str(n/4)), new_req.get_rank(str(n/4)))
156
-
157
-
158
- if __name__ == '__main__':
159
- unittest.main()
@@ -1,148 +0,0 @@
1
- # Licensed to the Apache Software Foundation (ASF) under one
2
- # or more contributor license agreements. See the NOTICE file
3
- # distributed with this work for additional information
4
- # regarding copyright ownership. The ASF licenses this file
5
- # to you under the Apache License, Version 2.0 (the
6
- # "License"); you may not use this file except in compliance
7
- # with the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing,
12
- # software distributed under the License is distributed on an
13
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
- # KIND, either express or implied. See the License for the
15
- # specific language governing permissions and limitations
16
- # under the License.
17
-
18
- import unittest
19
-
20
- from datasketches import theta_sketch, update_theta_sketch
21
- from datasketches import compact_theta_sketch, theta_union
22
- from datasketches import theta_intersection, theta_a_not_b
23
- from datasketches import theta_jaccard_similarity
24
-
25
- class ThetaTest(unittest.TestCase):
26
- def test_theta_basic_example(self):
27
- lgk = 12 # 2^k = 4096 rows in the table
28
- n = 1 << 18 # ~256k unique values
29
-
30
- # create a sketch and inject some values
31
- sk = self.generate_theta_sketch(n, lgk)
32
-
33
- # we can check that the upper and lower bounds bracket the
34
- # estimate, without needing to know the exact value.
35
- self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
36
- self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
37
-
38
- # because this sketch is deterministically generated, we can
39
- # also compare against the exact value
40
- self.assertLessEqual(sk.get_lower_bound(1), n)
41
- self.assertGreaterEqual(sk.get_upper_bound(1), n)
42
-
43
- # compact and serialize for storage, then reconstruct
44
- sk_bytes = sk.compact().serialize()
45
- new_sk = compact_theta_sketch.deserialize(sk_bytes)
46
-
47
- # estimate remains unchanged
48
- self.assertFalse(sk.is_empty())
49
- self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
50
-
51
- count = 0
52
- for hash in new_sk:
53
- self.assertLess(hash, new_sk.get_theta64())
54
- count = count + 1
55
- self.assertEqual(count, new_sk.get_num_retained())
56
-
57
- def test_theta_set_operations(self):
58
- lgk = 12 # 2^k = 4096 rows in the table
59
- n = 1 << 18 # ~256k unique values
60
-
61
- # we'll have 1/4 of the values overlap
62
- offset = int(3 * n / 4) # it's a float w/o cast
63
-
64
- # create a couple sketches and inject some values
65
- sk1 = self.generate_theta_sketch(n, lgk)
66
- sk2 = self.generate_theta_sketch(n, lgk, offset)
67
-
68
- # UNIONS
69
- # create a union object
70
- union = theta_union(lgk)
71
- union.update(sk1)
72
- union.update(sk2)
73
-
74
- # getting result from union returns a compact_theta_sketch
75
- # compact theta sketches can be used in additional unions
76
- # or set operations but cannot accept further item updates
77
- result = union.get_result()
78
- self.assertTrue(isinstance(result, compact_theta_sketch))
79
-
80
- # since our process here is deterministic, we have
81
- # checked and know the exact answer is within one
82
- # standard deviation of the estimate
83
- self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
84
- self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
85
-
86
- # INTERSECTIONS
87
- # create an intersection object
88
- intersect = theta_intersection() # no lg_k
89
- intersect.update(sk1)
90
- intersect.update(sk2)
91
-
92
- # has_result() indicates the intersection has been used,
93
- # although the result may be the empty set
94
- self.assertTrue(intersect.has_result())
95
-
96
- # as with unions, the result is a compact sketch
97
- result = intersect.get_result()
98
- self.assertTrue(isinstance(result, compact_theta_sketch))
99
-
100
- # we know the sets overlap by 1/4
101
- self.assertLessEqual(result.get_lower_bound(1), n / 4)
102
- self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
103
-
104
- # A NOT B
105
- # create an a_not_b object
106
- anb = theta_a_not_b() # no lg_k
107
- result = anb.compute(sk1, sk2)
108
-
109
- # as with unions, the result is a compact sketch
110
- self.assertTrue(isinstance(result, compact_theta_sketch))
111
-
112
- # we know the sets overlap by 1/4, so the remainder is 3/4
113
- self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
114
- self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
115
-
116
-
117
- # JACCARD SIMILARITY
118
- # Jaccard Similarity measure returns (lower_bound, estimate, upper_bound)
119
- jac = theta_jaccard_similarity.jaccard(sk1, sk2)
120
-
121
- # we can check that results are in the expected order
122
- self.assertLess(jac[0], jac[1])
123
- self.assertLess(jac[1], jac[2])
124
-
125
- # checks for sketch equivalency
126
- self.assertTrue(theta_jaccard_similarity.exactly_equal(sk1, sk1))
127
- self.assertFalse(theta_jaccard_similarity.exactly_equal(sk1, sk2))
128
-
129
- # we can apply a check for similarity or dissimilarity at a
130
- # given threshhold, at 97.7% confidence.
131
-
132
- # check that the Jaccard Index is at most (upper bound) 0.2.
133
- # exact result would be 1/7
134
- self.assertTrue(theta_jaccard_similarity.dissimilarity_test(sk1, sk2, 0.2))
135
-
136
- # check that the Jaccard Index is at least (lower bound) 0.7
137
- # exact result would be 3/4, using result from A NOT B test
138
- self.assertTrue(theta_jaccard_similarity.similarity_test(sk1, result, 0.7))
139
-
140
-
141
- def generate_theta_sketch(self, n, lgk, offset=0):
142
- sk = update_theta_sketch(lgk)
143
- for i in range(0, n):
144
- sk.update(i + offset)
145
- return sk
146
-
147
- if __name__ == '__main__':
148
- unittest.main()
@@ -1,206 +0,0 @@
1
- # Licensed to the Apache Software Foundation (ASF) under one
2
- # or more contributor license agreements. See the NOTICE file
3
- # distributed with this work for additional information
4
- # regarding copyright ownership. The ASF licenses this file
5
- # to you under the Apache License, Version 2.0 (the
6
- # "License"); you may not use this file except in compliance
7
- # with the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing,
12
- # software distributed under the License is distributed on an
13
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
- # KIND, either express or implied. See the License for the
15
- # specific language governing permissions and limitations
16
- # under the License.
17
-
18
- import unittest
19
-
20
- from datasketches import update_tuple_sketch
21
- from datasketches import compact_tuple_sketch, tuple_union
22
- from datasketches import tuple_intersection, tuple_a_not_b
23
- from datasketches import tuple_jaccard_similarity
24
- from datasketches import tuple_jaccard_similarity, PyIntsSerDe
25
- from datasketches import AccumulatorPolicy, MaxIntPolicy, MinIntPolicy
26
- from datasketches import update_theta_sketch
27
-
28
- class TupleTest(unittest.TestCase):
29
- def test_tuple_basic_example(self):
30
- lgk = 12 # 2^k = 4096 rows in the table
31
- n = 1 << 18 # ~256k unique values
32
-
33
- # create a sketch and inject some values -- summary is 2 so we can sum them
34
- # and know the reuslt
35
- sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=2)
36
-
37
- # we can check that the upper and lower bounds bracket the
38
- # estimate, without needing to know the exact value.
39
- self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
40
- self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
41
-
42
- # because this sketch is deterministically generated, we can
43
- # also compare against the exact value
44
- self.assertLessEqual(sk.get_lower_bound(1), n)
45
- self.assertGreaterEqual(sk.get_upper_bound(1), n)
46
-
47
- # compact and serialize for storage, then reconstruct
48
- sk_bytes = sk.compact().serialize(PyIntsSerDe())
49
- new_sk = compact_tuple_sketch.deserialize(sk_bytes, serde=PyIntsSerDe())
50
-
51
- # estimate remains unchanged
52
- self.assertFalse(sk.is_empty())
53
- self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
54
-
55
- # we can also iterate over the sketch entries
56
- # the iterator provides a (hashkey, summary) pair where the
57
- # first value is the raw hash value and the second the summary
58
- count = 0
59
- cumSum = 0
60
- for pair in new_sk:
61
- self.assertLess(pair[0], new_sk.get_theta64())
62
- count += 1
63
- cumSum += pair[1]
64
- self.assertEqual(count, new_sk.get_num_retained())
65
- self.assertEqual(cumSum, 2 * new_sk.get_num_retained())
66
-
67
- # we can even create a tuple sketch from an existing theta sketch
68
- # as long as we provide a summary to use
69
- theta_sk = update_theta_sketch(lgk)
70
- for i in range(n, 2*n):
71
- theta_sk.update(i)
72
- cts = compact_tuple_sketch(theta_sk, 5)
73
- cumSum = 0
74
- for pair in cts:
75
- cumSum += pair[1]
76
- self.assertEqual(cumSum, 5 * cts.get_num_retained())
77
-
78
-
79
- def test_tuple_set_operations(self):
80
- lgk = 12 # 2^k = 4096 rows in the table
81
- n = 1 << 18 # ~256k unique values
82
-
83
- # we'll have 1/4 of the values overlap
84
- offset = int(3 * n / 4) # it's a float w/o cast
85
-
86
- # create a couple sketches and inject some values, with different summaries
87
- sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=5)
88
- sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=7, offset=offset)
89
-
90
- # UNIONS
91
- # create a union object
92
- union = tuple_union(MaxIntPolicy(), lgk)
93
- union.update(sk1)
94
- union.update(sk2)
95
-
96
- # getting result from union returns a compact_theta_sketch
97
- # compact theta sketches can be used in additional unions
98
- # or set operations but cannot accept further item updates
99
- result = union.get_result()
100
- self.assertTrue(isinstance(result, compact_tuple_sketch))
101
-
102
- # since our process here is deterministic, we have
103
- # checked and know the exact answer is within one
104
- # standard deviation of the estimate
105
- self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
106
- self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
107
-
108
- # we unioned two equal-sized sketches with overlap and used
109
- # the max value as the resulting summary, meaning we should
110
- # have more summaries with value 7 than value 5 in the result
111
- count5 = 0
112
- count7 = 0
113
- for pair in result:
114
- if pair[1] == 5:
115
- count5 += 1
116
- elif pair[1] == 7:
117
- count7 += 1
118
- else:
119
- self.fail()
120
- self.assertLess(count5, count7)
121
-
122
- # INTERSECTIONS
123
- # create an intersection object
124
- intersect = tuple_intersection(MinIntPolicy()) # no lg_k
125
- intersect.update(sk1)
126
- intersect.update(sk2)
127
-
128
- # has_result() indicates the intersection has been used,
129
- # although the result may be the empty set
130
- self.assertTrue(intersect.has_result())
131
-
132
- # as with unions, the result is a compact sketch
133
- result = intersect.get_result()
134
- self.assertTrue(isinstance(result, compact_tuple_sketch))
135
-
136
- # we know the sets overlap by 1/4
137
- self.assertLessEqual(result.get_lower_bound(1), n / 4)
138
- self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
139
-
140
- # in this example, we intersected the sketches and took the
141
- # min value as the resulting summary, so all summaries
142
- # must be exactly equal to that value
143
- count5 = 0
144
- for pair in result:
145
- if pair[1] == 5:
146
- count5 += 1
147
- else:
148
- self.fail()
149
- self.assertEqual(count5, result.get_num_retained())
150
-
151
- # A NOT B
152
- # create an a_not_b object
153
- anb = tuple_a_not_b() # no lg_k or policy
154
- result = anb.compute(sk1, sk2)
155
-
156
- # as with unions, the result is a compact sketch
157
- self.assertTrue(isinstance(result, compact_tuple_sketch))
158
-
159
- # we know the sets overlap by 1/4, so the remainder is 3/4
160
- self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
161
- self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
162
-
163
- # here, we have only values with a summary of 5 as any keys that
164
- # existed in both sketches were removed
165
- count5 = 0
166
- for pair in result:
167
- if pair[1] == 5:
168
- count5 += 1
169
- else:
170
- self.fail()
171
- self.assertEqual(count5, result.get_num_retained())
172
-
173
- # JACCARD SIMILARITY
174
- # Jaccard Similarity measure returns (lower_bound, estimate, upper_bound)
175
- # and does not examine summaries, even for (dis)similarity tests.
176
- jac = tuple_jaccard_similarity.jaccard(sk1, sk2)
177
-
178
- # we can check that results are in the expected order
179
- self.assertLess(jac[0], jac[1])
180
- self.assertLess(jac[1], jac[2])
181
-
182
- # checks for sketch equivalence
183
- self.assertTrue(tuple_jaccard_similarity.exactly_equal(sk1, sk1))
184
- self.assertFalse(tuple_jaccard_similarity.exactly_equal(sk1, sk2))
185
-
186
- # we can apply a check for similarity or dissimilarity at a
187
- # given threshold, at 97.7% confidence.
188
-
189
- # check that the Jaccard Index is at most (upper bound) 0.2.
190
- # exact result would be 1/7
191
- self.assertTrue(tuple_jaccard_similarity.dissimilarity_test(sk1, sk2, 0.2))
192
-
193
- # check that the Jaccard Index is at least (lower bound) 0.7
194
- # exact result would be 3/4, using result from A NOT B test
195
- self.assertTrue(tuple_jaccard_similarity.similarity_test(sk1, result, 0.7))
196
-
197
-
198
- # Generates a basic tuple sketch with a fixed value for each update
199
- def generate_tuple_sketch(self, policy, n, lgk, value, offset=0):
200
- sk = update_tuple_sketch(policy, lgk)
201
- for i in range(0, n):
202
- sk.update(i + offset, value)
203
- return sk
204
-
205
- if __name__ == '__main__':
206
- unittest.main()