datasketches 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (245) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +3 -3
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +23 -20
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/README.md +1 -3
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +15 -10
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  21. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  23. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  24. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  25. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +2 -2
  26. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  27. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  28. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  29. data/vendor/datasketches-cpp/{python/src/__init__.py → count/CMakeLists.txt} +25 -1
  30. data/vendor/datasketches-cpp/count/include/count_min.hpp +405 -0
  31. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +497 -0
  32. data/vendor/datasketches-cpp/{MANIFEST.in → count/test/CMakeLists.txt} +23 -20
  33. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  34. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +303 -0
  35. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  36. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  37. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  38. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  39. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  40. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +14 -11
  41. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  42. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  43. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  44. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  45. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  46. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  47. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  48. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_serialize_for_java.cpp +38 -0
  49. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  50. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  51. data/vendor/datasketches-cpp/{tox.ini → density/CMakeLists.txt} +24 -8
  52. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +256 -0
  53. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  54. data/vendor/datasketches-cpp/{python/datasketches/__init__.py → density/test/CMakeLists.txt} +15 -3
  55. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  59. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  60. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  63. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  64. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +94 -61
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +20 -8
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  76. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  77. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -18
  78. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  79. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  80. data/vendor/datasketches-cpp/hll/include/hll.hpp +79 -65
  81. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  82. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  83. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  84. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  85. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  86. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  87. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  88. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +79 -53
  89. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +61 -132
  90. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  91. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  92. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  93. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +5 -40
  94. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +76 -54
  95. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +66 -136
  96. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  97. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  98. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  99. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +15 -39
  100. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  101. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -4
  102. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +105 -26
  103. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +50 -111
  104. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  105. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  106. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  107. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  108. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  109. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  110. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  111. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  112. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  113. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +89 -32
  114. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +33 -19
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +13 -10
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +23 -19
  117. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  118. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  119. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  120. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  121. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  122. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  123. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -51
  124. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  125. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  126. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -20
  127. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  128. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  129. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  130. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  131. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  132. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  133. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  134. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  135. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  137. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  139. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  140. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  141. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  142. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +180 -33
  143. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  144. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  145. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +10 -10
  146. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  147. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +21 -6
  148. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +13 -3
  149. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +15 -1
  150. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  151. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  152. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  153. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +39 -188
  154. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  155. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  157. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  158. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  159. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  160. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection_impl.hpp +31 -0
  161. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  162. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  163. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  164. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  165. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  166. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  167. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  168. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  169. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  170. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  171. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  172. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  173. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  174. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  175. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  176. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  177. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  178. data/vendor/datasketches-cpp/{python/src/datasketches.cpp → tuple/test/tuple_sketch_serialize_for_java.cpp} +16 -30
  179. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  180. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  181. metadata +61 -79
  182. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  183. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  184. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  185. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  188. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  189. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  190. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  191. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  192. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  193. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -81
  194. data/vendor/datasketches-cpp/python/README.md +0 -85
  195. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -104
  196. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  197. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  198. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  199. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  200. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  201. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  202. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  203. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -90
  204. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -128
  205. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -134
  206. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -210
  207. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  208. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -111
  209. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -204
  210. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -215
  211. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -172
  212. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  213. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  214. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  215. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  216. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -110
  217. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -130
  218. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -125
  219. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -126
  220. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -126
  221. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -146
  222. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  223. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -125
  224. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  225. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  230. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  231. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  232. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  233. data/vendor/datasketches-cpp/setup.py +0 -110
  234. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  238. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  239. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  240. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  241. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  242. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  243. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  244. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  245. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -1,354 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "## Frequent Items Sketch Examples"
8
- ]
9
- },
10
- {
11
- "cell_type": "markdown",
12
- "metadata": {},
13
- "source": [
14
- "### Basic Sketch Usage"
15
- ]
16
- },
17
- {
18
- "cell_type": "markdown",
19
- "metadata": {},
20
- "source": [
21
- "More so than other sketches in the library, the Frequent Items sketch can take some practice to use since it identifies exceptionally heavy hitters rather than returning a \"top N\" list. We assume readers have already familiarized themselves with the [sketch documentation](https://datasketches.github.io/docs/Frequency/FrequentItemsOverview.html) and are aware of the key concepts around use of this sketch."
22
- ]
23
- },
24
- {
25
- "cell_type": "code",
26
- "execution_count": 2,
27
- "metadata": {},
28
- "outputs": [],
29
- "source": [
30
- "from datasketches import frequent_strings_sketch, frequent_items_error_type"
31
- ]
32
- },
33
- {
34
- "cell_type": "markdown",
35
- "metadata": {},
36
- "source": [
37
- "We'll use a very small sketch in this case so that we can easily fill it, otherwise the difference between error types is more difficult to demonstrate."
38
- ]
39
- },
40
- {
41
- "cell_type": "code",
42
- "execution_count": 3,
43
- "metadata": {},
44
- "outputs": [],
45
- "source": [
46
- "k = 3\n",
47
- "fi = frequent_strings_sketch(k)"
48
- ]
49
- },
50
- {
51
- "cell_type": "markdown",
52
- "metadata": {},
53
- "source": [
54
- "A brief digression into implementation details to help explain what we're doing here. The Frequent Items sketch maintains a list of items, but purges the least frequent items when the list fills. For this example, we'll keep inserting items until after a purge takes place.\n",
55
- "\n",
56
- "We'll insert items with exponentially decreasing weights, which in this case gives us a more interesting set of results when we later query things."
57
- ]
58
- },
59
- {
60
- "cell_type": "code",
61
- "execution_count": 4,
62
- "metadata": {},
63
- "outputs": [
64
- {
65
- "name": "stdout",
66
- "output_type": "stream",
67
- "text": [
68
- "Update 1: 1 items\n",
69
- "Update 2: 2 items\n",
70
- "Update 3: 3 items\n",
71
- "Update 4: 4 items\n",
72
- "Update 5: 5 items\n",
73
- "Update 6: 6 items\n",
74
- "Update 7: 3 items\n",
75
- "Update 8: 4 items\n"
76
- ]
77
- }
78
- ],
79
- "source": [
80
- "n = 8\n",
81
- "for i in range(0,n):\n",
82
- " fi.update(str(i), 2 ** (n-i))\n",
83
- " i += 1\n",
84
- " print('Update ' + str(i) + ': ' + str(fi.get_num_active_items()) + ' items')"
85
- ]
86
- },
87
- {
88
- "cell_type": "markdown",
89
- "metadata": {},
90
- "source": [
91
- "We can see where the purge happened, and in this case we inserted a low-weight item after the purge. We can now compare querying items to exclude either false positives or false negatives.\n",
92
- " - `NO_FALSE_POSITIVES` returns all items with a _lower_ bound above the a posteriori error\n",
93
- " - `NO_FALSE_NEGATIVES` returns all items with an _upper_ bound above the a posteriori error\n",
94
- "\n",
95
- "The latter option will always include any results from the first set and may include others. Items are returned as (id, estimate, lower_bound, upper_bound) and are sorted by decreasing weight."
96
- ]
97
- },
98
- {
99
- "cell_type": "code",
100
- "execution_count": 5,
101
- "metadata": {},
102
- "outputs": [
103
- {
104
- "data": {
105
- "text/plain": [
106
- "[('0', 256, 224, 256), ('1', 128, 96, 128)]"
107
- ]
108
- },
109
- "execution_count": 5,
110
- "metadata": {},
111
- "output_type": "execute_result"
112
- }
113
- ],
114
- "source": [
115
- "fi.get_frequent_items(frequent_items_error_type.NO_FALSE_POSITIVES)"
116
- ]
117
- },
118
- {
119
- "cell_type": "code",
120
- "execution_count": 6,
121
- "metadata": {},
122
- "outputs": [
123
- {
124
- "data": {
125
- "text/plain": [
126
- "[('0', 256, 224, 256),\n",
127
- " ('1', 128, 96, 128),\n",
128
- " ('2', 64, 32, 64),\n",
129
- " ('7', 34, 2, 34)]"
130
- ]
131
- },
132
- "execution_count": 6,
133
- "metadata": {},
134
- "output_type": "execute_result"
135
- }
136
- ],
137
- "source": [
138
- "fi.get_frequent_items(frequent_items_error_type.NO_FALSE_NEGATIVES)"
139
- ]
140
- },
141
- {
142
- "cell_type": "markdown",
143
- "metadata": {},
144
- "source": [
145
- "The sketch also allows us to query for individual items directly."
146
- ]
147
- },
148
- {
149
- "cell_type": "code",
150
- "execution_count": 7,
151
- "metadata": {},
152
- "outputs": [
153
- {
154
- "name": "stdout",
155
- "output_type": "stream",
156
- "text": [
157
- "256\n",
158
- "64\n",
159
- "2\n"
160
- ]
161
- }
162
- ],
163
- "source": [
164
- "print(fi.get_estimate(\"0\"))\n",
165
- "print(fi.get_upper_bound(\"2\"))\n",
166
- "print(fi.get_lower_bound(\"7\"))"
167
- ]
168
- },
169
- {
170
- "cell_type": "markdown",
171
- "metadata": {},
172
- "source": [
173
- "We can also query for items not in the the list, whether the item has never been seen or if it has been evicted from the active set."
174
- ]
175
- },
176
- {
177
- "cell_type": "code",
178
- "execution_count": 8,
179
- "metadata": {},
180
- "outputs": [
181
- {
182
- "data": {
183
- "text/plain": [
184
- "0"
185
- ]
186
- },
187
- "execution_count": 8,
188
- "metadata": {},
189
- "output_type": "execute_result"
190
- }
191
- ],
192
- "source": [
193
- "fi.get_estimate(\"5\")"
194
- ]
195
- },
196
- {
197
- "cell_type": "markdown",
198
- "metadata": {},
199
- "source": [
200
- "The sketch may also be serialized for archiving, and reconstructed."
201
- ]
202
- },
203
- {
204
- "cell_type": "code",
205
- "execution_count": 9,
206
- "metadata": {},
207
- "outputs": [
208
- {
209
- "data": {
210
- "text/plain": [
211
- "84"
212
- ]
213
- },
214
- "execution_count": 9,
215
- "metadata": {},
216
- "output_type": "execute_result"
217
- }
218
- ],
219
- "source": [
220
- "sk_bytes = fi.serialize()\n",
221
- "len(sk_bytes)"
222
- ]
223
- },
224
- {
225
- "cell_type": "code",
226
- "execution_count": 11,
227
- "metadata": {},
228
- "outputs": [
229
- {
230
- "name": "stdout",
231
- "output_type": "stream",
232
- "text": [
233
- "### Frequent items sketch summary:\n",
234
- " lg cur map size : 3\n",
235
- " lg max map size : 3\n",
236
- " num active items : 4\n",
237
- " total weight : 510\n",
238
- " max error : 32\n",
239
- "### End sketch summary\n",
240
- "\n"
241
- ]
242
- }
243
- ],
244
- "source": [
245
- "fi2 = frequent_strings_sketch.deserialize(sk_bytes)\n",
246
- "print(fi2)"
247
- ]
248
- },
249
- {
250
- "cell_type": "markdown",
251
- "metadata": {},
252
- "source": [
253
- "### Merging Example"
254
- ]
255
- },
256
- {
257
- "cell_type": "markdown",
258
- "metadata": {},
259
- "source": [
260
- "Frequent Items sketches support `merge()` to combine sketches. Keep in mind that the combined sketches may not have any meaningfully frequent items, even if there were frequent items in one of the input sketches.\n",
261
- "\n",
262
- "We'll start by creating a sketch with lots of equally-weighted very light items, but with a combined weight several times greater than that of the first sketch, and then merge that into the first sketch."
263
- ]
264
- },
265
- {
266
- "cell_type": "code",
267
- "execution_count": 12,
268
- "metadata": {},
269
- "outputs": [],
270
- "source": [
271
- "fi2 = frequent_strings_sketch(k)\n",
272
- "wt = fi.get_total_weight()\n",
273
- "for i in range(0,4*wt):\n",
274
- " fi2.update(str(i))\n",
275
- "fi.merge(fi2)"
276
- ]
277
- },
278
- {
279
- "cell_type": "markdown",
280
- "metadata": {},
281
- "source": [
282
- "Even though all these new items have weight 1, there are so many of them that we have nothing if we ask for no fasle positives."
283
- ]
284
- },
285
- {
286
- "cell_type": "code",
287
- "execution_count": 13,
288
- "metadata": {},
289
- "outputs": [
290
- {
291
- "data": {
292
- "text/plain": [
293
- "0"
294
- ]
295
- },
296
- "execution_count": 13,
297
- "metadata": {},
298
- "output_type": "execute_result"
299
- }
300
- ],
301
- "source": [
302
- "len(fi.get_frequent_items(frequent_items_error_type.NO_FALSE_POSITIVES))"
303
- ]
304
- },
305
- {
306
- "cell_type": "markdown",
307
- "metadata": {},
308
- "source": [
309
- "We do, however, see a few potentially heavy items if we request no false negatives."
310
- ]
311
- },
312
- {
313
- "cell_type": "code",
314
- "execution_count": 14,
315
- "metadata": {},
316
- "outputs": [
317
- {
318
- "data": {
319
- "text/plain": [
320
- "3"
321
- ]
322
- },
323
- "execution_count": 14,
324
- "metadata": {},
325
- "output_type": "execute_result"
326
- }
327
- ],
328
- "source": [
329
- "len(fi.get_frequent_items(frequent_items_error_type.NO_FALSE_NEGATIVES))"
330
- ]
331
- }
332
- ],
333
- "metadata": {
334
- "kernelspec": {
335
- "display_name": "Python 3",
336
- "language": "python",
337
- "name": "python3"
338
- },
339
- "language_info": {
340
- "codemirror_mode": {
341
- "name": "ipython",
342
- "version": 3
343
- },
344
- "file_extension": ".py",
345
- "mimetype": "text/x-python",
346
- "name": "python",
347
- "nbconvert_exporter": "python",
348
- "pygments_lexer": "ipython3",
349
- "version": "3.7.0"
350
- }
351
- },
352
- "nbformat": 4,
353
- "nbformat_minor": 2
354
- }
@@ -1,346 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "## HLL Sketch Examples"
8
- ]
9
- },
10
- {
11
- "cell_type": "markdown",
12
- "metadata": {},
13
- "source": [
14
- "### Basic Sketch Usage"
15
- ]
16
- },
17
- {
18
- "cell_type": "code",
19
- "execution_count": 1,
20
- "metadata": {},
21
- "outputs": [],
22
- "source": [
23
- "from datasketches import hll_sketch, hll_union, tgt_hll_type"
24
- ]
25
- },
26
- {
27
- "cell_type": "markdown",
28
- "metadata": {},
29
- "source": [
30
- "We'll create a sketch with log2(k) = 12"
31
- ]
32
- },
33
- {
34
- "cell_type": "code",
35
- "execution_count": 2,
36
- "metadata": {},
37
- "outputs": [],
38
- "source": [
39
- "sk = hll_sketch(12)"
40
- ]
41
- },
42
- {
43
- "cell_type": "markdown",
44
- "metadata": {},
45
- "source": [
46
- "Insert ~2 million points. Values are hashed, so using sequential integers is fine for demonstration purposes."
47
- ]
48
- },
49
- {
50
- "cell_type": "code",
51
- "execution_count": 3,
52
- "metadata": {},
53
- "outputs": [
54
- {
55
- "name": "stdout",
56
- "output_type": "stream",
57
- "text": [
58
- "### HLL SKETCH SUMMARY: \n",
59
- " Log Config K : 12\n",
60
- " Hll Target : HLL_4\n",
61
- " Current Mode : HLL\n",
62
- " LB : 2.06958e+06\n",
63
- " Estimate : 2.09635e+06\n",
64
- " UB : 2.12379e+06\n",
65
- " OutOfOrder flag: 0\n",
66
- " CurMin : 7\n",
67
- " NumAtCurMin : 72\n",
68
- " HipAccum : 2.09635e+06\n",
69
- " KxQ0 : 5.80703\n",
70
- " KxQ1 : 0\n",
71
- "\n"
72
- ]
73
- }
74
- ],
75
- "source": [
76
- "n = 1 << 21\n",
77
- "for i in range(0, n):\n",
78
- " sk.update(i)\n",
79
- "print(sk)"
80
- ]
81
- },
82
- {
83
- "cell_type": "markdown",
84
- "metadata": {},
85
- "source": [
86
- "Since we know the exact value of n we can look at the estimate and upper/lower bounds as a % of the true value. We'll look at the bounds at 1 standard deviation. In this case, the true value does lie within the bounds, but since these are probabilistic bounds the true value will sometimes be outside them (especially at 1 standard deviation)."
87
- ]
88
- },
89
- {
90
- "cell_type": "code",
91
- "execution_count": 4,
92
- "metadata": {},
93
- "outputs": [
94
- {
95
- "name": "stdout",
96
- "output_type": "stream",
97
- "text": [
98
- "Upper bound (1 std. dev) as % of true value: 101.2703\n"
99
- ]
100
- }
101
- ],
102
- "source": [
103
- "print(\"Upper bound (1 std. dev) as % of true value: \", round(100*sk.get_upper_bound(1) / n, 4))"
104
- ]
105
- },
106
- {
107
- "cell_type": "code",
108
- "execution_count": 5,
109
- "metadata": {},
110
- "outputs": [
111
- {
112
- "name": "stdout",
113
- "output_type": "stream",
114
- "text": [
115
- "Estimate as % of true value: 99.9618\n"
116
- ]
117
- }
118
- ],
119
- "source": [
120
- "print(\"Estimate as % of true value: \", round(100*sk.get_estimate() / n, 4))"
121
- ]
122
- },
123
- {
124
- "cell_type": "code",
125
- "execution_count": 6,
126
- "metadata": {},
127
- "outputs": [
128
- {
129
- "name": "stdout",
130
- "output_type": "stream",
131
- "text": [
132
- "Lower bound (1 std. dev) as % of true value: 98.6852\n"
133
- ]
134
- }
135
- ],
136
- "source": [
137
- "print(\"Lower bound (1 std. dev) as % of true value: \", round(100*sk.get_lower_bound(1) / n, 4))"
138
- ]
139
- },
140
- {
141
- "cell_type": "markdown",
142
- "metadata": {},
143
- "source": [
144
- "Finally, we can serialize and deserialize the sketch, which will give us back the same structure."
145
- ]
146
- },
147
- {
148
- "cell_type": "code",
149
- "execution_count": 7,
150
- "metadata": {},
151
- "outputs": [
152
- {
153
- "data": {
154
- "text/plain": [
155
- "2096"
156
- ]
157
- },
158
- "execution_count": 7,
159
- "metadata": {},
160
- "output_type": "execute_result"
161
- }
162
- ],
163
- "source": [
164
- "sk_bytes = sk.serialize_compact()\n",
165
- "len(sk_bytes)"
166
- ]
167
- },
168
- {
169
- "cell_type": "code",
170
- "execution_count": 8,
171
- "metadata": {},
172
- "outputs": [
173
- {
174
- "name": "stdout",
175
- "output_type": "stream",
176
- "text": [
177
- "### HLL SKETCH SUMMARY: \n",
178
- " Log Config K : 12\n",
179
- " Hll Target : HLL_4\n",
180
- " Current Mode : HLL\n",
181
- " LB : 2.06958e+06\n",
182
- " Estimate : 2.09635e+06\n",
183
- " UB : 2.12379e+06\n",
184
- " OutOfOrder flag: 0\n",
185
- " CurMin : 7\n",
186
- " NumAtCurMin : 72\n",
187
- " HipAccum : 2.09635e+06\n",
188
- " KxQ0 : 5.80703\n",
189
- " KxQ1 : 0\n",
190
- "\n"
191
- ]
192
- }
193
- ],
194
- "source": [
195
- "sk2 = hll_sketch.deserialize(sk_bytes)\n",
196
- "print(sk2)"
197
- ]
198
- },
199
- {
200
- "cell_type": "markdown",
201
- "metadata": {},
202
- "source": [
203
- "### Sketch Union Usage"
204
- ]
205
- },
206
- {
207
- "cell_type": "markdown",
208
- "metadata": {},
209
- "source": [
210
- "Here, we'll create two sketches with partial overlap in values. For good measure, we'll let k be larger in one sketch. For most applications we'd generally create all new data using the same size sketch, allowing differences to creep in when combining new and historica data."
211
- ]
212
- },
213
- {
214
- "cell_type": "code",
215
- "execution_count": 9,
216
- "metadata": {},
217
- "outputs": [],
218
- "source": [
219
- "k = 12\n",
220
- "n = 1 << 20\n",
221
- "offset = int(3 * n / 4)"
222
- ]
223
- },
224
- {
225
- "cell_type": "code",
226
- "execution_count": 10,
227
- "metadata": {},
228
- "outputs": [],
229
- "source": [
230
- "sk1 = hll_sketch(k)\n",
231
- "sk2 = hll_sketch(k + 1)\n",
232
- "for i in range(0, n):\n",
233
- " sk1.update(i)\n",
234
- " sk2.update(i + offset)"
235
- ]
236
- },
237
- {
238
- "cell_type": "markdown",
239
- "metadata": {},
240
- "source": [
241
- "Create a union object and add the sketches to that. To demonstrate smoothly handling multiple sketch sizes, we'll use a size of k+1 here."
242
- ]
243
- },
244
- {
245
- "cell_type": "code",
246
- "execution_count": 11,
247
- "metadata": {},
248
- "outputs": [],
249
- "source": [
250
- "union = hll_union(k+1)\n",
251
- "union.update(sk1)\n",
252
- "union.update(sk2)"
253
- ]
254
- },
255
- {
256
- "cell_type": "markdown",
257
- "metadata": {},
258
- "source": [
259
- "Note how log config k has automatically adopted the value of the smaller input sketch."
260
- ]
261
- },
262
- {
263
- "cell_type": "code",
264
- "execution_count": 12,
265
- "metadata": {},
266
- "outputs": [
267
- {
268
- "name": "stdout",
269
- "output_type": "stream",
270
- "text": [
271
- "### HLL SKETCH SUMMARY: \n",
272
- " Log Config K : 12\n",
273
- " Hll Target : HLL_4\n",
274
- " Current Mode : HLL\n",
275
- " LB : 1.80197e+06\n",
276
- " Estimate : 1.83108e+06\n",
277
- " UB : 1.86121e+06\n",
278
- " OutOfOrder flag: 1\n",
279
- " CurMin : 6\n",
280
- " NumAtCurMin : 2\n",
281
- " HipAccum : 1.76932e+06\n",
282
- " KxQ0 : 6.60752\n",
283
- " KxQ1 : 0\n",
284
- "\n"
285
- ]
286
- }
287
- ],
288
- "source": [
289
- "result = union.get_result()\n",
290
- "print(result)"
291
- ]
292
- },
293
- {
294
- "cell_type": "markdown",
295
- "metadata": {},
296
- "source": [
297
- "We can again compare against the exact result, in this case 1.75*n"
298
- ]
299
- },
300
- {
301
- "cell_type": "code",
302
- "execution_count": 13,
303
- "metadata": {},
304
- "outputs": [
305
- {
306
- "name": "stdout",
307
- "output_type": "stream",
308
- "text": [
309
- "Estimate as % of true value: 99.7859\n"
310
- ]
311
- }
312
- ],
313
- "source": [
314
- "print(\"Estimate as % of true value: \", round(100*result.get_estimate() / (7*n/4), 4))"
315
- ]
316
- },
317
- {
318
- "cell_type": "code",
319
- "execution_count": null,
320
- "metadata": {},
321
- "outputs": [],
322
- "source": []
323
- }
324
- ],
325
- "metadata": {
326
- "kernelspec": {
327
- "display_name": "Python 3",
328
- "language": "python",
329
- "name": "python3"
330
- },
331
- "language_info": {
332
- "codemirror_mode": {
333
- "name": "ipython",
334
- "version": 3
335
- },
336
- "file_extension": ".py",
337
- "mimetype": "text/x-python",
338
- "name": "python",
339
- "nbconvert_exporter": "python",
340
- "pygments_lexer": "ipython3",
341
- "version": "3.7.0"
342
- }
343
- },
344
- "nbformat": 4,
345
- "nbformat_minor": 2
346
- }