datasketches 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (245) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +3 -3
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +23 -20
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/README.md +1 -3
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +15 -10
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  21. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  23. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  24. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  25. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +2 -2
  26. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  27. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  28. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  29. data/vendor/datasketches-cpp/{python/src/__init__.py → count/CMakeLists.txt} +25 -1
  30. data/vendor/datasketches-cpp/count/include/count_min.hpp +405 -0
  31. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +497 -0
  32. data/vendor/datasketches-cpp/{MANIFEST.in → count/test/CMakeLists.txt} +23 -20
  33. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  34. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +303 -0
  35. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  36. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  37. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  38. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  39. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  40. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +14 -11
  41. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  42. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  43. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  44. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  45. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  46. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  47. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  48. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_serialize_for_java.cpp +38 -0
  49. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  50. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  51. data/vendor/datasketches-cpp/{tox.ini → density/CMakeLists.txt} +24 -8
  52. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +256 -0
  53. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  54. data/vendor/datasketches-cpp/{python/datasketches/__init__.py → density/test/CMakeLists.txt} +15 -3
  55. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  59. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  60. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  63. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  64. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +94 -61
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +20 -8
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  76. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  77. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -18
  78. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  79. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  80. data/vendor/datasketches-cpp/hll/include/hll.hpp +79 -65
  81. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  82. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  83. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  84. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  85. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  86. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  87. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  88. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +79 -53
  89. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +61 -132
  90. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  91. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  92. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  93. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +5 -40
  94. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +76 -54
  95. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +66 -136
  96. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  97. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  98. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  99. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +15 -39
  100. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  101. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -4
  102. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +105 -26
  103. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +50 -111
  104. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  105. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  106. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  107. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  108. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  109. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  110. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  111. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  112. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  113. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +89 -32
  114. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +33 -19
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +13 -10
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +23 -19
  117. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  118. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  119. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  120. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  121. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  122. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  123. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -51
  124. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  125. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  126. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -20
  127. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  128. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  129. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  130. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  131. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  132. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  133. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  134. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  135. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  137. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  139. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  140. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  141. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  142. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +180 -33
  143. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  144. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  145. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +10 -10
  146. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  147. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +21 -6
  148. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +13 -3
  149. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +15 -1
  150. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  151. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  152. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  153. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +39 -188
  154. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  155. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  157. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  158. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  159. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  160. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection_impl.hpp +31 -0
  161. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  162. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  163. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  164. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  165. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  166. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  167. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  168. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  169. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  170. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  171. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  172. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  173. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  174. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  175. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  176. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  177. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  178. data/vendor/datasketches-cpp/{python/src/datasketches.cpp → tuple/test/tuple_sketch_serialize_for_java.cpp} +16 -30
  179. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  180. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  181. metadata +61 -79
  182. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  183. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  184. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  185. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  188. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  189. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  190. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  191. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  192. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  193. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -81
  194. data/vendor/datasketches-cpp/python/README.md +0 -85
  195. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -104
  196. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  197. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  198. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  199. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  200. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  201. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  202. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  203. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -90
  204. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -128
  205. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -134
  206. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -210
  207. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  208. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -111
  209. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -204
  210. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -215
  211. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -172
  212. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  213. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  214. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  215. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  216. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -110
  217. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -130
  218. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -125
  219. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -126
  220. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -126
  221. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -146
  222. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  223. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -125
  224. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  225. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  230. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  231. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  232. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  233. data/vendor/datasketches-cpp/setup.py +0 -110
  234. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  238. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  239. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  240. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  241. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  242. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  243. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  244. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  245. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -21,9 +21,26 @@
21
21
  #define THETA_SKETCH_HPP_
22
22
 
23
23
  #include "theta_update_sketch_base.hpp"
24
+ #include "compact_theta_sketch_parser.hpp"
24
25
 
25
26
  namespace datasketches {
26
27
 
28
+ // forward declarations
29
+ template<typename A> class theta_sketch_alloc;
30
+ template<typename A> class update_theta_sketch_alloc;
31
+ template<typename A> class compact_theta_sketch_alloc;
32
+ template<typename A> class wrapped_compact_theta_sketch_alloc;
33
+
34
+ /// Theta sketch alias with default allocator
35
+ using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
36
+ /// Update Theta sketch alias with default allocator
37
+ using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
38
+ /// Compact Theta sketch alias with default allocator
39
+ using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
40
+ /// Wrapped Compact Theta sketch alias with default allocator
41
+ using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
42
+
43
+ /// Abstract base class for Theta sketch
27
44
  template<typename Allocator = std::allocator<uint64_t>>
28
45
  class base_theta_sketch_alloc {
29
46
  public:
@@ -105,6 +122,7 @@ protected:
105
122
  virtual void print_items(std::ostringstream& os) const = 0;
106
123
  };
107
124
 
125
+ /// Base class for the Theta Sketch, a generalization of the Kth Minimum Value (KMV) sketch.
108
126
  template<typename Allocator = std::allocator<uint64_t>>
109
127
  class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
110
128
  public:
@@ -148,6 +166,11 @@ protected:
148
166
  // forward declaration
149
167
  template<typename A> class compact_theta_sketch_alloc;
150
168
 
169
+ /**
170
+ * Update Theta sketch.
171
+ * The purpose of this class is to build a Theta sketch from input data via the update() methods.
172
+ * There is no constructor. Use builder instead.
173
+ */
151
174
  template<typename Allocator = std::allocator<uint64_t>>
152
175
  class update_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
153
176
  public:
@@ -162,11 +185,33 @@ public:
162
185
  // No constructor here. Use builder instead.
163
186
  class builder;
164
187
 
165
- update_theta_sketch_alloc(const update_theta_sketch_alloc&) = default;
166
- update_theta_sketch_alloc(update_theta_sketch_alloc&&) noexcept = default;
188
+ /**
189
+ * Copy constructor
190
+ * @param other sketch to be copied
191
+ */
192
+ update_theta_sketch_alloc(const update_theta_sketch_alloc& other) = default;
193
+
194
+ /**
195
+ * Move constructor
196
+ * @param other sketch to be moved
197
+ */
198
+ update_theta_sketch_alloc(update_theta_sketch_alloc&& other) noexcept = default;
199
+
167
200
  virtual ~update_theta_sketch_alloc() = default;
168
- update_theta_sketch_alloc& operator=(const update_theta_sketch_alloc&) = default;
169
- update_theta_sketch_alloc& operator=(update_theta_sketch_alloc&&) = default;
201
+
202
+ /**
203
+ * Copy assignment
204
+ * @param other sketch to be copied
205
+ * @return reference to this sketch
206
+ */
207
+ update_theta_sketch_alloc& operator=(const update_theta_sketch_alloc& other) = default;
208
+
209
+ /**
210
+ * Move assignment
211
+ * @param other sketch to be moved
212
+ * @return reference to this sketch
213
+ */
214
+ update_theta_sketch_alloc& operator=(update_theta_sketch_alloc&& other) = default;
170
215
 
171
216
  virtual Allocator get_allocator() const;
172
217
  virtual bool is_empty() const;
@@ -286,7 +331,7 @@ public:
286
331
 
287
332
  /**
288
333
  * Converts this sketch to a compact sketch (ordered or unordered).
289
- * @param ordered optional flag to specify if ordered sketch should be produced
334
+ * @param ordered optional flag to specify if an ordered sketch should be produced
290
335
  * @return compact sketch
291
336
  */
292
337
  compact_theta_sketch_alloc<Allocator> compact(bool ordered = true) const;
@@ -306,8 +351,10 @@ private:
306
351
  virtual void print_specifics(std::ostringstream& os) const;
307
352
  };
308
353
 
309
- // compact sketch
310
-
354
+ /**
355
+ * Compact Theta sketch.
356
+ * This is an immutable form of the Theta sketch, the form that can be serialized and deserialized.
357
+ */
311
358
  template<typename Allocator = std::allocator<uint64_t>>
312
359
  class compact_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
313
360
  public:
@@ -317,7 +364,8 @@ public:
317
364
  using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
318
365
  using vector_bytes = std::vector<uint8_t, AllocBytes>;
319
366
 
320
- static const uint8_t SERIAL_VERSION = 3;
367
+ static const uint8_t UNCOMPRESSED_SERIAL_VERSION = 3;
368
+ static const uint8_t COMPRESSED_SERIAL_VERSION = 4;
321
369
  static const uint8_t SKETCH_TYPE = 3;
322
370
 
323
371
  // Instances of this type can be obtained:
@@ -325,13 +373,42 @@ public:
325
373
  // - as a result of a set operation
326
374
  // - by deserializing a previously serialized compact sketch
327
375
 
376
+ /**
377
+ * Copy constructor.
378
+ * Constructs a compact sketch from any other type of Theta sketch
379
+ * @param other sketch to be constructed from
380
+ * @param ordered if true make the resulting sketch ordered
381
+ */
328
382
  template<typename Other>
329
383
  compact_theta_sketch_alloc(const Other& other, bool ordered);
330
- compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
331
- compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
384
+
385
+ /**
386
+ * Copy constructor
387
+ * @param other sketch to be copied
388
+ */
389
+ compact_theta_sketch_alloc(const compact_theta_sketch_alloc& other) = default;
390
+
391
+ /**
392
+ * Move constructor
393
+ * @param other sketch to be moved
394
+ */
395
+ compact_theta_sketch_alloc(compact_theta_sketch_alloc&& other) noexcept = default;
396
+
332
397
  virtual ~compact_theta_sketch_alloc() = default;
333
- compact_theta_sketch_alloc& operator=(const compact_theta_sketch_alloc&) = default;
334
- compact_theta_sketch_alloc& operator=(compact_theta_sketch_alloc&&) = default;
398
+
399
+ /**
400
+ * Copy assignment
401
+ * @param other sketch to be copied
402
+ * @return reference to this sketch
403
+ */
404
+ compact_theta_sketch_alloc& operator=(const compact_theta_sketch_alloc& other) = default;
405
+
406
+ /**
407
+ * Move assignment
408
+ * @param other sketch to be moved
409
+ * @return reference to this sketch
410
+ */
411
+ compact_theta_sketch_alloc& operator=(compact_theta_sketch_alloc&& other) = default;
335
412
 
336
413
  virtual Allocator get_allocator() const;
337
414
  virtual bool is_empty() const;
@@ -355,6 +432,25 @@ public:
355
432
  */
356
433
  vector_bytes serialize(unsigned header_size_bytes = 0) const;
357
434
 
435
+ /**
436
+ * This method serializes the sketch into a given stream in a compressed binary form.
437
+ * Compression is applied to ordered sketches except empty and single item.
438
+ * For unordered, empty and single item sketches this method is equivalent to serialize()
439
+ * @param os output stream
440
+ */
441
+ void serialize_compressed(std::ostream& os) const;
442
+
443
+ /**
444
+ * This method serializes the sketch as a vector of bytes.
445
+ * An optional header can be reserved in front of the sketch.
446
+ * It is an uninitialized space of a given size.
447
+ * This header is used in Datasketches PostgreSQL extension.
448
+ * Compression is applied to ordered sketches except empty and single item.
449
+ * For unordered, empty and single item sketches this method is equivalent to serialize()
450
+ * @param header_size_bytes space to reserve in front of the sketch
451
+ */
452
+ vector_bytes serialize_compressed(unsigned header_size_bytes = 0) const;
453
+
358
454
  virtual iterator begin();
359
455
  virtual iterator end();
360
456
  virtual const_iterator begin() const;
@@ -364,6 +460,7 @@ public:
364
460
  * This method deserializes a sketch from a given stream.
365
461
  * @param is input stream
366
462
  * @param seed the seed for the hash function that was used to create the sketch
463
+ * @param allocator instance of an Allocator
367
464
  * @return an instance of the sketch
368
465
  */
369
466
  static compact_theta_sketch_alloc deserialize(std::istream& is,
@@ -374,14 +471,12 @@ public:
374
471
  * @param bytes pointer to the array of bytes
375
472
  * @param size the size of the array
376
473
  * @param seed the seed for the hash function that was used to create the sketch
474
+ * @param allocator instance of an Allocator
377
475
  * @return an instance of the sketch
378
476
  */
379
477
  static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
380
478
  uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
381
479
 
382
- // for internal use
383
- compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
384
-
385
480
  private:
386
481
  enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
387
482
 
@@ -391,23 +486,46 @@ private:
391
486
  uint64_t theta_;
392
487
  std::vector<uint64_t, Allocator> entries_;
393
488
 
489
+ bool is_suitable_for_compression() const;
490
+ uint8_t compute_min_leading_zeros() const;
491
+ void serialize_version_4(std::ostream& os) const;
492
+ vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
493
+
494
+ static compact_theta_sketch_alloc deserialize_v1(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
495
+ static compact_theta_sketch_alloc deserialize_v2(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
496
+ static compact_theta_sketch_alloc deserialize_v3(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
497
+ static compact_theta_sketch_alloc deserialize_v4(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
498
+
394
499
  virtual void print_specifics(std::ostringstream& os) const;
500
+
501
+ template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_union_base;
502
+ template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
503
+ template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
504
+ compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
395
505
  };
396
506
 
507
+ /// Update Theta sketch builder
397
508
  template<typename Allocator>
398
509
  class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
399
510
  public:
511
+ /**
512
+ * Constructor
513
+ * @param allocator
514
+ */
400
515
  builder(const Allocator& allocator = Allocator());
516
+ /// @return instance of Update Theta sketch
401
517
  update_theta_sketch_alloc build() const;
402
518
  };
403
519
 
404
- // This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
405
- // It does not take the ownership of the buffer.
406
-
520
+ /**
521
+ * Wrapped Compact Theta sketch.
522
+ * This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
523
+ * It does not take the ownership of the buffer.
524
+ */
407
525
  template<typename Allocator = std::allocator<uint64_t>>
408
- class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
526
+ class wrapped_compact_theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
409
527
  public:
410
- using const_iterator = const uint64_t*;
528
+ class const_iterator;
411
529
 
412
530
  Allocator get_allocator() const;
413
531
  bool is_empty() const;
@@ -416,7 +534,17 @@ public:
416
534
  uint32_t get_num_retained() const;
417
535
  uint16_t get_seed_hash() const;
418
536
 
537
+ /**
538
+ * Const iterator over hash values in this sketch.
539
+ * @return begin iterator
540
+ */
419
541
  const_iterator begin() const;
542
+
543
+ /**
544
+ * Const iterator pointing past the valid range.
545
+ * Not to be incremented or dereferenced.
546
+ * @return end iterator
547
+ */
420
548
  const_iterator end() const;
421
549
 
422
550
  /**
@@ -424,6 +552,7 @@ public:
424
552
  * @param bytes pointer to the array of bytes
425
553
  * @param size the size of the array
426
554
  * @param seed the seed for the hash function that was used to create the sketch
555
+ * @param dump_on_error if true prints hex dump of the input
427
556
  * @return an instance of the sketch
428
557
  */
429
558
  static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
@@ -433,22 +562,40 @@ protected:
433
562
  virtual void print_items(std::ostringstream& os) const;
434
563
 
435
564
  private:
436
- bool is_empty_;
437
- bool is_ordered_;
438
- uint16_t seed_hash_;
439
- uint32_t num_entries_;
440
- uint64_t theta_;
441
- const uint64_t* entries_;
565
+ using data_type = compact_theta_sketch_parser<true>::compact_theta_sketch_data;
566
+ data_type data_;
442
567
 
443
- wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
444
- uint64_t theta, const uint64_t* entries);
568
+ wrapped_compact_theta_sketch_alloc(const data_type& data);
445
569
  };
446
570
 
447
- // aliases with default allocator for convenience
448
- using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
449
- using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
450
- using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
451
- using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
571
+ template<typename Allocator>
572
+ class wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator {
573
+ public:
574
+ using iterator_category = std::input_iterator_tag;
575
+ using value_type = const uint64_t;
576
+ using difference_type = void;
577
+ using pointer = value_type*;
578
+ using reference = uint64_t;
579
+
580
+ const_iterator(const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index);
581
+ const_iterator& operator++();
582
+ const_iterator operator++(int);
583
+ bool operator==(const const_iterator& other) const;
584
+ bool operator!=(const const_iterator& other) const;
585
+ reference operator*() const;
586
+ pointer operator->() const;
587
+
588
+ private:
589
+ const void* ptr_;
590
+ uint8_t entry_bits_;
591
+ uint32_t num_entries_;
592
+ uint32_t index_;
593
+ uint64_t previous_;
594
+ bool is_block_mode_;
595
+ uint8_t buf_i_;
596
+ uint8_t offset_;
597
+ uint64_t buffer_[8];
598
+ };
452
599
 
453
600
  } /* namespace datasketches */
454
601