datasketches 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (245) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +3 -3
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +23 -20
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/README.md +1 -3
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +15 -10
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  21. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  23. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  24. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  25. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +2 -2
  26. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  27. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  28. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  29. data/vendor/datasketches-cpp/{python/src/__init__.py → count/CMakeLists.txt} +25 -1
  30. data/vendor/datasketches-cpp/count/include/count_min.hpp +405 -0
  31. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +497 -0
  32. data/vendor/datasketches-cpp/{MANIFEST.in → count/test/CMakeLists.txt} +23 -20
  33. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  34. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +303 -0
  35. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  36. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  37. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  38. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  39. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  40. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +14 -11
  41. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  42. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  43. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  44. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  45. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  46. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  47. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  48. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_serialize_for_java.cpp +38 -0
  49. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  50. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  51. data/vendor/datasketches-cpp/{tox.ini → density/CMakeLists.txt} +24 -8
  52. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +256 -0
  53. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  54. data/vendor/datasketches-cpp/{python/datasketches/__init__.py → density/test/CMakeLists.txt} +15 -3
  55. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  59. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  60. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  63. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  64. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +94 -61
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +20 -8
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  76. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  77. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -18
  78. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  79. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  80. data/vendor/datasketches-cpp/hll/include/hll.hpp +79 -65
  81. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  82. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  83. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  84. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  85. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  86. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  87. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  88. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +79 -53
  89. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +61 -132
  90. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  91. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  92. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  93. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +5 -40
  94. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +76 -54
  95. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +66 -136
  96. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  97. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  98. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  99. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +15 -39
  100. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  101. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -4
  102. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +105 -26
  103. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +50 -111
  104. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  105. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  106. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  107. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  108. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  109. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  110. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  111. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  112. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  113. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +89 -32
  114. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +33 -19
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +13 -10
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +23 -19
  117. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  118. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  119. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  120. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  121. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  122. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  123. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -51
  124. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  125. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  126. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -20
  127. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  128. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  129. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  130. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  131. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  132. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  133. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  134. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  135. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  137. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  139. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  140. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  141. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  142. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +180 -33
  143. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  144. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  145. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +10 -10
  146. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  147. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +21 -6
  148. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +13 -3
  149. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +15 -1
  150. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  151. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  152. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  153. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +39 -188
  154. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  155. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  157. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  158. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  159. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  160. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection_impl.hpp +31 -0
  161. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  162. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  163. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  164. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  165. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  166. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  167. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  168. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  169. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  170. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  171. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  172. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  173. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  174. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  175. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  176. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  177. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  178. data/vendor/datasketches-cpp/{python/src/datasketches.cpp → tuple/test/tuple_sketch_serialize_for_java.cpp} +16 -30
  179. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  180. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  181. metadata +61 -79
  182. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  183. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  184. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  185. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  188. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  189. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  190. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  191. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  192. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  193. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -81
  194. data/vendor/datasketches-cpp/python/README.md +0 -85
  195. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -104
  196. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  197. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  198. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  199. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  200. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  201. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  202. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  203. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -90
  204. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -128
  205. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -134
  206. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -210
  207. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  208. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -111
  209. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -204
  210. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -215
  211. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -172
  212. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  213. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  214. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  215. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  216. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -110
  217. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -130
  218. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -125
  219. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -126
  220. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -126
  221. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -146
  222. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  223. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -125
  224. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  225. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  230. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  231. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  232. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  233. data/vendor/datasketches-cpp/setup.py +0 -110
  234. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  238. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  239. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  240. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  241. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  242. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  243. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  244. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  245. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -27,7 +27,8 @@
27
27
  #include "serde.hpp"
28
28
  #include "binomial_bounds.hpp"
29
29
  #include "theta_helpers.hpp"
30
- #include "compact_theta_sketch_parser.hpp"
30
+ #include "count_zeros.hpp"
31
+ #include "bit_packing.hpp"
31
32
 
32
33
  namespace datasketches {
33
34
 
@@ -38,7 +39,8 @@ bool base_theta_sketch_alloc<A>::is_estimation_mode() const {
38
39
 
39
40
  template<typename A>
40
41
  double base_theta_sketch_alloc<A>::get_theta() const {
41
- return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
42
+ return static_cast<double>(get_theta64()) /
43
+ static_cast<double>(theta_constants::MAX_THETA);
42
44
  }
43
45
 
44
46
  template<typename A>
@@ -343,12 +345,9 @@ template<typename A>
343
345
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
344
346
  const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
345
347
  write(os, preamble_longs);
346
- const uint8_t serial_version = SERIAL_VERSION;
347
- write(os, serial_version);
348
- const uint8_t type = SKETCH_TYPE;
349
- write(os, type);
350
- const uint16_t unused16 = 0;
351
- write(os, unused16);
348
+ write(os, UNCOMPRESSED_SERIAL_VERSION);
349
+ write(os, SKETCH_TYPE);
350
+ write<uint16_t>(os, 0); // unused
352
351
  const uint8_t flags_byte(
353
352
  (1 << flags::IS_COMPACT) |
354
353
  (1 << flags::IS_READ_ONLY) |
@@ -356,13 +355,10 @@ void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
356
355
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
357
356
  );
358
357
  write(os, flags_byte);
359
- const uint16_t seed_hash = get_seed_hash();
360
- write(os, seed_hash);
358
+ write(os, get_seed_hash());
361
359
  if (preamble_longs > 1) {
362
- const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
363
- write(os, num_entries);
364
- const uint32_t unused32 = 0;
365
- write(os, unused32);
360
+ write(os, static_cast<uint32_t>(entries_.size()));
361
+ write<uint32_t>(os, 0); // unused
366
362
  }
367
363
  if (this->is_estimation_mode()) write(os, this->theta_);
368
364
  if (entries_.size() > 0) write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
@@ -376,11 +372,9 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
376
372
  vector_bytes bytes(size, 0, entries_.get_allocator());
377
373
  uint8_t* ptr = bytes.data() + header_size_bytes;
378
374
 
379
- ptr += copy_to_mem(preamble_longs, ptr);
380
- const uint8_t serial_version = SERIAL_VERSION;
381
- ptr += copy_to_mem(serial_version, ptr);
382
- const uint8_t type = SKETCH_TYPE;
383
- ptr += copy_to_mem(type, ptr);
375
+ *ptr++ = preamble_longs;
376
+ *ptr++ = UNCOMPRESSED_SERIAL_VERSION;
377
+ *ptr++ = SKETCH_TYPE;
384
378
  ptr += sizeof(uint16_t); // unused
385
379
  const uint8_t flags_byte(
386
380
  (1 << flags::IS_COMPACT) |
@@ -388,12 +382,10 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
388
382
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
389
383
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
390
384
  );
391
- ptr += copy_to_mem(flags_byte, ptr);
392
- const uint16_t seed_hash = get_seed_hash();
393
- ptr += copy_to_mem(seed_hash, ptr);
385
+ *ptr++ = flags_byte;
386
+ ptr += copy_to_mem(get_seed_hash(), ptr);
394
387
  if (preamble_longs > 1) {
395
- const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
396
- ptr += copy_to_mem(num_entries, ptr);
388
+ ptr += copy_to_mem(static_cast<uint32_t>(entries_.size()), ptr);
397
389
  ptr += sizeof(uint32_t); // unused
398
390
  }
399
391
  if (this->is_estimation_mode()) ptr += copy_to_mem(theta_, ptr);
@@ -401,131 +393,342 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
401
393
  return bytes;
402
394
  }
403
395
 
396
+ template<typename A>
397
+ bool compact_theta_sketch_alloc<A>::is_suitable_for_compression() const {
398
+ if (!this->is_ordered() || entries_.size() == 0 ||
399
+ (entries_.size() == 1 && !this->is_estimation_mode())) return false;
400
+ return true;
401
+ }
402
+
403
+ template<typename A>
404
+ void compact_theta_sketch_alloc<A>::serialize_compressed(std::ostream& os) const {
405
+ if (is_suitable_for_compression()) return serialize_version_4(os);
406
+ return serialize(os);
407
+ }
408
+
409
+ template<typename A>
410
+ auto compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_bytes) const -> vector_bytes {
411
+ if (is_suitable_for_compression()) return serialize_version_4(header_size_bytes);
412
+ return serialize(header_size_bytes);
413
+ }
414
+
415
+ template<typename A>
416
+ uint8_t compact_theta_sketch_alloc<A>::compute_min_leading_zeros() const {
417
+ // compression is based on leading zeros in deltas between ordered hash values
418
+ // assumes ordered sketch
419
+ uint64_t previous = 0;
420
+ uint64_t ored = 0;
421
+ for (const uint64_t entry: entries_) {
422
+ const uint64_t delta = entry - previous;
423
+ ored |= delta;
424
+ previous = entry;
425
+ }
426
+ return count_leading_zeros_in_u64(ored);
427
+ }
428
+
429
+ template<typename A>
430
+ void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const {
431
+ const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
432
+ const uint8_t entry_bits = 64 - compute_min_leading_zeros();
433
+
434
+ // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
435
+ const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
436
+
437
+ write(os, preamble_longs);
438
+ write(os, COMPRESSED_SERIAL_VERSION);
439
+ write(os, SKETCH_TYPE);
440
+ write(os, entry_bits);
441
+ write(os, num_entries_bytes);
442
+ const uint8_t flags_byte(
443
+ (1 << flags::IS_COMPACT) |
444
+ (1 << flags::IS_READ_ONLY) |
445
+ (1 << flags::IS_ORDERED)
446
+ );
447
+ write(os, flags_byte);
448
+ write(os, get_seed_hash());
449
+ if (this->is_estimation_mode()) write(os, this->theta_);
450
+ uint32_t num_entries = static_cast<uint32_t>(entries_.size());
451
+ for (unsigned i = 0; i < num_entries_bytes; ++i) {
452
+ write<uint8_t>(os, num_entries & 0xff);
453
+ num_entries >>= 8;
454
+ }
455
+
456
+ uint64_t previous = 0;
457
+ uint64_t deltas[8];
458
+ vector_bytes buffer(entry_bits, 0, entries_.get_allocator()); // block of 8 entries takes entry_bits bytes
459
+
460
+ // pack blocks of 8 deltas
461
+ unsigned i;
462
+ for (i = 0; i + 7 < entries_.size(); i += 8) {
463
+ for (unsigned j = 0; j < 8; ++j) {
464
+ deltas[j] = entries_[i + j] - previous;
465
+ previous = entries_[i + j];
466
+ }
467
+ pack_bits_block8(deltas, buffer.data(), entry_bits);
468
+ write(os, buffer.data(), buffer.size());
469
+ }
470
+
471
+ // pack extra deltas if fewer than 8 of them left
472
+ if (i < entries_.size()) {
473
+ uint8_t offset = 0;
474
+ uint8_t* ptr = buffer.data();
475
+ for (; i < entries_.size(); ++i) {
476
+ const uint64_t delta = entries_[i] - previous;
477
+ previous = entries_[i];
478
+ offset = pack_bits(delta, entry_bits, ptr, offset);
479
+ }
480
+ write(os, buffer.data(), ptr - buffer.data());
481
+ }
482
+ }
483
+
484
+ template<typename A>
485
+ auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_bytes) const -> vector_bytes {
486
+ const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
487
+ const uint8_t entry_bits = 64 - compute_min_leading_zeros();
488
+ const size_t compressed_bits = entry_bits * entries_.size();
489
+
490
+ // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
491
+ const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
492
+
493
+ const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + num_entries_bytes
494
+ + whole_bytes_to_hold_bits(compressed_bits);
495
+ vector_bytes bytes(size, 0, entries_.get_allocator());
496
+ uint8_t* ptr = bytes.data() + header_size_bytes;
497
+
498
+ *ptr++ = preamble_longs;
499
+ *ptr++ = COMPRESSED_SERIAL_VERSION;
500
+ *ptr++ = SKETCH_TYPE;
501
+ *ptr++ = entry_bits;
502
+ *ptr++ = num_entries_bytes;
503
+ const uint8_t flags_byte(
504
+ (1 << flags::IS_COMPACT) |
505
+ (1 << flags::IS_READ_ONLY) |
506
+ (1 << flags::IS_ORDERED)
507
+ );
508
+ *ptr++ = flags_byte;
509
+ ptr += copy_to_mem(get_seed_hash(), ptr);
510
+ if (this->is_estimation_mode()) {
511
+ ptr += copy_to_mem(theta_, ptr);
512
+ }
513
+ uint32_t num_entries = static_cast<uint32_t>(entries_.size());
514
+ for (unsigned i = 0; i < num_entries_bytes; ++i) {
515
+ *ptr++ = num_entries & 0xff;
516
+ num_entries >>= 8;
517
+ }
518
+
519
+ uint64_t previous = 0;
520
+ uint64_t deltas[8];
521
+
522
+ // pack blocks of 8 deltas
523
+ unsigned i;
524
+ for (i = 0; i + 7 < entries_.size(); i += 8) {
525
+ for (unsigned j = 0; j < 8; ++j) {
526
+ deltas[j] = entries_[i + j] - previous;
527
+ previous = entries_[i + j];
528
+ }
529
+ pack_bits_block8(deltas, ptr, entry_bits);
530
+ ptr += entry_bits;
531
+ }
532
+
533
+ // pack extra deltas if fewer than 8 of them left
534
+ uint8_t offset = 0;
535
+ for (; i < entries_.size(); ++i) {
536
+ const uint64_t delta = entries_[i] - previous;
537
+ previous = entries_[i];
538
+ offset = pack_bits(delta, entry_bits, ptr, offset);
539
+ }
540
+ return bytes;
541
+ }
542
+
404
543
  template<typename A>
405
544
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
406
545
  const auto preamble_longs = read<uint8_t>(is);
407
546
  const auto serial_version = read<uint8_t>(is);
408
547
  const auto type = read<uint8_t>(is);
548
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
409
549
  switch (serial_version) {
410
- case SERIAL_VERSION: {
411
- read<uint16_t>(is); // unused
412
- const auto flags_byte = read<uint8_t>(is);
413
- const auto seed_hash = read<uint16_t>(is);
414
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
415
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
416
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
417
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
418
-
419
- uint64_t theta = theta_constants::MAX_THETA;
420
- uint32_t num_entries = 0;
421
- if (!is_empty) {
422
- if (preamble_longs == 1) {
423
- num_entries = 1;
424
- } else {
425
- num_entries = read<uint32_t>(is);
426
- read<uint32_t>(is); // unused
427
- if (preamble_longs > 2) {
428
- theta = read<uint64_t>(is);
429
- }
430
- }
431
- }
432
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
433
- if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
550
+ case 4:
551
+ return deserialize_v4(preamble_longs, is, seed, allocator);
552
+ case 3:
553
+ return deserialize_v3(preamble_longs, is, seed, allocator);
554
+ case 1:
555
+ return deserialize_v1(preamble_longs, is, seed, allocator);
556
+ case 2:
557
+ return deserialize_v2(preamble_longs, is, seed, allocator);
558
+ default:
559
+ throw std::invalid_argument("unexpected sketch serialization version " + std::to_string(serial_version));
560
+ }
561
+ }
434
562
 
435
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
563
+ template<typename A>
564
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v1(
565
+ uint8_t, std::istream& is, uint64_t seed, const A& allocator)
566
+ {
567
+ const auto seed_hash = compute_seed_hash(seed);
568
+ read<uint8_t>(is); // unused
569
+ read<uint32_t>(is); // unused
570
+ const auto num_entries = read<uint32_t>(is);
571
+ read<uint32_t>(is); //unused
572
+ const auto theta = read<uint64_t>(is);
573
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
574
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
575
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
576
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
577
+ return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
578
+ }
579
+
580
+ template<typename A>
581
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v2(
582
+ uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
583
+ {
584
+ read<uint8_t>(is); // unused
585
+ read<uint16_t>(is); // unused
586
+ const uint16_t seed_hash = read<uint16_t>(is);
587
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
588
+ if (preamble_longs == 1) {
589
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
590
+ std::vector<uint64_t, A> entries(0, 0, allocator);
591
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
592
+ } else if (preamble_longs == 2) {
593
+ const uint32_t num_entries = read<uint32_t>(is);
594
+ read<uint32_t>(is); // unused
595
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
596
+ if (num_entries == 0) {
597
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
598
+ }
599
+ read(is, entries.data(), entries.size() * sizeof(uint64_t));
600
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
601
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
602
+ } else if (preamble_longs == 3) {
603
+ const uint32_t num_entries = read<uint32_t>(is);
604
+ read<uint32_t>(is); // unused
605
+ const auto theta = read<uint64_t>(is);
606
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
607
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
608
+ if (is_empty) {
436
609
  if (!is.good()) throw std::runtime_error("error reading from std::istream");
437
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
610
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
611
+ } else {
612
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
613
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
614
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
615
+ }
616
+ } else {
617
+ throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
438
618
  }
439
- case 1: {
440
- const auto seed_hash = compute_seed_hash(seed);
441
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
442
- read<uint8_t>(is); // unused
619
+ }
620
+
621
+ template<typename A>
622
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v3(
623
+ uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
624
+ {
625
+ read<uint16_t>(is); // unused
626
+ const auto flags_byte = read<uint8_t>(is);
627
+ const auto seed_hash = read<uint16_t>(is);
628
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
629
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
630
+ uint64_t theta = theta_constants::MAX_THETA;
631
+ uint32_t num_entries = 0;
632
+ if (!is_empty) {
633
+ if (preamble_longs == 1) {
634
+ num_entries = 1;
635
+ } else {
636
+ num_entries = read<uint32_t>(is);
443
637
  read<uint32_t>(is); // unused
444
- const auto num_entries = read<uint32_t>(is);
445
- read<uint32_t>(is); //unused
446
- const auto theta = read<uint64_t>(is);
447
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
448
- bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
449
- if (!is_empty)
450
- read(is, entries.data(), sizeof(uint64_t) * entries.size());
451
- if (!is.good())
452
- throw std::runtime_error("error reading from std::istream");
453
- return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
638
+ if (preamble_longs > 2) theta = read<uint64_t>(is);
639
+ }
454
640
  }
455
- case 2: {
456
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
457
- read<uint8_t>(is); // unused
458
- read<uint16_t>(is); // unused
459
- const uint16_t seed_hash = read<uint16_t>(is);
460
- checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
461
- if (preamble_longs == 1) {
462
- if (!is.good())
463
- throw std::runtime_error("error reading from std::istream");
464
- std::vector<uint64_t, A> entries(0, 0, allocator);
465
- return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
466
- } else if (preamble_longs == 2) {
467
- const uint32_t num_entries = read<uint32_t>(is);
468
- read<uint32_t>(is); // unused
469
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
470
- if (num_entries == 0) {
471
- return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
472
- }
473
- read(is, entries.data(), entries.size() * sizeof(uint64_t));
474
- if (!is.good())
475
- throw std::runtime_error("error reading from std::istream");
476
- return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
477
- } else if (preamble_longs == 3) {
478
- const uint32_t num_entries = read<uint32_t>(is);
479
- read<uint32_t>(is); // unused
480
- const auto theta = read<uint64_t>(is);
481
- bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
482
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
483
- if (is_empty) {
484
- if (!is.good())
485
- throw std::runtime_error("error reading from std::istream");
486
- return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
487
- } else {
488
- read(is, entries.data(), sizeof(uint64_t) * entries.size());
489
- if (!is.good())
490
- throw std::runtime_error("error reading from std::istream");
491
- return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
492
- }
493
- } else {
494
- throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
495
- }
641
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
642
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
643
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
644
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
645
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
646
+ }
647
+
648
+ template<typename A>
649
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v4(
650
+ uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
651
+ {
652
+ const auto entry_bits = read<uint8_t>(is);
653
+ const auto num_entries_bytes = read<uint8_t>(is);
654
+ const auto flags_byte = read<uint8_t>(is);
655
+ const auto seed_hash = read<uint16_t>(is);
656
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
657
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
658
+ uint64_t theta = theta_constants::MAX_THETA;
659
+ if (preamble_longs > 1) theta = read<uint64_t>(is);
660
+ uint32_t num_entries = 0;
661
+ for (unsigned i = 0; i < num_entries_bytes; ++i) {
662
+ num_entries |= read<uint8_t>(is) << (i << 3);
663
+ }
664
+ vector_bytes buffer(entry_bits, 0, allocator); // block of 8 entries takes entry_bits bytes
665
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
666
+
667
+ // unpack blocks of 8 deltas
668
+ unsigned i;
669
+ for (i = 0; i + 7 < num_entries; i += 8) {
670
+ read(is, buffer.data(), buffer.size());
671
+ unpack_bits_block8(&entries[i], buffer.data(), entry_bits);
496
672
  }
497
- default:
498
- // this should always fail since the valid cases are handled above
499
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
500
- // this throw is never reached, because check_serial_version will throw an informative exception.
501
- // This is only here to avoid a compiler warning about a path without a return value.
502
- throw std::invalid_argument("unexpected sketch serialization version");
673
+ // unpack extra deltas if fewer than 8 of them left
674
+ if (i < num_entries) read(is, buffer.data(), whole_bytes_to_hold_bits((num_entries - i) * entry_bits));
675
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
676
+ const uint8_t* ptr = buffer.data();
677
+ uint8_t offset = 0;
678
+ for (; i < num_entries; ++i) {
679
+ offset = unpack_bits(entries[i], entry_bits, ptr, offset);
503
680
  }
681
+ // undo deltas
682
+ uint64_t previous = 0;
683
+ for (i = 0; i < num_entries; ++i) {
684
+ entries[i] += previous;
685
+ previous = entries[i];
686
+ }
687
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
688
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
504
689
  }
505
690
 
506
691
  template<typename A>
507
692
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
508
693
  auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, false);
509
- return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::vector<uint64_t, A>(data.entries, data.entries + data.num_entries, allocator));
694
+ if (data.entry_bits == 64) { // versions 1 to 3
695
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(data.entries_start_ptr);
696
+ return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta,
697
+ std::vector<uint64_t, A>(entries, entries + data.num_entries, allocator));
698
+ } else { // version 4
699
+ std::vector<uint64_t, A> entries(data.num_entries, 0, allocator);
700
+ const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data.entries_start_ptr);
701
+ // unpack blocks of 8 deltas
702
+ unsigned i;
703
+ for (i = 0; i + 7 < data.num_entries; i += 8) {
704
+ unpack_bits_block8(&entries[i], ptr, data.entry_bits);
705
+ ptr += data.entry_bits;
706
+ }
707
+ // unpack extra deltas if fewer than 8 of them left
708
+ uint8_t offset = 0;
709
+ for (; i < data.num_entries; ++i) {
710
+ offset = unpack_bits(entries[i], data.entry_bits, ptr, offset);
711
+ }
712
+ // undo deltas
713
+ uint64_t previous = 0;
714
+ for (i = 0; i < data.num_entries; ++i) {
715
+ entries[i] += previous;
716
+ previous = entries[i];
717
+ }
718
+ return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::move(entries));
719
+ }
510
720
  }
511
721
 
512
722
  // wrapped compact sketch
513
723
 
514
724
  template<typename A>
515
- wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
516
- uint64_t theta, const uint64_t* entries):
517
- is_empty_(is_empty),
518
- is_ordered_(is_ordered),
519
- seed_hash_(seed_hash),
520
- num_entries_(num_entries),
521
- theta_(theta),
522
- entries_(entries)
725
+ wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(const data_type& data):
726
+ data_(data)
523
727
  {}
524
728
 
525
729
  template<typename A>
526
730
  const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
527
- auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error);
528
- return wrapped_compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.num_entries, data.theta, data.entries);
731
+ return wrapped_compact_theta_sketch_alloc(compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error));
529
732
  }
530
733
 
531
734
  template<typename A>
@@ -535,37 +738,37 @@ A wrapped_compact_theta_sketch_alloc<A>::get_allocator() const {
535
738
 
536
739
  template<typename A>
537
740
  bool wrapped_compact_theta_sketch_alloc<A>::is_empty() const {
538
- return is_empty_;
741
+ return data_.is_empty;
539
742
  }
540
743
 
541
744
  template<typename A>
542
745
  bool wrapped_compact_theta_sketch_alloc<A>::is_ordered() const {
543
- return is_ordered_;
746
+ return data_.is_ordered;
544
747
  }
545
748
 
546
749
  template<typename A>
547
750
  uint64_t wrapped_compact_theta_sketch_alloc<A>::get_theta64() const {
548
- return theta_;
751
+ return data_.theta;
549
752
  }
550
753
 
551
754
  template<typename A>
552
755
  uint32_t wrapped_compact_theta_sketch_alloc<A>::get_num_retained() const {
553
- return static_cast<uint32_t>(num_entries_);
756
+ return data_.num_entries;
554
757
  }
555
758
 
556
759
  template<typename A>
557
760
  uint16_t wrapped_compact_theta_sketch_alloc<A>::get_seed_hash() const {
558
- return seed_hash_;
761
+ return data_.seed_hash;
559
762
  }
560
763
 
561
764
  template<typename A>
562
765
  auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
563
- return entries_;
766
+ return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, 0);
564
767
  }
565
768
 
566
769
  template<typename A>
567
770
  auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
568
- return entries_ + num_entries_;
771
+ return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, data_.num_entries);
569
772
  }
570
773
 
571
774
  template<typename A>
@@ -574,12 +777,109 @@ void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&)
574
777
  template<typename A>
575
778
  void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
576
779
  os << "### Retained entries" << std::endl;
577
- for (const auto& hash: *this) {
780
+ for (const auto hash: *this) {
578
781
  os << hash << std::endl;
579
782
  }
580
783
  os << "### End retained entries" << std::endl;
581
784
  }
582
785
 
786
+ // assumes index == 0 or index == num_entries
787
+ template<typename Allocator>
788
+ wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::const_iterator(
789
+ const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index):
790
+ ptr_(ptr),
791
+ entry_bits_(entry_bits),
792
+ num_entries_(num_entries),
793
+ index_(index),
794
+ previous_(0),
795
+ is_block_mode_(num_entries_ >= 8),
796
+ buf_i_(0),
797
+ offset_(0)
798
+ {
799
+ if (entry_bits == 64) { // no compression
800
+ ptr_ = reinterpret_cast<const uint64_t*>(ptr) + index;
801
+ } else if (index < num_entries) {
802
+ if (is_block_mode_) {
803
+ unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
804
+ ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
805
+ for (int i = 0; i < 8; ++i) {
806
+ buffer_[i] += previous_;
807
+ previous_ = buffer_[i];
808
+ }
809
+ } else {
810
+ offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
811
+ buffer_[0] += previous_;
812
+ previous_ = buffer_[0];
813
+ }
814
+ }
815
+ }
816
+
817
+ template<typename Allocator>
818
+ auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++() -> const_iterator& {
819
+ if (entry_bits_ == 64) { // no compression
820
+ ptr_ = reinterpret_cast<const uint64_t*>(ptr_) + 1;
821
+ return *this;
822
+ }
823
+ ++index_;
824
+ if (index_ < num_entries_) {
825
+ if (is_block_mode_) {
826
+ ++buf_i_;
827
+ if (buf_i_ == 8) {
828
+ buf_i_ = 0;
829
+ if (index_ + 8 < num_entries_) {
830
+ unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
831
+ ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
832
+ for (int i = 0; i < 8; ++i) {
833
+ buffer_[i] += previous_;
834
+ previous_ = buffer_[i];
835
+ }
836
+ } else {
837
+ is_block_mode_ = false;
838
+ offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
839
+ buffer_[0] += previous_;
840
+ previous_ = buffer_[0];
841
+ }
842
+ }
843
+ } else {
844
+ offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
845
+ buffer_[0] += previous_;
846
+ previous_ = buffer_[0];
847
+ }
848
+ }
849
+ return *this;
850
+ }
851
+
852
+ template<typename Allocator>
853
+ auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++(int) -> const_iterator {
854
+ const_iterator tmp(*this);
855
+ operator++();
856
+ return tmp;
857
+ }
858
+
859
+ template<typename Allocator>
860
+ bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator!=(const const_iterator& other) const {
861
+ if (entry_bits_ == 64) return ptr_ != other.ptr_;
862
+ return index_ != other.index_;
863
+ }
864
+
865
+ template<typename Allocator>
866
+ bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator==(const const_iterator& other) const {
867
+ if (entry_bits_ == 64) return ptr_ == other.ptr_;
868
+ return index_ == other.index_;
869
+ }
870
+
871
+ template<typename Allocator>
872
+ auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator*() const -> reference {
873
+ if (entry_bits_ == 64) return *reinterpret_cast<const uint64_t*>(ptr_);
874
+ return buffer_[buf_i_];
875
+ }
876
+
877
+ template<typename Allocator>
878
+ auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator->() const -> pointer {
879
+ if (entry_bits_ == 64) return reinterpret_cast<const uint64_t*>(ptr_);
880
+ return buffer_ + buf_i_;
881
+ }
882
+
583
883
  } /* namespace datasketches */
584
884
 
585
885
  #endif