datasketches 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -20,7 +20,7 @@
20
20
  #ifndef COMPACT_THETA_SKETCH_PARSER_HPP_
21
21
  #define COMPACT_THETA_SKETCH_PARSER_HPP_
22
22
 
23
- #include <stdint.h>
23
+ #include <cstdint>
24
24
 
25
25
  namespace datasketches {
26
26
 
@@ -33,7 +33,8 @@ public:
33
33
  uint16_t seed_hash;
34
34
  uint32_t num_entries;
35
35
  uint64_t theta;
36
- const uint64_t* entries;
36
+ const void* entries_start_ptr;
37
+ uint8_t entry_bits;
37
38
  };
38
39
 
39
40
  static compact_theta_sketch_data parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error = false);
@@ -45,18 +46,23 @@ private:
45
46
  static const size_t COMPACT_SKETCH_TYPE_BYTE = 2;
46
47
  static const size_t COMPACT_SKETCH_FLAGS_BYTE = 5;
47
48
  static const size_t COMPACT_SKETCH_SEED_HASH_U16 = 3;
48
- static const size_t COMPACT_SKETCH_NUM_ENTRIES_U32 = 2;
49
- static const size_t COMPACT_SKETCH_SINGLE_ENTRY_U64 = 1;
50
- static const size_t COMPACT_SKETCH_ENTRIES_EXACT_U64 = 2;
51
- static const size_t COMPACT_SKETCH_THETA_U64 = 2;
52
- static const size_t COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 = 3;
49
+ static const size_t COMPACT_SKETCH_SINGLE_ENTRY_U64 = 1; // ver 3
50
+ static const size_t COMPACT_SKETCH_NUM_ENTRIES_U32 = 2; // ver 1-3
51
+ static const size_t COMPACT_SKETCH_ENTRIES_EXACT_U64 = 2; // ver 1-3
52
+ static const size_t COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 = 3; // ver 1-3
53
+ static const size_t COMPACT_SKETCH_THETA_U64 = 2; // ver 1-3
54
+ static const size_t COMPACT_SKETCH_V4_ENTRY_BITS_BYTE = 3;
55
+ static const size_t COMPACT_SKETCH_V4_NUM_ENTRIES_BYTES_BYTE = 4;
56
+ static const size_t COMPACT_SKETCH_V4_THETA_U64 = 1;
57
+ static const size_t COMPACT_SKETCH_V4_PACKED_DATA_EXACT_BYTE = 8;
58
+ static const size_t COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_BYTE = 16;
53
59
 
54
60
  static const uint8_t COMPACT_SKETCH_IS_EMPTY_FLAG = 2;
55
61
  static const uint8_t COMPACT_SKETCH_IS_ORDERED_FLAG = 4;
56
62
 
57
- static const uint8_t COMPACT_SKETCH_SERIAL_VERSION = 3;
58
63
  static const uint8_t COMPACT_SKETCH_TYPE = 3;
59
64
 
65
+ static void check_memory_size(const void* ptr, size_t actual_bytes, size_t expected_bytes, bool dump_on_error);
60
66
  static std::string hex_dump(const uint8_t* ptr, size_t size);
61
67
  };
62
68
 
@@ -26,106 +26,120 @@
26
26
 
27
27
  namespace datasketches {
28
28
 
29
+ template<typename T>
30
+ T whole_bytes_to_hold_bits(T bits) {
31
+ static_assert(std::is_integral<T>::value, "integral type expected");
32
+ return (bits >> 3) + ((bits & 7) > 0);
33
+ }
34
+
29
35
  template<bool dummy>
30
36
  auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
31
- if (size < 8) throw std::out_of_range("at least 8 bytes expected, actual " + std::to_string(size)
32
- + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
33
-
37
+ check_memory_size(ptr, size, 8, dump_on_error);
38
+ checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
34
39
  uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
35
-
36
40
  switch(serial_version) {
37
- case COMPACT_SKETCH_SERIAL_VERSION: {
38
- checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
41
+ case 4: {
42
+ // version 4 sketches are ordered and always have entries (single item in exact mode is v3)
43
+ const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
44
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
45
+ const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 1;
46
+ uint64_t theta = theta_constants::MAX_THETA;
47
+ if (has_theta) {
48
+ check_memory_size(ptr, size, 16, dump_on_error);
49
+ theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_V4_THETA_U64];
50
+ }
51
+ const uint8_t num_entries_bytes = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_V4_NUM_ENTRIES_BYTES_BYTE];
52
+ size_t data_offset_bytes = has_theta ? COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_BYTE : COMPACT_SKETCH_V4_PACKED_DATA_EXACT_BYTE;
53
+ check_memory_size(ptr, size, data_offset_bytes + num_entries_bytes, dump_on_error);
54
+ uint32_t num_entries = 0;
55
+ const uint8_t* num_entries_ptr = reinterpret_cast<const uint8_t*>(ptr) + data_offset_bytes;
56
+ for (unsigned i = 0; i < num_entries_bytes; ++i) {
57
+ num_entries |= (*num_entries_ptr++) << (i << 3);
58
+ }
59
+ data_offset_bytes += num_entries_bytes;
60
+ const uint8_t entry_bits = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_V4_ENTRY_BITS_BYTE];
61
+ const size_t expected_bits = entry_bits * num_entries;
62
+ const size_t expected_size_bytes = data_offset_bytes + whole_bytes_to_hold_bits(expected_bits);
63
+ check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
64
+ return {false, true, seed_hash, num_entries, theta,
65
+ reinterpret_cast<const uint8_t*>(ptr) + data_offset_bytes, entry_bits};
66
+ }
67
+ case 3: {
39
68
  uint64_t theta = theta_constants::MAX_THETA;
40
69
  const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
41
70
  if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
42
- return {true, true, seed_hash, 0, theta, nullptr};
71
+ return {true, true, seed_hash, 0, theta, nullptr, 64};
43
72
  }
44
73
  checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
45
74
  const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
46
75
  if (has_theta) {
47
- if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
76
+ check_memory_size(ptr, size, (COMPACT_SKETCH_THETA_U64 + 1) * sizeof(uint64_t), dump_on_error);
48
77
  theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
49
78
  }
50
79
  if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
51
- if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
52
- return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
80
+ check_memory_size(ptr, size, 16, dump_on_error);
81
+ return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64, 64};
53
82
  }
54
83
  const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
55
84
  const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
56
85
  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
57
86
  const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
58
- if (size < expected_size_bytes) {
59
- throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
60
- + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
61
- }
87
+ check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
62
88
  const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
63
- return {false, is_ordered, seed_hash, num_entries, theta, entries};
89
+ return {false, is_ordered, seed_hash, num_entries, theta, entries, 64};
64
90
  }
65
91
  case 1: {
66
92
  uint16_t seed_hash = compute_seed_hash(seed);
67
- checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
68
93
  const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
69
94
  uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
70
95
  bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
71
- if (is_empty) {
72
- return {true, true, seed_hash, 0, theta, nullptr};
73
- }
96
+ if (is_empty) return {true, true, seed_hash, 0, theta, nullptr, 64};
74
97
  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
75
98
  const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
76
- if (size < expected_size_bytes) {
77
- throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
78
- + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
79
- }
80
- return {false, true, seed_hash, num_entries, theta, entries};
99
+ check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
100
+ return {false, true, seed_hash, num_entries, theta, entries, 64};
81
101
  }
82
102
  case 2: {
83
- uint8_t preamble_size = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
84
- checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
103
+ uint8_t preamble_size = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
85
104
  const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
86
105
  checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
87
106
  if (preamble_size == 1) {
88
- return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
107
+ return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr, 64};
89
108
  } else if (preamble_size == 2) {
90
109
  const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
91
110
  if (num_entries == 0) {
92
- return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
111
+ return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr, 64};
93
112
  } else {
94
113
  const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
95
- if (size < expected_size_bytes) {
96
- throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
97
- + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
98
- }
114
+ check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
99
115
  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
100
- return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries};
116
+ return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries, 64};
101
117
  }
102
118
  } else if (preamble_size == 3) {
103
119
  const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
104
120
  uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
105
121
  bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
106
- if (is_empty) {
107
- return {true, true, seed_hash, 0, theta, nullptr};
108
- }
122
+ if (is_empty) return {true, true, seed_hash, 0, theta, nullptr, 64};
109
123
  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
110
124
  const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
111
- if (size < expected_size_bytes) {
112
- throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
113
- + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
114
- }
115
- return {false, true, seed_hash, num_entries, theta, entries};
125
+ check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
126
+ return {false, true, seed_hash, num_entries, theta, entries, 64};
116
127
  } else {
117
128
  throw std::invalid_argument(std::to_string(preamble_size) + " longs of premable, but expected 1, 2, or 3");
118
129
  }
119
130
  }
120
131
  default:
121
- // this should always fail since the valid cases are handled above
122
- checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
123
- // this throw is never reached, because check_serial_version will throw an informative exception.
124
- // This is only here to avoid a compiler warning about a path without a return value.
125
- throw std::invalid_argument("unexpected sketch serialization version");
132
+ throw std::invalid_argument("unsupported serial version " + std::to_string(serial_version));
126
133
  }
127
134
  }
128
135
 
136
+ template<bool dummy>
137
+ void compact_theta_sketch_parser<dummy>::check_memory_size(const void* ptr, size_t actual_bytes, size_t expected_bytes, bool dump_on_error) {
138
+ if (actual_bytes < expected_bytes) throw std::out_of_range("at least " + std::to_string(expected_bytes)
139
+ + " bytes expected, actual " + std::to_string(actual_bytes)
140
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), actual_bytes)) : ""));
141
+ }
142
+
129
143
  template<bool dummy>
130
144
  std::string compact_theta_sketch_parser<dummy>::hex_dump(const uint8_t* ptr, size_t size) {
131
145
  std::stringstream s;
@@ -20,8 +20,10 @@
20
20
  #ifndef THETA_HELPERS_HPP_
21
21
  #define THETA_HELPERS_HPP_
22
22
 
23
- #include <string>
24
23
  #include <stdexcept>
24
+ #include <string>
25
+
26
+ #include "theta_constants.hpp"
25
27
 
26
28
  namespace datasketches {
27
29
 
@@ -55,7 +57,7 @@ public:
55
57
  // consistent way of initializing theta from p
56
58
  // avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
57
59
  static uint64_t starting_theta_from_p(float p) {
58
- if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
60
+ if (p < 1) return static_cast<float>(theta_constants::MAX_THETA) * p;
59
61
  return theta_constants::MAX_THETA;
60
62
  }
61
63
 
@@ -21,6 +21,7 @@
21
21
  #define THETA_SKETCH_HPP_
22
22
 
23
23
  #include "theta_update_sketch_base.hpp"
24
+ #include "compact_theta_sketch_parser.hpp"
24
25
 
25
26
  namespace datasketches {
26
27
 
@@ -317,7 +318,8 @@ public:
317
318
  using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
318
319
  using vector_bytes = std::vector<uint8_t, AllocBytes>;
319
320
 
320
- static const uint8_t SERIAL_VERSION = 3;
321
+ static const uint8_t UNCOMPRESSED_SERIAL_VERSION = 3;
322
+ static const uint8_t COMPRESSED_SERIAL_VERSION = 4;
321
323
  static const uint8_t SKETCH_TYPE = 3;
322
324
 
323
325
  // Instances of this type can be obtained:
@@ -355,6 +357,25 @@ public:
355
357
  */
356
358
  vector_bytes serialize(unsigned header_size_bytes = 0) const;
357
359
 
360
+ /**
361
+ * This method serializes the sketch into a given stream in a compressed binary form.
362
+ * Compression is applied to ordered sketches except empty and single item.
363
+ * For unordered, empty and single item sketches this method is equivalent to serialize()
364
+ * @param os output stream
365
+ */
366
+ void serialize_compressed(std::ostream& os) const;
367
+
368
+ /**
369
+ * This method serializes the sketch as a vector of bytes.
370
+ * An optional header can be reserved in front of the sketch.
371
+ * It is an uninitialized space of a given size.
372
+ * This header is used in Datasketches PostgreSQL extension.
373
+ * Compression is applied to ordered sketches except empty and single item.
374
+ * For unordered, empty and single item sketches this method is equivalent to serialize()
375
+ * @param header_size_bytes space to reserve in front of the sketch
376
+ */
377
+ vector_bytes serialize_compressed(unsigned header_size_bytes = 0) const;
378
+
358
379
  virtual iterator begin();
359
380
  virtual iterator end();
360
381
  virtual const_iterator begin() const;
@@ -391,6 +412,16 @@ private:
391
412
  uint64_t theta_;
392
413
  std::vector<uint64_t, Allocator> entries_;
393
414
 
415
+ bool is_suitable_for_compression() const;
416
+ uint8_t compute_min_leading_zeros() const;
417
+ void serialize_version_4(std::ostream& os) const;
418
+ vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
419
+
420
+ static compact_theta_sketch_alloc deserialize_v1(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
421
+ static compact_theta_sketch_alloc deserialize_v2(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
422
+ static compact_theta_sketch_alloc deserialize_v3(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
423
+ static compact_theta_sketch_alloc deserialize_v4(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
424
+
394
425
  virtual void print_specifics(std::ostringstream& os) const;
395
426
  };
396
427
 
@@ -407,7 +438,7 @@ public:
407
438
  template<typename Allocator = std::allocator<uint64_t>>
408
439
  class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
409
440
  public:
410
- using const_iterator = const uint64_t*;
441
+ class const_iterator;
411
442
 
412
443
  Allocator get_allocator() const;
413
444
  bool is_empty() const;
@@ -433,15 +464,32 @@ protected:
433
464
  virtual void print_items(std::ostringstream& os) const;
434
465
 
435
466
  private:
436
- bool is_empty_;
437
- bool is_ordered_;
438
- uint16_t seed_hash_;
439
- uint32_t num_entries_;
440
- uint64_t theta_;
441
- const uint64_t* entries_;
467
+ using data_type = compact_theta_sketch_parser<true>::compact_theta_sketch_data;
468
+ data_type data_;
442
469
 
443
- wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
444
- uint64_t theta, const uint64_t* entries);
470
+ wrapped_compact_theta_sketch_alloc(const data_type& data);
471
+ };
472
+
473
+ template<typename Allocator>
474
+ class wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator: public std::iterator<std::input_iterator_tag, uint64_t> {
475
+ public:
476
+ const_iterator(const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index);
477
+ const_iterator& operator++();
478
+ const_iterator operator++(int);
479
+ bool operator==(const const_iterator& other) const;
480
+ bool operator!=(const const_iterator& other) const;
481
+ const uint64_t& operator*() const;
482
+ const uint64_t* operator->() const;
483
+ private:
484
+ const void* ptr_;
485
+ uint8_t entry_bits_;
486
+ uint32_t num_entries_;
487
+ uint32_t index_;
488
+ uint64_t previous_;
489
+ bool is_block_mode_;
490
+ uint8_t buf_i_;
491
+ uint8_t offset_;
492
+ uint64_t buffer_[8];
445
493
  };
446
494
 
447
495
  // aliases with default allocator for convenience