datasketches 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -0,0 +1,543 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef DENSITY_SKETCH_IMPL_HPP_
21
+ #define DENSITY_SKETCH_IMPL_HPP_
22
+
23
+ #include <algorithm>
24
+ #include <sstream>
25
+
26
+ #include "memory_operations.hpp"
27
+ #include "conditional_forward.hpp"
28
+
29
+ namespace datasketches {
30
+
31
+ template<typename T, typename K, typename A>
32
+ density_sketch<T, K, A>::density_sketch(uint16_t k, uint32_t dim, const K& kernel, const A& allocator):
33
+ kernel_(kernel),
34
+ k_(k),
35
+ dim_(dim),
36
+ num_retained_(0),
37
+ n_(0),
38
+ levels_(1, Level(allocator), allocator)
39
+ {
40
+ check_k(k);
41
+ }
42
+
43
+ template<typename T, typename K, typename A>
44
+ density_sketch<T, K, A>::density_sketch(uint16_t k, uint32_t dim, uint32_t num_retained, uint64_t n,
45
+ Levels&& levels, const K& kernel):
46
+ kernel_(kernel),
47
+ k_(k),
48
+ dim_(dim),
49
+ num_retained_(num_retained),
50
+ n_(n),
51
+ levels_(std::move(levels))
52
+ {
53
+ check_k(k);
54
+ }
55
+
56
+ template<typename T, typename K, typename A>
57
+ uint16_t density_sketch<T, K, A>::get_k() const {
58
+ return k_;
59
+ }
60
+
61
+ template<typename T, typename K, typename A>
62
+ uint32_t density_sketch<T, K, A>::get_dim() const {
63
+ return dim_;
64
+ }
65
+
66
+ template<typename T, typename K, typename A>
67
+ bool density_sketch<T, K, A>::is_empty() const {
68
+ return num_retained_ == 0;
69
+ }
70
+
71
+ template<typename T, typename K, typename A>
72
+ uint64_t density_sketch<T, K, A>::get_n() const {
73
+ return n_;
74
+ }
75
+
76
+ template<typename T, typename K, typename A>
77
+ uint32_t density_sketch<T, K, A>::get_num_retained() const {
78
+ return num_retained_;
79
+ }
80
+
81
+ template<typename T, typename K, typename A>
82
+ bool density_sketch<T, K, A>::is_estimation_mode() const {
83
+ return levels_.size() > 1;
84
+ }
85
+
86
+ template<typename T, typename K, typename A>
87
+ template<typename FwdVector>
88
+ void density_sketch<T, K, A>::update(FwdVector&& point) {
89
+ if (point.size() != dim_) throw std::invalid_argument("dimension mismatch");
90
+ while (num_retained_ >= k_ * levels_.size()) compact();
91
+ levels_[0].push_back(std::forward<FwdVector>(point));
92
+ ++num_retained_;
93
+ ++n_;
94
+ }
95
+
96
+ template<typename T, typename K, typename A>
97
+ template<typename FwdSketch>
98
+ void density_sketch<T, K, A>::merge(FwdSketch&& other) {
99
+ if (other.is_empty()) return;
100
+ if (other.dim_ != dim_) throw std::invalid_argument("dimension mismatch");
101
+ while (levels_.size() < other.levels_.size()) levels_.push_back(Level(levels_.get_allocator()));
102
+ for (unsigned height = 0; height < other.levels_.size(); ++height) {
103
+ std::copy(
104
+ forward_begin(conditional_forward<FwdSketch>(other.levels_[height])),
105
+ forward_end(conditional_forward<FwdSketch>(other.levels_[height])),
106
+ back_inserter(levels_[height])
107
+ );
108
+ }
109
+ num_retained_ += other.num_retained_;
110
+ n_ += other.n_;
111
+ while (num_retained_ >= k_ * levels_.size()) compact();
112
+ }
113
+
114
+ template<typename T, typename K, typename A>
115
+ T density_sketch<T, K, A>::get_estimate(const std::vector<T>& point) const {
116
+ if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
117
+ T density = 0;
118
+ for (unsigned height = 0; height < levels_.size(); ++height) {
119
+ for (const auto& p: levels_[height]) {
120
+ density += (1 << height) * kernel_(p, point) / n_;
121
+ }
122
+ }
123
+ return density;
124
+ }
125
+
126
+ template<typename T, typename K, typename A>
127
+ A density_sketch<T, K, A>::get_allocator() const {
128
+ return levels_.get_allocator();
129
+ }
130
+
131
+ template<typename T, typename K, typename A>
132
+ void density_sketch<T, K, A>::compact() {
133
+ for (unsigned height = 0; height < levels_.size(); ++height) {
134
+ if (levels_[height].size() >= k_) {
135
+ if (height + 1 >= levels_.size()) levels_.push_back(Level(levels_.get_allocator()));
136
+ compact_level(height);
137
+ break;
138
+ }
139
+ }
140
+ }
141
+
142
+ template<typename T, typename K, typename A>
143
+ void density_sketch<T, K, A>::compact_level(unsigned height) {
144
+ auto& level = levels_[height];
145
+ std::vector<bool> bits(level.size());
146
+ bits[0] = random_bit();
147
+ std::random_shuffle(level.begin(), level.end());
148
+ for (unsigned i = 1; i < level.size(); ++i) {
149
+ T delta = 0;
150
+ for (unsigned j = 0; j < i; ++j) {
151
+ delta += (bits[j] ? 1 : -1) * kernel_(level[i], level[j]);
152
+ }
153
+ bits[i] = delta < 0;
154
+ }
155
+ for (unsigned i = 0; i < level.size(); ++i) {
156
+ if (bits[i]) {
157
+ levels_[height + 1].push_back(std::move(level[i]));
158
+ } else {
159
+ --num_retained_;
160
+ }
161
+ }
162
+ level.clear();
163
+ }
164
+
165
+ /* Serialized sketch layout:
166
+ * Int || Start Byte Addr:
167
+ * Addr:
168
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
169
+ * 0 || Preamble_Ints | SerVer | FamID | Flags |------- K -------|---- unused -----|
170
+ *
171
+ * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
172
+ * 2 ||------------- Num Dimensions --------------|------ Num Retained Items ---------|
173
+ *
174
+ * || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
175
+ * 4 ||---------------------------Items Seen Count (N)--------------------------------|
176
+ *
177
+ * Ints 2 and 3 are omitted when the sketch is empty, meaning Num Dimensions is stored at
178
+ * offset 8 in that case. Otherwise, Int 5 is the start of level data, consisting of the
179
+ * size of the level (as a uint32 value) followed by that number of points, with
180
+ * Num Dimensions per point.
181
+ */
182
+
183
+ template<typename T, typename K, typename A>
184
+ void density_sketch<T, K, A>::serialize(std::ostream& os) const {
185
+ const uint8_t preamble_ints = is_empty() ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_LONG;
186
+ write(os, preamble_ints);
187
+ const uint8_t ser_ver = SERIAL_VERSION;
188
+ write(os, ser_ver);
189
+ const uint8_t family = FAMILY_ID;
190
+ write(os, family);
191
+
192
+ // only empty is a valid flag
193
+ const uint8_t flags_byte = (is_empty() ? 1 << flags::IS_EMPTY : 0);
194
+ write(os, flags_byte);
195
+ write(os, k_);
196
+ const uint16_t unused = 0;
197
+ write(os, unused);
198
+ write(os, dim_);
199
+
200
+ if (is_empty())
201
+ return;
202
+
203
+ write(os, num_retained_);
204
+ write(os, n_);
205
+
206
+ // levels array -- uint32_t since a single level may be larger than k
207
+ size_t pt_size = sizeof(T) * dim_;
208
+ for (const Level& lvl : levels_) {
209
+ const uint32_t level_size = static_cast<uint32_t>(lvl.size());
210
+ write(os, level_size);
211
+ for (const Vector& pt : lvl) {
212
+ write(os, pt.data(), pt_size);
213
+ }
214
+ }
215
+ }
216
+
217
+ template<typename T, typename K, typename A>
218
+ auto density_sketch<T, K, A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
219
+ const uint8_t preamble_ints = (is_empty() ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_LONG);
220
+
221
+ // pre-compute size
222
+ size_t size = header_size_bytes + preamble_ints * sizeof(uint32_t);
223
+ if (!is_empty())
224
+ for (const Level& lvl : levels_)
225
+ size += sizeof(uint32_t) + (lvl.size() * dim_ * sizeof(T));
226
+
227
+ vector_bytes bytes(size, 0, levels_.get_allocator());
228
+ uint8_t* ptr = bytes.data() + header_size_bytes;
229
+ const uint8_t* end_ptr = ptr + size;
230
+
231
+ ptr += copy_to_mem(preamble_ints, ptr);
232
+ const uint8_t ser_ver = SERIAL_VERSION;
233
+ ptr += copy_to_mem(ser_ver, ptr);
234
+ const uint8_t family = FAMILY_ID;
235
+ ptr += copy_to_mem(family, ptr);
236
+
237
+ // empty is the only valid flat
238
+ const uint8_t flags_byte = (is_empty() ? 1 << flags::IS_EMPTY : 0);
239
+ ptr += copy_to_mem(flags_byte, ptr);
240
+ ptr += copy_to_mem(k_, ptr);
241
+ ptr += sizeof(uint16_t); // 2 unused bytes
242
+ ptr += copy_to_mem(dim_, ptr);
243
+
244
+ if (is_empty())
245
+ return bytes;
246
+
247
+ ptr += copy_to_mem(num_retained_, ptr);
248
+ ptr += copy_to_mem(n_, ptr);
249
+
250
+ // levels array -- uint32_t since a single level may be larger than k
251
+ size_t pt_size = sizeof(T) * dim_;
252
+ for (const Level& lvl : levels_) {
253
+ ptr += copy_to_mem(static_cast<uint32_t>(lvl.size()), ptr);
254
+ for (const Vector& pt : lvl) {
255
+ ptr += copy_to_mem(pt.data(), ptr, pt_size);
256
+ }
257
+ }
258
+
259
+ if (ptr != end_ptr)
260
+ throw std::runtime_error("Actual output size does not equal expected output size");
261
+
262
+ return bytes;
263
+ }
264
+
265
+ template<typename T, typename K, typename A>
266
+ density_sketch<T, K, A> density_sketch<T, K, A>::deserialize(std::istream& is, const K& kernel, const A& allocator) {
267
+ const auto preamble_ints = read<uint8_t>(is);
268
+ const auto serial_version = read<uint8_t>(is);
269
+ const auto family_id = read<uint8_t>(is);
270
+ const auto flags_byte = read<uint8_t>(is);
271
+ const auto k = read<uint16_t>(is);
272
+ read<uint16_t>(is); // unused
273
+ const auto dim = read<uint32_t>(is);
274
+
275
+ check_k(k); // do we have constraints?
276
+ check_serial_version(serial_version); // a little redundant with the header check
277
+ check_family_id(family_id);
278
+ check_header_validity(preamble_ints, flags_byte, serial_version);
279
+
280
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
281
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
282
+ if (is_empty) {
283
+ return density_sketch(k, dim, kernel, allocator);
284
+ }
285
+
286
+ const auto num_retained = read<uint32_t>(is);
287
+ const auto n = read<uint64_t>(is);
288
+
289
+ // levels arrays
290
+ size_t pt_size = sizeof(T) * dim;
291
+ Levels levels(allocator);
292
+ int64_t num_to_read = num_retained; // num_retrained is uint32_t so this allows error checking
293
+ while (num_to_read > 0) {
294
+ const auto level_size = read<uint32_t>(is);
295
+ Level lvl(allocator);
296
+ lvl.reserve(level_size);
297
+ for (uint32_t i = 0; i < level_size; ++i) {
298
+ Vector pt(dim, 0, allocator);
299
+ read(is, pt.data(), pt_size);
300
+ lvl.push_back(pt);
301
+ }
302
+ levels.push_back(lvl);
303
+ num_to_read -= lvl.size();
304
+ }
305
+
306
+ if (num_to_read != 0)
307
+ throw std::runtime_error("Error deserializing sketch: Incorrect number of items read");
308
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
309
+
310
+ return density_sketch(k, dim, num_retained, n, std::move(levels), kernel);
311
+ }
312
+
313
+ template<typename T, typename K, typename A>
314
+ density_sketch<T, K, A> density_sketch<T, K, A>::deserialize(const void* bytes, size_t size, const K& kernel, const A& allocator) {
315
+ ensure_minimum_memory(size, PREAMBLE_INTS_SHORT * sizeof(uint32_t));
316
+ const char* ptr = static_cast<const char*>(bytes);
317
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
318
+ uint8_t preamble_ints;
319
+ ptr += copy_from_mem(ptr, preamble_ints);
320
+ uint8_t serial_version;
321
+ ptr += copy_from_mem(ptr, serial_version);
322
+ uint8_t family_id;
323
+ ptr += copy_from_mem(ptr, family_id);
324
+ uint8_t flags_byte;
325
+ ptr += copy_from_mem(ptr, flags_byte);
326
+ uint16_t k;
327
+ ptr += copy_from_mem(ptr, k);
328
+ uint16_t unused;
329
+ ptr += copy_from_mem(ptr, unused);
330
+ uint32_t dim;
331
+ ptr += copy_from_mem(ptr, dim);
332
+
333
+ check_k(k);
334
+ check_serial_version(serial_version); // a little redundant with the header check
335
+ check_family_id(family_id);
336
+ check_header_validity(preamble_ints, flags_byte, serial_version);
337
+
338
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
339
+ if (is_empty) {
340
+ return density_sketch(k, dim, kernel, allocator);
341
+ }
342
+
343
+ ensure_minimum_memory(size, PREAMBLE_INTS_LONG * sizeof(uint32_t));
344
+ uint32_t num_retained;
345
+ ptr += copy_from_mem(ptr, num_retained);
346
+ uint64_t n;
347
+ ptr += copy_from_mem(ptr, n);
348
+
349
+ // Predicting the number of levels seems hard so determining the exact remaining
350
+ // size is also hard. But we need at least num_retained * dim * sizeof(T)
351
+ // bytes for the points so we can check that.
352
+ size_t pt_size = sizeof(T) * dim;
353
+ ensure_minimum_memory(end_ptr - ptr, num_retained * pt_size);
354
+
355
+ // levels arrays
356
+ Levels levels(allocator);
357
+ int64_t num_to_read = num_retained; // num_retained is uint32_t so this allows error checking
358
+ while (num_to_read > 0) {
359
+ uint32_t level_size;
360
+ ptr += copy_from_mem(ptr, level_size);
361
+ ensure_minimum_memory(end_ptr - ptr, level_size * pt_size);
362
+ Level lvl(allocator);
363
+ lvl.reserve(level_size);
364
+ for (uint32_t i = 0; i < level_size; ++i) {
365
+ Vector pt(dim, 0, allocator);
366
+ ptr += copy_from_mem(ptr, pt.data(), pt_size);
367
+ lvl.push_back(pt);
368
+ }
369
+ levels.push_back(lvl);
370
+ num_to_read -= lvl.size();
371
+ }
372
+
373
+ if (num_to_read != 0)
374
+ throw std::runtime_error("Error deserializing sketch: Incorrect number of items read");
375
+ if (ptr > end_ptr) throw std::runtime_error("Error deserializing sketch: Read beyond provided memory");
376
+
377
+ return density_sketch(k, dim, num_retained, n, std::move(levels), kernel);
378
+ }
379
+
380
+ template<typename T, typename K, typename A>
381
+ void density_sketch<T, K, A>::check_k(uint16_t k) {
382
+ if (k < 2)
383
+ throw std::invalid_argument("k must be > 1. Found: " + std::to_string(k));
384
+ }
385
+
386
+ template<typename T, typename K, typename A>
387
+ void density_sketch<T, K, A>::check_serial_version(uint8_t serial_version) {
388
+ if (serial_version == SERIAL_VERSION)
389
+ return;
390
+ else
391
+ throw std::invalid_argument("Possible corruption. Unrecognized serialization version: " + std::to_string(serial_version));
392
+ }
393
+
394
+ template<typename T, typename K, typename A>
395
+ void density_sketch<T, K, A>::check_family_id(uint8_t family_id) {
396
+ if (family_id == FAMILY_ID)
397
+ return;
398
+ else
399
+ throw std::invalid_argument("Possible corruption. Family id does not indicate density sketch: " + std::to_string(family_id));
400
+ }
401
+
402
+ template<typename T, typename K, typename A>
403
+ void density_sketch<T, K, A>::check_header_validity(uint8_t preamble_ints, uint8_t flags_byte, uint8_t serial_version) {
404
+ const bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
405
+
406
+ if ((empty && preamble_ints == PREAMBLE_INTS_SHORT)
407
+ || (!empty && preamble_ints == PREAMBLE_INTS_LONG))
408
+ return;
409
+ else {
410
+ std::ostringstream os;
411
+ os << "Possible sketch corruption. Inconsistent state: "
412
+ << "preamble_ints = " << preamble_ints
413
+ << ", empty = " << (empty ? "true" : "false")
414
+ << ", serialization_version = " << serial_version;
415
+ throw std::invalid_argument(os.str());
416
+ }
417
+ }
418
+
419
+ template<typename T, typename K, typename A>
420
+ string<A> density_sketch<T, K, A>::to_string(bool print_levels, bool print_items) const {
421
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
422
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
423
+ std::ostringstream os;
424
+ os << "### Density sketch summary:" << std::endl;
425
+ os << " K : " << k_ << std::endl;
426
+ os << " Dim : " << dim_ << std::endl;
427
+ os << " Empty : " << (is_empty() ? "true" : "false") << std::endl;
428
+ os << " N : " << n_ << std::endl;
429
+ os << " Retained items : " << num_retained_ << std::endl;
430
+ os << " Estimation mode: " << (is_estimation_mode() ? "true" : "false") << std::endl;
431
+ os << " Levels : " << levels_.size() << std::endl;
432
+ os << "### End sketch summary" << std::endl;
433
+
434
+ if (print_levels) {
435
+ os << "### Density sketch levels:" << std::endl;
436
+ os << " height: size" << std::endl;
437
+ for (unsigned height = 0; height < levels_.size(); ++height) {
438
+ os << " " << height << ": "
439
+ << levels_[height].size() << std::endl;
440
+ }
441
+ os << "### End sketch levels" << std::endl;
442
+ }
443
+
444
+ if (print_items) {
445
+ os << "### Density sketch data:" << std::endl;
446
+ for (unsigned height = 0; height < levels_.size(); ++height) {
447
+ os << " level " << height << ": " << std::endl;
448
+ for (const auto& point: levels_[height]) {
449
+ os << " [";
450
+ bool first = true;
451
+ for (auto value: point) {
452
+ if (first) {
453
+ first = false;
454
+ } else {
455
+ os << ", ";
456
+ }
457
+ os << value;
458
+ }
459
+ os << "]" << std::endl;
460
+ }
461
+ }
462
+ os << "### End sketch data" << std::endl;
463
+ }
464
+ return string<A>(os.str().c_str(), levels_.get_allocator());
465
+ }
466
+
467
+ template<typename T, typename K, typename A>
468
+ auto density_sketch<T, K, A>::begin() const -> const_iterator {
469
+ return const_iterator(levels_.begin(), levels_.end());
470
+ }
471
+
472
+ template<typename T, typename K, typename A>
473
+ auto density_sketch<T, K, A>::end() const -> const_iterator {
474
+ return const_iterator(levels_.end(), levels_.end());
475
+ }
476
+
477
+ // iterator
478
+
479
+ template<typename T, typename K, typename A>
480
+ density_sketch<T, K, A>::const_iterator::const_iterator(LevelsIterator begin, LevelsIterator end):
481
+ levels_it_(begin),
482
+ levels_end_(end),
483
+ level_it_(),
484
+ height_(0)
485
+ {
486
+ // skip empty levels
487
+ while (levels_it_ != levels_end_) {
488
+ level_it_ = levels_it_->begin();
489
+ if (level_it_ != levels_it_->end()) break;
490
+ ++levels_it_;
491
+ ++height_;
492
+ }
493
+ }
494
+
495
+ template<typename T, typename K, typename A>
496
+ auto density_sketch<T, K, A>::const_iterator::operator++() -> const_iterator& {
497
+ ++level_it_;
498
+ if (level_it_ == levels_it_->end()) {
499
+ ++levels_it_;
500
+ ++height_;
501
+ // skip empty levels
502
+ while (levels_it_ != levels_end_) {
503
+ level_it_ = levels_it_->begin();
504
+ if (level_it_ != levels_it_->end()) break;
505
+ ++levels_it_;
506
+ ++height_;
507
+ }
508
+ }
509
+ return *this;
510
+ }
511
+
512
+ template<typename T, typename K, typename A>
513
+ auto density_sketch<T, K, A>::const_iterator::operator++(int) -> const_iterator& {
514
+ const_iterator tmp(*this);
515
+ operator++();
516
+ return tmp;
517
+ }
518
+
519
+ template<typename T, typename K, typename A>
520
+ bool density_sketch<T, K, A>::const_iterator::operator==(const const_iterator& other) const {
521
+ if (levels_it_ != other.levels_it_) return false;
522
+ if (levels_it_ == levels_end_) return true;
523
+ return level_it_ == other.level_it_;
524
+ }
525
+
526
+ template<typename T, typename K, typename A>
527
+ bool density_sketch<T, K, A>::const_iterator::operator!=(const const_iterator& other) const {
528
+ return !operator==(other);
529
+ }
530
+
531
+ template<typename T, typename K, typename A>
532
+ auto density_sketch<T, K, A>::const_iterator::operator*() const -> const value_type {
533
+ return value_type(*level_it_, 1ULL << height_);
534
+ }
535
+
536
+ template<typename T, typename K, typename A>
537
+ auto density_sketch<T, K, A>::const_iterator::operator->() const -> const return_value_holder<value_type> {
538
+ return **this;
539
+ }
540
+
541
+ } /* namespace datasketches */
542
+
543
+ #endif
@@ -0,0 +1,35 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_executable(density_test)
19
+
20
+ target_link_libraries(density_test density common_test_lib)
21
+
22
+ set_target_properties(density_test PROPERTIES
23
+ CXX_STANDARD 11
24
+ CXX_STANDARD_REQUIRED YES
25
+ )
26
+
27
+ add_test(
28
+ NAME density_test
29
+ COMMAND density_test
30
+ )
31
+
32
+ target_sources(density_test
33
+ PRIVATE
34
+ density_sketch_test.cpp
35
+ )