datasketches 0.2.7 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/MANIFEST.in +21 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
- data/vendor/datasketches-cpp/pyproject.toml +17 -12
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/setup.py +14 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +14 -5
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -47,7 +47,6 @@ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template
|
|
47
47
|
*/
|
48
48
|
template<
|
49
49
|
typename T,
|
50
|
-
typename S = serde<T>, // deprecated, to be removed in the next major version
|
51
50
|
typename A = std::allocator<T>
|
52
51
|
>
|
53
52
|
class var_opt_union {
|
@@ -69,20 +68,20 @@ public:
|
|
69
68
|
* This method takes an lvalue.
|
70
69
|
* @param sk a sketch to add to the union
|
71
70
|
*/
|
72
|
-
void update(const var_opt_sketch<T,
|
71
|
+
void update(const var_opt_sketch<T, A>& sk);
|
73
72
|
|
74
73
|
/**
|
75
74
|
* Updates this union with the given sketch
|
76
75
|
* This method takes an rvalue.
|
77
76
|
* @param sk a sketch to add to the union
|
78
77
|
*/
|
79
|
-
void update(var_opt_sketch<T,
|
78
|
+
void update(var_opt_sketch<T, A>&& sk);
|
80
79
|
|
81
80
|
/**
|
82
81
|
* Gets the varopt sketch resulting from the union of any input sketches.
|
83
82
|
* @return a varopt sketch
|
84
83
|
*/
|
85
|
-
var_opt_sketch<T,
|
84
|
+
var_opt_sketch<T, A> get_result() const;
|
86
85
|
|
87
86
|
/**
|
88
87
|
* Resets the union to its default, empty state.
|
@@ -95,7 +94,7 @@ public:
|
|
95
94
|
* @param instance of a SerDe
|
96
95
|
* @return size in bytes needed to serialize this sketch
|
97
96
|
*/
|
98
|
-
template<typename SerDe =
|
97
|
+
template<typename SerDe = serde<T>>
|
99
98
|
size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
|
100
99
|
|
101
100
|
// This is a convenience alias for users
|
@@ -111,7 +110,7 @@ public:
|
|
111
110
|
* @param header_size_bytes space to reserve in front of the sketch
|
112
111
|
* @param instance of a SerDe
|
113
112
|
*/
|
114
|
-
template<typename SerDe =
|
113
|
+
template<typename SerDe = serde<T>>
|
115
114
|
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
|
116
115
|
|
117
116
|
/**
|
@@ -120,18 +119,9 @@ public:
|
|
120
119
|
* @param os output stream
|
121
120
|
* @param instance of a SerDe
|
122
121
|
*/
|
123
|
-
template<typename SerDe =
|
122
|
+
template<typename SerDe = serde<T>>
|
124
123
|
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
|
125
124
|
|
126
|
-
/**
|
127
|
-
* NOTE: This method may be deprecated in a future version.
|
128
|
-
* This method deserializes a union from a given stream.
|
129
|
-
* @param is input stream
|
130
|
-
* @param instance of an Allocator
|
131
|
-
* @return an instance of a union
|
132
|
-
*/
|
133
|
-
static var_opt_union deserialize(std::istream& is, const A& allocator = A());
|
134
|
-
|
135
125
|
/**
|
136
126
|
* NOTE: This method may be deprecated in a future version.
|
137
127
|
* This method deserializes a union from a given stream.
|
@@ -140,19 +130,9 @@ public:
|
|
140
130
|
* @param instance of an Allocator
|
141
131
|
* @return an instance of a union
|
142
132
|
*/
|
143
|
-
template<typename SerDe =
|
133
|
+
template<typename SerDe = serde<T>>
|
144
134
|
static var_opt_union deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
|
145
135
|
|
146
|
-
/**
|
147
|
-
* NOTE: This method may be deprecated in a future version.
|
148
|
-
* This method deserializes a union from a given array of bytes.
|
149
|
-
* @param bytes pointer to the array of bytes
|
150
|
-
* @param size the size of the array
|
151
|
-
* @param instance of an Allocator
|
152
|
-
* @return an instance of a union
|
153
|
-
*/
|
154
|
-
static var_opt_union deserialize(const void* bytes, size_t size, const A& allocator = A());
|
155
|
-
|
156
136
|
/**
|
157
137
|
* NOTE: This method may be deprecated in a future version.
|
158
138
|
* This method deserializes a union from a given array of bytes.
|
@@ -162,7 +142,7 @@ public:
|
|
162
142
|
* @param instance of an Allocator
|
163
143
|
* @return an instance of a union
|
164
144
|
*/
|
165
|
-
template<typename SerDe =
|
145
|
+
template<typename SerDe = serde<T>>
|
166
146
|
static var_opt_union deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
|
167
147
|
|
168
148
|
/**
|
@@ -171,9 +151,8 @@ public:
|
|
171
151
|
*/
|
172
152
|
string<A> to_string() const;
|
173
153
|
|
174
|
-
|
175
154
|
private:
|
176
|
-
typedef typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T,
|
155
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T, A>> AllocSketch;
|
177
156
|
|
178
157
|
static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
|
179
158
|
static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
|
@@ -191,10 +170,10 @@ private:
|
|
191
170
|
|
192
171
|
uint32_t max_k_;
|
193
172
|
|
194
|
-
var_opt_sketch<T,
|
173
|
+
var_opt_sketch<T, A> gadget_;
|
195
174
|
|
196
175
|
var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
|
197
|
-
uint32_t max_k, var_opt_sketch<T,
|
176
|
+
uint32_t max_k, var_opt_sketch<T, A>&& gadget);
|
198
177
|
|
199
178
|
/*
|
200
179
|
IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
|
@@ -250,18 +229,18 @@ private:
|
|
250
229
|
more importantly, this design choice allows us to exactly re-construct the input sketch
|
251
230
|
when there is only one of them.
|
252
231
|
*/
|
253
|
-
inline void merge_items(const var_opt_sketch<T,
|
254
|
-
inline void merge_items(var_opt_sketch<T,
|
255
|
-
inline void resolve_tau(const var_opt_sketch<T,
|
232
|
+
inline void merge_items(const var_opt_sketch<T, A>& sk);
|
233
|
+
inline void merge_items(var_opt_sketch<T, A>&& sk);
|
234
|
+
inline void resolve_tau(const var_opt_sketch<T, A>& sketch);
|
256
235
|
|
257
236
|
double get_outer_tau() const;
|
258
237
|
|
259
|
-
var_opt_sketch<T,
|
238
|
+
var_opt_sketch<T, A> simple_gadget_coercer() const;
|
260
239
|
|
261
240
|
bool there_exist_unmarked_h_items_lighter_than_target(double threshold) const;
|
262
|
-
bool detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T,
|
263
|
-
void mark_moving_gadget_coercer(var_opt_sketch<T,
|
264
|
-
void migrate_marked_items_by_decreasing_k(var_opt_sketch<T,
|
241
|
+
bool detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T, A>& sk) const;
|
242
|
+
void mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) const;
|
243
|
+
void migrate_marked_items_by_decreasing_k(var_opt_sketch<T, A>& sk) const;
|
265
244
|
|
266
245
|
static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
|
267
246
|
static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
|
@@ -28,17 +28,17 @@
|
|
28
28
|
|
29
29
|
namespace datasketches {
|
30
30
|
|
31
|
-
template<typename T, typename
|
32
|
-
var_opt_union<T,
|
31
|
+
template<typename T, typename A>
|
32
|
+
var_opt_union<T, A>::var_opt_union(uint32_t max_k, const A& allocator) :
|
33
33
|
n_(0),
|
34
34
|
outer_tau_numer_(0.0),
|
35
35
|
outer_tau_denom_(0),
|
36
36
|
max_k_(max_k),
|
37
|
-
gadget_(max_k, var_opt_sketch<T,
|
37
|
+
gadget_(max_k, var_opt_sketch<T, A>::DEFAULT_RESIZE_FACTOR, true, allocator)
|
38
38
|
{}
|
39
39
|
|
40
|
-
template<typename T, typename
|
41
|
-
var_opt_union<T,
|
40
|
+
template<typename T, typename A>
|
41
|
+
var_opt_union<T, A>::var_opt_union(const var_opt_union& other) :
|
42
42
|
n_(other.n_),
|
43
43
|
outer_tau_numer_(other.outer_tau_numer_),
|
44
44
|
outer_tau_denom_(other.outer_tau_denom_),
|
@@ -46,8 +46,8 @@ var_opt_union<T,S,A>::var_opt_union(const var_opt_union& other) :
|
|
46
46
|
gadget_(other.gadget_)
|
47
47
|
{}
|
48
48
|
|
49
|
-
template<typename T, typename
|
50
|
-
var_opt_union<T,
|
49
|
+
template<typename T, typename A>
|
50
|
+
var_opt_union<T, A>::var_opt_union(var_opt_union&& other) noexcept :
|
51
51
|
n_(other.n_),
|
52
52
|
outer_tau_numer_(other.outer_tau_numer_),
|
53
53
|
outer_tau_denom_(other.outer_tau_denom_),
|
@@ -55,9 +55,9 @@ var_opt_union<T,S,A>::var_opt_union(var_opt_union&& other) noexcept :
|
|
55
55
|
gadget_(std::move(other.gadget_))
|
56
56
|
{}
|
57
57
|
|
58
|
-
template<typename T, typename
|
59
|
-
var_opt_union<T,
|
60
|
-
uint32_t max_k, var_opt_sketch<T,
|
58
|
+
template<typename T, typename A>
|
59
|
+
var_opt_union<T, A>::var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
|
60
|
+
uint32_t max_k, var_opt_sketch<T, A>&& gadget) :
|
61
61
|
n_(n),
|
62
62
|
outer_tau_numer_(outer_tau_numer),
|
63
63
|
outer_tau_denom_(outer_tau_denom),
|
@@ -65,12 +65,12 @@ var_opt_union<T,S,A>::var_opt_union(uint64_t n, double outer_tau_numer, uint64_t
|
|
65
65
|
gadget_(gadget)
|
66
66
|
{}
|
67
67
|
|
68
|
-
template<typename T, typename
|
69
|
-
var_opt_union<T,
|
68
|
+
template<typename T, typename A>
|
69
|
+
var_opt_union<T, A>::~var_opt_union() {}
|
70
70
|
|
71
|
-
template<typename T, typename
|
72
|
-
var_opt_union<T,
|
73
|
-
var_opt_union
|
71
|
+
template<typename T, typename A>
|
72
|
+
var_opt_union<T, A>& var_opt_union<T, A>::operator=(const var_opt_union& other) {
|
73
|
+
var_opt_union union_copy(other);
|
74
74
|
std::swap(n_, union_copy.n_);
|
75
75
|
std::swap(outer_tau_numer_, union_copy.outer_tau_numer_);
|
76
76
|
std::swap(outer_tau_denom_, union_copy.outer_tau_denom_);
|
@@ -79,8 +79,8 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(const var_opt_union& other
|
|
79
79
|
return *this;
|
80
80
|
}
|
81
81
|
|
82
|
-
template<typename T, typename
|
83
|
-
var_opt_union<T,
|
82
|
+
template<typename T, typename A>
|
83
|
+
var_opt_union<T, A>& var_opt_union<T, A>::operator=(var_opt_union&& other) {
|
84
84
|
std::swap(n_, other.n_);
|
85
85
|
std::swap(outer_tau_numer_, other.outer_tau_numer_);
|
86
86
|
std::swap(outer_tau_denom_, other.outer_tau_denom_);
|
@@ -128,14 +128,9 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
|
|
128
128
|
* </pre>
|
129
129
|
*/
|
130
130
|
|
131
|
-
template<typename T, typename
|
132
|
-
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
|
133
|
-
return deserialize(is, S(), allocator);
|
134
|
-
}
|
135
|
-
|
136
|
-
template<typename T, typename S, typename A>
|
131
|
+
template<typename T, typename A>
|
137
132
|
template<typename SerDe>
|
138
|
-
var_opt_union<T,
|
133
|
+
var_opt_union<T, A> var_opt_union<T, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
|
139
134
|
const auto preamble_longs = read<uint8_t>(is);
|
140
135
|
const auto serial_version = read<uint8_t>(is);
|
141
136
|
const auto family_id = read<uint8_t>(is);
|
@@ -155,29 +150,24 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const S
|
|
155
150
|
if (!is.good())
|
156
151
|
throw std::runtime_error("error reading from std::istream");
|
157
152
|
else
|
158
|
-
return var_opt_union
|
153
|
+
return var_opt_union(max_k);
|
159
154
|
}
|
160
155
|
|
161
156
|
const auto items_seen = read<uint64_t>(is);
|
162
157
|
const auto outer_tau_numer = read<double>(is);
|
163
158
|
const auto outer_tau_denom = read<uint64_t>(is);
|
164
159
|
|
165
|
-
var_opt_sketch<T,
|
160
|
+
var_opt_sketch<T, A> gadget = var_opt_sketch<T, A>::deserialize(is, sd, allocator);
|
166
161
|
|
167
162
|
if (!is.good())
|
168
163
|
throw std::runtime_error("error reading from std::istream");
|
169
164
|
|
170
|
-
return var_opt_union
|
171
|
-
}
|
172
|
-
|
173
|
-
template<typename T, typename S, typename A>
|
174
|
-
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
|
175
|
-
return deserialize(bytes, size, S(), allocator);
|
165
|
+
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
176
166
|
}
|
177
167
|
|
178
|
-
template<typename T, typename
|
168
|
+
template<typename T, typename A>
|
179
169
|
template<typename SerDe>
|
180
|
-
var_opt_union<T,
|
170
|
+
var_opt_union<T, A> var_opt_union<T, A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
|
181
171
|
ensure_minimum_memory(size, 8);
|
182
172
|
const char* ptr = static_cast<const char*>(bytes);
|
183
173
|
uint8_t preamble_longs;
|
@@ -201,7 +191,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
|
|
201
191
|
bool is_empty = flags & EMPTY_FLAG_MASK;
|
202
192
|
|
203
193
|
if (is_empty) {
|
204
|
-
return var_opt_union
|
194
|
+
return var_opt_union(max_k);
|
205
195
|
}
|
206
196
|
|
207
197
|
uint64_t items_seen;
|
@@ -212,14 +202,14 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
|
|
212
202
|
ptr += copy_from_mem(ptr, outer_tau_denom);
|
213
203
|
|
214
204
|
const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
|
215
|
-
var_opt_sketch<T,
|
205
|
+
var_opt_sketch<T, A> gadget = var_opt_sketch<T, A>::deserialize(ptr, gadget_size, sd, allocator);
|
216
206
|
|
217
|
-
return var_opt_union
|
207
|
+
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
218
208
|
}
|
219
209
|
|
220
|
-
template<typename T, typename
|
210
|
+
template<typename T, typename A>
|
221
211
|
template<typename SerDe>
|
222
|
-
size_t var_opt_union<T,
|
212
|
+
size_t var_opt_union<T, A>::get_serialized_size_bytes(const SerDe& sd) const {
|
223
213
|
if (n_ == 0) {
|
224
214
|
return PREAMBLE_LONGS_EMPTY << 3;
|
225
215
|
} else {
|
@@ -227,9 +217,9 @@ size_t var_opt_union<T,S,A>::get_serialized_size_bytes(const SerDe& sd) const {
|
|
227
217
|
}
|
228
218
|
}
|
229
219
|
|
230
|
-
template<typename T, typename
|
220
|
+
template<typename T, typename A>
|
231
221
|
template<typename SerDe>
|
232
|
-
void var_opt_union<T,
|
222
|
+
void var_opt_union<T, A>::serialize(std::ostream& os, const SerDe& sd) const {
|
233
223
|
bool empty = (n_ == 0);
|
234
224
|
|
235
225
|
const uint8_t serialization_version(SER_VER);
|
@@ -259,9 +249,9 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os, const SerDe& sd) const {
|
|
259
249
|
}
|
260
250
|
}
|
261
251
|
|
262
|
-
template<typename T, typename
|
252
|
+
template<typename T, typename A>
|
263
253
|
template<typename SerDe>
|
264
|
-
std::vector<uint8_t, AllocU8<A>> var_opt_union<T,
|
254
|
+
std::vector<uint8_t, AllocU8<A>> var_opt_union<T, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
|
265
255
|
const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
|
266
256
|
std::vector<uint8_t, AllocU8<A>> bytes(size, 0, gadget_.allocator_);
|
267
257
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
@@ -301,16 +291,16 @@ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header
|
|
301
291
|
return bytes;
|
302
292
|
}
|
303
293
|
|
304
|
-
template<typename T, typename
|
305
|
-
void var_opt_union<T,
|
294
|
+
template<typename T, typename A>
|
295
|
+
void var_opt_union<T, A>::reset() {
|
306
296
|
n_ = 0;
|
307
297
|
outer_tau_numer_ = 0.0;
|
308
298
|
outer_tau_denom_ = 0;
|
309
299
|
gadget_.reset();
|
310
300
|
}
|
311
301
|
|
312
|
-
template<typename T, typename
|
313
|
-
string<A> var_opt_union<T,
|
302
|
+
template<typename T, typename A>
|
303
|
+
string<A> var_opt_union<T, A>::to_string() const {
|
314
304
|
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
315
305
|
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
316
306
|
std::ostringstream os;
|
@@ -323,20 +313,20 @@ string<A> var_opt_union<T,S,A>::to_string() const {
|
|
323
313
|
return string<A>(os.str().c_str(), gadget_.allocator_);
|
324
314
|
}
|
325
315
|
|
326
|
-
template<typename T, typename
|
327
|
-
void var_opt_union<T,
|
316
|
+
template<typename T, typename A>
|
317
|
+
void var_opt_union<T, A>::update(const var_opt_sketch<T, A>& sk) {
|
328
318
|
merge_items(sk);
|
329
319
|
resolve_tau(sk);
|
330
320
|
}
|
331
321
|
|
332
|
-
template<typename T, typename
|
333
|
-
void var_opt_union<T,
|
322
|
+
template<typename T, typename A>
|
323
|
+
void var_opt_union<T, A>::update(var_opt_sketch<T, A>&& sk) {
|
334
324
|
merge_items(std::move(sk));
|
335
325
|
resolve_tau(sk); // don't need items, so ok even if they've been moved out
|
336
326
|
}
|
337
327
|
|
338
|
-
template<typename T, typename
|
339
|
-
double var_opt_union<T,
|
328
|
+
template<typename T, typename A>
|
329
|
+
double var_opt_union<T, A>::get_outer_tau() const {
|
340
330
|
if (outer_tau_denom_ == 0) {
|
341
331
|
return 0.0;
|
342
332
|
} else {
|
@@ -344,8 +334,8 @@ double var_opt_union<T,S,A>::get_outer_tau() const {
|
|
344
334
|
}
|
345
335
|
}
|
346
336
|
|
347
|
-
template<typename T, typename
|
348
|
-
void var_opt_union<T,
|
337
|
+
template<typename T, typename A>
|
338
|
+
void var_opt_union<T, A>::merge_items(const var_opt_sketch<T, A>& sketch) {
|
349
339
|
if (sketch.n_ == 0) {
|
350
340
|
return;
|
351
341
|
}
|
@@ -353,8 +343,8 @@ void var_opt_union<T,S,A>::merge_items(const var_opt_sketch<T,S,A>& sketch) {
|
|
353
343
|
n_ += sketch.n_;
|
354
344
|
|
355
345
|
// H region const_iterator
|
356
|
-
typename var_opt_sketch<T,
|
357
|
-
typename var_opt_sketch<T,
|
346
|
+
typename var_opt_sketch<T, A>::const_iterator h_itr(sketch, false, false);
|
347
|
+
typename var_opt_sketch<T, A>::const_iterator h_end(sketch, true, false);
|
358
348
|
while (h_itr != h_end) {
|
359
349
|
std::pair<const T&, const double> sample = *h_itr;
|
360
350
|
gadget_.update(sample.first, sample.second, false);
|
@@ -362,8 +352,8 @@ void var_opt_union<T,S,A>::merge_items(const var_opt_sketch<T,S,A>& sketch) {
|
|
362
352
|
}
|
363
353
|
|
364
354
|
// Weight-correcting R region iterator (const_iterator doesn't do the correction)
|
365
|
-
typename var_opt_sketch<T,
|
366
|
-
typename var_opt_sketch<T,
|
355
|
+
typename var_opt_sketch<T, A>::iterator r_itr(sketch, false, true);
|
356
|
+
typename var_opt_sketch<T, A>::iterator r_end(sketch, true, true);
|
367
357
|
while (r_itr != r_end) {
|
368
358
|
std::pair<const T&, const double> sample = *r_itr;
|
369
359
|
gadget_.update(sample.first, sample.second, true);
|
@@ -371,8 +361,8 @@ void var_opt_union<T,S,A>::merge_items(const var_opt_sketch<T,S,A>& sketch) {
|
|
371
361
|
}
|
372
362
|
}
|
373
363
|
|
374
|
-
template<typename T, typename
|
375
|
-
void var_opt_union<T,
|
364
|
+
template<typename T, typename A>
|
365
|
+
void var_opt_union<T, A>::merge_items(var_opt_sketch<T, A>&& sketch) {
|
376
366
|
if (sketch.n_ == 0) {
|
377
367
|
return;
|
378
368
|
}
|
@@ -380,8 +370,8 @@ void var_opt_union<T,S,A>::merge_items(var_opt_sketch<T,S,A>&& sketch) {
|
|
380
370
|
n_ += sketch.n_;
|
381
371
|
|
382
372
|
// H region iterator
|
383
|
-
typename var_opt_sketch<T,
|
384
|
-
typename var_opt_sketch<T,
|
373
|
+
typename var_opt_sketch<T, A>::iterator h_itr(sketch, false, false);
|
374
|
+
typename var_opt_sketch<T, A>::iterator h_end(sketch, true, false);
|
385
375
|
while (h_itr != h_end) {
|
386
376
|
std::pair<T&, double> sample = *h_itr;
|
387
377
|
gadget_.update(std::move(sample.first), sample.second, false);
|
@@ -389,8 +379,8 @@ void var_opt_union<T,S,A>::merge_items(var_opt_sketch<T,S,A>&& sketch) {
|
|
389
379
|
}
|
390
380
|
|
391
381
|
// Weight-correcting R region iterator
|
392
|
-
typename var_opt_sketch<T,
|
393
|
-
typename var_opt_sketch<T,
|
382
|
+
typename var_opt_sketch<T, A>::iterator r_itr(sketch, false, true);
|
383
|
+
typename var_opt_sketch<T, A>::iterator r_end(sketch, true, true);
|
394
384
|
while (r_itr != r_end) {
|
395
385
|
std::pair<T&, double> sample = *r_itr;
|
396
386
|
gadget_.update(std::move(sample.first), sample.second, true);
|
@@ -398,8 +388,8 @@ void var_opt_union<T,S,A>::merge_items(var_opt_sketch<T,S,A>&& sketch) {
|
|
398
388
|
}
|
399
389
|
}
|
400
390
|
|
401
|
-
template<typename T, typename
|
402
|
-
void var_opt_union<T,
|
391
|
+
template<typename T, typename A>
|
392
|
+
void var_opt_union<T, A>::resolve_tau(const var_opt_sketch<T, A>& sketch) {
|
403
393
|
if (sketch.r_ > 0) {
|
404
394
|
const double sketch_tau = sketch.get_tau();
|
405
395
|
const double outer_tau = get_outer_tau();
|
@@ -425,8 +415,8 @@ void var_opt_union<T,S,A>::resolve_tau(const var_opt_sketch<T,S,A>& sketch) {
|
|
425
415
|
}
|
426
416
|
}
|
427
417
|
|
428
|
-
template<typename T, typename
|
429
|
-
var_opt_sketch<T,
|
418
|
+
template<typename T, typename A>
|
419
|
+
var_opt_sketch<T, A> var_opt_union<T, A>::get_result() const {
|
430
420
|
// If no marked items in H, gadget is already valid mathematically. We can return what is
|
431
421
|
// basically just a copy of the gadget.
|
432
422
|
if (gadget_.num_marks_in_h_ == 0) {
|
@@ -435,7 +425,7 @@ var_opt_sketch<T,S,A> var_opt_union<T,S,A>::get_result() const {
|
|
435
425
|
// Copy of gadget. This may produce needless copying in the
|
436
426
|
// pseudo-exact case below, but should simplify the code without
|
437
427
|
// needing to make the gadget a pointer
|
438
|
-
var_opt_sketch<T,
|
428
|
+
var_opt_sketch<T, A> gcopy(gadget_, false, n_);
|
439
429
|
|
440
430
|
// At this point, we know that marked items are present in H. So:
|
441
431
|
// 1. Result will necessarily be in estimation mode
|
@@ -456,15 +446,15 @@ var_opt_sketch<T,S,A> var_opt_union<T,S,A>::get_result() const {
|
|
456
446
|
*
|
457
447
|
* @return A shallow copy of the gadget as valid varopt sketch
|
458
448
|
*/
|
459
|
-
template<typename T, typename
|
460
|
-
var_opt_sketch<T,
|
449
|
+
template<typename T, typename A>
|
450
|
+
var_opt_sketch<T, A> var_opt_union<T, A>::simple_gadget_coercer() const {
|
461
451
|
if (gadget_.num_marks_in_h_ != 0) throw std::logic_error("simple gadget coercer only applies if no marks");
|
462
|
-
return var_opt_sketch<T,
|
452
|
+
return var_opt_sketch<T, A>(gadget_, true, n_);
|
463
453
|
}
|
464
454
|
|
465
455
|
// this is a condition checked in detect_and_handle_subcase_of_pseudo_exact()
|
466
|
-
template<typename T, typename
|
467
|
-
bool var_opt_union<T,
|
456
|
+
template<typename T, typename A>
|
457
|
+
bool var_opt_union<T, A>::there_exist_unmarked_h_items_lighter_than_target(double threshold) const {
|
468
458
|
for (uint32_t i = 0; i < gadget_.h_; ++i) {
|
469
459
|
if ((gadget_.weights_[i] < threshold) && !gadget_.marks_[i]) {
|
470
460
|
return true;
|
@@ -473,8 +463,8 @@ bool var_opt_union<T,S,A>::there_exist_unmarked_h_items_lighter_than_target(doub
|
|
473
463
|
return false;
|
474
464
|
}
|
475
465
|
|
476
|
-
template<typename T, typename
|
477
|
-
bool var_opt_union<T,
|
466
|
+
template<typename T, typename A>
|
467
|
+
bool var_opt_union<T, A>::detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T, A>& sk) const {
|
478
468
|
// gadget is seemingly exact
|
479
469
|
const bool condition1 = gadget_.r_ == 0;
|
480
470
|
|
@@ -510,8 +500,8 @@ bool var_opt_union<T,S,A>::detect_and_handle_subcase_of_pseudo_exact(var_opt_ske
|
|
510
500
|
*
|
511
501
|
* @param sk Copy of the gadget, modified with marked items moved to the reservoir
|
512
502
|
*/
|
513
|
-
template<typename T, typename
|
514
|
-
void var_opt_union<T,
|
503
|
+
template<typename T, typename A>
|
504
|
+
void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) const {
|
515
505
|
const uint32_t result_k = gadget_.h_ + gadget_.r_;
|
516
506
|
|
517
507
|
uint32_t result_h = 0;
|
@@ -583,8 +573,8 @@ void var_opt_union<T,S,A>::mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk)
|
|
583
573
|
}
|
584
574
|
|
585
575
|
// this is basically a continuation of get_result(), but modifying the input gadget copy
|
586
|
-
template<typename T, typename
|
587
|
-
void var_opt_union<T,
|
576
|
+
template<typename T, typename A>
|
577
|
+
void var_opt_union<T, A>::migrate_marked_items_by_decreasing_k(var_opt_sketch<T, A>& gcopy) const {
|
588
578
|
const uint32_t r_count = gcopy.r_;
|
589
579
|
const uint32_t h_count = gcopy.h_;
|
590
580
|
const uint32_t k = gcopy.k_;
|
@@ -616,8 +606,8 @@ void var_opt_union<T,S,A>::migrate_marked_items_by_decreasing_k(var_opt_sketch<T
|
|
616
606
|
gcopy.strip_marks();
|
617
607
|
}
|
618
608
|
|
619
|
-
template<typename T, typename
|
620
|
-
void var_opt_union<T,
|
609
|
+
template<typename T, typename A>
|
610
|
+
void var_opt_union<T, A>::check_preamble_longs(uint8_t preamble_longs, uint8_t flags) {
|
621
611
|
bool is_empty(flags & EMPTY_FLAG_MASK);
|
622
612
|
|
623
613
|
if (is_empty) {
|
@@ -635,8 +625,8 @@ void var_opt_union<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t
|
|
635
625
|
}
|
636
626
|
}
|
637
627
|
|
638
|
-
template<typename T, typename
|
639
|
-
void var_opt_union<T,
|
628
|
+
template<typename T, typename A>
|
629
|
+
void var_opt_union<T, A>::check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver) {
|
640
630
|
if (family_id == FAMILY_ID) {
|
641
631
|
if (ser_ver != SER_VER) {
|
642
632
|
throw std::invalid_argument("Possible corruption: VarOpt Union serialization version must be "
|
@@ -28,8 +28,8 @@
|
|
28
28
|
|
29
29
|
namespace datasketches {
|
30
30
|
|
31
|
-
using var_opt_test_sketch = var_opt_sketch<test_type,
|
32
|
-
using var_opt_test_union = var_opt_union<test_type,
|
31
|
+
using var_opt_test_sketch = var_opt_sketch<test_type, test_allocator<test_type>>;
|
32
|
+
using var_opt_test_union = var_opt_union<test_type, test_allocator<test_type>>;
|
33
33
|
using alloc = test_allocator<test_type>;
|
34
34
|
|
35
35
|
TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
|
@@ -38,19 +38,19 @@ TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
|
|
38
38
|
{
|
39
39
|
var_opt_test_sketch sk1(10, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
|
40
40
|
for (int i = 0; i < 100; ++i) sk1.update(i);
|
41
|
-
auto bytes1 = sk1.serialize();
|
41
|
+
auto bytes1 = sk1.serialize(0, test_type_serde());
|
42
42
|
auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), test_type_serde(), 0);
|
43
43
|
|
44
44
|
std::stringstream ss;
|
45
|
-
sk1.serialize(ss);
|
46
|
-
auto sk3 = var_opt_test_sketch::deserialize(ss, alloc(0));
|
45
|
+
sk1.serialize(ss, test_type_serde());
|
46
|
+
auto sk3 = var_opt_test_sketch::deserialize(ss, test_type_serde(), alloc(0));
|
47
47
|
|
48
48
|
var_opt_test_union u1(10, 0);
|
49
49
|
u1.update(sk1);
|
50
50
|
u1.update(sk2);
|
51
51
|
u1.update(sk3);
|
52
52
|
|
53
|
-
auto bytes2 = u1.serialize();
|
53
|
+
auto bytes2 = u1.serialize(0, test_type_serde());
|
54
54
|
auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), test_type_serde(), 0);
|
55
55
|
}
|
56
56
|
REQUIRE(test_allocator_total_bytes == 0);
|
@@ -47,8 +47,8 @@ static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
|
|
47
47
|
return sk;
|
48
48
|
}
|
49
49
|
|
50
|
-
template<typename T, typename
|
51
|
-
static void check_if_equal(var_opt_sketch<T,
|
50
|
+
template<typename T, typename A>
|
51
|
+
static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2) {
|
52
52
|
REQUIRE(sk1.get_k() == sk2.get_k());
|
53
53
|
REQUIRE(sk1.get_n() == sk2.get_n());
|
54
54
|
REQUIRE(sk1.get_num_samples() == sk2.get_num_samples());
|
@@ -49,8 +49,8 @@ static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
|
|
49
49
|
|
50
50
|
// if exact_compare = false, checks for equivalence -- specific R region values may differ but
|
51
51
|
// R region weights must match
|
52
|
-
template<typename T, typename
|
53
|
-
static void check_if_equal(var_opt_sketch<T,
|
52
|
+
template<typename T, typename A>
|
53
|
+
static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2, bool exact_compare = true) {
|
54
54
|
REQUIRE(sk1.get_k() == sk2.get_k());
|
55
55
|
REQUIRE(sk1.get_n() == sk2.get_n());
|
56
56
|
REQUIRE(sk1.get_num_samples() == sk2.get_num_samples());
|
@@ -78,8 +78,8 @@ static void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk
|
|
78
78
|
// ensure that the resulting binary images are compatible.
|
79
79
|
// if exact_compare = false, checks for equivalence -- specific R region values may differ but
|
80
80
|
// R region weights must match
|
81
|
-
template<typename T, typename
|
82
|
-
static void compare_serialization_deserialization(var_opt_union<T,
|
81
|
+
template<typename T, typename A>
|
82
|
+
static void compare_serialization_deserialization(var_opt_union<T,A>& vo_union, bool exact_compare = true) {
|
83
83
|
std::vector<uint8_t> bytes = vo_union.serialize();
|
84
84
|
|
85
85
|
var_opt_union<T> u_from_bytes = var_opt_union<T>::deserialize(bytes.data(), bytes.size());
|
@@ -22,6 +22,8 @@ import os
|
|
22
22
|
import sys
|
23
23
|
import platform
|
24
24
|
import subprocess
|
25
|
+
import re
|
26
|
+
from datetime import datetime, timezone
|
25
27
|
|
26
28
|
from setuptools import setup, find_packages, Extension
|
27
29
|
from setuptools.command.build_ext import build_ext
|
@@ -78,9 +80,19 @@ class CMakeBuild(build_ext):
|
|
78
80
|
cwd=self.build_temp, env=env)
|
79
81
|
print() # add an empty line to pretty print
|
80
82
|
|
83
|
+
# Read and parse the version format
|
84
|
+
# @DT@ -> datestamp
|
85
|
+
# @HHMM@ -> .devHHMM to indicate development version
|
86
|
+
# Releases should have a fixed version with no @ variables
|
87
|
+
with open('version.cfg.in', 'r') as file:
|
88
|
+
ds_version = file.read().rstrip()
|
89
|
+
dt = datetime.now(timezone.utc)
|
90
|
+
ds_version = re.sub('@DT@', dt.strftime('%Y%m%d'), ds_version)
|
91
|
+
ds_version = re.sub('@HHMM@', 'dev' + dt.strftime('%H%M'), ds_version)
|
92
|
+
|
81
93
|
setup(
|
82
94
|
name='datasketches',
|
83
|
-
version=
|
95
|
+
version=ds_version,
|
84
96
|
author='Apache Software Foundation',
|
85
97
|
author_email='dev@datasketches.apache.org',
|
86
98
|
description='The Apache DataSketches Library for Python',
|
@@ -88,7 +100,7 @@ setup(
|
|
88
100
|
url='http://datasketches.apache.org',
|
89
101
|
long_description=open('python/README.md').read(),
|
90
102
|
long_description_content_type='text/markdown',
|
91
|
-
packages=find_packages(where='python',exclude=['src','*tests*']), # src not needed if only the.so
|
103
|
+
packages=find_packages(where='python',exclude=['src','*tests*']), # src not needed if only the .so
|
92
104
|
package_dir={'':'python'},
|
93
105
|
# may need to add all source paths for sdist packages w/o MANIFEST.in
|
94
106
|
ext_modules=[CMakeExtension('datasketches')],
|