datasketches 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/MANIFEST.in +21 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
- data/vendor/datasketches-cpp/pyproject.toml +17 -12
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/setup.py +14 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +14 -5
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
|
@@ -30,40 +30,42 @@
|
|
|
30
30
|
namespace datasketches {
|
|
31
31
|
|
|
32
32
|
// clang++ seems to require this declaration for CMAKE_BUILD_TYPE='Debug"
|
|
33
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
34
|
-
const uint8_t frequent_items_sketch<T, W, H, E,
|
|
33
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
34
|
+
const uint8_t frequent_items_sketch<T, W, H, E, A>::LG_MIN_MAP_SIZE;
|
|
35
35
|
|
|
36
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
37
|
-
frequent_items_sketch<T, W, H, E,
|
|
36
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
37
|
+
frequent_items_sketch<T, W, H, E, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size,
|
|
38
|
+
const E& equal, const A& allocator):
|
|
38
39
|
total_weight(0),
|
|
39
40
|
offset(0),
|
|
40
41
|
map(
|
|
41
42
|
std::max(lg_start_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
|
|
42
43
|
std::max(lg_max_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
|
|
44
|
+
equal,
|
|
43
45
|
allocator
|
|
44
46
|
)
|
|
45
47
|
{
|
|
46
48
|
if (lg_start_map_size > lg_max_map_size) throw std::invalid_argument("starting size must not be greater than maximum size");
|
|
47
49
|
}
|
|
48
50
|
|
|
49
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
50
|
-
void frequent_items_sketch<T, W, H, E,
|
|
51
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
52
|
+
void frequent_items_sketch<T, W, H, E, A>::update(const T& item, W weight) {
|
|
51
53
|
check_weight(weight);
|
|
52
54
|
if (weight == 0) return;
|
|
53
55
|
total_weight += weight;
|
|
54
56
|
offset += map.adjust_or_insert(item, weight);
|
|
55
57
|
}
|
|
56
58
|
|
|
57
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
58
|
-
void frequent_items_sketch<T, W, H, E,
|
|
59
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
60
|
+
void frequent_items_sketch<T, W, H, E, A>::update(T&& item, W weight) {
|
|
59
61
|
check_weight(weight);
|
|
60
62
|
if (weight == 0) return;
|
|
61
63
|
total_weight += weight;
|
|
62
64
|
offset += map.adjust_or_insert(std::move(item), weight);
|
|
63
65
|
}
|
|
64
66
|
|
|
65
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
66
|
-
void frequent_items_sketch<T, W, H, E,
|
|
67
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
68
|
+
void frequent_items_sketch<T, W, H, E, A>::merge(const frequent_items_sketch& other) {
|
|
67
69
|
if (other.is_empty()) return;
|
|
68
70
|
const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
|
|
69
71
|
for (auto it: other.map) {
|
|
@@ -73,8 +75,8 @@ void frequent_items_sketch<T, W, H, E, S, A>::merge(const frequent_items_sketch&
|
|
|
73
75
|
total_weight = merged_total_weight;
|
|
74
76
|
}
|
|
75
77
|
|
|
76
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
77
|
-
void frequent_items_sketch<T, W, H, E,
|
|
78
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
79
|
+
void frequent_items_sketch<T, W, H, E, A>::merge(frequent_items_sketch&& other) {
|
|
78
80
|
if (other.is_empty()) return;
|
|
79
81
|
const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
|
|
80
82
|
for (auto it: other.map) {
|
|
@@ -84,69 +86,67 @@ void frequent_items_sketch<T, W, H, E, S, A>::merge(frequent_items_sketch&& othe
|
|
|
84
86
|
total_weight = merged_total_weight;
|
|
85
87
|
}
|
|
86
88
|
|
|
87
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
88
|
-
bool frequent_items_sketch<T, W, H, E,
|
|
89
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
90
|
+
bool frequent_items_sketch<T, W, H, E, A>::is_empty() const {
|
|
89
91
|
return map.get_num_active() == 0;
|
|
90
92
|
}
|
|
91
93
|
|
|
92
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
93
|
-
uint32_t frequent_items_sketch<T, W, H, E,
|
|
94
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
95
|
+
uint32_t frequent_items_sketch<T, W, H, E, A>::get_num_active_items() const {
|
|
94
96
|
return map.get_num_active();
|
|
95
97
|
}
|
|
96
98
|
|
|
97
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
98
|
-
W frequent_items_sketch<T, W, H, E,
|
|
99
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
100
|
+
W frequent_items_sketch<T, W, H, E, A>::get_total_weight() const {
|
|
99
101
|
return total_weight;
|
|
100
102
|
}
|
|
101
103
|
|
|
102
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
103
|
-
W frequent_items_sketch<T, W, H, E,
|
|
104
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
105
|
+
W frequent_items_sketch<T, W, H, E, A>::get_estimate(const T& item) const {
|
|
104
106
|
// if item is tracked estimate = weight + offset, otherwise 0
|
|
105
107
|
const W weight = map.get(item);
|
|
106
108
|
if (weight > 0) return weight + offset;
|
|
107
109
|
return 0;
|
|
108
110
|
}
|
|
109
111
|
|
|
110
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
111
|
-
W frequent_items_sketch<T, W, H, E,
|
|
112
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
113
|
+
W frequent_items_sketch<T, W, H, E, A>::get_lower_bound(const T& item) const {
|
|
112
114
|
return map.get(item);
|
|
113
115
|
}
|
|
114
116
|
|
|
115
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
116
|
-
W frequent_items_sketch<T, W, H, E,
|
|
117
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
118
|
+
W frequent_items_sketch<T, W, H, E, A>::get_upper_bound(const T& item) const {
|
|
117
119
|
return map.get(item) + offset;
|
|
118
120
|
}
|
|
119
121
|
|
|
120
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
121
|
-
W frequent_items_sketch<T, W, H, E,
|
|
122
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
123
|
+
W frequent_items_sketch<T, W, H, E, A>::get_maximum_error() const {
|
|
122
124
|
return offset;
|
|
123
125
|
}
|
|
124
126
|
|
|
125
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
126
|
-
double frequent_items_sketch<T, W, H, E,
|
|
127
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
128
|
+
double frequent_items_sketch<T, W, H, E, A>::get_epsilon() const {
|
|
127
129
|
return EPSILON_FACTOR / (1 << map.get_lg_max_size());
|
|
128
130
|
}
|
|
129
131
|
|
|
130
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
131
|
-
double frequent_items_sketch<T, W, H, E,
|
|
132
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
133
|
+
double frequent_items_sketch<T, W, H, E, A>::get_epsilon(uint8_t lg_max_map_size) {
|
|
132
134
|
return EPSILON_FACTOR / (1 << lg_max_map_size);
|
|
133
135
|
}
|
|
134
136
|
|
|
135
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
136
|
-
double frequent_items_sketch<T, W, H, E,
|
|
137
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
138
|
+
double frequent_items_sketch<T, W, H, E, A>::get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight) {
|
|
137
139
|
return get_epsilon(lg_max_map_size) * estimated_total_weight;
|
|
138
140
|
}
|
|
139
141
|
|
|
140
142
|
|
|
141
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
142
|
-
|
|
143
|
-
frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type) const {
|
|
143
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
144
|
+
auto frequent_items_sketch<T, W, H, E, A>::get_frequent_items(frequent_items_error_type err_type) const -> vector_row {
|
|
144
145
|
return get_frequent_items(err_type, get_maximum_error());
|
|
145
146
|
}
|
|
146
147
|
|
|
147
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
148
|
-
|
|
149
|
-
frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const {
|
|
148
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
149
|
+
auto frequent_items_sketch<T, W, H, E, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const -> vector_row {
|
|
150
150
|
vector_row items(map.get_allocator());
|
|
151
151
|
for (auto it: map) {
|
|
152
152
|
const W lb = it.second;
|
|
@@ -160,9 +160,9 @@ frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error
|
|
|
160
160
|
return items;
|
|
161
161
|
}
|
|
162
162
|
|
|
163
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
163
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
164
164
|
template<typename SerDe>
|
|
165
|
-
void frequent_items_sketch<T, W, H, E,
|
|
165
|
+
void frequent_items_sketch<T, W, H, E, A>::serialize(std::ostream& os, const SerDe& sd) const {
|
|
166
166
|
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
|
|
167
167
|
write(os, preamble_longs);
|
|
168
168
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
@@ -206,18 +206,18 @@ void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os, const
|
|
|
206
206
|
}
|
|
207
207
|
}
|
|
208
208
|
|
|
209
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
209
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
210
210
|
template<typename SerDe>
|
|
211
|
-
size_t frequent_items_sketch<T, W, H, E,
|
|
211
|
+
size_t frequent_items_sketch<T, W, H, E, A>::get_serialized_size_bytes(const SerDe& sd) const {
|
|
212
212
|
if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t);
|
|
213
213
|
size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W);
|
|
214
214
|
for (auto it: map) size += sd.size_of_item(it.first);
|
|
215
215
|
return size;
|
|
216
216
|
}
|
|
217
217
|
|
|
218
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
218
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
219
219
|
template<typename SerDe>
|
|
220
|
-
auto frequent_items_sketch<T, W, H, E,
|
|
220
|
+
auto frequent_items_sketch<T, W, H, E, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const -> vector_bytes {
|
|
221
221
|
const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
|
|
222
222
|
vector_bytes bytes(size, 0, map.get_allocator());
|
|
223
223
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
@@ -266,8 +266,8 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
|
|
|
266
266
|
return bytes;
|
|
267
267
|
}
|
|
268
268
|
|
|
269
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
270
|
-
class frequent_items_sketch<T, W, H, E,
|
|
269
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
270
|
+
class frequent_items_sketch<T, W, H, E, A>::items_deleter {
|
|
271
271
|
public:
|
|
272
272
|
items_deleter(uint32_t num, bool destroy, const A& allocator):
|
|
273
273
|
allocator_(allocator), num_(num), destroy_(destroy) {}
|
|
@@ -286,14 +286,10 @@ private:
|
|
|
286
286
|
bool destroy_;
|
|
287
287
|
};
|
|
288
288
|
|
|
289
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
290
|
-
frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is, const A& allocator) {
|
|
291
|
-
return deserialize(is, S(), allocator);
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
289
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
295
290
|
template<typename SerDe>
|
|
296
|
-
frequent_items_sketch<T, W, H, E,
|
|
291
|
+
frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(std::istream& is,
|
|
292
|
+
const SerDe& sd, const E& equal, const A& allocator) {
|
|
297
293
|
const auto preamble_longs = read<uint8_t>(is);
|
|
298
294
|
const auto serial_version = read<uint8_t>(is);
|
|
299
295
|
const auto family_id = read<uint8_t>(is);
|
|
@@ -309,7 +305,7 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
|
309
305
|
check_family_id(family_id);
|
|
310
306
|
check_size(lg_cur_size, lg_max_size);
|
|
311
307
|
|
|
312
|
-
frequent_items_sketch
|
|
308
|
+
frequent_items_sketch sketch(lg_max_size, lg_cur_size, equal, allocator);
|
|
313
309
|
if (!is_empty) {
|
|
314
310
|
const auto num_items = read<uint32_t>(is);
|
|
315
311
|
read<uint32_t>(is); // unused
|
|
@@ -335,14 +331,10 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
|
335
331
|
return sketch;
|
|
336
332
|
}
|
|
337
333
|
|
|
338
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
339
|
-
frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
|
|
340
|
-
return deserialize(bytes, size, S(), allocator);
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
334
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
344
335
|
template<typename SerDe>
|
|
345
|
-
frequent_items_sketch<T, W, H, E,
|
|
336
|
+
frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(const void* bytes, size_t size,
|
|
337
|
+
const SerDe& sd, const E& equal, const A& allocator) {
|
|
346
338
|
ensure_minimum_memory(size, 8);
|
|
347
339
|
const char* ptr = static_cast<const char*>(bytes);
|
|
348
340
|
const char* base = static_cast<const char*>(bytes);
|
|
@@ -368,7 +360,7 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
|
368
360
|
check_size(lg_cur_size, lg_max_size);
|
|
369
361
|
ensure_minimum_memory(size, preamble_longs * sizeof(uint64_t));
|
|
370
362
|
|
|
371
|
-
frequent_items_sketch
|
|
363
|
+
frequent_items_sketch sketch(lg_max_size, lg_cur_size, equal, allocator);
|
|
372
364
|
if (!is_empty) {
|
|
373
365
|
uint32_t num_items;
|
|
374
366
|
ptr += copy_from_mem(ptr, num_items);
|
|
@@ -398,8 +390,8 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
|
398
390
|
return sketch;
|
|
399
391
|
}
|
|
400
392
|
|
|
401
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
402
|
-
void frequent_items_sketch<T, W, H, E,
|
|
393
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
394
|
+
void frequent_items_sketch<T, W, H, E, A>::check_preamble_longs(uint8_t preamble_longs, bool is_empty) {
|
|
403
395
|
if (is_empty) {
|
|
404
396
|
if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
|
|
405
397
|
throw std::invalid_argument("Possible corruption: preamble longs of an empty sketch must be " + std::to_string(PREAMBLE_LONGS_EMPTY) + ": " + std::to_string(preamble_longs));
|
|
@@ -411,22 +403,22 @@ void frequent_items_sketch<T, W, H, E, S, A>::check_preamble_longs(uint8_t pream
|
|
|
411
403
|
}
|
|
412
404
|
}
|
|
413
405
|
|
|
414
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
415
|
-
void frequent_items_sketch<T, W, H, E,
|
|
406
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
407
|
+
void frequent_items_sketch<T, W, H, E, A>::check_serial_version(uint8_t serial_version) {
|
|
416
408
|
if (serial_version != SERIAL_VERSION) {
|
|
417
409
|
throw std::invalid_argument("Possible corruption: serial version must be " + std::to_string(SERIAL_VERSION) + ": " + std::to_string(serial_version));
|
|
418
410
|
}
|
|
419
411
|
}
|
|
420
412
|
|
|
421
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
422
|
-
void frequent_items_sketch<T, W, H, E,
|
|
413
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
414
|
+
void frequent_items_sketch<T, W, H, E, A>::check_family_id(uint8_t family_id) {
|
|
423
415
|
if (family_id != FAMILY_ID) {
|
|
424
416
|
throw std::invalid_argument("Possible corruption: family ID must be " + std::to_string(FAMILY_ID) + ": " + std::to_string(family_id));
|
|
425
417
|
}
|
|
426
418
|
}
|
|
427
419
|
|
|
428
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
429
|
-
void frequent_items_sketch<T, W, H, E,
|
|
420
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
421
|
+
void frequent_items_sketch<T, W, H, E, A>::check_size(uint8_t lg_cur_size, uint8_t lg_max_size) {
|
|
430
422
|
if (lg_cur_size > lg_max_size) {
|
|
431
423
|
throw std::invalid_argument("Possible corruption: expected lg_cur_size <= lg_max_size: " + std::to_string(lg_cur_size) + " <= " + std::to_string(lg_max_size));
|
|
432
424
|
}
|
|
@@ -435,8 +427,8 @@ void frequent_items_sketch<T, W, H, E, S, A>::check_size(uint8_t lg_cur_size, ui
|
|
|
435
427
|
}
|
|
436
428
|
}
|
|
437
429
|
|
|
438
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
439
|
-
string<A> frequent_items_sketch<T, W, H, E,
|
|
430
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
431
|
+
string<A> frequent_items_sketch<T, W, H, E, A>::to_string(bool print_items) const {
|
|
440
432
|
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
|
441
433
|
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
|
442
434
|
std::ostringstream os;
|
|
@@ -466,23 +458,23 @@ string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) c
|
|
|
466
458
|
}
|
|
467
459
|
|
|
468
460
|
// version for integral signed type
|
|
469
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
461
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
470
462
|
template<typename WW, typename std::enable_if<std::is_integral<WW>::value && std::is_signed<WW>::value, int>::type>
|
|
471
|
-
void frequent_items_sketch<T, W, H, E,
|
|
463
|
+
void frequent_items_sketch<T, W, H, E, A>::check_weight(WW weight) {
|
|
472
464
|
if (weight < 0) {
|
|
473
465
|
throw std::invalid_argument("weight must be non-negative");
|
|
474
466
|
}
|
|
475
467
|
}
|
|
476
468
|
|
|
477
469
|
// version for integral unsigned type - no-op
|
|
478
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
470
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
479
471
|
template<typename WW, typename std::enable_if<std::is_integral<WW>::value && std::is_unsigned<WW>::value, int>::type>
|
|
480
|
-
void frequent_items_sketch<T, W, H, E,
|
|
472
|
+
void frequent_items_sketch<T, W, H, E, A>::check_weight(WW) {}
|
|
481
473
|
|
|
482
474
|
// version for floating point type
|
|
483
|
-
template<typename T, typename W, typename H, typename E, typename
|
|
475
|
+
template<typename T, typename W, typename H, typename E, typename A>
|
|
484
476
|
template<typename WW, typename std::enable_if<std::is_floating_point<WW>::value, int>::type>
|
|
485
|
-
void frequent_items_sketch<T, W, H, E,
|
|
477
|
+
void frequent_items_sketch<T, W, H, E, A>::check_weight(WW weight) {
|
|
486
478
|
if (weight < 0) {
|
|
487
479
|
throw std::invalid_argument("weight must be non-negative");
|
|
488
480
|
}
|
|
@@ -29,21 +29,27 @@ namespace datasketches {
|
|
|
29
29
|
* This is a specialized linear-probing hash map with a reverse purge operation
|
|
30
30
|
* that removes all entries in the map with values that are less than zero.
|
|
31
31
|
* Based on Java implementation here:
|
|
32
|
-
* https://github.com/
|
|
32
|
+
* https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ReversePurgeItemHashMap.java
|
|
33
33
|
* author Alexander Saydakov
|
|
34
34
|
*/
|
|
35
35
|
|
|
36
|
-
template<
|
|
36
|
+
template<
|
|
37
|
+
typename K,
|
|
38
|
+
typename V = uint64_t,
|
|
39
|
+
typename H = std::hash<K>,
|
|
40
|
+
typename E = std::equal_to<K>,
|
|
41
|
+
typename A = std::allocator<K>
|
|
42
|
+
>
|
|
37
43
|
class reverse_purge_hash_map {
|
|
38
44
|
public:
|
|
39
45
|
using AllocV = typename std::allocator_traits<A>::template rebind_alloc<V>;
|
|
40
46
|
using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
|
|
41
47
|
|
|
42
|
-
reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size, const A& allocator);
|
|
48
|
+
reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size, const E& equal, const A& allocator);
|
|
43
49
|
reverse_purge_hash_map(const reverse_purge_hash_map& other);
|
|
44
50
|
reverse_purge_hash_map(reverse_purge_hash_map&& other) noexcept;
|
|
45
51
|
~reverse_purge_hash_map();
|
|
46
|
-
reverse_purge_hash_map& operator=(reverse_purge_hash_map other);
|
|
52
|
+
reverse_purge_hash_map& operator=(const reverse_purge_hash_map& other);
|
|
47
53
|
reverse_purge_hash_map& operator=(reverse_purge_hash_map&& other);
|
|
48
54
|
|
|
49
55
|
template<typename FwdK>
|
|
@@ -65,6 +71,7 @@ private:
|
|
|
65
71
|
static constexpr uint16_t DRIFT_LIMIT = 1024; // used only for stress testing
|
|
66
72
|
static constexpr uint32_t MAX_SAMPLE_SIZE = 1024; // number of samples to compute approximate median during purge
|
|
67
73
|
|
|
74
|
+
E equal_;
|
|
68
75
|
A allocator_;
|
|
69
76
|
uint8_t lg_cur_size_;
|
|
70
77
|
uint8_t lg_max_size_;
|
|
@@ -34,7 +34,9 @@ template<typename K, typename V, typename H, typename E, typename A>
|
|
|
34
34
|
constexpr uint32_t reverse_purge_hash_map<K, V, H, E, A>::MAX_SAMPLE_SIZE;
|
|
35
35
|
|
|
36
36
|
template<typename K, typename V, typename H, typename E, typename A>
|
|
37
|
-
reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(uint8_t lg_cur_size, uint8_t lg_max_size,
|
|
37
|
+
reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(uint8_t lg_cur_size, uint8_t lg_max_size,
|
|
38
|
+
const E& equal, const A& allocator):
|
|
39
|
+
equal_(equal),
|
|
38
40
|
allocator_(allocator),
|
|
39
41
|
lg_cur_size_(lg_cur_size),
|
|
40
42
|
lg_max_size_(lg_max_size),
|
|
@@ -52,6 +54,7 @@ states_(nullptr)
|
|
|
52
54
|
|
|
53
55
|
template<typename K, typename V, typename H, typename E, typename A>
|
|
54
56
|
reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(const reverse_purge_hash_map<K, V, H, E, A>& other):
|
|
57
|
+
equal_(other.equal_),
|
|
55
58
|
allocator_(other.allocator_),
|
|
56
59
|
lg_cur_size_(other.lg_cur_size_),
|
|
57
60
|
lg_max_size_(other.lg_max_size_),
|
|
@@ -80,6 +83,7 @@ states_(nullptr)
|
|
|
80
83
|
|
|
81
84
|
template<typename K, typename V, typename H, typename E, typename A>
|
|
82
85
|
reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(reverse_purge_hash_map<K, V, H, E, A>&& other) noexcept:
|
|
86
|
+
equal_(std::move(other.equal_)),
|
|
83
87
|
allocator_(std::move(other.allocator_)),
|
|
84
88
|
lg_cur_size_(other.lg_cur_size_),
|
|
85
89
|
lg_max_size_(other.lg_max_size_),
|
|
@@ -119,19 +123,22 @@ reverse_purge_hash_map<K, V, H, E, A>::~reverse_purge_hash_map() {
|
|
|
119
123
|
}
|
|
120
124
|
|
|
121
125
|
template<typename K, typename V, typename H, typename E, typename A>
|
|
122
|
-
reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(reverse_purge_hash_map<K, V, H, E, A
|
|
123
|
-
|
|
124
|
-
std::swap(
|
|
125
|
-
std::swap(
|
|
126
|
-
std::swap(
|
|
127
|
-
std::swap(
|
|
128
|
-
std::swap(
|
|
129
|
-
std::swap(
|
|
126
|
+
reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(const reverse_purge_hash_map<K, V, H, E, A>& other) {
|
|
127
|
+
reverse_purge_hash_map copy(other);
|
|
128
|
+
std::swap(equal_, copy.equal_);
|
|
129
|
+
std::swap(allocator_, copy.allocator_);
|
|
130
|
+
std::swap(lg_cur_size_, copy.lg_cur_size_);
|
|
131
|
+
std::swap(lg_max_size_, copy.lg_max_size_);
|
|
132
|
+
std::swap(num_active_, copy.num_active_);
|
|
133
|
+
std::swap(keys_, copy.keys_);
|
|
134
|
+
std::swap(values_, copy.values_);
|
|
135
|
+
std::swap(states_, copy.states_);
|
|
130
136
|
return *this;
|
|
131
137
|
}
|
|
132
138
|
|
|
133
139
|
template<typename K, typename V, typename H, typename E, typename A>
|
|
134
140
|
reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(reverse_purge_hash_map<K, V, H, E, A>&& other) {
|
|
141
|
+
std::swap(equal_, other.equal_);
|
|
135
142
|
std::swap(allocator_, other.allocator_);
|
|
136
143
|
std::swap(lg_cur_size_, other.lg_cur_size_);
|
|
137
144
|
std::swap(lg_max_size_, other.lg_max_size_);
|
|
@@ -27,62 +27,75 @@
|
|
|
27
27
|
|
|
28
28
|
namespace datasketches {
|
|
29
29
|
|
|
30
|
-
using frequent_test_type_sketch = frequent_items_sketch<test_type, float, test_type_hash, test_type_equal,
|
|
30
|
+
using frequent_test_type_sketch = frequent_items_sketch<test_type, float, test_type_hash, test_type_equal, test_allocator<test_type>>;
|
|
31
31
|
using alloc = test_allocator<test_type>;
|
|
32
32
|
|
|
33
33
|
TEST_CASE("frequent items: custom type", "[frequent_items_sketch]") {
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
34
|
+
test_allocator_total_bytes = 0;
|
|
35
|
+
{
|
|
36
|
+
frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
|
|
37
|
+
sketch.update(1, 10); // should survive the purge
|
|
38
|
+
sketch.update(2);
|
|
39
|
+
sketch.update(3);
|
|
40
|
+
sketch.update(4);
|
|
41
|
+
sketch.update(5);
|
|
42
|
+
sketch.update(6);
|
|
43
|
+
sketch.update(7);
|
|
44
|
+
test_type a8(8);
|
|
45
|
+
sketch.update(a8);
|
|
46
|
+
REQUIRE_FALSE(sketch.is_empty());
|
|
47
|
+
REQUIRE(sketch.get_total_weight() == 17);
|
|
48
|
+
REQUIRE(sketch.get_estimate(1) == 10);
|
|
47
49
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
auto items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES);
|
|
51
|
+
REQUIRE(items.size() == 1); // only 1 item should be above threshold
|
|
52
|
+
REQUIRE(items[0].get_item().get_value() == 1);
|
|
53
|
+
REQUIRE(items[0].get_estimate() == 10);
|
|
52
54
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
55
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
56
|
+
sketch.serialize(s, test_type_serde());
|
|
57
|
+
auto sketch2 = frequent_test_type_sketch::deserialize(s, test_type_serde(), test_type_equal(), 0);
|
|
58
|
+
REQUIRE_FALSE(sketch2.is_empty());
|
|
59
|
+
REQUIRE(sketch2.get_total_weight() == 17);
|
|
60
|
+
REQUIRE(sketch2.get_estimate(1) == 10);
|
|
61
|
+
REQUIRE(sketch.get_num_active_items() == sketch2.get_num_active_items());
|
|
62
|
+
REQUIRE(sketch.get_maximum_error() == sketch2.get_maximum_error());
|
|
61
63
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
64
|
+
auto bytes = sketch.serialize(0, test_type_serde());
|
|
65
|
+
auto sketch3 = frequent_test_type_sketch::deserialize(bytes.data(), bytes.size(), test_type_serde(),
|
|
66
|
+
test_type_equal(), 0);
|
|
67
|
+
REQUIRE_FALSE(sketch3.is_empty());
|
|
68
|
+
REQUIRE(sketch3.get_total_weight() == 17);
|
|
69
|
+
REQUIRE(sketch3.get_estimate(1) == 10);
|
|
70
|
+
REQUIRE(sketch.get_num_active_items() == sketch3.get_num_active_items());
|
|
71
|
+
REQUIRE(sketch.get_maximum_error() == sketch3.get_maximum_error());
|
|
72
|
+
}
|
|
73
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
69
74
|
}
|
|
70
75
|
|
|
71
76
|
// this is to see the debug print from test_type if enabled there to make sure items are moved
|
|
72
77
|
TEST_CASE("frequent items: moving merge", "[frequent_items_sketch]") {
|
|
73
|
-
|
|
74
|
-
|
|
78
|
+
test_allocator_total_bytes = 0;
|
|
79
|
+
{
|
|
80
|
+
frequent_test_type_sketch sketch1(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
|
|
81
|
+
sketch1.update(1);
|
|
75
82
|
|
|
76
|
-
|
|
77
|
-
|
|
83
|
+
frequent_test_type_sketch sketch2(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
|
|
84
|
+
sketch2.update(2);
|
|
78
85
|
|
|
79
|
-
|
|
80
|
-
|
|
86
|
+
sketch2.merge(std::move(sketch1));
|
|
87
|
+
REQUIRE(sketch2.get_total_weight() == 2);
|
|
88
|
+
}
|
|
89
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
81
90
|
}
|
|
82
91
|
|
|
83
92
|
TEST_CASE("frequent items: negative weight", "[frequent_items_sketch]") {
|
|
84
|
-
|
|
85
|
-
|
|
93
|
+
test_allocator_total_bytes = 0;
|
|
94
|
+
{
|
|
95
|
+
frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
|
|
96
|
+
REQUIRE_THROWS_AS(sketch.update(1, -1), std::invalid_argument);
|
|
97
|
+
}
|
|
98
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
86
99
|
}
|
|
87
100
|
|
|
88
101
|
} /* namespace datasketches */
|
|
@@ -24,20 +24,20 @@
|
|
|
24
24
|
namespace datasketches {
|
|
25
25
|
|
|
26
26
|
TEST_CASE("reverse purge hash map: empty", "[frequent_items_sketch]") {
|
|
27
|
-
reverse_purge_hash_map<int> map(3, 3, std::allocator<int>());
|
|
27
|
+
reverse_purge_hash_map<int> map(3, 3, std::equal_to<int>(), std::allocator<int>());
|
|
28
28
|
REQUIRE(map.get_num_active() == 0);
|
|
29
29
|
REQUIRE(map.get_lg_cur_size() == 3); // static_cast<uint8_t>(3)
|
|
30
30
|
}
|
|
31
31
|
|
|
32
32
|
TEST_CASE("reverse purge hash map: one item", "[frequent_items_sketch]") {
|
|
33
|
-
reverse_purge_hash_map<int> map(3, 3, std::allocator<int>());
|
|
33
|
+
reverse_purge_hash_map<int> map(3, 3, std::equal_to<int>(), std::allocator<int>());
|
|
34
34
|
map.adjust_or_insert(1, 1);
|
|
35
35
|
REQUIRE(map.get_num_active() == 1);
|
|
36
36
|
REQUIRE(map.get(1) == 1);
|
|
37
37
|
}
|
|
38
38
|
|
|
39
39
|
TEST_CASE("reverse purge hash map: iterator", "[frequent_items_sketch]") {
|
|
40
|
-
reverse_purge_hash_map<int> map(3, 4, std::allocator<int>());
|
|
40
|
+
reverse_purge_hash_map<int> map(3, 4, std::equal_to<int>(), std::allocator<int>());
|
|
41
41
|
for (int i = 0; i < 11; i++) map.adjust_or_insert(i, 1); // this should fit with no purge
|
|
42
42
|
uint64_t sum = 0;
|
|
43
43
|
for (auto it: map) sum += it.second;
|
|
@@ -267,10 +267,10 @@ void Hll4Array<A>::shiftToBiggerCurMin() {
|
|
|
267
267
|
for (const auto coupon: *auxHashMap_) {
|
|
268
268
|
slotNum = HllUtil<A>::getLow26(coupon) & configKmask;
|
|
269
269
|
oldActualVal = HllUtil<A>::getValue(coupon);
|
|
270
|
-
|
|
271
|
-
if (newShiftedVal < 0) {
|
|
270
|
+
if (oldActualVal < newCurMin) {
|
|
272
271
|
throw std::logic_error("oldActualVal < newCurMin when incrementing curMin");
|
|
273
272
|
}
|
|
273
|
+
newShiftedVal = oldActualVal - newCurMin;
|
|
274
274
|
|
|
275
275
|
if (getSlot(slotNum) != hll_constants::AUX_TOKEN) {
|
|
276
276
|
throw std::logic_error("getSlot(slotNum) != AUX_TOKEN for item in auxiliary hash map");
|