datasketches 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
- data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/python/README.md +6 -3
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
- data/vendor/datasketches-cpp/setup.py +5 -3
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
- metadata +43 -34
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
|
@@ -1,481 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
-
* or more contributor license agreements. See the NOTICE file
|
|
4
|
-
* distributed with this work for additional information
|
|
5
|
-
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
-
* to you under the Apache License, Version 2.0 (the
|
|
7
|
-
* "License"); you may not use this file except in compliance
|
|
8
|
-
* with the License. You may obtain a copy of the License at
|
|
9
|
-
*
|
|
10
|
-
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
-
*
|
|
12
|
-
* Unless required by applicable law or agreed to in writing,
|
|
13
|
-
* software distributed under the License is distributed on an
|
|
14
|
-
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
-
* KIND, either express or implied. See the License for the
|
|
16
|
-
* specific language governing permissions and limitations
|
|
17
|
-
* under the License.
|
|
18
|
-
*/
|
|
19
|
-
|
|
20
|
-
#include <sstream>
|
|
21
|
-
|
|
22
|
-
#include "serde.hpp"
|
|
23
|
-
#include "binomial_bounds.hpp"
|
|
24
|
-
#include "theta_helpers.hpp"
|
|
25
|
-
|
|
26
|
-
namespace datasketches {
|
|
27
|
-
|
|
28
|
-
template<typename A>
|
|
29
|
-
bool theta_sketch_experimental<A>::is_estimation_mode() const {
|
|
30
|
-
return get_theta64() < theta_constants::MAX_THETA && !is_empty();
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
template<typename A>
|
|
34
|
-
double theta_sketch_experimental<A>::get_theta() const {
|
|
35
|
-
return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
template<typename A>
|
|
39
|
-
double theta_sketch_experimental<A>::get_estimate() const {
|
|
40
|
-
return get_num_retained() / get_theta();
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
template<typename A>
|
|
44
|
-
double theta_sketch_experimental<A>::get_lower_bound(uint8_t num_std_devs) const {
|
|
45
|
-
if (!is_estimation_mode()) return get_num_retained();
|
|
46
|
-
return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
template<typename A>
|
|
50
|
-
double theta_sketch_experimental<A>::get_upper_bound(uint8_t num_std_devs) const {
|
|
51
|
-
if (!is_estimation_mode()) return get_num_retained();
|
|
52
|
-
return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
template<typename A>
|
|
56
|
-
string<A> theta_sketch_experimental<A>::to_string(bool detail) const {
|
|
57
|
-
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
|
|
58
|
-
os << "### Theta sketch summary:" << std::endl;
|
|
59
|
-
os << " num retained entries : " << get_num_retained() << std::endl;
|
|
60
|
-
os << " seed hash : " << get_seed_hash() << std::endl;
|
|
61
|
-
os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
|
|
62
|
-
os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
|
|
63
|
-
os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
|
|
64
|
-
os << " theta (fraction) : " << get_theta() << std::endl;
|
|
65
|
-
os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
|
|
66
|
-
os << " estimate : " << this->get_estimate() << std::endl;
|
|
67
|
-
os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
|
|
68
|
-
os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
|
|
69
|
-
print_specifics(os);
|
|
70
|
-
os << "### End sketch summary" << std::endl;
|
|
71
|
-
if (detail) {
|
|
72
|
-
os << "### Retained entries" << std::endl;
|
|
73
|
-
for (const auto& hash: *this) {
|
|
74
|
-
os << hash << std::endl;
|
|
75
|
-
}
|
|
76
|
-
os << "### End retained entries" << std::endl;
|
|
77
|
-
}
|
|
78
|
-
return os.str();
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
// update sketch
|
|
82
|
-
|
|
83
|
-
template<typename A>
|
|
84
|
-
update_theta_sketch_experimental<A>::update_theta_sketch_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
|
85
|
-
uint64_t theta, uint64_t seed, const A& allocator):
|
|
86
|
-
table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
|
|
87
|
-
{}
|
|
88
|
-
|
|
89
|
-
template<typename A>
|
|
90
|
-
A update_theta_sketch_experimental<A>::get_allocator() const {
|
|
91
|
-
return table_.allocator_;
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
template<typename A>
|
|
95
|
-
bool update_theta_sketch_experimental<A>::is_empty() const {
|
|
96
|
-
return table_.is_empty_;
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
template<typename A>
|
|
100
|
-
bool update_theta_sketch_experimental<A>::is_ordered() const {
|
|
101
|
-
return false;
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
template<typename A>
|
|
105
|
-
uint64_t update_theta_sketch_experimental<A>::get_theta64() const {
|
|
106
|
-
return table_.theta_;
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
template<typename A>
|
|
110
|
-
uint32_t update_theta_sketch_experimental<A>::get_num_retained() const {
|
|
111
|
-
return table_.num_entries_;
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
template<typename A>
|
|
115
|
-
uint16_t update_theta_sketch_experimental<A>::get_seed_hash() const {
|
|
116
|
-
return compute_seed_hash(table_.seed_);
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
template<typename A>
|
|
120
|
-
uint8_t update_theta_sketch_experimental<A>::get_lg_k() const {
|
|
121
|
-
return table_.lg_nom_size_;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
template<typename A>
|
|
125
|
-
auto update_theta_sketch_experimental<A>::get_rf() const -> resize_factor {
|
|
126
|
-
return table_.rf_;
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
template<typename A>
|
|
130
|
-
void update_theta_sketch_experimental<A>::update(uint64_t value) {
|
|
131
|
-
update(&value, sizeof(value));
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
template<typename A>
|
|
135
|
-
void update_theta_sketch_experimental<A>::update(int64_t value) {
|
|
136
|
-
update(&value, sizeof(value));
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
template<typename A>
|
|
140
|
-
void update_theta_sketch_experimental<A>::update(uint32_t value) {
|
|
141
|
-
update(static_cast<int32_t>(value));
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
template<typename A>
|
|
145
|
-
void update_theta_sketch_experimental<A>::update(int32_t value) {
|
|
146
|
-
update(static_cast<int64_t>(value));
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
template<typename A>
|
|
150
|
-
void update_theta_sketch_experimental<A>::update(uint16_t value) {
|
|
151
|
-
update(static_cast<int16_t>(value));
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
template<typename A>
|
|
155
|
-
void update_theta_sketch_experimental<A>::update(int16_t value) {
|
|
156
|
-
update(static_cast<int64_t>(value));
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
template<typename A>
|
|
160
|
-
void update_theta_sketch_experimental<A>::update(uint8_t value) {
|
|
161
|
-
update(static_cast<int8_t>(value));
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
template<typename A>
|
|
165
|
-
void update_theta_sketch_experimental<A>::update(int8_t value) {
|
|
166
|
-
update(static_cast<int64_t>(value));
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
template<typename A>
|
|
170
|
-
void update_theta_sketch_experimental<A>::update(double value) {
|
|
171
|
-
update(canonical_double(value));
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
template<typename A>
|
|
175
|
-
void update_theta_sketch_experimental<A>::update(float value) {
|
|
176
|
-
update(static_cast<double>(value));
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
template<typename A>
|
|
180
|
-
void update_theta_sketch_experimental<A>::update(const std::string& value) {
|
|
181
|
-
if (value.empty()) return;
|
|
182
|
-
update(value.c_str(), value.length());
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
template<typename A>
|
|
186
|
-
void update_theta_sketch_experimental<A>::update(const void* data, size_t length) {
|
|
187
|
-
const uint64_t hash = table_.hash_and_screen(data, length);
|
|
188
|
-
if (hash == 0) return;
|
|
189
|
-
auto result = table_.find(hash);
|
|
190
|
-
if (!result.second) {
|
|
191
|
-
table_.insert(result.first, hash);
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
template<typename A>
|
|
196
|
-
void update_theta_sketch_experimental<A>::trim() {
|
|
197
|
-
table_.trim();
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
template<typename A>
|
|
201
|
-
auto update_theta_sketch_experimental<A>::begin() -> iterator {
|
|
202
|
-
return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
template<typename A>
|
|
206
|
-
auto update_theta_sketch_experimental<A>::end() -> iterator {
|
|
207
|
-
return iterator(nullptr, 0, 1 << table_.lg_cur_size_);
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
template<typename A>
|
|
211
|
-
auto update_theta_sketch_experimental<A>::begin() const -> const_iterator {
|
|
212
|
-
return const_iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
template<typename A>
|
|
216
|
-
auto update_theta_sketch_experimental<A>::end() const -> const_iterator {
|
|
217
|
-
return const_iterator(nullptr, 0, 1 << table_.lg_cur_size_);
|
|
218
|
-
}
|
|
219
|
-
template<typename A>
|
|
220
|
-
compact_theta_sketch_experimental<A> update_theta_sketch_experimental<A>::compact(bool ordered) const {
|
|
221
|
-
return compact_theta_sketch_experimental<A>(*this, ordered);
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
template<typename A>
|
|
225
|
-
void update_theta_sketch_experimental<A>::print_specifics(std::ostringstream& os) const {
|
|
226
|
-
os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
|
|
227
|
-
os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
|
|
228
|
-
os << " resize factor : " << (1 << table_.rf_) << std::endl;
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
// builder
|
|
232
|
-
|
|
233
|
-
template<typename A>
|
|
234
|
-
update_theta_sketch_experimental<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
|
|
235
|
-
|
|
236
|
-
template<typename A>
|
|
237
|
-
update_theta_sketch_experimental<A> update_theta_sketch_experimental<A>::builder::build() const {
|
|
238
|
-
return update_theta_sketch_experimental(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
// experimental compact theta sketch
|
|
242
|
-
|
|
243
|
-
template<typename A>
|
|
244
|
-
compact_theta_sketch_experimental<A>::compact_theta_sketch_experimental(const Base& other, bool ordered):
|
|
245
|
-
is_empty_(other.is_empty()),
|
|
246
|
-
is_ordered_(other.is_ordered() || ordered),
|
|
247
|
-
seed_hash_(other.get_seed_hash()),
|
|
248
|
-
theta_(other.get_theta64()),
|
|
249
|
-
entries_(other.get_allocator())
|
|
250
|
-
{
|
|
251
|
-
entries_.reserve(other.get_num_retained());
|
|
252
|
-
std::copy(other.begin(), other.end(), std::back_inserter(entries_));
|
|
253
|
-
if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
template<typename A>
|
|
257
|
-
compact_theta_sketch_experimental<A>::compact_theta_sketch_experimental(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
|
|
258
|
-
std::vector<uint64_t, A>&& entries):
|
|
259
|
-
is_empty_(is_empty),
|
|
260
|
-
is_ordered_(is_ordered),
|
|
261
|
-
seed_hash_(seed_hash),
|
|
262
|
-
theta_(theta),
|
|
263
|
-
entries_(std::move(entries))
|
|
264
|
-
{}
|
|
265
|
-
|
|
266
|
-
template<typename A>
|
|
267
|
-
A compact_theta_sketch_experimental<A>::get_allocator() const {
|
|
268
|
-
return entries_.get_allocator();
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
template<typename A>
|
|
272
|
-
bool compact_theta_sketch_experimental<A>::is_empty() const {
|
|
273
|
-
return is_empty_;
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
template<typename A>
|
|
277
|
-
bool compact_theta_sketch_experimental<A>::is_ordered() const {
|
|
278
|
-
return is_ordered_;
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
template<typename A>
|
|
282
|
-
uint64_t compact_theta_sketch_experimental<A>::get_theta64() const {
|
|
283
|
-
return theta_;
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
template<typename A>
|
|
287
|
-
uint32_t compact_theta_sketch_experimental<A>::get_num_retained() const {
|
|
288
|
-
return entries_.size();
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
template<typename A>
|
|
292
|
-
uint16_t compact_theta_sketch_experimental<A>::get_seed_hash() const {
|
|
293
|
-
return seed_hash_;
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
template<typename A>
|
|
297
|
-
auto compact_theta_sketch_experimental<A>::begin() -> iterator {
|
|
298
|
-
return iterator(entries_.data(), entries_.size(), 0);
|
|
299
|
-
}
|
|
300
|
-
|
|
301
|
-
template<typename A>
|
|
302
|
-
auto compact_theta_sketch_experimental<A>::end() -> iterator {
|
|
303
|
-
return iterator(nullptr, 0, entries_.size());
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
template<typename A>
|
|
307
|
-
auto compact_theta_sketch_experimental<A>::begin() const -> const_iterator {
|
|
308
|
-
return const_iterator(entries_.data(), entries_.size(), 0);
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
template<typename A>
|
|
312
|
-
auto compact_theta_sketch_experimental<A>::end() const -> const_iterator {
|
|
313
|
-
return const_iterator(nullptr, 0, entries_.size());
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
template<typename A>
|
|
317
|
-
void compact_theta_sketch_experimental<A>::print_specifics(std::ostringstream&) const {}
|
|
318
|
-
|
|
319
|
-
template<typename A>
|
|
320
|
-
void compact_theta_sketch_experimental<A>::serialize(std::ostream& os) const {
|
|
321
|
-
const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
|
|
322
|
-
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
|
|
323
|
-
os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
|
|
324
|
-
const uint8_t serial_version = SERIAL_VERSION;
|
|
325
|
-
os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
|
|
326
|
-
const uint8_t type = SKETCH_TYPE;
|
|
327
|
-
os.write(reinterpret_cast<const char*>(&type), sizeof(type));
|
|
328
|
-
const uint16_t unused16 = 0;
|
|
329
|
-
os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
|
|
330
|
-
const uint8_t flags_byte(
|
|
331
|
-
(1 << flags::IS_COMPACT) |
|
|
332
|
-
(1 << flags::IS_READ_ONLY) |
|
|
333
|
-
(this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
|
|
334
|
-
(this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
|
|
335
|
-
);
|
|
336
|
-
os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
|
|
337
|
-
const uint16_t seed_hash = get_seed_hash();
|
|
338
|
-
os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
|
|
339
|
-
if (!this->is_empty()) {
|
|
340
|
-
if (!is_single_item) {
|
|
341
|
-
const uint32_t num_entries = entries_.size();
|
|
342
|
-
os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
|
|
343
|
-
const uint32_t unused32 = 0;
|
|
344
|
-
os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
|
|
345
|
-
if (this->is_estimation_mode()) {
|
|
346
|
-
os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
|
|
347
|
-
}
|
|
348
|
-
}
|
|
349
|
-
os.write(reinterpret_cast<const char*>(entries_.data()), entries_.size() * sizeof(uint64_t));
|
|
350
|
-
}
|
|
351
|
-
}
|
|
352
|
-
|
|
353
|
-
template<typename A>
|
|
354
|
-
auto compact_theta_sketch_experimental<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
|
|
355
|
-
const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
|
|
356
|
-
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
|
|
357
|
-
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
|
|
358
|
-
+ sizeof(uint64_t) * entries_.size();
|
|
359
|
-
vector_bytes bytes(size, 0, entries_.get_allocator());
|
|
360
|
-
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
361
|
-
|
|
362
|
-
ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
|
|
363
|
-
const uint8_t serial_version = SERIAL_VERSION;
|
|
364
|
-
ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
|
|
365
|
-
const uint8_t type = SKETCH_TYPE;
|
|
366
|
-
ptr += copy_to_mem(&type, ptr, sizeof(type));
|
|
367
|
-
const uint16_t unused16 = 0;
|
|
368
|
-
ptr += copy_to_mem(&unused16, ptr, sizeof(unused16));
|
|
369
|
-
const uint8_t flags_byte(
|
|
370
|
-
(1 << flags::IS_COMPACT) |
|
|
371
|
-
(1 << flags::IS_READ_ONLY) |
|
|
372
|
-
(this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
|
|
373
|
-
(this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
|
|
374
|
-
);
|
|
375
|
-
ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
|
|
376
|
-
const uint16_t seed_hash = get_seed_hash();
|
|
377
|
-
ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
|
|
378
|
-
if (!this->is_empty()) {
|
|
379
|
-
if (!is_single_item) {
|
|
380
|
-
const uint32_t num_entries = entries_.size();
|
|
381
|
-
ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
|
|
382
|
-
const uint32_t unused32 = 0;
|
|
383
|
-
ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
|
|
384
|
-
if (this->is_estimation_mode()) {
|
|
385
|
-
ptr += copy_to_mem(&theta_, ptr, sizeof(uint64_t));
|
|
386
|
-
}
|
|
387
|
-
}
|
|
388
|
-
ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
|
|
389
|
-
}
|
|
390
|
-
return bytes;
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
template<typename A>
|
|
394
|
-
compact_theta_sketch_experimental<A> compact_theta_sketch_experimental<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
|
|
395
|
-
uint8_t preamble_longs;
|
|
396
|
-
is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
|
|
397
|
-
uint8_t serial_version;
|
|
398
|
-
is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
|
|
399
|
-
uint8_t type;
|
|
400
|
-
is.read(reinterpret_cast<char*>(&type), sizeof(type));
|
|
401
|
-
uint16_t unused16;
|
|
402
|
-
is.read(reinterpret_cast<char*>(&unused16), sizeof(unused16));
|
|
403
|
-
uint8_t flags_byte;
|
|
404
|
-
is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
|
|
405
|
-
uint16_t seed_hash;
|
|
406
|
-
is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
|
|
407
|
-
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
408
|
-
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
|
409
|
-
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
410
|
-
if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
411
|
-
|
|
412
|
-
uint64_t theta = theta_constants::MAX_THETA;
|
|
413
|
-
uint32_t num_entries = 0;
|
|
414
|
-
if (!is_empty) {
|
|
415
|
-
if (preamble_longs == 1) {
|
|
416
|
-
num_entries = 1;
|
|
417
|
-
} else {
|
|
418
|
-
is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
|
|
419
|
-
uint32_t unused32;
|
|
420
|
-
is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
|
|
421
|
-
if (preamble_longs > 2) {
|
|
422
|
-
is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
|
|
423
|
-
}
|
|
424
|
-
}
|
|
425
|
-
}
|
|
426
|
-
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
|
427
|
-
if (!is_empty) is.read(reinterpret_cast<char*>(entries.data()), sizeof(uint64_t) * entries.size());
|
|
428
|
-
|
|
429
|
-
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
|
|
430
|
-
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
|
431
|
-
return compact_theta_sketch_experimental(is_empty, is_ordered, seed_hash, theta, std::move(entries));
|
|
432
|
-
}
|
|
433
|
-
|
|
434
|
-
template<typename A>
|
|
435
|
-
compact_theta_sketch_experimental<A> compact_theta_sketch_experimental<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
|
|
436
|
-
ensure_minimum_memory(size, 8);
|
|
437
|
-
const char* ptr = static_cast<const char*>(bytes);
|
|
438
|
-
const char* base = ptr;
|
|
439
|
-
uint8_t preamble_longs;
|
|
440
|
-
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
|
|
441
|
-
uint8_t serial_version;
|
|
442
|
-
ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
|
|
443
|
-
uint8_t type;
|
|
444
|
-
ptr += copy_from_mem(ptr, &type, sizeof(type));
|
|
445
|
-
uint16_t unused16;
|
|
446
|
-
ptr += copy_from_mem(ptr, &unused16, sizeof(unused16));
|
|
447
|
-
uint8_t flags_byte;
|
|
448
|
-
ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
|
|
449
|
-
uint16_t seed_hash;
|
|
450
|
-
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
|
|
451
|
-
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
452
|
-
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
|
453
|
-
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
454
|
-
if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
455
|
-
|
|
456
|
-
uint64_t theta = theta_constants::MAX_THETA;
|
|
457
|
-
uint32_t num_entries = 0;
|
|
458
|
-
if (!is_empty) {
|
|
459
|
-
if (preamble_longs == 1) {
|
|
460
|
-
num_entries = 1;
|
|
461
|
-
} else {
|
|
462
|
-
ensure_minimum_memory(size, 8); // read the first prelong before this method
|
|
463
|
-
ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
|
|
464
|
-
uint32_t unused32;
|
|
465
|
-
ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
|
|
466
|
-
if (preamble_longs > 2) {
|
|
467
|
-
ensure_minimum_memory(size, (preamble_longs - 1) << 3);
|
|
468
|
-
ptr += copy_from_mem(ptr, &theta, sizeof(theta));
|
|
469
|
-
}
|
|
470
|
-
}
|
|
471
|
-
}
|
|
472
|
-
const size_t entries_size_bytes = sizeof(uint64_t) * num_entries;
|
|
473
|
-
check_memory_size(ptr - base + entries_size_bytes, size);
|
|
474
|
-
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
|
475
|
-
if (!is_empty) ptr += copy_from_mem(ptr, entries.data(), entries_size_bytes);
|
|
476
|
-
|
|
477
|
-
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
|
|
478
|
-
return compact_theta_sketch_experimental(is_empty, is_ordered, seed_hash, theta, std::move(entries));
|
|
479
|
-
}
|
|
480
|
-
|
|
481
|
-
} /* namespace datasketches */
|