datasketches 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
|
@@ -28,10 +28,10 @@
|
|
|
28
28
|
namespace datasketches {
|
|
29
29
|
|
|
30
30
|
template<typename A>
|
|
31
|
-
static
|
|
31
|
+
static int32_t find(const uint32_t* array, uint8_t lgArrInts, uint32_t coupon);
|
|
32
32
|
|
|
33
33
|
template<typename A>
|
|
34
|
-
CouponHashSet<A>::CouponHashSet(
|
|
34
|
+
CouponHashSet<A>::CouponHashSet(uint8_t lgConfigK, target_hll_type tgtHllType, const A& allocator)
|
|
35
35
|
: CouponList<A>(lgConfigK, tgtHllType, hll_mode::SET, allocator)
|
|
36
36
|
{
|
|
37
37
|
if (lgConfigK <= 7) {
|
|
@@ -56,45 +56,45 @@ std::function<void(HllSketchImpl<A>*)> CouponHashSet<A>::get_deleter() const {
|
|
|
56
56
|
|
|
57
57
|
template<typename A>
|
|
58
58
|
CouponHashSet<A>* CouponHashSet<A>::newSet(const void* bytes, size_t len, const A& allocator) {
|
|
59
|
-
if (len <
|
|
59
|
+
if (len < hll_constants::HASH_SET_INT_ARR_START) { // hard-coded
|
|
60
60
|
throw std::out_of_range("Input data length insufficient to hold CouponHashSet");
|
|
61
61
|
}
|
|
62
62
|
|
|
63
63
|
const uint8_t* data = static_cast<const uint8_t*>(bytes);
|
|
64
|
-
if (data[
|
|
64
|
+
if (data[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::HASH_SET_PREINTS) {
|
|
65
65
|
throw std::invalid_argument("Incorrect number of preInts in input stream");
|
|
66
66
|
}
|
|
67
|
-
if (data[
|
|
67
|
+
if (data[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
|
|
68
68
|
throw std::invalid_argument("Wrong ser ver in input stream");
|
|
69
69
|
}
|
|
70
|
-
if (data[
|
|
70
|
+
if (data[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
|
|
71
71
|
throw std::invalid_argument("Input stream is not an HLL sketch");
|
|
72
72
|
}
|
|
73
73
|
|
|
74
|
-
const hll_mode mode = HllSketchImpl<A>::extractCurMode(data[
|
|
74
|
+
const hll_mode mode = HllSketchImpl<A>::extractCurMode(data[hll_constants::MODE_BYTE]);
|
|
75
75
|
if (mode != SET) {
|
|
76
76
|
throw std::invalid_argument("Calling set constructor with non-set mode data");
|
|
77
77
|
}
|
|
78
78
|
|
|
79
|
-
const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(data[
|
|
79
|
+
const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(data[hll_constants::MODE_BYTE]);
|
|
80
80
|
|
|
81
|
-
const
|
|
81
|
+
const uint8_t lgK = data[hll_constants::LG_K_BYTE];
|
|
82
82
|
if (lgK <= 7) {
|
|
83
83
|
throw std::invalid_argument("Attempt to deserialize invalid CouponHashSet with lgConfigK <= 7. Found: "
|
|
84
84
|
+ std::to_string(lgK));
|
|
85
85
|
}
|
|
86
|
-
|
|
87
|
-
const bool compactFlag = ((data[
|
|
86
|
+
uint8_t lgArrInts = data[hll_constants::LG_ARR_BYTE];
|
|
87
|
+
const bool compactFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
|
|
88
88
|
|
|
89
|
-
|
|
90
|
-
std::memcpy(&couponCount, data +
|
|
91
|
-
if (lgArrInts <
|
|
92
|
-
lgArrInts = HllUtil
|
|
89
|
+
uint32_t couponCount;
|
|
90
|
+
std::memcpy(&couponCount, data + hll_constants::HASH_SET_COUNT_INT, sizeof(couponCount));
|
|
91
|
+
if (lgArrInts < hll_constants::LG_INIT_SET_SIZE) {
|
|
92
|
+
lgArrInts = HllUtil<>::computeLgArrInts(SET, couponCount, lgK);
|
|
93
93
|
}
|
|
94
94
|
// Don't set couponCount in sketch here;
|
|
95
95
|
// we'll set later if updatable, and increment with updates if compact
|
|
96
|
-
const
|
|
97
|
-
const size_t expectedLength =
|
|
96
|
+
const uint32_t couponsInArray = (compactFlag ? couponCount : (1 << lgArrInts));
|
|
97
|
+
const size_t expectedLength = hll_constants::HASH_SET_INT_ARR_START + (couponsInArray * sizeof(uint32_t));
|
|
98
98
|
if (len < expectedLength) {
|
|
99
99
|
throw std::out_of_range("Byte array too short for sketch. Expected " + std::to_string(expectedLength)
|
|
100
100
|
+ ", found: " + std::to_string(len));
|
|
@@ -104,19 +104,19 @@ CouponHashSet<A>* CouponHashSet<A>::newSet(const void* bytes, size_t len, const
|
|
|
104
104
|
CouponHashSet<A>* sketch = new (chsa.allocate(1)) CouponHashSet<A>(lgK, tgtHllType, allocator);
|
|
105
105
|
|
|
106
106
|
if (compactFlag) {
|
|
107
|
-
const uint8_t* curPos = data +
|
|
108
|
-
|
|
109
|
-
for (
|
|
107
|
+
const uint8_t* curPos = data + hll_constants::HASH_SET_INT_ARR_START;
|
|
108
|
+
uint32_t coupon;
|
|
109
|
+
for (uint32_t i = 0; i < couponCount; ++i, curPos += sizeof(coupon)) {
|
|
110
110
|
std::memcpy(&coupon, curPos, sizeof(coupon));
|
|
111
111
|
sketch->couponUpdate(coupon);
|
|
112
112
|
}
|
|
113
113
|
} else {
|
|
114
|
-
sketch->
|
|
115
|
-
sketch->
|
|
114
|
+
sketch->coupons_.resize(1ULL << lgArrInts);
|
|
115
|
+
sketch->couponCount_ = couponCount;
|
|
116
116
|
// only need to read valid coupons, unlike in stream case
|
|
117
|
-
std::memcpy(sketch->
|
|
118
|
-
data +
|
|
119
|
-
couponCount * sizeof(
|
|
117
|
+
std::memcpy(sketch->coupons_.data(),
|
|
118
|
+
data + hll_constants::HASH_SET_INT_ARR_START,
|
|
119
|
+
couponCount * sizeof(uint32_t));
|
|
120
120
|
}
|
|
121
121
|
|
|
122
122
|
return sketch;
|
|
@@ -125,37 +125,36 @@ CouponHashSet<A>* CouponHashSet<A>::newSet(const void* bytes, size_t len, const
|
|
|
125
125
|
template<typename A>
|
|
126
126
|
CouponHashSet<A>* CouponHashSet<A>::newSet(std::istream& is, const A& allocator) {
|
|
127
127
|
uint8_t listHeader[8];
|
|
128
|
-
|
|
128
|
+
read(is, listHeader, 8 * sizeof(uint8_t));
|
|
129
129
|
|
|
130
|
-
if (listHeader[
|
|
130
|
+
if (listHeader[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::HASH_SET_PREINTS) {
|
|
131
131
|
throw std::invalid_argument("Incorrect number of preInts in input stream");
|
|
132
132
|
}
|
|
133
|
-
if (listHeader[
|
|
133
|
+
if (listHeader[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
|
|
134
134
|
throw std::invalid_argument("Wrong ser ver in input stream");
|
|
135
135
|
}
|
|
136
|
-
if (listHeader[
|
|
136
|
+
if (listHeader[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
|
|
137
137
|
throw std::invalid_argument("Input stream is not an HLL sketch");
|
|
138
138
|
}
|
|
139
139
|
|
|
140
|
-
hll_mode mode = HllSketchImpl<A>::extractCurMode(listHeader[
|
|
140
|
+
hll_mode mode = HllSketchImpl<A>::extractCurMode(listHeader[hll_constants::MODE_BYTE]);
|
|
141
141
|
if (mode != SET) {
|
|
142
142
|
throw std::invalid_argument("Calling set constructor with non-set mode data");
|
|
143
143
|
}
|
|
144
144
|
|
|
145
|
-
target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(listHeader[
|
|
145
|
+
const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(listHeader[hll_constants::MODE_BYTE]);
|
|
146
146
|
|
|
147
|
-
const
|
|
147
|
+
const uint8_t lgK = listHeader[hll_constants::LG_K_BYTE];
|
|
148
148
|
if (lgK <= 7) {
|
|
149
149
|
throw std::invalid_argument("Attempt to deserialize invalid CouponHashSet with lgConfigK <= 7. Found: "
|
|
150
150
|
+ std::to_string(lgK));
|
|
151
151
|
}
|
|
152
|
-
|
|
153
|
-
const bool compactFlag = ((listHeader[
|
|
152
|
+
uint8_t lgArrInts = listHeader[hll_constants::LG_ARR_BYTE];
|
|
153
|
+
const bool compactFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
|
|
154
154
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
lgArrInts = HllUtil<A>::computeLgArrInts(SET, couponCount, lgK);
|
|
155
|
+
const auto couponCount = read<uint32_t>(is);
|
|
156
|
+
if (lgArrInts < hll_constants::LG_INIT_SET_SIZE) {
|
|
157
|
+
lgArrInts = HllUtil<>::computeLgArrInts(SET, couponCount, lgK);
|
|
159
158
|
}
|
|
160
159
|
|
|
161
160
|
ChsAlloc chsa(allocator);
|
|
@@ -166,16 +165,15 @@ CouponHashSet<A>* CouponHashSet<A>::newSet(std::istream& is, const A& allocator)
|
|
|
166
165
|
// Don't set couponCount here;
|
|
167
166
|
// we'll set later if updatable, and increment with updates if compact
|
|
168
167
|
if (compactFlag) {
|
|
169
|
-
for (
|
|
170
|
-
|
|
171
|
-
is.read((char*)&coupon, sizeof(coupon));
|
|
168
|
+
for (uint32_t i = 0; i < couponCount; ++i) {
|
|
169
|
+
const auto coupon = read<uint32_t>(is);
|
|
172
170
|
sketch->couponUpdate(coupon);
|
|
173
171
|
}
|
|
174
172
|
} else {
|
|
175
|
-
sketch->
|
|
176
|
-
sketch->
|
|
173
|
+
sketch->coupons_.resize(1ULL << lgArrInts);
|
|
174
|
+
sketch->couponCount_ = couponCount;
|
|
177
175
|
// for stream processing, read entire list so read pointer ends up set correctly
|
|
178
|
-
|
|
176
|
+
read(is, sketch->coupons_.data(), sketch->coupons_.size() * sizeof(uint32_t));
|
|
179
177
|
}
|
|
180
178
|
|
|
181
179
|
if (!is.good())
|
|
@@ -186,25 +184,25 @@ CouponHashSet<A>* CouponHashSet<A>::newSet(std::istream& is, const A& allocator)
|
|
|
186
184
|
|
|
187
185
|
template<typename A>
|
|
188
186
|
CouponHashSet<A>* CouponHashSet<A>::copy() const {
|
|
189
|
-
ChsAlloc chsa(this->
|
|
187
|
+
ChsAlloc chsa(this->coupons_.get_allocator());
|
|
190
188
|
return new (chsa.allocate(1)) CouponHashSet<A>(*this);
|
|
191
189
|
}
|
|
192
190
|
|
|
193
191
|
template<typename A>
|
|
194
|
-
CouponHashSet<A>* CouponHashSet<A>::copyAs(
|
|
195
|
-
ChsAlloc chsa(this->
|
|
192
|
+
CouponHashSet<A>* CouponHashSet<A>::copyAs(target_hll_type tgtHllType) const {
|
|
193
|
+
ChsAlloc chsa(this->coupons_.get_allocator());
|
|
196
194
|
return new (chsa.allocate(1)) CouponHashSet<A>(*this, tgtHllType);
|
|
197
195
|
}
|
|
198
196
|
|
|
199
197
|
template<typename A>
|
|
200
|
-
HllSketchImpl<A>* CouponHashSet<A>::couponUpdate(
|
|
201
|
-
const uint8_t lgCouponArrInts = count_trailing_zeros_in_u32(this->
|
|
202
|
-
const
|
|
198
|
+
HllSketchImpl<A>* CouponHashSet<A>::couponUpdate(uint32_t coupon) {
|
|
199
|
+
const uint8_t lgCouponArrInts = count_trailing_zeros_in_u32(static_cast<uint32_t>(this->coupons_.size()));
|
|
200
|
+
const int32_t index = find<A>(this->coupons_.data(), lgCouponArrInts, coupon);
|
|
203
201
|
if (index >= 0) {
|
|
204
202
|
return this; // found duplicate, ignore
|
|
205
203
|
}
|
|
206
|
-
this->
|
|
207
|
-
++this->
|
|
204
|
+
this->coupons_[~index] = coupon; // found empty
|
|
205
|
+
++this->couponCount_;
|
|
208
206
|
if (checkGrowOrPromote()) {
|
|
209
207
|
return this->promoteHeapListOrSetToHll(*this);
|
|
210
208
|
}
|
|
@@ -212,20 +210,20 @@ HllSketchImpl<A>* CouponHashSet<A>::couponUpdate(int coupon) {
|
|
|
212
210
|
}
|
|
213
211
|
|
|
214
212
|
template<typename A>
|
|
215
|
-
|
|
216
|
-
return
|
|
213
|
+
uint32_t CouponHashSet<A>::getMemDataStart() const {
|
|
214
|
+
return hll_constants::HASH_SET_INT_ARR_START;
|
|
217
215
|
}
|
|
218
216
|
|
|
219
217
|
template<typename A>
|
|
220
|
-
|
|
221
|
-
return
|
|
218
|
+
uint8_t CouponHashSet<A>::getPreInts() const {
|
|
219
|
+
return hll_constants::HASH_SET_PREINTS;
|
|
222
220
|
}
|
|
223
221
|
|
|
224
222
|
template<typename A>
|
|
225
223
|
bool CouponHashSet<A>::checkGrowOrPromote() {
|
|
226
|
-
if (static_cast<size_t>(
|
|
227
|
-
const uint8_t lgCouponArrInts = count_trailing_zeros_in_u32(this->
|
|
228
|
-
if (lgCouponArrInts == (this->
|
|
224
|
+
if (static_cast<size_t>(hll_constants::RESIZE_DENOM * this->couponCount_) > (hll_constants::RESIZE_NUMER * this->coupons_.size())) {
|
|
225
|
+
const uint8_t lgCouponArrInts = count_trailing_zeros_in_u32(static_cast<uint32_t>(this->coupons_.size()));
|
|
226
|
+
if (lgCouponArrInts == (this->lgConfigK_ - 3)) { // at max size
|
|
229
227
|
return true; // promote to HLL
|
|
230
228
|
}
|
|
231
229
|
growHashSet(lgCouponArrInts + 1);
|
|
@@ -234,15 +232,15 @@ bool CouponHashSet<A>::checkGrowOrPromote() {
|
|
|
234
232
|
}
|
|
235
233
|
|
|
236
234
|
template<typename A>
|
|
237
|
-
void CouponHashSet<A>::growHashSet(
|
|
238
|
-
const
|
|
239
|
-
vector_int coupons_new(tgtLen, 0, this->
|
|
240
|
-
|
|
241
|
-
const
|
|
242
|
-
for (
|
|
243
|
-
const
|
|
244
|
-
if (fetched !=
|
|
245
|
-
const
|
|
235
|
+
void CouponHashSet<A>::growHashSet(uint8_t tgtLgCoupArrSize) {
|
|
236
|
+
const uint32_t tgtLen = 1 << tgtLgCoupArrSize;
|
|
237
|
+
vector_int coupons_new(tgtLen, 0, this->coupons_.get_allocator());
|
|
238
|
+
|
|
239
|
+
const uint32_t srcLen = static_cast<uint32_t>(this->coupons_.size());
|
|
240
|
+
for (uint32_t i = 0; i < srcLen; ++i) { // scan existing array for non-zero values
|
|
241
|
+
const uint32_t fetched = this->coupons_[i];
|
|
242
|
+
if (fetched != hll_constants::EMPTY) {
|
|
243
|
+
const int32_t idx = find<A>(coupons_new.data(), tgtLgCoupArrSize, fetched); // search TGT array
|
|
246
244
|
if (idx < 0) { // found EMPTY
|
|
247
245
|
coupons_new[~idx] = fetched; // insert
|
|
248
246
|
continue;
|
|
@@ -250,23 +248,23 @@ void CouponHashSet<A>::growHashSet(int tgtLgCoupArrSize) {
|
|
|
250
248
|
throw std::runtime_error("Error: Found duplicate coupon");
|
|
251
249
|
}
|
|
252
250
|
}
|
|
253
|
-
this->
|
|
251
|
+
this->coupons_ = std::move(coupons_new);
|
|
254
252
|
}
|
|
255
253
|
|
|
256
254
|
template<typename A>
|
|
257
|
-
static
|
|
258
|
-
const
|
|
259
|
-
|
|
260
|
-
const
|
|
255
|
+
static int32_t find(const uint32_t* array, uint8_t lgArrInts, uint32_t coupon) {
|
|
256
|
+
const uint32_t arrMask = (1 << lgArrInts) - 1;
|
|
257
|
+
uint32_t probe = coupon & arrMask;
|
|
258
|
+
const uint32_t loopIndex = probe;
|
|
261
259
|
do {
|
|
262
|
-
const
|
|
263
|
-
if (couponAtIdx ==
|
|
260
|
+
const uint32_t couponAtIdx = array[probe];
|
|
261
|
+
if (couponAtIdx == hll_constants::EMPTY) {
|
|
264
262
|
return ~probe; //empty
|
|
265
263
|
}
|
|
266
264
|
else if (coupon == couponAtIdx) {
|
|
267
265
|
return probe; //duplicate
|
|
268
266
|
}
|
|
269
|
-
const
|
|
267
|
+
const uint32_t stride = ((coupon & hll_constants::KEY_MASK_26) >> lgArrInts) | 1;
|
|
270
268
|
probe = (probe + stride) & arrMask;
|
|
271
269
|
} while (probe != loopIndex);
|
|
272
270
|
throw std::invalid_argument("Key not found and no empty slots!");
|
|
@@ -29,29 +29,29 @@ class CouponHashSet : public CouponList<A> {
|
|
|
29
29
|
public:
|
|
30
30
|
static CouponHashSet* newSet(const void* bytes, size_t len, const A& allocator);
|
|
31
31
|
static CouponHashSet* newSet(std::istream& is, const A& allocator);
|
|
32
|
-
CouponHashSet(
|
|
32
|
+
CouponHashSet(uint8_t lgConfigK, target_hll_type tgtHllType, const A& allocator);
|
|
33
33
|
CouponHashSet(const CouponHashSet& that, target_hll_type tgtHllType);
|
|
34
34
|
|
|
35
35
|
virtual ~CouponHashSet() = default;
|
|
36
36
|
virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
|
|
37
37
|
|
|
38
38
|
protected:
|
|
39
|
-
using vector_int = std::vector<
|
|
39
|
+
using vector_int = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
|
|
40
40
|
|
|
41
41
|
virtual CouponHashSet* copy() const;
|
|
42
42
|
virtual CouponHashSet* copyAs(target_hll_type tgtHllType) const;
|
|
43
43
|
|
|
44
|
-
virtual HllSketchImpl<A>* couponUpdate(
|
|
44
|
+
virtual HllSketchImpl<A>* couponUpdate(uint32_t coupon);
|
|
45
45
|
|
|
46
|
-
virtual
|
|
47
|
-
virtual
|
|
46
|
+
virtual uint32_t getMemDataStart() const;
|
|
47
|
+
virtual uint8_t getPreInts() const;
|
|
48
48
|
|
|
49
49
|
friend class HllSketchImplFactory<A>;
|
|
50
50
|
|
|
51
51
|
private:
|
|
52
52
|
using ChsAlloc = typename std::allocator_traits<A>::template rebind_alloc<CouponHashSet<A>>;
|
|
53
53
|
bool checkGrowOrPromote();
|
|
54
|
-
void growHashSet(
|
|
54
|
+
void growHashSet(uint8_t tgtLgCoupArrSize);
|
|
55
55
|
};
|
|
56
56
|
|
|
57
57
|
}
|