datasketches 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
- metadata +8 -3
|
@@ -26,7 +26,7 @@
|
|
|
26
26
|
|
|
27
27
|
namespace datasketches {
|
|
28
28
|
|
|
29
|
-
static hll_sketch buildSketch(const int n, const
|
|
29
|
+
static hll_sketch buildSketch(const int n, const uint8_t lgK, const target_hll_type tgtHllType) {
|
|
30
30
|
hll_sketch sketch(lgK, tgtHllType);
|
|
31
31
|
for (int i = 0; i < n; ++i) {
|
|
32
32
|
sketch.update(i);
|
|
@@ -34,7 +34,7 @@ static hll_sketch buildSketch(const int n, const int lgK, const target_hll_type
|
|
|
34
34
|
return sketch;
|
|
35
35
|
}
|
|
36
36
|
|
|
37
|
-
static void crossCountingCheck(const
|
|
37
|
+
static void crossCountingCheck(const uint8_t lgK, const int n) {
|
|
38
38
|
hll_sketch sk4 = buildSketch(n, lgK, HLL_4);
|
|
39
39
|
const double est = sk4.get_estimate();
|
|
40
40
|
const double lb = sk4.get_lower_bound(1);
|
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
|
|
26
26
|
namespace datasketches {
|
|
27
27
|
|
|
28
|
-
static void testComposite(
|
|
28
|
+
static void testComposite(uint8_t lgK, const target_hll_type tgtHllType, const int n) {
|
|
29
29
|
hll_union u(lgK);
|
|
30
30
|
hll_sketch sk(lgK, tgtHllType);
|
|
31
31
|
for (int i = 0; i < n; ++i) {
|
|
@@ -45,7 +45,7 @@ TEST_CASE("hll array: check composite estimate", "[hll_array]") {
|
|
|
45
45
|
testComposite(13, target_hll_type::HLL_8, 10000);
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
-
static void serializeDeserialize(
|
|
48
|
+
static void serializeDeserialize(uint8_t lgK, target_hll_type tgtHllType, const int n) {
|
|
49
49
|
hll_sketch sk1(lgK, tgtHllType);
|
|
50
50
|
|
|
51
51
|
for (int i = 0; i < n; ++i) {
|
|
@@ -72,7 +72,7 @@ static void serializeDeserialize(const int lgK, target_hll_type tgtHllType, cons
|
|
|
72
72
|
}
|
|
73
73
|
|
|
74
74
|
TEST_CASE("hll array: check serialize deserialize", "[hll_array]") {
|
|
75
|
-
|
|
75
|
+
uint8_t lgK = 4;
|
|
76
76
|
int n = 8;
|
|
77
77
|
serializeDeserialize(lgK, HLL_4, n);
|
|
78
78
|
serializeDeserialize(lgK, HLL_6, n);
|
|
@@ -100,7 +100,7 @@ TEST_CASE("hll array: check is compact", "[hll_array]") {
|
|
|
100
100
|
}
|
|
101
101
|
|
|
102
102
|
TEST_CASE("hll array: check corrupt bytearray", "[hll_array]") {
|
|
103
|
-
|
|
103
|
+
uint8_t lgK = 8;
|
|
104
104
|
hll_sketch sk1(lgK, HLL_8);
|
|
105
105
|
for (int i = 0; i < 50; ++i) {
|
|
106
106
|
sk1.update(i);
|
|
@@ -109,36 +109,36 @@ TEST_CASE("hll array: check corrupt bytearray", "[hll_array]") {
|
|
|
109
109
|
uint8_t* bytes = sketchBytes.data();
|
|
110
110
|
const size_t size = sketchBytes.size();
|
|
111
111
|
|
|
112
|
-
bytes[
|
|
112
|
+
bytes[hll_constants::PREAMBLE_INTS_BYTE] = 0;
|
|
113
113
|
REQUIRE_THROWS_AS(hll_sketch::deserialize(bytes, size), std::invalid_argument);
|
|
114
114
|
REQUIRE_THROWS_AS(HllArray<std::allocator<uint8_t>>::newHll(bytes, size, std::allocator<uint8_t>()), std::invalid_argument);
|
|
115
|
-
bytes[
|
|
115
|
+
bytes[hll_constants::PREAMBLE_INTS_BYTE] = hll_constants::HLL_PREINTS;
|
|
116
116
|
|
|
117
|
-
bytes[
|
|
117
|
+
bytes[hll_constants::SER_VER_BYTE] = 0;
|
|
118
118
|
REQUIRE_THROWS_AS(hll_sketch::deserialize(bytes, size), std::invalid_argument);
|
|
119
|
-
bytes[
|
|
119
|
+
bytes[hll_constants::SER_VER_BYTE] = hll_constants::SER_VER;
|
|
120
120
|
|
|
121
|
-
bytes[
|
|
121
|
+
bytes[hll_constants::FAMILY_BYTE] = 0;
|
|
122
122
|
REQUIRE_THROWS_AS(hll_sketch::deserialize(bytes, size), std::invalid_argument);
|
|
123
|
-
bytes[
|
|
123
|
+
bytes[hll_constants::FAMILY_BYTE] = hll_constants::FAMILY_ID;
|
|
124
124
|
|
|
125
|
-
uint8_t tmp = bytes[
|
|
126
|
-
bytes[
|
|
125
|
+
uint8_t tmp = bytes[hll_constants::MODE_BYTE];
|
|
126
|
+
bytes[hll_constants::MODE_BYTE] = 0x10; // HLL_6, LIST
|
|
127
127
|
REQUIRE_THROWS_AS(hll_sketch::deserialize(bytes, size), std::invalid_argument);
|
|
128
|
-
bytes[
|
|
128
|
+
bytes[hll_constants::MODE_BYTE] = tmp;
|
|
129
129
|
|
|
130
|
-
tmp = bytes[
|
|
131
|
-
bytes[
|
|
130
|
+
tmp = bytes[hll_constants::LG_ARR_BYTE];
|
|
131
|
+
bytes[hll_constants::LG_ARR_BYTE] = 0;
|
|
132
132
|
hll_sketch::deserialize(bytes, size);
|
|
133
133
|
// should work fine despite the corruption
|
|
134
|
-
bytes[
|
|
134
|
+
bytes[hll_constants::LG_ARR_BYTE] = tmp;
|
|
135
135
|
|
|
136
136
|
REQUIRE_THROWS_AS(hll_sketch::deserialize(bytes, size - 1), std::out_of_range);
|
|
137
137
|
REQUIRE_THROWS_AS(hll_sketch::deserialize(bytes, 3), std::out_of_range);
|
|
138
138
|
}
|
|
139
139
|
|
|
140
140
|
TEST_CASE("hll array: check corrupt stream", "[hll_array]") {
|
|
141
|
-
|
|
141
|
+
uint8_t lgK = 6;
|
|
142
142
|
hll_sketch sk1(lgK);
|
|
143
143
|
for (int i = 0; i < 50; ++i) {
|
|
144
144
|
sk1.update(i);
|
|
@@ -146,46 +146,46 @@ TEST_CASE("hll array: check corrupt stream", "[hll_array]") {
|
|
|
146
146
|
std::stringstream ss;
|
|
147
147
|
sk1.serialize_compact(ss);
|
|
148
148
|
|
|
149
|
-
ss.seekp(
|
|
149
|
+
ss.seekp(hll_constants::PREAMBLE_INTS_BYTE);
|
|
150
150
|
ss.put(0);
|
|
151
151
|
ss.seekg(0);
|
|
152
152
|
REQUIRE_THROWS_AS(hll_sketch::deserialize(ss), std::invalid_argument);
|
|
153
153
|
REQUIRE_THROWS_AS(HllArray<std::allocator<uint8_t>>::newHll(ss, std::allocator<uint8_t>()), std::invalid_argument);
|
|
154
|
-
ss.seekp(
|
|
155
|
-
ss.put(
|
|
154
|
+
ss.seekp(hll_constants::PREAMBLE_INTS_BYTE);
|
|
155
|
+
ss.put(hll_constants::HLL_PREINTS);
|
|
156
156
|
|
|
157
|
-
ss.seekp(
|
|
157
|
+
ss.seekp(hll_constants::SER_VER_BYTE);
|
|
158
158
|
ss.put(0);
|
|
159
159
|
ss.seekg(0);
|
|
160
160
|
REQUIRE_THROWS_AS(hll_sketch::deserialize(ss), std::invalid_argument);
|
|
161
|
-
ss.seekp(
|
|
162
|
-
ss.put(
|
|
161
|
+
ss.seekp(hll_constants::SER_VER_BYTE);
|
|
162
|
+
ss.put(hll_constants::SER_VER);
|
|
163
163
|
|
|
164
|
-
ss.seekp(
|
|
164
|
+
ss.seekp(hll_constants::FAMILY_BYTE);
|
|
165
165
|
ss.put(0);
|
|
166
166
|
ss.seekg(0);
|
|
167
167
|
REQUIRE_THROWS_AS(hll_sketch::deserialize(ss), std::invalid_argument);
|
|
168
|
-
ss.seekp(
|
|
169
|
-
ss.put(
|
|
168
|
+
ss.seekp(hll_constants::FAMILY_BYTE);
|
|
169
|
+
ss.put(hll_constants::FAMILY_ID);
|
|
170
170
|
|
|
171
|
-
ss.seekg(
|
|
172
|
-
|
|
173
|
-
ss.seekp(
|
|
171
|
+
ss.seekg(hll_constants::MODE_BYTE);
|
|
172
|
+
auto tmp = ss.get();
|
|
173
|
+
ss.seekp(hll_constants::MODE_BYTE);
|
|
174
174
|
ss.put(0x11); // HLL_6, SET
|
|
175
175
|
ss.seekg(0);
|
|
176
176
|
REQUIRE_THROWS_AS(hll_sketch::deserialize(ss), std::invalid_argument);
|
|
177
|
-
ss.seekp(
|
|
178
|
-
ss.put(tmp);
|
|
177
|
+
ss.seekp(hll_constants::MODE_BYTE);
|
|
178
|
+
ss.put((char)tmp);
|
|
179
179
|
|
|
180
|
-
ss.seekg(
|
|
180
|
+
ss.seekg(hll_constants::LG_ARR_BYTE);
|
|
181
181
|
tmp = ss.get();
|
|
182
|
-
ss.seekp(
|
|
182
|
+
ss.seekp(hll_constants::LG_ARR_BYTE);
|
|
183
183
|
ss.put(0);
|
|
184
184
|
ss.seekg(0);
|
|
185
185
|
hll_sketch::deserialize(ss);
|
|
186
186
|
// should work fine despite the corruption
|
|
187
|
-
ss.seekp(
|
|
188
|
-
ss.put(tmp);
|
|
187
|
+
ss.seekp(hll_constants::LG_ARR_BYTE);
|
|
188
|
+
ss.put((char)tmp);
|
|
189
189
|
}
|
|
190
190
|
|
|
191
191
|
} /* namespace datasketches */
|
|
@@ -27,7 +27,7 @@ namespace datasketches {
|
|
|
27
27
|
using hll_sketch_test_alloc = hll_sketch_alloc<test_allocator<uint8_t>>;
|
|
28
28
|
using alloc = test_allocator<uint8_t>;
|
|
29
29
|
|
|
30
|
-
static void runCheckCopy(
|
|
30
|
+
static void runCheckCopy(uint8_t lgConfigK, target_hll_type tgtHllType) {
|
|
31
31
|
hll_sketch_test_alloc sk(lgConfigK, tgtHllType, false, 0);
|
|
32
32
|
|
|
33
33
|
for (int i = 0; i < 7; ++i) {
|
|
@@ -66,7 +66,7 @@ TEST_CASE("hll sketch: check copies", "[hll_sketch]") {
|
|
|
66
66
|
}
|
|
67
67
|
|
|
68
68
|
static void copyAs(target_hll_type srcType, target_hll_type dstType) {
|
|
69
|
-
|
|
69
|
+
uint8_t lgK = 8;
|
|
70
70
|
int n1 = 7;
|
|
71
71
|
int n2 = 24;
|
|
72
72
|
int n3 = 1000;
|
|
@@ -109,7 +109,7 @@ TEST_CASE("hll sketch: check copy as", "[hll_sketch]") {
|
|
|
109
109
|
TEST_CASE("hll sketch: check misc1", "[hll_sketch]") {
|
|
110
110
|
test_allocator_total_bytes = 0;
|
|
111
111
|
{
|
|
112
|
-
|
|
112
|
+
uint8_t lgConfigK = 8;
|
|
113
113
|
target_hll_type srcType = target_hll_type::HLL_8;
|
|
114
114
|
hll_sketch_test_alloc sk(lgConfigK, srcType, false, 0);
|
|
115
115
|
|
|
@@ -124,7 +124,7 @@ TEST_CASE("hll sketch: check misc1", "[hll_sketch]") {
|
|
|
124
124
|
sk.update(24); // HLL
|
|
125
125
|
REQUIRE(sk.get_updatable_serialization_bytes() == 40 + 256);
|
|
126
126
|
|
|
127
|
-
const
|
|
127
|
+
const auto hllBytes = hll_constants::HLL_BYTE_ARR_START + (1 << lgConfigK);
|
|
128
128
|
REQUIRE(sk.get_compact_serialization_bytes() == hllBytes);
|
|
129
129
|
REQUIRE(hll_sketch::get_max_updatable_serialization_bytes(lgConfigK, HLL_8) == hllBytes);
|
|
130
130
|
}
|
|
@@ -135,22 +135,22 @@ TEST_CASE("hll sketch: check num std dev", "[hll_sketch]") {
|
|
|
135
135
|
REQUIRE_THROWS_AS(HllUtil<>::checkNumStdDev(0), std::invalid_argument);
|
|
136
136
|
}
|
|
137
137
|
|
|
138
|
-
void checkSerializationSizes(
|
|
138
|
+
void checkSerializationSizes(uint8_t lgConfigK, target_hll_type tgtHllType) {
|
|
139
139
|
hll_sketch_test_alloc sk(lgConfigK, tgtHllType, false, 0);
|
|
140
140
|
int i;
|
|
141
141
|
|
|
142
142
|
// LIST
|
|
143
143
|
for (i = 0; i < 7; ++i) { sk.update(i); }
|
|
144
|
-
|
|
144
|
+
auto expected = hll_constants::LIST_INT_ARR_START + (i << 2);
|
|
145
145
|
REQUIRE(sk.get_compact_serialization_bytes() == expected);
|
|
146
|
-
expected =
|
|
146
|
+
expected = hll_constants::LIST_INT_ARR_START + (4 << hll_constants::LG_INIT_LIST_SIZE);
|
|
147
147
|
REQUIRE(sk.get_updatable_serialization_bytes() == expected);
|
|
148
148
|
|
|
149
149
|
// SET
|
|
150
150
|
for (i = 7; i < 24; ++i) { sk.update(i); }
|
|
151
|
-
expected =
|
|
151
|
+
expected = hll_constants::HASH_SET_INT_ARR_START + (i << 2);
|
|
152
152
|
REQUIRE(sk.get_compact_serialization_bytes() == expected);
|
|
153
|
-
expected =
|
|
153
|
+
expected = hll_constants::HASH_SET_INT_ARR_START + (4 << hll_constants::LG_INIT_SET_SIZE);
|
|
154
154
|
REQUIRE(sk.get_updatable_serialization_bytes() == expected);
|
|
155
155
|
}
|
|
156
156
|
|
|
@@ -178,7 +178,7 @@ TEST_CASE("hll sketch: exercise to string", "[hll_sketch]") {
|
|
|
178
178
|
|
|
179
179
|
// Creates and serializes then deserializes sketch.
|
|
180
180
|
// Returns true if deserialized sketch is compact.
|
|
181
|
-
static bool checkCompact(
|
|
181
|
+
static bool checkCompact(uint8_t lgK, const int n, const target_hll_type type, bool compact) {
|
|
182
182
|
hll_sketch_test_alloc sk(lgK, type, false, 0);
|
|
183
183
|
for (int i = 0; i < n; ++i) { sk.update(i); }
|
|
184
184
|
|
|
@@ -201,7 +201,7 @@ static bool checkCompact(const int lgK, const int n, const target_hll_type type,
|
|
|
201
201
|
TEST_CASE("hll sketch: check compact flag", "[hll_sketch]") {
|
|
202
202
|
test_allocator_total_bytes = 0;
|
|
203
203
|
{
|
|
204
|
-
|
|
204
|
+
uint8_t lgK = 8;
|
|
205
205
|
// unless/until we create non-updatable "direct" versions,
|
|
206
206
|
// deserialized image should never be compact
|
|
207
207
|
// LIST: follows serialization request
|
|
@@ -230,10 +230,10 @@ TEST_CASE("hll sketch: check compact flag", "[hll_sketch]") {
|
|
|
230
230
|
TEST_CASE("hll sketch: check k limits", "[hll_sketch]") {
|
|
231
231
|
test_allocator_total_bytes = 0;
|
|
232
232
|
{
|
|
233
|
-
hll_sketch_test_alloc sketch1(
|
|
234
|
-
hll_sketch_test_alloc sketch2(
|
|
235
|
-
REQUIRE_THROWS_AS(hll_sketch_test_alloc(
|
|
236
|
-
REQUIRE_THROWS_AS(hll_sketch_test_alloc(
|
|
233
|
+
hll_sketch_test_alloc sketch1(hll_constants::MIN_LOG_K, target_hll_type::HLL_8, false, 0);
|
|
234
|
+
hll_sketch_test_alloc sketch2(hll_constants::MAX_LOG_K, target_hll_type::HLL_4, false, 0);
|
|
235
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc(hll_constants::MIN_LOG_K - 1, target_hll_type::HLL_4, false, 0), std::invalid_argument);
|
|
236
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc(hll_constants::MAX_LOG_K + 1, target_hll_type::HLL_4, false, 0), std::invalid_argument);
|
|
237
237
|
}
|
|
238
238
|
REQUIRE(test_allocator_total_bytes == 0);
|
|
239
239
|
}
|
|
@@ -24,23 +24,19 @@
|
|
|
24
24
|
|
|
25
25
|
namespace datasketches {
|
|
26
26
|
|
|
27
|
-
static int min(int a, int b) {
|
|
28
|
-
return (a < b) ? a : b;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
27
|
static void println(std::string& str) {
|
|
32
28
|
//std::cout << str << "\n";
|
|
33
29
|
}
|
|
34
30
|
|
|
35
31
|
static void basicUnion(uint64_t n1, uint64_t n2,
|
|
36
|
-
|
|
32
|
+
uint8_t lgk1, uint8_t lgk2, uint8_t lgMaxK,
|
|
37
33
|
target_hll_type type1, target_hll_type type2, target_hll_type resultType) {
|
|
38
34
|
uint64_t v = 0;
|
|
39
35
|
//int tot = n1 + n2;
|
|
40
36
|
|
|
41
37
|
hll_sketch h1(lgk1, type1);
|
|
42
38
|
hll_sketch h2(lgk2, type2);
|
|
43
|
-
|
|
39
|
+
uint8_t lgControlK = std::min(std::min(lgk1, lgk2), lgMaxK);
|
|
44
40
|
hll_sketch control(lgControlK, resultType);
|
|
45
41
|
|
|
46
42
|
for (uint64_t i = 0; i < n1; ++i) {
|
|
@@ -89,9 +85,9 @@ TEST_CASE("hll union: check unions", "[hll_union]") {
|
|
|
89
85
|
target_hll_type type2 = HLL_8;
|
|
90
86
|
target_hll_type resultType = HLL_8;
|
|
91
87
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
88
|
+
uint8_t lgK1 = 7;
|
|
89
|
+
uint8_t lgK2 = 7;
|
|
90
|
+
uint8_t lgMaxK = 7;
|
|
95
91
|
uint64_t n1 = 7;
|
|
96
92
|
uint64_t n2 = 7;
|
|
97
93
|
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
|
|
@@ -108,7 +104,7 @@ TEST_CASE("hll union: check unions", "[hll_union]") {
|
|
|
108
104
|
n2 = 14;
|
|
109
105
|
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
|
|
110
106
|
|
|
111
|
-
|
|
107
|
+
uint8_t i = 0;
|
|
112
108
|
for (i = 7; i <= 13; ++i) {
|
|
113
109
|
lgK1 = i;
|
|
114
110
|
lgK2 = i;
|
|
@@ -184,9 +180,9 @@ TEST_CASE("hll union: check composite estimate", "[hll_union]") {
|
|
|
184
180
|
}
|
|
185
181
|
|
|
186
182
|
TEST_CASE("hll union: check config k limits", "[hll_union]") {
|
|
187
|
-
REQUIRE_THROWS_AS(hll_union(
|
|
183
|
+
REQUIRE_THROWS_AS(hll_union(hll_constants::MIN_LOG_K - 1), std::invalid_argument);
|
|
188
184
|
|
|
189
|
-
REQUIRE_THROWS_AS(hll_union(
|
|
185
|
+
REQUIRE_THROWS_AS(hll_union(hll_constants::MAX_LOG_K + 1), std::invalid_argument);
|
|
190
186
|
}
|
|
191
187
|
|
|
192
188
|
static double getBound(int lgK, bool ub, bool oooFlag, int numStdDev, double est) {
|
|
@@ -195,7 +191,7 @@ static double getBound(int lgK, bool ub, bool oooFlag, int numStdDev, double est
|
|
|
195
191
|
}
|
|
196
192
|
|
|
197
193
|
TEST_CASE("hll union: check ub lb", "[hll_union]") {
|
|
198
|
-
|
|
194
|
+
uint8_t lgK = 4;
|
|
199
195
|
int n = 1 << 20;
|
|
200
196
|
bool oooFlag = false;
|
|
201
197
|
|
|
@@ -223,7 +219,7 @@ TEST_CASE("hll union: check ub lb", "[hll_union]") {
|
|
|
223
219
|
}
|
|
224
220
|
|
|
225
221
|
TEST_CASE("hll union: check conversions", "[hll_union]") {
|
|
226
|
-
|
|
222
|
+
uint8_t lgK = 4;
|
|
227
223
|
hll_sketch sk1(lgK, HLL_8);
|
|
228
224
|
hll_sketch sk2(lgK, HLL_8);
|
|
229
225
|
int n = 1 << 20;
|
|
@@ -57,7 +57,7 @@ static int get_n(int lg_k, hll_mode mode) {
|
|
|
57
57
|
|
|
58
58
|
static long v = 0;
|
|
59
59
|
|
|
60
|
-
static hll_sketch build_sketch(
|
|
60
|
+
static hll_sketch build_sketch(uint8_t lg_k, target_hll_type hll_type, hll_mode mode) {
|
|
61
61
|
hll_sketch sk(lg_k, hll_type);
|
|
62
62
|
int n = get_n(lg_k, mode);
|
|
63
63
|
for (int i = 0; i < n; i++) sk.update(static_cast<uint64_t>(i + v));
|
|
@@ -67,7 +67,7 @@ static hll_sketch build_sketch(int lg_k, target_hll_type hll_type, hll_mode mode
|
|
|
67
67
|
|
|
68
68
|
// merges a sketch to an empty union and gets result of the same type, checks binary equivalence
|
|
69
69
|
static void union_one_update(bool compact) {
|
|
70
|
-
for (
|
|
70
|
+
for (uint8_t lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
|
|
71
71
|
for (int mode = 0; mode <= 2; mode++) { // List, Set, Hll
|
|
72
72
|
if ((lg_k < 8) && (mode == 1)) continue; // lg_k < 8 list transitions directly to HLL
|
|
73
73
|
for (int t = 0; t <= 2; t++) { // HLL_4, HLL_6, HLL_8
|
|
@@ -102,7 +102,7 @@ TEST_CASE("hll isomorphic: union one update serialize compact", "[hll_isomorphic
|
|
|
102
102
|
|
|
103
103
|
// converts a sketch to a different type and converts back to the original type to check binary equivalence
|
|
104
104
|
static void convert_back_and_forth(bool compact) {
|
|
105
|
-
for (
|
|
105
|
+
for (uint8_t lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
|
|
106
106
|
for (int mode = 0; mode <= 2; mode++) { // List, Set, Hll
|
|
107
107
|
if ((lg_k < 8) && (mode == 1)) continue; // lg_k < 8 list transitions directly to HLL
|
|
108
108
|
for (int t1 = 0; t1 <= 2; t1++) { // HLL_4, HLL_6, HLL_8
|
|
@@ -44,11 +44,11 @@ TEST_CASE("hll to/from byte array: double serialize", "[hll_byte_array]") {
|
|
|
44
44
|
auto ser2 = sk.serialize_updatable();
|
|
45
45
|
|
|
46
46
|
REQUIRE(ser1.size() == ser2.size());
|
|
47
|
-
|
|
47
|
+
size_t len = ser1.size();
|
|
48
48
|
uint8_t* b1 = ser1.data();
|
|
49
49
|
uint8_t* b2 = ser2.data();
|
|
50
50
|
|
|
51
|
-
for (
|
|
51
|
+
for (size_t i = 0; i < len; ++i) {
|
|
52
52
|
REQUIRE(b2[i] == b1[i]);
|
|
53
53
|
}
|
|
54
54
|
}
|
|
@@ -129,7 +129,7 @@ static void checkSketchEquality(hll_sketch& sk1, hll_sketch& sk2) {
|
|
|
129
129
|
REQUIRE(sk1.get_target_type() == sk2.get_target_type());
|
|
130
130
|
}
|
|
131
131
|
|
|
132
|
-
static void toFrom(const
|
|
132
|
+
static void toFrom(const uint8_t lgConfigK, const target_hll_type tgtHllType, const int n) {
|
|
133
133
|
hll_sketch src(lgConfigK, tgtHllType);
|
|
134
134
|
for (int i = 0; i < n; ++i) {
|
|
135
135
|
src.update(i);
|
|
@@ -157,7 +157,7 @@ static void toFrom(const int lgConfigK, const target_hll_type tgtHllType, const
|
|
|
157
157
|
TEST_CASE("hll to/from byte array: to from sketch", "[hll_byte_array]") {
|
|
158
158
|
for (int i = 0; i < 10; ++i) {
|
|
159
159
|
int n = nArr[i];
|
|
160
|
-
for (
|
|
160
|
+
for (uint8_t lgK = 4; lgK <= 13; ++lgK) {
|
|
161
161
|
toFrom(lgK, HLL_4, n);
|
|
162
162
|
toFrom(lgK, HLL_6, n);
|
|
163
163
|
toFrom(lgK, HLL_8, n);
|
|
@@ -26,7 +26,8 @@
|
|
|
26
26
|
|
|
27
27
|
namespace datasketches {
|
|
28
28
|
|
|
29
|
-
static std::independent_bits_engine<std::mt19937, 1, uint32_t>
|
|
29
|
+
static std::independent_bits_engine<std::mt19937, 1, uint32_t>
|
|
30
|
+
random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()));
|
|
30
31
|
|
|
31
32
|
#ifdef KLL_VALIDATION
|
|
32
33
|
extern uint32_t kll_next_offset;
|
|
@@ -46,9 +47,9 @@ class kll_helper {
|
|
|
46
47
|
static inline uint8_t floor_of_log2_of_fraction(uint64_t numer, uint64_t denom);
|
|
47
48
|
static inline uint8_t ub_on_num_levels(uint64_t n);
|
|
48
49
|
static inline uint32_t compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_levels);
|
|
49
|
-
static inline
|
|
50
|
-
static inline
|
|
51
|
-
static inline
|
|
50
|
+
static inline uint16_t level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid);
|
|
51
|
+
static inline uint16_t int_cap_aux(uint16_t k, uint8_t depth);
|
|
52
|
+
static inline uint16_t int_cap_aux_aux(uint16_t k, uint8_t depth);
|
|
52
53
|
static inline uint64_t sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels);
|
|
53
54
|
|
|
54
55
|
/*
|
|
@@ -55,28 +55,28 @@ uint32_t kll_helper::compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_l
|
|
|
55
55
|
return total;
|
|
56
56
|
}
|
|
57
57
|
|
|
58
|
-
|
|
58
|
+
uint16_t kll_helper::level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid) {
|
|
59
59
|
if (height >= numLevels) throw std::invalid_argument("height >= numLevels");
|
|
60
60
|
const uint8_t depth = numLevels - height - 1;
|
|
61
|
-
return std::max(
|
|
61
|
+
return std::max<uint16_t>(min_wid, int_cap_aux(k, depth));
|
|
62
62
|
}
|
|
63
63
|
|
|
64
|
-
|
|
64
|
+
uint16_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) {
|
|
65
65
|
if (depth > 60) throw std::invalid_argument("depth > 60");
|
|
66
66
|
if (depth <= 30) return int_cap_aux_aux(k, depth);
|
|
67
67
|
const uint8_t half = depth / 2;
|
|
68
68
|
const uint8_t rest = depth - half;
|
|
69
|
-
const
|
|
69
|
+
const uint16_t tmp = int_cap_aux_aux(k, half);
|
|
70
70
|
return int_cap_aux_aux(tmp, rest);
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
-
|
|
73
|
+
uint16_t kll_helper::int_cap_aux_aux(uint16_t k, uint8_t depth) {
|
|
74
74
|
if (depth > 30) throw std::invalid_argument("depth > 30");
|
|
75
75
|
const uint64_t twok = k << 1; // for rounding, we pre-multiply by 2
|
|
76
76
|
const uint64_t tmp = (uint64_t) (((uint64_t) twok << depth) / powers_of_three[depth]);
|
|
77
77
|
const uint64_t result = (tmp + 1) >> 1; // then here we add 1 and divide by 2
|
|
78
78
|
if (result > k) throw std::logic_error("result > k");
|
|
79
|
-
return result;
|
|
79
|
+
return static_cast<uint16_t>(result);
|
|
80
80
|
}
|
|
81
81
|
|
|
82
82
|
uint64_t kll_helper::sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels) {
|