datasketches 0.2.0 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -70,12 +70,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
70
70
|
REQUIRE(sketch.get_PMF(split_points, 1).size() == 0);
|
71
71
|
REQUIRE(sketch.get_CDF(split_points, 1).size() == 0);
|
72
72
|
|
73
|
-
|
74
|
-
for (auto& it: sketch) {
|
73
|
+
for (auto it: sketch) {
|
75
74
|
(void) it; // to suppress "unused" warning
|
76
|
-
|
75
|
+
FAIL("should be no iterations over an empty sketch");
|
77
76
|
}
|
78
|
-
REQUIRE(count == 0);
|
79
77
|
}
|
80
78
|
|
81
79
|
SECTION("get bad quantile") {
|
@@ -86,13 +84,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
86
84
|
|
87
85
|
SECTION("one item") {
|
88
86
|
kll_float_sketch sketch(200, 0);
|
89
|
-
sketch.update(1);
|
87
|
+
sketch.update(1.0f);
|
90
88
|
REQUIRE_FALSE(sketch.is_empty());
|
91
89
|
REQUIRE_FALSE(sketch.is_estimation_mode());
|
92
90
|
REQUIRE(sketch.get_n() == 1);
|
93
91
|
REQUIRE(sketch.get_num_retained() == 1);
|
94
|
-
REQUIRE(sketch.get_rank(1) == 0.0);
|
95
|
-
REQUIRE(sketch.get_rank(2) == 1.0);
|
92
|
+
REQUIRE(sketch.get_rank(1.0f) == 0.0);
|
93
|
+
REQUIRE(sketch.get_rank(2.0f) == 1.0);
|
96
94
|
REQUIRE(sketch.get_min_value() == 1.0);
|
97
95
|
REQUIRE(sketch.get_max_value() == 1.0);
|
98
96
|
REQUIRE(sketch.get_quantile(0.5) == 1.0);
|
@@ -104,7 +102,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
104
102
|
REQUIRE(quantiles[2] == 1.0);
|
105
103
|
|
106
104
|
int count = 0;
|
107
|
-
for (auto
|
105
|
+
for (auto it: sketch) {
|
108
106
|
REQUIRE(it.second == 1);
|
109
107
|
++count;
|
110
108
|
}
|
@@ -116,16 +114,16 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
116
114
|
sketch.update(std::numeric_limits<float>::quiet_NaN());
|
117
115
|
REQUIRE(sketch.is_empty());
|
118
116
|
|
119
|
-
sketch.update(0
|
117
|
+
sketch.update(0);
|
120
118
|
sketch.update(std::numeric_limits<float>::quiet_NaN());
|
121
119
|
REQUIRE(sketch.get_n() == 1);
|
122
120
|
}
|
123
121
|
|
124
122
|
SECTION("many items, exact mode") {
|
125
123
|
kll_float_sketch sketch(200, 0);
|
126
|
-
const uint32_t n
|
124
|
+
const uint32_t n = 200;
|
127
125
|
for (uint32_t i = 0; i < n; i++) {
|
128
|
-
sketch.update(i);
|
126
|
+
sketch.update(static_cast<float>(i));
|
129
127
|
REQUIRE(sketch.get_n() == i + 1);
|
130
128
|
}
|
131
129
|
REQUIRE_FALSE(sketch.is_empty());
|
@@ -145,7 +143,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
145
143
|
|
146
144
|
for (uint32_t i = 0; i < n; i++) {
|
147
145
|
const double trueRank = (double) i / n;
|
148
|
-
REQUIRE(sketch.get_rank(i) == trueRank);
|
146
|
+
REQUIRE(sketch.get_rank(static_cast<float>(i)) == trueRank);
|
149
147
|
}
|
150
148
|
|
151
149
|
// the alternative method must produce the same result
|
@@ -158,16 +156,16 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
158
156
|
|
159
157
|
SECTION("10 items") {
|
160
158
|
kll_float_sketch sketch(200, 0);
|
161
|
-
sketch.update(1);
|
162
|
-
sketch.update(2);
|
163
|
-
sketch.update(3);
|
164
|
-
sketch.update(4);
|
165
|
-
sketch.update(5);
|
166
|
-
sketch.update(6);
|
167
|
-
sketch.update(7);
|
168
|
-
sketch.update(8);
|
169
|
-
sketch.update(9);
|
170
|
-
sketch.update(10);
|
159
|
+
sketch.update(1.0f);
|
160
|
+
sketch.update(2.0f);
|
161
|
+
sketch.update(3.0f);
|
162
|
+
sketch.update(4.0f);
|
163
|
+
sketch.update(5.0f);
|
164
|
+
sketch.update(6.0f);
|
165
|
+
sketch.update(7.0f);
|
166
|
+
sketch.update(8.0f);
|
167
|
+
sketch.update(9.0f);
|
168
|
+
sketch.update(10.0f);
|
171
169
|
REQUIRE(sketch.get_quantile(0) == 1.0);
|
172
170
|
REQUIRE(sketch.get_quantile(0.5) == 6.0);
|
173
171
|
REQUIRE(sketch.get_quantile(0.99) == 10.0);
|
@@ -176,7 +174,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
176
174
|
|
177
175
|
SECTION("100 items") {
|
178
176
|
kll_float_sketch sketch(200, 0);
|
179
|
-
for (int i = 0; i < 100; ++i) sketch.update(i);
|
177
|
+
for (int i = 0; i < 100; ++i) sketch.update(static_cast<float>(i));
|
180
178
|
REQUIRE(sketch.get_quantile(0) == 0);
|
181
179
|
REQUIRE(sketch.get_quantile(0.01) == 1);
|
182
180
|
REQUIRE(sketch.get_quantile(0.5) == 50);
|
@@ -186,9 +184,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
186
184
|
|
187
185
|
SECTION("many items, estimation mode") {
|
188
186
|
kll_float_sketch sketch(200, 0);
|
189
|
-
const int n
|
187
|
+
const int n = 1000000;
|
190
188
|
for (int i = 0; i < n; i++) {
|
191
|
-
sketch.update(i);
|
189
|
+
sketch.update(static_cast<float>(i));
|
192
190
|
REQUIRE(sketch.get_n() == static_cast<uint64_t>(i + 1));
|
193
191
|
}
|
194
192
|
REQUIRE_FALSE(sketch.is_empty());
|
@@ -201,7 +199,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
201
199
|
// test rank
|
202
200
|
for (int i = 0; i < n; i++) {
|
203
201
|
const double trueRank = (double) i / n;
|
204
|
-
REQUIRE(sketch.get_rank(i) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
|
202
|
+
REQUIRE(sketch.get_rank(static_cast<float>(i)) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
|
205
203
|
}
|
206
204
|
|
207
205
|
// test quantiles at every 0.1 percentage point
|
@@ -224,6 +222,15 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
224
222
|
}
|
225
223
|
|
226
224
|
//std::cout << sketch.to_string();
|
225
|
+
|
226
|
+
uint32_t count = 0;
|
227
|
+
uint64_t total_weight = 0;
|
228
|
+
for (auto it: sketch) {
|
229
|
+
++count;
|
230
|
+
total_weight += it.second;
|
231
|
+
}
|
232
|
+
REQUIRE(count == sketch.get_num_retained());
|
233
|
+
REQUIRE(total_weight == sketch.get_n());
|
227
234
|
}
|
228
235
|
|
229
236
|
SECTION("consistency between get_rank adn get_PMF/CDF") {
|
@@ -231,8 +238,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
231
238
|
const int n = 1000;
|
232
239
|
float values[n];
|
233
240
|
for (int i = 0; i < n; i++) {
|
234
|
-
sketch.update(i);
|
235
|
-
values[i] = i;
|
241
|
+
sketch.update(static_cast<float>(i));
|
242
|
+
values[i] = static_cast<float>(i);
|
236
243
|
}
|
237
244
|
|
238
245
|
const auto ranks(sketch.get_CDF(values, n));
|
@@ -272,6 +279,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
272
279
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
273
280
|
auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
|
274
281
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
282
|
+
REQUIRE(s.tellg() == s.tellp());
|
275
283
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
276
284
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
277
285
|
REQUIRE(sketch2.get_n() == sketch.get_n());
|
@@ -297,9 +305,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
297
305
|
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
|
298
306
|
}
|
299
307
|
|
300
|
-
SECTION("serialize deserialize one item") {
|
308
|
+
SECTION("stream serialize deserialize one item") {
|
301
309
|
kll_float_sketch sketch(200, 0);
|
302
|
-
sketch.update(1);
|
310
|
+
sketch.update(1.0f);
|
303
311
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
304
312
|
sketch.serialize(s);
|
305
313
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
@@ -317,6 +325,24 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
317
325
|
REQUIRE(sketch2.get_rank(2) == 1.0);
|
318
326
|
}
|
319
327
|
|
328
|
+
SECTION("bytes serialize deserialize one item") {
|
329
|
+
kll_float_sketch sketch(200, 0);
|
330
|
+
sketch.update(1.0f);
|
331
|
+
auto bytes = sketch.serialize();
|
332
|
+
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
333
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
334
|
+
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
335
|
+
REQUIRE_FALSE(sketch2.is_empty());
|
336
|
+
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
337
|
+
REQUIRE(sketch2.get_n() == 1);
|
338
|
+
REQUIRE(sketch2.get_num_retained() == 1);
|
339
|
+
REQUIRE(sketch2.get_min_value() == 1.0);
|
340
|
+
REQUIRE(sketch2.get_max_value() == 1.0);
|
341
|
+
REQUIRE(sketch2.get_quantile(0.5) == 1.0);
|
342
|
+
REQUIRE(sketch2.get_rank(1) == 0.0);
|
343
|
+
REQUIRE(sketch2.get_rank(2) == 1.0);
|
344
|
+
}
|
345
|
+
|
320
346
|
SECTION("deserialize one item v1") {
|
321
347
|
std::ifstream is;
|
322
348
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
@@ -330,10 +356,46 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
330
356
|
REQUIRE(sketch.get_max_value() == 1.0);
|
331
357
|
}
|
332
358
|
|
359
|
+
SECTION("stream serialize deserialize three items") {
|
360
|
+
kll_float_sketch sketch(200, 0);
|
361
|
+
sketch.update(1.0f);
|
362
|
+
sketch.update(2.0f);
|
363
|
+
sketch.update(3.0f);
|
364
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
365
|
+
sketch.serialize(s);
|
366
|
+
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
367
|
+
auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
|
368
|
+
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
369
|
+
REQUIRE(s.tellg() == s.tellp());
|
370
|
+
REQUIRE_FALSE(sketch2.is_empty());
|
371
|
+
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
372
|
+
REQUIRE(sketch2.get_n() == 3);
|
373
|
+
REQUIRE(sketch2.get_num_retained() == 3);
|
374
|
+
REQUIRE(sketch2.get_min_value() == 1.0);
|
375
|
+
REQUIRE(sketch2.get_max_value() == 3.0);
|
376
|
+
}
|
377
|
+
|
378
|
+
SECTION("bytes serialize deserialize three items") {
|
379
|
+
kll_float_sketch sketch(200, 0);
|
380
|
+
sketch.update(1.0f);
|
381
|
+
sketch.update(2.0f);
|
382
|
+
sketch.update(3.0f);
|
383
|
+
auto bytes = sketch.serialize();
|
384
|
+
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
385
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
386
|
+
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
387
|
+
REQUIRE_FALSE(sketch2.is_empty());
|
388
|
+
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
389
|
+
REQUIRE(sketch2.get_n() == 3);
|
390
|
+
REQUIRE(sketch2.get_num_retained() == 3);
|
391
|
+
REQUIRE(sketch2.get_min_value() == 1.0);
|
392
|
+
REQUIRE(sketch2.get_max_value() == 3.0);
|
393
|
+
}
|
394
|
+
|
333
395
|
SECTION("stream serialize deserialize many floats") {
|
334
396
|
kll_float_sketch sketch(200, 0);
|
335
|
-
const int n
|
336
|
-
for (int i = 0; i < n; i++) sketch.update(i);
|
397
|
+
const int n = 1000;
|
398
|
+
for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
|
337
399
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
338
400
|
sketch.serialize(s);
|
339
401
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
@@ -350,13 +412,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
350
412
|
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
|
351
413
|
REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
|
352
414
|
REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
|
353
|
-
REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
|
415
|
+
REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
|
354
416
|
}
|
355
417
|
|
356
418
|
SECTION("bytes serialize deserialize many floats") {
|
357
419
|
kll_float_sketch sketch(200, 0);
|
358
|
-
const int n
|
359
|
-
for (int i = 0; i < n; i++) sketch.update(i);
|
420
|
+
const int n = 1000;
|
421
|
+
for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
|
360
422
|
auto bytes = sketch.serialize();
|
361
423
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
362
424
|
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
@@ -371,7 +433,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
371
433
|
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
|
372
434
|
REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
|
373
435
|
REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
|
374
|
-
REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
|
436
|
+
REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
|
375
437
|
REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 7), std::out_of_range);
|
376
438
|
REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 15), std::out_of_range);
|
377
439
|
REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
@@ -379,7 +441,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
379
441
|
|
380
442
|
SECTION("bytes serialize deserialize many ints") {
|
381
443
|
kll_sketch<int> sketch;
|
382
|
-
const int n
|
444
|
+
const int n = 1000;
|
383
445
|
for (int i = 0; i < n; i++) sketch.update(i);
|
384
446
|
auto bytes = sketch.serialize();
|
385
447
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
@@ -439,8 +501,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
439
501
|
kll_float_sketch sketch2(200, 0);
|
440
502
|
const int n = 10000;
|
441
503
|
for (int i = 0; i < n; i++) {
|
442
|
-
sketch1.update(i);
|
443
|
-
sketch2.update((2 * n) - i - 1);
|
504
|
+
sketch1.update(static_cast<float>(i));
|
505
|
+
sketch2.update(static_cast<float>((2 * n) - i - 1));
|
444
506
|
}
|
445
507
|
|
446
508
|
REQUIRE(sketch1.get_min_value() == 0.0f);
|
@@ -462,8 +524,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
462
524
|
kll_float_sketch sketch2(128, 0);
|
463
525
|
const int n = 10000;
|
464
526
|
for (int i = 0; i < n; i++) {
|
465
|
-
sketch1.update(i);
|
466
|
-
sketch2.update((2 * n) - i - 1);
|
527
|
+
sketch1.update(static_cast<float>(i));
|
528
|
+
sketch2.update(static_cast<float>((2 * n) - i - 1));
|
467
529
|
}
|
468
530
|
|
469
531
|
REQUIRE(sketch1.get_min_value() == 0.0f);
|
@@ -495,7 +557,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
495
557
|
kll_float_sketch sketch2(128, 0);
|
496
558
|
const int n = 10000;
|
497
559
|
for (int i = 0; i < n; i++) {
|
498
|
-
sketch1.update(i);
|
560
|
+
sketch1.update(static_cast<float>(i));
|
499
561
|
}
|
500
562
|
|
501
563
|
// rank error should not be affected by a merge with an empty sketch with lower k
|
@@ -518,8 +580,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
518
580
|
SECTION("merge min value from other") {
|
519
581
|
kll_float_sketch sketch1(200, 0);
|
520
582
|
kll_float_sketch sketch2(200, 0);
|
521
|
-
sketch1.update(1);
|
522
|
-
sketch2.update(2);
|
583
|
+
sketch1.update(1.0f);
|
584
|
+
sketch2.update(2.0f);
|
523
585
|
sketch2.merge(sketch1);
|
524
586
|
REQUIRE(sketch2.get_min_value() == 1.0f);
|
525
587
|
REQUIRE(sketch2.get_max_value() == 2.0f);
|
@@ -527,7 +589,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
527
589
|
|
528
590
|
SECTION("merge min and max values from other") {
|
529
591
|
kll_float_sketch sketch1(200, 0);
|
530
|
-
for (int i = 0; i < 1000000; i++) sketch1.update(i);
|
592
|
+
for (int i = 0; i < 1000000; i++) sketch1.update(static_cast<float>(i));
|
531
593
|
kll_float_sketch sketch2(200, 0);
|
532
594
|
sketch2.merge(sketch1);
|
533
595
|
REQUIRE(sketch2.get_min_value() == 0.0f);
|
@@ -540,7 +602,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
540
602
|
REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
|
541
603
|
REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
|
542
604
|
|
543
|
-
const int n
|
605
|
+
const int n = 1000;
|
544
606
|
for (int i = 0; i < n; i++) sketch.update(i);
|
545
607
|
|
546
608
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
@@ -679,6 +741,31 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
679
741
|
}
|
680
742
|
}
|
681
743
|
|
744
|
+
SECTION("max serialized size arithmetic type") {
|
745
|
+
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 10) == 1968);
|
746
|
+
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 100) == 2316);
|
747
|
+
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000) == 2440);
|
748
|
+
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000) == 2800);
|
749
|
+
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000000) == 3160);
|
750
|
+
}
|
751
|
+
|
752
|
+
SECTION("max serialized size non-arithmetic type") {
|
753
|
+
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 10, 4) == 1968);
|
754
|
+
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 100, 4) == 2316);
|
755
|
+
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000, 4) == 2440);
|
756
|
+
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000, 4) == 2800);
|
757
|
+
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000000, 4) == 3160);
|
758
|
+
}
|
759
|
+
|
760
|
+
SECTION("issue #236") {
|
761
|
+
kll_sketch<int8_t> kll;
|
762
|
+
kll.update(1);
|
763
|
+
kll.update(2);
|
764
|
+
kll.update(3);
|
765
|
+
auto blob = kll.serialize();
|
766
|
+
auto kll2 = kll_sketch<int8_t>::deserialize(blob.data(), blob.size());
|
767
|
+
}
|
768
|
+
|
682
769
|
// cleanup
|
683
770
|
if (test_allocator_total_bytes != 0) {
|
684
771
|
REQUIRE(test_allocator_total_bytes == 0);
|
@@ -0,0 +1,111 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch.hpp>
|
21
|
+
|
22
|
+
#include <random>
|
23
|
+
|
24
|
+
#include <kll_sketch.hpp>
|
25
|
+
#include <kolmogorov_smirnov.hpp>
|
26
|
+
|
27
|
+
namespace datasketches {
|
28
|
+
|
29
|
+
TEST_CASE("kolmogorov-smirnov empty", "[kll_sketch]") {
|
30
|
+
const uint16_t k = 200;
|
31
|
+
kll_sketch<double> sketch1(k);
|
32
|
+
kll_sketch<double> sketch2(k);
|
33
|
+
REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == 0);
|
34
|
+
REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
|
35
|
+
}
|
36
|
+
|
37
|
+
TEST_CASE("kolmogorov-smirnov same distribution", "[kll_sketch]") {
|
38
|
+
const uint16_t k = 200;
|
39
|
+
kll_sketch<double> sketch1(k);
|
40
|
+
kll_sketch<double> sketch2(k);
|
41
|
+
std::default_random_engine rand;
|
42
|
+
std::normal_distribution<double> distr;
|
43
|
+
const int n = k * 3 - 1;
|
44
|
+
for (int i = 0; i < n; ++i) {
|
45
|
+
const double x = distr(rand);
|
46
|
+
sketch1.update(x);
|
47
|
+
sketch2.update(x);
|
48
|
+
}
|
49
|
+
REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.01));
|
50
|
+
REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
|
51
|
+
}
|
52
|
+
|
53
|
+
TEST_CASE("kolmogorov-smirnov very different distributions", "[kll_sketch]") {
|
54
|
+
const uint16_t k = 200;
|
55
|
+
kll_sketch<double> sketch1(k);
|
56
|
+
kll_sketch<double> sketch2(k);
|
57
|
+
std::default_random_engine rand;
|
58
|
+
std::normal_distribution<double> distr;
|
59
|
+
const int n = k * 3 - 1;
|
60
|
+
for (int i = 0; i < n; ++i) {
|
61
|
+
const double x = distr(rand);
|
62
|
+
sketch1.update(x + 100.0);
|
63
|
+
sketch2.update(x);
|
64
|
+
}
|
65
|
+
const auto delta = kolmogorov_smirnov::delta(sketch1, sketch2);
|
66
|
+
REQUIRE(delta == Approx(1.0).margin(1e-6));
|
67
|
+
REQUIRE(delta <= 1);
|
68
|
+
REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
|
69
|
+
}
|
70
|
+
|
71
|
+
TEST_CASE("kolmogorov-smirnov slightly different distributions", "[kll_sketch]") {
|
72
|
+
const uint16_t k = 2000;
|
73
|
+
kll_sketch<double> sketch1(k);
|
74
|
+
kll_sketch<double> sketch2(k);
|
75
|
+
std::default_random_engine rand;
|
76
|
+
std::normal_distribution<double> distr;
|
77
|
+
const int n = k * 3 - 1;
|
78
|
+
for (int i = 0; i < n; ++i) {
|
79
|
+
const double x = distr(rand);
|
80
|
+
sketch1.update(x + 0.05);
|
81
|
+
sketch2.update(x);
|
82
|
+
}
|
83
|
+
const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
|
84
|
+
REQUIRE(delta == Approx(0.02).margin(0.01));
|
85
|
+
const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
|
86
|
+
//std::cout << "delta=" << delta << ", threshold=" << threshold << "\n";
|
87
|
+
REQUIRE_FALSE(delta > threshold);
|
88
|
+
REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
|
89
|
+
}
|
90
|
+
|
91
|
+
TEST_CASE("kolmogorov-smirnov slightly different distributions high resolution", "[kll_sketch]") {
|
92
|
+
const uint16_t k = 8000;
|
93
|
+
kll_sketch<double> sketch1(k);
|
94
|
+
kll_sketch<double> sketch2(k);
|
95
|
+
std::default_random_engine rand;
|
96
|
+
std::normal_distribution<double> distr;
|
97
|
+
const int n = k * 3 - 1;
|
98
|
+
for (int i = 0; i < n; ++i) {
|
99
|
+
const double x = distr(rand);
|
100
|
+
sketch1.update(x + 0.05);
|
101
|
+
sketch2.update(x);
|
102
|
+
}
|
103
|
+
const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
|
104
|
+
REQUIRE(delta == Approx(0.02).margin(0.01));
|
105
|
+
const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
|
106
|
+
//std::cout << "delta=" << delta << ", threshold=" << threshold << "\n";
|
107
|
+
REQUIRE(delta > threshold);
|
108
|
+
REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
|
109
|
+
}
|
110
|
+
|
111
|
+
} /* namespace datasketches */
|
@@ -15,16 +15,20 @@
|
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
17
|
|
18
|
-
|
18
|
+
find_package(Python3 COMPONENTS Interpreter Development)
|
19
|
+
|
20
|
+
# only Windows+MSVC seems to have trouble locating pybind11
|
19
21
|
if (MSVC)
|
20
|
-
|
21
|
-
|
22
|
-
|
22
|
+
execute_process(COMMAND cmd.exe /c ${CMAKE_CURRENT_SOURCE_DIR}/pybind11Path.cmd "${Python3_EXECUTABLE}"
|
23
|
+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
24
|
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
25
|
+
OUTPUT_VARIABLE EXTRA_PACKAGE_PATH)
|
26
|
+
set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${EXTRA_PACKAGE_PATH})
|
23
27
|
endif()
|
24
28
|
|
25
|
-
|
29
|
+
find_package(pybind11 CONFIG REQUIRED)
|
26
30
|
|
27
|
-
pybind11_add_module(python MODULE EXCLUDE_FROM_ALL
|
31
|
+
pybind11_add_module(python MODULE EXCLUDE_FROM_ALL THIN_LTO)
|
28
32
|
|
29
33
|
target_link_libraries(python
|
30
34
|
PRIVATE
|
@@ -1,76 +1,57 @@
|
|
1
|
-
|
1
|
+
<img src="https://raw.githubusercontent.com/apache/datasketches-website/master/logos/svg/datasketches-HorizontalColor-TM.svg" width="75%" alt="Apache DataSketchs Logo">
|
2
2
|
|
3
|
-
|
3
|
+
# The Apache DataSketches Library for Python
|
4
4
|
|
5
|
-
|
6
|
-
from a relase package, you must ensure that the pybind11 directory points to a local copy of pybind11.
|
5
|
+
This is the official version of the [Apache DataSketches](https://datasketches.apache.org) Python library.
|
7
6
|
|
8
|
-
|
7
|
+
In the analysis of big data there are often problem queries that don’t scale because they require huge compute resources and time to generate exact results. Examples include count distinct, quantiles, most-frequent items, joins, matrix computations, and graph analysis.
|
9
8
|
|
10
|
-
If
|
11
|
-
```pip install git+https://github.com/apache/datasketches-cpp.git```
|
9
|
+
If approximate results are acceptable, there is a class of specialized algorithms, called streaming algorithms, or sketches that can produce results orders-of magnitude faster and with mathematically proven error bounds. For interactive queries there may not be other viable alternatives, and in the case of real-time analysis, sketches are the only known solution.
|
12
10
|
|
13
|
-
|
14
|
-
|
15
|
-
### Building
|
16
|
-
|
17
|
-
When cloning the source repository, you should include the pybind11 submodule with the `--recursive` option to the clone command:
|
18
|
-
```
|
19
|
-
git clone --recursive https://github.com/apache/datasketches-cpp.git
|
20
|
-
cd datasketches-cpp
|
21
|
-
python -m pip install --upgrade pip setuptools wheel numpy
|
22
|
-
python setup.py build
|
23
|
-
```
|
11
|
+
This package provides a variety of sketches as described below. Wherever a specific type of sketch exists in Apache DataSketches packages for other languages, the sketches will be portable between languages (for platforms with the same endianness).
|
24
12
|
|
25
|
-
|
13
|
+
## Building and Installation
|
26
14
|
|
27
|
-
|
15
|
+
Once cloned, the library can be installed by running `python -m pip install .` in the project root directory, which will also install the necessary dependencies, namely numpy and [pybind11[global]](https://github.com/pybind/pybind11).
|
28
16
|
|
29
|
-
|
30
|
-
line of the build command with `python setup.py install`.
|
17
|
+
If you prefer to call the `setup.py` build script directly, you must first install `pybind11[global]`, as well as any other dependencies listed under the build-system section in `pyproject.toml`.
|
31
18
|
|
32
|
-
|
33
|
-
|
34
|
-
The python tests are run with `tox`. To ensure you have all the needed packages, from the package base directory run:
|
35
|
-
```
|
36
|
-
python -m pip install --upgrade pip setuptools wheel numpy tox
|
37
|
-
tox
|
38
|
-
```
|
19
|
+
The library is also available from PyPI via `python -m pip install datasketches`.
|
39
20
|
|
40
21
|
## Usage
|
41
22
|
|
42
|
-
Having installed the library, loading the Apache Datasketches
|
23
|
+
Having installed the library, loading the Apache Datasketches Library in Python is simple: `import datasketches`.
|
43
24
|
|
44
25
|
## Available Sketch Classes
|
45
26
|
|
46
27
|
- KLL (Absolute Error Quantiles)
|
47
|
-
|
48
|
-
|
28
|
+
- `kll_ints_sketch`
|
29
|
+
- `kll_floats_sketch`
|
49
30
|
- REQ (Relative Error Quantiles)
|
50
|
-
|
51
|
-
|
31
|
+
- `req_ints_sketch`
|
32
|
+
- `req_floats_sketch`
|
52
33
|
- Frequent Items
|
53
|
-
|
54
|
-
|
34
|
+
- `frequent_strings_sketch`
|
35
|
+
- Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
|
55
36
|
- Theta
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
37
|
+
- `update_theta_sketch`
|
38
|
+
- `compact_theta_sketch` (cannot be instantiated directly)
|
39
|
+
- `theta_union`
|
40
|
+
- `theta_intersection`
|
41
|
+
- `theta_a_not_b`
|
61
42
|
- HLL
|
62
|
-
|
63
|
-
|
64
|
-
|
43
|
+
- `hll_sketch`
|
44
|
+
- `hll_union`
|
45
|
+
- Target HLL types are `tgt_hll_type.{HLL_4 | HLL_6 | HLL_8}`
|
65
46
|
- CPC
|
66
|
-
|
67
|
-
|
47
|
+
- `cpc_sketch`
|
48
|
+
- `cpc_union`
|
68
49
|
- VarOpt Sampling
|
69
|
-
|
70
|
-
|
50
|
+
- `var_opt_sketch`
|
51
|
+
- `var_opt_union`
|
71
52
|
- Vector of KLL
|
72
|
-
|
73
|
-
|
53
|
+
- `vector_of_kll_ints_sketches`
|
54
|
+
- `vector_of_kll_floats_sketches`
|
74
55
|
|
75
56
|
## Known Differences from C++
|
76
57
|
|
@@ -79,3 +60,22 @@ The Python API largely mirrors the C++ API, with a few minor exceptions: The pri
|
|
79
60
|
The Vector of KLL object is currently exclusive to python, and holds an array of independent KLL sketches. This is useful for creating a set of KLL sketches over a vector and has been designed to allow input as either a vector or a matrix of multiple vectors.
|
80
61
|
|
81
62
|
We have also removed reliance on a builder class for theta sketches as Python allows named arguments to the constructor, not strictly positional arguments.
|
63
|
+
|
64
|
+
## Developer Instructions
|
65
|
+
|
66
|
+
The only developer-specific instructions relate to running unit tests.
|
67
|
+
|
68
|
+
### Unit tests
|
69
|
+
|
70
|
+
The Python unit tests are run with `tox`. To ensure you have all the needed package, from the package base directory run:
|
71
|
+
|
72
|
+
```bash
|
73
|
+
python -m pip install --upgrade tox
|
74
|
+
tox
|
75
|
+
```
|
76
|
+
|
77
|
+
## License
|
78
|
+
|
79
|
+
The Apache DataSketches Library is distrubted under an Apache 2.0 License.
|
80
|
+
|
81
|
+
There may be precompiled binaries provided as a convenience and distributed through PyPI via [https://pypi.org/project/datasketches/] contain compiled code from [pybind11](https://github.com/pybind/pybind11), which is distributed under a BSD license.
|
@@ -53,7 +53,7 @@ void init_cpc(py::module &m) {
|
|
53
53
|
using namespace datasketches;
|
54
54
|
|
55
55
|
py::class_<cpc_sketch>(m, "cpc_sketch")
|
56
|
-
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=
|
56
|
+
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=cpc_constants::DEFAULT_LG_K, py::arg("seed")=DEFAULT_SEED)
|
57
57
|
.def(py::init<const cpc_sketch&>())
|
58
58
|
.def("__str__", &cpc_sketch::to_string,
|
59
59
|
"Produces a string summary of the sketch")
|