datasketches 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
- data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/python/README.md +6 -3
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
- data/vendor/datasketches-cpp/setup.py +5 -3
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
- metadata +43 -34
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -26,7 +26,8 @@
|
|
26
26
|
|
27
27
|
namespace datasketches {
|
28
28
|
|
29
|
-
|
29
|
+
using kll_test_type_sketch = kll_sketch<test_type, test_type_less, test_type_serde, test_allocator<test_type>>;
|
30
|
+
using alloc = test_allocator<test_type>;
|
30
31
|
|
31
32
|
TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
32
33
|
|
@@ -34,7 +35,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
34
35
|
test_allocator_total_bytes = 0;
|
35
36
|
|
36
37
|
SECTION("compact level zero") {
|
37
|
-
kll_test_type_sketch sketch(8);
|
38
|
+
kll_test_type_sketch sketch(8, 0);
|
38
39
|
REQUIRE_THROWS_AS(sketch.get_quantile(0), std::runtime_error);
|
39
40
|
REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
|
40
41
|
REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
|
@@ -59,10 +60,10 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
59
60
|
}
|
60
61
|
|
61
62
|
SECTION("merge small") {
|
62
|
-
kll_test_type_sketch sketch1(8);
|
63
|
+
kll_test_type_sketch sketch1(8, 0);
|
63
64
|
sketch1.update(1);
|
64
65
|
|
65
|
-
kll_test_type_sketch sketch2(8);
|
66
|
+
kll_test_type_sketch sketch2(8, 0);
|
66
67
|
sketch2.update(2);
|
67
68
|
|
68
69
|
sketch2.merge(sketch1);
|
@@ -76,7 +77,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
76
77
|
}
|
77
78
|
|
78
79
|
SECTION("merge higher levels") {
|
79
|
-
kll_test_type_sketch sketch1(8);
|
80
|
+
kll_test_type_sketch sketch1(8, 0);
|
80
81
|
sketch1.update(1);
|
81
82
|
sketch1.update(2);
|
82
83
|
sketch1.update(3);
|
@@ -87,7 +88,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
87
88
|
sketch1.update(8);
|
88
89
|
sketch1.update(9);
|
89
90
|
|
90
|
-
kll_test_type_sketch sketch2(8);
|
91
|
+
kll_test_type_sketch sketch2(8, 0);
|
91
92
|
sketch2.update(10);
|
92
93
|
sketch2.update(11);
|
93
94
|
sketch2.update(12);
|
@@ -109,7 +110,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
109
110
|
}
|
110
111
|
|
111
112
|
SECTION("serialize deserialize") {
|
112
|
-
kll_test_type_sketch sketch1;
|
113
|
+
kll_test_type_sketch sketch1(200, 0);
|
113
114
|
|
114
115
|
const int n = 1000;
|
115
116
|
for (int i = 0; i < n; i++) sketch1.update(i);
|
@@ -117,7 +118,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
117
118
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
118
119
|
sketch1.serialize(s);
|
119
120
|
REQUIRE((size_t) s.tellp() == sketch1.get_serialized_size_bytes());
|
120
|
-
auto sketch2 = kll_test_type_sketch::deserialize(s);
|
121
|
+
auto sketch2 = kll_test_type_sketch::deserialize(s, alloc(0));
|
121
122
|
REQUIRE((size_t) s.tellg() == sketch2.get_serialized_size_bytes());
|
122
123
|
REQUIRE(s.tellg() == s.tellp());
|
123
124
|
REQUIRE(sketch2.is_empty() == sketch1.is_empty());
|
@@ -135,9 +136,9 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
135
136
|
}
|
136
137
|
|
137
138
|
SECTION("moving merge") {
|
138
|
-
kll_test_type_sketch sketch1(8);
|
139
|
+
kll_test_type_sketch sketch1(8, 0);
|
139
140
|
for (int i = 0; i < 10; i++) sketch1.update(i);
|
140
|
-
kll_test_type_sketch sketch2(8);
|
141
|
+
kll_test_type_sketch sketch2(8, 0);
|
141
142
|
sketch2.update(10);
|
142
143
|
sketch2.merge(std::move(sketch1));
|
143
144
|
REQUIRE(sketch2.get_min_value().get_value() == 0);
|
@@ -48,14 +48,14 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
48
48
|
test_allocator_total_bytes = 0;
|
49
49
|
|
50
50
|
SECTION("k limits") {
|
51
|
-
kll_float_sketch sketch1(kll_float_sketch::MIN_K); // this should work
|
52
|
-
kll_float_sketch sketch2(kll_float_sketch::MAX_K); // this should work
|
53
|
-
REQUIRE_THROWS_AS(new kll_float_sketch(kll_float_sketch::MIN_K - 1), std::invalid_argument);
|
51
|
+
kll_float_sketch sketch1(kll_float_sketch::MIN_K, 0); // this should work
|
52
|
+
kll_float_sketch sketch2(kll_float_sketch::MAX_K, 0); // this should work
|
53
|
+
REQUIRE_THROWS_AS(new kll_float_sketch(kll_float_sketch::MIN_K - 1, 0), std::invalid_argument);
|
54
54
|
// MAX_K + 1 makes no sense because k is uint16_t
|
55
55
|
}
|
56
56
|
|
57
57
|
SECTION("empty") {
|
58
|
-
kll_float_sketch sketch;
|
58
|
+
kll_float_sketch sketch(200, 0);
|
59
59
|
REQUIRE(sketch.is_empty());
|
60
60
|
REQUIRE_FALSE(sketch.is_estimation_mode());
|
61
61
|
REQUIRE(sketch.get_n() == 0);
|
@@ -79,13 +79,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
79
79
|
}
|
80
80
|
|
81
81
|
SECTION("get bad quantile") {
|
82
|
-
kll_float_sketch sketch;
|
82
|
+
kll_float_sketch sketch(200, 0);
|
83
83
|
sketch.update(0); // has to be non-empty to reach the check
|
84
84
|
REQUIRE_THROWS_AS(sketch.get_quantile(-1), std::invalid_argument);
|
85
85
|
}
|
86
86
|
|
87
87
|
SECTION("one item") {
|
88
|
-
kll_float_sketch sketch;
|
88
|
+
kll_float_sketch sketch(200, 0);
|
89
89
|
sketch.update(1);
|
90
90
|
REQUIRE_FALSE(sketch.is_empty());
|
91
91
|
REQUIRE_FALSE(sketch.is_estimation_mode());
|
@@ -112,7 +112,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
112
112
|
}
|
113
113
|
|
114
114
|
SECTION("NaN") {
|
115
|
-
kll_float_sketch sketch;
|
115
|
+
kll_float_sketch sketch(200, 0);
|
116
116
|
sketch.update(std::numeric_limits<float>::quiet_NaN());
|
117
117
|
REQUIRE(sketch.is_empty());
|
118
118
|
|
@@ -122,7 +122,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
122
122
|
}
|
123
123
|
|
124
124
|
SECTION("many items, exact mode") {
|
125
|
-
kll_float_sketch sketch;
|
125
|
+
kll_float_sketch sketch(200, 0);
|
126
126
|
const uint32_t n(200);
|
127
127
|
for (uint32_t i = 0; i < n; i++) {
|
128
128
|
sketch.update(i);
|
@@ -157,7 +157,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
157
157
|
}
|
158
158
|
|
159
159
|
SECTION("10 items") {
|
160
|
-
kll_float_sketch sketch;
|
160
|
+
kll_float_sketch sketch(200, 0);
|
161
161
|
sketch.update(1);
|
162
162
|
sketch.update(2);
|
163
163
|
sketch.update(3);
|
@@ -175,7 +175,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
175
175
|
}
|
176
176
|
|
177
177
|
SECTION("100 items") {
|
178
|
-
kll_float_sketch sketch;
|
178
|
+
kll_float_sketch sketch(200, 0);
|
179
179
|
for (int i = 0; i < 100; ++i) sketch.update(i);
|
180
180
|
REQUIRE(sketch.get_quantile(0) == 0);
|
181
181
|
REQUIRE(sketch.get_quantile(0.01) == 1);
|
@@ -185,7 +185,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
185
185
|
}
|
186
186
|
|
187
187
|
SECTION("many items, estimation mode") {
|
188
|
-
kll_float_sketch sketch;
|
188
|
+
kll_float_sketch sketch(200, 0);
|
189
189
|
const int n(1000000);
|
190
190
|
for (int i = 0; i < n; i++) {
|
191
191
|
sketch.update(i);
|
@@ -227,7 +227,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
227
227
|
}
|
228
228
|
|
229
229
|
SECTION("consistency between get_rank adn get_PMF/CDF") {
|
230
|
-
kll_float_sketch sketch;
|
230
|
+
kll_float_sketch sketch(200, 0);
|
231
231
|
const int n = 1000;
|
232
232
|
float values[n];
|
233
233
|
for (int i = 0; i < n; i++) {
|
@@ -256,7 +256,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
256
256
|
std::ifstream is;
|
257
257
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
258
258
|
is.open(testBinaryInputPath + "kll_sketch_from_java.sk", std::ios::binary);
|
259
|
-
auto sketch = kll_float_sketch::deserialize(is);
|
259
|
+
auto sketch = kll_float_sketch::deserialize(is, test_allocator<float>(0));
|
260
260
|
REQUIRE_FALSE(sketch.is_empty());
|
261
261
|
REQUIRE(sketch.is_estimation_mode());
|
262
262
|
REQUIRE(sketch.get_n() == 1000000);
|
@@ -266,11 +266,11 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
266
266
|
}
|
267
267
|
|
268
268
|
SECTION("stream serialize deserialize empty") {
|
269
|
-
kll_float_sketch sketch;
|
269
|
+
kll_float_sketch sketch(200, 0);
|
270
270
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
271
271
|
sketch.serialize(s);
|
272
272
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
273
|
-
auto sketch2 = kll_float_sketch::deserialize(s);
|
273
|
+
auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
|
274
274
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
275
275
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
276
276
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
@@ -283,9 +283,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
283
283
|
}
|
284
284
|
|
285
285
|
SECTION("bytes serialize deserialize empty") {
|
286
|
-
kll_float_sketch sketch;
|
286
|
+
kll_float_sketch sketch(200, 0);
|
287
287
|
auto bytes = sketch.serialize();
|
288
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size());
|
288
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
289
289
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
290
290
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
291
291
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
@@ -298,12 +298,12 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
298
298
|
}
|
299
299
|
|
300
300
|
SECTION("serialize deserialize one item") {
|
301
|
-
kll_float_sketch sketch;
|
301
|
+
kll_float_sketch sketch(200, 0);
|
302
302
|
sketch.update(1);
|
303
303
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
304
304
|
sketch.serialize(s);
|
305
305
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
306
|
-
auto sketch2 = kll_float_sketch::deserialize(s);
|
306
|
+
auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
|
307
307
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
308
308
|
REQUIRE(s.tellg() == s.tellp());
|
309
309
|
REQUIRE_FALSE(sketch2.is_empty());
|
@@ -321,7 +321,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
321
321
|
std::ifstream is;
|
322
322
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
323
323
|
is.open(testBinaryInputPath + "kll_sketch_float_one_item_v1.sk", std::ios::binary);
|
324
|
-
auto sketch = kll_float_sketch::deserialize(is);
|
324
|
+
auto sketch = kll_float_sketch::deserialize(is, test_allocator<float>(0));
|
325
325
|
REQUIRE_FALSE(sketch.is_empty());
|
326
326
|
REQUIRE_FALSE(sketch.is_estimation_mode());
|
327
327
|
REQUIRE(sketch.get_n() == 1);
|
@@ -331,13 +331,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
331
331
|
}
|
332
332
|
|
333
333
|
SECTION("stream serialize deserialize many floats") {
|
334
|
-
kll_float_sketch sketch;
|
334
|
+
kll_float_sketch sketch(200, 0);
|
335
335
|
const int n(1000);
|
336
336
|
for (int i = 0; i < n; i++) sketch.update(i);
|
337
337
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
338
338
|
sketch.serialize(s);
|
339
339
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
340
|
-
auto sketch2 = kll_float_sketch::deserialize(s);
|
340
|
+
auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
|
341
341
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
342
342
|
REQUIRE(s.tellg() == s.tellp());
|
343
343
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
@@ -354,12 +354,12 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
354
354
|
}
|
355
355
|
|
356
356
|
SECTION("bytes serialize deserialize many floats") {
|
357
|
-
kll_float_sketch sketch;
|
357
|
+
kll_float_sketch sketch(200, 0);
|
358
358
|
const int n(1000);
|
359
359
|
for (int i = 0; i < n; i++) sketch.update(i);
|
360
360
|
auto bytes = sketch.serialize();
|
361
361
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
362
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size());
|
362
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
363
363
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
364
364
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
365
365
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
@@ -414,7 +414,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
414
414
|
}
|
415
415
|
|
416
416
|
SECTION("out of order split points, float") {
|
417
|
-
kll_float_sketch sketch;
|
417
|
+
kll_float_sketch sketch(200, 0);
|
418
418
|
sketch.update(0); // has too be non-empty to reach the check
|
419
419
|
float split_points[2] = {1, 0};
|
420
420
|
REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 2), std::invalid_argument);
|
@@ -428,15 +428,15 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
428
428
|
}
|
429
429
|
|
430
430
|
SECTION("NaN split point") {
|
431
|
-
kll_float_sketch sketch;
|
431
|
+
kll_float_sketch sketch(200, 0);
|
432
432
|
sketch.update(0); // has too be non-empty to reach the check
|
433
433
|
float split_points[1] = {std::numeric_limits<float>::quiet_NaN()};
|
434
434
|
REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 1), std::invalid_argument);
|
435
435
|
}
|
436
436
|
|
437
437
|
SECTION("merge") {
|
438
|
-
kll_float_sketch sketch1;
|
439
|
-
kll_float_sketch sketch2;
|
438
|
+
kll_float_sketch sketch1(200, 0);
|
439
|
+
kll_float_sketch sketch2(200, 0);
|
440
440
|
const int n = 10000;
|
441
441
|
for (int i = 0; i < n; i++) {
|
442
442
|
sketch1.update(i);
|
@@ -458,8 +458,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
458
458
|
}
|
459
459
|
|
460
460
|
SECTION("merge lower k") {
|
461
|
-
kll_float_sketch sketch1(256);
|
462
|
-
kll_float_sketch sketch2(128);
|
461
|
+
kll_float_sketch sketch1(256, 0);
|
462
|
+
kll_float_sketch sketch2(128, 0);
|
463
463
|
const int n = 10000;
|
464
464
|
for (int i = 0; i < n; i++) {
|
465
465
|
sketch1.update(i);
|
@@ -471,6 +471,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
471
471
|
REQUIRE(sketch2.get_min_value() == n);
|
472
472
|
REQUIRE(sketch2.get_max_value() == 2.0f * n - 1);
|
473
473
|
|
474
|
+
REQUIRE(sketch1.get_k() == 256);
|
475
|
+
REQUIRE(sketch2.get_k() == 128);
|
476
|
+
|
474
477
|
REQUIRE(sketch1.get_normalized_rank_error(false) < sketch2.get_normalized_rank_error(false));
|
475
478
|
REQUIRE(sketch1.get_normalized_rank_error(true) < sketch2.get_normalized_rank_error(true));
|
476
479
|
|
@@ -488,8 +491,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
488
491
|
}
|
489
492
|
|
490
493
|
SECTION("merge exact mode, lower k") {
|
491
|
-
kll_float_sketch sketch1(256);
|
492
|
-
kll_float_sketch sketch2(128);
|
494
|
+
kll_float_sketch sketch1(256, 0);
|
495
|
+
kll_float_sketch sketch2(128, 0);
|
493
496
|
const int n = 10000;
|
494
497
|
for (int i = 0; i < n; i++) {
|
495
498
|
sketch1.update(i);
|
@@ -513,8 +516,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
513
516
|
}
|
514
517
|
|
515
518
|
SECTION("merge min value from other") {
|
516
|
-
kll_float_sketch sketch1;
|
517
|
-
kll_float_sketch sketch2;
|
519
|
+
kll_float_sketch sketch1(200, 0);
|
520
|
+
kll_float_sketch sketch2(200, 0);
|
518
521
|
sketch1.update(1);
|
519
522
|
sketch2.update(2);
|
520
523
|
sketch2.merge(sketch1);
|
@@ -523,9 +526,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
523
526
|
}
|
524
527
|
|
525
528
|
SECTION("merge min and max values from other") {
|
526
|
-
kll_float_sketch sketch1;
|
529
|
+
kll_float_sketch sketch1(200, 0);
|
527
530
|
for (int i = 0; i < 1000000; i++) sketch1.update(i);
|
528
|
-
kll_float_sketch sketch2;
|
531
|
+
kll_float_sketch sketch2(200, 0);
|
529
532
|
sketch2.merge(sketch1);
|
530
533
|
REQUIRE(sketch2.get_min_value() == 0.0f);
|
531
534
|
REQUIRE(sketch2.get_max_value() == 999999.0f);
|
@@ -560,7 +563,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
560
563
|
}
|
561
564
|
|
562
565
|
SECTION("sketch of strings stream") {
|
563
|
-
kll_string_sketch sketch1;
|
566
|
+
kll_string_sketch sketch1(200, 0);
|
564
567
|
REQUIRE_THROWS_AS(sketch1.get_quantile(0), std::runtime_error);
|
565
568
|
REQUIRE_THROWS_AS(sketch1.get_min_value(), std::runtime_error);
|
566
569
|
REQUIRE_THROWS_AS(sketch1.get_max_value(), std::runtime_error);
|
@@ -575,7 +578,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
575
578
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
576
579
|
sketch1.serialize(s);
|
577
580
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch1.get_serialized_size_bytes());
|
578
|
-
auto sketch2 = kll_string_sketch::deserialize(s);
|
581
|
+
auto sketch2 = kll_string_sketch::deserialize(s, test_allocator<std::string>(0));
|
579
582
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
580
583
|
REQUIRE(s.tellg() == s.tellp());
|
581
584
|
REQUIRE(sketch2.is_empty() == sketch1.is_empty());
|
@@ -599,7 +602,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
599
602
|
}
|
600
603
|
|
601
604
|
SECTION("sketch of strings bytes") {
|
602
|
-
kll_string_sketch sketch1;
|
605
|
+
kll_string_sketch sketch1(200, 0);
|
603
606
|
REQUIRE_THROWS_AS(sketch1.get_quantile(0), std::runtime_error);
|
604
607
|
REQUIRE_THROWS_AS(sketch1.get_min_value(), std::runtime_error);
|
605
608
|
REQUIRE_THROWS_AS(sketch1.get_max_value(), std::runtime_error);
|
@@ -613,7 +616,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
613
616
|
|
614
617
|
auto bytes = sketch1.serialize();
|
615
618
|
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
|
616
|
-
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size());
|
619
|
+
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
617
620
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
618
621
|
REQUIRE(sketch2.is_empty() == sketch1.is_empty());
|
619
622
|
REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
|
@@ -630,11 +633,11 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
630
633
|
|
631
634
|
|
632
635
|
SECTION("sketch of strings, single item, bytes") {
|
633
|
-
kll_string_sketch sketch1;
|
636
|
+
kll_string_sketch sketch1(200, 0);
|
634
637
|
sketch1.update("a");
|
635
638
|
auto bytes = sketch1.serialize();
|
636
639
|
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
|
637
|
-
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size());
|
640
|
+
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
638
641
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
639
642
|
}
|
640
643
|
|
@@ -35,6 +35,7 @@ target_link_libraries(python
|
|
35
35
|
fi
|
36
36
|
theta
|
37
37
|
sampling
|
38
|
+
req
|
38
39
|
pybind11::module
|
39
40
|
)
|
40
41
|
|
@@ -57,5 +58,6 @@ target_sources(python
|
|
57
58
|
src/fi_wrapper.cpp
|
58
59
|
src/theta_wrapper.cpp
|
59
60
|
src/vo_wrapper.cpp
|
61
|
+
src/req_wrapper.cpp
|
60
62
|
src/vector_of_kll.cpp
|
61
63
|
)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Python Wrapper for
|
1
|
+
# Python Wrapper for Apache DataSketches
|
2
2
|
|
3
3
|
## Installation
|
4
4
|
|
@@ -39,13 +39,16 @@ tox
|
|
39
39
|
|
40
40
|
## Usage
|
41
41
|
|
42
|
-
Having installed the library, loading the Datasketches library in Python is simple: `import datasketches`.
|
42
|
+
Having installed the library, loading the Apache Datasketches library in Python is simple: `import datasketches`.
|
43
43
|
|
44
44
|
## Available Sketch Classes
|
45
45
|
|
46
|
-
- KLL
|
46
|
+
- KLL (Absolute Error Quantiles)
|
47
47
|
- `kll_ints_sketch`
|
48
48
|
- `kll_floats_sketch`
|
49
|
+
- REQ (Relative Error Quantiles)
|
50
|
+
- `req_ints_sketch`
|
51
|
+
- `req_floats_sketch`
|
49
52
|
- Frequent Items
|
50
53
|
- `frequent_strings_sketch`
|
51
54
|
- Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
|
@@ -27,6 +27,7 @@ void init_fi(py::module& m);
|
|
27
27
|
void init_cpc(py::module& m);
|
28
28
|
void init_theta(py::module& m);
|
29
29
|
void init_vo(py::module& m);
|
30
|
+
void init_req(py::module& m);
|
30
31
|
void init_vector_of_kll(py::module& m);
|
31
32
|
|
32
33
|
PYBIND11_MODULE(datasketches, m) {
|
@@ -36,5 +37,6 @@ PYBIND11_MODULE(datasketches, m) {
|
|
36
37
|
init_cpc(m);
|
37
38
|
init_theta(m);
|
38
39
|
init_vo(m);
|
40
|
+
init_req(m);
|
39
41
|
init_vector_of_kll(m);
|
40
42
|
}
|
@@ -113,8 +113,6 @@ void init_hll(py::module &m) {
|
|
113
113
|
"Returns the approximate lower error bound given the specified number of standard deviations in {1, 2, 3}")
|
114
114
|
.def("get_upper_bound", &hll_union::get_upper_bound, py::arg("num_std_devs"),
|
115
115
|
"Returns the approximate upper error bound given the specified number of standard deviations in {1, 2, 3}")
|
116
|
-
.def("is_compact", &hll_union::is_compact,
|
117
|
-
"True if the union is compact, otherwise False")
|
118
116
|
.def("is_empty", &hll_union::is_empty,
|
119
117
|
"True if the union is empty, otherwise False")
|
120
118
|
.def("reset", &hll_union::reset,
|
@@ -130,6 +130,8 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
130
130
|
"Produces a string summary of the sketch")
|
131
131
|
.def("is_empty", &kll_sketch<T>::is_empty,
|
132
132
|
"Returns True if the sketch is empty, otherwise False")
|
133
|
+
.def("get_k", &kll_sketch<T>::get_k,
|
134
|
+
"Returns the configured parameter k")
|
133
135
|
.def("get_n", &kll_sketch<T>::get_n,
|
134
136
|
"Returns the length of the input stream")
|
135
137
|
.def("get_num_retained", &kll_sketch<T>::get_num_retained,
|
@@ -198,7 +200,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
198
200
|
"If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
|
199
201
|
"Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
|
200
202
|
"Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
|
201
|
-
.def("serialize", &dspy::kll_sketch_serialize<T>, "
|
203
|
+
.def("serialize", &dspy::kll_sketch_serialize<T>, "Serializes the sketch into a bytes object")
|
202
204
|
.def_static("deserialize", &dspy::kll_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
|
203
205
|
;
|
204
206
|
}
|