datasketches 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
- data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/python/README.md +6 -3
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
- data/vendor/datasketches-cpp/setup.py +5 -3
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
- metadata +43 -34
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
|
@@ -26,7 +26,8 @@
|
|
|
26
26
|
|
|
27
27
|
namespace datasketches {
|
|
28
28
|
|
|
29
|
-
|
|
29
|
+
using kll_test_type_sketch = kll_sketch<test_type, test_type_less, test_type_serde, test_allocator<test_type>>;
|
|
30
|
+
using alloc = test_allocator<test_type>;
|
|
30
31
|
|
|
31
32
|
TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
32
33
|
|
|
@@ -34,7 +35,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
|
34
35
|
test_allocator_total_bytes = 0;
|
|
35
36
|
|
|
36
37
|
SECTION("compact level zero") {
|
|
37
|
-
kll_test_type_sketch sketch(8);
|
|
38
|
+
kll_test_type_sketch sketch(8, 0);
|
|
38
39
|
REQUIRE_THROWS_AS(sketch.get_quantile(0), std::runtime_error);
|
|
39
40
|
REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
|
|
40
41
|
REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
|
|
@@ -59,10 +60,10 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
|
59
60
|
}
|
|
60
61
|
|
|
61
62
|
SECTION("merge small") {
|
|
62
|
-
kll_test_type_sketch sketch1(8);
|
|
63
|
+
kll_test_type_sketch sketch1(8, 0);
|
|
63
64
|
sketch1.update(1);
|
|
64
65
|
|
|
65
|
-
kll_test_type_sketch sketch2(8);
|
|
66
|
+
kll_test_type_sketch sketch2(8, 0);
|
|
66
67
|
sketch2.update(2);
|
|
67
68
|
|
|
68
69
|
sketch2.merge(sketch1);
|
|
@@ -76,7 +77,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
|
76
77
|
}
|
|
77
78
|
|
|
78
79
|
SECTION("merge higher levels") {
|
|
79
|
-
kll_test_type_sketch sketch1(8);
|
|
80
|
+
kll_test_type_sketch sketch1(8, 0);
|
|
80
81
|
sketch1.update(1);
|
|
81
82
|
sketch1.update(2);
|
|
82
83
|
sketch1.update(3);
|
|
@@ -87,7 +88,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
|
87
88
|
sketch1.update(8);
|
|
88
89
|
sketch1.update(9);
|
|
89
90
|
|
|
90
|
-
kll_test_type_sketch sketch2(8);
|
|
91
|
+
kll_test_type_sketch sketch2(8, 0);
|
|
91
92
|
sketch2.update(10);
|
|
92
93
|
sketch2.update(11);
|
|
93
94
|
sketch2.update(12);
|
|
@@ -109,7 +110,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
|
109
110
|
}
|
|
110
111
|
|
|
111
112
|
SECTION("serialize deserialize") {
|
|
112
|
-
kll_test_type_sketch sketch1;
|
|
113
|
+
kll_test_type_sketch sketch1(200, 0);
|
|
113
114
|
|
|
114
115
|
const int n = 1000;
|
|
115
116
|
for (int i = 0; i < n; i++) sketch1.update(i);
|
|
@@ -117,7 +118,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
|
117
118
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
118
119
|
sketch1.serialize(s);
|
|
119
120
|
REQUIRE((size_t) s.tellp() == sketch1.get_serialized_size_bytes());
|
|
120
|
-
auto sketch2 = kll_test_type_sketch::deserialize(s);
|
|
121
|
+
auto sketch2 = kll_test_type_sketch::deserialize(s, alloc(0));
|
|
121
122
|
REQUIRE((size_t) s.tellg() == sketch2.get_serialized_size_bytes());
|
|
122
123
|
REQUIRE(s.tellg() == s.tellp());
|
|
123
124
|
REQUIRE(sketch2.is_empty() == sketch1.is_empty());
|
|
@@ -135,9 +136,9 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
|
135
136
|
}
|
|
136
137
|
|
|
137
138
|
SECTION("moving merge") {
|
|
138
|
-
kll_test_type_sketch sketch1(8);
|
|
139
|
+
kll_test_type_sketch sketch1(8, 0);
|
|
139
140
|
for (int i = 0; i < 10; i++) sketch1.update(i);
|
|
140
|
-
kll_test_type_sketch sketch2(8);
|
|
141
|
+
kll_test_type_sketch sketch2(8, 0);
|
|
141
142
|
sketch2.update(10);
|
|
142
143
|
sketch2.merge(std::move(sketch1));
|
|
143
144
|
REQUIRE(sketch2.get_min_value().get_value() == 0);
|
|
@@ -48,14 +48,14 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
48
48
|
test_allocator_total_bytes = 0;
|
|
49
49
|
|
|
50
50
|
SECTION("k limits") {
|
|
51
|
-
kll_float_sketch sketch1(kll_float_sketch::MIN_K); // this should work
|
|
52
|
-
kll_float_sketch sketch2(kll_float_sketch::MAX_K); // this should work
|
|
53
|
-
REQUIRE_THROWS_AS(new kll_float_sketch(kll_float_sketch::MIN_K - 1), std::invalid_argument);
|
|
51
|
+
kll_float_sketch sketch1(kll_float_sketch::MIN_K, 0); // this should work
|
|
52
|
+
kll_float_sketch sketch2(kll_float_sketch::MAX_K, 0); // this should work
|
|
53
|
+
REQUIRE_THROWS_AS(new kll_float_sketch(kll_float_sketch::MIN_K - 1, 0), std::invalid_argument);
|
|
54
54
|
// MAX_K + 1 makes no sense because k is uint16_t
|
|
55
55
|
}
|
|
56
56
|
|
|
57
57
|
SECTION("empty") {
|
|
58
|
-
kll_float_sketch sketch;
|
|
58
|
+
kll_float_sketch sketch(200, 0);
|
|
59
59
|
REQUIRE(sketch.is_empty());
|
|
60
60
|
REQUIRE_FALSE(sketch.is_estimation_mode());
|
|
61
61
|
REQUIRE(sketch.get_n() == 0);
|
|
@@ -79,13 +79,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
79
79
|
}
|
|
80
80
|
|
|
81
81
|
SECTION("get bad quantile") {
|
|
82
|
-
kll_float_sketch sketch;
|
|
82
|
+
kll_float_sketch sketch(200, 0);
|
|
83
83
|
sketch.update(0); // has to be non-empty to reach the check
|
|
84
84
|
REQUIRE_THROWS_AS(sketch.get_quantile(-1), std::invalid_argument);
|
|
85
85
|
}
|
|
86
86
|
|
|
87
87
|
SECTION("one item") {
|
|
88
|
-
kll_float_sketch sketch;
|
|
88
|
+
kll_float_sketch sketch(200, 0);
|
|
89
89
|
sketch.update(1);
|
|
90
90
|
REQUIRE_FALSE(sketch.is_empty());
|
|
91
91
|
REQUIRE_FALSE(sketch.is_estimation_mode());
|
|
@@ -112,7 +112,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
112
112
|
}
|
|
113
113
|
|
|
114
114
|
SECTION("NaN") {
|
|
115
|
-
kll_float_sketch sketch;
|
|
115
|
+
kll_float_sketch sketch(200, 0);
|
|
116
116
|
sketch.update(std::numeric_limits<float>::quiet_NaN());
|
|
117
117
|
REQUIRE(sketch.is_empty());
|
|
118
118
|
|
|
@@ -122,7 +122,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
122
122
|
}
|
|
123
123
|
|
|
124
124
|
SECTION("many items, exact mode") {
|
|
125
|
-
kll_float_sketch sketch;
|
|
125
|
+
kll_float_sketch sketch(200, 0);
|
|
126
126
|
const uint32_t n(200);
|
|
127
127
|
for (uint32_t i = 0; i < n; i++) {
|
|
128
128
|
sketch.update(i);
|
|
@@ -157,7 +157,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
157
157
|
}
|
|
158
158
|
|
|
159
159
|
SECTION("10 items") {
|
|
160
|
-
kll_float_sketch sketch;
|
|
160
|
+
kll_float_sketch sketch(200, 0);
|
|
161
161
|
sketch.update(1);
|
|
162
162
|
sketch.update(2);
|
|
163
163
|
sketch.update(3);
|
|
@@ -175,7 +175,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
175
175
|
}
|
|
176
176
|
|
|
177
177
|
SECTION("100 items") {
|
|
178
|
-
kll_float_sketch sketch;
|
|
178
|
+
kll_float_sketch sketch(200, 0);
|
|
179
179
|
for (int i = 0; i < 100; ++i) sketch.update(i);
|
|
180
180
|
REQUIRE(sketch.get_quantile(0) == 0);
|
|
181
181
|
REQUIRE(sketch.get_quantile(0.01) == 1);
|
|
@@ -185,7 +185,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
185
185
|
}
|
|
186
186
|
|
|
187
187
|
SECTION("many items, estimation mode") {
|
|
188
|
-
kll_float_sketch sketch;
|
|
188
|
+
kll_float_sketch sketch(200, 0);
|
|
189
189
|
const int n(1000000);
|
|
190
190
|
for (int i = 0; i < n; i++) {
|
|
191
191
|
sketch.update(i);
|
|
@@ -227,7 +227,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
227
227
|
}
|
|
228
228
|
|
|
229
229
|
SECTION("consistency between get_rank adn get_PMF/CDF") {
|
|
230
|
-
kll_float_sketch sketch;
|
|
230
|
+
kll_float_sketch sketch(200, 0);
|
|
231
231
|
const int n = 1000;
|
|
232
232
|
float values[n];
|
|
233
233
|
for (int i = 0; i < n; i++) {
|
|
@@ -256,7 +256,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
256
256
|
std::ifstream is;
|
|
257
257
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
258
258
|
is.open(testBinaryInputPath + "kll_sketch_from_java.sk", std::ios::binary);
|
|
259
|
-
auto sketch = kll_float_sketch::deserialize(is);
|
|
259
|
+
auto sketch = kll_float_sketch::deserialize(is, test_allocator<float>(0));
|
|
260
260
|
REQUIRE_FALSE(sketch.is_empty());
|
|
261
261
|
REQUIRE(sketch.is_estimation_mode());
|
|
262
262
|
REQUIRE(sketch.get_n() == 1000000);
|
|
@@ -266,11 +266,11 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
266
266
|
}
|
|
267
267
|
|
|
268
268
|
SECTION("stream serialize deserialize empty") {
|
|
269
|
-
kll_float_sketch sketch;
|
|
269
|
+
kll_float_sketch sketch(200, 0);
|
|
270
270
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
271
271
|
sketch.serialize(s);
|
|
272
272
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
|
273
|
-
auto sketch2 = kll_float_sketch::deserialize(s);
|
|
273
|
+
auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
|
|
274
274
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
|
275
275
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
|
276
276
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
|
@@ -283,9 +283,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
283
283
|
}
|
|
284
284
|
|
|
285
285
|
SECTION("bytes serialize deserialize empty") {
|
|
286
|
-
kll_float_sketch sketch;
|
|
286
|
+
kll_float_sketch sketch(200, 0);
|
|
287
287
|
auto bytes = sketch.serialize();
|
|
288
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size());
|
|
288
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
|
289
289
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
|
290
290
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
|
291
291
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
|
@@ -298,12 +298,12 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
298
298
|
}
|
|
299
299
|
|
|
300
300
|
SECTION("serialize deserialize one item") {
|
|
301
|
-
kll_float_sketch sketch;
|
|
301
|
+
kll_float_sketch sketch(200, 0);
|
|
302
302
|
sketch.update(1);
|
|
303
303
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
304
304
|
sketch.serialize(s);
|
|
305
305
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
|
306
|
-
auto sketch2 = kll_float_sketch::deserialize(s);
|
|
306
|
+
auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
|
|
307
307
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
|
308
308
|
REQUIRE(s.tellg() == s.tellp());
|
|
309
309
|
REQUIRE_FALSE(sketch2.is_empty());
|
|
@@ -321,7 +321,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
321
321
|
std::ifstream is;
|
|
322
322
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
323
323
|
is.open(testBinaryInputPath + "kll_sketch_float_one_item_v1.sk", std::ios::binary);
|
|
324
|
-
auto sketch = kll_float_sketch::deserialize(is);
|
|
324
|
+
auto sketch = kll_float_sketch::deserialize(is, test_allocator<float>(0));
|
|
325
325
|
REQUIRE_FALSE(sketch.is_empty());
|
|
326
326
|
REQUIRE_FALSE(sketch.is_estimation_mode());
|
|
327
327
|
REQUIRE(sketch.get_n() == 1);
|
|
@@ -331,13 +331,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
331
331
|
}
|
|
332
332
|
|
|
333
333
|
SECTION("stream serialize deserialize many floats") {
|
|
334
|
-
kll_float_sketch sketch;
|
|
334
|
+
kll_float_sketch sketch(200, 0);
|
|
335
335
|
const int n(1000);
|
|
336
336
|
for (int i = 0; i < n; i++) sketch.update(i);
|
|
337
337
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
338
338
|
sketch.serialize(s);
|
|
339
339
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
|
340
|
-
auto sketch2 = kll_float_sketch::deserialize(s);
|
|
340
|
+
auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
|
|
341
341
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
|
342
342
|
REQUIRE(s.tellg() == s.tellp());
|
|
343
343
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
|
@@ -354,12 +354,12 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
354
354
|
}
|
|
355
355
|
|
|
356
356
|
SECTION("bytes serialize deserialize many floats") {
|
|
357
|
-
kll_float_sketch sketch;
|
|
357
|
+
kll_float_sketch sketch(200, 0);
|
|
358
358
|
const int n(1000);
|
|
359
359
|
for (int i = 0; i < n; i++) sketch.update(i);
|
|
360
360
|
auto bytes = sketch.serialize();
|
|
361
361
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
|
362
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size());
|
|
362
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
|
363
363
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
|
364
364
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
|
365
365
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
|
@@ -414,7 +414,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
414
414
|
}
|
|
415
415
|
|
|
416
416
|
SECTION("out of order split points, float") {
|
|
417
|
-
kll_float_sketch sketch;
|
|
417
|
+
kll_float_sketch sketch(200, 0);
|
|
418
418
|
sketch.update(0); // has too be non-empty to reach the check
|
|
419
419
|
float split_points[2] = {1, 0};
|
|
420
420
|
REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 2), std::invalid_argument);
|
|
@@ -428,15 +428,15 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
428
428
|
}
|
|
429
429
|
|
|
430
430
|
SECTION("NaN split point") {
|
|
431
|
-
kll_float_sketch sketch;
|
|
431
|
+
kll_float_sketch sketch(200, 0);
|
|
432
432
|
sketch.update(0); // has too be non-empty to reach the check
|
|
433
433
|
float split_points[1] = {std::numeric_limits<float>::quiet_NaN()};
|
|
434
434
|
REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 1), std::invalid_argument);
|
|
435
435
|
}
|
|
436
436
|
|
|
437
437
|
SECTION("merge") {
|
|
438
|
-
kll_float_sketch sketch1;
|
|
439
|
-
kll_float_sketch sketch2;
|
|
438
|
+
kll_float_sketch sketch1(200, 0);
|
|
439
|
+
kll_float_sketch sketch2(200, 0);
|
|
440
440
|
const int n = 10000;
|
|
441
441
|
for (int i = 0; i < n; i++) {
|
|
442
442
|
sketch1.update(i);
|
|
@@ -458,8 +458,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
458
458
|
}
|
|
459
459
|
|
|
460
460
|
SECTION("merge lower k") {
|
|
461
|
-
kll_float_sketch sketch1(256);
|
|
462
|
-
kll_float_sketch sketch2(128);
|
|
461
|
+
kll_float_sketch sketch1(256, 0);
|
|
462
|
+
kll_float_sketch sketch2(128, 0);
|
|
463
463
|
const int n = 10000;
|
|
464
464
|
for (int i = 0; i < n; i++) {
|
|
465
465
|
sketch1.update(i);
|
|
@@ -471,6 +471,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
471
471
|
REQUIRE(sketch2.get_min_value() == n);
|
|
472
472
|
REQUIRE(sketch2.get_max_value() == 2.0f * n - 1);
|
|
473
473
|
|
|
474
|
+
REQUIRE(sketch1.get_k() == 256);
|
|
475
|
+
REQUIRE(sketch2.get_k() == 128);
|
|
476
|
+
|
|
474
477
|
REQUIRE(sketch1.get_normalized_rank_error(false) < sketch2.get_normalized_rank_error(false));
|
|
475
478
|
REQUIRE(sketch1.get_normalized_rank_error(true) < sketch2.get_normalized_rank_error(true));
|
|
476
479
|
|
|
@@ -488,8 +491,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
488
491
|
}
|
|
489
492
|
|
|
490
493
|
SECTION("merge exact mode, lower k") {
|
|
491
|
-
kll_float_sketch sketch1(256);
|
|
492
|
-
kll_float_sketch sketch2(128);
|
|
494
|
+
kll_float_sketch sketch1(256, 0);
|
|
495
|
+
kll_float_sketch sketch2(128, 0);
|
|
493
496
|
const int n = 10000;
|
|
494
497
|
for (int i = 0; i < n; i++) {
|
|
495
498
|
sketch1.update(i);
|
|
@@ -513,8 +516,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
513
516
|
}
|
|
514
517
|
|
|
515
518
|
SECTION("merge min value from other") {
|
|
516
|
-
kll_float_sketch sketch1;
|
|
517
|
-
kll_float_sketch sketch2;
|
|
519
|
+
kll_float_sketch sketch1(200, 0);
|
|
520
|
+
kll_float_sketch sketch2(200, 0);
|
|
518
521
|
sketch1.update(1);
|
|
519
522
|
sketch2.update(2);
|
|
520
523
|
sketch2.merge(sketch1);
|
|
@@ -523,9 +526,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
523
526
|
}
|
|
524
527
|
|
|
525
528
|
SECTION("merge min and max values from other") {
|
|
526
|
-
kll_float_sketch sketch1;
|
|
529
|
+
kll_float_sketch sketch1(200, 0);
|
|
527
530
|
for (int i = 0; i < 1000000; i++) sketch1.update(i);
|
|
528
|
-
kll_float_sketch sketch2;
|
|
531
|
+
kll_float_sketch sketch2(200, 0);
|
|
529
532
|
sketch2.merge(sketch1);
|
|
530
533
|
REQUIRE(sketch2.get_min_value() == 0.0f);
|
|
531
534
|
REQUIRE(sketch2.get_max_value() == 999999.0f);
|
|
@@ -560,7 +563,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
560
563
|
}
|
|
561
564
|
|
|
562
565
|
SECTION("sketch of strings stream") {
|
|
563
|
-
kll_string_sketch sketch1;
|
|
566
|
+
kll_string_sketch sketch1(200, 0);
|
|
564
567
|
REQUIRE_THROWS_AS(sketch1.get_quantile(0), std::runtime_error);
|
|
565
568
|
REQUIRE_THROWS_AS(sketch1.get_min_value(), std::runtime_error);
|
|
566
569
|
REQUIRE_THROWS_AS(sketch1.get_max_value(), std::runtime_error);
|
|
@@ -575,7 +578,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
575
578
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
576
579
|
sketch1.serialize(s);
|
|
577
580
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch1.get_serialized_size_bytes());
|
|
578
|
-
auto sketch2 = kll_string_sketch::deserialize(s);
|
|
581
|
+
auto sketch2 = kll_string_sketch::deserialize(s, test_allocator<std::string>(0));
|
|
579
582
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
|
580
583
|
REQUIRE(s.tellg() == s.tellp());
|
|
581
584
|
REQUIRE(sketch2.is_empty() == sketch1.is_empty());
|
|
@@ -599,7 +602,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
599
602
|
}
|
|
600
603
|
|
|
601
604
|
SECTION("sketch of strings bytes") {
|
|
602
|
-
kll_string_sketch sketch1;
|
|
605
|
+
kll_string_sketch sketch1(200, 0);
|
|
603
606
|
REQUIRE_THROWS_AS(sketch1.get_quantile(0), std::runtime_error);
|
|
604
607
|
REQUIRE_THROWS_AS(sketch1.get_min_value(), std::runtime_error);
|
|
605
608
|
REQUIRE_THROWS_AS(sketch1.get_max_value(), std::runtime_error);
|
|
@@ -613,7 +616,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
613
616
|
|
|
614
617
|
auto bytes = sketch1.serialize();
|
|
615
618
|
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
|
|
616
|
-
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size());
|
|
619
|
+
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
|
617
620
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
|
618
621
|
REQUIRE(sketch2.is_empty() == sketch1.is_empty());
|
|
619
622
|
REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
|
|
@@ -630,11 +633,11 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
630
633
|
|
|
631
634
|
|
|
632
635
|
SECTION("sketch of strings, single item, bytes") {
|
|
633
|
-
kll_string_sketch sketch1;
|
|
636
|
+
kll_string_sketch sketch1(200, 0);
|
|
634
637
|
sketch1.update("a");
|
|
635
638
|
auto bytes = sketch1.serialize();
|
|
636
639
|
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
|
|
637
|
-
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size());
|
|
640
|
+
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
|
638
641
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
|
639
642
|
}
|
|
640
643
|
|
|
@@ -35,6 +35,7 @@ target_link_libraries(python
|
|
|
35
35
|
fi
|
|
36
36
|
theta
|
|
37
37
|
sampling
|
|
38
|
+
req
|
|
38
39
|
pybind11::module
|
|
39
40
|
)
|
|
40
41
|
|
|
@@ -57,5 +58,6 @@ target_sources(python
|
|
|
57
58
|
src/fi_wrapper.cpp
|
|
58
59
|
src/theta_wrapper.cpp
|
|
59
60
|
src/vo_wrapper.cpp
|
|
61
|
+
src/req_wrapper.cpp
|
|
60
62
|
src/vector_of_kll.cpp
|
|
61
63
|
)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Python Wrapper for
|
|
1
|
+
# Python Wrapper for Apache DataSketches
|
|
2
2
|
|
|
3
3
|
## Installation
|
|
4
4
|
|
|
@@ -39,13 +39,16 @@ tox
|
|
|
39
39
|
|
|
40
40
|
## Usage
|
|
41
41
|
|
|
42
|
-
Having installed the library, loading the Datasketches library in Python is simple: `import datasketches`.
|
|
42
|
+
Having installed the library, loading the Apache Datasketches library in Python is simple: `import datasketches`.
|
|
43
43
|
|
|
44
44
|
## Available Sketch Classes
|
|
45
45
|
|
|
46
|
-
- KLL
|
|
46
|
+
- KLL (Absolute Error Quantiles)
|
|
47
47
|
- `kll_ints_sketch`
|
|
48
48
|
- `kll_floats_sketch`
|
|
49
|
+
- REQ (Relative Error Quantiles)
|
|
50
|
+
- `req_ints_sketch`
|
|
51
|
+
- `req_floats_sketch`
|
|
49
52
|
- Frequent Items
|
|
50
53
|
- `frequent_strings_sketch`
|
|
51
54
|
- Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
|
|
@@ -27,6 +27,7 @@ void init_fi(py::module& m);
|
|
|
27
27
|
void init_cpc(py::module& m);
|
|
28
28
|
void init_theta(py::module& m);
|
|
29
29
|
void init_vo(py::module& m);
|
|
30
|
+
void init_req(py::module& m);
|
|
30
31
|
void init_vector_of_kll(py::module& m);
|
|
31
32
|
|
|
32
33
|
PYBIND11_MODULE(datasketches, m) {
|
|
@@ -36,5 +37,6 @@ PYBIND11_MODULE(datasketches, m) {
|
|
|
36
37
|
init_cpc(m);
|
|
37
38
|
init_theta(m);
|
|
38
39
|
init_vo(m);
|
|
40
|
+
init_req(m);
|
|
39
41
|
init_vector_of_kll(m);
|
|
40
42
|
}
|
|
@@ -113,8 +113,6 @@ void init_hll(py::module &m) {
|
|
|
113
113
|
"Returns the approximate lower error bound given the specified number of standard deviations in {1, 2, 3}")
|
|
114
114
|
.def("get_upper_bound", &hll_union::get_upper_bound, py::arg("num_std_devs"),
|
|
115
115
|
"Returns the approximate upper error bound given the specified number of standard deviations in {1, 2, 3}")
|
|
116
|
-
.def("is_compact", &hll_union::is_compact,
|
|
117
|
-
"True if the union is compact, otherwise False")
|
|
118
116
|
.def("is_empty", &hll_union::is_empty,
|
|
119
117
|
"True if the union is empty, otherwise False")
|
|
120
118
|
.def("reset", &hll_union::reset,
|
|
@@ -130,6 +130,8 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
130
130
|
"Produces a string summary of the sketch")
|
|
131
131
|
.def("is_empty", &kll_sketch<T>::is_empty,
|
|
132
132
|
"Returns True if the sketch is empty, otherwise False")
|
|
133
|
+
.def("get_k", &kll_sketch<T>::get_k,
|
|
134
|
+
"Returns the configured parameter k")
|
|
133
135
|
.def("get_n", &kll_sketch<T>::get_n,
|
|
134
136
|
"Returns the length of the input stream")
|
|
135
137
|
.def("get_num_retained", &kll_sketch<T>::get_num_retained,
|
|
@@ -198,7 +200,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
198
200
|
"If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
|
|
199
201
|
"Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
|
|
200
202
|
"Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
|
|
201
|
-
.def("serialize", &dspy::kll_sketch_serialize<T>, "
|
|
203
|
+
.def("serialize", &dspy::kll_sketch_serialize<T>, "Serializes the sketch into a bytes object")
|
|
202
204
|
.def_static("deserialize", &dspy::kll_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
|
|
203
205
|
;
|
|
204
206
|
}
|