datasketches 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/ext/datasketches/vo_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  6. data/vendor/datasketches-cpp/LICENSE +35 -7
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
  10. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  11. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  12. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  13. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  14. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  15. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  16. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  17. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  18. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  19. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  20. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  21. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  22. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  23. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
  24. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
  25. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
  26. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  27. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
  28. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  29. metadata +18 -10
@@ -0,0 +1,60 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ # separate executables for var_opt and ebpps sampling
19
+
20
+ # BLOOM FILTER
21
+ add_executable(bloom_filter_test)
22
+
23
+ target_link_libraries(bloom_filter_test filters common_test_lib)
24
+
25
+ set_target_properties(bloom_filter_test PROPERTIES
26
+ CXX_STANDARD_REQUIRED YES
27
+ )
28
+
29
+ file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" FILTERS_TEST_BINARY_PATH)
30
+ string(APPEND FILTERS_TEST_BINARY_PATH "/")
31
+ target_compile_definitions(bloom_filter_test
32
+ PRIVATE
33
+ TEST_BINARY_INPUT_PATH="${FILTERS_TEST_BINARY_PATH}"
34
+ )
35
+
36
+ add_test(
37
+ NAME bloom_filter_test
38
+ COMMAND bloom_filter_test
39
+ )
40
+
41
+ target_sources(bloom_filter_test
42
+ PRIVATE
43
+ bit_array_ops_test.cpp
44
+ bloom_filter_test.cpp
45
+ bloom_filter_allocation_test.cpp
46
+ )
47
+
48
+ if (SERDE_COMPAT)
49
+ target_sources(bloom_filter_test
50
+ PRIVATE
51
+ bloom_filter_deserialize_from_java_test.cpp
52
+ )
53
+ endif()
54
+
55
+ if (GENERATE)
56
+ target_sources(bloom_filter_test
57
+ PRIVATE
58
+ bloom_filter_serialize_for_java.cpp
59
+ )
60
+ endif()
@@ -0,0 +1,107 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <algorithm>
22
+
23
+ #include "bit_array_ops.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ TEST_CASE("bit_array: basic operation", "[bit_array]") {
28
+ uint8_t* data = new uint8_t[16];
29
+ std::fill_n(data, 16, 0);
30
+ REQUIRE(bit_array_ops::get_and_set_bit(data, 1) == false);
31
+ REQUIRE(bit_array_ops::get_and_set_bit(data, 2) == false);
32
+ for (int i = 4; i < 64; i <<= 1) {
33
+ REQUIRE(bit_array_ops::get_and_set_bit(data, 64 + i) == false);
34
+ }
35
+
36
+ REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 6);
37
+ REQUIRE(bit_array_ops::get_bit(data, 68));
38
+
39
+ REQUIRE(bit_array_ops::get_bit(data, 5) == false);
40
+ bit_array_ops::set_bit(data, 5);
41
+ REQUIRE(bit_array_ops::get_and_set_bit(data, 5));
42
+ REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 7);
43
+
44
+ bit_array_ops::clear_bit(data, 5);
45
+ REQUIRE(bit_array_ops::get_bit(data, 5) == false);
46
+ REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 6);
47
+
48
+ std::fill(data, data + 16, 0);
49
+ REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 0);
50
+
51
+ bit_array_ops::set_bit(data, 35);
52
+ REQUIRE(bit_array_ops::get_and_set_bit(data, 35));
53
+ bit_array_ops::assign_bit(data, 35, false);
54
+ REQUIRE(bit_array_ops::get_bit(data, 35) == false);
55
+ bit_array_ops::assign_bit(data, 35, true);
56
+ REQUIRE(bit_array_ops::get_bit(data, 35));
57
+
58
+ delete [] data;
59
+ }
60
+
61
+ TEST_CASE("bit_array: inversion", "[bit_array]") {
62
+ size_t num_bits = 1024;
63
+ uint8_t* data = new uint8_t[num_bits / 8];
64
+ std::fill_n(data, num_bits / 8, 0);
65
+ for (size_t i = 0; i < num_bits; i += num_bits / 8) {
66
+ bit_array_ops::get_and_set_bit(data, i);
67
+ }
68
+ REQUIRE(bit_array_ops::get_bit(data, 0));
69
+
70
+ size_t num_bits_set = bit_array_ops::count_num_bits_set(data, num_bits / 8);
71
+ bit_array_ops::invert(data, num_bits / 8);
72
+ REQUIRE(bit_array_ops::count_num_bits_set(data, num_bits / 8) == num_bits - num_bits_set);
73
+ REQUIRE(bit_array_ops::get_bit(data, 0) == false);
74
+
75
+ delete [] data;
76
+ }
77
+
78
+ TEST_CASE("bit_array: intersection and union", "[bit_array]") {
79
+ uint8_t* data1 = new uint8_t[8];
80
+ uint8_t* data2 = new uint8_t[8];
81
+ uint8_t* data3 = new uint8_t[8];
82
+ std::fill_n(data1, 8, 0);
83
+ std::fill_n(data2, 8, 0);
84
+ std::fill_n(data3, 8, 0);
85
+
86
+ size_t n = 10;
87
+ for (size_t i = 0; i < n; ++i) {
88
+ bit_array_ops::get_and_set_bit(data1, i);
89
+ bit_array_ops::get_and_set_bit(data2, i + (n / 2));
90
+ bit_array_ops::get_and_set_bit(data3, 2 * i);
91
+ }
92
+ REQUIRE(bit_array_ops::count_num_bits_set(data1, 8) == n);
93
+ REQUIRE(bit_array_ops::count_num_bits_set(data2, 8) == n);
94
+ REQUIRE(bit_array_ops::count_num_bits_set(data3, 8) == n);
95
+
96
+ bit_array_ops::intersect(data1, data2, 8);
97
+ REQUIRE(bit_array_ops::count_num_bits_set(data1, 8) == n / 2);
98
+
99
+ bit_array_ops::union_with(data3, data2, 8);
100
+ REQUIRE(bit_array_ops::count_num_bits_set(data3, 8) == 3 * n / 2);
101
+
102
+ delete [] data1;
103
+ delete [] data2;
104
+ delete [] data3;
105
+ }
106
+
107
+ } // namespace datasketches
@@ -0,0 +1,75 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <sstream>
21
+
22
+ #include <catch2/catch.hpp>
23
+
24
+ #include "bloom_filter.hpp"
25
+ #include "test_type.hpp"
26
+ #include "test_allocator.hpp"
27
+
28
+ namespace datasketches {
29
+
30
+ using bloom_filter_test_alloc = bloom_filter_alloc<test_allocator<test_type>>;
31
+ using alloc = test_allocator<test_type>;
32
+
33
+ TEST_CASE("bloom filter allocation test", "[bloom_filter][test_type]") {
34
+ test_allocator_total_bytes = 0;
35
+ test_allocator_net_allocations = 0;
36
+ {
37
+ int64_t num_items = 10000;
38
+ double fpp = 0.01;
39
+ uint64_t seed = bloom_filter_test_alloc::builder::generate_random_seed();
40
+ auto bf1 = bloom_filter_test_alloc::builder::create_by_accuracy(num_items,
41
+ fpp,
42
+ seed,
43
+ alloc(0));
44
+ for (int i = 0; i < num_items; ++i) {
45
+ if (num_items % 1 == 0) {
46
+ bf1.update(std::to_string(i));
47
+ } else {
48
+ bf1.update(i);
49
+ }
50
+ }
51
+ auto bytes1 = bf1.serialize(0);
52
+ auto bf2 = bloom_filter_test_alloc::deserialize(bytes1.data(), bytes1.size(), 0);
53
+
54
+ std::stringstream ss;
55
+ bf1.serialize(ss);
56
+ auto bf3 = bloom_filter_test_alloc::deserialize(ss, alloc(0));
57
+
58
+ bf3.reset();
59
+ for (int i = 0; i < num_items; ++i) {
60
+ bf1.update(-1.0 * i);
61
+ }
62
+
63
+ bf3.union_with(bf1);
64
+
65
+ auto bytes2 = bf3.serialize(0);
66
+ auto bf4 = bloom_filter_test_alloc::deserialize(bytes2.data(), bytes2.size(), 0);
67
+
68
+ auto bf5 = bloom_filter_test_alloc::wrap(bytes2.data(), bytes2.size(), 0);
69
+ auto bf6 = bloom_filter_test_alloc::writable_wrap(bytes2.data(), bytes2.size(), 0);
70
+ }
71
+ REQUIRE(test_allocator_total_bytes == 0);
72
+ REQUIRE(test_allocator_net_allocations == 0);
73
+ }
74
+
75
+ }
@@ -0,0 +1,51 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+
23
+ #include "bloom_filter.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ // assume the binary sketches for this test have been generated by datasketches-java code
28
+ // in the subdirectory called "java" in the root directory of this project
29
+ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
30
+
31
+ TEST_CASE("bloom_filter", "[serde_compat]") {
32
+ const uint64_t n_arr[] = {0, 10000, 2000000, 30000000};
33
+ const uint16_t h_arr[] = {3, 5};
34
+ for (const uint64_t n: n_arr) {
35
+ for (const uint16_t num_hashes: h_arr) {
36
+ std::ifstream is;
37
+ is.exceptions(std::ios::failbit | std::ios::badbit);
38
+ is.open(testBinaryInputPath + "bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_java.sk", std::ios::binary);
39
+ auto bf = bloom_filter::deserialize(is);
40
+ REQUIRE(bf.is_empty() == (n == 0));
41
+ REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10)));
42
+
43
+ for (uint64_t i = 0; i < n / 10; ++i) {
44
+ REQUIRE(bf.query(i));
45
+ }
46
+ if (n > 0) REQUIRE(bf.query(std::nan("1")));
47
+ }
48
+ }
49
+ }
50
+
51
+ } /* namespace datasketches */
@@ -0,0 +1,45 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <algorithm>
22
+ #include <fstream>
23
+
24
+ #include "bloom_filter.hpp"
25
+
26
+ namespace datasketches {
27
+
28
+ TEST_CASE("bloom filter generate", "[serialize_for_java]") {
29
+ const uint64_t n_arr[] = {0, 10000, 2000000, 30000000};
30
+ const uint16_t h_arr[] = {3, 5};
31
+ for (const uint64_t n: n_arr) {
32
+ for (const uint16_t num_hashes: h_arr) {
33
+ const uint64_t config_bits = std::max(n, static_cast<uint64_t>(1000)); // so empty still has valid bit size
34
+ bloom_filter bf = bloom_filter::builder::create_by_size(config_bits, num_hashes);
35
+ for (uint64_t i = 0; i < n / 10; ++i) bf.update(i); // note: n / 10 items into n bits
36
+ if (n > 0) bf.update(std::nan("1")); // include a NaN if non-empty
37
+ REQUIRE(bf.is_empty() == (n == 0));
38
+ REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10)));
39
+ std::ofstream os("bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_cpp.sk", std::ios::binary);
40
+ bf.serialize(os);
41
+ }
42
+ }
43
+ }
44
+
45
+ } /* namespace datasketches */