datasketches 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/vo_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  6. data/vendor/datasketches-cpp/LICENSE +35 -7
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
  10. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  11. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  12. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  13. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  14. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  15. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  16. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  17. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  18. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  19. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  20. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  21. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  22. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  23. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
  24. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
  25. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
  26. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  27. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
  28. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  29. metadata +17 -9
@@ -0,0 +1,60 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ # separate executables for var_opt and ebpps sampling
19
+
20
+ # BLOOM FILTER
21
+ add_executable(bloom_filter_test)
22
+
23
+ target_link_libraries(bloom_filter_test filters common_test_lib)
24
+
25
+ set_target_properties(bloom_filter_test PROPERTIES
26
+ CXX_STANDARD_REQUIRED YES
27
+ )
28
+
29
+ file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" FILTERS_TEST_BINARY_PATH)
30
+ string(APPEND FILTERS_TEST_BINARY_PATH "/")
31
+ target_compile_definitions(bloom_filter_test
32
+ PRIVATE
33
+ TEST_BINARY_INPUT_PATH="${FILTERS_TEST_BINARY_PATH}"
34
+ )
35
+
36
+ add_test(
37
+ NAME bloom_filter_test
38
+ COMMAND bloom_filter_test
39
+ )
40
+
41
+ target_sources(bloom_filter_test
42
+ PRIVATE
43
+ bit_array_ops_test.cpp
44
+ bloom_filter_test.cpp
45
+ bloom_filter_allocation_test.cpp
46
+ )
47
+
48
+ if (SERDE_COMPAT)
49
+ target_sources(bloom_filter_test
50
+ PRIVATE
51
+ bloom_filter_deserialize_from_java_test.cpp
52
+ )
53
+ endif()
54
+
55
+ if (GENERATE)
56
+ target_sources(bloom_filter_test
57
+ PRIVATE
58
+ bloom_filter_serialize_for_java.cpp
59
+ )
60
+ endif()
@@ -0,0 +1,107 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <algorithm>
22
+
23
+ #include "bit_array_ops.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ TEST_CASE("bit_array: basic operation", "[bit_array]") {
28
+ uint8_t* data = new uint8_t[16];
29
+ std::fill_n(data, 16, 0);
30
+ REQUIRE(bit_array_ops::get_and_set_bit(data, 1) == false);
31
+ REQUIRE(bit_array_ops::get_and_set_bit(data, 2) == false);
32
+ for (int i = 4; i < 64; i <<= 1) {
33
+ REQUIRE(bit_array_ops::get_and_set_bit(data, 64 + i) == false);
34
+ }
35
+
36
+ REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 6);
37
+ REQUIRE(bit_array_ops::get_bit(data, 68));
38
+
39
+ REQUIRE(bit_array_ops::get_bit(data, 5) == false);
40
+ bit_array_ops::set_bit(data, 5);
41
+ REQUIRE(bit_array_ops::get_and_set_bit(data, 5));
42
+ REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 7);
43
+
44
+ bit_array_ops::clear_bit(data, 5);
45
+ REQUIRE(bit_array_ops::get_bit(data, 5) == false);
46
+ REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 6);
47
+
48
+ std::fill(data, data + 16, 0);
49
+ REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 0);
50
+
51
+ bit_array_ops::set_bit(data, 35);
52
+ REQUIRE(bit_array_ops::get_and_set_bit(data, 35));
53
+ bit_array_ops::assign_bit(data, 35, false);
54
+ REQUIRE(bit_array_ops::get_bit(data, 35) == false);
55
+ bit_array_ops::assign_bit(data, 35, true);
56
+ REQUIRE(bit_array_ops::get_bit(data, 35));
57
+
58
+ delete [] data;
59
+ }
60
+
61
+ TEST_CASE("bit_array: inversion", "[bit_array]") {
62
+ size_t num_bits = 1024;
63
+ uint8_t* data = new uint8_t[num_bits / 8];
64
+ std::fill_n(data, num_bits / 8, 0);
65
+ for (size_t i = 0; i < num_bits; i += num_bits / 8) {
66
+ bit_array_ops::get_and_set_bit(data, i);
67
+ }
68
+ REQUIRE(bit_array_ops::get_bit(data, 0));
69
+
70
+ size_t num_bits_set = bit_array_ops::count_num_bits_set(data, num_bits / 8);
71
+ bit_array_ops::invert(data, num_bits / 8);
72
+ REQUIRE(bit_array_ops::count_num_bits_set(data, num_bits / 8) == num_bits - num_bits_set);
73
+ REQUIRE(bit_array_ops::get_bit(data, 0) == false);
74
+
75
+ delete [] data;
76
+ }
77
+
78
+ TEST_CASE("bit_array: intersection and union", "[bit_array]") {
79
+ uint8_t* data1 = new uint8_t[8];
80
+ uint8_t* data2 = new uint8_t[8];
81
+ uint8_t* data3 = new uint8_t[8];
82
+ std::fill_n(data1, 8, 0);
83
+ std::fill_n(data2, 8, 0);
84
+ std::fill_n(data3, 8, 0);
85
+
86
+ size_t n = 10;
87
+ for (size_t i = 0; i < n; ++i) {
88
+ bit_array_ops::get_and_set_bit(data1, i);
89
+ bit_array_ops::get_and_set_bit(data2, i + (n / 2));
90
+ bit_array_ops::get_and_set_bit(data3, 2 * i);
91
+ }
92
+ REQUIRE(bit_array_ops::count_num_bits_set(data1, 8) == n);
93
+ REQUIRE(bit_array_ops::count_num_bits_set(data2, 8) == n);
94
+ REQUIRE(bit_array_ops::count_num_bits_set(data3, 8) == n);
95
+
96
+ bit_array_ops::intersect(data1, data2, 8);
97
+ REQUIRE(bit_array_ops::count_num_bits_set(data1, 8) == n / 2);
98
+
99
+ bit_array_ops::union_with(data3, data2, 8);
100
+ REQUIRE(bit_array_ops::count_num_bits_set(data3, 8) == 3 * n / 2);
101
+
102
+ delete [] data1;
103
+ delete [] data2;
104
+ delete [] data3;
105
+ }
106
+
107
+ } // namespace datasketches
@@ -0,0 +1,75 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <sstream>
21
+
22
+ #include <catch2/catch.hpp>
23
+
24
+ #include "bloom_filter.hpp"
25
+ #include "test_type.hpp"
26
+ #include "test_allocator.hpp"
27
+
28
+ namespace datasketches {
29
+
30
+ using bloom_filter_test_alloc = bloom_filter_alloc<test_allocator<test_type>>;
31
+ using alloc = test_allocator<test_type>;
32
+
33
+ TEST_CASE("bloom filter allocation test", "[bloom_filter][test_type]") {
34
+ test_allocator_total_bytes = 0;
35
+ test_allocator_net_allocations = 0;
36
+ {
37
+ int64_t num_items = 10000;
38
+ double fpp = 0.01;
39
+ uint64_t seed = bloom_filter_test_alloc::builder::generate_random_seed();
40
+ auto bf1 = bloom_filter_test_alloc::builder::create_by_accuracy(num_items,
41
+ fpp,
42
+ seed,
43
+ alloc(0));
44
+ for (int i = 0; i < num_items; ++i) {
45
+ if (num_items % 1 == 0) {
46
+ bf1.update(std::to_string(i));
47
+ } else {
48
+ bf1.update(i);
49
+ }
50
+ }
51
+ auto bytes1 = bf1.serialize(0);
52
+ auto bf2 = bloom_filter_test_alloc::deserialize(bytes1.data(), bytes1.size(), 0);
53
+
54
+ std::stringstream ss;
55
+ bf1.serialize(ss);
56
+ auto bf3 = bloom_filter_test_alloc::deserialize(ss, alloc(0));
57
+
58
+ bf3.reset();
59
+ for (int i = 0; i < num_items; ++i) {
60
+ bf1.update(-1.0 * i);
61
+ }
62
+
63
+ bf3.union_with(bf1);
64
+
65
+ auto bytes2 = bf3.serialize(0);
66
+ auto bf4 = bloom_filter_test_alloc::deserialize(bytes2.data(), bytes2.size(), 0);
67
+
68
+ auto bf5 = bloom_filter_test_alloc::wrap(bytes2.data(), bytes2.size(), 0);
69
+ auto bf6 = bloom_filter_test_alloc::writable_wrap(bytes2.data(), bytes2.size(), 0);
70
+ }
71
+ REQUIRE(test_allocator_total_bytes == 0);
72
+ REQUIRE(test_allocator_net_allocations == 0);
73
+ }
74
+
75
+ }
@@ -0,0 +1,51 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+
23
+ #include "bloom_filter.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ // assume the binary sketches for this test have been generated by datasketches-java code
28
+ // in the subdirectory called "java" in the root directory of this project
29
+ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
30
+
31
+ TEST_CASE("bloom_filter", "[serde_compat]") {
32
+ const uint64_t n_arr[] = {0, 10000, 2000000, 30000000};
33
+ const uint16_t h_arr[] = {3, 5};
34
+ for (const uint64_t n: n_arr) {
35
+ for (const uint16_t num_hashes: h_arr) {
36
+ std::ifstream is;
37
+ is.exceptions(std::ios::failbit | std::ios::badbit);
38
+ is.open(testBinaryInputPath + "bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_java.sk", std::ios::binary);
39
+ auto bf = bloom_filter::deserialize(is);
40
+ REQUIRE(bf.is_empty() == (n == 0));
41
+ REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10)));
42
+
43
+ for (uint64_t i = 0; i < n / 10; ++i) {
44
+ REQUIRE(bf.query(i));
45
+ }
46
+ if (n > 0) REQUIRE(bf.query(std::nan("1")));
47
+ }
48
+ }
49
+ }
50
+
51
+ } /* namespace datasketches */
@@ -0,0 +1,45 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <algorithm>
22
+ #include <fstream>
23
+
24
+ #include "bloom_filter.hpp"
25
+
26
+ namespace datasketches {
27
+
28
+ TEST_CASE("bloom filter generate", "[serialize_for_java]") {
29
+ const uint64_t n_arr[] = {0, 10000, 2000000, 30000000};
30
+ const uint16_t h_arr[] = {3, 5};
31
+ for (const uint64_t n: n_arr) {
32
+ for (const uint16_t num_hashes: h_arr) {
33
+ const uint64_t config_bits = std::max(n, static_cast<uint64_t>(1000)); // so empty still has valid bit size
34
+ bloom_filter bf = bloom_filter::builder::create_by_size(config_bits, num_hashes);
35
+ for (uint64_t i = 0; i < n / 10; ++i) bf.update(i); // note: n / 10 items into n bits
36
+ if (n > 0) bf.update(std::nan("1")); // include a NaN if non-empty
37
+ REQUIRE(bf.is_empty() == (n == 0));
38
+ REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10)));
39
+ std::ofstream os("bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_cpp.sk", std::ios::binary);
40
+ bf.serialize(os);
41
+ }
42
+ }
43
+ }
44
+
45
+ } /* namespace datasketches */