datasketches 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/vo_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/LICENSE +35 -7
- data/vendor/datasketches-cpp/NOTICE +2 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
- data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
- data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
- data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +17 -9
@@ -0,0 +1,60 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
# separate executables for var_opt and ebpps sampling
|
19
|
+
|
20
|
+
# BLOOM FILTER
|
21
|
+
add_executable(bloom_filter_test)
|
22
|
+
|
23
|
+
target_link_libraries(bloom_filter_test filters common_test_lib)
|
24
|
+
|
25
|
+
set_target_properties(bloom_filter_test PROPERTIES
|
26
|
+
CXX_STANDARD_REQUIRED YES
|
27
|
+
)
|
28
|
+
|
29
|
+
file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" FILTERS_TEST_BINARY_PATH)
|
30
|
+
string(APPEND FILTERS_TEST_BINARY_PATH "/")
|
31
|
+
target_compile_definitions(bloom_filter_test
|
32
|
+
PRIVATE
|
33
|
+
TEST_BINARY_INPUT_PATH="${FILTERS_TEST_BINARY_PATH}"
|
34
|
+
)
|
35
|
+
|
36
|
+
add_test(
|
37
|
+
NAME bloom_filter_test
|
38
|
+
COMMAND bloom_filter_test
|
39
|
+
)
|
40
|
+
|
41
|
+
target_sources(bloom_filter_test
|
42
|
+
PRIVATE
|
43
|
+
bit_array_ops_test.cpp
|
44
|
+
bloom_filter_test.cpp
|
45
|
+
bloom_filter_allocation_test.cpp
|
46
|
+
)
|
47
|
+
|
48
|
+
if (SERDE_COMPAT)
|
49
|
+
target_sources(bloom_filter_test
|
50
|
+
PRIVATE
|
51
|
+
bloom_filter_deserialize_from_java_test.cpp
|
52
|
+
)
|
53
|
+
endif()
|
54
|
+
|
55
|
+
if (GENERATE)
|
56
|
+
target_sources(bloom_filter_test
|
57
|
+
PRIVATE
|
58
|
+
bloom_filter_serialize_for_java.cpp
|
59
|
+
)
|
60
|
+
endif()
|
@@ -0,0 +1,107 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
#include <algorithm>
|
22
|
+
|
23
|
+
#include "bit_array_ops.hpp"
|
24
|
+
|
25
|
+
namespace datasketches {
|
26
|
+
|
27
|
+
TEST_CASE("bit_array: basic operation", "[bit_array]") {
|
28
|
+
uint8_t* data = new uint8_t[16];
|
29
|
+
std::fill_n(data, 16, 0);
|
30
|
+
REQUIRE(bit_array_ops::get_and_set_bit(data, 1) == false);
|
31
|
+
REQUIRE(bit_array_ops::get_and_set_bit(data, 2) == false);
|
32
|
+
for (int i = 4; i < 64; i <<= 1) {
|
33
|
+
REQUIRE(bit_array_ops::get_and_set_bit(data, 64 + i) == false);
|
34
|
+
}
|
35
|
+
|
36
|
+
REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 6);
|
37
|
+
REQUIRE(bit_array_ops::get_bit(data, 68));
|
38
|
+
|
39
|
+
REQUIRE(bit_array_ops::get_bit(data, 5) == false);
|
40
|
+
bit_array_ops::set_bit(data, 5);
|
41
|
+
REQUIRE(bit_array_ops::get_and_set_bit(data, 5));
|
42
|
+
REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 7);
|
43
|
+
|
44
|
+
bit_array_ops::clear_bit(data, 5);
|
45
|
+
REQUIRE(bit_array_ops::get_bit(data, 5) == false);
|
46
|
+
REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 6);
|
47
|
+
|
48
|
+
std::fill(data, data + 16, 0);
|
49
|
+
REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 0);
|
50
|
+
|
51
|
+
bit_array_ops::set_bit(data, 35);
|
52
|
+
REQUIRE(bit_array_ops::get_and_set_bit(data, 35));
|
53
|
+
bit_array_ops::assign_bit(data, 35, false);
|
54
|
+
REQUIRE(bit_array_ops::get_bit(data, 35) == false);
|
55
|
+
bit_array_ops::assign_bit(data, 35, true);
|
56
|
+
REQUIRE(bit_array_ops::get_bit(data, 35));
|
57
|
+
|
58
|
+
delete [] data;
|
59
|
+
}
|
60
|
+
|
61
|
+
TEST_CASE("bit_array: inversion", "[bit_array]") {
|
62
|
+
size_t num_bits = 1024;
|
63
|
+
uint8_t* data = new uint8_t[num_bits / 8];
|
64
|
+
std::fill_n(data, num_bits / 8, 0);
|
65
|
+
for (size_t i = 0; i < num_bits; i += num_bits / 8) {
|
66
|
+
bit_array_ops::get_and_set_bit(data, i);
|
67
|
+
}
|
68
|
+
REQUIRE(bit_array_ops::get_bit(data, 0));
|
69
|
+
|
70
|
+
size_t num_bits_set = bit_array_ops::count_num_bits_set(data, num_bits / 8);
|
71
|
+
bit_array_ops::invert(data, num_bits / 8);
|
72
|
+
REQUIRE(bit_array_ops::count_num_bits_set(data, num_bits / 8) == num_bits - num_bits_set);
|
73
|
+
REQUIRE(bit_array_ops::get_bit(data, 0) == false);
|
74
|
+
|
75
|
+
delete [] data;
|
76
|
+
}
|
77
|
+
|
78
|
+
TEST_CASE("bit_array: intersection and union", "[bit_array]") {
|
79
|
+
uint8_t* data1 = new uint8_t[8];
|
80
|
+
uint8_t* data2 = new uint8_t[8];
|
81
|
+
uint8_t* data3 = new uint8_t[8];
|
82
|
+
std::fill_n(data1, 8, 0);
|
83
|
+
std::fill_n(data2, 8, 0);
|
84
|
+
std::fill_n(data3, 8, 0);
|
85
|
+
|
86
|
+
size_t n = 10;
|
87
|
+
for (size_t i = 0; i < n; ++i) {
|
88
|
+
bit_array_ops::get_and_set_bit(data1, i);
|
89
|
+
bit_array_ops::get_and_set_bit(data2, i + (n / 2));
|
90
|
+
bit_array_ops::get_and_set_bit(data3, 2 * i);
|
91
|
+
}
|
92
|
+
REQUIRE(bit_array_ops::count_num_bits_set(data1, 8) == n);
|
93
|
+
REQUIRE(bit_array_ops::count_num_bits_set(data2, 8) == n);
|
94
|
+
REQUIRE(bit_array_ops::count_num_bits_set(data3, 8) == n);
|
95
|
+
|
96
|
+
bit_array_ops::intersect(data1, data2, 8);
|
97
|
+
REQUIRE(bit_array_ops::count_num_bits_set(data1, 8) == n / 2);
|
98
|
+
|
99
|
+
bit_array_ops::union_with(data3, data2, 8);
|
100
|
+
REQUIRE(bit_array_ops::count_num_bits_set(data3, 8) == 3 * n / 2);
|
101
|
+
|
102
|
+
delete [] data1;
|
103
|
+
delete [] data2;
|
104
|
+
delete [] data3;
|
105
|
+
}
|
106
|
+
|
107
|
+
} // namespace datasketches
|
@@ -0,0 +1,75 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <sstream>
|
21
|
+
|
22
|
+
#include <catch2/catch.hpp>
|
23
|
+
|
24
|
+
#include "bloom_filter.hpp"
|
25
|
+
#include "test_type.hpp"
|
26
|
+
#include "test_allocator.hpp"
|
27
|
+
|
28
|
+
namespace datasketches {
|
29
|
+
|
30
|
+
using bloom_filter_test_alloc = bloom_filter_alloc<test_allocator<test_type>>;
|
31
|
+
using alloc = test_allocator<test_type>;
|
32
|
+
|
33
|
+
TEST_CASE("bloom filter allocation test", "[bloom_filter][test_type]") {
|
34
|
+
test_allocator_total_bytes = 0;
|
35
|
+
test_allocator_net_allocations = 0;
|
36
|
+
{
|
37
|
+
int64_t num_items = 10000;
|
38
|
+
double fpp = 0.01;
|
39
|
+
uint64_t seed = bloom_filter_test_alloc::builder::generate_random_seed();
|
40
|
+
auto bf1 = bloom_filter_test_alloc::builder::create_by_accuracy(num_items,
|
41
|
+
fpp,
|
42
|
+
seed,
|
43
|
+
alloc(0));
|
44
|
+
for (int i = 0; i < num_items; ++i) {
|
45
|
+
if (num_items % 1 == 0) {
|
46
|
+
bf1.update(std::to_string(i));
|
47
|
+
} else {
|
48
|
+
bf1.update(i);
|
49
|
+
}
|
50
|
+
}
|
51
|
+
auto bytes1 = bf1.serialize(0);
|
52
|
+
auto bf2 = bloom_filter_test_alloc::deserialize(bytes1.data(), bytes1.size(), 0);
|
53
|
+
|
54
|
+
std::stringstream ss;
|
55
|
+
bf1.serialize(ss);
|
56
|
+
auto bf3 = bloom_filter_test_alloc::deserialize(ss, alloc(0));
|
57
|
+
|
58
|
+
bf3.reset();
|
59
|
+
for (int i = 0; i < num_items; ++i) {
|
60
|
+
bf1.update(-1.0 * i);
|
61
|
+
}
|
62
|
+
|
63
|
+
bf3.union_with(bf1);
|
64
|
+
|
65
|
+
auto bytes2 = bf3.serialize(0);
|
66
|
+
auto bf4 = bloom_filter_test_alloc::deserialize(bytes2.data(), bytes2.size(), 0);
|
67
|
+
|
68
|
+
auto bf5 = bloom_filter_test_alloc::wrap(bytes2.data(), bytes2.size(), 0);
|
69
|
+
auto bf6 = bloom_filter_test_alloc::writable_wrap(bytes2.data(), bytes2.size(), 0);
|
70
|
+
}
|
71
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
72
|
+
REQUIRE(test_allocator_net_allocations == 0);
|
73
|
+
}
|
74
|
+
|
75
|
+
}
|
@@ -0,0 +1,51 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
#include <fstream>
|
22
|
+
|
23
|
+
#include "bloom_filter.hpp"
|
24
|
+
|
25
|
+
namespace datasketches {
|
26
|
+
|
27
|
+
// assume the binary sketches for this test have been generated by datasketches-java code
|
28
|
+
// in the subdirectory called "java" in the root directory of this project
|
29
|
+
static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
|
30
|
+
|
31
|
+
TEST_CASE("bloom_filter", "[serde_compat]") {
|
32
|
+
const uint64_t n_arr[] = {0, 10000, 2000000, 30000000};
|
33
|
+
const uint16_t h_arr[] = {3, 5};
|
34
|
+
for (const uint64_t n: n_arr) {
|
35
|
+
for (const uint16_t num_hashes: h_arr) {
|
36
|
+
std::ifstream is;
|
37
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
38
|
+
is.open(testBinaryInputPath + "bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_java.sk", std::ios::binary);
|
39
|
+
auto bf = bloom_filter::deserialize(is);
|
40
|
+
REQUIRE(bf.is_empty() == (n == 0));
|
41
|
+
REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10)));
|
42
|
+
|
43
|
+
for (uint64_t i = 0; i < n / 10; ++i) {
|
44
|
+
REQUIRE(bf.query(i));
|
45
|
+
}
|
46
|
+
if (n > 0) REQUIRE(bf.query(std::nan("1")));
|
47
|
+
}
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
} /* namespace datasketches */
|
@@ -0,0 +1,45 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
#include <algorithm>
|
22
|
+
#include <fstream>
|
23
|
+
|
24
|
+
#include "bloom_filter.hpp"
|
25
|
+
|
26
|
+
namespace datasketches {
|
27
|
+
|
28
|
+
TEST_CASE("bloom filter generate", "[serialize_for_java]") {
|
29
|
+
const uint64_t n_arr[] = {0, 10000, 2000000, 30000000};
|
30
|
+
const uint16_t h_arr[] = {3, 5};
|
31
|
+
for (const uint64_t n: n_arr) {
|
32
|
+
for (const uint16_t num_hashes: h_arr) {
|
33
|
+
const uint64_t config_bits = std::max(n, static_cast<uint64_t>(1000)); // so empty still has valid bit size
|
34
|
+
bloom_filter bf = bloom_filter::builder::create_by_size(config_bits, num_hashes);
|
35
|
+
for (uint64_t i = 0; i < n / 10; ++i) bf.update(i); // note: n / 10 items into n bits
|
36
|
+
if (n > 0) bf.update(std::nan("1")); // include a NaN if non-empty
|
37
|
+
REQUIRE(bf.is_empty() == (n == 0));
|
38
|
+
REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10)));
|
39
|
+
std::ofstream os("bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_cpp.sk", std::ios::binary);
|
40
|
+
bf.serialize(os);
|
41
|
+
}
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
} /* namespace datasketches */
|