datasketches 0.4.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/datasketches/vo_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/LICENSE +35 -7
- data/vendor/datasketches-cpp/NOTICE +2 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
- data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
- data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
- data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +18 -10
@@ -0,0 +1,132 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef _BLOOM_FILTER_BUILDER_IMPL_HPP_
|
21
|
+
#define _BLOOM_FILTER_BUILDER_IMPL_HPP_
|
22
|
+
|
23
|
+
#include <cmath>
|
24
|
+
#include <memory>
|
25
|
+
#include <vector>
|
26
|
+
|
27
|
+
#include "common_defs.hpp"
|
28
|
+
#include "xxhash64.h"
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
template<typename A>
|
33
|
+
uint64_t bloom_filter_alloc<A>::builder::generate_random_seed() {
|
34
|
+
return random_utils::next_uint64(random_utils::rand);
|
35
|
+
}
|
36
|
+
|
37
|
+
template<typename A>
|
38
|
+
uint16_t bloom_filter_alloc<A>::builder::suggest_num_hashes(uint64_t max_distinct_items,
|
39
|
+
uint64_t num_filter_bits) {
|
40
|
+
if (max_distinct_items == 0) {
|
41
|
+
throw std::invalid_argument("maximum number of distinct items must be strictly positive");
|
42
|
+
}
|
43
|
+
if (num_filter_bits == 0) {
|
44
|
+
throw std::invalid_argument("number of bits in the filter must be strictly positive");
|
45
|
+
} else if (num_filter_bits > bloom_filter_alloc<A>::MAX_FILTER_SIZE_BITS) {
|
46
|
+
throw std::invalid_argument("number of bits in the filter must be less than 2^63");
|
47
|
+
}
|
48
|
+
return static_cast<uint16_t>(std::ceil(static_cast<double>(num_filter_bits) / max_distinct_items * log(2.0)));
|
49
|
+
}
|
50
|
+
|
51
|
+
template<typename A>
|
52
|
+
uint16_t bloom_filter_alloc<A>::builder::suggest_num_hashes(double target_false_positive_prob) {
|
53
|
+
validate_accuracy_inputs(100, target_false_positive_prob); // max_distinct_items is an arbitrary valid value
|
54
|
+
return static_cast<uint16_t>(std::ceil(-log(target_false_positive_prob) / log(2.0)));
|
55
|
+
}
|
56
|
+
|
57
|
+
template<typename A>
|
58
|
+
uint64_t bloom_filter_alloc<A>::builder::suggest_num_filter_bits(uint64_t max_distinct_items,
|
59
|
+
double target_false_positive_prob) {
|
60
|
+
validate_accuracy_inputs(max_distinct_items, target_false_positive_prob);
|
61
|
+
return static_cast<uint64_t>(std::ceil(-static_cast<double>(max_distinct_items) * log(target_false_positive_prob) / (log(2.0) * log(2.0))));
|
62
|
+
}
|
63
|
+
|
64
|
+
template<typename A>
|
65
|
+
bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::create_by_accuracy(uint64_t max_distinct_items,
|
66
|
+
double target_false_positive_prob,
|
67
|
+
uint64_t seed,
|
68
|
+
const A& allocator) {
|
69
|
+
validate_accuracy_inputs(max_distinct_items, target_false_positive_prob);
|
70
|
+
const uint64_t num_filter_bits = bloom_filter_alloc<A>::builder::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob);
|
71
|
+
const uint16_t num_hashes = bloom_filter_alloc<A>::builder::suggest_num_hashes(target_false_positive_prob);
|
72
|
+
return bloom_filter_alloc<A>(num_filter_bits, num_hashes, seed, allocator);
|
73
|
+
}
|
74
|
+
|
75
|
+
template<typename A>
|
76
|
+
bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::create_by_size(uint64_t num_bits,
|
77
|
+
uint16_t num_hashes,
|
78
|
+
uint64_t seed,
|
79
|
+
const A& allocator) {
|
80
|
+
validate_size_inputs(num_bits, num_hashes);
|
81
|
+
return bloom_filter_alloc<A>(num_bits, num_hashes, seed, allocator);
|
82
|
+
}
|
83
|
+
|
84
|
+
template<typename A>
|
85
|
+
bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::initialize_by_accuracy(void* memory,
|
86
|
+
size_t length_bytes,
|
87
|
+
uint64_t max_distinct_items,
|
88
|
+
double target_false_positive_prob,
|
89
|
+
uint64_t seed,
|
90
|
+
const A& allocator) {
|
91
|
+
validate_accuracy_inputs(max_distinct_items, target_false_positive_prob);
|
92
|
+
const uint64_t num_filter_bits = bloom_filter_alloc<A>::builder::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob);
|
93
|
+
const uint16_t num_hashes = bloom_filter_alloc<A>::builder::suggest_num_hashes(target_false_positive_prob);
|
94
|
+
return bloom_filter_alloc<A>(static_cast<uint8_t*>(memory), length_bytes, num_filter_bits, num_hashes, seed, allocator);
|
95
|
+
}
|
96
|
+
|
97
|
+
template<typename A>
|
98
|
+
bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::initialize_by_size(void* memory,
|
99
|
+
size_t length_bytes,
|
100
|
+
uint64_t num_bits,
|
101
|
+
uint16_t num_hashes,
|
102
|
+
uint64_t seed,
|
103
|
+
const A& allocator) {
|
104
|
+
validate_size_inputs(num_bits, num_hashes);
|
105
|
+
return bloom_filter_alloc<A>(static_cast<uint8_t*>(memory), length_bytes, num_bits, num_hashes, seed, allocator);
|
106
|
+
}
|
107
|
+
|
108
|
+
template<typename A>
|
109
|
+
void bloom_filter_alloc<A>::builder::validate_size_inputs(uint64_t num_bits, uint16_t num_hashes) {
|
110
|
+
if (num_bits == 0) {
|
111
|
+
throw std::invalid_argument("number of bits in the filter must be strictly positive");
|
112
|
+
} else if (num_bits > bloom_filter_alloc<A>::MAX_FILTER_SIZE_BITS) {
|
113
|
+
throw std::invalid_argument("number of bits in the filter must be less than 2^63");
|
114
|
+
}
|
115
|
+
if (num_hashes == 0) {
|
116
|
+
throw std::invalid_argument("number of hashes for the filter must be strictly positive");
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
template<typename A>
|
121
|
+
void bloom_filter_alloc<A>::builder::validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob) {
|
122
|
+
if (max_distinct_items == 0) {
|
123
|
+
throw std::invalid_argument("maximum number of distinct items must be strictly positive");
|
124
|
+
}
|
125
|
+
if (target_false_positive_prob <= 0.0 || target_false_positive_prob > 1.0) {
|
126
|
+
throw std::invalid_argument("target false positive probability must be a valid probability strictly greater than 0.0");
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
130
|
+
} // namespace datasketches
|
131
|
+
|
132
|
+
#endif // _BLOOM_FILTER_BUILDER_IMPL_HPP_
|