datasketches 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/vo_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  6. data/vendor/datasketches-cpp/LICENSE +35 -7
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
  10. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  11. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  12. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  13. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  14. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  15. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  16. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  17. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  18. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  19. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  20. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  21. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  22. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  23. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
  24. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
  25. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
  26. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  27. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
  28. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  29. metadata +17 -9
@@ -0,0 +1,132 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _BLOOM_FILTER_BUILDER_IMPL_HPP_
21
+ #define _BLOOM_FILTER_BUILDER_IMPL_HPP_
22
+
23
+ #include <cmath>
24
+ #include <memory>
25
+ #include <vector>
26
+
27
+ #include "common_defs.hpp"
28
+ #include "xxhash64.h"
29
+
30
+ namespace datasketches {
31
+
32
+ template<typename A>
33
+ uint64_t bloom_filter_alloc<A>::builder::generate_random_seed() {
34
+ return random_utils::next_uint64(random_utils::rand);
35
+ }
36
+
37
+ template<typename A>
38
+ uint16_t bloom_filter_alloc<A>::builder::suggest_num_hashes(uint64_t max_distinct_items,
39
+ uint64_t num_filter_bits) {
40
+ if (max_distinct_items == 0) {
41
+ throw std::invalid_argument("maximum number of distinct items must be strictly positive");
42
+ }
43
+ if (num_filter_bits == 0) {
44
+ throw std::invalid_argument("number of bits in the filter must be strictly positive");
45
+ } else if (num_filter_bits > bloom_filter_alloc<A>::MAX_FILTER_SIZE_BITS) {
46
+ throw std::invalid_argument("number of bits in the filter must be less than 2^63");
47
+ }
48
+ return static_cast<uint16_t>(std::ceil(static_cast<double>(num_filter_bits) / max_distinct_items * log(2.0)));
49
+ }
50
+
51
+ template<typename A>
52
+ uint16_t bloom_filter_alloc<A>::builder::suggest_num_hashes(double target_false_positive_prob) {
53
+ validate_accuracy_inputs(100, target_false_positive_prob); // max_distinct_items is an arbitrary valid value
54
+ return static_cast<uint16_t>(std::ceil(-log(target_false_positive_prob) / log(2.0)));
55
+ }
56
+
57
+ template<typename A>
58
+ uint64_t bloom_filter_alloc<A>::builder::suggest_num_filter_bits(uint64_t max_distinct_items,
59
+ double target_false_positive_prob) {
60
+ validate_accuracy_inputs(max_distinct_items, target_false_positive_prob);
61
+ return static_cast<uint64_t>(std::ceil(-static_cast<double>(max_distinct_items) * log(target_false_positive_prob) / (log(2.0) * log(2.0))));
62
+ }
63
+
64
+ template<typename A>
65
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::create_by_accuracy(uint64_t max_distinct_items,
66
+ double target_false_positive_prob,
67
+ uint64_t seed,
68
+ const A& allocator) {
69
+ validate_accuracy_inputs(max_distinct_items, target_false_positive_prob);
70
+ const uint64_t num_filter_bits = bloom_filter_alloc<A>::builder::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob);
71
+ const uint16_t num_hashes = bloom_filter_alloc<A>::builder::suggest_num_hashes(target_false_positive_prob);
72
+ return bloom_filter_alloc<A>(num_filter_bits, num_hashes, seed, allocator);
73
+ }
74
+
75
+ template<typename A>
76
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::create_by_size(uint64_t num_bits,
77
+ uint16_t num_hashes,
78
+ uint64_t seed,
79
+ const A& allocator) {
80
+ validate_size_inputs(num_bits, num_hashes);
81
+ return bloom_filter_alloc<A>(num_bits, num_hashes, seed, allocator);
82
+ }
83
+
84
+ template<typename A>
85
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::initialize_by_accuracy(void* memory,
86
+ size_t length_bytes,
87
+ uint64_t max_distinct_items,
88
+ double target_false_positive_prob,
89
+ uint64_t seed,
90
+ const A& allocator) {
91
+ validate_accuracy_inputs(max_distinct_items, target_false_positive_prob);
92
+ const uint64_t num_filter_bits = bloom_filter_alloc<A>::builder::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob);
93
+ const uint16_t num_hashes = bloom_filter_alloc<A>::builder::suggest_num_hashes(target_false_positive_prob);
94
+ return bloom_filter_alloc<A>(static_cast<uint8_t*>(memory), length_bytes, num_filter_bits, num_hashes, seed, allocator);
95
+ }
96
+
97
+ template<typename A>
98
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::initialize_by_size(void* memory,
99
+ size_t length_bytes,
100
+ uint64_t num_bits,
101
+ uint16_t num_hashes,
102
+ uint64_t seed,
103
+ const A& allocator) {
104
+ validate_size_inputs(num_bits, num_hashes);
105
+ return bloom_filter_alloc<A>(static_cast<uint8_t*>(memory), length_bytes, num_bits, num_hashes, seed, allocator);
106
+ }
107
+
108
+ template<typename A>
109
+ void bloom_filter_alloc<A>::builder::validate_size_inputs(uint64_t num_bits, uint16_t num_hashes) {
110
+ if (num_bits == 0) {
111
+ throw std::invalid_argument("number of bits in the filter must be strictly positive");
112
+ } else if (num_bits > bloom_filter_alloc<A>::MAX_FILTER_SIZE_BITS) {
113
+ throw std::invalid_argument("number of bits in the filter must be less than 2^63");
114
+ }
115
+ if (num_hashes == 0) {
116
+ throw std::invalid_argument("number of hashes for the filter must be strictly positive");
117
+ }
118
+ }
119
+
120
+ template<typename A>
121
+ void bloom_filter_alloc<A>::builder::validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob) {
122
+ if (max_distinct_items == 0) {
123
+ throw std::invalid_argument("maximum number of distinct items must be strictly positive");
124
+ }
125
+ if (target_false_positive_prob <= 0.0 || target_false_positive_prob > 1.0) {
126
+ throw std::invalid_argument("target false positive probability must be a valid probability strictly greater than 0.0");
127
+ }
128
+ }
129
+
130
+ } // namespace datasketches
131
+
132
+ #endif // _BLOOM_FILTER_BUILDER_IMPL_HPP_