datasketches 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/ext/datasketches/vo_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  6. data/vendor/datasketches-cpp/LICENSE +35 -7
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
  10. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  11. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  12. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  13. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  14. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  15. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  16. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  17. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  18. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  19. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  20. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  21. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  22. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  23. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
  24. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
  25. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
  26. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  27. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
  28. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  29. metadata +18 -10
@@ -0,0 +1,132 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _BLOOM_FILTER_BUILDER_IMPL_HPP_
21
+ #define _BLOOM_FILTER_BUILDER_IMPL_HPP_
22
+
23
+ #include <cmath>
24
+ #include <memory>
25
+ #include <vector>
26
+
27
+ #include "common_defs.hpp"
28
+ #include "xxhash64.h"
29
+
30
+ namespace datasketches {
31
+
32
+ template<typename A>
33
+ uint64_t bloom_filter_alloc<A>::builder::generate_random_seed() {
34
+ return random_utils::next_uint64(random_utils::rand);
35
+ }
36
+
37
+ template<typename A>
38
+ uint16_t bloom_filter_alloc<A>::builder::suggest_num_hashes(uint64_t max_distinct_items,
39
+ uint64_t num_filter_bits) {
40
+ if (max_distinct_items == 0) {
41
+ throw std::invalid_argument("maximum number of distinct items must be strictly positive");
42
+ }
43
+ if (num_filter_bits == 0) {
44
+ throw std::invalid_argument("number of bits in the filter must be strictly positive");
45
+ } else if (num_filter_bits > bloom_filter_alloc<A>::MAX_FILTER_SIZE_BITS) {
46
+ throw std::invalid_argument("number of bits in the filter must be less than 2^63");
47
+ }
48
+ return static_cast<uint16_t>(std::ceil(static_cast<double>(num_filter_bits) / max_distinct_items * log(2.0)));
49
+ }
50
+
51
+ template<typename A>
52
+ uint16_t bloom_filter_alloc<A>::builder::suggest_num_hashes(double target_false_positive_prob) {
53
+ validate_accuracy_inputs(100, target_false_positive_prob); // max_distinct_items is an arbitrary valid value
54
+ return static_cast<uint16_t>(std::ceil(-log(target_false_positive_prob) / log(2.0)));
55
+ }
56
+
57
+ template<typename A>
58
+ uint64_t bloom_filter_alloc<A>::builder::suggest_num_filter_bits(uint64_t max_distinct_items,
59
+ double target_false_positive_prob) {
60
+ validate_accuracy_inputs(max_distinct_items, target_false_positive_prob);
61
+ return static_cast<uint64_t>(std::ceil(-static_cast<double>(max_distinct_items) * log(target_false_positive_prob) / (log(2.0) * log(2.0))));
62
+ }
63
+
64
+ template<typename A>
65
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::create_by_accuracy(uint64_t max_distinct_items,
66
+ double target_false_positive_prob,
67
+ uint64_t seed,
68
+ const A& allocator) {
69
+ validate_accuracy_inputs(max_distinct_items, target_false_positive_prob);
70
+ const uint64_t num_filter_bits = bloom_filter_alloc<A>::builder::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob);
71
+ const uint16_t num_hashes = bloom_filter_alloc<A>::builder::suggest_num_hashes(target_false_positive_prob);
72
+ return bloom_filter_alloc<A>(num_filter_bits, num_hashes, seed, allocator);
73
+ }
74
+
75
+ template<typename A>
76
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::create_by_size(uint64_t num_bits,
77
+ uint16_t num_hashes,
78
+ uint64_t seed,
79
+ const A& allocator) {
80
+ validate_size_inputs(num_bits, num_hashes);
81
+ return bloom_filter_alloc<A>(num_bits, num_hashes, seed, allocator);
82
+ }
83
+
84
+ template<typename A>
85
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::initialize_by_accuracy(void* memory,
86
+ size_t length_bytes,
87
+ uint64_t max_distinct_items,
88
+ double target_false_positive_prob,
89
+ uint64_t seed,
90
+ const A& allocator) {
91
+ validate_accuracy_inputs(max_distinct_items, target_false_positive_prob);
92
+ const uint64_t num_filter_bits = bloom_filter_alloc<A>::builder::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob);
93
+ const uint16_t num_hashes = bloom_filter_alloc<A>::builder::suggest_num_hashes(target_false_positive_prob);
94
+ return bloom_filter_alloc<A>(static_cast<uint8_t*>(memory), length_bytes, num_filter_bits, num_hashes, seed, allocator);
95
+ }
96
+
97
+ template<typename A>
98
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::initialize_by_size(void* memory,
99
+ size_t length_bytes,
100
+ uint64_t num_bits,
101
+ uint16_t num_hashes,
102
+ uint64_t seed,
103
+ const A& allocator) {
104
+ validate_size_inputs(num_bits, num_hashes);
105
+ return bloom_filter_alloc<A>(static_cast<uint8_t*>(memory), length_bytes, num_bits, num_hashes, seed, allocator);
106
+ }
107
+
108
+ template<typename A>
109
+ void bloom_filter_alloc<A>::builder::validate_size_inputs(uint64_t num_bits, uint16_t num_hashes) {
110
+ if (num_bits == 0) {
111
+ throw std::invalid_argument("number of bits in the filter must be strictly positive");
112
+ } else if (num_bits > bloom_filter_alloc<A>::MAX_FILTER_SIZE_BITS) {
113
+ throw std::invalid_argument("number of bits in the filter must be less than 2^63");
114
+ }
115
+ if (num_hashes == 0) {
116
+ throw std::invalid_argument("number of hashes for the filter must be strictly positive");
117
+ }
118
+ }
119
+
120
+ template<typename A>
121
+ void bloom_filter_alloc<A>::builder::validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob) {
122
+ if (max_distinct_items == 0) {
123
+ throw std::invalid_argument("maximum number of distinct items must be strictly positive");
124
+ }
125
+ if (target_false_positive_prob <= 0.0 || target_false_positive_prob > 1.0) {
126
+ throw std::invalid_argument("target false positive probability must be a valid probability strictly greater than 0.0");
127
+ }
128
+ }
129
+
130
+ } // namespace datasketches
131
+
132
+ #endif // _BLOOM_FILTER_BUILDER_IMPL_HPP_