datasketches 0.4.2 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/ext/datasketches/vo_wrapper.cpp +1 -1
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
  8. data/vendor/datasketches-cpp/LICENSE +35 -7
  9. data/vendor/datasketches-cpp/NOTICE +3 -3
  10. data/vendor/datasketches-cpp/README.md +2 -3
  11. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
  12. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  13. data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
  14. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  15. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  16. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  19. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  20. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  21. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  23. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  24. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  25. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  26. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  27. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  28. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  29. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  30. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  31. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  32. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  34. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  35. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  36. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  37. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  38. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  39. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  40. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  41. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  42. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  43. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  44. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  45. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
  46. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
  47. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  48. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  49. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  50. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  51. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  52. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  53. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
  54. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  55. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  56. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  57. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  58. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  59. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
  60. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  61. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  62. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  63. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  64. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  65. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  66. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  67. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  68. metadata +27 -9
@@ -0,0 +1,132 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _BLOOM_FILTER_BUILDER_IMPL_HPP_
21
+ #define _BLOOM_FILTER_BUILDER_IMPL_HPP_
22
+
23
+ #include <cmath>
24
+ #include <memory>
25
+ #include <vector>
26
+
27
+ #include "common_defs.hpp"
28
+ #include "xxhash64.h"
29
+
30
+ namespace datasketches {
31
+
32
+ template<typename A>
33
+ uint64_t bloom_filter_alloc<A>::builder::generate_random_seed() {
34
+ return random_utils::next_uint64(random_utils::rand);
35
+ }
36
+
37
+ template<typename A>
38
+ uint16_t bloom_filter_alloc<A>::builder::suggest_num_hashes(uint64_t max_distinct_items,
39
+ uint64_t num_filter_bits) {
40
+ if (max_distinct_items == 0) {
41
+ throw std::invalid_argument("maximum number of distinct items must be strictly positive");
42
+ }
43
+ if (num_filter_bits == 0) {
44
+ throw std::invalid_argument("number of bits in the filter must be strictly positive");
45
+ } else if (num_filter_bits > bloom_filter_alloc<A>::MAX_FILTER_SIZE_BITS) {
46
+ throw std::invalid_argument("number of bits in the filter must be less than 2^63");
47
+ }
48
+ return static_cast<uint16_t>(std::ceil(static_cast<double>(num_filter_bits) / max_distinct_items * log(2.0)));
49
+ }
50
+
51
+ template<typename A>
52
+ uint16_t bloom_filter_alloc<A>::builder::suggest_num_hashes(double target_false_positive_prob) {
53
+ validate_accuracy_inputs(100, target_false_positive_prob); // max_distinct_items is an arbitrary valid value
54
+ return static_cast<uint16_t>(std::ceil(-log(target_false_positive_prob) / log(2.0)));
55
+ }
56
+
57
+ template<typename A>
58
+ uint64_t bloom_filter_alloc<A>::builder::suggest_num_filter_bits(uint64_t max_distinct_items,
59
+ double target_false_positive_prob) {
60
+ validate_accuracy_inputs(max_distinct_items, target_false_positive_prob);
61
+ return static_cast<uint64_t>(std::ceil(-static_cast<double>(max_distinct_items) * log(target_false_positive_prob) / (log(2.0) * log(2.0))));
62
+ }
63
+
64
+ template<typename A>
65
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::create_by_accuracy(uint64_t max_distinct_items,
66
+ double target_false_positive_prob,
67
+ uint64_t seed,
68
+ const A& allocator) {
69
+ validate_accuracy_inputs(max_distinct_items, target_false_positive_prob);
70
+ const uint64_t num_filter_bits = bloom_filter_alloc<A>::builder::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob);
71
+ const uint16_t num_hashes = bloom_filter_alloc<A>::builder::suggest_num_hashes(target_false_positive_prob);
72
+ return bloom_filter_alloc<A>(num_filter_bits, num_hashes, seed, allocator);
73
+ }
74
+
75
+ template<typename A>
76
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::create_by_size(uint64_t num_bits,
77
+ uint16_t num_hashes,
78
+ uint64_t seed,
79
+ const A& allocator) {
80
+ validate_size_inputs(num_bits, num_hashes);
81
+ return bloom_filter_alloc<A>(num_bits, num_hashes, seed, allocator);
82
+ }
83
+
84
+ template<typename A>
85
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::initialize_by_accuracy(void* memory,
86
+ size_t length_bytes,
87
+ uint64_t max_distinct_items,
88
+ double target_false_positive_prob,
89
+ uint64_t seed,
90
+ const A& allocator) {
91
+ validate_accuracy_inputs(max_distinct_items, target_false_positive_prob);
92
+ const uint64_t num_filter_bits = bloom_filter_alloc<A>::builder::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob);
93
+ const uint16_t num_hashes = bloom_filter_alloc<A>::builder::suggest_num_hashes(target_false_positive_prob);
94
+ return bloom_filter_alloc<A>(static_cast<uint8_t*>(memory), length_bytes, num_filter_bits, num_hashes, seed, allocator);
95
+ }
96
+
97
+ template<typename A>
98
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::builder::initialize_by_size(void* memory,
99
+ size_t length_bytes,
100
+ uint64_t num_bits,
101
+ uint16_t num_hashes,
102
+ uint64_t seed,
103
+ const A& allocator) {
104
+ validate_size_inputs(num_bits, num_hashes);
105
+ return bloom_filter_alloc<A>(static_cast<uint8_t*>(memory), length_bytes, num_bits, num_hashes, seed, allocator);
106
+ }
107
+
108
+ template<typename A>
109
+ void bloom_filter_alloc<A>::builder::validate_size_inputs(uint64_t num_bits, uint16_t num_hashes) {
110
+ if (num_bits == 0) {
111
+ throw std::invalid_argument("number of bits in the filter must be strictly positive");
112
+ } else if (num_bits > bloom_filter_alloc<A>::MAX_FILTER_SIZE_BITS) {
113
+ throw std::invalid_argument("number of bits in the filter must be less than 2^63");
114
+ }
115
+ if (num_hashes == 0) {
116
+ throw std::invalid_argument("number of hashes for the filter must be strictly positive");
117
+ }
118
+ }
119
+
120
+ template<typename A>
121
+ void bloom_filter_alloc<A>::builder::validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob) {
122
+ if (max_distinct_items == 0) {
123
+ throw std::invalid_argument("maximum number of distinct items must be strictly positive");
124
+ }
125
+ if (target_false_positive_prob <= 0.0 || target_false_positive_prob > 1.0) {
126
+ throw std::invalid_argument("target false positive probability must be a valid probability strictly greater than 0.0");
127
+ }
128
+ }
129
+
130
+ } // namespace datasketches
131
+
132
+ #endif // _BLOOM_FILTER_BUILDER_IMPL_HPP_