datasketches 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -0,0 +1,236 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef DENSITY_SKETCH_HPP_
21
+ #define DENSITY_SKETCH_HPP_
22
+
23
+ #include <type_traits>
24
+ #include <vector>
25
+ #include <functional>
26
+ #include <numeric>
27
+ #include <cmath>
28
+
29
+ #include "common_defs.hpp"
30
+
31
+ /*
32
+ * Based on the following paper:
33
+ * Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning"
34
+ * https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf
35
+ *
36
+ * Inspired by the following implementation:
37
+ * https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py
38
+ */
39
+
40
+ namespace datasketches {
41
+
42
+ template<typename T>
43
+ struct gaussian_kernel {
44
+ T operator()(const std::vector<T>& v1, const std::vector<T>& v2) const {
45
+ return exp(-std::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0, std::plus<T>(), [](T a, T b){return (a-b)*(a-b);}));
46
+ }
47
+ };
48
+
49
+ template<
50
+ typename T,
51
+ typename Kernel = gaussian_kernel<T>,
52
+ typename Allocator = std::allocator<T>
53
+ >
54
+ class density_sketch {
55
+ static_assert(std::is_floating_point<T>::value, "Floating point type expected");
56
+
57
+ public:
58
+ using Vector = std::vector<T, Allocator>;
59
+ using Level = std::vector<Vector, typename std::allocator_traits<Allocator>::template rebind_alloc<Vector>>;
60
+ using Levels = std::vector<Level, typename std::allocator_traits<Allocator>::template rebind_alloc<Level>>;
61
+
62
+ /**
63
+ * Constructor
64
+ * @param k controls the size and error of the sketch.
65
+ * @param dim dimension of the input domain
66
+ * @param kernel to use by this instance
67
+ * @param allocator to use by this instance
68
+ */
69
+ density_sketch(uint16_t k, uint32_t dim, const Kernel& kernel = Kernel(), const Allocator& allocator = Allocator());
70
+
71
+ /**
72
+ * Returns configured parameter K
73
+ * @return parameter K
74
+ */
75
+ uint16_t get_k() const;
76
+
77
+ /**
78
+ * Returns configured dimensions
79
+ * @return dimensions
80
+ */
81
+ uint32_t get_dim() const;
82
+
83
+ /**
84
+ * Returns true if this sketch is empty.
85
+ * @return empty flag
86
+ */
87
+ bool is_empty() const;
88
+
89
+ /**
90
+ * Returns the length of the input stream (number of points observed by this sketch).
91
+ * @return stream length
92
+ */
93
+ uint64_t get_n() const;
94
+
95
+ /**
96
+ * Returns the number of retained points in the sketch.
97
+ * @return number of retained points
98
+ */
99
+ uint32_t get_num_retained() const;
100
+
101
+ /**
102
+ * Returns true if this sketch is in estimation mode.
103
+ * @return estimation mode flag
104
+ */
105
+ bool is_estimation_mode() const;
106
+
107
+ /**
108
+ * Updates this sketch with a given point.
109
+ * @param point given point
110
+ */
111
+ template<typename FwdVector>
112
+ void update(FwdVector&& point);
113
+
114
+ /**
115
+ * Merges another sketch into this one.
116
+ * @param other sketch to merge into this one
117
+ */
118
+ template<typename FwdSketch>
119
+ void merge(FwdSketch&& other);
120
+
121
+ T get_estimate(const std::vector<T>& point) const;
122
+
123
+ /**
124
+ * Returns an instance of the allocator for this sketch.
125
+ * @return allocator
126
+ */
127
+ Allocator get_allocator() const;
128
+
129
+ /**
130
+ * This method serializes the sketch into a given stream in a binary form
131
+ * @param os output stream
132
+ */
133
+ void serialize(std::ostream& os) const;
134
+
135
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
136
+
137
+ /**
138
+ * This method serializes the sketch as a vector of bytes.
139
+ * An optional header can be reserved in front of the sketch.
140
+ * It is an uninitialized space of a given size.
141
+ * This header is used in Datasketches PostgreSQL extension.
142
+ * @param header_size_bytes space to reserve in front of the sketch
143
+ */
144
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
145
+
146
+ /**
147
+ * This method deserializes a sketch from a given stream.
148
+ * @param is input stream
149
+ * @param kernel the kernel function to use for this sketch
150
+ * @param allocator the memory allocator to use with this sketch
151
+ * @return an instance of the sketch
152
+ */
153
+ static density_sketch deserialize(std::istream& is,
154
+ const Kernel& kernel=Kernel(), const Allocator& allocator = Allocator());
155
+
156
+ /**
157
+ * This method deserializes a sketch from a given array of bytes.
158
+ * @param bytes pointer to the array of bytes
159
+ * @param size the size of the array
160
+ * @param kernel the kernel function to use for this sketch
161
+ * @param allocator the memory allocator to use with this sketch
162
+ * @return an instance of the sketch
163
+ */
164
+ static density_sketch deserialize(const void* bytes, size_t size,
165
+ const Kernel& kernel=Kernel(), const Allocator& allocator = Allocator());
166
+
167
+ /**
168
+ * Prints a summary of the sketch.
169
+ * @param print_levels if true include information about levels
170
+ * @param print_items if true include sketch data
171
+ */
172
+ string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
173
+
174
+ class const_iterator;
175
+ const_iterator begin() const;
176
+ const_iterator end() const;
177
+
178
+ private:
179
+ enum flags { RESERVED0, RESERVED1, IS_EMPTY };
180
+ static const uint8_t PREAMBLE_INTS_SHORT = 3;
181
+ static const uint8_t PREAMBLE_INTS_LONG = 6;
182
+ static const uint8_t FAMILY_ID = 19;
183
+ static const uint8_t SERIAL_VERSION = 1;
184
+ static const size_t LEVELS_ARRAY_START = 5;
185
+
186
+ Allocator allocator_;
187
+ Kernel kernel_;
188
+ uint16_t k_;
189
+ uint32_t dim_;
190
+ uint32_t num_retained_;
191
+ uint64_t n_;
192
+ Levels levels_;
193
+
194
+ void compact();
195
+ void compact_level(unsigned height);
196
+
197
+ static void check_k(uint16_t k);
198
+ static void check_serial_version(uint8_t serial_version);
199
+ static void check_family_id(uint8_t family_id);
200
+ static void check_header_validity(uint8_t preamble_ints, uint8_t flags_byte, uint8_t serial_version);
201
+
202
+ density_sketch(uint16_t k, uint32_t dim, uint32_t num_retained, uint64_t n, Levels&& levels,
203
+ const Kernel& kernel = Kernel());
204
+ };
205
+
206
+ template<typename T, typename K, typename A>
207
+ class density_sketch<T, K, A>::const_iterator {
208
+ public:
209
+ using Vector = density_sketch<T, K, A>::Vector;
210
+ using iterator_category = std::input_iterator_tag;
211
+ using value_type = std::pair<const Vector&, const uint64_t>;
212
+ using difference_type = void;
213
+ using pointer = return_value_holder<value_type>;
214
+ using reference = const value_type;
215
+ const_iterator& operator++();
216
+ const_iterator& operator++(int);
217
+ bool operator==(const const_iterator& other) const;
218
+ bool operator!=(const const_iterator& other) const;
219
+ const value_type operator*() const;
220
+ const return_value_holder<value_type> operator->() const;
221
+ private:
222
+ using LevelsIterator = typename density_sketch<T, K, A>::Levels::const_iterator;
223
+ using LevelIterator = typename density_sketch<T, K, A>::Level::const_iterator;
224
+ LevelsIterator levels_it_;
225
+ LevelsIterator levels_end_;
226
+ LevelIterator level_it_;
227
+ unsigned height_;
228
+ friend class density_sketch<T, K, A>;
229
+ const_iterator(LevelsIterator begin, LevelsIterator end);
230
+ };
231
+
232
+ } /* namespace datasketches */
233
+
234
+ #include "density_sketch_impl.hpp"
235
+
236
+ #endif