datasketches 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -0,0 +1,236 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef DENSITY_SKETCH_HPP_
21
+ #define DENSITY_SKETCH_HPP_
22
+
23
+ #include <type_traits>
24
+ #include <vector>
25
+ #include <functional>
26
+ #include <numeric>
27
+ #include <cmath>
28
+
29
+ #include "common_defs.hpp"
30
+
31
+ /*
32
+ * Based on the following paper:
33
+ * Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning"
34
+ * https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf
35
+ *
36
+ * Inspired by the following implementation:
37
+ * https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py
38
+ */
39
+
40
+ namespace datasketches {
41
+
42
+ template<typename T>
43
+ struct gaussian_kernel {
44
+ T operator()(const std::vector<T>& v1, const std::vector<T>& v2) const {
45
+ return exp(-std::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0, std::plus<T>(), [](T a, T b){return (a-b)*(a-b);}));
46
+ }
47
+ };
48
+
49
+ template<
50
+ typename T,
51
+ typename Kernel = gaussian_kernel<T>,
52
+ typename Allocator = std::allocator<T>
53
+ >
54
+ class density_sketch {
55
+ static_assert(std::is_floating_point<T>::value, "Floating point type expected");
56
+
57
+ public:
58
+ using Vector = std::vector<T, Allocator>;
59
+ using Level = std::vector<Vector, typename std::allocator_traits<Allocator>::template rebind_alloc<Vector>>;
60
+ using Levels = std::vector<Level, typename std::allocator_traits<Allocator>::template rebind_alloc<Level>>;
61
+
62
+ /**
63
+ * Constructor
64
+ * @param k controls the size and error of the sketch.
65
+ * @param dim dimension of the input domain
66
+ * @param kernel to use by this instance
67
+ * @param allocator to use by this instance
68
+ */
69
+ density_sketch(uint16_t k, uint32_t dim, const Kernel& kernel = Kernel(), const Allocator& allocator = Allocator());
70
+
71
+ /**
72
+ * Returns configured parameter K
73
+ * @return parameter K
74
+ */
75
+ uint16_t get_k() const;
76
+
77
+ /**
78
+ * Returns configured dimensions
79
+ * @return dimensions
80
+ */
81
+ uint32_t get_dim() const;
82
+
83
+ /**
84
+ * Returns true if this sketch is empty.
85
+ * @return empty flag
86
+ */
87
+ bool is_empty() const;
88
+
89
+ /**
90
+ * Returns the length of the input stream (number of points observed by this sketch).
91
+ * @return stream length
92
+ */
93
+ uint64_t get_n() const;
94
+
95
+ /**
96
+ * Returns the number of retained points in the sketch.
97
+ * @return number of retained points
98
+ */
99
+ uint32_t get_num_retained() const;
100
+
101
+ /**
102
+ * Returns true if this sketch is in estimation mode.
103
+ * @return estimation mode flag
104
+ */
105
+ bool is_estimation_mode() const;
106
+
107
+ /**
108
+ * Updates this sketch with a given point.
109
+ * @param point given point
110
+ */
111
+ template<typename FwdVector>
112
+ void update(FwdVector&& point);
113
+
114
+ /**
115
+ * Merges another sketch into this one.
116
+ * @param other sketch to merge into this one
117
+ */
118
+ template<typename FwdSketch>
119
+ void merge(FwdSketch&& other);
120
+
121
+ T get_estimate(const std::vector<T>& point) const;
122
+
123
+ /**
124
+ * Returns an instance of the allocator for this sketch.
125
+ * @return allocator
126
+ */
127
+ Allocator get_allocator() const;
128
+
129
+ /**
130
+ * This method serializes the sketch into a given stream in a binary form
131
+ * @param os output stream
132
+ */
133
+ void serialize(std::ostream& os) const;
134
+
135
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
136
+
137
+ /**
138
+ * This method serializes the sketch as a vector of bytes.
139
+ * An optional header can be reserved in front of the sketch.
140
+ * It is an uninitialized space of a given size.
141
+ * This header is used in Datasketches PostgreSQL extension.
142
+ * @param header_size_bytes space to reserve in front of the sketch
143
+ */
144
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
145
+
146
+ /**
147
+ * This method deserializes a sketch from a given stream.
148
+ * @param is input stream
149
+ * @param kernel the kernel function to use for this sketch
150
+ * @param allocator the memory allocator to use with this sketch
151
+ * @return an instance of the sketch
152
+ */
153
+ static density_sketch deserialize(std::istream& is,
154
+ const Kernel& kernel=Kernel(), const Allocator& allocator = Allocator());
155
+
156
+ /**
157
+ * This method deserializes a sketch from a given array of bytes.
158
+ * @param bytes pointer to the array of bytes
159
+ * @param size the size of the array
160
+ * @param kernel the kernel function to use for this sketch
161
+ * @param allocator the memory allocator to use with this sketch
162
+ * @return an instance of the sketch
163
+ */
164
+ static density_sketch deserialize(const void* bytes, size_t size,
165
+ const Kernel& kernel=Kernel(), const Allocator& allocator = Allocator());
166
+
167
+ /**
168
+ * Prints a summary of the sketch.
169
+ * @param print_levels if true include information about levels
170
+ * @param print_items if true include sketch data
171
+ */
172
+ string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
173
+
174
+ class const_iterator;
175
+ const_iterator begin() const;
176
+ const_iterator end() const;
177
+
178
+ private:
179
+ enum flags { RESERVED0, RESERVED1, IS_EMPTY };
180
+ static const uint8_t PREAMBLE_INTS_SHORT = 3;
181
+ static const uint8_t PREAMBLE_INTS_LONG = 6;
182
+ static const uint8_t FAMILY_ID = 19;
183
+ static const uint8_t SERIAL_VERSION = 1;
184
+ static const size_t LEVELS_ARRAY_START = 5;
185
+
186
+ Allocator allocator_;
187
+ Kernel kernel_;
188
+ uint16_t k_;
189
+ uint32_t dim_;
190
+ uint32_t num_retained_;
191
+ uint64_t n_;
192
+ Levels levels_;
193
+
194
+ void compact();
195
+ void compact_level(unsigned height);
196
+
197
+ static void check_k(uint16_t k);
198
+ static void check_serial_version(uint8_t serial_version);
199
+ static void check_family_id(uint8_t family_id);
200
+ static void check_header_validity(uint8_t preamble_ints, uint8_t flags_byte, uint8_t serial_version);
201
+
202
+ density_sketch(uint16_t k, uint32_t dim, uint32_t num_retained, uint64_t n, Levels&& levels,
203
+ const Kernel& kernel = Kernel());
204
+ };
205
+
206
+ template<typename T, typename K, typename A>
207
+ class density_sketch<T, K, A>::const_iterator {
208
+ public:
209
+ using Vector = density_sketch<T, K, A>::Vector;
210
+ using iterator_category = std::input_iterator_tag;
211
+ using value_type = std::pair<const Vector&, const uint64_t>;
212
+ using difference_type = void;
213
+ using pointer = return_value_holder<value_type>;
214
+ using reference = const value_type;
215
+ const_iterator& operator++();
216
+ const_iterator& operator++(int);
217
+ bool operator==(const const_iterator& other) const;
218
+ bool operator!=(const const_iterator& other) const;
219
+ const value_type operator*() const;
220
+ const return_value_holder<value_type> operator->() const;
221
+ private:
222
+ using LevelsIterator = typename density_sketch<T, K, A>::Levels::const_iterator;
223
+ using LevelIterator = typename density_sketch<T, K, A>::Level::const_iterator;
224
+ LevelsIterator levels_it_;
225
+ LevelsIterator levels_end_;
226
+ LevelIterator level_it_;
227
+ unsigned height_;
228
+ friend class density_sketch<T, K, A>;
229
+ const_iterator(LevelsIterator begin, LevelsIterator end);
230
+ };
231
+
232
+ } /* namespace datasketches */
233
+
234
+ #include "density_sketch_impl.hpp"
235
+
236
+ #endif