datasketches 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef DENSITY_SKETCH_HPP_
|
|
21
|
+
#define DENSITY_SKETCH_HPP_
|
|
22
|
+
|
|
23
|
+
#include <type_traits>
|
|
24
|
+
#include <vector>
|
|
25
|
+
#include <functional>
|
|
26
|
+
#include <numeric>
|
|
27
|
+
#include <cmath>
|
|
28
|
+
|
|
29
|
+
#include "common_defs.hpp"
|
|
30
|
+
|
|
31
|
+
/*
|
|
32
|
+
* Based on the following paper:
|
|
33
|
+
* Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning"
|
|
34
|
+
* https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf
|
|
35
|
+
*
|
|
36
|
+
* Inspired by the following implementation:
|
|
37
|
+
* https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py
|
|
38
|
+
*/
|
|
39
|
+
|
|
40
|
+
namespace datasketches {
|
|
41
|
+
|
|
42
|
+
template<typename T>
|
|
43
|
+
struct gaussian_kernel {
|
|
44
|
+
T operator()(const std::vector<T>& v1, const std::vector<T>& v2) const {
|
|
45
|
+
return exp(-std::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0, std::plus<T>(), [](T a, T b){return (a-b)*(a-b);}));
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
template<
|
|
50
|
+
typename T,
|
|
51
|
+
typename Kernel = gaussian_kernel<T>,
|
|
52
|
+
typename Allocator = std::allocator<T>
|
|
53
|
+
>
|
|
54
|
+
class density_sketch {
|
|
55
|
+
static_assert(std::is_floating_point<T>::value, "Floating point type expected");
|
|
56
|
+
|
|
57
|
+
public:
|
|
58
|
+
using Vector = std::vector<T, Allocator>;
|
|
59
|
+
using Level = std::vector<Vector, typename std::allocator_traits<Allocator>::template rebind_alloc<Vector>>;
|
|
60
|
+
using Levels = std::vector<Level, typename std::allocator_traits<Allocator>::template rebind_alloc<Level>>;
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Constructor
|
|
64
|
+
* @param k controls the size and error of the sketch.
|
|
65
|
+
* @param dim dimension of the input domain
|
|
66
|
+
* @param kernel to use by this instance
|
|
67
|
+
* @param allocator to use by this instance
|
|
68
|
+
*/
|
|
69
|
+
density_sketch(uint16_t k, uint32_t dim, const Kernel& kernel = Kernel(), const Allocator& allocator = Allocator());
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Returns configured parameter K
|
|
73
|
+
* @return parameter K
|
|
74
|
+
*/
|
|
75
|
+
uint16_t get_k() const;
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Returns configured dimensions
|
|
79
|
+
* @return dimensions
|
|
80
|
+
*/
|
|
81
|
+
uint32_t get_dim() const;
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Returns true if this sketch is empty.
|
|
85
|
+
* @return empty flag
|
|
86
|
+
*/
|
|
87
|
+
bool is_empty() const;
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Returns the length of the input stream (number of points observed by this sketch).
|
|
91
|
+
* @return stream length
|
|
92
|
+
*/
|
|
93
|
+
uint64_t get_n() const;
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Returns the number of retained points in the sketch.
|
|
97
|
+
* @return number of retained points
|
|
98
|
+
*/
|
|
99
|
+
uint32_t get_num_retained() const;
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Returns true if this sketch is in estimation mode.
|
|
103
|
+
* @return estimation mode flag
|
|
104
|
+
*/
|
|
105
|
+
bool is_estimation_mode() const;
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Updates this sketch with a given point.
|
|
109
|
+
* @param point given point
|
|
110
|
+
*/
|
|
111
|
+
template<typename FwdVector>
|
|
112
|
+
void update(FwdVector&& point);
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Merges another sketch into this one.
|
|
116
|
+
* @param other sketch to merge into this one
|
|
117
|
+
*/
|
|
118
|
+
template<typename FwdSketch>
|
|
119
|
+
void merge(FwdSketch&& other);
|
|
120
|
+
|
|
121
|
+
T get_estimate(const std::vector<T>& point) const;
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Returns an instance of the allocator for this sketch.
|
|
125
|
+
* @return allocator
|
|
126
|
+
*/
|
|
127
|
+
Allocator get_allocator() const;
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* This method serializes the sketch into a given stream in a binary form
|
|
131
|
+
* @param os output stream
|
|
132
|
+
*/
|
|
133
|
+
void serialize(std::ostream& os) const;
|
|
134
|
+
|
|
135
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* This method serializes the sketch as a vector of bytes.
|
|
139
|
+
* An optional header can be reserved in front of the sketch.
|
|
140
|
+
* It is an uninitialized space of a given size.
|
|
141
|
+
* This header is used in Datasketches PostgreSQL extension.
|
|
142
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
|
143
|
+
*/
|
|
144
|
+
vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* This method deserializes a sketch from a given stream.
|
|
148
|
+
* @param is input stream
|
|
149
|
+
* @param kernel the kernel function to use for this sketch
|
|
150
|
+
* @param allocator the memory allocator to use with this sketch
|
|
151
|
+
* @return an instance of the sketch
|
|
152
|
+
*/
|
|
153
|
+
static density_sketch deserialize(std::istream& is,
|
|
154
|
+
const Kernel& kernel=Kernel(), const Allocator& allocator = Allocator());
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* This method deserializes a sketch from a given array of bytes.
|
|
158
|
+
* @param bytes pointer to the array of bytes
|
|
159
|
+
* @param size the size of the array
|
|
160
|
+
* @param kernel the kernel function to use for this sketch
|
|
161
|
+
* @param allocator the memory allocator to use with this sketch
|
|
162
|
+
* @return an instance of the sketch
|
|
163
|
+
*/
|
|
164
|
+
static density_sketch deserialize(const void* bytes, size_t size,
|
|
165
|
+
const Kernel& kernel=Kernel(), const Allocator& allocator = Allocator());
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Prints a summary of the sketch.
|
|
169
|
+
* @param print_levels if true include information about levels
|
|
170
|
+
* @param print_items if true include sketch data
|
|
171
|
+
*/
|
|
172
|
+
string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
|
|
173
|
+
|
|
174
|
+
class const_iterator;
|
|
175
|
+
const_iterator begin() const;
|
|
176
|
+
const_iterator end() const;
|
|
177
|
+
|
|
178
|
+
private:
|
|
179
|
+
enum flags { RESERVED0, RESERVED1, IS_EMPTY };
|
|
180
|
+
static const uint8_t PREAMBLE_INTS_SHORT = 3;
|
|
181
|
+
static const uint8_t PREAMBLE_INTS_LONG = 6;
|
|
182
|
+
static const uint8_t FAMILY_ID = 19;
|
|
183
|
+
static const uint8_t SERIAL_VERSION = 1;
|
|
184
|
+
static const size_t LEVELS_ARRAY_START = 5;
|
|
185
|
+
|
|
186
|
+
Allocator allocator_;
|
|
187
|
+
Kernel kernel_;
|
|
188
|
+
uint16_t k_;
|
|
189
|
+
uint32_t dim_;
|
|
190
|
+
uint32_t num_retained_;
|
|
191
|
+
uint64_t n_;
|
|
192
|
+
Levels levels_;
|
|
193
|
+
|
|
194
|
+
void compact();
|
|
195
|
+
void compact_level(unsigned height);
|
|
196
|
+
|
|
197
|
+
static void check_k(uint16_t k);
|
|
198
|
+
static void check_serial_version(uint8_t serial_version);
|
|
199
|
+
static void check_family_id(uint8_t family_id);
|
|
200
|
+
static void check_header_validity(uint8_t preamble_ints, uint8_t flags_byte, uint8_t serial_version);
|
|
201
|
+
|
|
202
|
+
density_sketch(uint16_t k, uint32_t dim, uint32_t num_retained, uint64_t n, Levels&& levels,
|
|
203
|
+
const Kernel& kernel = Kernel());
|
|
204
|
+
};
|
|
205
|
+
|
|
206
|
+
template<typename T, typename K, typename A>
|
|
207
|
+
class density_sketch<T, K, A>::const_iterator {
|
|
208
|
+
public:
|
|
209
|
+
using Vector = density_sketch<T, K, A>::Vector;
|
|
210
|
+
using iterator_category = std::input_iterator_tag;
|
|
211
|
+
using value_type = std::pair<const Vector&, const uint64_t>;
|
|
212
|
+
using difference_type = void;
|
|
213
|
+
using pointer = return_value_holder<value_type>;
|
|
214
|
+
using reference = const value_type;
|
|
215
|
+
const_iterator& operator++();
|
|
216
|
+
const_iterator& operator++(int);
|
|
217
|
+
bool operator==(const const_iterator& other) const;
|
|
218
|
+
bool operator!=(const const_iterator& other) const;
|
|
219
|
+
const value_type operator*() const;
|
|
220
|
+
const return_value_holder<value_type> operator->() const;
|
|
221
|
+
private:
|
|
222
|
+
using LevelsIterator = typename density_sketch<T, K, A>::Levels::const_iterator;
|
|
223
|
+
using LevelIterator = typename density_sketch<T, K, A>::Level::const_iterator;
|
|
224
|
+
LevelsIterator levels_it_;
|
|
225
|
+
LevelsIterator levels_end_;
|
|
226
|
+
LevelIterator level_it_;
|
|
227
|
+
unsigned height_;
|
|
228
|
+
friend class density_sketch<T, K, A>;
|
|
229
|
+
const_iterator(LevelsIterator begin, LevelsIterator end);
|
|
230
|
+
};
|
|
231
|
+
|
|
232
|
+
} /* namespace datasketches */
|
|
233
|
+
|
|
234
|
+
#include "density_sketch_impl.hpp"
|
|
235
|
+
|
|
236
|
+
#endif
|