datasketches 0.4.2 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/NOTICE +1 -1
- data/README.md +1 -1
- data/ext/datasketches/vo_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/LICENSE +35 -7
- data/vendor/datasketches-cpp/NOTICE +3 -3
- data/vendor/datasketches-cpp/README.md +2 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
- data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
- data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
- data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
- data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
- data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +27 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 427f1c95c4dc0782a7003f24b155b39d0b01560e57ff2046785d3254e79250c0
|
4
|
+
data.tar.gz: 820d72e4c907cb7d59bd808d2b837692032f038dac06cf96d0767531cc68370e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 03abee1fa7222628c3e3b00c9fed91218abf17a9dd8116aa6c55e37112a33506c11333533580a3804ce764bbce17ac4db08450e3cf82a52a36e924abd2391762
|
7
|
+
data.tar.gz: e838d14d5f43706dd85a51a45074e224131bc0ecd9157dfc350b2f990c6d54b9338dfb48fc0ad14d86f2d74ef2b855a385e4a67aa3be6e173bf5610887e18e04
|
data/CHANGELOG.md
CHANGED
data/NOTICE
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
[DataSketches](https://datasketches.apache.org/) - sketch data structures - for Ruby
|
4
4
|
|
5
|
-
[![Build Status](https://github.com/ankane/datasketches-ruby/workflows/build/badge.svg
|
5
|
+
[![Build Status](https://github.com/ankane/datasketches-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/datasketches-ruby/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -8,7 +8,7 @@ using datasketches::var_opt_sketch;
|
|
8
8
|
|
9
9
|
template<typename T>
|
10
10
|
void bind_vo_sketch(Rice::Module &m, const char* name) {
|
11
|
-
Rice::define_class_under<var_opt_sketch<T>>(m,
|
11
|
+
Rice::define_class_under<var_opt_sketch<T>>(m, name)
|
12
12
|
.define_constructor(Rice::Constructor<var_opt_sketch<T>, uint32_t>())
|
13
13
|
.define_method("k", &var_opt_sketch<T>::get_k)
|
14
14
|
.define_method("n", &var_opt_sketch<T>::get_n)
|
data/lib/datasketches/version.rb
CHANGED
@@ -207,9 +207,9 @@ APPENDIX A: How to apply the Apache License to your work.
|
|
207
207
|
|
208
208
|
APPENDIX B: Additional licenses relevant to this product.
|
209
209
|
|
210
|
-
This product includes a number of source files with code that has been
|
211
|
-
adapted from 3rd party sources including sources that may be subject
|
212
|
-
to different copyright notices and license terms. Your use of
|
210
|
+
This product includes a number of source files with code that has been
|
211
|
+
adapted from 3rd party sources including sources that may be subject
|
212
|
+
to different copyright notices and license terms. Your use of
|
213
213
|
the source code for these subcomponents is subject to the terms and
|
214
214
|
conditions of the following licenses.
|
215
215
|
|
@@ -221,7 +221,7 @@ APPENDIX B: Additional licenses relevant to this product.
|
|
221
221
|
https://github.com/catchorg/Catch2/blob/v2.x/LICENSE.txt
|
222
222
|
|
223
223
|
Boost Software License - Version 1.0 - August 17th, 2003
|
224
|
-
|
224
|
+
|
225
225
|
Permission is hereby granted, free of charge, to any person or organization
|
226
226
|
obtaining a copy of the software and accompanying documentation covered by
|
227
227
|
this license (the "Software") to use, reproduce, display, distribute,
|
@@ -248,6 +248,35 @@ APPENDIX B: Additional licenses relevant to this product.
|
|
248
248
|
Found in the Catch2 unit test code that is downloaded from github.com as part
|
249
249
|
of CMake configuration if configured to build tests.
|
250
250
|
|
251
|
+
=============================================================
|
252
|
+
MIT License
|
253
|
+
=============================================================
|
254
|
+
|
255
|
+
Original source:
|
256
|
+
https://github.com/stbrumme/xxhash/blob/master/LICENSE
|
257
|
+
|
258
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
259
|
+
of this software and associated documentation files (the "Software"),
|
260
|
+
to deal in the Software without restriction, including without limitation
|
261
|
+
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
262
|
+
and/or sell copies of the Software, and to permit persons to whom the Software
|
263
|
+
is furnished to do so, subject to the following conditions:
|
264
|
+
|
265
|
+
The above copyright notice and this permission notice shall be included
|
266
|
+
in all copies or substantial portions of the Software.
|
267
|
+
|
268
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
269
|
+
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
270
|
+
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
271
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
272
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
273
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
274
|
+
|
275
|
+
Code Location:
|
276
|
+
common/include/xxhash64.h
|
277
|
+
Original source code:
|
278
|
+
Copyright (c) 2018 Stephan Brumme
|
279
|
+
https://github.com/stbrumme/xxhash/blob/master/xxhash64.h
|
251
280
|
|
252
281
|
=============================================================
|
253
282
|
Public Domain
|
@@ -255,7 +284,7 @@ APPENDIX B: Additional licenses relevant to this product.
|
|
255
284
|
Original source code:
|
256
285
|
https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
|
257
286
|
Placed in the Public Domain by Austin Appleby
|
258
|
-
|
287
|
+
|
259
288
|
Code Locations:
|
260
289
|
common/include/MurmurHash3.h
|
261
290
|
that is adapted from the above.
|
@@ -263,8 +292,7 @@ APPENDIX B: Additional licenses relevant to this product.
|
|
263
292
|
Original source code:
|
264
293
|
* https://graphics.stanford.edu/~seander/bithacks.html
|
265
294
|
* Placed in the Public Domain by Sean Eron Anderson
|
266
|
-
|
295
|
+
|
267
296
|
Code Locations:
|
268
297
|
* common/include/ceiling_power_of_2.hpp
|
269
298
|
that is adapted from the above.
|
270
|
-
|
@@ -1,9 +1,9 @@
|
|
1
|
-
Apache DataSketches C++
|
2
|
-
Copyright
|
1
|
+
Apache DataSketches C++
|
2
|
+
Copyright 2025 The Apache Software Foundation
|
3
3
|
|
4
4
|
Copyright 2015-2018 Yahoo Inc.
|
5
5
|
Copyright 2019-2020 Verizon Media
|
6
|
-
Copyright 2021 Yahoo Inc.
|
6
|
+
Copyright 2021- Yahoo Inc.
|
7
7
|
|
8
8
|
This product includes software developed at
|
9
9
|
The Apache Software Foundation (http://www.apache.org/).
|
@@ -3,8 +3,7 @@ This is the core C++ component of the Apache DataSketches library. It contains
|
|
3
3
|
|
4
4
|
This component is also a dependency of other components of the library that create adaptors for target systems, such as PostgreSQL.
|
5
5
|
|
6
|
-
Note that we have a parallel core component for Java implementations of the same sketch algorithms
|
7
|
-
[datasketches-java](https://github.com/apache/datasketches-java).
|
6
|
+
Note that we have a parallel core component for [Java]((https://github.com/apache/datasketches-java) and [Python]((https://github.com/apache/datasketches-python) implementations of the same sketch algorithms.
|
8
7
|
|
9
8
|
Please visit the main [Apache DataSketches website](https://datasketches.apache.org) for more information.
|
10
9
|
|
@@ -104,4 +103,4 @@ from GitHub using CMake's `ExternalProject` module. The code would look somethin
|
|
104
103
|
target_include_directories(my_dependent_target
|
105
104
|
PRIVATE ${datasketches_INSTALL_DIR}/include/DataSketches)
|
106
105
|
add_dependencies(my_dependent_target datasketches)
|
107
|
-
```
|
106
|
+
```
|
@@ -29,11 +29,9 @@ target_include_directories(common
|
|
29
29
|
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
30
30
|
)
|
31
31
|
|
32
|
-
target_compile_features(common INTERFACE cxx_std_11)
|
33
|
-
|
34
32
|
install(TARGETS common EXPORT ${PROJECT_NAME})
|
35
33
|
|
36
|
-
install(FILES
|
34
|
+
install(FILES
|
37
35
|
${CMAKE_CURRENT_BINARY_DIR}/include/version.hpp
|
38
36
|
include/binomial_bounds.hpp
|
39
37
|
include/bounds_binomial_proportions.hpp
|
@@ -51,4 +49,5 @@ install(FILES
|
|
51
49
|
include/quantiles_sorted_view_impl.hpp
|
52
50
|
include/quantiles_sorted_view.hpp
|
53
51
|
include/serde.hpp
|
52
|
+
include/xxhash64.h
|
54
53
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
@@ -71,10 +71,10 @@ typedef struct {
|
|
71
71
|
// Block read - if your platform needs to do endian-swapping or can only
|
72
72
|
// handle aligned reads, do the conversion here
|
73
73
|
|
74
|
-
MURMUR3_FORCE_INLINE uint64_t getblock64 ( const
|
74
|
+
MURMUR3_FORCE_INLINE uint64_t getblock64 ( const uint8_t * p, size_t i )
|
75
75
|
{
|
76
76
|
uint64_t res;
|
77
|
-
memcpy(&res, p + i, sizeof(res));
|
77
|
+
memcpy(&res, p + i * sizeof(uint64_t), sizeof(res));
|
78
78
|
return res;
|
79
79
|
}
|
80
80
|
|
@@ -104,13 +104,12 @@ MURMUR3_FORCE_INLINE void MurmurHash3_x64_128(const void* key, size_t lenBytes,
|
|
104
104
|
|
105
105
|
// Number of full 128-bit blocks of 16 bytes.
|
106
106
|
// Possible exclusion of a remainder of up to 15 bytes.
|
107
|
-
const size_t nblocks = lenBytes >> 4; // bytes / 16
|
107
|
+
const size_t nblocks = lenBytes >> 4; // bytes / 16
|
108
108
|
|
109
109
|
// Process the 128-bit blocks (the body) into the hash
|
110
|
-
const uint64_t* blocks = (const uint64_t*)(data);
|
111
110
|
for (size_t i = 0; i < nblocks; ++i) { // 16 bytes per block
|
112
|
-
uint64_t k1 = getblock64(
|
113
|
-
uint64_t k2 = getblock64(
|
111
|
+
uint64_t k1 = getblock64(data, i * 2 + 0);
|
112
|
+
uint64_t k2 = getblock64(data, i * 2 + 1);
|
114
113
|
|
115
114
|
k1 *= c1; k1 = MURMUR3_ROTL64(k1,31); k1 *= c2; out.h1 ^= k1;
|
116
115
|
out.h1 = MURMUR3_ROTL64(out.h1,27);
|
@@ -42,6 +42,7 @@ namespace random_utils {
|
|
42
42
|
static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
|
43
43
|
static thread_local std::mt19937_64 rand(rd());
|
44
44
|
static thread_local std::uniform_real_distribution<> next_double(0.0, 1.0);
|
45
|
+
static thread_local std::uniform_int_distribution<uint64_t> next_uint64(0, UINT64_MAX);
|
45
46
|
|
46
47
|
// thread-safe random bit
|
47
48
|
static thread_local std::independent_bits_engine<std::mt19937, 1, uint32_t>
|
@@ -91,6 +92,23 @@ static inline void write(std::ostream& os, const T* ptr, size_t size_bytes) {
|
|
91
92
|
os.write(reinterpret_cast<const char*>(ptr), size_bytes);
|
92
93
|
}
|
93
94
|
|
95
|
+
template<typename T>
|
96
|
+
T byteswap(T value) {
|
97
|
+
char* ptr = static_cast<char*>(static_cast<void*>(&value));
|
98
|
+
const int len = sizeof(T);
|
99
|
+
for (size_t i = 0; i < len / 2; ++i) {
|
100
|
+
std::swap(ptr[i], ptr[len - i - 1]);
|
101
|
+
}
|
102
|
+
return value;
|
103
|
+
}
|
104
|
+
|
105
|
+
template<typename T>
|
106
|
+
static inline T read_big_endian(std::istream& is) {
|
107
|
+
T value;
|
108
|
+
is.read(reinterpret_cast<char*>(&value), sizeof(T));
|
109
|
+
return byteswap(value);
|
110
|
+
}
|
111
|
+
|
94
112
|
// wrapper for iterators to implement operator-> returning temporary value
|
95
113
|
template<typename T>
|
96
114
|
class return_value_holder {
|
@@ -86,19 +86,17 @@ auto quantiles_sorted_view<T, C, A>::get_quantile(double rank, bool inclusive) c
|
|
86
86
|
template<typename T, typename C, typename A>
|
87
87
|
auto quantiles_sorted_view<T, C, A>::get_CDF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double {
|
88
88
|
if (entries_.empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
89
|
-
vector_double buckets(entries_.get_allocator());
|
90
|
-
if (entries_.size() == 0) return buckets;
|
91
89
|
check_split_points(split_points, size);
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
90
|
+
vector_double ranks(entries_.get_allocator());
|
91
|
+
ranks.reserve(size + 1);
|
92
|
+
for (uint32_t i = 0; i < size; ++i) ranks.push_back(get_rank(split_points[i], inclusive));
|
93
|
+
ranks.push_back(1);
|
94
|
+
return ranks;
|
96
95
|
}
|
97
96
|
|
98
97
|
template<typename T, typename C, typename A>
|
99
98
|
auto quantiles_sorted_view<T, C, A>::get_PMF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double {
|
100
99
|
auto buckets = get_CDF(split_points, size, inclusive);
|
101
|
-
if (buckets.size() == 0) return buckets;
|
102
100
|
for (uint32_t i = size; i > 0; --i) {
|
103
101
|
buckets[i] -= buckets[i - 1];
|
104
102
|
}
|
@@ -0,0 +1,202 @@
|
|
1
|
+
// //////////////////////////////////////////////////////////
|
2
|
+
// xxhash64.h
|
3
|
+
// Copyright (c) 2016 Stephan Brumme. All rights reserved.
|
4
|
+
// see http://create.stephan-brumme.com/disclaimer.html
|
5
|
+
//
|
6
|
+
|
7
|
+
#pragma once
|
8
|
+
#include <stdint.h> // for uint32_t and uint64_t
|
9
|
+
|
10
|
+
/// XXHash (64 bit), based on Yann Collet's descriptions, see http://cyan4973.github.io/xxHash/
|
11
|
+
/** How to use:
|
12
|
+
uint64_t myseed = 0;
|
13
|
+
XXHash64 myhash(myseed);
|
14
|
+
myhash.add(pointerToSomeBytes, numberOfBytes);
|
15
|
+
myhash.add(pointerToSomeMoreBytes, numberOfMoreBytes); // call add() as often as you like to ...
|
16
|
+
// and compute hash:
|
17
|
+
uint64_t result = myhash.hash();
|
18
|
+
|
19
|
+
// or all of the above in one single line:
|
20
|
+
uint64_t result2 = XXHash64::hash(mypointer, numBytes, myseed);
|
21
|
+
|
22
|
+
Note: my code is NOT endian-aware !
|
23
|
+
**/
|
24
|
+
class XXHash64
|
25
|
+
{
|
26
|
+
public:
|
27
|
+
/// create new XXHash (64 bit)
|
28
|
+
/** @param seed your seed value, even zero is a valid seed **/
|
29
|
+
explicit XXHash64(uint64_t seed)
|
30
|
+
{
|
31
|
+
state[0] = seed + Prime1 + Prime2;
|
32
|
+
state[1] = seed + Prime2;
|
33
|
+
state[2] = seed;
|
34
|
+
state[3] = seed - Prime1;
|
35
|
+
bufferSize = 0;
|
36
|
+
totalLength = 0;
|
37
|
+
}
|
38
|
+
|
39
|
+
/// add a chunk of bytes
|
40
|
+
/** @param input pointer to a continuous block of data
|
41
|
+
@param length number of bytes
|
42
|
+
@return false if parameters are invalid / zero **/
|
43
|
+
bool add(const void* input, uint64_t length)
|
44
|
+
{
|
45
|
+
// no data ?
|
46
|
+
if (!input || length == 0)
|
47
|
+
return false;
|
48
|
+
|
49
|
+
totalLength += length;
|
50
|
+
// byte-wise access
|
51
|
+
const unsigned char* data = (const unsigned char*)input;
|
52
|
+
|
53
|
+
// unprocessed old data plus new data still fit in temporary buffer ?
|
54
|
+
if (bufferSize + length < MaxBufferSize)
|
55
|
+
{
|
56
|
+
// just add new data
|
57
|
+
while (length-- > 0)
|
58
|
+
buffer[bufferSize++] = *data++;
|
59
|
+
return true;
|
60
|
+
}
|
61
|
+
|
62
|
+
// point beyond last byte
|
63
|
+
const unsigned char* stop = data + length;
|
64
|
+
const unsigned char* stopBlock = stop - MaxBufferSize;
|
65
|
+
|
66
|
+
// some data left from previous update ?
|
67
|
+
if (bufferSize > 0)
|
68
|
+
{
|
69
|
+
// make sure temporary buffer is full (16 bytes)
|
70
|
+
while (bufferSize < MaxBufferSize)
|
71
|
+
buffer[bufferSize++] = *data++;
|
72
|
+
|
73
|
+
// process these 32 bytes (4x8)
|
74
|
+
process(buffer, state[0], state[1], state[2], state[3]);
|
75
|
+
}
|
76
|
+
|
77
|
+
// copying state to local variables helps optimizer A LOT
|
78
|
+
uint64_t s0 = state[0], s1 = state[1], s2 = state[2], s3 = state[3];
|
79
|
+
// 32 bytes at once
|
80
|
+
while (data <= stopBlock)
|
81
|
+
{
|
82
|
+
// local variables s0..s3 instead of state[0]..state[3] are much faster
|
83
|
+
process(data, s0, s1, s2, s3);
|
84
|
+
data += 32;
|
85
|
+
}
|
86
|
+
// copy back
|
87
|
+
state[0] = s0; state[1] = s1; state[2] = s2; state[3] = s3;
|
88
|
+
|
89
|
+
// copy remainder to temporary buffer
|
90
|
+
bufferSize = stop - data;
|
91
|
+
for (uint64_t i = 0; i < bufferSize; i++)
|
92
|
+
buffer[i] = data[i];
|
93
|
+
|
94
|
+
// done
|
95
|
+
return true;
|
96
|
+
}
|
97
|
+
|
98
|
+
/// get current hash
|
99
|
+
/** @return 64 bit XXHash **/
|
100
|
+
uint64_t hash() const
|
101
|
+
{
|
102
|
+
// fold 256 bit state into one single 64 bit value
|
103
|
+
uint64_t result;
|
104
|
+
if (totalLength >= MaxBufferSize)
|
105
|
+
{
|
106
|
+
result = rotateLeft(state[0], 1) +
|
107
|
+
rotateLeft(state[1], 7) +
|
108
|
+
rotateLeft(state[2], 12) +
|
109
|
+
rotateLeft(state[3], 18);
|
110
|
+
result = (result ^ processSingle(0, state[0])) * Prime1 + Prime4;
|
111
|
+
result = (result ^ processSingle(0, state[1])) * Prime1 + Prime4;
|
112
|
+
result = (result ^ processSingle(0, state[2])) * Prime1 + Prime4;
|
113
|
+
result = (result ^ processSingle(0, state[3])) * Prime1 + Prime4;
|
114
|
+
}
|
115
|
+
else
|
116
|
+
{
|
117
|
+
// internal state wasn't set in add(), therefore original seed is still stored in state2
|
118
|
+
result = state[2] + Prime5;
|
119
|
+
}
|
120
|
+
|
121
|
+
result += totalLength;
|
122
|
+
|
123
|
+
// process remaining bytes in temporary buffer
|
124
|
+
const unsigned char* data = buffer;
|
125
|
+
// point beyond last byte
|
126
|
+
const unsigned char* stop = data + bufferSize;
|
127
|
+
|
128
|
+
// at least 8 bytes left ? => eat 8 bytes per step
|
129
|
+
for (; data + 8 <= stop; data += 8)
|
130
|
+
result = rotateLeft(result ^ processSingle(0, *(uint64_t*)data), 27) * Prime1 + Prime4;
|
131
|
+
|
132
|
+
// 4 bytes left ? => eat those
|
133
|
+
if (data + 4 <= stop)
|
134
|
+
{
|
135
|
+
result = rotateLeft(result ^ (*(uint32_t*)data) * Prime1, 23) * Prime2 + Prime3;
|
136
|
+
data += 4;
|
137
|
+
}
|
138
|
+
|
139
|
+
// take care of remaining 0..3 bytes, eat 1 byte per step
|
140
|
+
while (data != stop)
|
141
|
+
result = rotateLeft(result ^ (*data++) * Prime5, 11) * Prime1;
|
142
|
+
|
143
|
+
// mix bits
|
144
|
+
result ^= result >> 33;
|
145
|
+
result *= Prime2;
|
146
|
+
result ^= result >> 29;
|
147
|
+
result *= Prime3;
|
148
|
+
result ^= result >> 32;
|
149
|
+
return result;
|
150
|
+
}
|
151
|
+
|
152
|
+
|
153
|
+
/// combine constructor, add() and hash() in one static function (C style)
|
154
|
+
/** @param input pointer to a continuous block of data
|
155
|
+
@param length number of bytes
|
156
|
+
@param seed your seed value, e.g. zero is a valid seed
|
157
|
+
@return 64 bit XXHash **/
|
158
|
+
static uint64_t hash(const void* input, uint64_t length, uint64_t seed)
|
159
|
+
{
|
160
|
+
XXHash64 hasher(seed);
|
161
|
+
hasher.add(input, length);
|
162
|
+
return hasher.hash();
|
163
|
+
}
|
164
|
+
|
165
|
+
private:
|
166
|
+
/// magic constants :-)
|
167
|
+
static const uint64_t Prime1 = 11400714785074694791ULL;
|
168
|
+
static const uint64_t Prime2 = 14029467366897019727ULL;
|
169
|
+
static const uint64_t Prime3 = 1609587929392839161ULL;
|
170
|
+
static const uint64_t Prime4 = 9650029242287828579ULL;
|
171
|
+
static const uint64_t Prime5 = 2870177450012600261ULL;
|
172
|
+
|
173
|
+
/// temporarily store up to 31 bytes between multiple add() calls
|
174
|
+
static const uint64_t MaxBufferSize = 31+1;
|
175
|
+
|
176
|
+
uint64_t state[4];
|
177
|
+
unsigned char buffer[MaxBufferSize];
|
178
|
+
uint64_t bufferSize;
|
179
|
+
uint64_t totalLength;
|
180
|
+
|
181
|
+
/// rotate bits, should compile to a single CPU instruction (ROL)
|
182
|
+
static inline uint64_t rotateLeft(uint64_t x, unsigned char bits)
|
183
|
+
{
|
184
|
+
return (x << bits) | (x >> (64 - bits));
|
185
|
+
}
|
186
|
+
|
187
|
+
/// process a single 64 bit value
|
188
|
+
static inline uint64_t processSingle(uint64_t previous, uint64_t input)
|
189
|
+
{
|
190
|
+
return rotateLeft(previous + input * Prime2, 31) * Prime1;
|
191
|
+
}
|
192
|
+
|
193
|
+
/// process a block of 4x4 bytes, this is the main part of the XXHash32 algorithm
|
194
|
+
static inline void process(const void* data, uint64_t& state0, uint64_t& state1, uint64_t& state2, uint64_t& state3)
|
195
|
+
{
|
196
|
+
const uint64_t* block = (const uint64_t*) data;
|
197
|
+
state0 = processSingle(state0, block[0]);
|
198
|
+
state1 = processSingle(state1, block[1]);
|
199
|
+
state2 = processSingle(state2, block[2]);
|
200
|
+
state3 = processSingle(state3, block[3]);
|
201
|
+
}
|
202
|
+
};
|
@@ -44,6 +44,10 @@ template<typename A> class cpc_compressor;
|
|
44
44
|
template<typename A>
|
45
45
|
inline cpc_compressor<A>& get_compressor();
|
46
46
|
|
47
|
+
// function called atexit to clean up compression tables
|
48
|
+
template<typename A>
|
49
|
+
void destroy_compressor();
|
50
|
+
|
47
51
|
template<typename A>
|
48
52
|
class cpc_compressor {
|
49
53
|
public:
|
@@ -109,8 +113,10 @@ private:
|
|
109
113
|
};
|
110
114
|
|
111
115
|
cpc_compressor();
|
112
|
-
|
116
|
+
friend cpc_compressor& get_compressor<A>();
|
117
|
+
|
113
118
|
~cpc_compressor();
|
119
|
+
friend void destroy_compressor<A>();
|
114
120
|
|
115
121
|
void make_decoding_tables(); // call this at startup
|
116
122
|
void free_decoding_tables(); // call this at the end
|
@@ -22,9 +22,11 @@
|
|
22
22
|
#ifndef CPC_COMPRESSOR_IMPL_HPP_
|
23
23
|
#define CPC_COMPRESSOR_IMPL_HPP_
|
24
24
|
|
25
|
+
#include <cstdlib>
|
25
26
|
#include <memory>
|
26
27
|
#include <stdexcept>
|
27
28
|
|
29
|
+
#include "common_defs.hpp"
|
28
30
|
#include "compression_data.hpp"
|
29
31
|
#include "cpc_util.hpp"
|
30
32
|
#include "cpc_common.hpp"
|
@@ -36,9 +38,17 @@ namespace datasketches {
|
|
36
38
|
template<typename A>
|
37
39
|
cpc_compressor<A>& get_compressor() {
|
38
40
|
static cpc_compressor<A>* instance = new cpc_compressor<A>(); // use new for global initialization
|
41
|
+
static int reg_result = std::atexit(destroy_compressor<A>); // just to clean up a little more nicely; don't worry if it fails
|
42
|
+
unused(reg_result);
|
39
43
|
return *instance;
|
40
44
|
}
|
41
45
|
|
46
|
+
// register to call compressor destructor at exit
|
47
|
+
template<typename A>
|
48
|
+
void destroy_compressor() {
|
49
|
+
delete std::addressof(get_compressor<A>());
|
50
|
+
}
|
51
|
+
|
42
52
|
template<typename A>
|
43
53
|
cpc_compressor<A>::cpc_compressor() {
|
44
54
|
make_decoding_tables();
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
add_library(filters INTERFACE)
|
19
|
+
|
20
|
+
add_library(${PROJECT_NAME}::FILTERS ALIAS filters)
|
21
|
+
|
22
|
+
if (BUILD_TESTS)
|
23
|
+
add_subdirectory(test)
|
24
|
+
endif()
|
25
|
+
|
26
|
+
target_include_directories(filters
|
27
|
+
INTERFACE
|
28
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
29
|
+
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
|
30
|
+
)
|
31
|
+
|
32
|
+
target_link_libraries(filters INTERFACE common)
|
33
|
+
|
34
|
+
install(TARGETS filters
|
35
|
+
EXPORT ${PROJECT_NAME}
|
36
|
+
)
|
37
|
+
|
38
|
+
install(FILES
|
39
|
+
include/bloom_filter.hpp
|
40
|
+
include/bloom_filter_impl.hpp
|
41
|
+
include/bloom_filter_builder_impl.hpp
|
42
|
+
include/bit_array_ops.hpp
|
43
|
+
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|