datasketches 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
- data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/python/README.md +6 -3
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
- data/vendor/datasketches-cpp/setup.py +5 -3
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
- metadata +43 -34
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
|
@@ -2,9 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#include <var_opt_sketch.hpp>
|
|
4
4
|
|
|
5
|
-
#include
|
|
6
|
-
#include <rice/Constructor.hpp>
|
|
7
|
-
#include <rice/Module.hpp>
|
|
5
|
+
#include "ext.h"
|
|
8
6
|
|
|
9
7
|
using datasketches::var_opt_sketch;
|
|
10
8
|
|
|
@@ -19,7 +17,7 @@ void bind_vo_sketch(Rice::Module &m, const char* name) {
|
|
|
19
17
|
.define_method("reset", &var_opt_sketch<T>::reset)
|
|
20
18
|
.define_method(
|
|
21
19
|
"samples",
|
|
22
|
-
|
|
20
|
+
[](var_opt_sketch<T>& self) {
|
|
23
21
|
auto a = Rice::Array();
|
|
24
22
|
for (auto item : self) {
|
|
25
23
|
auto t = Rice::Array();
|
|
@@ -31,9 +29,9 @@ void bind_vo_sketch(Rice::Module &m, const char* name) {
|
|
|
31
29
|
})
|
|
32
30
|
.define_method(
|
|
33
31
|
"update",
|
|
34
|
-
|
|
32
|
+
[](var_opt_sketch<T>& self, const T item) {
|
|
35
33
|
self.update(item);
|
|
36
|
-
});
|
|
34
|
+
}, Rice::Arg("item").keepAlive());
|
|
37
35
|
}
|
|
38
36
|
|
|
39
37
|
void init_vo(Rice::Module& m) {
|
data/lib/datasketches/version.rb
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
# DataSketches Core C++ Library Component
|
|
2
|
-
This is the core C++ component of the DataSketches library. It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications.
|
|
1
|
+
# Apache DataSketches Core C++ Library Component
|
|
2
|
+
This is the core C++ component of the Apache DataSketches library. It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications.
|
|
3
3
|
|
|
4
4
|
This component is also a dependency of other components of the library that create adaptors for target systems, such as PostgreSQL.
|
|
5
5
|
|
|
6
6
|
Note that we have a parallel core component for Java implementations of the same sketch algorithms,
|
|
7
7
|
[datasketches-java](https://github.com/apache/datasketches-java).
|
|
8
8
|
|
|
9
|
-
Please visit the main [DataSketches website](https://datasketches.apache.org) for more information.
|
|
9
|
+
Please visit the main [Apache DataSketches website](https://datasketches.apache.org) for more information.
|
|
10
10
|
|
|
11
11
|
If you are interested in making contributions to this site please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us.
|
|
12
12
|
|
|
13
13
|
---
|
|
14
14
|
|
|
15
|
-
This code requires C++11.
|
|
15
|
+
This code requires C++11.
|
|
16
16
|
|
|
17
17
|
This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/datasketches-cpp/tree/master/python).
|
|
18
18
|
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
// * Changed input seed in MurmurHash3_x64_128 to uint64_t
|
|
4
4
|
// * Define and use HashState reference to return result
|
|
5
5
|
// * Made entire hash function defined inline
|
|
6
|
+
// * Added compute_seed_hash
|
|
6
7
|
//-----------------------------------------------------------------------------
|
|
7
8
|
// MurmurHash3 was written by Austin Appleby, and is placed in the public
|
|
8
9
|
// domain. The author hereby disclaims copyright to this source code.
|
|
@@ -170,4 +171,10 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t se
|
|
|
170
171
|
|
|
171
172
|
//-----------------------------------------------------------------------------
|
|
172
173
|
|
|
174
|
+
FORCE_INLINE uint16_t compute_seed_hash(uint64_t seed) {
|
|
175
|
+
HashState hashes;
|
|
176
|
+
MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
|
|
177
|
+
return static_cast<uint16_t>(hashes.h1 & 0xffff);
|
|
178
|
+
}
|
|
179
|
+
|
|
173
180
|
#endif // _MURMURHASH3_H_
|
|
@@ -52,6 +52,18 @@ static inline size_t copy_to_mem(const void* src, void* dst, size_t size) {
|
|
|
52
52
|
return size;
|
|
53
53
|
}
|
|
54
54
|
|
|
55
|
+
template<typename T>
|
|
56
|
+
static inline size_t copy_to_mem(const T& item, void* dst) {
|
|
57
|
+
memcpy(dst, &item, sizeof(T));
|
|
58
|
+
return sizeof(T);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
template<typename T>
|
|
62
|
+
static inline size_t copy_from_mem(const void* src, T& item) {
|
|
63
|
+
memcpy(&item, src, sizeof(T));
|
|
64
|
+
return sizeof(T);
|
|
65
|
+
}
|
|
66
|
+
|
|
55
67
|
} // namespace
|
|
56
68
|
|
|
57
69
|
#endif // _MEMORY_OPERATIONS_HPP_
|
|
@@ -15,6 +15,10 @@
|
|
|
15
15
|
# specific language governing permissions and limitations
|
|
16
16
|
# under the License.
|
|
17
17
|
|
|
18
|
+
# two parts here, the common test code for other parts to use,
|
|
19
|
+
# and an integration test using the other parts of the library.
|
|
20
|
+
|
|
21
|
+
# common dependencies for tests
|
|
18
22
|
add_library(common_test OBJECT "")
|
|
19
23
|
|
|
20
24
|
set_target_properties(common_test PROPERTIES
|
|
@@ -36,3 +40,23 @@ target_sources(common_test
|
|
|
36
40
|
${CMAKE_CURRENT_SOURCE_DIR}/catch_runner.cpp
|
|
37
41
|
${CMAKE_CURRENT_SOURCE_DIR}/test_allocator.cpp
|
|
38
42
|
)
|
|
43
|
+
|
|
44
|
+
# now the integration test part
|
|
45
|
+
add_executable(integration_test)
|
|
46
|
+
|
|
47
|
+
target_link_libraries(integration_test cpc fi hll kll req sampling theta tuple common_test)
|
|
48
|
+
|
|
49
|
+
set_target_properties(integration_test PROPERTIES
|
|
50
|
+
CXX_STANDARD 11
|
|
51
|
+
CXX_STANDARD_REQUIRED YES
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
add_test(
|
|
55
|
+
NAME integration_test
|
|
56
|
+
COMMAND integration_test
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
target_sources(integration_test
|
|
60
|
+
PRIVATE
|
|
61
|
+
integration_test.cpp
|
|
62
|
+
)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch.hpp>
|
|
21
|
+
|
|
22
|
+
#include "cpc_sketch.hpp"
|
|
23
|
+
#include "cpc_union.hpp"
|
|
24
|
+
#include "frequent_items_sketch.hpp"
|
|
25
|
+
#include "hll.hpp"
|
|
26
|
+
#include "kll_sketch.hpp"
|
|
27
|
+
#include "req_sketch.hpp"
|
|
28
|
+
#include "var_opt_sketch.hpp"
|
|
29
|
+
#include "var_opt_union.hpp"
|
|
30
|
+
#include "theta_sketch.hpp"
|
|
31
|
+
#include "theta_union.hpp"
|
|
32
|
+
#include "theta_intersection.hpp"
|
|
33
|
+
#include "theta_a_not_b.hpp"
|
|
34
|
+
#include "tuple_sketch.hpp"
|
|
35
|
+
#include "tuple_union.hpp"
|
|
36
|
+
#include "tuple_intersection.hpp"
|
|
37
|
+
#include "tuple_a_not_b.hpp"
|
|
38
|
+
|
|
39
|
+
namespace datasketches {
|
|
40
|
+
|
|
41
|
+
template<typename Summary>
|
|
42
|
+
struct subtracting_intersection_policy {
|
|
43
|
+
void operator()(Summary& summary, const Summary& other) const {
|
|
44
|
+
summary -= other;
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
using tuple_intersection_float = tuple_intersection<float, subtracting_intersection_policy<float>>;
|
|
49
|
+
|
|
50
|
+
TEST_CASE("integration: declare all sketches", "[integration]") {
|
|
51
|
+
cpc_sketch cpc(12);
|
|
52
|
+
cpc_union cpc_u(12);
|
|
53
|
+
|
|
54
|
+
frequent_items_sketch<std::string> fi(100);
|
|
55
|
+
|
|
56
|
+
hll_sketch hll(13);
|
|
57
|
+
hll_union hll_u(13);
|
|
58
|
+
|
|
59
|
+
kll_sketch<double> kll(200);
|
|
60
|
+
|
|
61
|
+
req_sketch<double> req(12);
|
|
62
|
+
|
|
63
|
+
var_opt_sketch<std::string> vo(100);
|
|
64
|
+
var_opt_union<std::string> vo_u(100);
|
|
65
|
+
|
|
66
|
+
update_theta_sketch theta = update_theta_sketch::builder().build();
|
|
67
|
+
theta_union theta_u = theta_union::builder().build();
|
|
68
|
+
theta_intersection theta_i;
|
|
69
|
+
theta_a_not_b theta_anb;
|
|
70
|
+
|
|
71
|
+
auto tuple = update_tuple_sketch<float>::builder().build();
|
|
72
|
+
auto tuple_u = tuple_union<float>::builder().build();
|
|
73
|
+
tuple_intersection_float tuple_i;
|
|
74
|
+
tuple_a_not_b<float> tuple_anb;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
} /* namespace datasketches */
|
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
|
|
23
23
|
#include <new>
|
|
24
24
|
#include <utility>
|
|
25
|
+
#include <stdexcept>
|
|
25
26
|
|
|
26
27
|
// this allocator keeps the total allocated size in a global variable for testing
|
|
27
28
|
|
|
@@ -43,7 +44,14 @@ public:
|
|
|
43
44
|
template <class U>
|
|
44
45
|
struct rebind { typedef test_allocator<U> other; };
|
|
45
46
|
|
|
46
|
-
|
|
47
|
+
// this is to test that a given instance of an allocator is used instead of instantiating
|
|
48
|
+
static const bool DISALLOW_DEFAULT_CONSTRUCTOR = true;
|
|
49
|
+
test_allocator() {
|
|
50
|
+
if (DISALLOW_DEFAULT_CONSTRUCTOR) throw std::runtime_error("test_allocator: default constructor");
|
|
51
|
+
}
|
|
52
|
+
// call this constructor in tests and pass an allocator instance
|
|
53
|
+
test_allocator(int) {}
|
|
54
|
+
|
|
47
55
|
test_allocator(const test_allocator&) {}
|
|
48
56
|
template <class U>
|
|
49
57
|
test_allocator(const test_allocator<U>&) {}
|
|
@@ -44,6 +44,8 @@ template<typename A> class u32_table;
|
|
|
44
44
|
|
|
45
45
|
template<typename A>
|
|
46
46
|
struct compressed_state {
|
|
47
|
+
explicit compressed_state(const A& allocator): table_data(allocator), table_data_words(0), table_num_entries(0),
|
|
48
|
+
window_data(allocator), window_data_words(0) {}
|
|
47
49
|
vector_u32<A> table_data;
|
|
48
50
|
uint32_t table_data_words;
|
|
49
51
|
uint32_t table_num_entries; // can be different from the number of entries in the sketch in hybrid mode
|
|
@@ -53,6 +55,7 @@ struct compressed_state {
|
|
|
53
55
|
|
|
54
56
|
template<typename A>
|
|
55
57
|
struct uncompressed_state {
|
|
58
|
+
explicit uncompressed_state(const A& allocator): table(allocator), window(allocator) {}
|
|
56
59
|
u32_table<A> table;
|
|
57
60
|
vector_u8<A> window;
|
|
58
61
|
};
|
|
@@ -129,14 +129,14 @@ private:
|
|
|
129
129
|
void compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const;
|
|
130
130
|
void compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const;
|
|
131
131
|
|
|
132
|
-
vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k) const;
|
|
132
|
+
vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k, const A& allocator) const;
|
|
133
133
|
void uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const;
|
|
134
134
|
|
|
135
135
|
static size_t safe_length_for_compressed_pair_buf(uint64_t k, size_t num_pairs, size_t num_base_bits);
|
|
136
136
|
static size_t safe_length_for_compressed_window_buf(uint64_t k);
|
|
137
137
|
static uint8_t determine_pseudo_phase(uint8_t lg_k, uint64_t c);
|
|
138
138
|
|
|
139
|
-
static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space);
|
|
139
|
+
static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space, const A& allocator);
|
|
140
140
|
static inline uint64_t golomb_choose_number_of_base_bits(uint64_t k, uint64_t count);
|
|
141
141
|
};
|
|
142
142
|
|
|
@@ -160,7 +160,7 @@ template<typename A>
|
|
|
160
160
|
void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint64_t num_coupons) const {
|
|
161
161
|
switch (cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons)) {
|
|
162
162
|
case cpc_sketch_alloc<A>::flavor::EMPTY:
|
|
163
|
-
target.table = u32_table<A>(2, 6 + lg_k);
|
|
163
|
+
target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
|
|
164
164
|
break;
|
|
165
165
|
case cpc_sketch_alloc<A>::flavor::SPARSE:
|
|
166
166
|
uncompress_sparse_flavor(source, target, lg_k);
|
|
@@ -191,8 +191,9 @@ template<typename A>
|
|
|
191
191
|
void cpc_compressor<A>::uncompress_sparse_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
|
|
192
192
|
if (source.window_data.size() > 0) throw std::logic_error("unexpected sliding window");
|
|
193
193
|
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
|
|
194
|
-
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
|
|
195
|
-
|
|
194
|
+
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
|
|
195
|
+
lg_k, source.table_data.get_allocator());
|
|
196
|
+
target.table = u32_table<A>::make_from_pairs(pairs.data(), source.table_num_entries, lg_k, pairs.get_allocator());
|
|
196
197
|
}
|
|
197
198
|
|
|
198
199
|
// This is complicated because it effectively builds a Sparse version
|
|
@@ -206,7 +207,7 @@ void cpc_compressor<A>::compress_hybrid_flavor(const cpc_sketch_alloc<A>& source
|
|
|
206
207
|
if (pairs_from_table.size() > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, pairs_from_table.size());
|
|
207
208
|
const size_t num_pairs_from_window = source.get_num_coupons() - pairs_from_table.size(); // because the window offset is zero
|
|
208
209
|
|
|
209
|
-
vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size());
|
|
210
|
+
vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size(), source.get_allocator());
|
|
210
211
|
|
|
211
212
|
u32_table<A>::merge(
|
|
212
213
|
pairs_from_table.data(), 0, pairs_from_table.size(),
|
|
@@ -221,7 +222,8 @@ template<typename A>
|
|
|
221
222
|
void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
|
|
222
223
|
if (source.window_data.size() > 0) throw std::logic_error("window is not expected");
|
|
223
224
|
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
|
|
224
|
-
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
|
|
225
|
+
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
|
|
226
|
+
lg_k, source.table_data.get_allocator());
|
|
225
227
|
|
|
226
228
|
// In the hybrid flavor, some of these pairs actually
|
|
227
229
|
// belong in the window, so we will separate them out,
|
|
@@ -240,7 +242,7 @@ void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& sour
|
|
|
240
242
|
pairs[next_true_pair++] = row_col; // move true pair down
|
|
241
243
|
}
|
|
242
244
|
}
|
|
243
|
-
target.table = u32_table<A>::make_from_pairs(pairs.data(), next_true_pair, lg_k);
|
|
245
|
+
target.table = u32_table<A>::make_from_pairs(pairs.data(), next_true_pair, lg_k, pairs.get_allocator());
|
|
244
246
|
}
|
|
245
247
|
|
|
246
248
|
template<typename A>
|
|
@@ -264,21 +266,23 @@ void cpc_compressor<A>::compress_pinned_flavor(const cpc_sketch_alloc<A>& source
|
|
|
264
266
|
}
|
|
265
267
|
|
|
266
268
|
template<typename A>
|
|
267
|
-
void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
|
|
269
|
+
void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
|
|
270
|
+
uint8_t lg_k, uint32_t num_coupons) const {
|
|
268
271
|
if (source.window_data.size() == 0) throw std::logic_error("window is expected");
|
|
269
272
|
uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
|
|
270
273
|
const size_t num_pairs = source.table_num_entries;
|
|
271
274
|
if (num_pairs == 0) {
|
|
272
|
-
target.table = u32_table<A>(2, 6 + lg_k);
|
|
275
|
+
target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
|
|
273
276
|
} else {
|
|
274
277
|
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
|
|
275
|
-
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
|
|
278
|
+
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
|
|
279
|
+
lg_k, source.table_data.get_allocator());
|
|
276
280
|
// undo the compressor's 8-column shift
|
|
277
281
|
for (size_t i = 0; i < num_pairs; i++) {
|
|
278
282
|
if ((pairs[i] & 63) >= 56) throw std::logic_error("(pairs[i] & 63) >= 56");
|
|
279
283
|
pairs[i] += 8;
|
|
280
284
|
}
|
|
281
|
-
target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k);
|
|
285
|
+
target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k, pairs.get_allocator());
|
|
282
286
|
}
|
|
283
287
|
}
|
|
284
288
|
|
|
@@ -314,15 +318,17 @@ void cpc_compressor<A>::compress_sliding_flavor(const cpc_sketch_alloc<A>& sourc
|
|
|
314
318
|
}
|
|
315
319
|
|
|
316
320
|
template<typename A>
|
|
317
|
-
void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
|
|
321
|
+
void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
|
|
322
|
+
uint8_t lg_k, uint32_t num_coupons) const {
|
|
318
323
|
if (source.window_data.size() == 0) throw std::logic_error("window is expected");
|
|
319
324
|
uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
|
|
320
325
|
const size_t num_pairs = source.table_num_entries;
|
|
321
326
|
if (num_pairs == 0) {
|
|
322
|
-
target.table = u32_table<A>(2, 6 + lg_k);
|
|
327
|
+
target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
|
|
323
328
|
} else {
|
|
324
329
|
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
|
|
325
|
-
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
|
|
330
|
+
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
|
|
331
|
+
lg_k, source.table_data.get_allocator());
|
|
326
332
|
|
|
327
333
|
const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
|
|
328
334
|
if (pseudo_phase >= 16) throw std::logic_error("pseudo phase >= 16");
|
|
@@ -342,7 +348,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
|
|
|
342
348
|
pairs[i] = (row << 6) | col;
|
|
343
349
|
}
|
|
344
350
|
|
|
345
|
-
target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k);
|
|
351
|
+
target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k, pairs.get_allocator());
|
|
346
352
|
}
|
|
347
353
|
}
|
|
348
354
|
|
|
@@ -364,9 +370,10 @@ void cpc_compressor<A>::compress_surprising_values(const vector_u32<A>& pairs, u
|
|
|
364
370
|
}
|
|
365
371
|
|
|
366
372
|
template<typename A>
|
|
367
|
-
vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs,
|
|
373
|
+
vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs,
|
|
374
|
+
uint8_t lg_k, const A& allocator) const {
|
|
368
375
|
const size_t k = 1 << lg_k;
|
|
369
|
-
vector_u32<A> pairs(num_pairs);
|
|
376
|
+
vector_u32<A> pairs(num_pairs, 0, allocator);
|
|
370
377
|
const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
|
|
371
378
|
low_level_uncompress_pairs(pairs.data(), num_pairs, num_base_bits, data, data_words);
|
|
372
379
|
return pairs;
|
|
@@ -388,7 +395,8 @@ void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t l
|
|
|
388
395
|
}
|
|
389
396
|
|
|
390
397
|
template<typename A>
|
|
391
|
-
void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window,
|
|
398
|
+
void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window,
|
|
399
|
+
uint8_t lg_k, uint32_t num_coupons) const {
|
|
392
400
|
const size_t k = 1 << lg_k;
|
|
393
401
|
window.resize(k); // zeroing not needed here (unlike the Hybrid Flavor)
|
|
394
402
|
const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
|
|
@@ -710,9 +718,10 @@ void write_unary(
|
|
|
710
718
|
// The empty space that this leaves at the beginning of the output array
|
|
711
719
|
// will be filled in later by the caller.
|
|
712
720
|
template<typename A>
|
|
713
|
-
vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get,
|
|
721
|
+
vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get,
|
|
722
|
+
uint32_t empty_space, const A& allocator) {
|
|
714
723
|
const size_t output_length = empty_space + num_pairs_to_get;
|
|
715
|
-
vector_u32<A> pairs(output_length);
|
|
724
|
+
vector_u32<A> pairs(output_length, 0, allocator);
|
|
716
725
|
size_t pair_index = empty_space;
|
|
717
726
|
for (unsigned row_index = 0; row_index < k; row_index++) {
|
|
718
727
|
uint8_t byte = window[row_index];
|
|
@@ -49,7 +49,7 @@ template<typename A> class cpc_sketch_alloc;
|
|
|
49
49
|
template<typename A> class cpc_union_alloc;
|
|
50
50
|
|
|
51
51
|
// alias with default allocator for convenience
|
|
52
|
-
|
|
52
|
+
using cpc_sketch = cpc_sketch_alloc<std::allocator<uint8_t>>;
|
|
53
53
|
|
|
54
54
|
// allocation and initialization of global decompression (decoding) tables
|
|
55
55
|
// call this before anything else if you want to control the initialization time
|
|
@@ -67,7 +67,10 @@ public:
|
|
|
67
67
|
* @param lg_k base 2 logarithm of the number of bins in the sketch
|
|
68
68
|
* @param seed for hash function
|
|
69
69
|
*/
|
|
70
|
-
explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED);
|
|
70
|
+
explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
|
71
|
+
|
|
72
|
+
using allocator_type = A;
|
|
73
|
+
A get_allocator() const;
|
|
71
74
|
|
|
72
75
|
/**
|
|
73
76
|
* @return configured lg_k of this sketch
|
|
@@ -204,7 +207,7 @@ public:
|
|
|
204
207
|
|
|
205
208
|
// This is a convenience alias for users
|
|
206
209
|
// The type returned by the following serialize method
|
|
207
|
-
|
|
210
|
+
using vector_bytes = vector_u8<A>;
|
|
208
211
|
|
|
209
212
|
/**
|
|
210
213
|
* This method serializes the sketch as a vector of bytes.
|
|
@@ -221,7 +224,7 @@ public:
|
|
|
221
224
|
* @param seed the seed for the hash function that was used to create the sketch
|
|
222
225
|
* @return an instance of a sketch
|
|
223
226
|
*/
|
|
224
|
-
static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
|
|
227
|
+
static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
|
225
228
|
|
|
226
229
|
/**
|
|
227
230
|
* This method deserializes a sketch from a given array of bytes.
|
|
@@ -230,7 +233,7 @@ public:
|
|
|
230
233
|
* @param seed the seed for the hash function that was used to create the sketch
|
|
231
234
|
* @return an instance of the sketch
|
|
232
235
|
*/
|
|
233
|
-
static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
|
|
236
|
+
static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
|
234
237
|
|
|
235
238
|
// for internal use
|
|
236
239
|
uint32_t get_num_coupons() const;
|