datasketches 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
- data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/python/README.md +6 -3
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
- data/vendor/datasketches-cpp/setup.py +5 -3
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
- metadata +43 -34
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -2,9 +2,7 @@
|
|
2
2
|
|
3
3
|
#include <var_opt_sketch.hpp>
|
4
4
|
|
5
|
-
#include
|
6
|
-
#include <rice/Constructor.hpp>
|
7
|
-
#include <rice/Module.hpp>
|
5
|
+
#include "ext.h"
|
8
6
|
|
9
7
|
using datasketches::var_opt_sketch;
|
10
8
|
|
@@ -19,7 +17,7 @@ void bind_vo_sketch(Rice::Module &m, const char* name) {
|
|
19
17
|
.define_method("reset", &var_opt_sketch<T>::reset)
|
20
18
|
.define_method(
|
21
19
|
"samples",
|
22
|
-
|
20
|
+
[](var_opt_sketch<T>& self) {
|
23
21
|
auto a = Rice::Array();
|
24
22
|
for (auto item : self) {
|
25
23
|
auto t = Rice::Array();
|
@@ -31,9 +29,9 @@ void bind_vo_sketch(Rice::Module &m, const char* name) {
|
|
31
29
|
})
|
32
30
|
.define_method(
|
33
31
|
"update",
|
34
|
-
|
32
|
+
[](var_opt_sketch<T>& self, const T item) {
|
35
33
|
self.update(item);
|
36
|
-
});
|
34
|
+
}, Rice::Arg("item").keepAlive());
|
37
35
|
}
|
38
36
|
|
39
37
|
void init_vo(Rice::Module& m) {
|
data/lib/datasketches/version.rb
CHANGED
@@ -1,18 +1,18 @@
|
|
1
|
-
# DataSketches Core C++ Library Component
|
2
|
-
This is the core C++ component of the DataSketches library. It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications.
|
1
|
+
# Apache DataSketches Core C++ Library Component
|
2
|
+
This is the core C++ component of the Apache DataSketches library. It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications.
|
3
3
|
|
4
4
|
This component is also a dependency of other components of the library that create adaptors for target systems, such as PostgreSQL.
|
5
5
|
|
6
6
|
Note that we have a parallel core component for Java implementations of the same sketch algorithms,
|
7
7
|
[datasketches-java](https://github.com/apache/datasketches-java).
|
8
8
|
|
9
|
-
Please visit the main [DataSketches website](https://datasketches.apache.org) for more information.
|
9
|
+
Please visit the main [Apache DataSketches website](https://datasketches.apache.org) for more information.
|
10
10
|
|
11
11
|
If you are interested in making contributions to this site please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us.
|
12
12
|
|
13
13
|
---
|
14
14
|
|
15
|
-
This code requires C++11.
|
15
|
+
This code requires C++11.
|
16
16
|
|
17
17
|
This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/datasketches-cpp/tree/master/python).
|
18
18
|
|
@@ -3,6 +3,7 @@
|
|
3
3
|
// * Changed input seed in MurmurHash3_x64_128 to uint64_t
|
4
4
|
// * Define and use HashState reference to return result
|
5
5
|
// * Made entire hash function defined inline
|
6
|
+
// * Added compute_seed_hash
|
6
7
|
//-----------------------------------------------------------------------------
|
7
8
|
// MurmurHash3 was written by Austin Appleby, and is placed in the public
|
8
9
|
// domain. The author hereby disclaims copyright to this source code.
|
@@ -170,4 +171,10 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t se
|
|
170
171
|
|
171
172
|
//-----------------------------------------------------------------------------
|
172
173
|
|
174
|
+
FORCE_INLINE uint16_t compute_seed_hash(uint64_t seed) {
|
175
|
+
HashState hashes;
|
176
|
+
MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
|
177
|
+
return static_cast<uint16_t>(hashes.h1 & 0xffff);
|
178
|
+
}
|
179
|
+
|
173
180
|
#endif // _MURMURHASH3_H_
|
@@ -52,6 +52,18 @@ static inline size_t copy_to_mem(const void* src, void* dst, size_t size) {
|
|
52
52
|
return size;
|
53
53
|
}
|
54
54
|
|
55
|
+
template<typename T>
|
56
|
+
static inline size_t copy_to_mem(const T& item, void* dst) {
|
57
|
+
memcpy(dst, &item, sizeof(T));
|
58
|
+
return sizeof(T);
|
59
|
+
}
|
60
|
+
|
61
|
+
template<typename T>
|
62
|
+
static inline size_t copy_from_mem(const void* src, T& item) {
|
63
|
+
memcpy(&item, src, sizeof(T));
|
64
|
+
return sizeof(T);
|
65
|
+
}
|
66
|
+
|
55
67
|
} // namespace
|
56
68
|
|
57
69
|
#endif // _MEMORY_OPERATIONS_HPP_
|
@@ -15,6 +15,10 @@
|
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
17
|
|
18
|
+
# two parts here, the common test code for other parts to use,
|
19
|
+
# and an integration test using the other parts of the library.
|
20
|
+
|
21
|
+
# common dependencies for tests
|
18
22
|
add_library(common_test OBJECT "")
|
19
23
|
|
20
24
|
set_target_properties(common_test PROPERTIES
|
@@ -36,3 +40,23 @@ target_sources(common_test
|
|
36
40
|
${CMAKE_CURRENT_SOURCE_DIR}/catch_runner.cpp
|
37
41
|
${CMAKE_CURRENT_SOURCE_DIR}/test_allocator.cpp
|
38
42
|
)
|
43
|
+
|
44
|
+
# now the integration test part
|
45
|
+
add_executable(integration_test)
|
46
|
+
|
47
|
+
target_link_libraries(integration_test cpc fi hll kll req sampling theta tuple common_test)
|
48
|
+
|
49
|
+
set_target_properties(integration_test PROPERTIES
|
50
|
+
CXX_STANDARD 11
|
51
|
+
CXX_STANDARD_REQUIRED YES
|
52
|
+
)
|
53
|
+
|
54
|
+
add_test(
|
55
|
+
NAME integration_test
|
56
|
+
COMMAND integration_test
|
57
|
+
)
|
58
|
+
|
59
|
+
target_sources(integration_test
|
60
|
+
PRIVATE
|
61
|
+
integration_test.cpp
|
62
|
+
)
|
@@ -0,0 +1,77 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch.hpp>
|
21
|
+
|
22
|
+
#include "cpc_sketch.hpp"
|
23
|
+
#include "cpc_union.hpp"
|
24
|
+
#include "frequent_items_sketch.hpp"
|
25
|
+
#include "hll.hpp"
|
26
|
+
#include "kll_sketch.hpp"
|
27
|
+
#include "req_sketch.hpp"
|
28
|
+
#include "var_opt_sketch.hpp"
|
29
|
+
#include "var_opt_union.hpp"
|
30
|
+
#include "theta_sketch.hpp"
|
31
|
+
#include "theta_union.hpp"
|
32
|
+
#include "theta_intersection.hpp"
|
33
|
+
#include "theta_a_not_b.hpp"
|
34
|
+
#include "tuple_sketch.hpp"
|
35
|
+
#include "tuple_union.hpp"
|
36
|
+
#include "tuple_intersection.hpp"
|
37
|
+
#include "tuple_a_not_b.hpp"
|
38
|
+
|
39
|
+
namespace datasketches {
|
40
|
+
|
41
|
+
template<typename Summary>
|
42
|
+
struct subtracting_intersection_policy {
|
43
|
+
void operator()(Summary& summary, const Summary& other) const {
|
44
|
+
summary -= other;
|
45
|
+
}
|
46
|
+
};
|
47
|
+
|
48
|
+
using tuple_intersection_float = tuple_intersection<float, subtracting_intersection_policy<float>>;
|
49
|
+
|
50
|
+
TEST_CASE("integration: declare all sketches", "[integration]") {
|
51
|
+
cpc_sketch cpc(12);
|
52
|
+
cpc_union cpc_u(12);
|
53
|
+
|
54
|
+
frequent_items_sketch<std::string> fi(100);
|
55
|
+
|
56
|
+
hll_sketch hll(13);
|
57
|
+
hll_union hll_u(13);
|
58
|
+
|
59
|
+
kll_sketch<double> kll(200);
|
60
|
+
|
61
|
+
req_sketch<double> req(12);
|
62
|
+
|
63
|
+
var_opt_sketch<std::string> vo(100);
|
64
|
+
var_opt_union<std::string> vo_u(100);
|
65
|
+
|
66
|
+
update_theta_sketch theta = update_theta_sketch::builder().build();
|
67
|
+
theta_union theta_u = theta_union::builder().build();
|
68
|
+
theta_intersection theta_i;
|
69
|
+
theta_a_not_b theta_anb;
|
70
|
+
|
71
|
+
auto tuple = update_tuple_sketch<float>::builder().build();
|
72
|
+
auto tuple_u = tuple_union<float>::builder().build();
|
73
|
+
tuple_intersection_float tuple_i;
|
74
|
+
tuple_a_not_b<float> tuple_anb;
|
75
|
+
}
|
76
|
+
|
77
|
+
} /* namespace datasketches */
|
@@ -22,6 +22,7 @@
|
|
22
22
|
|
23
23
|
#include <new>
|
24
24
|
#include <utility>
|
25
|
+
#include <stdexcept>
|
25
26
|
|
26
27
|
// this allocator keeps the total allocated size in a global variable for testing
|
27
28
|
|
@@ -43,7 +44,14 @@ public:
|
|
43
44
|
template <class U>
|
44
45
|
struct rebind { typedef test_allocator<U> other; };
|
45
46
|
|
46
|
-
|
47
|
+
// this is to test that a given instance of an allocator is used instead of instantiating
|
48
|
+
static const bool DISALLOW_DEFAULT_CONSTRUCTOR = true;
|
49
|
+
test_allocator() {
|
50
|
+
if (DISALLOW_DEFAULT_CONSTRUCTOR) throw std::runtime_error("test_allocator: default constructor");
|
51
|
+
}
|
52
|
+
// call this constructor in tests and pass an allocator instance
|
53
|
+
test_allocator(int) {}
|
54
|
+
|
47
55
|
test_allocator(const test_allocator&) {}
|
48
56
|
template <class U>
|
49
57
|
test_allocator(const test_allocator<U>&) {}
|
@@ -44,6 +44,8 @@ template<typename A> class u32_table;
|
|
44
44
|
|
45
45
|
template<typename A>
|
46
46
|
struct compressed_state {
|
47
|
+
explicit compressed_state(const A& allocator): table_data(allocator), table_data_words(0), table_num_entries(0),
|
48
|
+
window_data(allocator), window_data_words(0) {}
|
47
49
|
vector_u32<A> table_data;
|
48
50
|
uint32_t table_data_words;
|
49
51
|
uint32_t table_num_entries; // can be different from the number of entries in the sketch in hybrid mode
|
@@ -53,6 +55,7 @@ struct compressed_state {
|
|
53
55
|
|
54
56
|
template<typename A>
|
55
57
|
struct uncompressed_state {
|
58
|
+
explicit uncompressed_state(const A& allocator): table(allocator), window(allocator) {}
|
56
59
|
u32_table<A> table;
|
57
60
|
vector_u8<A> window;
|
58
61
|
};
|
@@ -129,14 +129,14 @@ private:
|
|
129
129
|
void compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const;
|
130
130
|
void compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const;
|
131
131
|
|
132
|
-
vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k) const;
|
132
|
+
vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k, const A& allocator) const;
|
133
133
|
void uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const;
|
134
134
|
|
135
135
|
static size_t safe_length_for_compressed_pair_buf(uint64_t k, size_t num_pairs, size_t num_base_bits);
|
136
136
|
static size_t safe_length_for_compressed_window_buf(uint64_t k);
|
137
137
|
static uint8_t determine_pseudo_phase(uint8_t lg_k, uint64_t c);
|
138
138
|
|
139
|
-
static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space);
|
139
|
+
static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space, const A& allocator);
|
140
140
|
static inline uint64_t golomb_choose_number_of_base_bits(uint64_t k, uint64_t count);
|
141
141
|
};
|
142
142
|
|
@@ -160,7 +160,7 @@ template<typename A>
|
|
160
160
|
void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint64_t num_coupons) const {
|
161
161
|
switch (cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons)) {
|
162
162
|
case cpc_sketch_alloc<A>::flavor::EMPTY:
|
163
|
-
target.table = u32_table<A>(2, 6 + lg_k);
|
163
|
+
target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
|
164
164
|
break;
|
165
165
|
case cpc_sketch_alloc<A>::flavor::SPARSE:
|
166
166
|
uncompress_sparse_flavor(source, target, lg_k);
|
@@ -191,8 +191,9 @@ template<typename A>
|
|
191
191
|
void cpc_compressor<A>::uncompress_sparse_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
|
192
192
|
if (source.window_data.size() > 0) throw std::logic_error("unexpected sliding window");
|
193
193
|
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
|
194
|
-
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
|
195
|
-
|
194
|
+
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
|
195
|
+
lg_k, source.table_data.get_allocator());
|
196
|
+
target.table = u32_table<A>::make_from_pairs(pairs.data(), source.table_num_entries, lg_k, pairs.get_allocator());
|
196
197
|
}
|
197
198
|
|
198
199
|
// This is complicated because it effectively builds a Sparse version
|
@@ -206,7 +207,7 @@ void cpc_compressor<A>::compress_hybrid_flavor(const cpc_sketch_alloc<A>& source
|
|
206
207
|
if (pairs_from_table.size() > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, pairs_from_table.size());
|
207
208
|
const size_t num_pairs_from_window = source.get_num_coupons() - pairs_from_table.size(); // because the window offset is zero
|
208
209
|
|
209
|
-
vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size());
|
210
|
+
vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size(), source.get_allocator());
|
210
211
|
|
211
212
|
u32_table<A>::merge(
|
212
213
|
pairs_from_table.data(), 0, pairs_from_table.size(),
|
@@ -221,7 +222,8 @@ template<typename A>
|
|
221
222
|
void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
|
222
223
|
if (source.window_data.size() > 0) throw std::logic_error("window is not expected");
|
223
224
|
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
|
224
|
-
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
|
225
|
+
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
|
226
|
+
lg_k, source.table_data.get_allocator());
|
225
227
|
|
226
228
|
// In the hybrid flavor, some of these pairs actually
|
227
229
|
// belong in the window, so we will separate them out,
|
@@ -240,7 +242,7 @@ void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& sour
|
|
240
242
|
pairs[next_true_pair++] = row_col; // move true pair down
|
241
243
|
}
|
242
244
|
}
|
243
|
-
target.table = u32_table<A>::make_from_pairs(pairs.data(), next_true_pair, lg_k);
|
245
|
+
target.table = u32_table<A>::make_from_pairs(pairs.data(), next_true_pair, lg_k, pairs.get_allocator());
|
244
246
|
}
|
245
247
|
|
246
248
|
template<typename A>
|
@@ -264,21 +266,23 @@ void cpc_compressor<A>::compress_pinned_flavor(const cpc_sketch_alloc<A>& source
|
|
264
266
|
}
|
265
267
|
|
266
268
|
template<typename A>
|
267
|
-
void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
|
269
|
+
void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
|
270
|
+
uint8_t lg_k, uint32_t num_coupons) const {
|
268
271
|
if (source.window_data.size() == 0) throw std::logic_error("window is expected");
|
269
272
|
uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
|
270
273
|
const size_t num_pairs = source.table_num_entries;
|
271
274
|
if (num_pairs == 0) {
|
272
|
-
target.table = u32_table<A>(2, 6 + lg_k);
|
275
|
+
target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
|
273
276
|
} else {
|
274
277
|
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
|
275
|
-
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
|
278
|
+
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
|
279
|
+
lg_k, source.table_data.get_allocator());
|
276
280
|
// undo the compressor's 8-column shift
|
277
281
|
for (size_t i = 0; i < num_pairs; i++) {
|
278
282
|
if ((pairs[i] & 63) >= 56) throw std::logic_error("(pairs[i] & 63) >= 56");
|
279
283
|
pairs[i] += 8;
|
280
284
|
}
|
281
|
-
target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k);
|
285
|
+
target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k, pairs.get_allocator());
|
282
286
|
}
|
283
287
|
}
|
284
288
|
|
@@ -314,15 +318,17 @@ void cpc_compressor<A>::compress_sliding_flavor(const cpc_sketch_alloc<A>& sourc
|
|
314
318
|
}
|
315
319
|
|
316
320
|
template<typename A>
|
317
|
-
void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
|
321
|
+
void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
|
322
|
+
uint8_t lg_k, uint32_t num_coupons) const {
|
318
323
|
if (source.window_data.size() == 0) throw std::logic_error("window is expected");
|
319
324
|
uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
|
320
325
|
const size_t num_pairs = source.table_num_entries;
|
321
326
|
if (num_pairs == 0) {
|
322
|
-
target.table = u32_table<A>(2, 6 + lg_k);
|
327
|
+
target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
|
323
328
|
} else {
|
324
329
|
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
|
325
|
-
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
|
330
|
+
vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
|
331
|
+
lg_k, source.table_data.get_allocator());
|
326
332
|
|
327
333
|
const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
|
328
334
|
if (pseudo_phase >= 16) throw std::logic_error("pseudo phase >= 16");
|
@@ -342,7 +348,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
|
|
342
348
|
pairs[i] = (row << 6) | col;
|
343
349
|
}
|
344
350
|
|
345
|
-
target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k);
|
351
|
+
target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k, pairs.get_allocator());
|
346
352
|
}
|
347
353
|
}
|
348
354
|
|
@@ -364,9 +370,10 @@ void cpc_compressor<A>::compress_surprising_values(const vector_u32<A>& pairs, u
|
|
364
370
|
}
|
365
371
|
|
366
372
|
template<typename A>
|
367
|
-
vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs,
|
373
|
+
vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs,
|
374
|
+
uint8_t lg_k, const A& allocator) const {
|
368
375
|
const size_t k = 1 << lg_k;
|
369
|
-
vector_u32<A> pairs(num_pairs);
|
376
|
+
vector_u32<A> pairs(num_pairs, 0, allocator);
|
370
377
|
const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
|
371
378
|
low_level_uncompress_pairs(pairs.data(), num_pairs, num_base_bits, data, data_words);
|
372
379
|
return pairs;
|
@@ -388,7 +395,8 @@ void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t l
|
|
388
395
|
}
|
389
396
|
|
390
397
|
template<typename A>
|
391
|
-
void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window,
|
398
|
+
void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window,
|
399
|
+
uint8_t lg_k, uint32_t num_coupons) const {
|
392
400
|
const size_t k = 1 << lg_k;
|
393
401
|
window.resize(k); // zeroing not needed here (unlike the Hybrid Flavor)
|
394
402
|
const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
|
@@ -710,9 +718,10 @@ void write_unary(
|
|
710
718
|
// The empty space that this leaves at the beginning of the output array
|
711
719
|
// will be filled in later by the caller.
|
712
720
|
template<typename A>
|
713
|
-
vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get,
|
721
|
+
vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get,
|
722
|
+
uint32_t empty_space, const A& allocator) {
|
714
723
|
const size_t output_length = empty_space + num_pairs_to_get;
|
715
|
-
vector_u32<A> pairs(output_length);
|
724
|
+
vector_u32<A> pairs(output_length, 0, allocator);
|
716
725
|
size_t pair_index = empty_space;
|
717
726
|
for (unsigned row_index = 0; row_index < k; row_index++) {
|
718
727
|
uint8_t byte = window[row_index];
|
@@ -49,7 +49,7 @@ template<typename A> class cpc_sketch_alloc;
|
|
49
49
|
template<typename A> class cpc_union_alloc;
|
50
50
|
|
51
51
|
// alias with default allocator for convenience
|
52
|
-
|
52
|
+
using cpc_sketch = cpc_sketch_alloc<std::allocator<uint8_t>>;
|
53
53
|
|
54
54
|
// allocation and initialization of global decompression (decoding) tables
|
55
55
|
// call this before anything else if you want to control the initialization time
|
@@ -67,7 +67,10 @@ public:
|
|
67
67
|
* @param lg_k base 2 logarithm of the number of bins in the sketch
|
68
68
|
* @param seed for hash function
|
69
69
|
*/
|
70
|
-
explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED);
|
70
|
+
explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
71
|
+
|
72
|
+
using allocator_type = A;
|
73
|
+
A get_allocator() const;
|
71
74
|
|
72
75
|
/**
|
73
76
|
* @return configured lg_k of this sketch
|
@@ -204,7 +207,7 @@ public:
|
|
204
207
|
|
205
208
|
// This is a convenience alias for users
|
206
209
|
// The type returned by the following serialize method
|
207
|
-
|
210
|
+
using vector_bytes = vector_u8<A>;
|
208
211
|
|
209
212
|
/**
|
210
213
|
* This method serializes the sketch as a vector of bytes.
|
@@ -221,7 +224,7 @@ public:
|
|
221
224
|
* @param seed the seed for the hash function that was used to create the sketch
|
222
225
|
* @return an instance of a sketch
|
223
226
|
*/
|
224
|
-
static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
|
227
|
+
static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
225
228
|
|
226
229
|
/**
|
227
230
|
* This method deserializes a sketch from a given array of bytes.
|
@@ -230,7 +233,7 @@ public:
|
|
230
233
|
* @param seed the seed for the hash function that was used to create the sketch
|
231
234
|
* @return an instance of the sketch
|
232
235
|
*/
|
233
|
-
static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
|
236
|
+
static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
234
237
|
|
235
238
|
// for internal use
|
236
239
|
uint32_t get_num_coupons() const;
|