datasketches 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
- data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/python/README.md +6 -3
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
- data/vendor/datasketches-cpp/setup.py +5 -3
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
- metadata +43 -34
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
|
@@ -51,7 +51,7 @@ class var_opt_union {
|
|
|
51
51
|
public:
|
|
52
52
|
static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
|
|
53
53
|
|
|
54
|
-
explicit var_opt_union(uint32_t max_k);
|
|
54
|
+
explicit var_opt_union(uint32_t max_k, const A& allocator = A());
|
|
55
55
|
var_opt_union(const var_opt_union& other);
|
|
56
56
|
var_opt_union(var_opt_union&& other) noexcept;
|
|
57
57
|
|
|
@@ -119,16 +119,16 @@ public:
|
|
|
119
119
|
* @param is input stream
|
|
120
120
|
* @return an instance of a union
|
|
121
121
|
*/
|
|
122
|
-
static var_opt_union deserialize(std::istream& is);
|
|
122
|
+
static var_opt_union deserialize(std::istream& is, const A& allocator = A());
|
|
123
123
|
|
|
124
124
|
/**
|
|
125
125
|
* NOTE: This method may be deprecated in a future version.
|
|
126
|
-
* This method deserializes a
|
|
126
|
+
* This method deserializes a union from a given array of bytes.
|
|
127
127
|
* @param bytes pointer to the array of bytes
|
|
128
128
|
* @param size the size of the array
|
|
129
129
|
* @return an instance of a union
|
|
130
130
|
*/
|
|
131
|
-
static var_opt_union deserialize(const void* bytes, size_t size);
|
|
131
|
+
static var_opt_union deserialize(const void* bytes, size_t size, const A& allocator = A());
|
|
132
132
|
|
|
133
133
|
/**
|
|
134
134
|
* Prints a summary of the union as a string.
|
|
@@ -236,4 +236,4 @@ private:
|
|
|
236
236
|
|
|
237
237
|
#include "var_opt_union_impl.hpp"
|
|
238
238
|
|
|
239
|
-
#endif // _VAR_OPT_UNION_HPP_
|
|
239
|
+
#endif // _VAR_OPT_UNION_HPP_
|
|
@@ -28,12 +28,12 @@
|
|
|
28
28
|
namespace datasketches {
|
|
29
29
|
|
|
30
30
|
template<typename T, typename S, typename A>
|
|
31
|
-
var_opt_union<T,S,A>::var_opt_union(uint32_t max_k) :
|
|
31
|
+
var_opt_union<T,S,A>::var_opt_union(uint32_t max_k, const A& allocator) :
|
|
32
32
|
n_(0),
|
|
33
33
|
outer_tau_numer_(0),
|
|
34
34
|
outer_tau_denom_(0.0),
|
|
35
35
|
max_k_(max_k),
|
|
36
|
-
gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true)
|
|
36
|
+
gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true, allocator)
|
|
37
37
|
{}
|
|
38
38
|
|
|
39
39
|
template<typename T, typename S, typename A>
|
|
@@ -128,7 +128,7 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
|
|
|
128
128
|
*/
|
|
129
129
|
|
|
130
130
|
template<typename T, typename S, typename A>
|
|
131
|
-
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
|
|
131
|
+
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
|
|
132
132
|
uint8_t preamble_longs;
|
|
133
133
|
is.read((char*)&preamble_longs, sizeof(preamble_longs));
|
|
134
134
|
uint8_t serial_version;
|
|
@@ -163,7 +163,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
|
|
|
163
163
|
uint64_t outer_tau_denom;
|
|
164
164
|
is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
|
|
165
165
|
|
|
166
|
-
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is);
|
|
166
|
+
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
|
|
167
167
|
|
|
168
168
|
if (!is.good())
|
|
169
169
|
throw std::runtime_error("error reading from std::istream");
|
|
@@ -172,7 +172,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
|
|
|
172
172
|
}
|
|
173
173
|
|
|
174
174
|
template<typename T, typename S, typename A>
|
|
175
|
-
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size) {
|
|
175
|
+
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
|
|
176
176
|
ensure_minimum_memory(size, 8);
|
|
177
177
|
const char* ptr = static_cast<const char*>(bytes);
|
|
178
178
|
uint8_t preamble_longs;
|
|
@@ -207,7 +207,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
|
|
|
207
207
|
ptr += copy_from_mem(ptr, &outer_tau_denom, sizeof(outer_tau_denom));
|
|
208
208
|
|
|
209
209
|
const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
|
|
210
|
-
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size);
|
|
210
|
+
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
|
|
211
211
|
|
|
212
212
|
return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
|
213
213
|
}
|
|
@@ -255,7 +255,7 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
|
|
|
255
255
|
template<typename T, typename S, typename A>
|
|
256
256
|
std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes) const {
|
|
257
257
|
const size_t size = header_size_bytes + get_serialized_size_bytes();
|
|
258
|
-
std::vector<uint8_t, AllocU8<A>> bytes(size);
|
|
258
|
+
std::vector<uint8_t, AllocU8<A>> bytes(size, 0, gadget_.allocator_);
|
|
259
259
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
260
260
|
|
|
261
261
|
const bool empty = n_ == 0;
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <var_opt_sketch.hpp>
|
|
21
|
+
#include <var_opt_union.hpp>
|
|
22
|
+
#include <test_type.hpp>
|
|
23
|
+
#include <test_allocator.hpp>
|
|
24
|
+
|
|
25
|
+
#include <catch.hpp>
|
|
26
|
+
|
|
27
|
+
#include <sstream>
|
|
28
|
+
|
|
29
|
+
namespace datasketches {
|
|
30
|
+
|
|
31
|
+
using var_opt_test_sketch = var_opt_sketch<test_type, test_type_serde, test_allocator<test_type>>;
|
|
32
|
+
using var_opt_test_union = var_opt_union<test_type, test_type_serde, test_allocator<test_type>>;
|
|
33
|
+
using alloc = test_allocator<test_type>;
|
|
34
|
+
|
|
35
|
+
TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
|
|
36
|
+
test_allocator_total_bytes = 0;
|
|
37
|
+
test_allocator_net_allocations = 0;
|
|
38
|
+
{
|
|
39
|
+
var_opt_test_sketch sk1(10, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
|
|
40
|
+
for (int i = 0; i < 100; ++i) sk1.update(i);
|
|
41
|
+
auto bytes1 = sk1.serialize();
|
|
42
|
+
auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), 0);
|
|
43
|
+
|
|
44
|
+
std::stringstream ss;
|
|
45
|
+
sk1.serialize(ss);
|
|
46
|
+
auto sk3 = var_opt_test_sketch::deserialize(ss, alloc(0));
|
|
47
|
+
|
|
48
|
+
var_opt_test_union u1(10, 0);
|
|
49
|
+
u1.update(sk1);
|
|
50
|
+
u1.update(sk2);
|
|
51
|
+
u1.update(sk3);
|
|
52
|
+
|
|
53
|
+
auto bytes2 = u1.serialize();
|
|
54
|
+
auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), 0);
|
|
55
|
+
}
|
|
56
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
57
|
+
REQUIRE(test_allocator_net_allocations == 0);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
TEST_CASE( "varopt union: move", "[var_opt_union][test_type]") {
|
|
61
|
+
test_allocator_total_bytes = 0;
|
|
62
|
+
test_allocator_net_allocations = 0;
|
|
63
|
+
{
|
|
64
|
+
uint32_t n = 20;
|
|
65
|
+
uint32_t k = 5;
|
|
66
|
+
var_opt_test_union u(k, 0);
|
|
67
|
+
var_opt_test_sketch sk1(k, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
|
|
68
|
+
var_opt_test_sketch sk2(k, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
|
|
69
|
+
|
|
70
|
+
// move udpates
|
|
71
|
+
for (int i = 0; i < (int) n; ++i) {
|
|
72
|
+
sk1.update(i);
|
|
73
|
+
sk2.update(-i);
|
|
74
|
+
}
|
|
75
|
+
REQUIRE(sk1.get_n() == n);
|
|
76
|
+
REQUIRE(sk2.get_n() == n);
|
|
77
|
+
|
|
78
|
+
// move unions
|
|
79
|
+
u.update(std::move(sk2));
|
|
80
|
+
u.update(std::move(sk1));
|
|
81
|
+
REQUIRE(u.get_result().get_n() == 2 * n);
|
|
82
|
+
|
|
83
|
+
// move constructor
|
|
84
|
+
var_opt_test_union u2(std::move(u));
|
|
85
|
+
REQUIRE(u2.get_result().get_n() == 2 * n);
|
|
86
|
+
|
|
87
|
+
// move assignment
|
|
88
|
+
var_opt_test_union u3(k, 0);
|
|
89
|
+
u3 = std::move(u2);
|
|
90
|
+
REQUIRE(u3.get_result().get_n() == 2 * n);
|
|
91
|
+
}
|
|
92
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
93
|
+
REQUIRE(test_allocator_net_allocations == 0);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
}
|
|
@@ -18,7 +18,6 @@
|
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
20
|
#include <var_opt_union.hpp>
|
|
21
|
-
#include "test_type.hpp"
|
|
22
21
|
|
|
23
22
|
#include <catch.hpp>
|
|
24
23
|
|
|
@@ -325,34 +324,4 @@ TEST_CASE("varopt union: deserialize from java", "[var_opt_union]") {
|
|
|
325
324
|
REQUIRE(result.get_k() < 128);
|
|
326
325
|
}
|
|
327
326
|
|
|
328
|
-
TEST_CASE( "varopt union: move", "[var_opt_union][test_type]") {
|
|
329
|
-
uint32_t n = 20;
|
|
330
|
-
uint32_t k = 5;
|
|
331
|
-
var_opt_union<test_type> u(k);
|
|
332
|
-
var_opt_sketch<test_type> sk1(k);
|
|
333
|
-
var_opt_sketch<test_type> sk2(k);
|
|
334
|
-
|
|
335
|
-
// move udpates
|
|
336
|
-
for (int i = 0; i < (int) n; ++i) {
|
|
337
|
-
sk1.update(i);
|
|
338
|
-
sk2.update(-i);
|
|
339
|
-
}
|
|
340
|
-
REQUIRE(sk1.get_n() == n);
|
|
341
|
-
REQUIRE(sk2.get_n() == n);
|
|
342
|
-
|
|
343
|
-
// move unions
|
|
344
|
-
u.update(std::move(sk2));
|
|
345
|
-
u.update(std::move(sk1));
|
|
346
|
-
REQUIRE(u.get_result().get_n() == 2 * n);
|
|
347
|
-
|
|
348
|
-
// move constructor
|
|
349
|
-
var_opt_union<test_type> u2(std::move(u));
|
|
350
|
-
REQUIRE(u2.get_result().get_n() == 2 * n);
|
|
351
|
-
|
|
352
|
-
// move assignment
|
|
353
|
-
var_opt_union<test_type> u3(k);
|
|
354
|
-
u3 = std::move(u2);
|
|
355
|
-
REQUIRE(u3.get_result().get_n() == 2 * n);
|
|
356
|
-
}
|
|
357
|
-
|
|
358
327
|
}
|
|
@@ -49,6 +49,8 @@ class CMakeBuild(build_ext):
|
|
|
49
49
|
os.path.dirname(self.get_ext_fullpath(ext.name)))
|
|
50
50
|
cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir]
|
|
51
51
|
cmake_args += ['-DWITH_PYTHON=True']
|
|
52
|
+
# ensure we use a consistent python version
|
|
53
|
+
cmake_args += ['-DPYTHON_EXECUTABLE=' + sys.executable]
|
|
52
54
|
cfg = 'Debug' if self.debug else 'Release'
|
|
53
55
|
build_args = ['--config', cfg]
|
|
54
56
|
|
|
@@ -77,10 +79,10 @@ class CMakeBuild(build_ext):
|
|
|
77
79
|
|
|
78
80
|
setup(
|
|
79
81
|
name='datasketches',
|
|
80
|
-
version='
|
|
81
|
-
author='
|
|
82
|
+
version='3.0.0',
|
|
83
|
+
author='Apache DataSketches Developers',
|
|
82
84
|
author_email='dev@datasketches.apache.org',
|
|
83
|
-
description='A wrapper for the C++
|
|
85
|
+
description='A wrapper for the C++ Apache DataSketches library',
|
|
84
86
|
license='Apache License 2.0',
|
|
85
87
|
url='http://datasketches.apache.org',
|
|
86
88
|
long_description=open('python/README.md').read(),
|
|
@@ -33,9 +33,21 @@ target_link_libraries(theta INTERFACE common)
|
|
|
33
33
|
target_compile_features(theta INTERFACE cxx_std_11)
|
|
34
34
|
|
|
35
35
|
set(theta_HEADERS "")
|
|
36
|
-
list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/
|
|
37
|
-
list(APPEND theta_HEADERS "include/
|
|
38
|
-
list(APPEND theta_HEADERS "include/
|
|
36
|
+
list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_sketch_impl.hpp")
|
|
37
|
+
list(APPEND theta_HEADERS "include/theta_union.hpp;include/theta_union_impl.hpp")
|
|
38
|
+
list(APPEND theta_HEADERS "include/theta_intersection.hpp;include/theta_intersection_impl.hpp")
|
|
39
|
+
list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_a_not_b_impl.hpp")
|
|
40
|
+
list(APPEND theta_HEADERS "include/theta_jaccard_similarity.hpp")
|
|
41
|
+
list(APPEND theta_HEADERS "include/theta_comparators.hpp")
|
|
42
|
+
list(APPEND theta_HEADERS "include/theta_constants.hpp")
|
|
43
|
+
list(APPEND theta_HEADERS "include/theta_helpers.hpp")
|
|
44
|
+
list(APPEND theta_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
|
|
45
|
+
list(APPEND theta_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
|
|
46
|
+
list(APPEND theta_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
|
|
47
|
+
list(APPEND theta_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
|
|
48
|
+
list(APPEND theta_HEADERS "include/theta_jaccard_similarity_base.hpp")
|
|
49
|
+
list(APPEND theta_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
|
|
50
|
+
list(APPEND theta_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
|
|
39
51
|
|
|
40
52
|
install(TARGETS theta
|
|
41
53
|
EXPORT ${PROJECT_NAME}
|
|
@@ -54,4 +66,19 @@ target_sources(theta
|
|
|
54
66
|
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
|
|
55
67
|
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
|
|
56
68
|
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
|
|
69
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity.hpp
|
|
70
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
|
|
71
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
|
|
72
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_helpers.hpp
|
|
73
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
|
|
74
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
|
|
75
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
|
|
76
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
|
|
77
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
|
|
78
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
|
|
79
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
|
|
80
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
|
|
81
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity_base.hpp
|
|
82
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
|
|
83
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
|
|
57
84
|
)
|
|
@@ -20,51 +20,34 @@
|
|
|
20
20
|
#ifndef THETA_A_NOT_B_HPP_
|
|
21
21
|
#define THETA_A_NOT_B_HPP_
|
|
22
22
|
|
|
23
|
-
#include <memory>
|
|
24
|
-
#include <functional>
|
|
25
|
-
#include <climits>
|
|
26
|
-
|
|
27
23
|
#include "theta_sketch.hpp"
|
|
28
|
-
#include "
|
|
24
|
+
#include "theta_set_difference_base.hpp"
|
|
29
25
|
|
|
30
26
|
namespace datasketches {
|
|
31
27
|
|
|
32
|
-
|
|
33
|
-
* author Alexander Saydakov
|
|
34
|
-
* author Lee Rhodes
|
|
35
|
-
* author Kevin Lang
|
|
36
|
-
*/
|
|
37
|
-
|
|
38
|
-
template<typename A>
|
|
28
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
|
39
29
|
class theta_a_not_b_alloc {
|
|
40
30
|
public:
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
31
|
+
using Entry = uint64_t;
|
|
32
|
+
using ExtractKey = trivial_extract_key;
|
|
33
|
+
using CompactSketch = compact_theta_sketch_alloc<Allocator>;
|
|
34
|
+
using State = theta_set_difference_base<Entry, ExtractKey, CompactSketch, Allocator>;
|
|
35
|
+
|
|
36
|
+
explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
|
46
37
|
|
|
47
38
|
/**
|
|
48
39
|
* Computes the a-not-b set operation given two sketches.
|
|
49
40
|
* @return the result of a-not-b
|
|
50
41
|
*/
|
|
51
|
-
|
|
42
|
+
template<typename FwdSketch, typename Sketch>
|
|
43
|
+
CompactSketch compute(FwdSketch&& a, const Sketch& b, bool ordered = true) const;
|
|
52
44
|
|
|
53
45
|
private:
|
|
54
|
-
|
|
55
|
-
uint16_t seed_hash_;
|
|
56
|
-
|
|
57
|
-
class less_than {
|
|
58
|
-
public:
|
|
59
|
-
explicit less_than(uint64_t value): value(value) {}
|
|
60
|
-
bool operator()(uint64_t value) const { return value < this->value; }
|
|
61
|
-
private:
|
|
62
|
-
uint64_t value;
|
|
63
|
-
};
|
|
46
|
+
State state_;
|
|
64
47
|
};
|
|
65
48
|
|
|
66
49
|
// alias with default allocator for convenience
|
|
67
|
-
|
|
50
|
+
using theta_a_not_b = theta_a_not_b_alloc<std::allocator<uint64_t>>;
|
|
68
51
|
|
|
69
52
|
} /* namespace datasketches */
|
|
70
53
|
|
|
@@ -26,56 +26,15 @@
|
|
|
26
26
|
|
|
27
27
|
namespace datasketches {
|
|
28
28
|
|
|
29
|
-
/*
|
|
30
|
-
* author Alexander Saydakov
|
|
31
|
-
* author Lee Rhodes
|
|
32
|
-
* author Kevin Lang
|
|
33
|
-
*/
|
|
34
|
-
|
|
35
29
|
template<typename A>
|
|
36
|
-
theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed):
|
|
37
|
-
|
|
30
|
+
theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed, const A& allocator):
|
|
31
|
+
state_(seed, allocator)
|
|
38
32
|
{}
|
|
39
33
|
|
|
40
34
|
template<typename A>
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
|
|
45
|
-
|
|
46
|
-
const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
|
|
47
|
-
vector_u64<A> keys;
|
|
48
|
-
bool is_empty = a.is_empty();
|
|
49
|
-
|
|
50
|
-
if (b.get_num_retained() == 0) {
|
|
51
|
-
std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than(theta));
|
|
52
|
-
} else {
|
|
53
|
-
if (a.is_ordered() && b.is_ordered()) { // sort-based
|
|
54
|
-
std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than(theta)));
|
|
55
|
-
} else { // hash-based
|
|
56
|
-
const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
|
|
57
|
-
vector_u64<A> b_hash_table(1 << lg_size, 0);
|
|
58
|
-
for (auto key: b) {
|
|
59
|
-
if (key < theta) {
|
|
60
|
-
update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
|
|
61
|
-
} else if (b.is_ordered()) {
|
|
62
|
-
break; // early stop
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
// scan A lookup B
|
|
67
|
-
for (auto key: a) {
|
|
68
|
-
if (key < theta) {
|
|
69
|
-
if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
|
|
70
|
-
} else if (a.is_ordered()) {
|
|
71
|
-
break; // early stop
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
|
|
77
|
-
if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
|
|
78
|
-
return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
|
|
35
|
+
template<typename FwdSketch, typename Sketch>
|
|
36
|
+
auto theta_a_not_b_alloc<A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const -> CompactSketch {
|
|
37
|
+
return state_.compute(std::forward<FwdSketch>(a), b, ordered);
|
|
79
38
|
}
|
|
80
39
|
|
|
81
40
|
} /* namespace datasketches */
|