datasketches 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
- data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/python/README.md +6 -3
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
- data/vendor/datasketches-cpp/setup.py +5 -3
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
- metadata +43 -34
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -51,7 +51,7 @@ class var_opt_union {
|
|
51
51
|
public:
|
52
52
|
static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
|
53
53
|
|
54
|
-
explicit var_opt_union(uint32_t max_k);
|
54
|
+
explicit var_opt_union(uint32_t max_k, const A& allocator = A());
|
55
55
|
var_opt_union(const var_opt_union& other);
|
56
56
|
var_opt_union(var_opt_union&& other) noexcept;
|
57
57
|
|
@@ -119,16 +119,16 @@ public:
|
|
119
119
|
* @param is input stream
|
120
120
|
* @return an instance of a union
|
121
121
|
*/
|
122
|
-
static var_opt_union deserialize(std::istream& is);
|
122
|
+
static var_opt_union deserialize(std::istream& is, const A& allocator = A());
|
123
123
|
|
124
124
|
/**
|
125
125
|
* NOTE: This method may be deprecated in a future version.
|
126
|
-
* This method deserializes a
|
126
|
+
* This method deserializes a union from a given array of bytes.
|
127
127
|
* @param bytes pointer to the array of bytes
|
128
128
|
* @param size the size of the array
|
129
129
|
* @return an instance of a union
|
130
130
|
*/
|
131
|
-
static var_opt_union deserialize(const void* bytes, size_t size);
|
131
|
+
static var_opt_union deserialize(const void* bytes, size_t size, const A& allocator = A());
|
132
132
|
|
133
133
|
/**
|
134
134
|
* Prints a summary of the union as a string.
|
@@ -236,4 +236,4 @@ private:
|
|
236
236
|
|
237
237
|
#include "var_opt_union_impl.hpp"
|
238
238
|
|
239
|
-
#endif // _VAR_OPT_UNION_HPP_
|
239
|
+
#endif // _VAR_OPT_UNION_HPP_
|
@@ -28,12 +28,12 @@
|
|
28
28
|
namespace datasketches {
|
29
29
|
|
30
30
|
template<typename T, typename S, typename A>
|
31
|
-
var_opt_union<T,S,A>::var_opt_union(uint32_t max_k) :
|
31
|
+
var_opt_union<T,S,A>::var_opt_union(uint32_t max_k, const A& allocator) :
|
32
32
|
n_(0),
|
33
33
|
outer_tau_numer_(0),
|
34
34
|
outer_tau_denom_(0.0),
|
35
35
|
max_k_(max_k),
|
36
|
-
gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true)
|
36
|
+
gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true, allocator)
|
37
37
|
{}
|
38
38
|
|
39
39
|
template<typename T, typename S, typename A>
|
@@ -128,7 +128,7 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
|
|
128
128
|
*/
|
129
129
|
|
130
130
|
template<typename T, typename S, typename A>
|
131
|
-
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
|
131
|
+
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
|
132
132
|
uint8_t preamble_longs;
|
133
133
|
is.read((char*)&preamble_longs, sizeof(preamble_longs));
|
134
134
|
uint8_t serial_version;
|
@@ -163,7 +163,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
|
|
163
163
|
uint64_t outer_tau_denom;
|
164
164
|
is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
|
165
165
|
|
166
|
-
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is);
|
166
|
+
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
|
167
167
|
|
168
168
|
if (!is.good())
|
169
169
|
throw std::runtime_error("error reading from std::istream");
|
@@ -172,7 +172,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
|
|
172
172
|
}
|
173
173
|
|
174
174
|
template<typename T, typename S, typename A>
|
175
|
-
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size) {
|
175
|
+
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
|
176
176
|
ensure_minimum_memory(size, 8);
|
177
177
|
const char* ptr = static_cast<const char*>(bytes);
|
178
178
|
uint8_t preamble_longs;
|
@@ -207,7 +207,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
|
|
207
207
|
ptr += copy_from_mem(ptr, &outer_tau_denom, sizeof(outer_tau_denom));
|
208
208
|
|
209
209
|
const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
|
210
|
-
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size);
|
210
|
+
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
|
211
211
|
|
212
212
|
return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
213
213
|
}
|
@@ -255,7 +255,7 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
|
|
255
255
|
template<typename T, typename S, typename A>
|
256
256
|
std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes) const {
|
257
257
|
const size_t size = header_size_bytes + get_serialized_size_bytes();
|
258
|
-
std::vector<uint8_t, AllocU8<A>> bytes(size);
|
258
|
+
std::vector<uint8_t, AllocU8<A>> bytes(size, 0, gadget_.allocator_);
|
259
259
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
260
260
|
|
261
261
|
const bool empty = n_ == 0;
|
@@ -0,0 +1,96 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <var_opt_sketch.hpp>
|
21
|
+
#include <var_opt_union.hpp>
|
22
|
+
#include <test_type.hpp>
|
23
|
+
#include <test_allocator.hpp>
|
24
|
+
|
25
|
+
#include <catch.hpp>
|
26
|
+
|
27
|
+
#include <sstream>
|
28
|
+
|
29
|
+
namespace datasketches {
|
30
|
+
|
31
|
+
using var_opt_test_sketch = var_opt_sketch<test_type, test_type_serde, test_allocator<test_type>>;
|
32
|
+
using var_opt_test_union = var_opt_union<test_type, test_type_serde, test_allocator<test_type>>;
|
33
|
+
using alloc = test_allocator<test_type>;
|
34
|
+
|
35
|
+
TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
|
36
|
+
test_allocator_total_bytes = 0;
|
37
|
+
test_allocator_net_allocations = 0;
|
38
|
+
{
|
39
|
+
var_opt_test_sketch sk1(10, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
|
40
|
+
for (int i = 0; i < 100; ++i) sk1.update(i);
|
41
|
+
auto bytes1 = sk1.serialize();
|
42
|
+
auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), 0);
|
43
|
+
|
44
|
+
std::stringstream ss;
|
45
|
+
sk1.serialize(ss);
|
46
|
+
auto sk3 = var_opt_test_sketch::deserialize(ss, alloc(0));
|
47
|
+
|
48
|
+
var_opt_test_union u1(10, 0);
|
49
|
+
u1.update(sk1);
|
50
|
+
u1.update(sk2);
|
51
|
+
u1.update(sk3);
|
52
|
+
|
53
|
+
auto bytes2 = u1.serialize();
|
54
|
+
auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), 0);
|
55
|
+
}
|
56
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
57
|
+
REQUIRE(test_allocator_net_allocations == 0);
|
58
|
+
}
|
59
|
+
|
60
|
+
TEST_CASE( "varopt union: move", "[var_opt_union][test_type]") {
|
61
|
+
test_allocator_total_bytes = 0;
|
62
|
+
test_allocator_net_allocations = 0;
|
63
|
+
{
|
64
|
+
uint32_t n = 20;
|
65
|
+
uint32_t k = 5;
|
66
|
+
var_opt_test_union u(k, 0);
|
67
|
+
var_opt_test_sketch sk1(k, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
|
68
|
+
var_opt_test_sketch sk2(k, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
|
69
|
+
|
70
|
+
// move udpates
|
71
|
+
for (int i = 0; i < (int) n; ++i) {
|
72
|
+
sk1.update(i);
|
73
|
+
sk2.update(-i);
|
74
|
+
}
|
75
|
+
REQUIRE(sk1.get_n() == n);
|
76
|
+
REQUIRE(sk2.get_n() == n);
|
77
|
+
|
78
|
+
// move unions
|
79
|
+
u.update(std::move(sk2));
|
80
|
+
u.update(std::move(sk1));
|
81
|
+
REQUIRE(u.get_result().get_n() == 2 * n);
|
82
|
+
|
83
|
+
// move constructor
|
84
|
+
var_opt_test_union u2(std::move(u));
|
85
|
+
REQUIRE(u2.get_result().get_n() == 2 * n);
|
86
|
+
|
87
|
+
// move assignment
|
88
|
+
var_opt_test_union u3(k, 0);
|
89
|
+
u3 = std::move(u2);
|
90
|
+
REQUIRE(u3.get_result().get_n() == 2 * n);
|
91
|
+
}
|
92
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
93
|
+
REQUIRE(test_allocator_net_allocations == 0);
|
94
|
+
}
|
95
|
+
|
96
|
+
}
|
@@ -18,7 +18,6 @@
|
|
18
18
|
*/
|
19
19
|
|
20
20
|
#include <var_opt_union.hpp>
|
21
|
-
#include "test_type.hpp"
|
22
21
|
|
23
22
|
#include <catch.hpp>
|
24
23
|
|
@@ -325,34 +324,4 @@ TEST_CASE("varopt union: deserialize from java", "[var_opt_union]") {
|
|
325
324
|
REQUIRE(result.get_k() < 128);
|
326
325
|
}
|
327
326
|
|
328
|
-
TEST_CASE( "varopt union: move", "[var_opt_union][test_type]") {
|
329
|
-
uint32_t n = 20;
|
330
|
-
uint32_t k = 5;
|
331
|
-
var_opt_union<test_type> u(k);
|
332
|
-
var_opt_sketch<test_type> sk1(k);
|
333
|
-
var_opt_sketch<test_type> sk2(k);
|
334
|
-
|
335
|
-
// move udpates
|
336
|
-
for (int i = 0; i < (int) n; ++i) {
|
337
|
-
sk1.update(i);
|
338
|
-
sk2.update(-i);
|
339
|
-
}
|
340
|
-
REQUIRE(sk1.get_n() == n);
|
341
|
-
REQUIRE(sk2.get_n() == n);
|
342
|
-
|
343
|
-
// move unions
|
344
|
-
u.update(std::move(sk2));
|
345
|
-
u.update(std::move(sk1));
|
346
|
-
REQUIRE(u.get_result().get_n() == 2 * n);
|
347
|
-
|
348
|
-
// move constructor
|
349
|
-
var_opt_union<test_type> u2(std::move(u));
|
350
|
-
REQUIRE(u2.get_result().get_n() == 2 * n);
|
351
|
-
|
352
|
-
// move assignment
|
353
|
-
var_opt_union<test_type> u3(k);
|
354
|
-
u3 = std::move(u2);
|
355
|
-
REQUIRE(u3.get_result().get_n() == 2 * n);
|
356
|
-
}
|
357
|
-
|
358
327
|
}
|
@@ -49,6 +49,8 @@ class CMakeBuild(build_ext):
|
|
49
49
|
os.path.dirname(self.get_ext_fullpath(ext.name)))
|
50
50
|
cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir]
|
51
51
|
cmake_args += ['-DWITH_PYTHON=True']
|
52
|
+
# ensure we use a consistent python version
|
53
|
+
cmake_args += ['-DPYTHON_EXECUTABLE=' + sys.executable]
|
52
54
|
cfg = 'Debug' if self.debug else 'Release'
|
53
55
|
build_args = ['--config', cfg]
|
54
56
|
|
@@ -77,10 +79,10 @@ class CMakeBuild(build_ext):
|
|
77
79
|
|
78
80
|
setup(
|
79
81
|
name='datasketches',
|
80
|
-
version='
|
81
|
-
author='
|
82
|
+
version='3.0.0',
|
83
|
+
author='Apache DataSketches Developers',
|
82
84
|
author_email='dev@datasketches.apache.org',
|
83
|
-
description='A wrapper for the C++
|
85
|
+
description='A wrapper for the C++ Apache DataSketches library',
|
84
86
|
license='Apache License 2.0',
|
85
87
|
url='http://datasketches.apache.org',
|
86
88
|
long_description=open('python/README.md').read(),
|
@@ -33,9 +33,21 @@ target_link_libraries(theta INTERFACE common)
|
|
33
33
|
target_compile_features(theta INTERFACE cxx_std_11)
|
34
34
|
|
35
35
|
set(theta_HEADERS "")
|
36
|
-
list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/
|
37
|
-
list(APPEND theta_HEADERS "include/
|
38
|
-
list(APPEND theta_HEADERS "include/
|
36
|
+
list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_sketch_impl.hpp")
|
37
|
+
list(APPEND theta_HEADERS "include/theta_union.hpp;include/theta_union_impl.hpp")
|
38
|
+
list(APPEND theta_HEADERS "include/theta_intersection.hpp;include/theta_intersection_impl.hpp")
|
39
|
+
list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_a_not_b_impl.hpp")
|
40
|
+
list(APPEND theta_HEADERS "include/theta_jaccard_similarity.hpp")
|
41
|
+
list(APPEND theta_HEADERS "include/theta_comparators.hpp")
|
42
|
+
list(APPEND theta_HEADERS "include/theta_constants.hpp")
|
43
|
+
list(APPEND theta_HEADERS "include/theta_helpers.hpp")
|
44
|
+
list(APPEND theta_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
|
45
|
+
list(APPEND theta_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
|
46
|
+
list(APPEND theta_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
|
47
|
+
list(APPEND theta_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
|
48
|
+
list(APPEND theta_HEADERS "include/theta_jaccard_similarity_base.hpp")
|
49
|
+
list(APPEND theta_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
|
50
|
+
list(APPEND theta_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
|
39
51
|
|
40
52
|
install(TARGETS theta
|
41
53
|
EXPORT ${PROJECT_NAME}
|
@@ -54,4 +66,19 @@ target_sources(theta
|
|
54
66
|
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
|
55
67
|
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
|
56
68
|
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
|
69
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity.hpp
|
70
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
|
71
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
|
72
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_helpers.hpp
|
73
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
|
74
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
|
75
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
|
76
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
|
77
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
|
78
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
|
79
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
|
80
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
|
81
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity_base.hpp
|
82
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
|
83
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
|
57
84
|
)
|
@@ -20,51 +20,34 @@
|
|
20
20
|
#ifndef THETA_A_NOT_B_HPP_
|
21
21
|
#define THETA_A_NOT_B_HPP_
|
22
22
|
|
23
|
-
#include <memory>
|
24
|
-
#include <functional>
|
25
|
-
#include <climits>
|
26
|
-
|
27
23
|
#include "theta_sketch.hpp"
|
28
|
-
#include "
|
24
|
+
#include "theta_set_difference_base.hpp"
|
29
25
|
|
30
26
|
namespace datasketches {
|
31
27
|
|
32
|
-
|
33
|
-
* author Alexander Saydakov
|
34
|
-
* author Lee Rhodes
|
35
|
-
* author Kevin Lang
|
36
|
-
*/
|
37
|
-
|
38
|
-
template<typename A>
|
28
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
39
29
|
class theta_a_not_b_alloc {
|
40
30
|
public:
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
31
|
+
using Entry = uint64_t;
|
32
|
+
using ExtractKey = trivial_extract_key;
|
33
|
+
using CompactSketch = compact_theta_sketch_alloc<Allocator>;
|
34
|
+
using State = theta_set_difference_base<Entry, ExtractKey, CompactSketch, Allocator>;
|
35
|
+
|
36
|
+
explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
46
37
|
|
47
38
|
/**
|
48
39
|
* Computes the a-not-b set operation given two sketches.
|
49
40
|
* @return the result of a-not-b
|
50
41
|
*/
|
51
|
-
|
42
|
+
template<typename FwdSketch, typename Sketch>
|
43
|
+
CompactSketch compute(FwdSketch&& a, const Sketch& b, bool ordered = true) const;
|
52
44
|
|
53
45
|
private:
|
54
|
-
|
55
|
-
uint16_t seed_hash_;
|
56
|
-
|
57
|
-
class less_than {
|
58
|
-
public:
|
59
|
-
explicit less_than(uint64_t value): value(value) {}
|
60
|
-
bool operator()(uint64_t value) const { return value < this->value; }
|
61
|
-
private:
|
62
|
-
uint64_t value;
|
63
|
-
};
|
46
|
+
State state_;
|
64
47
|
};
|
65
48
|
|
66
49
|
// alias with default allocator for convenience
|
67
|
-
|
50
|
+
using theta_a_not_b = theta_a_not_b_alloc<std::allocator<uint64_t>>;
|
68
51
|
|
69
52
|
} /* namespace datasketches */
|
70
53
|
|
@@ -26,56 +26,15 @@
|
|
26
26
|
|
27
27
|
namespace datasketches {
|
28
28
|
|
29
|
-
/*
|
30
|
-
* author Alexander Saydakov
|
31
|
-
* author Lee Rhodes
|
32
|
-
* author Kevin Lang
|
33
|
-
*/
|
34
|
-
|
35
29
|
template<typename A>
|
36
|
-
theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed):
|
37
|
-
|
30
|
+
theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed, const A& allocator):
|
31
|
+
state_(seed, allocator)
|
38
32
|
{}
|
39
33
|
|
40
34
|
template<typename A>
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
|
45
|
-
|
46
|
-
const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
|
47
|
-
vector_u64<A> keys;
|
48
|
-
bool is_empty = a.is_empty();
|
49
|
-
|
50
|
-
if (b.get_num_retained() == 0) {
|
51
|
-
std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than(theta));
|
52
|
-
} else {
|
53
|
-
if (a.is_ordered() && b.is_ordered()) { // sort-based
|
54
|
-
std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than(theta)));
|
55
|
-
} else { // hash-based
|
56
|
-
const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
|
57
|
-
vector_u64<A> b_hash_table(1 << lg_size, 0);
|
58
|
-
for (auto key: b) {
|
59
|
-
if (key < theta) {
|
60
|
-
update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
|
61
|
-
} else if (b.is_ordered()) {
|
62
|
-
break; // early stop
|
63
|
-
}
|
64
|
-
}
|
65
|
-
|
66
|
-
// scan A lookup B
|
67
|
-
for (auto key: a) {
|
68
|
-
if (key < theta) {
|
69
|
-
if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
|
70
|
-
} else if (a.is_ordered()) {
|
71
|
-
break; // early stop
|
72
|
-
}
|
73
|
-
}
|
74
|
-
}
|
75
|
-
}
|
76
|
-
if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
|
77
|
-
if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
|
78
|
-
return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
|
35
|
+
template<typename FwdSketch, typename Sketch>
|
36
|
+
auto theta_a_not_b_alloc<A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const -> CompactSketch {
|
37
|
+
return state_.compute(std::forward<FwdSketch>(a), b, ordered);
|
79
38
|
}
|
80
39
|
|
81
40
|
} /* namespace datasketches */
|