datasketches 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
@@ -346,14 +346,21 @@ class var_opt_sketch {
|
|
346
346
|
};
|
347
347
|
|
348
348
|
template<typename T, typename A>
|
349
|
-
class var_opt_sketch<T, A>::const_iterator
|
349
|
+
class var_opt_sketch<T, A>::const_iterator {
|
350
350
|
public:
|
351
|
+
using iterator_category = std::input_iterator_tag;
|
352
|
+
using value_type = std::pair<const T&, const double>;
|
353
|
+
using difference_type = void;
|
354
|
+
using pointer = const return_value_holder<value_type>;
|
355
|
+
using reference = const value_type;
|
356
|
+
|
351
357
|
const_iterator(const const_iterator& other);
|
352
358
|
const_iterator& operator++();
|
353
359
|
const_iterator& operator++(int);
|
354
360
|
bool operator==(const const_iterator& other) const;
|
355
361
|
bool operator!=(const const_iterator& other) const;
|
356
|
-
|
362
|
+
reference operator*() const;
|
363
|
+
pointer operator->() const;
|
357
364
|
|
358
365
|
private:
|
359
366
|
friend class var_opt_sketch<T, A>;
|
@@ -362,8 +369,8 @@ private:
|
|
362
369
|
// default iterator over full sketch
|
363
370
|
const_iterator(const var_opt_sketch<T, A>& sk, bool is_end);
|
364
371
|
|
365
|
-
// iterates over only one of the H or R
|
366
|
-
//
|
372
|
+
// iterates over only one of the H or R regions
|
373
|
+
// does not apply weight correction
|
367
374
|
const_iterator(const var_opt_sketch<T, A>& sk, bool is_end, bool use_r_region);
|
368
375
|
|
369
376
|
bool get_mark() const;
|
@@ -377,14 +384,21 @@ private:
|
|
377
384
|
|
378
385
|
// non-const iterator for internal use
|
379
386
|
template<typename T, typename A>
|
380
|
-
class var_opt_sketch<T, A>::iterator
|
387
|
+
class var_opt_sketch<T, A>::iterator {
|
381
388
|
public:
|
389
|
+
using iterator_category = std::input_iterator_tag;
|
390
|
+
using value_type = std::pair<T&, double>;
|
391
|
+
using difference_type = void;
|
392
|
+
using pointer = return_value_holder<value_type>;
|
393
|
+
using reference = value_type;
|
394
|
+
|
382
395
|
iterator(const iterator& other);
|
383
396
|
iterator& operator++();
|
384
397
|
iterator& operator++(int);
|
385
398
|
bool operator==(const iterator& other) const;
|
386
399
|
bool operator!=(const iterator& other) const;
|
387
|
-
|
400
|
+
reference operator*();
|
401
|
+
pointer operator->();
|
388
402
|
|
389
403
|
private:
|
390
404
|
friend class var_opt_sketch<T, A>;
|
@@ -189,16 +189,16 @@ var_opt_sketch<T, A>::~var_opt_sketch() {
|
|
189
189
|
// destroy everything
|
190
190
|
const size_t num_to_destroy = std::min(k_ + 1, curr_items_alloc_);
|
191
191
|
for (size_t i = 0; i < num_to_destroy; ++i) {
|
192
|
-
|
192
|
+
data_[i].~T();
|
193
193
|
}
|
194
194
|
} else {
|
195
195
|
// skip gap or anything unused at the end
|
196
196
|
for (size_t i = 0; i < h_; ++i) {
|
197
|
-
|
197
|
+
data_[i].~T();
|
198
198
|
}
|
199
199
|
|
200
200
|
for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) {
|
201
|
-
|
201
|
+
data_[i].~T();
|
202
202
|
}
|
203
203
|
}
|
204
204
|
allocator_.deallocate(data_, curr_items_alloc_);
|
@@ -658,14 +658,14 @@ void var_opt_sketch<T, A>::reset() {
|
|
658
658
|
// destroy everything
|
659
659
|
const size_t num_to_destroy = std::min(k_ + 1, prev_alloc);
|
660
660
|
for (size_t i = 0; i < num_to_destroy; ++i)
|
661
|
-
|
661
|
+
data_[i].~T();
|
662
662
|
} else {
|
663
663
|
// skip gap or anything unused at the end
|
664
664
|
for (size_t i = 0; i < h_; ++i)
|
665
|
-
|
665
|
+
data_[i].~T();
|
666
666
|
|
667
667
|
for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
|
668
|
-
|
668
|
+
data_[i].~T();
|
669
669
|
}
|
670
670
|
|
671
671
|
if (curr_items_alloc_ < prev_alloc) {
|
@@ -754,10 +754,10 @@ string<A> var_opt_sketch<T, A>::items_to_string(bool print_gap) const {
|
|
754
754
|
const uint32_t array_length = (n_ < k_ ? n_ : k_ + 1);
|
755
755
|
for (uint32_t i = 0, display_idx = 0; i < array_length; ++i) {
|
756
756
|
if (i == h_ && print_gap) {
|
757
|
-
os <<
|
757
|
+
os << display_idx << ": GAP" << std::endl;
|
758
758
|
++display_idx;
|
759
759
|
} else {
|
760
|
-
os <<
|
760
|
+
os << display_idx << ": " << data_[i] << "\twt = ";
|
761
761
|
if (weights_[i] == -1.0) {
|
762
762
|
os << get_tau() << "\t(-1.0)" << std::endl;
|
763
763
|
} else {
|
@@ -990,7 +990,7 @@ void var_opt_sketch<T, A>::grow_data_arrays() {
|
|
990
990
|
|
991
991
|
for (uint32_t i = 0; i < prev_size; ++i) {
|
992
992
|
new (&tmp_data[i]) T(std::move(data_[i]));
|
993
|
-
|
993
|
+
data_[i].~T();
|
994
994
|
tmp_weights[i] = weights_[i];
|
995
995
|
}
|
996
996
|
|
@@ -1531,7 +1531,6 @@ var_opt_sketch<T, A>::const_iterator::const_iterator(const var_opt_sketch& sk, b
|
|
1531
1531
|
if (idx_ == final_idx_) { sk_ = nullptr; }
|
1532
1532
|
}
|
1533
1533
|
|
1534
|
-
|
1535
1534
|
template<typename T, typename A>
|
1536
1535
|
var_opt_sketch<T, A>::const_iterator::const_iterator(const const_iterator& other) :
|
1537
1536
|
sk_(other.sk_),
|
@@ -1543,6 +1542,9 @@ var_opt_sketch<T, A>::const_iterator::const_iterator(const const_iterator& other
|
|
1543
1542
|
|
1544
1543
|
template<typename T, typename A>
|
1545
1544
|
typename var_opt_sketch<T, A>::const_iterator& var_opt_sketch<T, A>::const_iterator::operator++() {
|
1545
|
+
// accumulate weight already visited
|
1546
|
+
if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
|
1547
|
+
|
1546
1548
|
++idx_;
|
1547
1549
|
|
1548
1550
|
if (idx_ == final_idx_) {
|
@@ -1551,7 +1553,6 @@ typename var_opt_sketch<T, A>::const_iterator& var_opt_sketch<T, A>::const_itera
|
|
1551
1553
|
} else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
|
1552
1554
|
++idx_;
|
1553
1555
|
}
|
1554
|
-
if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
|
1555
1556
|
return *this;
|
1556
1557
|
}
|
1557
1558
|
|
@@ -1575,14 +1576,19 @@ bool var_opt_sketch<T, A>::const_iterator::operator!=(const const_iterator& othe
|
|
1575
1576
|
}
|
1576
1577
|
|
1577
1578
|
template<typename T, typename A>
|
1578
|
-
|
1579
|
+
auto var_opt_sketch<T, A>::const_iterator::operator*() const -> reference {
|
1579
1580
|
double wt;
|
1580
1581
|
if (idx_ < sk_->h_) {
|
1581
1582
|
wt = sk_->weights_[idx_];
|
1582
1583
|
} else {
|
1583
1584
|
wt = r_item_wt_;
|
1584
1585
|
}
|
1585
|
-
return
|
1586
|
+
return value_type(sk_->data_[idx_], wt);
|
1587
|
+
}
|
1588
|
+
|
1589
|
+
template<typename T, typename A>
|
1590
|
+
auto var_opt_sketch<T, A>::const_iterator::operator->() const -> pointer {
|
1591
|
+
return **this;
|
1586
1592
|
}
|
1587
1593
|
|
1588
1594
|
template<typename T, typename A>
|
@@ -1622,6 +1628,9 @@ var_opt_sketch<T, A>::iterator::iterator(const iterator& other) :
|
|
1622
1628
|
|
1623
1629
|
template<typename T, typename A>
|
1624
1630
|
typename var_opt_sketch<T, A>::iterator& var_opt_sketch<T, A>::iterator::operator++() {
|
1631
|
+
// accumulate weight already visited
|
1632
|
+
if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
|
1633
|
+
|
1625
1634
|
++idx_;
|
1626
1635
|
|
1627
1636
|
if (idx_ == final_idx_) {
|
@@ -1630,7 +1639,7 @@ typename var_opt_sketch<T, A>::iterator& var_opt_sketch<T, A>::iterator::operato
|
|
1630
1639
|
} else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
|
1631
1640
|
++idx_;
|
1632
1641
|
}
|
1633
|
-
|
1642
|
+
|
1634
1643
|
return *this;
|
1635
1644
|
}
|
1636
1645
|
|
@@ -1654,7 +1663,7 @@ bool var_opt_sketch<T, A>::iterator::operator!=(const iterator& other) const {
|
|
1654
1663
|
}
|
1655
1664
|
|
1656
1665
|
template<typename T, typename A>
|
1657
|
-
|
1666
|
+
auto var_opt_sketch<T, A>::iterator::operator*() -> reference {
|
1658
1667
|
double wt;
|
1659
1668
|
if (idx_ < sk_->h_) {
|
1660
1669
|
wt = sk_->weights_[idx_];
|
@@ -1663,7 +1672,12 @@ std::pair<T&, double> var_opt_sketch<T, A>::iterator::operator*() {
|
|
1663
1672
|
} else {
|
1664
1673
|
wt = r_item_wt_;
|
1665
1674
|
}
|
1666
|
-
return
|
1675
|
+
return value_type(sk_->data_[idx_], wt);
|
1676
|
+
}
|
1677
|
+
|
1678
|
+
template<typename T, typename A>
|
1679
|
+
auto var_opt_sketch<T, A>::iterator::operator->() -> pointer {
|
1680
|
+
return **this;
|
1667
1681
|
}
|
1668
1682
|
|
1669
1683
|
template<typename T, typename A>
|
@@ -153,6 +153,8 @@ public:
|
|
153
153
|
|
154
154
|
private:
|
155
155
|
typedef typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T, A>> AllocSketch;
|
156
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
|
157
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
|
156
158
|
|
157
159
|
static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
|
158
160
|
static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
|
@@ -170,10 +172,12 @@ private:
|
|
170
172
|
|
171
173
|
uint32_t max_k_;
|
172
174
|
|
175
|
+
A allocator_;
|
176
|
+
|
173
177
|
var_opt_sketch<T, A> gadget_;
|
174
178
|
|
175
179
|
var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
|
176
|
-
uint32_t max_k, var_opt_sketch<T, A>&& gadget);
|
180
|
+
uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator = A());
|
177
181
|
|
178
182
|
/*
|
179
183
|
IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
|
@@ -34,6 +34,7 @@ var_opt_union<T, A>::var_opt_union(uint32_t max_k, const A& allocator) :
|
|
34
34
|
outer_tau_numer_(0.0),
|
35
35
|
outer_tau_denom_(0),
|
36
36
|
max_k_(max_k),
|
37
|
+
allocator_(allocator),
|
37
38
|
gadget_(max_k, var_opt_sketch<T, A>::DEFAULT_RESIZE_FACTOR, true, allocator)
|
38
39
|
{}
|
39
40
|
|
@@ -43,6 +44,7 @@ var_opt_union<T, A>::var_opt_union(const var_opt_union& other) :
|
|
43
44
|
outer_tau_numer_(other.outer_tau_numer_),
|
44
45
|
outer_tau_denom_(other.outer_tau_denom_),
|
45
46
|
max_k_(other.max_k_),
|
47
|
+
allocator_(other.allocator_),
|
46
48
|
gadget_(other.gadget_)
|
47
49
|
{}
|
48
50
|
|
@@ -52,16 +54,18 @@ var_opt_union<T, A>::var_opt_union(var_opt_union&& other) noexcept :
|
|
52
54
|
outer_tau_numer_(other.outer_tau_numer_),
|
53
55
|
outer_tau_denom_(other.outer_tau_denom_),
|
54
56
|
max_k_(other.max_k_),
|
57
|
+
allocator_(other.allocator_),
|
55
58
|
gadget_(std::move(other.gadget_))
|
56
59
|
{}
|
57
60
|
|
58
61
|
template<typename T, typename A>
|
59
62
|
var_opt_union<T, A>::var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
|
60
|
-
uint32_t max_k, var_opt_sketch<T, A>&& gadget) :
|
63
|
+
uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator) :
|
61
64
|
n_(n),
|
62
65
|
outer_tau_numer_(outer_tau_numer),
|
63
66
|
outer_tau_denom_(outer_tau_denom),
|
64
67
|
max_k_(max_k),
|
68
|
+
allocator_(allocator),
|
65
69
|
gadget_(gadget)
|
66
70
|
{}
|
67
71
|
|
@@ -75,6 +79,7 @@ var_opt_union<T, A>& var_opt_union<T, A>::operator=(const var_opt_union& other)
|
|
75
79
|
std::swap(outer_tau_numer_, union_copy.outer_tau_numer_);
|
76
80
|
std::swap(outer_tau_denom_, union_copy.outer_tau_denom_);
|
77
81
|
std::swap(max_k_, union_copy.max_k_);
|
82
|
+
std::swap(allocator_, other.allocator_);
|
78
83
|
std::swap(gadget_, union_copy.gadget_);
|
79
84
|
return *this;
|
80
85
|
}
|
@@ -85,6 +90,7 @@ var_opt_union<T, A>& var_opt_union<T, A>::operator=(var_opt_union&& other) {
|
|
85
90
|
std::swap(outer_tau_numer_, other.outer_tau_numer_);
|
86
91
|
std::swap(outer_tau_denom_, other.outer_tau_denom_);
|
87
92
|
std::swap(max_k_, other.max_k_);
|
93
|
+
std::swap(allocator_, other.allocator_);
|
88
94
|
std::swap(gadget_, other.gadget_);
|
89
95
|
return *this;
|
90
96
|
}
|
@@ -162,7 +168,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(std::istream& is, const Ser
|
|
162
168
|
if (!is.good())
|
163
169
|
throw std::runtime_error("error reading from std::istream");
|
164
170
|
|
165
|
-
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
171
|
+
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget), allocator);
|
166
172
|
}
|
167
173
|
|
168
174
|
template<typename T, typename A>
|
@@ -204,7 +210,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(const void* bytes, size_t s
|
|
204
210
|
const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
|
205
211
|
var_opt_sketch<T, A> gadget = var_opt_sketch<T, A>::deserialize(ptr, gadget_size, sd, allocator);
|
206
212
|
|
207
|
-
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
213
|
+
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget), allocator);
|
208
214
|
}
|
209
215
|
|
210
216
|
template<typename T, typename A>
|
@@ -508,9 +514,8 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
508
514
|
uint32_t result_r = 0;
|
509
515
|
size_t next_r_pos = result_k; // = (result_k+1)-1, to fill R region from back to front
|
510
516
|
|
511
|
-
|
512
|
-
|
513
|
-
T* data = A().allocate(result_k + 1);
|
517
|
+
double* wts = AllocDouble(allocator_).allocate(result_k + 1);
|
518
|
+
T* data = A(allocator_).allocate(result_k + 1);
|
514
519
|
|
515
520
|
// insert R region items, ignoring weights
|
516
521
|
// Currently (May 2017) this next block is unreachable; this coercer is used only in the
|
@@ -519,7 +524,7 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
519
524
|
// Addedndum (Jan 2020): Cleanup at end of method assumes R count is 0
|
520
525
|
const size_t final_idx = gadget_.get_num_samples();
|
521
526
|
for (size_t idx = gadget_.h_ + 1; idx <= final_idx; ++idx) {
|
522
|
-
|
527
|
+
new (&data[next_r_pos]) T(gadget_.data_[idx]);
|
523
528
|
wts[next_r_pos] = gadget_.weights_[idx];
|
524
529
|
++result_r;
|
525
530
|
--next_r_pos;
|
@@ -530,20 +535,20 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
530
535
|
// insert H region items
|
531
536
|
for (size_t idx = 0; idx < gadget_.h_; ++idx) {
|
532
537
|
if (gadget_.marks_[idx]) {
|
533
|
-
|
538
|
+
new (&data[next_r_pos]) T(gadget_.data_[idx]);
|
534
539
|
wts[next_r_pos] = -1.0;
|
535
540
|
transferred_weight += gadget_.weights_[idx];
|
536
541
|
++result_r;
|
537
542
|
--next_r_pos;
|
538
543
|
} else {
|
539
|
-
|
544
|
+
new (&data[result_h]) T(gadget_.data_[idx]);
|
540
545
|
wts[result_h] = gadget_.weights_[idx];
|
541
546
|
++result_h;
|
542
547
|
}
|
543
548
|
}
|
544
549
|
|
545
550
|
if (result_h + result_r != result_k) throw std::logic_error("H + R counts must equal k");
|
546
|
-
if (
|
551
|
+
if (std::abs(transferred_weight - outer_tau_numer_) > 1e-10) {
|
547
552
|
throw std::logic_error("uexpected mismatch in transferred weight");
|
548
553
|
}
|
549
554
|
|
@@ -554,11 +559,10 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
554
559
|
wts[result_h] = -1.0;
|
555
560
|
|
556
561
|
// clean up arrays in input sketch, replace with new values
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
A().deallocate(sk.data_, sk.curr_items_alloc_);
|
562
|
+
AllocBool(allocator_).deallocate(sk.marks_, sk.curr_items_alloc_);
|
563
|
+
AllocDouble(allocator_).deallocate(sk.weights_, sk.curr_items_alloc_);
|
564
|
+
for (size_t i = 0; i < result_k; ++i) { sk.data_[i].~T(); } // assumes everything in H region, no gap
|
565
|
+
A(allocator_).deallocate(sk.data_, sk.curr_items_alloc_);
|
562
566
|
|
563
567
|
sk.data_ = data;
|
564
568
|
sk.weights_ = wts;
|
@@ -52,17 +52,15 @@ static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2)
|
|
52
52
|
REQUIRE(sk1.get_k() == sk2.get_k());
|
53
53
|
REQUIRE(sk1.get_n() == sk2.get_n());
|
54
54
|
REQUIRE(sk1.get_num_samples() == sk2.get_num_samples());
|
55
|
-
|
55
|
+
|
56
56
|
auto it1 = sk1.begin();
|
57
57
|
auto it2 = sk2.begin();
|
58
|
-
size_t i = 0;
|
59
58
|
|
60
59
|
while ((it1 != sk1.end()) && (it2 != sk2.end())) {
|
61
|
-
|
62
|
-
|
60
|
+
auto p1 = *it1;
|
61
|
+
auto p2 = *it2;
|
63
62
|
REQUIRE(p1.first == p2.first); // data values
|
64
63
|
REQUIRE(p1.second == p2.second); // weights
|
65
|
-
++i;
|
66
64
|
++it1;
|
67
65
|
++it2;
|
68
66
|
}
|
@@ -182,7 +180,7 @@ TEST_CASE("varopt sketch: invalid weight", "[var_opt_sketch]") {
|
|
182
180
|
var_opt_sketch<std::string> sk(100, resize_factor::X2);
|
183
181
|
REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument);
|
184
182
|
|
185
|
-
// should not throw but sketch
|
183
|
+
// should not throw but sketch should still be empty
|
186
184
|
sk.update("zero weight", 0.0);
|
187
185
|
REQUIRE(sk.is_empty());
|
188
186
|
}
|
@@ -213,7 +211,7 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
|
|
213
211
|
|
214
212
|
double input_sum = 0.0;
|
215
213
|
for (size_t i = 0; i < n; ++i) {
|
216
|
-
// generate weights
|
214
|
+
// generate weights above and below 1.0 using w ~ exp(5*N(0,1))
|
217
215
|
// which covers about 10 orders of magnitude
|
218
216
|
double w = std::exp(5 * N(rand));
|
219
217
|
input_sum += w;
|
@@ -221,12 +219,12 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
|
|
221
219
|
}
|
222
220
|
|
223
221
|
double output_sum = 0.0;
|
224
|
-
for (auto
|
225
|
-
output_sum +=
|
222
|
+
for (auto pair : sk) { // std::pair<int, weight>
|
223
|
+
output_sum += pair.second;
|
226
224
|
}
|
227
225
|
|
228
226
|
double weight_ratio = output_sum / input_sum;
|
229
|
-
REQUIRE(
|
227
|
+
REQUIRE(weight_ratio == Approx(1.0).margin(EPS));
|
230
228
|
}
|
231
229
|
|
232
230
|
TEST_CASE("varopt sketch: under-full sketch serialization", "[var_opt_sketch]") {
|
@@ -275,26 +273,38 @@ TEST_CASE("varopt sketch: full sketch serialization", "[var_opt_sketch]") {
|
|
275
273
|
sk.update(100, 100.0);
|
276
274
|
sk.update(101, 101.0);
|
277
275
|
|
276
|
+
subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
|
277
|
+
double total_weight = summary.total_sketch_weight;
|
278
|
+
double cum_weight = 0.0;
|
279
|
+
for (auto pair : sk) {
|
280
|
+
cum_weight += pair.second;
|
281
|
+
}
|
282
|
+
double weight_ratio = cum_weight / total_weight;
|
283
|
+
REQUIRE(weight_ratio == Approx(1.0).margin(EPS));
|
284
|
+
|
278
285
|
// first 2 entries should be heavy and in heap order (smallest at root)
|
279
286
|
auto it = sk.begin();
|
280
|
-
|
287
|
+
auto p1 = *it;
|
281
288
|
++it;
|
282
|
-
|
289
|
+
auto p2 = *it;
|
283
290
|
REQUIRE(p1.second == Approx(100.0).margin(EPS));
|
284
291
|
REQUIRE(p2.second == Approx(101.0).margin(EPS));
|
285
292
|
REQUIRE(p1.first == 100);
|
286
293
|
REQUIRE(p2.first == 101);
|
294
|
+
// using operator ->
|
295
|
+
REQUIRE(it->first == p2.first);
|
296
|
+
REQUIRE(it->second == p2.second);
|
287
297
|
|
288
298
|
// check for 4 preamble longs
|
289
299
|
auto bytes = sk.serialize();
|
290
300
|
REQUIRE((bytes.data()[0] & 0x3f) == 4);; // PREAMBLE_LONGS_WARMUP
|
291
301
|
|
292
|
-
|
302
|
+
auto sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
|
293
303
|
check_if_equal(sk, sk_from_bytes);
|
294
304
|
|
295
305
|
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
296
306
|
sk.serialize(ss);
|
297
|
-
|
307
|
+
auto sk_from_stream = var_opt_sketch<int>::deserialize(ss);
|
298
308
|
check_if_equal(sk, sk_from_stream);
|
299
309
|
|
300
310
|
// ensure we unroll properly
|
@@ -340,6 +350,15 @@ TEST_CASE("varopt sketch: pseudo-light update", "[var_opt_sketch]") {
|
|
340
350
|
auto it = sk.begin();
|
341
351
|
double wt = (*it).second;
|
342
352
|
REQUIRE(wt == Approx((k + 2.0) / k).margin(EPS));
|
353
|
+
|
354
|
+
subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
|
355
|
+
double total_weight = summary.total_sketch_weight;
|
356
|
+
double cum_weight = 0.0;
|
357
|
+
for (auto pair : sk) {
|
358
|
+
cum_weight += pair.second;
|
359
|
+
}
|
360
|
+
double weight_ratio = cum_weight / total_weight;
|
361
|
+
REQUIRE(weight_ratio == Approx(1.0).margin(EPS));
|
343
362
|
}
|
344
363
|
|
345
364
|
TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
|
@@ -57,7 +57,6 @@ static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2,
|
|
57
57
|
|
58
58
|
auto it1 = sk1.begin();
|
59
59
|
auto it2 = sk2.begin();
|
60
|
-
size_t i = 0;
|
61
60
|
|
62
61
|
while ((it1 != sk1.end()) && (it2 != sk2.end())) {
|
63
62
|
const std::pair<const T&, const double> p1 = *it1;
|
@@ -66,7 +65,6 @@ static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2,
|
|
66
65
|
REQUIRE(p1.first == p2.first); // data values
|
67
66
|
}
|
68
67
|
REQUIRE(p1.second == p2.second); // weight values
|
69
|
-
++i;
|
70
68
|
++it1;
|
71
69
|
++it2;
|
72
70
|
}
|
@@ -100,7 +100,7 @@ setup(
|
|
100
100
|
url='http://datasketches.apache.org',
|
101
101
|
long_description=open('python/README.md').read(),
|
102
102
|
long_description_content_type='text/markdown',
|
103
|
-
packages=find_packages(where='python',exclude=['src','*tests*']), # src not needed if only the .so
|
103
|
+
packages=find_packages(where='python',exclude=['src','include','*tests*']), # src not needed if only the .so
|
104
104
|
package_dir={'':'python'},
|
105
105
|
# may need to add all source paths for sdist packages w/o MANIFEST.in
|
106
106
|
ext_modules=[CMakeExtension('datasketches')],
|