datasketches 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
@@ -346,14 +346,21 @@ class var_opt_sketch {
|
|
346
346
|
};
|
347
347
|
|
348
348
|
template<typename T, typename A>
|
349
|
-
class var_opt_sketch<T, A>::const_iterator
|
349
|
+
class var_opt_sketch<T, A>::const_iterator {
|
350
350
|
public:
|
351
|
+
using iterator_category = std::input_iterator_tag;
|
352
|
+
using value_type = std::pair<const T&, const double>;
|
353
|
+
using difference_type = void;
|
354
|
+
using pointer = const return_value_holder<value_type>;
|
355
|
+
using reference = const value_type;
|
356
|
+
|
351
357
|
const_iterator(const const_iterator& other);
|
352
358
|
const_iterator& operator++();
|
353
359
|
const_iterator& operator++(int);
|
354
360
|
bool operator==(const const_iterator& other) const;
|
355
361
|
bool operator!=(const const_iterator& other) const;
|
356
|
-
|
362
|
+
reference operator*() const;
|
363
|
+
pointer operator->() const;
|
357
364
|
|
358
365
|
private:
|
359
366
|
friend class var_opt_sketch<T, A>;
|
@@ -362,8 +369,8 @@ private:
|
|
362
369
|
// default iterator over full sketch
|
363
370
|
const_iterator(const var_opt_sketch<T, A>& sk, bool is_end);
|
364
371
|
|
365
|
-
// iterates over only one of the H or R
|
366
|
-
//
|
372
|
+
// iterates over only one of the H or R regions
|
373
|
+
// does not apply weight correction
|
367
374
|
const_iterator(const var_opt_sketch<T, A>& sk, bool is_end, bool use_r_region);
|
368
375
|
|
369
376
|
bool get_mark() const;
|
@@ -377,14 +384,21 @@ private:
|
|
377
384
|
|
378
385
|
// non-const iterator for internal use
|
379
386
|
template<typename T, typename A>
|
380
|
-
class var_opt_sketch<T, A>::iterator
|
387
|
+
class var_opt_sketch<T, A>::iterator {
|
381
388
|
public:
|
389
|
+
using iterator_category = std::input_iterator_tag;
|
390
|
+
using value_type = std::pair<T&, double>;
|
391
|
+
using difference_type = void;
|
392
|
+
using pointer = return_value_holder<value_type>;
|
393
|
+
using reference = value_type;
|
394
|
+
|
382
395
|
iterator(const iterator& other);
|
383
396
|
iterator& operator++();
|
384
397
|
iterator& operator++(int);
|
385
398
|
bool operator==(const iterator& other) const;
|
386
399
|
bool operator!=(const iterator& other) const;
|
387
|
-
|
400
|
+
reference operator*();
|
401
|
+
pointer operator->();
|
388
402
|
|
389
403
|
private:
|
390
404
|
friend class var_opt_sketch<T, A>;
|
@@ -189,16 +189,16 @@ var_opt_sketch<T, A>::~var_opt_sketch() {
|
|
189
189
|
// destroy everything
|
190
190
|
const size_t num_to_destroy = std::min(k_ + 1, curr_items_alloc_);
|
191
191
|
for (size_t i = 0; i < num_to_destroy; ++i) {
|
192
|
-
|
192
|
+
data_[i].~T();
|
193
193
|
}
|
194
194
|
} else {
|
195
195
|
// skip gap or anything unused at the end
|
196
196
|
for (size_t i = 0; i < h_; ++i) {
|
197
|
-
|
197
|
+
data_[i].~T();
|
198
198
|
}
|
199
199
|
|
200
200
|
for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) {
|
201
|
-
|
201
|
+
data_[i].~T();
|
202
202
|
}
|
203
203
|
}
|
204
204
|
allocator_.deallocate(data_, curr_items_alloc_);
|
@@ -658,14 +658,14 @@ void var_opt_sketch<T, A>::reset() {
|
|
658
658
|
// destroy everything
|
659
659
|
const size_t num_to_destroy = std::min(k_ + 1, prev_alloc);
|
660
660
|
for (size_t i = 0; i < num_to_destroy; ++i)
|
661
|
-
|
661
|
+
data_[i].~T();
|
662
662
|
} else {
|
663
663
|
// skip gap or anything unused at the end
|
664
664
|
for (size_t i = 0; i < h_; ++i)
|
665
|
-
|
665
|
+
data_[i].~T();
|
666
666
|
|
667
667
|
for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
|
668
|
-
|
668
|
+
data_[i].~T();
|
669
669
|
}
|
670
670
|
|
671
671
|
if (curr_items_alloc_ < prev_alloc) {
|
@@ -754,10 +754,10 @@ string<A> var_opt_sketch<T, A>::items_to_string(bool print_gap) const {
|
|
754
754
|
const uint32_t array_length = (n_ < k_ ? n_ : k_ + 1);
|
755
755
|
for (uint32_t i = 0, display_idx = 0; i < array_length; ++i) {
|
756
756
|
if (i == h_ && print_gap) {
|
757
|
-
os <<
|
757
|
+
os << display_idx << ": GAP" << std::endl;
|
758
758
|
++display_idx;
|
759
759
|
} else {
|
760
|
-
os <<
|
760
|
+
os << display_idx << ": " << data_[i] << "\twt = ";
|
761
761
|
if (weights_[i] == -1.0) {
|
762
762
|
os << get_tau() << "\t(-1.0)" << std::endl;
|
763
763
|
} else {
|
@@ -990,7 +990,7 @@ void var_opt_sketch<T, A>::grow_data_arrays() {
|
|
990
990
|
|
991
991
|
for (uint32_t i = 0; i < prev_size; ++i) {
|
992
992
|
new (&tmp_data[i]) T(std::move(data_[i]));
|
993
|
-
|
993
|
+
data_[i].~T();
|
994
994
|
tmp_weights[i] = weights_[i];
|
995
995
|
}
|
996
996
|
|
@@ -1531,7 +1531,6 @@ var_opt_sketch<T, A>::const_iterator::const_iterator(const var_opt_sketch& sk, b
|
|
1531
1531
|
if (idx_ == final_idx_) { sk_ = nullptr; }
|
1532
1532
|
}
|
1533
1533
|
|
1534
|
-
|
1535
1534
|
template<typename T, typename A>
|
1536
1535
|
var_opt_sketch<T, A>::const_iterator::const_iterator(const const_iterator& other) :
|
1537
1536
|
sk_(other.sk_),
|
@@ -1543,6 +1542,9 @@ var_opt_sketch<T, A>::const_iterator::const_iterator(const const_iterator& other
|
|
1543
1542
|
|
1544
1543
|
template<typename T, typename A>
|
1545
1544
|
typename var_opt_sketch<T, A>::const_iterator& var_opt_sketch<T, A>::const_iterator::operator++() {
|
1545
|
+
// accumulate weight already visited
|
1546
|
+
if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
|
1547
|
+
|
1546
1548
|
++idx_;
|
1547
1549
|
|
1548
1550
|
if (idx_ == final_idx_) {
|
@@ -1551,7 +1553,6 @@ typename var_opt_sketch<T, A>::const_iterator& var_opt_sketch<T, A>::const_itera
|
|
1551
1553
|
} else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
|
1552
1554
|
++idx_;
|
1553
1555
|
}
|
1554
|
-
if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
|
1555
1556
|
return *this;
|
1556
1557
|
}
|
1557
1558
|
|
@@ -1575,14 +1576,19 @@ bool var_opt_sketch<T, A>::const_iterator::operator!=(const const_iterator& othe
|
|
1575
1576
|
}
|
1576
1577
|
|
1577
1578
|
template<typename T, typename A>
|
1578
|
-
|
1579
|
+
auto var_opt_sketch<T, A>::const_iterator::operator*() const -> reference {
|
1579
1580
|
double wt;
|
1580
1581
|
if (idx_ < sk_->h_) {
|
1581
1582
|
wt = sk_->weights_[idx_];
|
1582
1583
|
} else {
|
1583
1584
|
wt = r_item_wt_;
|
1584
1585
|
}
|
1585
|
-
return
|
1586
|
+
return value_type(sk_->data_[idx_], wt);
|
1587
|
+
}
|
1588
|
+
|
1589
|
+
template<typename T, typename A>
|
1590
|
+
auto var_opt_sketch<T, A>::const_iterator::operator->() const -> pointer {
|
1591
|
+
return **this;
|
1586
1592
|
}
|
1587
1593
|
|
1588
1594
|
template<typename T, typename A>
|
@@ -1622,6 +1628,9 @@ var_opt_sketch<T, A>::iterator::iterator(const iterator& other) :
|
|
1622
1628
|
|
1623
1629
|
template<typename T, typename A>
|
1624
1630
|
typename var_opt_sketch<T, A>::iterator& var_opt_sketch<T, A>::iterator::operator++() {
|
1631
|
+
// accumulate weight already visited
|
1632
|
+
if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
|
1633
|
+
|
1625
1634
|
++idx_;
|
1626
1635
|
|
1627
1636
|
if (idx_ == final_idx_) {
|
@@ -1630,7 +1639,7 @@ typename var_opt_sketch<T, A>::iterator& var_opt_sketch<T, A>::iterator::operato
|
|
1630
1639
|
} else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
|
1631
1640
|
++idx_;
|
1632
1641
|
}
|
1633
|
-
|
1642
|
+
|
1634
1643
|
return *this;
|
1635
1644
|
}
|
1636
1645
|
|
@@ -1654,7 +1663,7 @@ bool var_opt_sketch<T, A>::iterator::operator!=(const iterator& other) const {
|
|
1654
1663
|
}
|
1655
1664
|
|
1656
1665
|
template<typename T, typename A>
|
1657
|
-
|
1666
|
+
auto var_opt_sketch<T, A>::iterator::operator*() -> reference {
|
1658
1667
|
double wt;
|
1659
1668
|
if (idx_ < sk_->h_) {
|
1660
1669
|
wt = sk_->weights_[idx_];
|
@@ -1663,7 +1672,12 @@ std::pair<T&, double> var_opt_sketch<T, A>::iterator::operator*() {
|
|
1663
1672
|
} else {
|
1664
1673
|
wt = r_item_wt_;
|
1665
1674
|
}
|
1666
|
-
return
|
1675
|
+
return value_type(sk_->data_[idx_], wt);
|
1676
|
+
}
|
1677
|
+
|
1678
|
+
template<typename T, typename A>
|
1679
|
+
auto var_opt_sketch<T, A>::iterator::operator->() -> pointer {
|
1680
|
+
return **this;
|
1667
1681
|
}
|
1668
1682
|
|
1669
1683
|
template<typename T, typename A>
|
@@ -153,6 +153,8 @@ public:
|
|
153
153
|
|
154
154
|
private:
|
155
155
|
typedef typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T, A>> AllocSketch;
|
156
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
|
157
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
|
156
158
|
|
157
159
|
static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
|
158
160
|
static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
|
@@ -170,10 +172,12 @@ private:
|
|
170
172
|
|
171
173
|
uint32_t max_k_;
|
172
174
|
|
175
|
+
A allocator_;
|
176
|
+
|
173
177
|
var_opt_sketch<T, A> gadget_;
|
174
178
|
|
175
179
|
var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
|
176
|
-
uint32_t max_k, var_opt_sketch<T, A>&& gadget);
|
180
|
+
uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator = A());
|
177
181
|
|
178
182
|
/*
|
179
183
|
IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
|
@@ -34,6 +34,7 @@ var_opt_union<T, A>::var_opt_union(uint32_t max_k, const A& allocator) :
|
|
34
34
|
outer_tau_numer_(0.0),
|
35
35
|
outer_tau_denom_(0),
|
36
36
|
max_k_(max_k),
|
37
|
+
allocator_(allocator),
|
37
38
|
gadget_(max_k, var_opt_sketch<T, A>::DEFAULT_RESIZE_FACTOR, true, allocator)
|
38
39
|
{}
|
39
40
|
|
@@ -43,6 +44,7 @@ var_opt_union<T, A>::var_opt_union(const var_opt_union& other) :
|
|
43
44
|
outer_tau_numer_(other.outer_tau_numer_),
|
44
45
|
outer_tau_denom_(other.outer_tau_denom_),
|
45
46
|
max_k_(other.max_k_),
|
47
|
+
allocator_(other.allocator_),
|
46
48
|
gadget_(other.gadget_)
|
47
49
|
{}
|
48
50
|
|
@@ -52,16 +54,18 @@ var_opt_union<T, A>::var_opt_union(var_opt_union&& other) noexcept :
|
|
52
54
|
outer_tau_numer_(other.outer_tau_numer_),
|
53
55
|
outer_tau_denom_(other.outer_tau_denom_),
|
54
56
|
max_k_(other.max_k_),
|
57
|
+
allocator_(other.allocator_),
|
55
58
|
gadget_(std::move(other.gadget_))
|
56
59
|
{}
|
57
60
|
|
58
61
|
template<typename T, typename A>
|
59
62
|
var_opt_union<T, A>::var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
|
60
|
-
uint32_t max_k, var_opt_sketch<T, A>&& gadget) :
|
63
|
+
uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator) :
|
61
64
|
n_(n),
|
62
65
|
outer_tau_numer_(outer_tau_numer),
|
63
66
|
outer_tau_denom_(outer_tau_denom),
|
64
67
|
max_k_(max_k),
|
68
|
+
allocator_(allocator),
|
65
69
|
gadget_(gadget)
|
66
70
|
{}
|
67
71
|
|
@@ -75,6 +79,7 @@ var_opt_union<T, A>& var_opt_union<T, A>::operator=(const var_opt_union& other)
|
|
75
79
|
std::swap(outer_tau_numer_, union_copy.outer_tau_numer_);
|
76
80
|
std::swap(outer_tau_denom_, union_copy.outer_tau_denom_);
|
77
81
|
std::swap(max_k_, union_copy.max_k_);
|
82
|
+
std::swap(allocator_, other.allocator_);
|
78
83
|
std::swap(gadget_, union_copy.gadget_);
|
79
84
|
return *this;
|
80
85
|
}
|
@@ -85,6 +90,7 @@ var_opt_union<T, A>& var_opt_union<T, A>::operator=(var_opt_union&& other) {
|
|
85
90
|
std::swap(outer_tau_numer_, other.outer_tau_numer_);
|
86
91
|
std::swap(outer_tau_denom_, other.outer_tau_denom_);
|
87
92
|
std::swap(max_k_, other.max_k_);
|
93
|
+
std::swap(allocator_, other.allocator_);
|
88
94
|
std::swap(gadget_, other.gadget_);
|
89
95
|
return *this;
|
90
96
|
}
|
@@ -162,7 +168,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(std::istream& is, const Ser
|
|
162
168
|
if (!is.good())
|
163
169
|
throw std::runtime_error("error reading from std::istream");
|
164
170
|
|
165
|
-
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
171
|
+
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget), allocator);
|
166
172
|
}
|
167
173
|
|
168
174
|
template<typename T, typename A>
|
@@ -204,7 +210,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(const void* bytes, size_t s
|
|
204
210
|
const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
|
205
211
|
var_opt_sketch<T, A> gadget = var_opt_sketch<T, A>::deserialize(ptr, gadget_size, sd, allocator);
|
206
212
|
|
207
|
-
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
213
|
+
return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget), allocator);
|
208
214
|
}
|
209
215
|
|
210
216
|
template<typename T, typename A>
|
@@ -508,9 +514,8 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
508
514
|
uint32_t result_r = 0;
|
509
515
|
size_t next_r_pos = result_k; // = (result_k+1)-1, to fill R region from back to front
|
510
516
|
|
511
|
-
|
512
|
-
|
513
|
-
T* data = A().allocate(result_k + 1);
|
517
|
+
double* wts = AllocDouble(allocator_).allocate(result_k + 1);
|
518
|
+
T* data = A(allocator_).allocate(result_k + 1);
|
514
519
|
|
515
520
|
// insert R region items, ignoring weights
|
516
521
|
// Currently (May 2017) this next block is unreachable; this coercer is used only in the
|
@@ -519,7 +524,7 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
519
524
|
// Addedndum (Jan 2020): Cleanup at end of method assumes R count is 0
|
520
525
|
const size_t final_idx = gadget_.get_num_samples();
|
521
526
|
for (size_t idx = gadget_.h_ + 1; idx <= final_idx; ++idx) {
|
522
|
-
|
527
|
+
new (&data[next_r_pos]) T(gadget_.data_[idx]);
|
523
528
|
wts[next_r_pos] = gadget_.weights_[idx];
|
524
529
|
++result_r;
|
525
530
|
--next_r_pos;
|
@@ -530,20 +535,20 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
530
535
|
// insert H region items
|
531
536
|
for (size_t idx = 0; idx < gadget_.h_; ++idx) {
|
532
537
|
if (gadget_.marks_[idx]) {
|
533
|
-
|
538
|
+
new (&data[next_r_pos]) T(gadget_.data_[idx]);
|
534
539
|
wts[next_r_pos] = -1.0;
|
535
540
|
transferred_weight += gadget_.weights_[idx];
|
536
541
|
++result_r;
|
537
542
|
--next_r_pos;
|
538
543
|
} else {
|
539
|
-
|
544
|
+
new (&data[result_h]) T(gadget_.data_[idx]);
|
540
545
|
wts[result_h] = gadget_.weights_[idx];
|
541
546
|
++result_h;
|
542
547
|
}
|
543
548
|
}
|
544
549
|
|
545
550
|
if (result_h + result_r != result_k) throw std::logic_error("H + R counts must equal k");
|
546
|
-
if (
|
551
|
+
if (std::abs(transferred_weight - outer_tau_numer_) > 1e-10) {
|
547
552
|
throw std::logic_error("uexpected mismatch in transferred weight");
|
548
553
|
}
|
549
554
|
|
@@ -554,11 +559,10 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
|
|
554
559
|
wts[result_h] = -1.0;
|
555
560
|
|
556
561
|
// clean up arrays in input sketch, replace with new values
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
A().deallocate(sk.data_, sk.curr_items_alloc_);
|
562
|
+
AllocBool(allocator_).deallocate(sk.marks_, sk.curr_items_alloc_);
|
563
|
+
AllocDouble(allocator_).deallocate(sk.weights_, sk.curr_items_alloc_);
|
564
|
+
for (size_t i = 0; i < result_k; ++i) { sk.data_[i].~T(); } // assumes everything in H region, no gap
|
565
|
+
A(allocator_).deallocate(sk.data_, sk.curr_items_alloc_);
|
562
566
|
|
563
567
|
sk.data_ = data;
|
564
568
|
sk.weights_ = wts;
|
@@ -52,17 +52,15 @@ static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2)
|
|
52
52
|
REQUIRE(sk1.get_k() == sk2.get_k());
|
53
53
|
REQUIRE(sk1.get_n() == sk2.get_n());
|
54
54
|
REQUIRE(sk1.get_num_samples() == sk2.get_num_samples());
|
55
|
-
|
55
|
+
|
56
56
|
auto it1 = sk1.begin();
|
57
57
|
auto it2 = sk2.begin();
|
58
|
-
size_t i = 0;
|
59
58
|
|
60
59
|
while ((it1 != sk1.end()) && (it2 != sk2.end())) {
|
61
|
-
|
62
|
-
|
60
|
+
auto p1 = *it1;
|
61
|
+
auto p2 = *it2;
|
63
62
|
REQUIRE(p1.first == p2.first); // data values
|
64
63
|
REQUIRE(p1.second == p2.second); // weights
|
65
|
-
++i;
|
66
64
|
++it1;
|
67
65
|
++it2;
|
68
66
|
}
|
@@ -182,7 +180,7 @@ TEST_CASE("varopt sketch: invalid weight", "[var_opt_sketch]") {
|
|
182
180
|
var_opt_sketch<std::string> sk(100, resize_factor::X2);
|
183
181
|
REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument);
|
184
182
|
|
185
|
-
// should not throw but sketch
|
183
|
+
// should not throw but sketch should still be empty
|
186
184
|
sk.update("zero weight", 0.0);
|
187
185
|
REQUIRE(sk.is_empty());
|
188
186
|
}
|
@@ -213,7 +211,7 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
|
|
213
211
|
|
214
212
|
double input_sum = 0.0;
|
215
213
|
for (size_t i = 0; i < n; ++i) {
|
216
|
-
// generate weights
|
214
|
+
// generate weights above and below 1.0 using w ~ exp(5*N(0,1))
|
217
215
|
// which covers about 10 orders of magnitude
|
218
216
|
double w = std::exp(5 * N(rand));
|
219
217
|
input_sum += w;
|
@@ -221,12 +219,12 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
|
|
221
219
|
}
|
222
220
|
|
223
221
|
double output_sum = 0.0;
|
224
|
-
for (auto
|
225
|
-
output_sum +=
|
222
|
+
for (auto pair : sk) { // std::pair<int, weight>
|
223
|
+
output_sum += pair.second;
|
226
224
|
}
|
227
225
|
|
228
226
|
double weight_ratio = output_sum / input_sum;
|
229
|
-
REQUIRE(
|
227
|
+
REQUIRE(weight_ratio == Approx(1.0).margin(EPS));
|
230
228
|
}
|
231
229
|
|
232
230
|
TEST_CASE("varopt sketch: under-full sketch serialization", "[var_opt_sketch]") {
|
@@ -275,26 +273,38 @@ TEST_CASE("varopt sketch: full sketch serialization", "[var_opt_sketch]") {
|
|
275
273
|
sk.update(100, 100.0);
|
276
274
|
sk.update(101, 101.0);
|
277
275
|
|
276
|
+
subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
|
277
|
+
double total_weight = summary.total_sketch_weight;
|
278
|
+
double cum_weight = 0.0;
|
279
|
+
for (auto pair : sk) {
|
280
|
+
cum_weight += pair.second;
|
281
|
+
}
|
282
|
+
double weight_ratio = cum_weight / total_weight;
|
283
|
+
REQUIRE(weight_ratio == Approx(1.0).margin(EPS));
|
284
|
+
|
278
285
|
// first 2 entries should be heavy and in heap order (smallest at root)
|
279
286
|
auto it = sk.begin();
|
280
|
-
|
287
|
+
auto p1 = *it;
|
281
288
|
++it;
|
282
|
-
|
289
|
+
auto p2 = *it;
|
283
290
|
REQUIRE(p1.second == Approx(100.0).margin(EPS));
|
284
291
|
REQUIRE(p2.second == Approx(101.0).margin(EPS));
|
285
292
|
REQUIRE(p1.first == 100);
|
286
293
|
REQUIRE(p2.first == 101);
|
294
|
+
// using operator ->
|
295
|
+
REQUIRE(it->first == p2.first);
|
296
|
+
REQUIRE(it->second == p2.second);
|
287
297
|
|
288
298
|
// check for 4 preamble longs
|
289
299
|
auto bytes = sk.serialize();
|
290
300
|
REQUIRE((bytes.data()[0] & 0x3f) == 4);; // PREAMBLE_LONGS_WARMUP
|
291
301
|
|
292
|
-
|
302
|
+
auto sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
|
293
303
|
check_if_equal(sk, sk_from_bytes);
|
294
304
|
|
295
305
|
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
296
306
|
sk.serialize(ss);
|
297
|
-
|
307
|
+
auto sk_from_stream = var_opt_sketch<int>::deserialize(ss);
|
298
308
|
check_if_equal(sk, sk_from_stream);
|
299
309
|
|
300
310
|
// ensure we unroll properly
|
@@ -340,6 +350,15 @@ TEST_CASE("varopt sketch: pseudo-light update", "[var_opt_sketch]") {
|
|
340
350
|
auto it = sk.begin();
|
341
351
|
double wt = (*it).second;
|
342
352
|
REQUIRE(wt == Approx((k + 2.0) / k).margin(EPS));
|
353
|
+
|
354
|
+
subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
|
355
|
+
double total_weight = summary.total_sketch_weight;
|
356
|
+
double cum_weight = 0.0;
|
357
|
+
for (auto pair : sk) {
|
358
|
+
cum_weight += pair.second;
|
359
|
+
}
|
360
|
+
double weight_ratio = cum_weight / total_weight;
|
361
|
+
REQUIRE(weight_ratio == Approx(1.0).margin(EPS));
|
343
362
|
}
|
344
363
|
|
345
364
|
TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
|
@@ -57,7 +57,6 @@ static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2,
|
|
57
57
|
|
58
58
|
auto it1 = sk1.begin();
|
59
59
|
auto it2 = sk2.begin();
|
60
|
-
size_t i = 0;
|
61
60
|
|
62
61
|
while ((it1 != sk1.end()) && (it2 != sk2.end())) {
|
63
62
|
const std::pair<const T&, const double> p1 = *it1;
|
@@ -66,7 +65,6 @@ static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2,
|
|
66
65
|
REQUIRE(p1.first == p2.first); // data values
|
67
66
|
}
|
68
67
|
REQUIRE(p1.second == p2.second); // weight values
|
69
|
-
++i;
|
70
68
|
++it1;
|
71
69
|
++it2;
|
72
70
|
}
|
@@ -100,7 +100,7 @@ setup(
|
|
100
100
|
url='http://datasketches.apache.org',
|
101
101
|
long_description=open('python/README.md').read(),
|
102
102
|
long_description_content_type='text/markdown',
|
103
|
-
packages=find_packages(where='python',exclude=['src','*tests*']), # src not needed if only the .so
|
103
|
+
packages=find_packages(where='python',exclude=['src','include','*tests*']), # src not needed if only the .so
|
104
104
|
package_dir={'':'python'},
|
105
105
|
# may need to add all source paths for sdist packages w/o MANIFEST.in
|
106
106
|
ext_modules=[CMakeExtension('datasketches')],
|