datasketches 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -346,14 +346,21 @@ class var_opt_sketch {
346
346
  };
347
347
 
348
348
  template<typename T, typename A>
349
- class var_opt_sketch<T, A>::const_iterator : public std::iterator<std::input_iterator_tag, T> {
349
+ class var_opt_sketch<T, A>::const_iterator {
350
350
  public:
351
+ using iterator_category = std::input_iterator_tag;
352
+ using value_type = std::pair<const T&, const double>;
353
+ using difference_type = void;
354
+ using pointer = const return_value_holder<value_type>;
355
+ using reference = const value_type;
356
+
351
357
  const_iterator(const const_iterator& other);
352
358
  const_iterator& operator++();
353
359
  const_iterator& operator++(int);
354
360
  bool operator==(const const_iterator& other) const;
355
361
  bool operator!=(const const_iterator& other) const;
356
- const std::pair<const T&, const double> operator*() const;
362
+ reference operator*() const;
363
+ pointer operator->() const;
357
364
 
358
365
  private:
359
366
  friend class var_opt_sketch<T, A>;
@@ -362,8 +369,8 @@ private:
362
369
  // default iterator over full sketch
363
370
  const_iterator(const var_opt_sketch<T, A>& sk, bool is_end);
364
371
 
365
- // iterates over only one of the H or R region, optionally applying weight correction
366
- // to R region (can correct for numerical precision issues)
372
+ // iterates over only one of the H or R regions
373
+ // does not apply weight correction
367
374
  const_iterator(const var_opt_sketch<T, A>& sk, bool is_end, bool use_r_region);
368
375
 
369
376
  bool get_mark() const;
@@ -377,14 +384,21 @@ private:
377
384
 
378
385
  // non-const iterator for internal use
379
386
  template<typename T, typename A>
380
- class var_opt_sketch<T, A>::iterator : public std::iterator<std::input_iterator_tag, T> {
387
+ class var_opt_sketch<T, A>::iterator {
381
388
  public:
389
+ using iterator_category = std::input_iterator_tag;
390
+ using value_type = std::pair<T&, double>;
391
+ using difference_type = void;
392
+ using pointer = return_value_holder<value_type>;
393
+ using reference = value_type;
394
+
382
395
  iterator(const iterator& other);
383
396
  iterator& operator++();
384
397
  iterator& operator++(int);
385
398
  bool operator==(const iterator& other) const;
386
399
  bool operator!=(const iterator& other) const;
387
- std::pair<T&, double> operator*();
400
+ reference operator*();
401
+ pointer operator->();
388
402
 
389
403
  private:
390
404
  friend class var_opt_sketch<T, A>;
@@ -189,16 +189,16 @@ var_opt_sketch<T, A>::~var_opt_sketch() {
189
189
  // destroy everything
190
190
  const size_t num_to_destroy = std::min(k_ + 1, curr_items_alloc_);
191
191
  for (size_t i = 0; i < num_to_destroy; ++i) {
192
- allocator_.destroy(data_ + i);
192
+ data_[i].~T();
193
193
  }
194
194
  } else {
195
195
  // skip gap or anything unused at the end
196
196
  for (size_t i = 0; i < h_; ++i) {
197
- allocator_.destroy(data_+ i);
197
+ data_[i].~T();
198
198
  }
199
199
 
200
200
  for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) {
201
- allocator_.destroy(data_ + i);
201
+ data_[i].~T();
202
202
  }
203
203
  }
204
204
  allocator_.deallocate(data_, curr_items_alloc_);
@@ -658,14 +658,14 @@ void var_opt_sketch<T, A>::reset() {
658
658
  // destroy everything
659
659
  const size_t num_to_destroy = std::min(k_ + 1, prev_alloc);
660
660
  for (size_t i = 0; i < num_to_destroy; ++i)
661
- allocator_.destroy(data_ + i);
661
+ data_[i].~T();
662
662
  } else {
663
663
  // skip gap or anything unused at the end
664
664
  for (size_t i = 0; i < h_; ++i)
665
- allocator_.destroy(data_+ i);
665
+ data_[i].~T();
666
666
 
667
667
  for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
668
- allocator_.destroy(data_ + i);
668
+ data_[i].~T();
669
669
  }
670
670
 
671
671
  if (curr_items_alloc_ < prev_alloc) {
@@ -754,10 +754,10 @@ string<A> var_opt_sketch<T, A>::items_to_string(bool print_gap) const {
754
754
  const uint32_t array_length = (n_ < k_ ? n_ : k_ + 1);
755
755
  for (uint32_t i = 0, display_idx = 0; i < array_length; ++i) {
756
756
  if (i == h_ && print_gap) {
757
- os << i << ": GAP" << std::endl;
757
+ os << display_idx << ": GAP" << std::endl;
758
758
  ++display_idx;
759
759
  } else {
760
- os << i << ": " << data_[i] << "\twt = ";
760
+ os << display_idx << ": " << data_[i] << "\twt = ";
761
761
  if (weights_[i] == -1.0) {
762
762
  os << get_tau() << "\t(-1.0)" << std::endl;
763
763
  } else {
@@ -990,7 +990,7 @@ void var_opt_sketch<T, A>::grow_data_arrays() {
990
990
 
991
991
  for (uint32_t i = 0; i < prev_size; ++i) {
992
992
  new (&tmp_data[i]) T(std::move(data_[i]));
993
- allocator_.destroy(data_ + i);
993
+ data_[i].~T();
994
994
  tmp_weights[i] = weights_[i];
995
995
  }
996
996
 
@@ -1531,7 +1531,6 @@ var_opt_sketch<T, A>::const_iterator::const_iterator(const var_opt_sketch& sk, b
1531
1531
  if (idx_ == final_idx_) { sk_ = nullptr; }
1532
1532
  }
1533
1533
 
1534
-
1535
1534
  template<typename T, typename A>
1536
1535
  var_opt_sketch<T, A>::const_iterator::const_iterator(const const_iterator& other) :
1537
1536
  sk_(other.sk_),
@@ -1543,6 +1542,9 @@ var_opt_sketch<T, A>::const_iterator::const_iterator(const const_iterator& other
1543
1542
 
1544
1543
  template<typename T, typename A>
1545
1544
  typename var_opt_sketch<T, A>::const_iterator& var_opt_sketch<T, A>::const_iterator::operator++() {
1545
+ // accumulate weight already visited
1546
+ if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
1547
+
1546
1548
  ++idx_;
1547
1549
 
1548
1550
  if (idx_ == final_idx_) {
@@ -1551,7 +1553,6 @@ typename var_opt_sketch<T, A>::const_iterator& var_opt_sketch<T, A>::const_itera
1551
1553
  } else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
1552
1554
  ++idx_;
1553
1555
  }
1554
- if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
1555
1556
  return *this;
1556
1557
  }
1557
1558
 
@@ -1575,14 +1576,19 @@ bool var_opt_sketch<T, A>::const_iterator::operator!=(const const_iterator& othe
1575
1576
  }
1576
1577
 
1577
1578
  template<typename T, typename A>
1578
- const std::pair<const T&, const double> var_opt_sketch<T, A>::const_iterator::operator*() const {
1579
+ auto var_opt_sketch<T, A>::const_iterator::operator*() const -> reference {
1579
1580
  double wt;
1580
1581
  if (idx_ < sk_->h_) {
1581
1582
  wt = sk_->weights_[idx_];
1582
1583
  } else {
1583
1584
  wt = r_item_wt_;
1584
1585
  }
1585
- return std::pair<const T&, const double>(sk_->data_[idx_], wt);
1586
+ return value_type(sk_->data_[idx_], wt);
1587
+ }
1588
+
1589
+ template<typename T, typename A>
1590
+ auto var_opt_sketch<T, A>::const_iterator::operator->() const -> pointer {
1591
+ return **this;
1586
1592
  }
1587
1593
 
1588
1594
  template<typename T, typename A>
@@ -1622,6 +1628,9 @@ var_opt_sketch<T, A>::iterator::iterator(const iterator& other) :
1622
1628
 
1623
1629
  template<typename T, typename A>
1624
1630
  typename var_opt_sketch<T, A>::iterator& var_opt_sketch<T, A>::iterator::operator++() {
1631
+ // accumulate weight already visited
1632
+ if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
1633
+
1625
1634
  ++idx_;
1626
1635
 
1627
1636
  if (idx_ == final_idx_) {
@@ -1630,7 +1639,7 @@ typename var_opt_sketch<T, A>::iterator& var_opt_sketch<T, A>::iterator::operato
1630
1639
  } else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
1631
1640
  ++idx_;
1632
1641
  }
1633
- if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
1642
+
1634
1643
  return *this;
1635
1644
  }
1636
1645
 
@@ -1654,7 +1663,7 @@ bool var_opt_sketch<T, A>::iterator::operator!=(const iterator& other) const {
1654
1663
  }
1655
1664
 
1656
1665
  template<typename T, typename A>
1657
- std::pair<T&, double> var_opt_sketch<T, A>::iterator::operator*() {
1666
+ auto var_opt_sketch<T, A>::iterator::operator*() -> reference {
1658
1667
  double wt;
1659
1668
  if (idx_ < sk_->h_) {
1660
1669
  wt = sk_->weights_[idx_];
@@ -1663,7 +1672,12 @@ std::pair<T&, double> var_opt_sketch<T, A>::iterator::operator*() {
1663
1672
  } else {
1664
1673
  wt = r_item_wt_;
1665
1674
  }
1666
- return std::pair<T&, double>(sk_->data_[idx_], wt);
1675
+ return value_type(sk_->data_[idx_], wt);
1676
+ }
1677
+
1678
+ template<typename T, typename A>
1679
+ auto var_opt_sketch<T, A>::iterator::operator->() -> pointer {
1680
+ return **this;
1667
1681
  }
1668
1682
 
1669
1683
  template<typename T, typename A>
@@ -153,6 +153,8 @@ public:
153
153
 
154
154
  private:
155
155
  typedef typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T, A>> AllocSketch;
156
+ typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
157
+ typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
156
158
 
157
159
  static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
158
160
  static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
@@ -170,10 +172,12 @@ private:
170
172
 
171
173
  uint32_t max_k_;
172
174
 
175
+ A allocator_;
176
+
173
177
  var_opt_sketch<T, A> gadget_;
174
178
 
175
179
  var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
176
- uint32_t max_k, var_opt_sketch<T, A>&& gadget);
180
+ uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator = A());
177
181
 
178
182
  /*
179
183
  IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
@@ -34,6 +34,7 @@ var_opt_union<T, A>::var_opt_union(uint32_t max_k, const A& allocator) :
34
34
  outer_tau_numer_(0.0),
35
35
  outer_tau_denom_(0),
36
36
  max_k_(max_k),
37
+ allocator_(allocator),
37
38
  gadget_(max_k, var_opt_sketch<T, A>::DEFAULT_RESIZE_FACTOR, true, allocator)
38
39
  {}
39
40
 
@@ -43,6 +44,7 @@ var_opt_union<T, A>::var_opt_union(const var_opt_union& other) :
43
44
  outer_tau_numer_(other.outer_tau_numer_),
44
45
  outer_tau_denom_(other.outer_tau_denom_),
45
46
  max_k_(other.max_k_),
47
+ allocator_(other.allocator_),
46
48
  gadget_(other.gadget_)
47
49
  {}
48
50
 
@@ -52,16 +54,18 @@ var_opt_union<T, A>::var_opt_union(var_opt_union&& other) noexcept :
52
54
  outer_tau_numer_(other.outer_tau_numer_),
53
55
  outer_tau_denom_(other.outer_tau_denom_),
54
56
  max_k_(other.max_k_),
57
+ allocator_(other.allocator_),
55
58
  gadget_(std::move(other.gadget_))
56
59
  {}
57
60
 
58
61
  template<typename T, typename A>
59
62
  var_opt_union<T, A>::var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
60
- uint32_t max_k, var_opt_sketch<T, A>&& gadget) :
63
+ uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator) :
61
64
  n_(n),
62
65
  outer_tau_numer_(outer_tau_numer),
63
66
  outer_tau_denom_(outer_tau_denom),
64
67
  max_k_(max_k),
68
+ allocator_(allocator),
65
69
  gadget_(gadget)
66
70
  {}
67
71
 
@@ -75,6 +79,7 @@ var_opt_union<T, A>& var_opt_union<T, A>::operator=(const var_opt_union& other)
75
79
  std::swap(outer_tau_numer_, union_copy.outer_tau_numer_);
76
80
  std::swap(outer_tau_denom_, union_copy.outer_tau_denom_);
77
81
  std::swap(max_k_, union_copy.max_k_);
82
+ std::swap(allocator_, other.allocator_);
78
83
  std::swap(gadget_, union_copy.gadget_);
79
84
  return *this;
80
85
  }
@@ -85,6 +90,7 @@ var_opt_union<T, A>& var_opt_union<T, A>::operator=(var_opt_union&& other) {
85
90
  std::swap(outer_tau_numer_, other.outer_tau_numer_);
86
91
  std::swap(outer_tau_denom_, other.outer_tau_denom_);
87
92
  std::swap(max_k_, other.max_k_);
93
+ std::swap(allocator_, other.allocator_);
88
94
  std::swap(gadget_, other.gadget_);
89
95
  return *this;
90
96
  }
@@ -162,7 +168,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(std::istream& is, const Ser
162
168
  if (!is.good())
163
169
  throw std::runtime_error("error reading from std::istream");
164
170
 
165
- return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
171
+ return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget), allocator);
166
172
  }
167
173
 
168
174
  template<typename T, typename A>
@@ -204,7 +210,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(const void* bytes, size_t s
204
210
  const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
205
211
  var_opt_sketch<T, A> gadget = var_opt_sketch<T, A>::deserialize(ptr, gadget_size, sd, allocator);
206
212
 
207
- return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
213
+ return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget), allocator);
208
214
  }
209
215
 
210
216
  template<typename T, typename A>
@@ -508,9 +514,8 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
508
514
  uint32_t result_r = 0;
509
515
  size_t next_r_pos = result_k; // = (result_k+1)-1, to fill R region from back to front
510
516
 
511
- typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
512
- double* wts = AllocDouble().allocate(result_k + 1);
513
- T* data = A().allocate(result_k + 1);
517
+ double* wts = AllocDouble(allocator_).allocate(result_k + 1);
518
+ T* data = A(allocator_).allocate(result_k + 1);
514
519
 
515
520
  // insert R region items, ignoring weights
516
521
  // Currently (May 2017) this next block is unreachable; this coercer is used only in the
@@ -519,7 +524,7 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
519
524
  // Addedndum (Jan 2020): Cleanup at end of method assumes R count is 0
520
525
  const size_t final_idx = gadget_.get_num_samples();
521
526
  for (size_t idx = gadget_.h_ + 1; idx <= final_idx; ++idx) {
522
- A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
527
+ new (&data[next_r_pos]) T(gadget_.data_[idx]);
523
528
  wts[next_r_pos] = gadget_.weights_[idx];
524
529
  ++result_r;
525
530
  --next_r_pos;
@@ -530,20 +535,20 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
530
535
  // insert H region items
531
536
  for (size_t idx = 0; idx < gadget_.h_; ++idx) {
532
537
  if (gadget_.marks_[idx]) {
533
- A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
538
+ new (&data[next_r_pos]) T(gadget_.data_[idx]);
534
539
  wts[next_r_pos] = -1.0;
535
540
  transferred_weight += gadget_.weights_[idx];
536
541
  ++result_r;
537
542
  --next_r_pos;
538
543
  } else {
539
- A().construct(&data[result_h], T(gadget_.data_[idx]));
544
+ new (&data[result_h]) T(gadget_.data_[idx]);
540
545
  wts[result_h] = gadget_.weights_[idx];
541
546
  ++result_h;
542
547
  }
543
548
  }
544
549
 
545
550
  if (result_h + result_r != result_k) throw std::logic_error("H + R counts must equal k");
546
- if (fabs(transferred_weight - outer_tau_numer_) > 1e-10) {
551
+ if (std::abs(transferred_weight - outer_tau_numer_) > 1e-10) {
547
552
  throw std::logic_error("uexpected mismatch in transferred weight");
548
553
  }
549
554
 
@@ -554,11 +559,10 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
554
559
  wts[result_h] = -1.0;
555
560
 
556
561
  // clean up arrays in input sketch, replace with new values
557
- typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
558
- AllocBool().deallocate(sk.marks_, sk.curr_items_alloc_);
559
- AllocDouble().deallocate(sk.weights_, sk.curr_items_alloc_);
560
- for (size_t i = 0; i < result_k; ++i) { A().destroy(sk.data_ + i); } // assumes everything in H region, no gap
561
- A().deallocate(sk.data_, sk.curr_items_alloc_);
562
+ AllocBool(allocator_).deallocate(sk.marks_, sk.curr_items_alloc_);
563
+ AllocDouble(allocator_).deallocate(sk.weights_, sk.curr_items_alloc_);
564
+ for (size_t i = 0; i < result_k; ++i) { sk.data_[i].~T(); } // assumes everything in H region, no gap
565
+ A(allocator_).deallocate(sk.data_, sk.curr_items_alloc_);
562
566
 
563
567
  sk.data_ = data;
564
568
  sk.weights_ = wts;
@@ -52,17 +52,15 @@ static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2)
52
52
  REQUIRE(sk1.get_k() == sk2.get_k());
53
53
  REQUIRE(sk1.get_n() == sk2.get_n());
54
54
  REQUIRE(sk1.get_num_samples() == sk2.get_num_samples());
55
-
55
+
56
56
  auto it1 = sk1.begin();
57
57
  auto it2 = sk2.begin();
58
- size_t i = 0;
59
58
 
60
59
  while ((it1 != sk1.end()) && (it2 != sk2.end())) {
61
- const std::pair<const T&, const double> p1 = *it1;
62
- const std::pair<const T&, const double> p2 = *it2;
60
+ auto p1 = *it1;
61
+ auto p2 = *it2;
63
62
  REQUIRE(p1.first == p2.first); // data values
64
63
  REQUIRE(p1.second == p2.second); // weights
65
- ++i;
66
64
  ++it1;
67
65
  ++it2;
68
66
  }
@@ -182,7 +180,7 @@ TEST_CASE("varopt sketch: invalid weight", "[var_opt_sketch]") {
182
180
  var_opt_sketch<std::string> sk(100, resize_factor::X2);
183
181
  REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument);
184
182
 
185
- // should not throw but sketch shoulds till be empty
183
+ // should not throw but sketch should still be empty
186
184
  sk.update("zero weight", 0.0);
187
185
  REQUIRE(sk.is_empty());
188
186
  }
@@ -213,7 +211,7 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
213
211
 
214
212
  double input_sum = 0.0;
215
213
  for (size_t i = 0; i < n; ++i) {
216
- // generate weights aboev and below 1.0 using w ~ exp(5*N(0,1))
214
+ // generate weights above and below 1.0 using w ~ exp(5*N(0,1))
217
215
  // which covers about 10 orders of magnitude
218
216
  double w = std::exp(5 * N(rand));
219
217
  input_sum += w;
@@ -221,12 +219,12 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
221
219
  }
222
220
 
223
221
  double output_sum = 0.0;
224
- for (auto it : sk) { // std::pair<int, weight>
225
- output_sum += it.second;
222
+ for (auto pair : sk) { // std::pair<int, weight>
223
+ output_sum += pair.second;
226
224
  }
227
225
 
228
226
  double weight_ratio = output_sum / input_sum;
229
- REQUIRE(std::abs(weight_ratio - 1.0) == Approx(0).margin(EPS));
227
+ REQUIRE(weight_ratio == Approx(1.0).margin(EPS));
230
228
  }
231
229
 
232
230
  TEST_CASE("varopt sketch: under-full sketch serialization", "[var_opt_sketch]") {
@@ -275,26 +273,38 @@ TEST_CASE("varopt sketch: full sketch serialization", "[var_opt_sketch]") {
275
273
  sk.update(100, 100.0);
276
274
  sk.update(101, 101.0);
277
275
 
276
+ subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
277
+ double total_weight = summary.total_sketch_weight;
278
+ double cum_weight = 0.0;
279
+ for (auto pair : sk) {
280
+ cum_weight += pair.second;
281
+ }
282
+ double weight_ratio = cum_weight / total_weight;
283
+ REQUIRE(weight_ratio == Approx(1.0).margin(EPS));
284
+
278
285
  // first 2 entries should be heavy and in heap order (smallest at root)
279
286
  auto it = sk.begin();
280
- const std::pair<const int, const double> p1 = *it;
287
+ auto p1 = *it;
281
288
  ++it;
282
- const std::pair<const int, const double> p2 = *it;
289
+ auto p2 = *it;
283
290
  REQUIRE(p1.second == Approx(100.0).margin(EPS));
284
291
  REQUIRE(p2.second == Approx(101.0).margin(EPS));
285
292
  REQUIRE(p1.first == 100);
286
293
  REQUIRE(p2.first == 101);
294
+ // using operator ->
295
+ REQUIRE(it->first == p2.first);
296
+ REQUIRE(it->second == p2.second);
287
297
 
288
298
  // check for 4 preamble longs
289
299
  auto bytes = sk.serialize();
290
300
  REQUIRE((bytes.data()[0] & 0x3f) == 4);; // PREAMBLE_LONGS_WARMUP
291
301
 
292
- var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
302
+ auto sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
293
303
  check_if_equal(sk, sk_from_bytes);
294
304
 
295
305
  std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
296
306
  sk.serialize(ss);
297
- var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
307
+ auto sk_from_stream = var_opt_sketch<int>::deserialize(ss);
298
308
  check_if_equal(sk, sk_from_stream);
299
309
 
300
310
  // ensure we unroll properly
@@ -340,6 +350,15 @@ TEST_CASE("varopt sketch: pseudo-light update", "[var_opt_sketch]") {
340
350
  auto it = sk.begin();
341
351
  double wt = (*it).second;
342
352
  REQUIRE(wt == Approx((k + 2.0) / k).margin(EPS));
353
+
354
+ subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
355
+ double total_weight = summary.total_sketch_weight;
356
+ double cum_weight = 0.0;
357
+ for (auto pair : sk) {
358
+ cum_weight += pair.second;
359
+ }
360
+ double weight_ratio = cum_weight / total_weight;
361
+ REQUIRE(weight_ratio == Approx(1.0).margin(EPS));
343
362
  }
344
363
 
345
364
  TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
@@ -57,7 +57,6 @@ static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2,
57
57
 
58
58
  auto it1 = sk1.begin();
59
59
  auto it2 = sk2.begin();
60
- size_t i = 0;
61
60
 
62
61
  while ((it1 != sk1.end()) && (it2 != sk2.end())) {
63
62
  const std::pair<const T&, const double> p1 = *it1;
@@ -66,7 +65,6 @@ static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2,
66
65
  REQUIRE(p1.first == p2.first); // data values
67
66
  }
68
67
  REQUIRE(p1.second == p2.second); // weight values
69
- ++i;
70
68
  ++it1;
71
69
  ++it2;
72
70
  }
@@ -100,7 +100,7 @@ setup(
100
100
  url='http://datasketches.apache.org',
101
101
  long_description=open('python/README.md').read(),
102
102
  long_description_content_type='text/markdown',
103
- packages=find_packages(where='python',exclude=['src','*tests*']), # src not needed if only the .so
103
+ packages=find_packages(where='python',exclude=['src','include','*tests*']), # src not needed if only the .so
104
104
  package_dir={'':'python'},
105
105
  # may need to add all source paths for sdist packages w/o MANIFEST.in
106
106
  ext_modules=[CMakeExtension('datasketches')],
@@ -62,4 +62,5 @@ install(FILES
62
62
  include/bounds_on_ratios_in_theta_sketched_sets.hpp
63
63
  include/compact_theta_sketch_parser.hpp
64
64
  include/compact_theta_sketch_parser_impl.hpp
65
+ include/bit_packing.hpp
65
66
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")