datasketches 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -346,14 +346,21 @@ class var_opt_sketch {
346
346
  };
347
347
 
348
348
  template<typename T, typename A>
349
- class var_opt_sketch<T, A>::const_iterator : public std::iterator<std::input_iterator_tag, T> {
349
+ class var_opt_sketch<T, A>::const_iterator {
350
350
  public:
351
+ using iterator_category = std::input_iterator_tag;
352
+ using value_type = std::pair<const T&, const double>;
353
+ using difference_type = void;
354
+ using pointer = const return_value_holder<value_type>;
355
+ using reference = const value_type;
356
+
351
357
  const_iterator(const const_iterator& other);
352
358
  const_iterator& operator++();
353
359
  const_iterator& operator++(int);
354
360
  bool operator==(const const_iterator& other) const;
355
361
  bool operator!=(const const_iterator& other) const;
356
- const std::pair<const T&, const double> operator*() const;
362
+ reference operator*() const;
363
+ pointer operator->() const;
357
364
 
358
365
  private:
359
366
  friend class var_opt_sketch<T, A>;
@@ -362,8 +369,8 @@ private:
362
369
  // default iterator over full sketch
363
370
  const_iterator(const var_opt_sketch<T, A>& sk, bool is_end);
364
371
 
365
- // iterates over only one of the H or R region, optionally applying weight correction
366
- // to R region (can correct for numerical precision issues)
372
+ // iterates over only one of the H or R regions
373
+ // does not apply weight correction
367
374
  const_iterator(const var_opt_sketch<T, A>& sk, bool is_end, bool use_r_region);
368
375
 
369
376
  bool get_mark() const;
@@ -377,14 +384,21 @@ private:
377
384
 
378
385
  // non-const iterator for internal use
379
386
  template<typename T, typename A>
380
- class var_opt_sketch<T, A>::iterator : public std::iterator<std::input_iterator_tag, T> {
387
+ class var_opt_sketch<T, A>::iterator {
381
388
  public:
389
+ using iterator_category = std::input_iterator_tag;
390
+ using value_type = std::pair<T&, double>;
391
+ using difference_type = void;
392
+ using pointer = return_value_holder<value_type>;
393
+ using reference = value_type;
394
+
382
395
  iterator(const iterator& other);
383
396
  iterator& operator++();
384
397
  iterator& operator++(int);
385
398
  bool operator==(const iterator& other) const;
386
399
  bool operator!=(const iterator& other) const;
387
- std::pair<T&, double> operator*();
400
+ reference operator*();
401
+ pointer operator->();
388
402
 
389
403
  private:
390
404
  friend class var_opt_sketch<T, A>;
@@ -189,16 +189,16 @@ var_opt_sketch<T, A>::~var_opt_sketch() {
189
189
  // destroy everything
190
190
  const size_t num_to_destroy = std::min(k_ + 1, curr_items_alloc_);
191
191
  for (size_t i = 0; i < num_to_destroy; ++i) {
192
- allocator_.destroy(data_ + i);
192
+ data_[i].~T();
193
193
  }
194
194
  } else {
195
195
  // skip gap or anything unused at the end
196
196
  for (size_t i = 0; i < h_; ++i) {
197
- allocator_.destroy(data_+ i);
197
+ data_[i].~T();
198
198
  }
199
199
 
200
200
  for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) {
201
- allocator_.destroy(data_ + i);
201
+ data_[i].~T();
202
202
  }
203
203
  }
204
204
  allocator_.deallocate(data_, curr_items_alloc_);
@@ -658,14 +658,14 @@ void var_opt_sketch<T, A>::reset() {
658
658
  // destroy everything
659
659
  const size_t num_to_destroy = std::min(k_ + 1, prev_alloc);
660
660
  for (size_t i = 0; i < num_to_destroy; ++i)
661
- allocator_.destroy(data_ + i);
661
+ data_[i].~T();
662
662
  } else {
663
663
  // skip gap or anything unused at the end
664
664
  for (size_t i = 0; i < h_; ++i)
665
- allocator_.destroy(data_+ i);
665
+ data_[i].~T();
666
666
 
667
667
  for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
668
- allocator_.destroy(data_ + i);
668
+ data_[i].~T();
669
669
  }
670
670
 
671
671
  if (curr_items_alloc_ < prev_alloc) {
@@ -754,10 +754,10 @@ string<A> var_opt_sketch<T, A>::items_to_string(bool print_gap) const {
754
754
  const uint32_t array_length = (n_ < k_ ? n_ : k_ + 1);
755
755
  for (uint32_t i = 0, display_idx = 0; i < array_length; ++i) {
756
756
  if (i == h_ && print_gap) {
757
- os << i << ": GAP" << std::endl;
757
+ os << display_idx << ": GAP" << std::endl;
758
758
  ++display_idx;
759
759
  } else {
760
- os << i << ": " << data_[i] << "\twt = ";
760
+ os << display_idx << ": " << data_[i] << "\twt = ";
761
761
  if (weights_[i] == -1.0) {
762
762
  os << get_tau() << "\t(-1.0)" << std::endl;
763
763
  } else {
@@ -990,7 +990,7 @@ void var_opt_sketch<T, A>::grow_data_arrays() {
990
990
 
991
991
  for (uint32_t i = 0; i < prev_size; ++i) {
992
992
  new (&tmp_data[i]) T(std::move(data_[i]));
993
- allocator_.destroy(data_ + i);
993
+ data_[i].~T();
994
994
  tmp_weights[i] = weights_[i];
995
995
  }
996
996
 
@@ -1531,7 +1531,6 @@ var_opt_sketch<T, A>::const_iterator::const_iterator(const var_opt_sketch& sk, b
1531
1531
  if (idx_ == final_idx_) { sk_ = nullptr; }
1532
1532
  }
1533
1533
 
1534
-
1535
1534
  template<typename T, typename A>
1536
1535
  var_opt_sketch<T, A>::const_iterator::const_iterator(const const_iterator& other) :
1537
1536
  sk_(other.sk_),
@@ -1543,6 +1542,9 @@ var_opt_sketch<T, A>::const_iterator::const_iterator(const const_iterator& other
1543
1542
 
1544
1543
  template<typename T, typename A>
1545
1544
  typename var_opt_sketch<T, A>::const_iterator& var_opt_sketch<T, A>::const_iterator::operator++() {
1545
+ // accumulate weight already visited
1546
+ if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
1547
+
1546
1548
  ++idx_;
1547
1549
 
1548
1550
  if (idx_ == final_idx_) {
@@ -1551,7 +1553,6 @@ typename var_opt_sketch<T, A>::const_iterator& var_opt_sketch<T, A>::const_itera
1551
1553
  } else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
1552
1554
  ++idx_;
1553
1555
  }
1554
- if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
1555
1556
  return *this;
1556
1557
  }
1557
1558
 
@@ -1575,14 +1576,19 @@ bool var_opt_sketch<T, A>::const_iterator::operator!=(const const_iterator& othe
1575
1576
  }
1576
1577
 
1577
1578
  template<typename T, typename A>
1578
- const std::pair<const T&, const double> var_opt_sketch<T, A>::const_iterator::operator*() const {
1579
+ auto var_opt_sketch<T, A>::const_iterator::operator*() const -> reference {
1579
1580
  double wt;
1580
1581
  if (idx_ < sk_->h_) {
1581
1582
  wt = sk_->weights_[idx_];
1582
1583
  } else {
1583
1584
  wt = r_item_wt_;
1584
1585
  }
1585
- return std::pair<const T&, const double>(sk_->data_[idx_], wt);
1586
+ return value_type(sk_->data_[idx_], wt);
1587
+ }
1588
+
1589
+ template<typename T, typename A>
1590
+ auto var_opt_sketch<T, A>::const_iterator::operator->() const -> pointer {
1591
+ return **this;
1586
1592
  }
1587
1593
 
1588
1594
  template<typename T, typename A>
@@ -1622,6 +1628,9 @@ var_opt_sketch<T, A>::iterator::iterator(const iterator& other) :
1622
1628
 
1623
1629
  template<typename T, typename A>
1624
1630
  typename var_opt_sketch<T, A>::iterator& var_opt_sketch<T, A>::iterator::operator++() {
1631
+ // accumulate weight already visited
1632
+ if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
1633
+
1625
1634
  ++idx_;
1626
1635
 
1627
1636
  if (idx_ == final_idx_) {
@@ -1630,7 +1639,7 @@ typename var_opt_sketch<T, A>::iterator& var_opt_sketch<T, A>::iterator::operato
1630
1639
  } else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
1631
1640
  ++idx_;
1632
1641
  }
1633
- if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
1642
+
1634
1643
  return *this;
1635
1644
  }
1636
1645
 
@@ -1654,7 +1663,7 @@ bool var_opt_sketch<T, A>::iterator::operator!=(const iterator& other) const {
1654
1663
  }
1655
1664
 
1656
1665
  template<typename T, typename A>
1657
- std::pair<T&, double> var_opt_sketch<T, A>::iterator::operator*() {
1666
+ auto var_opt_sketch<T, A>::iterator::operator*() -> reference {
1658
1667
  double wt;
1659
1668
  if (idx_ < sk_->h_) {
1660
1669
  wt = sk_->weights_[idx_];
@@ -1663,7 +1672,12 @@ std::pair<T&, double> var_opt_sketch<T, A>::iterator::operator*() {
1663
1672
  } else {
1664
1673
  wt = r_item_wt_;
1665
1674
  }
1666
- return std::pair<T&, double>(sk_->data_[idx_], wt);
1675
+ return value_type(sk_->data_[idx_], wt);
1676
+ }
1677
+
1678
+ template<typename T, typename A>
1679
+ auto var_opt_sketch<T, A>::iterator::operator->() -> pointer {
1680
+ return **this;
1667
1681
  }
1668
1682
 
1669
1683
  template<typename T, typename A>
@@ -153,6 +153,8 @@ public:
153
153
 
154
154
  private:
155
155
  typedef typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T, A>> AllocSketch;
156
+ typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
157
+ typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
156
158
 
157
159
  static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
158
160
  static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
@@ -170,10 +172,12 @@ private:
170
172
 
171
173
  uint32_t max_k_;
172
174
 
175
+ A allocator_;
176
+
173
177
  var_opt_sketch<T, A> gadget_;
174
178
 
175
179
  var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
176
- uint32_t max_k, var_opt_sketch<T, A>&& gadget);
180
+ uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator = A());
177
181
 
178
182
  /*
179
183
  IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
@@ -34,6 +34,7 @@ var_opt_union<T, A>::var_opt_union(uint32_t max_k, const A& allocator) :
34
34
  outer_tau_numer_(0.0),
35
35
  outer_tau_denom_(0),
36
36
  max_k_(max_k),
37
+ allocator_(allocator),
37
38
  gadget_(max_k, var_opt_sketch<T, A>::DEFAULT_RESIZE_FACTOR, true, allocator)
38
39
  {}
39
40
 
@@ -43,6 +44,7 @@ var_opt_union<T, A>::var_opt_union(const var_opt_union& other) :
43
44
  outer_tau_numer_(other.outer_tau_numer_),
44
45
  outer_tau_denom_(other.outer_tau_denom_),
45
46
  max_k_(other.max_k_),
47
+ allocator_(other.allocator_),
46
48
  gadget_(other.gadget_)
47
49
  {}
48
50
 
@@ -52,16 +54,18 @@ var_opt_union<T, A>::var_opt_union(var_opt_union&& other) noexcept :
52
54
  outer_tau_numer_(other.outer_tau_numer_),
53
55
  outer_tau_denom_(other.outer_tau_denom_),
54
56
  max_k_(other.max_k_),
57
+ allocator_(other.allocator_),
55
58
  gadget_(std::move(other.gadget_))
56
59
  {}
57
60
 
58
61
  template<typename T, typename A>
59
62
  var_opt_union<T, A>::var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
60
- uint32_t max_k, var_opt_sketch<T, A>&& gadget) :
63
+ uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator) :
61
64
  n_(n),
62
65
  outer_tau_numer_(outer_tau_numer),
63
66
  outer_tau_denom_(outer_tau_denom),
64
67
  max_k_(max_k),
68
+ allocator_(allocator),
65
69
  gadget_(gadget)
66
70
  {}
67
71
 
@@ -75,6 +79,7 @@ var_opt_union<T, A>& var_opt_union<T, A>::operator=(const var_opt_union& other)
75
79
  std::swap(outer_tau_numer_, union_copy.outer_tau_numer_);
76
80
  std::swap(outer_tau_denom_, union_copy.outer_tau_denom_);
77
81
  std::swap(max_k_, union_copy.max_k_);
82
+ std::swap(allocator_, other.allocator_);
78
83
  std::swap(gadget_, union_copy.gadget_);
79
84
  return *this;
80
85
  }
@@ -85,6 +90,7 @@ var_opt_union<T, A>& var_opt_union<T, A>::operator=(var_opt_union&& other) {
85
90
  std::swap(outer_tau_numer_, other.outer_tau_numer_);
86
91
  std::swap(outer_tau_denom_, other.outer_tau_denom_);
87
92
  std::swap(max_k_, other.max_k_);
93
+ std::swap(allocator_, other.allocator_);
88
94
  std::swap(gadget_, other.gadget_);
89
95
  return *this;
90
96
  }
@@ -162,7 +168,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(std::istream& is, const Ser
162
168
  if (!is.good())
163
169
  throw std::runtime_error("error reading from std::istream");
164
170
 
165
- return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
171
+ return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget), allocator);
166
172
  }
167
173
 
168
174
  template<typename T, typename A>
@@ -204,7 +210,7 @@ var_opt_union<T, A> var_opt_union<T, A>::deserialize(const void* bytes, size_t s
204
210
  const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
205
211
  var_opt_sketch<T, A> gadget = var_opt_sketch<T, A>::deserialize(ptr, gadget_size, sd, allocator);
206
212
 
207
- return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
213
+ return var_opt_union(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget), allocator);
208
214
  }
209
215
 
210
216
  template<typename T, typename A>
@@ -508,9 +514,8 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
508
514
  uint32_t result_r = 0;
509
515
  size_t next_r_pos = result_k; // = (result_k+1)-1, to fill R region from back to front
510
516
 
511
- typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
512
- double* wts = AllocDouble().allocate(result_k + 1);
513
- T* data = A().allocate(result_k + 1);
517
+ double* wts = AllocDouble(allocator_).allocate(result_k + 1);
518
+ T* data = A(allocator_).allocate(result_k + 1);
514
519
 
515
520
  // insert R region items, ignoring weights
516
521
  // Currently (May 2017) this next block is unreachable; this coercer is used only in the
@@ -519,7 +524,7 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
519
524
  // Addedndum (Jan 2020): Cleanup at end of method assumes R count is 0
520
525
  const size_t final_idx = gadget_.get_num_samples();
521
526
  for (size_t idx = gadget_.h_ + 1; idx <= final_idx; ++idx) {
522
- A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
527
+ new (&data[next_r_pos]) T(gadget_.data_[idx]);
523
528
  wts[next_r_pos] = gadget_.weights_[idx];
524
529
  ++result_r;
525
530
  --next_r_pos;
@@ -530,20 +535,20 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
530
535
  // insert H region items
531
536
  for (size_t idx = 0; idx < gadget_.h_; ++idx) {
532
537
  if (gadget_.marks_[idx]) {
533
- A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
538
+ new (&data[next_r_pos]) T(gadget_.data_[idx]);
534
539
  wts[next_r_pos] = -1.0;
535
540
  transferred_weight += gadget_.weights_[idx];
536
541
  ++result_r;
537
542
  --next_r_pos;
538
543
  } else {
539
- A().construct(&data[result_h], T(gadget_.data_[idx]));
544
+ new (&data[result_h]) T(gadget_.data_[idx]);
540
545
  wts[result_h] = gadget_.weights_[idx];
541
546
  ++result_h;
542
547
  }
543
548
  }
544
549
 
545
550
  if (result_h + result_r != result_k) throw std::logic_error("H + R counts must equal k");
546
- if (fabs(transferred_weight - outer_tau_numer_) > 1e-10) {
551
+ if (std::abs(transferred_weight - outer_tau_numer_) > 1e-10) {
547
552
  throw std::logic_error("uexpected mismatch in transferred weight");
548
553
  }
549
554
 
@@ -554,11 +559,10 @@ void var_opt_union<T, A>::mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) c
554
559
  wts[result_h] = -1.0;
555
560
 
556
561
  // clean up arrays in input sketch, replace with new values
557
- typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
558
- AllocBool().deallocate(sk.marks_, sk.curr_items_alloc_);
559
- AllocDouble().deallocate(sk.weights_, sk.curr_items_alloc_);
560
- for (size_t i = 0; i < result_k; ++i) { A().destroy(sk.data_ + i); } // assumes everything in H region, no gap
561
- A().deallocate(sk.data_, sk.curr_items_alloc_);
562
+ AllocBool(allocator_).deallocate(sk.marks_, sk.curr_items_alloc_);
563
+ AllocDouble(allocator_).deallocate(sk.weights_, sk.curr_items_alloc_);
564
+ for (size_t i = 0; i < result_k; ++i) { sk.data_[i].~T(); } // assumes everything in H region, no gap
565
+ A(allocator_).deallocate(sk.data_, sk.curr_items_alloc_);
562
566
 
563
567
  sk.data_ = data;
564
568
  sk.weights_ = wts;
@@ -52,17 +52,15 @@ static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2)
52
52
  REQUIRE(sk1.get_k() == sk2.get_k());
53
53
  REQUIRE(sk1.get_n() == sk2.get_n());
54
54
  REQUIRE(sk1.get_num_samples() == sk2.get_num_samples());
55
-
55
+
56
56
  auto it1 = sk1.begin();
57
57
  auto it2 = sk2.begin();
58
- size_t i = 0;
59
58
 
60
59
  while ((it1 != sk1.end()) && (it2 != sk2.end())) {
61
- const std::pair<const T&, const double> p1 = *it1;
62
- const std::pair<const T&, const double> p2 = *it2;
60
+ auto p1 = *it1;
61
+ auto p2 = *it2;
63
62
  REQUIRE(p1.first == p2.first); // data values
64
63
  REQUIRE(p1.second == p2.second); // weights
65
- ++i;
66
64
  ++it1;
67
65
  ++it2;
68
66
  }
@@ -182,7 +180,7 @@ TEST_CASE("varopt sketch: invalid weight", "[var_opt_sketch]") {
182
180
  var_opt_sketch<std::string> sk(100, resize_factor::X2);
183
181
  REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument);
184
182
 
185
- // should not throw but sketch shoulds till be empty
183
+ // should not throw but sketch should still be empty
186
184
  sk.update("zero weight", 0.0);
187
185
  REQUIRE(sk.is_empty());
188
186
  }
@@ -213,7 +211,7 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
213
211
 
214
212
  double input_sum = 0.0;
215
213
  for (size_t i = 0; i < n; ++i) {
216
- // generate weights aboev and below 1.0 using w ~ exp(5*N(0,1))
214
+ // generate weights above and below 1.0 using w ~ exp(5*N(0,1))
217
215
  // which covers about 10 orders of magnitude
218
216
  double w = std::exp(5 * N(rand));
219
217
  input_sum += w;
@@ -221,12 +219,12 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
221
219
  }
222
220
 
223
221
  double output_sum = 0.0;
224
- for (auto it : sk) { // std::pair<int, weight>
225
- output_sum += it.second;
222
+ for (auto pair : sk) { // std::pair<int, weight>
223
+ output_sum += pair.second;
226
224
  }
227
225
 
228
226
  double weight_ratio = output_sum / input_sum;
229
- REQUIRE(std::abs(weight_ratio - 1.0) == Approx(0).margin(EPS));
227
+ REQUIRE(weight_ratio == Approx(1.0).margin(EPS));
230
228
  }
231
229
 
232
230
  TEST_CASE("varopt sketch: under-full sketch serialization", "[var_opt_sketch]") {
@@ -275,26 +273,38 @@ TEST_CASE("varopt sketch: full sketch serialization", "[var_opt_sketch]") {
275
273
  sk.update(100, 100.0);
276
274
  sk.update(101, 101.0);
277
275
 
276
+ subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
277
+ double total_weight = summary.total_sketch_weight;
278
+ double cum_weight = 0.0;
279
+ for (auto pair : sk) {
280
+ cum_weight += pair.second;
281
+ }
282
+ double weight_ratio = cum_weight / total_weight;
283
+ REQUIRE(weight_ratio == Approx(1.0).margin(EPS));
284
+
278
285
  // first 2 entries should be heavy and in heap order (smallest at root)
279
286
  auto it = sk.begin();
280
- const std::pair<const int, const double> p1 = *it;
287
+ auto p1 = *it;
281
288
  ++it;
282
- const std::pair<const int, const double> p2 = *it;
289
+ auto p2 = *it;
283
290
  REQUIRE(p1.second == Approx(100.0).margin(EPS));
284
291
  REQUIRE(p2.second == Approx(101.0).margin(EPS));
285
292
  REQUIRE(p1.first == 100);
286
293
  REQUIRE(p2.first == 101);
294
+ // using operator ->
295
+ REQUIRE(it->first == p2.first);
296
+ REQUIRE(it->second == p2.second);
287
297
 
288
298
  // check for 4 preamble longs
289
299
  auto bytes = sk.serialize();
290
300
  REQUIRE((bytes.data()[0] & 0x3f) == 4);; // PREAMBLE_LONGS_WARMUP
291
301
 
292
- var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
302
+ auto sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
293
303
  check_if_equal(sk, sk_from_bytes);
294
304
 
295
305
  std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
296
306
  sk.serialize(ss);
297
- var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
307
+ auto sk_from_stream = var_opt_sketch<int>::deserialize(ss);
298
308
  check_if_equal(sk, sk_from_stream);
299
309
 
300
310
  // ensure we unroll properly
@@ -340,6 +350,15 @@ TEST_CASE("varopt sketch: pseudo-light update", "[var_opt_sketch]") {
340
350
  auto it = sk.begin();
341
351
  double wt = (*it).second;
342
352
  REQUIRE(wt == Approx((k + 2.0) / k).margin(EPS));
353
+
354
+ subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
355
+ double total_weight = summary.total_sketch_weight;
356
+ double cum_weight = 0.0;
357
+ for (auto pair : sk) {
358
+ cum_weight += pair.second;
359
+ }
360
+ double weight_ratio = cum_weight / total_weight;
361
+ REQUIRE(weight_ratio == Approx(1.0).margin(EPS));
343
362
  }
344
363
 
345
364
  TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
@@ -57,7 +57,6 @@ static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2,
57
57
 
58
58
  auto it1 = sk1.begin();
59
59
  auto it2 = sk2.begin();
60
- size_t i = 0;
61
60
 
62
61
  while ((it1 != sk1.end()) && (it2 != sk2.end())) {
63
62
  const std::pair<const T&, const double> p1 = *it1;
@@ -66,7 +65,6 @@ static void check_if_equal(var_opt_sketch<T, A>& sk1, var_opt_sketch<T, A>& sk2,
66
65
  REQUIRE(p1.first == p2.first); // data values
67
66
  }
68
67
  REQUIRE(p1.second == p2.second); // weight values
69
- ++i;
70
68
  ++it1;
71
69
  ++it2;
72
70
  }
@@ -100,7 +100,7 @@ setup(
100
100
  url='http://datasketches.apache.org',
101
101
  long_description=open('python/README.md').read(),
102
102
  long_description_content_type='text/markdown',
103
- packages=find_packages(where='python',exclude=['src','*tests*']), # src not needed if only the .so
103
+ packages=find_packages(where='python',exclude=['src','include','*tests*']), # src not needed if only the .so
104
104
  package_dir={'':'python'},
105
105
  # may need to add all source paths for sdist packages w/o MANIFEST.in
106
106
  ext_modules=[CMakeExtension('datasketches')],
@@ -62,4 +62,5 @@ install(FILES
62
62
  include/bounds_on_ratios_in_theta_sketched_sets.hpp
63
63
  include/compact_theta_sketch_parser.hpp
64
64
  include/compact_theta_sketch_parser_impl.hpp
65
+ include/bit_packing.hpp
65
66
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")