datasketches 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -0,0 +1,244 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <cmath>
21
+ #include <catch2/catch.hpp>
22
+
23
+ #include <density_sketch.hpp>
24
+
25
+ namespace datasketches {
26
+
27
+ TEST_CASE("density sketch: empty", "[density_sketch]") {
28
+ density_sketch<float> sketch(10, 3);
29
+ REQUIRE(sketch.is_empty());
30
+ REQUIRE_THROWS_AS(sketch.get_estimate({0, 0, 0}), std::runtime_error);
31
+ }
32
+
33
+ TEST_CASE("density sketch: one item", "[density_sketch]") {
34
+ density_sketch<float> sketch(10, 3);
35
+
36
+ // dimension mismatch
37
+ REQUIRE_THROWS_AS(sketch.update(std::vector<float>({0, 0})), std::invalid_argument);
38
+
39
+ sketch.update(std::vector<float>({0, 0, 0}));
40
+ REQUIRE_FALSE(sketch.is_empty());
41
+ REQUIRE_FALSE(sketch.is_estimation_mode());
42
+ REQUIRE(sketch.get_estimate({0, 0, 0}) == 1);
43
+ REQUIRE(sketch.get_estimate({0.01, 0.01, 0.01}) > 0.95);
44
+ REQUIRE(sketch.get_estimate({1, 1, 1}) < 0.05);
45
+ }
46
+
47
+ TEST_CASE("density sketch: merge", "[density_sketch]") {
48
+ density_sketch<float> sketch1(10, 4);
49
+ sketch1.update(std::vector<float>({0, 0, 0, 0}));
50
+ sketch1.update(std::vector<float>({1, 2, 3, 4}));
51
+
52
+ density_sketch<float> sketch2(10, 4);
53
+ sketch2.update(std::vector<float>({5, 6, 7, 8}));
54
+
55
+ sketch1.merge(sketch2);
56
+
57
+ REQUIRE(sketch1.get_n() == 3);
58
+ REQUIRE(sketch1.get_num_retained() == 3);
59
+ }
60
+
61
+ TEST_CASE("density sketch: iterator", "[density_sketch]") {
62
+ density_sketch<float> sketch(10, 3);
63
+ unsigned n = 1000;
64
+ for (unsigned i = 1; i <= n; ++i) sketch.update(std::vector<float>(3, i));
65
+ REQUIRE(sketch.get_n() == n);
66
+ REQUIRE(sketch.is_estimation_mode());
67
+ //std::cout << sketch.to_string(true, true);
68
+ unsigned count = 0;
69
+ for (auto pair: sketch) {
70
+ ++count;
71
+ // just to assert something about the output
72
+ REQUIRE(pair.first.size() == sketch.get_dim());
73
+ }
74
+ REQUIRE(count == sketch.get_num_retained());
75
+ }
76
+
77
+ // spherical kernel for testing, returns 1 for vectors within radius and 0 otherwise
78
+ template<typename T>
79
+ struct spherical_kernel {
80
+ spherical_kernel(T radius = 1.0) : _radius_squared(radius * radius) {}
81
+ T operator()(const std::vector<T>& v1, const std::vector<T>& v2) const {
82
+ return std::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0, std::plus<T>(), [](T a, T b){return (a-b)*(a-b);}) <= _radius_squared ? 1.0 : 0.0;
83
+ }
84
+ private:
85
+ T _radius_squared;
86
+ };
87
+
88
+ TEST_CASE("custom kernel", "[density_sketch]") {
89
+ density_sketch<float, spherical_kernel<float>> sketch(10, 3, spherical_kernel<float>(0.5));
90
+
91
+ // update with (1,1,1) and test points inside and outside the kernel
92
+ sketch.update(std::vector<float>(3, 1.0));
93
+ REQUIRE(sketch.get_estimate(std::vector<float>(3, 1.001)) == 1.0);
94
+ REQUIRE(sketch.get_estimate(std::vector<float>(3, 2.0)) == 0.0);
95
+
96
+ // rest of test follows iterator test above
97
+ unsigned n = 1000;
98
+ for (unsigned i = 2; i <= n; ++i) sketch.update(std::vector<float>(3, i));
99
+ REQUIRE(sketch.get_n() == n);
100
+ REQUIRE(sketch.is_estimation_mode());
101
+ unsigned count = 0;
102
+ for (auto pair: sketch) {
103
+ ++count;
104
+ // just to assert something about the output
105
+ REQUIRE(pair.first.size() == sketch.get_dim());
106
+ }
107
+ REQUIRE(count == sketch.get_num_retained());
108
+ }
109
+
110
+ TEST_CASE("serialize empty", "[density_sketch]") {
111
+ density_sketch<double> sk(10, 2);
112
+ auto bytes = sk.serialize();
113
+ auto sk2 = density_sketch<double>::deserialize(bytes.data(), bytes.size());
114
+ REQUIRE(sk2.is_empty());
115
+ REQUIRE(!sk2.is_estimation_mode());
116
+ REQUIRE(sk.get_k() == sk2.get_k());
117
+ REQUIRE(sk.get_dim() == sk2.get_dim());
118
+ REQUIRE(sk.get_n() == sk2.get_n());
119
+ REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
120
+
121
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
122
+ sk.serialize(s);
123
+ auto sk3 = density_sketch<double>::deserialize(s);
124
+ REQUIRE(sk3.is_empty());
125
+ REQUIRE(!sk3.is_estimation_mode());
126
+ REQUIRE(sk.get_k() == sk3.get_k());
127
+ REQUIRE(sk.get_dim() == sk3.get_dim());
128
+ REQUIRE(sk.get_n() == sk3.get_n());
129
+ REQUIRE(sk.get_num_retained() == sk3.get_num_retained());
130
+ }
131
+
132
+ TEST_CASE("serialize bytes", "[density_sketch]") {
133
+ uint16_t k = 10;
134
+ uint32_t dim = 3;
135
+ density_sketch<double> sk(k, dim);
136
+
137
+ for (uint16_t i = 0; i < k; ++i) {
138
+ double val = static_cast<double>(i);
139
+ sk.update(std::vector<double>({val, std::sqrt(val), -val}));
140
+ }
141
+ REQUIRE(!sk.is_estimation_mode());
142
+
143
+ // exact mode
144
+ auto bytes = sk.serialize();
145
+ auto sk2 = density_sketch<double>::deserialize(bytes.data(), bytes.size());
146
+ REQUIRE(!sk2.is_empty());
147
+ REQUIRE(!sk2.is_estimation_mode());
148
+ REQUIRE(sk.get_k() == sk2.get_k());
149
+ REQUIRE(sk.get_dim() == sk2.get_dim());
150
+ REQUIRE(sk.get_n() == sk2.get_n());
151
+ REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
152
+ auto it1 = sk.begin();
153
+ auto it2 = sk2.begin();
154
+ while (it1 != sk.end()) {
155
+ REQUIRE(it1->first[0] == it2->first[0]);
156
+ REQUIRE(it1->second == it2->second);
157
+ ++it1;
158
+ ++it2;
159
+ }
160
+
161
+ // estimation mode
162
+ size_t n = 1031;
163
+ for (uint32_t i = k; i < n; ++i) {
164
+ double val = static_cast<double>(i);
165
+ sk.update(std::vector<double>({val, std::sqrt(val), -val}));
166
+ }
167
+ REQUIRE(sk.is_estimation_mode());
168
+
169
+ bytes = sk.serialize();
170
+ sk2 = density_sketch<double>::deserialize(bytes.data(), bytes.size());
171
+ REQUIRE(!sk2.is_empty());
172
+ REQUIRE(sk2.is_estimation_mode());
173
+ REQUIRE(sk.get_k() == sk2.get_k());
174
+ REQUIRE(sk.get_dim() == sk2.get_dim());
175
+ REQUIRE(sk.get_n() == sk2.get_n());
176
+ REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
177
+ it1 = sk.begin();
178
+ it2 = sk2.begin();
179
+ while (it1 != sk.end()) {
180
+ REQUIRE(it1->first[0] == it2->first[0]);
181
+ REQUIRE(it1->second == it2->second);
182
+ ++it1;
183
+ ++it2;
184
+ }
185
+ }
186
+
187
+ TEST_CASE("serialize stream", "[density_sketch]") {
188
+ uint16_t k = 10;
189
+ uint32_t dim = 3;
190
+ density_sketch<float> sk(k, dim);
191
+
192
+ for (uint16_t i = 0; i < k; ++i) {
193
+ float val = static_cast<float>(i);
194
+ sk.update(std::vector<float>({val, std::sin(val), std::cos(val)}));
195
+ }
196
+ REQUIRE(!sk.is_estimation_mode());
197
+
198
+ // exact mode
199
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
200
+ sk.serialize(s);
201
+ auto sk2 = density_sketch<float>::deserialize(s);
202
+ REQUIRE(!sk2.is_empty());
203
+ REQUIRE(!sk2.is_estimation_mode());
204
+ REQUIRE(sk.get_k() == sk2.get_k());
205
+ REQUIRE(sk.get_dim() == sk2.get_dim());
206
+ REQUIRE(sk.get_n() == sk2.get_n());
207
+ REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
208
+ auto it1 = sk.begin();
209
+ auto it2 = sk2.begin();
210
+ while (it1 != sk.end()) {
211
+ REQUIRE(it1->first[0] == it2->first[0]);
212
+ REQUIRE(it1->second == it2->second);
213
+ ++it1;
214
+ ++it2;
215
+ }
216
+
217
+ // estimation mode
218
+ size_t n = 1031;
219
+ for (uint32_t i = k; i < n; ++i) {
220
+ float val = static_cast<float>(i);
221
+ sk.update(std::vector<float>({val, std::sqrt(val), -val}));
222
+ }
223
+ REQUIRE(sk.is_estimation_mode());
224
+
225
+ std::stringstream s2(std::ios::in | std::ios::out | std::ios::binary);
226
+ sk.serialize(s2);
227
+ sk2 = density_sketch<float>::deserialize(s2);
228
+ REQUIRE(!sk2.is_empty());
229
+ REQUIRE(sk2.is_estimation_mode());
230
+ REQUIRE(sk.get_k() == sk2.get_k());
231
+ REQUIRE(sk.get_dim() == sk2.get_dim());
232
+ REQUIRE(sk.get_n() == sk2.get_n());
233
+ REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
234
+ it1 = sk.begin();
235
+ it2 = sk2.begin();
236
+ while (it1 != sk.end()) {
237
+ REQUIRE(it1->first[0] == it2->first[0]);
238
+ REQUIRE(it1->second == it2->second);
239
+ ++it1;
240
+ ++it2;
241
+ }
242
+ }
243
+
244
+ } /* namespace datasketches */
@@ -91,8 +91,14 @@ private:
91
91
 
92
92
  // This iterator uses strides based on golden ratio to avoid clustering during merge
93
93
  template<typename K, typename V, typename H, typename E, typename A>
94
- class reverse_purge_hash_map<K, V, H, E, A>::iterator: public std::iterator<std::input_iterator_tag, K> {
94
+ class reverse_purge_hash_map<K, V, H, E, A>::iterator {
95
95
  public:
96
+ using iterator_category = std::input_iterator_tag;
97
+ using value_type = std::pair<K&, V>;
98
+ using difference_type = void;
99
+ using pointer = void;
100
+ using reference = const value_type;
101
+
96
102
  friend class reverse_purge_hash_map<K, V, H, E, A>;
97
103
  iterator& operator++() {
98
104
  ++count;
@@ -107,8 +113,8 @@ public:
107
113
  iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; }
108
114
  bool operator==(const iterator& rhs) const { return count == rhs.count; }
109
115
  bool operator!=(const iterator& rhs) const { return count != rhs.count; }
110
- const std::pair<K&, V> operator*() const {
111
- return std::pair<K&, V>(map->keys_[index], map->values_[index]);
116
+ reference operator*() const {
117
+ return value_type(map->keys_[index], map->values_[index]);
112
118
  }
113
119
  private:
114
120
  static constexpr double GOLDEN_RATIO_RECIPROCAL = 0.6180339887498949; // = (sqrt(5) - 1) / 2
@@ -51,6 +51,22 @@ Hll4Array<A>::Hll4Array(const Hll4Array<A>& that) :
51
51
  }
52
52
  }
53
53
 
54
+ template<typename A>
55
+ Hll4Array<A>::Hll4Array(const HllArray<A>& other) :
56
+ HllArray<A>(other.getLgConfigK(), target_hll_type::HLL_4, other.isStartFullSize(), other.getAllocator()),
57
+ auxHashMap_(nullptr)
58
+ {
59
+ const int numBytes = this->hll4ArrBytes(this->lgConfigK_);
60
+ this->hllByteArr_.resize(numBytes, 0);
61
+ this->oooFlag_ = other.isOutOfOrderFlag();
62
+
63
+ for (const auto coupon : other) { // all = false, so skip empty values
64
+ internalCouponUpdate(coupon); // updates KxQ registers
65
+ }
66
+ this->hipAccum_ = other.getHipAccum();
67
+ this->rebuild_kxq_curmin_ = false;
68
+ }
69
+
54
70
  template<typename A>
55
71
  Hll4Array<A>::~Hll4Array() {
56
72
  // hllByteArr deleted in parent
@@ -114,10 +130,9 @@ uint8_t Hll4Array<A>::getSlot(uint32_t slotNo) const {
114
130
  }
115
131
 
116
132
  template<typename A>
117
- uint8_t Hll4Array<A>::get_value(uint32_t index) const {
118
- const uint8_t value = getSlot(index);
133
+ uint8_t Hll4Array<A>::adjustRawValue(uint32_t slot, uint8_t value) const {
119
134
  if (value != hll_constants::AUX_TOKEN) return value + this->curMin_;
120
- return auxHashMap_->mustFindValueFor(index);
135
+ return auxHashMap_->mustFindValueFor(slot);
121
136
  }
122
137
 
123
138
  template<typename A>
@@ -210,7 +225,7 @@ void Hll4Array<A>::internalHll4Update(uint32_t slotNo, uint8_t newVal) {
210
225
 
211
226
  // we just increased a pair value, so it might be time to change curMin
212
227
  if (actualOldValue == this->curMin_) { // 908
213
- this->decNumAtCurMin();
228
+ --(this->numAtCurMin_);
214
229
  while (this->numAtCurMin_ == 0) {
215
230
  shiftToBiggerCurMin(); // increases curMin by 1, builds a new aux table
216
231
  // shifts values in 4-bit table and recounts curMin
@@ -328,13 +343,6 @@ typename HllArray<A>::const_iterator Hll4Array<A>::end() const {
328
343
  this->tgtHllType_, auxHashMap_, this->curMin_, false);
329
344
  }
330
345
 
331
- template<typename A>
332
- void Hll4Array<A>::mergeHll(const HllArray<A>& src) {
333
- for (const auto coupon: src) {
334
- internalCouponUpdate(coupon);
335
- }
336
- }
337
-
338
346
  }
339
347
 
340
348
  #endif // _HLL4ARRAY_INTERNAL_HPP_
@@ -25,14 +25,12 @@
25
25
 
26
26
  namespace datasketches {
27
27
 
28
- template<typename A>
29
- class Hll4Iterator;
30
-
31
28
  template<typename A>
32
29
  class Hll4Array final : public HllArray<A> {
33
30
  public:
34
31
  explicit Hll4Array(uint8_t lgConfigK, bool startFullSize, const A& allocator);
35
32
  explicit Hll4Array(const Hll4Array<A>& that);
33
+ explicit Hll4Array(const HllArray<A>& that);
36
34
 
37
35
  virtual ~Hll4Array();
38
36
  virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
@@ -41,13 +39,12 @@ class Hll4Array final : public HllArray<A> {
41
39
 
42
40
  inline uint8_t getSlot(uint32_t slotNo) const;
43
41
  inline void putSlot(uint32_t slotNo, uint8_t value);
44
- inline uint8_t get_value(uint32_t index) const;
42
+ inline uint8_t adjustRawValue(uint32_t index, uint8_t value) const;
45
43
 
46
44
  virtual uint32_t getUpdatableSerializationBytes() const;
47
45
  virtual uint32_t getHllByteArrBytes() const;
48
46
 
49
47
  virtual HllSketchImpl<A>* couponUpdate(uint32_t coupon) final;
50
- void mergeHll(const HllArray<A>& src);
51
48
 
52
49
  virtual AuxHashMap<A>* getAuxHashMap() const;
53
50
  // does *not* delete old map if overwriting
@@ -34,6 +34,25 @@ HllArray<A>(lgConfigK, target_hll_type::HLL_6, startFullSize, allocator)
34
34
  this->hllByteArr_.resize(numBytes, 0);
35
35
  }
36
36
 
37
+ template<typename A>
38
+ Hll6Array<A>::Hll6Array(const HllArray<A>& other) :
39
+ HllArray<A>(other.getLgConfigK(), target_hll_type::HLL_6, other.isStartFullSize(), other.getAllocator())
40
+ {
41
+ const int numBytes = this->hll6ArrBytes(this->lgConfigK_);
42
+ this->hllByteArr_.resize(numBytes, 0);
43
+ this->oooFlag_ = other.isOutOfOrderFlag();
44
+ uint32_t num_zeros = 1 << this->lgConfigK_;
45
+
46
+ for (const auto coupon : other) { // all = false, so skip empty values
47
+ num_zeros--;
48
+ internalCouponUpdate(coupon); // updates KxQ registers
49
+ }
50
+
51
+ this->numAtCurMin_ = num_zeros;
52
+ this->hipAccum_ = other.getHipAccum();
53
+ this->rebuild_kxq_curmin_ = false;
54
+ }
55
+
37
56
  template<typename A>
38
57
  std::function<void(HllSketchImpl<A>*)> Hll6Array<A>::get_deleter() const {
39
58
  return [](HllSketchImpl<A>* ptr) {
@@ -101,13 +120,6 @@ void Hll6Array<A>::internalCouponUpdate(uint32_t coupon) {
101
120
  }
102
121
  }
103
122
 
104
- template<typename A>
105
- void Hll6Array<A>::mergeHll(const HllArray<A>& src) {
106
- for (const auto coupon: src) {
107
- internalCouponUpdate(coupon);
108
- }
109
- }
110
-
111
123
  }
112
124
 
113
125
  #endif // _HLL6ARRAY_INTERNAL_HPP_
@@ -31,6 +31,7 @@ template<typename A>
31
31
  class Hll6Array final : public HllArray<A> {
32
32
  public:
33
33
  Hll6Array(uint8_t lgConfigK, bool startFullSize, const A& allocator);
34
+ explicit Hll6Array(const HllArray<A>& that);
34
35
 
35
36
  virtual ~Hll6Array() = default;
36
37
  virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
@@ -41,7 +42,6 @@ class Hll6Array final : public HllArray<A> {
41
42
  inline void putSlot(uint32_t slotNo, uint8_t value);
42
43
 
43
44
  virtual HllSketchImpl<A>* couponUpdate(uint32_t coupon) final;
44
- void mergeHll(const HllArray<A>& src);
45
45
 
46
46
  virtual uint32_t getHllByteArrBytes() const;
47
47
 
@@ -32,6 +32,25 @@ HllArray<A>(lgConfigK, target_hll_type::HLL_8, startFullSize, allocator)
32
32
  this->hllByteArr_.resize(numBytes, 0);
33
33
  }
34
34
 
35
+ template<typename A>
36
+ Hll8Array<A>::Hll8Array(const HllArray<A>& other):
37
+ HllArray<A>(other.getLgConfigK(), target_hll_type::HLL_8, other.isStartFullSize(), other.getAllocator())
38
+ {
39
+ const int numBytes = this->hll8ArrBytes(this->lgConfigK_);
40
+ this->hllByteArr_.resize(numBytes, 0);
41
+ this->oooFlag_ = other.isOutOfOrderFlag();
42
+ uint32_t num_zeros = 1 << this->lgConfigK_;
43
+
44
+ for (const auto coupon : other) { // all = false, so skip empty values
45
+ num_zeros--;
46
+ internalCouponUpdate(coupon); // updates KxQ registers
47
+ }
48
+
49
+ this->numAtCurMin_ = num_zeros;
50
+ this->hipAccum_ = other.getHipAccum();
51
+ this->rebuild_kxq_curmin_ = false;
52
+ }
53
+
35
54
  template<typename A>
36
55
  std::function<void(HllSketchImpl<A>*)> Hll8Array<A>::get_deleter() const {
37
56
  return [](HllSketchImpl<A>* ptr) {
@@ -77,13 +96,11 @@ void Hll8Array<A>::internalCouponUpdate(uint32_t coupon) {
77
96
  const uint32_t slotNo = HllUtil<A>::getLow26(coupon) & configKmask;
78
97
  const uint8_t newVal = HllUtil<A>::getValue(coupon);
79
98
 
80
- const uint8_t curVal = getSlot(slotNo);
99
+ const uint8_t curVal = this->hllByteArr_[slotNo];
81
100
  if (newVal > curVal) {
82
- putSlot(slotNo, newVal);
101
+ this->hllByteArr_[slotNo] = newVal;
83
102
  this->hipAndKxQIncrementalUpdate(curVal, newVal);
84
- if (curVal == 0) {
85
- this->numAtCurMin_--; // interpret numAtCurMin as num zeros
86
- }
103
+ this->numAtCurMin_ -= curVal == 0; // interpret numAtCurMin as num zeros
87
104
  }
88
105
  }
89
106
 
@@ -97,49 +114,88 @@ void Hll8Array<A>::mergeList(const CouponList<A>& src) {
97
114
  template<typename A>
98
115
  void Hll8Array<A>::mergeHll(const HllArray<A>& src) {
99
116
  // at this point src_k >= dst_k
100
- const uint32_t src_k = 1 << src.getLgConfigK();
101
- const uint32_t dst_mask = (1 << this->getLgConfigK()) - 1;
102
- // duplication below is to avoid a virtual method call in a loop
103
- if (src.getTgtHllType() == target_hll_type::HLL_8) {
104
- for (uint32_t i = 0; i < src_k; i++) {
105
- const uint8_t new_v = static_cast<const Hll8Array<A>&>(src).getSlot(i);
106
- const uint32_t j = i & dst_mask;
107
- const uint8_t old_v = this->hllByteArr_[j];
108
- if (new_v > old_v) {
109
- this->hllByteArr_[j] = new_v;
110
- this->hipAndKxQIncrementalUpdate(old_v, new_v);
111
- if (old_v == 0) {
112
- this->numAtCurMin_--;
113
- }
117
+ // we can optimize further when the k values are equal
118
+ if (this->getLgConfigK() == src.getLgConfigK()) {
119
+ if (src.getTgtHllType() == target_hll_type::HLL_8) {
120
+ uint32_t i = 0;
121
+ for (const auto value: src.getHllArray()) {
122
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
123
+ ++i;
114
124
  }
115
- }
116
- } else if (src.getTgtHllType() == target_hll_type::HLL_6) {
117
- for (uint32_t i = 0; i < src_k; i++) {
118
- const uint8_t new_v = static_cast<const Hll6Array<A>&>(src).getSlot(i);
119
- const uint32_t j = i & dst_mask;
120
- const uint8_t old_v = this->hllByteArr_[j];
121
- if (new_v > old_v) {
122
- this->hllByteArr_[j] = new_v;
123
- this->hipAndKxQIncrementalUpdate(old_v, new_v);
124
- if (old_v == 0) {
125
- this->numAtCurMin_--;
126
- }
125
+ } else if (src.getTgtHllType() == target_hll_type::HLL_6) {
126
+ const uint32_t src_k = 1 << src.getLgConfigK();
127
+ uint32_t i = 0;
128
+ const uint8_t* ptr = src.getHllArray().data();
129
+ while (i < src_k) {
130
+ uint8_t value = *ptr & 0x3f;
131
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
132
+ ++i;
133
+ value = *ptr++ >> 6;
134
+ value |= (*ptr & 0x0f) << 2;
135
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
136
+ ++i;
137
+ value = *ptr++ >> 4;
138
+ value |= (*ptr & 3) << 4;
139
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
140
+ ++i;
141
+ value = *ptr++ >> 2;
142
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
143
+ ++i;
144
+ }
145
+ } else { // HLL_4
146
+ const auto& src4 = static_cast<const Hll4Array<A>&>(src);
147
+ uint32_t i = 0;
148
+ for (const auto byte: src.getHllArray()) {
149
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], src4.adjustRawValue(i, byte & hll_constants::loNibbleMask));
150
+ ++i;
151
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], src4.adjustRawValue(i, byte >> 4));
152
+ ++i;
127
153
  }
128
154
  }
129
- } else { // HLL_4
130
- for (uint32_t i = 0; i < src_k; i++) {
131
- const uint8_t new_v = static_cast<const Hll4Array<A>&>(src).get_value(i);
132
- const uint32_t j = i & dst_mask;
133
- const uint8_t old_v = this->hllByteArr_[j];
134
- if (new_v > old_v) {
135
- this->hllByteArr_[j] = new_v;
136
- this->hipAndKxQIncrementalUpdate(old_v, new_v);
137
- if (old_v == 0) {
138
- this->numAtCurMin_--;
139
- }
155
+ } else {
156
+ // src_k > dst_k
157
+ const uint32_t dst_mask = (1 << this->getLgConfigK()) - 1;
158
+ // special treatment below to optimize performance
159
+ if (src.getTgtHllType() == target_hll_type::HLL_8) {
160
+ uint32_t i = 0;
161
+ for (const auto value: src.getHllArray()) {
162
+ processValue(i++, dst_mask, value);
163
+ }
164
+ } else if (src.getTgtHllType() == target_hll_type::HLL_6) {
165
+ const uint32_t src_k = 1 << src.getLgConfigK();
166
+ uint32_t i = 0;
167
+ const uint8_t* ptr = src.getHllArray().data();
168
+ while (i < src_k) {
169
+ uint8_t value = *ptr & 0x3f;
170
+ processValue(i++, dst_mask, value);
171
+ value = *ptr++ >> 6;
172
+ value |= (*ptr & 0x0f) << 2;
173
+ processValue(i++, dst_mask, value);
174
+ value = *ptr++ >> 4;
175
+ value |= (*ptr & 3) << 4;
176
+ processValue(i++, dst_mask, value);
177
+ value = *ptr++ >> 2;
178
+ processValue(i++, dst_mask, value);
179
+ }
180
+ } else { // HLL_4
181
+ const auto& src4 = static_cast<const Hll4Array<A>&>(src);
182
+ uint32_t i = 0;
183
+ for (const auto byte: src.getHllArray()) {
184
+ processValue(i, dst_mask, src4.adjustRawValue(i, byte & hll_constants::loNibbleMask));
185
+ ++i;
186
+ processValue(i, dst_mask, src4.adjustRawValue(i, byte >> 4));
187
+ ++i;
140
188
  }
141
189
  }
142
190
  }
191
+ this->setRebuildKxqCurminFlag(true);
192
+ }
193
+
194
+
195
+ template<typename A>
196
+ void Hll8Array<A>::processValue(uint32_t slot, uint32_t mask, uint8_t new_val) {
197
+ const size_t index = slot & mask;
198
+ this->hllByteArr_[index] = std::max(this->hllByteArr_[index], new_val);
143
199
  }
144
200
 
145
201
  }
@@ -31,6 +31,7 @@ template<typename A>
31
31
  class Hll8Array final : public HllArray<A> {
32
32
  public:
33
33
  Hll8Array(uint8_t lgConfigK, bool startFullSize, const A& allocator);
34
+ explicit Hll8Array(const HllArray<A>& that);
34
35
 
35
36
  virtual ~Hll8Array() = default;
36
37
  virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
@@ -48,6 +49,7 @@ class Hll8Array final : public HllArray<A> {
48
49
 
49
50
  private:
50
51
  inline void internalCouponUpdate(uint32_t coupon);
52
+ inline void processValue(uint32_t slot, uint32_t mask, uint8_t new_val);
51
53
  };
52
54
 
53
55
  }