datasketches 0.3.0 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  7. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  8. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  9. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  10. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  11. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  12. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  13. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  14. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  15. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  16. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -1
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +2 -2
  73. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  74. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  75. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  76. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  77. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  78. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  79. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  80. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  81. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  82. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  83. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  84. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  86. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  87. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  88. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  89. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  90. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  96. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  97. data/vendor/datasketches-cpp/setup.py +1 -1
  98. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  99. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  101. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  102. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  104. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  105. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  107. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  108. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  109. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  110. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  111. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  112. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  113. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  114. metadata +31 -3
@@ -0,0 +1,244 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <cmath>
21
+ #include <catch2/catch.hpp>
22
+
23
+ #include <density_sketch.hpp>
24
+
25
+ namespace datasketches {
26
+
27
+ TEST_CASE("density sketch: empty", "[density_sketch]") {
28
+ density_sketch<float> sketch(10, 3);
29
+ REQUIRE(sketch.is_empty());
30
+ REQUIRE_THROWS_AS(sketch.get_estimate({0, 0, 0}), std::runtime_error);
31
+ }
32
+
33
+ TEST_CASE("density sketch: one item", "[density_sketch]") {
34
+ density_sketch<float> sketch(10, 3);
35
+
36
+ // dimension mismatch
37
+ REQUIRE_THROWS_AS(sketch.update(std::vector<float>({0, 0})), std::invalid_argument);
38
+
39
+ sketch.update(std::vector<float>({0, 0, 0}));
40
+ REQUIRE_FALSE(sketch.is_empty());
41
+ REQUIRE_FALSE(sketch.is_estimation_mode());
42
+ REQUIRE(sketch.get_estimate({0, 0, 0}) == 1);
43
+ REQUIRE(sketch.get_estimate({0.01, 0.01, 0.01}) > 0.95);
44
+ REQUIRE(sketch.get_estimate({1, 1, 1}) < 0.05);
45
+ }
46
+
47
+ TEST_CASE("density sketch: merge", "[density_sketch]") {
48
+ density_sketch<float> sketch1(10, 4);
49
+ sketch1.update(std::vector<float>({0, 0, 0, 0}));
50
+ sketch1.update(std::vector<float>({1, 2, 3, 4}));
51
+
52
+ density_sketch<float> sketch2(10, 4);
53
+ sketch2.update(std::vector<float>({5, 6, 7, 8}));
54
+
55
+ sketch1.merge(sketch2);
56
+
57
+ REQUIRE(sketch1.get_n() == 3);
58
+ REQUIRE(sketch1.get_num_retained() == 3);
59
+ }
60
+
61
+ TEST_CASE("density sketch: iterator", "[density_sketch]") {
62
+ density_sketch<float> sketch(10, 3);
63
+ unsigned n = 1000;
64
+ for (unsigned i = 1; i <= n; ++i) sketch.update(std::vector<float>(3, i));
65
+ REQUIRE(sketch.get_n() == n);
66
+ REQUIRE(sketch.is_estimation_mode());
67
+ //std::cout << sketch.to_string(true, true);
68
+ unsigned count = 0;
69
+ for (auto pair: sketch) {
70
+ ++count;
71
+ // just to assert something about the output
72
+ REQUIRE(pair.first.size() == sketch.get_dim());
73
+ }
74
+ REQUIRE(count == sketch.get_num_retained());
75
+ }
76
+
77
+ // spherical kernel for testing, returns 1 for vectors within radius and 0 otherwise
78
+ template<typename T>
79
+ struct spherical_kernel {
80
+ spherical_kernel(T radius = 1.0) : _radius_squared(radius * radius) {}
81
+ T operator()(const std::vector<T>& v1, const std::vector<T>& v2) const {
82
+ return std::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0, std::plus<T>(), [](T a, T b){return (a-b)*(a-b);}) <= _radius_squared ? 1.0 : 0.0;
83
+ }
84
+ private:
85
+ T _radius_squared;
86
+ };
87
+
88
+ TEST_CASE("custom kernel", "[density_sketch]") {
89
+ density_sketch<float, spherical_kernel<float>> sketch(10, 3, spherical_kernel<float>(0.5));
90
+
91
+ // update with (1,1,1) and test points inside and outside the kernel
92
+ sketch.update(std::vector<float>(3, 1.0));
93
+ REQUIRE(sketch.get_estimate(std::vector<float>(3, 1.001)) == 1.0);
94
+ REQUIRE(sketch.get_estimate(std::vector<float>(3, 2.0)) == 0.0);
95
+
96
+ // rest of test follows iterator test above
97
+ unsigned n = 1000;
98
+ for (unsigned i = 2; i <= n; ++i) sketch.update(std::vector<float>(3, i));
99
+ REQUIRE(sketch.get_n() == n);
100
+ REQUIRE(sketch.is_estimation_mode());
101
+ unsigned count = 0;
102
+ for (auto pair: sketch) {
103
+ ++count;
104
+ // just to assert something about the output
105
+ REQUIRE(pair.first.size() == sketch.get_dim());
106
+ }
107
+ REQUIRE(count == sketch.get_num_retained());
108
+ }
109
+
110
+ TEST_CASE("serialize empty", "[density_sketch]") {
111
+ density_sketch<double> sk(10, 2);
112
+ auto bytes = sk.serialize();
113
+ auto sk2 = density_sketch<double>::deserialize(bytes.data(), bytes.size());
114
+ REQUIRE(sk2.is_empty());
115
+ REQUIRE(!sk2.is_estimation_mode());
116
+ REQUIRE(sk.get_k() == sk2.get_k());
117
+ REQUIRE(sk.get_dim() == sk2.get_dim());
118
+ REQUIRE(sk.get_n() == sk2.get_n());
119
+ REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
120
+
121
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
122
+ sk.serialize(s);
123
+ auto sk3 = density_sketch<double>::deserialize(s);
124
+ REQUIRE(sk3.is_empty());
125
+ REQUIRE(!sk3.is_estimation_mode());
126
+ REQUIRE(sk.get_k() == sk3.get_k());
127
+ REQUIRE(sk.get_dim() == sk3.get_dim());
128
+ REQUIRE(sk.get_n() == sk3.get_n());
129
+ REQUIRE(sk.get_num_retained() == sk3.get_num_retained());
130
+ }
131
+
132
+ TEST_CASE("serialize bytes", "[density_sketch]") {
133
+ uint16_t k = 10;
134
+ uint32_t dim = 3;
135
+ density_sketch<double> sk(k, dim);
136
+
137
+ for (uint16_t i = 0; i < k; ++i) {
138
+ double val = static_cast<double>(i);
139
+ sk.update(std::vector<double>({val, std::sqrt(val), -val}));
140
+ }
141
+ REQUIRE(!sk.is_estimation_mode());
142
+
143
+ // exact mode
144
+ auto bytes = sk.serialize();
145
+ auto sk2 = density_sketch<double>::deserialize(bytes.data(), bytes.size());
146
+ REQUIRE(!sk2.is_empty());
147
+ REQUIRE(!sk2.is_estimation_mode());
148
+ REQUIRE(sk.get_k() == sk2.get_k());
149
+ REQUIRE(sk.get_dim() == sk2.get_dim());
150
+ REQUIRE(sk.get_n() == sk2.get_n());
151
+ REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
152
+ auto it1 = sk.begin();
153
+ auto it2 = sk2.begin();
154
+ while (it1 != sk.end()) {
155
+ REQUIRE(it1->first[0] == it2->first[0]);
156
+ REQUIRE(it1->second == it2->second);
157
+ ++it1;
158
+ ++it2;
159
+ }
160
+
161
+ // estimation mode
162
+ size_t n = 1031;
163
+ for (uint32_t i = k; i < n; ++i) {
164
+ double val = static_cast<double>(i);
165
+ sk.update(std::vector<double>({val, std::sqrt(val), -val}));
166
+ }
167
+ REQUIRE(sk.is_estimation_mode());
168
+
169
+ bytes = sk.serialize();
170
+ sk2 = density_sketch<double>::deserialize(bytes.data(), bytes.size());
171
+ REQUIRE(!sk2.is_empty());
172
+ REQUIRE(sk2.is_estimation_mode());
173
+ REQUIRE(sk.get_k() == sk2.get_k());
174
+ REQUIRE(sk.get_dim() == sk2.get_dim());
175
+ REQUIRE(sk.get_n() == sk2.get_n());
176
+ REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
177
+ it1 = sk.begin();
178
+ it2 = sk2.begin();
179
+ while (it1 != sk.end()) {
180
+ REQUIRE(it1->first[0] == it2->first[0]);
181
+ REQUIRE(it1->second == it2->second);
182
+ ++it1;
183
+ ++it2;
184
+ }
185
+ }
186
+
187
+ TEST_CASE("serialize stream", "[density_sketch]") {
188
+ uint16_t k = 10;
189
+ uint32_t dim = 3;
190
+ density_sketch<float> sk(k, dim);
191
+
192
+ for (uint16_t i = 0; i < k; ++i) {
193
+ float val = static_cast<float>(i);
194
+ sk.update(std::vector<float>({val, std::sin(val), std::cos(val)}));
195
+ }
196
+ REQUIRE(!sk.is_estimation_mode());
197
+
198
+ // exact mode
199
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
200
+ sk.serialize(s);
201
+ auto sk2 = density_sketch<float>::deserialize(s);
202
+ REQUIRE(!sk2.is_empty());
203
+ REQUIRE(!sk2.is_estimation_mode());
204
+ REQUIRE(sk.get_k() == sk2.get_k());
205
+ REQUIRE(sk.get_dim() == sk2.get_dim());
206
+ REQUIRE(sk.get_n() == sk2.get_n());
207
+ REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
208
+ auto it1 = sk.begin();
209
+ auto it2 = sk2.begin();
210
+ while (it1 != sk.end()) {
211
+ REQUIRE(it1->first[0] == it2->first[0]);
212
+ REQUIRE(it1->second == it2->second);
213
+ ++it1;
214
+ ++it2;
215
+ }
216
+
217
+ // estimation mode
218
+ size_t n = 1031;
219
+ for (uint32_t i = k; i < n; ++i) {
220
+ float val = static_cast<float>(i);
221
+ sk.update(std::vector<float>({val, std::sqrt(val), -val}));
222
+ }
223
+ REQUIRE(sk.is_estimation_mode());
224
+
225
+ std::stringstream s2(std::ios::in | std::ios::out | std::ios::binary);
226
+ sk.serialize(s2);
227
+ sk2 = density_sketch<float>::deserialize(s2);
228
+ REQUIRE(!sk2.is_empty());
229
+ REQUIRE(sk2.is_estimation_mode());
230
+ REQUIRE(sk.get_k() == sk2.get_k());
231
+ REQUIRE(sk.get_dim() == sk2.get_dim());
232
+ REQUIRE(sk.get_n() == sk2.get_n());
233
+ REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
234
+ it1 = sk.begin();
235
+ it2 = sk2.begin();
236
+ while (it1 != sk.end()) {
237
+ REQUIRE(it1->first[0] == it2->first[0]);
238
+ REQUIRE(it1->second == it2->second);
239
+ ++it1;
240
+ ++it2;
241
+ }
242
+ }
243
+
244
+ } /* namespace datasketches */
@@ -91,8 +91,14 @@ private:
91
91
 
92
92
  // This iterator uses strides based on golden ratio to avoid clustering during merge
93
93
  template<typename K, typename V, typename H, typename E, typename A>
94
- class reverse_purge_hash_map<K, V, H, E, A>::iterator: public std::iterator<std::input_iterator_tag, K> {
94
+ class reverse_purge_hash_map<K, V, H, E, A>::iterator {
95
95
  public:
96
+ using iterator_category = std::input_iterator_tag;
97
+ using value_type = std::pair<K&, V>;
98
+ using difference_type = void;
99
+ using pointer = void;
100
+ using reference = const value_type;
101
+
96
102
  friend class reverse_purge_hash_map<K, V, H, E, A>;
97
103
  iterator& operator++() {
98
104
  ++count;
@@ -107,8 +113,8 @@ public:
107
113
  iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; }
108
114
  bool operator==(const iterator& rhs) const { return count == rhs.count; }
109
115
  bool operator!=(const iterator& rhs) const { return count != rhs.count; }
110
- const std::pair<K&, V> operator*() const {
111
- return std::pair<K&, V>(map->keys_[index], map->values_[index]);
116
+ reference operator*() const {
117
+ return value_type(map->keys_[index], map->values_[index]);
112
118
  }
113
119
  private:
114
120
  static constexpr double GOLDEN_RATIO_RECIPROCAL = 0.6180339887498949; // = (sqrt(5) - 1) / 2
@@ -51,6 +51,22 @@ Hll4Array<A>::Hll4Array(const Hll4Array<A>& that) :
51
51
  }
52
52
  }
53
53
 
54
+ template<typename A>
55
+ Hll4Array<A>::Hll4Array(const HllArray<A>& other) :
56
+ HllArray<A>(other.getLgConfigK(), target_hll_type::HLL_4, other.isStartFullSize(), other.getAllocator()),
57
+ auxHashMap_(nullptr)
58
+ {
59
+ const int numBytes = this->hll4ArrBytes(this->lgConfigK_);
60
+ this->hllByteArr_.resize(numBytes, 0);
61
+ this->oooFlag_ = other.isOutOfOrderFlag();
62
+
63
+ for (const auto coupon : other) { // all = false, so skip empty values
64
+ internalCouponUpdate(coupon); // updates KxQ registers
65
+ }
66
+ this->hipAccum_ = other.getHipAccum();
67
+ this->rebuild_kxq_curmin_ = false;
68
+ }
69
+
54
70
  template<typename A>
55
71
  Hll4Array<A>::~Hll4Array() {
56
72
  // hllByteArr deleted in parent
@@ -114,10 +130,9 @@ uint8_t Hll4Array<A>::getSlot(uint32_t slotNo) const {
114
130
  }
115
131
 
116
132
  template<typename A>
117
- uint8_t Hll4Array<A>::get_value(uint32_t index) const {
118
- const uint8_t value = getSlot(index);
133
+ uint8_t Hll4Array<A>::adjustRawValue(uint32_t slot, uint8_t value) const {
119
134
  if (value != hll_constants::AUX_TOKEN) return value + this->curMin_;
120
- return auxHashMap_->mustFindValueFor(index);
135
+ return auxHashMap_->mustFindValueFor(slot);
121
136
  }
122
137
 
123
138
  template<typename A>
@@ -210,7 +225,7 @@ void Hll4Array<A>::internalHll4Update(uint32_t slotNo, uint8_t newVal) {
210
225
 
211
226
  // we just increased a pair value, so it might be time to change curMin
212
227
  if (actualOldValue == this->curMin_) { // 908
213
- this->decNumAtCurMin();
228
+ --(this->numAtCurMin_);
214
229
  while (this->numAtCurMin_ == 0) {
215
230
  shiftToBiggerCurMin(); // increases curMin by 1, builds a new aux table
216
231
  // shifts values in 4-bit table and recounts curMin
@@ -328,13 +343,6 @@ typename HllArray<A>::const_iterator Hll4Array<A>::end() const {
328
343
  this->tgtHllType_, auxHashMap_, this->curMin_, false);
329
344
  }
330
345
 
331
- template<typename A>
332
- void Hll4Array<A>::mergeHll(const HllArray<A>& src) {
333
- for (const auto coupon: src) {
334
- internalCouponUpdate(coupon);
335
- }
336
- }
337
-
338
346
  }
339
347
 
340
348
  #endif // _HLL4ARRAY_INTERNAL_HPP_
@@ -25,14 +25,12 @@
25
25
 
26
26
  namespace datasketches {
27
27
 
28
- template<typename A>
29
- class Hll4Iterator;
30
-
31
28
  template<typename A>
32
29
  class Hll4Array final : public HllArray<A> {
33
30
  public:
34
31
  explicit Hll4Array(uint8_t lgConfigK, bool startFullSize, const A& allocator);
35
32
  explicit Hll4Array(const Hll4Array<A>& that);
33
+ explicit Hll4Array(const HllArray<A>& that);
36
34
 
37
35
  virtual ~Hll4Array();
38
36
  virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
@@ -41,13 +39,12 @@ class Hll4Array final : public HllArray<A> {
41
39
 
42
40
  inline uint8_t getSlot(uint32_t slotNo) const;
43
41
  inline void putSlot(uint32_t slotNo, uint8_t value);
44
- inline uint8_t get_value(uint32_t index) const;
42
+ inline uint8_t adjustRawValue(uint32_t index, uint8_t value) const;
45
43
 
46
44
  virtual uint32_t getUpdatableSerializationBytes() const;
47
45
  virtual uint32_t getHllByteArrBytes() const;
48
46
 
49
47
  virtual HllSketchImpl<A>* couponUpdate(uint32_t coupon) final;
50
- void mergeHll(const HllArray<A>& src);
51
48
 
52
49
  virtual AuxHashMap<A>* getAuxHashMap() const;
53
50
  // does *not* delete old map if overwriting
@@ -34,6 +34,25 @@ HllArray<A>(lgConfigK, target_hll_type::HLL_6, startFullSize, allocator)
34
34
  this->hllByteArr_.resize(numBytes, 0);
35
35
  }
36
36
 
37
+ template<typename A>
38
+ Hll6Array<A>::Hll6Array(const HllArray<A>& other) :
39
+ HllArray<A>(other.getLgConfigK(), target_hll_type::HLL_6, other.isStartFullSize(), other.getAllocator())
40
+ {
41
+ const int numBytes = this->hll6ArrBytes(this->lgConfigK_);
42
+ this->hllByteArr_.resize(numBytes, 0);
43
+ this->oooFlag_ = other.isOutOfOrderFlag();
44
+ uint32_t num_zeros = 1 << this->lgConfigK_;
45
+
46
+ for (const auto coupon : other) { // all = false, so skip empty values
47
+ num_zeros--;
48
+ internalCouponUpdate(coupon); // updates KxQ registers
49
+ }
50
+
51
+ this->numAtCurMin_ = num_zeros;
52
+ this->hipAccum_ = other.getHipAccum();
53
+ this->rebuild_kxq_curmin_ = false;
54
+ }
55
+
37
56
  template<typename A>
38
57
  std::function<void(HllSketchImpl<A>*)> Hll6Array<A>::get_deleter() const {
39
58
  return [](HllSketchImpl<A>* ptr) {
@@ -101,13 +120,6 @@ void Hll6Array<A>::internalCouponUpdate(uint32_t coupon) {
101
120
  }
102
121
  }
103
122
 
104
- template<typename A>
105
- void Hll6Array<A>::mergeHll(const HllArray<A>& src) {
106
- for (const auto coupon: src) {
107
- internalCouponUpdate(coupon);
108
- }
109
- }
110
-
111
123
  }
112
124
 
113
125
  #endif // _HLL6ARRAY_INTERNAL_HPP_
@@ -31,6 +31,7 @@ template<typename A>
31
31
  class Hll6Array final : public HllArray<A> {
32
32
  public:
33
33
  Hll6Array(uint8_t lgConfigK, bool startFullSize, const A& allocator);
34
+ explicit Hll6Array(const HllArray<A>& that);
34
35
 
35
36
  virtual ~Hll6Array() = default;
36
37
  virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
@@ -41,7 +42,6 @@ class Hll6Array final : public HllArray<A> {
41
42
  inline void putSlot(uint32_t slotNo, uint8_t value);
42
43
 
43
44
  virtual HllSketchImpl<A>* couponUpdate(uint32_t coupon) final;
44
- void mergeHll(const HllArray<A>& src);
45
45
 
46
46
  virtual uint32_t getHllByteArrBytes() const;
47
47
 
@@ -32,6 +32,25 @@ HllArray<A>(lgConfigK, target_hll_type::HLL_8, startFullSize, allocator)
32
32
  this->hllByteArr_.resize(numBytes, 0);
33
33
  }
34
34
 
35
+ template<typename A>
36
+ Hll8Array<A>::Hll8Array(const HllArray<A>& other):
37
+ HllArray<A>(other.getLgConfigK(), target_hll_type::HLL_8, other.isStartFullSize(), other.getAllocator())
38
+ {
39
+ const int numBytes = this->hll8ArrBytes(this->lgConfigK_);
40
+ this->hllByteArr_.resize(numBytes, 0);
41
+ this->oooFlag_ = other.isOutOfOrderFlag();
42
+ uint32_t num_zeros = 1 << this->lgConfigK_;
43
+
44
+ for (const auto coupon : other) { // all = false, so skip empty values
45
+ num_zeros--;
46
+ internalCouponUpdate(coupon); // updates KxQ registers
47
+ }
48
+
49
+ this->numAtCurMin_ = num_zeros;
50
+ this->hipAccum_ = other.getHipAccum();
51
+ this->rebuild_kxq_curmin_ = false;
52
+ }
53
+
35
54
  template<typename A>
36
55
  std::function<void(HllSketchImpl<A>*)> Hll8Array<A>::get_deleter() const {
37
56
  return [](HllSketchImpl<A>* ptr) {
@@ -77,13 +96,11 @@ void Hll8Array<A>::internalCouponUpdate(uint32_t coupon) {
77
96
  const uint32_t slotNo = HllUtil<A>::getLow26(coupon) & configKmask;
78
97
  const uint8_t newVal = HllUtil<A>::getValue(coupon);
79
98
 
80
- const uint8_t curVal = getSlot(slotNo);
99
+ const uint8_t curVal = this->hllByteArr_[slotNo];
81
100
  if (newVal > curVal) {
82
- putSlot(slotNo, newVal);
101
+ this->hllByteArr_[slotNo] = newVal;
83
102
  this->hipAndKxQIncrementalUpdate(curVal, newVal);
84
- if (curVal == 0) {
85
- this->numAtCurMin_--; // interpret numAtCurMin as num zeros
86
- }
103
+ this->numAtCurMin_ -= curVal == 0; // interpret numAtCurMin as num zeros
87
104
  }
88
105
  }
89
106
 
@@ -97,49 +114,88 @@ void Hll8Array<A>::mergeList(const CouponList<A>& src) {
97
114
  template<typename A>
98
115
  void Hll8Array<A>::mergeHll(const HllArray<A>& src) {
99
116
  // at this point src_k >= dst_k
100
- const uint32_t src_k = 1 << src.getLgConfigK();
101
- const uint32_t dst_mask = (1 << this->getLgConfigK()) - 1;
102
- // duplication below is to avoid a virtual method call in a loop
103
- if (src.getTgtHllType() == target_hll_type::HLL_8) {
104
- for (uint32_t i = 0; i < src_k; i++) {
105
- const uint8_t new_v = static_cast<const Hll8Array<A>&>(src).getSlot(i);
106
- const uint32_t j = i & dst_mask;
107
- const uint8_t old_v = this->hllByteArr_[j];
108
- if (new_v > old_v) {
109
- this->hllByteArr_[j] = new_v;
110
- this->hipAndKxQIncrementalUpdate(old_v, new_v);
111
- if (old_v == 0) {
112
- this->numAtCurMin_--;
113
- }
117
+ // we can optimize further when the k values are equal
118
+ if (this->getLgConfigK() == src.getLgConfigK()) {
119
+ if (src.getTgtHllType() == target_hll_type::HLL_8) {
120
+ uint32_t i = 0;
121
+ for (const auto value: src.getHllArray()) {
122
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
123
+ ++i;
114
124
  }
115
- }
116
- } else if (src.getTgtHllType() == target_hll_type::HLL_6) {
117
- for (uint32_t i = 0; i < src_k; i++) {
118
- const uint8_t new_v = static_cast<const Hll6Array<A>&>(src).getSlot(i);
119
- const uint32_t j = i & dst_mask;
120
- const uint8_t old_v = this->hllByteArr_[j];
121
- if (new_v > old_v) {
122
- this->hllByteArr_[j] = new_v;
123
- this->hipAndKxQIncrementalUpdate(old_v, new_v);
124
- if (old_v == 0) {
125
- this->numAtCurMin_--;
126
- }
125
+ } else if (src.getTgtHllType() == target_hll_type::HLL_6) {
126
+ const uint32_t src_k = 1 << src.getLgConfigK();
127
+ uint32_t i = 0;
128
+ const uint8_t* ptr = src.getHllArray().data();
129
+ while (i < src_k) {
130
+ uint8_t value = *ptr & 0x3f;
131
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
132
+ ++i;
133
+ value = *ptr++ >> 6;
134
+ value |= (*ptr & 0x0f) << 2;
135
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
136
+ ++i;
137
+ value = *ptr++ >> 4;
138
+ value |= (*ptr & 3) << 4;
139
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
140
+ ++i;
141
+ value = *ptr++ >> 2;
142
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
143
+ ++i;
144
+ }
145
+ } else { // HLL_4
146
+ const auto& src4 = static_cast<const Hll4Array<A>&>(src);
147
+ uint32_t i = 0;
148
+ for (const auto byte: src.getHllArray()) {
149
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], src4.adjustRawValue(i, byte & hll_constants::loNibbleMask));
150
+ ++i;
151
+ this->hllByteArr_[i] = std::max(this->hllByteArr_[i], src4.adjustRawValue(i, byte >> 4));
152
+ ++i;
127
153
  }
128
154
  }
129
- } else { // HLL_4
130
- for (uint32_t i = 0; i < src_k; i++) {
131
- const uint8_t new_v = static_cast<const Hll4Array<A>&>(src).get_value(i);
132
- const uint32_t j = i & dst_mask;
133
- const uint8_t old_v = this->hllByteArr_[j];
134
- if (new_v > old_v) {
135
- this->hllByteArr_[j] = new_v;
136
- this->hipAndKxQIncrementalUpdate(old_v, new_v);
137
- if (old_v == 0) {
138
- this->numAtCurMin_--;
139
- }
155
+ } else {
156
+ // src_k > dst_k
157
+ const uint32_t dst_mask = (1 << this->getLgConfigK()) - 1;
158
+ // special treatment below to optimize performance
159
+ if (src.getTgtHllType() == target_hll_type::HLL_8) {
160
+ uint32_t i = 0;
161
+ for (const auto value: src.getHllArray()) {
162
+ processValue(i++, dst_mask, value);
163
+ }
164
+ } else if (src.getTgtHllType() == target_hll_type::HLL_6) {
165
+ const uint32_t src_k = 1 << src.getLgConfigK();
166
+ uint32_t i = 0;
167
+ const uint8_t* ptr = src.getHllArray().data();
168
+ while (i < src_k) {
169
+ uint8_t value = *ptr & 0x3f;
170
+ processValue(i++, dst_mask, value);
171
+ value = *ptr++ >> 6;
172
+ value |= (*ptr & 0x0f) << 2;
173
+ processValue(i++, dst_mask, value);
174
+ value = *ptr++ >> 4;
175
+ value |= (*ptr & 3) << 4;
176
+ processValue(i++, dst_mask, value);
177
+ value = *ptr++ >> 2;
178
+ processValue(i++, dst_mask, value);
179
+ }
180
+ } else { // HLL_4
181
+ const auto& src4 = static_cast<const Hll4Array<A>&>(src);
182
+ uint32_t i = 0;
183
+ for (const auto byte: src.getHllArray()) {
184
+ processValue(i, dst_mask, src4.adjustRawValue(i, byte & hll_constants::loNibbleMask));
185
+ ++i;
186
+ processValue(i, dst_mask, src4.adjustRawValue(i, byte >> 4));
187
+ ++i;
140
188
  }
141
189
  }
142
190
  }
191
+ this->setRebuildKxqCurminFlag(true);
192
+ }
193
+
194
+
195
+ template<typename A>
196
+ void Hll8Array<A>::processValue(uint32_t slot, uint32_t mask, uint8_t new_val) {
197
+ const size_t index = slot & mask;
198
+ this->hllByteArr_[index] = std::max(this->hllByteArr_[index], new_val);
143
199
  }
144
200
 
145
201
  }
@@ -31,6 +31,7 @@ template<typename A>
31
31
  class Hll8Array final : public HllArray<A> {
32
32
  public:
33
33
  Hll8Array(uint8_t lgConfigK, bool startFullSize, const A& allocator);
34
+ explicit Hll8Array(const HllArray<A>& that);
34
35
 
35
36
  virtual ~Hll8Array() = default;
36
37
  virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
@@ -48,6 +49,7 @@ class Hll8Array final : public HllArray<A> {
48
49
 
49
50
  private:
50
51
  inline void internalCouponUpdate(uint32_t coupon);
52
+ inline void processValue(uint32_t slot, uint32_t mask, uint8_t new_val);
51
53
  };
52
54
 
53
55
  }