datasketches 0.2.7 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/kll_wrapper.cpp +20 -20
  4. data/ext/datasketches/theta_wrapper.cpp +2 -2
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  7. data/vendor/datasketches-cpp/MANIFEST.in +21 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  10. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  11. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  12. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  13. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  14. data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
  15. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
  16. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  17. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  18. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  19. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  20. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  21. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
  22. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
  24. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  25. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  26. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  27. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  28. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  29. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  30. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  31. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
  32. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
  33. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
  34. data/vendor/datasketches-cpp/pyproject.toml +17 -12
  35. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  36. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  37. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  38. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  39. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  40. data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
  41. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  42. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  43. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  44. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  45. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  46. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  47. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
  48. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  49. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  50. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  51. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  52. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  53. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  54. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  55. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
  56. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
  57. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  58. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  59. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  60. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  61. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  62. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
  63. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
  64. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  65. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  66. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  67. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  68. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  69. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
  70. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
  71. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
  72. data/vendor/datasketches-cpp/setup.py +14 -2
  73. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  74. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  75. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  76. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  77. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
  78. data/vendor/datasketches-cpp/tox.ini +26 -0
  79. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  80. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
  81. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  82. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  83. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
  84. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  85. metadata +14 -5
  86. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -30,40 +30,42 @@
30
30
  namespace datasketches {
31
31
 
32
32
  // clang++ seems to require this declaration for CMAKE_BUILD_TYPE='Debug"
33
- template<typename T, typename W, typename H, typename E, typename S, typename A>
34
- const uint8_t frequent_items_sketch<T, W, H, E, S, A>::LG_MIN_MAP_SIZE;
33
+ template<typename T, typename W, typename H, typename E, typename A>
34
+ const uint8_t frequent_items_sketch<T, W, H, E, A>::LG_MIN_MAP_SIZE;
35
35
 
36
- template<typename T, typename W, typename H, typename E, typename S, typename A>
37
- frequent_items_sketch<T, W, H, E, S, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size, const A& allocator):
36
+ template<typename T, typename W, typename H, typename E, typename A>
37
+ frequent_items_sketch<T, W, H, E, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size,
38
+ const E& equal, const A& allocator):
38
39
  total_weight(0),
39
40
  offset(0),
40
41
  map(
41
42
  std::max(lg_start_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
42
43
  std::max(lg_max_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
44
+ equal,
43
45
  allocator
44
46
  )
45
47
  {
46
48
  if (lg_start_map_size > lg_max_map_size) throw std::invalid_argument("starting size must not be greater than maximum size");
47
49
  }
48
50
 
49
- template<typename T, typename W, typename H, typename E, typename S, typename A>
50
- void frequent_items_sketch<T, W, H, E, S, A>::update(const T& item, W weight) {
51
+ template<typename T, typename W, typename H, typename E, typename A>
52
+ void frequent_items_sketch<T, W, H, E, A>::update(const T& item, W weight) {
51
53
  check_weight(weight);
52
54
  if (weight == 0) return;
53
55
  total_weight += weight;
54
56
  offset += map.adjust_or_insert(item, weight);
55
57
  }
56
58
 
57
- template<typename T, typename W, typename H, typename E, typename S, typename A>
58
- void frequent_items_sketch<T, W, H, E, S, A>::update(T&& item, W weight) {
59
+ template<typename T, typename W, typename H, typename E, typename A>
60
+ void frequent_items_sketch<T, W, H, E, A>::update(T&& item, W weight) {
59
61
  check_weight(weight);
60
62
  if (weight == 0) return;
61
63
  total_weight += weight;
62
64
  offset += map.adjust_or_insert(std::move(item), weight);
63
65
  }
64
66
 
65
- template<typename T, typename W, typename H, typename E, typename S, typename A>
66
- void frequent_items_sketch<T, W, H, E, S, A>::merge(const frequent_items_sketch& other) {
67
+ template<typename T, typename W, typename H, typename E, typename A>
68
+ void frequent_items_sketch<T, W, H, E, A>::merge(const frequent_items_sketch& other) {
67
69
  if (other.is_empty()) return;
68
70
  const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
69
71
  for (auto it: other.map) {
@@ -73,8 +75,8 @@ void frequent_items_sketch<T, W, H, E, S, A>::merge(const frequent_items_sketch&
73
75
  total_weight = merged_total_weight;
74
76
  }
75
77
 
76
- template<typename T, typename W, typename H, typename E, typename S, typename A>
77
- void frequent_items_sketch<T, W, H, E, S, A>::merge(frequent_items_sketch&& other) {
78
+ template<typename T, typename W, typename H, typename E, typename A>
79
+ void frequent_items_sketch<T, W, H, E, A>::merge(frequent_items_sketch&& other) {
78
80
  if (other.is_empty()) return;
79
81
  const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
80
82
  for (auto it: other.map) {
@@ -84,69 +86,67 @@ void frequent_items_sketch<T, W, H, E, S, A>::merge(frequent_items_sketch&& othe
84
86
  total_weight = merged_total_weight;
85
87
  }
86
88
 
87
- template<typename T, typename W, typename H, typename E, typename S, typename A>
88
- bool frequent_items_sketch<T, W, H, E, S, A>::is_empty() const {
89
+ template<typename T, typename W, typename H, typename E, typename A>
90
+ bool frequent_items_sketch<T, W, H, E, A>::is_empty() const {
89
91
  return map.get_num_active() == 0;
90
92
  }
91
93
 
92
- template<typename T, typename W, typename H, typename E, typename S, typename A>
93
- uint32_t frequent_items_sketch<T, W, H, E, S, A>::get_num_active_items() const {
94
+ template<typename T, typename W, typename H, typename E, typename A>
95
+ uint32_t frequent_items_sketch<T, W, H, E, A>::get_num_active_items() const {
94
96
  return map.get_num_active();
95
97
  }
96
98
 
97
- template<typename T, typename W, typename H, typename E, typename S, typename A>
98
- W frequent_items_sketch<T, W, H, E, S, A>::get_total_weight() const {
99
+ template<typename T, typename W, typename H, typename E, typename A>
100
+ W frequent_items_sketch<T, W, H, E, A>::get_total_weight() const {
99
101
  return total_weight;
100
102
  }
101
103
 
102
- template<typename T, typename W, typename H, typename E, typename S, typename A>
103
- W frequent_items_sketch<T, W, H, E, S, A>::get_estimate(const T& item) const {
104
+ template<typename T, typename W, typename H, typename E, typename A>
105
+ W frequent_items_sketch<T, W, H, E, A>::get_estimate(const T& item) const {
104
106
  // if item is tracked estimate = weight + offset, otherwise 0
105
107
  const W weight = map.get(item);
106
108
  if (weight > 0) return weight + offset;
107
109
  return 0;
108
110
  }
109
111
 
110
- template<typename T, typename W, typename H, typename E, typename S, typename A>
111
- W frequent_items_sketch<T, W, H, E, S, A>::get_lower_bound(const T& item) const {
112
+ template<typename T, typename W, typename H, typename E, typename A>
113
+ W frequent_items_sketch<T, W, H, E, A>::get_lower_bound(const T& item) const {
112
114
  return map.get(item);
113
115
  }
114
116
 
115
- template<typename T, typename W, typename H, typename E, typename S, typename A>
116
- W frequent_items_sketch<T, W, H, E, S, A>::get_upper_bound(const T& item) const {
117
+ template<typename T, typename W, typename H, typename E, typename A>
118
+ W frequent_items_sketch<T, W, H, E, A>::get_upper_bound(const T& item) const {
117
119
  return map.get(item) + offset;
118
120
  }
119
121
 
120
- template<typename T, typename W, typename H, typename E, typename S, typename A>
121
- W frequent_items_sketch<T, W, H, E, S, A>::get_maximum_error() const {
122
+ template<typename T, typename W, typename H, typename E, typename A>
123
+ W frequent_items_sketch<T, W, H, E, A>::get_maximum_error() const {
122
124
  return offset;
123
125
  }
124
126
 
125
- template<typename T, typename W, typename H, typename E, typename S, typename A>
126
- double frequent_items_sketch<T, W, H, E, S, A>::get_epsilon() const {
127
+ template<typename T, typename W, typename H, typename E, typename A>
128
+ double frequent_items_sketch<T, W, H, E, A>::get_epsilon() const {
127
129
  return EPSILON_FACTOR / (1 << map.get_lg_max_size());
128
130
  }
129
131
 
130
- template<typename T, typename W, typename H, typename E, typename S, typename A>
131
- double frequent_items_sketch<T, W, H, E, S, A>::get_epsilon(uint8_t lg_max_map_size) {
132
+ template<typename T, typename W, typename H, typename E, typename A>
133
+ double frequent_items_sketch<T, W, H, E, A>::get_epsilon(uint8_t lg_max_map_size) {
132
134
  return EPSILON_FACTOR / (1 << lg_max_map_size);
133
135
  }
134
136
 
135
- template<typename T, typename W, typename H, typename E, typename S, typename A>
136
- double frequent_items_sketch<T, W, H, E, S, A>::get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight) {
137
+ template<typename T, typename W, typename H, typename E, typename A>
138
+ double frequent_items_sketch<T, W, H, E, A>::get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight) {
137
139
  return get_epsilon(lg_max_map_size) * estimated_total_weight;
138
140
  }
139
141
 
140
142
 
141
- template<typename T, typename W, typename H, typename E, typename S, typename A>
142
- typename frequent_items_sketch<T, W, H, E, S, A>::vector_row
143
- frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type) const {
143
+ template<typename T, typename W, typename H, typename E, typename A>
144
+ auto frequent_items_sketch<T, W, H, E, A>::get_frequent_items(frequent_items_error_type err_type) const -> vector_row {
144
145
  return get_frequent_items(err_type, get_maximum_error());
145
146
  }
146
147
 
147
- template<typename T, typename W, typename H, typename E, typename S, typename A>
148
- typename frequent_items_sketch<T, W, H, E, S, A>::vector_row
149
- frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const {
148
+ template<typename T, typename W, typename H, typename E, typename A>
149
+ auto frequent_items_sketch<T, W, H, E, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const -> vector_row {
150
150
  vector_row items(map.get_allocator());
151
151
  for (auto it: map) {
152
152
  const W lb = it.second;
@@ -160,9 +160,9 @@ frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error
160
160
  return items;
161
161
  }
162
162
 
163
- template<typename T, typename W, typename H, typename E, typename S, typename A>
163
+ template<typename T, typename W, typename H, typename E, typename A>
164
164
  template<typename SerDe>
165
- void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os, const SerDe& sd) const {
165
+ void frequent_items_sketch<T, W, H, E, A>::serialize(std::ostream& os, const SerDe& sd) const {
166
166
  const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
167
167
  write(os, preamble_longs);
168
168
  const uint8_t serial_version = SERIAL_VERSION;
@@ -206,18 +206,18 @@ void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os, const
206
206
  }
207
207
  }
208
208
 
209
- template<typename T, typename W, typename H, typename E, typename S, typename A>
209
+ template<typename T, typename W, typename H, typename E, typename A>
210
210
  template<typename SerDe>
211
- size_t frequent_items_sketch<T, W, H, E, S, A>::get_serialized_size_bytes(const SerDe& sd) const {
211
+ size_t frequent_items_sketch<T, W, H, E, A>::get_serialized_size_bytes(const SerDe& sd) const {
212
212
  if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t);
213
213
  size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W);
214
214
  for (auto it: map) size += sd.size_of_item(it.first);
215
215
  return size;
216
216
  }
217
217
 
218
- template<typename T, typename W, typename H, typename E, typename S, typename A>
218
+ template<typename T, typename W, typename H, typename E, typename A>
219
219
  template<typename SerDe>
220
- auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const -> vector_bytes {
220
+ auto frequent_items_sketch<T, W, H, E, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const -> vector_bytes {
221
221
  const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
222
222
  vector_bytes bytes(size, 0, map.get_allocator());
223
223
  uint8_t* ptr = bytes.data() + header_size_bytes;
@@ -266,8 +266,8 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
266
266
  return bytes;
267
267
  }
268
268
 
269
- template<typename T, typename W, typename H, typename E, typename S, typename A>
270
- class frequent_items_sketch<T, W, H, E, S, A>::items_deleter {
269
+ template<typename T, typename W, typename H, typename E, typename A>
270
+ class frequent_items_sketch<T, W, H, E, A>::items_deleter {
271
271
  public:
272
272
  items_deleter(uint32_t num, bool destroy, const A& allocator):
273
273
  allocator_(allocator), num_(num), destroy_(destroy) {}
@@ -286,14 +286,10 @@ private:
286
286
  bool destroy_;
287
287
  };
288
288
 
289
- template<typename T, typename W, typename H, typename E, typename S, typename A>
290
- frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is, const A& allocator) {
291
- return deserialize(is, S(), allocator);
292
- }
293
-
294
- template<typename T, typename W, typename H, typename E, typename S, typename A>
289
+ template<typename T, typename W, typename H, typename E, typename A>
295
290
  template<typename SerDe>
296
- frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
291
+ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(std::istream& is,
292
+ const SerDe& sd, const E& equal, const A& allocator) {
297
293
  const auto preamble_longs = read<uint8_t>(is);
298
294
  const auto serial_version = read<uint8_t>(is);
299
295
  const auto family_id = read<uint8_t>(is);
@@ -309,7 +305,7 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
309
305
  check_family_id(family_id);
310
306
  check_size(lg_cur_size, lg_max_size);
311
307
 
312
- frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
308
+ frequent_items_sketch sketch(lg_max_size, lg_cur_size, equal, allocator);
313
309
  if (!is_empty) {
314
310
  const auto num_items = read<uint32_t>(is);
315
311
  read<uint32_t>(is); // unused
@@ -335,14 +331,10 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
335
331
  return sketch;
336
332
  }
337
333
 
338
- template<typename T, typename W, typename H, typename E, typename S, typename A>
339
- frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
340
- return deserialize(bytes, size, S(), allocator);
341
- }
342
-
343
- template<typename T, typename W, typename H, typename E, typename S, typename A>
334
+ template<typename T, typename W, typename H, typename E, typename A>
344
335
  template<typename SerDe>
345
- frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
336
+ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(const void* bytes, size_t size,
337
+ const SerDe& sd, const E& equal, const A& allocator) {
346
338
  ensure_minimum_memory(size, 8);
347
339
  const char* ptr = static_cast<const char*>(bytes);
348
340
  const char* base = static_cast<const char*>(bytes);
@@ -368,7 +360,7 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
368
360
  check_size(lg_cur_size, lg_max_size);
369
361
  ensure_minimum_memory(size, preamble_longs * sizeof(uint64_t));
370
362
 
371
- frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
363
+ frequent_items_sketch sketch(lg_max_size, lg_cur_size, equal, allocator);
372
364
  if (!is_empty) {
373
365
  uint32_t num_items;
374
366
  ptr += copy_from_mem(ptr, num_items);
@@ -398,8 +390,8 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
398
390
  return sketch;
399
391
  }
400
392
 
401
- template<typename T, typename W, typename H, typename E, typename S, typename A>
402
- void frequent_items_sketch<T, W, H, E, S, A>::check_preamble_longs(uint8_t preamble_longs, bool is_empty) {
393
+ template<typename T, typename W, typename H, typename E, typename A>
394
+ void frequent_items_sketch<T, W, H, E, A>::check_preamble_longs(uint8_t preamble_longs, bool is_empty) {
403
395
  if (is_empty) {
404
396
  if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
405
397
  throw std::invalid_argument("Possible corruption: preamble longs of an empty sketch must be " + std::to_string(PREAMBLE_LONGS_EMPTY) + ": " + std::to_string(preamble_longs));
@@ -411,22 +403,22 @@ void frequent_items_sketch<T, W, H, E, S, A>::check_preamble_longs(uint8_t pream
411
403
  }
412
404
  }
413
405
 
414
- template<typename T, typename W, typename H, typename E, typename S, typename A>
415
- void frequent_items_sketch<T, W, H, E, S, A>::check_serial_version(uint8_t serial_version) {
406
+ template<typename T, typename W, typename H, typename E, typename A>
407
+ void frequent_items_sketch<T, W, H, E, A>::check_serial_version(uint8_t serial_version) {
416
408
  if (serial_version != SERIAL_VERSION) {
417
409
  throw std::invalid_argument("Possible corruption: serial version must be " + std::to_string(SERIAL_VERSION) + ": " + std::to_string(serial_version));
418
410
  }
419
411
  }
420
412
 
421
- template<typename T, typename W, typename H, typename E, typename S, typename A>
422
- void frequent_items_sketch<T, W, H, E, S, A>::check_family_id(uint8_t family_id) {
413
+ template<typename T, typename W, typename H, typename E, typename A>
414
+ void frequent_items_sketch<T, W, H, E, A>::check_family_id(uint8_t family_id) {
423
415
  if (family_id != FAMILY_ID) {
424
416
  throw std::invalid_argument("Possible corruption: family ID must be " + std::to_string(FAMILY_ID) + ": " + std::to_string(family_id));
425
417
  }
426
418
  }
427
419
 
428
- template<typename T, typename W, typename H, typename E, typename S, typename A>
429
- void frequent_items_sketch<T, W, H, E, S, A>::check_size(uint8_t lg_cur_size, uint8_t lg_max_size) {
420
+ template<typename T, typename W, typename H, typename E, typename A>
421
+ void frequent_items_sketch<T, W, H, E, A>::check_size(uint8_t lg_cur_size, uint8_t lg_max_size) {
430
422
  if (lg_cur_size > lg_max_size) {
431
423
  throw std::invalid_argument("Possible corruption: expected lg_cur_size <= lg_max_size: " + std::to_string(lg_cur_size) + " <= " + std::to_string(lg_max_size));
432
424
  }
@@ -435,8 +427,8 @@ void frequent_items_sketch<T, W, H, E, S, A>::check_size(uint8_t lg_cur_size, ui
435
427
  }
436
428
  }
437
429
 
438
- template<typename T, typename W, typename H, typename E, typename S, typename A>
439
- string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) const {
430
+ template<typename T, typename W, typename H, typename E, typename A>
431
+ string<A> frequent_items_sketch<T, W, H, E, A>::to_string(bool print_items) const {
440
432
  // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
441
433
  // The stream does not support passing an allocator instance, and alternatives are complicated.
442
434
  std::ostringstream os;
@@ -466,23 +458,23 @@ string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) c
466
458
  }
467
459
 
468
460
  // version for integral signed type
469
- template<typename T, typename W, typename H, typename E, typename S, typename A>
461
+ template<typename T, typename W, typename H, typename E, typename A>
470
462
  template<typename WW, typename std::enable_if<std::is_integral<WW>::value && std::is_signed<WW>::value, int>::type>
471
- void frequent_items_sketch<T, W, H, E, S, A>::check_weight(WW weight) {
463
+ void frequent_items_sketch<T, W, H, E, A>::check_weight(WW weight) {
472
464
  if (weight < 0) {
473
465
  throw std::invalid_argument("weight must be non-negative");
474
466
  }
475
467
  }
476
468
 
477
469
  // version for integral unsigned type - no-op
478
- template<typename T, typename W, typename H, typename E, typename S, typename A>
470
+ template<typename T, typename W, typename H, typename E, typename A>
479
471
  template<typename WW, typename std::enable_if<std::is_integral<WW>::value && std::is_unsigned<WW>::value, int>::type>
480
- void frequent_items_sketch<T, W, H, E, S, A>::check_weight(WW) {}
472
+ void frequent_items_sketch<T, W, H, E, A>::check_weight(WW) {}
481
473
 
482
474
  // version for floating point type
483
- template<typename T, typename W, typename H, typename E, typename S, typename A>
475
+ template<typename T, typename W, typename H, typename E, typename A>
484
476
  template<typename WW, typename std::enable_if<std::is_floating_point<WW>::value, int>::type>
485
- void frequent_items_sketch<T, W, H, E, S, A>::check_weight(WW weight) {
477
+ void frequent_items_sketch<T, W, H, E, A>::check_weight(WW weight) {
486
478
  if (weight < 0) {
487
479
  throw std::invalid_argument("weight must be non-negative");
488
480
  }
@@ -29,21 +29,27 @@ namespace datasketches {
29
29
  * This is a specialized linear-probing hash map with a reverse purge operation
30
30
  * that removes all entries in the map with values that are less than zero.
31
31
  * Based on Java implementation here:
32
- * https://github.com/DataSketches/sketches-core/blob/master/src/main/java/com/yahoo/sketches/frequencies/ReversePurgeItemHashMap.java
32
+ * https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ReversePurgeItemHashMap.java
33
33
  * author Alexander Saydakov
34
34
  */
35
35
 
36
- template<typename K, typename V = uint64_t, typename H = std::hash<K>, typename E = std::equal_to<K>, typename A = std::allocator<K>>
36
+ template<
37
+ typename K,
38
+ typename V = uint64_t,
39
+ typename H = std::hash<K>,
40
+ typename E = std::equal_to<K>,
41
+ typename A = std::allocator<K>
42
+ >
37
43
  class reverse_purge_hash_map {
38
44
  public:
39
45
  using AllocV = typename std::allocator_traits<A>::template rebind_alloc<V>;
40
46
  using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
41
47
 
42
- reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size, const A& allocator);
48
+ reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size, const E& equal, const A& allocator);
43
49
  reverse_purge_hash_map(const reverse_purge_hash_map& other);
44
50
  reverse_purge_hash_map(reverse_purge_hash_map&& other) noexcept;
45
51
  ~reverse_purge_hash_map();
46
- reverse_purge_hash_map& operator=(reverse_purge_hash_map other);
52
+ reverse_purge_hash_map& operator=(const reverse_purge_hash_map& other);
47
53
  reverse_purge_hash_map& operator=(reverse_purge_hash_map&& other);
48
54
 
49
55
  template<typename FwdK>
@@ -65,6 +71,7 @@ private:
65
71
  static constexpr uint16_t DRIFT_LIMIT = 1024; // used only for stress testing
66
72
  static constexpr uint32_t MAX_SAMPLE_SIZE = 1024; // number of samples to compute approximate median during purge
67
73
 
74
+ E equal_;
68
75
  A allocator_;
69
76
  uint8_t lg_cur_size_;
70
77
  uint8_t lg_max_size_;
@@ -34,7 +34,9 @@ template<typename K, typename V, typename H, typename E, typename A>
34
34
  constexpr uint32_t reverse_purge_hash_map<K, V, H, E, A>::MAX_SAMPLE_SIZE;
35
35
 
36
36
  template<typename K, typename V, typename H, typename E, typename A>
37
- reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(uint8_t lg_cur_size, uint8_t lg_max_size, const A& allocator):
37
+ reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(uint8_t lg_cur_size, uint8_t lg_max_size,
38
+ const E& equal, const A& allocator):
39
+ equal_(equal),
38
40
  allocator_(allocator),
39
41
  lg_cur_size_(lg_cur_size),
40
42
  lg_max_size_(lg_max_size),
@@ -52,6 +54,7 @@ states_(nullptr)
52
54
 
53
55
  template<typename K, typename V, typename H, typename E, typename A>
54
56
  reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(const reverse_purge_hash_map<K, V, H, E, A>& other):
57
+ equal_(other.equal_),
55
58
  allocator_(other.allocator_),
56
59
  lg_cur_size_(other.lg_cur_size_),
57
60
  lg_max_size_(other.lg_max_size_),
@@ -80,6 +83,7 @@ states_(nullptr)
80
83
 
81
84
  template<typename K, typename V, typename H, typename E, typename A>
82
85
  reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(reverse_purge_hash_map<K, V, H, E, A>&& other) noexcept:
86
+ equal_(std::move(other.equal_)),
83
87
  allocator_(std::move(other.allocator_)),
84
88
  lg_cur_size_(other.lg_cur_size_),
85
89
  lg_max_size_(other.lg_max_size_),
@@ -119,19 +123,22 @@ reverse_purge_hash_map<K, V, H, E, A>::~reverse_purge_hash_map() {
119
123
  }
120
124
 
121
125
  template<typename K, typename V, typename H, typename E, typename A>
122
- reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(reverse_purge_hash_map<K, V, H, E, A> other) {
123
- std::swap(allocator_, other.allocator_);
124
- std::swap(lg_cur_size_, other.lg_cur_size_);
125
- std::swap(lg_max_size_, other.lg_max_size_);
126
- std::swap(num_active_, other.num_active_);
127
- std::swap(keys_, other.keys_);
128
- std::swap(values_, other.values_);
129
- std::swap(states_, other.states_);
126
+ reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(const reverse_purge_hash_map<K, V, H, E, A>& other) {
127
+ reverse_purge_hash_map copy(other);
128
+ std::swap(equal_, copy.equal_);
129
+ std::swap(allocator_, copy.allocator_);
130
+ std::swap(lg_cur_size_, copy.lg_cur_size_);
131
+ std::swap(lg_max_size_, copy.lg_max_size_);
132
+ std::swap(num_active_, copy.num_active_);
133
+ std::swap(keys_, copy.keys_);
134
+ std::swap(values_, copy.values_);
135
+ std::swap(states_, copy.states_);
130
136
  return *this;
131
137
  }
132
138
 
133
139
  template<typename K, typename V, typename H, typename E, typename A>
134
140
  reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(reverse_purge_hash_map<K, V, H, E, A>&& other) {
141
+ std::swap(equal_, other.equal_);
135
142
  std::swap(allocator_, other.allocator_);
136
143
  std::swap(lg_cur_size_, other.lg_cur_size_);
137
144
  std::swap(lg_max_size_, other.lg_max_size_);
@@ -17,7 +17,7 @@
17
17
 
18
18
  add_executable(fi_test)
19
19
 
20
- target_link_libraries(fi_test fi common_test)
20
+ target_link_libraries(fi_test fi common_test_lib)
21
21
 
22
22
  set_target_properties(fi_test PROPERTIES
23
23
  CXX_STANDARD 11
@@ -27,62 +27,75 @@
27
27
 
28
28
  namespace datasketches {
29
29
 
30
- using frequent_test_type_sketch = frequent_items_sketch<test_type, float, test_type_hash, test_type_equal, test_type_serde, test_allocator<test_type>>;
30
+ using frequent_test_type_sketch = frequent_items_sketch<test_type, float, test_type_hash, test_type_equal, test_allocator<test_type>>;
31
31
  using alloc = test_allocator<test_type>;
32
32
 
33
33
  TEST_CASE("frequent items: custom type", "[frequent_items_sketch]") {
34
- frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
35
- sketch.update(1, 10); // should survive the purge
36
- sketch.update(2);
37
- sketch.update(3);
38
- sketch.update(4);
39
- sketch.update(5);
40
- sketch.update(6);
41
- sketch.update(7);
42
- test_type a8(8);
43
- sketch.update(a8);
44
- REQUIRE_FALSE(sketch.is_empty());
45
- REQUIRE(sketch.get_total_weight() == 17);
46
- REQUIRE(sketch.get_estimate(1) == 10);
34
+ test_allocator_total_bytes = 0;
35
+ {
36
+ frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
37
+ sketch.update(1, 10); // should survive the purge
38
+ sketch.update(2);
39
+ sketch.update(3);
40
+ sketch.update(4);
41
+ sketch.update(5);
42
+ sketch.update(6);
43
+ sketch.update(7);
44
+ test_type a8(8);
45
+ sketch.update(a8);
46
+ REQUIRE_FALSE(sketch.is_empty());
47
+ REQUIRE(sketch.get_total_weight() == 17);
48
+ REQUIRE(sketch.get_estimate(1) == 10);
47
49
 
48
- auto items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES);
49
- REQUIRE(items.size() == 1); // only 1 item should be above threshold
50
- REQUIRE(items[0].get_item().get_value() == 1);
51
- REQUIRE(items[0].get_estimate() == 10);
50
+ auto items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES);
51
+ REQUIRE(items.size() == 1); // only 1 item should be above threshold
52
+ REQUIRE(items[0].get_item().get_value() == 1);
53
+ REQUIRE(items[0].get_estimate() == 10);
52
54
 
53
- std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
54
- sketch.serialize(s);
55
- auto sketch2 = frequent_test_type_sketch::deserialize(s, alloc(0));
56
- REQUIRE_FALSE(sketch2.is_empty());
57
- REQUIRE(sketch2.get_total_weight() == 17);
58
- REQUIRE(sketch2.get_estimate(1) == 10);
59
- REQUIRE(sketch.get_num_active_items() == sketch2.get_num_active_items());
60
- REQUIRE(sketch.get_maximum_error() == sketch2.get_maximum_error());
55
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
56
+ sketch.serialize(s, test_type_serde());
57
+ auto sketch2 = frequent_test_type_sketch::deserialize(s, test_type_serde(), test_type_equal(), 0);
58
+ REQUIRE_FALSE(sketch2.is_empty());
59
+ REQUIRE(sketch2.get_total_weight() == 17);
60
+ REQUIRE(sketch2.get_estimate(1) == 10);
61
+ REQUIRE(sketch.get_num_active_items() == sketch2.get_num_active_items());
62
+ REQUIRE(sketch.get_maximum_error() == sketch2.get_maximum_error());
61
63
 
62
- auto bytes = sketch.serialize();
63
- auto sketch3 = frequent_test_type_sketch::deserialize(bytes.data(), bytes.size(), alloc(0));
64
- REQUIRE_FALSE(sketch3.is_empty());
65
- REQUIRE(sketch3.get_total_weight() == 17);
66
- REQUIRE(sketch3.get_estimate(1) == 10);
67
- REQUIRE(sketch.get_num_active_items() == sketch3.get_num_active_items());
68
- REQUIRE(sketch.get_maximum_error() == sketch3.get_maximum_error());
64
+ auto bytes = sketch.serialize(0, test_type_serde());
65
+ auto sketch3 = frequent_test_type_sketch::deserialize(bytes.data(), bytes.size(), test_type_serde(),
66
+ test_type_equal(), 0);
67
+ REQUIRE_FALSE(sketch3.is_empty());
68
+ REQUIRE(sketch3.get_total_weight() == 17);
69
+ REQUIRE(sketch3.get_estimate(1) == 10);
70
+ REQUIRE(sketch.get_num_active_items() == sketch3.get_num_active_items());
71
+ REQUIRE(sketch.get_maximum_error() == sketch3.get_maximum_error());
72
+ }
73
+ REQUIRE(test_allocator_total_bytes == 0);
69
74
  }
70
75
 
71
76
  // this is to see the debug print from test_type if enabled there to make sure items are moved
72
77
  TEST_CASE("frequent items: moving merge", "[frequent_items_sketch]") {
73
- frequent_test_type_sketch sketch1(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
74
- sketch1.update(1);
78
+ test_allocator_total_bytes = 0;
79
+ {
80
+ frequent_test_type_sketch sketch1(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
81
+ sketch1.update(1);
75
82
 
76
- frequent_test_type_sketch sketch2(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
77
- sketch2.update(2);
83
+ frequent_test_type_sketch sketch2(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
84
+ sketch2.update(2);
78
85
 
79
- sketch2.merge(std::move(sketch1));
80
- REQUIRE(sketch2.get_total_weight() == 2);
86
+ sketch2.merge(std::move(sketch1));
87
+ REQUIRE(sketch2.get_total_weight() == 2);
88
+ }
89
+ REQUIRE(test_allocator_total_bytes == 0);
81
90
  }
82
91
 
83
92
  TEST_CASE("frequent items: negative weight", "[frequent_items_sketch]") {
84
- frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
85
- REQUIRE_THROWS_AS(sketch.update(1, -1), std::invalid_argument);
93
+ test_allocator_total_bytes = 0;
94
+ {
95
+ frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
96
+ REQUIRE_THROWS_AS(sketch.update(1, -1), std::invalid_argument);
97
+ }
98
+ REQUIRE(test_allocator_total_bytes == 0);
86
99
  }
87
100
 
88
101
  } /* namespace datasketches */
@@ -24,20 +24,20 @@
24
24
  namespace datasketches {
25
25
 
26
26
  TEST_CASE("reverse purge hash map: empty", "[frequent_items_sketch]") {
27
- reverse_purge_hash_map<int> map(3, 3, std::allocator<int>());
27
+ reverse_purge_hash_map<int> map(3, 3, std::equal_to<int>(), std::allocator<int>());
28
28
  REQUIRE(map.get_num_active() == 0);
29
29
  REQUIRE(map.get_lg_cur_size() == 3); // static_cast<uint8_t>(3)
30
30
  }
31
31
 
32
32
  TEST_CASE("reverse purge hash map: one item", "[frequent_items_sketch]") {
33
- reverse_purge_hash_map<int> map(3, 3, std::allocator<int>());
33
+ reverse_purge_hash_map<int> map(3, 3, std::equal_to<int>(), std::allocator<int>());
34
34
  map.adjust_or_insert(1, 1);
35
35
  REQUIRE(map.get_num_active() == 1);
36
36
  REQUIRE(map.get(1) == 1);
37
37
  }
38
38
 
39
39
  TEST_CASE("reverse purge hash map: iterator", "[frequent_items_sketch]") {
40
- reverse_purge_hash_map<int> map(3, 4, std::allocator<int>());
40
+ reverse_purge_hash_map<int> map(3, 4, std::equal_to<int>(), std::allocator<int>());
41
41
  for (int i = 0; i < 11; i++) map.adjust_or_insert(i, 1); // this should fit with no purge
42
42
  uint64_t sum = 0;
43
43
  for (auto it: map) sum += it.second;
@@ -267,10 +267,10 @@ void Hll4Array<A>::shiftToBiggerCurMin() {
267
267
  for (const auto coupon: *auxHashMap_) {
268
268
  slotNum = HllUtil<A>::getLow26(coupon) & configKmask;
269
269
  oldActualVal = HllUtil<A>::getValue(coupon);
270
- newShiftedVal = oldActualVal - newCurMin;
271
- if (newShiftedVal < 0) {
270
+ if (oldActualVal < newCurMin) {
272
271
  throw std::logic_error("oldActualVal < newCurMin when incrementing curMin");
273
272
  }
273
+ newShiftedVal = oldActualVal - newCurMin;
274
274
 
275
275
  if (getSlot(slotNum) != hll_constants::AUX_TOKEN) {
276
276
  throw std::logic_error("getSlot(slotNum) != AUX_TOKEN for item in auxiliary hash map");
@@ -17,7 +17,7 @@
17
17
 
18
18
  add_executable(hll_test)
19
19
 
20
- target_link_libraries(hll_test hll common_test)
20
+ target_link_libraries(hll_test hll common_test_lib)
21
21
 
22
22
  set_target_properties(hll_test PROPERTIES
23
23
  CXX_STANDARD 11