datasketches 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/kll_wrapper.cpp +20 -20
  4. data/ext/datasketches/theta_wrapper.cpp +2 -2
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  7. data/vendor/datasketches-cpp/MANIFEST.in +21 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  10. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  11. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  12. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  13. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  14. data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
  15. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
  16. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  17. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  18. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  19. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  20. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  21. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
  22. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
  24. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  25. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  26. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  27. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  28. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  29. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  30. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  31. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
  32. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
  33. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
  34. data/vendor/datasketches-cpp/pyproject.toml +17 -12
  35. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  36. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  37. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  38. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  39. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  40. data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
  41. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  42. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  43. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  44. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  45. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  46. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  47. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
  48. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  49. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  50. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  51. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  52. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  53. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  54. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  55. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
  56. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
  57. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  58. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  59. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  60. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  61. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  62. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
  63. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
  64. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  65. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  66. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  67. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  68. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  69. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
  70. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
  71. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
  72. data/vendor/datasketches-cpp/setup.py +14 -2
  73. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  74. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  75. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  76. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  77. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
  78. data/vendor/datasketches-cpp/tox.ini +26 -0
  79. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  80. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
  81. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  82. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  83. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
  84. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  85. metadata +14 -5
  86. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -30,40 +30,42 @@
30
30
  namespace datasketches {
31
31
 
32
32
  // clang++ seems to require this declaration for CMAKE_BUILD_TYPE='Debug"
33
- template<typename T, typename W, typename H, typename E, typename S, typename A>
34
- const uint8_t frequent_items_sketch<T, W, H, E, S, A>::LG_MIN_MAP_SIZE;
33
+ template<typename T, typename W, typename H, typename E, typename A>
34
+ const uint8_t frequent_items_sketch<T, W, H, E, A>::LG_MIN_MAP_SIZE;
35
35
 
36
- template<typename T, typename W, typename H, typename E, typename S, typename A>
37
- frequent_items_sketch<T, W, H, E, S, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size, const A& allocator):
36
+ template<typename T, typename W, typename H, typename E, typename A>
37
+ frequent_items_sketch<T, W, H, E, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size,
38
+ const E& equal, const A& allocator):
38
39
  total_weight(0),
39
40
  offset(0),
40
41
  map(
41
42
  std::max(lg_start_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
42
43
  std::max(lg_max_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
44
+ equal,
43
45
  allocator
44
46
  )
45
47
  {
46
48
  if (lg_start_map_size > lg_max_map_size) throw std::invalid_argument("starting size must not be greater than maximum size");
47
49
  }
48
50
 
49
- template<typename T, typename W, typename H, typename E, typename S, typename A>
50
- void frequent_items_sketch<T, W, H, E, S, A>::update(const T& item, W weight) {
51
+ template<typename T, typename W, typename H, typename E, typename A>
52
+ void frequent_items_sketch<T, W, H, E, A>::update(const T& item, W weight) {
51
53
  check_weight(weight);
52
54
  if (weight == 0) return;
53
55
  total_weight += weight;
54
56
  offset += map.adjust_or_insert(item, weight);
55
57
  }
56
58
 
57
- template<typename T, typename W, typename H, typename E, typename S, typename A>
58
- void frequent_items_sketch<T, W, H, E, S, A>::update(T&& item, W weight) {
59
+ template<typename T, typename W, typename H, typename E, typename A>
60
+ void frequent_items_sketch<T, W, H, E, A>::update(T&& item, W weight) {
59
61
  check_weight(weight);
60
62
  if (weight == 0) return;
61
63
  total_weight += weight;
62
64
  offset += map.adjust_or_insert(std::move(item), weight);
63
65
  }
64
66
 
65
- template<typename T, typename W, typename H, typename E, typename S, typename A>
66
- void frequent_items_sketch<T, W, H, E, S, A>::merge(const frequent_items_sketch& other) {
67
+ template<typename T, typename W, typename H, typename E, typename A>
68
+ void frequent_items_sketch<T, W, H, E, A>::merge(const frequent_items_sketch& other) {
67
69
  if (other.is_empty()) return;
68
70
  const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
69
71
  for (auto it: other.map) {
@@ -73,8 +75,8 @@ void frequent_items_sketch<T, W, H, E, S, A>::merge(const frequent_items_sketch&
73
75
  total_weight = merged_total_weight;
74
76
  }
75
77
 
76
- template<typename T, typename W, typename H, typename E, typename S, typename A>
77
- void frequent_items_sketch<T, W, H, E, S, A>::merge(frequent_items_sketch&& other) {
78
+ template<typename T, typename W, typename H, typename E, typename A>
79
+ void frequent_items_sketch<T, W, H, E, A>::merge(frequent_items_sketch&& other) {
78
80
  if (other.is_empty()) return;
79
81
  const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
80
82
  for (auto it: other.map) {
@@ -84,69 +86,67 @@ void frequent_items_sketch<T, W, H, E, S, A>::merge(frequent_items_sketch&& othe
84
86
  total_weight = merged_total_weight;
85
87
  }
86
88
 
87
- template<typename T, typename W, typename H, typename E, typename S, typename A>
88
- bool frequent_items_sketch<T, W, H, E, S, A>::is_empty() const {
89
+ template<typename T, typename W, typename H, typename E, typename A>
90
+ bool frequent_items_sketch<T, W, H, E, A>::is_empty() const {
89
91
  return map.get_num_active() == 0;
90
92
  }
91
93
 
92
- template<typename T, typename W, typename H, typename E, typename S, typename A>
93
- uint32_t frequent_items_sketch<T, W, H, E, S, A>::get_num_active_items() const {
94
+ template<typename T, typename W, typename H, typename E, typename A>
95
+ uint32_t frequent_items_sketch<T, W, H, E, A>::get_num_active_items() const {
94
96
  return map.get_num_active();
95
97
  }
96
98
 
97
- template<typename T, typename W, typename H, typename E, typename S, typename A>
98
- W frequent_items_sketch<T, W, H, E, S, A>::get_total_weight() const {
99
+ template<typename T, typename W, typename H, typename E, typename A>
100
+ W frequent_items_sketch<T, W, H, E, A>::get_total_weight() const {
99
101
  return total_weight;
100
102
  }
101
103
 
102
- template<typename T, typename W, typename H, typename E, typename S, typename A>
103
- W frequent_items_sketch<T, W, H, E, S, A>::get_estimate(const T& item) const {
104
+ template<typename T, typename W, typename H, typename E, typename A>
105
+ W frequent_items_sketch<T, W, H, E, A>::get_estimate(const T& item) const {
104
106
  // if item is tracked estimate = weight + offset, otherwise 0
105
107
  const W weight = map.get(item);
106
108
  if (weight > 0) return weight + offset;
107
109
  return 0;
108
110
  }
109
111
 
110
- template<typename T, typename W, typename H, typename E, typename S, typename A>
111
- W frequent_items_sketch<T, W, H, E, S, A>::get_lower_bound(const T& item) const {
112
+ template<typename T, typename W, typename H, typename E, typename A>
113
+ W frequent_items_sketch<T, W, H, E, A>::get_lower_bound(const T& item) const {
112
114
  return map.get(item);
113
115
  }
114
116
 
115
- template<typename T, typename W, typename H, typename E, typename S, typename A>
116
- W frequent_items_sketch<T, W, H, E, S, A>::get_upper_bound(const T& item) const {
117
+ template<typename T, typename W, typename H, typename E, typename A>
118
+ W frequent_items_sketch<T, W, H, E, A>::get_upper_bound(const T& item) const {
117
119
  return map.get(item) + offset;
118
120
  }
119
121
 
120
- template<typename T, typename W, typename H, typename E, typename S, typename A>
121
- W frequent_items_sketch<T, W, H, E, S, A>::get_maximum_error() const {
122
+ template<typename T, typename W, typename H, typename E, typename A>
123
+ W frequent_items_sketch<T, W, H, E, A>::get_maximum_error() const {
122
124
  return offset;
123
125
  }
124
126
 
125
- template<typename T, typename W, typename H, typename E, typename S, typename A>
126
- double frequent_items_sketch<T, W, H, E, S, A>::get_epsilon() const {
127
+ template<typename T, typename W, typename H, typename E, typename A>
128
+ double frequent_items_sketch<T, W, H, E, A>::get_epsilon() const {
127
129
  return EPSILON_FACTOR / (1 << map.get_lg_max_size());
128
130
  }
129
131
 
130
- template<typename T, typename W, typename H, typename E, typename S, typename A>
131
- double frequent_items_sketch<T, W, H, E, S, A>::get_epsilon(uint8_t lg_max_map_size) {
132
+ template<typename T, typename W, typename H, typename E, typename A>
133
+ double frequent_items_sketch<T, W, H, E, A>::get_epsilon(uint8_t lg_max_map_size) {
132
134
  return EPSILON_FACTOR / (1 << lg_max_map_size);
133
135
  }
134
136
 
135
- template<typename T, typename W, typename H, typename E, typename S, typename A>
136
- double frequent_items_sketch<T, W, H, E, S, A>::get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight) {
137
+ template<typename T, typename W, typename H, typename E, typename A>
138
+ double frequent_items_sketch<T, W, H, E, A>::get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight) {
137
139
  return get_epsilon(lg_max_map_size) * estimated_total_weight;
138
140
  }
139
141
 
140
142
 
141
- template<typename T, typename W, typename H, typename E, typename S, typename A>
142
- typename frequent_items_sketch<T, W, H, E, S, A>::vector_row
143
- frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type) const {
143
+ template<typename T, typename W, typename H, typename E, typename A>
144
+ auto frequent_items_sketch<T, W, H, E, A>::get_frequent_items(frequent_items_error_type err_type) const -> vector_row {
144
145
  return get_frequent_items(err_type, get_maximum_error());
145
146
  }
146
147
 
147
- template<typename T, typename W, typename H, typename E, typename S, typename A>
148
- typename frequent_items_sketch<T, W, H, E, S, A>::vector_row
149
- frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const {
148
+ template<typename T, typename W, typename H, typename E, typename A>
149
+ auto frequent_items_sketch<T, W, H, E, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const -> vector_row {
150
150
  vector_row items(map.get_allocator());
151
151
  for (auto it: map) {
152
152
  const W lb = it.second;
@@ -160,9 +160,9 @@ frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error
160
160
  return items;
161
161
  }
162
162
 
163
- template<typename T, typename W, typename H, typename E, typename S, typename A>
163
+ template<typename T, typename W, typename H, typename E, typename A>
164
164
  template<typename SerDe>
165
- void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os, const SerDe& sd) const {
165
+ void frequent_items_sketch<T, W, H, E, A>::serialize(std::ostream& os, const SerDe& sd) const {
166
166
  const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
167
167
  write(os, preamble_longs);
168
168
  const uint8_t serial_version = SERIAL_VERSION;
@@ -206,18 +206,18 @@ void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os, const
206
206
  }
207
207
  }
208
208
 
209
- template<typename T, typename W, typename H, typename E, typename S, typename A>
209
+ template<typename T, typename W, typename H, typename E, typename A>
210
210
  template<typename SerDe>
211
- size_t frequent_items_sketch<T, W, H, E, S, A>::get_serialized_size_bytes(const SerDe& sd) const {
211
+ size_t frequent_items_sketch<T, W, H, E, A>::get_serialized_size_bytes(const SerDe& sd) const {
212
212
  if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t);
213
213
  size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W);
214
214
  for (auto it: map) size += sd.size_of_item(it.first);
215
215
  return size;
216
216
  }
217
217
 
218
- template<typename T, typename W, typename H, typename E, typename S, typename A>
218
+ template<typename T, typename W, typename H, typename E, typename A>
219
219
  template<typename SerDe>
220
- auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const -> vector_bytes {
220
+ auto frequent_items_sketch<T, W, H, E, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const -> vector_bytes {
221
221
  const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
222
222
  vector_bytes bytes(size, 0, map.get_allocator());
223
223
  uint8_t* ptr = bytes.data() + header_size_bytes;
@@ -266,8 +266,8 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
266
266
  return bytes;
267
267
  }
268
268
 
269
- template<typename T, typename W, typename H, typename E, typename S, typename A>
270
- class frequent_items_sketch<T, W, H, E, S, A>::items_deleter {
269
+ template<typename T, typename W, typename H, typename E, typename A>
270
+ class frequent_items_sketch<T, W, H, E, A>::items_deleter {
271
271
  public:
272
272
  items_deleter(uint32_t num, bool destroy, const A& allocator):
273
273
  allocator_(allocator), num_(num), destroy_(destroy) {}
@@ -286,14 +286,10 @@ private:
286
286
  bool destroy_;
287
287
  };
288
288
 
289
- template<typename T, typename W, typename H, typename E, typename S, typename A>
290
- frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is, const A& allocator) {
291
- return deserialize(is, S(), allocator);
292
- }
293
-
294
- template<typename T, typename W, typename H, typename E, typename S, typename A>
289
+ template<typename T, typename W, typename H, typename E, typename A>
295
290
  template<typename SerDe>
296
- frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
291
+ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(std::istream& is,
292
+ const SerDe& sd, const E& equal, const A& allocator) {
297
293
  const auto preamble_longs = read<uint8_t>(is);
298
294
  const auto serial_version = read<uint8_t>(is);
299
295
  const auto family_id = read<uint8_t>(is);
@@ -309,7 +305,7 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
309
305
  check_family_id(family_id);
310
306
  check_size(lg_cur_size, lg_max_size);
311
307
 
312
- frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
308
+ frequent_items_sketch sketch(lg_max_size, lg_cur_size, equal, allocator);
313
309
  if (!is_empty) {
314
310
  const auto num_items = read<uint32_t>(is);
315
311
  read<uint32_t>(is); // unused
@@ -335,14 +331,10 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
335
331
  return sketch;
336
332
  }
337
333
 
338
- template<typename T, typename W, typename H, typename E, typename S, typename A>
339
- frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
340
- return deserialize(bytes, size, S(), allocator);
341
- }
342
-
343
- template<typename T, typename W, typename H, typename E, typename S, typename A>
334
+ template<typename T, typename W, typename H, typename E, typename A>
344
335
  template<typename SerDe>
345
- frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
336
+ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(const void* bytes, size_t size,
337
+ const SerDe& sd, const E& equal, const A& allocator) {
346
338
  ensure_minimum_memory(size, 8);
347
339
  const char* ptr = static_cast<const char*>(bytes);
348
340
  const char* base = static_cast<const char*>(bytes);
@@ -368,7 +360,7 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
368
360
  check_size(lg_cur_size, lg_max_size);
369
361
  ensure_minimum_memory(size, preamble_longs * sizeof(uint64_t));
370
362
 
371
- frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
363
+ frequent_items_sketch sketch(lg_max_size, lg_cur_size, equal, allocator);
372
364
  if (!is_empty) {
373
365
  uint32_t num_items;
374
366
  ptr += copy_from_mem(ptr, num_items);
@@ -398,8 +390,8 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
398
390
  return sketch;
399
391
  }
400
392
 
401
- template<typename T, typename W, typename H, typename E, typename S, typename A>
402
- void frequent_items_sketch<T, W, H, E, S, A>::check_preamble_longs(uint8_t preamble_longs, bool is_empty) {
393
+ template<typename T, typename W, typename H, typename E, typename A>
394
+ void frequent_items_sketch<T, W, H, E, A>::check_preamble_longs(uint8_t preamble_longs, bool is_empty) {
403
395
  if (is_empty) {
404
396
  if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
405
397
  throw std::invalid_argument("Possible corruption: preamble longs of an empty sketch must be " + std::to_string(PREAMBLE_LONGS_EMPTY) + ": " + std::to_string(preamble_longs));
@@ -411,22 +403,22 @@ void frequent_items_sketch<T, W, H, E, S, A>::check_preamble_longs(uint8_t pream
411
403
  }
412
404
  }
413
405
 
414
- template<typename T, typename W, typename H, typename E, typename S, typename A>
415
- void frequent_items_sketch<T, W, H, E, S, A>::check_serial_version(uint8_t serial_version) {
406
+ template<typename T, typename W, typename H, typename E, typename A>
407
+ void frequent_items_sketch<T, W, H, E, A>::check_serial_version(uint8_t serial_version) {
416
408
  if (serial_version != SERIAL_VERSION) {
417
409
  throw std::invalid_argument("Possible corruption: serial version must be " + std::to_string(SERIAL_VERSION) + ": " + std::to_string(serial_version));
418
410
  }
419
411
  }
420
412
 
421
- template<typename T, typename W, typename H, typename E, typename S, typename A>
422
- void frequent_items_sketch<T, W, H, E, S, A>::check_family_id(uint8_t family_id) {
413
+ template<typename T, typename W, typename H, typename E, typename A>
414
+ void frequent_items_sketch<T, W, H, E, A>::check_family_id(uint8_t family_id) {
423
415
  if (family_id != FAMILY_ID) {
424
416
  throw std::invalid_argument("Possible corruption: family ID must be " + std::to_string(FAMILY_ID) + ": " + std::to_string(family_id));
425
417
  }
426
418
  }
427
419
 
428
- template<typename T, typename W, typename H, typename E, typename S, typename A>
429
- void frequent_items_sketch<T, W, H, E, S, A>::check_size(uint8_t lg_cur_size, uint8_t lg_max_size) {
420
+ template<typename T, typename W, typename H, typename E, typename A>
421
+ void frequent_items_sketch<T, W, H, E, A>::check_size(uint8_t lg_cur_size, uint8_t lg_max_size) {
430
422
  if (lg_cur_size > lg_max_size) {
431
423
  throw std::invalid_argument("Possible corruption: expected lg_cur_size <= lg_max_size: " + std::to_string(lg_cur_size) + " <= " + std::to_string(lg_max_size));
432
424
  }
@@ -435,8 +427,8 @@ void frequent_items_sketch<T, W, H, E, S, A>::check_size(uint8_t lg_cur_size, ui
435
427
  }
436
428
  }
437
429
 
438
- template<typename T, typename W, typename H, typename E, typename S, typename A>
439
- string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) const {
430
+ template<typename T, typename W, typename H, typename E, typename A>
431
+ string<A> frequent_items_sketch<T, W, H, E, A>::to_string(bool print_items) const {
440
432
  // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
441
433
  // The stream does not support passing an allocator instance, and alternatives are complicated.
442
434
  std::ostringstream os;
@@ -466,23 +458,23 @@ string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) c
466
458
  }
467
459
 
468
460
  // version for integral signed type
469
- template<typename T, typename W, typename H, typename E, typename S, typename A>
461
+ template<typename T, typename W, typename H, typename E, typename A>
470
462
  template<typename WW, typename std::enable_if<std::is_integral<WW>::value && std::is_signed<WW>::value, int>::type>
471
- void frequent_items_sketch<T, W, H, E, S, A>::check_weight(WW weight) {
463
+ void frequent_items_sketch<T, W, H, E, A>::check_weight(WW weight) {
472
464
  if (weight < 0) {
473
465
  throw std::invalid_argument("weight must be non-negative");
474
466
  }
475
467
  }
476
468
 
477
469
  // version for integral unsigned type - no-op
478
- template<typename T, typename W, typename H, typename E, typename S, typename A>
470
+ template<typename T, typename W, typename H, typename E, typename A>
479
471
  template<typename WW, typename std::enable_if<std::is_integral<WW>::value && std::is_unsigned<WW>::value, int>::type>
480
- void frequent_items_sketch<T, W, H, E, S, A>::check_weight(WW) {}
472
+ void frequent_items_sketch<T, W, H, E, A>::check_weight(WW) {}
481
473
 
482
474
  // version for floating point type
483
- template<typename T, typename W, typename H, typename E, typename S, typename A>
475
+ template<typename T, typename W, typename H, typename E, typename A>
484
476
  template<typename WW, typename std::enable_if<std::is_floating_point<WW>::value, int>::type>
485
- void frequent_items_sketch<T, W, H, E, S, A>::check_weight(WW weight) {
477
+ void frequent_items_sketch<T, W, H, E, A>::check_weight(WW weight) {
486
478
  if (weight < 0) {
487
479
  throw std::invalid_argument("weight must be non-negative");
488
480
  }
@@ -29,21 +29,27 @@ namespace datasketches {
29
29
  * This is a specialized linear-probing hash map with a reverse purge operation
30
30
  * that removes all entries in the map with values that are less than zero.
31
31
  * Based on Java implementation here:
32
- * https://github.com/DataSketches/sketches-core/blob/master/src/main/java/com/yahoo/sketches/frequencies/ReversePurgeItemHashMap.java
32
+ * https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ReversePurgeItemHashMap.java
33
33
  * author Alexander Saydakov
34
34
  */
35
35
 
36
- template<typename K, typename V = uint64_t, typename H = std::hash<K>, typename E = std::equal_to<K>, typename A = std::allocator<K>>
36
+ template<
37
+ typename K,
38
+ typename V = uint64_t,
39
+ typename H = std::hash<K>,
40
+ typename E = std::equal_to<K>,
41
+ typename A = std::allocator<K>
42
+ >
37
43
  class reverse_purge_hash_map {
38
44
  public:
39
45
  using AllocV = typename std::allocator_traits<A>::template rebind_alloc<V>;
40
46
  using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
41
47
 
42
- reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size, const A& allocator);
48
+ reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size, const E& equal, const A& allocator);
43
49
  reverse_purge_hash_map(const reverse_purge_hash_map& other);
44
50
  reverse_purge_hash_map(reverse_purge_hash_map&& other) noexcept;
45
51
  ~reverse_purge_hash_map();
46
- reverse_purge_hash_map& operator=(reverse_purge_hash_map other);
52
+ reverse_purge_hash_map& operator=(const reverse_purge_hash_map& other);
47
53
  reverse_purge_hash_map& operator=(reverse_purge_hash_map&& other);
48
54
 
49
55
  template<typename FwdK>
@@ -65,6 +71,7 @@ private:
65
71
  static constexpr uint16_t DRIFT_LIMIT = 1024; // used only for stress testing
66
72
  static constexpr uint32_t MAX_SAMPLE_SIZE = 1024; // number of samples to compute approximate median during purge
67
73
 
74
+ E equal_;
68
75
  A allocator_;
69
76
  uint8_t lg_cur_size_;
70
77
  uint8_t lg_max_size_;
@@ -34,7 +34,9 @@ template<typename K, typename V, typename H, typename E, typename A>
34
34
  constexpr uint32_t reverse_purge_hash_map<K, V, H, E, A>::MAX_SAMPLE_SIZE;
35
35
 
36
36
  template<typename K, typename V, typename H, typename E, typename A>
37
- reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(uint8_t lg_cur_size, uint8_t lg_max_size, const A& allocator):
37
+ reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(uint8_t lg_cur_size, uint8_t lg_max_size,
38
+ const E& equal, const A& allocator):
39
+ equal_(equal),
38
40
  allocator_(allocator),
39
41
  lg_cur_size_(lg_cur_size),
40
42
  lg_max_size_(lg_max_size),
@@ -52,6 +54,7 @@ states_(nullptr)
52
54
 
53
55
  template<typename K, typename V, typename H, typename E, typename A>
54
56
  reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(const reverse_purge_hash_map<K, V, H, E, A>& other):
57
+ equal_(other.equal_),
55
58
  allocator_(other.allocator_),
56
59
  lg_cur_size_(other.lg_cur_size_),
57
60
  lg_max_size_(other.lg_max_size_),
@@ -80,6 +83,7 @@ states_(nullptr)
80
83
 
81
84
  template<typename K, typename V, typename H, typename E, typename A>
82
85
  reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(reverse_purge_hash_map<K, V, H, E, A>&& other) noexcept:
86
+ equal_(std::move(other.equal_)),
83
87
  allocator_(std::move(other.allocator_)),
84
88
  lg_cur_size_(other.lg_cur_size_),
85
89
  lg_max_size_(other.lg_max_size_),
@@ -119,19 +123,22 @@ reverse_purge_hash_map<K, V, H, E, A>::~reverse_purge_hash_map() {
119
123
  }
120
124
 
121
125
  template<typename K, typename V, typename H, typename E, typename A>
122
- reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(reverse_purge_hash_map<K, V, H, E, A> other) {
123
- std::swap(allocator_, other.allocator_);
124
- std::swap(lg_cur_size_, other.lg_cur_size_);
125
- std::swap(lg_max_size_, other.lg_max_size_);
126
- std::swap(num_active_, other.num_active_);
127
- std::swap(keys_, other.keys_);
128
- std::swap(values_, other.values_);
129
- std::swap(states_, other.states_);
126
+ reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(const reverse_purge_hash_map<K, V, H, E, A>& other) {
127
+ reverse_purge_hash_map copy(other);
128
+ std::swap(equal_, copy.equal_);
129
+ std::swap(allocator_, copy.allocator_);
130
+ std::swap(lg_cur_size_, copy.lg_cur_size_);
131
+ std::swap(lg_max_size_, copy.lg_max_size_);
132
+ std::swap(num_active_, copy.num_active_);
133
+ std::swap(keys_, copy.keys_);
134
+ std::swap(values_, copy.values_);
135
+ std::swap(states_, copy.states_);
130
136
  return *this;
131
137
  }
132
138
 
133
139
  template<typename K, typename V, typename H, typename E, typename A>
134
140
  reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(reverse_purge_hash_map<K, V, H, E, A>&& other) {
141
+ std::swap(equal_, other.equal_);
135
142
  std::swap(allocator_, other.allocator_);
136
143
  std::swap(lg_cur_size_, other.lg_cur_size_);
137
144
  std::swap(lg_max_size_, other.lg_max_size_);
@@ -17,7 +17,7 @@
17
17
 
18
18
  add_executable(fi_test)
19
19
 
20
- target_link_libraries(fi_test fi common_test)
20
+ target_link_libraries(fi_test fi common_test_lib)
21
21
 
22
22
  set_target_properties(fi_test PROPERTIES
23
23
  CXX_STANDARD 11
@@ -27,62 +27,75 @@
27
27
 
28
28
  namespace datasketches {
29
29
 
30
- using frequent_test_type_sketch = frequent_items_sketch<test_type, float, test_type_hash, test_type_equal, test_type_serde, test_allocator<test_type>>;
30
+ using frequent_test_type_sketch = frequent_items_sketch<test_type, float, test_type_hash, test_type_equal, test_allocator<test_type>>;
31
31
  using alloc = test_allocator<test_type>;
32
32
 
33
33
  TEST_CASE("frequent items: custom type", "[frequent_items_sketch]") {
34
- frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
35
- sketch.update(1, 10); // should survive the purge
36
- sketch.update(2);
37
- sketch.update(3);
38
- sketch.update(4);
39
- sketch.update(5);
40
- sketch.update(6);
41
- sketch.update(7);
42
- test_type a8(8);
43
- sketch.update(a8);
44
- REQUIRE_FALSE(sketch.is_empty());
45
- REQUIRE(sketch.get_total_weight() == 17);
46
- REQUIRE(sketch.get_estimate(1) == 10);
34
+ test_allocator_total_bytes = 0;
35
+ {
36
+ frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
37
+ sketch.update(1, 10); // should survive the purge
38
+ sketch.update(2);
39
+ sketch.update(3);
40
+ sketch.update(4);
41
+ sketch.update(5);
42
+ sketch.update(6);
43
+ sketch.update(7);
44
+ test_type a8(8);
45
+ sketch.update(a8);
46
+ REQUIRE_FALSE(sketch.is_empty());
47
+ REQUIRE(sketch.get_total_weight() == 17);
48
+ REQUIRE(sketch.get_estimate(1) == 10);
47
49
 
48
- auto items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES);
49
- REQUIRE(items.size() == 1); // only 1 item should be above threshold
50
- REQUIRE(items[0].get_item().get_value() == 1);
51
- REQUIRE(items[0].get_estimate() == 10);
50
+ auto items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES);
51
+ REQUIRE(items.size() == 1); // only 1 item should be above threshold
52
+ REQUIRE(items[0].get_item().get_value() == 1);
53
+ REQUIRE(items[0].get_estimate() == 10);
52
54
 
53
- std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
54
- sketch.serialize(s);
55
- auto sketch2 = frequent_test_type_sketch::deserialize(s, alloc(0));
56
- REQUIRE_FALSE(sketch2.is_empty());
57
- REQUIRE(sketch2.get_total_weight() == 17);
58
- REQUIRE(sketch2.get_estimate(1) == 10);
59
- REQUIRE(sketch.get_num_active_items() == sketch2.get_num_active_items());
60
- REQUIRE(sketch.get_maximum_error() == sketch2.get_maximum_error());
55
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
56
+ sketch.serialize(s, test_type_serde());
57
+ auto sketch2 = frequent_test_type_sketch::deserialize(s, test_type_serde(), test_type_equal(), 0);
58
+ REQUIRE_FALSE(sketch2.is_empty());
59
+ REQUIRE(sketch2.get_total_weight() == 17);
60
+ REQUIRE(sketch2.get_estimate(1) == 10);
61
+ REQUIRE(sketch.get_num_active_items() == sketch2.get_num_active_items());
62
+ REQUIRE(sketch.get_maximum_error() == sketch2.get_maximum_error());
61
63
 
62
- auto bytes = sketch.serialize();
63
- auto sketch3 = frequent_test_type_sketch::deserialize(bytes.data(), bytes.size(), alloc(0));
64
- REQUIRE_FALSE(sketch3.is_empty());
65
- REQUIRE(sketch3.get_total_weight() == 17);
66
- REQUIRE(sketch3.get_estimate(1) == 10);
67
- REQUIRE(sketch.get_num_active_items() == sketch3.get_num_active_items());
68
- REQUIRE(sketch.get_maximum_error() == sketch3.get_maximum_error());
64
+ auto bytes = sketch.serialize(0, test_type_serde());
65
+ auto sketch3 = frequent_test_type_sketch::deserialize(bytes.data(), bytes.size(), test_type_serde(),
66
+ test_type_equal(), 0);
67
+ REQUIRE_FALSE(sketch3.is_empty());
68
+ REQUIRE(sketch3.get_total_weight() == 17);
69
+ REQUIRE(sketch3.get_estimate(1) == 10);
70
+ REQUIRE(sketch.get_num_active_items() == sketch3.get_num_active_items());
71
+ REQUIRE(sketch.get_maximum_error() == sketch3.get_maximum_error());
72
+ }
73
+ REQUIRE(test_allocator_total_bytes == 0);
69
74
  }
70
75
 
71
76
  // this is to see the debug print from test_type if enabled there to make sure items are moved
72
77
  TEST_CASE("frequent items: moving merge", "[frequent_items_sketch]") {
73
- frequent_test_type_sketch sketch1(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
74
- sketch1.update(1);
78
+ test_allocator_total_bytes = 0;
79
+ {
80
+ frequent_test_type_sketch sketch1(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
81
+ sketch1.update(1);
75
82
 
76
- frequent_test_type_sketch sketch2(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
77
- sketch2.update(2);
83
+ frequent_test_type_sketch sketch2(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
84
+ sketch2.update(2);
78
85
 
79
- sketch2.merge(std::move(sketch1));
80
- REQUIRE(sketch2.get_total_weight() == 2);
86
+ sketch2.merge(std::move(sketch1));
87
+ REQUIRE(sketch2.get_total_weight() == 2);
88
+ }
89
+ REQUIRE(test_allocator_total_bytes == 0);
81
90
  }
82
91
 
83
92
  TEST_CASE("frequent items: negative weight", "[frequent_items_sketch]") {
84
- frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
85
- REQUIRE_THROWS_AS(sketch.update(1, -1), std::invalid_argument);
93
+ test_allocator_total_bytes = 0;
94
+ {
95
+ frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
96
+ REQUIRE_THROWS_AS(sketch.update(1, -1), std::invalid_argument);
97
+ }
98
+ REQUIRE(test_allocator_total_bytes == 0);
86
99
  }
87
100
 
88
101
  } /* namespace datasketches */
@@ -24,20 +24,20 @@
24
24
  namespace datasketches {
25
25
 
26
26
  TEST_CASE("reverse purge hash map: empty", "[frequent_items_sketch]") {
27
- reverse_purge_hash_map<int> map(3, 3, std::allocator<int>());
27
+ reverse_purge_hash_map<int> map(3, 3, std::equal_to<int>(), std::allocator<int>());
28
28
  REQUIRE(map.get_num_active() == 0);
29
29
  REQUIRE(map.get_lg_cur_size() == 3); // static_cast<uint8_t>(3)
30
30
  }
31
31
 
32
32
  TEST_CASE("reverse purge hash map: one item", "[frequent_items_sketch]") {
33
- reverse_purge_hash_map<int> map(3, 3, std::allocator<int>());
33
+ reverse_purge_hash_map<int> map(3, 3, std::equal_to<int>(), std::allocator<int>());
34
34
  map.adjust_or_insert(1, 1);
35
35
  REQUIRE(map.get_num_active() == 1);
36
36
  REQUIRE(map.get(1) == 1);
37
37
  }
38
38
 
39
39
  TEST_CASE("reverse purge hash map: iterator", "[frequent_items_sketch]") {
40
- reverse_purge_hash_map<int> map(3, 4, std::allocator<int>());
40
+ reverse_purge_hash_map<int> map(3, 4, std::equal_to<int>(), std::allocator<int>());
41
41
  for (int i = 0; i < 11; i++) map.adjust_or_insert(i, 1); // this should fit with no purge
42
42
  uint64_t sum = 0;
43
43
  for (auto it: map) sum += it.second;
@@ -267,10 +267,10 @@ void Hll4Array<A>::shiftToBiggerCurMin() {
267
267
  for (const auto coupon: *auxHashMap_) {
268
268
  slotNum = HllUtil<A>::getLow26(coupon) & configKmask;
269
269
  oldActualVal = HllUtil<A>::getValue(coupon);
270
- newShiftedVal = oldActualVal - newCurMin;
271
- if (newShiftedVal < 0) {
270
+ if (oldActualVal < newCurMin) {
272
271
  throw std::logic_error("oldActualVal < newCurMin when incrementing curMin");
273
272
  }
273
+ newShiftedVal = oldActualVal - newCurMin;
274
274
 
275
275
  if (getSlot(slotNum) != hll_constants::AUX_TOKEN) {
276
276
  throw std::logic_error("getSlot(slotNum) != AUX_TOKEN for item in auxiliary hash map");
@@ -17,7 +17,7 @@
17
17
 
18
18
  add_executable(hll_test)
19
19
 
20
- target_link_libraries(hll_test hll common_test)
20
+ target_link_libraries(hll_test hll common_test_lib)
21
21
 
22
22
  set_target_properties(hll_test PROPERTIES
23
23
  CXX_STANDARD 11