datasketches 0.1.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (205) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +17 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  6. data/ext/datasketches/ext.cpp +1 -1
  7. data/ext/datasketches/ext.h +4 -0
  8. data/ext/datasketches/extconf.rb +1 -1
  9. data/ext/datasketches/fi_wrapper.cpp +6 -8
  10. data/ext/datasketches/hll_wrapper.cpp +13 -14
  11. data/ext/datasketches/kll_wrapper.cpp +28 -76
  12. data/ext/datasketches/theta_wrapper.cpp +27 -41
  13. data/ext/datasketches/vo_wrapper.cpp +4 -6
  14. data/lib/datasketches/version.rb +1 -1
  15. data/vendor/datasketches-cpp/CMakeLists.txt +10 -0
  16. data/vendor/datasketches-cpp/LICENSE +40 -3
  17. data/vendor/datasketches-cpp/NOTICE +1 -1
  18. data/vendor/datasketches-cpp/README.md +4 -4
  19. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +18 -7
  20. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  21. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  24. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  25. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  26. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  27. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  28. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  29. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +13 -3
  31. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +20 -20
  32. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +116 -105
  33. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +22 -6
  34. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +140 -101
  35. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  36. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +20 -20
  37. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -16
  38. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +6 -6
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +10 -10
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +21 -21
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  42. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  43. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  46. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  47. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +102 -105
  48. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  49. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +141 -125
  50. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  51. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +5 -5
  52. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  53. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +81 -109
  54. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +25 -24
  55. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  56. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +5 -5
  57. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +89 -105
  58. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +13 -13
  59. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +130 -165
  60. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +21 -22
  61. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  62. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  63. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  64. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +88 -83
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +34 -45
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +7 -8
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +41 -52
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +7 -8
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +220 -251
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +42 -42
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +36 -38
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +15 -14
  76. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +47 -44
  77. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +62 -87
  78. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +121 -128
  79. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  80. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  81. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  82. data/vendor/datasketches-cpp/hll/include/hll.hpp +25 -53
  83. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +8 -8
  84. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +36 -36
  85. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +28 -28
  86. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  87. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +37 -37
  88. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +57 -61
  89. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  90. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  91. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  92. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  93. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  94. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  95. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +40 -25
  96. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +50 -6
  97. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +164 -136
  98. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  99. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  100. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  101. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  102. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +178 -88
  103. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  104. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  105. data/vendor/datasketches-cpp/python/CMakeLists.txt +12 -6
  106. data/vendor/datasketches-cpp/python/README.md +52 -49
  107. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  108. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  109. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  110. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -6
  111. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +4 -2
  112. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  113. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +38 -28
  114. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  115. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  116. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -2
  117. data/vendor/datasketches-cpp/python/tests/kll_test.py +5 -5
  118. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  119. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  120. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  121. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  122. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  123. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +18 -8
  124. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  125. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +488 -0
  126. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  127. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  128. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  129. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  130. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  131. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  132. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  133. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  134. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  135. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  136. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  137. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  138. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +19 -13
  139. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +130 -127
  140. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  141. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +41 -49
  142. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  143. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  144. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  145. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -44
  146. data/vendor/datasketches-cpp/setup.py +11 -6
  147. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  148. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +3 -2
  149. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  150. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  151. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  152. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  153. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  154. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  155. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +11 -4
  156. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  157. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +26 -28
  158. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  159. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  160. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  161. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  162. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +24 -36
  163. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  164. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  165. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +163 -256
  166. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +250 -651
  167. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  168. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  169. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +6 -1
  170. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  171. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +10 -21
  172. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +44 -30
  173. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  174. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  175. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  176. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +60 -5
  177. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +74 -235
  178. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  179. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  180. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  181. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  182. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  183. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +57 -70
  184. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  185. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  186. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +18 -21
  187. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +13 -16
  188. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +7 -6
  189. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +3 -3
  190. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  191. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +13 -16
  192. metadata +51 -36
  193. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  194. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  195. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  196. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  197. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  198. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  199. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  200. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  201. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  202. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  203. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  204. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  205. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -0,0 +1,237 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+
21
+ #include <cstring>
22
+ #include <sstream>
23
+ #include <fstream>
24
+
25
+ #include <catch.hpp>
26
+
27
+ #include "cpc_sketch.hpp"
28
+ #include "test_allocator.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ using cpc_sketch_test_alloc = cpc_sketch_alloc<test_allocator<uint8_t>>;
33
+ using alloc = test_allocator<uint8_t>;
34
+
35
+ TEST_CASE("cpc sketch allocation: serialize deserialize empty", "[cpc_sketch]") {
36
+ test_allocator_total_bytes = 0;
37
+ test_allocator_net_allocations = 0;
38
+ {
39
+ cpc_sketch_test_alloc sketch(11, DEFAULT_SEED, 0);
40
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
41
+ sketch.serialize(s);
42
+ auto deserialized = cpc_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0));
43
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
44
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
45
+ REQUIRE(deserialized.validate());
46
+ }
47
+ REQUIRE(test_allocator_total_bytes == 0);
48
+ REQUIRE(test_allocator_net_allocations == 0);
49
+ }
50
+
51
+ TEST_CASE("cpc sketch allocation: serialize deserialize sparse", "[cpc_sketch]") {
52
+ test_allocator_total_bytes = 0;
53
+ test_allocator_net_allocations = 0;
54
+ {
55
+ cpc_sketch_test_alloc sketch(11, DEFAULT_SEED, 0);
56
+ const int n(100);
57
+ for (int i = 0; i < n; i++) sketch.update(i);
58
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
59
+ sketch.serialize(s);
60
+ auto deserialized = cpc_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0));
61
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
62
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
63
+ REQUIRE(deserialized.validate());
64
+ }
65
+ REQUIRE(test_allocator_total_bytes == 0);
66
+ REQUIRE(test_allocator_net_allocations == 0);
67
+ }
68
+
69
+ TEST_CASE("cpc sketch allocation: serialize deserialize hybrid", "[cpc_sketch]") {
70
+ test_allocator_total_bytes = 0;
71
+ test_allocator_net_allocations = 0;
72
+ {
73
+ cpc_sketch_test_alloc sketch(11, DEFAULT_SEED, 0);
74
+ const int n(200);
75
+ for (int i = 0; i < n; i++) sketch.update(i);
76
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
77
+ sketch.serialize(s);
78
+ auto deserialized = cpc_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0));
79
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
80
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
81
+ REQUIRE(deserialized.validate());
82
+ }
83
+ REQUIRE(test_allocator_total_bytes == 0);
84
+ REQUIRE(test_allocator_net_allocations == 0);
85
+ }
86
+
87
+ TEST_CASE("cpc sketch allocation: serialize deserialize pinned", "[cpc_sketch]") {
88
+ test_allocator_total_bytes = 0;
89
+ test_allocator_net_allocations = 0;
90
+ {
91
+ cpc_sketch_test_alloc sketch(11, DEFAULT_SEED, 0);
92
+ const int n(2000);
93
+ for (int i = 0; i < n; i++) sketch.update(i);
94
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
95
+ sketch.serialize(s);
96
+ auto deserialized = cpc_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0));
97
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
98
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
99
+ REQUIRE(deserialized.validate());
100
+ }
101
+ REQUIRE(test_allocator_total_bytes == 0);
102
+ REQUIRE(test_allocator_net_allocations == 0);
103
+ }
104
+
105
+ TEST_CASE("cpc sketch allocation: serialize deserialize sliding", "[cpc_sketch]") {
106
+ test_allocator_total_bytes = 0;
107
+ test_allocator_net_allocations = 0;
108
+ {
109
+ cpc_sketch_test_alloc sketch(11, DEFAULT_SEED, 0);
110
+ const int n(20000);
111
+ for (int i = 0; i < n; i++) sketch.update(i);
112
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
113
+ sketch.serialize(s);
114
+ auto deserialized = cpc_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0));
115
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
116
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
117
+ REQUIRE(deserialized.validate());
118
+ }
119
+ REQUIRE(test_allocator_total_bytes == 0);
120
+ REQUIRE(test_allocator_net_allocations == 0);
121
+ }
122
+
123
+ TEST_CASE("cpc sketch allocation: serializing deserialize sliding large", "[cpc_sketch]") {
124
+ test_allocator_total_bytes = 0;
125
+ test_allocator_net_allocations = 0;
126
+ {
127
+ cpc_sketch_test_alloc sketch(11, DEFAULT_SEED, 0);
128
+ const int n(3000000);
129
+ for (int i = 0; i < n; i++) sketch.update(i);
130
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
131
+ sketch.serialize(s);
132
+ auto deserialized = cpc_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0));
133
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
134
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
135
+ REQUIRE(deserialized.validate());
136
+ }
137
+ REQUIRE(test_allocator_total_bytes == 0);
138
+ REQUIRE(test_allocator_net_allocations == 0);
139
+ }
140
+
141
+ TEST_CASE("cpc sketch allocation: serialize deserialize empty, bytes", "[cpc_sketch]") {
142
+ test_allocator_total_bytes = 0;
143
+ test_allocator_net_allocations = 0;
144
+ {
145
+ cpc_sketch_test_alloc sketch(11, DEFAULT_SEED, 0);
146
+ auto bytes = sketch.serialize();
147
+ auto deserialized = cpc_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, 0);
148
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
149
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
150
+ REQUIRE(deserialized.validate());
151
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, DEFAULT_SEED, 0), std::out_of_range);
152
+ }
153
+ REQUIRE(test_allocator_total_bytes == 0);
154
+ REQUIRE(test_allocator_net_allocations == 0);
155
+ }
156
+
157
+ TEST_CASE("cpc sketch allocation: serialize deserialize sparse, bytes", "[cpc_sketch]") {
158
+ test_allocator_total_bytes = 0;
159
+ test_allocator_net_allocations = 0;
160
+ {
161
+ cpc_sketch_test_alloc sketch(11, DEFAULT_SEED, 0);
162
+ const int n(100);
163
+ for (int i = 0; i < n; i++) sketch.update(i);
164
+ auto bytes = sketch.serialize();
165
+ auto deserialized = cpc_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, 0);
166
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
167
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
168
+ REQUIRE(deserialized.validate());
169
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), 7, DEFAULT_SEED, 0), std::out_of_range);
170
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), 15, DEFAULT_SEED, 0), std::out_of_range);
171
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, DEFAULT_SEED, 0), std::out_of_range);
172
+ }
173
+ REQUIRE(test_allocator_total_bytes == 0);
174
+ REQUIRE(test_allocator_net_allocations == 0);
175
+ }
176
+
177
+ TEST_CASE("cpc sketch allocation: serialize deserialize hybrid, bytes", "[cpc_sketch]") {
178
+ test_allocator_total_bytes = 0;
179
+ test_allocator_net_allocations = 0;
180
+ {
181
+ cpc_sketch_test_alloc sketch(11, DEFAULT_SEED, 0);
182
+ const int n(200);
183
+ for (int i = 0; i < n; i++) sketch.update(i);
184
+ auto bytes = sketch.serialize();
185
+ auto deserialized = cpc_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, 0);
186
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
187
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
188
+ REQUIRE(deserialized.validate());
189
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), 7, DEFAULT_SEED, 0), std::out_of_range);
190
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), 15, DEFAULT_SEED, 0), std::out_of_range);
191
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, DEFAULT_SEED, 0), std::out_of_range);
192
+ }
193
+ REQUIRE(test_allocator_total_bytes == 0);
194
+ REQUIRE(test_allocator_net_allocations == 0);
195
+ }
196
+
197
+ TEST_CASE("cpc sketch allocation: serialize deserialize pinned, bytes", "[cpc_sketch]") {
198
+ test_allocator_total_bytes = 0;
199
+ test_allocator_net_allocations = 0;
200
+ {
201
+ cpc_sketch_test_alloc sketch(11, DEFAULT_SEED, 0);
202
+ const int n(2000);
203
+ for (int i = 0; i < n; i++) sketch.update(i);
204
+ auto bytes = sketch.serialize();
205
+ auto deserialized = cpc_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, 0);
206
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
207
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
208
+ REQUIRE(deserialized.validate());
209
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), 7, DEFAULT_SEED, 0), std::out_of_range);
210
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), 15, DEFAULT_SEED, 0), std::out_of_range);
211
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, DEFAULT_SEED, 0), std::out_of_range);
212
+ }
213
+ REQUIRE(test_allocator_total_bytes == 0);
214
+ REQUIRE(test_allocator_net_allocations == 0);
215
+ }
216
+
217
+ TEST_CASE("cpc sketch allocation: serialize deserialize sliding, bytes", "[cpc_sketch]") {
218
+ test_allocator_total_bytes = 0;
219
+ test_allocator_net_allocations = 0;
220
+ {
221
+ cpc_sketch_test_alloc sketch(11, DEFAULT_SEED, 0);
222
+ const int n(20000);
223
+ for (int i = 0; i < n; i++) sketch.update(i);
224
+ auto bytes = sketch.serialize();
225
+ auto deserialized = cpc_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, 0);
226
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
227
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
228
+ REQUIRE(deserialized.validate());
229
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), 7, DEFAULT_SEED, 0), std::out_of_range);
230
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), 15, DEFAULT_SEED, 0), std::out_of_range);
231
+ REQUIRE_THROWS_AS(cpc_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, DEFAULT_SEED, 0), std::out_of_range);
232
+ }
233
+ REQUIRE(test_allocator_total_bytes == 0);
234
+ REQUIRE(test_allocator_net_allocations == 0);
235
+ }
236
+
237
+ } /* namespace datasketches */
@@ -283,6 +283,26 @@ TEST_CASE("cpc sketch: serialize deserialize sliding, bytes", "[cpc_sketch]") {
283
283
  REQUIRE(deserialized.validate());
284
284
  }
285
285
 
286
+ TEST_CASE("cpc sketch: serialize deserialize sliding huge", "[cpc_sketch]") {
287
+ cpc_sketch sketch(26);
288
+ const int n = 10000000;
289
+ for (int i = 0; i < n; i++) sketch.update(i);
290
+ REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.001));
291
+ auto bytes = sketch.serialize();
292
+ cpc_sketch deserialized = cpc_sketch::deserialize(bytes.data(), bytes.size());
293
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
294
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
295
+ REQUIRE(deserialized.validate());
296
+ REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 7), std::out_of_range);
297
+ REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 15), std::out_of_range);
298
+ REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
299
+
300
+ // updating again with the same values should not change the sketch
301
+ for (int i = 0; i < n; i++) deserialized.update(i);
302
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
303
+ REQUIRE(deserialized.validate());
304
+ }
305
+
286
306
  TEST_CASE("cpc sketch: copy", "[cpc_sketch]") {
287
307
  cpc_sketch s1(11);
288
308
  s1.update(1);
@@ -378,4 +398,9 @@ TEST_CASE("cpc sketch: update string equivalence", "[cpc_sketch]") {
378
398
  REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
379
399
  }
380
400
 
401
+ TEST_CASE("cpc sketch: max serialized size", "[cpc_sketch]") {
402
+ REQUIRE(cpc_sketch::get_max_serialized_size_bytes(4) == 24 + 40);
403
+ REQUIRE(cpc_sketch::get_max_serialized_size_bytes(26) == static_cast<size_t>((0.6 * (1 << 26)) + 40));
404
+ }
405
+
381
406
  } /* namespace datasketches */
@@ -81,7 +81,7 @@ TEST_CASE("cpc union: large", "[cpc_union]") {
81
81
  cpc_union u(11);
82
82
  for (int i = 0; i < 1000; i++) {
83
83
  cpc_sketch tmp(11);
84
- for (int i = 0; i < 10000; i++) {
84
+ for (int j = 0; j < 10000; j++) {
85
85
  s.update(key);
86
86
  tmp.update(key);
87
87
  key++;
@@ -40,15 +40,20 @@ namespace datasketches {
40
40
 
41
41
  enum frequent_items_error_type { NO_FALSE_POSITIVES, NO_FALSE_NEGATIVES };
42
42
 
43
- // for serialization as raw bytes
44
- template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
45
- template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
46
-
47
43
  // type W for weight must be an arithmetic type (integral or floating point)
48
- template<typename T, typename W = uint64_t, typename H = std::hash<T>, typename E = std::equal_to<T>, typename S = serde<T>, typename A = std::allocator<T>>
44
+ template<
45
+ typename T,
46
+ typename W = uint64_t,
47
+ typename H = std::hash<T>,
48
+ typename E = std::equal_to<T>,
49
+ typename S = serde<T>,
50
+ typename A = std::allocator<T>
51
+ >
49
52
  class frequent_items_sketch {
50
53
  public:
51
54
 
55
+ static const uint8_t LG_MIN_MAP_SIZE = 3;
56
+
52
57
  /**
53
58
  * Construct this sketch with parameters lg_max_map_size and lg_start_map_size.
54
59
  *
@@ -59,7 +64,7 @@ public:
59
64
  * @param lg_start_map_size Log2 of the starting physical size of the internal hash
60
65
  * map managed by this sketch.
61
66
  */
62
- explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE);
67
+ explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE, const A& allocator = A());
63
68
 
64
69
  /**
65
70
  * Update this sketch with an item and a positive weight (frequency count).
@@ -232,7 +237,8 @@ public:
232
237
 
233
238
  // This is a convenience alias for users
234
239
  // The type returned by the following serialize method
235
- typedef vector_u8<A> vector_bytes;
240
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
241
+
236
242
 
237
243
  /**
238
244
  * This method serializes the sketch as a vector of bytes.
@@ -249,7 +255,7 @@ public:
249
255
  * @param is input stream
250
256
  * @return an instance of the sketch
251
257
  */
252
- static frequent_items_sketch deserialize(std::istream& is);
258
+ static frequent_items_sketch deserialize(std::istream& is, const A& allocator = A());
253
259
 
254
260
  /**
255
261
  * This method deserializes a sketch from a given array of bytes.
@@ -257,7 +263,7 @@ public:
257
263
  * @param size the size of the array
258
264
  * @return an instance of the sketch
259
265
  */
260
- static frequent_items_sketch deserialize(const void* bytes, size_t size);
266
+ static frequent_items_sketch deserialize(const void* bytes, size_t size, const A& allocator = A());
261
267
 
262
268
  /**
263
269
  * Returns a human readable summary of this sketch
@@ -266,7 +272,6 @@ public:
266
272
  string<A> to_string(bool print_items = false) const;
267
273
 
268
274
  private:
269
- static const uint8_t LG_MIN_MAP_SIZE = 3;
270
275
  static const uint8_t SERIAL_VERSION = 1;
271
276
  static const uint8_t FAMILY_ID = 10;
272
277
  static const uint8_t PREAMBLE_LONGS_EMPTY = 1;