datasketches 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,43 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_executable(sampling_test)
19
+
20
+ target_link_libraries(sampling_test sampling common_test)
21
+
22
+ set_target_properties(sampling_test PROPERTIES
23
+ CXX_STANDARD 11
24
+ CXX_STANDARD_REQUIRED YES
25
+ )
26
+
27
+ file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" SAMPLING_TEST_BINARY_PATH)
28
+ string(APPEND SAMPLING_TEST_BINARY_PATH "/")
29
+ target_compile_definitions(sampling_test
30
+ PRIVATE
31
+ TEST_BINARY_INPUT_PATH="${SAMPLING_TEST_BINARY_PATH}"
32
+ )
33
+
34
+ add_test(
35
+ NAME sampling_test
36
+ COMMAND sampling_test
37
+ )
38
+
39
+ target_sources(sampling_test
40
+ PRIVATE
41
+ var_opt_sketch_test.cpp
42
+ var_opt_union_test.cpp
43
+ )
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ Code snippets used to generate to generate the binary images from Java.
21
+ Heavy items have negative weights to allow a simple predicate to filter
22
+ heavy vs light sketch entires.
23
+
24
+
25
+ varopt_sketch_long_sampling.bin:
26
+ final VarOptItemsSketch<String> sk = VarOptItemsSketch.newInstance(1024);
27
+ for (int i = 1; i <= 200; ++i) {
28
+ sk.update(Integer.toString(i), 1000.0 / i);
29
+ }
30
+ byte[] bytes = sk.toByteArray(new ArrayOfStringsSerDe());
31
+
32
+
33
+ varopt_sketch_string_exact.bin:
34
+ final VarOptItemsSketch<Long> sk = VarOptItemsSketch.newInstance(1024);
35
+ for (long i = 0; i < 2000; ++i) {
36
+ sk.update(i, 1.0);
37
+ }
38
+ sk.update(-1L, 100000.0);
39
+ sk.update(-2L, 110000.0);
40
+ sk.update(-3L, 120000.0);
41
+ byte[] bytes = sk.toByteArray(new ArrayOfLongsSerDe());
42
+
43
+
44
+ varopt_union_double_sampling.bin:
45
+ // parallels small samplign sketch test
46
+ final int kSmall = 16;
47
+ final int n1 = 32;
48
+ final int n2 = 64;
49
+ final int kMax = 128;
50
+
51
+ // small k sketch, but sampling
52
+ VarOptItemsSketch<Double> sketch = VarOptItemsSketch.newInstance(kSmall);
53
+ for (int i = 0; i < n1; ++i) {
54
+ sketch.update(1.0 * i, 1.0);
55
+ }
56
+ sketch.update(-1.0, n1 * n1); // add a heavy item
57
+
58
+ final VarOptItemsUnion<Double> union = VarOptItemsUnion.newInstance(kMax);
59
+ union.update(sketch);
60
+
61
+ // another one, but different n to get a different per-item weight
62
+ sketch = VarOptItemsSketch.newInstance(kSmall);
63
+ for (int i = 0; i < n2; ++i) {
64
+ sketch.update(1.0 * i, 1.0);
65
+ }
66
+ union.update(sketch);
67
+ byte[] bytes = union.toByteArray(new ArrayOfDoublesSerDe());
@@ -0,0 +1,509 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <var_opt_sketch.hpp>
21
+
22
+ #include <catch.hpp>
23
+
24
+ #include <vector>
25
+ #include <string>
26
+ #include <sstream>
27
+ #include <fstream>
28
+ #include <cmath>
29
+ #include <random>
30
+
31
+ #ifdef TEST_BINARY_INPUT_PATH
32
+ static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
33
+ #else
34
+ static std::string testBinaryInputPath = "test/";
35
+ #endif
36
+
37
+ namespace datasketches {
38
+
39
+ static constexpr double EPS = 1e-13;
40
+
41
+ static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
42
+ var_opt_sketch<int> sk(k);
43
+ for (uint64_t i = 0; i < n; ++i) {
44
+ sk.update(i, 1.0);
45
+ }
46
+ return sk;
47
+ }
48
+
49
+ template<typename T, typename S, typename A>
50
+ static void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk2) {
51
+ REQUIRE(sk1.get_k() == sk2.get_k());
52
+ REQUIRE(sk1.get_n() == sk2.get_n());
53
+ REQUIRE(sk1.get_num_samples() == sk2.get_num_samples());
54
+
55
+ auto it1 = sk1.begin();
56
+ auto it2 = sk2.begin();
57
+ size_t i = 0;
58
+
59
+ while ((it1 != sk1.end()) && (it2 != sk2.end())) {
60
+ const std::pair<const T&, const double> p1 = *it1;
61
+ const std::pair<const T&, const double> p2 = *it2;
62
+ REQUIRE(p1.first == p2.first); // data values
63
+ REQUIRE(p1.second == p2.second); // weights
64
+ ++i;
65
+ ++it1;
66
+ ++it2;
67
+ }
68
+
69
+ REQUIRE((it1 == sk1.end() && it2 == sk2.end())); // iterators must end at the same time
70
+ }
71
+
72
+ TEST_CASE("varopt sketch: invalid k", "[var_opt_sketch]") {
73
+ REQUIRE_THROWS_AS(var_opt_sketch<int>(0), std::invalid_argument);
74
+ REQUIRE_THROWS_AS(var_opt_sketch<int>(1 << 31), std::invalid_argument); // aka k < 0
75
+ }
76
+
77
+ TEST_CASE("varopt sketch: bad serialization version", "[var_opt_sketch]") {
78
+ var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
79
+ std::vector<uint8_t> bytes = sk.serialize();
80
+ bytes[1] = 0; // corrupt the serialization version byte
81
+
82
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
83
+
84
+ // create a stringstream to check the same
85
+ std::stringstream ss;
86
+ std::string str(bytes.begin(), bytes.end());
87
+ ss.str(str);
88
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::invalid_argument);
89
+ }
90
+
91
+ TEST_CASE("varopt sketch: bad family", "[var_opt_sketch]") {
92
+ var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
93
+ std::vector<uint8_t> bytes = sk.serialize();
94
+ bytes[2] = 0; // corrupt the family byte
95
+
96
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
97
+
98
+ // create a stringstream to check the same
99
+ std::stringstream ss;
100
+ std::string str(bytes.begin(), bytes.end());
101
+ ss.str(str);
102
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::invalid_argument);
103
+ }
104
+
105
+ TEST_CASE("varopt sketch: bad prelongs", "[var_opt_sketch]") {
106
+ // The nubmer of preamble longs shares bits with resize_factor, but the latter
107
+ // has no invalid values as it gets 2 bites for 4 enum values.
108
+ var_opt_sketch<int> sk = create_unweighted_sketch(32, 33);
109
+ std::vector<uint8_t> bytes = sk.serialize();
110
+
111
+ bytes[0] = 0; // corrupt the preamble longs byte to be too small
112
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
113
+
114
+ bytes[0] = 2; // corrupt the preamble longs byte to 2
115
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
116
+
117
+ bytes[0] = 5; // corrupt the preamble longs byte to be too large
118
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
119
+ }
120
+
121
+ TEST_CASE("varopt sketch: malformed preamble", "[var_opt_sketch]") {
122
+ uint32_t k = 50;
123
+ var_opt_sketch<int> sk = create_unweighted_sketch(k, k);
124
+ const std::vector<uint8_t> src_bytes = sk.serialize();
125
+
126
+ // we'll re-use the same bytes several times so we'll use copies
127
+ std::vector<uint8_t> bytes(src_bytes);
128
+
129
+ // no items in R, but preamble longs indicates full
130
+ bytes[0] = 4; // PREAMBLE_LONGS_FULL
131
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
132
+
133
+ // k = 0
134
+ bytes = src_bytes;
135
+ *reinterpret_cast<int32_t*>(&bytes[4]) = 0;
136
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
137
+
138
+ // negative H region count in Java (signed ints)
139
+ // throws due to H count != n in exact mode
140
+ bytes = src_bytes;
141
+ *reinterpret_cast<int32_t*>(&bytes[16]) = -1;
142
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
143
+
144
+ // negative R region count in Java (signed ints)
145
+ // throws due to non-zero R in sampling mode
146
+ bytes = src_bytes;
147
+ *reinterpret_cast<int32_t*>(&bytes[20]) = -128;
148
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
149
+ }
150
+
151
+ TEST_CASE("varopt sketch: empty sketch", "[var_opt_sketch]") {
152
+ var_opt_sketch<std::string> sk(5);
153
+ REQUIRE(sk.get_n() == 0);
154
+ REQUIRE(sk.get_num_samples() == 0);
155
+
156
+ std::vector<uint8_t> bytes = sk.serialize();
157
+ REQUIRE(bytes.size() == (1 << 3)); // num bytes in PREAMBLE_LONGS_EMPTY
158
+
159
+ var_opt_sketch<std::string> loaded_sk = var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size());
160
+ REQUIRE(loaded_sk.get_n() == 0);
161
+ REQUIRE(loaded_sk.get_num_samples() == 0);
162
+ }
163
+
164
+ TEST_CASE("varopt sketch: non-empty degenerate sketch", "[var_opt_sketch]") {
165
+ // Make an empty serialized sketch, then extend it to a
166
+ // PREAMBLE_LONGS_WARMUP-sized byte array, with no items.
167
+ // Then clear the empty flag so it will try to load the rest.
168
+ var_opt_sketch<std::string> sk(12, resize_factor::X2);
169
+ std::vector<uint8_t> bytes = sk.serialize();
170
+ while (bytes.size() < 24) { // PREAMBLE_LONGS_WARMUP * 8
171
+ bytes.push_back((uint8_t) 0);
172
+ }
173
+
174
+ // ensure non-empty -- H and R region sizes already set to 0
175
+ bytes[3] = 0; // set flags bit to not-empty (other bits should already be 0)
176
+
177
+ REQUIRE_THROWS_AS(var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
178
+ }
179
+
180
+ TEST_CASE("varopt sketch: invalid weight", "[var_opt_sketch]") {
181
+ var_opt_sketch<std::string> sk(100, resize_factor::X2);
182
+ REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument);
183
+
184
+ // should not throw but sketch shoulds till be empty
185
+ sk.update("zero weight", 0.0);
186
+ REQUIRE(sk.is_empty());
187
+ }
188
+
189
+ TEST_CASE("varopt sketch: corrupt serialized weight", "[var_opt_sketch]") {
190
+ var_opt_sketch<int> sk = create_unweighted_sketch(100, 20);
191
+ auto bytes = sk.serialize();
192
+
193
+ // weights are in the first double after the preamble
194
+ size_t preamble_bytes = (bytes[0] & 0x3f) << 3;
195
+ *reinterpret_cast<double*>(&bytes[preamble_bytes]) = -1.5;
196
+
197
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
198
+
199
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
200
+ for (auto& b : bytes) { ss >> b; }
201
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::invalid_argument);
202
+ }
203
+
204
+ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
205
+ uint32_t k = 256;
206
+ uint64_t n = 10 * k;
207
+ var_opt_sketch<int> sk(k);
208
+
209
+ std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
210
+ std::mt19937_64 rand(rd());
211
+ std::normal_distribution<double> N(0.0, 1.0);
212
+
213
+ double input_sum = 0.0;
214
+ for (size_t i = 0; i < n; ++i) {
215
+ // generate weights aboev and below 1.0 using w ~ exp(5*N(0,1))
216
+ // which covers about 10 orders of magnitude
217
+ double w = std::exp(5 * N(rand));
218
+ input_sum += w;
219
+ sk.update(i, w);
220
+ }
221
+
222
+ double output_sum = 0.0;
223
+ for (auto& it : sk) { // std::pair<int, weight>
224
+ output_sum += it.second;
225
+ }
226
+
227
+ double weight_ratio = output_sum / input_sum;
228
+ REQUIRE(std::abs(weight_ratio - 1.0) == Approx(0).margin(EPS));
229
+ }
230
+
231
+ TEST_CASE("varopt sketch: under-full sketch serialization", "[var_opt_sketch]") {
232
+ var_opt_sketch<int> sk = create_unweighted_sketch(100, 10); // need n < k
233
+
234
+ auto bytes = sk.serialize();
235
+ var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
236
+ check_if_equal(sk, sk_from_bytes);
237
+
238
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
239
+ sk.serialize(ss);
240
+ var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
241
+ check_if_equal(sk, sk_from_stream);
242
+
243
+ // ensure we unroll properly
244
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
245
+ std::string str_trunc((char*)&bytes[0], bytes.size() - 1);
246
+ ss.str(str_trunc);
247
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::runtime_error);
248
+ }
249
+
250
+ TEST_CASE("varopt sketch: end-of-warmup sketch serialization", "[var_opt_sketch]") {
251
+ var_opt_sketch<int> sk = create_unweighted_sketch(2843, 2843); // need n == k
252
+ auto bytes = sk.serialize();
253
+
254
+ // ensure still only 3 preamble longs
255
+ REQUIRE((bytes.data()[0] & 0x3f) == 3); // PREAMBLE_LONGS_WARMUP
256
+
257
+ var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
258
+ check_if_equal(sk, sk_from_bytes);
259
+
260
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
261
+ sk.serialize(ss);
262
+ var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
263
+ check_if_equal(sk, sk_from_stream);
264
+
265
+ // ensure we unroll properly
266
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size() - 1000), std::out_of_range);
267
+ std::string str_trunc((char*)&bytes[0], bytes.size() - 100);
268
+ ss.str(str_trunc);
269
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::runtime_error);
270
+ }
271
+
272
+ TEST_CASE("varopt sketch: full sketch serialization", "[var_opt_sketch]") {
273
+ var_opt_sketch<int> sk = create_unweighted_sketch(32, 32);
274
+ sk.update(100, 100.0);
275
+ sk.update(101, 101.0);
276
+
277
+ // first 2 entries should be heavy and in heap order (smallest at root)
278
+ auto it = sk.begin();
279
+ const std::pair<const int, const double> p1 = *it;
280
+ ++it;
281
+ const std::pair<const int, const double> p2 = *it;
282
+ REQUIRE(p1.second == Approx(100.0).margin(EPS));
283
+ REQUIRE(p2.second == Approx(101.0).margin(EPS));
284
+ REQUIRE(p1.first == 100);
285
+ REQUIRE(p2.first == 101);
286
+
287
+ // check for 4 preamble longs
288
+ auto bytes = sk.serialize();
289
+ REQUIRE((bytes.data()[0] & 0x3f) == 4);; // PREAMBLE_LONGS_WARMUP
290
+
291
+ var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
292
+ check_if_equal(sk, sk_from_bytes);
293
+
294
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
295
+ sk.serialize(ss);
296
+ var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
297
+ check_if_equal(sk, sk_from_stream);
298
+
299
+ // ensure we unroll properly
300
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size() - 100), std::out_of_range);
301
+ std::string str_trunc((char*)&bytes[0], bytes.size() - 100);
302
+ ss.str(str_trunc);
303
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::runtime_error);
304
+ }
305
+
306
+ TEST_CASE("varopt sketch: string serialization", "[var_opt_sketch]") {
307
+ var_opt_sketch<std::string> sk(5);
308
+ sk.update("a", 1.0);
309
+ sk.update("bc", 1.0);
310
+ sk.update("def", 1.0);
311
+ sk.update("ghij", 1.0);
312
+ sk.update("klmno", 1.0);
313
+ sk.update("heavy item", 100.0);
314
+
315
+ auto bytes = sk.serialize();
316
+ var_opt_sketch<std::string> sk_from_bytes = var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size());
317
+ check_if_equal(sk, sk_from_bytes);
318
+
319
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
320
+ sk.serialize(ss);
321
+ var_opt_sketch<std::string> sk_from_stream = var_opt_sketch<std::string>::deserialize(ss);
322
+ check_if_equal(sk, sk_from_stream);
323
+
324
+ // ensure we unroll properly
325
+ REQUIRE_THROWS_AS(var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size() - 12), std::out_of_range);
326
+ std::string str_trunc((char*)&bytes[0], bytes.size() - 12);
327
+ ss.str(str_trunc);
328
+ REQUIRE_THROWS_AS(var_opt_sketch<std::string>::deserialize(ss), std::runtime_error);
329
+ }
330
+
331
+ TEST_CASE("varopt sketch: pseudo-light update", "[var_opt_sketch]") {
332
+ uint32_t k = 1024;
333
+ var_opt_sketch<int> sk = create_unweighted_sketch(k, k + 1);
334
+ sk.update(0, 1.0); // k+2nd update
335
+
336
+ // check the first weight, assuming all k items are unweighted
337
+ // (and consequently in R).
338
+ // Expected: (k + 2) / |R| = (k + 2) / k
339
+ auto it = sk.begin();
340
+ double wt = (*it).second;
341
+ REQUIRE(wt == Approx((k + 2.0) / k).margin(EPS));
342
+ }
343
+
344
+ TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
345
+ uint32_t k = 1024;
346
+ double wt_scale = 10.0 * k;
347
+ var_opt_sketch<int> sk = create_unweighted_sketch(k, k + 1);
348
+
349
+ // Next k-1 updates should be update_pseudo_heavy_general()
350
+ // Last one should call update_pseudo_heavy_r_eq_1(), since we'll have
351
+ // added k-1 heavy items, leaving only 1 item left in R
352
+ for (uint32_t i = 1; i <= k; ++i) {
353
+ sk.update(-i, k + (i * wt_scale));
354
+ }
355
+
356
+ auto it = sk.begin();
357
+
358
+ // Expected: lightest "heavy" item (first one out): k + 2*wt_scale
359
+ double wt = (*it).second;
360
+ REQUIRE(wt == Approx(1.0 * (k + (2 * wt_scale))).margin(EPS));
361
+
362
+ // we don't know which R item is left, but there should be only one, at the end
363
+ // of the sample set.
364
+ // Expected: k+1 + (min "heavy" item) / |R| = ((k+1) + (k*wt_scale)) / 1 = wt_scale + 2k + 1
365
+ while (it != sk.end()) {
366
+ wt = (*it).second;
367
+ ++it;
368
+ }
369
+ REQUIRE(wt == Approx(1.0 + wt_scale + (2 * k)).margin(EPS));
370
+ }
371
+
372
+ TEST_CASE("varopt sketch: reset", "[var_opt_sketch]") {
373
+ uint32_t k = 1024;
374
+ uint64_t n1 = 20;
375
+ uint64_t n2 = 2 * k;
376
+ var_opt_sketch<std::string> sk(k);
377
+
378
+ // reset from sampling mode
379
+ for (uint64_t i = 0; i < n2; ++i) {
380
+ sk.update(std::to_string(i), 100.0 + i);
381
+ }
382
+ REQUIRE(sk.get_n() == n2);
383
+ REQUIRE(sk.get_k() == k);
384
+
385
+ sk.reset();
386
+ REQUIRE(sk.get_n() == 0);
387
+ REQUIRE(sk.get_k() == k);
388
+
389
+ // reset from exact mode
390
+ for (uint64_t i = 0; i < n1; ++i)
391
+ sk.update(std::to_string(i));
392
+ REQUIRE(sk.get_n() == n1);
393
+ REQUIRE(sk.get_k() == k);
394
+
395
+ sk.reset();
396
+ REQUIRE(sk.get_n() == 0);
397
+ REQUIRE(sk.get_k() == k);
398
+ }
399
+
400
+ TEST_CASE("varopt sketch: estimate subset sum", "[var_opt_sketch]") {
401
+ uint32_t k = 10;
402
+ var_opt_sketch<int> sk(k);
403
+
404
+ // empty sketch -- all zeros
405
+ subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
406
+ REQUIRE(summary.estimate == 0.0);
407
+ REQUIRE(summary.total_sketch_weight == 0.0);
408
+
409
+ // add items, keeping in exact mode
410
+ double total_weight = 0.0;
411
+ for (uint32_t i = 1; i <= (k - 1); ++i) {
412
+ sk.update(i, 1.0 * i);
413
+ total_weight += 1.0 * i;
414
+ }
415
+
416
+ summary = sk.estimate_subset_sum([](int){ return true; });
417
+ REQUIRE(summary.estimate == total_weight);
418
+ REQUIRE(summary.lower_bound == total_weight);
419
+ REQUIRE(summary.upper_bound == total_weight);
420
+ REQUIRE(summary.total_sketch_weight == total_weight);
421
+
422
+ // add a few more items, pushing to sampling mode
423
+ for (uint32_t i = k; i <= (k + 1); ++i) {
424
+ sk.update(i, 1.0 * i);
425
+ total_weight += 1.0 * i;
426
+ }
427
+
428
+ // predicate always true so estimate == upper bound
429
+ summary = sk.estimate_subset_sum([](int){ return true; });
430
+ REQUIRE(summary.estimate == Approx(total_weight).margin(EPS));
431
+ REQUIRE(summary.upper_bound == Approx(total_weight).margin(EPS));
432
+ REQUIRE(summary.lower_bound < total_weight);
433
+ REQUIRE(summary.total_sketch_weight == Approx(total_weight).margin(EPS));
434
+
435
+ // predicate always false so estimate == lower bound == 0.0
436
+ summary = sk.estimate_subset_sum([](int){ return false; });
437
+ REQUIRE(summary.estimate == 0.0);
438
+ REQUIRE(summary.lower_bound == 0.0);
439
+ REQUIRE(summary.upper_bound > 0.0);
440
+ REQUIRE(summary.total_sketch_weight == Approx(total_weight).margin(EPS));
441
+
442
+ // finally, a non-degenerate predicate
443
+ // insert negative items with identical weights, filter for negative weights only
444
+ for (uint32_t i = 1; i <= (k + 1); ++i) {
445
+ sk.update(static_cast<int32_t>(-i), 1.0 * i);
446
+ total_weight += 1.0 * i;
447
+ }
448
+
449
+ summary = sk.estimate_subset_sum([](int x) { return x < 0; });
450
+ REQUIRE(summary.estimate >= summary.lower_bound);
451
+ REQUIRE(summary.estimate <= summary.upper_bound);
452
+
453
+ // allow pretty generous bounds when testing
454
+ REQUIRE(summary.lower_bound < (total_weight / 1.4));
455
+ REQUIRE(summary.upper_bound > (total_weight / 2.6));
456
+ REQUIRE(summary.total_sketch_weight == Approx(total_weight).margin(EPS));
457
+
458
+ // and another data type, keeping it in exact mode for simplicity
459
+ var_opt_sketch<bool> sk2(k);
460
+ total_weight = 0.0;
461
+ for (uint32_t i = 1; i <= (k - 1); ++i) {
462
+ sk2.update((i % 2) == 0, 1.0 * i);
463
+ total_weight += i;
464
+ }
465
+
466
+ summary = sk2.estimate_subset_sum([](bool b){ return !b; });
467
+ REQUIRE(summary.estimate == summary.lower_bound);
468
+ REQUIRE(summary.estimate == summary.upper_bound);
469
+ REQUIRE(summary.estimate < total_weight); // exact mode, so know it must be strictly less
470
+ }
471
+
472
+ TEST_CASE("varopt sketch: deserialize exact from java", "[var_opt_sketch]") {
473
+ std::ifstream is;
474
+ is.exceptions(std::ios::failbit | std::ios::badbit);
475
+ is.open(testBinaryInputPath + "varopt_sketch_string_exact.sk", std::ios::binary);
476
+ var_opt_sketch<std::string> sketch = var_opt_sketch<std::string>::deserialize(is);
477
+ REQUIRE_FALSE(sketch.is_empty());
478
+ REQUIRE(sketch.get_k() == 1024);
479
+ REQUIRE(sketch.get_n() == 200);
480
+ REQUIRE(sketch.get_num_samples() == 200);
481
+ subset_summary ss = sketch.estimate_subset_sum([](std::string){ return true; });
482
+
483
+ double tgt_wt = 0.0;
484
+ for (int i = 1; i <= 200; ++i) { tgt_wt += 1000.0 / i; }
485
+ REQUIRE(ss.total_sketch_weight == Approx(tgt_wt).margin(EPS));
486
+ }
487
+
488
+
489
+ TEST_CASE("varopt sketch: deserialize sampling from java", "[var_opt_sketch]") {
490
+ std::ifstream is;
491
+ is.exceptions(std::ios::failbit | std::ios::badbit);
492
+ is.open(testBinaryInputPath + "varopt_sketch_long_sampling.sk", std::ios::binary);
493
+ var_opt_sketch<int64_t> sketch = var_opt_sketch<int64_t>::deserialize(is);
494
+ REQUIRE_FALSE(sketch.is_empty());
495
+ REQUIRE(sketch.get_k() == 1024);
496
+ REQUIRE(sketch.get_n() == 2003);
497
+ REQUIRE(sketch.get_num_samples() == sketch.get_k());
498
+ subset_summary ss = sketch.estimate_subset_sum([](int64_t){ return true; });
499
+ REQUIRE(ss.estimate == Approx(332000.0).margin(EPS));
500
+ REQUIRE(ss.total_sketch_weight == Approx(332000.0).margin(EPS));
501
+
502
+ ss = sketch.estimate_subset_sum([](int64_t x){ return x < 0; });
503
+ REQUIRE(ss.estimate == 330000.0); // heavy item, weight is exact
504
+
505
+ ss = sketch.estimate_subset_sum([](int64_t x){ return x >= 0; });
506
+ REQUIRE(ss.estimate == Approx(2000.0).margin(EPS));
507
+ }
508
+
509
+ }