datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,43 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_executable(sampling_test)
19
+
20
+ target_link_libraries(sampling_test sampling common_test)
21
+
22
+ set_target_properties(sampling_test PROPERTIES
23
+ CXX_STANDARD 11
24
+ CXX_STANDARD_REQUIRED YES
25
+ )
26
+
27
+ file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" SAMPLING_TEST_BINARY_PATH)
28
+ string(APPEND SAMPLING_TEST_BINARY_PATH "/")
29
+ target_compile_definitions(sampling_test
30
+ PRIVATE
31
+ TEST_BINARY_INPUT_PATH="${SAMPLING_TEST_BINARY_PATH}"
32
+ )
33
+
34
+ add_test(
35
+ NAME sampling_test
36
+ COMMAND sampling_test
37
+ )
38
+
39
+ target_sources(sampling_test
40
+ PRIVATE
41
+ var_opt_sketch_test.cpp
42
+ var_opt_union_test.cpp
43
+ )
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ Code snippets used to generate to generate the binary images from Java.
21
+ Heavy items have negative weights to allow a simple predicate to filter
22
+ heavy vs light sketch entires.
23
+
24
+
25
+ varopt_sketch_long_sampling.bin:
26
+ final VarOptItemsSketch<String> sk = VarOptItemsSketch.newInstance(1024);
27
+ for (int i = 1; i <= 200; ++i) {
28
+ sk.update(Integer.toString(i), 1000.0 / i);
29
+ }
30
+ byte[] bytes = sk.toByteArray(new ArrayOfStringsSerDe());
31
+
32
+
33
+ varopt_sketch_string_exact.bin:
34
+ final VarOptItemsSketch<Long> sk = VarOptItemsSketch.newInstance(1024);
35
+ for (long i = 0; i < 2000; ++i) {
36
+ sk.update(i, 1.0);
37
+ }
38
+ sk.update(-1L, 100000.0);
39
+ sk.update(-2L, 110000.0);
40
+ sk.update(-3L, 120000.0);
41
+ byte[] bytes = sk.toByteArray(new ArrayOfLongsSerDe());
42
+
43
+
44
+ varopt_union_double_sampling.bin:
45
+ // parallels small samplign sketch test
46
+ final int kSmall = 16;
47
+ final int n1 = 32;
48
+ final int n2 = 64;
49
+ final int kMax = 128;
50
+
51
+ // small k sketch, but sampling
52
+ VarOptItemsSketch<Double> sketch = VarOptItemsSketch.newInstance(kSmall);
53
+ for (int i = 0; i < n1; ++i) {
54
+ sketch.update(1.0 * i, 1.0);
55
+ }
56
+ sketch.update(-1.0, n1 * n1); // add a heavy item
57
+
58
+ final VarOptItemsUnion<Double> union = VarOptItemsUnion.newInstance(kMax);
59
+ union.update(sketch);
60
+
61
+ // another one, but different n to get a different per-item weight
62
+ sketch = VarOptItemsSketch.newInstance(kSmall);
63
+ for (int i = 0; i < n2; ++i) {
64
+ sketch.update(1.0 * i, 1.0);
65
+ }
66
+ union.update(sketch);
67
+ byte[] bytes = union.toByteArray(new ArrayOfDoublesSerDe());
@@ -0,0 +1,509 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <var_opt_sketch.hpp>
21
+
22
+ #include <catch.hpp>
23
+
24
+ #include <vector>
25
+ #include <string>
26
+ #include <sstream>
27
+ #include <fstream>
28
+ #include <cmath>
29
+ #include <random>
30
+
31
+ #ifdef TEST_BINARY_INPUT_PATH
32
+ static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
33
+ #else
34
+ static std::string testBinaryInputPath = "test/";
35
+ #endif
36
+
37
+ namespace datasketches {
38
+
39
+ static constexpr double EPS = 1e-13;
40
+
41
+ static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
42
+ var_opt_sketch<int> sk(k);
43
+ for (uint64_t i = 0; i < n; ++i) {
44
+ sk.update(i, 1.0);
45
+ }
46
+ return sk;
47
+ }
48
+
49
+ template<typename T, typename S, typename A>
50
+ static void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk2) {
51
+ REQUIRE(sk1.get_k() == sk2.get_k());
52
+ REQUIRE(sk1.get_n() == sk2.get_n());
53
+ REQUIRE(sk1.get_num_samples() == sk2.get_num_samples());
54
+
55
+ auto it1 = sk1.begin();
56
+ auto it2 = sk2.begin();
57
+ size_t i = 0;
58
+
59
+ while ((it1 != sk1.end()) && (it2 != sk2.end())) {
60
+ const std::pair<const T&, const double> p1 = *it1;
61
+ const std::pair<const T&, const double> p2 = *it2;
62
+ REQUIRE(p1.first == p2.first); // data values
63
+ REQUIRE(p1.second == p2.second); // weights
64
+ ++i;
65
+ ++it1;
66
+ ++it2;
67
+ }
68
+
69
+ REQUIRE((it1 == sk1.end() && it2 == sk2.end())); // iterators must end at the same time
70
+ }
71
+
72
+ TEST_CASE("varopt sketch: invalid k", "[var_opt_sketch]") {
73
+ REQUIRE_THROWS_AS(var_opt_sketch<int>(0), std::invalid_argument);
74
+ REQUIRE_THROWS_AS(var_opt_sketch<int>(1 << 31), std::invalid_argument); // aka k < 0
75
+ }
76
+
77
+ TEST_CASE("varopt sketch: bad serialization version", "[var_opt_sketch]") {
78
+ var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
79
+ std::vector<uint8_t> bytes = sk.serialize();
80
+ bytes[1] = 0; // corrupt the serialization version byte
81
+
82
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
83
+
84
+ // create a stringstream to check the same
85
+ std::stringstream ss;
86
+ std::string str(bytes.begin(), bytes.end());
87
+ ss.str(str);
88
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::invalid_argument);
89
+ }
90
+
91
+ TEST_CASE("varopt sketch: bad family", "[var_opt_sketch]") {
92
+ var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
93
+ std::vector<uint8_t> bytes = sk.serialize();
94
+ bytes[2] = 0; // corrupt the family byte
95
+
96
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
97
+
98
+ // create a stringstream to check the same
99
+ std::stringstream ss;
100
+ std::string str(bytes.begin(), bytes.end());
101
+ ss.str(str);
102
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::invalid_argument);
103
+ }
104
+
105
+ TEST_CASE("varopt sketch: bad prelongs", "[var_opt_sketch]") {
106
+ // The nubmer of preamble longs shares bits with resize_factor, but the latter
107
+ // has no invalid values as it gets 2 bites for 4 enum values.
108
+ var_opt_sketch<int> sk = create_unweighted_sketch(32, 33);
109
+ std::vector<uint8_t> bytes = sk.serialize();
110
+
111
+ bytes[0] = 0; // corrupt the preamble longs byte to be too small
112
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
113
+
114
+ bytes[0] = 2; // corrupt the preamble longs byte to 2
115
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
116
+
117
+ bytes[0] = 5; // corrupt the preamble longs byte to be too large
118
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
119
+ }
120
+
121
+ TEST_CASE("varopt sketch: malformed preamble", "[var_opt_sketch]") {
122
+ uint32_t k = 50;
123
+ var_opt_sketch<int> sk = create_unweighted_sketch(k, k);
124
+ const std::vector<uint8_t> src_bytes = sk.serialize();
125
+
126
+ // we'll re-use the same bytes several times so we'll use copies
127
+ std::vector<uint8_t> bytes(src_bytes);
128
+
129
+ // no items in R, but preamble longs indicates full
130
+ bytes[0] = 4; // PREAMBLE_LONGS_FULL
131
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
132
+
133
+ // k = 0
134
+ bytes = src_bytes;
135
+ *reinterpret_cast<int32_t*>(&bytes[4]) = 0;
136
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
137
+
138
+ // negative H region count in Java (signed ints)
139
+ // throws due to H count != n in exact mode
140
+ bytes = src_bytes;
141
+ *reinterpret_cast<int32_t*>(&bytes[16]) = -1;
142
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
143
+
144
+ // negative R region count in Java (signed ints)
145
+ // throws due to non-zero R in sampling mode
146
+ bytes = src_bytes;
147
+ *reinterpret_cast<int32_t*>(&bytes[20]) = -128;
148
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
149
+ }
150
+
151
+ TEST_CASE("varopt sketch: empty sketch", "[var_opt_sketch]") {
152
+ var_opt_sketch<std::string> sk(5);
153
+ REQUIRE(sk.get_n() == 0);
154
+ REQUIRE(sk.get_num_samples() == 0);
155
+
156
+ std::vector<uint8_t> bytes = sk.serialize();
157
+ REQUIRE(bytes.size() == (1 << 3)); // num bytes in PREAMBLE_LONGS_EMPTY
158
+
159
+ var_opt_sketch<std::string> loaded_sk = var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size());
160
+ REQUIRE(loaded_sk.get_n() == 0);
161
+ REQUIRE(loaded_sk.get_num_samples() == 0);
162
+ }
163
+
164
+ TEST_CASE("varopt sketch: non-empty degenerate sketch", "[var_opt_sketch]") {
165
+ // Make an empty serialized sketch, then extend it to a
166
+ // PREAMBLE_LONGS_WARMUP-sized byte array, with no items.
167
+ // Then clear the empty flag so it will try to load the rest.
168
+ var_opt_sketch<std::string> sk(12, resize_factor::X2);
169
+ std::vector<uint8_t> bytes = sk.serialize();
170
+ while (bytes.size() < 24) { // PREAMBLE_LONGS_WARMUP * 8
171
+ bytes.push_back((uint8_t) 0);
172
+ }
173
+
174
+ // ensure non-empty -- H and R region sizes already set to 0
175
+ bytes[3] = 0; // set flags bit to not-empty (other bits should already be 0)
176
+
177
+ REQUIRE_THROWS_AS(var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
178
+ }
179
+
180
+ TEST_CASE("varopt sketch: invalid weight", "[var_opt_sketch]") {
181
+ var_opt_sketch<std::string> sk(100, resize_factor::X2);
182
+ REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument);
183
+
184
+ // should not throw but sketch shoulds till be empty
185
+ sk.update("zero weight", 0.0);
186
+ REQUIRE(sk.is_empty());
187
+ }
188
+
189
+ TEST_CASE("varopt sketch: corrupt serialized weight", "[var_opt_sketch]") {
190
+ var_opt_sketch<int> sk = create_unweighted_sketch(100, 20);
191
+ auto bytes = sk.serialize();
192
+
193
+ // weights are in the first double after the preamble
194
+ size_t preamble_bytes = (bytes[0] & 0x3f) << 3;
195
+ *reinterpret_cast<double*>(&bytes[preamble_bytes]) = -1.5;
196
+
197
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
198
+
199
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
200
+ for (auto& b : bytes) { ss >> b; }
201
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::invalid_argument);
202
+ }
203
+
204
+ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
205
+ uint32_t k = 256;
206
+ uint64_t n = 10 * k;
207
+ var_opt_sketch<int> sk(k);
208
+
209
+ std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
210
+ std::mt19937_64 rand(rd());
211
+ std::normal_distribution<double> N(0.0, 1.0);
212
+
213
+ double input_sum = 0.0;
214
+ for (size_t i = 0; i < n; ++i) {
215
+ // generate weights aboev and below 1.0 using w ~ exp(5*N(0,1))
216
+ // which covers about 10 orders of magnitude
217
+ double w = std::exp(5 * N(rand));
218
+ input_sum += w;
219
+ sk.update(i, w);
220
+ }
221
+
222
+ double output_sum = 0.0;
223
+ for (auto& it : sk) { // std::pair<int, weight>
224
+ output_sum += it.second;
225
+ }
226
+
227
+ double weight_ratio = output_sum / input_sum;
228
+ REQUIRE(std::abs(weight_ratio - 1.0) == Approx(0).margin(EPS));
229
+ }
230
+
231
+ TEST_CASE("varopt sketch: under-full sketch serialization", "[var_opt_sketch]") {
232
+ var_opt_sketch<int> sk = create_unweighted_sketch(100, 10); // need n < k
233
+
234
+ auto bytes = sk.serialize();
235
+ var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
236
+ check_if_equal(sk, sk_from_bytes);
237
+
238
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
239
+ sk.serialize(ss);
240
+ var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
241
+ check_if_equal(sk, sk_from_stream);
242
+
243
+ // ensure we unroll properly
244
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
245
+ std::string str_trunc((char*)&bytes[0], bytes.size() - 1);
246
+ ss.str(str_trunc);
247
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::runtime_error);
248
+ }
249
+
250
+ TEST_CASE("varopt sketch: end-of-warmup sketch serialization", "[var_opt_sketch]") {
251
+ var_opt_sketch<int> sk = create_unweighted_sketch(2843, 2843); // need n == k
252
+ auto bytes = sk.serialize();
253
+
254
+ // ensure still only 3 preamble longs
255
+ REQUIRE((bytes.data()[0] & 0x3f) == 3); // PREAMBLE_LONGS_WARMUP
256
+
257
+ var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
258
+ check_if_equal(sk, sk_from_bytes);
259
+
260
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
261
+ sk.serialize(ss);
262
+ var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
263
+ check_if_equal(sk, sk_from_stream);
264
+
265
+ // ensure we unroll properly
266
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size() - 1000), std::out_of_range);
267
+ std::string str_trunc((char*)&bytes[0], bytes.size() - 100);
268
+ ss.str(str_trunc);
269
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::runtime_error);
270
+ }
271
+
272
+ TEST_CASE("varopt sketch: full sketch serialization", "[var_opt_sketch]") {
273
+ var_opt_sketch<int> sk = create_unweighted_sketch(32, 32);
274
+ sk.update(100, 100.0);
275
+ sk.update(101, 101.0);
276
+
277
+ // first 2 entries should be heavy and in heap order (smallest at root)
278
+ auto it = sk.begin();
279
+ const std::pair<const int, const double> p1 = *it;
280
+ ++it;
281
+ const std::pair<const int, const double> p2 = *it;
282
+ REQUIRE(p1.second == Approx(100.0).margin(EPS));
283
+ REQUIRE(p2.second == Approx(101.0).margin(EPS));
284
+ REQUIRE(p1.first == 100);
285
+ REQUIRE(p2.first == 101);
286
+
287
+ // check for 4 preamble longs
288
+ auto bytes = sk.serialize();
289
+ REQUIRE((bytes.data()[0] & 0x3f) == 4);; // PREAMBLE_LONGS_WARMUP
290
+
291
+ var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
292
+ check_if_equal(sk, sk_from_bytes);
293
+
294
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
295
+ sk.serialize(ss);
296
+ var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
297
+ check_if_equal(sk, sk_from_stream);
298
+
299
+ // ensure we unroll properly
300
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size() - 100), std::out_of_range);
301
+ std::string str_trunc((char*)&bytes[0], bytes.size() - 100);
302
+ ss.str(str_trunc);
303
+ REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::runtime_error);
304
+ }
305
+
306
+ TEST_CASE("varopt sketch: string serialization", "[var_opt_sketch]") {
307
+ var_opt_sketch<std::string> sk(5);
308
+ sk.update("a", 1.0);
309
+ sk.update("bc", 1.0);
310
+ sk.update("def", 1.0);
311
+ sk.update("ghij", 1.0);
312
+ sk.update("klmno", 1.0);
313
+ sk.update("heavy item", 100.0);
314
+
315
+ auto bytes = sk.serialize();
316
+ var_opt_sketch<std::string> sk_from_bytes = var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size());
317
+ check_if_equal(sk, sk_from_bytes);
318
+
319
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
320
+ sk.serialize(ss);
321
+ var_opt_sketch<std::string> sk_from_stream = var_opt_sketch<std::string>::deserialize(ss);
322
+ check_if_equal(sk, sk_from_stream);
323
+
324
+ // ensure we unroll properly
325
+ REQUIRE_THROWS_AS(var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size() - 12), std::out_of_range);
326
+ std::string str_trunc((char*)&bytes[0], bytes.size() - 12);
327
+ ss.str(str_trunc);
328
+ REQUIRE_THROWS_AS(var_opt_sketch<std::string>::deserialize(ss), std::runtime_error);
329
+ }
330
+
331
+ TEST_CASE("varopt sketch: pseudo-light update", "[var_opt_sketch]") {
332
+ uint32_t k = 1024;
333
+ var_opt_sketch<int> sk = create_unweighted_sketch(k, k + 1);
334
+ sk.update(0, 1.0); // k+2nd update
335
+
336
+ // check the first weight, assuming all k items are unweighted
337
+ // (and consequently in R).
338
+ // Expected: (k + 2) / |R| = (k + 2) / k
339
+ auto it = sk.begin();
340
+ double wt = (*it).second;
341
+ REQUIRE(wt == Approx((k + 2.0) / k).margin(EPS));
342
+ }
343
+
344
+ TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
345
+ uint32_t k = 1024;
346
+ double wt_scale = 10.0 * k;
347
+ var_opt_sketch<int> sk = create_unweighted_sketch(k, k + 1);
348
+
349
+ // Next k-1 updates should be update_pseudo_heavy_general()
350
+ // Last one should call update_pseudo_heavy_r_eq_1(), since we'll have
351
+ // added k-1 heavy items, leaving only 1 item left in R
352
+ for (uint32_t i = 1; i <= k; ++i) {
353
+ sk.update(-i, k + (i * wt_scale));
354
+ }
355
+
356
+ auto it = sk.begin();
357
+
358
+ // Expected: lightest "heavy" item (first one out): k + 2*wt_scale
359
+ double wt = (*it).second;
360
+ REQUIRE(wt == Approx(1.0 * (k + (2 * wt_scale))).margin(EPS));
361
+
362
+ // we don't know which R item is left, but there should be only one, at the end
363
+ // of the sample set.
364
+ // Expected: k+1 + (min "heavy" item) / |R| = ((k+1) + (k*wt_scale)) / 1 = wt_scale + 2k + 1
365
+ while (it != sk.end()) {
366
+ wt = (*it).second;
367
+ ++it;
368
+ }
369
+ REQUIRE(wt == Approx(1.0 + wt_scale + (2 * k)).margin(EPS));
370
+ }
371
+
372
+ TEST_CASE("varopt sketch: reset", "[var_opt_sketch]") {
373
+ uint32_t k = 1024;
374
+ uint64_t n1 = 20;
375
+ uint64_t n2 = 2 * k;
376
+ var_opt_sketch<std::string> sk(k);
377
+
378
+ // reset from sampling mode
379
+ for (uint64_t i = 0; i < n2; ++i) {
380
+ sk.update(std::to_string(i), 100.0 + i);
381
+ }
382
+ REQUIRE(sk.get_n() == n2);
383
+ REQUIRE(sk.get_k() == k);
384
+
385
+ sk.reset();
386
+ REQUIRE(sk.get_n() == 0);
387
+ REQUIRE(sk.get_k() == k);
388
+
389
+ // reset from exact mode
390
+ for (uint64_t i = 0; i < n1; ++i)
391
+ sk.update(std::to_string(i));
392
+ REQUIRE(sk.get_n() == n1);
393
+ REQUIRE(sk.get_k() == k);
394
+
395
+ sk.reset();
396
+ REQUIRE(sk.get_n() == 0);
397
+ REQUIRE(sk.get_k() == k);
398
+ }
399
+
400
+ TEST_CASE("varopt sketch: estimate subset sum", "[var_opt_sketch]") {
401
+ uint32_t k = 10;
402
+ var_opt_sketch<int> sk(k);
403
+
404
+ // empty sketch -- all zeros
405
+ subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
406
+ REQUIRE(summary.estimate == 0.0);
407
+ REQUIRE(summary.total_sketch_weight == 0.0);
408
+
409
+ // add items, keeping in exact mode
410
+ double total_weight = 0.0;
411
+ for (uint32_t i = 1; i <= (k - 1); ++i) {
412
+ sk.update(i, 1.0 * i);
413
+ total_weight += 1.0 * i;
414
+ }
415
+
416
+ summary = sk.estimate_subset_sum([](int){ return true; });
417
+ REQUIRE(summary.estimate == total_weight);
418
+ REQUIRE(summary.lower_bound == total_weight);
419
+ REQUIRE(summary.upper_bound == total_weight);
420
+ REQUIRE(summary.total_sketch_weight == total_weight);
421
+
422
+ // add a few more items, pushing to sampling mode
423
+ for (uint32_t i = k; i <= (k + 1); ++i) {
424
+ sk.update(i, 1.0 * i);
425
+ total_weight += 1.0 * i;
426
+ }
427
+
428
+ // predicate always true so estimate == upper bound
429
+ summary = sk.estimate_subset_sum([](int){ return true; });
430
+ REQUIRE(summary.estimate == Approx(total_weight).margin(EPS));
431
+ REQUIRE(summary.upper_bound == Approx(total_weight).margin(EPS));
432
+ REQUIRE(summary.lower_bound < total_weight);
433
+ REQUIRE(summary.total_sketch_weight == Approx(total_weight).margin(EPS));
434
+
435
+ // predicate always false so estimate == lower bound == 0.0
436
+ summary = sk.estimate_subset_sum([](int){ return false; });
437
+ REQUIRE(summary.estimate == 0.0);
438
+ REQUIRE(summary.lower_bound == 0.0);
439
+ REQUIRE(summary.upper_bound > 0.0);
440
+ REQUIRE(summary.total_sketch_weight == Approx(total_weight).margin(EPS));
441
+
442
+ // finally, a non-degenerate predicate
443
+ // insert negative items with identical weights, filter for negative weights only
444
+ for (uint32_t i = 1; i <= (k + 1); ++i) {
445
+ sk.update(static_cast<int32_t>(-i), 1.0 * i);
446
+ total_weight += 1.0 * i;
447
+ }
448
+
449
+ summary = sk.estimate_subset_sum([](int x) { return x < 0; });
450
+ REQUIRE(summary.estimate >= summary.lower_bound);
451
+ REQUIRE(summary.estimate <= summary.upper_bound);
452
+
453
+ // allow pretty generous bounds when testing
454
+ REQUIRE(summary.lower_bound < (total_weight / 1.4));
455
+ REQUIRE(summary.upper_bound > (total_weight / 2.6));
456
+ REQUIRE(summary.total_sketch_weight == Approx(total_weight).margin(EPS));
457
+
458
+ // and another data type, keeping it in exact mode for simplicity
459
+ var_opt_sketch<bool> sk2(k);
460
+ total_weight = 0.0;
461
+ for (uint32_t i = 1; i <= (k - 1); ++i) {
462
+ sk2.update((i % 2) == 0, 1.0 * i);
463
+ total_weight += i;
464
+ }
465
+
466
+ summary = sk2.estimate_subset_sum([](bool b){ return !b; });
467
+ REQUIRE(summary.estimate == summary.lower_bound);
468
+ REQUIRE(summary.estimate == summary.upper_bound);
469
+ REQUIRE(summary.estimate < total_weight); // exact mode, so know it must be strictly less
470
+ }
471
+
472
+ TEST_CASE("varopt sketch: deserialize exact from java", "[var_opt_sketch]") {
473
+ std::ifstream is;
474
+ is.exceptions(std::ios::failbit | std::ios::badbit);
475
+ is.open(testBinaryInputPath + "varopt_sketch_string_exact.sk", std::ios::binary);
476
+ var_opt_sketch<std::string> sketch = var_opt_sketch<std::string>::deserialize(is);
477
+ REQUIRE_FALSE(sketch.is_empty());
478
+ REQUIRE(sketch.get_k() == 1024);
479
+ REQUIRE(sketch.get_n() == 200);
480
+ REQUIRE(sketch.get_num_samples() == 200);
481
+ subset_summary ss = sketch.estimate_subset_sum([](std::string){ return true; });
482
+
483
+ double tgt_wt = 0.0;
484
+ for (int i = 1; i <= 200; ++i) { tgt_wt += 1000.0 / i; }
485
+ REQUIRE(ss.total_sketch_weight == Approx(tgt_wt).margin(EPS));
486
+ }
487
+
488
+
489
+ TEST_CASE("varopt sketch: deserialize sampling from java", "[var_opt_sketch]") {
490
+ std::ifstream is;
491
+ is.exceptions(std::ios::failbit | std::ios::badbit);
492
+ is.open(testBinaryInputPath + "varopt_sketch_long_sampling.sk", std::ios::binary);
493
+ var_opt_sketch<int64_t> sketch = var_opt_sketch<int64_t>::deserialize(is);
494
+ REQUIRE_FALSE(sketch.is_empty());
495
+ REQUIRE(sketch.get_k() == 1024);
496
+ REQUIRE(sketch.get_n() == 2003);
497
+ REQUIRE(sketch.get_num_samples() == sketch.get_k());
498
+ subset_summary ss = sketch.estimate_subset_sum([](int64_t){ return true; });
499
+ REQUIRE(ss.estimate == Approx(332000.0).margin(EPS));
500
+ REQUIRE(ss.total_sketch_weight == Approx(332000.0).margin(EPS));
501
+
502
+ ss = sketch.estimate_subset_sum([](int64_t x){ return x < 0; });
503
+ REQUIRE(ss.estimate == 330000.0); // heavy item, weight is exact
504
+
505
+ ss = sketch.estimate_subset_sum([](int64_t x){ return x >= 0; });
506
+ REQUIRE(ss.estimate == Approx(2000.0).margin(EPS));
507
+ }
508
+
509
+ }