datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,44 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_executable(kll_test)
19
+
20
+ target_link_libraries(kll_test kll common_test)
21
+
22
+ set_target_properties(kll_test PROPERTIES
23
+ CXX_STANDARD 11
24
+ CXX_STANDARD_REQUIRED YES
25
+ )
26
+
27
+ file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" KLL_TEST_BINARY_PATH)
28
+ string(APPEND KLL_TEST_BINARY_PATH "/")
29
+ target_compile_definitions(kll_test
30
+ PRIVATE
31
+ TEST_BINARY_INPUT_PATH="${KLL_TEST_BINARY_PATH}"
32
+ )
33
+
34
+ add_test(
35
+ NAME kll_test
36
+ COMMAND kll_test
37
+ )
38
+
39
+ target_sources(kll_test
40
+ PRIVATE
41
+ kll_sketch_test.cpp
42
+ kll_sketch_custom_type_test.cpp
43
+ kll_sketch_validation.cpp
44
+ )
@@ -0,0 +1,154 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+ #include <sstream>
22
+
23
+ #include <kll_sketch.hpp>
24
+ #include <test_allocator.hpp>
25
+ #include <test_type.hpp>
26
+
27
+ namespace datasketches {
28
+
29
+ typedef kll_sketch<test_type, test_type_less, test_type_serde, test_allocator<test_type>> kll_test_type_sketch;
30
+
31
+ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
32
+
33
+ // setup section
34
+ test_allocator_total_bytes = 0;
35
+
36
+ SECTION("compact level zero") {
37
+ kll_test_type_sketch sketch(8);
38
+ REQUIRE_THROWS_AS(sketch.get_quantile(0), std::runtime_error);
39
+ REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
40
+ REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
41
+ REQUIRE(sketch.get_serialized_size_bytes() == 8);
42
+
43
+ sketch.update(1);
44
+ sketch.update(2);
45
+ sketch.update(3);
46
+ sketch.update(4);
47
+ sketch.update(5);
48
+ sketch.update(6);
49
+ sketch.update(7);
50
+ sketch.update(8);
51
+ sketch.update(9);
52
+
53
+ //sketch.to_stream(std::cout);
54
+
55
+ REQUIRE(sketch.is_estimation_mode());
56
+ REQUIRE(sketch.get_n() > sketch.get_num_retained());
57
+ REQUIRE(sketch.get_min_value().get_value() == 1);
58
+ REQUIRE(sketch.get_max_value().get_value() == 9);
59
+ }
60
+
61
+ SECTION("merge small") {
62
+ kll_test_type_sketch sketch1(8);
63
+ sketch1.update(1);
64
+
65
+ kll_test_type_sketch sketch2(8);
66
+ sketch2.update(2);
67
+
68
+ sketch2.merge(sketch1);
69
+
70
+ //sketch2.to_stream(std::cout);
71
+
72
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
73
+ REQUIRE(sketch2.get_num_retained() == sketch2.get_n());
74
+ REQUIRE(sketch2.get_min_value().get_value() == 1);
75
+ REQUIRE(sketch2.get_max_value().get_value() == 2);
76
+ }
77
+
78
+ SECTION("merge higher levels") {
79
+ kll_test_type_sketch sketch1(8);
80
+ sketch1.update(1);
81
+ sketch1.update(2);
82
+ sketch1.update(3);
83
+ sketch1.update(4);
84
+ sketch1.update(5);
85
+ sketch1.update(6);
86
+ sketch1.update(7);
87
+ sketch1.update(8);
88
+ sketch1.update(9);
89
+
90
+ kll_test_type_sketch sketch2(8);
91
+ sketch2.update(10);
92
+ sketch2.update(11);
93
+ sketch2.update(12);
94
+ sketch2.update(13);
95
+ sketch2.update(14);
96
+ sketch2.update(15);
97
+ sketch2.update(16);
98
+ sketch2.update(17);
99
+ sketch2.update(18);
100
+
101
+ sketch2.merge(sketch1);
102
+
103
+ //sketch2.to_stream(std::cout);
104
+
105
+ REQUIRE(sketch2.is_estimation_mode());
106
+ REQUIRE(sketch2.get_n() > sketch2.get_num_retained());
107
+ REQUIRE(sketch2.get_min_value().get_value() == 1);
108
+ REQUIRE(sketch2.get_max_value().get_value() == 18);
109
+ }
110
+
111
+ SECTION("serialize deserialize") {
112
+ kll_test_type_sketch sketch1;
113
+
114
+ const int n = 1000;
115
+ for (int i = 0; i < n; i++) sketch1.update(i);
116
+
117
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
118
+ sketch1.serialize(s);
119
+ REQUIRE((size_t) s.tellp() == sketch1.get_serialized_size_bytes());
120
+ auto sketch2 = kll_test_type_sketch::deserialize(s);
121
+ REQUIRE((size_t) s.tellg() == sketch2.get_serialized_size_bytes());
122
+ REQUIRE(s.tellg() == s.tellp());
123
+ REQUIRE(sketch2.is_empty() == sketch1.is_empty());
124
+ REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
125
+ REQUIRE(sketch2.get_n() == sketch1.get_n());
126
+ REQUIRE(sketch2.get_num_retained() == sketch1.get_num_retained());
127
+ REQUIRE(sketch2.get_min_value().get_value() == sketch1.get_min_value().get_value());
128
+ REQUIRE(sketch2.get_max_value().get_value() == sketch1.get_max_value().get_value());
129
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch1.get_normalized_rank_error(false));
130
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch1.get_normalized_rank_error(true));
131
+ REQUIRE(sketch2.get_quantile(0.5).get_value() == sketch1.get_quantile(0.5).get_value());
132
+ REQUIRE(sketch2.get_rank(0) == sketch1.get_rank(0));
133
+ REQUIRE(sketch2.get_rank(n) == sketch1.get_rank(n));
134
+ REQUIRE(sketch2.get_rank(n / 2) == sketch1.get_rank(n / 2));
135
+ }
136
+
137
+ SECTION("moving merge") {
138
+ kll_test_type_sketch sketch1(8);
139
+ for (int i = 0; i < 10; i++) sketch1.update(i);
140
+ kll_test_type_sketch sketch2(8);
141
+ sketch2.update(10);
142
+ sketch2.merge(std::move(sketch1));
143
+ REQUIRE(sketch2.get_min_value().get_value() == 0);
144
+ REQUIRE(sketch2.get_max_value().get_value() == 10);
145
+ REQUIRE(sketch2.get_n() == 11);
146
+ }
147
+
148
+ // cleanup
149
+ if (test_allocator_total_bytes != 0) {
150
+ REQUIRE(test_allocator_total_bytes == 0);
151
+ }
152
+ }
153
+
154
+ } /* namespace datasketches */
@@ -0,0 +1,685 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+ #include <cmath>
22
+ #include <cstring>
23
+ #include <sstream>
24
+ #include <fstream>
25
+
26
+ #include <kll_sketch.hpp>
27
+ #include <test_allocator.hpp>
28
+
29
+ namespace datasketches {
30
+
31
+ static const double RANK_EPS_FOR_K_200 = 0.0133;
32
+ static const double NUMERIC_NOISE_TOLERANCE = 1E-6;
33
+
34
+ #ifdef TEST_BINARY_INPUT_PATH
35
+ static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
36
+ #else
37
+ static std::string testBinaryInputPath = "test/";
38
+ #endif
39
+
40
+ // typical usage would be just kll_sketch<float> or kll_sketch<std::string>, but here we use test_allocator
41
+ typedef kll_sketch<float, std::less<float>, serde<float>, test_allocator<float>> kll_float_sketch;
42
+ // let std::string use the default allocator for simplicity, otherwise we need to define "less" and "serde"
43
+ typedef kll_sketch<std::string, std::less<std::string>, serde<std::string>, test_allocator<std::string>> kll_string_sketch;
44
+
45
+ TEST_CASE("kll sketch", "[kll_sketch]") {
46
+
47
+ // setup
48
+ test_allocator_total_bytes = 0;
49
+
50
+ SECTION("k limits") {
51
+ kll_float_sketch sketch1(kll_float_sketch::MIN_K); // this should work
52
+ kll_float_sketch sketch2(kll_float_sketch::MAX_K); // this should work
53
+ REQUIRE_THROWS_AS(new kll_float_sketch(kll_float_sketch::MIN_K - 1), std::invalid_argument);
54
+ // MAX_K + 1 makes no sense because k is uint16_t
55
+ }
56
+
57
+ SECTION("empty") {
58
+ kll_float_sketch sketch;
59
+ REQUIRE(sketch.is_empty());
60
+ REQUIRE_FALSE(sketch.is_estimation_mode());
61
+ REQUIRE(sketch.get_n() == 0);
62
+ REQUIRE(sketch.get_num_retained() == 0);
63
+ REQUIRE(std::isnan(sketch.get_rank(0)));
64
+ REQUIRE(std::isnan(sketch.get_min_value()));
65
+ REQUIRE(std::isnan(sketch.get_max_value()));
66
+ REQUIRE(std::isnan(sketch.get_quantile(0.5)));
67
+ const double fractions[3] {0, 0.5, 1};
68
+ REQUIRE(sketch.get_quantiles(fractions, 3).size() == 0);
69
+ const float split_points[1] {0};
70
+ REQUIRE(sketch.get_PMF(split_points, 1).size() == 0);
71
+ REQUIRE(sketch.get_CDF(split_points, 1).size() == 0);
72
+
73
+ int count = 0;
74
+ for (auto& it: sketch) {
75
+ (void) it; // to suppress "unused" warning
76
+ ++count;
77
+ }
78
+ REQUIRE(count == 0);
79
+ }
80
+
81
+ SECTION("get bad quantile") {
82
+ kll_float_sketch sketch;
83
+ sketch.update(0); // has to be non-empty to reach the check
84
+ REQUIRE_THROWS_AS(sketch.get_quantile(-1), std::invalid_argument);
85
+ }
86
+
87
+ SECTION("one item") {
88
+ kll_float_sketch sketch;
89
+ sketch.update(1);
90
+ REQUIRE_FALSE(sketch.is_empty());
91
+ REQUIRE_FALSE(sketch.is_estimation_mode());
92
+ REQUIRE(sketch.get_n() == 1);
93
+ REQUIRE(sketch.get_num_retained() == 1);
94
+ REQUIRE(sketch.get_rank(1) == 0.0);
95
+ REQUIRE(sketch.get_rank(2) == 1.0);
96
+ REQUIRE(sketch.get_min_value() == 1.0);
97
+ REQUIRE(sketch.get_max_value() == 1.0);
98
+ REQUIRE(sketch.get_quantile(0.5) == 1.0);
99
+ const double fractions[3] {0, 0.5, 1};
100
+ auto quantiles = sketch.get_quantiles(fractions, 3);
101
+ REQUIRE(quantiles.size() == 3);
102
+ REQUIRE(quantiles[0] == 1.0);
103
+ REQUIRE(quantiles[1] == 1.0);
104
+ REQUIRE(quantiles[2] == 1.0);
105
+
106
+ int count = 0;
107
+ for (auto& it: sketch) {
108
+ REQUIRE(it.second == 1);
109
+ ++count;
110
+ }
111
+ REQUIRE(count == 1);
112
+ }
113
+
114
+ SECTION("NaN") {
115
+ kll_float_sketch sketch;
116
+ sketch.update(std::numeric_limits<float>::quiet_NaN());
117
+ REQUIRE(sketch.is_empty());
118
+
119
+ sketch.update(0.0);
120
+ sketch.update(std::numeric_limits<float>::quiet_NaN());
121
+ REQUIRE(sketch.get_n() == 1);
122
+ }
123
+
124
+ SECTION("many items, exact mode") {
125
+ kll_float_sketch sketch;
126
+ const uint32_t n(200);
127
+ for (uint32_t i = 0; i < n; i++) {
128
+ sketch.update(i);
129
+ REQUIRE(sketch.get_n() == i + 1);
130
+ }
131
+ REQUIRE_FALSE(sketch.is_empty());
132
+ REQUIRE_FALSE(sketch.is_estimation_mode());
133
+ REQUIRE(sketch.get_num_retained() == n);
134
+ REQUIRE(sketch.get_min_value() == 0.0);
135
+ REQUIRE(sketch.get_quantile(0) == 0.0);
136
+ REQUIRE(sketch.get_max_value() == n - 1);
137
+ REQUIRE(sketch.get_quantile(1) == n - 1);
138
+
139
+ const double fractions[3] {0, 0.5, 1};
140
+ auto quantiles = sketch.get_quantiles(fractions, 3);
141
+ REQUIRE(quantiles.size() == 3);
142
+ REQUIRE(quantiles[0] == 0.0);
143
+ REQUIRE(quantiles[1] == n / 2);
144
+ REQUIRE(quantiles[2] == n - 1 );
145
+
146
+ for (uint32_t i = 0; i < n; i++) {
147
+ const double trueRank = (double) i / n;
148
+ REQUIRE(sketch.get_rank(i) == trueRank);
149
+ }
150
+
151
+ // the alternative method must produce the same result
152
+ auto quantiles2 = sketch.get_quantiles(3);
153
+ REQUIRE(quantiles2.size() == 3);
154
+ REQUIRE(quantiles[0] == quantiles2[0]);
155
+ REQUIRE(quantiles[1] == quantiles2[1]);
156
+ REQUIRE(quantiles[2] == quantiles2[2]);
157
+ }
158
+
159
+ SECTION("10 items") {
160
+ kll_float_sketch sketch;
161
+ sketch.update(1);
162
+ sketch.update(2);
163
+ sketch.update(3);
164
+ sketch.update(4);
165
+ sketch.update(5);
166
+ sketch.update(6);
167
+ sketch.update(7);
168
+ sketch.update(8);
169
+ sketch.update(9);
170
+ sketch.update(10);
171
+ REQUIRE(sketch.get_quantile(0) == 1.0);
172
+ REQUIRE(sketch.get_quantile(0.5) == 6.0);
173
+ REQUIRE(sketch.get_quantile(0.99) == 10.0);
174
+ REQUIRE(sketch.get_quantile(1) == 10.0);
175
+ }
176
+
177
+ SECTION("100 items") {
178
+ kll_float_sketch sketch;
179
+ for (int i = 0; i < 100; ++i) sketch.update(i);
180
+ REQUIRE(sketch.get_quantile(0) == 0);
181
+ REQUIRE(sketch.get_quantile(0.01) == 1);
182
+ REQUIRE(sketch.get_quantile(0.5) == 50);
183
+ REQUIRE(sketch.get_quantile(0.99) == 99.0);
184
+ REQUIRE(sketch.get_quantile(1) == 99.0);
185
+ }
186
+
187
+ SECTION("many items, estimation mode") {
188
+ kll_float_sketch sketch;
189
+ const int n(1000000);
190
+ for (int i = 0; i < n; i++) {
191
+ sketch.update(i);
192
+ REQUIRE(sketch.get_n() == static_cast<uint64_t>(i + 1));
193
+ }
194
+ REQUIRE_FALSE(sketch.is_empty());
195
+ REQUIRE(sketch.is_estimation_mode());
196
+ REQUIRE(sketch.get_min_value() == 0.0); // min value is exact
197
+ REQUIRE(sketch.get_quantile(0) == 0.0); // min value is exact
198
+ REQUIRE(sketch.get_max_value() == n - 1); // max value is exact
199
+ REQUIRE(sketch.get_quantile(1) == n - 1); // max value is exact
200
+
201
+ // test rank
202
+ for (int i = 0; i < n; i++) {
203
+ const double trueRank = (double) i / n;
204
+ REQUIRE(sketch.get_rank(i) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
205
+ }
206
+
207
+ // test quantiles at every 0.1 percentage point
208
+ double fractions[1001];
209
+ double reverse_fractions[1001]; // check that ordering does not matter
210
+ for (int i = 0; i < 1001; i++) {
211
+ fractions[i] = (double) i / 1000;
212
+ reverse_fractions[1000 - i] = fractions[i];
213
+ }
214
+ auto quantiles = sketch.get_quantiles(fractions, 1001);
215
+ auto reverse_quantiles = sketch.get_quantiles(reverse_fractions, 1001);
216
+ float previous_quantile(0);
217
+ for (int i = 0; i < 1001; i++) {
218
+ // expensive in a loop, just to check the equivalence here, not advised for real code
219
+ const float quantile = sketch.get_quantile(fractions[i]);
220
+ REQUIRE(quantiles[i] == quantile);
221
+ REQUIRE(reverse_quantiles[1000 - i] == quantile);
222
+ REQUIRE(previous_quantile <= quantile);
223
+ previous_quantile = quantile;
224
+ }
225
+
226
+ //std::cout << sketch.to_string();
227
+ }
228
+
229
+ SECTION("consistency between get_rank adn get_PMF/CDF") {
230
+ kll_float_sketch sketch;
231
+ const int n = 1000;
232
+ float values[n];
233
+ for (int i = 0; i < n; i++) {
234
+ sketch.update(i);
235
+ values[i] = i;
236
+ }
237
+
238
+ const auto ranks(sketch.get_CDF(values, n));
239
+ const auto pmf(sketch.get_PMF(values, n));
240
+
241
+ double subtotal_pmf(0);
242
+ for (int i = 0; i < n; i++) {
243
+ if (sketch.get_rank(values[i]) != ranks[i]) {
244
+ std::cerr << "checking rank vs CDF for value " << i << std::endl;
245
+ REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
246
+ }
247
+ subtotal_pmf += pmf[i];
248
+ if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
249
+ std::cerr << "CDF vs PMF for value " << i << std::endl;
250
+ REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
251
+ }
252
+ }
253
+ }
254
+
255
+ SECTION("deserialize from java") {
256
+ std::ifstream is;
257
+ is.exceptions(std::ios::failbit | std::ios::badbit);
258
+ is.open(testBinaryInputPath + "kll_sketch_from_java.sk", std::ios::binary);
259
+ auto sketch = kll_float_sketch::deserialize(is);
260
+ REQUIRE_FALSE(sketch.is_empty());
261
+ REQUIRE(sketch.is_estimation_mode());
262
+ REQUIRE(sketch.get_n() == 1000000);
263
+ REQUIRE(sketch.get_num_retained() == 614);
264
+ REQUIRE(sketch.get_min_value() == 0.0);
265
+ REQUIRE(sketch.get_max_value() == 999999.0);
266
+ }
267
+
268
+ SECTION("stream serialize deserialize empty") {
269
+ kll_float_sketch sketch;
270
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
271
+ sketch.serialize(s);
272
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
273
+ auto sketch2 = kll_float_sketch::deserialize(s);
274
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
275
+ REQUIRE(sketch2.is_empty() == sketch.is_empty());
276
+ REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
277
+ REQUIRE(sketch2.get_n() == sketch.get_n());
278
+ REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
279
+ REQUIRE(std::isnan(sketch2.get_min_value()));
280
+ REQUIRE(std::isnan(sketch2.get_max_value()));
281
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
282
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
283
+ }
284
+
285
+ SECTION("bytes serialize deserialize empty") {
286
+ kll_float_sketch sketch;
287
+ auto bytes = sketch.serialize();
288
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size());
289
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
290
+ REQUIRE(sketch2.is_empty() == sketch.is_empty());
291
+ REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
292
+ REQUIRE(sketch2.get_n() == sketch.get_n());
293
+ REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
294
+ REQUIRE(std::isnan(sketch2.get_min_value()));
295
+ REQUIRE(std::isnan(sketch2.get_max_value()));
296
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
297
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
298
+ }
299
+
300
+ SECTION("serialize deserialize one item") {
301
+ kll_float_sketch sketch;
302
+ sketch.update(1);
303
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
304
+ sketch.serialize(s);
305
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
306
+ auto sketch2 = kll_float_sketch::deserialize(s);
307
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
308
+ REQUIRE(s.tellg() == s.tellp());
309
+ REQUIRE_FALSE(sketch2.is_empty());
310
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
311
+ REQUIRE(sketch2.get_n() == 1);
312
+ REQUIRE(sketch2.get_num_retained() == 1);
313
+ REQUIRE(sketch2.get_min_value() == 1.0);
314
+ REQUIRE(sketch2.get_max_value() == 1.0);
315
+ REQUIRE(sketch2.get_quantile(0.5) == 1.0);
316
+ REQUIRE(sketch2.get_rank(1) == 0.0);
317
+ REQUIRE(sketch2.get_rank(2) == 1.0);
318
+ }
319
+
320
+ SECTION("deserialize one item v1") {
321
+ std::ifstream is;
322
+ is.exceptions(std::ios::failbit | std::ios::badbit);
323
+ is.open(testBinaryInputPath + "kll_sketch_float_one_item_v1.sk", std::ios::binary);
324
+ auto sketch = kll_float_sketch::deserialize(is);
325
+ REQUIRE_FALSE(sketch.is_empty());
326
+ REQUIRE_FALSE(sketch.is_estimation_mode());
327
+ REQUIRE(sketch.get_n() == 1);
328
+ REQUIRE(sketch.get_num_retained() == 1);
329
+ REQUIRE(sketch.get_min_value() == 1.0);
330
+ REQUIRE(sketch.get_max_value() == 1.0);
331
+ }
332
+
333
+ SECTION("stream serialize deserialize many floats") {
334
+ kll_float_sketch sketch;
335
+ const int n(1000);
336
+ for (int i = 0; i < n; i++) sketch.update(i);
337
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
338
+ sketch.serialize(s);
339
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
340
+ auto sketch2 = kll_float_sketch::deserialize(s);
341
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
342
+ REQUIRE(s.tellg() == s.tellp());
343
+ REQUIRE(sketch2.is_empty() == sketch.is_empty());
344
+ REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
345
+ REQUIRE(sketch2.get_n() == sketch.get_n());
346
+ REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
347
+ REQUIRE(sketch2.get_min_value() == sketch.get_min_value());
348
+ REQUIRE(sketch2.get_max_value() == sketch.get_max_value());
349
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
350
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
351
+ REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
352
+ REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
353
+ REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
354
+ }
355
+
356
+ SECTION("bytes serialize deserialize many floats") {
357
+ kll_float_sketch sketch;
358
+ const int n(1000);
359
+ for (int i = 0; i < n; i++) sketch.update(i);
360
+ auto bytes = sketch.serialize();
361
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
362
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size());
363
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
364
+ REQUIRE(sketch2.is_empty() == sketch.is_empty());
365
+ REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
366
+ REQUIRE(sketch2.get_n() == sketch.get_n());
367
+ REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
368
+ REQUIRE(sketch2.get_min_value() == sketch.get_min_value());
369
+ REQUIRE(sketch2.get_max_value() == sketch.get_max_value());
370
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
371
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
372
+ REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
373
+ REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
374
+ REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
375
+ REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 7), std::out_of_range);
376
+ REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 15), std::out_of_range);
377
+ REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
378
+ }
379
+
380
+ SECTION("bytes serialize deserialize many ints") {
381
+ kll_sketch<int> sketch;
382
+ const int n(1000);
383
+ for (int i = 0; i < n; i++) sketch.update(i);
384
+ auto bytes = sketch.serialize();
385
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
386
+ auto sketch2 = kll_sketch<int>::deserialize(bytes.data(), bytes.size());
387
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
388
+ REQUIRE(sketch2.is_empty() == sketch.is_empty());
389
+ REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
390
+ REQUIRE(sketch2.get_n() == sketch.get_n());
391
+ REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
392
+ REQUIRE(sketch2.get_min_value() == sketch.get_min_value());
393
+ REQUIRE(sketch2.get_max_value() == sketch.get_max_value());
394
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
395
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
396
+ REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
397
+ REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
398
+ REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
399
+ REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 7), std::out_of_range);
400
+ REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 15), std::out_of_range);
401
+ REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
402
+ }
403
+
404
+ SECTION("floor of log2 of fraction") {
405
+ REQUIRE(kll_helper::floor_of_log2_of_fraction(0, 1) == 0);
406
+ REQUIRE(kll_helper::floor_of_log2_of_fraction(1, 2) == 0);
407
+ REQUIRE(kll_helper::floor_of_log2_of_fraction(2, 2) == 0);
408
+ REQUIRE(kll_helper::floor_of_log2_of_fraction(3, 2) == 0);
409
+ REQUIRE(kll_helper::floor_of_log2_of_fraction(4, 2) == 1);
410
+ REQUIRE(kll_helper::floor_of_log2_of_fraction(5, 2) == 1);
411
+ REQUIRE(kll_helper::floor_of_log2_of_fraction(6, 2) == 1);
412
+ REQUIRE(kll_helper::floor_of_log2_of_fraction(7, 2) == 1);
413
+ REQUIRE(kll_helper::floor_of_log2_of_fraction(8, 2) == 2);
414
+ }
415
+
416
+ SECTION("out of order split points, float") {
417
+ kll_float_sketch sketch;
418
+ sketch.update(0); // has too be non-empty to reach the check
419
+ float split_points[2] = {1, 0};
420
+ REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 2), std::invalid_argument);
421
+ }
422
+
423
+ SECTION("out of order split points, int") {
424
+ kll_sketch<int> sketch;
425
+ sketch.update(0); // has too be non-empty to reach the check
426
+ int split_points[2] = {1, 0};
427
+ REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 2), std::invalid_argument);
428
+ }
429
+
430
+ SECTION("NaN split point") {
431
+ kll_float_sketch sketch;
432
+ sketch.update(0); // has too be non-empty to reach the check
433
+ float split_points[1] = {std::numeric_limits<float>::quiet_NaN()};
434
+ REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 1), std::invalid_argument);
435
+ }
436
+
437
+ SECTION("merge") {
438
+ kll_float_sketch sketch1;
439
+ kll_float_sketch sketch2;
440
+ const int n = 10000;
441
+ for (int i = 0; i < n; i++) {
442
+ sketch1.update(i);
443
+ sketch2.update((2 * n) - i - 1);
444
+ }
445
+
446
+ REQUIRE(sketch1.get_min_value() == 0.0f);
447
+ REQUIRE(sketch1.get_max_value() == n - 1);
448
+ REQUIRE(sketch2.get_min_value() == n);
449
+ REQUIRE(sketch2.get_max_value() == 2.0f * n - 1);
450
+
451
+ sketch1.merge(sketch2);
452
+
453
+ REQUIRE_FALSE(sketch1.is_empty());
454
+ REQUIRE(sketch1.get_n() == 2 * n);
455
+ REQUIRE(sketch1.get_min_value() == 0.0f);
456
+ REQUIRE(sketch1.get_max_value() == 2.0f * n - 1);
457
+ REQUIRE(sketch1.get_quantile(0.5) == Approx(n).margin(n * RANK_EPS_FOR_K_200));
458
+ }
459
+
460
+ SECTION("merge lower k") {
461
+ kll_float_sketch sketch1(256);
462
+ kll_float_sketch sketch2(128);
463
+ const int n = 10000;
464
+ for (int i = 0; i < n; i++) {
465
+ sketch1.update(i);
466
+ sketch2.update((2 * n) - i - 1);
467
+ }
468
+
469
+ REQUIRE(sketch1.get_min_value() == 0.0f);
470
+ REQUIRE(sketch1.get_max_value() == n - 1);
471
+ REQUIRE(sketch2.get_min_value() == n);
472
+ REQUIRE(sketch2.get_max_value() == 2.0f * n - 1);
473
+
474
+ REQUIRE(sketch1.get_normalized_rank_error(false) < sketch2.get_normalized_rank_error(false));
475
+ REQUIRE(sketch1.get_normalized_rank_error(true) < sketch2.get_normalized_rank_error(true));
476
+
477
+ sketch1.merge(sketch2);
478
+
479
+ // sketch1 must get "contaminated" by the lower K in sketch2
480
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch1.get_normalized_rank_error(false));
481
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch1.get_normalized_rank_error(true));
482
+
483
+ REQUIRE_FALSE(sketch1.is_empty());
484
+ REQUIRE(sketch1.get_n() == 2 * n);
485
+ REQUIRE(sketch1.get_min_value() == 0.0f);
486
+ REQUIRE(sketch1.get_max_value() == 2.0f * n - 1);
487
+ REQUIRE(sketch1.get_quantile(0.5) == Approx(n).margin(n * RANK_EPS_FOR_K_200));
488
+ }
489
+
490
+ SECTION("merge exact mode, lower k") {
491
+ kll_float_sketch sketch1(256);
492
+ kll_float_sketch sketch2(128);
493
+ const int n = 10000;
494
+ for (int i = 0; i < n; i++) {
495
+ sketch1.update(i);
496
+ }
497
+
498
+ // rank error should not be affected by a merge with an empty sketch with lower k
499
+ const double rank_error_before_merge = sketch1.get_normalized_rank_error(true);
500
+ sketch1.merge(sketch2);
501
+ REQUIRE(sketch1.get_normalized_rank_error(true) == rank_error_before_merge);
502
+
503
+ REQUIRE_FALSE(sketch1.is_empty());
504
+ REQUIRE(sketch1.get_n() == n);
505
+ REQUIRE(sketch1.get_min_value() == 0.0f);
506
+ REQUIRE(sketch1.get_max_value() == n - 1);
507
+ REQUIRE(sketch1.get_quantile(0.5) == Approx(n / 2).margin(n / 2 * RANK_EPS_FOR_K_200));
508
+
509
+ sketch2.update(0);
510
+ sketch1.merge(sketch2);
511
+ // rank error should not be affected by a merge with a sketch in exact mode with lower k
512
+ REQUIRE(sketch1.get_normalized_rank_error(true) == rank_error_before_merge);
513
+ }
514
+
515
+ SECTION("merge min value from other") {
516
+ kll_float_sketch sketch1;
517
+ kll_float_sketch sketch2;
518
+ sketch1.update(1);
519
+ sketch2.update(2);
520
+ sketch2.merge(sketch1);
521
+ REQUIRE(sketch2.get_min_value() == 1.0f);
522
+ REQUIRE(sketch2.get_max_value() == 2.0f);
523
+ }
524
+
525
+ SECTION("merge min and max values from other") {
526
+ kll_float_sketch sketch1;
527
+ for (int i = 0; i < 1000000; i++) sketch1.update(i);
528
+ kll_float_sketch sketch2;
529
+ sketch2.merge(sketch1);
530
+ REQUIRE(sketch2.get_min_value() == 0.0f);
531
+ REQUIRE(sketch2.get_max_value() == 999999.0f);
532
+ }
533
+
534
+ SECTION("sketch of ints") {
535
+ kll_sketch<int> sketch;
536
+ REQUIRE_THROWS_AS(sketch.get_quantile(0), std::runtime_error);
537
+ REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
538
+ REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
539
+
540
+ const int n(1000);
541
+ for (int i = 0; i < n; i++) sketch.update(i);
542
+
543
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
544
+ sketch.serialize(s);
545
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
546
+ auto sketch2 = kll_sketch<int>::deserialize(s);
547
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
548
+ REQUIRE(s.tellg() == s.tellp());
549
+ REQUIRE(sketch2.is_empty() == sketch.is_empty());
550
+ REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
551
+ REQUIRE(sketch2.get_n() == sketch.get_n());
552
+ REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
553
+ REQUIRE(sketch2.get_min_value() == sketch.get_min_value());
554
+ REQUIRE(sketch2.get_max_value() == sketch.get_max_value());
555
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
556
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
557
+ REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
558
+ REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
559
+ REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
560
+ }
561
+
562
+ SECTION("sketch of strings stream") {
563
+ kll_string_sketch sketch1;
564
+ REQUIRE_THROWS_AS(sketch1.get_quantile(0), std::runtime_error);
565
+ REQUIRE_THROWS_AS(sketch1.get_min_value(), std::runtime_error);
566
+ REQUIRE_THROWS_AS(sketch1.get_max_value(), std::runtime_error);
567
+ REQUIRE(sketch1.get_serialized_size_bytes() == 8);
568
+
569
+ const int n = 1000;
570
+ for (int i = 0; i < n; i++) sketch1.update(std::to_string(i));
571
+
572
+ REQUIRE(sketch1.get_min_value() == std::string("0"));
573
+ REQUIRE(sketch1.get_max_value() == std::string("999"));
574
+
575
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
576
+ sketch1.serialize(s);
577
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch1.get_serialized_size_bytes());
578
+ auto sketch2 = kll_string_sketch::deserialize(s);
579
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
580
+ REQUIRE(s.tellg() == s.tellp());
581
+ REQUIRE(sketch2.is_empty() == sketch1.is_empty());
582
+ REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
583
+ REQUIRE(sketch2.get_n() == sketch1.get_n());
584
+ REQUIRE(sketch2.get_num_retained() == sketch1.get_num_retained());
585
+ REQUIRE(sketch2.get_min_value() == sketch1.get_min_value());
586
+ REQUIRE(sketch2.get_max_value() == sketch1.get_max_value());
587
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch1.get_normalized_rank_error(false));
588
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch1.get_normalized_rank_error(true));
589
+ REQUIRE(sketch2.get_quantile(0.5) == sketch1.get_quantile(0.5));
590
+ REQUIRE(sketch2.get_rank(std::to_string(0)) == sketch1.get_rank(std::to_string(0)));
591
+ REQUIRE(sketch2.get_rank(std::to_string(n)) == sketch1.get_rank(std::to_string(n)));
592
+
593
+ // to take a look using hexdump
594
+ //std::ofstream os("kll-string.sk");
595
+ //sketch1.serialize(os);
596
+
597
+ // debug print
598
+ //sketch1.to_stream(std::cout);
599
+ }
600
+
601
+ SECTION("sketch of strings bytes") {
602
+ kll_string_sketch sketch1;
603
+ REQUIRE_THROWS_AS(sketch1.get_quantile(0), std::runtime_error);
604
+ REQUIRE_THROWS_AS(sketch1.get_min_value(), std::runtime_error);
605
+ REQUIRE_THROWS_AS(sketch1.get_max_value(), std::runtime_error);
606
+ REQUIRE(sketch1.get_serialized_size_bytes() == 8);
607
+
608
+ const int n = 1000;
609
+ for (int i = 0; i < n; i++) sketch1.update(std::to_string(i));
610
+
611
+ REQUIRE(sketch1.get_min_value() == std::string("0"));
612
+ REQUIRE(sketch1.get_max_value() == std::string("999"));
613
+
614
+ auto bytes = sketch1.serialize();
615
+ REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
616
+ auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size());
617
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
618
+ REQUIRE(sketch2.is_empty() == sketch1.is_empty());
619
+ REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
620
+ REQUIRE(sketch2.get_n() == sketch1.get_n());
621
+ REQUIRE(sketch2.get_num_retained() == sketch1.get_num_retained());
622
+ REQUIRE(sketch2.get_min_value() == sketch1.get_min_value());
623
+ REQUIRE(sketch2.get_max_value() == sketch1.get_max_value());
624
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch1.get_normalized_rank_error(false));
625
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch1.get_normalized_rank_error(true));
626
+ REQUIRE(sketch2.get_quantile(0.5) == sketch1.get_quantile(0.5));
627
+ REQUIRE(sketch2.get_rank(std::to_string(0)) == sketch1.get_rank(std::to_string(0)));
628
+ REQUIRE(sketch2.get_rank(std::to_string(n)) == sketch1.get_rank(std::to_string(n)));
629
+ }
630
+
631
+
632
+ SECTION("sketch of strings, single item, bytes") {
633
+ kll_string_sketch sketch1;
634
+ sketch1.update("a");
635
+ auto bytes = sketch1.serialize();
636
+ REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
637
+ auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size());
638
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
639
+ }
640
+
641
+ SECTION("copy") {
642
+ kll_sketch<int> sketch1;
643
+ const int n(1000);
644
+ for (int i = 0; i < n; i++) sketch1.update(i);
645
+
646
+ // copy constructor
647
+ kll_sketch<int> sketch2(sketch1);
648
+ for (int i = 0; i < n; i++) {
649
+ REQUIRE(sketch2.get_rank(i) == sketch1.get_rank(i));
650
+ }
651
+
652
+ // copy assignment
653
+ kll_sketch<int> sketch3;
654
+ sketch3 = sketch1;
655
+ for (int i = 0; i < n; i++) {
656
+ REQUIRE(sketch3.get_rank(i) == sketch1.get_rank(i));
657
+ }
658
+ }
659
+
660
+ SECTION("move") {
661
+ kll_sketch<int> sketch1;
662
+ const int n(100);
663
+ for (int i = 0; i < n; i++) sketch1.update(i);
664
+
665
+ // move constructor
666
+ kll_sketch<int> sketch2(std::move(sketch1));
667
+ for (int i = 0; i < n; i++) {
668
+ REQUIRE(sketch2.get_rank(i) == (double) i / n);
669
+ }
670
+
671
+ // move assignment
672
+ kll_sketch<int> sketch3;
673
+ sketch3 = std::move(sketch2);
674
+ for (int i = 0; i < n; i++) {
675
+ REQUIRE(sketch3.get_rank(i) == (double) i / n);
676
+ }
677
+ }
678
+
679
+ // cleanup
680
+ if (test_allocator_total_bytes != 0) {
681
+ REQUIRE(test_allocator_total_bytes == 0);
682
+ }
683
+ }
684
+
685
+ } /* namespace datasketches */