datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,57 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_library(theta INTERFACE)
19
+
20
+ add_library(${PROJECT_NAME}::THETA ALIAS theta)
21
+
22
+ if (BUILD_TESTS)
23
+ add_subdirectory(test)
24
+ endif()
25
+
26
+ target_include_directories(theta
27
+ INTERFACE
28
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
29
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
30
+ )
31
+
32
+ target_link_libraries(theta INTERFACE common)
33
+ target_compile_features(theta INTERFACE cxx_std_11)
34
+
35
+ set(theta_HEADERS "")
36
+ list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_union.hpp;include/theta_intersection.hpp")
37
+ list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_sketch_impl.hpp")
38
+ list(APPEND theta_HEADERS "include/theta_union_impl.hpp;include/theta_intersection_impl.hpp;include/theta_a_not_b_impl.hpp")
39
+
40
+ install(TARGETS theta
41
+ EXPORT ${PROJECT_NAME}
42
+ )
43
+
44
+ install(FILES ${theta_HEADERS}
45
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
46
+
47
+ target_sources(theta
48
+ INTERFACE
49
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch.hpp
50
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union.hpp
51
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection.hpp
52
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b.hpp
53
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_impl.hpp
54
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
55
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
56
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
57
+ )
@@ -0,0 +1,73 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef THETA_A_NOT_B_HPP_
21
+ #define THETA_A_NOT_B_HPP_
22
+
23
+ #include <memory>
24
+ #include <functional>
25
+ #include <climits>
26
+
27
+ #include "theta_sketch.hpp"
28
+ #include "common_defs.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ /*
33
+ * author Alexander Saydakov
34
+ * author Lee Rhodes
35
+ * author Kevin Lang
36
+ */
37
+
38
+ template<typename A>
39
+ class theta_a_not_b_alloc {
40
+ public:
41
+ /**
42
+ * Creates an instance of the a-not-b operation (set difference) with a given has seed.
43
+ * @param seed hash seed
44
+ */
45
+ explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED);
46
+
47
+ /**
48
+ * Computes the a-not-b set operation given two sketches.
49
+ * @return the result of a-not-b
50
+ */
51
+ compact_theta_sketch_alloc<A> compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered = true) const;
52
+
53
+ private:
54
+ typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
55
+ uint16_t seed_hash_;
56
+
57
+ class less_than {
58
+ public:
59
+ explicit less_than(uint64_t value): value(value) {}
60
+ bool operator()(uint64_t value) const { return value < this->value; }
61
+ private:
62
+ uint64_t value;
63
+ };
64
+ };
65
+
66
+ // alias with default allocator for convenience
67
+ typedef theta_a_not_b_alloc<std::allocator<void>> theta_a_not_b;
68
+
69
+ } /* namespace datasketches */
70
+
71
+ #include "theta_a_not_b_impl.hpp"
72
+
73
+ # endif
@@ -0,0 +1,83 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef THETA_A_NOT_B_IMPL_HPP_
21
+ #define THETA_A_NOT_B_IMPL_HPP_
22
+
23
+ #include <algorithm>
24
+
25
+ #include "conditional_back_inserter.hpp"
26
+
27
+ namespace datasketches {
28
+
29
+ /*
30
+ * author Alexander Saydakov
31
+ * author Lee Rhodes
32
+ * author Kevin Lang
33
+ */
34
+
35
+ template<typename A>
36
+ theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed):
37
+ seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
38
+ {}
39
+
40
+ template<typename A>
41
+ compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
42
+ if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
43
+ if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
44
+ if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
45
+
46
+ const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
47
+ vector_u64<A> keys;
48
+ bool is_empty = a.is_empty();
49
+
50
+ if (b.get_num_retained() == 0) {
51
+ std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than(theta));
52
+ } else {
53
+ if (a.is_ordered() && b.is_ordered()) { // sort-based
54
+ std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than(theta)));
55
+ } else { // hash-based
56
+ const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
57
+ vector_u64<A> b_hash_table(1 << lg_size, 0);
58
+ for (auto key: b) {
59
+ if (key < theta) {
60
+ update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
61
+ } else if (b.is_ordered()) {
62
+ break; // early stop
63
+ }
64
+ }
65
+
66
+ // scan A lookup B
67
+ for (auto key: a) {
68
+ if (key < theta) {
69
+ if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
70
+ } else if (a.is_ordered()) {
71
+ break; // early stop
72
+ }
73
+ }
74
+ }
75
+ }
76
+ if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
77
+ if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
78
+ return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
79
+ }
80
+
81
+ } /* namespace datasketches */
82
+
83
+ # endif
@@ -0,0 +1,88 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef THETA_INTERSECTION_HPP_
21
+ #define THETA_INTERSECTION_HPP_
22
+
23
+ #include <memory>
24
+ #include <functional>
25
+ #include <climits>
26
+
27
+ #include "theta_sketch.hpp"
28
+ #include "common_defs.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ /*
33
+ * author Alexander Saydakov
34
+ * author Lee Rhodes
35
+ * author Kevin Lang
36
+ */
37
+
38
+ template<typename A>
39
+ class theta_intersection_alloc {
40
+ public:
41
+ /**
42
+ * Creates an instance of the intersection with a given hash seed.
43
+ * @param seed hash seed
44
+ */
45
+ explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED);
46
+
47
+ /**
48
+ * Updates the intersection with a given sketch.
49
+ * The intersection can be viewed as starting from the "universe" set, and every update
50
+ * can reduce the current set to leave the overlapping subset only.
51
+ * @param sketch represents input set for the intersection
52
+ */
53
+ void update(const theta_sketch_alloc<A>& sketch);
54
+
55
+ /**
56
+ * Produces a copy of the current state of the intersection.
57
+ * If update() was not called, the state is the infinite "universe",
58
+ * which is considered an undefined state, and throws an exception.
59
+ * @param ordered optional flag to specify if ordered sketch should be produced
60
+ * @return the result of the intersection
61
+ */
62
+ compact_theta_sketch_alloc<A> get_result(bool ordered = true) const;
63
+
64
+ /**
65
+ * Returns true if the state of the intersection is defined (not infinite "universe").
66
+ * @return true if the state is valid
67
+ */
68
+ bool has_result() const;
69
+
70
+ private:
71
+ typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
72
+ bool is_valid_;
73
+ bool is_empty_;
74
+ uint64_t theta_;
75
+ uint8_t lg_size_;
76
+ vector_u64<A> keys_;
77
+ uint32_t num_keys_;
78
+ uint16_t seed_hash_;
79
+ };
80
+
81
+ // alias with default allocator for convenience
82
+ typedef theta_intersection_alloc<std::allocator<void>> theta_intersection;
83
+
84
+ } /* namespace datasketches */
85
+
86
+ #include "theta_intersection_impl.hpp"
87
+
88
+ # endif
@@ -0,0 +1,130 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef THETA_INTERSECTION_IMPL_HPP_
21
+ #define THETA_INTERSECTION_IMPL_HPP_
22
+
23
+ #include <algorithm>
24
+
25
+ namespace datasketches {
26
+
27
+ /*
28
+ * author Alexander Saydakov
29
+ * author Lee Rhodes
30
+ * author Kevin Lang
31
+ */
32
+
33
+ template<typename A>
34
+ theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed):
35
+ is_valid_(false),
36
+ is_empty_(false),
37
+ theta_(theta_sketch_alloc<A>::MAX_THETA),
38
+ lg_size_(0),
39
+ keys_(),
40
+ num_keys_(0),
41
+ seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
42
+ {}
43
+
44
+ template<typename A>
45
+ void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
46
+ if (is_empty_) return;
47
+ if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
48
+ is_empty_ |= sketch.is_empty();
49
+ theta_ = std::min(theta_, sketch.get_theta64());
50
+ if (is_valid_ && num_keys_ == 0) return;
51
+ if (sketch.get_num_retained() == 0) {
52
+ is_valid_ = true;
53
+ if (keys_.size() > 0) {
54
+ keys_.resize(0);
55
+ lg_size_ = 0;
56
+ num_keys_ = 0;
57
+ }
58
+ return;
59
+ }
60
+ if (!is_valid_) { // first update, clone incoming sketch
61
+ is_valid_ = true;
62
+ lg_size_ = lg_size_from_count(sketch.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
63
+ keys_.resize(1 << lg_size_, 0);
64
+ for (auto key: sketch) {
65
+ if (!update_theta_sketch_alloc<A>::hash_search_or_insert(key, keys_.data(), lg_size_)) {
66
+ throw std::invalid_argument("duplicate key, possibly corrupted input sketch");
67
+ }
68
+ ++num_keys_;
69
+ }
70
+ if (num_keys_ != sketch.get_num_retained()) throw std::invalid_argument("num keys mismatch, possibly corrupted input sketch");
71
+ } else { // intersection
72
+ const uint32_t max_matches = std::min(num_keys_, sketch.get_num_retained());
73
+ vector_u64<A> matched_keys(max_matches);
74
+ uint32_t match_count = 0;
75
+ uint32_t count = 0;
76
+ for (auto key: sketch) {
77
+ if (key < theta_) {
78
+ if (update_theta_sketch_alloc<A>::hash_search(key, keys_.data(), lg_size_)) {
79
+ if (match_count == max_matches) throw std::invalid_argument("max matches exceeded, possibly corrupted input sketch");
80
+ matched_keys[match_count++] = key;
81
+ }
82
+ } else if (sketch.is_ordered()) {
83
+ break; // early stop
84
+ }
85
+ ++count;
86
+ }
87
+ if (count > sketch.get_num_retained()) {
88
+ throw std::invalid_argument(" more keys then expected, possibly corrupted input sketch");
89
+ } else if (!sketch.is_ordered() && count < sketch.get_num_retained()) {
90
+ throw std::invalid_argument(" fewer keys then expected, possibly corrupted input sketch");
91
+ }
92
+ if (match_count == 0) {
93
+ keys_.resize(0);
94
+ lg_size_ = 0;
95
+ num_keys_ = 0;
96
+ if (theta_ == theta_sketch_alloc<A>::MAX_THETA) is_empty_ = true;
97
+ } else {
98
+ const uint8_t lg_size = lg_size_from_count(match_count, update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
99
+ if (lg_size != lg_size_) {
100
+ lg_size_ = lg_size;
101
+ keys_.resize(1 << lg_size_);
102
+ }
103
+ std::fill(keys_.begin(), keys_.end(), 0);
104
+ for (uint32_t i = 0; i < match_count; i++) {
105
+ update_theta_sketch_alloc<A>::hash_search_or_insert(matched_keys[i], keys_.data(), lg_size_);
106
+ }
107
+ num_keys_ = match_count;
108
+ }
109
+ }
110
+ }
111
+
112
+ template<typename A>
113
+ compact_theta_sketch_alloc<A> theta_intersection_alloc<A>::get_result(bool ordered) const {
114
+ if (!is_valid_) throw std::invalid_argument("calling get_result() before calling update() is undefined");
115
+ vector_u64<A> keys(num_keys_);
116
+ if (num_keys_ > 0) {
117
+ std::copy_if(keys_.begin(), keys_.end(), keys.begin(), [](uint64_t key) { return key != 0; });
118
+ if (ordered) std::sort(keys.begin(), keys.end());
119
+ }
120
+ return compact_theta_sketch_alloc<A>(is_empty_, theta_, std::move(keys), seed_hash_, ordered);
121
+ }
122
+
123
+ template<typename A>
124
+ bool theta_intersection_alloc<A>::has_result() const {
125
+ return is_valid_;
126
+ }
127
+
128
+ } /* namespace datasketches */
129
+
130
+ # endif
@@ -0,0 +1,533 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef THETA_SKETCH_HPP_
21
+ #define THETA_SKETCH_HPP_
22
+
23
+ #include <memory>
24
+ #include <functional>
25
+ #include <climits>
26
+ #include <vector>
27
+
28
+ #include "common_defs.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ /*
33
+ * author Alexander Saydakov
34
+ * author Lee Rhodes
35
+ * author Kevin Lang
36
+ */
37
+
38
+ // forward-declarations
39
+ template<typename A> class theta_sketch_alloc;
40
+ template<typename A> class update_theta_sketch_alloc;
41
+ template<typename A> class compact_theta_sketch_alloc;
42
+ template<typename A> class theta_union_alloc;
43
+ template<typename A> class theta_intersection_alloc;
44
+ template<typename A> class theta_a_not_b_alloc;
45
+
46
+ // for serialization as raw bytes
47
+ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
48
+ template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
49
+
50
+ template<typename A>
51
+ class theta_sketch_alloc {
52
+ public:
53
+ static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
54
+ static const uint8_t SERIAL_VERSION = 3;
55
+
56
+ virtual ~theta_sketch_alloc() = default;
57
+
58
+ /**
59
+ * @return true if this sketch represents an empty set (not the same as no retained entries!)
60
+ */
61
+ bool is_empty() const;
62
+
63
+ /**
64
+ * @return estimate of the distinct count of the input stream
65
+ */
66
+ double get_estimate() const;
67
+
68
+ /**
69
+ * Returns the approximate lower error bound given a number of standard deviations.
70
+ * This parameter is similar to the number of standard deviations of the normal distribution
71
+ * and corresponds to approximately 67%, 95% and 99% confidence intervals.
72
+ * @param num_std_devs number of Standard Deviations (1, 2 or 3)
73
+ * @return the lower bound
74
+ */
75
+ double get_lower_bound(uint8_t num_std_devs) const;
76
+
77
+ /**
78
+ * Returns the approximate upper error bound given a number of standard deviations.
79
+ * This parameter is similar to the number of standard deviations of the normal distribution
80
+ * and corresponds to approximately 67%, 95% and 99% confidence intervals.
81
+ * @param num_std_devs number of Standard Deviations (1, 2 or 3)
82
+ * @return the upper bound
83
+ */
84
+ double get_upper_bound(uint8_t num_std_devs) const;
85
+
86
+ /**
87
+ * @return true if the sketch is in estimation mode (as opposed to exact mode)
88
+ */
89
+ bool is_estimation_mode() const;
90
+
91
+ /**
92
+ * @return theta as a fraction from 0 to 1 (effective sampling rate)
93
+ */
94
+ double get_theta() const;
95
+
96
+ /**
97
+ * @return theta as a positive integer between 0 and LLONG_MAX
98
+ */
99
+ uint64_t get_theta64() const;
100
+
101
+ /**
102
+ * @return the number of retained entries in the sketch
103
+ */
104
+ virtual uint32_t get_num_retained() const = 0;
105
+
106
+ virtual uint16_t get_seed_hash() const = 0;
107
+
108
+ /**
109
+ * @return true if retained entries are ordered
110
+ */
111
+ virtual bool is_ordered() const = 0;
112
+
113
+ /**
114
+ * Writes a human-readable summary of this sketch to a given stream
115
+ * @param print_items if true include the list of items retained by the sketch
116
+ */
117
+ virtual string<A> to_string(bool print_items = false) const = 0;
118
+
119
+ /**
120
+ * This method serializes the sketch into a given stream in a binary form
121
+ * @param os output stream
122
+ */
123
+ virtual void serialize(std::ostream& os) const = 0;
124
+
125
+ // This is a convenience alias for users
126
+ // The type returned by the following serialize method
127
+ typedef vector_u8<A> vector_bytes;
128
+
129
+ /**
130
+ * This method serializes the sketch as a vector of bytes.
131
+ * An optional header can be reserved in front of the sketch.
132
+ * It is an uninitialized space of a given size.
133
+ * This header is used in Datasketches PostgreSQL extension.
134
+ * @param header_size_bytes space to reserve in front of the sketch
135
+ */
136
+ virtual vector_bytes serialize(unsigned header_size_bytes = 0) const = 0;
137
+
138
+ // This is a convenience alias for users
139
+ // The type returned by the following deserialize methods
140
+ // It is not possible to return instances of an abstract type, so this has to be a pointer
141
+ typedef std::unique_ptr<theta_sketch_alloc<A>, std::function<void(theta_sketch_alloc<A>*)>> unique_ptr;
142
+
143
+ /**
144
+ * This method deserializes a sketch from a given stream.
145
+ * @param is input stream
146
+ * @param seed the seed for the hash function that was used to create the sketch
147
+ * @return an instance of a sketch as a unique_ptr
148
+ */
149
+ static unique_ptr deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
150
+
151
+ /**
152
+ * This method deserializes a sketch from a given array of bytes.
153
+ * @param bytes pointer to the array of bytes
154
+ * @param size the size of the array
155
+ * @param seed the seed for the hash function that was used to create the sketch
156
+ * @return an instance of the sketch
157
+ */
158
+ static unique_ptr deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
159
+
160
+ class const_iterator;
161
+
162
+ /**
163
+ * Iterator over hash values in this sketch.
164
+ * @return begin iterator
165
+ */
166
+ virtual const_iterator begin() const = 0;
167
+
168
+ /**
169
+ * Iterator pointing past the valid range.
170
+ * Not to be incremented or dereferenced.
171
+ * @return end iterator
172
+ */
173
+ virtual const_iterator end() const = 0;
174
+
175
+ protected:
176
+ enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
177
+
178
+ bool is_empty_;
179
+ uint64_t theta_;
180
+
181
+ theta_sketch_alloc(bool is_empty, uint64_t theta);
182
+
183
+ static uint16_t get_seed_hash(uint64_t seed);
184
+
185
+ static void check_sketch_type(uint8_t actual, uint8_t expected);
186
+ static void check_serial_version(uint8_t actual, uint8_t expected);
187
+ static void check_seed_hash(uint16_t actual, uint16_t expected);
188
+
189
+ friend theta_intersection_alloc<A>;
190
+ friend theta_a_not_b_alloc<A>;
191
+ };
192
+
193
+ // update sketch
194
+
195
+ template<typename A> using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
196
+ template<typename A> using vector_u64 = std::vector<uint64_t, AllocU64<A>>;
197
+
198
+ template<typename A>
199
+ class update_theta_sketch_alloc: public theta_sketch_alloc<A> {
200
+ public:
201
+ class builder;
202
+ enum resize_factor { X1, X2, X4, X8 };
203
+ static const uint8_t SKETCH_TYPE = 2;
204
+
205
+ // No constructor here. Use builder instead.
206
+
207
+ virtual ~update_theta_sketch_alloc() = default;
208
+
209
+ virtual uint32_t get_num_retained() const;
210
+ virtual uint16_t get_seed_hash() const;
211
+ virtual bool is_ordered() const;
212
+ virtual string<A> to_string(bool print_items = false) const;
213
+ virtual void serialize(std::ostream& os) const;
214
+ typedef vector_u8<A> vector_bytes; // alias for users
215
+ // header space is reserved, but not initialized
216
+ virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
217
+
218
+ /**
219
+ * Update this sketch with a given string.
220
+ * @param value string to update the sketch with
221
+ */
222
+ void update(const std::string& value);
223
+
224
+ /**
225
+ * Update this sketch with a given unsigned 64-bit integer.
226
+ * @param value uint64_t to update the sketch with
227
+ */
228
+ void update(uint64_t value);
229
+
230
+ /**
231
+ * Update this sketch with a given signed 64-bit integer.
232
+ * @param value int64_t to update the sketch with
233
+ */
234
+ void update(int64_t value);
235
+
236
+ /**
237
+ * Update this sketch with a given unsigned 32-bit integer.
238
+ * For compatibility with Java implementation.
239
+ * @param value uint32_t to update the sketch with
240
+ */
241
+ void update(uint32_t value);
242
+
243
+ /**
244
+ * Update this sketch with a given signed 32-bit integer.
245
+ * For compatibility with Java implementation.
246
+ * @param value int32_t to update the sketch with
247
+ */
248
+ void update(int32_t value);
249
+
250
+ /**
251
+ * Update this sketch with a given unsigned 16-bit integer.
252
+ * For compatibility with Java implementation.
253
+ * @param value uint16_t to update the sketch with
254
+ */
255
+ void update(uint16_t value);
256
+
257
+ /**
258
+ * Update this sketch with a given signed 16-bit integer.
259
+ * For compatibility with Java implementation.
260
+ * @param value int16_t to update the sketch with
261
+ */
262
+ void update(int16_t value);
263
+
264
+ /**
265
+ * Update this sketch with a given unsigned 8-bit integer.
266
+ * For compatibility with Java implementation.
267
+ * @param value uint8_t to update the sketch with
268
+ */
269
+ void update(uint8_t value);
270
+
271
+ /**
272
+ * Update this sketch with a given signed 8-bit integer.
273
+ * For compatibility with Java implementation.
274
+ * @param value int8_t to update the sketch with
275
+ */
276
+ void update(int8_t value);
277
+
278
+ /**
279
+ * Update this sketch with a given double-precision floating point value.
280
+ * For compatibility with Java implementation.
281
+ * @param value double to update the sketch with
282
+ */
283
+ void update(double value);
284
+
285
+ /**
286
+ * Update this sketch with a given floating point value.
287
+ * For compatibility with Java implementation.
288
+ * @param value float to update the sketch with
289
+ */
290
+ void update(float value);
291
+
292
+ /**
293
+ * Update this sketch with given data of any type.
294
+ * This is a "universal" update that covers all cases above,
295
+ * but may produce different hashes.
296
+ * Be very careful to hash input values consistently using the same approach
297
+ * both over time and on different platforms
298
+ * and while passing sketches between C++ environment and Java environment.
299
+ * Otherwise two sketches that should represent overlapping sets will be disjoint
300
+ * For instance, for signed 32-bit values call update(int32_t) method above,
301
+ * which does widening conversion to int64_t, if compatibility with Java is expected
302
+ * @param data pointer to the data
303
+ * @param length of the data in bytes
304
+ */
305
+ void update(const void* data, unsigned length);
306
+
307
+ /**
308
+ * Remove retained entries in excess of the nominal size k (if any)
309
+ */
310
+ void trim();
311
+
312
+ /**
313
+ * Converts this sketch to a compact sketch (ordered or unordered).
314
+ * @param ordered optional flag to specify if ordered sketch should be produced
315
+ * @return compact sketch
316
+ */
317
+ compact_theta_sketch_alloc<A> compact(bool ordered = true) const;
318
+
319
+ virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
320
+ virtual typename theta_sketch_alloc<A>::const_iterator end() const;
321
+
322
+ /**
323
+ * This method deserializes a sketch from a given stream.
324
+ * @param is input stream
325
+ * @param seed the seed for the hash function that was used to create the sketch
326
+ * @return an instance of a sketch
327
+ */
328
+ static update_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
329
+
330
+ /**
331
+ * This method deserializes a sketch from a given array of bytes.
332
+ * @param bytes pointer to the array of bytes
333
+ * @param size the size of the array
334
+ * @param seed the seed for the hash function that was used to create the sketch
335
+ * @return an instance of the sketch
336
+ */
337
+ static update_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
338
+
339
+ private:
340
+ // resize threshold = 0.5 tuned for speed
341
+ static constexpr double RESIZE_THRESHOLD = 0.5;
342
+ // hash table rebuild threshold = 15/16
343
+ static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0;
344
+
345
+ static constexpr uint8_t STRIDE_HASH_BITS = 7;
346
+ static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1;
347
+
348
+ uint8_t lg_cur_size_;
349
+ uint8_t lg_nom_size_;
350
+ vector_u64<A> keys_;
351
+ uint32_t num_keys_;
352
+ resize_factor rf_;
353
+ float p_;
354
+ uint64_t seed_;
355
+ uint32_t capacity_;
356
+
357
+ // for builder
358
+ update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed);
359
+
360
+ // for deserialize
361
+ update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed);
362
+
363
+ void resize();
364
+ void rebuild();
365
+
366
+ friend theta_union_alloc<A>;
367
+ void internal_update(uint64_t hash);
368
+
369
+ friend theta_intersection_alloc<A>;
370
+ friend theta_a_not_b_alloc<A>;
371
+ static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
372
+ static inline uint32_t get_stride(uint64_t hash, uint8_t lg_size);
373
+ static bool hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size);
374
+ static bool hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size);
375
+
376
+ friend theta_sketch_alloc<A>;
377
+ static update_theta_sketch_alloc<A> internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
378
+ static update_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
379
+ };
380
+
381
+ // compact sketch
382
+
383
+ template<typename A>
384
+ class compact_theta_sketch_alloc: public theta_sketch_alloc<A> {
385
+ public:
386
+ static const uint8_t SKETCH_TYPE = 3;
387
+
388
+ // No constructor here.
389
+ // Instances of this type can be obtained:
390
+ // - by compacting an update_theta_sketch
391
+ // - as a result of a set operation
392
+ // - by deserializing a previously serialized compact sketch
393
+
394
+ compact_theta_sketch_alloc(const theta_sketch_alloc<A>& other, bool ordered);
395
+ virtual ~compact_theta_sketch_alloc() = default;
396
+
397
+ virtual uint32_t get_num_retained() const;
398
+ virtual uint16_t get_seed_hash() const;
399
+ virtual bool is_ordered() const;
400
+ virtual string<A> to_string(bool print_items = false) const;
401
+ virtual void serialize(std::ostream& os) const;
402
+ typedef vector_u8<A> vector_bytes; // alias for users
403
+ // header space is reserved, but not initialized
404
+ virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
405
+
406
+ virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
407
+ virtual typename theta_sketch_alloc<A>::const_iterator end() const;
408
+
409
+ /**
410
+ * This method deserializes a sketch from a given stream.
411
+ * @param is input stream
412
+ * @param seed the seed for the hash function that was used to create the sketch
413
+ * @return an instance of a sketch
414
+ */
415
+ static compact_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
416
+
417
+ /**
418
+ * This method deserializes a sketch from a given array of bytes.
419
+ * @param bytes pointer to the array of bytes
420
+ * @param size the size of the array
421
+ * @param seed the seed for the hash function that was used to create the sketch
422
+ * @return an instance of the sketch
423
+ */
424
+ static compact_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
425
+
426
+ private:
427
+ typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
428
+
429
+ vector_u64<A> keys_;
430
+ uint16_t seed_hash_;
431
+ bool is_ordered_;
432
+
433
+ friend theta_sketch_alloc<A>;
434
+ friend update_theta_sketch_alloc<A>;
435
+ friend theta_union_alloc<A>;
436
+ friend theta_intersection_alloc<A>;
437
+ friend theta_a_not_b_alloc<A>;
438
+ compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered);
439
+ static compact_theta_sketch_alloc<A> internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
440
+ static compact_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
441
+ };
442
+
443
+ // builder
444
+
445
+ template<typename A>
446
+ class update_theta_sketch_alloc<A>::builder {
447
+ public:
448
+ static const uint8_t MIN_LG_K = 5;
449
+ static const uint8_t DEFAULT_LG_K = 12;
450
+ static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
451
+
452
+ /**
453
+ * Creates and instance of the builder with default parameters.
454
+ */
455
+ builder();
456
+
457
+ /**
458
+ * Set log2(k), where k is a nominal number of entries in the sketch
459
+ * @param lg_k base 2 logarithm of nominal number of entries
460
+ * @return this builder
461
+ */
462
+ builder& set_lg_k(uint8_t lg_k);
463
+
464
+ /**
465
+ * Set resize factor for the internal hash table (defaults to 8)
466
+ * @param rf resize factor
467
+ * @return this builder
468
+ */
469
+ builder& set_resize_factor(resize_factor rf);
470
+
471
+ /**
472
+ * Set sampling probability (initial theta). The default is 1, so the sketch retains
473
+ * all entries until it reaches the limit, at which point it goes into the estimation mode
474
+ * and reduces the effective sampling probability (theta) as necessary.
475
+ * @param p sampling probability
476
+ * @return this builder
477
+ */
478
+ builder& set_p(float p);
479
+
480
+ /**
481
+ * Set the seed for the hash function. Should be used carefully if needed.
482
+ * Sketches produced with different seed are not compatible
483
+ * and cannot be mixed in set operations.
484
+ * @param seed hash seed
485
+ * @return this builder
486
+ */
487
+ builder& set_seed(uint64_t seed);
488
+
489
+ /**
490
+ * This is to create an instance of the sketch with predefined parameters.
491
+ * @return and instance of the sketch
492
+ */
493
+ update_theta_sketch_alloc<A> build() const;
494
+
495
+ private:
496
+ uint8_t lg_k_;
497
+ resize_factor rf_;
498
+ float p_;
499
+ uint64_t seed_;
500
+
501
+ static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
502
+ };
503
+
504
+ // iterator
505
+ template<typename A>
506
+ class theta_sketch_alloc<A>::const_iterator: public std::iterator<std::input_iterator_tag, uint64_t> {
507
+ public:
508
+ const_iterator& operator++();
509
+ const_iterator operator++(int);
510
+ bool operator==(const const_iterator& other) const;
511
+ bool operator!=(const const_iterator& other) const;
512
+ uint64_t operator*() const;
513
+
514
+ private:
515
+ const uint64_t* keys_;
516
+ uint32_t size_;
517
+ uint32_t index_;
518
+ const_iterator(const uint64_t* keys, uint32_t size, uint32_t index);
519
+ friend class update_theta_sketch_alloc<A>;
520
+ friend class compact_theta_sketch_alloc<A>;
521
+ };
522
+
523
+
524
+ // aliases with default allocator for convenience
525
+ typedef theta_sketch_alloc<std::allocator<void>> theta_sketch;
526
+ typedef update_theta_sketch_alloc<std::allocator<void>> update_theta_sketch;
527
+ typedef compact_theta_sketch_alloc<std::allocator<void>> compact_theta_sketch;
528
+
529
+ } /* namespace datasketches */
530
+
531
+ #include "theta_sketch_impl.hpp"
532
+
533
+ #endif