datasketches 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -151,8 +151,8 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
151
151
  const uint32_t old_size = 1 << lg_size;
152
152
  const uint32_t new_size = 1 << new_lg_size;
153
153
  if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
154
- vector_u32<A> old_slots = std::move(slots);
155
- slots = vector_u32<A>(new_size, UINT32_MAX, old_slots.get_allocator());
154
+ vector_u32 old_slots = std::move(slots);
155
+ slots = vector_u32(new_size, UINT32_MAX, old_slots.get_allocator());
156
156
  lg_size = new_lg_size;
157
157
  for (uint32_t i = 0; i < old_size; i++) {
158
158
  if (old_slots[i] != UINT32_MAX) {
@@ -168,10 +168,10 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
168
168
  // and even then the subsequent sort would fix things up.
169
169
  // The result is nearly sorted, so make sure to use an efficient sort for that case
170
170
  template<typename A>
171
- vector_u32<A> u32_table<A>::unwrapping_get_items() const {
172
- if (num_items == 0) return vector_u32<A>(slots.get_allocator());
171
+ auto u32_table<A>::unwrapping_get_items() const -> vector_u32 {
172
+ if (num_items == 0) return vector_u32(slots.get_allocator());
173
173
  const uint32_t table_size = 1 << lg_size;
174
- vector_u32<A> result(num_items, 0, slots.get_allocator());
174
+ vector_u32 result(num_items, 0, slots.get_allocator());
175
175
  size_t i = 0;
176
176
  size_t l = 0;
177
177
  size_t r = num_items - 1;
@@ -20,16 +20,15 @@ add_executable(cpc_test)
20
20
  target_link_libraries(cpc_test cpc common_test_lib)
21
21
 
22
22
  set_target_properties(cpc_test PROPERTIES
23
- CXX_STANDARD 11
24
23
  CXX_STANDARD_REQUIRED YES
25
24
  )
26
25
 
27
- #file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" CPC_TEST_BINARY_PATH)
28
- #string(APPEND CPC_TEST_BINARY_PATH "/")
29
- #target_compile_definitions(cpc_test
30
- # PRIVATE
31
- # TEST_BINARY_INPUT_PATH="${CPC_TEST_BINARY_PATH}"
32
- #)
26
+ file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" CPC_TEST_BINARY_PATH)
27
+ string(APPEND CPC_TEST_BINARY_PATH "/")
28
+ target_compile_definitions(cpc_test
29
+ PRIVATE
30
+ TEST_BINARY_INPUT_PATH="${CPC_TEST_BINARY_PATH}"
31
+ )
33
32
 
34
33
  add_test(
35
34
  NAME cpc_test
@@ -43,3 +42,17 @@ target_sources(cpc_test
43
42
  compression_test.cpp
44
43
  cpc_sketch_allocation_test.cpp
45
44
  )
45
+
46
+ if (SERDE_COMPAT)
47
+ target_sources(cpc_test
48
+ PRIVATE
49
+ cpc_sketch_deserialize_from_java_test.cpp
50
+ )
51
+ endif()
52
+
53
+ if (GENERATE)
54
+ target_sources(cpc_test
55
+ PRIVATE
56
+ cpc_sketch_serialize_for_java.cpp
57
+ )
58
+ endif()
@@ -0,0 +1,60 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+ #include <cpc_sketch.hpp>
23
+
24
+ namespace datasketches {
25
+
26
+ // assume the binary sketches for this test have been generated by datasketches-java code
27
+ // in the subdirectory called "java" in the root directory of this project
28
+ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
29
+
30
+ TEST_CASE("cpc sketch", "[serde_compat]") {
31
+ const unsigned n_arr[] = {0, 100, 200, 2000, 20000};
32
+ for (const unsigned n: n_arr) {
33
+ std::ifstream is;
34
+ is.exceptions(std::ios::failbit | std::ios::badbit);
35
+ is.open(testBinaryInputPath + "cpc_n" + std::to_string(n) + "_java.sk", std::ios::binary);
36
+ const auto sketch = cpc_sketch::deserialize(is);
37
+ REQUIRE(sketch.is_empty() == (n == 0));
38
+ REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.02));
39
+ }
40
+ }
41
+
42
+ TEST_CASE("cpc sketch negative one", "[serde_compat]") {
43
+ std::ifstream is;
44
+ is.exceptions(std::ios::failbit | std::ios::badbit);
45
+ is.open(testBinaryInputPath + "cpc_negative_one_java.sk", std::ios::binary);
46
+ auto sketch = cpc_sketch::deserialize(is);
47
+ REQUIRE_FALSE(sketch.is_empty());
48
+ REQUIRE(sketch.get_estimate() == Approx(1).margin(0.01));
49
+ sketch.update((uint64_t) -1);
50
+ sketch.update((int64_t) -1);
51
+ sketch.update((uint32_t) -1);
52
+ sketch.update((int32_t) -1);
53
+ sketch.update((uint16_t) -1);
54
+ sketch.update((int16_t) -1);
55
+ sketch.update((uint8_t) -1);
56
+ sketch.update((int8_t) -1);
57
+ REQUIRE(sketch.get_estimate() == Approx(1).margin(0.01));
58
+ }
59
+
60
+ } /* namespace datasketches */
@@ -17,21 +17,22 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
- #ifndef _PY_OBJECT_LT_HPP_
21
- #define _PY_OBJECT_LT_HPP_
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+ #include <cpc_sketch.hpp>
22
23
 
23
- #include <pybind11/pybind11.h>
24
+ namespace datasketches {
24
25
 
25
- /*
26
- This header defines a less than operator on generic python
27
- objects. The implementation calls the object's built-in __lt__()
28
- method. If that method is not defined, the call may fail.
29
- */
30
-
31
- struct py_object_lt {
32
- bool operator()(const pybind11::object& a, const pybind11::object& b) const {
33
- return a < b;
26
+ TEST_CASE("cpc sketch generate", "[serialize_for_java]") {
27
+ const unsigned n_arr[] = {0, 100, 200, 2000, 20000};
28
+ for (const unsigned n: n_arr) {
29
+ cpc_sketch sketch;
30
+ for (unsigned i = 1; i <= n; ++i) sketch.update(i);
31
+ REQUIRE(sketch.is_empty() == (n == 0));
32
+ REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.02));
33
+ std::ofstream os("cpc_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
34
+ sketch.serialize(os);
34
35
  }
35
- };
36
+ }
36
37
 
37
- #endif // _PY_OBJECT_LT_HPP_
38
+ } /* namespace datasketches */
@@ -32,10 +32,10 @@ namespace datasketches {
32
32
  static const double RELATIVE_ERROR_FOR_LG_K_11 = 0.02;
33
33
 
34
34
  TEST_CASE("cpc sketch: lg k limits", "[cpc_sketch]") {
35
- cpc_sketch s1(CPC_MIN_LG_K); // this should work
36
- cpc_sketch s2(CPC_MAX_LG_K); // this should work
37
- REQUIRE_THROWS_AS(cpc_sketch(CPC_MIN_LG_K - 1), std::invalid_argument);
38
- REQUIRE_THROWS_AS(cpc_sketch(CPC_MAX_LG_K + 1), std::invalid_argument);
35
+ cpc_sketch s1(cpc_constants::MIN_LG_K); // this should work
36
+ cpc_sketch s2(cpc_constants::MAX_LG_K); // this should work
37
+ REQUIRE_THROWS_AS(cpc_sketch(cpc_constants::MIN_LG_K - 1), std::invalid_argument);
38
+ REQUIRE_THROWS_AS(cpc_sketch(cpc_constants::MAX_LG_K + 1), std::invalid_argument);
39
39
  }
40
40
 
41
41
  TEST_CASE("cpc sketch: empty", "[cpc_sketch]") {
@@ -88,9 +88,6 @@ TEST_CASE("cpc sketch: serialize deserialize empty", "[cpc_sketch]") {
88
88
  REQUIRE(deserialized.is_empty() == sketch.is_empty());
89
89
  REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
90
90
  REQUIRE(deserialized.validate());
91
-
92
- std::ofstream os("cpc-empty.bin");
93
- sketch.serialize(os);
94
91
  }
95
92
 
96
93
  TEST_CASE("cpc sketch: serialize deserialize sparse", "[cpc_sketch]") {
@@ -108,9 +105,6 @@ TEST_CASE("cpc sketch: serialize deserialize sparse", "[cpc_sketch]") {
108
105
  for (int i = 0; i < n; i++) deserialized.update(i);
109
106
  REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
110
107
  REQUIRE(deserialized.validate());
111
-
112
- std::ofstream os("cpc-sparse.bin");
113
- sketch.serialize(os);
114
108
  }
115
109
 
116
110
  TEST_CASE("cpc sketch: serialize deserialize hybrid", "[cpc_sketch]") {
@@ -128,9 +122,6 @@ TEST_CASE("cpc sketch: serialize deserialize hybrid", "[cpc_sketch]") {
128
122
  for (int i = 0; i < n; i++) deserialized.update(i);
129
123
  REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
130
124
  REQUIRE(deserialized.validate());
131
-
132
- std::ofstream os("cpc-hybrid.bin");
133
- sketch.serialize(os);
134
125
  }
135
126
 
136
127
  TEST_CASE("cpc sketch: serialize deserialize pinned", "[cpc_sketch]") {
@@ -148,9 +139,6 @@ TEST_CASE("cpc sketch: serialize deserialize pinned", "[cpc_sketch]") {
148
139
  for (int i = 0; i < n; i++) deserialized.update(i);
149
140
  REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
150
141
  REQUIRE(deserialized.validate());
151
-
152
- std::ofstream os("cpc-pinned.bin");
153
- sketch.serialize(os);
154
142
  }
155
143
 
156
144
  TEST_CASE("cpc sketch: serialize deserialize sliding", "[cpc_sketch]") {
@@ -168,9 +156,6 @@ TEST_CASE("cpc sketch: serialize deserialize sliding", "[cpc_sketch]") {
168
156
  for (int i = 0; i < n; i++) deserialized.update(i);
169
157
  REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
170
158
  REQUIRE(deserialized.validate());
171
-
172
- std::ofstream os("cpc-sliding.bin");
173
- sketch.serialize(os);
174
159
  }
175
160
 
176
161
  TEST_CASE("cpc sketch: serializing deserialize sliding large", "[cpc_sketch]") {
@@ -188,9 +173,6 @@ TEST_CASE("cpc sketch: serializing deserialize sliding large", "[cpc_sketch]") {
188
173
  for (int i = 0; i < n; i++) deserialized.update(i);
189
174
  REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
190
175
  REQUIRE(deserialized.validate());
191
-
192
- std::ofstream os("cpc-sliding-large.bin");
193
- sketch.serialize(os);
194
176
  }
195
177
 
196
178
  TEST_CASE("cpc sketch: serialize deserialize empty, bytes", "[cpc_sketch]") {
@@ -201,9 +183,6 @@ TEST_CASE("cpc sketch: serialize deserialize empty, bytes", "[cpc_sketch]") {
201
183
  REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
202
184
  REQUIRE(deserialized.validate());
203
185
  REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
204
-
205
- std::ofstream os("cpc-empty.bin");
206
- sketch.serialize(os);
207
186
  }
208
187
 
209
188
  TEST_CASE("cpc sketch: serialize deserialize sparse, bytes", "[cpc_sketch]") {
@@ -261,8 +240,6 @@ TEST_CASE("cpc sketch: serialize deserialize pinned, bytes", "[cpc_sketch]") {
261
240
  for (int i = 0; i < n; i++) deserialized.update(i);
262
241
  REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
263
242
  REQUIRE(deserialized.validate());
264
-
265
- std::cout << sketch.to_string();
266
243
  }
267
244
 
268
245
  TEST_CASE("cpc sketch: serialize deserialize sliding, bytes", "[cpc_sketch]") {
@@ -380,8 +357,6 @@ TEST_CASE("cpc sketch: update int equivalence", "[cpc_sketch]") {
380
357
  sketch.update((uint8_t) -1);
381
358
  sketch.update((int8_t) -1);
382
359
  REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
383
- std::ofstream os("cpc-negative-one.bin"); // to compare with Java
384
- sketch.serialize(os);
385
360
  }
386
361
 
387
362
  TEST_CASE("cpc sketch: update float equivalence", "[cpc_sketch]") {
@@ -28,10 +28,10 @@ namespace datasketches {
28
28
  static const double RELATIVE_ERROR_FOR_LG_K_11 = 0.02;
29
29
 
30
30
  TEST_CASE("cpc union: lg k limits", "[cpc_union]") {
31
- cpc_union u1(CPC_MIN_LG_K); // this should work
32
- cpc_union u2(CPC_MAX_LG_K); // this should work
33
- REQUIRE_THROWS_AS(cpc_union(CPC_MIN_LG_K - 1), std::invalid_argument);
34
- REQUIRE_THROWS_AS(cpc_union(CPC_MAX_LG_K + 1), std::invalid_argument);
31
+ cpc_union u1(cpc_constants::MIN_LG_K); // this should work
32
+ cpc_union u2(cpc_constants::MAX_LG_K); // this should work
33
+ REQUIRE_THROWS_AS(cpc_union(cpc_constants::MIN_LG_K - 1), std::invalid_argument);
34
+ REQUIRE_THROWS_AS(cpc_union(cpc_constants::MAX_LG_K + 1), std::invalid_argument);
35
35
  }
36
36
 
37
37
  TEST_CASE("cpc union: empty", "[cpc_union]") {
@@ -28,15 +28,6 @@
28
28
 
29
29
  #include "common_defs.hpp"
30
30
 
31
- /*
32
- * Based on the following paper:
33
- * Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning"
34
- * https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf
35
- *
36
- * Inspired by the following implementation:
37
- * https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py
38
- */
39
-
40
31
  namespace datasketches {
41
32
 
42
33
  template<typename T>
@@ -46,6 +37,18 @@ struct gaussian_kernel {
46
37
  }
47
38
  };
48
39
 
40
+ /**
41
+ * Density sketch.
42
+ *
43
+ * Builds a coreset from the given set of input points. Provides density estimate at a given point.
44
+ *
45
+ * Based on the following paper:
46
+ * Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning"
47
+ * https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf
48
+ *
49
+ * Inspired by the following implementation:
50
+ * https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py
51
+ */
49
52
  template<
50
53
  typename T,
51
54
  typename Kernel = gaussian_kernel<T>,
@@ -118,6 +121,10 @@ public:
118
121
  template<typename FwdSketch>
119
122
  void merge(FwdSketch&& other);
120
123
 
124
+ /**
125
+ * Density estimate at a given point
126
+ * @return density estimate at a given point
127
+ */
121
128
  T get_estimate(const std::vector<T>& point) const;
122
129
 
123
130
  /**
@@ -172,7 +179,20 @@ public:
172
179
  string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
173
180
 
174
181
  class const_iterator;
182
+
183
+ /**
184
+ * Iterator pointing to the first item in the sketch.
185
+ * If the sketch is empty, the returned iterator must not be dereferenced or incremented.
186
+ * @return iterator pointing to the first item in the sketch
187
+ */
175
188
  const_iterator begin() const;
189
+
190
+ /**
191
+ * Iterator pointing to the past-the-end item in the sketch.
192
+ * The past-the-end item is the hypothetical item that would follow the last item.
193
+ * It does not point to any item, and must not be dereferenced or incremented.
194
+ * @return iterator pointing to the past-the-end item in the sketch
195
+ */
176
196
  const_iterator end() const;
177
197
 
178
198
  private:
@@ -143,7 +143,7 @@ template<typename T, typename K, typename A>
143
143
  void density_sketch<T, K, A>::compact_level(unsigned height) {
144
144
  auto& level = levels_[height];
145
145
  std::vector<bool> bits(level.size());
146
- bits[0] = random_bit();
146
+ bits[0] = random_utils::random_bit();
147
147
  std::random_shuffle(level.begin(), level.end());
148
148
  for (unsigned i = 1; i < level.size(); ++i) {
149
149
  T delta = 0;
@@ -20,7 +20,6 @@ add_executable(density_test)
20
20
  target_link_libraries(density_test density common_test_lib)
21
21
 
22
22
  set_target_properties(density_test PROPERTIES
23
- CXX_STANDARD 11
24
23
  CXX_STANDARD_REQUIRED YES
25
24
  )
26
25
 
@@ -32,15 +32,19 @@
32
32
 
33
33
  namespace datasketches {
34
34
 
35
- /*
35
+ /// Frequent items error type
36
+ enum frequent_items_error_type {
37
+ NO_FALSE_POSITIVES, ///< include an item in the result list if get_lower_bound(item) &gt; threshold
38
+ NO_FALSE_NEGATIVES ///< include an item in the result list if get_upper_bound(item) &gt; threshold
39
+ };
40
+
41
+ /**
42
+ * Frequent Items sketch.
43
+ *
36
44
  * Based on Java implementation here:
37
45
  * https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ItemsSketch.java
38
- * author Alexander Saydakov
46
+ * @author Alexander Saydakov
39
47
  */
40
-
41
- enum frequent_items_error_type { NO_FALSE_POSITIVES, NO_FALSE_NEGATIVES };
42
-
43
- // type W for weight must be an arithmetic type (integral or floating point)
44
48
  template<
45
49
  typename T,
46
50
  typename W = uint64_t,
@@ -49,6 +53,7 @@ template<
49
53
  typename A = std::allocator<T>
50
54
  >
51
55
  class frequent_items_sketch {
56
+ static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
52
57
  public:
53
58
 
54
59
  static const uint8_t LG_MIN_MAP_SIZE = 3;
@@ -194,7 +199,7 @@ public:
194
199
  * There may be items omitted from the set with true frequencies greater than the
195
200
  * threshold (false negatives).</p>
196
201
  *
197
- * @param error_type determines whether no false positives or no false negatives are desired.
202
+ * @param err_type determines whether no false positives or no false negatives are desired.
198
203
  * @return an array of frequent items
199
204
  */
200
205
  vector_row get_frequent_items(frequent_items_error_type err_type) const;
@@ -217,7 +222,7 @@ public:
217
222
  * There may be items omitted from the set with true frequencies greater than the
218
223
  * threshold (false negatives).</p>
219
224
  *
220
- * @param error_type determines whether no false positives or no false negatives are desired.
225
+ * @param err_type determines whether no false positives or no false negatives are desired.
221
226
  * @param threshold to include items in the result list
222
227
  * @return an array of frequent items
223
228
  */
@@ -293,7 +298,9 @@ private:
293
298
  static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
294
299
  static const uint8_t PREAMBLE_LONGS_NONEMPTY = 4;
295
300
  static constexpr double EPSILON_FACTOR = 3.5;
296
- enum flags { IS_EMPTY };
301
+ // due to a mistake different bits were used in C++ and Java to indicate empty sketch
302
+ // therefore both are set and checked for compatibility with historical binary format
303
+ enum flags { IS_EMPTY_1 = 0, IS_EMPTY_2 = 2 };
297
304
  W total_weight;
298
305
  W offset;
299
306
  reverse_purge_hash_map<T, W, H, E, A> map;
@@ -318,14 +325,19 @@ private:
318
325
  class items_deleter;
319
326
  };
320
327
 
328
+ /// Row in the output from #get_frequent_items
321
329
  template<typename T, typename W, typename H, typename E, typename A>
322
330
  class frequent_items_sketch<T, W, H, E, A>::row {
323
331
  public:
324
332
  row(const T* item, W weight, W offset):
325
333
  item(item), weight(weight), offset(offset) {}
334
+ /// @return item
326
335
  const T& get_item() const { return *item; }
336
+ /// @return frequency (weight) estimate
327
337
  W get_estimate() const { return weight + offset; }
338
+ /// @return estimate lower bound
328
339
  W get_lower_bound() const { return weight; }
340
+ /// @return estimate upper bound
329
341
  W get_upper_bound() const { return weight + offset; }
330
342
  private:
331
343
  const T* item;
@@ -174,7 +174,8 @@ void frequent_items_sketch<T, W, H, E, A>::serialize(std::ostream& os, const Ser
174
174
  const uint8_t lg_cur_size = map.get_lg_cur_size();
175
175
  write(os, lg_cur_size);
176
176
  const uint8_t flags_byte(
177
- (is_empty() ? 1 << flags::IS_EMPTY : 0)
177
+ (is_empty() ? 1 << flags::IS_EMPTY_1 : 0)
178
+ | (is_empty() ? 1 << flags::IS_EMPTY_2 : 0)
178
179
  );
179
180
  write(os, flags_byte);
180
181
  const uint16_t unused16 = 0;
@@ -234,7 +235,8 @@ auto frequent_items_sketch<T, W, H, E, A>::serialize(unsigned header_size_bytes,
234
235
  const uint8_t lg_cur_size = map.get_lg_cur_size();
235
236
  ptr += copy_to_mem(lg_cur_size, ptr);
236
237
  const uint8_t flags_byte(
237
- (is_empty() ? 1 << flags::IS_EMPTY : 0)
238
+ (is_empty() ? 1 << flags::IS_EMPTY_1 : 0)
239
+ | (is_empty() ? 1 << flags::IS_EMPTY_2 : 0)
238
240
  );
239
241
  ptr += copy_to_mem(flags_byte, ptr);
240
242
  ptr += sizeof(uint16_t); // unused
@@ -298,7 +300,7 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
298
300
  const auto flags_byte = read<uint8_t>(is);
299
301
  read<uint16_t>(is); // unused
300
302
 
301
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
303
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY_1)) | (flags_byte & (1 << flags::IS_EMPTY_2));
302
304
 
303
305
  check_preamble_longs(preamble_longs, is_empty);
304
306
  check_serial_version(serial_version);
@@ -352,7 +354,7 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
352
354
  ptr += copy_from_mem(ptr, flags_byte);
353
355
  ptr += sizeof(uint16_t); // unused
354
356
 
355
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
357
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY_1)) | (flags_byte & (1 << flags::IS_EMPTY_2));
356
358
 
357
359
  check_preamble_longs(preamble_longs, is_empty);
358
360
  check_serial_version(serial_version);
@@ -20,7 +20,6 @@ add_executable(fi_test)
20
20
  target_link_libraries(fi_test fi common_test_lib)
21
21
 
22
22
  set_target_properties(fi_test PROPERTIES
23
- CXX_STANDARD 11
24
23
  CXX_STANDARD_REQUIRED YES
25
24
  )
26
25
 
@@ -42,3 +41,17 @@ target_sources(fi_test
42
41
  frequent_items_sketch_test.cpp
43
42
  frequent_items_sketch_custom_type_test.cpp
44
43
  )
44
+
45
+ if (SERDE_COMPAT)
46
+ target_sources(fi_test
47
+ PRIVATE
48
+ frequent_items_sketch_deserialize_from_java_test.cpp
49
+ )
50
+ endif()
51
+
52
+ if (GENERATE)
53
+ target_sources(fi_test
54
+ PRIVATE
55
+ frequent_items_sketch_serialize_for_java.cpp
56
+ )
57
+ endif()
@@ -0,0 +1,95 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+ #include <frequent_items_sketch.hpp>
23
+
24
+ namespace datasketches {
25
+
26
+ // assume the binary sketches for this test have been generated by datasketches-java code
27
+ // in the subdirectory called "java" in the root directory of this project
28
+ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
29
+
30
+ TEST_CASE("frequent longs", "[serde_compat]") {
31
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
32
+ for (const unsigned n: n_arr) {
33
+ std::ifstream is;
34
+ is.exceptions(std::ios::failbit | std::ios::badbit);
35
+ is.open(testBinaryInputPath + "frequent_long_n" + std::to_string(n) + "_java.sk", std::ios::binary);
36
+ const auto sketch = frequent_items_sketch<int64_t>::deserialize(is);
37
+ REQUIRE(sketch.is_empty() == (n == 0));
38
+ if (n > 10) {
39
+ REQUIRE(sketch.get_maximum_error() > 0);
40
+ } else {
41
+ REQUIRE(sketch.get_maximum_error() == 0);
42
+ }
43
+ REQUIRE(sketch.get_total_weight() == n);
44
+ }
45
+ }
46
+
47
+ TEST_CASE("frequent strings", "[serde_compat]") {
48
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
49
+ for (const unsigned n: n_arr) {
50
+ std::ifstream is;
51
+ is.exceptions(std::ios::failbit | std::ios::badbit);
52
+ is.open(testBinaryInputPath + "frequent_string_n" + std::to_string(n) + "_java.sk", std::ios::binary);
53
+ const auto sketch = frequent_items_sketch<std::string>::deserialize(is);
54
+ REQUIRE(sketch.is_empty() == (n == 0));
55
+ if (n > 10) {
56
+ REQUIRE(sketch.get_maximum_error() > 0);
57
+ } else {
58
+ REQUIRE(sketch.get_maximum_error() == 0);
59
+ }
60
+ REQUIRE(sketch.get_total_weight() == n);
61
+ }
62
+ }
63
+
64
+ TEST_CASE("frequent strings ascii", "[serde_compat]") {
65
+ std::ifstream is;
66
+ is.exceptions(std::ios::failbit | std::ios::badbit);
67
+ is.open(testBinaryInputPath + "frequent_string_ascii_java.sk", std::ios::binary);
68
+ const auto sketch = frequent_items_sketch<std::string>::deserialize(is);
69
+ REQUIRE_FALSE(sketch.is_empty());
70
+ REQUIRE(sketch.get_maximum_error() == 0);
71
+ REQUIRE(sketch.get_total_weight() == 10);
72
+ REQUIRE(sketch.get_estimate("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa") == 1);
73
+ REQUIRE(sketch.get_estimate("bbbbbbbbbbbbbbbbbbbbbbbbbbbbb") == 2);
74
+ REQUIRE(sketch.get_estimate("ccccccccccccccccccccccccccccc") == 3);
75
+ REQUIRE(sketch.get_estimate("ddddddddddddddddddddddddddddd") == 4);
76
+ }
77
+
78
+ TEST_CASE("frequent strings utf8", "[serde_compat]") {
79
+ std::ifstream is;
80
+ is.exceptions(std::ios::failbit | std::ios::badbit);
81
+ is.open(testBinaryInputPath + "frequent_string_utf8_java.sk", std::ios::binary);
82
+ const auto sketch = frequent_items_sketch<std::string>::deserialize(is);
83
+ REQUIRE_FALSE(sketch.is_empty());
84
+ REQUIRE(sketch.get_maximum_error() == 0);
85
+ REQUIRE(sketch.get_total_weight() == 28);
86
+ REQUIRE(sketch.get_estimate("абвгд") == 1);
87
+ REQUIRE(sketch.get_estimate("еёжзи") == 2);
88
+ REQUIRE(sketch.get_estimate("йклмн") == 3);
89
+ REQUIRE(sketch.get_estimate("опрст") == 4);
90
+ REQUIRE(sketch.get_estimate("уфхцч") == 5);
91
+ REQUIRE(sketch.get_estimate("шщъыь") == 6);
92
+ REQUIRE(sketch.get_estimate("эюя") == 7);
93
+ }
94
+
95
+ } /* namespace datasketches */