datasketches 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -25,31 +25,32 @@
25
25
 
26
26
  namespace datasketches {
27
27
 
28
- /*
29
- * C++ implementation of the CountMin sketch data structure of Cormode and Muthukrishnan.
30
- * [1] - http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
31
- * The template type W is the type of the vector that contains the weights of the objects inserted into the sketch,
32
- * not the type of the input items themselves.
33
- * @author Charlie Dickens
34
- */
35
-
28
+ /**
29
+ * C++ implementation of the CountMin sketch data structure of Cormode and Muthukrishnan.
30
+ * [1] - http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
31
+ * The template type W is the type of the vector that contains the weights of the objects inserted into the sketch,
32
+ * not the type of the input items themselves.
33
+ * @author Charlie Dickens
34
+ */
36
35
  template <typename W,
37
36
  typename Allocator = std::allocator<W>>
38
37
  class count_min_sketch{
39
38
  static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
40
39
  public:
41
40
  using allocator_type = Allocator;
41
+ using const_iterator = typename std::vector<W, Allocator>::const_iterator;
42
42
 
43
43
  /**
44
44
  * Creates an instance of the sketch given parameters _num_hashes, _num_buckets and hash seed, `seed`.
45
- * @param num_hashes : number of hash functions in the sketch. Equivalently the number of rows in the array
46
- * @param num_buckets : number of buckets that hash functions map into. Equivalently the number of columns in the array
45
+ * @param num_hashes number of hash functions in the sketch. Equivalently the number of rows in the array
46
+ * @param num_buckets number of buckets that hash functions map into. Equivalently the number of columns in the array
47
47
  * @param seed for hash function
48
+ * @param allocator to acquire and release memory
48
49
  *
49
50
  * The items inserted into the sketch can be arbitrary type, so long as they are hashable via murmurhash.
50
51
  * Only update and estimate methods are added for uint64_t and string types.
51
52
  */
52
- count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator()) ;
53
+ count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
53
54
 
54
55
  /**
55
56
  * @return configured _num_hashes of this sketch
@@ -67,59 +68,61 @@ public:
67
68
  uint64_t get_seed() const;
68
69
 
69
70
  /**
70
- * @return epsilon : double
71
+ * @return epsilon
71
72
  * The maximum permissible error for any frequency estimate query.
72
73
  * epsilon = ceil(e / _num_buckets)
73
74
  */
74
75
  double get_relative_error() const;
75
76
 
76
77
  /**
77
- * @return _total_weight : typename W
78
+ * @return _total_weight
78
79
  * The total weight currently inserted into the stream.
79
80
  */
80
81
  W get_total_weight() const;
81
82
 
82
- /*
83
- * @param relative_error : double -- the desired accuracy within which estimates should lie.
83
+ /**
84
+ * Suggests the number of buckets required to achieve the given relative error
85
+ * @param relative_error the desired accuracy within which estimates should lie.
84
86
  * For example, when relative_error = 0.05, then the returned frequency estimates satisfy the
85
87
  * `relative_error` guarantee that never overestimates the weights but may underestimate the weights
86
88
  * by 5% of the total weight in the sketch.
87
- * @return number_of_buckets : the number of hash buckets at every level of the
89
+ * @return the number of hash buckets at every level of the
88
90
  * sketch required in order to obtain the specified relative error.
89
91
  * [1] - Section 3 ``Data Structure'', page 6.
90
92
  */
91
- static uint32_t suggest_num_buckets(double relative_error) ;
93
+ static uint32_t suggest_num_buckets(double relative_error);
92
94
 
93
- /*
94
- * @param confidence : double -- the desired confidence with which estimates should be correct.
95
+ /**
96
+ * Suggests the number of hash functions required to achieve the given confidence
97
+ * @param confidence the desired confidence with which estimates should be correct.
95
98
  * For example, with 95% confidence, frequency estimates satisfy the `relative_error` guarantee.
96
- * @return number_of_hashes : the number of hash functions that are required in
99
+ * @return the number of hash functions that are required in
97
100
  * order to achieve the specified confidence of the sketch.
98
101
  * confidence = 1 - delta, with delta denoting the sketch failure probability in the literature.
99
102
  * [1] - Section 3 ``Data Structure'', page 6.
100
103
  */
101
- static uint8_t suggest_num_hashes(double confidence) ;
104
+ static uint8_t suggest_num_hashes(double confidence);
102
105
 
103
106
  /**
104
107
  * Specific get_estimate function for uint64_t type
105
108
  * see generic get_estimate function
106
- * @param item : uint64_t type.
109
+ * @param item uint64_t type.
107
110
  * @return an estimate of the item's frequency.
108
111
  */
109
- W get_estimate(uint64_t item) const ;
112
+ W get_estimate(uint64_t item) const;
110
113
 
111
114
  /**
112
115
  * Specific get_estimate function for int64_t type
113
116
  * see generic get_estimate function
114
- * @param item : uint64_t type.
117
+ * @param item int64_t type.
115
118
  * @return an estimate of the item's frequency.
116
119
  */
117
- W get_estimate(int64_t item) const ;
120
+ W get_estimate(int64_t item) const;
118
121
 
119
122
  /**
120
123
  * Specific get_estimate function for std::string type
121
124
  * see generic get_estimate function
122
- * @param item : std::string type
125
+ * @param item std::string type
123
126
  * @return an estimate of the item's frequency.
124
127
  */
125
128
  W get_estimate(const std::string& item) const;
@@ -127,69 +130,115 @@ public:
127
130
  /**
128
131
  * This is the generic estimate query function for any of the given datatypes.
129
132
  * Query the sketch for the estimate of a given item.
130
- * @param item : pointer to the data item to be query from the sketch.
131
- * @param size : size_t
133
+ * @param item pointer to the data item to be query from the sketch.
134
+ * @param size size of the item in bytes
132
135
  * @return the estimated frequency of the item denoted f_est satisfying
133
136
  * f_true - relative_error*_total_weight <= f_est <= f_true
134
137
  */
135
- W get_estimate(const void* item, size_t size) const ;
138
+ W get_estimate(const void* item, size_t size) const;
136
139
 
137
140
  /**
138
141
  * Query the sketch for the upper bound of a given item.
139
- * @param item : uint64_t or std::string to query
142
+ * @param item to query
143
+ * @param size of the item in bytes
140
144
  * @return the upper bound on the true frequency of the item
141
145
  * f_true <= f_est + relative_error*_total_weight
142
146
  */
143
147
  W get_upper_bound(const void* item, size_t size) const;
144
- W get_upper_bound(int64_t) const ;
145
- W get_upper_bound(uint64_t) const ;
148
+
149
+ /**
150
+ * Query the sketch for the upper bound of a given item.
151
+ * @param item to query
152
+ * @return the upper bound on the true frequency of the item
153
+ * f_true <= f_est + relative_error*_total_weight
154
+ */
155
+ W get_upper_bound(int64_t item) const;
156
+
157
+ /**
158
+ * Query the sketch for the upper bound of a given item.
159
+ * @param item to query
160
+ * @return the upper bound on the true frequency of the item
161
+ * f_true <= f_est + relative_error*_total_weight
162
+ */
163
+ W get_upper_bound(uint64_t item) const;
164
+
165
+ /**
166
+ * Query the sketch for the upper bound of a given item.
167
+ * @param item to query
168
+ * @return the upper bound on the true frequency of the item
169
+ * f_true <= f_est + relative_error*_total_weight
170
+ */
146
171
  W get_upper_bound(const std::string& item) const;
147
172
 
148
173
  /**
149
174
  * Query the sketch for the lower bound of a given item.
150
- * @param item : uint64_t or std::string to query
175
+ * @param item to query
176
+ * @param size of the item in bytes
151
177
  * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
152
178
  * f_true - relative_error*_total_weight <= f_est
153
179
  */
154
- W get_lower_bound(const void* item, size_t size) const ;
155
- W get_lower_bound(int64_t) const ;
156
- W get_lower_bound(uint64_t) const ;
157
- W get_lower_bound(const std::string& item) const ;
180
+ W get_lower_bound(const void* item, size_t size) const;
158
181
 
159
- /*
182
+ /**
183
+ * Query the sketch for the lower bound of a given item.
184
+ * @param item to query
185
+ * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
186
+ * f_true - relative_error*_total_weight <= f_est
187
+ */
188
+ W get_lower_bound(int64_t item) const;
189
+
190
+ /**
191
+ * Query the sketch for the lower bound of a given item.
192
+ * @param item to query
193
+ * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
194
+ * f_true - relative_error*_total_weight <= f_est
195
+ */
196
+ W get_lower_bound(uint64_t item) const;
197
+
198
+ /**
199
+ * Query the sketch for the lower bound of a given item.
200
+ * @param item to query
201
+ * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
202
+ * f_true - relative_error*_total_weight <= f_est
203
+ */
204
+ W get_lower_bound(const std::string& item) const;
205
+
206
+ /**
160
207
  * Update this sketch with given data of any type.
161
- * This is a "universal" update that covers all cases above,
162
- * but may produce different hashes.
208
+ * This is a "universal" update that covers all cases,
209
+ * but may produce different hashes compared to specialized update methods.
163
210
  * @param item pointer to the data item to be inserted into the sketch.
164
211
  * @param size of the data in bytes
165
- * @return vector of uint64_t which each represent the index to which `value' must update in the sketch
212
+ * @param weight arithmetic type
166
213
  */
167
- void update(const void* item, size_t size, W weight) ;
214
+ void update(const void* item, size_t size, W weight);
168
215
 
169
216
  /**
170
- * Update this sketch with a given uint64_t item.
171
- * @param item : uint64_t to update the sketch with
172
- * @param weight : arithmetic type
173
- * void function which inserts an item of type uint64_t into the sketch
217
+ * Update this sketch with a given item.
218
+ * @param item to update the sketch with
219
+ * @param weight arithmetic type
174
220
  */
175
- void update(uint64_t item, W weight) ;
176
- void update(uint64_t item) ;
177
- void update(int64_t item, W weight) ;
178
- void update(int64_t item) ;
221
+ void update(uint64_t item, W weight = 1);
222
+
223
+ /**
224
+ * Update this sketch with a given item.
225
+ * @param item to update the sketch with
226
+ * @param weight arithmetic type
227
+ */
228
+ void update(int64_t item, W weight = 1);
179
229
 
180
230
  /**
181
231
  * Update this sketch with a given string.
182
- * @param item : string to update the sketch with
183
- * @param weight : arithmetic type
184
- * void function which inserts an item of type std::string into the sketch
232
+ * @param item string to update the sketch with
233
+ * @param weight arithmetic type
185
234
  */
186
- void update(const std::string& item, W weight) ;
187
- void update(const std::string& item) ;
235
+ void update(const std::string& item, W weight = 1);
188
236
 
189
- /*
190
- * merges a separate count_min_sketch into this count_min_sketch.
237
+ /**
238
+ * Merges another count_min_sketch into this count_min_sketch.
239
+ * @param other_sketch
191
240
  */
192
- void merge(const count_min_sketch &other_sketch) ;
241
+ void merge(const count_min_sketch& other_sketch);
193
242
 
194
243
  /**
195
244
  * Returns true if this sketch is empty.
@@ -197,7 +246,7 @@ public:
197
246
  * This can only ever happen if all items inserted to the sketch have weights that cancel each other out.
198
247
  * @return empty flag
199
248
  */
200
- bool is_empty() const ;
249
+ bool is_empty() const;
201
250
 
202
251
  /**
203
252
  * @brief Returns a string describing the sketch
@@ -205,15 +254,23 @@ public:
205
254
  */
206
255
  string<Allocator> to_string() const;
207
256
 
208
- // Iterators
209
- using const_iterator = typename std::vector<W, Allocator>::const_iterator ;
257
+ /**
258
+ * Iterator pointing to the first item in the sketch.
259
+ * If the sketch is empty, the returned iterator must not be dereferenced or incremented.
260
+ * @return iterator pointing to the first item in the sketch
261
+ */
210
262
  const_iterator begin() const;
211
- const_iterator end() const;
212
263
 
213
264
  /**
214
- * This method serializes the sketch into a given stream in a binary form
215
- * @param os output stream
216
- * The byte output has the following structure
265
+ * Iterator pointing to the past-the-end item in the sketch.
266
+ * The past-the-end item is the hypothetical item that would follow the last item.
267
+ * It does not point to any item, and must not be dereferenced or incremented.
268
+ * @return iterator pointing to the past-the-end item in the sketch
269
+ */
270
+ const_iterator end() const;
271
+
272
+ /*
273
+ * The serialized sketch binary form has the following structure
217
274
  * Byte 0:
218
275
  * 1 - if and only if the sketch is empty
219
276
  * 0 - otherwise
@@ -254,8 +311,6 @@ public:
254
311
  ||---------------------------- sketch entries ---------------------------|
255
312
  ...
256
313
 
257
- *
258
- *
259
314
  */
260
315
 
261
316
 
@@ -266,7 +321,8 @@ public:
266
321
  size_t get_serialized_size_bytes() const;
267
322
 
268
323
  /**
269
- * This method serializes a binary image of the sketch to an output stream.
324
+ * This method serializes the sketch into a given stream in a binary form
325
+ * @param os output stream
270
326
  */
271
327
  void serialize(std::ostream& os) const;
272
328
 
@@ -287,6 +343,7 @@ public:
287
343
  * This method deserializes a sketch from a given stream.
288
344
  * @param is input stream
289
345
  * @param seed the seed for the hash function that was used to create the sketch
346
+ * @param allocator instance of an Allocator
290
347
  * @return an instance of a sketch
291
348
  */
292
349
  static count_min_sketch deserialize(std::istream& is, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
@@ -296,24 +353,24 @@ public:
296
353
  * @param bytes pointer to the array of bytes
297
354
  * @param size the size of the array
298
355
  * @param seed the seed for the hash function that was used to create the sketch
356
+ * @param allocator instance of an Allocator
299
357
  * @return an instance of the sketch
300
358
  */
301
359
  static count_min_sketch deserialize(const void* bytes, size_t size, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
302
360
 
303
361
  /**
304
- * Returns the allocator for this sketch.
305
362
  * @return allocator
306
363
  */
307
364
  allocator_type get_allocator() const;
308
365
 
309
366
  private:
310
367
  Allocator _allocator;
311
- uint8_t _num_hashes ;
312
- uint32_t _num_buckets ;
313
- std::vector<W, Allocator> _sketch_array ; // the array stored by the sketch
314
- uint64_t _seed ;
315
- W _total_weight ;
316
- std::vector<uint64_t> hash_seeds ;
368
+ uint8_t _num_hashes;
369
+ uint32_t _num_buckets;
370
+ std::vector<W, Allocator> _sketch_array; // the array stored by the sketch
371
+ uint64_t _seed;
372
+ W _total_weight;
373
+ std::vector<uint64_t> hash_seeds;
317
374
 
318
375
  enum flags {IS_EMPTY};
319
376
  static const uint8_t PREAMBLE_LONGS_SHORT = 2; // Empty -> need second byte for sketch parameters
@@ -331,9 +388,6 @@ private:
331
388
  */
332
389
  static void check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte);
333
390
 
334
-
335
-
336
-
337
391
  /*
338
392
  * Obtain the hash values when inserting an item into the sketch.
339
393
  * @param item pointer to the data item to be inserted into the sketch.