datasketches 0.2.3 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/README.md +8 -8
  4. data/ext/datasketches/kll_wrapper.cpp +7 -3
  5. data/ext/datasketches/theta_wrapper.cpp +20 -4
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +25 -5
  8. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  9. data/vendor/datasketches-cpp/NOTICE +6 -5
  10. data/vendor/datasketches-cpp/README.md +76 -9
  11. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  12. data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
  13. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  14. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  15. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  16. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  17. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  18. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  19. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  20. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +3 -1
  22. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  24. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
  25. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  26. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  27. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  28. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  29. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  30. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +29 -11
  31. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  32. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  34. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  35. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  36. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  37. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  38. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  39. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  40. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  41. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  42. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  43. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  44. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  45. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  46. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  47. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  48. data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
  49. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  50. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +5 -2
  51. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +108 -41
  52. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +150 -132
  53. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +165 -31
  54. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  55. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  56. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  57. data/vendor/datasketches-cpp/python/README.md +13 -9
  58. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  59. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  60. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  61. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  62. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  63. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  64. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  65. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  66. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  67. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  68. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +656 -0
  69. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1373 -0
  70. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  71. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  72. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  73. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  74. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  75. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  76. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  77. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  78. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  79. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  80. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  81. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +975 -0
  82. data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
  83. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  84. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +6 -0
  85. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +30 -2
  86. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +73 -23
  87. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +95 -63
  88. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +74 -3
  89. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +44 -33
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  96. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  97. data/vendor/datasketches-cpp/setup.py +1 -1
  98. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  99. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
  103. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
  104. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  105. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
  106. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  107. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  108. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
  109. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  110. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
  111. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +34 -9
  112. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  113. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  114. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  115. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  116. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  117. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  118. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  119. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  120. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
  121. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
  122. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
  123. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  124. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  125. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  126. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  127. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  128. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
  129. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
  130. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  131. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  132. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  133. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  134. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  135. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  136. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
  137. metadata +33 -12
  138. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  139. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  140. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  141. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
  142. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea00e444de6dc1bebc2b8cf878a250f08717d55eaa55f63f6bec28f4be2af00d
4
- data.tar.gz: 161b9089e3b8d0dbd99cfb6cc0af463934c42ba85f4788a08306369966f28571
3
+ metadata.gz: cf1ea0f9f2d12b0e46c2d4c7dec21f41992e711e73eca68ea1ef03a4bb711077
4
+ data.tar.gz: 92f56b63da0254962be47d8d3e00a6950a271053bf3152167f95e6fdb99528e6
5
5
  SHA512:
6
- metadata.gz: '09eede1e6e4c0fe57c0116c4e8873670192fea845783687ca34890bd9358af9dd19a535774ab7dd667055cf6acd0d3913f044dcf2274e0ec092b33307250a74a'
7
- data.tar.gz: b8bcaeb7af0d27e836f21941663229a2750922914c4f31f4ffbd6e3c3876320f9ce92916eb9730e02227b87e8f244bc08e5bc38541bf4ef4e3485203fff01942
6
+ metadata.gz: 5841d4a70f1e852faa150f57ebfefc7b975de020782c41eebdad87a01d016be9bdf86f86173600632bf6f56300df0c9c4196251aa5df02a47ecd357ac844ef80
7
+ data.tar.gz: d6ae7c811e0e2c2008b912e29f86d1b99491c74cd878790dfd800811a007f0dbf9c49bb59db30345450ff82673381f2c036a84a57dc44a6f6751610d9be2ee88
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## 0.2.6 (2022-07-13)
2
+
3
+ - Updated DataSketches to 3.5.0
4
+
5
+ ## 0.2.5 (2022-05-21)
6
+
7
+ - Updated DataSketches to 3.4.0
8
+
9
+ ## 0.2.4 (2021-12-28)
10
+
11
+ - Updated DataSketches to 3.3.0
12
+
1
13
  ## 0.2.3 (2021-09-29)
2
14
 
3
15
  - Updated DataSketches to 3.2.0
data/README.md CHANGED
@@ -1,15 +1,15 @@
1
- # DataSketches
1
+ # DataSketches Ruby
2
2
 
3
3
  [DataSketches](https://datasketches.apache.org/) - sketch data structures - for Ruby
4
4
 
5
- [![Build Status](https://github.com/ankane/datasketches/workflows/build/badge.svg?branch=master)](https://github.com/ankane/datasketches/actions)
5
+ [![Build Status](https://github.com/ankane/datasketches-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/datasketches-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
9
9
  Add this line to your application’s Gemfile:
10
10
 
11
11
  ```ruby
12
- gem 'datasketches'
12
+ gem "datasketches"
13
13
  ```
14
14
 
15
15
  ## Sketch Families
@@ -292,22 +292,22 @@ This library is modeled after the DataSketches [Python API](https://github.com/a
292
292
 
293
293
  ## History
294
294
 
295
- View the [changelog](https://github.com/ankane/datasketches/blob/master/CHANGELOG.md)
295
+ View the [changelog](https://github.com/ankane/datasketches-ruby/blob/master/CHANGELOG.md)
296
296
 
297
297
  ## Contributing
298
298
 
299
299
  Everyone is encouraged to help improve this project. Here are a few ways you can help:
300
300
 
301
- - [Report bugs](https://github.com/ankane/datasketches/issues)
302
- - Fix bugs and [submit pull requests](https://github.com/ankane/datasketches/pulls)
301
+ - [Report bugs](https://github.com/ankane/datasketches-ruby/issues)
302
+ - Fix bugs and [submit pull requests](https://github.com/ankane/datasketches-ruby/pulls)
303
303
  - Write, clarify, or fix documentation
304
304
  - Suggest or add new features
305
305
 
306
306
  To get started with development:
307
307
 
308
308
  ```sh
309
- git clone --recursive https://github.com/ankane/datasketches.git
310
- cd datasketches
309
+ git clone --recursive https://github.com/ankane/datasketches-ruby.git
310
+ cd datasketches-ruby
311
311
  bundle install
312
312
  bundle exec rake compile
313
313
  bundle exec rake test
@@ -33,7 +33,11 @@ void bind_kll_sketch(Rice::Module& m, const char* name) {
33
33
  .define_method("estimation_mode?", &kll_sketch<T>::is_estimation_mode)
34
34
  .define_method("min_value", &kll_sketch<T>::get_min_value)
35
35
  .define_method("max_value", &kll_sketch<T>::get_max_value)
36
- .define_method("quantile", &kll_sketch<T>::get_quantile)
36
+ .define_method(
37
+ "quantile",
38
+ [](kll_sketch<T>& self, double fraction) {
39
+ return self.get_quantile(fraction);
40
+ })
37
41
  .define_method(
38
42
  "quantiles",
39
43
  [](kll_sketch<T>& self, Rice::Object obj) {
@@ -51,12 +55,12 @@ void bind_kll_sketch(Rice::Module& m, const char* name) {
51
55
  })
52
56
  .define_method(
53
57
  "pmf",
54
- [](kll_sketch<T>& self, std::vector<T> split_points) {
58
+ [](kll_sketch<T>& self, const std::vector<T>& split_points) {
55
59
  return self.get_PMF(&split_points[0], split_points.size());
56
60
  })
57
61
  .define_method(
58
62
  "cdf",
59
- [](kll_sketch<T>& self, std::vector<T> split_points) {
63
+ [](kll_sketch<T>& self, const std::vector<T>& split_points) {
60
64
  return self.get_CDF(&split_points[0], split_points.size());
61
65
  })
62
66
  .define_method(
@@ -20,10 +20,26 @@ using Rice::Arg;
20
20
 
21
21
  void init_theta(Rice::Module& m) {
22
22
  Rice::define_class_under<theta_sketch>(m, "ThetaSketch")
23
- .define_method("empty?", &theta_sketch::is_empty)
24
- .define_method("estimate", &theta_sketch::get_estimate)
25
- .define_method("lower_bound", &theta_sketch::get_lower_bound)
26
- .define_method("upper_bound", &theta_sketch::get_upper_bound);
23
+ .define_method(
24
+ "empty?",
25
+ [](theta_sketch& self) {
26
+ return self.is_empty();
27
+ })
28
+ .define_method(
29
+ "estimate",
30
+ [](theta_sketch& self) {
31
+ return self.get_estimate();
32
+ })
33
+ .define_method(
34
+ "lower_bound",
35
+ [](theta_sketch& self, uint8_t num_std_devs) {
36
+ return self.get_lower_bound(num_std_devs);
37
+ })
38
+ .define_method(
39
+ "upper_bound",
40
+ [](theta_sketch& self, uint8_t num_std_devs) {
41
+ return self.get_upper_bound(num_std_devs);
42
+ });
27
43
 
28
44
  Rice::define_class_under<compact_theta_sketch, theta_sketch>(m, "CompactThetaSketch")
29
45
  .define_singleton_function(
@@ -1,3 +1,3 @@
1
1
  module DataSketches
2
- VERSION = "0.2.3"
2
+ VERSION = "0.2.6"
3
3
  end
@@ -15,9 +15,9 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- cmake_minimum_required(VERSION 3.12.0)
18
+ cmake_minimum_required(VERSION 3.16.0)
19
19
  project(DataSketches
20
- VERSION 0.12.0
20
+ VERSION 3.5.0
21
21
  LANGUAGES CXX)
22
22
 
23
23
  include(GNUInstallDirs)
@@ -106,12 +106,13 @@ add_subdirectory(theta)
106
106
  add_subdirectory(sampling)
107
107
  add_subdirectory(tuple)
108
108
  add_subdirectory(req)
109
+ add_subdirectory(quantiles)
109
110
 
110
111
  if (WITH_PYTHON)
111
112
  add_subdirectory(python)
112
113
  endif()
113
114
 
114
- target_link_libraries(datasketches INTERFACE hll cpc kll fi theta sampling)
115
+ target_link_libraries(datasketches INTERFACE hll cpc kll fi theta sampling req quantiles)
115
116
 
116
117
  if (COVERAGE)
117
118
  find_program(LCOV_PATH NAMES "lcov")
@@ -126,11 +127,30 @@ endif()
126
127
 
127
128
  # # Installation
128
129
  install(TARGETS datasketches
129
- EXPORT ${PROJCT_NAME}
130
+ EXPORT ${PROJECT_NAME}
130
131
  PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/DataSketches
131
132
  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/DataSketches
132
133
  )
133
134
 
135
+ # Packaging
136
+ include(CMakePackageConfigHelpers)
137
+ write_basic_package_version_file(
138
+ "${CMAKE_CURRENT_BINARY_DIR}/DataSketchesConfigVersion.cmake"
139
+ VERSION ${PROJECT_VERSION}
140
+ COMPATIBILITY SameMajorVersion
141
+ )
142
+ configure_package_config_file(
143
+ cmake/DataSketchesConfig.cmake.in
144
+ "${CMAKE_CURRENT_BINARY_DIR}/DataSketchesConfig.cmake"
145
+ INSTALL_DESTINATION lib/DataSketches/cmake
146
+ PATH_VARS CMAKE_INSTALL_INCLUDEDIR
147
+ )
148
+ install(EXPORT ${PROJECT_NAME} DESTINATION lib/DataSketches/cmake)
149
+ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/DataSketchesConfigVersion.cmake"
150
+ "${CMAKE_CURRENT_BINARY_DIR}/DataSketchesConfig.cmake"
151
+ DESTINATION lib/DataSketches/cmake)
152
+
153
+
134
154
  #set(CPACK_PROJECT_NAME ${PROJECT_NAME})
135
155
  #set(CPACK_PROJECT_VERSION ${PROJECT_VERSION})
136
- #include(CPack)
156
+ include(CPack)
@@ -9,11 +9,14 @@ global-exclude .git*
9
9
 
10
10
  recursive-include python/pybind11 *
11
11
 
12
+ graft cmake
12
13
  graft common
13
14
  graft cpc
14
15
  graft fi
15
16
  graft hll
16
17
  graft kll
18
+ graft req
17
19
  graft theta
20
+ graft tuple
18
21
  graft sampling
19
22
  graft python
@@ -1,11 +1,12 @@
1
- Apache DataSketches-cpp
2
- Copyright 2020-2021 The Apache Software Foundation
1
+ Apache DataSketches C++ and Python
2
+ Copyright 2022 The Apache Software Foundation
3
3
 
4
- Copyright 2015-2018 Yahoo
5
- Copyright 2019 Verizon Media
4
+ Copyright 2015-2018 Yahoo Inc.
5
+ Copyright 2019-2020 Verizon Media
6
+ Copyright 2021 Yahoo Inc.
6
7
 
7
8
  This product includes software developed at
8
9
  The Apache Software Foundation (http://www.apache.org/).
9
10
 
10
11
  Prior to moving to ASF, the software for this project was developed at
11
- Yahoo (now Verizon Media) (https://developer.yahoo.com).
12
+ Yahoo Inc. (https://developer.yahoo.com).
@@ -25,18 +25,85 @@ Installing the latest cmake on OSX: brew install cmake
25
25
  Building and running unit tests using cmake for OSX and Linux:
26
26
 
27
27
  ```
28
- $ cd build
29
- $ cmake ..
30
- $ make
31
- $ make test
28
+ $ cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release
29
+ $ cmake --build build/Release -t all test
32
30
  ```
33
31
 
34
32
  Building and running unit tests using cmake for Windows from the command line:
35
33
 
36
34
  ```
37
- $ cd build
38
- $ cmake ..
39
- $ cd ..
40
- $ cmake --build build --config Release
41
- $ cmake --build build --config Release --target RUN_TESTS
35
+ $ cd build
36
+ $ cmake ..
37
+ $ cd ..
38
+ $ cmake --build build --config Release
39
+ $ cmake --build build --config Release --target RUN_TESTS
42
40
  ```
41
+
42
+ To install a local distribution (OSX and Linux), use the following command. The
43
+ CMAKE_INSTALL_PREFIX variable controls the destination. If not specified, it
44
+ defaults to installing in /usr (/usr/include, /usr/lib, etc). In the command below,
45
+ the installation will be in /tmp/install/DataSketches (/tmp/install/DataSketches/include,
46
+ /tmp/install/DataSketches/lib, etc)
47
+
48
+ ```
49
+ $ cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/install/DataSketches
50
+ $ cmake --build build/Release -t install
51
+ ```
52
+
53
+ To generate an installable package using cmake's built in cpack packaging tool,
54
+ use the following command. The type of packaging is controlled by the CPACK_GENERATOR
55
+ variable (semi-colon separated list). Cmake usually supports packaging types such as RPM,
56
+ DEB, STGZ, TGZ, TZ, ZIP, etc.
57
+
58
+ ```
59
+ $ cmake3 -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCPACK_GENERATOR="RPM;STGZ;TGZ"
60
+ $ cmake3 --build build/Release -t package
61
+ ```
62
+
63
+ The DataSketches project can be included in other projects' CMakeLists.txt files in one of two ways.
64
+ If DataSketches has been installed on the host (using an RPM, DEB, "make install" into /usr/local, or some
65
+ way, then CMake's `find_package` command can be used like this:
66
+
67
+ ```
68
+ find_package(DataSketches 3.2 REQUIRED)
69
+ target_link_library(my_dependent_target PUBLIC ${DATASKETCHES_LIB})
70
+ ```
71
+
72
+ When used with find_package, DataSketches exports several variables, including
73
+
74
+ - `DATASKETCHES_VERSION`: The version number of the datasketches package that was imported.
75
+ - `DATASKETCHES_INCLUDE_DIR`: The directory that should be added to access DataSketches include files.
76
+ Because cmake automatically includes the interface directories for included target libraries when
77
+ using `target_link_library`, under normal circumstances there will be no need to include this directly.
78
+ - `DATASKETCHES_LIB`: The name of the DataSketches target to include as a dependency. Projects pulling
79
+ in DataSketches should reference this with `target_link_library` in order to set up all the correct dependencies
80
+ and include paths.
81
+
82
+ If you don't have DataSketches installed locally, dependent projects can pull it directly
83
+ from GitHub using CMake's `ExternalProject` module. The code would look something like this:
84
+
85
+ ```
86
+ cmake_policy(SET CMP0097 NEW)
87
+ include(ExternalProject)
88
+ ExternalProject_Add(datasketches
89
+ GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git
90
+ GIT_TAG 3.2.0
91
+ GIT_SHALLOW true
92
+ GIT_SUBMODULES ""
93
+ INSTALL_DIR /tmp/datasketches-prefix
94
+ CMAKE_ARGS -DBUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=/tmp/datasketches-prefix
95
+
96
+ # Override the install command to add DESTDIR
97
+ # This is necessary to work around an oddity in the RPM (but not other) package
98
+ # generation, as CMake otherwise picks up the Datasketch files when building
99
+ # an RPM for a dependent package. (RPM scans the directory for files in addition to installing
100
+ # those files referenced in an "install" rule in the cmake file)
101
+ INSTALL_COMMAND env DESTDIR= ${CMAKE_COMMAND} --build . --target install
102
+ )
103
+ ExternalProject_Get_property(datasketches INSTALL_DIR)
104
+ set(datasketches_INSTALL_DIR ${INSTALL_DIR})
105
+ message("Source dir of datasketches = ${datasketches_INSTALL_DIR}")
106
+ target_include_directories(my_dependent_target
107
+ PRIVATE ${datasketches_INSTALL_DIR}/include/DataSketches)
108
+ add_dependencies(my_dependent_target datasketches)
109
+ ```
@@ -0,0 +1,10 @@
1
+ set(DATASKETCHES_VERSION "@PROJECT_VERSION@")
2
+
3
+ @PACKAGE_INIT@
4
+
5
+ include("${CMAKE_CURRENT_LIST_DIR}/DataSketches.cmake")
6
+
7
+ set_and_check(DATASKETCHES_INCLUDE_DIR "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@/DataSketches")
8
+ set(DATASKETCHES_LIB "datasketches")
9
+
10
+ check_required_components("@PROJECT_NAME@")
@@ -29,17 +29,22 @@ target_include_directories(common
29
29
 
30
30
  target_compile_features(common INTERFACE cxx_std_11)
31
31
 
32
- target_sources(common
33
- INTERFACE
34
- ${CMAKE_CURRENT_SOURCE_DIR}/include/common_defs.hpp
35
- ${CMAKE_CURRENT_SOURCE_DIR}/include/memory_operations.hpp
36
- ${CMAKE_CURRENT_SOURCE_DIR}/include/MurmurHash3.h
37
- ${CMAKE_CURRENT_SOURCE_DIR}/include/serde.hpp
38
- ${CMAKE_CURRENT_SOURCE_DIR}/include/count_zeros.hpp
39
- ${CMAKE_CURRENT_SOURCE_DIR}/include/inv_pow2_table.hpp
40
- ${CMAKE_CURRENT_SOURCE_DIR}/include/binomial_bounds.hpp
41
- ${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_back_inserter.hpp
42
- ${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_forward.hpp
43
- ${CMAKE_CURRENT_SOURCE_DIR}/include/ceiling_power_of_2.hpp
44
- )
32
+ install(TARGETS common EXPORT ${PROJECT_NAME})
45
33
 
34
+ install(FILES
35
+ include/common_defs.hpp
36
+ include/memory_operations.hpp
37
+ include/MurmurHash3.h
38
+ include/serde.hpp
39
+ include/count_zeros.hpp
40
+ include/inv_pow2_table.hpp
41
+ include/binomial_bounds.hpp
42
+ include/conditional_back_inserter.hpp
43
+ include/conditional_forward.hpp
44
+ include/ceiling_power_of_2.hpp
45
+ include/bounds_binomial_proportions.hpp
46
+ include/quantile_sketch_sorted_view.hpp
47
+ include/quantile_sketch_sorted_view_impl.hpp
48
+ include/kolmogorov_smirnov.hpp
49
+ include/kolmogorov_smirnov_impl.hpp
50
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
@@ -22,6 +22,7 @@
22
22
 
23
23
  #include <algorithm>
24
24
  #include <cmath>
25
+ #include <stdexcept>
25
26
 
26
27
  /*
27
28
  * This class enables the estimation of error bounds given a sample set size, the sampling
@@ -24,6 +24,8 @@
24
24
  #include <string>
25
25
  #include <memory>
26
26
  #include <iostream>
27
+ #include <random>
28
+ #include <chrono>
27
29
 
28
30
  namespace datasketches {
29
31
 
@@ -34,6 +36,18 @@ enum resize_factor { X1 = 0, X2, X4, X8 };
34
36
  template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
35
37
  template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;
36
38
 
39
+ // random bit
40
+ static std::independent_bits_engine<std::mt19937, 1, uint32_t>
41
+ random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()));
42
+
43
+ // common random declarations
44
+ namespace random_utils {
45
+ static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
46
+ static std::mt19937_64 rand(rd());
47
+ static std::uniform_real_distribution<> next_double(0.0, 1.0);
48
+ }
49
+
50
+
37
51
  // utility function to hide unused compiler warning
38
52
  // usually has no additional cost
39
53
  template<typename T> void unused(T&&...) {}
@@ -25,7 +25,8 @@ namespace datasketches {
25
25
  class kolmogorov_smirnov {
26
26
  public:
27
27
  /**
28
- * Computes the raw delta area between two KLL quantile sketches for the Kolmogorov-Smirnov Test.
28
+ * Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
29
+ * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
29
30
  * @param sketch1 KLL sketch 1
30
31
  * @param sketch2 KLL sketch 2
31
32
  * @return the raw delta between two KLL quantile sketches
@@ -37,6 +38,7 @@ public:
37
38
  * Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
38
39
  * Adjusts the computed threshold by the error epsilons of the two given sketches.
39
40
  * See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
41
+ * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
40
42
  * @param sketch1 KLL sketch 1
41
43
  * @param sketch2 KLL sketch 2
42
44
  * @param p Target p-value. Typically .001 to .1, e.g., .05.
@@ -46,7 +48,8 @@ public:
46
48
  static double threshold(const Sketch& sketch1, const Sketch& sketch2, double p);
47
49
 
48
50
  /**
49
- * Performs the Kolmogorov-Smirnov Test between two KLL quantiles sketches.
51
+ * Performs the Kolmogorov-Smirnov Test between two quantile sketches.
52
+ * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
50
53
  * Note: if the given sketches have insufficient data or if the sketch sizes are too small,
51
54
  * this will return false.
52
55
  * @param sketch1 KLL sketch 1
@@ -57,7 +60,6 @@ public:
57
60
  */
58
61
  template<typename Sketch>
59
62
  static bool test(const Sketch& sketch1, const Sketch& sketch2, double p);
60
-
61
63
  };
62
64
 
63
65
  } /* namespace datasketches */
@@ -20,39 +20,36 @@
20
20
  #ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
21
21
  #define KOLMOGOROV_SMIRNOV_IMPL_HPP_
22
22
 
23
- namespace datasketches {
23
+ #include <cmath>
24
+ #include <algorithm>
24
25
 
25
- // type resolver
26
- template<typename T, typename C, typename S, typename A>
27
- kll_quantile_calculator<T, C, A> make_quantile_calculator(const kll_sketch<T, C, S, A>& sketch) {
28
- return kll_quantile_calculator<T, C, A>(sketch);
29
- }
26
+ namespace datasketches {
30
27
 
31
28
  template<typename Sketch>
32
29
  double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
33
- using Comparator = typename Sketch::comparator;
34
- auto calc1 = make_quantile_calculator(sketch1);
35
- auto calc2 = make_quantile_calculator(sketch2);
36
- auto it1 = calc1.begin();
37
- auto it2 = calc2.begin();
30
+ auto comparator = sketch1.get_comparator(); // assuming the same comparator in sketch2
31
+ auto view1 = sketch1.get_sorted_view(true);
32
+ auto view2 = sketch2.get_sorted_view(true);
33
+ auto it1 = view1.begin();
34
+ auto it2 = view2.begin();
38
35
  const auto n1 = sketch1.get_n();
39
36
  const auto n2 = sketch2.get_n();
40
37
  double delta = 0;
41
- while (it1 != calc1.end() && it2 != calc2.end()) {
38
+ while (it1 != view1.end() && it2 != view2.end()) {
42
39
  const double norm_cum_wt1 = static_cast<double>((*it1).second) / n1;
43
40
  const double norm_cum_wt2 = static_cast<double>((*it2).second) / n2;
44
41
  delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
45
- if (Comparator()((*it1).first, (*it2).first)) {
42
+ if (comparator((*it1).first, (*it2).first)) {
46
43
  ++it1;
47
- } else if (Comparator()((*it2).first, (*it1).first)) {
44
+ } else if (comparator((*it2).first, (*it1).first)) {
48
45
  ++it2;
49
46
  } else {
50
47
  ++it1;
51
48
  ++it2;
52
49
  }
53
50
  }
54
- const double norm_cum_wt1 = it1 == calc1.end() ? 1 : static_cast<double>((*it1).second) / n1;
55
- const double norm_cum_wt2 = it2 == calc2.end() ? 1 : static_cast<double>((*it2).second) / n2;
51
+ const double norm_cum_wt1 = it1 == view1.end() ? 1 : static_cast<double>((*it1).second) / n1;
52
+ const double norm_cum_wt2 = it2 == view2.end() ? 1 : static_cast<double>((*it2).second) / n2;
56
53
  delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
57
54
  return delta;
58
55
  }
@@ -0,0 +1,121 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef QUANTILE_SKETCH_SORTED_VIEW_HPP_
21
+ #define QUANTILE_SKETCH_SORTED_VIEW_HPP_
22
+
23
+ #include <functional>
24
+
25
+ namespace datasketches {
26
+
27
+ template<
28
+ typename T,
29
+ typename Comparator, // strict weak ordering function (see C++ named requirements: Compare)
30
+ typename Allocator
31
+ >
32
+ class quantile_sketch_sorted_view {
33
+ public:
34
+ using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type;
35
+ using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
36
+ using Container = std::vector<Entry, AllocEntry>;
37
+
38
+ quantile_sketch_sorted_view(uint32_t num, const Allocator& allocator);
39
+
40
+ template<typename Iterator>
41
+ void add(Iterator begin, Iterator end, uint64_t weight);
42
+
43
+ template<bool inclusive>
44
+ void convert_to_cummulative();
45
+
46
+ class const_iterator;
47
+ const_iterator begin() const;
48
+ const_iterator end() const;
49
+
50
+ size_t size() const;
51
+
52
+ // makes sense only with cumulative weight
53
+ using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type;
54
+ quantile_return_type get_quantile(double rank) const;
55
+
56
+ private:
57
+ static inline const T& deref_helper(const T* t) { return *t; }
58
+ static inline T deref_helper(T t) { return t; }
59
+
60
+ struct compare_pairs_by_first {
61
+ bool operator()(const Entry& a, const Entry& b) const {
62
+ return Comparator()(deref_helper(a.first), deref_helper(b.first));
63
+ }
64
+ };
65
+
66
+ struct compare_pairs_by_second {
67
+ bool operator()(const Entry& a, const Entry& b) const {
68
+ return a.second < b.second;
69
+ }
70
+ };
71
+
72
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
73
+ static inline T ref_helper(const T& t) { return t; }
74
+
75
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
76
+ static inline const T* ref_helper(const T& t) { return std::addressof(t); }
77
+
78
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
79
+ static inline Entry make_dummy_entry(uint64_t weight) { return Entry(0, weight); }
80
+
81
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
82
+ static inline Entry make_dummy_entry(uint64_t weight) { return Entry(nullptr, weight); }
83
+
84
+ uint64_t total_weight_;
85
+ Container entries_;
86
+ };
87
+
88
+ template<typename T, typename C, typename A>
89
+ class quantile_sketch_sorted_view<T, C, A>::const_iterator: public quantile_sketch_sorted_view<T, C, A>::Container::const_iterator {
90
+ public:
91
+ using Base = typename quantile_sketch_sorted_view<T, C, A>::Container::const_iterator;
92
+ using value_type = typename std::conditional<std::is_arithmetic<T>::value, typename Base::value_type, std::pair<const T&, const uint64_t>>::type;
93
+
94
+ const_iterator(const Base& it): Base(it) {}
95
+
96
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
97
+ value_type operator*() const { return Base::operator*(); }
98
+
99
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
100
+ value_type operator*() const { return value_type(*(Base::operator*().first), Base::operator*().second); }
101
+
102
+ class return_value_holder {
103
+ public:
104
+ return_value_holder(value_type value): value_(value) {}
105
+ const value_type* operator->() const { return &value_; }
106
+ private:
107
+ value_type value_;
108
+ };
109
+
110
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
111
+ const value_type* operator->() const { return Base::operator->(); }
112
+
113
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
114
+ return_value_holder operator->() const { return **this; }
115
+ };
116
+
117
+ } /* namespace datasketches */
118
+
119
+ #include "quantile_sketch_sorted_view_impl.hpp"
120
+
121
+ #endif