datasketches 0.2.3 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/README.md +8 -8
  4. data/ext/datasketches/kll_wrapper.cpp +7 -3
  5. data/ext/datasketches/theta_wrapper.cpp +20 -4
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +25 -5
  8. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  9. data/vendor/datasketches-cpp/NOTICE +6 -5
  10. data/vendor/datasketches-cpp/README.md +76 -9
  11. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  12. data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
  13. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  14. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  15. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  16. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  17. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  18. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  19. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  20. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +3 -1
  22. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  24. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
  25. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  26. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  27. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  28. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  29. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  30. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +29 -11
  31. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  32. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  34. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  35. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  36. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  37. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  38. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  39. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  40. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  41. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  42. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  43. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  44. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  45. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  46. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  47. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  48. data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
  49. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  50. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +5 -2
  51. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +108 -41
  52. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +150 -132
  53. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +165 -31
  54. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  55. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  56. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  57. data/vendor/datasketches-cpp/python/README.md +13 -9
  58. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  59. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  60. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  61. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  62. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  63. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  64. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  65. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  66. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  67. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  68. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +656 -0
  69. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1373 -0
  70. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  71. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  72. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  73. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  74. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  75. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  76. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  77. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  78. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  79. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  80. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  81. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +975 -0
  82. data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
  83. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  84. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +6 -0
  85. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +30 -2
  86. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +73 -23
  87. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +95 -63
  88. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +74 -3
  89. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +44 -33
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  96. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  97. data/vendor/datasketches-cpp/setup.py +1 -1
  98. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  99. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
  103. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
  104. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  105. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
  106. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  107. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  108. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
  109. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  110. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
  111. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +34 -9
  112. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  113. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  114. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  115. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  116. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  117. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  118. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  119. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  120. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
  121. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
  122. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
  123. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  124. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  125. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  126. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  127. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  128. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
  129. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
  130. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  131. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  132. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  133. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  134. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  135. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  136. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
  137. metadata +33 -12
  138. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  139. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  140. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  141. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
  142. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea00e444de6dc1bebc2b8cf878a250f08717d55eaa55f63f6bec28f4be2af00d
4
- data.tar.gz: 161b9089e3b8d0dbd99cfb6cc0af463934c42ba85f4788a08306369966f28571
3
+ metadata.gz: cf1ea0f9f2d12b0e46c2d4c7dec21f41992e711e73eca68ea1ef03a4bb711077
4
+ data.tar.gz: 92f56b63da0254962be47d8d3e00a6950a271053bf3152167f95e6fdb99528e6
5
5
  SHA512:
6
- metadata.gz: '09eede1e6e4c0fe57c0116c4e8873670192fea845783687ca34890bd9358af9dd19a535774ab7dd667055cf6acd0d3913f044dcf2274e0ec092b33307250a74a'
7
- data.tar.gz: b8bcaeb7af0d27e836f21941663229a2750922914c4f31f4ffbd6e3c3876320f9ce92916eb9730e02227b87e8f244bc08e5bc38541bf4ef4e3485203fff01942
6
+ metadata.gz: 5841d4a70f1e852faa150f57ebfefc7b975de020782c41eebdad87a01d016be9bdf86f86173600632bf6f56300df0c9c4196251aa5df02a47ecd357ac844ef80
7
+ data.tar.gz: d6ae7c811e0e2c2008b912e29f86d1b99491c74cd878790dfd800811a007f0dbf9c49bb59db30345450ff82673381f2c036a84a57dc44a6f6751610d9be2ee88
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## 0.2.6 (2022-07-13)
2
+
3
+ - Updated DataSketches to 3.5.0
4
+
5
+ ## 0.2.5 (2022-05-21)
6
+
7
+ - Updated DataSketches to 3.4.0
8
+
9
+ ## 0.2.4 (2021-12-28)
10
+
11
+ - Updated DataSketches to 3.3.0
12
+
1
13
  ## 0.2.3 (2021-09-29)
2
14
 
3
15
  - Updated DataSketches to 3.2.0
data/README.md CHANGED
@@ -1,15 +1,15 @@
1
- # DataSketches
1
+ # DataSketches Ruby
2
2
 
3
3
  [DataSketches](https://datasketches.apache.org/) - sketch data structures - for Ruby
4
4
 
5
- [![Build Status](https://github.com/ankane/datasketches/workflows/build/badge.svg?branch=master)](https://github.com/ankane/datasketches/actions)
5
+ [![Build Status](https://github.com/ankane/datasketches-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/datasketches-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
9
9
  Add this line to your application’s Gemfile:
10
10
 
11
11
  ```ruby
12
- gem 'datasketches'
12
+ gem "datasketches"
13
13
  ```
14
14
 
15
15
  ## Sketch Families
@@ -292,22 +292,22 @@ This library is modeled after the DataSketches [Python API](https://github.com/a
292
292
 
293
293
  ## History
294
294
 
295
- View the [changelog](https://github.com/ankane/datasketches/blob/master/CHANGELOG.md)
295
+ View the [changelog](https://github.com/ankane/datasketches-ruby/blob/master/CHANGELOG.md)
296
296
 
297
297
  ## Contributing
298
298
 
299
299
  Everyone is encouraged to help improve this project. Here are a few ways you can help:
300
300
 
301
- - [Report bugs](https://github.com/ankane/datasketches/issues)
302
- - Fix bugs and [submit pull requests](https://github.com/ankane/datasketches/pulls)
301
+ - [Report bugs](https://github.com/ankane/datasketches-ruby/issues)
302
+ - Fix bugs and [submit pull requests](https://github.com/ankane/datasketches-ruby/pulls)
303
303
  - Write, clarify, or fix documentation
304
304
  - Suggest or add new features
305
305
 
306
306
  To get started with development:
307
307
 
308
308
  ```sh
309
- git clone --recursive https://github.com/ankane/datasketches.git
310
- cd datasketches
309
+ git clone --recursive https://github.com/ankane/datasketches-ruby.git
310
+ cd datasketches-ruby
311
311
  bundle install
312
312
  bundle exec rake compile
313
313
  bundle exec rake test
@@ -33,7 +33,11 @@ void bind_kll_sketch(Rice::Module& m, const char* name) {
33
33
  .define_method("estimation_mode?", &kll_sketch<T>::is_estimation_mode)
34
34
  .define_method("min_value", &kll_sketch<T>::get_min_value)
35
35
  .define_method("max_value", &kll_sketch<T>::get_max_value)
36
- .define_method("quantile", &kll_sketch<T>::get_quantile)
36
+ .define_method(
37
+ "quantile",
38
+ [](kll_sketch<T>& self, double fraction) {
39
+ return self.get_quantile(fraction);
40
+ })
37
41
  .define_method(
38
42
  "quantiles",
39
43
  [](kll_sketch<T>& self, Rice::Object obj) {
@@ -51,12 +55,12 @@ void bind_kll_sketch(Rice::Module& m, const char* name) {
51
55
  })
52
56
  .define_method(
53
57
  "pmf",
54
- [](kll_sketch<T>& self, std::vector<T> split_points) {
58
+ [](kll_sketch<T>& self, const std::vector<T>& split_points) {
55
59
  return self.get_PMF(&split_points[0], split_points.size());
56
60
  })
57
61
  .define_method(
58
62
  "cdf",
59
- [](kll_sketch<T>& self, std::vector<T> split_points) {
63
+ [](kll_sketch<T>& self, const std::vector<T>& split_points) {
60
64
  return self.get_CDF(&split_points[0], split_points.size());
61
65
  })
62
66
  .define_method(
@@ -20,10 +20,26 @@ using Rice::Arg;
20
20
 
21
21
  void init_theta(Rice::Module& m) {
22
22
  Rice::define_class_under<theta_sketch>(m, "ThetaSketch")
23
- .define_method("empty?", &theta_sketch::is_empty)
24
- .define_method("estimate", &theta_sketch::get_estimate)
25
- .define_method("lower_bound", &theta_sketch::get_lower_bound)
26
- .define_method("upper_bound", &theta_sketch::get_upper_bound);
23
+ .define_method(
24
+ "empty?",
25
+ [](theta_sketch& self) {
26
+ return self.is_empty();
27
+ })
28
+ .define_method(
29
+ "estimate",
30
+ [](theta_sketch& self) {
31
+ return self.get_estimate();
32
+ })
33
+ .define_method(
34
+ "lower_bound",
35
+ [](theta_sketch& self, uint8_t num_std_devs) {
36
+ return self.get_lower_bound(num_std_devs);
37
+ })
38
+ .define_method(
39
+ "upper_bound",
40
+ [](theta_sketch& self, uint8_t num_std_devs) {
41
+ return self.get_upper_bound(num_std_devs);
42
+ });
27
43
 
28
44
  Rice::define_class_under<compact_theta_sketch, theta_sketch>(m, "CompactThetaSketch")
29
45
  .define_singleton_function(
@@ -1,3 +1,3 @@
1
1
  module DataSketches
2
- VERSION = "0.2.3"
2
+ VERSION = "0.2.6"
3
3
  end
@@ -15,9 +15,9 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- cmake_minimum_required(VERSION 3.12.0)
18
+ cmake_minimum_required(VERSION 3.16.0)
19
19
  project(DataSketches
20
- VERSION 0.12.0
20
+ VERSION 3.5.0
21
21
  LANGUAGES CXX)
22
22
 
23
23
  include(GNUInstallDirs)
@@ -106,12 +106,13 @@ add_subdirectory(theta)
106
106
  add_subdirectory(sampling)
107
107
  add_subdirectory(tuple)
108
108
  add_subdirectory(req)
109
+ add_subdirectory(quantiles)
109
110
 
110
111
  if (WITH_PYTHON)
111
112
  add_subdirectory(python)
112
113
  endif()
113
114
 
114
- target_link_libraries(datasketches INTERFACE hll cpc kll fi theta sampling)
115
+ target_link_libraries(datasketches INTERFACE hll cpc kll fi theta sampling req quantiles)
115
116
 
116
117
  if (COVERAGE)
117
118
  find_program(LCOV_PATH NAMES "lcov")
@@ -126,11 +127,30 @@ endif()
126
127
 
127
128
  # # Installation
128
129
  install(TARGETS datasketches
129
- EXPORT ${PROJCT_NAME}
130
+ EXPORT ${PROJECT_NAME}
130
131
  PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/DataSketches
131
132
  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/DataSketches
132
133
  )
133
134
 
135
+ # Packaging
136
+ include(CMakePackageConfigHelpers)
137
+ write_basic_package_version_file(
138
+ "${CMAKE_CURRENT_BINARY_DIR}/DataSketchesConfigVersion.cmake"
139
+ VERSION ${PROJECT_VERSION}
140
+ COMPATIBILITY SameMajorVersion
141
+ )
142
+ configure_package_config_file(
143
+ cmake/DataSketchesConfig.cmake.in
144
+ "${CMAKE_CURRENT_BINARY_DIR}/DataSketchesConfig.cmake"
145
+ INSTALL_DESTINATION lib/DataSketches/cmake
146
+ PATH_VARS CMAKE_INSTALL_INCLUDEDIR
147
+ )
148
+ install(EXPORT ${PROJECT_NAME} DESTINATION lib/DataSketches/cmake)
149
+ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/DataSketchesConfigVersion.cmake"
150
+ "${CMAKE_CURRENT_BINARY_DIR}/DataSketchesConfig.cmake"
151
+ DESTINATION lib/DataSketches/cmake)
152
+
153
+
134
154
  #set(CPACK_PROJECT_NAME ${PROJECT_NAME})
135
155
  #set(CPACK_PROJECT_VERSION ${PROJECT_VERSION})
136
- #include(CPack)
156
+ include(CPack)
@@ -9,11 +9,14 @@ global-exclude .git*
9
9
 
10
10
  recursive-include python/pybind11 *
11
11
 
12
+ graft cmake
12
13
  graft common
13
14
  graft cpc
14
15
  graft fi
15
16
  graft hll
16
17
  graft kll
18
+ graft req
17
19
  graft theta
20
+ graft tuple
18
21
  graft sampling
19
22
  graft python
@@ -1,11 +1,12 @@
1
- Apache DataSketches-cpp
2
- Copyright 2020-2021 The Apache Software Foundation
1
+ Apache DataSketches C++ and Python
2
+ Copyright 2022 The Apache Software Foundation
3
3
 
4
- Copyright 2015-2018 Yahoo
5
- Copyright 2019 Verizon Media
4
+ Copyright 2015-2018 Yahoo Inc.
5
+ Copyright 2019-2020 Verizon Media
6
+ Copyright 2021 Yahoo Inc.
6
7
 
7
8
  This product includes software developed at
8
9
  The Apache Software Foundation (http://www.apache.org/).
9
10
 
10
11
  Prior to moving to ASF, the software for this project was developed at
11
- Yahoo (now Verizon Media) (https://developer.yahoo.com).
12
+ Yahoo Inc. (https://developer.yahoo.com).
@@ -25,18 +25,85 @@ Installing the latest cmake on OSX: brew install cmake
25
25
  Building and running unit tests using cmake for OSX and Linux:
26
26
 
27
27
  ```
28
- $ cd build
29
- $ cmake ..
30
- $ make
31
- $ make test
28
+ $ cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release
29
+ $ cmake --build build/Release -t all test
32
30
  ```
33
31
 
34
32
  Building and running unit tests using cmake for Windows from the command line:
35
33
 
36
34
  ```
37
- $ cd build
38
- $ cmake ..
39
- $ cd ..
40
- $ cmake --build build --config Release
41
- $ cmake --build build --config Release --target RUN_TESTS
35
+ $ cd build
36
+ $ cmake ..
37
+ $ cd ..
38
+ $ cmake --build build --config Release
39
+ $ cmake --build build --config Release --target RUN_TESTS
42
40
  ```
41
+
42
+ To install a local distribution (OSX and Linux), use the following command. The
43
+ CMAKE_INSTALL_PREFIX variable controls the destination. If not specified, it
44
+ defaults to installing in /usr (/usr/include, /usr/lib, etc). In the command below,
45
+ the installation will be in /tmp/install/DataSketches (/tmp/install/DataSketches/include,
46
+ /tmp/install/DataSketches/lib, etc)
47
+
48
+ ```
49
+ $ cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/install/DataSketches
50
+ $ cmake --build build/Release -t install
51
+ ```
52
+
53
+ To generate an installable package using cmake's built in cpack packaging tool,
54
+ use the following command. The type of packaging is controlled by the CPACK_GENERATOR
55
+ variable (semi-colon separated list). Cmake usually supports packaging types such as RPM,
56
+ DEB, STGZ, TGZ, TZ, ZIP, etc.
57
+
58
+ ```
59
+ $ cmake3 -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCPACK_GENERATOR="RPM;STGZ;TGZ"
60
+ $ cmake3 --build build/Release -t package
61
+ ```
62
+
63
+ The DataSketches project can be included in other projects' CMakeLists.txt files in one of two ways.
64
+ If DataSketches has been installed on the host (using an RPM, DEB, "make install" into /usr/local, or some
65
+ way, then CMake's `find_package` command can be used like this:
66
+
67
+ ```
68
+ find_package(DataSketches 3.2 REQUIRED)
69
+ target_link_library(my_dependent_target PUBLIC ${DATASKETCHES_LIB})
70
+ ```
71
+
72
+ When used with find_package, DataSketches exports several variables, including
73
+
74
+ - `DATASKETCHES_VERSION`: The version number of the datasketches package that was imported.
75
+ - `DATASKETCHES_INCLUDE_DIR`: The directory that should be added to access DataSketches include files.
76
+ Because cmake automatically includes the interface directories for included target libraries when
77
+ using `target_link_library`, under normal circumstances there will be no need to include this directly.
78
+ - `DATASKETCHES_LIB`: The name of the DataSketches target to include as a dependency. Projects pulling
79
+ in DataSketches should reference this with `target_link_library` in order to set up all the correct dependencies
80
+ and include paths.
81
+
82
+ If you don't have DataSketches installed locally, dependent projects can pull it directly
83
+ from GitHub using CMake's `ExternalProject` module. The code would look something like this:
84
+
85
+ ```
86
+ cmake_policy(SET CMP0097 NEW)
87
+ include(ExternalProject)
88
+ ExternalProject_Add(datasketches
89
+ GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git
90
+ GIT_TAG 3.2.0
91
+ GIT_SHALLOW true
92
+ GIT_SUBMODULES ""
93
+ INSTALL_DIR /tmp/datasketches-prefix
94
+ CMAKE_ARGS -DBUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=/tmp/datasketches-prefix
95
+
96
+ # Override the install command to add DESTDIR
97
+ # This is necessary to work around an oddity in the RPM (but not other) package
98
+ # generation, as CMake otherwise picks up the Datasketch files when building
99
+ # an RPM for a dependent package. (RPM scans the directory for files in addition to installing
100
+ # those files referenced in an "install" rule in the cmake file)
101
+ INSTALL_COMMAND env DESTDIR= ${CMAKE_COMMAND} --build . --target install
102
+ )
103
+ ExternalProject_Get_property(datasketches INSTALL_DIR)
104
+ set(datasketches_INSTALL_DIR ${INSTALL_DIR})
105
+ message("Source dir of datasketches = ${datasketches_INSTALL_DIR}")
106
+ target_include_directories(my_dependent_target
107
+ PRIVATE ${datasketches_INSTALL_DIR}/include/DataSketches)
108
+ add_dependencies(my_dependent_target datasketches)
109
+ ```
@@ -0,0 +1,10 @@
1
+ set(DATASKETCHES_VERSION "@PROJECT_VERSION@")
2
+
3
+ @PACKAGE_INIT@
4
+
5
+ include("${CMAKE_CURRENT_LIST_DIR}/DataSketches.cmake")
6
+
7
+ set_and_check(DATASKETCHES_INCLUDE_DIR "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@/DataSketches")
8
+ set(DATASKETCHES_LIB "datasketches")
9
+
10
+ check_required_components("@PROJECT_NAME@")
@@ -29,17 +29,22 @@ target_include_directories(common
29
29
 
30
30
  target_compile_features(common INTERFACE cxx_std_11)
31
31
 
32
- target_sources(common
33
- INTERFACE
34
- ${CMAKE_CURRENT_SOURCE_DIR}/include/common_defs.hpp
35
- ${CMAKE_CURRENT_SOURCE_DIR}/include/memory_operations.hpp
36
- ${CMAKE_CURRENT_SOURCE_DIR}/include/MurmurHash3.h
37
- ${CMAKE_CURRENT_SOURCE_DIR}/include/serde.hpp
38
- ${CMAKE_CURRENT_SOURCE_DIR}/include/count_zeros.hpp
39
- ${CMAKE_CURRENT_SOURCE_DIR}/include/inv_pow2_table.hpp
40
- ${CMAKE_CURRENT_SOURCE_DIR}/include/binomial_bounds.hpp
41
- ${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_back_inserter.hpp
42
- ${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_forward.hpp
43
- ${CMAKE_CURRENT_SOURCE_DIR}/include/ceiling_power_of_2.hpp
44
- )
32
+ install(TARGETS common EXPORT ${PROJECT_NAME})
45
33
 
34
+ install(FILES
35
+ include/common_defs.hpp
36
+ include/memory_operations.hpp
37
+ include/MurmurHash3.h
38
+ include/serde.hpp
39
+ include/count_zeros.hpp
40
+ include/inv_pow2_table.hpp
41
+ include/binomial_bounds.hpp
42
+ include/conditional_back_inserter.hpp
43
+ include/conditional_forward.hpp
44
+ include/ceiling_power_of_2.hpp
45
+ include/bounds_binomial_proportions.hpp
46
+ include/quantile_sketch_sorted_view.hpp
47
+ include/quantile_sketch_sorted_view_impl.hpp
48
+ include/kolmogorov_smirnov.hpp
49
+ include/kolmogorov_smirnov_impl.hpp
50
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
@@ -22,6 +22,7 @@
22
22
 
23
23
  #include <algorithm>
24
24
  #include <cmath>
25
+ #include <stdexcept>
25
26
 
26
27
  /*
27
28
  * This class enables the estimation of error bounds given a sample set size, the sampling
@@ -24,6 +24,8 @@
24
24
  #include <string>
25
25
  #include <memory>
26
26
  #include <iostream>
27
+ #include <random>
28
+ #include <chrono>
27
29
 
28
30
  namespace datasketches {
29
31
 
@@ -34,6 +36,18 @@ enum resize_factor { X1 = 0, X2, X4, X8 };
34
36
  template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
35
37
  template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;
36
38
 
39
+ // random bit
40
+ static std::independent_bits_engine<std::mt19937, 1, uint32_t>
41
+ random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()));
42
+
43
+ // common random declarations
44
+ namespace random_utils {
45
+ static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
46
+ static std::mt19937_64 rand(rd());
47
+ static std::uniform_real_distribution<> next_double(0.0, 1.0);
48
+ }
49
+
50
+
37
51
  // utility function to hide unused compiler warning
38
52
  // usually has no additional cost
39
53
  template<typename T> void unused(T&&...) {}
@@ -25,7 +25,8 @@ namespace datasketches {
25
25
  class kolmogorov_smirnov {
26
26
  public:
27
27
  /**
28
- * Computes the raw delta area between two KLL quantile sketches for the Kolmogorov-Smirnov Test.
28
+ * Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
29
+ * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
29
30
  * @param sketch1 KLL sketch 1
30
31
  * @param sketch2 KLL sketch 2
31
32
  * @return the raw delta between two KLL quantile sketches
@@ -37,6 +38,7 @@ public:
37
38
  * Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
38
39
  * Adjusts the computed threshold by the error epsilons of the two given sketches.
39
40
  * See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
41
+ * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
40
42
  * @param sketch1 KLL sketch 1
41
43
  * @param sketch2 KLL sketch 2
42
44
  * @param p Target p-value. Typically .001 to .1, e.g., .05.
@@ -46,7 +48,8 @@ public:
46
48
  static double threshold(const Sketch& sketch1, const Sketch& sketch2, double p);
47
49
 
48
50
  /**
49
- * Performs the Kolmogorov-Smirnov Test between two KLL quantiles sketches.
51
+ * Performs the Kolmogorov-Smirnov Test between two quantile sketches.
52
+ * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
50
53
  * Note: if the given sketches have insufficient data or if the sketch sizes are too small,
51
54
  * this will return false.
52
55
  * @param sketch1 KLL sketch 1
@@ -57,7 +60,6 @@ public:
57
60
  */
58
61
  template<typename Sketch>
59
62
  static bool test(const Sketch& sketch1, const Sketch& sketch2, double p);
60
-
61
63
  };
62
64
 
63
65
  } /* namespace datasketches */
@@ -20,39 +20,36 @@
20
20
  #ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
21
21
  #define KOLMOGOROV_SMIRNOV_IMPL_HPP_
22
22
 
23
- namespace datasketches {
23
+ #include <cmath>
24
+ #include <algorithm>
24
25
 
25
- // type resolver
26
- template<typename T, typename C, typename S, typename A>
27
- kll_quantile_calculator<T, C, A> make_quantile_calculator(const kll_sketch<T, C, S, A>& sketch) {
28
- return kll_quantile_calculator<T, C, A>(sketch);
29
- }
26
+ namespace datasketches {
30
27
 
31
28
  template<typename Sketch>
32
29
  double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
33
- using Comparator = typename Sketch::comparator;
34
- auto calc1 = make_quantile_calculator(sketch1);
35
- auto calc2 = make_quantile_calculator(sketch2);
36
- auto it1 = calc1.begin();
37
- auto it2 = calc2.begin();
30
+ auto comparator = sketch1.get_comparator(); // assuming the same comparator in sketch2
31
+ auto view1 = sketch1.get_sorted_view(true);
32
+ auto view2 = sketch2.get_sorted_view(true);
33
+ auto it1 = view1.begin();
34
+ auto it2 = view2.begin();
38
35
  const auto n1 = sketch1.get_n();
39
36
  const auto n2 = sketch2.get_n();
40
37
  double delta = 0;
41
- while (it1 != calc1.end() && it2 != calc2.end()) {
38
+ while (it1 != view1.end() && it2 != view2.end()) {
42
39
  const double norm_cum_wt1 = static_cast<double>((*it1).second) / n1;
43
40
  const double norm_cum_wt2 = static_cast<double>((*it2).second) / n2;
44
41
  delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
45
- if (Comparator()((*it1).first, (*it2).first)) {
42
+ if (comparator((*it1).first, (*it2).first)) {
46
43
  ++it1;
47
- } else if (Comparator()((*it2).first, (*it1).first)) {
44
+ } else if (comparator((*it2).first, (*it1).first)) {
48
45
  ++it2;
49
46
  } else {
50
47
  ++it1;
51
48
  ++it2;
52
49
  }
53
50
  }
54
- const double norm_cum_wt1 = it1 == calc1.end() ? 1 : static_cast<double>((*it1).second) / n1;
55
- const double norm_cum_wt2 = it2 == calc2.end() ? 1 : static_cast<double>((*it2).second) / n2;
51
+ const double norm_cum_wt1 = it1 == view1.end() ? 1 : static_cast<double>((*it1).second) / n1;
52
+ const double norm_cum_wt2 = it2 == view2.end() ? 1 : static_cast<double>((*it2).second) / n2;
56
53
  delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
57
54
  return delta;
58
55
  }
@@ -0,0 +1,121 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef QUANTILE_SKETCH_SORTED_VIEW_HPP_
21
+ #define QUANTILE_SKETCH_SORTED_VIEW_HPP_
22
+
23
+ #include <functional>
24
+
25
+ namespace datasketches {
26
+
27
+ template<
28
+ typename T,
29
+ typename Comparator, // strict weak ordering function (see C++ named requirements: Compare)
30
+ typename Allocator
31
+ >
32
+ class quantile_sketch_sorted_view {
33
+ public:
34
+ using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type;
35
+ using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
36
+ using Container = std::vector<Entry, AllocEntry>;
37
+
38
+ quantile_sketch_sorted_view(uint32_t num, const Allocator& allocator);
39
+
40
+ template<typename Iterator>
41
+ void add(Iterator begin, Iterator end, uint64_t weight);
42
+
43
+ template<bool inclusive>
44
+ void convert_to_cummulative();
45
+
46
+ class const_iterator;
47
+ const_iterator begin() const;
48
+ const_iterator end() const;
49
+
50
+ size_t size() const;
51
+
52
+ // makes sense only with cumulative weight
53
+ using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type;
54
+ quantile_return_type get_quantile(double rank) const;
55
+
56
+ private:
57
+ static inline const T& deref_helper(const T* t) { return *t; }
58
+ static inline T deref_helper(T t) { return t; }
59
+
60
+ struct compare_pairs_by_first {
61
+ bool operator()(const Entry& a, const Entry& b) const {
62
+ return Comparator()(deref_helper(a.first), deref_helper(b.first));
63
+ }
64
+ };
65
+
66
+ struct compare_pairs_by_second {
67
+ bool operator()(const Entry& a, const Entry& b) const {
68
+ return a.second < b.second;
69
+ }
70
+ };
71
+
72
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
73
+ static inline T ref_helper(const T& t) { return t; }
74
+
75
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
76
+ static inline const T* ref_helper(const T& t) { return std::addressof(t); }
77
+
78
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
79
+ static inline Entry make_dummy_entry(uint64_t weight) { return Entry(0, weight); }
80
+
81
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
82
+ static inline Entry make_dummy_entry(uint64_t weight) { return Entry(nullptr, weight); }
83
+
84
+ uint64_t total_weight_;
85
+ Container entries_;
86
+ };
87
+
88
+ template<typename T, typename C, typename A>
89
+ class quantile_sketch_sorted_view<T, C, A>::const_iterator: public quantile_sketch_sorted_view<T, C, A>::Container::const_iterator {
90
+ public:
91
+ using Base = typename quantile_sketch_sorted_view<T, C, A>::Container::const_iterator;
92
+ using value_type = typename std::conditional<std::is_arithmetic<T>::value, typename Base::value_type, std::pair<const T&, const uint64_t>>::type;
93
+
94
+ const_iterator(const Base& it): Base(it) {}
95
+
96
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
97
+ value_type operator*() const { return Base::operator*(); }
98
+
99
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
100
+ value_type operator*() const { return value_type(*(Base::operator*().first), Base::operator*().second); }
101
+
102
+ class return_value_holder {
103
+ public:
104
+ return_value_holder(value_type value): value_(value) {}
105
+ const value_type* operator->() const { return &value_; }
106
+ private:
107
+ value_type value_;
108
+ };
109
+
110
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
111
+ const value_type* operator->() const { return Base::operator->(); }
112
+
113
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
114
+ return_value_holder operator->() const { return **this; }
115
+ };
116
+
117
+ } /* namespace datasketches */
118
+
119
+ #include "quantile_sketch_sorted_view_impl.hpp"
120
+
121
+ #endif