datasketches 0.2.0 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e62d3e98b646fb074ae5bccf506a3ec9659bfde99d98721e2e3d7377ea68a3ce
4
- data.tar.gz: ef7a1d11de9801e0d40d0071a46f695555a026a54c8e31d23624c01045faafe6
3
+ metadata.gz: 897dbc30f97ce17f0415630b6347a0092dac05196b0ef61e80939410d65cdf17
4
+ data.tar.gz: 61302f9cadde8a8badc97b455eb5c32d913c3b1fea8ed571e2da93a29e65afa9
5
5
  SHA512:
6
- metadata.gz: 8808ddf87374a66de48f0434f22b8813cd0170d40ce1dc518a480235f6bd359322fb1e354e224355911f5569a32dce99457515dfd6a8a591223ea6a1e834dfb8
7
- data.tar.gz: 5fdeee4c5b69a495d90242e2a3e6157fbca0931bebeb77ba025840c15f14e571c9a672b527082324b2e72e17ae6e0d301b7978cf21be6ae77766cd1b4f18c3d0
6
+ metadata.gz: 4d541ba7f96a86f3f8de44f069f6e39d51ba6f28fa5d8c8d1d99a8434a95c5fe1a26470e6b062f348808fe5c0a444134d0dc96385437b4cb946c4a92044a2a5c
7
+ data.tar.gz: bc1bdacb7cbe69f9bb1382fd2ac7019bec04baf444dc963d63a594e989fd201d9eb9aadd0e463ac4efef8f7ba53915a594d8fb00f74ae295674b9024269a0406
data/CHANGELOG.md CHANGED
@@ -1,3 +1,19 @@
1
+ ## 0.2.4 (2021-12-28)
2
+
3
+ - Updated DataSketches to 3.3.0
4
+
5
+ ## 0.2.3 (2021-09-29)
6
+
7
+ - Updated DataSketches to 3.2.0
8
+
9
+ ## 0.2.2 (2021-07-17)
10
+
11
+ - Updated DataSketches to 3.1.0
12
+
13
+ ## 0.2.1 (2021-05-23)
14
+
15
+ - Improved performance
16
+
1
17
  ## 0.2.0 (2021-05-17)
2
18
 
3
19
  - Updated DataSketches to 3.0.0
data/LICENSE CHANGED
@@ -284,11 +284,48 @@ APPENDIX B: Additional licenses relevant to this product.
284
284
  ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
285
285
  DEALINGS IN THE SOFTWARE.
286
286
  -------------------------------------------------------------
287
- Code Locations
287
+ Code Locations:
288
288
  * https://github.com/apache/datasketches-cpp/blob/master/common/test/catch.hpp
289
289
  that is adapted from the above.
290
290
 
291
291
 
292
+ =============================================================
293
+ BSD License
294
+ =============================================================
295
+ Original source code:
296
+ https://github.com/pybind/pybind11/blob/master/LICENSE
297
+
298
+ Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
299
+
300
+ Redistribution and use in source and binary forms, with or without
301
+ modification, are permitted provided that the following conditions are met:
302
+
303
+ 1. Redistributions of source code must retain the above copyright notice, this
304
+ list of conditions and the following disclaimer.
305
+
306
+ 2. Redistributions in binary form must reproduce the above copyright notice,
307
+ this list of conditions and the following disclaimer in the documentation
308
+ and/or other materials provided with the distribution.
309
+
310
+ 3. Neither the name of the copyright holder nor the names of its contributors
311
+ may be used to endorse or promote products derived from this software
312
+ without specific prior written permission.
313
+
314
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
315
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
316
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
317
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
318
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
319
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
320
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
321
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
322
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
323
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
324
+ -------------------------------------------------------------
325
+ Code Locations:
326
+ Found only in the convenience binaries distributed from PyPI, which rely
327
+ on pybind11 code during compilation.
328
+
292
329
 
293
330
  =============================================================
294
331
  Public Domain
@@ -297,7 +334,7 @@ APPENDIX B: Additional licenses relevant to this product.
297
334
  https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
298
335
  Placed in the Public Domain by Austin Appleby
299
336
 
300
- Code Locations
337
+ Code Locations:
301
338
  common/include/MurmurHash3.h
302
339
  that is adapted from the above.
303
340
  -------------------------------------------------------------
@@ -305,6 +342,6 @@ APPENDIX B: Additional licenses relevant to this product.
305
342
  * https://graphics.stanford.edu/~seander/bithacks.html
306
343
  * Placed in the Public Domain by Sean Eron Anderson
307
344
 
308
- Code Locations
345
+ Code Locations:
309
346
  * common/include/ceiling_power_of_2.hpp
310
347
  that is adapted from the above.
data/NOTICE CHANGED
@@ -1,5 +1,5 @@
1
1
  Apache DataSketches-cpp
2
- Copyright 2020 The Apache Software Foundation
2
+ Copyright 2020-2021 The Apache Software Foundation
3
3
 
4
4
  Copyright 2015-2018 Yahoo
5
5
  Copyright 2019 Verizon Media
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
- # DataSketches
1
+ # DataSketches Ruby
2
2
 
3
3
  [DataSketches](https://datasketches.apache.org/) - sketch data structures - for Ruby
4
4
 
5
- [![Build Status](https://github.com/ankane/datasketches/workflows/build/badge.svg?branch=master)](https://github.com/ankane/datasketches/actions)
5
+ [![Build Status](https://github.com/ankane/datasketches-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/datasketches-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -292,22 +292,22 @@ This library is modeled after the DataSketches [Python API](https://github.com/a
292
292
 
293
293
  ## History
294
294
 
295
- View the [changelog](https://github.com/ankane/datasketches/blob/master/CHANGELOG.md)
295
+ View the [changelog](https://github.com/ankane/datasketches-ruby/blob/master/CHANGELOG.md)
296
296
 
297
297
  ## Contributing
298
298
 
299
299
  Everyone is encouraged to help improve this project. Here are a few ways you can help:
300
300
 
301
- - [Report bugs](https://github.com/ankane/datasketches/issues)
302
- - Fix bugs and [submit pull requests](https://github.com/ankane/datasketches/pulls)
301
+ - [Report bugs](https://github.com/ankane/datasketches-ruby/issues)
302
+ - Fix bugs and [submit pull requests](https://github.com/ankane/datasketches-ruby/pulls)
303
303
  - Write, clarify, or fix documentation
304
304
  - Suggest or add new features
305
305
 
306
306
  To get started with development:
307
307
 
308
308
  ```sh
309
- git clone --recursive https://github.com/ankane/datasketches.git
310
- cd datasketches
309
+ git clone --recursive https://github.com/ankane/datasketches-ruby.git
310
+ cd datasketches-ruby
311
311
  bundle install
312
312
  bundle exec rake compile
313
313
  bundle exec rake test
@@ -1,6 +1,6 @@
1
1
  require "mkmf-rice"
2
2
 
3
- $CXXFLAGS += " -std=c++17"
3
+ $CXXFLAGS += " -std=c++17 $(optflags)"
4
4
 
5
5
  ext = File.expand_path(".", __dir__)
6
6
  datasketches = File.expand_path("../../vendor/datasketches-cpp", __dir__)
@@ -20,10 +20,26 @@ using Rice::Arg;
20
20
 
21
21
  void init_theta(Rice::Module& m) {
22
22
  Rice::define_class_under<theta_sketch>(m, "ThetaSketch")
23
- .define_method("empty?", &theta_sketch::is_empty)
24
- .define_method("estimate", &theta_sketch::get_estimate)
25
- .define_method("lower_bound", &theta_sketch::get_lower_bound)
26
- .define_method("upper_bound", &theta_sketch::get_upper_bound);
23
+ .define_method(
24
+ "empty?",
25
+ [](theta_sketch& self) {
26
+ return self.is_empty();
27
+ })
28
+ .define_method(
29
+ "estimate",
30
+ [](theta_sketch& self) {
31
+ return self.get_estimate();
32
+ })
33
+ .define_method(
34
+ "lower_bound",
35
+ [](theta_sketch& self, uint8_t num_std_devs) {
36
+ return self.get_lower_bound(num_std_devs);
37
+ })
38
+ .define_method(
39
+ "upper_bound",
40
+ [](theta_sketch& self, uint8_t num_std_devs) {
41
+ return self.get_upper_bound(num_std_devs);
42
+ });
27
43
 
28
44
  Rice::define_class_under<compact_theta_sketch, theta_sketch>(m, "CompactThetaSketch")
29
45
  .define_singleton_function(
@@ -1,3 +1,3 @@
1
1
  module DataSketches
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.4"
3
3
  end
@@ -17,7 +17,7 @@
17
17
 
18
18
  cmake_minimum_required(VERSION 3.12.0)
19
19
  project(DataSketches
20
- VERSION 0.12.0
20
+ VERSION 3.2.0
21
21
  LANGUAGES CXX)
22
22
 
23
23
  include(GNUInstallDirs)
@@ -35,6 +35,8 @@ set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
35
35
  #set(CMAKE_VERBOSE_MAKEFILE ON)
36
36
  set(CMAKE_MACOSX_RPATH ON)
37
37
 
38
+ set(CMAKE_CXX_STANDARD 11)
39
+
38
40
  # enable compiler warnings globally
39
41
  # derived from https://foonathan.net/blog/2018/10/17/cmake-warnings.html
40
42
  # and https://arne-mertz.de/2018/07/cmake-properties-options/
@@ -70,6 +72,13 @@ if(COVERAGE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
70
72
  add_link_options(--coverage)
71
73
  endif()
72
74
 
75
+ option(SANITIZE "Run sanitization checks (g++/clang only)" OFF)
76
+ if(SANITIZE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
77
+ add_compile_options(-fsanitize=${SANITIZE})
78
+ add_link_options(-fsanitize=${SANITIZE})
79
+ endif()
80
+
81
+
73
82
  # set default build type to Release
74
83
  # Derived from: https://blog.kitware.com/cmake-and-the-default-build-type/
75
84
  set(default_build_type "Release")
@@ -117,11 +126,30 @@ endif()
117
126
 
118
127
  # # Installation
119
128
  install(TARGETS datasketches
120
- EXPORT ${PROJCT_NAME}
129
+ EXPORT ${PROJECT_NAME}
121
130
  PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/DataSketches
122
131
  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/DataSketches
123
132
  )
124
133
 
134
+ # Packaging
135
+ include(CMakePackageConfigHelpers)
136
+ write_basic_package_version_file(
137
+ "${CMAKE_CURRENT_BINARY_DIR}/DataSketchesConfigVersion.cmake"
138
+ VERSION ${PROJECT_VERSION}
139
+ COMPATIBILITY SameMajorVersion
140
+ )
141
+ configure_package_config_file(
142
+ cmake/DataSketchesConfig.cmake.in
143
+ "${CMAKE_CURRENT_BINARY_DIR}/DataSketchesConfig.cmake"
144
+ INSTALL_DESTINATION lib/DataSketches/cmake
145
+ PATH_VARS CMAKE_INSTALL_INCLUDEDIR
146
+ )
147
+ install(EXPORT ${PROJECT_NAME} DESTINATION lib/DataSketches/cmake)
148
+ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/DataSketchesConfigVersion.cmake"
149
+ "${CMAKE_CURRENT_BINARY_DIR}/DataSketchesConfig.cmake"
150
+ DESTINATION lib/DataSketches/cmake)
151
+
152
+
125
153
  #set(CPACK_PROJECT_NAME ${PROJECT_NAME})
126
154
  #set(CPACK_PROJECT_VERSION ${PROJECT_VERSION})
127
- #include(CPack)
155
+ include(CPack)
@@ -284,11 +284,48 @@ APPENDIX B: Additional licenses relevant to this product.
284
284
  ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
285
285
  DEALINGS IN THE SOFTWARE.
286
286
  -------------------------------------------------------------
287
- Code Locations
287
+ Code Locations:
288
288
  * https://github.com/apache/datasketches-cpp/blob/master/common/test/catch.hpp
289
289
  that is adapted from the above.
290
290
 
291
291
 
292
+ =============================================================
293
+ BSD License
294
+ =============================================================
295
+ Original source code:
296
+ https://github.com/pybind/pybind11/blob/master/LICENSE
297
+
298
+ Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
299
+
300
+ Redistribution and use in source and binary forms, with or without
301
+ modification, are permitted provided that the following conditions are met:
302
+
303
+ 1. Redistributions of source code must retain the above copyright notice, this
304
+ list of conditions and the following disclaimer.
305
+
306
+ 2. Redistributions in binary form must reproduce the above copyright notice,
307
+ this list of conditions and the following disclaimer in the documentation
308
+ and/or other materials provided with the distribution.
309
+
310
+ 3. Neither the name of the copyright holder nor the names of its contributors
311
+ may be used to endorse or promote products derived from this software
312
+ without specific prior written permission.
313
+
314
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
315
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
316
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
317
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
318
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
319
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
320
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
321
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
322
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
323
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
324
+ -------------------------------------------------------------
325
+ Code Locations:
326
+ Found only in the convenience binaries distributed from PyPI, which rely
327
+ on pybind11 code during compilation.
328
+
292
329
 
293
330
  =============================================================
294
331
  Public Domain
@@ -297,7 +334,7 @@ APPENDIX B: Additional licenses relevant to this product.
297
334
  https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
298
335
  Placed in the Public Domain by Austin Appleby
299
336
 
300
- Code Locations
337
+ Code Locations:
301
338
  common/include/MurmurHash3.h
302
339
  that is adapted from the above.
303
340
  -------------------------------------------------------------
@@ -305,7 +342,7 @@ APPENDIX B: Additional licenses relevant to this product.
305
342
  * https://graphics.stanford.edu/~seander/bithacks.html
306
343
  * Placed in the Public Domain by Sean Eron Anderson
307
344
 
308
- Code Locations
345
+ Code Locations:
309
346
  * common/include/ceiling_power_of_2.hpp
310
347
  that is adapted from the above.
311
348
 
@@ -9,11 +9,14 @@ global-exclude .git*
9
9
 
10
10
  recursive-include python/pybind11 *
11
11
 
12
+ graft cmake
12
13
  graft common
13
14
  graft cpc
14
15
  graft fi
15
16
  graft hll
16
17
  graft kll
18
+ graft req
17
19
  graft theta
20
+ graft tuple
18
21
  graft sampling
19
22
  graft python
@@ -1,5 +1,5 @@
1
1
  Apache DataSketches-cpp
2
- Copyright 2020 The Apache Software Foundation
2
+ Copyright 2020-2021 The Apache Software Foundation
3
3
 
4
4
  Copyright 2015-2018 Yahoo
5
5
  Copyright 2019 Verizon Media
@@ -25,18 +25,85 @@ Installing the latest cmake on OSX: brew install cmake
25
25
  Building and running unit tests using cmake for OSX and Linux:
26
26
 
27
27
  ```
28
- $ cd build
29
- $ cmake ..
30
- $ make
31
- $ make test
28
+ $ cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release
29
+ $ cmake --build build/Release -t all test
32
30
  ```
33
31
 
34
32
  Building and running unit tests using cmake for Windows from the command line:
35
33
 
36
34
  ```
37
- $ cd build
38
- $ cmake ..
39
- $ cd ..
40
- $ cmake --build build --config Release
41
- $ cmake --build build --config Release --target RUN_TESTS
35
+ $ cd build
36
+ $ cmake ..
37
+ $ cd ..
38
+ $ cmake --build build --config Release
39
+ $ cmake --build build --config Release --target RUN_TESTS
42
40
  ```
41
+
42
+ To install a local distribution (OSX and Linux), use the following command. The
43
+ CMAKE_INSTALL_PREFIX variable controls the destination. If not specified, it
44
+ defaults to installing in /usr (/usr/include, /usr/lib, etc). In the command below,
45
+ the installation will be in /tmp/install/DataSketches (/tmp/install/DataSketches/include,
46
+ /tmp/install/DataSketches/lib, etc)
47
+
48
+ ```
49
+ $ cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/install/DataSketches
50
+ $ cmake --build build/Release -t install
51
+ ```
52
+
53
+ To generate an installable package using cmake's built in cpack packaging tool,
54
+ use the following command. The type of packaging is controlled by the CPACK_GENERATOR
55
+ variable (semi-colon separated list). Cmake usually supports packaging types such as RPM,
56
+ DEB, STGZ, TGZ, TZ, ZIP, etc.
57
+
58
+ ```
59
+ $ cmake3 -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCPACK_GENERATOR="RPM;STGZ;TGZ"
60
+ $ cmake3 --build build/Release -t package
61
+ ```
62
+
63
+ The DataSketches project can be included in other projects' CMakeLists.txt files in one of two ways.
64
+ If DataSketches has been installed on the host (using an RPM, DEB, "make install" into /usr/local, or some
65
+ way, then CMake's `find_package` command can be used like this:
66
+
67
+ ```
68
+ find_package(DataSketches 3.2 REQUIRED)
69
+ target_link_library(my_dependent_target PUBLIC ${DATASKETCHES_LIB})
70
+ ```
71
+
72
+ When used with find_package, DataSketches exports several variables, including
73
+
74
+ - `DATASKETCHES_VERSION`: The version number of the datasketches package that was imported.
75
+ - `DATASKETCHES_INCLUDE_DIR`: The directory that should be added to access DataSketches include files.
76
+ Because cmake automatically includes the interface directories for included target libraries when
77
+ using `target_link_library`, under normal circumstances there will be no need to include this directly.
78
+ - `DATASKETCHES_LIB`: The name of the DataSketches target to include as a dependency. Projects pulling
79
+ in DataSketches should reference this with `target_link_library` in order to set up all the correct dependencies
80
+ and include paths.
81
+
82
+ If you don't have DataSketches installed locally, dependent projects can pull it directly
83
+ from GitHub using CMake's `ExternalProject` module. The code would look something like this:
84
+
85
+ ```
86
+ cmake_policy(SET CMP0097 NEW)
87
+ include(ExternalProject)
88
+ ExternalProject_Add(datasketches
89
+ GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git
90
+ GIT_TAG 3.2.0
91
+ GIT_SHALLOW true
92
+ GIT_SUBMODULES ""
93
+ INSTALL_DIR /tmp/datasketches-prefix
94
+ CMAKE_ARGS -DBUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=/tmp/datasketches-prefix
95
+
96
+ # Override the install command to add DESTDIR
97
+ # This is necessary to work around an oddity in the RPM (but not other) package
98
+ # generation, as CMake otherwise picks up the Datasketch files when building
99
+ # an RPM for a dependent package. (RPM scans the directory for files in addition to installing
100
+ # those files referenced in an "install" rule in the cmake file)
101
+ INSTALL_COMMAND env DESTDIR= ${CMAKE_COMMAND} --build . --target install
102
+ )
103
+ ExternalProject_Get_property(datasketches INSTALL_DIR)
104
+ set(datasketches_INSTALL_DIR ${INSTALL_DIR})
105
+ message("Source dir of datasketches = ${datasketches_INSTALL_DIR}")
106
+ target_include_directories(my_dependent_target
107
+ PRIVATE ${datasketches_INSTALL_DIR}/include/DataSketches)
108
+ add_dependencies(my_dependent_target datasketches)
109
+ ```
@@ -0,0 +1,10 @@
1
+ set(DATASKETCHES_VERSION "@PROJECT_VERSION@")
2
+
3
+ @PACKAGE_INIT@
4
+
5
+ include("${CMAKE_CURRENT_LIST_DIR}/DataSketches.cmake")
6
+
7
+ set_and_check(DATASKETCHES_INCLUDE_DIR "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@/DataSketches")
8
+ set(DATASKETCHES_LIB "datasketches")
9
+
10
+ check_required_components("@PROJECT_NAME@")
@@ -29,17 +29,18 @@ target_include_directories(common
29
29
 
30
30
  target_compile_features(common INTERFACE cxx_std_11)
31
31
 
32
- target_sources(common
33
- INTERFACE
34
- ${CMAKE_CURRENT_SOURCE_DIR}/include/common_defs.hpp
35
- ${CMAKE_CURRENT_SOURCE_DIR}/include/memory_operations.hpp
36
- ${CMAKE_CURRENT_SOURCE_DIR}/include/MurmurHash3.h
37
- ${CMAKE_CURRENT_SOURCE_DIR}/include/serde.hpp
38
- ${CMAKE_CURRENT_SOURCE_DIR}/include/count_zeros.hpp
39
- ${CMAKE_CURRENT_SOURCE_DIR}/include/inv_pow2_table.hpp
40
- ${CMAKE_CURRENT_SOURCE_DIR}/include/binomial_bounds.hpp
41
- ${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_back_inserter.hpp
42
- ${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_forward.hpp
43
- ${CMAKE_CURRENT_SOURCE_DIR}/include/ceiling_power_of_2.hpp
44
- )
32
+ install(TARGETS common EXPORT ${PROJECT_NAME})
45
33
 
34
+ install(FILES
35
+ include/common_defs.hpp
36
+ include/memory_operations.hpp
37
+ include/MurmurHash3.h
38
+ include/serde.hpp
39
+ include/count_zeros.hpp
40
+ include/inv_pow2_table.hpp
41
+ include/binomial_bounds.hpp
42
+ include/conditional_back_inserter.hpp
43
+ include/conditional_forward.hpp
44
+ include/ceiling_power_of_2.hpp
45
+ include/bounds_binomial_proportions.hpp
46
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
@@ -16,6 +16,8 @@
16
16
  #ifndef _MURMURHASH3_H_
17
17
  #define _MURMURHASH3_H_
18
18
 
19
+ #include <cstring>
20
+
19
21
  //-----------------------------------------------------------------------------
20
22
  // Platform-specific functions and macros
21
23
 
@@ -76,9 +78,11 @@ typedef struct {
76
78
  // Block read - if your platform needs to do endian-swapping or can only
77
79
  // handle aligned reads, do the conversion here
78
80
 
79
- FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
81
+ FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, size_t i )
80
82
  {
81
- return p[i];
83
+ uint64_t res;
84
+ memcpy(&res, p + i, sizeof(res));
85
+ return res;
82
86
  }
83
87
 
84
88
  //-----------------------------------------------------------------------------
@@ -95,7 +99,7 @@ FORCE_INLINE uint64_t fmix64 ( uint64_t k )
95
99
  return k;
96
100
  }
97
101
 
98
- FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t seed, HashState& out) {
102
+ FORCE_INLINE void MurmurHash3_x64_128(const void* key, size_t lenBytes, uint64_t seed, HashState& out) {
99
103
  static const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
100
104
  static const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
101
105
 
@@ -106,13 +110,13 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t se
106
110
 
107
111
  // Number of full 128-bit blocks of 16 bytes.
108
112
  // Possible exclusion of a remainder of up to 15 bytes.
109
- const int nblocks = lenBytes >> 4; // bytes / 16
113
+ const size_t nblocks = lenBytes >> 4; // bytes / 16
110
114
 
111
115
  // Process the 128-bit blocks (the body) into the hash
112
116
  const uint64_t* blocks = (const uint64_t*)(data);
113
- for (int i = 0; i < nblocks; ++i) { // 16 bytes per block
114
- uint64_t k1 = getblock64(blocks,i*2+0);
115
- uint64_t k2 = getblock64(blocks,i*2+1);
117
+ for (size_t i = 0; i < nblocks; ++i) { // 16 bytes per block
118
+ uint64_t k1 = getblock64(blocks, i * 2 + 0);
119
+ uint64_t k2 = getblock64(blocks, i * 2 + 1);
116
120
 
117
121
  k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; out.h1 ^= k1;
118
122
  out.h1 = ROTL64(out.h1,27);
@@ -381,7 +381,7 @@ private:
381
381
  // The following computes an approximation to the lower bound of a Frequentist
382
382
  // confidence interval based on the tails of the Binomial distribution.
383
383
  static double compute_approx_binomial_lower_bound(unsigned long long num_samples, double theta, unsigned num_std_devs) {
384
- if (theta == 1) return num_samples;
384
+ if (theta == 1) return static_cast<double>(num_samples);
385
385
  if (num_samples == 0) return 0;
386
386
  if (num_samples == 1) {
387
387
  const double delta = delta_of_num_std_devs[num_std_devs];
@@ -395,24 +395,24 @@ private:
395
395
  }
396
396
  // at this point we know 2 <= num_samples <= 120
397
397
  if (theta > (1 - 1e-5)) { // empirically-determined threshold
398
- return num_samples;
398
+ return static_cast<double>(num_samples);
399
399
  }
400
400
  if (theta < (num_samples / 360.0)) { // empirically-determined threshold
401
401
  // here we use the Gaussian approximation, but with a modified num_std_devs
402
- const unsigned index = 3 * num_samples + (num_std_devs - 1);
402
+ const unsigned index = 3 * static_cast<unsigned>(num_samples) + (num_std_devs - 1);
403
403
  const double raw_lb = cont_classic_lb(num_samples, theta, lb_equiv_table[index]);
404
404
  return raw_lb - 0.5; // fake round down
405
405
  }
406
406
  // This is the most difficult range to approximate; we will compute an "exact" LB.
407
407
  // We know that est <= 360, so specialNStar() shouldn't be ridiculously slow.
408
408
  const double delta = delta_of_num_std_devs[num_std_devs];
409
- return special_n_star(num_samples, theta, delta); // no need to round
409
+ return static_cast<double>(special_n_star(num_samples, theta, delta)); // no need to round
410
410
  }
411
411
 
412
412
  // The following computes an approximation to the upper bound of a Frequentist
413
413
  // confidence interval based on the tails of the Binomial distribution.
414
414
  static double compute_approx_binomial_upper_bound(unsigned long long num_samples, double theta, unsigned num_std_devs) {
415
- if (theta == 1) return num_samples;
415
+ if (theta == 1) return static_cast<double>(num_samples);
416
416
  if (num_samples == 0) {
417
417
  const double delta = delta_of_num_std_devs[num_std_devs];
418
418
  const double raw_ub = std::log(delta) / std::log(1 - theta);
@@ -425,18 +425,18 @@ private:
425
425
  }
426
426
  // at this point we know 2 <= num_samples <= 120
427
427
  if (theta > (1 - 1e-5)) { // empirically-determined threshold
428
- return num_samples + 1;
428
+ return static_cast<double>(num_samples + 1);
429
429
  }
430
430
  if (theta < (num_samples / 360.0)) { // empirically-determined threshold
431
431
  // here we use the Gaussian approximation, but with a modified num_std_devs
432
- const unsigned index = 3 * num_samples + (num_std_devs - 1);
432
+ const unsigned index = 3 * static_cast<unsigned>(num_samples) + (num_std_devs - 1);
433
433
  const double raw_ub = cont_classic_ub(num_samples, theta, ub_equiv_table[index]);
434
434
  return raw_ub + 0.5; // fake round up
435
435
  }
436
436
  // This is the most difficult range to approximate; we will compute an "exact" UB.
437
437
  // We know that est <= 360, so specialNPrimeF() shouldn't be ridiculously slow.
438
438
  const double delta = delta_of_num_std_devs[num_std_devs];
439
- return special_n_prime_f(num_samples, theta, delta); // no need to round
439
+ return static_cast<double>(special_n_prime_f(num_samples, theta, delta)); // no need to round
440
440
  }
441
441
 
442
442
  static void check_theta(double theta) {