datasketches 0.2.0 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -70,12 +70,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
70
70
  REQUIRE(sketch.get_PMF(split_points, 1).size() == 0);
71
71
  REQUIRE(sketch.get_CDF(split_points, 1).size() == 0);
72
72
 
73
- int count = 0;
74
- for (auto& it: sketch) {
73
+ for (auto it: sketch) {
75
74
  (void) it; // to suppress "unused" warning
76
- ++count;
75
+ FAIL("should be no iterations over an empty sketch");
77
76
  }
78
- REQUIRE(count == 0);
79
77
  }
80
78
 
81
79
  SECTION("get bad quantile") {
@@ -86,13 +84,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
86
84
 
87
85
  SECTION("one item") {
88
86
  kll_float_sketch sketch(200, 0);
89
- sketch.update(1);
87
+ sketch.update(1.0f);
90
88
  REQUIRE_FALSE(sketch.is_empty());
91
89
  REQUIRE_FALSE(sketch.is_estimation_mode());
92
90
  REQUIRE(sketch.get_n() == 1);
93
91
  REQUIRE(sketch.get_num_retained() == 1);
94
- REQUIRE(sketch.get_rank(1) == 0.0);
95
- REQUIRE(sketch.get_rank(2) == 1.0);
92
+ REQUIRE(sketch.get_rank(1.0f) == 0.0);
93
+ REQUIRE(sketch.get_rank(2.0f) == 1.0);
96
94
  REQUIRE(sketch.get_min_value() == 1.0);
97
95
  REQUIRE(sketch.get_max_value() == 1.0);
98
96
  REQUIRE(sketch.get_quantile(0.5) == 1.0);
@@ -104,7 +102,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
104
102
  REQUIRE(quantiles[2] == 1.0);
105
103
 
106
104
  int count = 0;
107
- for (auto& it: sketch) {
105
+ for (auto it: sketch) {
108
106
  REQUIRE(it.second == 1);
109
107
  ++count;
110
108
  }
@@ -116,16 +114,16 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
116
114
  sketch.update(std::numeric_limits<float>::quiet_NaN());
117
115
  REQUIRE(sketch.is_empty());
118
116
 
119
- sketch.update(0.0);
117
+ sketch.update(0);
120
118
  sketch.update(std::numeric_limits<float>::quiet_NaN());
121
119
  REQUIRE(sketch.get_n() == 1);
122
120
  }
123
121
 
124
122
  SECTION("many items, exact mode") {
125
123
  kll_float_sketch sketch(200, 0);
126
- const uint32_t n(200);
124
+ const uint32_t n = 200;
127
125
  for (uint32_t i = 0; i < n; i++) {
128
- sketch.update(i);
126
+ sketch.update(static_cast<float>(i));
129
127
  REQUIRE(sketch.get_n() == i + 1);
130
128
  }
131
129
  REQUIRE_FALSE(sketch.is_empty());
@@ -145,7 +143,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
145
143
 
146
144
  for (uint32_t i = 0; i < n; i++) {
147
145
  const double trueRank = (double) i / n;
148
- REQUIRE(sketch.get_rank(i) == trueRank);
146
+ REQUIRE(sketch.get_rank(static_cast<float>(i)) == trueRank);
149
147
  }
150
148
 
151
149
  // the alternative method must produce the same result
@@ -158,16 +156,16 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
158
156
 
159
157
  SECTION("10 items") {
160
158
  kll_float_sketch sketch(200, 0);
161
- sketch.update(1);
162
- sketch.update(2);
163
- sketch.update(3);
164
- sketch.update(4);
165
- sketch.update(5);
166
- sketch.update(6);
167
- sketch.update(7);
168
- sketch.update(8);
169
- sketch.update(9);
170
- sketch.update(10);
159
+ sketch.update(1.0f);
160
+ sketch.update(2.0f);
161
+ sketch.update(3.0f);
162
+ sketch.update(4.0f);
163
+ sketch.update(5.0f);
164
+ sketch.update(6.0f);
165
+ sketch.update(7.0f);
166
+ sketch.update(8.0f);
167
+ sketch.update(9.0f);
168
+ sketch.update(10.0f);
171
169
  REQUIRE(sketch.get_quantile(0) == 1.0);
172
170
  REQUIRE(sketch.get_quantile(0.5) == 6.0);
173
171
  REQUIRE(sketch.get_quantile(0.99) == 10.0);
@@ -176,7 +174,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
176
174
 
177
175
  SECTION("100 items") {
178
176
  kll_float_sketch sketch(200, 0);
179
- for (int i = 0; i < 100; ++i) sketch.update(i);
177
+ for (int i = 0; i < 100; ++i) sketch.update(static_cast<float>(i));
180
178
  REQUIRE(sketch.get_quantile(0) == 0);
181
179
  REQUIRE(sketch.get_quantile(0.01) == 1);
182
180
  REQUIRE(sketch.get_quantile(0.5) == 50);
@@ -186,9 +184,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
186
184
 
187
185
  SECTION("many items, estimation mode") {
188
186
  kll_float_sketch sketch(200, 0);
189
- const int n(1000000);
187
+ const int n = 1000000;
190
188
  for (int i = 0; i < n; i++) {
191
- sketch.update(i);
189
+ sketch.update(static_cast<float>(i));
192
190
  REQUIRE(sketch.get_n() == static_cast<uint64_t>(i + 1));
193
191
  }
194
192
  REQUIRE_FALSE(sketch.is_empty());
@@ -201,7 +199,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
201
199
  // test rank
202
200
  for (int i = 0; i < n; i++) {
203
201
  const double trueRank = (double) i / n;
204
- REQUIRE(sketch.get_rank(i) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
202
+ REQUIRE(sketch.get_rank(static_cast<float>(i)) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
205
203
  }
206
204
 
207
205
  // test quantiles at every 0.1 percentage point
@@ -224,6 +222,15 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
224
222
  }
225
223
 
226
224
  //std::cout << sketch.to_string();
225
+
226
+ uint32_t count = 0;
227
+ uint64_t total_weight = 0;
228
+ for (auto it: sketch) {
229
+ ++count;
230
+ total_weight += it.second;
231
+ }
232
+ REQUIRE(count == sketch.get_num_retained());
233
+ REQUIRE(total_weight == sketch.get_n());
227
234
  }
228
235
 
229
236
  SECTION("consistency between get_rank adn get_PMF/CDF") {
@@ -231,8 +238,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
231
238
  const int n = 1000;
232
239
  float values[n];
233
240
  for (int i = 0; i < n; i++) {
234
- sketch.update(i);
235
- values[i] = i;
241
+ sketch.update(static_cast<float>(i));
242
+ values[i] = static_cast<float>(i);
236
243
  }
237
244
 
238
245
  const auto ranks(sketch.get_CDF(values, n));
@@ -272,6 +279,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
272
279
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
273
280
  auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
274
281
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
282
+ REQUIRE(s.tellg() == s.tellp());
275
283
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
276
284
  REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
277
285
  REQUIRE(sketch2.get_n() == sketch.get_n());
@@ -297,9 +305,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
297
305
  REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
298
306
  }
299
307
 
300
- SECTION("serialize deserialize one item") {
308
+ SECTION("stream serialize deserialize one item") {
301
309
  kll_float_sketch sketch(200, 0);
302
- sketch.update(1);
310
+ sketch.update(1.0f);
303
311
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
304
312
  sketch.serialize(s);
305
313
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
@@ -317,6 +325,24 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
317
325
  REQUIRE(sketch2.get_rank(2) == 1.0);
318
326
  }
319
327
 
328
+ SECTION("bytes serialize deserialize one item") {
329
+ kll_float_sketch sketch(200, 0);
330
+ sketch.update(1.0f);
331
+ auto bytes = sketch.serialize();
332
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
333
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
334
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
335
+ REQUIRE_FALSE(sketch2.is_empty());
336
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
337
+ REQUIRE(sketch2.get_n() == 1);
338
+ REQUIRE(sketch2.get_num_retained() == 1);
339
+ REQUIRE(sketch2.get_min_value() == 1.0);
340
+ REQUIRE(sketch2.get_max_value() == 1.0);
341
+ REQUIRE(sketch2.get_quantile(0.5) == 1.0);
342
+ REQUIRE(sketch2.get_rank(1) == 0.0);
343
+ REQUIRE(sketch2.get_rank(2) == 1.0);
344
+ }
345
+
320
346
  SECTION("deserialize one item v1") {
321
347
  std::ifstream is;
322
348
  is.exceptions(std::ios::failbit | std::ios::badbit);
@@ -330,10 +356,46 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
330
356
  REQUIRE(sketch.get_max_value() == 1.0);
331
357
  }
332
358
 
359
+ SECTION("stream serialize deserialize three items") {
360
+ kll_float_sketch sketch(200, 0);
361
+ sketch.update(1.0f);
362
+ sketch.update(2.0f);
363
+ sketch.update(3.0f);
364
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
365
+ sketch.serialize(s);
366
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
367
+ auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
368
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
369
+ REQUIRE(s.tellg() == s.tellp());
370
+ REQUIRE_FALSE(sketch2.is_empty());
371
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
372
+ REQUIRE(sketch2.get_n() == 3);
373
+ REQUIRE(sketch2.get_num_retained() == 3);
374
+ REQUIRE(sketch2.get_min_value() == 1.0);
375
+ REQUIRE(sketch2.get_max_value() == 3.0);
376
+ }
377
+
378
+ SECTION("bytes serialize deserialize three items") {
379
+ kll_float_sketch sketch(200, 0);
380
+ sketch.update(1.0f);
381
+ sketch.update(2.0f);
382
+ sketch.update(3.0f);
383
+ auto bytes = sketch.serialize();
384
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
385
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
386
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
387
+ REQUIRE_FALSE(sketch2.is_empty());
388
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
389
+ REQUIRE(sketch2.get_n() == 3);
390
+ REQUIRE(sketch2.get_num_retained() == 3);
391
+ REQUIRE(sketch2.get_min_value() == 1.0);
392
+ REQUIRE(sketch2.get_max_value() == 3.0);
393
+ }
394
+
333
395
  SECTION("stream serialize deserialize many floats") {
334
396
  kll_float_sketch sketch(200, 0);
335
- const int n(1000);
336
- for (int i = 0; i < n; i++) sketch.update(i);
397
+ const int n = 1000;
398
+ for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
337
399
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
338
400
  sketch.serialize(s);
339
401
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
@@ -350,13 +412,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
350
412
  REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
351
413
  REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
352
414
  REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
353
- REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
415
+ REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
354
416
  }
355
417
 
356
418
  SECTION("bytes serialize deserialize many floats") {
357
419
  kll_float_sketch sketch(200, 0);
358
- const int n(1000);
359
- for (int i = 0; i < n; i++) sketch.update(i);
420
+ const int n = 1000;
421
+ for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
360
422
  auto bytes = sketch.serialize();
361
423
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
362
424
  auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
@@ -371,7 +433,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
371
433
  REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
372
434
  REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
373
435
  REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
374
- REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
436
+ REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
375
437
  REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 7), std::out_of_range);
376
438
  REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 15), std::out_of_range);
377
439
  REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
@@ -379,7 +441,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
379
441
 
380
442
  SECTION("bytes serialize deserialize many ints") {
381
443
  kll_sketch<int> sketch;
382
- const int n(1000);
444
+ const int n = 1000;
383
445
  for (int i = 0; i < n; i++) sketch.update(i);
384
446
  auto bytes = sketch.serialize();
385
447
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
@@ -439,8 +501,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
439
501
  kll_float_sketch sketch2(200, 0);
440
502
  const int n = 10000;
441
503
  for (int i = 0; i < n; i++) {
442
- sketch1.update(i);
443
- sketch2.update((2 * n) - i - 1);
504
+ sketch1.update(static_cast<float>(i));
505
+ sketch2.update(static_cast<float>((2 * n) - i - 1));
444
506
  }
445
507
 
446
508
  REQUIRE(sketch1.get_min_value() == 0.0f);
@@ -462,8 +524,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
462
524
  kll_float_sketch sketch2(128, 0);
463
525
  const int n = 10000;
464
526
  for (int i = 0; i < n; i++) {
465
- sketch1.update(i);
466
- sketch2.update((2 * n) - i - 1);
527
+ sketch1.update(static_cast<float>(i));
528
+ sketch2.update(static_cast<float>((2 * n) - i - 1));
467
529
  }
468
530
 
469
531
  REQUIRE(sketch1.get_min_value() == 0.0f);
@@ -495,7 +557,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
495
557
  kll_float_sketch sketch2(128, 0);
496
558
  const int n = 10000;
497
559
  for (int i = 0; i < n; i++) {
498
- sketch1.update(i);
560
+ sketch1.update(static_cast<float>(i));
499
561
  }
500
562
 
501
563
  // rank error should not be affected by a merge with an empty sketch with lower k
@@ -518,8 +580,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
518
580
  SECTION("merge min value from other") {
519
581
  kll_float_sketch sketch1(200, 0);
520
582
  kll_float_sketch sketch2(200, 0);
521
- sketch1.update(1);
522
- sketch2.update(2);
583
+ sketch1.update(1.0f);
584
+ sketch2.update(2.0f);
523
585
  sketch2.merge(sketch1);
524
586
  REQUIRE(sketch2.get_min_value() == 1.0f);
525
587
  REQUIRE(sketch2.get_max_value() == 2.0f);
@@ -527,7 +589,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
527
589
 
528
590
  SECTION("merge min and max values from other") {
529
591
  kll_float_sketch sketch1(200, 0);
530
- for (int i = 0; i < 1000000; i++) sketch1.update(i);
592
+ for (int i = 0; i < 1000000; i++) sketch1.update(static_cast<float>(i));
531
593
  kll_float_sketch sketch2(200, 0);
532
594
  sketch2.merge(sketch1);
533
595
  REQUIRE(sketch2.get_min_value() == 0.0f);
@@ -540,7 +602,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
540
602
  REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
541
603
  REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
542
604
 
543
- const int n(1000);
605
+ const int n = 1000;
544
606
  for (int i = 0; i < n; i++) sketch.update(i);
545
607
 
546
608
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
@@ -679,6 +741,31 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
679
741
  }
680
742
  }
681
743
 
744
+ SECTION("max serialized size arithmetic type") {
745
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 10) == 1968);
746
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 100) == 2316);
747
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000) == 2440);
748
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000) == 2800);
749
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000000) == 3160);
750
+ }
751
+
752
+ SECTION("max serialized size non-arithmetic type") {
753
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 10, 4) == 1968);
754
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 100, 4) == 2316);
755
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000, 4) == 2440);
756
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000, 4) == 2800);
757
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000000, 4) == 3160);
758
+ }
759
+
760
+ SECTION("issue #236") {
761
+ kll_sketch<int8_t> kll;
762
+ kll.update(1);
763
+ kll.update(2);
764
+ kll.update(3);
765
+ auto blob = kll.serialize();
766
+ auto kll2 = kll_sketch<int8_t>::deserialize(blob.data(), blob.size());
767
+ }
768
+
682
769
  // cleanup
683
770
  if (test_allocator_total_bytes != 0) {
684
771
  REQUIRE(test_allocator_total_bytes == 0);
@@ -0,0 +1,111 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include <random>
23
+
24
+ #include <kll_sketch.hpp>
25
+ #include <kolmogorov_smirnov.hpp>
26
+
27
+ namespace datasketches {
28
+
29
+ TEST_CASE("kolmogorov-smirnov empty", "[kll_sketch]") {
30
+ const uint16_t k = 200;
31
+ kll_sketch<double> sketch1(k);
32
+ kll_sketch<double> sketch2(k);
33
+ REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == 0);
34
+ REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
35
+ }
36
+
37
+ TEST_CASE("kolmogorov-smirnov same distribution", "[kll_sketch]") {
38
+ const uint16_t k = 200;
39
+ kll_sketch<double> sketch1(k);
40
+ kll_sketch<double> sketch2(k);
41
+ std::default_random_engine rand;
42
+ std::normal_distribution<double> distr;
43
+ const int n = k * 3 - 1;
44
+ for (int i = 0; i < n; ++i) {
45
+ const double x = distr(rand);
46
+ sketch1.update(x);
47
+ sketch2.update(x);
48
+ }
49
+ REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.01));
50
+ REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
51
+ }
52
+
53
+ TEST_CASE("kolmogorov-smirnov very different distributions", "[kll_sketch]") {
54
+ const uint16_t k = 200;
55
+ kll_sketch<double> sketch1(k);
56
+ kll_sketch<double> sketch2(k);
57
+ std::default_random_engine rand;
58
+ std::normal_distribution<double> distr;
59
+ const int n = k * 3 - 1;
60
+ for (int i = 0; i < n; ++i) {
61
+ const double x = distr(rand);
62
+ sketch1.update(x + 100.0);
63
+ sketch2.update(x);
64
+ }
65
+ const auto delta = kolmogorov_smirnov::delta(sketch1, sketch2);
66
+ REQUIRE(delta == Approx(1.0).margin(1e-6));
67
+ REQUIRE(delta <= 1);
68
+ REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
69
+ }
70
+
71
+ TEST_CASE("kolmogorov-smirnov slightly different distributions", "[kll_sketch]") {
72
+ const uint16_t k = 2000;
73
+ kll_sketch<double> sketch1(k);
74
+ kll_sketch<double> sketch2(k);
75
+ std::default_random_engine rand;
76
+ std::normal_distribution<double> distr;
77
+ const int n = k * 3 - 1;
78
+ for (int i = 0; i < n; ++i) {
79
+ const double x = distr(rand);
80
+ sketch1.update(x + 0.05);
81
+ sketch2.update(x);
82
+ }
83
+ const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
84
+ REQUIRE(delta == Approx(0.02).margin(0.01));
85
+ const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
86
+ //std::cout << "delta=" << delta << ", threshold=" << threshold << "\n";
87
+ REQUIRE_FALSE(delta > threshold);
88
+ REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
89
+ }
90
+
91
+ TEST_CASE("kolmogorov-smirnov slightly different distributions high resolution", "[kll_sketch]") {
92
+ const uint16_t k = 8000;
93
+ kll_sketch<double> sketch1(k);
94
+ kll_sketch<double> sketch2(k);
95
+ std::default_random_engine rand;
96
+ std::normal_distribution<double> distr;
97
+ const int n = k * 3 - 1;
98
+ for (int i = 0; i < n; ++i) {
99
+ const double x = distr(rand);
100
+ sketch1.update(x + 0.05);
101
+ sketch2.update(x);
102
+ }
103
+ const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
104
+ REQUIRE(delta == Approx(0.02).margin(0.01));
105
+ const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
106
+ //std::cout << "delta=" << delta << ", threshold=" << threshold << "\n";
107
+ REQUIRE(delta > threshold);
108
+ REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
109
+ }
110
+
111
+ } /* namespace datasketches */
@@ -1,8 +1,10 @@
1
1
  [build-system]
2
2
  requires = ["wheel",
3
3
  "setuptools >= 30.3.0",
4
- "setuptools_scm",
5
- "cmake >= 3.12"]
4
+ "cmake >= 3.12",
5
+ "pip >= 10.0",
6
+ "pybind11[global] >= 2.6.0"]
7
+ build-backend = "setuptools.build_meta"
6
8
 
7
9
  [tool.tox]
8
10
  legacy_tox_ini = """
@@ -15,16 +15,20 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- # TODO: Can we force python version >= 3.0?
18
+ find_package(Python3 COMPONENTS Interpreter Development)
19
+
20
+ # only Windows+MSVC seems to have trouble locating pybind11
19
21
  if (MSVC)
20
- set(PYBIND11_CPP_STANDARD /std:c++11)
21
- else()
22
- set(PYBIND11_CPP_STANDARD -std=c++11)
22
+ execute_process(COMMAND cmd.exe /c ${CMAKE_CURRENT_SOURCE_DIR}/pybind11Path.cmd "${Python3_EXECUTABLE}"
23
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
24
+ OUTPUT_STRIP_TRAILING_WHITESPACE
25
+ OUTPUT_VARIABLE EXTRA_PACKAGE_PATH)
26
+ set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${EXTRA_PACKAGE_PATH})
23
27
  endif()
24
28
 
25
- add_subdirectory(pybind11)
29
+ find_package(pybind11 CONFIG REQUIRED)
26
30
 
27
- pybind11_add_module(python MODULE EXCLUDE_FROM_ALL SYSTEM THIN_LTO)
31
+ pybind11_add_module(python MODULE EXCLUDE_FROM_ALL THIN_LTO)
28
32
 
29
33
  target_link_libraries(python
30
34
  PRIVATE
@@ -1,76 +1,57 @@
1
- # Python Wrapper for Apache DataSketches
1
+ <img src="https://raw.githubusercontent.com/apache/datasketches-website/master/logos/svg/datasketches-HorizontalColor-TM.svg" width="75%" alt="Apache DataSketchs Logo">
2
2
 
3
- ## Installation
3
+ # The Apache DataSketches Library for Python
4
4
 
5
- The release files do not include the needed python binding library ([pybind11](https://github.com/pybind/pybind11)). If building
6
- from a relase package, you must ensure that the pybind11 directory points to a local copy of pybind11.
5
+ This is the official version of the [Apache DataSketches](https://datasketches.apache.org) Python library.
7
6
 
8
- An official pypi build is eventually planned but not yet available.
7
+ In the analysis of big data there are often problem queries that don’t scale because they require huge compute resources and time to generate exact results. Examples include count distinct, quantiles, most-frequent items, joins, matrix computations, and graph analysis.
9
8
 
10
- If you instead want to take a (possibly ill-advised) gamble on the current state of the master branch being useable, you can run:
11
- ```pip install git+https://github.com/apache/datasketches-cpp.git```
9
+ If approximate results are acceptable, there is a class of specialized algorithms, called streaming algorithms, or sketches that can produce results orders-of magnitude faster and with mathematically proven error bounds. For interactive queries there may not be other viable alternatives, and in the case of real-time analysis, sketches are the only known solution.
12
10
 
13
- ## Developer Instructions
14
-
15
- ### Building
16
-
17
- When cloning the source repository, you should include the pybind11 submodule with the `--recursive` option to the clone command:
18
- ```
19
- git clone --recursive https://github.com/apache/datasketches-cpp.git
20
- cd datasketches-cpp
21
- python -m pip install --upgrade pip setuptools wheel numpy
22
- python setup.py build
23
- ```
11
+ This package provides a variety of sketches as described below. Wherever a specific type of sketch exists in Apache DataSketches packages for other languages, the sketches will be portable between languages (for platforms with the same endianness).
24
12
 
25
- If you cloned without `--recursive`, you can add the submodule post-checkout using `git submodule update --init --recursive`.
13
+ ## Building and Installation
26
14
 
27
- ### Installing
15
+ Once cloned, the library can be installed by running `python -m pip install .` in the project root directory, which will also install the necessary dependencies, namely numpy and [pybind11[global]](https://github.com/pybind/pybind11).
28
16
 
29
- Assuming you have already checked out the library and any dependent submodules, install by simply replacing the lsat
30
- line of the build command with `python setup.py install`.
17
+ If you prefer to call the `setup.py` build script directly, you must first install `pybind11[global]`, as well as any other dependencies listed under the build-system section in `pyproject.toml`.
31
18
 
32
- ### Unit tests
33
-
34
- The python tests are run with `tox`. To ensure you have all the needed packages, from the package base directory run:
35
- ```
36
- python -m pip install --upgrade pip setuptools wheel numpy tox
37
- tox
38
- ```
19
+ The library is also available from PyPI via `python -m pip install datasketches`.
39
20
 
40
21
  ## Usage
41
22
 
42
- Having installed the library, loading the Apache Datasketches library in Python is simple: `import datasketches`.
23
+ Having installed the library, loading the Apache Datasketches Library in Python is simple: `import datasketches`.
43
24
 
44
25
  ## Available Sketch Classes
45
26
 
46
27
  - KLL (Absolute Error Quantiles)
47
- - `kll_ints_sketch`
48
- - `kll_floats_sketch`
28
+ - `kll_ints_sketch`
29
+ - `kll_floats_sketch`
49
30
  - REQ (Relative Error Quantiles)
50
- - `req_ints_sketch`
51
- - `req_floats_sketch`
31
+ - `req_ints_sketch`
32
+ - `req_floats_sketch`
52
33
  - Frequent Items
53
- - `frequent_strings_sketch`
54
- - Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
34
+ - `frequent_strings_sketch`
35
+ - Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
55
36
  - Theta
56
- - `update_theta_sketch`
57
- - `compact_theta_sketch` (cannot be instantiated directly)
58
- - `theta_union`
59
- - `theta_intersection`
60
- - `theta_a_not_b`
37
+ - `update_theta_sketch`
38
+ - `compact_theta_sketch` (cannot be instantiated directly)
39
+ - `theta_union`
40
+ - `theta_intersection`
41
+ - `theta_a_not_b`
61
42
  - HLL
62
- - `hll_sketch`
63
- - `hll_union`
64
- - Target HLL types are `tgt_hll_type.{HLL_4 | HLL_6 | HLL_8}`
43
+ - `hll_sketch`
44
+ - `hll_union`
45
+ - Target HLL types are `tgt_hll_type.{HLL_4 | HLL_6 | HLL_8}`
65
46
  - CPC
66
- - `cpc_sketch`
67
- - `cpc_union`
47
+ - `cpc_sketch`
48
+ - `cpc_union`
68
49
  - VarOpt Sampling
69
- - `var_opt_sketch`
70
- - `var_opt_union`
50
+ - `var_opt_sketch`
51
+ - `var_opt_union`
71
52
  - Vector of KLL
72
- - `vector_of_kll_ints_sketches`
73
- - `vector_of_kll_floats_sketches`
53
+ - `vector_of_kll_ints_sketches`
54
+ - `vector_of_kll_floats_sketches`
74
55
 
75
56
  ## Known Differences from C++
76
57
 
@@ -79,3 +60,22 @@ The Python API largely mirrors the C++ API, with a few minor exceptions: The pri
79
60
  The Vector of KLL object is currently exclusive to python, and holds an array of independent KLL sketches. This is useful for creating a set of KLL sketches over a vector and has been designed to allow input as either a vector or a matrix of multiple vectors.
80
61
 
81
62
  We have also removed reliance on a builder class for theta sketches as Python allows named arguments to the constructor, not strictly positional arguments.
63
+
64
+ ## Developer Instructions
65
+
66
+ The only developer-specific instructions relate to running unit tests.
67
+
68
+ ### Unit tests
69
+
70
+ The Python unit tests are run with `tox`. To ensure you have all the needed package, from the package base directory run:
71
+
72
+ ```bash
73
+ python -m pip install --upgrade tox
74
+ tox
75
+ ```
76
+
77
+ ## License
78
+
79
+ The Apache DataSketches Library is distrubted under an Apache 2.0 License.
80
+
81
+ There may be precompiled binaries provided as a convenience and distributed through PyPI via [https://pypi.org/project/datasketches/] contain compiled code from [pybind11](https://github.com/pybind/pybind11), which is distributed under a BSD license.
@@ -0,0 +1,3 @@
1
+ @echo off
2
+ :: Takes path to the Python interpreter and returns the path to pybind11
3
+ %1 -m pip show pybind11 | %1 -c "import sys,re;[sys.stdout.write(re.sub('^Location:\\s+','',line)) for line in sys.stdin if re.search('^Location:\\s+',line)]"
@@ -53,7 +53,7 @@ void init_cpc(py::module &m) {
53
53
  using namespace datasketches;
54
54
 
55
55
  py::class_<cpc_sketch>(m, "cpc_sketch")
56
- .def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=CPC_DEFAULT_LG_K, py::arg("seed")=DEFAULT_SEED)
56
+ .def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=cpc_constants::DEFAULT_LG_K, py::arg("seed")=DEFAULT_SEED)
57
57
  .def(py::init<const cpc_sketch&>())
58
58
  .def("__str__", &cpc_sketch::to_string,
59
59
  "Produces a string summary of the sketch")