datasketches 0.4.2 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/ext/datasketches/vo_wrapper.cpp +1 -1
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
  8. data/vendor/datasketches-cpp/LICENSE +35 -7
  9. data/vendor/datasketches-cpp/NOTICE +3 -3
  10. data/vendor/datasketches-cpp/README.md +2 -3
  11. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
  12. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  13. data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
  14. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  15. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  16. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  19. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  20. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  21. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  23. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  24. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  25. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  26. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  27. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  28. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  29. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  30. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  31. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  32. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  34. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  35. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  36. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  37. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  38. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  39. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  40. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  41. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  42. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  43. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  44. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  45. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
  46. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
  47. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  48. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  49. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  50. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  51. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  52. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  53. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
  54. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  55. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  56. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  57. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  58. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  59. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
  60. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  61. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  62. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  63. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  64. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  65. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  66. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  67. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  68. metadata +27 -9
@@ -417,6 +417,20 @@ public:
417
417
  virtual uint32_t get_num_retained() const;
418
418
  virtual uint16_t get_seed_hash() const;
419
419
 
420
+ /**
421
+ * Computes maximum serialized size in bytes
422
+ * @param lg_k nominal number of entries in the sketch
423
+ */
424
+ static size_t get_max_serialized_size_bytes(uint8_t lg_k);
425
+
426
+ /**
427
+ * Computes size in bytes required to serialize the current state of the sketch.
428
+ * Computing compressed size is expensive. It takes iterating over all retained hashes,
429
+ * and the actual serialization will have to look at them again.
430
+ * @param compressed if true compressed size is returned (if applicable)
431
+ */
432
+ size_t get_serialized_size_bytes(bool compressed = false) const;
433
+
420
434
  /**
421
435
  * This method serializes the sketch into a given stream in a binary form
422
436
  * @param os output stream
@@ -486,8 +500,11 @@ private:
486
500
  uint64_t theta_;
487
501
  std::vector<uint64_t, Allocator> entries_;
488
502
 
503
+ uint8_t get_preamble_longs(bool compressed) const;
489
504
  bool is_suitable_for_compression() const;
490
- uint8_t compute_min_leading_zeros() const;
505
+ uint8_t compute_entry_bits() const;
506
+ uint8_t get_num_entries_bytes() const;
507
+ size_t get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const;
491
508
  void serialize_version_4(std::ostream& os) const;
492
509
  vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
493
510
 
@@ -24,11 +24,11 @@
24
24
  #include <vector>
25
25
  #include <stdexcept>
26
26
 
27
- #include "serde.hpp"
28
27
  #include "binomial_bounds.hpp"
29
28
  #include "theta_helpers.hpp"
30
29
  #include "count_zeros.hpp"
31
30
  #include "bit_packing.hpp"
31
+ #include "memory_operations.hpp"
32
32
 
33
33
  namespace datasketches {
34
34
 
@@ -341,6 +341,39 @@ auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
341
341
  template<typename A>
342
342
  void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
343
343
 
344
+ template<typename A>
345
+ uint8_t compact_theta_sketch_alloc<A>::get_preamble_longs(bool compressed) const {
346
+ if (compressed) {
347
+ return this->is_estimation_mode() ? 2 : 1;
348
+ }
349
+ return this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
350
+ }
351
+
352
+ template<typename A>
353
+ size_t compact_theta_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
354
+ return sizeof(uint64_t) * (3 + update_theta_sketch_alloc<A>::theta_table::get_capacity(lg_k + 1, lg_k));
355
+ }
356
+
357
+ template<typename A>
358
+ size_t compact_theta_sketch_alloc<A>::get_serialized_size_bytes(bool compressed) const {
359
+ if (compressed && is_suitable_for_compression()) {
360
+ return get_compressed_serialized_size_bytes(compute_entry_bits(), get_num_entries_bytes());
361
+ }
362
+ return sizeof(uint64_t) * get_preamble_longs(false) + sizeof(uint64_t) * entries_.size();
363
+ }
364
+
365
+ // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
366
+ template<typename A>
367
+ uint8_t compact_theta_sketch_alloc<A>::get_num_entries_bytes() const {
368
+ return whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
369
+ }
370
+
371
+ template<typename A>
372
+ size_t compact_theta_sketch_alloc<A>::get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const {
373
+ const size_t compressed_bits = entry_bits * entries_.size();
374
+ return sizeof(uint64_t) * get_preamble_longs(true) + num_entries_bytes + whole_bytes_to_hold_bits(compressed_bits);
375
+ }
376
+
344
377
  template<typename A>
345
378
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
346
379
  const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
@@ -366,12 +399,10 @@ void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
366
399
 
367
400
  template<typename A>
368
401
  auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
369
- const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
370
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
371
- + sizeof(uint64_t) * entries_.size();
402
+ const size_t size = get_serialized_size_bytes() + header_size_bytes;
372
403
  vector_bytes bytes(size, 0, entries_.get_allocator());
373
404
  uint8_t* ptr = bytes.data() + header_size_bytes;
374
-
405
+ const uint8_t preamble_longs = get_preamble_longs(false);
375
406
  *ptr++ = preamble_longs;
376
407
  *ptr++ = UNCOMPRESSED_SERIAL_VERSION;
377
408
  *ptr++ = SKETCH_TYPE;
@@ -413,7 +444,7 @@ auto compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_by
413
444
  }
414
445
 
415
446
  template<typename A>
416
- uint8_t compact_theta_sketch_alloc<A>::compute_min_leading_zeros() const {
447
+ uint8_t compact_theta_sketch_alloc<A>::compute_entry_bits() const {
417
448
  // compression is based on leading zeros in deltas between ordered hash values
418
449
  // assumes ordered sketch
419
450
  uint64_t previous = 0;
@@ -423,16 +454,14 @@ uint8_t compact_theta_sketch_alloc<A>::compute_min_leading_zeros() const {
423
454
  ored |= delta;
424
455
  previous = entry;
425
456
  }
426
- return count_leading_zeros_in_u64(ored);
457
+ return 64 - count_leading_zeros_in_u64(ored);
427
458
  }
428
459
 
429
460
  template<typename A>
430
461
  void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const {
431
462
  const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
432
- const uint8_t entry_bits = 64 - compute_min_leading_zeros();
433
-
434
- // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
435
- const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
463
+ const uint8_t entry_bits = compute_entry_bits();
464
+ const uint8_t num_entries_bytes = get_num_entries_bytes();
436
465
 
437
466
  write(os, preamble_longs);
438
467
  write(os, COMPRESSED_SERIAL_VERSION);
@@ -477,25 +506,20 @@ void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const
477
506
  previous = entries_[i];
478
507
  offset = pack_bits(delta, entry_bits, ptr, offset);
479
508
  }
509
+ if (offset > 0) ++ptr;
480
510
  write(os, buffer.data(), ptr - buffer.data());
481
511
  }
482
512
  }
483
513
 
484
514
  template<typename A>
485
515
  auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_bytes) const -> vector_bytes {
486
- const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
487
- const uint8_t entry_bits = 64 - compute_min_leading_zeros();
488
- const size_t compressed_bits = entry_bits * entries_.size();
489
-
490
- // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
491
- const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
492
-
493
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + num_entries_bytes
494
- + whole_bytes_to_hold_bits(compressed_bits);
516
+ const uint8_t entry_bits = compute_entry_bits();
517
+ const uint8_t num_entries_bytes = get_num_entries_bytes();
518
+ const size_t size = get_compressed_serialized_size_bytes(entry_bits, num_entries_bytes) + header_size_bytes;
495
519
  vector_bytes bytes(size, 0, entries_.get_allocator());
496
520
  uint8_t* ptr = bytes.data() + header_size_bytes;
497
521
 
498
- *ptr++ = preamble_longs;
522
+ *ptr++ = get_preamble_longs(true);
499
523
  *ptr++ = COMPRESSED_SERIAL_VERSION;
500
524
  *ptr++ = SKETCH_TYPE;
501
525
  *ptr++ = entry_bits;
@@ -29,50 +29,53 @@ namespace datasketches {
29
29
  static const uint64_t IGOLDEN64 = 0x9e3779b97f4a7c13ULL;
30
30
 
31
31
  TEST_CASE("pack unpack bits") {
32
- for (uint8_t bits = 1; bits <= 63; ++bits) {
33
- const uint64_t mask = (1ULL << bits) - 1;
34
- std::vector<uint64_t> input(8, 0);
35
- const uint64_t igolden64 = IGOLDEN64;
36
- uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
37
- for (int i = 0; i < 8; ++i) {
38
- input[i] = value & mask;
39
- value += igolden64;
40
- }
41
- std::vector<uint8_t> bytes(8 * sizeof(uint64_t), 0);
42
- uint8_t offset = 0;
43
- uint8_t* ptr = bytes.data();
44
- for (int i = 0; i < 8; ++i) {
45
- offset = pack_bits(input[i], bits, ptr, offset);
46
- }
32
+ uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
33
+ for (int m = 0; m < 10000; ++m) {
34
+ for (uint8_t bits = 1; bits <= 63; ++bits) {
35
+ int n = 8;
36
+ const uint64_t mask = (1ULL << bits) - 1;
37
+ std::vector<uint64_t> input(n, 0);
38
+ for (int i = 0; i < n; ++i) {
39
+ input[i] = value & mask;
40
+ value += IGOLDEN64;
41
+ }
42
+ std::vector<uint8_t> bytes(n * sizeof(uint64_t), 0);
43
+ uint8_t offset = 0;
44
+ uint8_t* ptr = bytes.data();
45
+ for (int i = 0; i < n; ++i) {
46
+ offset = pack_bits(input[i], bits, ptr, offset);
47
+ }
47
48
 
48
- std::vector<uint64_t> output(8, 0);
49
- offset = 0;
50
- const uint8_t* cptr = bytes.data();
51
- for (int i = 0; i < 8; ++i) {
52
- offset = unpack_bits(output[i], bits, cptr, offset);
53
- }
54
- for (int i = 0; i < 8; ++i) {
55
- REQUIRE((input[i] & mask) == output[i]);
49
+ std::vector<uint64_t> output(n, 0);
50
+ offset = 0;
51
+ const uint8_t* cptr = bytes.data();
52
+ for (int i = 0; i < n; ++i) {
53
+ offset = unpack_bits(output[i], bits, cptr, offset);
54
+ }
55
+ for (int i = 0; i < n; ++i) {
56
+ REQUIRE(input[i] == output[i]);
57
+ }
56
58
  }
57
59
  }
58
60
  }
59
61
 
60
62
  TEST_CASE("pack unpack blocks") {
61
- for (uint8_t bits = 1; bits <= 63; ++bits) {
62
- const uint64_t mask = (1ULL << bits) - 1;
63
- std::vector<uint64_t> input(8, 0);
64
- const uint64_t igolden64 = IGOLDEN64;
65
- uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
66
- for (int i = 0; i < 8; ++i) {
67
- input[i] = value & mask;
68
- value += igolden64;
69
- }
70
- std::vector<uint8_t> bytes(8 * sizeof(uint64_t), 0);
71
- pack_bits_block8(input.data(), bytes.data(), bits);
72
- std::vector<uint64_t> output(8, 0);
73
- unpack_bits_block8(output.data(), bytes.data(), bits);
74
- for (int i = 0; i < 8; ++i) {
75
- REQUIRE((input[i] & mask) == output[i]);
63
+ uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
64
+ for (int n = 0; n < 10000; ++n) {
65
+ for (uint8_t bits = 1; bits <= 63; ++bits) {
66
+ const uint64_t mask = (1ULL << bits) - 1;
67
+ std::vector<uint64_t> input(8, 0);
68
+ for (int i = 0; i < 8; ++i) {
69
+ input[i] = value & mask;
70
+ value += IGOLDEN64;
71
+ }
72
+ std::vector<uint8_t> bytes(bits, 0);
73
+ pack_bits_block8(input.data(), bytes.data(), bits);
74
+ std::vector<uint64_t> output(8, 0);
75
+ unpack_bits_block8(output.data(), bytes.data(), bits);
76
+ for (int i = 0; i < 8; ++i) {
77
+ REQUIRE(input[i] == output[i]);
78
+ }
76
79
  }
77
80
  }
78
81
  }
@@ -45,6 +45,23 @@ TEST_CASE("theta sketch", "[serde_compat]") {
45
45
  }
46
46
  }
47
47
 
48
+ TEST_CASE("theta sketch compressed", "[serde_compat]") {
49
+ const unsigned n_arr[] = {10, 100, 1000, 10000, 100000, 1000000};
50
+ for (const unsigned n: n_arr) {
51
+ std::ifstream is;
52
+ is.exceptions(std::ios::failbit | std::ios::badbit);
53
+ is.open(testBinaryInputPath + "theta_compressed_n" + std::to_string(n) + "_java.sk", std::ios::binary);
54
+ const auto sketch = compact_theta_sketch::deserialize(is);
55
+ REQUIRE(sketch.is_estimation_mode() == (n > 1000));
56
+ REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
57
+ for (const auto hash: sketch) {
58
+ REQUIRE(hash < sketch.get_theta64());
59
+ }
60
+ REQUIRE(sketch.is_ordered());
61
+ REQUIRE(std::is_sorted(sketch.begin(), sketch.end()));
62
+ }
63
+ }
64
+
48
65
  TEST_CASE("theta sketch non-empty no entries", "[serde_compat]") {
49
66
  std::ifstream is;
50
67
  is.exceptions(std::ios::failbit | std::ios::badbit);
@@ -43,7 +43,7 @@ TEST_CASE("theta sketch generate compressed", "[serialize_for_java]") {
43
43
  REQUIRE_FALSE(sketch.is_empty());
44
44
  REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
45
45
  std::ofstream os("theta_compressed_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
46
- sketch.compact().serialize(os);
46
+ sketch.compact().serialize_compressed(os);
47
47
  }
48
48
  }
49
49
 
@@ -273,9 +273,11 @@ TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[
273
273
  for (int i = 0; i < n; i++) update_sketch.update(i);
274
274
 
275
275
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
276
- update_sketch.compact().serialize(s);
277
- auto bytes = update_sketch.compact().serialize();
276
+ auto compact_sketch = update_sketch.compact();
277
+ compact_sketch.serialize(s);
278
+ auto bytes = compact_sketch.serialize();
278
279
  REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
280
+ REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes());
279
281
  for (size_t i = 0; i < bytes.size(); ++i) {
280
282
  REQUIRE(((char*)bytes.data())[i] == (char)s.get());
281
283
  }
@@ -515,12 +517,54 @@ TEST_CASE("theta sketch: wrap compact v2 estimation from java", "[theta_sketch]"
515
517
  }
516
518
  }
517
519
 
520
+ TEST_CASE("theta sketch: serialize deserialize small compressed", "[theta_sketch]") {
521
+ auto update_sketch = update_theta_sketch::builder().build();
522
+ for (int i = 0; i < 10; i++) update_sketch.update(i);
523
+ auto compact_sketch = update_sketch.compact();
524
+
525
+ auto bytes = compact_sketch.serialize_compressed();
526
+ REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes(true));
527
+ { // deserialize bytes
528
+ auto deserialized_sketch = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
529
+ REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
530
+ REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
531
+ auto iter = deserialized_sketch.begin();
532
+ for (const auto key: compact_sketch) {
533
+ REQUIRE(*iter == key);
534
+ ++iter;
535
+ }
536
+ }
537
+ { // wrap bytes
538
+ auto wrapped_sketch = wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size());
539
+ REQUIRE(wrapped_sketch.get_num_retained() == compact_sketch.get_num_retained());
540
+ REQUIRE(wrapped_sketch.get_theta() == compact_sketch.get_theta());
541
+ auto iter = wrapped_sketch.begin();
542
+ for (const auto key: compact_sketch) {
543
+ REQUIRE(*iter == key);
544
+ ++iter;
545
+ }
546
+ }
547
+
548
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
549
+ compact_sketch.serialize_compressed(s);
550
+ REQUIRE(static_cast<size_t>(s.tellp()) == compact_sketch.get_serialized_size_bytes(true));
551
+ auto deserialized_sketch = compact_theta_sketch::deserialize(s);
552
+ REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
553
+ REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
554
+ auto iter = deserialized_sketch.begin();
555
+ for (const auto key: compact_sketch) {
556
+ REQUIRE(*iter == key);
557
+ ++iter;
558
+ }
559
+ }
560
+
518
561
  TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
519
562
  auto update_sketch = update_theta_sketch::builder().build();
520
563
  for (int i = 0; i < 10000; i++) update_sketch.update(i);
521
564
  auto compact_sketch = update_sketch.compact();
522
565
 
523
566
  auto bytes = compact_sketch.serialize_compressed();
567
+ REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes(true));
524
568
  { // deserialize bytes
525
569
  auto deserialized_sketch = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
526
570
  REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
@@ -544,6 +588,7 @@ TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
544
588
 
545
589
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
546
590
  compact_sketch.serialize_compressed(s);
591
+ REQUIRE(static_cast<size_t>(s.tellp()) == compact_sketch.get_serialized_size_bytes(true));
547
592
  auto deserialized_sketch = compact_theta_sketch::deserialize(s);
548
593
  REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
549
594
  REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
@@ -554,4 +599,30 @@ TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
554
599
  }
555
600
  }
556
601
 
602
+ // The sketch reaches capacity for the first time at 2 * K * 15/16,
603
+ // but at that point it is still in exact mode, so the serialized size is not the maximum
604
+ // (theta in not serialized in the exact mode).
605
+ // So we need to catch the second time, but some updates will be ignored in the estimation mode,
606
+ // so we update more than enough times keeping track of the maximum.
607
+ // Potentially the exact number of updates to reach the peak can be figured out given this particular sequence,
608
+ // but not assuming that might be even better (say, in case we change the load factor or hash function
609
+ // or just out of principle not to rely on implementation details too much).
610
+ TEST_CASE("max serialized size", "[theta_sketch]") {
611
+ const uint8_t lg_k = 10;
612
+ auto sketch = update_theta_sketch::builder().set_lg_k(lg_k).build();
613
+ int value = 0;
614
+
615
+ // this will go over the first peak, which is not the highest
616
+ for (int i = 0; i < (1 << lg_k) * 2; ++i) sketch.update(value++);
617
+
618
+ // this will to over the second peak keeping track of the max size
619
+ size_t max_size_bytes = 0;
620
+ for (int i = 0; i < (1 << lg_k) * 2; ++i) {
621
+ sketch.update(value++);
622
+ auto bytes = sketch.compact().serialize();
623
+ max_size_bytes = std::max(max_size_bytes, bytes.size());
624
+ }
625
+ REQUIRE(max_size_bytes == compact_theta_sketch::get_max_serialized_size_bytes(lg_k));
626
+ }
627
+
557
628
  } /* namespace datasketches */
@@ -30,7 +30,6 @@ target_include_directories(tuple
30
30
  )
31
31
 
32
32
  target_link_libraries(tuple INTERFACE common theta)
33
- target_compile_features(tuple INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS tuple
36
35
  EXPORT ${PROJECT_NAME}
@@ -381,6 +381,15 @@ public:
381
381
  */
382
382
  compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
383
383
 
384
+ /**
385
+ * Produces a Compact Tuple sketch from this sketch
386
+ * by applying a given predicate to each entry.
387
+ * @param predicate should return true for the entries to keep
388
+ * @return compact sketch with the entries retained according to the predicate
389
+ */
390
+ template<typename Predicate>
391
+ compact_tuple_sketch<Summary, Allocator> filter(const Predicate& predicate) const;
392
+
384
393
  virtual iterator begin();
385
394
  virtual iterator end();
386
395
  virtual const_iterator begin() const;
@@ -480,6 +489,25 @@ public:
480
489
  virtual uint32_t get_num_retained() const;
481
490
  virtual uint16_t get_seed_hash() const;
482
491
 
492
+ /**
493
+ * Produces a Compact Tuple sketch from this sketch
494
+ * by applying a given predicate to each entry.
495
+ * @param predicate should return true for the entries to keep
496
+ * @return compact sketch with the entries retained according to the predicate
497
+ */
498
+ template<typename Predicate>
499
+ compact_tuple_sketch filter(const Predicate& predicate) const;
500
+
501
+ /**
502
+ * Produces a Compact Tuple sketch from a given sketch (Update or Compact)
503
+ * by applying a given predicate to each entry.
504
+ * @param sketch input sketch
505
+ * @param predicate should return true for the entries to keep
506
+ * @return compact sketch with the entries retained according to the predicate
507
+ */
508
+ template<typename Sketch, typename Predicate>
509
+ static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& predicate);
510
+
483
511
  /**
484
512
  * This method serializes the sketch into a given stream in a binary form
485
513
  * @param os output stream
@@ -579,7 +607,6 @@ protected:
579
607
  template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
580
608
  template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
581
609
  compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
582
-
583
610
  };
584
611
 
585
612
  /// Tuple base builder
@@ -258,6 +258,12 @@ compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::compact(bool ordered
258
258
  return compact_tuple_sketch<S, A>(*this, ordered);
259
259
  }
260
260
 
261
+ template<typename S, typename U, typename P, typename A>
262
+ template<typename Predicate>
263
+ compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::filter(const Predicate& predicate) const {
264
+ return compact_tuple_sketch<S, A>::filter(*this, predicate);
265
+ }
266
+
261
267
  template<typename S, typename U, typename P, typename A>
262
268
  void update_tuple_sketch<S, U, P, A>::print_specifics(std::ostringstream& os) const {
263
269
  os << " lg nominal size : " << (int) map_.lg_nom_size_ << std::endl;
@@ -344,6 +350,33 @@ uint16_t compact_tuple_sketch<S, A>::get_seed_hash() const {
344
350
  return seed_hash_;
345
351
  }
346
352
 
353
+ template<typename S, typename A>
354
+ template<typename Predicate>
355
+ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Predicate& predicate) const {
356
+ return filter(*this, predicate);
357
+ }
358
+
359
+ template<typename S, typename A>
360
+ template<typename Sketch, typename Predicate>
361
+ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Sketch& sketch, const Predicate& predicate) {
362
+ std::vector<Entry, AllocEntry> entries(sketch.get_allocator());
363
+ entries.reserve(sketch.get_num_retained());
364
+ std::copy_if(
365
+ sketch.begin(),
366
+ sketch.end(),
367
+ std::back_inserter(entries),
368
+ [&predicate](const Entry& e) {return predicate(e.second);}
369
+ );
370
+ entries.shrink_to_fit();
371
+ return compact_tuple_sketch(
372
+ !sketch.is_estimation_mode() && entries.empty(),
373
+ sketch.is_ordered(),
374
+ sketch.get_seed_hash(),
375
+ sketch.get_theta64(),
376
+ std::move(entries)
377
+ );
378
+ }
379
+
347
380
  // implementation for fixed-size arithmetic types (integral and floating point)
348
381
  template<typename S, typename A>
349
382
  template<typename SD, typename SS, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type>
@@ -310,4 +310,65 @@ TEST_CASE("tuple sketch: float, update with different types of keys", "[tuple_sk
310
310
  REQUIRE(sketch.get_num_retained() == 3);
311
311
  }
312
312
 
313
+ TEST_CASE("filter", "[tuple_sketch]") {
314
+ auto usk = update_tuple_sketch<int>::builder().build();
315
+
316
+ { // empty update sketch
317
+ auto sk = usk.filter([](int){return true;});
318
+ REQUIRE(sk.is_empty());
319
+ REQUIRE(sk.is_ordered());
320
+ REQUIRE(sk.get_num_retained() == 0);
321
+ }
322
+
323
+ { // empty compact sketch
324
+ auto sk = usk.compact().filter([](int){return true;});
325
+ REQUIRE(sk.is_empty());
326
+ REQUIRE(sk.is_ordered());
327
+ REQUIRE(sk.get_num_retained() == 0);
328
+ }
329
+
330
+ usk.update(1, 1);
331
+ usk.update(1, 1);
332
+ usk.update(2, 1);
333
+ usk.update(2, 1);
334
+ usk.update(3, 1);
335
+
336
+ { // exact mode update sketch
337
+ auto sk = usk.filter([](int v){return v > 1;});
338
+ REQUIRE_FALSE(sk.is_empty());
339
+ REQUIRE_FALSE(sk.is_ordered());
340
+ REQUIRE_FALSE(sk.is_estimation_mode());
341
+ REQUIRE(sk.get_num_retained() == 2);
342
+ }
343
+
344
+ { // exact mode compact sketch
345
+ auto sk = usk.compact().filter([](int v){return v > 1;});
346
+ REQUIRE_FALSE(sk.is_empty());
347
+ REQUIRE(sk.is_ordered());
348
+ REQUIRE_FALSE(sk.is_estimation_mode());
349
+ REQUIRE(sk.get_num_retained() == 2);
350
+ }
351
+
352
+ // only keys 1 and 2 had values of 2, which will become 3 after this update
353
+ // some entries are discarded in estimation mode, but these happen to survive
354
+ // the process is deterministic, so the test will always work
355
+ for (int i = 0; i < 10000; ++i) usk.update(i, 1);
356
+
357
+ { // estimation mode update sketch
358
+ auto sk = usk.filter([](int v){return v > 2;});
359
+ REQUIRE_FALSE(sk.is_empty());
360
+ REQUIRE_FALSE(sk.is_ordered());
361
+ REQUIRE(sk.is_estimation_mode());
362
+ REQUIRE(sk.get_num_retained() == 2);
363
+ }
364
+
365
+ { // estimation mode compact sketch
366
+ auto sk = usk.compact().filter([](int v){return v > 2;});
367
+ REQUIRE_FALSE(sk.is_empty());
368
+ REQUIRE(sk.is_ordered());
369
+ REQUIRE(sk.is_estimation_mode());
370
+ REQUIRE(sk.get_num_retained() == 2);
371
+ }
372
+ }
373
+
313
374
  } /* namespace datasketches */
@@ -1 +1 @@
1
- 5.0.2
1
+ 5.2.0
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datasketches
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2024-01-13 00:00:00.000000000 Z
10
+ date: 2025-01-16 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: rice
@@ -16,15 +15,14 @@ dependencies:
16
15
  requirements:
17
16
  - - ">="
18
17
  - !ruby/object:Gem::Version
19
- version: '4.1'
18
+ version: 4.3.3
20
19
  type: :runtime
21
20
  prerelease: false
22
21
  version_requirements: !ruby/object:Gem::Requirement
23
22
  requirements:
24
23
  - - ">="
25
24
  - !ruby/object:Gem::Version
26
- version: '4.1'
27
- description:
25
+ version: 4.3.3
28
26
  email: andrew@ankane.org
29
27
  executables: []
30
28
  extensions:
@@ -72,6 +70,7 @@ files:
72
70
  - vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp
73
71
  - vendor/datasketches-cpp/common/include/serde.hpp
74
72
  - vendor/datasketches-cpp/common/include/version.hpp.in
73
+ - vendor/datasketches-cpp/common/include/xxhash64.h
75
74
  - vendor/datasketches-cpp/common/test/CMakeLists.txt
76
75
  - vendor/datasketches-cpp/common/test/catch_runner.cpp
77
76
  - vendor/datasketches-cpp/common/test/integration_test.cpp
@@ -124,6 +123,17 @@ files:
124
123
  - vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp
125
124
  - vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp
126
125
  - vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp
126
+ - vendor/datasketches-cpp/filters/CMakeLists.txt
127
+ - vendor/datasketches-cpp/filters/include/bit_array_ops.hpp
128
+ - vendor/datasketches-cpp/filters/include/bloom_filter.hpp
129
+ - vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp
130
+ - vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp
131
+ - vendor/datasketches-cpp/filters/test/CMakeLists.txt
132
+ - vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp
133
+ - vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp
134
+ - vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp
135
+ - vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp
136
+ - vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp
127
137
  - vendor/datasketches-cpp/hll/CMakeLists.txt
128
138
  - vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp
129
139
  - vendor/datasketches-cpp/hll/include/AuxHashMap.hpp
@@ -231,6 +241,16 @@ files:
231
241
  - vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp
232
242
  - vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp
233
243
  - vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp
244
+ - vendor/datasketches-cpp/tdigest/CMakeLists.txt
245
+ - vendor/datasketches-cpp/tdigest/include/tdigest.hpp
246
+ - vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp
247
+ - vendor/datasketches-cpp/tdigest/test/CMakeLists.txt
248
+ - vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp
249
+ - vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp
250
+ - vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk
251
+ - vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk
252
+ - vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp
253
+ - vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp
234
254
  - vendor/datasketches-cpp/theta/CMakeLists.txt
235
255
  - vendor/datasketches-cpp/theta/include/bit_packing.hpp
236
256
  - vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp
@@ -309,7 +329,6 @@ homepage: https://github.com/ankane/datasketches-ruby
309
329
  licenses:
310
330
  - Apache-2.0
311
331
  metadata: {}
312
- post_install_message:
313
332
  rdoc_options: []
314
333
  require_paths:
315
334
  - lib
@@ -324,8 +343,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
324
343
  - !ruby/object:Gem::Version
325
344
  version: '0'
326
345
  requirements: []
327
- rubygems_version: 3.5.3
328
- signing_key:
346
+ rubygems_version: 3.6.2
329
347
  specification_version: 4
330
348
  summary: Sketch data structures for Ruby
331
349
  test_files: []