datasketches 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/README.md +2 -3
  9. data/vendor/datasketches-cpp/common/CMakeLists.txt +0 -2
  10. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  11. data/vendor/datasketches-cpp/common/include/common_defs.hpp +17 -0
  12. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  13. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  14. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  16. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  19. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  20. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  21. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  23. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  24. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  25. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  26. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  27. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  28. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  29. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  30. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +254 -0
  31. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +595 -0
  32. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  33. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  34. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  35. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  36. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  37. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  38. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +447 -0
  39. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  40. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  41. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  42. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  43. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +9 -8
  44. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  45. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  46. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  47. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  48. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  49. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  50. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  51. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  52. metadata +13 -3
@@ -24,11 +24,11 @@
24
24
  #include <vector>
25
25
  #include <stdexcept>
26
26
 
27
- #include "serde.hpp"
28
27
  #include "binomial_bounds.hpp"
29
28
  #include "theta_helpers.hpp"
30
29
  #include "count_zeros.hpp"
31
30
  #include "bit_packing.hpp"
31
+ #include "memory_operations.hpp"
32
32
 
33
33
  namespace datasketches {
34
34
 
@@ -341,6 +341,39 @@ auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
341
341
  template<typename A>
342
342
  void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
343
343
 
344
+ template<typename A>
345
+ uint8_t compact_theta_sketch_alloc<A>::get_preamble_longs(bool compressed) const {
346
+ if (compressed) {
347
+ return this->is_estimation_mode() ? 2 : 1;
348
+ }
349
+ return this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
350
+ }
351
+
352
+ template<typename A>
353
+ size_t compact_theta_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
354
+ return sizeof(uint64_t) * (3 + update_theta_sketch_alloc<A>::theta_table::get_capacity(lg_k + 1, lg_k));
355
+ }
356
+
357
+ template<typename A>
358
+ size_t compact_theta_sketch_alloc<A>::get_serialized_size_bytes(bool compressed) const {
359
+ if (compressed && is_suitable_for_compression()) {
360
+ return get_compressed_serialized_size_bytes(compute_entry_bits(), get_num_entries_bytes());
361
+ }
362
+ return sizeof(uint64_t) * get_preamble_longs(false) + sizeof(uint64_t) * entries_.size();
363
+ }
364
+
365
+ // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
366
+ template<typename A>
367
+ uint8_t compact_theta_sketch_alloc<A>::get_num_entries_bytes() const {
368
+ return whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
369
+ }
370
+
371
+ template<typename A>
372
+ size_t compact_theta_sketch_alloc<A>::get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const {
373
+ const size_t compressed_bits = entry_bits * entries_.size();
374
+ return sizeof(uint64_t) * get_preamble_longs(true) + num_entries_bytes + whole_bytes_to_hold_bits(compressed_bits);
375
+ }
376
+
344
377
  template<typename A>
345
378
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
346
379
  const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
@@ -366,12 +399,10 @@ void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
366
399
 
367
400
  template<typename A>
368
401
  auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
369
- const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
370
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
371
- + sizeof(uint64_t) * entries_.size();
402
+ const size_t size = get_serialized_size_bytes() + header_size_bytes;
372
403
  vector_bytes bytes(size, 0, entries_.get_allocator());
373
404
  uint8_t* ptr = bytes.data() + header_size_bytes;
374
-
405
+ const uint8_t preamble_longs = get_preamble_longs(false);
375
406
  *ptr++ = preamble_longs;
376
407
  *ptr++ = UNCOMPRESSED_SERIAL_VERSION;
377
408
  *ptr++ = SKETCH_TYPE;
@@ -413,7 +444,7 @@ auto compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_by
413
444
  }
414
445
 
415
446
  template<typename A>
416
- uint8_t compact_theta_sketch_alloc<A>::compute_min_leading_zeros() const {
447
+ uint8_t compact_theta_sketch_alloc<A>::compute_entry_bits() const {
417
448
  // compression is based on leading zeros in deltas between ordered hash values
418
449
  // assumes ordered sketch
419
450
  uint64_t previous = 0;
@@ -423,16 +454,14 @@ uint8_t compact_theta_sketch_alloc<A>::compute_min_leading_zeros() const {
423
454
  ored |= delta;
424
455
  previous = entry;
425
456
  }
426
- return count_leading_zeros_in_u64(ored);
457
+ return 64 - count_leading_zeros_in_u64(ored);
427
458
  }
428
459
 
429
460
  template<typename A>
430
461
  void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const {
431
462
  const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
432
- const uint8_t entry_bits = 64 - compute_min_leading_zeros();
433
-
434
- // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
435
- const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
463
+ const uint8_t entry_bits = compute_entry_bits();
464
+ const uint8_t num_entries_bytes = get_num_entries_bytes();
436
465
 
437
466
  write(os, preamble_longs);
438
467
  write(os, COMPRESSED_SERIAL_VERSION);
@@ -477,25 +506,20 @@ void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const
477
506
  previous = entries_[i];
478
507
  offset = pack_bits(delta, entry_bits, ptr, offset);
479
508
  }
509
+ if (offset > 0) ++ptr;
480
510
  write(os, buffer.data(), ptr - buffer.data());
481
511
  }
482
512
  }
483
513
 
484
514
  template<typename A>
485
515
  auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_bytes) const -> vector_bytes {
486
- const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
487
- const uint8_t entry_bits = 64 - compute_min_leading_zeros();
488
- const size_t compressed_bits = entry_bits * entries_.size();
489
-
490
- // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
491
- const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
492
-
493
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + num_entries_bytes
494
- + whole_bytes_to_hold_bits(compressed_bits);
516
+ const uint8_t entry_bits = compute_entry_bits();
517
+ const uint8_t num_entries_bytes = get_num_entries_bytes();
518
+ const size_t size = get_compressed_serialized_size_bytes(entry_bits, num_entries_bytes) + header_size_bytes;
495
519
  vector_bytes bytes(size, 0, entries_.get_allocator());
496
520
  uint8_t* ptr = bytes.data() + header_size_bytes;
497
521
 
498
- *ptr++ = preamble_longs;
522
+ *ptr++ = get_preamble_longs(true);
499
523
  *ptr++ = COMPRESSED_SERIAL_VERSION;
500
524
  *ptr++ = SKETCH_TYPE;
501
525
  *ptr++ = entry_bits;
@@ -30,29 +30,30 @@ static const uint64_t IGOLDEN64 = 0x9e3779b97f4a7c13ULL;
30
30
 
31
31
  TEST_CASE("pack unpack bits") {
32
32
  for (uint8_t bits = 1; bits <= 63; ++bits) {
33
+ int n = 8;
33
34
  const uint64_t mask = (1ULL << bits) - 1;
34
- std::vector<uint64_t> input(8, 0);
35
+ std::vector<uint64_t> input(n, 0);
35
36
  const uint64_t igolden64 = IGOLDEN64;
36
37
  uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
37
- for (int i = 0; i < 8; ++i) {
38
+ for (int i = 0; i < n; ++i) {
38
39
  input[i] = value & mask;
39
40
  value += igolden64;
40
41
  }
41
- std::vector<uint8_t> bytes(8 * sizeof(uint64_t), 0);
42
+ std::vector<uint8_t> bytes(n * sizeof(uint64_t), 0);
42
43
  uint8_t offset = 0;
43
44
  uint8_t* ptr = bytes.data();
44
- for (int i = 0; i < 8; ++i) {
45
+ for (int i = 0; i < n; ++i) {
45
46
  offset = pack_bits(input[i], bits, ptr, offset);
46
47
  }
47
48
 
48
- std::vector<uint64_t> output(8, 0);
49
+ std::vector<uint64_t> output(n, 0);
49
50
  offset = 0;
50
51
  const uint8_t* cptr = bytes.data();
51
- for (int i = 0; i < 8; ++i) {
52
+ for (int i = 0; i < n; ++i) {
52
53
  offset = unpack_bits(output[i], bits, cptr, offset);
53
54
  }
54
- for (int i = 0; i < 8; ++i) {
55
- REQUIRE((input[i] & mask) == output[i]);
55
+ for (int i = 0; i < n; ++i) {
56
+ REQUIRE(input[i] == output[i]);
56
57
  }
57
58
  }
58
59
  }
@@ -45,6 +45,23 @@ TEST_CASE("theta sketch", "[serde_compat]") {
45
45
  }
46
46
  }
47
47
 
48
+ TEST_CASE("theta sketch compressed", "[serde_compat]") {
49
+ const unsigned n_arr[] = {10, 100, 1000, 10000, 100000, 1000000};
50
+ for (const unsigned n: n_arr) {
51
+ std::ifstream is;
52
+ is.exceptions(std::ios::failbit | std::ios::badbit);
53
+ is.open(testBinaryInputPath + "theta_compressed_n" + std::to_string(n) + "_java.sk", std::ios::binary);
54
+ const auto sketch = compact_theta_sketch::deserialize(is);
55
+ REQUIRE(sketch.is_estimation_mode() == (n > 1000));
56
+ REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
57
+ for (const auto hash: sketch) {
58
+ REQUIRE(hash < sketch.get_theta64());
59
+ }
60
+ REQUIRE(sketch.is_ordered());
61
+ REQUIRE(std::is_sorted(sketch.begin(), sketch.end()));
62
+ }
63
+ }
64
+
48
65
  TEST_CASE("theta sketch non-empty no entries", "[serde_compat]") {
49
66
  std::ifstream is;
50
67
  is.exceptions(std::ios::failbit | std::ios::badbit);
@@ -43,7 +43,7 @@ TEST_CASE("theta sketch generate compressed", "[serialize_for_java]") {
43
43
  REQUIRE_FALSE(sketch.is_empty());
44
44
  REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
45
45
  std::ofstream os("theta_compressed_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
46
- sketch.compact().serialize(os);
46
+ sketch.compact().serialize_compressed(os);
47
47
  }
48
48
  }
49
49
 
@@ -273,9 +273,11 @@ TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[
273
273
  for (int i = 0; i < n; i++) update_sketch.update(i);
274
274
 
275
275
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
276
- update_sketch.compact().serialize(s);
277
- auto bytes = update_sketch.compact().serialize();
276
+ auto compact_sketch = update_sketch.compact();
277
+ compact_sketch.serialize(s);
278
+ auto bytes = compact_sketch.serialize();
278
279
  REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
280
+ REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes());
279
281
  for (size_t i = 0; i < bytes.size(); ++i) {
280
282
  REQUIRE(((char*)bytes.data())[i] == (char)s.get());
281
283
  }
@@ -515,12 +517,54 @@ TEST_CASE("theta sketch: wrap compact v2 estimation from java", "[theta_sketch]"
515
517
  }
516
518
  }
517
519
 
520
+ TEST_CASE("theta sketch: serialize deserialize small compressed", "[theta_sketch]") {
521
+ auto update_sketch = update_theta_sketch::builder().build();
522
+ for (int i = 0; i < 10; i++) update_sketch.update(i);
523
+ auto compact_sketch = update_sketch.compact();
524
+
525
+ auto bytes = compact_sketch.serialize_compressed();
526
+ REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes(true));
527
+ { // deserialize bytes
528
+ auto deserialized_sketch = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
529
+ REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
530
+ REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
531
+ auto iter = deserialized_sketch.begin();
532
+ for (const auto key: compact_sketch) {
533
+ REQUIRE(*iter == key);
534
+ ++iter;
535
+ }
536
+ }
537
+ { // wrap bytes
538
+ auto wrapped_sketch = wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size());
539
+ REQUIRE(wrapped_sketch.get_num_retained() == compact_sketch.get_num_retained());
540
+ REQUIRE(wrapped_sketch.get_theta() == compact_sketch.get_theta());
541
+ auto iter = wrapped_sketch.begin();
542
+ for (const auto key: compact_sketch) {
543
+ REQUIRE(*iter == key);
544
+ ++iter;
545
+ }
546
+ }
547
+
548
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
549
+ compact_sketch.serialize_compressed(s);
550
+ REQUIRE(static_cast<size_t>(s.tellp()) == compact_sketch.get_serialized_size_bytes(true));
551
+ auto deserialized_sketch = compact_theta_sketch::deserialize(s);
552
+ REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
553
+ REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
554
+ auto iter = deserialized_sketch.begin();
555
+ for (const auto key: compact_sketch) {
556
+ REQUIRE(*iter == key);
557
+ ++iter;
558
+ }
559
+ }
560
+
518
561
  TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
519
562
  auto update_sketch = update_theta_sketch::builder().build();
520
563
  for (int i = 0; i < 10000; i++) update_sketch.update(i);
521
564
  auto compact_sketch = update_sketch.compact();
522
565
 
523
566
  auto bytes = compact_sketch.serialize_compressed();
567
+ REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes(true));
524
568
  { // deserialize bytes
525
569
  auto deserialized_sketch = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
526
570
  REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
@@ -544,6 +588,7 @@ TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
544
588
 
545
589
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
546
590
  compact_sketch.serialize_compressed(s);
591
+ REQUIRE(static_cast<size_t>(s.tellp()) == compact_sketch.get_serialized_size_bytes(true));
547
592
  auto deserialized_sketch = compact_theta_sketch::deserialize(s);
548
593
  REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
549
594
  REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
@@ -554,4 +599,30 @@ TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
554
599
  }
555
600
  }
556
601
 
602
+ // The sketch reaches capacity for the first time at 2 * K * 15/16,
603
+ // but at that point it is still in exact mode, so the serialized size is not the maximum
604
+ // (theta in not serialized in the exact mode).
605
+ // So we need to catch the second time, but some updates will be ignored in the estimation mode,
606
+ // so we update more than enough times keeping track of the maximum.
607
+ // Potentially the exact number of updates to reach the peak can be figured out given this particular sequence,
608
+ // but not assuming that might be even better (say, in case we change the load factor or hash function
609
+ // or just out of principle not to rely on implementation details too much).
610
+ TEST_CASE("max serialized size", "[theta_sketch]") {
611
+ const uint8_t lg_k = 10;
612
+ auto sketch = update_theta_sketch::builder().set_lg_k(lg_k).build();
613
+ int value = 0;
614
+
615
+ // this will go over the first peak, which is not the highest
616
+ for (int i = 0; i < (1 << lg_k) * 2; ++i) sketch.update(value++);
617
+
618
+ // this will to over the second peak keeping track of the max size
619
+ size_t max_size_bytes = 0;
620
+ for (int i = 0; i < (1 << lg_k) * 2; ++i) {
621
+ sketch.update(value++);
622
+ auto bytes = sketch.compact().serialize();
623
+ max_size_bytes = std::max(max_size_bytes, bytes.size());
624
+ }
625
+ REQUIRE(max_size_bytes == compact_theta_sketch::get_max_serialized_size_bytes(lg_k));
626
+ }
627
+
557
628
  } /* namespace datasketches */
@@ -30,7 +30,6 @@ target_include_directories(tuple
30
30
  )
31
31
 
32
32
  target_link_libraries(tuple INTERFACE common theta)
33
- target_compile_features(tuple INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS tuple
36
35
  EXPORT ${PROJECT_NAME}
@@ -381,6 +381,15 @@ public:
381
381
  */
382
382
  compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
383
383
 
384
+ /**
385
+ * Produces a Compact Tuple sketch from this sketch
386
+ * by applying a given predicate to each entry.
387
+ * @param predicate should return true for the entries to keep
388
+ * @return compact sketch with the entries retained according to the predicate
389
+ */
390
+ template<typename Predicate>
391
+ compact_tuple_sketch<Summary, Allocator> filter(const Predicate& predicate) const;
392
+
384
393
  virtual iterator begin();
385
394
  virtual iterator end();
386
395
  virtual const_iterator begin() const;
@@ -480,6 +489,25 @@ public:
480
489
  virtual uint32_t get_num_retained() const;
481
490
  virtual uint16_t get_seed_hash() const;
482
491
 
492
+ /**
493
+ * Produces a Compact Tuple sketch from this sketch
494
+ * by applying a given predicate to each entry.
495
+ * @param predicate should return true for the entries to keep
496
+ * @return compact sketch with the entries retained according to the predicate
497
+ */
498
+ template<typename Predicate>
499
+ compact_tuple_sketch filter(const Predicate& predicate) const;
500
+
501
+ /**
502
+ * Produces a Compact Tuple sketch from a given sketch (Update or Compact)
503
+ * by applying a given predicate to each entry.
504
+ * @param sketch input sketch
505
+ * @param predicate should return true for the entries to keep
506
+ * @return compact sketch with the entries retained according to the predicate
507
+ */
508
+ template<typename Sketch, typename Predicate>
509
+ static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& predicate);
510
+
483
511
  /**
484
512
  * This method serializes the sketch into a given stream in a binary form
485
513
  * @param os output stream
@@ -579,7 +607,6 @@ protected:
579
607
  template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
580
608
  template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
581
609
  compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
582
-
583
610
  };
584
611
 
585
612
  /// Tuple base builder
@@ -258,6 +258,12 @@ compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::compact(bool ordered
258
258
  return compact_tuple_sketch<S, A>(*this, ordered);
259
259
  }
260
260
 
261
+ template<typename S, typename U, typename P, typename A>
262
+ template<typename Predicate>
263
+ compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::filter(const Predicate& predicate) const {
264
+ return compact_tuple_sketch<S, A>::filter(*this, predicate);
265
+ }
266
+
261
267
  template<typename S, typename U, typename P, typename A>
262
268
  void update_tuple_sketch<S, U, P, A>::print_specifics(std::ostringstream& os) const {
263
269
  os << " lg nominal size : " << (int) map_.lg_nom_size_ << std::endl;
@@ -344,6 +350,33 @@ uint16_t compact_tuple_sketch<S, A>::get_seed_hash() const {
344
350
  return seed_hash_;
345
351
  }
346
352
 
353
+ template<typename S, typename A>
354
+ template<typename Predicate>
355
+ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Predicate& predicate) const {
356
+ return filter(*this, predicate);
357
+ }
358
+
359
+ template<typename S, typename A>
360
+ template<typename Sketch, typename Predicate>
361
+ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Sketch& sketch, const Predicate& predicate) {
362
+ std::vector<Entry, AllocEntry> entries(sketch.get_allocator());
363
+ entries.reserve(sketch.get_num_retained());
364
+ std::copy_if(
365
+ sketch.begin(),
366
+ sketch.end(),
367
+ std::back_inserter(entries),
368
+ [&predicate](const Entry& e) {return predicate(e.second);}
369
+ );
370
+ entries.shrink_to_fit();
371
+ return compact_tuple_sketch(
372
+ !sketch.is_estimation_mode() && entries.empty(),
373
+ sketch.is_ordered(),
374
+ sketch.get_seed_hash(),
375
+ sketch.get_theta64(),
376
+ std::move(entries)
377
+ );
378
+ }
379
+
347
380
  // implementation for fixed-size arithmetic types (integral and floating point)
348
381
  template<typename S, typename A>
349
382
  template<typename SD, typename SS, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type>
@@ -310,4 +310,65 @@ TEST_CASE("tuple sketch: float, update with different types of keys", "[tuple_sk
310
310
  REQUIRE(sketch.get_num_retained() == 3);
311
311
  }
312
312
 
313
+ TEST_CASE("filter", "[tuple_sketch]") {
314
+ auto usk = update_tuple_sketch<int>::builder().build();
315
+
316
+ { // empty update sketch
317
+ auto sk = usk.filter([](int){return true;});
318
+ REQUIRE(sk.is_empty());
319
+ REQUIRE(sk.is_ordered());
320
+ REQUIRE(sk.get_num_retained() == 0);
321
+ }
322
+
323
+ { // empty compact sketch
324
+ auto sk = usk.compact().filter([](int){return true;});
325
+ REQUIRE(sk.is_empty());
326
+ REQUIRE(sk.is_ordered());
327
+ REQUIRE(sk.get_num_retained() == 0);
328
+ }
329
+
330
+ usk.update(1, 1);
331
+ usk.update(1, 1);
332
+ usk.update(2, 1);
333
+ usk.update(2, 1);
334
+ usk.update(3, 1);
335
+
336
+ { // exact mode update sketch
337
+ auto sk = usk.filter([](int v){return v > 1;});
338
+ REQUIRE_FALSE(sk.is_empty());
339
+ REQUIRE_FALSE(sk.is_ordered());
340
+ REQUIRE_FALSE(sk.is_estimation_mode());
341
+ REQUIRE(sk.get_num_retained() == 2);
342
+ }
343
+
344
+ { // exact mode compact sketch
345
+ auto sk = usk.compact().filter([](int v){return v > 1;});
346
+ REQUIRE_FALSE(sk.is_empty());
347
+ REQUIRE(sk.is_ordered());
348
+ REQUIRE_FALSE(sk.is_estimation_mode());
349
+ REQUIRE(sk.get_num_retained() == 2);
350
+ }
351
+
352
+ // only keys 1 and 2 had values of 2, which will become 3 after this update
353
+ // some entries are discarded in estimation mode, but these happen to survive
354
+ // the process is deterministic, so the test will always work
355
+ for (int i = 0; i < 10000; ++i) usk.update(i, 1);
356
+
357
+ { // estimation mode update sketch
358
+ auto sk = usk.filter([](int v){return v > 2;});
359
+ REQUIRE_FALSE(sk.is_empty());
360
+ REQUIRE_FALSE(sk.is_ordered());
361
+ REQUIRE(sk.is_estimation_mode());
362
+ REQUIRE(sk.get_num_retained() == 2);
363
+ }
364
+
365
+ { // estimation mode compact sketch
366
+ auto sk = usk.compact().filter([](int v){return v > 2;});
367
+ REQUIRE_FALSE(sk.is_empty());
368
+ REQUIRE(sk.is_ordered());
369
+ REQUIRE(sk.is_estimation_mode());
370
+ REQUIRE(sk.get_num_retained() == 2);
371
+ }
372
+ }
373
+
313
374
  } /* namespace datasketches */
@@ -1 +1 @@
1
- 5.0.2
1
+ 5.1.0
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datasketches
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-01-13 00:00:00.000000000 Z
11
+ date: 2024-08-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -231,6 +231,16 @@ files:
231
231
  - vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp
232
232
  - vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp
233
233
  - vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp
234
+ - vendor/datasketches-cpp/tdigest/CMakeLists.txt
235
+ - vendor/datasketches-cpp/tdigest/include/tdigest.hpp
236
+ - vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp
237
+ - vendor/datasketches-cpp/tdigest/test/CMakeLists.txt
238
+ - vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp
239
+ - vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp
240
+ - vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk
241
+ - vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk
242
+ - vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp
243
+ - vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp
234
244
  - vendor/datasketches-cpp/theta/CMakeLists.txt
235
245
  - vendor/datasketches-cpp/theta/include/bit_packing.hpp
236
246
  - vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp
@@ -324,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
324
334
  - !ruby/object:Gem::Version
325
335
  version: '0'
326
336
  requirements: []
327
- rubygems_version: 3.5.3
337
+ rubygems_version: 3.5.11
328
338
  signing_key:
329
339
  specification_version: 4
330
340
  summary: Sketch data structures for Ruby