datasketches 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +4 -0
 - data/ext/datasketches/vo_wrapper.cpp +1 -1
 - data/lib/datasketches/version.rb +1 -1
 - data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
 - data/vendor/datasketches-cpp/LICENSE +35 -7
 - data/vendor/datasketches-cpp/NOTICE +2 -2
 - data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
 - data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
 - data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
 - data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
 - data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
 - data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
 - data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
 - data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
 - data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
 - data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
 - data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
 - data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
 - data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
 - data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
 - data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
 - data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
 - data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
 - data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
 - data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
 - data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
 - data/vendor/datasketches-cpp/version.cfg.in +1 -1
 - metadata +17 -9
 
| 
         @@ -0,0 +1,406 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            /*
         
     | 
| 
      
 2 
     | 
    
         
            +
             * Licensed to the Apache Software Foundation (ASF) under one
         
     | 
| 
      
 3 
     | 
    
         
            +
             * or more contributor license agreements.  See the NOTICE file
         
     | 
| 
      
 4 
     | 
    
         
            +
             * distributed with this work for additional information
         
     | 
| 
      
 5 
     | 
    
         
            +
             * regarding copyright ownership.  The ASF licenses this file
         
     | 
| 
      
 6 
     | 
    
         
            +
             * to you under the Apache License, Version 2.0 (the
         
     | 
| 
      
 7 
     | 
    
         
            +
             * "License"); you may not use this file except in compliance
         
     | 
| 
      
 8 
     | 
    
         
            +
             * with the License.  You may obtain a copy of the License at
         
     | 
| 
      
 9 
     | 
    
         
            +
             *
         
     | 
| 
      
 10 
     | 
    
         
            +
             *   http://www.apache.org/licenses/LICENSE-2.0
         
     | 
| 
      
 11 
     | 
    
         
            +
             *
         
     | 
| 
      
 12 
     | 
    
         
            +
             * Unless required by applicable law or agreed to in writing,
         
     | 
| 
      
 13 
     | 
    
         
            +
             * software distributed under the License is distributed on an
         
     | 
| 
      
 14 
     | 
    
         
            +
             * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
         
     | 
| 
      
 15 
     | 
    
         
            +
             * KIND, either express or implied.  See the License for the
         
     | 
| 
      
 16 
     | 
    
         
            +
             * specific language governing permissions and limitations
         
     | 
| 
      
 17 
     | 
    
         
            +
             * under the License.
         
     | 
| 
      
 18 
     | 
    
         
            +
             */
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
            #include <catch2/catch.hpp>
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
            #include "bloom_filter.hpp"
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
            #ifdef TEST_BINARY_INPUT_PATH
         
     | 
| 
      
 25 
     | 
    
         
            +
            static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
         
     | 
| 
      
 26 
     | 
    
         
            +
            #else
         
     | 
| 
      
 27 
     | 
    
         
            +
            static std::string testBinaryInputPath = "test/";
         
     | 
| 
      
 28 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
            namespace datasketches {
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
            TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") {
         
     | 
| 
      
 33 
     | 
    
         
            +
              REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(0, 4), std::invalid_argument);
         
     | 
| 
      
 34 
     | 
    
         
            +
              REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(1L << 60, 4), std::invalid_argument);
         
     | 
| 
      
 35 
     | 
    
         
            +
              REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(65535, 0), std::invalid_argument);
         
     | 
| 
      
 36 
     | 
    
         
            +
            }
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
            TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") {
         
     | 
| 
      
 39 
     | 
    
         
            +
              uint64_t num_items = 4000;
         
     | 
| 
      
 40 
     | 
    
         
            +
              double fpp = 0.01;
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
              uint64_t num_bits = bloom_filter::builder::suggest_num_filter_bits(num_items, fpp);
         
     | 
| 
      
 43 
     | 
    
         
            +
              uint16_t num_hashes = bloom_filter::builder::suggest_num_hashes(num_items, num_bits);
         
     | 
| 
      
 44 
     | 
    
         
            +
              uint64_t seed = 89023;
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
              auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes, seed);
         
     | 
| 
      
 47 
     | 
    
         
            +
              uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64
         
     | 
| 
      
 48 
     | 
    
         
            +
              REQUIRE(bf.get_capacity() == adjusted_num_bits);
         
     | 
| 
      
 49 
     | 
    
         
            +
              REQUIRE(bf.get_num_hashes() == num_hashes);
         
     | 
| 
      
 50 
     | 
    
         
            +
              REQUIRE(bf.get_seed() == seed);
         
     | 
| 
      
 51 
     | 
    
         
            +
              REQUIRE(bf.is_empty());
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
              // should match above
         
     | 
| 
      
 54 
     | 
    
         
            +
              bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
         
     | 
| 
      
 55 
     | 
    
         
            +
              REQUIRE(bf.get_capacity() == adjusted_num_bits);
         
     | 
| 
      
 56 
     | 
    
         
            +
              REQUIRE(bf.get_num_hashes() == num_hashes);
         
     | 
| 
      
 57 
     | 
    
         
            +
              REQUIRE(bf.get_seed() == seed);
         
     | 
| 
      
 58 
     | 
    
         
            +
              REQUIRE(bf.is_empty());
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
              // same for initializing memory in-place
         
     | 
| 
      
 61 
     | 
    
         
            +
              size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits);
         
     | 
| 
      
 62 
     | 
    
         
            +
              uint8_t* bytes = new uint8_t[serialized_size_bytes];
         
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
      
 64 
     | 
    
         
            +
              bf = bloom_filter::builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed);
         
     | 
| 
      
 65 
     | 
    
         
            +
              REQUIRE(bf.get_capacity() == adjusted_num_bits);
         
     | 
| 
      
 66 
     | 
    
         
            +
              REQUIRE(bf.get_num_hashes() == num_hashes);
         
     | 
| 
      
 67 
     | 
    
         
            +
              REQUIRE(bf.get_seed() == seed);
         
     | 
| 
      
 68 
     | 
    
         
            +
              REQUIRE(bf.is_empty());
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
              bf = bloom_filter::builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed);
         
     | 
| 
      
 71 
     | 
    
         
            +
              REQUIRE(bf.get_capacity() == adjusted_num_bits);
         
     | 
| 
      
 72 
     | 
    
         
            +
              REQUIRE(bf.get_num_hashes() == num_hashes);
         
     | 
| 
      
 73 
     | 
    
         
            +
              REQUIRE(bf.get_seed() == seed);
         
     | 
| 
      
 74 
     | 
    
         
            +
              REQUIRE(bf.is_empty());
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
              delete [] bytes;
         
     | 
| 
      
 77 
     | 
    
         
            +
            }
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
            TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") {
         
     | 
| 
      
 80 
     | 
    
         
            +
              uint64_t num_items = 5000;
         
     | 
| 
      
 81 
     | 
    
         
            +
              double fpp = 0.01;
         
     | 
| 
      
 82 
     | 
    
         
            +
              uint64_t seed = 4897301548054ULL;
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
              auto bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
         
     | 
| 
      
 85 
     | 
    
         
            +
              REQUIRE(bf.is_empty());
         
     | 
| 
      
 86 
     | 
    
         
            +
              REQUIRE(bf.get_bits_used() == 0);
         
     | 
| 
      
 87 
     | 
    
         
            +
             
     | 
| 
      
 88 
     | 
    
         
            +
              for (uint64_t i = 0; i < num_items; ++i) {
         
     | 
| 
      
 89 
     | 
    
         
            +
                bf.query_and_update(i);
         
     | 
| 
      
 90 
     | 
    
         
            +
              }
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
              REQUIRE(!bf.is_empty());
         
     | 
| 
      
 93 
     | 
    
         
            +
              // filter is about 50% full at target capacity
         
     | 
| 
      
 94 
     | 
    
         
            +
              // since seed is fixed we expect an exact value every time
         
     | 
| 
      
 95 
     | 
    
         
            +
              // but leaving the approximate test in since that's more the "expectation"
         
     | 
| 
      
 96 
     | 
    
         
            +
              REQUIRE(bf.get_bits_used() == 24793); // exact value is not important but should be consistent
         
     | 
| 
      
 97 
     | 
    
         
            +
              REQUIRE(bf.get_bits_used() == Approx(0.5 * bf.get_capacity()).epsilon(0.05)); // just over 3.3% in practice
         
     | 
| 
      
 98 
     | 
    
         
            +
             
     | 
| 
      
 99 
     | 
    
         
            +
              uint32_t num_found = 0;
         
     | 
| 
      
 100 
     | 
    
         
            +
              for (uint64_t i = num_items; i < bf.get_capacity(); ++i) {
         
     | 
| 
      
 101 
     | 
    
         
            +
                if (bf.query(i)) {
         
     | 
| 
      
 102 
     | 
    
         
            +
                  ++num_found;
         
     | 
| 
      
 103 
     | 
    
         
            +
                }
         
     | 
| 
      
 104 
     | 
    
         
            +
              }
         
     | 
| 
      
 105 
     | 
    
         
            +
              // fpp is average with significant variance -- even at 12% it would fail occasionally
         
     | 
| 
      
 106 
     | 
    
         
            +
              REQUIRE(num_found == 423);
         
     | 
| 
      
 107 
     | 
    
         
            +
              //REQUIRE(num_found == Approx((bf.get_capacity() - num_items) * fpp).epsilon(0.12));
         
     | 
| 
      
 108 
     | 
    
         
            +
              auto bytes = bf.serialize();
         
     | 
| 
      
 109 
     | 
    
         
            +
             
     | 
| 
      
 110 
     | 
    
         
            +
              // initialize in memory and run the same tests
         
     | 
| 
      
 111 
     | 
    
         
            +
              // also checking against the results from the first part
         
     | 
| 
      
 112 
     | 
    
         
            +
              uint8_t* bf_memory = new uint8_t[bytes.size()];
         
     | 
| 
      
 113 
     | 
    
         
            +
              auto bf2 = bloom_filter::builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed());
         
     | 
| 
      
 114 
     | 
    
         
            +
              REQUIRE(bf2.is_empty());
         
     | 
| 
      
 115 
     | 
    
         
            +
              REQUIRE(bf2.get_bits_used() == 0);
         
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
              for (uint64_t i = 0; i < num_items; ++i) {
         
     | 
| 
      
 118 
     | 
    
         
            +
                bf2.query_and_update(i);
         
     | 
| 
      
 119 
     | 
    
         
            +
              }
         
     | 
| 
      
 120 
     | 
    
         
            +
             
     | 
| 
      
 121 
     | 
    
         
            +
              REQUIRE(!bf2.is_empty());
         
     | 
| 
      
 122 
     | 
    
         
            +
              REQUIRE(bf2.get_bits_used() == bf.get_bits_used()); // should exactly match above
         
     | 
| 
      
 123 
     | 
    
         
            +
             
     | 
| 
      
 124 
     | 
    
         
            +
              uint32_t num_found2 = 0;
         
     | 
| 
      
 125 
     | 
    
         
            +
              for (uint64_t i = num_items; i < bf2.get_capacity(); ++i) {
         
     | 
| 
      
 126 
     | 
    
         
            +
                if (bf2.query(i)) {
         
     | 
| 
      
 127 
     | 
    
         
            +
                  ++num_found2;
         
     | 
| 
      
 128 
     | 
    
         
            +
                }
         
     | 
| 
      
 129 
     | 
    
         
            +
              }
         
     | 
| 
      
 130 
     | 
    
         
            +
              REQUIRE(num_found == num_found2); // should exactly match above
         
     | 
| 
      
 131 
     | 
    
         
            +
              auto bytes2 = bf2.serialize();
         
     | 
| 
      
 132 
     | 
    
         
            +
             
     | 
| 
      
 133 
     | 
    
         
            +
              REQUIRE(bytes.size() == bytes2.size());
         
     | 
| 
      
 134 
     | 
    
         
            +
              for (size_t i = 0; i < bytes.size(); ++i) {
         
     | 
| 
      
 135 
     | 
    
         
            +
                REQUIRE(bytes[i] == bytes2[i]);
         
     | 
| 
      
 136 
     | 
    
         
            +
              }
         
     | 
| 
      
 137 
     | 
    
         
            +
             
     | 
| 
      
 138 
     | 
    
         
            +
              // check that raw memory also matches serialized sketch
         
     | 
| 
      
 139 
     | 
    
         
            +
              const uint8_t* bf_bytes = bf2.get_wrapped_memory();
         
     | 
| 
      
 140 
     | 
    
         
            +
              REQUIRE(bf_bytes == bf_memory);
         
     | 
| 
      
 141 
     | 
    
         
            +
              for (size_t i = 0; i < bytes.size(); ++i) {
         
     | 
| 
      
 142 
     | 
    
         
            +
                REQUIRE(bf_bytes[i] == bytes[i]);
         
     | 
| 
      
 143 
     | 
    
         
            +
              }
         
     | 
| 
      
 144 
     | 
    
         
            +
             
     | 
| 
      
 145 
     | 
    
         
            +
              // ensure the filters reset properly
         
     | 
| 
      
 146 
     | 
    
         
            +
              bf.reset();
         
     | 
| 
      
 147 
     | 
    
         
            +
              REQUIRE(bf.is_empty());
         
     | 
| 
      
 148 
     | 
    
         
            +
              REQUIRE(bf.get_bits_used() == 0);
         
     | 
| 
      
 149 
     | 
    
         
            +
             
     | 
| 
      
 150 
     | 
    
         
            +
              bf2.reset();
         
     | 
| 
      
 151 
     | 
    
         
            +
              REQUIRE(bf2.is_empty());
         
     | 
| 
      
 152 
     | 
    
         
            +
              REQUIRE(bf2.get_bits_used() == 0);
         
     | 
| 
      
 153 
     | 
    
         
            +
             
     | 
| 
      
 154 
     | 
    
         
            +
              delete [] bf_memory;
         
     | 
| 
      
 155 
     | 
    
         
            +
            }
         
     | 
| 
      
 156 
     | 
    
         
            +
             
     | 
| 
      
 157 
     | 
    
         
            +
            TEST_CASE("bloom_filter: inversion", "[bloom_filter]") {
         
     | 
| 
      
 158 
     | 
    
         
            +
              uint64_t num_bits = 8192;
         
     | 
| 
      
 159 
     | 
    
         
            +
              uint16_t num_hashes = 3;
         
     | 
| 
      
 160 
     | 
    
         
            +
             
     | 
| 
      
 161 
     | 
    
         
            +
              auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
         
     | 
| 
      
 162 
     | 
    
         
            +
             
     | 
| 
      
 163 
     | 
    
         
            +
              uint64_t n = 500;
         
     | 
| 
      
 164 
     | 
    
         
            +
              for (uint64_t i = 0; i < n; ++i) {
         
     | 
| 
      
 165 
     | 
    
         
            +
                bf.update(i);
         
     | 
| 
      
 166 
     | 
    
         
            +
              }
         
     | 
| 
      
 167 
     | 
    
         
            +
              uint64_t num_bits_set = bf.get_bits_used();
         
     | 
| 
      
 168 
     | 
    
         
            +
              bf.invert();
         
     | 
| 
      
 169 
     | 
    
         
            +
              REQUIRE(bf.get_bits_used() == num_bits - num_bits_set);
         
     | 
| 
      
 170 
     | 
    
         
            +
             
     | 
| 
      
 171 
     | 
    
         
            +
              // original items should be mostly not-present
         
     | 
| 
      
 172 
     | 
    
         
            +
              uint32_t num_found = 0;
         
     | 
| 
      
 173 
     | 
    
         
            +
              for (uint64_t i = 0; i < n; ++i) {
         
     | 
| 
      
 174 
     | 
    
         
            +
                if (bf.query(i)) {
         
     | 
| 
      
 175 
     | 
    
         
            +
                  ++num_found;
         
     | 
| 
      
 176 
     | 
    
         
            +
                }
         
     | 
| 
      
 177 
     | 
    
         
            +
              }
         
     | 
| 
      
 178 
     | 
    
         
            +
              REQUIRE(num_found < n / 10);
         
     | 
| 
      
 179 
     | 
    
         
            +
             
     | 
| 
      
 180 
     | 
    
         
            +
              // many other items should be "present"
         
     | 
| 
      
 181 
     | 
    
         
            +
              num_found = 0;
         
     | 
| 
      
 182 
     | 
    
         
            +
              for (uint64_t i = n; i < num_bits; ++i) {
         
     | 
| 
      
 183 
     | 
    
         
            +
                if (bf.query(i)) {
         
     | 
| 
      
 184 
     | 
    
         
            +
                  ++num_found;
         
     | 
| 
      
 185 
     | 
    
         
            +
                }
         
     | 
| 
      
 186 
     | 
    
         
            +
              }
         
     | 
| 
      
 187 
     | 
    
         
            +
              REQUIRE(num_found > n);
         
     | 
| 
      
 188 
     | 
    
         
            +
            }
         
     | 
| 
      
 189 
     | 
    
         
            +
             
     | 
| 
      
 190 
     | 
    
         
            +
            TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") {
         
     | 
| 
      
 191 
     | 
    
         
            +
              uint64_t num_bits = 32768;
         
     | 
| 
      
 192 
     | 
    
         
            +
              uint16_t num_hashes = 4;
         
     | 
| 
      
 193 
     | 
    
         
            +
             
     | 
| 
      
 194 
     | 
    
         
            +
              auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
         
     | 
| 
      
 195 
     | 
    
         
            +
             
     | 
| 
      
 196 
     | 
    
         
            +
              // mismatched num bits
         
     | 
| 
      
 197 
     | 
    
         
            +
              auto bf2 = bloom_filter::builder::create_by_size(2 * num_bits, num_hashes);
         
     | 
| 
      
 198 
     | 
    
         
            +
              REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument);
         
     | 
| 
      
 199 
     | 
    
         
            +
             
     | 
| 
      
 200 
     | 
    
         
            +
              // mismatched num hashes
         
     | 
| 
      
 201 
     | 
    
         
            +
              auto bf3 = bloom_filter::builder::create_by_size(num_bits, 2 * num_hashes);
         
     | 
| 
      
 202 
     | 
    
         
            +
              REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument);
         
     | 
| 
      
 203 
     | 
    
         
            +
             
     | 
| 
      
 204 
     | 
    
         
            +
              // mismatched seed
         
     | 
| 
      
 205 
     | 
    
         
            +
              auto bf4 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1);
         
     | 
| 
      
 206 
     | 
    
         
            +
              REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument);
         
     | 
| 
      
 207 
     | 
    
         
            +
            }
         
     | 
| 
      
 208 
     | 
    
         
            +
             
     | 
| 
      
 209 
     | 
    
         
            +
            TEST_CASE("bloom_filter: basic union", "[bloom_filter]") {
         
     | 
| 
      
 210 
     | 
    
         
            +
              const uint64_t num_bits = 12288;
         
     | 
| 
      
 211 
     | 
    
         
            +
              const uint16_t num_hashes = 4;
         
     | 
| 
      
 212 
     | 
    
         
            +
             
     | 
| 
      
 213 
     | 
    
         
            +
              auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
         
     | 
| 
      
 214 
     | 
    
         
            +
              auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed());
         
     | 
| 
      
 215 
     | 
    
         
            +
             
     | 
| 
      
 216 
     | 
    
         
            +
              const uint64_t n = 1000;
         
     | 
| 
      
 217 
     | 
    
         
            +
              const uint32_t max_item = 3 * n / 2 - 1;
         
     | 
| 
      
 218 
     | 
    
         
            +
              for (uint64_t i = 0; i < n; ++i) {
         
     | 
| 
      
 219 
     | 
    
         
            +
                bf1.query_and_update(i);
         
     | 
| 
      
 220 
     | 
    
         
            +
                bf2.update(n / 2 + i);
         
     | 
| 
      
 221 
     | 
    
         
            +
              }
         
     | 
| 
      
 222 
     | 
    
         
            +
             
     | 
| 
      
 223 
     | 
    
         
            +
              bf1.union_with(bf2);
         
     | 
| 
      
 224 
     | 
    
         
            +
              for (uint64_t i = 0; i < max_item; ++i) {
         
     | 
| 
      
 225 
     | 
    
         
            +
                REQUIRE(bf1.query(i));
         
     | 
| 
      
 226 
     | 
    
         
            +
              }
         
     | 
| 
      
 227 
     | 
    
         
            +
             
     | 
| 
      
 228 
     | 
    
         
            +
              uint32_t num_found = 0;
         
     | 
| 
      
 229 
     | 
    
         
            +
              for (uint64_t i = max_item; i < num_bits; ++i) {
         
     | 
| 
      
 230 
     | 
    
         
            +
                if (bf1.query(i)) {
         
     | 
| 
      
 231 
     | 
    
         
            +
                  ++num_found;
         
     | 
| 
      
 232 
     | 
    
         
            +
                }
         
     | 
| 
      
 233 
     | 
    
         
            +
              }
         
     | 
| 
      
 234 
     | 
    
         
            +
              REQUIRE(num_found < num_bits / 10); // not being super strict
         
     | 
| 
      
 235 
     | 
    
         
            +
            }
         
     | 
| 
      
 236 
     | 
    
         
            +
             
     | 
| 
      
 237 
     | 
    
         
            +
            TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") {
         
     | 
| 
      
 238 
     | 
    
         
            +
              const uint64_t num_bits = 8192;
         
     | 
| 
      
 239 
     | 
    
         
            +
              const uint16_t num_hahes = 5;
         
     | 
| 
      
 240 
     | 
    
         
            +
             
     | 
| 
      
 241 
     | 
    
         
            +
              auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hahes);
         
     | 
| 
      
 242 
     | 
    
         
            +
              auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hahes, bf1.get_seed());
         
     | 
| 
      
 243 
     | 
    
         
            +
             
     | 
| 
      
 244 
     | 
    
         
            +
              const uint64_t n = 1024;
         
     | 
| 
      
 245 
     | 
    
         
            +
              const uint32_t max_item = 3 * n / 2 - 1;
         
     | 
| 
      
 246 
     | 
    
         
            +
              for (uint64_t i = 0; i < n; ++i) {
         
     | 
| 
      
 247 
     | 
    
         
            +
                bf1.update(i);
         
     | 
| 
      
 248 
     | 
    
         
            +
                bf2.update(n / 2 + i);
         
     | 
| 
      
 249 
     | 
    
         
            +
              }
         
     | 
| 
      
 250 
     | 
    
         
            +
             
     | 
| 
      
 251 
     | 
    
         
            +
              bf1.intersect(bf2);
         
     | 
| 
      
 252 
     | 
    
         
            +
              // overlap bit should all be set
         
     | 
| 
      
 253 
     | 
    
         
            +
              for (uint64_t i = n / 2; i < n; ++i) {
         
     | 
| 
      
 254 
     | 
    
         
            +
                REQUIRE(bf1.query(i));
         
     | 
| 
      
 255 
     | 
    
         
            +
              }
         
     | 
| 
      
 256 
     | 
    
         
            +
             
     | 
| 
      
 257 
     | 
    
         
            +
              uint32_t num_found = 0;
         
     | 
| 
      
 258 
     | 
    
         
            +
              for (uint64_t i = 0; i < n / 2; ++i) {
         
     | 
| 
      
 259 
     | 
    
         
            +
                if (bf1.query(i)) {
         
     | 
| 
      
 260 
     | 
    
         
            +
                  ++num_found;
         
     | 
| 
      
 261 
     | 
    
         
            +
                }
         
     | 
| 
      
 262 
     | 
    
         
            +
              }
         
     | 
| 
      
 263 
     | 
    
         
            +
              for (uint64_t i = max_item; i < num_bits; ++i) {
         
     | 
| 
      
 264 
     | 
    
         
            +
                if (bf1.query(i)) {
         
     | 
| 
      
 265 
     | 
    
         
            +
                  ++num_found;
         
     | 
| 
      
 266 
     | 
    
         
            +
                }
         
     | 
| 
      
 267 
     | 
    
         
            +
              }
         
     | 
| 
      
 268 
     | 
    
         
            +
             
     | 
| 
      
 269 
     | 
    
         
            +
              REQUIRE(num_found < num_bits / 10); // not being super strict
         
     | 
| 
      
 270 
     | 
    
         
            +
            }
         
     | 
| 
      
 271 
     | 
    
         
            +
             
     | 
| 
      
 272 
     | 
    
         
            +
            TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") {
         
     | 
| 
      
 273 
     | 
    
         
            +
              const uint64_t num_bits = 32769;
         
     | 
| 
      
 274 
     | 
    
         
            +
              const uint16_t num_hashes = 7;
         
     | 
| 
      
 275 
     | 
    
         
            +
             
     | 
| 
      
 276 
     | 
    
         
            +
              auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
         
     | 
| 
      
 277 
     | 
    
         
            +
              auto bytes = bf.serialize();
         
     | 
| 
      
 278 
     | 
    
         
            +
              REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
         
     | 
| 
      
 279 
     | 
    
         
            +
             
     | 
| 
      
 280 
     | 
    
         
            +
              auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
         
     | 
| 
      
 281 
     | 
    
         
            +
              REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
         
     | 
| 
      
 282 
     | 
    
         
            +
              REQUIRE(bf.get_seed() == bf_bytes.get_seed());
         
     | 
| 
      
 283 
     | 
    
         
            +
              REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
         
     | 
| 
      
 284 
     | 
    
         
            +
              REQUIRE(bf_bytes.is_empty());
         
     | 
| 
      
 285 
     | 
    
         
            +
             
     | 
| 
      
 286 
     | 
    
         
            +
              std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
         
     | 
| 
      
 287 
     | 
    
         
            +
              bf.serialize(ss);
         
     | 
| 
      
 288 
     | 
    
         
            +
              auto bf_stream = bloom_filter::deserialize(ss);
         
     | 
| 
      
 289 
     | 
    
         
            +
              REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
         
     | 
| 
      
 290 
     | 
    
         
            +
              REQUIRE(bf.get_seed() == bf_stream.get_seed());
         
     | 
| 
      
 291 
     | 
    
         
            +
              REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
         
     | 
| 
      
 292 
     | 
    
         
            +
              REQUIRE(bf_stream.is_empty());
         
     | 
| 
      
 293 
     | 
    
         
            +
             
     | 
| 
      
 294 
     | 
    
         
            +
              // read-only wrap should work
         
     | 
| 
      
 295 
     | 
    
         
            +
              auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
         
     | 
| 
      
 296 
     | 
    
         
            +
              REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
         
     | 
| 
      
 297 
     | 
    
         
            +
              REQUIRE(bf.get_seed() == bf_wrap.get_seed());
         
     | 
| 
      
 298 
     | 
    
         
            +
              REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
         
     | 
| 
      
 299 
     | 
    
         
            +
              REQUIRE(bf_wrap.is_empty());
         
     | 
| 
      
 300 
     | 
    
         
            +
             
     | 
| 
      
 301 
     | 
    
         
            +
              // writable wrap should not
         
     | 
| 
      
 302 
     | 
    
         
            +
              REQUIRE_THROWS_AS(bloom_filter::writable_wrap(bytes.data(), bytes.size()), std::invalid_argument);
         
     | 
| 
      
 303 
     | 
    
         
            +
            }
         
     | 
| 
      
 304 
     | 
    
         
            +
             
     | 
| 
      
 305 
     | 
    
         
            +
            TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") {
         
     | 
| 
      
 306 
     | 
    
         
            +
              const uint64_t num_bits = 32768;
         
     | 
| 
      
 307 
     | 
    
         
            +
              const uint16_t num_hashes = 5;
         
     | 
| 
      
 308 
     | 
    
         
            +
             
     | 
| 
      
 309 
     | 
    
         
            +
              auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
         
     | 
| 
      
 310 
     | 
    
         
            +
              const uint64_t n = 1000;
         
     | 
| 
      
 311 
     | 
    
         
            +
              for (uint64_t i = 0; i < n; ++i) {
         
     | 
| 
      
 312 
     | 
    
         
            +
                bf.update(0.5 + i); // testing floats
         
     | 
| 
      
 313 
     | 
    
         
            +
              }
         
     | 
| 
      
 314 
     | 
    
         
            +
             
     | 
| 
      
 315 
     | 
    
         
            +
              // test more items without updating, assuming some false positives
         
     | 
| 
      
 316 
     | 
    
         
            +
              // so we can check that we get the same number of false positives
         
     | 
| 
      
 317 
     | 
    
         
            +
              // with the same query items
         
     | 
| 
      
 318 
     | 
    
         
            +
              uint64_t fp_count = 0;
         
     | 
| 
      
 319 
     | 
    
         
            +
              for (uint64_t i = n; i < num_bits; ++i) {
         
     | 
| 
      
 320 
     | 
    
         
            +
                fp_count += bf.query(0.5 + i) ? 1 : 0;
         
     | 
| 
      
 321 
     | 
    
         
            +
              }
         
     | 
| 
      
 322 
     | 
    
         
            +
             
     | 
| 
      
 323 
     | 
    
         
            +
              auto bytes = bf.serialize();
         
     | 
| 
      
 324 
     | 
    
         
            +
              REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
         
     | 
| 
      
 325 
     | 
    
         
            +
             
     | 
| 
      
 326 
     | 
    
         
            +
              auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
         
     | 
| 
      
 327 
     | 
    
         
            +
              REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
         
     | 
| 
      
 328 
     | 
    
         
            +
              REQUIRE(bf.get_seed() == bf_bytes.get_seed());
         
     | 
| 
      
 329 
     | 
    
         
            +
              REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
         
     | 
| 
      
 330 
     | 
    
         
            +
              REQUIRE(!bf_bytes.is_empty());
         
     | 
| 
      
 331 
     | 
    
         
            +
              REQUIRE(bf.is_memory_owned());
         
     | 
| 
      
 332 
     | 
    
         
            +
              uint64_t fp_count_bytes = 0;
         
     | 
| 
      
 333 
     | 
    
         
            +
              for (uint64_t i = 0; i < num_bits; ++i) {
         
     | 
| 
      
 334 
     | 
    
         
            +
                bool val = bf_bytes.query(0.5 + i);
         
     | 
| 
      
 335 
     | 
    
         
            +
                if (i < n)
         
     | 
| 
      
 336 
     | 
    
         
            +
                  REQUIRE(val);
         
     | 
| 
      
 337 
     | 
    
         
            +
                else if (val)
         
     | 
| 
      
 338 
     | 
    
         
            +
                  ++fp_count_bytes;
         
     | 
| 
      
 339 
     | 
    
         
            +
              }
         
     | 
| 
      
 340 
     | 
    
         
            +
              REQUIRE(fp_count_bytes == fp_count);
         
     | 
| 
      
 341 
     | 
    
         
            +
             
     | 
| 
      
 342 
     | 
    
         
            +
              std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
         
     | 
| 
      
 343 
     | 
    
         
            +
              bf.serialize(ss);
         
     | 
| 
      
 344 
     | 
    
         
            +
              auto bf_stream = bloom_filter::deserialize(ss);
         
     | 
| 
      
 345 
     | 
    
         
            +
              REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
         
     | 
| 
      
 346 
     | 
    
         
            +
              REQUIRE(bf.get_seed() == bf_stream.get_seed());
         
     | 
| 
      
 347 
     | 
    
         
            +
              REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
         
     | 
| 
      
 348 
     | 
    
         
            +
              REQUIRE(!bf_stream.is_empty());
         
     | 
| 
      
 349 
     | 
    
         
            +
              REQUIRE(bf_stream.is_memory_owned());
         
     | 
| 
      
 350 
     | 
    
         
            +
              uint64_t fp_count_stream = 0;
         
     | 
| 
      
 351 
     | 
    
         
            +
              for (uint64_t i = 0; i < num_bits; ++i) {
         
     | 
| 
      
 352 
     | 
    
         
            +
                bool val = bf_stream.query(0.5 + i);
         
     | 
| 
      
 353 
     | 
    
         
            +
                if (i < n)
         
     | 
| 
      
 354 
     | 
    
         
            +
                  REQUIRE(val);
         
     | 
| 
      
 355 
     | 
    
         
            +
                else if (val)
         
     | 
| 
      
 356 
     | 
    
         
            +
                  ++fp_count_stream;
         
     | 
| 
      
 357 
     | 
    
         
            +
              }
         
     | 
| 
      
 358 
     | 
    
         
            +
              REQUIRE(fp_count_stream == fp_count);
         
     | 
| 
      
 359 
     | 
    
         
            +
             
     | 
| 
      
 360 
     | 
    
         
            +
              // read-only wrap
         
     | 
| 
      
 361 
     | 
    
         
            +
              auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
         
     | 
| 
      
 362 
     | 
    
         
            +
              REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
         
     | 
| 
      
 363 
     | 
    
         
            +
              REQUIRE(bf.get_seed() == bf_wrap.get_seed());
         
     | 
| 
      
 364 
     | 
    
         
            +
              REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
         
     | 
| 
      
 365 
     | 
    
         
            +
              REQUIRE(!bf_wrap.is_empty());
         
     | 
| 
      
 366 
     | 
    
         
            +
              REQUIRE(!bf_wrap.is_memory_owned());
         
     | 
| 
      
 367 
     | 
    
         
            +
              uint64_t fp_count_wrap = 0;
         
     | 
| 
      
 368 
     | 
    
         
            +
              for (uint64_t i = 0; i < num_bits; ++i) {
         
     | 
| 
      
 369 
     | 
    
         
            +
                bool val = bf_wrap.query(0.5 + i);
         
     | 
| 
      
 370 
     | 
    
         
            +
                if (i < n)
         
     | 
| 
      
 371 
     | 
    
         
            +
                  REQUIRE(val);
         
     | 
| 
      
 372 
     | 
    
         
            +
                else if (val)
         
     | 
| 
      
 373 
     | 
    
         
            +
                  ++fp_count_wrap;
         
     | 
| 
      
 374 
     | 
    
         
            +
              }
         
     | 
| 
      
 375 
     | 
    
         
            +
              REQUIRE(fp_count_wrap == fp_count);
         
     | 
| 
      
 376 
     | 
    
         
            +
              REQUIRE_THROWS_AS(bf_wrap.update(-1.0), std::logic_error);
         
     | 
| 
      
 377 
     | 
    
         
            +
              REQUIRE_THROWS_AS(bf_wrap.query_and_update(-2.0), std::logic_error);
         
     | 
| 
      
 378 
     | 
    
         
            +
              REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error);
         
     | 
| 
      
 379 
     | 
    
         
            +
             
     | 
| 
      
 380 
     | 
    
         
            +
              // writable wrap
         
     | 
| 
      
 381 
     | 
    
         
            +
              auto bf_writable = bloom_filter::writable_wrap(bytes.data(), bytes.size());
         
     | 
| 
      
 382 
     | 
    
         
            +
              REQUIRE(bf.get_capacity() == bf_writable.get_capacity());
         
     | 
| 
      
 383 
     | 
    
         
            +
              REQUIRE(bf.get_seed() == bf_writable.get_seed());
         
     | 
| 
      
 384 
     | 
    
         
            +
              REQUIRE(bf.get_num_hashes() == bf_writable.get_num_hashes());
         
     | 
| 
      
 385 
     | 
    
         
            +
              REQUIRE(!bf_writable.is_empty());
         
     | 
| 
      
 386 
     | 
    
         
            +
              REQUIRE(!bf_writable.is_memory_owned());
         
     | 
| 
      
 387 
     | 
    
         
            +
              uint64_t fp_count_writable = 0;
         
     | 
| 
      
 388 
     | 
    
         
            +
              for (uint64_t i = 0; i < num_bits; ++i) {
         
     | 
| 
      
 389 
     | 
    
         
            +
                bool val = bf_writable.query(0.5 + i);
         
     | 
| 
      
 390 
     | 
    
         
            +
                if (i < n)
         
     | 
| 
      
 391 
     | 
    
         
            +
                  REQUIRE(val);
         
     | 
| 
      
 392 
     | 
    
         
            +
                else if (val)
         
     | 
| 
      
 393 
     | 
    
         
            +
                  ++fp_count_writable;
         
     | 
| 
      
 394 
     | 
    
         
            +
              }
         
     | 
| 
      
 395 
     | 
    
         
            +
              REQUIRE(fp_count_writable == fp_count);
         
     | 
| 
      
 396 
     | 
    
         
            +
             
     | 
| 
      
 397 
     | 
    
         
            +
              REQUIRE(!bf_writable.query(-1.0));
         
     | 
| 
      
 398 
     | 
    
         
            +
              bf_writable.update(-1.0);
         
     | 
| 
      
 399 
     | 
    
         
            +
              REQUIRE(bf_writable.query(-1.0));
         
     | 
| 
      
 400 
     | 
    
         
            +
             
     | 
| 
      
 401 
     | 
    
         
            +
              // not good memory management to do this, but because we wrapped the same bytes as both
         
     | 
| 
      
 402 
     | 
    
         
            +
              // read-only adn writable, that update should ahve changed the read-only version, too
         
     | 
| 
      
 403 
     | 
    
         
            +
              REQUIRE(bf_wrap.query(-1.0));
         
     | 
| 
      
 404 
     | 
    
         
            +
            }
         
     | 
| 
      
 405 
     | 
    
         
            +
             
     | 
| 
      
 406 
     | 
    
         
            +
            } // namespace datasketches
         
     | 
| 
         @@ -89,6 +89,7 @@ public: 
     | 
|
| 
       89 
89 
     | 
    
         
             
              using vector_t = std::vector<T, Allocator>;
         
     | 
| 
       90 
90 
     | 
    
         
             
              using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
         
     | 
| 
       91 
91 
     | 
    
         
             
              using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
         
     | 
| 
      
 92 
     | 
    
         
            +
              using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
         
     | 
| 
       92 
93 
     | 
    
         | 
| 
       93 
94 
     | 
    
         
             
              struct centroid_cmp {
         
     | 
| 
       94 
95 
     | 
    
         
             
                centroid_cmp() {}
         
     | 
| 
         @@ -115,7 +116,7 @@ public: 
     | 
|
| 
       115 
116 
     | 
    
         
             
               * Merge the given t-Digest into this one
         
     | 
| 
       116 
117 
     | 
    
         
             
               * @param other t-Digest to merge
         
     | 
| 
       117 
118 
     | 
    
         
             
               */
         
     | 
| 
       118 
     | 
    
         
            -
              void merge(tdigest& other);
         
     | 
| 
      
 119 
     | 
    
         
            +
              void merge(const tdigest& other);
         
     | 
| 
       119 
120 
     | 
    
         | 
| 
       120 
121 
     | 
    
         
             
              /**
         
     | 
| 
       121 
122 
     | 
    
         
             
               * Process buffered values and merge centroids if needed
         
     | 
| 
         @@ -142,8 +143,17 @@ public: 
     | 
|
| 
       142 
143 
     | 
    
         
             
               */
         
     | 
| 
       143 
144 
     | 
    
         
             
              uint64_t get_total_weight() const;
         
     | 
| 
       144 
145 
     | 
    
         | 
| 
      
 146 
     | 
    
         
            +
              /**
         
     | 
| 
      
 147 
     | 
    
         
            +
               * Returns an instance of the allocator for this t-Digest.
         
     | 
| 
      
 148 
     | 
    
         
            +
               * @return allocator
         
     | 
| 
      
 149 
     | 
    
         
            +
               */
         
     | 
| 
      
 150 
     | 
    
         
            +
              Allocator get_allocator() const;
         
     | 
| 
      
 151 
     | 
    
         
            +
             
     | 
| 
       145 
152 
     | 
    
         
             
              /**
         
     | 
| 
       146 
153 
     | 
    
         
             
               * Compute approximate normalized rank of the given value.
         
     | 
| 
      
 154 
     | 
    
         
            +
               *
         
     | 
| 
      
 155 
     | 
    
         
            +
               * <p>If the sketch is empty this throws std::runtime_error.
         
     | 
| 
      
 156 
     | 
    
         
            +
               *
         
     | 
| 
       147 
157 
     | 
    
         
             
               * @param value to be ranked
         
     | 
| 
       148 
158 
     | 
    
         
             
               * @return normalized rank (from 0 to 1 inclusive)
         
     | 
| 
       149 
159 
     | 
    
         
             
               */
         
     | 
| 
         @@ -151,11 +161,49 @@ public: 
     | 
|
| 
       151 
161 
     | 
    
         | 
| 
       152 
162 
     | 
    
         
             
              /**
         
     | 
| 
       153 
163 
     | 
    
         
             
               * Compute approximate quantile value corresponding to the given normalized rank
         
     | 
| 
      
 164 
     | 
    
         
            +
               *
         
     | 
| 
      
 165 
     | 
    
         
            +
               * <p>If the sketch is empty this throws std::runtime_error.
         
     | 
| 
      
 166 
     | 
    
         
            +
               *
         
     | 
| 
       154 
167 
     | 
    
         
             
               * @param rank normalized rank (from 0 to 1 inclusive)
         
     | 
| 
       155 
168 
     | 
    
         
             
               * @return quantile value corresponding to the given rank
         
     | 
| 
       156 
169 
     | 
    
         
             
               */
         
     | 
| 
       157 
170 
     | 
    
         
             
              T get_quantile(double rank) const;
         
     | 
| 
       158 
171 
     | 
    
         | 
| 
      
 172 
     | 
    
         
            +
              /**
         
     | 
| 
      
 173 
     | 
    
         
            +
               * Returns an approximation to the Probability Mass Function (PMF) of the input stream
         
     | 
| 
      
 174 
     | 
    
         
            +
               * given a set of split points.
         
     | 
| 
      
 175 
     | 
    
         
            +
               *
         
     | 
| 
      
 176 
     | 
    
         
            +
               * <p>If the sketch is empty this throws std::runtime_error.
         
     | 
| 
      
 177 
     | 
    
         
            +
               *
         
     | 
| 
      
 178 
     | 
    
         
            +
               * @param split_points an array of <i>m</i> unique, monotonically increasing values
         
     | 
| 
      
 179 
     | 
    
         
            +
               * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
         
     | 
| 
      
 180 
     | 
    
         
            +
               *
         
     | 
| 
      
 181 
     | 
    
         
            +
               * @param size the number of split points in the array
         
     | 
| 
      
 182 
     | 
    
         
            +
               *
         
     | 
| 
      
 183 
     | 
    
         
            +
               * @return an array of m+1 doubles each of which is an approximation
         
     | 
| 
      
 184 
     | 
    
         
            +
               * to the fraction of the input stream values (the mass) that fall into one of those intervals.
         
     | 
| 
      
 185 
     | 
    
         
            +
               */
         
     | 
| 
      
 186 
     | 
    
         
            +
              vector_double get_PMF(const T* split_points, uint32_t size) const;
         
     | 
| 
      
 187 
     | 
    
         
            +
             
     | 
| 
      
 188 
     | 
    
         
            +
              /**
         
     | 
| 
      
 189 
     | 
    
         
            +
               * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
         
     | 
| 
      
 190 
     | 
    
         
            +
               * cumulative analog of the PMF, of the input stream given a set of split points.
         
     | 
| 
      
 191 
     | 
    
         
            +
               *
         
     | 
| 
      
 192 
     | 
    
         
            +
               * <p>If the sketch is empty this throws std::runtime_error.
         
     | 
| 
      
 193 
     | 
    
         
            +
               *
         
     | 
| 
      
 194 
     | 
    
         
            +
               * @param split_points an array of <i>m</i> unique, monotonically increasing values
         
     | 
| 
      
 195 
     | 
    
         
            +
               * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
         
     | 
| 
      
 196 
     | 
    
         
            +
               *
         
     | 
| 
      
 197 
     | 
    
         
            +
               * @param size the number of split points in the array
         
     | 
| 
      
 198 
     | 
    
         
            +
               *
         
     | 
| 
      
 199 
     | 
    
         
            +
               * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
         
     | 
| 
      
 200 
     | 
    
         
            +
               * of the input stream given the split_points. The value at array position j of the returned
         
     | 
| 
      
 201 
     | 
    
         
            +
               * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
         
     | 
| 
      
 202 
     | 
    
         
            +
               * array. This can be viewed as array of ranks of the given split points plus one more value
         
     | 
| 
      
 203 
     | 
    
         
            +
               * that is always 1.
         
     | 
| 
      
 204 
     | 
    
         
            +
               */
         
     | 
| 
      
 205 
     | 
    
         
            +
              vector_double get_CDF(const T* split_points, uint32_t size) const;
         
     | 
| 
      
 206 
     | 
    
         
            +
             
     | 
| 
       159 
207 
     | 
    
         
             
              /**
         
     | 
| 
       160 
208 
     | 
    
         
             
               * @return parameter k (compression) that was used to configure this t-Digest
         
     | 
| 
       161 
209 
     | 
    
         
             
               */
         
     | 
| 
         @@ -245,6 +293,8 @@ private: 
     | 
|
| 
       245 
293 
     | 
    
         
             
              // for compatibility with format of the reference implementation
         
     | 
| 
       246 
294 
     | 
    
         
             
              static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
         
     | 
| 
       247 
295 
     | 
    
         
             
              static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
         
     | 
| 
      
 296 
     | 
    
         
            +
             
     | 
| 
      
 297 
     | 
    
         
            +
              static inline void check_split_points(const T* values, uint32_t size);
         
     | 
| 
       248 
298 
     | 
    
         
             
            };
         
     | 
| 
       249 
299 
     | 
    
         | 
| 
       250 
300 
     | 
    
         
             
            } /* namespace datasketches */
         
     | 
| 
         @@ -20,6 +20,7 @@ 
     | 
|
| 
       20 
20 
     | 
    
         
             
            #ifndef _TDIGEST_IMPL_HPP_
         
     | 
| 
       21 
21 
     | 
    
         
             
            #define _TDIGEST_IMPL_HPP_
         
     | 
| 
       22 
22 
     | 
    
         | 
| 
      
 23 
     | 
    
         
            +
            #include <algorithm>
         
     | 
| 
       23 
24 
     | 
    
         
             
            #include <cmath>
         
     | 
| 
       24 
25 
     | 
    
         
             
            #include <sstream>
         
     | 
| 
       25 
26 
     | 
    
         | 
| 
         @@ -43,7 +44,7 @@ void tdigest<T, A>::update(T value) { 
     | 
|
| 
       43 
44 
     | 
    
         
             
            }
         
     | 
| 
       44 
45 
     | 
    
         | 
| 
       45 
46 
     | 
    
         
             
            template<typename T, typename A>
         
     | 
| 
       46 
     | 
    
         
            -
            void tdigest<T, A>::merge(tdigest& other) {
         
     | 
| 
      
 47 
     | 
    
         
            +
            void tdigest<T, A>::merge(const tdigest& other) {
         
     | 
| 
       47 
48 
     | 
    
         
             
              if (other.is_empty()) return;
         
     | 
| 
       48 
49 
     | 
    
         
             
              vector_centroid tmp(buffer_.get_allocator());
         
     | 
| 
       49 
50 
     | 
    
         
             
              tmp.reserve(buffer_.size() + centroids_.size() + other.buffer_.size() + other.centroids_.size());
         
     | 
| 
         @@ -84,6 +85,11 @@ uint64_t tdigest<T, A>::get_total_weight() const { 
     | 
|
| 
       84 
85 
     | 
    
         
             
              return centroids_weight_ + buffer_.size();
         
     | 
| 
       85 
86 
     | 
    
         
             
            }
         
     | 
| 
       86 
87 
     | 
    
         | 
| 
      
 88 
     | 
    
         
            +
            template<typename T, typename A>
         
     | 
| 
      
 89 
     | 
    
         
            +
            A tdigest<T, A>::get_allocator() const {
         
     | 
| 
      
 90 
     | 
    
         
            +
              return buffer_.get_allocator();
         
     | 
| 
      
 91 
     | 
    
         
            +
            }
         
     | 
| 
      
 92 
     | 
    
         
            +
             
     | 
| 
       87 
93 
     | 
    
         
             
            template<typename T, typename A>
         
     | 
| 
       88 
94 
     | 
    
         
             
            double tdigest<T, A>::get_rank(T value) const {
         
     | 
| 
       89 
95 
     | 
    
         
             
              if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
         
     | 
| 
         @@ -190,6 +196,25 @@ T tdigest<T, A>::get_quantile(double rank) const { 
     | 
|
| 
       190 
196 
     | 
    
         
             
              return weighted_average(centroids_.back().get_weight(), w1, max_, w2);
         
     | 
| 
       191 
197 
     | 
    
         
             
            }
         
     | 
| 
       192 
198 
     | 
    
         | 
| 
      
 199 
     | 
    
         
            +
            template<typename T, typename A>
         
     | 
| 
      
 200 
     | 
    
         
            +
            auto tdigest<T, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
         
     | 
| 
      
 201 
     | 
    
         
            +
              auto buckets = get_CDF(split_points, size);
         
     | 
| 
      
 202 
     | 
    
         
            +
              for (uint32_t i = size; i > 0; --i) {
         
     | 
| 
      
 203 
     | 
    
         
            +
                buckets[i] -= buckets[i - 1];
         
     | 
| 
      
 204 
     | 
    
         
            +
              }
         
     | 
| 
      
 205 
     | 
    
         
            +
              return buckets;
         
     | 
| 
      
 206 
     | 
    
         
            +
            }
         
     | 
| 
      
 207 
     | 
    
         
            +
             
     | 
| 
      
 208 
     | 
    
         
            +
            template<typename T, typename A>
         
     | 
| 
      
 209 
     | 
    
         
            +
            auto tdigest<T, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
         
     | 
| 
      
 210 
     | 
    
         
            +
              check_split_points(split_points, size);
         
     | 
| 
      
 211 
     | 
    
         
            +
              vector_double ranks(get_allocator());
         
     | 
| 
      
 212 
     | 
    
         
            +
              ranks.reserve(size + 1);
         
     | 
| 
      
 213 
     | 
    
         
            +
              for (uint32_t i = 0; i < size; ++i) ranks.push_back(get_rank(split_points[i]));
         
     | 
| 
      
 214 
     | 
    
         
            +
              ranks.push_back(1);
         
     | 
| 
      
 215 
     | 
    
         
            +
              return ranks;
         
     | 
| 
      
 216 
     | 
    
         
            +
            }
         
     | 
| 
      
 217 
     | 
    
         
            +
             
     | 
| 
       193 
218 
     | 
    
         
             
            template<typename T, typename A>
         
     | 
| 
       194 
219 
     | 
    
         
             
            uint16_t tdigest<T, A>::get_k() const {
         
     | 
| 
       195 
220 
     | 
    
         
             
              return k_;
         
     | 
| 
         @@ -590,6 +615,18 @@ buffer_(std::move(buffer)) 
     | 
|
| 
       590 
615 
     | 
    
         
             
              buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER);
         
     | 
| 
       591 
616 
     | 
    
         
             
            }
         
     | 
| 
       592 
617 
     | 
    
         | 
| 
      
 618 
     | 
    
         
            +
            template<typename T, typename A>
         
     | 
| 
      
 619 
     | 
    
         
            +
            void tdigest<T, A>::check_split_points(const T* values, uint32_t size) {
         
     | 
| 
      
 620 
     | 
    
         
            +
              for (uint32_t i = 0; i < size ; i++) {
         
     | 
| 
      
 621 
     | 
    
         
            +
                if (std::isnan(values[i])) {
         
     | 
| 
      
 622 
     | 
    
         
            +
                  throw std::invalid_argument("Values must not be NaN");
         
     | 
| 
      
 623 
     | 
    
         
            +
                }
         
     | 
| 
      
 624 
     | 
    
         
            +
                if ((i < (size - 1)) && !(values[i] < values[i + 1])) {
         
     | 
| 
      
 625 
     | 
    
         
            +
                  throw std::invalid_argument("Values must be unique and monotonically increasing");
         
     | 
| 
      
 626 
     | 
    
         
            +
                }
         
     | 
| 
      
 627 
     | 
    
         
            +
              }
         
     | 
| 
      
 628 
     | 
    
         
            +
            }
         
     | 
| 
      
 629 
     | 
    
         
            +
             
     | 
| 
       593 
630 
     | 
    
         
             
            } /* namespace datasketches */
         
     | 
| 
       594 
631 
     | 
    
         | 
| 
       595 
632 
     | 
    
         
             
            #endif // _TDIGEST_IMPL_HPP_
         
     | 
| 
         @@ -35,6 +35,9 @@ TEST_CASE("empty", "[tdigest]") { 
     | 
|
| 
       35 
35 
     | 
    
         
             
              REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
         
     | 
| 
       36 
36 
     | 
    
         
             
              REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
         
     | 
| 
       37 
37 
     | 
    
         
             
              REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
         
     | 
| 
      
 38 
     | 
    
         
            +
              const double split_points[1] {0};
         
     | 
| 
      
 39 
     | 
    
         
            +
              REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::runtime_error);
         
     | 
| 
      
 40 
     | 
    
         
            +
              REQUIRE_THROWS_AS(td.get_CDF(split_points, 1), std::runtime_error);
         
     | 
| 
       38 
41 
     | 
    
         
             
            }
         
     | 
| 
       39 
42 
     | 
    
         | 
| 
       40 
43 
     | 
    
         
             
            TEST_CASE("one value", "[tdigest]") {
         
     | 
| 
         @@ -56,9 +59,6 @@ TEST_CASE("many values", "[tdigest]") { 
     | 
|
| 
       56 
59 
     | 
    
         
             
              const size_t n = 10000;
         
     | 
| 
       57 
60 
     | 
    
         
             
              tdigest_double td;
         
     | 
| 
       58 
61 
     | 
    
         
             
              for (size_t i = 0; i < n; ++i) td.update(i);
         
     | 
| 
       59 
     | 
    
         
            -
            //  std::cout << td.to_string(true);
         
     | 
| 
       60 
     | 
    
         
            -
            //  td.compress();
         
     | 
| 
       61 
     | 
    
         
            -
            //  std::cout << td.to_string(true);
         
     | 
| 
       62 
62 
     | 
    
         
             
              REQUIRE_FALSE(td.is_empty());
         
     | 
| 
       63 
63 
     | 
    
         
             
              REQUIRE(td.get_total_weight() == n);
         
     | 
| 
       64 
64 
     | 
    
         
             
              REQUIRE(td.get_min_value() == 0);
         
     | 
| 
         @@ -73,6 +73,15 @@ TEST_CASE("many values", "[tdigest]") { 
     | 
|
| 
       73 
73 
     | 
    
         
             
              REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
         
     | 
| 
       74 
74 
     | 
    
         
             
              REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
         
     | 
| 
       75 
75 
     | 
    
         
             
              REQUIRE(td.get_quantile(1) == n - 1);
         
     | 
| 
      
 76 
     | 
    
         
            +
              const double split_points[1] {n / 2};
         
     | 
| 
      
 77 
     | 
    
         
            +
              const auto pmf = td.get_PMF(split_points, 1);
         
     | 
| 
      
 78 
     | 
    
         
            +
              REQUIRE(pmf.size() == 2);
         
     | 
| 
      
 79 
     | 
    
         
            +
              REQUIRE(pmf[0] == Approx(0.5).margin(0.0001));
         
     | 
| 
      
 80 
     | 
    
         
            +
              REQUIRE(pmf[1] == Approx(0.5).margin(0.0001));
         
     | 
| 
      
 81 
     | 
    
         
            +
              const auto cdf = td.get_CDF(split_points, 1);
         
     | 
| 
      
 82 
     | 
    
         
            +
              REQUIRE(cdf.size() == 2);
         
     | 
| 
      
 83 
     | 
    
         
            +
              REQUIRE(cdf[0] == Approx(0.5).margin(0.0001));
         
     | 
| 
      
 84 
     | 
    
         
            +
              REQUIRE(cdf[1] == 1);
         
     | 
| 
       76 
85 
     | 
    
         
             
            }
         
     | 
| 
       77 
86 
     | 
    
         | 
| 
       78 
87 
     | 
    
         
             
            TEST_CASE("rank - two values", "[tdigest]") {
         
     | 
| 
         @@ -329,7 +329,7 @@ static inline void pack_bits_13(const uint64_t* values, uint8_t* ptr) { 
     | 
|
| 
       329 
329 
     | 
    
         | 
| 
       330 
330 
     | 
    
         
             
              *ptr++ = static_cast<uint8_t>(values[3] >> 4);
         
     | 
| 
       331 
331 
     | 
    
         | 
| 
       332 
     | 
    
         
            -
              *ptr = static_cast<uint8_t>(values[3]  
     | 
| 
      
 332 
     | 
    
         
            +
              *ptr = static_cast<uint8_t>(values[3] << 4);
         
     | 
| 
       333 
333 
     | 
    
         
             
              *ptr++ |= static_cast<uint8_t>(values[4] >> 9);
         
     | 
| 
       334 
334 
     | 
    
         | 
| 
       335 
335 
     | 
    
         
             
              *ptr++ = static_cast<uint8_t>(values[4] >> 1);
         
     | 
| 
         @@ -4227,7 +4227,7 @@ static inline void unpack_bits_33(uint64_t* values, const uint8_t* ptr) { 
     | 
|
| 
       4227 
4227 
     | 
    
         
             
              values[6] |= *ptr >> 1;
         
     | 
| 
       4228 
4228 
     | 
    
         | 
| 
       4229 
4229 
     | 
    
         
             
              values[7] = static_cast<uint64_t>(*ptr++ & 1) << 32;
         
     | 
| 
       4230 
     | 
    
         
            -
              values[7] |= *ptr++ << 24;
         
     | 
| 
      
 4230 
     | 
    
         
            +
              values[7] |= static_cast<uint64_t>(*ptr++) << 24;
         
     | 
| 
       4231 
4231 
     | 
    
         
             
              values[7] |= *ptr++ << 16;
         
     | 
| 
       4232 
4232 
     | 
    
         
             
              values[7] |= *ptr++ << 8;
         
     | 
| 
       4233 
4233 
     | 
    
         
             
              values[7] |= *ptr;
         
     | 
| 
         @@ -4296,7 +4296,7 @@ static inline void unpack_bits_35(uint64_t* values, const uint8_t* ptr) { 
     | 
|
| 
       4296 
4296 
     | 
    
         
             
              values[1] |= *ptr++ << 6;
         
     | 
| 
       4297 
4297 
     | 
    
         
             
              values[1] |= *ptr >> 2;
         
     | 
| 
       4298 
4298 
     | 
    
         | 
| 
       4299 
     | 
    
         
            -
              values[2] = static_cast<uint64_t>(*ptr++ &  
     | 
| 
      
 4299 
     | 
    
         
            +
              values[2] = static_cast<uint64_t>(*ptr++ & 3) << 33;
         
     | 
| 
       4300 
4300 
     | 
    
         
             
              values[2] |= static_cast<uint64_t>(*ptr++) << 25;
         
     | 
| 
       4301 
4301 
     | 
    
         
             
              values[2] |= *ptr++ << 17;
         
     | 
| 
       4302 
4302 
     | 
    
         
             
              values[2] |= *ptr++ << 9;
         
     | 
| 
         @@ -6201,7 +6201,7 @@ static inline void pack_bits_block8(const uint64_t* values, uint8_t* ptr, uint8_ 
     | 
|
| 
       6201 
6201 
     | 
    
         
             
                case 61: pack_bits_61(values, ptr); break;
         
     | 
| 
       6202 
6202 
     | 
    
         
             
                case 62: pack_bits_62(values, ptr); break;
         
     | 
| 
       6203 
6203 
     | 
    
         
             
                case 63: pack_bits_63(values, ptr); break;
         
     | 
| 
       6204 
     | 
    
         
            -
                default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
         
     | 
| 
      
 6204 
     | 
    
         
            +
                default: throw std::logic_error("wrong number of bits in pack_bits_block8: " + std::to_string(bits));
         
     | 
| 
       6205 
6205 
     | 
    
         
             
              }
         
     | 
| 
       6206 
6206 
     | 
    
         
             
            }
         
     | 
| 
       6207 
6207 
     | 
    
         | 
| 
         @@ -6270,7 +6270,7 @@ static inline void unpack_bits_block8(uint64_t* values, const uint8_t* ptr, uint 
     | 
|
| 
       6270 
6270 
     | 
    
         
             
                case 61: unpack_bits_61(values, ptr); break;
         
     | 
| 
       6271 
6271 
     | 
    
         
             
                case 62: unpack_bits_62(values, ptr); break;
         
     | 
| 
       6272 
6272 
     | 
    
         
             
                case 63: unpack_bits_63(values, ptr); break;
         
     | 
| 
       6273 
     | 
    
         
            -
                default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
         
     | 
| 
      
 6273 
     | 
    
         
            +
                default: throw std::logic_error("wrong number of bits in unpack_bits_block8: " + std::to_string(bits));
         
     | 
| 
       6274 
6274 
     | 
    
         
             
              }
         
     | 
| 
       6275 
6275 
     | 
    
         
             
            }
         
     | 
| 
       6276 
6276 
     | 
    
         |