RubyGems - rrudb - Versions diffs - 0.0.2 - Mend

rrudb 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

checksums.yaml +7 -0
data/.yardopts +1 -0
data/LICENSE.txt +22 -0
data/README.md +26 -0
data/examples/example.rb +39 -0
data/ext/rudb/NuDB/include/nudb/CMakeLists.txt +104 -0
data/ext/rudb/NuDB/include/nudb/_experimental/basic_seconds_clock.hpp +200 -0
data/ext/rudb/NuDB/include/nudb/_experimental/chrono_util.hpp +58 -0
data/ext/rudb/NuDB/include/nudb/_experimental/test/fail_file.hpp +343 -0
data/ext/rudb/NuDB/include/nudb/_experimental/test/temp_dir.hpp +73 -0
data/ext/rudb/NuDB/include/nudb/_experimental/test/test_store.hpp +451 -0
data/ext/rudb/NuDB/include/nudb/_experimental/test/xor_shift_engine.hpp +105 -0
data/ext/rudb/NuDB/include/nudb/_experimental/util.hpp +288 -0
data/ext/rudb/NuDB/include/nudb/basic_store.hpp +461 -0
data/ext/rudb/NuDB/include/nudb/concepts.hpp +205 -0
data/ext/rudb/NuDB/include/nudb/context.hpp +144 -0
data/ext/rudb/NuDB/include/nudb/create.hpp +117 -0
data/ext/rudb/NuDB/include/nudb/detail/arena.hpp +296 -0
data/ext/rudb/NuDB/include/nudb/detail/bucket.hpp +473 -0
data/ext/rudb/NuDB/include/nudb/detail/buffer.hpp +86 -0
data/ext/rudb/NuDB/include/nudb/detail/bulkio.hpp +196 -0
data/ext/rudb/NuDB/include/nudb/detail/cache.hpp +236 -0
data/ext/rudb/NuDB/include/nudb/detail/endian.hpp +93 -0
data/ext/rudb/NuDB/include/nudb/detail/field.hpp +265 -0
data/ext/rudb/NuDB/include/nudb/detail/format.hpp +630 -0
data/ext/rudb/NuDB/include/nudb/detail/gentex.hpp +259 -0
data/ext/rudb/NuDB/include/nudb/detail/mutex.hpp +26 -0
data/ext/rudb/NuDB/include/nudb/detail/pool.hpp +243 -0
data/ext/rudb/NuDB/include/nudb/detail/store_base.hpp +45 -0
data/ext/rudb/NuDB/include/nudb/detail/stream.hpp +149 -0
data/ext/rudb/NuDB/include/nudb/detail/xxhash.hpp +328 -0
data/ext/rudb/NuDB/include/nudb/error.hpp +257 -0
data/ext/rudb/NuDB/include/nudb/file.hpp +55 -0
data/ext/rudb/NuDB/include/nudb/impl/basic_store.ipp +785 -0
data/ext/rudb/NuDB/include/nudb/impl/context.ipp +241 -0
data/ext/rudb/NuDB/include/nudb/impl/create.ipp +163 -0
data/ext/rudb/NuDB/include/nudb/impl/error.ipp +175 -0
data/ext/rudb/NuDB/include/nudb/impl/posix_file.ipp +248 -0
data/ext/rudb/NuDB/include/nudb/impl/recover.ipp +209 -0
data/ext/rudb/NuDB/include/nudb/impl/rekey.ipp +248 -0
data/ext/rudb/NuDB/include/nudb/impl/verify.ipp +634 -0
data/ext/rudb/NuDB/include/nudb/impl/visit.ipp +96 -0
data/ext/rudb/NuDB/include/nudb/impl/win32_file.ipp +264 -0
data/ext/rudb/NuDB/include/nudb/native_file.hpp +76 -0
data/ext/rudb/NuDB/include/nudb/nudb.hpp +27 -0
data/ext/rudb/NuDB/include/nudb/posix_file.hpp +228 -0
data/ext/rudb/NuDB/include/nudb/progress.hpp +32 -0
data/ext/rudb/NuDB/include/nudb/recover.hpp +73 -0
data/ext/rudb/NuDB/include/nudb/rekey.hpp +110 -0
data/ext/rudb/NuDB/include/nudb/store.hpp +27 -0
data/ext/rudb/NuDB/include/nudb/type_traits.hpp +63 -0
data/ext/rudb/NuDB/include/nudb/verify.hpp +200 -0
data/ext/rudb/NuDB/include/nudb/version.hpp +21 -0
data/ext/rudb/NuDB/include/nudb/visit.hpp +63 -0
data/ext/rudb/NuDB/include/nudb/win32_file.hpp +246 -0
data/ext/rudb/NuDB/include/nudb/xxhasher.hpp +45 -0
data/ext/rudb/extconf.rb +12 -0
data/ext/rudb/rudb.cpp +234 -0
data/lib/rudb/version.rb +3 -0
data/lib/rudb.rb +1 -0
metadata +104 -0

data/ext/rudb/NuDB/include/nudb/impl/verify.ipp ADDED Viewed

@@ -0,0 +1,634 @@
+//
+// Copyright (c) 2015-2016 Vinnie Falco (vinnie dot falco at gmail dot com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+#ifndef NUDB_IMPL_VERIFY_IPP
+#define NUDB_IMPL_VERIFY_IPP
+#include <nudb/concepts.hpp>
+#include <nudb/native_file.hpp>
+#include <nudb/type_traits.hpp>
+#include <nudb/detail/bucket.hpp>
+#include <nudb/detail/bulkio.hpp>
+#include <nudb/detail/format.hpp>
+#include <boost/core/ignore_unused.hpp>
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+#include <string>
+namespace nudb {
+namespace detail {
+// Normal verify that does not require a buffer
+//
+template<
+    class Hasher,
+    class File,
+    class Progress>
+void
+verify_normal(
+    verify_info& info,
+    File& df,
+    File& kf,
+    dat_file_header& dh,
+    key_file_header& kh,
+    Progress&& progress,
+    error_code& ec)
+{
+    static_assert(is_File<File>::value,
+        "File requirements not met");
+    static_assert(is_Hasher<Hasher>::value,
+        "Hasher requirements not met");
+    static_assert(is_Progress<Progress>::value,
+        "Progress requirements not met");
+    boost::ignore_unused(dh);
+    info.algorithm = 0;
+    auto const readSize = 1024 * kh.block_size;
+    // This ratio balances the 2 work phases.
+    // The number is determined empirically.
+    auto const adjust = 1.75;
+    // Calculate the work required
+    auto const keys = static_cast<std::uint64_t>(
+        double(kh.load_factor) / 65536.0 * kh.buckets * kh.capacity);
+    std::uint64_t const nwork = static_cast<std::uint64_t>(
+        info.dat_file_size + keys * kh.block_size +
+        adjust * (info.key_file_size + keys * kh.block_size));
+    std::uint64_t work = 0;
+    progress(0, nwork);
+    // Iterate Data File
+    // Data Record
+    auto const dh_len =
+        field<uint48_t>::size + // Size
+        kh.key_size;            // Key
+    std::uint64_t fetches = 0;
+    buffer buf{kh.block_size + dh_len};
+    bucket b{kh.block_size, buf.get()};
+    std::uint8_t* pd = buf.get() + kh.block_size;
+    {
+        bulk_reader<File> r{df, dat_file_header::size,
+            info.dat_file_size, readSize};
+        while(! r.eof())
+        {
+            auto const offset = r.offset();
+            // Data Record or Spill Record
+            auto is = r.prepare(
+                field<uint48_t>::size, ec); // Size
+            if(ec)
+                return;
+            nsize_t size;
+            read_size48(is, size);
+            if(size > 0)
+            {
+                // Data Record
+                is = r.prepare(
+                    kh.key_size +           // Key
+                    size, ec);              // Data
+                if(ec)
+                    return;
+                std::uint8_t const* const key =
+                    is.data(kh.key_size);
+                std::uint8_t const* const data =
+                    is.data(size);
+                (void)data;
+                auto const h = hash<Hasher>(
+                    key, kh.key_size, kh.salt);
+                // Check bucket and spills
+                auto const n = bucket_index(
+                    h, kh.buckets, kh.modulus);
+                b.read(kf,
+                       static_cast<noff_t>(n + 1) * kh.block_size, ec);
+                if(ec)
+                    return;
+                work += kh.block_size;
+                ++fetches;
+                for(;;)
+                {
+                    for(auto i = b.lower_bound(h);
+                        i < b.size(); ++i)
+                    {
+                        auto const item = b[i];
+                        if(item.hash != h)
+                            break;
+                        if(item.offset == offset)
+                            goto found;
+                        ++fetches;
+                    }
+                    auto const spill = b.spill();
+                    if(! spill)
+                    {
+                        ec = error::orphaned_value;
+                        return;
+                    }
+                    b.read(df, spill, ec);
+                    if(ec == error::short_read)
+                    {
+                        ec = error::short_spill;
+                        return;
+                    }
+                    if(ec)
+                        return;
+                    ++fetches;
+                }
+            found:
+                // Update
+                ++info.value_count;
+                info.value_bytes += size;
+            }
+            else
+            {
+                // Spill Record
+                is = r.prepare(
+                    field<std::uint16_t>::size, ec);
+                if(ec == error::short_read)
+                {
+                    ec = error::short_spill;
+                    return;
+                }
+                if(ec)
+                    return;
+                read<std::uint16_t>(is, size);  // Size
+                if(size != info.bucket_size)
+                {
+                    ec = error::invalid_spill_size;
+                    return;
+                }
+                if(ec)
+                    return;
+                b.read(r, ec);                  // Bucket
+                if(ec == error::short_read)
+                {
+                    ec = error::short_spill;
+                    return;
+                }
+                if(ec)
+                    return;
+                ++info.spill_count_tot;
+                info.spill_bytes_tot +=
+                    field<uint48_t>::size +     // Zero
+                    field<uint16_t>::size +     // Size
+                    b.actual_size();            // Bucket
+            }
+            progress(work + offset, nwork);
+        }
+        work += info.dat_file_size;
+    }
+    // Iterate Key File
+    {
+        for(std::size_t n = 0; n < kh.buckets; ++n)
+        {
+            std::size_t nspill = 0;
+            b.read(kf, static_cast<noff_t>(
+                n + 1) * kh.block_size, ec);
+            if(ec)
+                return;
+            work += static_cast<std::uint64_t>(
+                adjust * kh.block_size);
+            bool spill = false;
+            for(;;)
+            {
+                info.key_count += b.size();
+                for(nkey_t i = 0; i < b.size(); ++i)
+                {
+                    auto const e = b[i];
+                    df.read(e.offset, pd, dh_len, ec);
+                    if(ec == error::short_read)
+                    {
+                        ec = error::missing_value;
+                        return;
+                    }
+                    if(ec)
+                        return;
+                    if(! spill)
+                        work += static_cast<std::uint64_t>(
+                            adjust * kh.block_size);
+                    // Data Record
+                    istream is{pd, dh_len};
+                    std::uint64_t size;
+                    // VFALCO This should really be a 32-bit field
+                    read<uint48_t>(is, size);   // Size
+                    void const* key =
+                        is.data(kh.key_size);   // Key
+                    if(size != e.size)
+                    {
+                        ec = error::size_mismatch;
+                        return;
+                    }
+                    auto const h = hash<Hasher>(key,
+                        kh.key_size, kh.salt);
+                    if(h != e.hash)
+                    {
+                        ec = error::hash_mismatch;
+                        return;
+                    }
+                }
+                if(! b.spill())
+                    break;
+                b.read(df, b.spill(), ec);
+                if(ec)
+                    return;
+                spill = true;
+                ++nspill;
+                ++info.spill_count;
+                info.spill_bytes +=
+                    field<uint48_t>::size + // Zero
+                    field<uint16_t>::size + // Size
+                    b.actual_size();        // SpillBucket
+            }
+            if(nspill >= info.hist.size())
+                nspill = info.hist.size() - 1;
+            ++info.hist[nspill];
+            progress(work, nwork);
+        }
+    }
+    float sum = 0;
+    for(size_t i = 0; i < info.hist.size(); ++i)
+        sum += info.hist[i] * (i + 1);
+    if(info.value_count)
+        info.avg_fetch =
+            float(fetches) / info.value_count;
+    else
+        info.avg_fetch = 0;
+    info.waste = (info.spill_bytes_tot - info.spill_bytes) /
+        float(info.dat_file_size);
+    if(info.value_count)
+        info.overhead =
+            float(info.key_file_size + info.dat_file_size) /
+            (
+                info.value_bytes +
+                info.key_count *
+                    (info.key_size +
+                    // Data Record
+                     field<uint48_t>::size) // Size
+                        ) - 1;
+    else
+        info.overhead = 0;
+    info.actual_load = info.key_count / float(
+        info.capacity * info.buckets);
+}
+// Fast version of verify that uses a buffer
+//
+template<class Hasher, class File, class Progress>
+void
+verify_fast(
+    verify_info& info,
+    File& df,
+    File& kf,
+    dat_file_header& dh,
+    key_file_header& kh,
+    std::size_t bufferSize,
+    Progress&& progress,
+    error_code& ec)
+{
+    boost::ignore_unused(dh);
+    info.algorithm = 1;
+    auto const readSize = 1024 * kh.block_size;
+    // Counts unverified keys per bucket
+    if(kh.buckets > std::numeric_limits<nbuck_t>::max())
+    {
+        ec = error::too_many_buckets;
+        return;
+    }
+    std::unique_ptr<nkey_t[]> nkeys(
+        new nkey_t[kh.buckets]);
+    // Verify contiguous sequential sections of the
+    // key file using multiple passes over the data.
+    //
+    if(bufferSize < 2 * kh.block_size + sizeof(nkey_t))
+        throw std::logic_error("invalid buffer size");
+    auto chunkSize = std::min(kh.buckets,
+        (bufferSize - kh.block_size) /
+            (kh.block_size + sizeof(nkey_t)));
+    auto const passes =
+        (kh.buckets + chunkSize - 1) / chunkSize;
+    // Calculate the work required
+    std::uint64_t work = 0;
+    std::uint64_t const nwork =
+        passes * info.dat_file_size + info.key_file_size;
+    progress(0, nwork);
+    std::uint64_t fetches = 0;
+    buffer buf{(chunkSize + 1) * kh.block_size};
+    bucket tmp{kh.block_size,
+        buf.get() + chunkSize * kh.block_size};
+    for(nsize_t b0 = 0; b0 < kh.buckets; b0 += chunkSize)
+    {
+        // Load key file chunk to buffer
+        auto const b1 = std::min(b0 + chunkSize, kh.buckets);
+        // Buffered range is [b0, b1)
+        auto const bn = b1 - b0;
+        kf.read(
+            static_cast<noff_t>(b0 + 1) * kh.block_size,
+            buf.get(),
+            static_cast<noff_t>(bn * kh.block_size),
+            ec);
+        if(ec)
+            return;
+        work += bn * kh.block_size;
+        progress(work, nwork);
+        // Count keys in buckets, including spills
+        for(nbuck_t i = 0 ; i < bn; ++i)
+        {
+            bucket b{kh.block_size,
+                buf.get() + i * kh.block_size};
+            nkeys[i] = b.size();
+            std::size_t nspill = 0;
+            auto spill = b.spill();
+            while(spill != 0)
+            {
+                tmp.read(df, spill, ec);
+                if(ec == error::short_read)
+                {
+                    ec = error::short_spill;
+                    return;
+                }
+                if(ec)
+                    return;
+                nkeys[i] += tmp.size();
+                spill = tmp.spill();
+                ++nspill;
+                ++info.spill_count;
+                info.spill_bytes +=
+                    field<uint48_t>::size + // Zero
+                    field<uint16_t>::size + // Size
+                    tmp.actual_size();      // SpillBucket
+            }
+            if(nspill >= info.hist.size())
+                nspill = info.hist.size() - 1;
+            ++info.hist[nspill];
+            info.key_count += nkeys[i];
+        }
+        // Iterate Data File
+        bulk_reader<File> r(df, dat_file_header::size,
+            info.dat_file_size, readSize);
+        while(! r.eof())
+        {
+            auto const offset = r.offset();
+            // Data Record or Spill Record
+            auto is = r.prepare(
+                field<uint48_t>::size, ec); // Size
+            if(ec == error::short_read)
+            {
+                ec = error::short_data_record;
+                return;
+            }
+            if(ec)
+                return;
+            nsize_t size;
+            detail::read_size48(is, size);
+            if(size > 0)
+            {
+                // Data Record
+                is = r.prepare(
+                    kh.key_size +           // Key
+                    size, ec);              // Data
+                if(ec == error::short_read)
+                {
+                    ec = error::short_value;
+                    return;
+                }
+                if(ec)
+                    return;
+                std::uint8_t const* const key =
+                    is.data(kh.key_size);
+                std::uint8_t const* const data =
+                    is.data(size);
+                (void)data;
+                auto const h = hash<Hasher>(
+                    key, kh.key_size, kh.salt);
+                auto const n = bucket_index(
+                    h, kh.buckets, kh.modulus);
+                if(n < b0 || n >= b1)
+                    continue;
+                // Check bucket and spills
+                bucket b{kh.block_size, buf.get() +
+                    (n - b0) * kh.block_size};
+                ++fetches;
+                for(;;)
+                {
+                    for(auto i = b.lower_bound(h);
+                        i < b.size(); ++i)
+                    {
+                        auto const item = b[i];
+                        if(item.hash != h)
+                            break;
+                        if(item.offset == offset)
+                            goto found;
+                        ++fetches;
+                    }
+                    auto const spill = b.spill();
+                    if(! spill)
+                    {
+                        ec = error::orphaned_value;
+                        return;
+                    }
+                    b = tmp;
+                    b.read(df, spill, ec);
+                    if(ec == error::short_read)
+                    {
+                        ec = error::short_spill;
+                        return;
+                    }
+                    if(ec)
+                        return;
+                    ++fetches;
+                }
+            found:
+                // Update
+                ++info.value_count;
+                info.value_bytes += size;
+                if(nkeys[n - b0]-- == 0)
+                {
+                    ec = error::orphaned_value;
+                    return;
+                }
+            }
+            else
+            {
+                // Spill Record
+                is = r.prepare(
+                    field<std::uint16_t>::size, ec);
+                if(ec == error::short_read)
+                {
+                    ec = error::short_spill;
+                    return;
+                }
+                if(ec)
+                    return;
+                read<std::uint16_t>(is, size);      // Size
+                if(bucket_size(
+                    bucket_capacity(size)) != size)
+                {
+                    ec = error::invalid_spill_size;
+                    return;
+                }
+                r.prepare(size, ec);                // Bucket
+                if(ec == error::short_read)
+                {
+                    ec = error::short_spill;
+                    return;
+                }
+                if(ec)
+                    return;
+                if(b0 == 0)
+                {
+                    ++info.spill_count_tot;
+                    info.spill_bytes_tot +=
+                        field<uint48_t>::size +     // Zero
+                        field<uint16_t>::size +     // Size
+                        tmp.actual_size();          // Bucket
+                }
+            }
+            progress(work + offset, nwork);
+        }
+        // Make sure every key in every bucket was visited
+        for(std::size_t i = 0; i < bn; ++i)
+        {
+            if(nkeys[i] != 0)
+            {
+                ec = error::missing_value;
+                return;
+            }
+        }
+        work += info.dat_file_size;
+    }
+    float sum = 0;
+    for(std::size_t i = 0; i < info.hist.size(); ++i)
+        sum += info.hist[i] * (i + 1);
+    if(info.value_count)
+        info.avg_fetch =
+            float(fetches) / info.value_count;
+    else
+        info.avg_fetch = 0;
+    info.waste = (info.spill_bytes_tot - info.spill_bytes) /
+        float(info.dat_file_size);
+    if(info.value_count)
+        info.overhead =
+            float(info.key_file_size + info.dat_file_size) /
+            (
+                info.value_bytes +
+                info.key_count *
+                    (info.key_size +
+                    // Data Record
+                     field<uint48_t>::size) // Size
+                        ) - 1;
+    else
+        info.overhead = 0;
+    info.actual_load = info.key_count / float(
+        info.capacity * info.buckets);
+}
+} // detail
+template<class Hasher, class Progress>
+void
+verify(
+    verify_info& info,
+    path_type const& dat_path,
+    path_type const& key_path,
+    std::size_t bufferSize,
+    Progress&& progress,
+    error_code& ec)
+{
+    static_assert(is_Hasher<Hasher>::value,
+        "Hasher requirements not met");
+    static_assert(is_Progress<Progress>::value,
+        "Progress requirements not met");
+    info = {};
+    using namespace detail;
+    using File = native_file;
+    File df;
+    df.open(file_mode::scan, dat_path, ec);
+    if(ec)
+        return;
+    File kf;
+    kf.open (file_mode::read, key_path, ec);
+    if(ec)
+        return;
+    dat_file_header dh;
+    read(df, dh, ec);
+    if(ec)
+        return;
+    verify(dh, ec);
+    if(ec)
+        return;
+    key_file_header kh;
+    read(kf, kh, ec);
+    if(ec)
+        return;
+    verify<Hasher>(kh, ec);
+    if(ec)
+        return;
+    verify<Hasher>(dh, kh, ec);
+    if(ec)
+        return;
+    info.dat_path = dat_path;
+    info.key_path = key_path;
+    info.version = dh.version;
+    info.uid = dh.uid;
+    info.appnum = dh.appnum;
+    info.key_size = dh.key_size;
+    info.salt = kh.salt;
+    info.pepper = kh.pepper;
+    info.block_size = kh.block_size;
+    info.load_factor = kh.load_factor / 65536.f;
+    info.capacity = kh.capacity;
+    info.buckets = kh.buckets;
+    info.bucket_size = bucket_size(kh.capacity);
+    info.key_file_size = kf.size(ec);
+    if(ec)
+        return;
+    info.dat_file_size = df.size(ec);
+    if(ec)
+        return;
+    // Determine which algorithm requires the least amount
+    // of file I/O given the available buffer size
+    std::size_t chunkSize;
+    if(bufferSize >= 2 * kh.block_size + sizeof(nkey_t))
+        chunkSize = std::min(kh.buckets,
+            (bufferSize - kh.block_size) /
+                (kh.block_size + sizeof(nkey_t)));
+    else
+        chunkSize = 0;
+    std::size_t passes;
+    if(chunkSize > 0)
+        passes = (kh.buckets + chunkSize - 1) / chunkSize;
+    else
+        passes = 0;
+    if(! chunkSize ||
+        ((
+            info.dat_file_size +
+            (kh.buckets * kh.load_factor * kh.capacity * kh.block_size) +
+            info.key_file_size
+        ) < (
+            passes * info.dat_file_size + info.key_file_size
+        )))
+    {
+        detail::verify_normal<Hasher>(info,
+            df, kf, dh, kh, progress, ec);
+    }
+    else
+    {
+        detail::verify_fast<Hasher>(info,
+            df, kf, dh, kh, bufferSize, progress, ec);
+    }
+}
+} // nudb
+#endif