npm - @nxtedition/rocksdb - Versions diffs - 15.4.0 → 15.5.0 - Mend

@nxtedition/rocksdb 15.4.0 → 15.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (402) hide show

package/deps/rocksdb/rocksdb/util/io_dispatcher_test.cc ADDED Viewed

@@ -0,0 +1,1919 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include "rocksdb/io_dispatcher.h"
+#include <memory>
+#include <mutex>
+#include <thread>
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "test_util/sync_point.h"
+// Enable io_uring support for this test
+extern "C" bool RocksDbIOUringEnable() { return true; }
+// Check if io_uring is available at compile time
+#ifdef ROCKSDB_IOURING_PRESENT
+static constexpr bool kIOUringPresent = true;
+#else
+static constexpr bool kIOUringPresent = false;
+#endif
+namespace ROCKSDB_NAMESPACE {
+// Represents a single read operation recorded by the tracking file system
+struct ReadOp {
+  enum Type { kMultiRead, kReadAsync };
+  Type type;
+  // For MultiRead: contains all (offset, len) pairs in the request
+  // For ReadAsync: contains a single (offset, len) pair
+  std::vector<std::pair<uint64_t, size_t>> requests;
+};
+// Forward declaration
+class ReadTrackingFS;
+// Wrapper around FSRandomAccessFile that tracks read operations
+class ReadTrackingRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+  ReadTrackingRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file,
+                               ReadTrackingFS* fs)
+      : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {}
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     IODebugContext* dbg) override;
+ private:
+  ReadTrackingFS* fs_;
+};
+// FileSystem wrapper that tracks all read operations for verification
+class ReadTrackingFS : public FileSystemWrapper {
+ public:
+  explicit ReadTrackingFS(const std::shared_ptr<FileSystem>& target)
+      : FileSystemWrapper(target) {}
+  static const char* kClassName() { return "ReadTrackingFS"; }
+  const char* Name() const override { return kClassName(); }
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    std::unique_ptr<FSRandomAccessFile> file;
+    IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
+    if (s.ok()) {
+      result->reset(new ReadTrackingRandomAccessFile(std::move(file), this));
+    }
+    return s;
+  }
+  // Record a MultiRead operation
+  void RecordMultiRead(const std::vector<std::pair<uint64_t, size_t>>& reqs) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    ReadOp op;
+    op.type = ReadOp::kMultiRead;
+    op.requests = reqs;
+    read_ops_.push_back(std::move(op));
+  }
+  // Record a ReadAsync operation
+  void RecordReadAsync(uint64_t offset, size_t len) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    ReadOp op;
+    op.type = ReadOp::kReadAsync;
+    op.requests.push_back({offset, len});
+    read_ops_.push_back(std::move(op));
+  }
+  // Get all recorded read operations
+  std::vector<ReadOp> GetReadOps() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return read_ops_;
+  }
+  // Clear recorded read operations
+  void ClearReadOps() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    read_ops_.clear();
+  }
+  // Get count of MultiRead operations
+  size_t GetMultiReadCount() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    size_t count = 0;
+    for (const auto& op : read_ops_) {
+      if (op.type == ReadOp::kMultiRead) {
+        count++;
+      }
+    }
+    return count;
+  }
+  // Get count of ReadAsync operations
+  size_t GetReadAsyncCount() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    size_t count = 0;
+    for (const auto& op : read_ops_) {
+      if (op.type == ReadOp::kReadAsync) {
+        count++;
+      }
+    }
+    return count;
+  }
+ private:
+  mutable std::mutex mutex_;
+  std::vector<ReadOp> read_ops_;
+};
+IOStatus ReadTrackingRandomAccessFile::MultiRead(FSReadRequest* reqs,
+                                                 size_t num_reqs,
+                                                 const IOOptions& options,
+                                                 IODebugContext* dbg) {
+  // Record the read operation before executing it
+  std::vector<std::pair<uint64_t, size_t>> recorded_reqs;
+  recorded_reqs.reserve(num_reqs);
+  for (size_t i = 0; i < num_reqs; i++) {
+    recorded_reqs.push_back({reqs[i].offset, reqs[i].len});
+  }
+  fs_->RecordMultiRead(recorded_reqs);
+  // Delegate to underlying file
+  return target()->MultiRead(reqs, num_reqs, options, dbg);
+}
+IOStatus ReadTrackingRandomAccessFile::ReadAsync(
+    FSReadRequest& req, const IOOptions& opts,
+    std::function<void(FSReadRequest&, void*)> cb, void* cb_arg,
+    void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
+  // Record the read operation before executing it
+  fs_->RecordReadAsync(req.offset, req.len);
+  // Delegate to underlying file
+  return target()->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, dbg);
+}
+class IODispatcherTest : public DBTestBase {
+ public:
+  IODispatcherTest()
+      : DBTestBase("io_dispatcher_test", /*env_do_fsync=*/false) {}
+  ~IODispatcherTest() override {
+    // Close any open tables
+    for (auto& table : tables_) {
+      table.reset();
+    }
+    tables_.clear();
+  }
+  // Helper to collect block handles from a table
+  // We use TEST_GetDataBlockHandle to get handles for specific keys
+  // Since we know the keys we inserted, we can collect their block handles
+  Status CollectBlockHandles(BlockBasedTable* table, size_t num_keys,
+                             std::vector<BlockHandle>* block_handles_out) {
+    block_handles_out->clear();
+    ReadOptions read_options;
+    std::unordered_set<uint64_t> seen_offsets;
+    // Iterate through all keys and get their block handles
+    // We collect unique block handles (same block might contain multiple keys)
+    IndexBlockIter iiter_on_stack;
+    BlockCacheLookupContext context{TableReaderCaller::kUserVerifyChecksum};
+    auto iiter = table->NewIndexIterator(read_options, false, &iiter_on_stack,
+                                         nullptr, &context);
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+    if (iiter != &iiter_on_stack) {
+      iiter_unique_ptr.reset(iiter);
+    }
+    // Position the iterator at the first entry
+    iiter->SeekToFirst();
+    while (iiter->Valid()) {
+      auto handle = iiter->value().handle;
+      if (seen_offsets.find(handle.offset()) == seen_offsets.end()) {
+        block_handles_out->push_back(handle);
+        seen_offsets.insert(handle.offset());
+        if (block_handles_out->size() >= num_keys) {
+          break;
+        }
+      }
+      iiter->Next();
+    }
+    return Status::OK();
+  }
+  std::string test_dir_{};
+  Env* env_{};
+  std::shared_ptr<FileSystem> base_fs_;
+  std::shared_ptr<ReadTrackingFS> tracking_fs_;
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    test_dir_ = test::PerThreadDBPath("block_based_table_reader_test");
+    env_ = Env::Default();
+    base_fs_ = FileSystem::Default();
+    tracking_fs_ = std::make_shared<ReadTrackingFS>(base_fs_);
+    ASSERT_OK(base_fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+  }
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+  void NewFileWriter(const std::string& filename,
+                     std::unique_ptr<WritableFileWriter>* writer) {
+    std::string path = Path(filename);
+    EnvOptions env_options;
+    FileOptions foptions;
+    std::unique_ptr<FSWritableFile> file;
+    ASSERT_OK(base_fs_->NewWritableFile(path, foptions, &file, nullptr));
+    writer->reset(new WritableFileWriter(std::move(file), path, env_options));
+  }
+  void NewFileReader(const std::string& filename, const FileOptions& opt,
+                     std::unique_ptr<RandomAccessFileReader>* reader,
+                     Statistics* stats = nullptr) {
+    std::string path = Path(filename);
+    std::unique_ptr<FSRandomAccessFile> f;
+    // Use tracking_fs_ to record read operations
+    ASSERT_OK(tracking_fs_->NewRandomAccessFile(path, opt, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), path,
+                                             env_->GetSystemClock().get(),
+                                             /*io_tracer=*/nullptr,
+                                             /*stats=*/stats));
+  }
+  std::vector<std::shared_ptr<Statistics>> all_stats_;
+  std::vector<std::unique_ptr<BlockBasedTable>> tables_;
+  // Options must be stored as member variables to avoid use-after-scope
+  // The BlockBasedTable keeps references to these options
+  std::vector<std::unique_ptr<ImmutableOptions>> all_ioptions_;
+  std::vector<std::unique_ptr<EnvOptions>> all_env_options_;
+  // Helper to create an SST file and open it as a table
+  // Following pattern from table_test.cc TableConstructor
+  Status CreateAndOpenSST(int num_blocks,
+                          std::unique_ptr<BlockBasedTable>* table,
+                          std::vector<BlockHandle>* block_handles_out) {
+    // Create options - store in member variables to avoid use-after-scope
+    // The BlockBasedTable will keep references to these options
+    Options options{};
+    options.statistics = nullptr;
+    BlockBasedTableOptions table_options;
+    table_options.block_cache = NewLRUCache(8 * 1024 * 1024);
+    table_options.block_size = 16 * 1024;
+    table_options.no_block_cache = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    // Store these in member variables so they outlive the function
+    auto ioptions = std::make_unique<ImmutableOptions>(options);
+    auto moptions = MutableCFOptions{options};
+    InternalKeyComparator internal_comparator(options.comparator);
+    // Create in-memory file using StringSink (like table_test.cc)
+    auto table_name = "test_table";
+    std::unique_ptr<WritableFileWriter> file_writer;
+    NewFileWriter(table_name, &file_writer);
+    // Create table builder
+    std::string column_family_name;
+    const ReadOptions read_options;
+    const WriteOptions write_options;
+    std::vector<std::unique_ptr<InternalTblPropCollFactory>>
+        int_tbl_prop_coll_factories;
+    TableBuilderOptions builder_options(
+        *ioptions, moptions, read_options, write_options, internal_comparator,
+        &int_tbl_prop_coll_factories, kNoCompression, options.compression_opts,
+        0 /* column_family_id */, column_family_name, -1 /* level */,
+        kUnknownNewestKeyTime);
+    std::unique_ptr<TableBuilder> builder(
+        options.table_factory->NewTableBuilder(builder_options,
+                                               file_writer.get()));
+    Status s;
+    auto rnd = Random::GetTLSInstance();
+    // Add keys to the table
+    // 10k * 1Kib = ~10MiB
+    for (int i = 0; i < 10000; i++) {
+      std::string value = rnd->RandomString(2 << 10);
+      InternalKey ikey(Key(i), i, kTypeValue);
+      builder->Add(ikey.Encode(), value);
+    }
+    s = builder->Finish();
+    if (!s.ok()) {
+      return s;
+    }
+    uint64_t file_size = builder->FileSize();
+    IOOptions io_options;
+    s = file_writer->Flush(io_options);
+    if (!s.ok()) {
+      return s;
+    }
+    // Now open the file for reading using StringSource (like table_test.cc)
+    std::unique_ptr<RandomAccessFileReader> file;
+    FileOptions foptions;
+    foptions.use_direct_reads = false;
+    NewFileReader(table_name, foptions, &file, nullptr);
+    // Store EnvOptions and InternalKeyComparator to avoid use-after-scope
+    auto soptions = std::make_unique<EnvOptions>();
+    BlockCacheTracer block_cache_tracer;
+    std::unique_ptr<TableReader> table_reader;
+    auto ikc = InternalKeyComparator(options.comparator);
+    TableReaderOptions reader_options(*ioptions, moptions.prefix_extractor,
+                                      moptions.compression_manager.get(),
+                                      *soptions, ikc,
+                                      0 /* block_protection_bytes_per_key */);
+    s = options.table_factory->NewTableReader(reader_options, std::move(file),
+                                              file_size, &table_reader);
+    if (!s.ok()) {
+      return s;
+    }
+    table->reset(static_cast<BlockBasedTable*>(table_reader.release()));
+    // Collect actual block handles from the table's index
+    // This is similar to how block_based_table_iterator.cc CollectBlockHandles
+    // works
+    s = CollectBlockHandles(table->get(), num_blocks, block_handles_out);
+    if (!s.ok()) {
+      return s;
+    }
+    // Store all options in member variables to keep them alive
+    all_ioptions_.push_back(std::move(ioptions));
+    all_env_options_.push_back(std::move(soptions));
+    return Status::OK();
+  }
+  static uint64_t cur_file_num_;
+};
+uint64_t IODispatcherTest::cur_file_num_ = 1;
+TEST_F(IODispatcherTest, BasicSSTRead) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GT(block_handles.size(), 0);
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  ReadOptions read_options;
+  // Only use async IO when io_uring is available
+  job->job_options.read_options.async_io = kIOUringPresent;
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  // Read blocks using the new ReadSet API and verify they are valid
+  // ReadIndex will poll for async IO completion internally, no need to sleep
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+    // Verify the block has reasonable content
+    const Block* block_ptr = block.GetValue();
+    ASSERT_GT(block_ptr->size(), 0);
+  }
+  // Verify statistics - some blocks should have been read asynchronously
+  // Note: actual counts depend on cache behavior and IO completion
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  ASSERT_EQ(total_reads, block_handles.size());
+}
+TEST_F(IODispatcherTest, MultipleSSTFiles) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+  std::vector<std::shared_ptr<ReadSet>> read_sets;
+  std::vector<std::vector<BlockHandle>> all_block_handles;
+  // Create and submit jobs for multiple SST files
+  for (int i = 0; i < 3; i++) {
+    std::unique_ptr<BlockBasedTable> table;
+    std::vector<BlockHandle> block_handles;
+    Status s = CreateAndOpenSST(30 + i * 10, &table, &block_handles);
+    ASSERT_OK(s);
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    tables_.push_back(std::move(table));
+    all_block_handles.push_back(block_handles);
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    read_sets.push_back(read_set);
+  }
+  // Verify all ReadSets can read their blocks successfully
+  // ReadIndex will poll for async IO completion internally, no need to sleep
+  for (size_t i = 0; i < read_sets.size(); ++i) {
+    for (size_t j = 0; j < all_block_handles[i].size(); ++j) {
+      CachableEntry<Block> block;
+      Status read_status = read_sets[i]->ReadIndex(j, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+  }
+}
+TEST_F(IODispatcherTest, StatisticsTracking) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GT(block_handles.size(), 0);
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  // Only use async IO when io_uring is available
+  job->job_options.read_options.async_io = kIOUringPresent;
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  // Read all blocks - ReadIndex handles polling for async IO completion
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+  // Read the same blocks again - should all be cache hits now
+  std::shared_ptr<ReadSet> read_set2;
+  s = dispatcher->SubmitJob(job, &read_set2);
+  ASSERT_OK(s);
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set2->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+  // After reading all blocks, verify statistics
+  uint64_t num_sync = read_set->GetNumSyncReads();
+  uint64_t num_async = read_set->GetNumAsyncReads();
+  uint64_t num_cache = read_set->GetNumCacheHits();
+  // Total reads should equal number of blocks
+  uint64_t total_reads = num_sync + num_async + num_cache;
+  ASSERT_EQ(total_reads, block_handles.size());
+}
+TEST_F(IODispatcherTest, AsyncAndSyncRead) {
+  // This test verifies the difference between async_io=true and async_io=false
+  // by checking the statistics after reading all blocks.
+  // Only test async_io=true when io_uring is available.
+  std::vector<bool> async_modes = {false};
+  if (kIOUringPresent) {
+    async_modes.push_back(true);
+  }
+  for (auto async : async_modes) {
+    std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+    std::unique_ptr<BlockBasedTable> table;
+    std::vector<BlockHandle> block_handles;
+    Status s = CreateAndOpenSST(40, &table, &block_handles);
+    ASSERT_OK(s);
+    ASSERT_NE(table, nullptr);
+    ASSERT_GT(block_handles.size(), 0);
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    ReadOptions read_options;
+    // Ensure we don't use cache for this test - we want fresh reads
+    read_options.fill_cache = false;
+    job->job_options.read_options.async_io = async;
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    ASSERT_NE(read_set, nullptr);
+    // Read all blocks - ReadIndex handles polling for async IO internally
+    for (size_t i = 0; i < block_handles.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+      // Verify the block has reasonable content
+      const Block* block_ptr = block.GetValue();
+      ASSERT_GT(block_ptr->size(), 0);
+    }
+    // Verify statistics
+    uint64_t num_sync = read_set->GetNumSyncReads();
+    uint64_t num_async = read_set->GetNumAsyncReads();
+    uint64_t num_cache = read_set->GetNumCacheHits();
+    // Total reads should equal number of blocks
+    uint64_t total_reads = num_sync + num_async + num_cache;
+    EXPECT_EQ(total_reads, block_handles.size());
+    // When async_io is false, we always expect sync reads
+    if (!async) {
+      EXPECT_GT(num_sync, 0) << "Expected sync reads when async_io=false";
+      EXPECT_EQ(num_async, 0) << "Expected no async reads when async_io=false";
+    }
+    // When async_io is true:
+    // - If io_uring is available, we expect async reads
+    // - If io_uring is NOT available, ReadAsync returns NotSupported and
+    //   we fall back to sync reads. This is valid behavior.
+    // So we only verify that ALL blocks were read (checked above).
+  }
+}
+TEST_F(IODispatcherTest, VerifyBlockContent) {
+  // Test that blocks retrieved through ReadSet contain the correct data
+  // that was written to the SST file
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GT(block_handles.size(), 0);
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  ReadOptions read_options;
+  job->job_options.read_options.async_io = false;
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  // Read each block and verify its content
+  int t = 0;
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block_entry;
+    Status read_status = read_set->ReadIndex(i, &block_entry);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block_entry.GetValue(), nullptr);
+    Block* block = block_entry.GetValue();
+    ASSERT_GT(block->size(), 0);
+    // Create an iterator to walk through the block's keys
+    // We use InternalKeyComparator for data blocks
+    InternalKeyComparator internal_comparator(BytewiseComparator());
+    std::unique_ptr<DataBlockIter> iter(block->NewDataIterator(
+        internal_comparator.user_comparator(), kDisableGlobalSequenceNumber));
+    // Iterate through all keys in this block
+    size_t num_keys_in_block = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      num_keys_in_block++;
+      // Verify key is not empty
+      ASSERT_GT(iter->key().size(), 0)
+          << "Block " << i << " contains empty key";
+      // Verify value is not empty (we wrote 1KB values)
+      ASSERT_GT(iter->value().size(), 2 ^ 10)
+          << "Block " << i << " contains empty value";
+      // Parse the internal key
+      ParsedInternalKey parsed_key;
+      Status parse_status =
+          ParseInternalKey(iter->key(), &parsed_key, true /* log_err */);
+      ASSERT_OK(parse_status) << "Failed to parse internal key in block " << i;
+      // Verify the key matches the expected format from CreateAndOpenSST
+      // Keys are created with Key(i) which generates keys like "key000000"
+      std::string user_key = parsed_key.user_key.ToString();
+      auto check = Key(t);
+      t++;
+      ASSERT_TRUE(user_key.find("key") == 0)
+          << "Unexpected key format in block " << i << ": " << user_key;
+      ASSERT_EQ(check.c_str(), user_key);
+      // Verify value type is correct (should be kTypeValue)
+      ASSERT_EQ(parsed_key.type, kTypeValue)
+          << "Unexpected value type in block " << i;
+    }
+    // Verify iterator status after iteration
+    ASSERT_OK(iter->status()) << "Iterator error in block " << i;
+    // Each block should contain at least one key
+    ASSERT_GT(num_keys_in_block, 0) << "Block " << i << " contains no keys";
+  }
+}
+// We want to test here that even when we DONT read from the readset that all
+// pinned blocks will be unpinned.
+TEST_F(IODispatcherTest, ReadSetDestroysUnpinsBlocks) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_EQ(block_handles.size(), 30);
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  ReadOptions read_options;
+  job->job_options.read_options.async_io =
+      false;  // Use sync IO so blocks are pinned immediately
+  auto* rep = table->get_rep();
+  auto cache = rep->table_options.block_cache.get();
+  ASSERT_NE(cache, nullptr);
+  auto initial_pinned_usage = cache->GetPinnedUsage();
+  ASSERT_EQ(initial_pinned_usage, 0);
+  {
+    std::shared_ptr<ReadSet> read_set;
+    Status t = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(t);
+    ASSERT_NE(read_set, nullptr);
+    // With sync IO, blocks are already pinned in read_set->pinned_blocks_
+    // We do NOT call read_set->Read() - blocks should remain in pinned_blocks_
+    // At this point, blocks should be pinned in the ReadSet
+    auto pinned_usage_with_blocks = cache->GetPinnedUsage();
+    ASSERT_GT(pinned_usage_with_blocks, initial_pinned_usage)
+        << "Expected pinned usage to increase after SubmitJob, but "
+        << "initial=" << initial_pinned_usage
+        << " current=" << pinned_usage_with_blocks;
+    // ReadSet goes out of scope here, its destructor should unpin all blocks
+  }
+  // ReadSet destroyed - all blocks should be unpinned
+  auto final_pinned_usage = cache->GetPinnedUsage();
+  ASSERT_EQ(final_pinned_usage, initial_pinned_usage)
+      << "Expected pinned usage to return to initial value after ReadSet "
+      << "destruction, but initial=" << initial_pinned_usage
+      << " final=" << final_pinned_usage;
+}
+// Test that verifies the coalescing logic: adjacent blocks within the
+// coalesce threshold should be combined into a single read request.
+TEST_F(IODispatcherTest, VerifyCoalescing) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  // Get many blocks so we can test coalescing behavior
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GE(block_handles.size(), 20);
+  tracking_fs_->ClearReadOps();
+  // Test coalescing with sync reads (uses MultiRead)
+  {
+    auto job = std::make_shared<IOJob>();
+    // Use a subset of adjacent blocks
+    std::vector<BlockHandle> adjacent_blocks;
+    for (size_t i = 0; i < 10 && i < block_handles.size(); ++i) {
+      adjacent_blocks.push_back(block_handles[i]);
+    }
+    job->block_handles = adjacent_blocks;
+    job->table = table.get();
+    job->job_options.read_options.async_io = false;
+    // Set a large coalesce threshold so all adjacent blocks are combined
+    job->job_options.io_coalesce_threshold = 1024 * 1024;  // 1MB
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    for (size_t i = 0; i < adjacent_blocks.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+    // With a large coalesce threshold and adjacent blocks, we expect
+    // all blocks to be coalesced into a single MultiRead request
+    auto read_ops = tracking_fs_->GetReadOps();
+    size_t multiread_count = 0;
+    size_t total_requests_in_multireads = 0;
+    for (const auto& op : read_ops) {
+      if (op.type == ReadOp::kMultiRead) {
+        multiread_count++;
+        total_requests_in_multireads += op.requests.size();
+      }
+    }
+    // Adjacent blocks should be coalesced into a single read request
+    // (assuming they're within the coalesce threshold)
+    EXPECT_EQ(multiread_count, 1)
+        << "Expected 1 MultiRead call with coalesced blocks";
+    EXPECT_EQ(total_requests_in_multireads, 1)
+        << "Expected all adjacent blocks to be coalesced into 1 request";
+  }
+  tracking_fs_->ClearReadOps();
+  // Test with zero coalesce threshold and non-adjacent blocks
+  // Non-adjacent blocks (with gaps) should NOT be coalesced with threshold=0
+  {
+    // Create new table to avoid cache hits
+    std::unique_ptr<BlockBasedTable> table2;
+    std::vector<BlockHandle> block_handles2;
+    s = CreateAndOpenSST(50, &table2, &block_handles2);
+    ASSERT_OK(s);
+    ASSERT_GE(block_handles2.size(), 20);
+    tracking_fs_->ClearReadOps();
+    auto job = std::make_shared<IOJob>();
+    // Skip every other block to create gaps between requested blocks
+    // This ensures there are gaps that won't be bridged with threshold=0
+    std::vector<BlockHandle> non_adjacent_blocks;
+    for (size_t i = 0;
+         i < block_handles2.size() && non_adjacent_blocks.size() < 5; i += 2) {
+      non_adjacent_blocks.push_back(block_handles2[i]);
+    }
+    job->block_handles = non_adjacent_blocks;
+    job->table = table2.get();
+    job->job_options.read_options.async_io = false;
+    // Set zero coalesce threshold - blocks with gaps should not be coalesced
+    job->job_options.io_coalesce_threshold = 0;
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    for (size_t i = 0; i < non_adjacent_blocks.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+    // With zero coalesce threshold and non-adjacent blocks (with gaps),
+    // each block should be a separate request
+    auto read_ops = tracking_fs_->GetReadOps();
+    size_t total_requests_in_multireads = 0;
+    for (const auto& op : read_ops) {
+      if (op.type == ReadOp::kMultiRead) {
+        total_requests_in_multireads += op.requests.size();
+      }
+    }
+    // Each non-adjacent block should be a separate request since there are
+    // gaps between them and threshold=0 means no gap tolerance
+    EXPECT_EQ(total_requests_in_multireads, non_adjacent_blocks.size())
+        << "Expected each non-adjacent block to be a separate request with "
+           "zero coalesce threshold";
+  }
+}
+// Test that verifies the read request offsets and lengths match the
+// expected block handles.
+TEST_F(IODispatcherTest, VerifyReadRequestDetails) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GE(block_handles.size(), 5);
+  tracking_fs_->ClearReadOps();
+  // Use just a few non-adjacent blocks to avoid coalescing
+  std::vector<BlockHandle> test_blocks;
+  // Pick every other block to ensure they're not adjacent
+  for (size_t i = 0; i < block_handles.size(); i += 2) {
+    test_blocks.push_back(block_handles[i]);
+  }
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = test_blocks;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;
+  // Small coalesce threshold to minimize coalescing for this test
+  job->job_options.io_coalesce_threshold = 0;
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  for (size_t i = 0; i < test_blocks.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+  }
+  // Verify the read requests match the block handles
+  auto read_ops = tracking_fs_->GetReadOps();
+  std::unordered_set<uint64_t> expected_offsets;
+  for (const auto& handle : test_blocks) {
+    expected_offsets.insert(handle.offset());
+  }
+  std::unordered_set<uint64_t> actual_offsets;
+  for (const auto& op : read_ops) {
+    if (op.type == ReadOp::kMultiRead) {
+      for (const auto& req : op.requests) {
+        actual_offsets.insert(req.first);
+      }
+    }
+  }
+  // Verify all expected offsets were read
+  for (const auto& expected : expected_offsets) {
+    EXPECT_TRUE(actual_offsets.count(expected) > 0)
+        << "Expected read at offset " << expected << " but it was not found";
+  }
+}
+// Test that memory limiting blocks when the limit is exceeded
+TEST_F(IODispatcherTest, MemoryLimitBlocksWhenExceeded) {
+  // Create dispatcher with a small memory limit (1MB)
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 1 * 1024 * 1024;  // 1MB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GT(block_handles.size(), 0);
+  // Submit a job - should succeed immediately (non-blocking)
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  // Read all blocks - they may be read synchronously if prefetch was deferred
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+// Test that SubmitJob never blocks even when memory is exhausted
+TEST_F(IODispatcherTest, SubmitJobNeverBlocks) {
+  // Create dispatcher with a tiny memory limit
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 1024;  // 1KB - very small
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GT(block_handles.size(), 0);
+  // Submit first job - uses up all memory
+  auto job1 = std::make_shared<IOJob>();
+  job1->block_handles = block_handles;
+  job1->table = table.get();
+  job1->job_options.read_options.async_io = false;
+  std::shared_ptr<ReadSet> read_set1;
+  s = dispatcher->SubmitJob(job1, &read_set1);
+  ASSERT_OK(s);  // Should succeed immediately
+  // Submit second job - should also succeed immediately (not block)
+  std::unique_ptr<BlockBasedTable> table2;
+  std::vector<BlockHandle> block_handles2;
+  s = CreateAndOpenSST(30, &table2, &block_handles2);
+  ASSERT_OK(s);
+  auto job2 = std::make_shared<IOJob>();
+  job2->block_handles = block_handles2;
+  job2->table = table2.get();
+  job2->job_options.read_options.async_io = false;
+  std::shared_ptr<ReadSet> read_set2;
+  s = dispatcher->SubmitJob(job2, &read_set2);
+  ASSERT_OK(s);  // Should succeed immediately - prefetch is just deferred
+  // Reads work - blocks are fetched synchronously on demand
+  for (size_t i = 0; i < block_handles2.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set2->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+// Test that releasing blocks triggers pending prefetches
+TEST_F(IODispatcherTest, BlockReleaseTriggersWaitingJob) {
+  // Create dispatcher with a small memory limit
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 100 * 1024;  // 100KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GT(block_handles.size(), 0);
+  // Submit first job
+  auto job1 = std::make_shared<IOJob>();
+  job1->block_handles = block_handles;
+  job1->table = table.get();
+  job1->job_options.read_options.async_io = false;
+  std::shared_ptr<ReadSet> read_set1;
+  s = dispatcher->SubmitJob(job1, &read_set1);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set1, nullptr);
+  // Read all blocks from first job
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set1->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+  }
+  // Submit second job - prefetch will be deferred due to memory limit
+  std::unique_ptr<BlockBasedTable> table2;
+  std::vector<BlockHandle> block_handles2;
+  s = CreateAndOpenSST(20, &table2, &block_handles2);
+  ASSERT_OK(s);
+  auto job2 = std::make_shared<IOJob>();
+  job2->block_handles = block_handles2;
+  job2->table = table2.get();
+  job2->job_options.read_options.async_io = false;
+  std::shared_ptr<ReadSet> read_set2;
+  s = dispatcher->SubmitJob(job2, &read_set2);
+  ASSERT_OK(s);  // Should succeed immediately
+  ASSERT_NE(read_set2, nullptr);
+  // Release blocks from first job - this should trigger pending prefetches
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    read_set1->ReleaseBlock(i);
+  }
+  // Read all blocks from second job - should work
+  for (size_t i = 0; i < block_handles2.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set2->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+// Test that multiple ReadSets share the memory budget
+TEST_F(IODispatcherTest, MultipleReadSetsShareMemoryBudget) {
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 10 * 1024 * 1024;  // 10MB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  std::vector<std::shared_ptr<ReadSet>> read_sets;
+  std::vector<std::vector<BlockHandle>> all_block_handles;
+  // Create and submit multiple jobs
+  for (int i = 0; i < 3; i++) {
+    std::unique_ptr<BlockBasedTable> table;
+    std::vector<BlockHandle> block_handles;
+    Status s = CreateAndOpenSST(20 + i * 5, &table, &block_handles);
+    ASSERT_OK(s);
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    job->job_options.read_options.async_io = false;
+    tables_.push_back(std::move(table));
+    all_block_handles.push_back(block_handles);
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    read_sets.push_back(read_set);
+  }
+  // Verify all ReadSets can read their blocks
+  for (size_t i = 0; i < read_sets.size(); ++i) {
+    for (size_t j = 0; j < all_block_handles[i].size(); ++j) {
+      CachableEntry<Block> block;
+      Status read_status = read_sets[i]->ReadIndex(j, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+  }
+  // Release all blocks from first ReadSet
+  for (size_t i = 0; i < all_block_handles[0].size(); ++i) {
+    read_sets[0]->ReleaseBlock(i);
+  }
+  // Create another job - should work because first ReadSet released memory
+  std::unique_ptr<BlockBasedTable> table_new;
+  std::vector<BlockHandle> block_handles_new;
+  Status s = CreateAndOpenSST(25, &table_new, &block_handles_new);
+  ASSERT_OK(s);
+  auto job_new = std::make_shared<IOJob>();
+  job_new->block_handles = block_handles_new;
+  job_new->table = table_new.get();
+  job_new->job_options.read_options.async_io = false;
+  std::shared_ptr<ReadSet> read_set_new;
+  s = dispatcher->SubmitJob(job_new, &read_set_new);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set_new, nullptr);
+  for (size_t i = 0; i < block_handles_new.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set_new->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+// Test that no memory limiting is applied when max_prefetch_memory_bytes is 0
+TEST_F(IODispatcherTest, NoMemoryLimitWhenZero) {
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 0;  // No limit
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+// Test memory release on ReadSet destruction triggers pending prefetches
+TEST_F(IODispatcherTest, MemoryReleasedOnReadSetDestruction) {
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 100 * 1024;  // 100KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  // Create table outside the scope so it outlives the ReadSet
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  // Second table - created now so it's available after first ReadSet is
+  // destroyed
+  std::unique_ptr<BlockBasedTable> table2;
+  std::vector<BlockHandle> block_handles2;
+  s = CreateAndOpenSST(30, &table2, &block_handles2);
+  ASSERT_OK(s);
+  std::shared_ptr<ReadSet> read_set2;
+  {
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    job->job_options.read_options.async_io = false;
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    ASSERT_NE(read_set, nullptr);
+    // Submit second job while first is still alive - prefetch will be deferred
+    auto job2 = std::make_shared<IOJob>();
+    job2->block_handles = block_handles2;
+    job2->table = table2.get();
+    job2->job_options.read_options.async_io = false;
+    s = dispatcher->SubmitJob(job2, &read_set2);
+    ASSERT_OK(s);  // Should succeed immediately
+    ASSERT_NE(read_set2, nullptr);
+    // First ReadSet goes out of scope here and should release all memory,
+    // which triggers pending prefetches for second ReadSet
+  }
+  // Read all blocks from second job - should work because first ReadSet
+  // released its memory on destruction
+  for (size_t i = 0; i < block_handles2.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set2->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+// Test that partial prefetch dispatches as many blocks as memory allows
+// and queues the rest for later dispatch
+TEST_F(IODispatcherTest, PartialPrefetchDispatchesWhatFits) {
+  // Skip this test if io_uring is not available since partial prefetch
+  // only applies to async IO
+  if (!kIOUringPresent) {
+    return;  // io_uring not available, skip async IO test
+  }
+  // Create dispatcher with memory limit that allows only some blocks
+  // Each block is ~16KB, so 50KB allows roughly 3 blocks
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 50 * 1024;  // 50KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  // Create 10 blocks - only ~3 should fit in memory
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 5);
+  // Use sync point to count blocks dispatched during SubmitJob
+  size_t blocks_dispatched_on_submit = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
+        auto* indices = static_cast<std::vector<size_t>*>(arg);
+        blocks_dispatched_on_submit += indices->size();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;  // Use async IO
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  // With partial prefetch, we expect SOME blocks to have been dispatched
+  // (the ones that fit in memory), but not ALL blocks
+  // This is the key assertion: partial prefetch means > 0 blocks dispatched
+  // even though total memory needed exceeds the limit
+  EXPECT_GT(blocks_dispatched_on_submit, 0)
+      << "Expected some blocks to be dispatched with partial prefetch";
+  EXPECT_LT(blocks_dispatched_on_submit, block_handles.size())
+      << "Expected not all blocks to be dispatched (memory limit should apply)";
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  // Now read all blocks - remaining blocks will be fetched on demand
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+  // Verify all blocks were ultimately read
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  EXPECT_EQ(total_reads, block_handles.size());
+}
+// Test that earlier block indices are prioritized in partial prefetch
+TEST_F(IODispatcherTest, PartialPrefetchPrioritizesEarlierIndices) {
+  // Skip this test if io_uring is not available
+  if (!kIOUringPresent) {
+    return;  // io_uring not available, skip async IO test
+  }
+  // Create dispatcher with memory limit that allows only 1-2 blocks
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 20 * 1024;  // 20KB - room for ~1 block
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 5);
+  tracking_fs_->ClearReadOps();
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  // Get the async reads that were dispatched
+  auto read_ops = tracking_fs_->GetReadOps();
+  // Find the offset of the first async read
+  uint64_t first_async_offset = UINT64_MAX;
+  for (const auto& op : read_ops) {
+    if (op.type == ReadOp::kReadAsync && !op.requests.empty()) {
+      first_async_offset = std::min(first_async_offset, op.requests[0].first);
+    }
+  }
+  // The first async read should be for the first block (lowest offset)
+  // This verifies that earlier indices are prioritized
+  if (first_async_offset != UINT64_MAX) {
+    EXPECT_EQ(first_async_offset, block_handles[0].offset())
+        << "Expected first async read to be for the first block (earliest "
+           "index)";
+  }
+  // Read all blocks to complete the test
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+// Test that blocks larger than the memory budget are excluded from prefetch
+// and fall back to synchronous read
+TEST_F(IODispatcherTest, OversizedBlocksFallbackToSyncRead) {
+  // Skip this test if io_uring is not available since we need async IO
+  if (!kIOUringPresent) {
+    return;
+  }
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 3);
+  // Calculate the size of a single block
+  size_t single_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+  // Create dispatcher with memory limit smaller than a single block
+  // This means ALL blocks are "oversized" and should fall back to sync read
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = single_block_size / 2;  // Half a block
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  // Track dispatches - with oversized blocks, nothing should be dispatched
+  size_t blocks_dispatched = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
+        auto* indices = static_cast<std::vector<size_t>*>(arg);
+        blocks_dispatched += indices->size();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  // No blocks should have been dispatched since they're all oversized
+  EXPECT_EQ(blocks_dispatched, 0)
+      << "Expected no blocks to be dispatched when all blocks are oversized";
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  // All blocks should still be readable via sync fallback
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+  // All reads should be sync since blocks couldn't be prefetched
+  EXPECT_GT(read_set->GetNumSyncReads(), 0)
+      << "Expected sync reads for oversized blocks";
+}
+// Test that reading blocks before prefetch dispatch correctly updates
+// memory accounting for coalesced groups
+TEST_F(IODispatcherTest, PartialReadsUpdateCoalescedGroups) {
+  // Skip this test if io_uring is not available
+  if (!kIOUringPresent) {
+    return;
+  }
+  // Create dispatcher with memory limit that allows only some blocks
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 50 * 1024;  // 50KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(20, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  // Read some blocks directly (simulating on-demand access before prefetch)
+  // This removes them from pending and should update coalesced group accounting
+  for (size_t i = 0; i < 5 && i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+  // Release the blocks we read - this frees memory
+  for (size_t i = 0; i < 5 && i < block_handles.size(); ++i) {
+    read_set->ReleaseBlock(i);
+  }
+  // Now read the remaining blocks - these should work correctly
+  // The key test: memory accounting should be correct even though some blocks
+  // were removed from pending groups before dispatch
+  for (size_t i = 5; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+  // Verify all remaining blocks were read successfully
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  // We read 5 blocks initially, then the remaining blocks
+  EXPECT_GE(total_reads, block_handles.size() - 5)
+      << "Expected at least the remaining blocks to be counted";
+}
+// Test that a mix of oversized and normal blocks works correctly
+TEST_F(IODispatcherTest, MixedOversizedAndNormalBlocks) {
+  // Skip this test if io_uring is not available
+  if (!kIOUringPresent) {
+    return;
+  }
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 5);
+  // Calculate the size of a typical block
+  size_t typical_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+  // Create dispatcher with memory limit that allows exactly 2 typical blocks
+  // This means groups of 3+ blocks become "oversized" as a group
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = typical_block_size * 2;
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  // All blocks should be readable regardless of prefetch status
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+  // Verify total reads match
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  EXPECT_EQ(total_reads, block_handles.size());
+}
+// Test that memory is properly accounted when groups are partially consumed
+TEST_F(IODispatcherTest, MemoryAccountingWithPartialGroupConsumption) {
+  // Skip this test if io_uring is not available
+  if (!kIOUringPresent) {
+    return;
+  }
+  // Create dispatcher with a specific memory limit
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 100 * 1024;  // 100KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  // Read blocks one at a time and release them
+  // This tests that RemoveFromPending correctly updates pending state
+  // and that TryDispatchPendingPrefetches filters correctly
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+    // Release the block immediately after reading
+    read_set->ReleaseBlock(i);
+  }
+  // Verify total reads match
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  EXPECT_EQ(total_reads, block_handles.size());
+}
+// Test that sync prefetching respects memory limits
+TEST_F(IODispatcherTest, SyncPrefetchWithMemoryLimit) {
+  // Create dispatcher with a small memory limit
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 50 * 1024;  // 50KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(20, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;  // Sync IO
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  // All blocks should be readable even with memory limits
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+  // Verify all were sync reads
+  EXPECT_GT(read_set->GetNumSyncReads(), 0)
+      << "Expected sync reads with async_io=false";
+  EXPECT_EQ(read_set->GetNumAsyncReads(), 0)
+      << "Expected no async reads with async_io=false";
+}
+// Test that oversized blocks work correctly with sync IO
+TEST_F(IODispatcherTest, OversizedBlocksWithSyncIO) {
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 3);
+  // Calculate the size of a single block
+  size_t single_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+  // Create dispatcher with memory limit smaller than a single block
+  // This means ALL blocks are "oversized"
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = single_block_size / 2;  // Half a block
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;  // Sync IO
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  // All blocks should still be readable via sync fallback
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+  // All reads should be sync
+  EXPECT_GT(read_set->GetNumSyncReads(), 0)
+      << "Expected sync reads for oversized blocks";
+}
+// Test that a single block larger than total memory budget still works
+TEST_F(IODispatcherTest, SingleBlockLargerThanTotalMemory) {
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(5, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 1);
+  // Set memory limit to 1 byte - smaller than any block
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 1;
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  // Test with both sync and async modes
+  for (bool async : {false, true}) {
+    // Skip async if io_uring not available
+    if (async && !kIOUringPresent) {
+      continue;
+    }
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    job->job_options.read_options.async_io = async;
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s) << "SubmitJob failed with async=" << async;
+    ASSERT_NE(read_set, nullptr);
+    // All blocks should be readable
+    for (size_t i = 0; i < block_handles.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status)
+          << "Failed to read block " << i << " with async=" << async;
+      ASSERT_NE(block.GetValue(), nullptr)
+          << "Block " << i << " is null with async=" << async;
+    }
+  }
+}
+// Test that sync prefetching defers later groups and dispatches them
+// when memory is released
+TEST_F(IODispatcherTest, SyncPrefetchDefersAndDispatchesLaterGroups) {
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  // Create 10+ blocks so we have enough to test deferred dispatch
+  Status s = CreateAndOpenSST(20, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+  // Calculate typical block size
+  size_t typical_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+  // Set memory limit to fit approximately 3 blocks
+  // This should cause groups to be split and some deferred
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = typical_block_size * 3;
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  // Track dispatch calls
+  std::vector<size_t> dispatch_counts;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
+        auto* indices = static_cast<std::vector<size_t>*>(arg);
+        dispatch_counts.push_back(indices->size());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;  // Sync IO
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+  // After SubmitJob, some blocks should have been dispatched (first group)
+  // and remaining groups should be queued
+  size_t initial_dispatch_count = dispatch_counts.size();
+  EXPECT_GT(initial_dispatch_count, 0)
+      << "Expected at least one dispatch during SubmitJob";
+  // Read and release first few blocks - this should trigger deferred dispatch
+  for (size_t i = 0; i < 3 && i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+    // Release to free memory
+    read_set->ReleaseBlock(i);
+  }
+  // After releasing blocks, more dispatches should have occurred
+  // as the pending queue gets processed
+  size_t dispatch_count_after_release = dispatch_counts.size();
+  EXPECT_GE(dispatch_count_after_release, initial_dispatch_count)
+      << "Expected more dispatches after releasing blocks";
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  // All remaining blocks should still be readable
+  for (size_t i = 3; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+}
+// Test that coalesced groups are properly split based on memory budget
+TEST_F(IODispatcherTest, CoalescedGroupsSplitByMemoryBudget) {
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(15, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+  // Calculate typical block size
+  size_t typical_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+  // Set memory limit to fit exactly 5 blocks
+  // With 10+ blocks, we should get at least 2 groups
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = typical_block_size * 5;
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+  // Track how many blocks are in each dispatch call
+  std::vector<size_t> blocks_per_dispatch;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
+        auto* indices = static_cast<std::vector<size_t>*>(arg);
+        blocks_per_dispatch.push_back(indices->size());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  // First dispatch should have at most 5 blocks (memory limit)
+  ASSERT_GT(blocks_per_dispatch.size(), 0);
+  EXPECT_LE(blocks_per_dispatch[0], 5)
+      << "First dispatch should be limited by memory budget";
+  // Read and release all blocks to trigger remaining dispatches
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    read_set->ReleaseBlock(i);
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  // Verify each dispatch was limited by memory budget
+  for (size_t i = 0; i < blocks_per_dispatch.size(); ++i) {
+    EXPECT_LE(blocks_per_dispatch[i], 5)
+        << "Dispatch " << i << " exceeded memory budget";
+  }
+}
+// Regression tests for a bug where ReadIndex moved values out of
+// pinned_blocks_ via std::move, but neither ReleaseBlock() nor the destructor
+// released memory accounting because they checked pinned_blocks_.GetValue()
+// which was null after the move.
+// Tests run with both sync and async IO modes to cover Case 1 and Case 2
+// in ReadIndex().
+TEST_F(IODispatcherTest, MemoryReleasedAfterReadIndexThenReleaseBlock) {
+  for (bool async : {false, true}) {
+    // Skip async if io_uring not available
+    if (async && !kIOUringPresent) {
+      continue;
+    }
+    SCOPED_TRACE("async_io=" + std::to_string(async));
+    auto stats = CreateDBStatistics();
+    IODispatcherOptions opts;
+    opts.max_prefetch_memory_bytes = 100 * 1024;  // 100KB
+    opts.statistics = stats.get();
+    std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+    std::unique_ptr<BlockBasedTable> table;
+    std::vector<BlockHandle> block_handles;
+    Status s = CreateAndOpenSST(20, &table, &block_handles);
+    ASSERT_OK(s);
+    ASSERT_GT(block_handles.size(), 0);
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    job->job_options.read_options.async_io = async;
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    ASSERT_NE(read_set, nullptr);
+    // Some memory should have been granted for prefetch
+    ASSERT_GT(stats->getTickerCount(PREFETCH_MEMORY_BYTES_GRANTED), 0);
+    // Read all blocks — ReadIndex moves values out of pinned_blocks_.
+    // This also triggers TryDispatchPendingPrefetches as memory is released,
+    // which acquires more memory for pending groups. So granted grows during
+    // this loop.
+    for (size_t i = 0; i < block_handles.size(); ++i) {
+      CachableEntry<Block> block;
+      ASSERT_OK(read_set->ReadIndex(i, &block));
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+    // Release all blocks — should be a no-op for memory accounting since
+    // ReadIndex already released memory when moving values out
+    for (size_t i = 0; i < block_handles.size(); ++i) {
+      read_set->ReleaseBlock(i);
+    }
+    // Read both counters after all operations complete, since
+    // TryDispatchPendingPrefetches during ReadIndex may have granted additional
+    // memory for pending groups
+    uint64_t granted = stats->getTickerCount(PREFETCH_MEMORY_BYTES_GRANTED);
+    uint64_t released = stats->getTickerCount(PREFETCH_MEMORY_BYTES_RELEASED);
+    // With the bug, released < granted because ReleaseBlock skips
+    // ReleaseMemory when pinned_blocks_ value was already moved out
+    EXPECT_EQ(released, granted);
+  }
+}
+// Test that ReadSet destructor releases memory for blocks that were read
+// via ReadIndex but never explicitly released via ReleaseBlock.
+TEST_F(IODispatcherTest, DestructorReleasesMemoryAfterReadIndex) {
+  for (bool async : {false, true}) {
+    // Skip async if io_uring not available
+    if (async && !kIOUringPresent) {
+      continue;
+    }
+    SCOPED_TRACE("async_io=" + std::to_string(async));
+    auto stats = CreateDBStatistics();
+    IODispatcherOptions opts;
+    opts.max_prefetch_memory_bytes = 100 * 1024;  // 100KB
+    opts.statistics = stats.get();
+    std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+    std::unique_ptr<BlockBasedTable> table;
+    std::vector<BlockHandle> block_handles;
+    Status s = CreateAndOpenSST(20, &table, &block_handles);
+    ASSERT_OK(s);
+    ASSERT_GT(block_handles.size(), 0);
+    {
+      auto job = std::make_shared<IOJob>();
+      job->block_handles = block_handles;
+      job->table = table.get();
+      job->job_options.read_options.async_io = async;
+      std::shared_ptr<ReadSet> read_set;
+      s = dispatcher->SubmitJob(job, &read_set);
+      ASSERT_OK(s);
+      ASSERT_NE(read_set, nullptr);
+      uint64_t granted = stats->getTickerCount(PREFETCH_MEMORY_BYTES_GRANTED);
+      ASSERT_GT(granted, 0);
+      // Read all blocks via ReadIndex (moves values out of pinned_blocks_)
+      // but do NOT call ReleaseBlock — let the destructor handle cleanup
+      for (size_t i = 0; i < block_handles.size(); ++i) {
+        CachableEntry<Block> block;
+        ASSERT_OK(read_set->ReadIndex(i, &block));
+      }
+      // read_set goes out of scope — destructor should release all memory
+    }
+    uint64_t granted = stats->getTickerCount(PREFETCH_MEMORY_BYTES_GRANTED);
+    uint64_t released = stats->getTickerCount(PREFETCH_MEMORY_BYTES_RELEASED);
+    // Destructor should release memory for all prefetched blocks,
+    // even those whose values were moved out by ReadIndex
+    EXPECT_EQ(released, granted);
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}