@nxtedition/rocksdb 13.5.9 → 13.5.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/deps/rocksdb/rocksdb/BUCK +2 -1
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +2 -1
  3. package/deps/rocksdb/rocksdb/Makefile +1 -1
  4. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +4 -5
  5. package/deps/rocksdb/rocksdb/db/c.cc +13 -0
  6. package/deps/rocksdb/rocksdb/db/c_test.c +0 -12
  7. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +8 -8
  8. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +2 -3
  9. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +5 -4
  10. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -1
  11. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +10 -10
  12. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +11 -6
  13. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +10 -16
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +2 -4
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -17
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +164 -0
  17. package/deps/rocksdb/rocksdb/db/corruption_test.cc +74 -3
  18. package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +39 -4
  19. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +2 -83
  20. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +0 -4
  21. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +11 -11
  22. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +0 -3
  23. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +0 -9
  24. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +16 -54
  25. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +0 -6
  26. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +186 -0
  27. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +3 -40
  28. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +0 -54
  29. package/deps/rocksdb/rocksdb/db/db_test.cc +0 -292
  30. package/deps/rocksdb/rocksdb/db/db_test2.cc +0 -1235
  31. package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -0
  32. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +11 -4
  33. package/deps/rocksdb/rocksdb/db/log_reader.cc +11 -11
  34. package/deps/rocksdb/rocksdb/db/merge_helper.h +1 -1
  35. package/deps/rocksdb/rocksdb/db/multi_scan.cc +70 -0
  36. package/deps/rocksdb/rocksdb/db/version_set.cc +15 -8
  37. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +4 -0
  38. package/deps/rocksdb/rocksdb/env/composite_env.cc +4 -0
  39. package/deps/rocksdb/rocksdb/env/env.cc +4 -0
  40. package/deps/rocksdb/rocksdb/env/env_encryption.cc +38 -3
  41. package/deps/rocksdb/rocksdb/env/env_test.cc +36 -1
  42. package/deps/rocksdb/rocksdb/env/fs_posix.cc +20 -4
  43. package/deps/rocksdb/rocksdb/env/io_posix.cc +16 -0
  44. package/deps/rocksdb/rocksdb/env/io_posix.h +3 -0
  45. package/deps/rocksdb/rocksdb/env/mock_env.cc +5 -0
  46. package/deps/rocksdb/rocksdb/file/readahead_raf.cc +4 -0
  47. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +33 -6
  48. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +5 -0
  49. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +25 -1
  50. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +10 -0
  51. package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +9 -0
  52. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +12 -0
  53. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +12 -8
  54. package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +29 -28
  55. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +26 -6
  56. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +9 -0
  57. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +3 -0
  58. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +142 -0
  59. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +2 -0
  60. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +2 -2
  61. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +2 -0
  62. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  63. package/deps/rocksdb/rocksdb/options/options_helper.h +3 -0
  64. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +2 -0
  65. package/deps/rocksdb/rocksdb/port/win/io_win.cc +20 -0
  66. package/deps/rocksdb/rocksdb/port/win/io_win.h +4 -0
  67. package/deps/rocksdb/rocksdb/src.mk +2 -1
  68. package/deps/rocksdb/rocksdb/table/block_based/block.cc +31 -34
  69. package/deps/rocksdb/rocksdb/table/block_based/block.h +2 -4
  70. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +43 -7
  71. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +6 -0
  72. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +367 -2
  73. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +69 -23
  74. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +54 -6
  75. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +27 -5
  76. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +167 -3
  77. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +6 -2
  78. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +6 -0
  79. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +12 -0
  80. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +1 -0
  81. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +0 -3
  82. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +10 -7
  83. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +244 -0
  84. package/deps/rocksdb/rocksdb/table/external_table.cc +1 -1
  85. package/deps/rocksdb/rocksdb/table/format.cc +51 -33
  86. package/deps/rocksdb/rocksdb/table/format.h +1 -1
  87. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +13 -8
  88. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +1 -3
  89. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +5 -1
  90. package/deps/rocksdb/rocksdb/table/table_test.cc +629 -1
  91. package/deps/rocksdb/rocksdb/test_util/testutil.cc +0 -1
  92. package/deps/rocksdb/rocksdb/test_util/testutil.h +5 -0
  93. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +183 -94
  94. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +71 -0
  95. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +37 -22
  96. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +308 -0
  97. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +189 -0
  98. package/deps/rocksdb/rocksdb/util/cast_util.h +22 -11
  99. package/deps/rocksdb/rocksdb/util/coding.h +4 -3
  100. package/deps/rocksdb/rocksdb/util/compression.cc +2 -0
  101. package/deps/rocksdb/rocksdb/util/compression.h +16 -6
  102. package/deps/rocksdb/rocksdb/util/compression_test.cc +1679 -15
  103. package/deps/rocksdb/rocksdb/util/stop_watch.h +17 -7
  104. package/deps/rocksdb/rocksdb/util/timer_queue_test.cc +17 -3
  105. package/deps/rocksdb/rocksdb/utilities/env_mirror.cc +10 -0
  106. package/deps/rocksdb/rocksdb/utilities/fault_injection_env.cc +5 -0
  107. package/deps/rocksdb/rocksdb/utilities/fault_injection_env.h +2 -0
  108. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +18 -2
  109. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +22 -3
  110. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +5 -0
  111. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +22 -2
  112. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +15 -4
  113. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +61 -0
  114. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +18 -0
  115. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +3 -0
  116. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +3 -0
  117. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +9 -3
  118. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +9 -0
  119. package/deps/rocksdb/rocksdb.gyp +15 -1
  120. package/package.json +1 -1
  121. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  122. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  123. package/deps/rocksdb/rocksdb/util/auto_skip_compressor.cc +0 -131
  124. package/deps/rocksdb/rocksdb/util/auto_skip_compressor.h +0 -90
@@ -46,6 +46,7 @@
46
46
  #include "rocksdb/table.h"
47
47
  #include "rocksdb/table_properties.h"
48
48
  #include "rocksdb/trace_record.h"
49
+ #include "rocksdb/user_defined_index.h"
49
50
  #include "table/block_based/binary_search_index_reader.h"
50
51
  #include "table/block_based/block.h"
51
52
  #include "table/block_based/block_based_table_factory.h"
@@ -58,6 +59,7 @@
58
59
  #include "table/block_based/hash_index_reader.h"
59
60
  #include "table/block_based/partitioned_filter_block.h"
60
61
  #include "table/block_based/partitioned_index_reader.h"
62
+ #include "table/block_based/user_defined_index_wrapper.h"
61
63
  #include "table/block_fetcher.h"
62
64
  #include "table/format.h"
63
65
  #include "table/get_context.h"
@@ -104,7 +106,11 @@ CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
104
106
  bool use_block_cache_for_lookup) const; \
105
107
  template Status BlockBasedTable::LookupAndPinBlocksInCache<T>( \
106
108
  const ReadOptions& ro, const BlockHandle& handle, \
107
- CachableEntry<T>* out_parsed_block) const;
109
+ CachableEntry<T>* out_parsed_block) const; \
110
+ template Status BlockBasedTable::CreateAndPinBlockInCache<T>( \
111
+ const ReadOptions& ro, const BlockHandle& handle, \
112
+ BlockContents* block_contents, CachableEntry<T>* out_parsed_block) \
113
+ const;
108
114
 
109
115
  INSTANTIATE_BLOCKLIKE_TEMPLATES(ParsedFullFilterBlock);
110
116
  INSTANTIATE_BLOCKLIKE_TEMPLATES(DecompressorDict);
@@ -113,6 +119,7 @@ INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kIndex);
113
119
  INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kFilterPartitionIndex);
114
120
  INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kRangeDeletion);
115
121
  INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kMetaIndex);
122
+ INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kUserDefinedIndex);
116
123
 
117
124
  } // namespace ROCKSDB_NAMESPACE
118
125
 
@@ -1318,6 +1325,34 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
1318
1325
  if (!s.ok()) {
1319
1326
  return s;
1320
1327
  }
1328
+ if (table_options.user_defined_index_factory != nullptr) {
1329
+ std::string udi_name(table_options.user_defined_index_factory->Name());
1330
+ BlockHandle udi_block_handle;
1331
+
1332
+ // Should we use FindOptionalMetaBlock here?
1333
+ s = FindMetaBlock(meta_iter, kUserDefinedIndexPrefix + udi_name,
1334
+ &udi_block_handle);
1335
+ if (!s.ok()) {
1336
+ return s;
1337
+ }
1338
+ // Read the block, and allocate on heap or pin in cache. The UDI block is
1339
+ // not compressed. RetrieveBlock will verify the checksum.
1340
+ s = RetrieveBlock(prefetch_buffer, ro, udi_block_handle,
1341
+ rep_->decompressor.get(), &rep_->udi_block,
1342
+ /*get_context=*/nullptr, lookup_context,
1343
+ /*for_compaction=*/false, use_cache, /*async_read=*/false,
1344
+ /*use_block_cache_for_lookup=*/false);
1345
+ if (!s.ok()) {
1346
+ return s;
1347
+ }
1348
+ assert(!rep_->udi_block.IsEmpty());
1349
+
1350
+ std::unique_ptr<UserDefinedIndexReader> udi_reader =
1351
+ table_options.user_defined_index_factory->NewReader(
1352
+ rep_->udi_block.GetValue()->data);
1353
+ index_reader = std::make_unique<UserDefinedIndexReaderWrapper>(
1354
+ udi_name, std::move(index_reader), std::move(udi_reader));
1355
+ }
1321
1356
 
1322
1357
  rep_->index_reader = std::move(index_reader);
1323
1358
 
@@ -1704,6 +1739,17 @@ Status BlockBasedTable::LookupAndPinBlocksInCache(
1704
1739
  return s;
1705
1740
  }
1706
1741
 
1742
+ template <typename TBlocklike>
1743
+ Status BlockBasedTable::CreateAndPinBlockInCache(
1744
+ const ReadOptions& ro, const BlockHandle& handle, BlockContents* contents,
1745
+ CachableEntry<TBlocklike>* out_parsed_block) const {
1746
+ return MaybeReadBlockAndLoadToCache(
1747
+ nullptr, ro, handle, rep_->decompressor.get(),
1748
+ /*for_compaction=*/false, out_parsed_block, nullptr, nullptr, contents,
1749
+ /*async_read=*/false,
1750
+ /*use_block_cache_for_lookup=*/true);
1751
+ }
1752
+
1707
1753
  // If contents is nullptr, this function looks up the block caches for the
1708
1754
  // data block referenced by handle, and read the block from disk if necessary.
1709
1755
  // If contents is non-null, it skips the cache lookup and disk read, since
@@ -1765,8 +1811,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
1765
1811
  ro.fill_cache) {
1766
1812
  Statistics* statistics = rep_->ioptions.stats;
1767
1813
  const bool maybe_compressed =
1768
- TBlocklike::kBlockType != BlockType::kFilter &&
1769
- TBlocklike::kBlockType != BlockType::kCompressionDictionary &&
1814
+ BlockTypeMaybeCompressed(TBlocklike::kBlockType) &&
1770
1815
  rep_->decompressor;
1771
1816
  // This flag, if true, tells BlockFetcher to return the uncompressed
1772
1817
  // block when ReadBlockContents() is called.
@@ -1910,6 +1955,7 @@ BlockBasedTable::SaveLookupContextOrTraceRecord(
1910
1955
  trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
1911
1956
  break;
1912
1957
  case BlockType::kIndex:
1958
+ case BlockType::kUserDefinedIndex:
1913
1959
  trace_block_type = TraceType::kBlockTraceIndexBlock;
1914
1960
  break;
1915
1961
  default:
@@ -2002,9 +2048,7 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::RetrieveBlock(
2002
2048
  }
2003
2049
 
2004
2050
  const bool maybe_compressed =
2005
- TBlocklike::kBlockType != BlockType::kFilter &&
2006
- TBlocklike::kBlockType != BlockType::kCompressionDictionary &&
2007
- rep_->decompressor;
2051
+ BlockTypeMaybeCompressed(TBlocklike::kBlockType) && rep_->decompressor;
2008
2052
  std::unique_ptr<TBlocklike> block;
2009
2053
 
2010
2054
  {
@@ -2747,6 +2791,10 @@ BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
2747
2791
  return BlockType::kIndex;
2748
2792
  }
2749
2793
 
2794
+ if (meta_block_name.starts_with(kUserDefinedIndexPrefix)) {
2795
+ return BlockType::kUserDefinedIndex;
2796
+ }
2797
+
2750
2798
  if (meta_block_name.starts_with(kObsoleteFilterBlockPrefix)) {
2751
2799
  // Obsolete but possible in old files
2752
2800
  return BlockType::kInvalid;
@@ -228,11 +228,15 @@ class BlockBasedTable : public TableReader {
228
228
 
229
229
  // Create an iterator for index access. If iter is null, then a new object
230
230
  // is created on the heap, and the callee will have the ownership.
231
- // If a non-null iter is passed in, it will be used, and the returned value
232
- // is either the same as iter or a new on-heap object that
233
- // wraps the passed iter. In the latter case the return value points
234
- // to a different object then iter, and the callee has the ownership of the
235
- // returned object.
231
+ // If a non-null iter is passed in, it may be used, and the returned value
232
+ // is either the same as iter or a new on-heap object.
233
+ // In the latter case the return value points to a different object then
234
+ // iter, and the callee has the ownership of the returned object.
235
+ //
236
+ // Under all circumstances, the caller MUST use the returned iterator
237
+ // for further operations. If the returned iterator != iter, then the
238
+ // caller MUST ensure that iter stays in scope until the returned
239
+ // iterator is destroyed.
236
240
  virtual InternalIteratorBase<IndexValue>* NewIterator(
237
241
  const ReadOptions& read_options, bool disable_prefix_seek,
238
242
  IndexBlockIter* iter, GetContext* get_context,
@@ -295,11 +299,21 @@ class BlockBasedTable : public TableReader {
295
299
  Status GetKVPairsFromDataBlocks(const ReadOptions& read_options,
296
300
  std::vector<KVPairBlock>* kv_pair_blocks);
297
301
 
302
+ // Look up the block cache for the specified block.
303
+ // out_parsed_block is set to nullptr if the block is not found in the cache.
298
304
  template <typename TBlocklike>
299
305
  Status LookupAndPinBlocksInCache(
300
306
  const ReadOptions& ro, const BlockHandle& handle,
301
307
  CachableEntry<TBlocklike>* out_parsed_block) const;
302
308
 
309
+ // Create the block given in `block_contents` and insert it into block cache.
310
+ // `out_parsed_block` points to the inserted block if successful.
311
+ template <typename TBlocklike>
312
+ Status CreateAndPinBlockInCache(
313
+ const ReadOptions& ro, const BlockHandle& handle,
314
+ BlockContents* block_contents,
315
+ CachableEntry<TBlocklike>* out_parsed_block) const;
316
+
303
317
  struct Rep;
304
318
 
305
319
  Rep* get_rep() { return rep_; }
@@ -544,6 +558,12 @@ class BlockBasedTable : public TableReader {
544
558
 
545
559
  bool TimestampMayMatch(const ReadOptions& read_options) const;
546
560
 
561
+ bool BlockTypeMaybeCompressed(BlockType type) const {
562
+ return type != BlockType::kFilter &&
563
+ type != BlockType::kCompressionDictionary &&
564
+ type != BlockType::kUserDefinedIndex;
565
+ }
566
+
547
567
  // A cumulative data block file read in MultiGet lower than this size will
548
568
  // use a stack buffer
549
569
  static constexpr size_t kMultiGetReadStackBufSize = 8192;
@@ -689,6 +709,8 @@ struct BlockBasedTable::Rep {
689
709
  std::unique_ptr<CacheReservationManager::CacheReservationHandle>
690
710
  table_reader_cache_res_handle = nullptr;
691
711
 
712
+ CachableEntry<Block_kUserDefinedIndex> udi_block;
713
+
692
714
  SequenceNumber get_global_seqno(BlockType block_type) const {
693
715
  return (block_type == BlockType::kFilterPartitionIndex ||
694
716
  block_type == BlockType::kCompressionDictionary)
@@ -173,7 +173,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
173
173
  0 /* _tail_size */, user_defined_timestamps_persisted);
174
174
 
175
175
  std::unique_ptr<RandomAccessFileReader> file;
176
- NewFileReader(table_name, foptions, &file);
176
+ NewFileReader(table_name, foptions, &file, ioptions.statistics.get());
177
177
 
178
178
  uint64_t file_size = 0;
179
179
  ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size));
@@ -222,12 +222,15 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
222
222
  }
223
223
 
224
224
  void NewFileReader(const std::string& filename, const FileOptions& opt,
225
- std::unique_ptr<RandomAccessFileReader>* reader) {
225
+ std::unique_ptr<RandomAccessFileReader>* reader,
226
+ Statistics* stats = nullptr) {
226
227
  std::string path = Path(filename);
227
228
  std::unique_ptr<FSRandomAccessFile> f;
228
229
  ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
229
230
  reader->reset(new RandomAccessFileReader(std::move(f), path,
230
- env_->GetSystemClock().get()));
231
+ env_->GetSystemClock().get(),
232
+ /*io_tracer=*/nullptr,
233
+ /*stats=*/stats));
231
234
  }
232
235
  };
233
236
 
@@ -990,6 +993,167 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) {
990
993
  ASSERT_EQ(s.code(), Status::kCorruption);
991
994
  }
992
995
 
996
+ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
997
+ Options options;
998
+ options.statistics = CreateDBStatistics();
999
+ ReadOptions read_opts;
1000
+ size_t ts_sz = options.comparator->timestamp_size();
1001
+ std::vector<std::pair<std::string, std::string>> kv =
1002
+ BlockBasedTableReaderBaseTest::GenerateKVMap(
1003
+ 100 /* num_block */,
1004
+ true /* mixed_with_human_readable_string_value */, ts_sz);
1005
+
1006
+ std::string table_name = "BlockBasedTableReaderTest_NewIterator" +
1007
+ CompressionTypeToString(compression_type_);
1008
+
1009
+ ImmutableOptions ioptions(options);
1010
+ CreateTable(table_name, ioptions, compression_type_, kv,
1011
+ compression_parallel_threads_, compression_dict_bytes_);
1012
+
1013
+ std::unique_ptr<BlockBasedTable> table;
1014
+ FileOptions foptions;
1015
+ foptions.use_direct_reads = true;
1016
+ InternalKeyComparator comparator(options.comparator);
1017
+ NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
1018
+ true /* bool prefetch_index_and_filter_in_cache */,
1019
+ nullptr /* status */, persist_udt_);
1020
+
1021
+ std::unique_ptr<InternalIterator> iter;
1022
+ iter.reset(table->NewIterator(
1023
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1024
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1025
+
1026
+ // Should coalesce into a single I/O
1027
+ std::vector<ScanOptions> scan_options(
1028
+ {ScanOptions(ExtractUserKey(kv[0].first),
1029
+ ExtractUserKey(kv[kEntriesPerBlock].first)),
1030
+ ScanOptions(ExtractUserKey(kv[2 * kEntriesPerBlock].first),
1031
+ ExtractUserKey(kv[3 * kEntriesPerBlock].first))});
1032
+
1033
+ auto read_count_before =
1034
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1035
+ iter->Prepare(&scan_options);
1036
+ auto read_count_after =
1037
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1038
+ ASSERT_EQ(read_count_before + 1, read_count_after);
1039
+ iter->Seek(kv[0].first);
1040
+ for (size_t i = 0; i < kEntriesPerBlock + 1; ++i) {
1041
+ ASSERT_TRUE(iter->Valid());
1042
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1043
+ iter->Next();
1044
+ }
1045
+ // Iter may still be valid after scan range. Upper layer (DBIter) handles
1046
+ // exact upper bound checking. So we don't check !iter->Valid() here.
1047
+ ASSERT_OK(iter->status());
1048
+ iter->Seek(kv[2 * kEntriesPerBlock].first);
1049
+ for (size_t i = 2 * kEntriesPerBlock; i < 3 * kEntriesPerBlock; ++i) {
1050
+ ASSERT_TRUE(iter->Valid());
1051
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1052
+ iter->Next();
1053
+ }
1054
+ ASSERT_OK(iter->status());
1055
+
1056
+ iter.reset(table->NewIterator(
1057
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1058
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1059
+ // No IO coalesce, should do MultiRead with 2 read requests.
1060
+ scan_options = {ScanOptions(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
1061
+ ExtractUserKey(kv[75 * kEntriesPerBlock].first)),
1062
+ ScanOptions(ExtractUserKey(kv[90 * kEntriesPerBlock].first),
1063
+ ExtractUserKey(kv[95 * kEntriesPerBlock].first))};
1064
+ read_count_before =
1065
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1066
+ iter->Prepare(&scan_options);
1067
+ read_count_after =
1068
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1069
+ ASSERT_EQ(read_count_before + 2, read_count_after);
1070
+
1071
+ iter->Seek(kv[70 * kEntriesPerBlock].first);
1072
+ for (size_t i = 70 * kEntriesPerBlock; i < 75 * kEntriesPerBlock; ++i) {
1073
+ ASSERT_TRUE(iter->Valid());
1074
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1075
+ iter->Next();
1076
+ }
1077
+ ASSERT_OK(iter->status());
1078
+ iter->Seek(kv[90 * kEntriesPerBlock].first);
1079
+ for (size_t i = 90 * kEntriesPerBlock; i < 95 * kEntriesPerBlock; ++i) {
1080
+ ASSERT_TRUE(iter->Valid());
1081
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1082
+ iter->Next();
1083
+ }
1084
+ ASSERT_OK(iter->status());
1085
+
1086
+ iter.reset(table->NewIterator(
1087
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1088
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1089
+ // Should do two I/Os since blocks 80-81 and 90-95 are already in block cache,
1090
+ // reads from blocks 50-79 and 82-.. are co
1091
+ scan_options = {ScanOptions(ExtractUserKey(kv[50 * kEntriesPerBlock].first))};
1092
+ read_count_before =
1093
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1094
+ iter->Prepare(&scan_options);
1095
+ read_count_after =
1096
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1097
+ ASSERT_EQ(read_count_before + 3, read_count_after);
1098
+ iter->Seek(kv[50 * kEntriesPerBlock].first);
1099
+ for (size_t i = 50 * kEntriesPerBlock; i < 100 * kEntriesPerBlock; ++i) {
1100
+ ASSERT_TRUE(iter->Valid());
1101
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1102
+ iter->Next();
1103
+ }
1104
+ ASSERT_FALSE(iter->Valid());
1105
+ ASSERT_OK(iter->status());
1106
+
1107
+ // Check cases when Seek key does not match start key in ScanOptions
1108
+ iter.reset(table->NewIterator(
1109
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1110
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1111
+ scan_options = {ScanOptions(ExtractUserKey(kv[10 * kEntriesPerBlock].first),
1112
+ ExtractUserKey(kv[20 * kEntriesPerBlock].first)),
1113
+ ScanOptions(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
1114
+ ExtractUserKey(kv[40 * kEntriesPerBlock].first))};
1115
+ iter->Prepare(&scan_options);
1116
+ // Match start key
1117
+ iter->Seek(kv[10 * kEntriesPerBlock].first);
1118
+ for (size_t i = 10 * kEntriesPerBlock; i < 20 * kEntriesPerBlock; ++i) {
1119
+ ASSERT_TRUE(iter->Valid());
1120
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1121
+ iter->Next();
1122
+ }
1123
+ ASSERT_OK(iter->status());
1124
+ // Does not match start key of the second ScanOptions.
1125
+ iter->Seek(kv[50 * kEntriesPerBlock + 1].first);
1126
+ for (size_t i = 50 * kEntriesPerBlock + 1; i < 100 * kEntriesPerBlock; ++i) {
1127
+ ASSERT_TRUE(iter->Valid());
1128
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1129
+ iter->Next();
1130
+ }
1131
+ ASSERT_FALSE(iter->Valid());
1132
+ ASSERT_OK(iter->status());
1133
+
1134
+ iter.reset(table->NewIterator(
1135
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1136
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1137
+ scan_options = {ScanOptions(ExtractUserKey(kv[10 * kEntriesPerBlock].first)),
1138
+ ScanOptions(ExtractUserKey(kv[11 * kEntriesPerBlock].first))};
1139
+ iter->Prepare(&scan_options);
1140
+ // Does not match the first ScanOptions.
1141
+ iter->SeekToFirst();
1142
+ for (size_t i = 0; i < kEntriesPerBlock; ++i) {
1143
+ ASSERT_TRUE(iter->Valid());
1144
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1145
+ iter->Next();
1146
+ }
1147
+ ASSERT_OK(iter->status());
1148
+ iter->Seek(kv[10 * kEntriesPerBlock].first);
1149
+ for (size_t i = 10 * kEntriesPerBlock; i < 12 * kEntriesPerBlock; ++i) {
1150
+ ASSERT_TRUE(iter->Valid());
1151
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1152
+ iter->Next();
1153
+ }
1154
+ ASSERT_OK(iter->status());
1155
+ }
1156
+
993
1157
  // Param 1: compression type
994
1158
  // Param 2: whether to use direct reads
995
1159
  // Param 3: Block Based Table Index type, partitioned filters are also enabled
@@ -21,15 +21,19 @@
21
21
  // An entry for a particular key-value pair has the form:
22
22
  // shared_bytes: varint32
23
23
  // unshared_bytes: varint32
24
- // value_length: varint32
24
+ // value_length: varint32 (NOTE1)
25
25
  // key_delta: char[unshared_bytes]
26
26
  // value: char[value_length]
27
- // shared_bytes == 0 for restart points.
27
+ // shared_bytes == 0 (explicitly stored) for restart points.
28
28
  //
29
29
  // The trailer of the block has the form:
30
30
  // restarts: uint32[num_restarts]
31
31
  // num_restarts: uint32
32
32
  // restarts[i] contains the offset within the block of the ith restart point.
33
+ //
34
+ // NOTE1: omitted for format_version >= 4 index blocks, because the value is
35
+ // composed of one (shared_bytes > 0) or two (shared_bytes == 0) varints, whose
36
+ // length is self-describing.
33
37
 
34
38
  #include "table/block_based/block_builder.h"
35
39
 
@@ -46,6 +46,12 @@ void BlockCreateContext::Create(std::unique_ptr<Block_kMetaIndex>* parsed_out,
46
46
  protection_bytes_per_key);
47
47
  }
48
48
 
49
+ void BlockCreateContext::Create(
50
+ std::unique_ptr<Block_kUserDefinedIndex>* parsed_out,
51
+ BlockContents&& block) {
52
+ parsed_out->reset(new Block_kUserDefinedIndex(std::move(block)));
53
+ }
54
+
49
55
  void BlockCreateContext::Create(
50
56
  std::unique_ptr<ParsedFullFilterBlock>* parsed_out, BlockContents&& block) {
51
57
  parsed_out->reset(new ParsedFullFilterBlock(
@@ -67,6 +67,16 @@ class Block_kMetaIndex : public Block {
67
67
  static constexpr BlockType kBlockType = BlockType::kMetaIndex;
68
68
  };
69
69
 
70
+ class Block_kUserDefinedIndex : public BlockContents {
71
+ public:
72
+ static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kIndexBlock;
73
+ static constexpr BlockType kBlockType = BlockType::kUserDefinedIndex;
74
+
75
+ explicit Block_kUserDefinedIndex(BlockContents&& other)
76
+ : BlockContents(std::move(other)) {}
77
+ const Slice& ContentSlice() const { return data; }
78
+ };
79
+
70
80
  struct BlockCreateContext : public Cache::CreateContext {
71
81
  BlockCreateContext() {}
72
82
  BlockCreateContext(const BlockBasedTableOptions* _table_options,
@@ -126,6 +136,8 @@ struct BlockCreateContext : public Cache::CreateContext {
126
136
  BlockContents&& block);
127
137
  void Create(std::unique_ptr<Block_kMetaIndex>* parsed_out,
128
138
  BlockContents&& block);
139
+ void Create(std::unique_ptr<Block_kUserDefinedIndex>* parsed_out,
140
+ BlockContents&& block);
129
141
  void Create(std::unique_ptr<ParsedFullFilterBlock>* parsed_out,
130
142
  BlockContents&& block);
131
143
  void Create(std::unique_ptr<DecompressorDict>* parsed_out,
@@ -27,6 +27,7 @@ enum class BlockType : uint8_t {
27
27
  kHashIndexMetadata,
28
28
  kMetaIndex,
29
29
  kIndex,
30
+ kUserDefinedIndex,
30
31
  // Note: keep kInvalid the last value when adding new enum values.
31
32
  kInvalid
32
33
  };
@@ -1012,9 +1012,6 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder {
1012
1012
  FastLocalBloomBitsBuilder bloom_fallback_;
1013
1013
  };
1014
1014
 
1015
- // for the linker, at least with DEBUG_LEVEL=2
1016
- constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries;
1017
-
1018
1015
  class Standard128RibbonBitsReader : public BuiltinFilterBitsReader {
1019
1016
  public:
1020
1017
  Standard128RibbonBitsReader(const char* data, size_t len_bytes,
@@ -46,7 +46,7 @@ class IndexBuilder {
46
46
  // primary index.
47
47
  struct IndexBlocks {
48
48
  Slice index_block_contents;
49
- std::unordered_map<std::string, Slice> meta_blocks;
49
+ std::unordered_map<std::string, std::pair<BlockType, Slice>> meta_blocks;
50
50
  };
51
51
  IndexBuilder(const InternalKeyComparator* comparator, size_t ts_sz,
52
52
  bool persist_user_defined_timestamps)
@@ -78,7 +78,8 @@ class IndexBuilder {
78
78
 
79
79
  // This method will be called whenever a key is added. The subclasses may
80
80
  // override OnKeyAdded() if they need to collect additional information.
81
- virtual void OnKeyAdded(const Slice& /*key*/) {}
81
+ virtual void OnKeyAdded(const Slice& /*key*/,
82
+ const std::optional<Slice>& /*value*/) {}
82
83
 
83
84
  // Inform the index builder that all entries has been written. Block builder
84
85
  // may therefore perform any operation required for block finalization.
@@ -180,7 +181,8 @@ class ShortenedIndexBuilder : public IndexBuilder {
180
181
  seperator_is_key_plus_seq_ = (format_version <= 2);
181
182
  }
182
183
 
183
- void OnKeyAdded(const Slice& key) override {
184
+ void OnKeyAdded(const Slice& key,
185
+ const std::optional<Slice>& /*value*/) override {
184
186
  if (include_first_key_ && current_block_first_internal_key_.empty()) {
185
187
  current_block_first_internal_key_.assign(key.data(), key.size());
186
188
  }
@@ -358,7 +360,8 @@ class HashIndexBuilder : public IndexBuilder {
358
360
  separator_scratch);
359
361
  }
360
362
 
361
- void OnKeyAdded(const Slice& key) override {
363
+ void OnKeyAdded(const Slice& key,
364
+ const std::optional<Slice>& /*value*/) override {
362
365
  auto key_prefix = hash_key_extractor_->Transform(key);
363
366
  bool is_first_entry = pending_block_num_ == 0;
364
367
 
@@ -393,9 +396,9 @@ class HashIndexBuilder : public IndexBuilder {
393
396
  Status s = primary_index_builder_.Finish(index_blocks,
394
397
  last_partition_block_handle);
395
398
  index_blocks->meta_blocks.insert(
396
- {kHashIndexPrefixesBlock.c_str(), prefix_block_});
397
- index_blocks->meta_blocks.insert(
398
- {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
399
+ {kHashIndexPrefixesBlock.c_str(), {BlockType::kIndex, prefix_block_}});
400
+ index_blocks->meta_blocks.insert({kHashIndexPrefixesMetadataBlock.c_str(),
401
+ {BlockType::kIndex, prefix_meta_block_}});
399
402
  return s;
400
403
  }
401
404