@nxtedition/rocksdb 14.0.0 → 15.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/binding.cc +52 -179
  2. package/deps/rocksdb/rocksdb/BUCK +7 -0
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +29 -14
  4. package/deps/rocksdb/rocksdb/Directory.Build.props +9 -0
  5. package/deps/rocksdb/rocksdb/Makefile +6 -1
  6. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +4 -4
  7. package/deps/rocksdb/rocksdb/ccache_msvc_compiler.bat +1 -0
  8. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +17 -3
  9. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +8 -3
  10. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +10 -0
  11. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +522 -60
  12. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +69 -10
  13. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +443 -0
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +4 -2
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +14 -3
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +5 -5
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +3 -6
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +1 -1
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +28 -5
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +4 -4
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +6 -3
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +455 -98
  23. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +4 -2
  24. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +13 -1
  25. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +146 -0
  26. package/deps/rocksdb/rocksdb/db/db_follower_test.cc +2 -2
  27. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +6 -0
  28. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +5 -2
  29. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +18 -19
  30. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +5 -0
  31. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +665 -14
  32. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +83 -0
  33. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +68 -0
  34. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +101 -0
  35. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +44 -0
  36. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +1 -2
  37. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +22 -5
  38. package/deps/rocksdb/rocksdb/db/log_reader.h +4 -4
  39. package/deps/rocksdb/rocksdb/db/log_writer.h +1 -1
  40. package/deps/rocksdb/rocksdb/db/merge_helper.h +1 -1
  41. package/deps/rocksdb/rocksdb/db/version_edit.cc +477 -139
  42. package/deps/rocksdb/rocksdb/db/version_edit.h +228 -8
  43. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +333 -0
  44. package/deps/rocksdb/rocksdb/db/write_thread.h +1 -1
  45. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -0
  46. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +247 -32
  47. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +3 -0
  48. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.cc +61 -0
  49. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +17 -28
  50. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +16 -0
  51. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +6 -1
  52. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +46 -18
  53. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +18 -1
  54. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +8 -7
  55. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +4 -4
  56. package/deps/rocksdb/rocksdb/env/fs_posix.cc +1 -0
  57. package/deps/rocksdb/rocksdb/file/filename.cc +40 -0
  58. package/deps/rocksdb/rocksdb/file/filename.h +14 -1
  59. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +4 -3
  60. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +2 -1
  61. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +26 -7
  62. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +5 -3
  63. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +59 -0
  64. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +3 -0
  65. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +24 -0
  66. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +2 -1
  67. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +4 -0
  68. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +13 -8
  69. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +1 -0
  70. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +2 -0
  71. package/deps/rocksdb/rocksdb/options/options_test.cc +5 -0
  72. package/deps/rocksdb/rocksdb/src.mk +2 -0
  73. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +73 -16
  74. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +10 -5
  75. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +32 -0
  76. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +18 -27
  77. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +0 -3
  78. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +5 -1
  79. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +708 -217
  80. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +11 -6
  81. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +5 -3
  82. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +27 -19
  83. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +24 -6
  84. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +51 -18
  85. package/deps/rocksdb/rocksdb/table/block_based/index_builder_test.cc +183 -0
  86. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +4 -2
  87. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h +0 -2
  88. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +8 -3
  89. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +3 -1
  90. package/deps/rocksdb/rocksdb/table/table_test.cc +222 -36
  91. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +246 -6
  92. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +86 -0
  93. package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +21 -0
  94. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -0
  95. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +1 -1
  96. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +1 -0
  97. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +0 -2
  98. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +12 -12
  99. package/index.js +27 -37
  100. package/package.json +1 -1
  101. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  102. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -51,7 +51,9 @@
51
51
  #include "rocksdb/status.h"
52
52
  #include "rocksdb/table.h"
53
53
  #include "rocksdb/utilities/options_type.h"
54
+ #include "table/format.h"
54
55
  #include "table/merging_iterator.h"
56
+ #include "table/meta_blocks.h"
55
57
  #include "table/table_builder.h"
56
58
  #include "table/unique_id_impl.h"
57
59
  #include "test_util/sync_point.h"
@@ -253,7 +255,9 @@ void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
253
255
 
254
256
  void CompactionJob::Prepare(
255
257
  std::optional<std::pair<std::optional<Slice>, std::optional<Slice>>>
256
- known_single_subcompact) {
258
+ known_single_subcompact,
259
+ const CompactionProgress& compaction_progress,
260
+ log::Writer* compaction_progress_writer) {
257
261
  db_mutex_->AssertHeld();
258
262
  AutoThreadOperationStageUpdater stage_updater(
259
263
  ThreadStatus::STAGE_COMPACTION_PREPARE);
@@ -303,6 +307,9 @@ void CompactionJob::Prepare(
303
307
  /*sub_job_id*/ 0);
304
308
  }
305
309
 
310
+ MaybeAssignCompactionProgressAndWriter(compaction_progress,
311
+ compaction_progress_writer);
312
+
306
313
  // collect all seqno->time information from the input files which will be used
307
314
  // to encode seqno->time to the output files.
308
315
  SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber;
@@ -401,6 +408,25 @@ void CompactionJob::Prepare(
401
408
  options_file_number_ = versions_->options_file_number();
402
409
  }
403
410
 
411
+ void CompactionJob::MaybeAssignCompactionProgressAndWriter(
412
+ const CompactionProgress& compaction_progress,
413
+ log::Writer* compaction_progress_writer) {
414
+ // LIMITATION: Only supports resuming single subcompaction for now
415
+ if (compact_->sub_compact_states.size() != 1) {
416
+ return;
417
+ }
418
+
419
+ if (!compaction_progress.empty()) {
420
+ assert(compaction_progress.size() == 1);
421
+ SubcompactionState* sub_compact = &compact_->sub_compact_states[0];
422
+ const SubcompactionProgress& subcompaction_progress =
423
+ compaction_progress[0];
424
+ sub_compact->SetSubcompactionProgress(subcompaction_progress);
425
+ }
426
+
427
+ compaction_progress_writer_ = compaction_progress_writer;
428
+ }
429
+
404
430
  uint64_t CompactionJob::GetSubcompactionsLimit() {
405
431
  return extra_num_subcompaction_threads_reserved_ +
406
432
  std::max(
@@ -924,7 +950,8 @@ void CompactionJob::FinalizeCompactionRun(
924
950
  UpdateCompactionJobInputStatsFromInternalStats(internal_stats_,
925
951
  num_input_range_del);
926
952
  }
927
- UpdateCompactionJobOutputStatsFromInternalStats(internal_stats_);
953
+ UpdateCompactionJobOutputStatsFromInternalStats(input_status,
954
+ internal_stats_);
928
955
  RecordCompactionIOStats();
929
956
 
930
957
  LogFlush(db_options_.info_log);
@@ -1249,8 +1276,8 @@ Status CompactionJob::SetupAndValidateCompactionFilter(
1249
1276
  return Status::OK();
1250
1277
  }
1251
1278
 
1252
- void CompactionJob::InitializeReadOptions(
1253
- ColumnFamilyData* cfd, ReadOptions& read_options,
1279
+ void CompactionJob::InitializeReadOptionsAndBoundaries(
1280
+ const size_t ts_sz, ReadOptions& read_options,
1254
1281
  SubcompactionKeyBoundaries& boundaries) {
1255
1282
  read_options.verify_checksums = true;
1256
1283
  read_options.fill_cache = false;
@@ -1264,8 +1291,6 @@ void CompactionJob::InitializeReadOptions(
1264
1291
 
1265
1292
  // Remove the timestamps from boundaries because boundaries created in
1266
1293
  // GenSubcompactionBoundaries doesn't strip away the timestamp.
1267
- const size_t ts_sz = cfd->user_comparator()->timestamp_size();
1268
-
1269
1294
  if (boundaries.start.has_value()) {
1270
1295
  read_options.iterate_lower_bound = &(*boundaries.start);
1271
1296
  if (ts_sz > 0) {
@@ -1282,30 +1307,7 @@ void CompactionJob::InitializeReadOptions(
1282
1307
  read_options.iterate_upper_bound = &(*boundaries.end_without_ts);
1283
1308
  }
1284
1309
  }
1285
- }
1286
-
1287
- InternalIterator* CompactionJob::CreateInputIterator(
1288
- SubcompactionState* sub_compact, ColumnFamilyData* cfd,
1289
- SubcompactionInternalIterators& iterators,
1290
- SubcompactionKeyBoundaries& boundaries, ReadOptions& read_options) {
1291
- // This is assigned after creation of SubcompactionState to simplify that
1292
- // creation across both CompactionJob and CompactionServiceCompactionJob
1293
- sub_compact->AssignRangeDelAggregator(
1294
- std::make_unique<CompactionRangeDelAggregator>(
1295
- &cfd->internal_comparator(), job_context_->snapshot_seqs,
1296
- &full_history_ts_low_, &trim_ts_));
1297
-
1298
- InitializeReadOptions(cfd, read_options, boundaries);
1299
-
1300
- // Although the v2 aggregator is what the level iterator(s) know about,
1301
- // the AddTombstones calls will be propagated down to the v1 aggregator.
1302
- iterators.raw_input =
1303
- std::unique_ptr<InternalIterator>(versions_->MakeInputIterator(
1304
- read_options, sub_compact->compaction, sub_compact->RangeDelAgg(),
1305
- file_options_for_read_, boundaries.start, boundaries.end));
1306
- InternalIterator* input = iterators.raw_input.get();
1307
1310
 
1308
- const size_t ts_sz = cfd->user_comparator()->timestamp_size();
1309
1311
  if (ts_sz > 0) {
1310
1312
  if (ts_sz <= strlen(boundaries.kMaxTs)) {
1311
1313
  boundaries.ts_slice = Slice(boundaries.kMaxTs, ts_sz);
@@ -1314,7 +1316,6 @@ InternalIterator* CompactionJob::CreateInputIterator(
1314
1316
  boundaries.ts_slice = Slice(boundaries.max_ts);
1315
1317
  }
1316
1318
  }
1317
-
1318
1319
  if (boundaries.start.has_value()) {
1319
1320
  boundaries.start_ikey.SetInternalKey(*boundaries.start, kMaxSequenceNumber,
1320
1321
  kValueTypeForSeek);
@@ -1335,6 +1336,29 @@ InternalIterator* CompactionJob::CreateInputIterator(
1335
1336
  boundaries.end_internal_key = boundaries.end_ikey.GetInternalKey();
1336
1337
  boundaries.end_user_key = boundaries.end_ikey.GetUserKey();
1337
1338
  }
1339
+ }
1340
+
1341
+ InternalIterator* CompactionJob::CreateInputIterator(
1342
+ SubcompactionState* sub_compact, ColumnFamilyData* cfd,
1343
+ SubcompactionInternalIterators& iterators,
1344
+ SubcompactionKeyBoundaries& boundaries, ReadOptions& read_options) {
1345
+ const size_t ts_sz = cfd->user_comparator()->timestamp_size();
1346
+ InitializeReadOptionsAndBoundaries(ts_sz, read_options, boundaries);
1347
+
1348
+ // This is assigned after creation of SubcompactionState to simplify that
1349
+ // creation across both CompactionJob and CompactionServiceCompactionJob
1350
+ sub_compact->AssignRangeDelAggregator(
1351
+ std::make_unique<CompactionRangeDelAggregator>(
1352
+ &cfd->internal_comparator(), job_context_->snapshot_seqs,
1353
+ &full_history_ts_low_, &trim_ts_));
1354
+
1355
+ // Although the v2 aggregator is what the level iterator(s) know about,
1356
+ // the AddTombstones calls will be propagated down to the v1 aggregator.
1357
+ iterators.raw_input =
1358
+ std::unique_ptr<InternalIterator>(versions_->MakeInputIterator(
1359
+ read_options, sub_compact->compaction, sub_compact->RangeDelAgg(),
1360
+ file_options_for_read_, boundaries.start, boundaries.end));
1361
+ InternalIterator* input = iterators.raw_input.get();
1338
1362
 
1339
1363
  if (boundaries.start.has_value() || boundaries.end.has_value()) {
1340
1364
  iterators.clip = std::make_unique<ClippingIterator>(
@@ -1404,7 +1428,8 @@ std::unique_ptr<CompactionIterator> CompactionJob::CreateCompactionIterator(
1404
1428
  env_, ShouldReportDetailedTime(env_, stats_), sub_compact->RangeDelAgg(),
1405
1429
  blob_resources.blob_file_builder.get(), db_options_.allow_data_in_errors,
1406
1430
  db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
1407
- sub_compact->compaction->DoesInputReferenceBlobFiles(),
1431
+ sub_compact->compaction
1432
+ ->DoesInputReferenceBlobFiles() /* must_count_input_entries */,
1408
1433
  sub_compact->compaction, compaction_filter, shutting_down_,
1409
1434
  db_options_.info_log, full_history_ts_low, preserve_seqno_after_);
1410
1435
  }
@@ -1424,11 +1449,13 @@ CompactionJob::CreateFileHandlers(SubcompactionState* sub_compact,
1424
1449
 
1425
1450
  const CompactionFileCloseFunc close_file_func =
1426
1451
  [this, sub_compact, start_user_key, end_user_key](
1427
- CompactionOutputs& outputs, const Status& status,
1428
- const Slice& next_table_min_key) {
1429
- return this->FinishCompactionOutputFile(status, sub_compact, outputs,
1430
- next_table_min_key,
1431
- start_user_key, end_user_key);
1452
+ const Status& status,
1453
+ const ParsedInternalKey& prev_table_last_internal_key,
1454
+ const Slice& next_table_min_key, const CompactionIterator* c_iter,
1455
+ CompactionOutputs& outputs) {
1456
+ return this->FinishCompactionOutputFile(
1457
+ status, prev_table_last_internal_key, next_table_min_key,
1458
+ start_user_key, end_user_key, c_iter, sub_compact, outputs);
1432
1459
  };
1433
1460
 
1434
1461
  return {open_file_func, close_file_func};
@@ -1442,6 +1469,9 @@ Status CompactionJob::ProcessKeyValue(
1442
1469
  const uint64_t kRecordStatsEvery = 1000;
1443
1470
  [[maybe_unused]] const std::optional<const Slice> end = sub_compact->end;
1444
1471
 
1472
+ IterKey last_output_key;
1473
+ ParsedInternalKey last_output_ikey;
1474
+
1445
1475
  TEST_SYNC_POINT_CALLBACK(
1446
1476
  "CompactionJob::ProcessKeyValueCompaction()::Processing",
1447
1477
  static_cast<void*>(const_cast<Compaction*>(sub_compact->compaction)));
@@ -1491,8 +1521,9 @@ Status CompactionJob::ProcessKeyValue(
1491
1521
  // and `close_file_func`.
1492
1522
  // TODO: it would be better to have the compaction file open/close moved
1493
1523
  // into `CompactionOutputs` which has the output file information.
1494
- status = sub_compact->AddToOutput(*c_iter, use_proximal_output,
1495
- open_file_func, close_file_func);
1524
+ status =
1525
+ sub_compact->AddToOutput(*c_iter, use_proximal_output, open_file_func,
1526
+ close_file_func, last_output_ikey);
1496
1527
  if (!status.ok()) {
1497
1528
  break;
1498
1529
  }
@@ -1500,6 +1531,10 @@ Status CompactionJob::ProcessKeyValue(
1500
1531
  TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:2",
1501
1532
  static_cast<void*>(const_cast<std::atomic<bool>*>(
1502
1533
  &manual_compaction_canceled_)));
1534
+
1535
+ last_output_key.SetInternalKey(c_iter->key(), &last_output_ikey);
1536
+ last_output_ikey.sequence = ikey.sequence;
1537
+ last_output_ikey.type = ikey.type;
1503
1538
  c_iter->Next();
1504
1539
 
1505
1540
  #ifndef NDEBUG
@@ -1684,6 +1719,22 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
1684
1719
  ReadOptions read_options;
1685
1720
  const WriteOptions write_options(Env::IOPriority::IO_LOW,
1686
1721
  Env::IOActivity::kCompaction);
1722
+
1723
+ InternalIterator* input_iter = CreateInputIterator(
1724
+ sub_compact, cfd, iterators, boundaries, read_options);
1725
+
1726
+ assert(input_iter);
1727
+
1728
+ Status status =
1729
+ MaybeResumeSubcompactionProgressOnInputIterator(sub_compact, input_iter);
1730
+
1731
+ if (status.IsNotFound()) {
1732
+ input_iter->SeekToFirst();
1733
+ } else if (!status.ok()) {
1734
+ sub_compact->status = status;
1735
+ return;
1736
+ }
1737
+
1687
1738
  MergeHelper merge(
1688
1739
  env_, cfd->user_comparator(), cfd->ioptions().merge_operator.get(),
1689
1740
  compaction_filter, db_options_.info_log.get(),
@@ -1692,11 +1743,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
1692
1743
  compact_->compaction->level(), db_options_.stats);
1693
1744
  BlobFileResources blob_resources;
1694
1745
 
1695
- InternalIterator* input_iter = CreateInputIterator(
1696
- sub_compact, cfd, iterators, boundaries, read_options);
1697
- assert(input_iter);
1698
- input_iter->SeekToFirst();
1699
-
1700
1746
  auto c_iter =
1701
1747
  CreateCompactionIterator(sub_compact, cfd, input_iter, compaction_filter,
1702
1748
  merge, blob_resources, write_options);
@@ -1711,9 +1757,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
1711
1757
  auto [open_file_func, close_file_func] =
1712
1758
  CreateFileHandlers(sub_compact, boundaries);
1713
1759
 
1714
- Status status =
1715
- ProcessKeyValue(sub_compact, cfd, c_iter.get(), open_file_func,
1716
- close_file_func, prev_cpu_micros);
1760
+ status = ProcessKeyValue(sub_compact, cfd, c_iter.get(), open_file_func,
1761
+ close_file_func, prev_cpu_micros);
1717
1762
 
1718
1763
  status = FinalizeProcessKeyValueStatus(cfd, input_iter, c_iter.get(), status);
1719
1764
 
@@ -1795,9 +1840,11 @@ void CompactionJob::RecordDroppedKeys(
1795
1840
  }
1796
1841
 
1797
1842
  Status CompactionJob::FinishCompactionOutputFile(
1798
- const Status& input_status, SubcompactionState* sub_compact,
1799
- CompactionOutputs& outputs, const Slice& next_table_min_key,
1800
- const Slice* comp_start_user_key, const Slice* comp_end_user_key) {
1843
+ const Status& input_status,
1844
+ const ParsedInternalKey& prev_table_last_internal_key,
1845
+ const Slice& next_table_min_key, const Slice* comp_start_user_key,
1846
+ const Slice* comp_end_user_key, const CompactionIterator* c_iter,
1847
+ SubcompactionState* sub_compact, CompactionOutputs& outputs) {
1801
1848
  AutoThreadOperationStageUpdater stage_updater(
1802
1849
  ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
1803
1850
  assert(sub_compact != nullptr);
@@ -1971,10 +2018,94 @@ Status CompactionJob::FinishCompactionOutputFile(
1971
2018
  }
1972
2019
  }
1973
2020
 
2021
+ if (s.ok() && ShouldUpdateSubcompactionProgress(sub_compact, c_iter,
2022
+ prev_table_last_internal_key,
2023
+ next_table_min_key, meta)) {
2024
+ UpdateSubcompactionProgress(c_iter, next_table_min_key, sub_compact);
2025
+ s = PersistSubcompactionProgress(sub_compact);
2026
+ }
1974
2027
  outputs.ResetBuilder();
1975
2028
  return s;
1976
2029
  }
1977
2030
 
2031
+ bool CompactionJob::ShouldUpdateSubcompactionProgress(
2032
+ const SubcompactionState* sub_compact, const CompactionIterator* c_iter,
2033
+ const ParsedInternalKey& prev_table_last_internal_key,
2034
+ const Slice& next_table_min_internal_key, const FileMetaData* meta) const {
2035
+ const auto* cfd = sub_compact->compaction->column_family_data();
2036
+ // No need to update when the output will not get persisted
2037
+ if (compaction_progress_writer_ == nullptr) {
2038
+ return false;
2039
+ }
2040
+
2041
+ // No need to update for a new empty output
2042
+ if (meta == nullptr) {
2043
+ return false;
2044
+ }
2045
+
2046
+ // TODO(hx235): save progress even on the last output file
2047
+ if (next_table_min_internal_key.empty()) {
2048
+ return false;
2049
+ }
2050
+
2051
+ // LIMITATION: Persisting compaction progress with timestamp
2052
+ // is not supported since the feature of persisting timestamp of the key in
2053
+ // SST files itself is still experimental
2054
+ size_t ts_sz = cfd->user_comparator()->timestamp_size();
2055
+ if (ts_sz > 0) {
2056
+ return false;
2057
+ }
2058
+
2059
+ // LIMITATION: Compaction progress persistence disabled for file boundaries
2060
+ // contaning range deletions. Range deletions can span file boundaries, making
2061
+ // it difficult (but possible) to ensure adjacent output tables have different
2062
+ // user keys. See the last check for why different users keys of adjacent
2063
+ // output tables are needed
2064
+ const ValueType next_table_min_internal_key_type =
2065
+ ExtractValueType(next_table_min_internal_key);
2066
+ const ValueType prev_table_last_internal_key_type =
2067
+ prev_table_last_internal_key.user_key.empty()
2068
+ ? ValueType::kTypeValue
2069
+ : prev_table_last_internal_key.type;
2070
+
2071
+ if (next_table_min_internal_key_type == ValueType::kTypeRangeDeletion ||
2072
+ prev_table_last_internal_key_type == ValueType::kTypeRangeDeletion) {
2073
+ return false;
2074
+ }
2075
+
2076
+ // LIMITATION: Compaction progress persistence disabled when adjacent output
2077
+ // tables share the same user key at boundaries. This ensures a simple Seek()
2078
+ // of the next key when resuming can process all versions of a user key
2079
+ const Slice next_table_min_user_key =
2080
+ ExtractUserKey(next_table_min_internal_key);
2081
+ const Slice prev_table_last_user_key =
2082
+ prev_table_last_internal_key.user_key.empty()
2083
+ ? Slice()
2084
+ : prev_table_last_internal_key.user_key;
2085
+
2086
+ if (cfd->user_comparator()->EqualWithoutTimestamp(next_table_min_user_key,
2087
+ prev_table_last_user_key)) {
2088
+ return false;
2089
+ }
2090
+
2091
+ // LIMITATION: Don't save progress if the current key has already been scanned
2092
+ // (looked ahead) in the input but not yet output. This can happen with merge
2093
+ // operations, single deletes, and deletes at the bottommost level where
2094
+ // CompactionIterator needs to look ahead to process multiple entries for the
2095
+ // same user key before outputting a result. If we saved progress and resumed
2096
+ // at this boundary, the resumed session would see and process the same input
2097
+ // key again through Seek(), leading to incorrect double-counting in
2098
+ // number of processed input entries and input count verification failure
2099
+ //
2100
+ // TODO(hx235): Offset num_processed_input_records to avoid double counting
2101
+ // instead of disabling progress persistence.
2102
+ if (c_iter->IsCurrentKeyAlreadyScanned()) {
2103
+ return false;
2104
+ }
2105
+
2106
+ return true;
2107
+ }
2108
+
1978
2109
  Status CompactionJob::InstallCompactionResults(bool* compaction_released) {
1979
2110
  assert(compact_);
1980
2111
 
@@ -2120,15 +2251,8 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
2120
2251
 
2121
2252
  // Pass temperature of the last level files to FileSystem.
2122
2253
  FileOptions fo_copy = file_options_;
2123
- Temperature temperature = sub_compact->compaction->output_temperature();
2124
- Temperature last_level_temp =
2125
- sub_compact->compaction->mutable_cf_options().last_level_temperature;
2126
- // Here last_level_temperature supersedes default_write_temperature, when
2127
- // enabled and applicable
2128
- if (last_level_temp != Temperature::kUnknown &&
2129
- sub_compact->compaction->is_last_level() && !outputs.IsProximalLevel()) {
2130
- temperature = last_level_temp;
2131
- }
2254
+ auto temperature =
2255
+ sub_compact->compaction->GetOutputTemperature(outputs.IsProximalLevel());
2132
2256
  fo_copy.temperature = temperature;
2133
2257
  fo_copy.write_hint = write_hint_;
2134
2258
 
@@ -2404,6 +2528,7 @@ void CompactionJob::UpdateCompactionJobInputStatsFromInternalStats(
2404
2528
  }
2405
2529
 
2406
2530
  void CompactionJob::UpdateCompactionJobOutputStatsFromInternalStats(
2531
+ const Status& status,
2407
2532
  const InternalStats::CompactionStatsFull& internal_stats) const {
2408
2533
  assert(job_stats_);
2409
2534
  job_stats_->elapsed_micros = internal_stats.output_level_stats.micros;
@@ -2434,7 +2559,7 @@ void CompactionJob::UpdateCompactionJobOutputStatsFromInternalStats(
2434
2559
  internal_stats.proximal_level_stats.num_output_files_blob;
2435
2560
  }
2436
2561
 
2437
- if (job_stats_->num_output_files > 0) {
2562
+ if (status.ok() && job_stats_->num_output_files > 0) {
2438
2563
  CopyPrefix(compact_->SmallestUserKey(),
2439
2564
  CompactionJobStats::kMaxPrefixLength,
2440
2565
  &job_stats_->smallest_output_key_prefix);
@@ -2515,6 +2640,344 @@ Env::IOPriority CompactionJob::GetRateLimiterPriority() {
2515
2640
  return Env::IO_LOW;
2516
2641
  }
2517
2642
 
2643
+ Status CompactionJob::ReadTablePropertiesDirectly(
2644
+ const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
2645
+ const FileMetaData* file_meta, const ReadOptions& read_options,
2646
+ std::shared_ptr<const TableProperties>* tp) {
2647
+ std::unique_ptr<FSRandomAccessFile> file;
2648
+ std::string file_name = GetTableFileName(file_meta->fd.GetNumber());
2649
+ Status s = ioptions.fs->NewRandomAccessFile(file_name, file_options_, &file,
2650
+ nullptr /* dbg */);
2651
+ if (!s.ok()) {
2652
+ return s;
2653
+ }
2654
+
2655
+ std::unique_ptr<RandomAccessFileReader> file_reader(
2656
+ new RandomAccessFileReader(
2657
+ std::move(file), file_name, ioptions.clock, io_tracer_,
2658
+ ioptions.stats, Histograms::SST_READ_MICROS /* hist_type */,
2659
+ nullptr /* file_read_hist */, ioptions.rate_limiter.get(),
2660
+ ioptions.listeners));
2661
+
2662
+ std::unique_ptr<TableProperties> props;
2663
+
2664
+ uint64_t magic_number = kBlockBasedTableMagicNumber;
2665
+
2666
+ const auto* table_factory = moptions.table_factory.get();
2667
+ if (table_factory == nullptr) {
2668
+ return Status::Incomplete("Table factory is not set");
2669
+ } else {
2670
+ const auto& table_factory_name = table_factory->Name();
2671
+ if (table_factory_name == TableFactory::kPlainTableName()) {
2672
+ magic_number = kPlainTableMagicNumber;
2673
+ } else if (table_factory_name == TableFactory::kCuckooTableName()) {
2674
+ magic_number = kCuckooTableMagicNumber;
2675
+ }
2676
+ }
2677
+
2678
+ s = ReadTableProperties(file_reader.get(), file_meta->fd.GetFileSize(),
2679
+ magic_number, ioptions, read_options, &props);
2680
+ if (!s.ok()) {
2681
+ return s;
2682
+ }
2683
+
2684
+ *tp = std::move(props);
2685
+ return s;
2686
+ }
2687
+
2688
+ Status CompactionJob::ReadOutputFilesTableProperties(
2689
+ const autovector<FileMetaData>& output_files,
2690
+ const ReadOptions& read_options,
2691
+ std::vector<std::shared_ptr<const TableProperties>>&
2692
+ output_files_table_properties,
2693
+ bool is_proximal_level) {
2694
+ assert(!output_files.empty());
2695
+
2696
+ static const char* level_type =
2697
+ is_proximal_level ? "proximal output" : "output";
2698
+
2699
+ output_files_table_properties.reserve(output_files.size());
2700
+
2701
+ Status s;
2702
+
2703
+ for (const FileMetaData& metadata : output_files) {
2704
+ std::shared_ptr<const TableProperties> tp;
2705
+ s = ReadTablePropertiesDirectly(compact_->compaction->immutable_options(),
2706
+ compact_->compaction->mutable_cf_options(),
2707
+ &metadata, read_options, &tp);
2708
+ if (!s.ok()) {
2709
+ ROCKS_LOG_ERROR(
2710
+ db_options_.info_log,
2711
+ "Failed to read table properties for %s level output file #%" PRIu64
2712
+ ": %s",
2713
+ level_type, metadata.fd.GetNumber(), s.ToString().c_str());
2714
+ return s;
2715
+ }
2716
+
2717
+ if (tp == nullptr) {
2718
+ ROCKS_LOG_ERROR(db_options_.info_log,
2719
+ "Empty table property for %s level output file #%" PRIu64
2720
+ "",
2721
+ level_type, metadata.fd.GetNumber());
2722
+
2723
+ s = Status::Corruption("Empty table property for " +
2724
+ std::string(level_type) +
2725
+ " level output files during resuming");
2726
+ return s;
2727
+ }
2728
+ output_files_table_properties.push_back(tp);
2729
+ }
2730
+ return s;
2731
+ }
2732
+
2733
+ void CompactionJob::RestoreCompactionOutputs(
2734
+ const ColumnFamilyData* cfd,
2735
+ const std::vector<std::shared_ptr<const TableProperties>>&
2736
+ output_files_table_properties,
2737
+ SubcompactionProgressPerLevel& subcompaction_progress_per_level,
2738
+ CompactionOutputs* outputs_to_restore) {
2739
+ assert(outputs_to_restore->GetOutputs().size() == 0);
2740
+
2741
+ const auto& output_files = subcompaction_progress_per_level.GetOutputFiles();
2742
+
2743
+ for (size_t i = 0; i < output_files.size(); i++) {
2744
+ FileMetaData file_copy = output_files[i];
2745
+
2746
+ outputs_to_restore->AddOutput(std::move(file_copy),
2747
+ cfd->internal_comparator(),
2748
+ paranoid_file_checks_, true /* finished */);
2749
+
2750
+ outputs_to_restore->UpdateTableProperties(
2751
+ *output_files_table_properties[i]);
2752
+ }
2753
+
2754
+ outputs_to_restore->SetNumOutputRecords(
2755
+ subcompaction_progress_per_level.GetNumProcessedOutputRecords());
2756
+ }
2757
+
2758
+ // Attempt to resume compaction from a previously persisted compaction progress.
2759
+ //
2760
+ // RETURNS:
2761
+ // - Status::OK():
2762
+ // * Input iterator positioned at next unprocessed key
2763
+ // * CompactionOutputs objects fully restored for both output and proximal
2764
+ // output levels in SubcompactionState
2765
+ // * Compaction job statistics accurately reflect input and output records
2766
+ // processed for record count verification
2767
+ // * File number generation advanced to prevent conflicts with existing outputs
2768
+ // - Status::NotFound(): No valid progress to resume from
2769
+ // - Status::Corruption(): Resume key is invalid, beyond input range, or output
2770
+ // restoration failed
2771
+ // - Other non-OK status: Iterator errors or file system issues during
2772
+ // restoration
2773
+ //
2774
+ // The caller must check for Status::IsIncomplete() to distinguish between
2775
+ // "no resume needed" (proceed with `InternalIterator::SeekToFirst()`) vs
2776
+ // "resume failed" scenarios.
2777
+ Status CompactionJob::MaybeResumeSubcompactionProgressOnInputIterator(
2778
+ SubcompactionState* sub_compact, InternalIterator* input_iter) {
2779
+ const ReadOptions read_options(Env::IOActivity::kCompaction);
2780
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
2781
+ SubcompactionProgress& subcompaction_progress =
2782
+ sub_compact->GetSubcompactionProgressRef();
2783
+
2784
+ if (subcompaction_progress.output_level_progress
2785
+ .GetNumProcessedOutputRecords() == 0 &&
2786
+ subcompaction_progress.proximal_output_level_progress
2787
+ .GetNumProcessedOutputRecords() == 0) {
2788
+ return Status::NotFound("No subcompaction progress to resume");
2789
+ }
2790
+
2791
+ ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Resuming compaction : %s",
2792
+ cfd->GetName().c_str(), job_id_,
2793
+ subcompaction_progress.ToString().c_str());
2794
+
2795
+ input_iter->Seek(subcompaction_progress.next_internal_key_to_compact);
2796
+
2797
+ if (!input_iter->Valid()) {
2798
+ ROCKS_LOG_ERROR(db_options_.info_log,
2799
+ "[%s] [JOB %d] Iterator is invalid after "
2800
+ "seeking to the key to resume. This indicates the key is "
2801
+ "incorrectly beyond the input data range.",
2802
+ cfd->GetName().c_str(), job_id_);
2803
+ return Status::Corruption(
2804
+ "The key to resume is beyond the input data range");
2805
+ } else if (!input_iter->status().ok()) {
2806
+ ROCKS_LOG_ERROR(db_options_.info_log,
2807
+ "[%s] [JOB %d] Iterator has error after seeking to "
2808
+ "the key to resume: %s",
2809
+ cfd->GetName().c_str(), job_id_,
2810
+ input_iter->status().ToString().c_str());
2811
+ return Status::Corruption(
2812
+ "Iterator has error status after seeking to the key: " +
2813
+ input_iter->status().ToString());
2814
+ }
2815
+
2816
+ sub_compact->compaction_job_stats.has_accurate_num_input_records =
2817
+ subcompaction_progress.num_processed_input_records != 0;
2818
+
2819
+ sub_compact->compaction_job_stats.num_input_records =
2820
+ subcompaction_progress.num_processed_input_records;
2821
+
2822
+ for (const bool& is_proximal_level : {false, true}) {
2823
+ if (is_proximal_level &&
2824
+ !sub_compact->compaction->SupportsPerKeyPlacement()) {
2825
+ continue;
2826
+ }
2827
+
2828
+ Status s;
2829
+ SubcompactionProgressPerLevel& subcompaction_progress_per_level =
2830
+ is_proximal_level
2831
+ ? subcompaction_progress.proximal_output_level_progress
2832
+ : subcompaction_progress.output_level_progress;
2833
+
2834
+ const auto& output_files =
2835
+ subcompaction_progress_per_level.GetOutputFiles();
2836
+
2837
+ std::vector<std::shared_ptr<const TableProperties>>
2838
+ output_files_table_properties;
2839
+
2840
+ // TODO(hx235): investigate if we can skip reading properties to save read
2841
+ // IO
2842
+ s = ReadOutputFilesTableProperties(output_files, read_options,
2843
+ output_files_table_properties);
2844
+ if (!s.ok()) {
2845
+ ROCKS_LOG_ERROR(
2846
+ db_options_.info_log,
2847
+ "[%s] [JOB %d] Failed to read table properties for %s output level"
2848
+ "files "
2849
+ "during resume: %s.",
2850
+ cfd->GetName().c_str(), job_id_, is_proximal_level ? "proximal" : "",
2851
+ s.ToString().c_str());
2852
+ return Status::Corruption(
2853
+ "Not able to resume due to table property reading error " +
2854
+ s.ToString());
2855
+ }
2856
+
2857
+ RestoreCompactionOutputs(cfd, output_files_table_properties,
2858
+ subcompaction_progress_per_level,
2859
+ sub_compact->Outputs(is_proximal_level));
2860
+
2861
+ // Skip past all the used file numbers to avoid creating new output files
2862
+ // after resumption that conflict with the existing output files
2863
+ for (const auto& file_meta : output_files) {
2864
+ uint64_t file_number = file_meta.fd.GetNumber();
2865
+ while (versions_->NewFileNumber() <= file_number) {
2866
+ versions_->FetchAddFileNumber(1);
2867
+ }
2868
+ }
2869
+ }
2870
+
2871
+ return Status::OK();
2872
+ }
2873
+
2874
+ void CompactionJob::UpdateSubcompactionProgress(
2875
+ const CompactionIterator* c_iter, const Slice next_table_min_key,
2876
+ SubcompactionState* sub_compact) {
2877
+ assert(c_iter);
2878
+ SubcompactionProgress& subcompaction_progress =
2879
+ sub_compact->GetSubcompactionProgressRef();
2880
+
2881
+ IterKey next_ikey_to_compact;
2882
+ next_ikey_to_compact.SetInternalKey(ExtractUserKey(next_table_min_key),
2883
+ kMaxSequenceNumber, kValueTypeForSeek);
2884
+ subcompaction_progress.next_internal_key_to_compact =
2885
+ next_ikey_to_compact.GetInternalKey().ToString();
2886
+
2887
+ // Track total processed input records for progress reporting by combining:
2888
+ // - Resumed count: records already processed before compaction was
2889
+ // interrupted
2890
+ // - Current count: records scanned in the current compaction session
2891
+ // Only update when both tracking mechanisms provide accurate counts to ensure
2892
+ // reliability.
2893
+ subcompaction_progress.num_processed_input_records =
2894
+ c_iter->HasNumInputEntryScanned() &&
2895
+ sub_compact->compaction_job_stats.has_accurate_num_input_records
2896
+ ? c_iter->NumInputEntryScanned() +
2897
+ sub_compact->compaction_job_stats.num_input_records
2898
+ : 0;
2899
+
2900
+ UpdateSubcompactionProgressPerLevel(
2901
+ sub_compact, false /* is_proximal_level */, subcompaction_progress);
2902
+
2903
+ if (sub_compact->compaction->SupportsPerKeyPlacement()) {
2904
+ UpdateSubcompactionProgressPerLevel(
2905
+ sub_compact, true /* is_proximal_level */, subcompaction_progress);
2906
+ }
2907
+ }
2908
+
2909
+ void CompactionJob::UpdateSubcompactionProgressPerLevel(
2910
+ SubcompactionState* sub_compact, bool is_proximal_level,
2911
+ SubcompactionProgress& subcompaction_progress) {
2912
+ SubcompactionProgressPerLevel& subcompaction_progress_per_level =
2913
+ is_proximal_level ? subcompaction_progress.proximal_output_level_progress
2914
+ : subcompaction_progress.output_level_progress;
2915
+
2916
+ subcompaction_progress_per_level.SetNumProcessedOutputRecords(
2917
+ sub_compact->OutputStats(is_proximal_level)->num_output_records);
2918
+
2919
+ const auto& prev_output_files =
2920
+ subcompaction_progress_per_level.GetOutputFiles();
2921
+
2922
+ const auto& current_output_files =
2923
+ sub_compact->Outputs(is_proximal_level)->GetOutputs();
2924
+
2925
+ for (size_t i = prev_output_files.size(); i < current_output_files.size();
2926
+ i++) {
2927
+ subcompaction_progress_per_level.AddToOutputFiles(
2928
+ current_output_files[i].meta);
2929
+ }
2930
+ }
2931
+
2932
+ Status CompactionJob::PersistSubcompactionProgress(
2933
+ SubcompactionState* sub_compact) {
2934
+ SubcompactionProgress& subcompaction_progress =
2935
+ sub_compact->GetSubcompactionProgressRef();
2936
+
2937
+ assert(compaction_progress_writer_);
2938
+
2939
+ VersionEdit edit;
2940
+ edit.SetSubcompactionProgress(subcompaction_progress);
2941
+
2942
+ std::string record;
2943
+ if (!edit.EncodeTo(&record)) {
2944
+ ROCKS_LOG_ERROR(
2945
+ db_options_.info_log,
2946
+ "[%s] [JOB %d] Failed to encode subcompaction "
2947
+ "progress",
2948
+ compact_->compaction->column_family_data()->GetName().c_str(), job_id_);
2949
+ return Status::Corruption("Failed to encode subcompaction progress");
2950
+ }
2951
+
2952
+ WriteOptions write_options(Env::IOActivity::kCompaction);
2953
+ Status s = compaction_progress_writer_->AddRecord(write_options, record);
2954
+ IOOptions opts;
2955
+ if (s.ok()) {
2956
+ s = WritableFileWriter::PrepareIOOptions(write_options, opts);
2957
+ }
2958
+ if (s.ok()) {
2959
+ s = compaction_progress_writer_->file()->Sync(opts, db_options_.use_fsync);
2960
+ }
2961
+
2962
+ if (!s.ok()) {
2963
+ ROCKS_LOG_ERROR(
2964
+ db_options_.info_log,
2965
+ "[%s] [JOB %d] Failed to persist subcompaction "
2966
+ "progress: %s",
2967
+ compact_->compaction->column_family_data()->GetName().c_str(), job_id_,
2968
+ s.ToString().c_str());
2969
+ return s;
2970
+ }
2971
+
2972
+ subcompaction_progress.output_level_progress
2973
+ .UpdateLastPersistedOutputFilesCount();
2974
+
2975
+ subcompaction_progress.proximal_output_level_progress
2976
+ .UpdateLastPersistedOutputFilesCount();
2977
+
2978
+ return Status::OK();
2979
+ }
2980
+
2518
2981
  Status CompactionJob::VerifyInputRecordCount(
2519
2982
  uint64_t num_input_range_del) const {
2520
2983
  size_t ts_sz = compact_->compaction->column_family_data()
@@ -2585,5 +3048,4 @@ Status CompactionJob::VerifyOutputRecordCount() const {
2585
3048
  }
2586
3049
  return Status::OK();
2587
3050
  }
2588
-
2589
3051
  } // namespace ROCKSDB_NAMESPACE