@nxtedition/rocksdb 5.2.36 → 5.2.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/binding.cc +66 -91
  2. package/deps/liburing/liburing/COPYING +502 -0
  3. package/deps/liburing/liburing/COPYING.GPL +339 -0
  4. package/deps/liburing/liburing/LICENSE +7 -0
  5. package/deps/liburing/liburing/Makefile +84 -0
  6. package/deps/liburing/liburing/Makefile.quiet +11 -0
  7. package/deps/liburing/liburing/README +46 -0
  8. package/deps/liburing/liburing/configure +420 -0
  9. package/deps/liburing/liburing/debian/README.Debian +7 -0
  10. package/deps/liburing/liburing/debian/changelog +27 -0
  11. package/deps/liburing/liburing/debian/compat +1 -0
  12. package/deps/liburing/liburing/debian/control +48 -0
  13. package/deps/liburing/liburing/debian/copyright +49 -0
  14. package/deps/liburing/liburing/debian/liburing-dev.install +4 -0
  15. package/deps/liburing/liburing/debian/liburing-dev.manpages +6 -0
  16. package/deps/liburing/liburing/debian/liburing1-udeb.install +1 -0
  17. package/deps/liburing/liburing/debian/liburing1.install +1 -0
  18. package/deps/liburing/liburing/debian/liburing1.symbols +32 -0
  19. package/deps/liburing/liburing/debian/patches/series +1 -0
  20. package/deps/liburing/liburing/debian/rules +81 -0
  21. package/deps/liburing/liburing/debian/source/format +1 -0
  22. package/deps/liburing/liburing/debian/source/local-options +2 -0
  23. package/deps/liburing/liburing/debian/source/options +1 -0
  24. package/deps/liburing/liburing/debian/watch +3 -0
  25. package/deps/liburing/liburing/examples/Makefile +29 -0
  26. package/deps/liburing/liburing/examples/io_uring-cp.c +279 -0
  27. package/deps/liburing/liburing/examples/io_uring-test.c +112 -0
  28. package/deps/liburing/liburing/examples/link-cp.c +193 -0
  29. package/deps/liburing/liburing/examples/ucontext-cp.c +273 -0
  30. package/deps/liburing/liburing/liburing.pc.in +12 -0
  31. package/deps/liburing/liburing/liburing.spec +66 -0
  32. package/deps/liburing/liburing/make-debs.sh +53 -0
  33. package/deps/liburing/liburing/man/io_uring.7 +736 -0
  34. package/deps/liburing/liburing/man/io_uring_enter.2 +1403 -0
  35. package/deps/liburing/liburing/man/io_uring_get_sqe.3 +37 -0
  36. package/deps/liburing/liburing/man/io_uring_queue_exit.3 +27 -0
  37. package/deps/liburing/liburing/man/io_uring_queue_init.3 +44 -0
  38. package/deps/liburing/liburing/man/io_uring_register.2 +605 -0
  39. package/deps/liburing/liburing/man/io_uring_setup.2 +515 -0
  40. package/deps/liburing/liburing/src/Makefile +76 -0
  41. package/deps/liburing/liburing/src/include/liburing/barrier.h +73 -0
  42. package/deps/liburing/liburing/src/include/liburing/io_uring.h +422 -0
  43. package/deps/liburing/liburing/src/include/liburing.h +775 -0
  44. package/deps/liburing/liburing/src/liburing.map +46 -0
  45. package/deps/liburing/liburing/src/queue.c +403 -0
  46. package/deps/liburing/liburing/src/register.c +299 -0
  47. package/deps/liburing/liburing/src/setup.c +356 -0
  48. package/deps/liburing/liburing/src/syscall.c +73 -0
  49. package/deps/liburing/liburing/src/syscall.h +20 -0
  50. package/deps/liburing/liburing/test/232c93d07b74-test.c +305 -0
  51. package/deps/liburing/liburing/test/35fa71a030ca-test.c +329 -0
  52. package/deps/liburing/liburing/test/500f9fbadef8-test.c +89 -0
  53. package/deps/liburing/liburing/test/7ad0e4b2f83c-test.c +93 -0
  54. package/deps/liburing/liburing/test/8a9973408177-test.c +106 -0
  55. package/deps/liburing/liburing/test/917257daa0fe-test.c +53 -0
  56. package/deps/liburing/liburing/test/Makefile +312 -0
  57. package/deps/liburing/liburing/test/a0908ae19763-test.c +58 -0
  58. package/deps/liburing/liburing/test/a4c0b3decb33-test.c +180 -0
  59. package/deps/liburing/liburing/test/accept-link.c +251 -0
  60. package/deps/liburing/liburing/test/accept-reuse.c +164 -0
  61. package/deps/liburing/liburing/test/accept-test.c +79 -0
  62. package/deps/liburing/liburing/test/accept.c +476 -0
  63. package/deps/liburing/liburing/test/across-fork.c +283 -0
  64. package/deps/liburing/liburing/test/b19062a56726-test.c +53 -0
  65. package/deps/liburing/liburing/test/b5837bd5311d-test.c +77 -0
  66. package/deps/liburing/liburing/test/ce593a6c480a-test.c +135 -0
  67. package/deps/liburing/liburing/test/close-opath.c +122 -0
  68. package/deps/liburing/liburing/test/config +10 -0
  69. package/deps/liburing/liburing/test/connect.c +398 -0
  70. package/deps/liburing/liburing/test/cq-full.c +96 -0
  71. package/deps/liburing/liburing/test/cq-overflow.c +294 -0
  72. package/deps/liburing/liburing/test/cq-peek-batch.c +102 -0
  73. package/deps/liburing/liburing/test/cq-ready.c +94 -0
  74. package/deps/liburing/liburing/test/cq-size.c +58 -0
  75. package/deps/liburing/liburing/test/d4ae271dfaae-test.c +96 -0
  76. package/deps/liburing/liburing/test/d77a67ed5f27-test.c +65 -0
  77. package/deps/liburing/liburing/test/defer.c +307 -0
  78. package/deps/liburing/liburing/test/double-poll-crash.c +186 -0
  79. package/deps/liburing/liburing/test/eeed8b54e0df-test.c +114 -0
  80. package/deps/liburing/liburing/test/empty-eownerdead.c +42 -0
  81. package/deps/liburing/liburing/test/eventfd-disable.c +151 -0
  82. package/deps/liburing/liburing/test/eventfd-ring.c +97 -0
  83. package/deps/liburing/liburing/test/eventfd.c +112 -0
  84. package/deps/liburing/liburing/test/fadvise.c +202 -0
  85. package/deps/liburing/liburing/test/fallocate.c +249 -0
  86. package/deps/liburing/liburing/test/fc2a85cb02ef-test.c +138 -0
  87. package/deps/liburing/liburing/test/file-register.c +843 -0
  88. package/deps/liburing/liburing/test/file-update.c +173 -0
  89. package/deps/liburing/liburing/test/files-exit-hang-poll.c +128 -0
  90. package/deps/liburing/liburing/test/files-exit-hang-timeout.c +134 -0
  91. package/deps/liburing/liburing/test/fixed-link.c +90 -0
  92. package/deps/liburing/liburing/test/fsync.c +224 -0
  93. package/deps/liburing/liburing/test/hardlink.c +136 -0
  94. package/deps/liburing/liburing/test/helpers.c +135 -0
  95. package/deps/liburing/liburing/test/helpers.h +67 -0
  96. package/deps/liburing/liburing/test/io-cancel.c +537 -0
  97. package/deps/liburing/liburing/test/io_uring_enter.c +296 -0
  98. package/deps/liburing/liburing/test/io_uring_register.c +664 -0
  99. package/deps/liburing/liburing/test/io_uring_setup.c +192 -0
  100. package/deps/liburing/liburing/test/iopoll.c +366 -0
  101. package/deps/liburing/liburing/test/lfs-openat-write.c +117 -0
  102. package/deps/liburing/liburing/test/lfs-openat.c +273 -0
  103. package/deps/liburing/liburing/test/link-timeout.c +1107 -0
  104. package/deps/liburing/liburing/test/link.c +496 -0
  105. package/deps/liburing/liburing/test/link_drain.c +229 -0
  106. package/deps/liburing/liburing/test/madvise.c +195 -0
  107. package/deps/liburing/liburing/test/mkdir.c +108 -0
  108. package/deps/liburing/liburing/test/multicqes_drain.c +383 -0
  109. package/deps/liburing/liburing/test/nop-all-sizes.c +107 -0
  110. package/deps/liburing/liburing/test/nop.c +115 -0
  111. package/deps/liburing/liburing/test/open-close.c +146 -0
  112. package/deps/liburing/liburing/test/openat2.c +240 -0
  113. package/deps/liburing/liburing/test/personality.c +204 -0
  114. package/deps/liburing/liburing/test/pipe-eof.c +81 -0
  115. package/deps/liburing/liburing/test/pipe-reuse.c +105 -0
  116. package/deps/liburing/liburing/test/poll-cancel-ton.c +139 -0
  117. package/deps/liburing/liburing/test/poll-cancel.c +135 -0
  118. package/deps/liburing/liburing/test/poll-link.c +227 -0
  119. package/deps/liburing/liburing/test/poll-many.c +208 -0
  120. package/deps/liburing/liburing/test/poll-mshot-update.c +273 -0
  121. package/deps/liburing/liburing/test/poll-ring.c +48 -0
  122. package/deps/liburing/liburing/test/poll-v-poll.c +353 -0
  123. package/deps/liburing/liburing/test/poll.c +109 -0
  124. package/deps/liburing/liburing/test/probe.c +137 -0
  125. package/deps/liburing/liburing/test/read-write.c +876 -0
  126. package/deps/liburing/liburing/test/register-restrictions.c +633 -0
  127. package/deps/liburing/liburing/test/rename.c +134 -0
  128. package/deps/liburing/liburing/test/ring-leak.c +173 -0
  129. package/deps/liburing/liburing/test/ring-leak2.c +249 -0
  130. package/deps/liburing/liburing/test/rsrc_tags.c +449 -0
  131. package/deps/liburing/liburing/test/runtests-loop.sh +16 -0
  132. package/deps/liburing/liburing/test/runtests.sh +170 -0
  133. package/deps/liburing/liburing/test/rw_merge_test.c +97 -0
  134. package/deps/liburing/liburing/test/self.c +91 -0
  135. package/deps/liburing/liburing/test/send_recv.c +291 -0
  136. package/deps/liburing/liburing/test/send_recvmsg.c +345 -0
  137. package/deps/liburing/liburing/test/sendmsg_fs_cve.c +198 -0
  138. package/deps/liburing/liburing/test/shared-wq.c +84 -0
  139. package/deps/liburing/liburing/test/short-read.c +75 -0
  140. package/deps/liburing/liburing/test/shutdown.c +163 -0
  141. package/deps/liburing/liburing/test/sigfd-deadlock.c +74 -0
  142. package/deps/liburing/liburing/test/socket-rw-eagain.c +156 -0
  143. package/deps/liburing/liburing/test/socket-rw.c +147 -0
  144. package/deps/liburing/liburing/test/splice.c +511 -0
  145. package/deps/liburing/liburing/test/sq-full-cpp.cc +45 -0
  146. package/deps/liburing/liburing/test/sq-full.c +45 -0
  147. package/deps/liburing/liburing/test/sq-poll-dup.c +200 -0
  148. package/deps/liburing/liburing/test/sq-poll-kthread.c +168 -0
  149. package/deps/liburing/liburing/test/sq-poll-share.c +137 -0
  150. package/deps/liburing/liburing/test/sq-space_left.c +159 -0
  151. package/deps/liburing/liburing/test/sqpoll-cancel-hang.c +159 -0
  152. package/deps/liburing/liburing/test/sqpoll-disable-exit.c +195 -0
  153. package/deps/liburing/liburing/test/sqpoll-exit-hang.c +77 -0
  154. package/deps/liburing/liburing/test/sqpoll-sleep.c +68 -0
  155. package/deps/liburing/liburing/test/statx.c +172 -0
  156. package/deps/liburing/liburing/test/stdout.c +232 -0
  157. package/deps/liburing/liburing/test/submit-link-fail.c +154 -0
  158. package/deps/liburing/liburing/test/submit-reuse.c +239 -0
  159. package/deps/liburing/liburing/test/symlink.c +116 -0
  160. package/deps/liburing/liburing/test/teardowns.c +58 -0
  161. package/deps/liburing/liburing/test/thread-exit.c +131 -0
  162. package/deps/liburing/liburing/test/timeout-new.c +246 -0
  163. package/deps/liburing/liburing/test/timeout-overflow.c +204 -0
  164. package/deps/liburing/liburing/test/timeout.c +1354 -0
  165. package/deps/liburing/liburing/test/unlink.c +111 -0
  166. package/deps/liburing/liburing/test/wakeup-hang.c +162 -0
  167. package/deps/liburing/liburing.gyp +20 -0
  168. package/deps/rocksdb/rocksdb/db/corruption_test.cc +62 -0
  169. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +7 -62
  170. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +25 -11
  171. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +74 -155
  172. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +1 -2
  173. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +2 -2
  174. package/deps/rocksdb/rocksdb/env/fs_posix.cc +13 -0
  175. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +4 -2
  176. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +22 -4
  177. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +5 -0
  178. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +15 -0
  179. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +5 -0
  180. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  181. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +3 -0
  182. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +3 -7
  183. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +2 -1
  184. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +44 -29
  185. package/deps/rocksdb/rocksdb.gyp +4 -3
  186. package/package.json +1 -1
  187. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  188. package/prebuilds/darwin-x64/node.napi.node +0 -0
  189. package/prebuilds/linux-x64/node.napi.node +0 -0
  190. package/prebuilds/prebuilds/linux-x64/node.napi.node +0 -0
@@ -399,7 +399,7 @@ IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname,
399
399
  Status DBImpl::Recover(
400
400
  const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
401
401
  bool error_if_wal_file_exists, bool error_if_data_exists_in_wals,
402
- uint64_t* recovered_seq, RecoveryContext* recovery_ctx) {
402
+ uint64_t* recovered_seq) {
403
403
  mutex_.AssertHeld();
404
404
 
405
405
  bool is_new_db = false;
@@ -518,10 +518,9 @@ Status DBImpl::Recover(
518
518
  if (!s.ok()) {
519
519
  return s;
520
520
  }
521
-
522
- s = SetDBId(read_only, recovery_ctx);
521
+ s = SetDBId(read_only);
523
522
  if (s.ok() && !read_only) {
524
- s = DeleteUnreferencedSstFiles(recovery_ctx);
523
+ s = DeleteUnreferencedSstFiles();
525
524
  }
526
525
 
527
526
  if (immutable_db_options_.paranoid_checks && s.ok()) {
@@ -536,6 +535,10 @@ Status DBImpl::Recover(
536
535
  }
537
536
  }
538
537
  }
538
+ // DB mutex is already held
539
+ if (s.ok() && immutable_db_options_.persist_stats_to_disk) {
540
+ s = InitPersistStatsColumnFamily();
541
+ }
539
542
 
540
543
  std::vector<std::string> files_in_wal_dir;
541
544
  if (s.ok()) {
@@ -605,10 +608,7 @@ Status DBImpl::Recover(
605
608
  WalNumber max_wal_number =
606
609
  versions_->GetWalSet().GetWals().rbegin()->first;
607
610
  edit.DeleteWalsBefore(max_wal_number + 1);
608
- assert(recovery_ctx != nullptr);
609
- assert(versions_->GetColumnFamilySet() != nullptr);
610
- recovery_ctx->UpdateVersionEdits(
611
- versions_->GetColumnFamilySet()->GetDefault(), edit);
611
+ s = versions_->LogAndApplyToDefaultColumnFamily(&edit, &mutex_);
612
612
  }
613
613
  if (!s.ok()) {
614
614
  return s;
@@ -644,8 +644,8 @@ Status DBImpl::Recover(
644
644
  std::sort(wals.begin(), wals.end());
645
645
 
646
646
  bool corrupted_wal_found = false;
647
- s = RecoverLogFiles(wals, &next_sequence, read_only, &corrupted_wal_found,
648
- recovery_ctx);
647
+ s = RecoverLogFiles(wals, &next_sequence, read_only,
648
+ &corrupted_wal_found);
649
649
  if (corrupted_wal_found && recovered_seq != nullptr) {
650
650
  *recovered_seq = next_sequence;
651
651
  }
@@ -805,30 +805,10 @@ Status DBImpl::InitPersistStatsColumnFamily() {
805
805
  return s;
806
806
  }
807
807
 
808
- Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) {
809
- mutex_.AssertHeld();
810
- assert(versions_->descriptor_log_ == nullptr);
811
- Status s = versions_->LogAndApply(
812
- recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_,
813
- recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir());
814
- if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) {
815
- mutex_.Unlock();
816
- for (const auto& fname : recovery_ctx.files_to_delete_) {
817
- s = env_->DeleteFile(fname);
818
- if (!s.ok()) {
819
- break;
820
- }
821
- }
822
- mutex_.Lock();
823
- }
824
- return s;
825
- }
826
-
827
808
  // REQUIRES: wal_numbers are sorted in ascending order
828
- Status DBImpl::RecoverLogFiles(std::vector<uint64_t>& wal_numbers,
809
+ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
829
810
  SequenceNumber* next_sequence, bool read_only,
830
- bool* corrupted_wal_found,
831
- RecoveryContext* recovery_ctx) {
811
+ bool* corrupted_wal_found) {
832
812
  struct LogReporter : public log::Reader::Reporter {
833
813
  Env* env;
834
814
  Logger* info_log;
@@ -853,7 +833,6 @@ Status DBImpl::RecoverLogFiles(std::vector<uint64_t>& wal_numbers,
853
833
  edit.SetColumnFamily(cfd->GetID());
854
834
  version_edits.insert({cfd->GetID(), edit});
855
835
  }
856
-
857
836
  int job_id = next_job_id_.fetch_add(1);
858
837
  {
859
838
  auto stream = event_logger_.Log();
@@ -1277,7 +1256,6 @@ Status DBImpl::RecoverLogFiles(std::vector<uint64_t>& wal_numbers,
1277
1256
  edit->SetLogNumber(max_wal_number + 1);
1278
1257
  }
1279
1258
  }
1280
-
1281
1259
  if (status.ok()) {
1282
1260
  // we must mark the next log number as used, even though it's
1283
1261
  // not actually used. that is because VersionSet assumes
@@ -1285,40 +1263,42 @@ Status DBImpl::RecoverLogFiles(std::vector<uint64_t>& wal_numbers,
1285
1263
  // log number
1286
1264
  versions_->MarkFileNumberUsed(max_wal_number + 1);
1287
1265
 
1288
- if (corrupted_wal_found != nullptr && *corrupted_wal_found == true &&
1289
- immutable_db_options_.wal_recovery_mode ==
1290
- WALRecoveryMode::kPointInTimeRecovery) {
1291
- MoveCorruptedWalFiles(wal_numbers, corrupted_wal_number);
1292
- }
1293
-
1294
- assert(recovery_ctx != nullptr);
1266
+ autovector<ColumnFamilyData*> cfds;
1267
+ autovector<const MutableCFOptions*> cf_opts;
1268
+ autovector<autovector<VersionEdit*>> edit_lists;
1295
1269
  for (auto* cfd : *versions_->GetColumnFamilySet()) {
1270
+ cfds.push_back(cfd);
1271
+ cf_opts.push_back(cfd->GetLatestMutableCFOptions());
1296
1272
  auto iter = version_edits.find(cfd->GetID());
1297
1273
  assert(iter != version_edits.end());
1298
- recovery_ctx->UpdateVersionEdits(cfd, iter->second);
1274
+ edit_lists.push_back({&iter->second});
1299
1275
  }
1300
1276
 
1277
+ std::unique_ptr<VersionEdit> wal_deletion;
1301
1278
  if (flushed) {
1302
- VersionEdit wal_deletion;
1279
+ wal_deletion = std::make_unique<VersionEdit>();
1303
1280
  if (immutable_db_options_.track_and_verify_wals_in_manifest) {
1304
- wal_deletion.DeleteWalsBefore(max_wal_number + 1);
1281
+ wal_deletion->DeleteWalsBefore(max_wal_number + 1);
1305
1282
  }
1306
1283
  if (!allow_2pc()) {
1307
1284
  // In non-2pc mode, flushing the memtables of the column families
1308
1285
  // means we can advance min_log_number_to_keep.
1309
- wal_deletion.SetMinLogNumberToKeep(max_wal_number + 1);
1286
+ wal_deletion->SetMinLogNumberToKeep(max_wal_number + 1);
1310
1287
  }
1311
- assert(versions_->GetColumnFamilySet() != nullptr);
1312
- recovery_ctx->UpdateVersionEdits(
1313
- versions_->GetColumnFamilySet()->GetDefault(), wal_deletion);
1288
+ edit_lists.back().push_back(wal_deletion.get());
1314
1289
  }
1290
+
1291
+ // write MANIFEST with update
1292
+ status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_,
1293
+ directories_.GetDbDir(),
1294
+ /*new_descriptor_log=*/true);
1315
1295
  }
1316
1296
  }
1317
1297
 
1318
1298
  if (status.ok()) {
1319
1299
  if (data_seen && !flushed) {
1320
1300
  status = RestoreAliveLogFiles(wal_numbers);
1321
- } else if (!wal_numbers.empty()) {
1301
+ } else {
1322
1302
  // If there's no data in the WAL, or we flushed all the data, still
1323
1303
  // truncate the log file. If the process goes into a crash loop before
1324
1304
  // the file is deleted, the preallocated space will never get freed.
@@ -1334,48 +1314,6 @@ Status DBImpl::RecoverLogFiles(std::vector<uint64_t>& wal_numbers,
1334
1314
  return status;
1335
1315
  }
1336
1316
 
1337
- void DBImpl::MoveCorruptedWalFiles(std::vector<uint64_t>& wal_numbers,
1338
- uint64_t corrupted_wal_number) {
1339
- size_t num_wals = wal_numbers.size();
1340
- // Find the first corrupted wal.
1341
- auto iter = std::lower_bound(wal_numbers.begin(), wal_numbers.end(),
1342
- corrupted_wal_number);
1343
- auto corrupt_start_iter = iter;
1344
-
1345
- // Increment iter to move WAL files from first corrupted_wal_number + 1.
1346
- iter++;
1347
-
1348
- std::string archival_path =
1349
- ArchivalDirectory(immutable_db_options_.GetWalDir());
1350
- Status create_status = env_->CreateDirIfMissing(archival_path);
1351
-
1352
- // create_status is only checked when it needs to move the corrupted WAL files
1353
- // to archive folder.
1354
- create_status.PermitUncheckedError();
1355
-
1356
- // Truncate the last WAL to reclaim the pre allocated space before
1357
- // moving it.
1358
- GetLogSizeAndMaybeTruncate(wal_numbers.back(), /*truncate=*/true, nullptr)
1359
- .PermitUncheckedError();
1360
-
1361
- // Move all the WAL files from corrupted_wal_number + 1 to last WAL
1362
- // (max_wal_number) to avoid column family inconsistency error to archival
1363
- // directory. If its unable to create archive dir, it will delete the
1364
- // corrupted WAL files.
1365
- // We are moving all but first corrupted WAL file to a different folder.
1366
- while (iter != wal_numbers.end()) {
1367
- LogFileNumberSize log(*iter);
1368
- std::string fname = LogFileName(immutable_db_options_.GetWalDir(), *iter);
1369
- #ifndef ROCKSDB_LITE
1370
- if (create_status.ok()) {
1371
- wal_manager_.ArchiveWALFile(fname, *iter);
1372
- }
1373
- #endif
1374
- iter++;
1375
- }
1376
- wal_numbers.erase(corrupt_start_iter + 1, wal_numbers.begin() + num_wals);
1377
- }
1378
-
1379
1317
  Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
1380
1318
  LogFileNumberSize* log_ptr) {
1381
1319
  LogFileNumberSize log(wal_number);
@@ -1438,8 +1376,7 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
1438
1376
  // log has such preallocated space, so we only truncate for the last log.
1439
1377
  LogFileNumberSize log;
1440
1378
  s = GetLogSizeAndMaybeTruncate(
1441
- wal_number,
1442
- /*truncate=*/(wal_number == wal_numbers.back()), &log);
1379
+ wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
1443
1380
  if (!s.ok()) {
1444
1381
  break;
1445
1382
  }
@@ -1800,13 +1737,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
1800
1737
  impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
1801
1738
 
1802
1739
  impl->mutex_.Lock();
1803
-
1804
- RecoveryContext recovery_ctx;
1805
-
1806
1740
  // Handles create_if_missing, error_if_exists
1807
1741
  uint64_t recovered_seq(kMaxSequenceNumber);
1808
- s = impl->Recover(column_families, false, false, false, &recovered_seq,
1809
- &recovery_ctx);
1742
+ s = impl->Recover(column_families, false, false, false, &recovered_seq);
1810
1743
  if (s.ok()) {
1811
1744
  uint64_t new_log_number = impl->versions_->NewFileNumber();
1812
1745
  log::Writer* new_log = nullptr;
@@ -1823,6 +1756,40 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
1823
1756
  }
1824
1757
 
1825
1758
  if (s.ok()) {
1759
+ // set column family handles
1760
+ for (auto cf : column_families) {
1761
+ auto cfd =
1762
+ impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
1763
+ if (cfd != nullptr) {
1764
+ handles->push_back(
1765
+ new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
1766
+ impl->NewThreadStatusCfInfo(cfd);
1767
+ } else {
1768
+ if (db_options.create_missing_column_families) {
1769
+ // missing column family, create it
1770
+ ColumnFamilyHandle* handle;
1771
+ impl->mutex_.Unlock();
1772
+ s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
1773
+ impl->mutex_.Lock();
1774
+ if (s.ok()) {
1775
+ handles->push_back(handle);
1776
+ } else {
1777
+ break;
1778
+ }
1779
+ } else {
1780
+ s = Status::InvalidArgument("Column family not found", cf.name);
1781
+ break;
1782
+ }
1783
+ }
1784
+ }
1785
+ }
1786
+ if (s.ok()) {
1787
+ SuperVersionContext sv_context(/* create_superversion */ true);
1788
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
1789
+ impl->InstallSuperVersionAndScheduleWork(
1790
+ cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
1791
+ }
1792
+ sv_context.Clean();
1826
1793
  if (impl->two_write_queues_) {
1827
1794
  impl->log_write_mutex_.Lock();
1828
1795
  }
@@ -1835,15 +1802,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
1835
1802
  }
1836
1803
  if (s.ok()) {
1837
1804
  // In WritePrepared there could be gap in sequence numbers. This breaks
1838
- // the trick we use in kPointInTimeRecovery which assumes the first seq
1839
- // in the log right after the corrupted log is one larger than the last
1840
- // seq we read from the wals. To let this trick keep working, we add a
1841
- // dummy entry with the expected sequence to the first log right after
1842
- // recovery. In non-WritePrepared case also the new log after recovery
1843
- // could be empty, and thus missing the consecutive seq hint to
1844
- // distinguish middle-log corruption to
1845
- // corrupted-log-remained-after-recovery. This case also will be
1846
- // addressed by a dummy write.
1805
+ // the trick we use in kPointInTimeRecovery which assumes the first seq in
1806
+ // the log right after the corrupted log is one larger than the last seq
1807
+ // we read from the wals. To let this trick keep working, we add a dummy
1808
+ // entry with the expected sequence to the first log right after recovery.
1809
+ // In non-WritePrepared case also the new log after recovery could be
1810
+ // empty, and thus missing the consecutive seq hint to distinguish
1811
+ // middle-log corruption to corrupted-log-remained-after-recovery. This
1812
+ // case also will be addressed by a dummy write.
1847
1813
  if (recovered_seq != kMaxSequenceNumber) {
1848
1814
  WriteBatch empty_batch;
1849
1815
  WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
@@ -1862,52 +1828,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
1862
1828
  }
1863
1829
  }
1864
1830
  }
1865
- if (s.ok()) {
1866
- s = impl->LogAndApplyForRecovery(recovery_ctx);
1867
- }
1868
-
1869
- if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
1870
- impl->mutex_.AssertHeld();
1871
- s = impl->InitPersistStatsColumnFamily();
1872
- }
1873
-
1874
- if (s.ok()) {
1875
- // set column family handles
1876
- for (auto cf : column_families) {
1877
- auto cfd =
1878
- impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
1879
- if (cfd != nullptr) {
1880
- handles->push_back(
1881
- new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
1882
- impl->NewThreadStatusCfInfo(cfd);
1883
- } else {
1884
- if (db_options.create_missing_column_families) {
1885
- // missing column family, create it
1886
- ColumnFamilyHandle* handle;
1887
- impl->mutex_.Unlock();
1888
- s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
1889
- impl->mutex_.Lock();
1890
- if (s.ok()) {
1891
- handles->push_back(handle);
1892
- } else {
1893
- break;
1894
- }
1895
- } else {
1896
- s = Status::InvalidArgument("Column family not found", cf.name);
1897
- break;
1898
- }
1899
- }
1900
- }
1901
- }
1902
-
1903
- if (s.ok()) {
1904
- SuperVersionContext sv_context(/* create_superversion */ true);
1905
- for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
1906
- impl->InstallSuperVersionAndScheduleWork(
1907
- cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
1908
- }
1909
- sv_context.Clean();
1910
- }
1911
1831
  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
1912
1832
  // try to read format version
1913
1833
  s = impl->PersistentStatsProcessFormatVersion();
@@ -1933,8 +1853,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
1933
1853
  if (cfd->ioptions()->merge_operator != nullptr &&
1934
1854
  !cfd->mem()->IsMergeOperatorSupported()) {
1935
1855
  s = Status::InvalidArgument(
1936
- "The memtable of column family %s does not support merge "
1937
- "operator "
1856
+ "The memtable of column family %s does not support merge operator "
1938
1857
  "its options.merge_operator is non-null",
1939
1858
  cfd->GetName().c_str());
1940
1859
  }
@@ -33,8 +33,7 @@ DBImplSecondary::~DBImplSecondary() {}
33
33
  Status DBImplSecondary::Recover(
34
34
  const std::vector<ColumnFamilyDescriptor>& column_families,
35
35
  bool /*readonly*/, bool /*error_if_wal_file_exists*/,
36
- bool /*error_if_data_exists_in_wals*/, uint64_t*,
37
- RecoveryContext* /*recovery_ctx*/) {
36
+ bool /*error_if_data_exists_in_wals*/, uint64_t*) {
38
37
  mutex_.AssertHeld();
39
38
 
40
39
  JobContext job_context(0);
@@ -81,8 +81,8 @@ class DBImplSecondary : public DBImpl {
81
81
  // and log_readers_ to facilitate future operations.
82
82
  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
83
83
  bool read_only, bool error_if_wal_file_exists,
84
- bool error_if_data_exists_in_wals, uint64_t* = nullptr,
85
- RecoveryContext* recovery_ctx = nullptr) override;
84
+ bool error_if_data_exists_in_wals,
85
+ uint64_t* = nullptr) override;
86
86
 
87
87
  // Implementations of the DB interface
88
88
  using DB::Get;
@@ -1118,6 +1118,19 @@ class PosixFileSystem : public FileSystem {
1118
1118
  #endif
1119
1119
  }
1120
1120
 
1121
+ // TODO akanksha: Look into flags and see how to provide support for AbortIO
1122
+ // in posix for IOUring requests. Currently it calls Poll to wait for requests
1123
+ // to complete the request.
1124
+ virtual IOStatus AbortIO(std::vector<void*>& io_handles) override {
1125
+ IOStatus s = Poll(io_handles, io_handles.size());
1126
+ // If Poll is not supported then it didn't submit any request and it should
1127
+ // return OK.
1128
+ if (s.IsNotSupported()) {
1129
+ return IOStatus::OK();
1130
+ }
1131
+ return s;
1132
+ }
1133
+
1121
1134
  #if defined(ROCKSDB_IOURING_PRESENT)
1122
1135
  // io_uring instance
1123
1136
  std::unique_ptr<ThreadLocalPtr> thread_local_io_urings_;
@@ -229,6 +229,7 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
229
229
  // second buffer.
230
230
  std::vector<void*> handles;
231
231
  handles.emplace_back(io_handle_);
232
+ StopWatch sw(clock_, stats_, POLL_WAIT_MICROS);
232
233
  fs_->Poll(handles, 1).PermitUncheckedError();
233
234
  }
234
235
 
@@ -281,7 +282,7 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
281
282
  bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
282
283
  offset += length;
283
284
  length = 0;
284
- prefetch_size -= length;
285
+ prefetch_size = readahead_size;
285
286
  }
286
287
  // Data is overlapping i.e. some of the data is in curr_ buffer and remaining
287
288
  // in second buffer.
@@ -310,7 +311,8 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
310
311
  // sync prefetching and copy the remaining data to third buffer in the end.
311
312
  // swap the buffers.
312
313
  curr_ = curr_ ^ 1;
313
- prefetch_size -= length;
314
+ // Update prefetch_size as length has been updated in CopyDataToBuffer.
315
+ prefetch_size = length + readahead_size;
314
316
  }
315
317
 
316
318
  // Update second again if swap happened.
@@ -14,6 +14,7 @@
14
14
  #include <string>
15
15
 
16
16
  #include "file/readahead_file_info.h"
17
+ #include "monitoring/statistics.h"
17
18
  #include "port/port.h"
18
19
  #include "rocksdb/env.h"
19
20
  #include "rocksdb/file_system.h"
@@ -64,7 +65,8 @@ class FilePrefetchBuffer {
64
65
  FilePrefetchBuffer(size_t readahead_size = 0, size_t max_readahead_size = 0,
65
66
  bool enable = true, bool track_min_offset = false,
66
67
  bool implicit_auto_readahead = false,
67
- bool async_io = false, FileSystem* fs = nullptr)
68
+ bool async_io = false, FileSystem* fs = nullptr,
69
+ SystemClock* clock = nullptr, Statistics* stats = nullptr)
68
70
  : curr_(0),
69
71
  readahead_size_(readahead_size),
70
72
  initial_auto_readahead_size_(readahead_size),
@@ -80,7 +82,9 @@ class FilePrefetchBuffer {
80
82
  del_fn_(nullptr),
81
83
  async_read_in_progress_(false),
82
84
  async_io_(async_io),
83
- fs_(fs) {
85
+ fs_(fs),
86
+ clock_(clock),
87
+ stats_(stats) {
84
88
  // If async_io_ is enabled, data is asynchronously filled in second buffer
85
89
  // while curr_ is being consumed. If data is overlapping in two buffers,
86
90
  // data is copied to third buffer to return continuous buffer.
@@ -88,12 +92,24 @@ class FilePrefetchBuffer {
88
92
  }
89
93
 
90
94
  ~FilePrefetchBuffer() {
91
- // Wait for any pending async job before destroying the class object.
95
+ // Abort any pending async read request before destroying the class object.
92
96
  if (async_read_in_progress_ && fs_ != nullptr) {
93
97
  std::vector<void*> handles;
94
98
  handles.emplace_back(io_handle_);
95
- fs_->Poll(handles, 1).PermitUncheckedError();
99
+ Status s = fs_->AbortIO(handles);
100
+ assert(s.ok());
96
101
  }
102
+
103
+ // Prefetch buffer bytes discarded.
104
+ uint64_t bytes_discarded = 0;
105
+ if (bufs_[curr_].buffer_.CurrentSize() != 0) {
106
+ bytes_discarded = bufs_[curr_].buffer_.CurrentSize();
107
+ }
108
+ if (bufs_[curr_ ^ 1].buffer_.CurrentSize() != 0) {
109
+ bytes_discarded += bufs_[curr_ ^ 1].buffer_.CurrentSize();
110
+ }
111
+ RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded);
112
+
97
113
  // Release io_handle_.
98
114
  if (io_handle_ != nullptr && del_fn_ != nullptr) {
99
115
  del_fn_(io_handle_);
@@ -272,5 +288,7 @@ class FilePrefetchBuffer {
272
288
  bool async_read_in_progress_;
273
289
  bool async_io_;
274
290
  FileSystem* fs_;
291
+ SystemClock* clock_;
292
+ Statistics* stats_;
275
293
  };
276
294
  } // namespace ROCKSDB_NAMESPACE
@@ -1288,6 +1288,10 @@ TEST_P(PrefetchTestWithPosix, ReadAsyncWithPosixFS) {
1288
1288
  {
1289
1289
  HistogramData async_read_bytes;
1290
1290
  options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
1291
+ HistogramData prefetched_bytes_discarded;
1292
+ options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED,
1293
+ &prefetched_bytes_discarded);
1294
+
1291
1295
  // Not all platforms support iouring. In that case, ReadAsync in posix
1292
1296
  // won't submit async requests.
1293
1297
  if (read_async_called) {
@@ -1295,6 +1299,7 @@ TEST_P(PrefetchTestWithPosix, ReadAsyncWithPosixFS) {
1295
1299
  } else {
1296
1300
  ASSERT_EQ(async_read_bytes.count, 0);
1297
1301
  }
1302
+ ASSERT_GT(prefetched_bytes_discarded.count, 0);
1298
1303
  }
1299
1304
  }
1300
1305
 
@@ -668,6 +668,17 @@ class FileSystem : public Customizable {
668
668
  return IOStatus::OK();
669
669
  }
670
670
 
671
+ // EXPERIMENTAL
672
+ // Abort the read IO requests submitted asynchronously. Underlying FS is
673
+ // required to support AbortIO API. AbortIO implementation should ensure that
674
+ // the all the read requests related to io_handles should be aborted and
675
+ // it shouldn't call the callback for these io_handles.
676
+ //
677
+ // Default implementation is to return IOStatus::OK.
678
+ virtual IOStatus AbortIO(std::vector<void*>& /*io_handles*/) {
679
+ return IOStatus::OK();
680
+ }
681
+
671
682
  // If you're adding methods here, remember to add them to EnvWrapper too.
672
683
 
673
684
  private:
@@ -1500,6 +1511,10 @@ class FileSystemWrapper : public FileSystem {
1500
1511
  return target_->Poll(io_handles, min_completions);
1501
1512
  }
1502
1513
 
1514
+ virtual IOStatus AbortIO(std::vector<void*>& io_handles) override {
1515
+ return target_->AbortIO(io_handles);
1516
+ }
1517
+
1503
1518
  protected:
1504
1519
  std::shared_ptr<FileSystem> target_;
1505
1520
  };
@@ -536,7 +536,12 @@ enum Histograms : uint32_t {
536
536
  // Error handler statistics
537
537
  ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
538
538
 
539
+ // Stats related to asynchronous read requests.
539
540
  ASYNC_READ_BYTES,
541
+ POLL_WAIT_MICROS,
542
+
543
+ // Number of prefetched bytes discarded by RocksDB.
544
+ PREFETCHED_BYTES_DISCARDED,
540
545
 
541
546
  HISTOGRAM_ENUM_MAX,
542
547
  };
@@ -13,7 +13,7 @@
13
13
  // minor or major version number planned for release.
14
14
  #define ROCKSDB_MAJOR 7
15
15
  #define ROCKSDB_MINOR 2
16
- #define ROCKSDB_PATCH 0
16
+ #define ROCKSDB_PATCH 2
17
17
 
18
18
  // Do not use these. We made the mistake of declaring macros starting with
19
19
  // double underscore. Now we have to live with our choice. We'll deprecate these
@@ -284,6 +284,9 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
284
284
  {ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
285
285
  "rocksdb.error.handler.autoresume.retry.count"},
286
286
  {ASYNC_READ_BYTES, "rocksdb.async.read.bytes"},
287
+ {POLL_WAIT_MICROS, "rocksdb.poll.wait.micros"},
288
+ {PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"},
289
+
287
290
  };
288
291
 
289
292
  std::shared_ptr<Statistics> CreateDBStatistics() {
@@ -604,14 +604,10 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
604
604
  dbfull()->TEST_WaitForStatsDumpRun(
605
605
  [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
606
606
  // writing to all three cf, flush default cf
607
- // LogNumbers: default: 16, stats: 10, pikachu: 5
608
- // Since in recovery process, cfd_stats column is created after WAL is
609
- // created, synced and MANIFEST is persisted, its log number which depends on
610
- // logfile_number_ will be different. Since "pikachu" is never flushed, thus
611
- // its log_number should be the smallest of the three.
607
+ // LogNumbers: default: 14, stats: 4, pikachu: 4
612
608
  ASSERT_OK(Flush());
613
- ASSERT_LT(cfd_test->GetLogNumber(), cfd_stats->GetLogNumber());
614
- ASSERT_LT(cfd_test->GetLogNumber(), cfd_default->GetLogNumber());
609
+ ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
610
+ ASSERT_LT(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
615
611
 
616
612
  ASSERT_OK(Put("foo1", "v1"));
617
613
  ASSERT_OK(Put("bar1", "v1"));
@@ -666,7 +666,8 @@ struct BlockBasedTable::Rep {
666
666
  fpb->reset(new FilePrefetchBuffer(
667
667
  readahead_size, max_readahead_size,
668
668
  !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset */,
669
- implicit_auto_readahead, async_io, ioptions.fs.get()));
669
+ implicit_auto_readahead, async_io, ioptions.fs.get(), ioptions.clock,
670
+ ioptions.stats));
670
671
  }
671
672
 
672
673
  void CreateFilePrefetchBufferIfNotExists(