@nxtedition/rocksdb 11.0.2 → 11.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. package/binding.cc +133 -122
  2. package/deps/rocksdb/rocksdb/db/column_family_test.cc +15 -7
  3. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +4 -2
  4. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -4
  5. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +11 -7
  6. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +17 -11
  7. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +15 -0
  8. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +155 -0
  9. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +564 -461
  10. package/deps/rocksdb/rocksdb/db/db_follower_test.cc +8 -4
  11. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +40 -24
  12. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +8 -1
  13. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +7 -4
  14. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  15. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +3 -1
  16. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +19 -1
  17. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +20 -16
  18. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +27 -0
  19. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +10 -2
  20. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +85 -0
  21. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +55 -2
  22. package/deps/rocksdb/rocksdb/db/db_test2.cc +231 -0
  23. package/deps/rocksdb/rocksdb/db/db_test_util.cc +5 -0
  24. package/deps/rocksdb/rocksdb/db/db_test_util.h +10 -1
  25. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +0 -1
  26. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +175 -1
  27. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +64 -0
  28. package/deps/rocksdb/rocksdb/db/dbformat.h +5 -6
  29. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +8 -8
  30. package/deps/rocksdb/rocksdb/db/experimental.cc +3 -2
  31. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +2 -4
  32. package/deps/rocksdb/rocksdb/db/flush_job.cc +7 -2
  33. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +4 -2
  34. package/deps/rocksdb/rocksdb/db/listener_test.cc +5 -5
  35. package/deps/rocksdb/rocksdb/db/log_writer.cc +12 -3
  36. package/deps/rocksdb/rocksdb/db/memtable.cc +83 -23
  37. package/deps/rocksdb/rocksdb/db/memtable.h +11 -3
  38. package/deps/rocksdb/rocksdb/db/memtable_list.cc +7 -5
  39. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +21 -0
  40. package/deps/rocksdb/rocksdb/db/version_builder.cc +462 -33
  41. package/deps/rocksdb/rocksdb/db/version_builder.h +70 -23
  42. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +95 -207
  43. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +54 -35
  44. package/deps/rocksdb/rocksdb/db/version_set.cc +13 -11
  45. package/deps/rocksdb/rocksdb/db/version_set_test.cc +313 -59
  46. package/deps/rocksdb/rocksdb/db/write_batch.cc +124 -64
  47. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +2 -3
  48. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h +1 -1
  49. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +4 -1
  50. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +9 -0
  51. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +4 -32
  52. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +7 -3
  53. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +60 -172
  54. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +57 -2
  55. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +23 -15
  56. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +2 -3
  57. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.cc +1 -1
  58. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +4 -1
  59. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +200 -92
  60. package/deps/rocksdb/rocksdb/env/file_system.cc +3 -3
  61. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +124 -23
  62. package/deps/rocksdb/rocksdb/file/delete_scheduler.h +61 -8
  63. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +141 -2
  64. package/deps/rocksdb/rocksdb/file/file_util.cc +17 -2
  65. package/deps/rocksdb/rocksdb/file/file_util.h +10 -0
  66. package/deps/rocksdb/rocksdb/file/filename.cc +11 -3
  67. package/deps/rocksdb/rocksdb/file/filename.h +2 -1
  68. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +18 -0
  69. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +27 -4
  70. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +8 -1
  71. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +8 -13
  72. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +4 -0
  73. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +5 -0
  74. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +5 -2
  75. package/deps/rocksdb/rocksdb/include/rocksdb/filter_policy.h +2 -1
  76. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +34 -0
  77. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +25 -1
  78. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +5 -0
  79. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +27 -9
  80. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +2 -0
  81. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +12 -0
  82. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +21 -0
  83. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  84. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +29 -1
  85. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +102 -33
  86. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +46 -3
  87. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +4 -0
  88. package/deps/rocksdb/rocksdb/options/cf_options.cc +6 -0
  89. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  90. package/deps/rocksdb/rocksdb/options/db_options.cc +15 -1
  91. package/deps/rocksdb/rocksdb/options/db_options.h +2 -0
  92. package/deps/rocksdb/rocksdb/options/options_helper.cc +10 -0
  93. package/deps/rocksdb/rocksdb/options/options_parser.cc +3 -2
  94. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +9 -2
  95. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +75 -35
  96. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +6 -0
  97. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +4 -0
  98. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +8 -1
  99. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +40 -15
  100. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +98 -17
  101. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +14 -2
  102. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +21 -91
  103. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +13 -21
  104. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +14 -5
  105. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +62 -53
  106. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +60 -38
  107. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +175 -78
  108. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +65 -36
  109. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +25 -15
  110. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +13 -1
  111. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +18 -4
  112. package/deps/rocksdb/rocksdb/table/meta_blocks.h +4 -0
  113. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +11 -0
  114. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +2 -2
  115. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +47 -18
  116. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h +1 -2
  117. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +95 -0
  118. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +26 -15
  119. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +62 -19
  120. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +73 -34
  121. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +5 -0
  122. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +10 -3
  123. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +2 -1
  124. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +8 -5
  125. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +7 -4
  126. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +225 -0
  127. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +2 -1
  128. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +17 -0
  129. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +5 -2
  130. package/index.js +5 -17
  131. package/iterator.js +9 -1
  132. package/package.json +1 -1
  133. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  134. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -73,6 +73,8 @@ void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
73
73
  deadlock_detect_ = txn_options.deadlock_detect;
74
74
  deadlock_detect_depth_ = txn_options.deadlock_detect_depth;
75
75
  write_batch_.SetMaxBytes(txn_options.max_write_batch_size);
76
+ write_batch_.GetWriteBatch()->SetTrackTimestampSize(
77
+ txn_options.write_batch_track_timestamp_size);
76
78
  skip_concurrency_control_ = txn_options.skip_concurrency_control;
77
79
 
78
80
  lock_timeout_ = txn_options.lock_timeout * 1000;
@@ -189,12 +191,9 @@ inline Status WriteCommittedTxn::GetForUpdateImpl(
189
191
  }
190
192
  }
191
193
 
192
- if (!do_validate && kMaxTxnTimestamp != read_timestamp_) {
193
- return Status::InvalidArgument(
194
- "If do_validate is false then GetForUpdate with read_timestamp is not "
195
- "defined.");
196
- } else if (do_validate && kMaxTxnTimestamp == read_timestamp_) {
197
- return Status::InvalidArgument("read_timestamp must be set for validation");
194
+ Status s = SanityCheckReadTimestamp(do_validate);
195
+ if (!s.ok()) {
196
+ return s;
198
197
  }
199
198
 
200
199
  if (!read_options.timestamp) {
@@ -237,17 +236,9 @@ Status WriteCommittedTxn::GetEntityForUpdate(const ReadOptions& read_options,
237
236
  }
238
237
 
239
238
  assert(ts_sz > 0);
240
-
241
- if (!do_validate) {
242
- if (read_timestamp_ != kMaxTxnTimestamp) {
243
- return Status::InvalidArgument(
244
- "Read timestamp must not be set if validation is disabled");
245
- }
246
- } else {
247
- if (read_timestamp_ == kMaxTxnTimestamp) {
248
- return Status::InvalidArgument(
249
- "Read timestamp must be set for validation");
250
- }
239
+ Status s = SanityCheckReadTimestamp(do_validate);
240
+ if (!s.ok()) {
241
+ return s;
251
242
  }
252
243
 
253
244
  std::string ts_buf;
@@ -271,6 +262,33 @@ Status WriteCommittedTxn::GetEntityForUpdate(const ReadOptions& read_options,
271
262
  read_options, column_family, key, columns, exclusive, do_validate);
272
263
  }
273
264
 
265
+ Status WriteCommittedTxn::SanityCheckReadTimestamp(bool do_validate) {
266
+ bool enable_udt_validation =
267
+ txn_db_impl_->GetTxnDBOptions().enable_udt_validation;
268
+ if (!enable_udt_validation) {
269
+ if (kMaxTxnTimestamp != read_timestamp_) {
270
+ return Status::InvalidArgument(
271
+ "read_timestamp is set but timestamp validation is disabled for the "
272
+ "DB");
273
+ }
274
+ } else {
275
+ if (!do_validate) {
276
+ if (kMaxTxnTimestamp != read_timestamp_) {
277
+ return Status::InvalidArgument(
278
+ "If do_validate is false then GetForUpdate with read_timestamp is "
279
+ "not "
280
+ "defined.");
281
+ }
282
+ } else {
283
+ if (kMaxTxnTimestamp == read_timestamp_) {
284
+ return Status::InvalidArgument(
285
+ "read_timestamp must be set for validation");
286
+ }
287
+ }
288
+ }
289
+ return Status::OK();
290
+ }
291
+
274
292
  Status WriteCommittedTxn::PutEntityImpl(ColumnFamilyHandle* column_family,
275
293
  const Slice& key,
276
294
  const WideColumns& columns,
@@ -496,7 +514,8 @@ Status WriteCommittedTxn::SetReadTimestampForValidation(TxnTimestamp ts) {
496
514
  }
497
515
 
498
516
  Status WriteCommittedTxn::SetCommitTimestamp(TxnTimestamp ts) {
499
- if (read_timestamp_ < kMaxTxnTimestamp && ts <= read_timestamp_) {
517
+ if (txn_db_impl_->GetTxnDBOptions().enable_udt_validation &&
518
+ read_timestamp_ < kMaxTxnTimestamp && ts <= read_timestamp_) {
500
519
  return Status::InvalidArgument(
501
520
  "Cannot commit at timestamp smaller than or equal to read timestamp");
502
521
  }
@@ -746,8 +765,16 @@ Status WriteCommittedTxn::CommitWithoutPrepareInternal() {
746
765
  EncodeFixed64(commit_ts_buf, commit_timestamp_);
747
766
  Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf));
748
767
 
749
- Status s =
750
- wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t {
768
+ Status s = wb->UpdateTimestamps(
769
+ commit_ts, [wb, wbwi, this](uint32_t cf) -> size_t {
770
+ // First search through timestamp info kept inside the WriteBatch
771
+ // in case some writes bypassed the Transaction's write APIs.
772
+ auto cf_id_to_ts_sz = wb->GetColumnFamilyToTimestampSize();
773
+ auto iter = cf_id_to_ts_sz.find(cf);
774
+ if (iter != cf_id_to_ts_sz.end()) {
775
+ size_t ts_sz = iter->second;
776
+ return ts_sz;
777
+ }
751
778
  auto cf_iter = cfs_with_ts_tracked_when_indexing_disabled_.find(cf);
752
779
  if (cf_iter != cfs_with_ts_tracked_when_indexing_disabled_.end()) {
753
780
  return sizeof(kMaxTxnTimestamp);
@@ -823,16 +850,24 @@ Status WriteCommittedTxn::CommitInternal() {
823
850
  s = WriteBatchInternal::MarkCommitWithTimestamp(working_batch, name_,
824
851
  commit_ts);
825
852
  if (s.ok()) {
826
- s = wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t {
827
- if (cfs_with_ts_tracked_when_indexing_disabled_.find(cf) !=
828
- cfs_with_ts_tracked_when_indexing_disabled_.end()) {
829
- return sizeof(kMaxTxnTimestamp);
830
- }
831
- const Comparator* ucmp =
832
- WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf);
833
- return ucmp ? ucmp->timestamp_size()
834
- : std::numeric_limits<size_t>::max();
835
- });
853
+ s = wb->UpdateTimestamps(
854
+ commit_ts, [wb, wbwi, this](uint32_t cf) -> size_t {
855
+ // first search through timestamp info kept inside the WriteBatch
856
+ // in case some writes bypassed the Transaction's write APIs.
857
+ auto cf_id_to_ts_sz = wb->GetColumnFamilyToTimestampSize();
858
+ auto iter = cf_id_to_ts_sz.find(cf);
859
+ if (iter != cf_id_to_ts_sz.end()) {
860
+ return iter->second;
861
+ }
862
+ if (cfs_with_ts_tracked_when_indexing_disabled_.find(cf) !=
863
+ cfs_with_ts_tracked_when_indexing_disabled_.end()) {
864
+ return sizeof(kMaxTxnTimestamp);
865
+ }
866
+ const Comparator* ucmp =
867
+ WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf);
868
+ return ucmp ? ucmp->timestamp_size()
869
+ : std::numeric_limits<size_t>::max();
870
+ });
836
871
  }
837
872
  }
838
873
 
@@ -1207,7 +1242,10 @@ Status PessimisticTransaction::ValidateSnapshot(
1207
1242
 
1208
1243
  return TransactionUtil::CheckKeyForConflicts(
1209
1244
  db_impl_, cfh, key.ToString(), snap_seq, ts_sz == 0 ? nullptr : &ts_buf,
1210
- false /* cache_only */);
1245
+ false /* cache_only */,
1246
+ /* snap_checker */ nullptr,
1247
+ /* min_uncommitted */ kMaxSequenceNumber,
1248
+ txn_db_impl_->GetTxnDBOptions().enable_udt_validation);
1211
1249
  }
1212
1250
 
1213
1251
  bool PessimisticTransaction::TryStealingLocks() {
@@ -1227,14 +1265,15 @@ Status PessimisticTransaction::SetName(const TransactionName& name) {
1227
1265
  if (txn_state_ == STARTED) {
1228
1266
  if (name_.length()) {
1229
1267
  s = Status::InvalidArgument("Transaction has already been named.");
1230
- } else if (txn_db_impl_->GetTransactionByName(name) != nullptr) {
1231
- s = Status::InvalidArgument("Transaction name must be unique.");
1232
1268
  } else if (name.length() < 1 || name.length() > 512) {
1233
1269
  s = Status::InvalidArgument(
1234
1270
  "Transaction name length must be between 1 and 512 chars.");
1235
1271
  } else {
1236
1272
  name_ = name;
1237
- txn_db_impl_->RegisterTransaction(this);
1273
+ s = txn_db_impl_->RegisterTransaction(this);
1274
+ if (!s.ok()) {
1275
+ name_.clear();
1276
+ }
1238
1277
  }
1239
1278
  } else {
1240
1279
  s = Status::InvalidArgument("Transaction is beyond state for naming.");
@@ -330,6 +330,11 @@ class WriteCommittedTxn : public PessimisticTransaction {
330
330
 
331
331
  Status RollbackInternal() override;
332
332
 
333
+ // Checks if the combination of `do_validate`, the read timestamp set in
334
+ // `read_timestamp_` and the `enable_udt_validation` flag in
335
+ // TransactionDBOptions make sense together.
336
+ Status SanityCheckReadTimestamp(bool do_validate);
337
+
333
338
  // Column families that enable timestamps and whose data are written when
334
339
  // indexing_enabled_ is false. If a key is written when indexing_enabled_ is
335
340
  // true, then the corresponding column family is not added to cfs_with_ts
@@ -723,6 +723,11 @@ void PessimisticTransactionDB::ReinitializeTransaction(
723
723
  Transaction* PessimisticTransactionDB::GetTransactionByName(
724
724
  const TransactionName& name) {
725
725
  std::lock_guard<std::mutex> lock(name_map_mutex_);
726
+ return GetTransactionByNameLocked(name);
727
+ }
728
+
729
+ Transaction* PessimisticTransactionDB::GetTransactionByNameLocked(
730
+ const TransactionName& name) {
726
731
  auto it = transactions_.find(name);
727
732
  if (it == transactions_.end()) {
728
733
  return nullptr;
@@ -755,13 +760,15 @@ void PessimisticTransactionDB::SetDeadlockInfoBufferSize(uint32_t target_size) {
755
760
  lock_manager_->Resize(target_size);
756
761
  }
757
762
 
758
- void PessimisticTransactionDB::RegisterTransaction(Transaction* txn) {
763
+ Status PessimisticTransactionDB::RegisterTransaction(Transaction* txn) {
759
764
  assert(txn);
760
765
  assert(txn->GetName().length() > 0);
761
- assert(GetTransactionByName(txn->GetName()) == nullptr);
762
766
  assert(txn->GetState() == Transaction::STARTED);
763
767
  std::lock_guard<std::mutex> lock(name_map_mutex_);
764
- transactions_[txn->GetName()] = txn;
768
+ if (!transactions_.insert({txn->GetName(), txn}).second) {
769
+ return Status::InvalidArgument("Duplicate txn name " + txn->GetName());
770
+ }
771
+ return Status::OK();
765
772
  }
766
773
 
767
774
  void PessimisticTransactionDB::UnregisterTransaction(Transaction* txn) {
@@ -173,7 +173,7 @@ class PessimisticTransactionDB : public TransactionDB {
173
173
 
174
174
  Transaction* GetTransactionByName(const TransactionName& name) override;
175
175
 
176
- void RegisterTransaction(Transaction* txn);
176
+ Status RegisterTransaction(Transaction* txn);
177
177
  void UnregisterTransaction(Transaction* txn);
178
178
 
179
179
  // not thread safe. current use case is during recovery (single thread)
@@ -239,6 +239,7 @@ class PessimisticTransactionDB : public TransactionDB {
239
239
  friend class WriteUnpreparedTransactionTest_MarkLogWithPrepSection_Test;
240
240
 
241
241
  Transaction* BeginInternalTransaction(const WriteOptions& options);
242
+ Transaction* GetTransactionByNameLocked(const TransactionName& name);
242
243
 
243
244
  std::shared_ptr<LockManager> lock_manager_;
244
245
 
@@ -21,7 +21,8 @@ namespace ROCKSDB_NAMESPACE {
21
21
  Status TransactionUtil::CheckKeyForConflicts(
22
22
  DBImpl* db_impl, ColumnFamilyHandle* column_family, const std::string& key,
23
23
  SequenceNumber snap_seq, const std::string* const read_ts, bool cache_only,
24
- ReadCallback* snap_checker, SequenceNumber min_uncommitted) {
24
+ ReadCallback* snap_checker, SequenceNumber min_uncommitted,
25
+ bool enable_udt_validation) {
25
26
  Status result;
26
27
 
27
28
  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
@@ -37,8 +38,9 @@ Status TransactionUtil::CheckKeyForConflicts(
37
38
  SequenceNumber earliest_seq =
38
39
  db_impl->GetEarliestMemTableSequenceNumber(sv, true);
39
40
 
40
- result = CheckKey(db_impl, sv, earliest_seq, snap_seq, key, read_ts,
41
- cache_only, snap_checker, min_uncommitted);
41
+ result =
42
+ CheckKey(db_impl, sv, earliest_seq, snap_seq, key, read_ts, cache_only,
43
+ snap_checker, min_uncommitted, enable_udt_validation);
42
44
 
43
45
  db_impl->ReturnAndCleanupSuperVersion(cfd, sv);
44
46
  }
@@ -52,7 +54,8 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
52
54
  const std::string& key,
53
55
  const std::string* const read_ts,
54
56
  bool cache_only, ReadCallback* snap_checker,
55
- SequenceNumber min_uncommitted) {
57
+ SequenceNumber min_uncommitted,
58
+ bool enable_udt_validation) {
56
59
  // When `min_uncommitted` is provided, keys are not always committed
57
60
  // in sequence number order, and `snap_checker` is used to check whether
58
61
  // specific sequence number is in the database is visible to the transaction.
@@ -130,7 +133,7 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
130
133
  ? snap_seq < seq
131
134
  : !snap_checker->IsVisible(seq);
132
135
  // Perform conflict checking based on timestamp if applicable.
133
- if (!write_conflict && read_ts != nullptr) {
136
+ if (enable_udt_validation && !write_conflict && read_ts != nullptr) {
134
137
  ColumnFamilyData* cfd = sv->cfd;
135
138
  assert(cfd);
136
139
  const Comparator* const ucmp = cfd->user_comparator();
@@ -43,7 +43,8 @@ class TransactionUtil {
43
43
  const std::string& key, SequenceNumber snap_seq,
44
44
  const std::string* const ts, bool cache_only,
45
45
  ReadCallback* snap_checker = nullptr,
46
- SequenceNumber min_uncommitted = kMaxSequenceNumber);
46
+ SequenceNumber min_uncommitted = kMaxSequenceNumber,
47
+ bool enable_udt_validation = true);
47
48
 
48
49
  // For each key,SequenceNumber pair tracked by the LockTracker, this function
49
50
  // will verify there have been no writes to the key in the db since that
@@ -70,13 +71,15 @@ class TransactionUtil {
70
71
  // seq > `snap_seq`: applicable to conflict
71
72
  // `min_uncommitted` <= seq <= `snap_seq`: call `snap_checker` to determine.
72
73
  //
73
- // If user-defined timestamp is enabled, a write conflict is detected if an
74
- // operation for `key` with timestamp greater than `ts` exists.
74
+ // If user-defined timestamp is enabled and `enable_udt_validation` is set to
75
+ // true, a write conflict is detected if an operation for `key` with timestamp
76
+ // greater than `ts` exists.
75
77
  static Status CheckKey(DBImpl* db_impl, SuperVersion* sv,
76
78
  SequenceNumber earliest_seq, SequenceNumber snap_seq,
77
79
  const std::string& key, const std::string* const ts,
78
80
  bool cache_only, ReadCallback* snap_checker = nullptr,
79
- SequenceNumber min_uncommitted = kMaxSequenceNumber);
81
+ SequenceNumber min_uncommitted = kMaxSequenceNumber,
82
+ bool enable_udt_validation = true);
80
83
  };
81
84
 
82
85
  } // namespace ROCKSDB_NAMESPACE
@@ -130,6 +130,128 @@ void CheckKeyValueTsWithIterator(
130
130
  }
131
131
  }
132
132
 
133
+ // This is an incorrect usage of this API, supporting this should be removed
134
+ // after MyRocks remove this pattern in a refactor.
135
+ TEST_P(WriteCommittedTxnWithTsTest, WritesBypassTransactionAPIs) {
136
+ options.comparator = test::BytewiseComparatorWithU64TsWrapper();
137
+ ASSERT_OK(ReOpen());
138
+
139
+ const std::string test_cf_name = "test_cf";
140
+ ColumnFamilyOptions cf_options;
141
+ ColumnFamilyHandle* cfh = nullptr;
142
+ assert(db);
143
+ ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
144
+ delete cfh;
145
+ cfh = nullptr;
146
+
147
+ std::vector<ColumnFamilyDescriptor> cf_descs;
148
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
149
+ cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options));
150
+ options.avoid_flush_during_shutdown = true;
151
+ ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
152
+
153
+ // Write in each transaction a mixture of column families that enable
154
+ // timestamp and disable timestamps.
155
+
156
+ TransactionOptions txn_opts;
157
+ txn_opts.write_batch_track_timestamp_size = true;
158
+ std::unique_ptr<Transaction> txn0(NewTxn(WriteOptions(), txn_opts));
159
+ assert(txn0);
160
+ ASSERT_OK(txn0->Put(handles_[0], "key1", "key1_val"));
161
+ // Timestamp size info for writes like this can only be correctly tracked if
162
+ // TransactionOptions.write_batch_track_timestamp_size is true.
163
+ ASSERT_OK(txn0->GetWriteBatch()->GetWriteBatch()->Put(handles_[1], "foo",
164
+ "foo_val"));
165
+ ASSERT_OK(txn0->SetName("txn0"));
166
+ ASSERT_OK(txn0->SetCommitTimestamp(2));
167
+ ASSERT_OK(txn0->Prepare());
168
+ ASSERT_OK(txn0->Commit());
169
+ txn0.reset();
170
+
171
+ // For keys written from transactions that disable
172
+ // `write_batch_track_timestamp_size`
173
+ // The keys has incorrect behavior like:
174
+ // *Cannot be found after commit: because transaction's UpdateTimestamp do not
175
+ // have correct timestamp size when this write bypass transaction write APIs.
176
+ // *Can be found again after DB restart recovers the write from WAL log:
177
+ // because recovered transaction's UpdateTimestamp get correct timestamp size
178
+ // info directly from VersionSet.
179
+ // If there is a flush that persisted this transaction into sst files after
180
+ // it's committed, the key will be forever corrupted.
181
+ std::unique_ptr<Transaction> txn1(
182
+ NewTxn(WriteOptions(), TransactionOptions()));
183
+ assert(txn1);
184
+ ASSERT_OK(txn1->Put(handles_[0], "key2", "key2_val"));
185
+ // Writing a key with more than 8 bytes so that we can manifest the error as
186
+ // a NotFound error instead of an issue during `WriteBatch::UpdateTimestamp`.
187
+ ASSERT_OK(txn1->GetWriteBatch()->GetWriteBatch()->Put(
188
+ handles_[1], "foobarbaz", "baz_val"));
189
+ ASSERT_OK(txn1->SetName("txn1"));
190
+ ASSERT_OK(txn1->SetCommitTimestamp(2));
191
+ ASSERT_OK(txn1->Prepare());
192
+ ASSERT_OK(txn1->Commit());
193
+ txn1.reset();
194
+
195
+ ASSERT_OK(db->Flush(FlushOptions(), handles_[1]));
196
+
197
+ std::unique_ptr<Transaction> txn2(
198
+ NewTxn(WriteOptions(), TransactionOptions()));
199
+ assert(txn2);
200
+ ASSERT_OK(txn2->Put(handles_[0], "key3", "key3_val"));
201
+ ASSERT_OK(txn2->GetWriteBatch()->GetWriteBatch()->Put(
202
+ handles_[1], "bazbazbaz", "bazbazbaz_val"));
203
+ ASSERT_OK(txn2->SetCommitTimestamp(2));
204
+ ASSERT_OK(txn2->SetName("txn2"));
205
+ ASSERT_OK(txn2->Prepare());
206
+ ASSERT_OK(txn2->Commit());
207
+ txn2.reset();
208
+
209
+ std::unique_ptr<Transaction> txn3(
210
+ NewTxn(WriteOptions(), TransactionOptions()));
211
+ assert(txn3);
212
+ std::string value;
213
+ ReadOptions ropts;
214
+ std::string read_ts;
215
+ Slice timestamp = EncodeU64Ts(2, &read_ts);
216
+ ropts.timestamp = &timestamp;
217
+ ASSERT_OK(txn3->Get(ropts, handles_[0], "key1", &value));
218
+ ASSERT_EQ("key1_val", value);
219
+ ASSERT_OK(txn3->Get(ropts, handles_[0], "key2", &value));
220
+ ASSERT_EQ("key2_val", value);
221
+ ASSERT_OK(txn3->Get(ropts, handles_[0], "key3", &value));
222
+ ASSERT_EQ("key3_val", value);
223
+ txn3.reset();
224
+
225
+ std::unique_ptr<Transaction> txn4(
226
+ NewTxn(WriteOptions(), TransactionOptions()));
227
+ assert(txn4);
228
+ ASSERT_OK(txn4->Get(ReadOptions(), handles_[1], "foo", &value));
229
+ ASSERT_EQ("foo_val", value);
230
+ // Incorrect behavior: committed keys cannot be found
231
+ ASSERT_TRUE(
232
+ txn4->Get(ReadOptions(), handles_[1], "foobarbaz", &value).IsNotFound());
233
+ ASSERT_TRUE(
234
+ txn4->Get(ReadOptions(), handles_[1], "bazbazbaz", &value).IsNotFound());
235
+ txn4.reset();
236
+
237
+ ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
238
+ std::unique_ptr<Transaction> txn5(
239
+ NewTxn(WriteOptions(), TransactionOptions()));
240
+ assert(txn5);
241
+ ASSERT_OK(txn5->Get(ReadOptions(), handles_[1], "foo", &value));
242
+ ASSERT_EQ("foo_val", value);
243
+ // Incorrect behavior:
244
+ // *unflushed key can be found after reopen replays the entries from WAL
245
+ // (this is not suggesting using flushing as a workaround but to show a
246
+ // possible misleading behavior)
247
+ // *flushed key is forever corrupted.
248
+ ASSERT_TRUE(
249
+ txn5->Get(ReadOptions(), handles_[1], "foobarbaz", &value).IsNotFound());
250
+ ASSERT_OK(txn5->Get(ReadOptions(), handles_[1], "bazbazbaz", &value));
251
+ ASSERT_EQ("bazbazbaz_val", value);
252
+ txn5.reset();
253
+ }
254
+
133
255
  TEST_P(WriteCommittedTxnWithTsTest, ReOpenWithTimestamp) {
134
256
  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
135
257
  ASSERT_OK(ReOpenNoDelete());
@@ -554,6 +676,109 @@ TEST_P(WriteCommittedTxnWithTsTest, GetForUpdate) {
554
676
  txn5.reset();
555
677
  }
556
678
 
679
+ TEST_P(WriteCommittedTxnWithTsTest, GetForUpdateUdtValidationNotEnabled) {
680
+ ASSERT_OK(ReOpenNoDelete());
681
+
682
+ ColumnFamilyOptions cf_options;
683
+ cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper();
684
+ const std::string test_cf_name = "test_cf";
685
+ ColumnFamilyHandle* cfh = nullptr;
686
+ assert(db);
687
+ ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
688
+ delete cfh;
689
+ cfh = nullptr;
690
+
691
+ std::vector<ColumnFamilyDescriptor> cf_descs;
692
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
693
+ cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options));
694
+ options.avoid_flush_during_shutdown = true;
695
+
696
+ txn_db_options.enable_udt_validation = false;
697
+ ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
698
+
699
+ // blind write a key/value for latter read via `GetForUpdate`.
700
+ std::unique_ptr<Transaction> txn0(
701
+ NewTxn(WriteOptions(), TransactionOptions()));
702
+ ASSERT_OK(txn0->Put(handles_[1], "key", "value0"));
703
+ ASSERT_OK(txn0->SetCommitTimestamp(20));
704
+ ASSERT_OK(txn0->Commit());
705
+
706
+ // When timestamp validation is disabled across the whole DB
707
+ // `SetReadTimestampForValidation` should not be called.
708
+ std::unique_ptr<Transaction> txn1(
709
+ NewTxn(WriteOptions(), TransactionOptions()));
710
+ std::string value;
711
+ ASSERT_OK(txn1->SetReadTimestampForValidation(21));
712
+ ASSERT_TRUE(txn1->GetForUpdate(ReadOptions(), handles_[1], "key", &value,
713
+ /* exclusive= */ true, /*do_validate=*/true)
714
+ .IsInvalidArgument());
715
+ txn1.reset();
716
+
717
+ // do_validate and no snapshot, no conflict checking at all
718
+ std::unique_ptr<Transaction> txn2(
719
+ NewTxn(WriteOptions(), TransactionOptions()));
720
+ ASSERT_OK(txn2->GetForUpdate(ReadOptions(), handles_[1], "key", &value,
721
+ /* exclusive= */ true, /*do_validate=*/true));
722
+ ASSERT_OK(txn2->Put(handles_[1], "key", "value1"));
723
+ ASSERT_OK(txn2->SetCommitTimestamp(21));
724
+ ASSERT_OK(txn2->Commit());
725
+ txn2.reset();
726
+
727
+ // do_validate and set snapshot, execute sequence number based conflict
728
+ // checking and skip timestamp based conflict checking.
729
+ std::unique_ptr<Transaction> txn3(
730
+ NewTxn(WriteOptions(), TransactionOptions()));
731
+ txn3->SetSnapshot();
732
+ ASSERT_OK(txn3->GetForUpdate(ReadOptions(), handles_[1], "key", &value,
733
+ /* exclusive= */ true, /*do_validate=*/true));
734
+ ASSERT_OK(txn3->Put(handles_[1], "key", "value2"));
735
+ ASSERT_OK(txn3->SetCommitTimestamp(22));
736
+ ASSERT_OK(txn3->Commit());
737
+ txn3.reset();
738
+
739
+ // Always check `ReadOptions.timestamp` to be consistent with the default
740
+ // `read_timestamp_` if it's explicitly set, even if whole DB disables
741
+ // timestamp validation.
742
+ std::unique_ptr<Transaction> txn4(
743
+ NewTxn(WriteOptions(), TransactionOptions()));
744
+ ReadOptions ropts;
745
+ std::string read_timestamp;
746
+ Slice read_ts = EncodeU64Ts(27, &read_timestamp);
747
+ ropts.timestamp = &read_ts;
748
+ ASSERT_TRUE(txn4->GetForUpdate(ropts, handles_[1], "key", &value,
749
+ /* exclusive= */ true, /*do_validate=*/true)
750
+ .IsInvalidArgument());
751
+ txn4.reset();
752
+
753
+ // Conflict of timestamps not caught when parallel transactions commit with
754
+ // some out of order timestamps.
755
+ std::unique_ptr<Transaction> txn5(
756
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
757
+ assert(txn5);
758
+
759
+ std::unique_ptr<Transaction> txn6(
760
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
761
+ assert(txn6);
762
+ ASSERT_OK(txn6->GetForUpdate(ReadOptions(), handles_[1], "key", &value,
763
+ /* exclusive= */ true, /*do_validate=*/true));
764
+ ASSERT_OK(txn6->Put(handles_[1], "key", "value4"));
765
+ ASSERT_OK(txn6->SetName("txn6"));
766
+ ASSERT_OK(txn6->Prepare());
767
+ ASSERT_OK(txn6->SetCommitTimestamp(24));
768
+ ASSERT_OK(txn6->Commit());
769
+ txn6.reset();
770
+
771
+ txn5->SetSnapshot();
772
+ ASSERT_OK(txn5->GetForUpdate(ReadOptions(), handles_[1], "key", &value,
773
+ /* exclusive= */ true, /*do_validate=*/true));
774
+ ASSERT_OK(txn5->Put(handles_[1], "key", "value3"));
775
+ ASSERT_OK(txn5->SetName("txn5"));
776
+ // txn5 commits after txn6 but writes a smaller timestamp
777
+ ASSERT_OK(txn5->SetCommitTimestamp(23));
778
+ ASSERT_OK(txn5->Commit());
779
+ txn5.reset();
780
+ }
781
+
557
782
  TEST_P(WriteCommittedTxnWithTsTest, BlindWrite) {
558
783
  ASSERT_OK(ReOpenNoDelete());
559
784
 
@@ -529,7 +529,8 @@ Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
529
529
  // TODO(yanqin): support user-defined timestamp
530
530
  return TransactionUtil::CheckKeyForConflicts(
531
531
  db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr,
532
- false /* cache_only */, &snap_checker, min_uncommitted);
532
+ false /* cache_only */, &snap_checker, min_uncommitted,
533
+ txn_db_impl_->GetTxnDBOptions().enable_udt_validation);
533
534
  }
534
535
 
535
536
  void WritePreparedTxn::SetSnapshot() {
@@ -13,6 +13,7 @@
13
13
  #include <unordered_map>
14
14
  #include <vector>
15
15
 
16
+ #include "db/attribute_group_iterator_impl.h"
16
17
  #include "db/db_iter.h"
17
18
  #include "db/pre_release_callback.h"
18
19
  #include "db/read_callback.h"
@@ -101,6 +102,22 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
101
102
  const std::vector<ColumnFamilyHandle*>& column_families,
102
103
  std::vector<Iterator*>* iterators) override;
103
104
 
105
+ using DB::NewCoalescingIterator;
106
+ std::unique_ptr<Iterator> NewCoalescingIterator(
107
+ const ReadOptions& /*options*/,
108
+ const std::vector<ColumnFamilyHandle*>& /*column_families*/) override {
109
+ return std::unique_ptr<Iterator>(
110
+ NewErrorIterator(Status::NotSupported("Not supported yet")));
111
+ }
112
+
113
+ using DB::NewAttributeGroupIterator;
114
+ std::unique_ptr<AttributeGroupIterator> NewAttributeGroupIterator(
115
+ const ReadOptions& /*options*/,
116
+ const std::vector<ColumnFamilyHandle*>& /*column_families*/) override {
117
+ return NewAttributeGroupErrorIterator(
118
+ Status::NotSupported("Not supported yet"));
119
+ }
120
+
104
121
  // Check whether the transaction that wrote the value with sequence number seq
105
122
  // is visible to the snapshot with sequence number snapshot_seq.
106
123
  // Returns true if commit_seq <= snapshot_seq
@@ -395,7 +395,9 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) {
395
395
  // unprep_seqs_ will also contain prepared seqnos since they are treated in
396
396
  // the same way in the prepare/commit callbacks. See the comment on the
397
397
  // definition of unprep_seqs_.
398
- unprep_seqs_[prepare_seq] = prepare_batch_cnt_;
398
+ if (s.ok()) {
399
+ unprep_seqs_[prepare_seq] = prepare_batch_cnt_;
400
+ }
399
401
 
400
402
  // Reset transaction state.
401
403
  if (!prepared) {
@@ -1077,7 +1079,8 @@ Status WriteUnpreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
1077
1079
  // TODO(yanqin): Support user-defined timestamp.
1078
1080
  return TransactionUtil::CheckKeyForConflicts(
1079
1081
  db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr,
1080
- false /* cache_only */, &snap_checker, min_uncommitted);
1082
+ false /* cache_only */, &snap_checker, min_uncommitted,
1083
+ txn_db_impl_->GetTxnDBOptions().enable_udt_validation);
1081
1084
  }
1082
1085
 
1083
1086
  const std::map<SequenceNumber, size_t>&
package/index.js CHANGED
@@ -248,29 +248,17 @@ class RocksLevel extends AbstractLevel {
248
248
  }
249
249
 
250
250
  async query (options) {
251
+ return this.querySync(options)
252
+ }
253
+
254
+ querySync (options) {
251
255
  if (this.status !== 'open') {
252
256
  throw new ModuleError('Database is not open', {
253
257
  code: 'LEVEL_DATABASE_NOT_OPEN'
254
258
  })
255
259
  }
256
260
 
257
- // TOOD (perf): Merge into single call...
258
- const context = binding.iterator_init(this[kContext], options ?? kEmpty)
259
- try {
260
- return binding.iterator_nextv(context, options.limit)
261
- } finally {
262
- binding.iterator_close(context)
263
- }
264
- }
265
-
266
- querySync (options) {
267
- // TOOD (perf): Merge into single call...
268
- const context = binding.iterator_init(this[kContext], options ?? kEmpty)
269
- try {
270
- return binding.iterator_nextv(context, options.limit)
271
- } finally {
272
- binding.iterator_close(context)
273
- }
261
+ return binding.db_query(this[kContext], options ?? kEmpty)
274
262
  }
275
263
  }
276
264
 
package/iterator.js CHANGED
@@ -94,8 +94,16 @@ class Iterator extends AbstractIterator {
94
94
  }
95
95
 
96
96
  _nextvSync (size, options) {
97
+ if (this[kFinished]) {
98
+ return []
99
+ }
100
+
101
+ const { rows, finished } = binding.iterator_nextv(this[kContext], size)
102
+
97
103
  this[kFirst] = false
98
- return binding.iterator_nextv(this[kContext], size)
104
+ this[kFinished] = finished
105
+
106
+ return rows
99
107
  }
100
108
 
101
109
  _close (callback) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@nxtedition/rocksdb",
3
- "version": "11.0.2",
3
+ "version": "11.0.4",
4
4
  "description": "A low-level Node.js RocksDB binding",
5
5
  "license": "MIT",
6
6
  "main": "index.js",