@nxtedition/rocksdb 7.0.3 → 7.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/binding.cc +320 -324
  2. package/chained-batch.js +6 -1
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +8 -3
  4. package/deps/rocksdb/rocksdb/Makefile +10 -4
  5. package/deps/rocksdb/rocksdb/TARGETS +6 -4
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +9 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_test.cc +14 -0
  8. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +8 -8
  9. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +272 -174
  10. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +201 -57
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +19 -19
  12. package/deps/rocksdb/rocksdb/cache/lru_cache.h +2 -1
  13. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +170 -0
  14. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +95 -0
  15. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +298 -0
  16. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +172 -0
  17. package/deps/rocksdb/rocksdb/db/column_family.cc +8 -3
  18. package/deps/rocksdb/rocksdb/db/column_family.h +6 -3
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +10 -0
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +6 -6
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +22 -2
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +38 -0
  23. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +17 -5
  24. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +4 -7
  25. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +74 -71
  26. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +70 -1
  27. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +13 -12
  28. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +36 -0
  29. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +11 -4
  30. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +1 -1
  31. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +139 -91
  32. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +48 -14
  33. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +90 -55
  34. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +9 -4
  35. package/deps/rocksdb/rocksdb/db/db_test.cc +3 -1
  36. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +12 -7
  37. package/deps/rocksdb/rocksdb/db/db_write_test.cc +35 -0
  38. package/deps/rocksdb/rocksdb/db/dbformat.cc +3 -1
  39. package/deps/rocksdb/rocksdb/db/dbformat.h +5 -3
  40. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +1 -1
  41. package/deps/rocksdb/rocksdb/db/memtable.cc +1 -0
  42. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +4 -2
  43. package/deps/rocksdb/rocksdb/db/repair.cc +1 -1
  44. package/deps/rocksdb/rocksdb/db/version_builder.cc +43 -1
  45. package/deps/rocksdb/rocksdb/db/version_edit.cc +13 -5
  46. package/deps/rocksdb/rocksdb/db/version_edit.h +22 -1
  47. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +4 -5
  48. package/deps/rocksdb/rocksdb/db/version_set.cc +109 -41
  49. package/deps/rocksdb/rocksdb/db/version_set.h +36 -3
  50. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -4
  51. package/deps/rocksdb/rocksdb/db/version_set_test.cc +10 -10
  52. package/deps/rocksdb/rocksdb/db/version_util.h +1 -1
  53. package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +1 -1
  54. package/deps/rocksdb/rocksdb/db/write_batch.cc +34 -10
  55. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +2 -0
  56. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +4 -0
  57. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +2 -0
  58. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +4 -1
  59. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +1 -1
  60. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +7 -5
  61. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +5 -10
  62. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +0 -7
  63. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +2 -0
  64. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +24 -3
  65. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +8 -0
  66. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +10 -0
  67. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +5 -0
  68. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +4 -4
  69. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +9 -5
  70. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +5 -0
  71. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +1 -0
  72. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +1 -1
  73. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  74. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +0 -3
  75. package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +8 -6
  76. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +3 -1
  77. package/deps/rocksdb/rocksdb/options/options_helper.cc +4 -2
  78. package/deps/rocksdb/rocksdb/options/options_test.cc +1 -11
  79. package/deps/rocksdb/rocksdb/port/port_posix.h +7 -0
  80. package/deps/rocksdb/rocksdb/port/win/port_win.h +11 -3
  81. package/deps/rocksdb/rocksdb/src.mk +6 -2
  82. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +4 -33
  83. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +3 -3
  84. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +38 -118
  85. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +6 -8
  86. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +10 -13
  87. package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +4 -9
  88. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +0 -1
  89. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +10 -28
  90. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +2 -3
  91. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +0 -91
  92. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +2 -30
  93. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +6 -27
  94. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +11 -13
  95. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +28 -40
  96. package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +0 -1
  97. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +22 -43
  98. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +11 -22
  99. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +24 -25
  100. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +0 -1
  101. package/deps/rocksdb/rocksdb/table/get_context.h +0 -1
  102. package/deps/rocksdb/rocksdb/table/table_test.cc +3 -18
  103. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +3 -16
  104. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +3 -3
  105. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +1 -1
  106. package/deps/rocksdb/rocksdb/util/bloom_test.cc +0 -201
  107. package/deps/rocksdb/rocksdb/util/distributed_mutex.h +48 -0
  108. package/deps/rocksdb/rocksdb/util/filter_bench.cc +5 -11
  109. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +3 -0
  110. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +7 -21
  111. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -1
  112. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +45 -0
  113. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +21 -14
  114. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +10 -1
  115. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +3 -1
  116. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +9 -0
  117. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +3 -2
  118. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +3 -1
  119. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +5 -4
  120. package/deps/rocksdb/rocksdb.gyp +1 -1
  121. package/index.js +36 -14
  122. package/package-lock.json +2 -2
  123. package/package.json +1 -1
  124. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  125. package/prebuilds/linux-x64/node.napi.node +0 -0
  126. package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc +0 -358
  127. package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block.h +0 -127
  128. package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc +0 -219
@@ -15,7 +15,6 @@ enum class WriteBatchOpType {
15
15
  kSingleDelete,
16
16
  kDeleteRange,
17
17
  kMerge,
18
- kBlobIndex,
19
18
  kNum,
20
19
  };
21
20
 
@@ -25,11 +24,28 @@ WriteBatchOpType operator+(WriteBatchOpType lhs, const int rhs) {
25
24
  return static_cast<WriteBatchOpType>(static_cast<T>(lhs) + rhs);
26
25
  }
27
26
 
27
+ enum class WriteMode {
28
+ // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key > 0`.
29
+ kWriteProtectedBatch = 0,
30
+ // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key == 0`.
31
+ // Protection is enabled via `WriteOptions::protection_bytes_per_key > 0`.
32
+ kWriteUnprotectedBatch,
33
+ // TODO(ajkr): add a mode that uses `Write()` wrappers, e.g., `Put()`.
34
+ kNum,
35
+ };
36
+
37
+ // Integer addition is needed for `::testing::Range()` to take the enum type.
38
+ WriteMode operator+(WriteMode lhs, const int rhs) {
39
+ using T = std::underlying_type<WriteMode>::type;
40
+ return static_cast<WriteMode>(static_cast<T>(lhs) + rhs);
41
+ }
42
+
28
43
  std::pair<WriteBatch, Status> GetWriteBatch(ColumnFamilyHandle* cf_handle,
44
+ size_t protection_bytes_per_key,
29
45
  WriteBatchOpType op_type) {
30
46
  Status s;
31
47
  WriteBatch wb(0 /* reserved_bytes */, 0 /* max_bytes */,
32
- 8 /* protection_bytes_per_entry */, 0 /* default_cf_ts_sz */);
48
+ protection_bytes_per_key, 0 /* default_cf_ts_sz */);
33
49
  switch (op_type) {
34
50
  case WriteBatchOpType::kPut:
35
51
  s = wb.Put(cf_handle, "key", "val");
@@ -46,36 +62,44 @@ std::pair<WriteBatch, Status> GetWriteBatch(ColumnFamilyHandle* cf_handle,
46
62
  case WriteBatchOpType::kMerge:
47
63
  s = wb.Merge(cf_handle, "key", "val");
48
64
  break;
49
- case WriteBatchOpType::kBlobIndex: {
50
- // TODO(ajkr): use public API once available.
51
- uint32_t cf_id;
52
- if (cf_handle == nullptr) {
53
- cf_id = 0;
54
- } else {
55
- cf_id = cf_handle->GetID();
56
- }
57
-
58
- std::string blob_index;
59
- BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210,
60
- "val");
61
-
62
- s = WriteBatchInternal::PutBlobIndex(&wb, cf_id, "key", blob_index);
63
- break;
64
- }
65
65
  case WriteBatchOpType::kNum:
66
66
  assert(false);
67
67
  }
68
68
  return {std::move(wb), std::move(s)};
69
69
  }
70
70
 
71
- class DbKvChecksumTest
72
- : public DBTestBase,
73
- public ::testing::WithParamInterface<std::tuple<WriteBatchOpType, char>> {
71
+ class DbKvChecksumTest : public DBTestBase,
72
+ public ::testing::WithParamInterface<
73
+ std::tuple<WriteBatchOpType, char, WriteMode>> {
74
74
  public:
75
75
  DbKvChecksumTest()
76
76
  : DBTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {
77
77
  op_type_ = std::get<0>(GetParam());
78
78
  corrupt_byte_addend_ = std::get<1>(GetParam());
79
+ write_mode_ = std::get<2>(GetParam());
80
+ }
81
+
82
+ Status ExecuteWrite(ColumnFamilyHandle* cf_handle) {
83
+ switch (write_mode_) {
84
+ case WriteMode::kWriteProtectedBatch: {
85
+ auto batch_and_status = GetWriteBatch(
86
+ cf_handle, 8 /* protection_bytes_per_key */, op_type_);
87
+ assert(batch_and_status.second.ok());
88
+ return db_->Write(WriteOptions(), &batch_and_status.first);
89
+ }
90
+ case WriteMode::kWriteUnprotectedBatch: {
91
+ auto batch_and_status = GetWriteBatch(
92
+ cf_handle, 0 /* protection_bytes_per_key */, op_type_);
93
+ assert(batch_and_status.second.ok());
94
+ WriteOptions write_opts;
95
+ write_opts.protection_bytes_per_key = 8;
96
+ return db_->Write(write_opts, &batch_and_status.first);
97
+ }
98
+ case WriteMode::kNum:
99
+ assert(false);
100
+ }
101
+ return Status::NotSupported("WriteMode " +
102
+ std::to_string(static_cast<int>(write_mode_)));
79
103
  }
80
104
 
81
105
  void CorruptNextByteCallBack(void* arg) {
@@ -96,6 +120,7 @@ class DbKvChecksumTest
96
120
  protected:
97
121
  WriteBatchOpType op_type_;
98
122
  char corrupt_byte_addend_;
123
+ WriteMode write_mode_;
99
124
  size_t corrupt_byte_offset_ = 0;
100
125
  size_t entry_len_ = std::numeric_limits<size_t>::max();
101
126
  };
@@ -114,9 +139,6 @@ std::string GetOpTypeString(const WriteBatchOpType& op_type) {
114
139
  case WriteBatchOpType::kMerge:
115
140
  return "Merge";
116
141
  break;
117
- case WriteBatchOpType::kBlobIndex:
118
- return "BlobIndex";
119
- break;
120
142
  case WriteBatchOpType::kNum:
121
143
  assert(false);
122
144
  }
@@ -128,15 +150,31 @@ INSTANTIATE_TEST_CASE_P(
128
150
  DbKvChecksumTest, DbKvChecksumTest,
129
151
  ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
130
152
  WriteBatchOpType::kNum),
131
- ::testing::Values(2, 103, 251)),
132
- [](const testing::TestParamInfo<std::tuple<WriteBatchOpType, char>>& args) {
153
+ ::testing::Values(2, 103, 251),
154
+ ::testing::Range(static_cast<WriteMode>(0),
155
+ WriteMode::kNum)),
156
+ [](const testing::TestParamInfo<
157
+ std::tuple<WriteBatchOpType, char, WriteMode>>& args) {
133
158
  std::ostringstream oss;
134
159
  oss << GetOpTypeString(std::get<0>(args.param)) << "Add"
135
160
  << static_cast<int>(
136
161
  static_cast<unsigned char>(std::get<1>(args.param)));
162
+ switch (std::get<2>(args.param)) {
163
+ case WriteMode::kWriteProtectedBatch:
164
+ oss << "WriteProtectedBatch";
165
+ break;
166
+ case WriteMode::kWriteUnprotectedBatch:
167
+ oss << "WriteUnprotectedBatch";
168
+ break;
169
+ case WriteMode::kNum:
170
+ assert(false);
171
+ }
137
172
  return oss.str();
138
173
  });
139
174
 
175
+ // TODO(ajkr): add a test that corrupts the `WriteBatch` contents. Such
176
+ // corruptions should only be detectable in `WriteMode::kWriteProtectedBatch`.
177
+
140
178
  TEST_P(DbKvChecksumTest, MemTableAddCorrupted) {
141
179
  // This test repeatedly attempts to write `WriteBatch`es containing a single
142
180
  // entry of type `op_type_`. Each attempt has one byte corrupted in its
@@ -158,10 +196,7 @@ TEST_P(DbKvChecksumTest, MemTableAddCorrupted) {
158
196
  Reopen(options);
159
197
 
160
198
  SyncPoint::GetInstance()->EnableProcessing();
161
- auto batch_and_status = GetWriteBatch(nullptr /* cf_handle */, op_type_);
162
- ASSERT_OK(batch_and_status.second);
163
- ASSERT_TRUE(
164
- db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption());
199
+ ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
165
200
  SyncPoint::GetInstance()->DisableProcessing();
166
201
 
167
202
  // In case the above callback is not invoked, this test will run
@@ -194,10 +229,7 @@ TEST_P(DbKvChecksumTest, MemTableAddWithColumnFamilyCorrupted) {
194
229
  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
195
230
 
196
231
  SyncPoint::GetInstance()->EnableProcessing();
197
- auto batch_and_status = GetWriteBatch(handles_[1], op_type_);
198
- ASSERT_OK(batch_and_status.second);
199
- ASSERT_TRUE(
200
- db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption());
232
+ ASSERT_TRUE(ExecuteWrite(handles_[1]).IsCorruption());
201
233
  SyncPoint::GetInstance()->DisableProcessing();
202
234
 
203
235
  // In case the above callback is not invoked, this test will run
@@ -209,7 +241,8 @@ TEST_P(DbKvChecksumTest, MemTableAddWithColumnFamilyCorrupted) {
209
241
 
210
242
  TEST_P(DbKvChecksumTest, NoCorruptionCase) {
211
243
  // If this test fails, we may have found a piece of malfunctioned hardware
212
- auto batch_and_status = GetWriteBatch(nullptr, op_type_);
244
+ auto batch_and_status =
245
+ GetWriteBatch(nullptr, 8 /* protection_bytes_per_key */, op_type_);
213
246
  ASSERT_OK(batch_and_status.second);
214
247
  ASSERT_OK(batch_and_status.first.VerifyChecksum());
215
248
  }
@@ -238,10 +271,7 @@ TEST_P(DbKvChecksumTest, WriteToWALCorrupted) {
238
271
  auto log_size_pre_write = dbfull()->TEST_total_log_size();
239
272
 
240
273
  SyncPoint::GetInstance()->EnableProcessing();
241
- auto batch_and_status = GetWriteBatch(nullptr /* cf_handle */, op_type_);
242
- ASSERT_OK(batch_and_status.second);
243
- ASSERT_TRUE(
244
- db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption());
274
+ ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
245
275
  // Confirm that nothing was written to WAL
246
276
  ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
247
277
  ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
@@ -279,10 +309,7 @@ TEST_P(DbKvChecksumTest, WriteToWALWithColumnFamilyCorrupted) {
279
309
  auto log_size_pre_write = dbfull()->TEST_total_log_size();
280
310
 
281
311
  SyncPoint::GetInstance()->EnableProcessing();
282
- auto batch_and_status = GetWriteBatch(handles_[1], op_type_);
283
- ASSERT_OK(batch_and_status.second);
284
- ASSERT_TRUE(
285
- db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption());
312
+ ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
286
313
  // Confirm that nothing was written to WAL
287
314
  ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
288
315
  ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
@@ -322,9 +349,11 @@ void CorruptWriteBatch(Slice* content, size_t offset,
322
349
 
323
350
  TEST_P(DbKvChecksumTestMergedBatch, NoCorruptionCase) {
324
351
  // Veirfy write batch checksum after write batch append
325
- auto batch1 = GetWriteBatch(nullptr /* cf_handle */, op_type1_);
352
+ auto batch1 = GetWriteBatch(nullptr /* cf_handle */,
353
+ 8 /* protection_bytes_per_key */, op_type1_);
326
354
  ASSERT_OK(batch1.second);
327
- auto batch2 = GetWriteBatch(nullptr /* cf_handle */, op_type2_);
355
+ auto batch2 = GetWriteBatch(nullptr /* cf_handle */,
356
+ 8 /* protection_bytes_per_key */, op_type2_);
328
357
  ASSERT_OK(batch2.second);
329
358
  ASSERT_OK(WriteBatchInternal::Append(&batch1.first, &batch2.first));
330
359
  ASSERT_OK(batch1.first.VerifyChecksum());
@@ -345,11 +374,11 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) {
345
374
  options.merge_operator = MergeOperators::CreateStringAppendOperator();
346
375
  }
347
376
 
348
- auto leader_batch_and_status =
349
- GetWriteBatch(nullptr /* cf_handle */, op_type1_);
377
+ auto leader_batch_and_status = GetWriteBatch(
378
+ nullptr /* cf_handle */, 8 /* protection_bytes_per_key */, op_type1_);
350
379
  ASSERT_OK(leader_batch_and_status.second);
351
- auto follower_batch_and_status =
352
- GetWriteBatch(nullptr /* cf_handle */, op_type2_);
380
+ auto follower_batch_and_status = GetWriteBatch(
381
+ nullptr /* cf_handle */, 8 /* protection_bytes_per_key */, op_type2_);
353
382
  size_t leader_batch_size = leader_batch_and_status.first.GetDataSize();
354
383
  size_t total_bytes =
355
384
  leader_batch_size + follower_batch_and_status.first.GetDataSize();
@@ -390,7 +419,8 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) {
390
419
  // follower
391
420
  follower_thread = port::Thread([&]() {
392
421
  follower_batch_and_status =
393
- GetWriteBatch(nullptr /* cf_handle */, op_type2_);
422
+ GetWriteBatch(nullptr /* cf_handle */,
423
+ 8 /* protection_bytes_per_key */, op_type2_);
394
424
  ASSERT_OK(follower_batch_and_status.second);
395
425
  ASSERT_TRUE(
396
426
  db_->Write(WriteOptions(), &follower_batch_and_status.first)
@@ -413,7 +443,8 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) {
413
443
  Reopen(options);
414
444
  SyncPoint::GetInstance()->EnableProcessing();
415
445
  auto log_size_pre_write = dbfull()->TEST_total_log_size();
416
- leader_batch_and_status = GetWriteBatch(nullptr /* cf_handle */, op_type1_);
446
+ leader_batch_and_status = GetWriteBatch(
447
+ nullptr /* cf_handle */, 8 /* protection_bytes_per_key */, op_type1_);
417
448
  ASSERT_OK(leader_batch_and_status.second);
418
449
  ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first)
419
450
  .IsCorruption());
@@ -452,9 +483,11 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) {
452
483
  }
453
484
  CreateAndReopenWithCF({"ramen"}, options);
454
485
 
455
- auto leader_batch_and_status = GetWriteBatch(handles_[1], op_type1_);
486
+ auto leader_batch_and_status =
487
+ GetWriteBatch(handles_[1], 8 /* protection_bytes_per_key */, op_type1_);
456
488
  ASSERT_OK(leader_batch_and_status.second);
457
- auto follower_batch_and_status = GetWriteBatch(handles_[1], op_type2_);
489
+ auto follower_batch_and_status =
490
+ GetWriteBatch(handles_[1], 8 /* protection_bytes_per_key */, op_type2_);
458
491
  size_t leader_batch_size = leader_batch_and_status.first.GetDataSize();
459
492
  size_t total_bytes =
460
493
  leader_batch_size + follower_batch_and_status.first.GetDataSize();
@@ -494,7 +527,8 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) {
494
527
  // Start the other writer thread which will join the write group as
495
528
  // follower
496
529
  follower_thread = port::Thread([&]() {
497
- follower_batch_and_status = GetWriteBatch(handles_[1], op_type2_);
530
+ follower_batch_and_status = GetWriteBatch(
531
+ handles_[1], 8 /* protection_bytes_per_key */, op_type2_);
498
532
  ASSERT_OK(follower_batch_and_status.second);
499
533
  ASSERT_TRUE(
500
534
  db_->Write(WriteOptions(), &follower_batch_and_status.first)
@@ -518,7 +552,8 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) {
518
552
  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "ramen"}, options);
519
553
  SyncPoint::GetInstance()->EnableProcessing();
520
554
  auto log_size_pre_write = dbfull()->TEST_total_log_size();
521
- leader_batch_and_status = GetWriteBatch(handles_[1], op_type1_);
555
+ leader_batch_and_status =
556
+ GetWriteBatch(handles_[1], 8 /* protection_bytes_per_key */, op_type1_);
522
557
  ASSERT_OK(leader_batch_and_status.second);
523
558
  ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first)
524
559
  .IsCorruption());
@@ -139,8 +139,6 @@ TEST_P(DBRateLimiterOnReadTest, Get) {
139
139
  }
140
140
 
141
141
  TEST_P(DBRateLimiterOnReadTest, NewMultiGet) {
142
- // The new void-returning `MultiGet()` APIs use `MultiRead()`, which does not
143
- // yet support rate limiting.
144
142
  if (use_direct_io_ && !IsDirectIOSupported()) {
145
143
  return;
146
144
  }
@@ -149,6 +147,7 @@ TEST_P(DBRateLimiterOnReadTest, NewMultiGet) {
149
147
  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
150
148
 
151
149
  const int kNumKeys = kNumFiles * kNumKeysPerFile;
150
+ int64_t expected = 0;
152
151
  {
153
152
  std::vector<std::string> key_bufs;
154
153
  key_bufs.reserve(kNumKeys);
@@ -160,13 +159,19 @@ TEST_P(DBRateLimiterOnReadTest, NewMultiGet) {
160
159
  }
161
160
  std::vector<Status> statuses(kNumKeys);
162
161
  std::vector<PinnableSlice> values(kNumKeys);
162
+ const int64_t prev_total_rl_req = options_.rate_limiter->GetTotalRequests();
163
163
  db_->MultiGet(GetReadOptions(), dbfull()->DefaultColumnFamily(), kNumKeys,
164
164
  keys.data(), values.data(), statuses.data());
165
+ const int64_t cur_total_rl_req = options_.rate_limiter->GetTotalRequests();
165
166
  for (int i = 0; i < kNumKeys; ++i) {
166
- ASSERT_TRUE(statuses[i].IsNotSupported());
167
+ ASSERT_TRUE(statuses[i].ok());
167
168
  }
169
+ ASSERT_GT(cur_total_rl_req, prev_total_rl_req);
170
+ ASSERT_EQ(cur_total_rl_req - prev_total_rl_req,
171
+ options_.rate_limiter->GetTotalRequests(Env::IO_USER));
168
172
  }
169
- ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
173
+ expected += kNumKeys;
174
+ ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
170
175
  }
171
176
 
172
177
  TEST_P(DBRateLimiterOnReadTest, OldMultiGet) {
@@ -4271,7 +4271,9 @@ TEST_F(DBTest, ConcurrentFlushWAL) {
4271
4271
  threads.emplace_back([&] {
4272
4272
  for (size_t i = cnt; i < 2 * cnt; i++) {
4273
4273
  auto istr = std::to_string(i);
4274
- WriteBatch batch;
4274
+ WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
4275
+ wopt.protection_bytes_per_key,
4276
+ 0 /* default_cf_ts_sz */);
4275
4277
  ASSERT_OK(batch.Put("a" + istr, "b" + istr));
4276
4278
  ASSERT_OK(
4277
4279
  dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true));
@@ -1261,11 +1261,11 @@ class RecoveryTestHelper {
1261
1261
  std::unique_ptr<WalManager> wal_manager;
1262
1262
  WriteController write_controller;
1263
1263
 
1264
- versions.reset(new VersionSet(test->dbname_, &db_options, file_options,
1265
- table_cache.get(), &write_buffer_manager,
1266
- &write_controller,
1267
- /*block_cache_tracer=*/nullptr,
1268
- /*io_tracer=*/nullptr, /*db_session_id*/ ""));
1264
+ versions.reset(new VersionSet(
1265
+ test->dbname_, &db_options, file_options, table_cache.get(),
1266
+ &write_buffer_manager, &write_controller,
1267
+ /*block_cache_tracer=*/nullptr,
1268
+ /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ ""));
1269
1269
 
1270
1270
  wal_manager.reset(
1271
1271
  new WalManager(db_options, file_options, /*io_tracer=*/nullptr));
@@ -1497,6 +1497,8 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
1497
1497
  // The following make sure there are two bg flush threads.
1498
1498
  options.max_background_jobs = 8;
1499
1499
 
1500
+ DestroyAndReopen(options);
1501
+
1500
1502
  const std::string cf1_name("cf1");
1501
1503
  CreateAndReopenWithCF({cf1_name}, options);
1502
1504
  assert(handles_.size() == 2);
@@ -1512,10 +1514,13 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
1512
1514
  ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "foo", "value"));
1513
1515
  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
1514
1516
 
1515
- ASSERT_OK(dbfull()->TEST_FlushMemTable(false, true, handles_[1]));
1517
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(
1518
+ /*wait=*/false, /*allow_write_stall=*/true, handles_[1]));
1516
1519
 
1517
1520
  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
1518
- ASSERT_OK(dbfull()->TEST_FlushMemTable(false, true, handles_[0]));
1521
+
1522
+ ASSERT_OK(dbfull()->TEST_FlushMemTable(
1523
+ /*wait=*/false, /*allow_write_stall=*/true, handles_[0]));
1519
1524
 
1520
1525
  bool called = false;
1521
1526
  SyncPoint::GetInstance()->DisableProcessing();
@@ -334,6 +334,41 @@ TEST_P(DBWriteTest, ManualWalFlushInEffect) {
334
334
  ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty());
335
335
  }
336
336
 
337
+ TEST_P(DBWriteTest, UnflushedPutRaceWithTrackedWalSync) {
338
+ // Repro race condition bug where unflushed WAL data extended the synced size
339
+ // recorded to MANIFEST despite being unrecoverable.
340
+ Options options = GetOptions();
341
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
342
+ new FaultInjectionTestEnv(env_));
343
+ options.env = fault_env.get();
344
+ options.manual_wal_flush = true;
345
+ options.track_and_verify_wals_in_manifest = true;
346
+ Reopen(options);
347
+
348
+ ASSERT_OK(Put("key1", "val1"));
349
+
350
+ SyncPoint::GetInstance()->SetCallBack(
351
+ "DBImpl::SyncWAL:Begin",
352
+ [this](void* /* arg */) { ASSERT_OK(Put("key2", "val2")); });
353
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
354
+
355
+ ASSERT_OK(db_->FlushWAL(true /* sync */));
356
+
357
+ // Ensure callback ran.
358
+ ASSERT_EQ("val2", Get("key2"));
359
+
360
+ Close();
361
+
362
+ // Simulate full loss of unsynced data. This drops "key2" -> "val2" from the
363
+ // DB WAL.
364
+ fault_env->DropUnsyncedFileData();
365
+
366
+ Reopen(options);
367
+
368
+ // Need to close before `fault_env` goes out of scope.
369
+ Close();
370
+ }
371
+
337
372
  TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
338
373
  std::unique_ptr<FaultInjectionTestEnv> mock_env(
339
374
  new FaultInjectionTestEnv(env_));
@@ -26,7 +26,7 @@ namespace ROCKSDB_NAMESPACE {
26
26
  // and the value type is embedded as the low 8 bits in the sequence
27
27
  // number in internal keys, we need to use the highest-numbered
28
28
  // ValueType, not the lowest).
29
- const ValueType kValueTypeForSeek = kTypeDeletionWithTimestamp;
29
+ const ValueType kValueTypeForSeek = kTypeWideColumnEntity;
30
30
  const ValueType kValueTypeForSeekForPrev = kTypeDeletion;
31
31
  const std::string kDisableUserTimestamp("");
32
32
 
@@ -46,6 +46,8 @@ EntryType GetEntryType(ValueType value_type) {
46
46
  return kEntryRangeDeletion;
47
47
  case kTypeBlobIndex:
48
48
  return kEntryBlobIndex;
49
+ case kTypeWideColumnEntity:
50
+ return kEntryWideColumnEntity;
49
51
  default:
50
52
  return kEntryOther;
51
53
  }
@@ -66,7 +66,9 @@ enum ValueType : unsigned char {
66
66
  kTypeBeginUnprepareXID = 0x13, // WAL only.
67
67
  kTypeDeletionWithTimestamp = 0x14,
68
68
  kTypeCommitXIDAndTimestamp = 0x15, // WAL only
69
- kMaxValue = 0x7F // Not used for storing records.
69
+ kTypeWideColumnEntity = 0x16,
70
+ kTypeColumnFamilyWideColumnEntity = 0x17, // WAL only
71
+ kMaxValue = 0x7F // Not used for storing records.
70
72
  };
71
73
 
72
74
  // Defined in dbformat.cc
@@ -76,8 +78,8 @@ extern const ValueType kValueTypeForSeekForPrev;
76
78
  // Checks whether a type is an inline value type
77
79
  // (i.e. a type used in memtable skiplist and sst file datablock).
78
80
  inline bool IsValueType(ValueType t) {
79
- return t <= kTypeMerge || t == kTypeSingleDeletion || t == kTypeBlobIndex ||
80
- kTypeDeletionWithTimestamp == t;
81
+ return t <= kTypeMerge || kTypeSingleDeletion == t || kTypeBlobIndex == t ||
82
+ kTypeDeletionWithTimestamp == t || kTypeWideColumnEntity == t;
81
83
  }
82
84
 
83
85
  // Checks whether a type is from user operation
@@ -128,7 +128,7 @@ class FlushJobTestBase : public testing::Test {
128
128
  new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
129
129
  &write_buffer_manager_, &write_controller_,
130
130
  /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
131
- /*db_session_id*/ ""));
131
+ /*db_id*/ "", /*db_session_id*/ ""));
132
132
  EXPECT_OK(versions_->Recover(column_families, false));
133
133
  }
134
134
 
@@ -1159,6 +1159,7 @@ Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key,
1159
1159
  if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
1160
1160
  // shift the value buffer as well.
1161
1161
  memcpy(p, prev_buffer, new_prev_size);
1162
+ prev_buffer = p;
1162
1163
  }
1163
1164
  }
1164
1165
  RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
@@ -103,7 +103,8 @@ class MemTableListTest : public testing::Test {
103
103
  VersionSet versions(dbname, &immutable_db_options, env_options,
104
104
  table_cache.get(), &write_buffer_manager,
105
105
  &write_controller, /*block_cache_tracer=*/nullptr,
106
- /*io_tracer=*/nullptr, /*db_session_id*/ "");
106
+ /*io_tracer=*/nullptr, /*db_id*/ "",
107
+ /*db_session_id*/ "");
107
108
  std::vector<ColumnFamilyDescriptor> cf_descs;
108
109
  cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
109
110
  cf_descs.emplace_back("one", ColumnFamilyOptions());
@@ -153,7 +154,8 @@ class MemTableListTest : public testing::Test {
153
154
  VersionSet versions(dbname, &immutable_db_options, env_options,
154
155
  table_cache.get(), &write_buffer_manager,
155
156
  &write_controller, /*block_cache_tracer=*/nullptr,
156
- /*io_tracer=*/nullptr, /*db_session_id*/ "");
157
+ /*io_tracer=*/nullptr, /*db_id*/ "",
158
+ /*db_session_id*/ "");
157
159
  std::vector<ColumnFamilyDescriptor> cf_descs;
158
160
  cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
159
161
  cf_descs.emplace_back("one", ColumnFamilyOptions());
@@ -122,7 +122,7 @@ class Repairer {
122
122
  vset_(dbname_, &immutable_db_options_, file_options_,
123
123
  raw_table_cache_.get(), &wb_, &wc_,
124
124
  /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
125
- db_session_id_),
125
+ /*db_id=*/"", db_session_id_),
126
126
  next_file_number_(1),
127
127
  db_lock_(nullptr),
128
128
  closed_(false) {
@@ -249,6 +249,8 @@ class VersionBuilder::Rep {
249
249
  bool has_invalid_levels_;
250
250
  // Current levels of table files affected by additions/deletions.
251
251
  std::unordered_map<uint64_t, int> table_file_levels_;
252
+ // Current compact cursors that should be changed after the last compaction
253
+ std::unordered_map<int, InternalKey> updated_compact_cursors_;
252
254
  NewestFirstBySeqNo level_zero_cmp_;
253
255
  BySmallestKey level_nonzero_cmp_;
254
256
 
@@ -809,6 +811,22 @@ class VersionBuilder::Rep {
809
811
  return Status::OK();
810
812
  }
811
813
 
814
+ Status ApplyCompactCursors(int level,
815
+ const InternalKey& smallest_uncompacted_key) {
816
+ if (level < 0) {
817
+ std::ostringstream oss;
818
+ oss << "Cannot add compact cursor (" << level << ","
819
+ << smallest_uncompacted_key.Encode().ToString()
820
+ << " due to invalid level (level = " << level << ")";
821
+ return Status::Corruption("VersionBuilder", oss.str());
822
+ }
823
+ if (level < num_levels_) {
824
+ // Omit levels (>= num_levels_) when re-open with shrinking num_levels_
825
+ updated_compact_cursors_[level] = smallest_uncompacted_key;
826
+ }
827
+ return Status::OK();
828
+ }
829
+
812
830
  // Apply all of the edits in *edit to the current state.
813
831
  Status Apply(const VersionEdit* edit) {
814
832
  {
@@ -860,6 +878,16 @@ class VersionBuilder::Rep {
860
878
  }
861
879
  }
862
880
 
881
+ // Populate compact cursors for round-robin compaction, leave
882
+ // the cursor to be empty to indicate it is invalid
883
+ for (const auto& cursor : edit->GetCompactCursors()) {
884
+ const int level = cursor.first;
885
+ const InternalKey smallest_uncompacted_key = cursor.second;
886
+ const Status s = ApplyCompactCursors(level, smallest_uncompacted_key);
887
+ if (!s.ok()) {
888
+ return s;
889
+ }
890
+ }
863
891
  return Status::OK();
864
892
  }
865
893
 
@@ -1142,12 +1170,24 @@ class VersionBuilder::Rep {
1142
1170
  }
1143
1171
  }
1144
1172
 
1173
+ void SaveCompactCursorsTo(VersionStorageInfo* vstorage) const {
1174
+ for (auto iter = updated_compact_cursors_.begin();
1175
+ iter != updated_compact_cursors_.end(); iter++) {
1176
+ vstorage->AddCursorForOneLevel(iter->first, iter->second);
1177
+ }
1178
+ }
1179
+
1145
1180
  // Save the current state in *vstorage.
1146
1181
  Status SaveTo(VersionStorageInfo* vstorage) const {
1147
- Status s = CheckConsistency(base_vstorage_);
1182
+ Status s;
1183
+
1184
+ #ifndef NDEBUG
1185
+ // The same check is done within Apply() so we skip it in release mode.
1186
+ s = CheckConsistency(base_vstorage_);
1148
1187
  if (!s.ok()) {
1149
1188
  return s;
1150
1189
  }
1190
+ #endif // NDEBUG
1151
1191
 
1152
1192
  s = CheckConsistency(vstorage);
1153
1193
  if (!s.ok()) {
@@ -1158,6 +1198,8 @@ class VersionBuilder::Rep {
1158
1198
 
1159
1199
  SaveBlobFilesTo(vstorage);
1160
1200
 
1201
+ SaveCompactCursorsTo(vstorage);
1202
+
1161
1203
  s = CheckConsistency(vstorage);
1162
1204
  return s;
1163
1205
  }
@@ -79,6 +79,7 @@ void VersionEdit::Clear() {
79
79
  has_max_column_family_ = false;
80
80
  has_min_log_number_to_keep_ = false;
81
81
  has_last_sequence_ = false;
82
+ compact_cursors_.clear();
82
83
  deleted_files_.clear();
83
84
  new_files_.clear();
84
85
  blob_file_additions_.clear();
@@ -121,6 +122,13 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
121
122
  if (has_last_sequence_) {
122
123
  PutVarint32Varint64(dst, kLastSequence, last_sequence_);
123
124
  }
125
+ for (size_t i = 0; i < compact_cursors_.size(); i++) {
126
+ if (compact_cursors_[i].second.Valid()) {
127
+ PutVarint32(dst, kCompactCursor);
128
+ PutVarint32(dst, compact_cursors_[i].first); // level
129
+ PutLengthPrefixedSlice(dst, compact_cursors_[i].second.Encode());
130
+ }
131
+ }
124
132
  for (const auto& deleted : deleted_files_) {
125
133
  PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */,
126
134
  deleted.second /* file number */);
@@ -512,15 +520,15 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
512
520
  }
513
521
  break;
514
522
 
515
- case kCompactPointer:
523
+ case kCompactCursor:
516
524
  if (GetLevel(&input, &level, &msg) &&
517
525
  GetInternalKey(&input, &key)) {
518
- // we don't use compact pointers anymore,
519
- // but we should not fail if they are still
520
- // in manifest
526
+ // Here we re-use the output format of compact pointer in LevelDB
527
+ // to persist compact_cursors_
528
+ compact_cursors_.push_back(std::make_pair(level, key));
521
529
  } else {
522
530
  if (!msg) {
523
- msg = "compaction pointer";
531
+ msg = "compaction cursor";
524
532
  }
525
533
  }
526
534
  break;