@nxtedition/rocksdb 7.0.0-alpha.7 → 7.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/binding.cc +67 -73
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +1 -1
  3. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +3 -1
  4. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +2 -0
  5. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -0
  6. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +28 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_test.cc +5 -2
  8. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +48 -60
  9. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +18 -20
  10. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
  11. package/deps/rocksdb/rocksdb/db/c.cc +5 -0
  12. package/deps/rocksdb/rocksdb/db/column_family.cc +20 -0
  13. package/deps/rocksdb/rocksdb/db/column_family.h +9 -0
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +44 -26
  15. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +32 -14
  16. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +73 -44
  17. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +3 -1
  18. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +6 -1
  19. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +10 -5
  20. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +47 -35
  21. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +2 -1
  22. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +54 -32
  23. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +426 -61
  24. package/deps/rocksdb/rocksdb/db/db_options_test.cc +1 -0
  25. package/deps/rocksdb/rocksdb/db/db_test.cc +102 -24
  26. package/deps/rocksdb/rocksdb/db/db_test2.cc +159 -30
  27. package/deps/rocksdb/rocksdb/db/db_test_util.cc +1 -0
  28. package/deps/rocksdb/rocksdb/db/dbformat.h +1 -1
  29. package/deps/rocksdb/rocksdb/db/version_builder.cc +39 -10
  30. package/deps/rocksdb/rocksdb/db/version_builder.h +4 -1
  31. package/deps/rocksdb/rocksdb/db/version_edit.h +20 -0
  32. package/deps/rocksdb/rocksdb/db/version_set.cc +2 -1
  33. package/deps/rocksdb/rocksdb/db/version_set.h +17 -2
  34. package/deps/rocksdb/rocksdb/db/version_set_test.cc +119 -0
  35. package/deps/rocksdb/rocksdb/db/write_batch.cc +96 -0
  36. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +4 -0
  37. package/deps/rocksdb/rocksdb/db/write_thread.cc +1 -0
  38. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +3 -0
  39. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +9 -0
  40. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +18 -2
  41. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +4 -0
  42. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +12 -0
  43. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +1 -1
  44. package/deps/rocksdb/rocksdb/env/fs_posix.cc +96 -6
  45. package/deps/rocksdb/rocksdb/env/io_posix.cc +51 -18
  46. package/deps/rocksdb/rocksdb/env/io_posix.h +2 -0
  47. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +12 -5
  48. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +22 -6
  49. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +99 -8
  50. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +9 -1
  51. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +3 -0
  52. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -0
  53. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +4 -0
  54. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +1 -1
  55. package/deps/rocksdb/rocksdb/include/rocksdb/io_status.h +7 -0
  56. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +11 -1
  57. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +4 -1
  58. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +14 -1
  59. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +6 -0
  60. package/deps/rocksdb/rocksdb/options/cf_options.cc +12 -1
  61. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  62. package/deps/rocksdb/rocksdb/options/options.cc +8 -1
  63. package/deps/rocksdb/rocksdb/options/options_helper.cc +1 -0
  64. package/deps/rocksdb/rocksdb/options/options_parser.cc +2 -1
  65. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +7 -2
  66. package/deps/rocksdb/rocksdb/options/options_test.cc +52 -0
  67. package/deps/rocksdb/rocksdb/port/port_posix.h +10 -1
  68. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +1 -1
  69. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +1 -1
  70. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +1 -1
  71. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +5 -5
  72. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +16 -10
  73. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +1 -1
  74. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +1 -1
  75. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -4
  76. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
  77. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +39 -12
  78. package/deps/rocksdb/rocksdb/util/comparator.cc +10 -0
  79. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +1 -1
  80. package/deps/rocksdb/rocksdb/util/xxhash.h +2 -1
  81. package/index.js +4 -1
  82. package/package.json +1 -1
  83. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  84. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -25,6 +25,49 @@ WriteBatchOpType operator+(WriteBatchOpType lhs, const int rhs) {
25
25
  return static_cast<WriteBatchOpType>(static_cast<T>(lhs) + rhs);
26
26
  }
27
27
 
28
+ std::pair<WriteBatch, Status> GetWriteBatch(ColumnFamilyHandle* cf_handle,
29
+ WriteBatchOpType op_type) {
30
+ Status s;
31
+ WriteBatch wb(0 /* reserved_bytes */, 0 /* max_bytes */,
32
+ 8 /* protection_bytes_per_entry */, 0 /* default_cf_ts_sz */);
33
+ switch (op_type) {
34
+ case WriteBatchOpType::kPut:
35
+ s = wb.Put(cf_handle, "key", "val");
36
+ break;
37
+ case WriteBatchOpType::kDelete:
38
+ s = wb.Delete(cf_handle, "key");
39
+ break;
40
+ case WriteBatchOpType::kSingleDelete:
41
+ s = wb.SingleDelete(cf_handle, "key");
42
+ break;
43
+ case WriteBatchOpType::kDeleteRange:
44
+ s = wb.DeleteRange(cf_handle, "begin", "end");
45
+ break;
46
+ case WriteBatchOpType::kMerge:
47
+ s = wb.Merge(cf_handle, "key", "val");
48
+ break;
49
+ case WriteBatchOpType::kBlobIndex: {
50
+ // TODO(ajkr): use public API once available.
51
+ uint32_t cf_id;
52
+ if (cf_handle == nullptr) {
53
+ cf_id = 0;
54
+ } else {
55
+ cf_id = cf_handle->GetID();
56
+ }
57
+
58
+ std::string blob_index;
59
+ BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210,
60
+ "val");
61
+
62
+ s = WriteBatchInternal::PutBlobIndex(&wb, cf_id, "key", blob_index);
63
+ break;
64
+ }
65
+ case WriteBatchOpType::kNum:
66
+ assert(false);
67
+ }
68
+ return {std::move(wb), std::move(s)};
69
+ }
70
+
28
71
  class DbKvChecksumTest
29
72
  : public DBTestBase,
30
73
  public ::testing::WithParamInterface<std::tuple<WriteBatchOpType, char>> {
@@ -35,48 +78,6 @@ class DbKvChecksumTest
35
78
  corrupt_byte_addend_ = std::get<1>(GetParam());
36
79
  }
37
80
 
38
- std::pair<WriteBatch, Status> GetWriteBatch(ColumnFamilyHandle* cf_handle) {
39
- Status s;
40
- WriteBatch wb(0 /* reserved_bytes */, 0 /* max_bytes */,
41
- 8 /* protection_bytes_per_entry */, 0 /* default_cf_ts_sz */);
42
- switch (op_type_) {
43
- case WriteBatchOpType::kPut:
44
- s = wb.Put(cf_handle, "key", "val");
45
- break;
46
- case WriteBatchOpType::kDelete:
47
- s = wb.Delete(cf_handle, "key");
48
- break;
49
- case WriteBatchOpType::kSingleDelete:
50
- s = wb.SingleDelete(cf_handle, "key");
51
- break;
52
- case WriteBatchOpType::kDeleteRange:
53
- s = wb.DeleteRange(cf_handle, "begin", "end");
54
- break;
55
- case WriteBatchOpType::kMerge:
56
- s = wb.Merge(cf_handle, "key", "val");
57
- break;
58
- case WriteBatchOpType::kBlobIndex: {
59
- // TODO(ajkr): use public API once available.
60
- uint32_t cf_id;
61
- if (cf_handle == nullptr) {
62
- cf_id = 0;
63
- } else {
64
- cf_id = cf_handle->GetID();
65
- }
66
-
67
- std::string blob_index;
68
- BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210,
69
- "val");
70
-
71
- s = WriteBatchInternal::PutBlobIndex(&wb, cf_id, "key", blob_index);
72
- break;
73
- }
74
- case WriteBatchOpType::kNum:
75
- assert(false);
76
- }
77
- return {std::move(wb), std::move(s)};
78
- }
79
-
80
81
  void CorruptNextByteCallBack(void* arg) {
81
82
  Slice encoded = *static_cast<Slice*>(arg);
82
83
  if (entry_len_ == std::numeric_limits<size_t>::max()) {
@@ -99,34 +100,28 @@ class DbKvChecksumTest
99
100
  size_t entry_len_ = std::numeric_limits<size_t>::max();
100
101
  };
101
102
 
102
- std::string GetTestNameSuffix(
103
- ::testing::TestParamInfo<std::tuple<WriteBatchOpType, char>> info) {
104
- std::ostringstream oss;
105
- switch (std::get<0>(info.param)) {
103
+ std::string GetOpTypeString(const WriteBatchOpType& op_type) {
104
+ switch (op_type) {
106
105
  case WriteBatchOpType::kPut:
107
- oss << "Put";
108
- break;
106
+ return "Put";
109
107
  case WriteBatchOpType::kDelete:
110
- oss << "Delete";
111
- break;
108
+ return "Delete";
112
109
  case WriteBatchOpType::kSingleDelete:
113
- oss << "SingleDelete";
114
- break;
110
+ return "SingleDelete";
115
111
  case WriteBatchOpType::kDeleteRange:
116
- oss << "DeleteRange";
112
+ return "DeleteRange";
117
113
  break;
118
114
  case WriteBatchOpType::kMerge:
119
- oss << "Merge";
115
+ return "Merge";
120
116
  break;
121
117
  case WriteBatchOpType::kBlobIndex:
122
- oss << "BlobIndex";
118
+ return "BlobIndex";
123
119
  break;
124
120
  case WriteBatchOpType::kNum:
125
121
  assert(false);
126
122
  }
127
- oss << "Add"
128
- << static_cast<int>(static_cast<unsigned char>(std::get<1>(info.param)));
129
- return oss.str();
123
+ assert(false);
124
+ return "";
130
125
  }
131
126
 
132
127
  INSTANTIATE_TEST_CASE_P(
@@ -134,7 +129,13 @@ INSTANTIATE_TEST_CASE_P(
134
129
  ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
135
130
  WriteBatchOpType::kNum),
136
131
  ::testing::Values(2, 103, 251)),
137
- GetTestNameSuffix);
132
+ [](const testing::TestParamInfo<std::tuple<WriteBatchOpType, char>>& args) {
133
+ std::ostringstream oss;
134
+ oss << GetOpTypeString(std::get<0>(args.param)) << "Add"
135
+ << static_cast<int>(
136
+ static_cast<unsigned char>(std::get<1>(args.param)));
137
+ return oss.str();
138
+ });
138
139
 
139
140
  TEST_P(DbKvChecksumTest, MemTableAddCorrupted) {
140
141
  // This test repeatedly attempts to write `WriteBatch`es containing a single
@@ -157,11 +158,16 @@ TEST_P(DbKvChecksumTest, MemTableAddCorrupted) {
157
158
  Reopen(options);
158
159
 
159
160
  SyncPoint::GetInstance()->EnableProcessing();
160
- auto batch_and_status = GetWriteBatch(nullptr /* cf_handle */);
161
+ auto batch_and_status = GetWriteBatch(nullptr /* cf_handle */, op_type_);
161
162
  ASSERT_OK(batch_and_status.second);
162
163
  ASSERT_TRUE(
163
164
  db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption());
164
165
  SyncPoint::GetInstance()->DisableProcessing();
166
+
167
+ // In case the above callback is not invoked, this test will run
168
+ // numeric_limits<size_t>::max() times until it reports an error (or will
169
+ // exhaust disk space). Added this assert to report error early.
170
+ ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
165
171
  }
166
172
  }
167
173
 
@@ -188,14 +194,373 @@ TEST_P(DbKvChecksumTest, MemTableAddWithColumnFamilyCorrupted) {
188
194
  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
189
195
 
190
196
  SyncPoint::GetInstance()->EnableProcessing();
191
- auto batch_and_status = GetWriteBatch(handles_[1]);
197
+ auto batch_and_status = GetWriteBatch(handles_[1], op_type_);
198
+ ASSERT_OK(batch_and_status.second);
199
+ ASSERT_TRUE(
200
+ db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption());
201
+ SyncPoint::GetInstance()->DisableProcessing();
202
+
203
+ // In case the above callback is not invoked, this test will run
204
+ // numeric_limits<size_t>::max() times until it reports an error (or will
205
+ // exhaust disk space). Added this assert to report error early.
206
+ ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
207
+ }
208
+ }
209
+
210
+ TEST_P(DbKvChecksumTest, NoCorruptionCase) {
211
+ // If this test fails, we may have found a piece of malfunctioned hardware
212
+ auto batch_and_status = GetWriteBatch(nullptr, op_type_);
213
+ ASSERT_OK(batch_and_status.second);
214
+ ASSERT_OK(batch_and_status.first.VerifyChecksum());
215
+ }
216
+
217
+ TEST_P(DbKvChecksumTest, WriteToWALCorrupted) {
218
+ // This test repeatedly attempts to write `WriteBatch`es containing a single
219
+ // entry of type `op_type_`. Each attempt has one byte corrupted by adding
220
+ // `corrupt_byte_addend_` to its original value. The test repeats until an
221
+ // attempt has been made on each byte in the encoded write batch. All attempts
222
+ // are expected to fail with `Status::Corruption`
223
+ Options options = CurrentOptions();
224
+ if (op_type_ == WriteBatchOpType::kMerge) {
225
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
226
+ }
227
+ SyncPoint::GetInstance()->SetCallBack(
228
+ "DBImpl::WriteToWAL:log_entry",
229
+ std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
230
+ std::placeholders::_1));
231
+ // First 8 bytes are for sequence number which is not protected in write batch
232
+ corrupt_byte_offset_ = 8;
233
+
234
+ while (MoreBytesToCorrupt()) {
235
+ // Corrupted write batch leads to read-only mode, so we have to
236
+ // reopen for every attempt.
237
+ Reopen(options);
238
+ auto log_size_pre_write = dbfull()->TEST_total_log_size();
239
+
240
+ SyncPoint::GetInstance()->EnableProcessing();
241
+ auto batch_and_status = GetWriteBatch(nullptr /* cf_handle */, op_type_);
242
+ ASSERT_OK(batch_and_status.second);
243
+ ASSERT_TRUE(
244
+ db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption());
245
+ // Confirm that nothing was written to WAL
246
+ ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
247
+ ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
248
+ SyncPoint::GetInstance()->DisableProcessing();
249
+
250
+ // In case the above callback is not invoked, this test will run
251
+ // numeric_limits<size_t>::max() times until it reports an error (or will
252
+ // exhaust disk space). Added this assert to report error early.
253
+ ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
254
+ }
255
+ }
256
+
257
+ TEST_P(DbKvChecksumTest, WriteToWALWithColumnFamilyCorrupted) {
258
+ // This test repeatedly attempts to write `WriteBatch`es containing a single
259
+ // entry of type `op_type_`. Each attempt has one byte corrupted by adding
260
+ // `corrupt_byte_addend_` to its original value. The test repeats until an
261
+ // attempt has been made on each byte in the encoded write batch. All attempts
262
+ // are expected to fail with `Status::Corruption`
263
+ Options options = CurrentOptions();
264
+ if (op_type_ == WriteBatchOpType::kMerge) {
265
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
266
+ }
267
+ CreateAndReopenWithCF({"pikachu"}, options);
268
+ SyncPoint::GetInstance()->SetCallBack(
269
+ "DBImpl::WriteToWAL:log_entry",
270
+ std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
271
+ std::placeholders::_1));
272
+ // First 8 bytes are for sequence number which is not protected in write batch
273
+ corrupt_byte_offset_ = 8;
274
+
275
+ while (MoreBytesToCorrupt()) {
276
+ // Corrupted write batch leads to read-only mode, so we have to
277
+ // reopen for every attempt.
278
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
279
+ auto log_size_pre_write = dbfull()->TEST_total_log_size();
280
+
281
+ SyncPoint::GetInstance()->EnableProcessing();
282
+ auto batch_and_status = GetWriteBatch(handles_[1], op_type_);
192
283
  ASSERT_OK(batch_and_status.second);
193
284
  ASSERT_TRUE(
194
285
  db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption());
286
+ // Confirm that nothing was written to WAL
287
+ ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
288
+ ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
195
289
  SyncPoint::GetInstance()->DisableProcessing();
290
+
291
+ // In case the above callback is not invoked, this test will run
292
+ // numeric_limits<size_t>::max() times until it reports an error (or will
293
+ // exhaust disk space). Added this assert to report error early.
294
+ ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
295
+ }
296
+ }
297
+
298
+ class DbKvChecksumTestMergedBatch
299
+ : public DBTestBase,
300
+ public ::testing::WithParamInterface<
301
+ std::tuple<WriteBatchOpType, WriteBatchOpType, char>> {
302
+ public:
303
+ DbKvChecksumTestMergedBatch()
304
+ : DBTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {
305
+ op_type1_ = std::get<0>(GetParam());
306
+ op_type2_ = std::get<1>(GetParam());
307
+ corrupt_byte_addend_ = std::get<2>(GetParam());
196
308
  }
309
+
310
+ protected:
311
+ WriteBatchOpType op_type1_;
312
+ WriteBatchOpType op_type2_;
313
+ char corrupt_byte_addend_;
314
+ };
315
+
316
+ void CorruptWriteBatch(Slice* content, size_t offset,
317
+ char corrupt_byte_addend) {
318
+ ASSERT_TRUE(offset < content->size());
319
+ char* buf = const_cast<char*>(content->data());
320
+ buf[offset] += corrupt_byte_addend;
321
+ }
322
+
323
+ TEST_P(DbKvChecksumTestMergedBatch, NoCorruptionCase) {
324
+ // Veirfy write batch checksum after write batch append
325
+ auto batch1 = GetWriteBatch(nullptr /* cf_handle */, op_type1_);
326
+ ASSERT_OK(batch1.second);
327
+ auto batch2 = GetWriteBatch(nullptr /* cf_handle */, op_type2_);
328
+ ASSERT_OK(batch2.second);
329
+ ASSERT_OK(WriteBatchInternal::Append(&batch1.first, &batch2.first));
330
+ ASSERT_OK(batch1.first.VerifyChecksum());
197
331
  }
198
332
 
333
+ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) {
334
+ // This test has two writers repeatedly attempt to write `WriteBatch`es
335
+ // containing a single entry of type op_type1_ and op_type2_ respectively. The
336
+ // leader of the write group writes the batch containinng the entry of type
337
+ // op_type1_. One byte of the pre-merged write batches is corrupted by adding
338
+ // `corrupt_byte_addend_` to the batch's original value during each attempt.
339
+ // The test repeats until an attempt has been made on each byte in both
340
+ // pre-merged write batches. All attempts are expected to fail with
341
+ // `Status::Corruption`.
342
+ Options options = CurrentOptions();
343
+ if (op_type1_ == WriteBatchOpType::kMerge ||
344
+ op_type2_ == WriteBatchOpType::kMerge) {
345
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
346
+ }
347
+
348
+ auto leader_batch_and_status =
349
+ GetWriteBatch(nullptr /* cf_handle */, op_type1_);
350
+ ASSERT_OK(leader_batch_and_status.second);
351
+ auto follower_batch_and_status =
352
+ GetWriteBatch(nullptr /* cf_handle */, op_type2_);
353
+ size_t leader_batch_size = leader_batch_and_status.first.GetDataSize();
354
+ size_t total_bytes =
355
+ leader_batch_size + follower_batch_and_status.first.GetDataSize();
356
+ // First 8 bytes are for sequence number which is not protected in write batch
357
+ size_t corrupt_byte_offset = 8;
358
+
359
+ std::atomic<bool> follower_joined{false};
360
+ std::atomic<int> leader_count{0};
361
+ port::Thread follower_thread;
362
+ // This callback should only be called by the leader thread
363
+ SyncPoint::GetInstance()->SetCallBack(
364
+ "WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) {
365
+ auto* leader = reinterpret_cast<WriteThread::Writer*>(arg_leader);
366
+ ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER);
367
+
368
+ // This callback should only be called by the follower thread
369
+ SyncPoint::GetInstance()->SetCallBack(
370
+ "WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) {
371
+ auto* follower =
372
+ reinterpret_cast<WriteThread::Writer*>(arg_follower);
373
+ // The leader thread will wait on this bool and hence wait until
374
+ // this writer joins the write group
375
+ ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER);
376
+ if (corrupt_byte_offset >= leader_batch_size) {
377
+ Slice batch_content = follower->batch->Data();
378
+ CorruptWriteBatch(&batch_content,
379
+ corrupt_byte_offset - leader_batch_size,
380
+ corrupt_byte_addend_);
381
+ }
382
+ // Leader busy waits on this flag
383
+ follower_joined = true;
384
+ // So the follower does not enter the outer callback at
385
+ // WriteThread::JoinBatchGroup:Wait2
386
+ SyncPoint::GetInstance()->DisableProcessing();
387
+ });
388
+
389
+ // Start the other writer thread which will join the write group as
390
+ // follower
391
+ follower_thread = port::Thread([&]() {
392
+ follower_batch_and_status =
393
+ GetWriteBatch(nullptr /* cf_handle */, op_type2_);
394
+ ASSERT_OK(follower_batch_and_status.second);
395
+ ASSERT_TRUE(
396
+ db_->Write(WriteOptions(), &follower_batch_and_status.first)
397
+ .IsCorruption());
398
+ });
399
+
400
+ ASSERT_EQ(leader->batch->GetDataSize(), leader_batch_size);
401
+ if (corrupt_byte_offset < leader_batch_size) {
402
+ Slice batch_content = leader->batch->Data();
403
+ CorruptWriteBatch(&batch_content, corrupt_byte_offset,
404
+ corrupt_byte_addend_);
405
+ }
406
+ leader_count++;
407
+ while (!follower_joined) {
408
+ // busy waiting
409
+ }
410
+ });
411
+ while (corrupt_byte_offset < total_bytes) {
412
+ // Reopen DB since it failed WAL write which lead to read-only mode
413
+ Reopen(options);
414
+ SyncPoint::GetInstance()->EnableProcessing();
415
+ auto log_size_pre_write = dbfull()->TEST_total_log_size();
416
+ leader_batch_and_status = GetWriteBatch(nullptr /* cf_handle */, op_type1_);
417
+ ASSERT_OK(leader_batch_and_status.second);
418
+ ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first)
419
+ .IsCorruption());
420
+ follower_thread.join();
421
+ // Prevent leader thread from entering this callback
422
+ SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait");
423
+ ASSERT_EQ(1, leader_count);
424
+ // Nothing should have been written to WAL
425
+ ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
426
+ ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
427
+
428
+ corrupt_byte_offset++;
429
+ if (corrupt_byte_offset == leader_batch_size) {
430
+ // skip over the sequence number part of follower's write batch
431
+ corrupt_byte_offset += 8;
432
+ }
433
+ follower_joined = false;
434
+ leader_count = 0;
435
+ }
436
+ SyncPoint::GetInstance()->DisableProcessing();
437
+ }
438
+
439
+ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) {
440
+ // This test has two writers repeatedly attempt to write `WriteBatch`es
441
+ // containing a single entry of type op_type1_ and op_type2_ respectively. The
442
+ // leader of the write group writes the batch containinng the entry of type
443
+ // op_type1_. One byte of the pre-merged write batches is corrupted by adding
444
+ // `corrupt_byte_addend_` to the batch's original value during each attempt.
445
+ // The test repeats until an attempt has been made on each byte in both
446
+ // pre-merged write batches. All attempts are expected to fail with
447
+ // `Status::Corruption`.
448
+ Options options = CurrentOptions();
449
+ if (op_type1_ == WriteBatchOpType::kMerge ||
450
+ op_type2_ == WriteBatchOpType::kMerge) {
451
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
452
+ }
453
+ CreateAndReopenWithCF({"ramen"}, options);
454
+
455
+ auto leader_batch_and_status = GetWriteBatch(handles_[1], op_type1_);
456
+ ASSERT_OK(leader_batch_and_status.second);
457
+ auto follower_batch_and_status = GetWriteBatch(handles_[1], op_type2_);
458
+ size_t leader_batch_size = leader_batch_and_status.first.GetDataSize();
459
+ size_t total_bytes =
460
+ leader_batch_size + follower_batch_and_status.first.GetDataSize();
461
+ // First 8 bytes are for sequence number which is not protected in write batch
462
+ size_t corrupt_byte_offset = 8;
463
+
464
+ std::atomic<bool> follower_joined{false};
465
+ std::atomic<int> leader_count{0};
466
+ port::Thread follower_thread;
467
+ // This callback should only be called by the leader thread
468
+ SyncPoint::GetInstance()->SetCallBack(
469
+ "WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) {
470
+ auto* leader = reinterpret_cast<WriteThread::Writer*>(arg_leader);
471
+ ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER);
472
+
473
+ // This callback should only be called by the follower thread
474
+ SyncPoint::GetInstance()->SetCallBack(
475
+ "WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) {
476
+ auto* follower =
477
+ reinterpret_cast<WriteThread::Writer*>(arg_follower);
478
+ // The leader thread will wait on this bool and hence wait until
479
+ // this writer joins the write group
480
+ ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER);
481
+ if (corrupt_byte_offset >= leader_batch_size) {
482
+ Slice batch_content =
483
+ WriteBatchInternal::Contents(follower->batch);
484
+ CorruptWriteBatch(&batch_content,
485
+ corrupt_byte_offset - leader_batch_size,
486
+ corrupt_byte_addend_);
487
+ }
488
+ follower_joined = true;
489
+ // So the follower does not enter the outer callback at
490
+ // WriteThread::JoinBatchGroup:Wait2
491
+ SyncPoint::GetInstance()->DisableProcessing();
492
+ });
493
+
494
+ // Start the other writer thread which will join the write group as
495
+ // follower
496
+ follower_thread = port::Thread([&]() {
497
+ follower_batch_and_status = GetWriteBatch(handles_[1], op_type2_);
498
+ ASSERT_OK(follower_batch_and_status.second);
499
+ ASSERT_TRUE(
500
+ db_->Write(WriteOptions(), &follower_batch_and_status.first)
501
+ .IsCorruption());
502
+ });
503
+
504
+ ASSERT_EQ(leader->batch->GetDataSize(), leader_batch_size);
505
+ if (corrupt_byte_offset < leader_batch_size) {
506
+ Slice batch_content = WriteBatchInternal::Contents(leader->batch);
507
+ CorruptWriteBatch(&batch_content, corrupt_byte_offset,
508
+ corrupt_byte_addend_);
509
+ }
510
+ leader_count++;
511
+ while (!follower_joined) {
512
+ // busy waiting
513
+ }
514
+ });
515
+ SyncPoint::GetInstance()->EnableProcessing();
516
+ while (corrupt_byte_offset < total_bytes) {
517
+ // Reopen DB since it failed WAL write which lead to read-only mode
518
+ ReopenWithColumnFamilies({kDefaultColumnFamilyName, "ramen"}, options);
519
+ SyncPoint::GetInstance()->EnableProcessing();
520
+ auto log_size_pre_write = dbfull()->TEST_total_log_size();
521
+ leader_batch_and_status = GetWriteBatch(handles_[1], op_type1_);
522
+ ASSERT_OK(leader_batch_and_status.second);
523
+ ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first)
524
+ .IsCorruption());
525
+ follower_thread.join();
526
+ // Prevent leader thread from entering this callback
527
+ SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait");
528
+
529
+ ASSERT_EQ(1, leader_count);
530
+ // Nothing should have been written to WAL
531
+ ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
532
+ ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
533
+
534
+ corrupt_byte_offset++;
535
+ if (corrupt_byte_offset == leader_batch_size) {
536
+ // skip over the sequence number part of follower's write batch
537
+ corrupt_byte_offset += 8;
538
+ }
539
+ follower_joined = false;
540
+ leader_count = 0;
541
+ }
542
+ SyncPoint::GetInstance()->DisableProcessing();
543
+ }
544
+
545
+ INSTANTIATE_TEST_CASE_P(
546
+ DbKvChecksumTestMergedBatch, DbKvChecksumTestMergedBatch,
547
+ ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
548
+ WriteBatchOpType::kNum),
549
+ ::testing::Range(static_cast<WriteBatchOpType>(0),
550
+ WriteBatchOpType::kNum),
551
+ ::testing::Values(2, 103, 251)),
552
+ [](const testing::TestParamInfo<
553
+ std::tuple<WriteBatchOpType, WriteBatchOpType, char>>& args) {
554
+ std::ostringstream oss;
555
+ oss << GetOpTypeString(std::get<0>(args.param))
556
+ << GetOpTypeString(std::get<1>(args.param)) << "Add"
557
+ << static_cast<int>(
558
+ static_cast<unsigned char>(std::get<2>(args.param)));
559
+ return oss.str();
560
+ });
561
+
562
+ // TODO: add test for transactions
563
+ // TODO: add test for corrupted write batch with WAL disabled
199
564
  } // namespace ROCKSDB_NAMESPACE
200
565
 
201
566
  int main(int argc, char** argv) {
@@ -220,6 +220,7 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) {
220
220
 
221
221
  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
222
222
  Options c_opts = dbfull()->GetOptions(cfh);
223
+
223
224
  const auto* c_bbto =
224
225
  c_opts.table_factory->GetOptions<BlockBasedTableOptions>();
225
226
  ASSERT_NE(c_bbto, nullptr);