@nxtedition/rocksdb 13.1.5 → 13.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +37 -12
- package/deps/rocksdb/rocksdb/{TARGETS → BUCK} +27 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -1
- package/deps/rocksdb/rocksdb/Makefile +2 -2
- package/deps/rocksdb/rocksdb/cache/cache.cc +3 -1
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +2 -0
- package/deps/rocksdb/rocksdb/db/attribute_group_iterator_impl.h +34 -9
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +7 -6
- package/deps/rocksdb/rocksdb/db/blob/blob_source.h +5 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +22 -14
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +149 -0
- package/deps/rocksdb/rocksdb/db/builder.cc +13 -24
- package/deps/rocksdb/rocksdb/db/coalescing_iterator.h +35 -10
- package/deps/rocksdb/rocksdb/db/column_family.cc +21 -10
- package/deps/rocksdb/rocksdb/db/column_family.h +15 -8
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +98 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +126 -16
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +51 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +2 -8
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +24 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +52 -22
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +9 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +36 -9
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +6 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +30 -17
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +26 -23
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +43 -33
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +6 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +19 -9
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +6 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +632 -411
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +171 -51
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +7 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +37 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +51 -11
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +10 -3
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +350 -154
- package/deps/rocksdb/rocksdb/db/convenience.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +62 -27
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +68 -1
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +91 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +134 -70
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +71 -23
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +43 -16
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +47 -33
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +27 -19
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +38 -25
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +7 -4
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +258 -42
- package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +161 -9
- package/deps/rocksdb/rocksdb/db/db_iter.cc +118 -86
- package/deps/rocksdb/rocksdb/db/db_iter.h +44 -17
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +27 -6
- package/deps/rocksdb/rocksdb/db/db_test.cc +48 -16
- package/deps/rocksdb/rocksdb/db/db_test2.cc +60 -15
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +97 -44
- package/deps/rocksdb/rocksdb/db/db_test_util.h +7 -1
- package/deps/rocksdb/rocksdb/db/dbformat.cc +15 -5
- package/deps/rocksdb/rocksdb/db/dbformat.h +137 -55
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
- package/deps/rocksdb/rocksdb/db/experimental.cc +54 -0
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +663 -8
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +152 -91
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +134 -11
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +55 -9
- package/deps/rocksdb/rocksdb/db/flush_job.cc +52 -29
- package/deps/rocksdb/rocksdb/db/flush_job.h +5 -3
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +18 -12
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +23 -29
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +3 -2
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +2 -0
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +9 -6
- package/deps/rocksdb/rocksdb/db/internal_stats.h +54 -0
- package/deps/rocksdb/rocksdb/db/job_context.h +1 -1
- package/deps/rocksdb/rocksdb/db/log_reader.cc +6 -7
- package/deps/rocksdb/rocksdb/db/manifest_ops.cc +47 -0
- package/deps/rocksdb/rocksdb/db/manifest_ops.h +20 -0
- package/deps/rocksdb/rocksdb/db/memtable.cc +165 -64
- package/deps/rocksdb/rocksdb/db/memtable.h +422 -243
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +99 -68
- package/deps/rocksdb/rocksdb/db/memtable_list.h +63 -38
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +28 -25
- package/deps/rocksdb/rocksdb/db/multi_cf_iterator_impl.h +118 -60
- package/deps/rocksdb/rocksdb/db/multi_cf_iterator_test.cc +344 -89
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +2 -3
- package/deps/rocksdb/rocksdb/db/repair.cc +15 -14
- package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -13
- package/deps/rocksdb/rocksdb/db/snapshot_checker.h +7 -0
- package/deps/rocksdb/rocksdb/db/table_cache.cc +62 -65
- package/deps/rocksdb/rocksdb/db/table_cache.h +70 -76
- package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +5 -6
- package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +8 -7
- package/deps/rocksdb/rocksdb/db/version_builder.cc +17 -19
- package/deps/rocksdb/rocksdb/db/version_builder.h +13 -12
- package/deps/rocksdb/rocksdb/db/version_edit.h +30 -0
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +3 -5
- package/deps/rocksdb/rocksdb/db/version_set.cc +89 -129
- package/deps/rocksdb/rocksdb/db/version_set.h +12 -4
- package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -2
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +12 -8
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +0 -15
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +0 -2
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +9 -7
- package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +0 -8
- package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.h +28 -2
- package/deps/rocksdb/rocksdb/db/write_batch.cc +32 -10
- package/deps/rocksdb/rocksdb/db/write_batch_internal.h +9 -0
- package/deps/rocksdb/rocksdb/db/write_batch_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/write_thread.cc +3 -1
- package/deps/rocksdb/rocksdb/db/write_thread.h +6 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +15 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +7 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +18 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +100 -22
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +34 -8
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +223 -78
- package/deps/rocksdb/rocksdb/env/file_system.cc +6 -1
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +53 -0
- package/deps/rocksdb/rocksdb/env/io_posix.cc +63 -17
- package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +132 -48
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +92 -24
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +727 -109
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +3 -4
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +1 -1
- package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +8 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/attribute_groups.h +20 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +9 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +9 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +2 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +10 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +34 -37
- package/deps/rocksdb/rocksdb/include/rocksdb/iterator_base.h +21 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +56 -28
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +36 -28
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +11 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +84 -60
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +102 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +89 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +32 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +30 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +23 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -0
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +79 -21
- package/deps/rocksdb/rocksdb/memtable/skiplist.h +41 -18
- package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +1 -5
- package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +169 -0
- package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +400 -0
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -0
- package/deps/rocksdb/rocksdb/options/cf_options.cc +137 -82
- package/deps/rocksdb/rocksdb/options/cf_options.h +18 -6
- package/deps/rocksdb/rocksdb/options/configurable.cc +31 -17
- package/deps/rocksdb/rocksdb/options/configurable_helper.h +7 -6
- package/deps/rocksdb/rocksdb/options/options_helper.cc +10 -8
- package/deps/rocksdb/rocksdb/options/options_parser.cc +74 -54
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +89 -0
- package/deps/rocksdb/rocksdb/options/options_test.cc +112 -26
- package/deps/rocksdb/rocksdb/port/port.h +5 -9
- package/deps/rocksdb/rocksdb/src.mk +8 -0
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +4 -0
- package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -7
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +2 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +62 -80
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +13 -3
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +16 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +38 -7
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +12 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +4 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +4 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +204 -1
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +3 -3
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +2 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +4 -0
- package/deps/rocksdb/rocksdb/table/format.cc +3 -3
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +4 -1
- package/deps/rocksdb/rocksdb/table/mock_table.cc +0 -50
- package/deps/rocksdb/rocksdb/table/mock_table.h +53 -0
- package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +4 -0
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -5
- package/deps/rocksdb/rocksdb/table/table_builder.h +3 -1
- package/deps/rocksdb/rocksdb/table/table_properties.cc +181 -0
- package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +5 -5
- package/deps/rocksdb/rocksdb/table/table_test.cc +71 -64
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py +45 -45
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +35 -35
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +43 -43
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +41 -4
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +1 -0
- package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +1 -1
- package/deps/rocksdb/rocksdb/unreleased_history/add.sh +13 -0
- package/deps/rocksdb/rocksdb/util/aligned_buffer.h +24 -5
- package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +7 -0
- package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +0 -52
- package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +1 -10
- package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +92 -0
- package/deps/rocksdb/rocksdb/util/thread_operation.h +1 -0
- package/deps/rocksdb/rocksdb/util/udt_util.cc +50 -4
- package/deps/rocksdb/rocksdb/util/udt_util.h +24 -11
- package/deps/rocksdb/rocksdb/util/udt_util_test.cc +26 -13
- package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +1 -16
- package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +2 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +214 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.h +60 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +124 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +441 -0
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +34 -3
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +7 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +437 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +34 -11
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +14 -7
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +7 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +17 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +69 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +20 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1290 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +324 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +18 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +8 -1
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +57 -12
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +32 -3
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +33 -2
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +721 -9
- package/deps/rocksdb/rocksdb.gyp +2 -0
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
|
@@ -301,7 +301,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
|
|
|
301
301
|
VersionEdit new_db_edit;
|
|
302
302
|
const WriteOptions write_options(Env::IOActivity::kDBOpen);
|
|
303
303
|
Status s = SetupDBId(write_options, /*read_only=*/false, /*is_new_db=*/true,
|
|
304
|
-
&new_db_edit);
|
|
304
|
+
/*is_retry=*/false, &new_db_edit);
|
|
305
305
|
if (!s.ok()) {
|
|
306
306
|
return s;
|
|
307
307
|
}
|
|
@@ -575,6 +575,7 @@ Status DBImpl::Recover(
|
|
|
575
575
|
}
|
|
576
576
|
if (s.ok() && !read_only) {
|
|
577
577
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
578
|
+
auto& moptions = *cfd->GetLatestMutableCFOptions();
|
|
578
579
|
// Try to trivially move files down the LSM tree to start from bottommost
|
|
579
580
|
// level when level_compaction_dynamic_level_bytes is enabled. This should
|
|
580
581
|
// only be useful when user is migrating to turning on this option.
|
|
@@ -592,14 +593,14 @@ Status DBImpl::Recover(
|
|
|
592
593
|
if (cfd->ioptions()->compaction_style ==
|
|
593
594
|
CompactionStyle::kCompactionStyleLevel &&
|
|
594
595
|
cfd->ioptions()->level_compaction_dynamic_level_bytes &&
|
|
595
|
-
!
|
|
596
|
+
!moptions.disable_auto_compactions) {
|
|
596
597
|
int to_level = cfd->ioptions()->num_levels - 1;
|
|
597
598
|
// last level is reserved
|
|
598
599
|
// allow_ingest_behind does not support Level Compaction,
|
|
599
600
|
// and per_key_placement can have infinite compaction loop for Level
|
|
600
601
|
// Compaction. Adjust to_level here just to be safe.
|
|
601
602
|
if (cfd->ioptions()->allow_ingest_behind ||
|
|
602
|
-
|
|
603
|
+
moptions.preclude_last_level_data_seconds > 0) {
|
|
603
604
|
to_level -= 1;
|
|
604
605
|
}
|
|
605
606
|
// Whether this column family has a level trivially moved
|
|
@@ -675,11 +676,11 @@ Status DBImpl::Recover(
|
|
|
675
676
|
// Already set up DB ID in NewDB
|
|
676
677
|
} else if (immutable_db_options_.write_dbid_to_manifest && recovery_ctx) {
|
|
677
678
|
VersionEdit edit;
|
|
678
|
-
s = SetupDBId(write_options, read_only, is_new_db, &edit);
|
|
679
|
+
s = SetupDBId(write_options, read_only, is_new_db, is_retry, &edit);
|
|
679
680
|
recovery_ctx->UpdateVersionEdits(
|
|
680
681
|
versions_->GetColumnFamilySet()->GetDefault(), edit);
|
|
681
682
|
} else {
|
|
682
|
-
s = SetupDBId(write_options, read_only, is_new_db, nullptr);
|
|
683
|
+
s = SetupDBId(write_options, read_only, is_new_db, is_retry, nullptr);
|
|
683
684
|
}
|
|
684
685
|
assert(!s.ok() || !db_id_.empty());
|
|
685
686
|
ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str());
|
|
@@ -1274,7 +1275,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|
|
1274
1275
|
reader.GetRecordedTimestampSize();
|
|
1275
1276
|
status = HandleWriteBatchTimestampSizeDifference(
|
|
1276
1277
|
&batch, running_ts_sz, record_ts_sz,
|
|
1277
|
-
TimestampSizeConsistencyMode::kReconcileInconsistency,
|
|
1278
|
+
TimestampSizeConsistencyMode::kReconcileInconsistency, seq_per_batch_,
|
|
1279
|
+
batch_per_txn_, &new_batch);
|
|
1278
1280
|
if (!status.ok()) {
|
|
1279
1281
|
return status;
|
|
1280
1282
|
}
|
|
@@ -1371,6 +1373,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|
|
1371
1373
|
}
|
|
1372
1374
|
}
|
|
1373
1375
|
}
|
|
1376
|
+
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
1377
|
+
"Recovered to log #%" PRIu64 " seq #%" PRIu64, wal_number,
|
|
1378
|
+
*next_sequence);
|
|
1374
1379
|
|
|
1375
1380
|
if (!status.ok() || old_log_record) {
|
|
1376
1381
|
if (status.IsNotSupported()) {
|
|
@@ -1403,10 +1408,6 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|
|
1403
1408
|
if (corrupted_wal_found != nullptr) {
|
|
1404
1409
|
*corrupted_wal_found = true;
|
|
1405
1410
|
}
|
|
1406
|
-
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
1407
|
-
"Point in time recovered to log #%" PRIu64
|
|
1408
|
-
" seq #%" PRIu64,
|
|
1409
|
-
wal_number, *next_sequence);
|
|
1410
1411
|
} else {
|
|
1411
1412
|
assert(immutable_db_options_.wal_recovery_mode ==
|
|
1412
1413
|
WALRecoveryMode::kTolerateCorruptedTailRecords ||
|
|
@@ -1667,10 +1668,20 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
1667
1668
|
Arena arena;
|
|
1668
1669
|
Status s;
|
|
1669
1670
|
TableProperties table_properties;
|
|
1671
|
+
const auto* ucmp = cfd->internal_comparator().user_comparator();
|
|
1672
|
+
assert(ucmp);
|
|
1673
|
+
const size_t ts_sz = ucmp->timestamp_size();
|
|
1674
|
+
const bool logical_strip_timestamp =
|
|
1675
|
+
ts_sz > 0 && !cfd->ioptions()->persist_user_defined_timestamps;
|
|
1670
1676
|
{
|
|
1671
1677
|
ScopedArenaPtr<InternalIterator> iter(
|
|
1672
|
-
|
|
1673
|
-
|
|
1678
|
+
logical_strip_timestamp
|
|
1679
|
+
? mem->NewTimestampStrippingIterator(
|
|
1680
|
+
ro, /*seqno_to_time_mapping=*/nullptr, &arena,
|
|
1681
|
+
/*prefix_extractor=*/nullptr, ts_sz)
|
|
1682
|
+
: mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
|
|
1683
|
+
/*prefix_extractor=*/nullptr,
|
|
1684
|
+
/*for_flush=*/true));
|
|
1674
1685
|
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
|
|
1675
1686
|
"[%s] [WriteLevel0TableForRecovery]"
|
|
1676
1687
|
" Level-0 table #%" PRIu64 ": started",
|
|
@@ -1705,11 +1716,14 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
1705
1716
|
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
|
|
1706
1717
|
range_del_iters;
|
|
1707
1718
|
auto range_del_iter =
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1719
|
+
logical_strip_timestamp
|
|
1720
|
+
? mem->NewTimestampStrippingRangeTombstoneIterator(
|
|
1721
|
+
ro, kMaxSequenceNumber, ts_sz)
|
|
1722
|
+
// This is called during recovery, where a live memtable is
|
|
1723
|
+
// flushed directly. In this case, no fragmented tombstone list is
|
|
1724
|
+
// cached in this memtable yet.
|
|
1725
|
+
: mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber,
|
|
1726
|
+
false /* immutable_memtable */);
|
|
1713
1727
|
if (range_del_iter != nullptr) {
|
|
1714
1728
|
range_del_iters.emplace_back(range_del_iter);
|
|
1715
1729
|
}
|
|
@@ -1723,10 +1737,11 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
1723
1737
|
cfd->internal_comparator(), cfd->internal_tbl_prop_coll_factories(),
|
|
1724
1738
|
GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
|
|
1725
1739
|
mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(),
|
|
1726
|
-
0 /* level */,
|
|
1727
|
-
|
|
1728
|
-
0 /* file_creation_time */, db_id_,
|
|
1729
|
-
0 /* target_file_size */, meta.fd.GetNumber(),
|
|
1740
|
+
0 /* level */, current_time /* newest_key_time */,
|
|
1741
|
+
false /* is_bottommost */, TableFileCreationReason::kRecovery,
|
|
1742
|
+
0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_,
|
|
1743
|
+
db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber(),
|
|
1744
|
+
kMaxSequenceNumber);
|
|
1730
1745
|
Version* version = cfd->current();
|
|
1731
1746
|
version->Ref();
|
|
1732
1747
|
uint64_t num_input_entries = 0;
|
|
@@ -1756,7 +1771,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
1756
1771
|
s = io_s;
|
|
1757
1772
|
}
|
|
1758
1773
|
|
|
1759
|
-
uint64_t total_num_entries = mem->
|
|
1774
|
+
uint64_t total_num_entries = mem->NumEntries();
|
|
1760
1775
|
if (s.ok() && total_num_entries != num_input_entries) {
|
|
1761
1776
|
std::string msg = "Expected " + std::to_string(total_num_entries) +
|
|
1762
1777
|
" entries in memtable, but read " +
|
|
@@ -1795,9 +1810,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
1795
1810
|
|
|
1796
1811
|
// For UDT in memtable only feature, move up the cutoff timestamp whenever
|
|
1797
1812
|
// a flush happens.
|
|
1798
|
-
|
|
1799
|
-
size_t ts_sz = ucmp->timestamp_size();
|
|
1800
|
-
if (ts_sz > 0 && !cfd->ioptions()->persist_user_defined_timestamps) {
|
|
1813
|
+
if (logical_strip_timestamp) {
|
|
1801
1814
|
Slice mem_newest_udt = mem->GetNewestUDT();
|
|
1802
1815
|
std::string full_history_ts_low = cfd->GetFullHistoryTsLow();
|
|
1803
1816
|
if (full_history_ts_low.empty() ||
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
#include "db/arena_wrapped_db_iter.h"
|
|
9
9
|
#include "db/db_impl/compacted_db_impl.h"
|
|
10
10
|
#include "db/db_impl/db_impl.h"
|
|
11
|
-
#include "db/
|
|
11
|
+
#include "db/manifest_ops.h"
|
|
12
12
|
#include "db/merge_context.h"
|
|
13
13
|
#include "logging/logging.h"
|
|
14
14
|
#include "monitoring/perf_context_imp.h"
|
|
@@ -265,8 +265,8 @@ Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
|
|
|
265
265
|
const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
|
|
266
266
|
std::string manifest_path;
|
|
267
267
|
uint64_t manifest_file_number;
|
|
268
|
-
s =
|
|
269
|
-
|
|
268
|
+
s = GetCurrentManifestPath(dbname, fs.get(), /*is_retry=*/false,
|
|
269
|
+
&manifest_path, &manifest_file_number);
|
|
270
270
|
} else {
|
|
271
271
|
// Historic behavior that doesn't necessarily make sense
|
|
272
272
|
s = db_options.env->CreateDirIfMissing(dbname);
|
|
@@ -233,7 +233,8 @@ Status DBImplSecondary::RecoverLogFiles(
|
|
|
233
233
|
reader->GetRecordedTimestampSize();
|
|
234
234
|
status = HandleWriteBatchTimestampSizeDifference(
|
|
235
235
|
&batch, running_ts_sz, record_ts_sz,
|
|
236
|
-
TimestampSizeConsistencyMode::kVerifyConsistency
|
|
236
|
+
TimestampSizeConsistencyMode::kVerifyConsistency, seq_per_batch_,
|
|
237
|
+
batch_per_txn_);
|
|
237
238
|
if (!status.ok()) {
|
|
238
239
|
break;
|
|
239
240
|
}
|
|
@@ -247,9 +248,7 @@ Status DBImplSecondary::RecoverLogFiles(
|
|
|
247
248
|
if (cfd == nullptr) {
|
|
248
249
|
continue;
|
|
249
250
|
}
|
|
250
|
-
|
|
251
|
-
cfds_changed->insert(cfd);
|
|
252
|
-
}
|
|
251
|
+
cfds_changed->insert(cfd);
|
|
253
252
|
const std::vector<FileMetaData*>& l0_files =
|
|
254
253
|
cfd->current()->storage_info()->LevelFiles(0);
|
|
255
254
|
SequenceNumber seq =
|
|
@@ -957,6 +956,10 @@ Status DB::OpenAndCompact(
|
|
|
957
956
|
config_options.env = override_options.env;
|
|
958
957
|
std::vector<ColumnFamilyDescriptor> all_column_families;
|
|
959
958
|
|
|
959
|
+
TEST_SYNC_POINT_CALLBACK(
|
|
960
|
+
"DBImplSecondary::OpenAndCompact::BeforeLoadingOptions:0",
|
|
961
|
+
&compaction_input.options_file_number);
|
|
962
|
+
TEST_SYNC_POINT("DBImplSecondary::OpenAndCompact::BeforeLoadingOptions:1");
|
|
960
963
|
std::string options_file_name =
|
|
961
964
|
OptionsFileName(name, compaction_input.options_file_number);
|
|
962
965
|
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
#include "db/error_handler.h"
|
|
13
13
|
#include "db/event_helpers.h"
|
|
14
14
|
#include "logging/logging.h"
|
|
15
|
+
#include "memtable/wbwi_memtable.h"
|
|
15
16
|
#include "monitoring/perf_context_imp.h"
|
|
16
17
|
#include "options/options_helper.h"
|
|
17
18
|
#include "test_util/sync_point.h"
|
|
@@ -189,16 +190,137 @@ Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
|
|
|
189
190
|
return s;
|
|
190
191
|
}
|
|
191
192
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
193
|
+
Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
|
|
194
|
+
const WBWIMemTable::SeqnoRange& assigned_seqno,
|
|
195
|
+
uint64_t prep_log,
|
|
196
|
+
SequenceNumber last_seqno_after_ingest,
|
|
197
|
+
bool memtable_updated, bool ignore_missing_cf) {
|
|
198
|
+
// Keys in new memtable have seqno > last_seqno_after_ingest >= keys in wbwi.
|
|
199
|
+
assert(assigned_seqno.upper_bound <= last_seqno_after_ingest);
|
|
200
|
+
// Keys in the current memtable have seqno <= LastSequence() < keys in wbwi.
|
|
201
|
+
assert(assigned_seqno.lower_bound > versions_->LastSequence());
|
|
202
|
+
autovector<ReadOnlyMemTable*> memtables;
|
|
203
|
+
autovector<ColumnFamilyData*> cfds;
|
|
204
|
+
InstrumentedMutexLock lock(&mutex_);
|
|
205
|
+
ColumnFamilySet* cf_set = versions_->GetColumnFamilySet();
|
|
206
|
+
|
|
207
|
+
// Create WBWIMemTables
|
|
208
|
+
for (const auto [cf_id, stat] : wbwi->GetCFStats()) {
|
|
209
|
+
ColumnFamilyData* cfd = cf_set->GetColumnFamily(cf_id);
|
|
210
|
+
if (!cfd) {
|
|
211
|
+
if (ignore_missing_cf) {
|
|
212
|
+
continue;
|
|
213
|
+
}
|
|
214
|
+
for (auto mem : memtables) {
|
|
215
|
+
mem->Unref();
|
|
216
|
+
delete mem;
|
|
217
|
+
}
|
|
218
|
+
for (auto cfd_ptr : cfds) {
|
|
219
|
+
cfd_ptr->UnrefAndTryDelete();
|
|
220
|
+
}
|
|
221
|
+
Status s = Status::InvalidArgument(
|
|
222
|
+
"Invalid column family id from WriteBatchWithIndex: " +
|
|
223
|
+
std::to_string(cf_id));
|
|
224
|
+
if (memtable_updated) {
|
|
225
|
+
s = Status::Corruption(
|
|
226
|
+
"Part of the write batch is applied. Memtable is in a inconsistent "
|
|
227
|
+
"state. " +
|
|
228
|
+
s.ToString());
|
|
229
|
+
error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
return s;
|
|
233
|
+
}
|
|
234
|
+
WBWIMemTable* wbwi_memtable =
|
|
235
|
+
new WBWIMemTable(wbwi, cfd->user_comparator(), cf_id, cfd->ioptions(),
|
|
236
|
+
cfd->GetLatestMutableCFOptions(), stat);
|
|
237
|
+
wbwi_memtable->Ref();
|
|
238
|
+
wbwi_memtable->AssignSequenceNumbers(assigned_seqno);
|
|
239
|
+
// This is needed to keep the WAL that contains Prepare alive until
|
|
240
|
+
// committed data in this memtable is persisted.
|
|
241
|
+
wbwi_memtable->SetMinPrepLog(prep_log);
|
|
242
|
+
memtables.push_back(wbwi_memtable);
|
|
243
|
+
cfd->Ref();
|
|
244
|
+
cfds.push_back(cfd);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Stop writes to the DB by entering both write threads
|
|
248
|
+
WriteThread::Writer nonmem_w;
|
|
249
|
+
if (two_write_queues_) {
|
|
250
|
+
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
|
251
|
+
}
|
|
252
|
+
WaitForPendingWrites();
|
|
253
|
+
|
|
254
|
+
// Switch memtable and add WBWIMemTables
|
|
255
|
+
Status s;
|
|
256
|
+
for (size_t i = 0; i < memtables.size(); ++i) {
|
|
257
|
+
assert(!immutable_db_options_.atomic_flush);
|
|
258
|
+
// NOTE: to support atomic flush, need to call
|
|
259
|
+
// SelectColumnFamiliesForAtomicFlush()
|
|
260
|
+
WriteContext write_context;
|
|
261
|
+
// TODO: not switch on empty memtable, may need to update metadata
|
|
262
|
+
// like NextLogNumber(), earliest_seqno and memtable id.
|
|
263
|
+
s = SwitchMemtable(cfds[i], &write_context, memtables[i],
|
|
264
|
+
last_seqno_after_ingest);
|
|
265
|
+
if (!s.ok()) {
|
|
266
|
+
// SwitchMemtable() can only fail if a new WAL is to be created, this
|
|
267
|
+
// should only happen for the first call to SwitchMemtable(). log will
|
|
268
|
+
// be empty and no new WAL is created for the rest of the calls.
|
|
269
|
+
assert(i == 0);
|
|
270
|
+
if (i != 0 || memtable_updated) {
|
|
271
|
+
// escalate error to non-recoverable
|
|
272
|
+
s = Status::Corruption(
|
|
273
|
+
"Part of the write batch is applied. Memtable is in a inconsistent "
|
|
274
|
+
"state. " +
|
|
275
|
+
s.ToString());
|
|
276
|
+
error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
|
|
277
|
+
} else {
|
|
278
|
+
// SwitchMemtable() already sets appropriate bg error
|
|
279
|
+
}
|
|
280
|
+
for (size_t j = i; j < memtables.size(); j++) {
|
|
281
|
+
memtables[j]->Unref();
|
|
282
|
+
delete memtables[j];
|
|
283
|
+
}
|
|
284
|
+
break;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
for (size_t i = 0; i < cfds.size(); ++i) {
|
|
288
|
+
if (cfds[i]->UnrefAndTryDelete()) {
|
|
289
|
+
cfds[i] = nullptr;
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
// exit the second queue before returning
|
|
294
|
+
if (two_write_queues_) {
|
|
295
|
+
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
|
296
|
+
}
|
|
297
|
+
if (s.ok()) {
|
|
298
|
+
// Trigger flushes for the new immutable memtables.
|
|
299
|
+
for (const auto cfd : cfds) {
|
|
300
|
+
if (cfd == nullptr) {
|
|
301
|
+
continue;
|
|
302
|
+
}
|
|
303
|
+
cfd->imm()->FlushRequested();
|
|
304
|
+
FlushRequest flush_req;
|
|
305
|
+
// TODO: a new flush reason for ingesting memtable
|
|
306
|
+
GenerateFlushRequest({cfd}, FlushReason::kExternalFileIngestion,
|
|
307
|
+
&flush_req);
|
|
308
|
+
EnqueuePendingFlush(flush_req);
|
|
309
|
+
}
|
|
310
|
+
MaybeScheduleFlushOrCompaction();
|
|
311
|
+
}
|
|
312
|
+
return s;
|
|
313
|
+
}
|
|
314
|
+
|
|
195
315
|
Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
196
316
|
WriteBatch* my_batch, WriteCallback* callback,
|
|
197
317
|
UserWriteCallback* user_write_cb, uint64_t* log_used,
|
|
198
318
|
uint64_t log_ref, bool disable_memtable,
|
|
199
319
|
uint64_t* seq_used, size_t batch_cnt,
|
|
200
320
|
PreReleaseCallback* pre_release_callback,
|
|
201
|
-
PostMemTableCallback* post_memtable_callback
|
|
321
|
+
PostMemTableCallback* post_memtable_callback,
|
|
322
|
+
std::shared_ptr<WriteBatchWithIndex> wbwi,
|
|
323
|
+
uint64_t prep_log) {
|
|
202
324
|
assert(!seq_per_batch_ || batch_cnt != 0);
|
|
203
325
|
assert(my_batch == nullptr || my_batch->Count() == 0 ||
|
|
204
326
|
write_options.protection_bytes_per_key == 0 ||
|
|
@@ -287,6 +409,23 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
287
409
|
return Status::NotSupported(
|
|
288
410
|
"DeleteRange is not compatible with row cache.");
|
|
289
411
|
}
|
|
412
|
+
if (wbwi) {
|
|
413
|
+
assert(prep_log > 0);
|
|
414
|
+
// Used only in WriteCommittedTxn::CommitInternal() with no `callback`.
|
|
415
|
+
assert(!callback);
|
|
416
|
+
if (immutable_db_options_.unordered_write) {
|
|
417
|
+
return Status::NotSupported(
|
|
418
|
+
"Ingesting WriteBatch does not support unordered_write");
|
|
419
|
+
}
|
|
420
|
+
if (immutable_db_options_.enable_pipelined_write) {
|
|
421
|
+
return Status::NotSupported(
|
|
422
|
+
"Ingesting WriteBatch does not support pipelined_write");
|
|
423
|
+
}
|
|
424
|
+
if (immutable_db_options_.atomic_flush) {
|
|
425
|
+
return Status::NotSupported(
|
|
426
|
+
"Ingesting WriteBatch does not support atomic_flush");
|
|
427
|
+
}
|
|
428
|
+
}
|
|
290
429
|
// Otherwise IsLatestPersistentState optimization does not make sense
|
|
291
430
|
assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
|
|
292
431
|
disable_memtable);
|
|
@@ -344,7 +483,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
344
483
|
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
|
345
484
|
WriteThread::Writer w(write_options, my_batch, callback, user_write_cb,
|
|
346
485
|
log_ref, disable_memtable, batch_cnt,
|
|
347
|
-
pre_release_callback, post_memtable_callback
|
|
486
|
+
pre_release_callback, post_memtable_callback,
|
|
487
|
+
/*_ingest_wbwi=*/wbwi != nullptr);
|
|
348
488
|
StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
|
|
349
489
|
|
|
350
490
|
write_thread_.JoinBatchGroup(&w);
|
|
@@ -441,6 +581,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
441
581
|
TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters");
|
|
442
582
|
last_batch_group_size_ =
|
|
443
583
|
write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
|
|
584
|
+
if (wbwi) {
|
|
585
|
+
assert(write_group.size == 1);
|
|
586
|
+
}
|
|
444
587
|
|
|
445
588
|
IOStatus io_s;
|
|
446
589
|
Status pre_release_cb_status;
|
|
@@ -494,10 +637,25 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
494
637
|
// Note about seq_per_batch_: either disableWAL is set for the entire write
|
|
495
638
|
// group or not. In either case we inc seq for each write batch with no
|
|
496
639
|
// failed callback. This means that there could be a batch with
|
|
497
|
-
//
|
|
640
|
+
// disable_memtable in between; although we do not write this batch to
|
|
498
641
|
// memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc
|
|
499
642
|
// the seq per valid written key to mem.
|
|
500
643
|
size_t seq_inc = seq_per_batch_ ? valid_batches : total_count;
|
|
644
|
+
if (wbwi) {
|
|
645
|
+
// Reserve sequence numbers for the ingested memtable. We need to reserve
|
|
646
|
+
// at lease this amount for recovery. During recovery,
|
|
647
|
+
// transactions do not commit by ingesting WBWI. The sequence number
|
|
648
|
+
// associated with the commit entry in WAL is used as the starting
|
|
649
|
+
// sequence number for inserting into memtable. We need to reserve
|
|
650
|
+
// enough sequence numbers here (at least the number of operations
|
|
651
|
+
// in write batch) to assign to memtable entries for this transaction.
|
|
652
|
+
// This prevents updates in different transactions from using out-of-order
|
|
653
|
+
// sequence numbers or the same key+seqno.
|
|
654
|
+
//
|
|
655
|
+
// WBWI ingestion requires not grouping writes, so we don't need to
|
|
656
|
+
// consider incrementing sequence number for WBWI from other writers.
|
|
657
|
+
seq_inc += wbwi->GetWriteBatch()->Count();
|
|
658
|
+
}
|
|
501
659
|
|
|
502
660
|
const bool concurrent_update = two_write_queues_;
|
|
503
661
|
// Update stats while we are an exclusive group leader, so we know
|
|
@@ -674,6 +832,27 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
674
832
|
// handle exit, false means somebody else did
|
|
675
833
|
should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
|
|
676
834
|
}
|
|
835
|
+
if (wbwi) {
|
|
836
|
+
if (status.ok() && w.status.ok()) {
|
|
837
|
+
// w.batch contains (potentially empty) commit time batch updates,
|
|
838
|
+
// only ingest wbwi if w.batch is applied to memtable successfully
|
|
839
|
+
assert(wbwi->GetWriteBatch()->Count() > 0);
|
|
840
|
+
|
|
841
|
+
uint32_t memtable_update_count = w.batch->Count();
|
|
842
|
+
SequenceNumber lb = versions_->LastSequence() + memtable_update_count + 1;
|
|
843
|
+
SequenceNumber ub = versions_->LastSequence() + memtable_update_count +
|
|
844
|
+
wbwi->GetWriteBatch()->Count();
|
|
845
|
+
assert(ub == last_sequence);
|
|
846
|
+
if (two_write_queues_) {
|
|
847
|
+
assert(ub <= versions_->LastAllocatedSequence());
|
|
848
|
+
}
|
|
849
|
+
status = IngestWBWI(wbwi, {/*lower_bound=*/lb, /*upper_bound=*/ub},
|
|
850
|
+
prep_log, last_sequence,
|
|
851
|
+
/*memtable_updated=*/memtable_update_count > 0,
|
|
852
|
+
write_options.ignore_missing_column_families);
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
|
|
677
856
|
if (should_exit_batch_group) {
|
|
678
857
|
if (status.ok()) {
|
|
679
858
|
for (auto* tmp_w : write_group) {
|
|
@@ -687,7 +866,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
687
866
|
}
|
|
688
867
|
}
|
|
689
868
|
// Note: if we are to resume after non-OK statuses we need to revisit how
|
|
690
|
-
// we
|
|
869
|
+
// we react to non-OK statuses here.
|
|
691
870
|
versions_->SetLastSequence(last_sequence);
|
|
692
871
|
}
|
|
693
872
|
MemTableInsertStatusCheck(w.status);
|
|
@@ -735,17 +914,6 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|
|
735
914
|
size_t total_byte_size = 0;
|
|
736
915
|
|
|
737
916
|
if (w.status.ok()) {
|
|
738
|
-
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
|
|
739
|
-
// grabs but does not seem thread-safe.
|
|
740
|
-
if (tracer_) {
|
|
741
|
-
InstrumentedMutexLock lock(&trace_mutex_);
|
|
742
|
-
if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
|
|
743
|
-
for (auto* writer : wal_write_group) {
|
|
744
|
-
// TODO: maybe handle the tracing status?
|
|
745
|
-
tracer_->Write(writer->batch).PermitUncheckedError();
|
|
746
|
-
}
|
|
747
|
-
}
|
|
748
|
-
}
|
|
749
917
|
SequenceNumber next_sequence = current_sequence;
|
|
750
918
|
for (auto* writer : wal_write_group) {
|
|
751
919
|
assert(writer);
|
|
@@ -760,6 +928,22 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|
|
760
928
|
}
|
|
761
929
|
}
|
|
762
930
|
}
|
|
931
|
+
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
|
|
932
|
+
// grabs but does not seem thread-safe.
|
|
933
|
+
if (tracer_) {
|
|
934
|
+
InstrumentedMutexLock lock(&trace_mutex_);
|
|
935
|
+
if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
|
|
936
|
+
for (auto* writer : wal_write_group) {
|
|
937
|
+
if (writer->CallbackFailed()) {
|
|
938
|
+
// When optimisitc txn conflict checking fails, we should
|
|
939
|
+
// not record to trace.
|
|
940
|
+
continue;
|
|
941
|
+
}
|
|
942
|
+
// TODO: maybe handle the tracing status?
|
|
943
|
+
tracer_->Write(writer->batch).PermitUncheckedError();
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
}
|
|
763
947
|
if (w.disable_wal) {
|
|
764
948
|
has_unpersisted_data_.store(true, std::memory_order_relaxed);
|
|
765
949
|
}
|
|
@@ -1005,19 +1189,6 @@ Status DBImpl::WriteImplWALOnly(
|
|
|
1005
1189
|
WriteThread::WriteGroup write_group;
|
|
1006
1190
|
uint64_t last_sequence;
|
|
1007
1191
|
write_thread->EnterAsBatchGroupLeader(&w, &write_group);
|
|
1008
|
-
// Note: no need to update last_batch_group_size_ here since the batch writes
|
|
1009
|
-
// to WAL only
|
|
1010
|
-
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
|
|
1011
|
-
// grabs but does not seem thread-safe.
|
|
1012
|
-
if (tracer_) {
|
|
1013
|
-
InstrumentedMutexLock lock(&trace_mutex_);
|
|
1014
|
-
if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
|
|
1015
|
-
for (auto* writer : write_group) {
|
|
1016
|
-
// TODO: maybe handle the tracing status?
|
|
1017
|
-
tracer_->Write(writer->batch).PermitUncheckedError();
|
|
1018
|
-
}
|
|
1019
|
-
}
|
|
1020
|
-
}
|
|
1021
1192
|
|
|
1022
1193
|
size_t pre_release_callback_cnt = 0;
|
|
1023
1194
|
size_t total_byte_size = 0;
|
|
@@ -1032,6 +1203,23 @@ Status DBImpl::WriteImplWALOnly(
|
|
|
1032
1203
|
}
|
|
1033
1204
|
}
|
|
1034
1205
|
|
|
1206
|
+
// Note: no need to update last_batch_group_size_ here since the batch writes
|
|
1207
|
+
// to WAL only
|
|
1208
|
+
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
|
|
1209
|
+
// grabs but does not seem thread-safe.
|
|
1210
|
+
if (tracer_) {
|
|
1211
|
+
InstrumentedMutexLock lock(&trace_mutex_);
|
|
1212
|
+
if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
|
|
1213
|
+
for (auto* writer : write_group) {
|
|
1214
|
+
if (writer->CallbackFailed()) {
|
|
1215
|
+
continue;
|
|
1216
|
+
}
|
|
1217
|
+
// TODO: maybe handle the tracing status?
|
|
1218
|
+
tracer_->Write(writer->batch).PermitUncheckedError();
|
|
1219
|
+
}
|
|
1220
|
+
}
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1035
1223
|
const bool concurrent_update = true;
|
|
1036
1224
|
// Update stats while we are an exclusive group leader, so we know
|
|
1037
1225
|
// that nobody else can be writing to these particular stats.
|
|
@@ -1201,7 +1389,6 @@ void DBImpl::MemTableInsertStatusCheck(const Status& status) {
|
|
|
1201
1389
|
if (!status.ok()) {
|
|
1202
1390
|
mutex_.Lock();
|
|
1203
1391
|
assert(!error_handler_.IsBGWorkStopped());
|
|
1204
|
-
// Maybe change the return status to void?
|
|
1205
1392
|
error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
|
|
1206
1393
|
mutex_.Unlock();
|
|
1207
1394
|
}
|
|
@@ -1601,6 +1788,8 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
|
|
|
1601
1788
|
Status DBImpl::WriteRecoverableState() {
|
|
1602
1789
|
mutex_.AssertHeld();
|
|
1603
1790
|
if (!cached_recoverable_state_empty_) {
|
|
1791
|
+
// Only for write-prepared and write-unprepared.
|
|
1792
|
+
assert(seq_per_batch_);
|
|
1604
1793
|
bool dont_care_bool;
|
|
1605
1794
|
SequenceNumber next_seq;
|
|
1606
1795
|
if (two_write_queues_) {
|
|
@@ -2193,16 +2382,13 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
|
|
|
2193
2382
|
mutex_.Lock();
|
|
2194
2383
|
}
|
|
2195
2384
|
|
|
2196
|
-
|
|
2197
|
-
|
|
2198
|
-
|
|
2199
|
-
// two_write_queues_ is true (This is to simplify the reasoning.)
|
|
2200
|
-
Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
|
2385
|
+
Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
|
|
2386
|
+
ReadOnlyMemTable* new_imm,
|
|
2387
|
+
SequenceNumber last_seqno) {
|
|
2201
2388
|
mutex_.AssertHeld();
|
|
2202
2389
|
assert(lock_wal_count_ == 0);
|
|
2203
2390
|
|
|
2204
2391
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
2205
|
-
const ReadOptions read_options;
|
|
2206
2392
|
const WriteOptions write_options;
|
|
2207
2393
|
|
|
2208
2394
|
log::Writer* new_log = nullptr;
|
|
@@ -2238,12 +2424,13 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
|
|
2238
2424
|
const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
|
|
2239
2425
|
|
|
2240
2426
|
// Set memtable_info for memtable sealed callback
|
|
2427
|
+
// TODO: memtable_info for `new_imm`
|
|
2241
2428
|
MemTableInfo memtable_info;
|
|
2242
2429
|
memtable_info.cf_name = cfd->GetName();
|
|
2243
2430
|
memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber();
|
|
2244
2431
|
memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
|
|
2245
|
-
memtable_info.num_entries = cfd->mem()->
|
|
2246
|
-
memtable_info.num_deletes = cfd->mem()->
|
|
2432
|
+
memtable_info.num_entries = cfd->mem()->NumEntries();
|
|
2433
|
+
memtable_info.num_deletes = cfd->mem()->NumDeletion();
|
|
2247
2434
|
if (!cfd->ioptions()->persist_user_defined_timestamps &&
|
|
2248
2435
|
cfd->user_comparator()->timestamp_size() > 0) {
|
|
2249
2436
|
const Slice& newest_udt = cfd->mem()->GetNewestUDT();
|
|
@@ -2265,8 +2452,20 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
|
|
2265
2452
|
}
|
|
2266
2453
|
}
|
|
2267
2454
|
if (s.ok()) {
|
|
2268
|
-
|
|
2269
|
-
|
|
2455
|
+
// FIXME: from the comment for GetEarliestSequenceNumber(), any key with
|
|
2456
|
+
// seqno >= earliest_seqno should be in this or later memtable. This means
|
|
2457
|
+
// we should use LastSequence() + 1 or last_seqno + 1 here. And it needs to
|
|
2458
|
+
// be incremented with file ingestion and other operations that consumes
|
|
2459
|
+
// sequence number.
|
|
2460
|
+
SequenceNumber seq;
|
|
2461
|
+
if (new_imm) {
|
|
2462
|
+
assert(last_seqno > versions_->LastSequence());
|
|
2463
|
+
seq = last_seqno;
|
|
2464
|
+
} else {
|
|
2465
|
+
seq = versions_->LastSequence();
|
|
2466
|
+
}
|
|
2467
|
+
new_mem =
|
|
2468
|
+
cfd->ConstructNewMemtable(mutable_cf_options, /*earliest_seq=*/seq);
|
|
2270
2469
|
context->superversion_context.NewSuperVersion();
|
|
2271
2470
|
|
|
2272
2471
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
@@ -2348,6 +2547,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
|
|
2348
2547
|
versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_);
|
|
2349
2548
|
if (min_wal_number_to_keep >
|
|
2350
2549
|
versions_->GetWalSet().GetMinWalNumberToKeep()) {
|
|
2550
|
+
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
2551
|
+
const ReadOptions read_options;
|
|
2351
2552
|
// Get a snapshot of the empty column families.
|
|
2352
2553
|
// LogAndApply may release and reacquire db
|
|
2353
2554
|
// mutex, during that period, column family may become empty (e.g. its
|
|
@@ -2405,6 +2606,18 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
|
|
2405
2606
|
cfd->mem()->SetNextLogNumber(logfile_number_);
|
|
2406
2607
|
assert(new_mem != nullptr);
|
|
2407
2608
|
cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
|
|
2609
|
+
if (new_imm) {
|
|
2610
|
+
// Need to assign memtable id here before SetMemtable() below assigns id to
|
|
2611
|
+
// the new live memtable
|
|
2612
|
+
cfd->AssignMemtableID(new_imm);
|
|
2613
|
+
// NOTE: new_imm and cfd->mem() references the same WAL and has the same
|
|
2614
|
+
// NextLogNumber(). They should be flushed together. For non-atomic-flush,
|
|
2615
|
+
// we always try to flush all immutable memtable. For atomic flush, these
|
|
2616
|
+
// two memtables will be marked eligible for flush in the same call to
|
|
2617
|
+
// AssignAtomicFlushSeq().
|
|
2618
|
+
new_imm->SetNextLogNumber(logfile_number_);
|
|
2619
|
+
cfd->imm()->Add(new_imm, &context->memtables_to_free_);
|
|
2620
|
+
}
|
|
2408
2621
|
new_mem->Ref();
|
|
2409
2622
|
cfd->SetMemtable(new_mem);
|
|
2410
2623
|
InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
|
|
@@ -2417,6 +2630,9 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
|
|
2417
2630
|
// that is okay. If we did, it most likely means that s was already an error.
|
|
2418
2631
|
// In any case, ignore any unchecked error for i_os here.
|
|
2419
2632
|
io_s.PermitUncheckedError();
|
|
2633
|
+
// We guarantee that if a non-ok status is returned, `new_imm` was not added
|
|
2634
|
+
// to the db.
|
|
2635
|
+
assert(s.ok());
|
|
2420
2636
|
return s;
|
|
2421
2637
|
}
|
|
2422
2638
|
|