@nxtedition/rocksdb 7.0.27 → 7.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. package/binding.cc +170 -30
  2. package/chained-batch.js +1 -1
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -0
  4. package/deps/rocksdb/rocksdb/Makefile +3 -0
  5. package/deps/rocksdb/rocksdb/TARGETS +10 -0
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +17 -7
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +2 -0
  8. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -0
  9. package/deps/rocksdb/rocksdb/cache/charged_cache.cc +117 -0
  10. package/deps/rocksdb/rocksdb/cache/charged_cache.h +121 -0
  11. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +270 -180
  12. package/deps/rocksdb/rocksdb/cache/clock_cache.h +412 -124
  13. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +1 -0
  14. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +1 -1
  15. package/deps/rocksdb/rocksdb/cache/lru_cache.h +2 -2
  16. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
  17. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +1 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +71 -9
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +11 -2
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +21 -14
  21. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +68 -7
  22. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +16 -0
  23. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +519 -12
  24. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +120 -0
  25. package/deps/rocksdb/rocksdb/db/builder.cc +15 -5
  26. package/deps/rocksdb/rocksdb/db/builder.h +3 -0
  27. package/deps/rocksdb/rocksdb/db/c.cc +18 -0
  28. package/deps/rocksdb/rocksdb/db/c_test.c +18 -0
  29. package/deps/rocksdb/rocksdb/db/column_family.h +2 -0
  30. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +3 -2
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +9 -4
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +15 -10
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +36 -34
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +50 -13
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +12 -0
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +8 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +2 -1
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +13 -17
  39. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +26 -9
  40. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +0 -11
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +93 -0
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +3 -8
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +8 -1
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +17 -5
  46. package/deps/rocksdb/rocksdb/db/db_test.cc +0 -3
  47. package/deps/rocksdb/rocksdb/db/db_test2.cc +39 -12
  48. package/deps/rocksdb/rocksdb/db/db_test_util.cc +9 -0
  49. package/deps/rocksdb/rocksdb/db/db_test_util.h +2 -0
  50. package/deps/rocksdb/rocksdb/db/dbformat.cc +0 -38
  51. package/deps/rocksdb/rocksdb/db/dbformat.h +14 -13
  52. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +5 -2
  53. package/deps/rocksdb/rocksdb/db/event_helpers.cc +13 -1
  54. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +0 -10
  55. package/deps/rocksdb/rocksdb/db/flush_job.cc +19 -15
  56. package/deps/rocksdb/rocksdb/db/flush_job.h +7 -0
  57. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +21 -15
  58. package/deps/rocksdb/rocksdb/db/forward_iterator.h +4 -3
  59. package/deps/rocksdb/rocksdb/db/memtable_list.cc +9 -0
  60. package/deps/rocksdb/rocksdb/db/memtable_list.h +5 -0
  61. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.cc +53 -12
  62. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.h +14 -2
  63. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc +10 -10
  64. package/deps/rocksdb/rocksdb/db/repair.cc +8 -6
  65. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +890 -0
  66. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +324 -0
  67. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +186 -0
  68. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +2 -0
  69. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +13 -4
  70. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -2
  71. package/deps/rocksdb/rocksdb/env/env_test.cc +74 -1
  72. package/deps/rocksdb/rocksdb/env/io_posix.cc +11 -8
  73. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +28 -0
  74. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +14 -1
  75. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +4 -4
  76. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +30 -23
  77. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +1 -1
  78. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +3 -13
  79. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +5 -0
  80. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/debug.h +1 -2
  81. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -0
  82. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  83. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +26 -26
  84. package/deps/rocksdb/rocksdb/options/cf_options.cc +14 -1
  85. package/deps/rocksdb/rocksdb/options/cf_options.h +5 -0
  86. package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -56
  87. package/deps/rocksdb/rocksdb/options/db_options.cc +4 -5
  88. package/deps/rocksdb/rocksdb/options/options.cc +11 -1
  89. package/deps/rocksdb/rocksdb/options/options_helper.cc +8 -0
  90. package/deps/rocksdb/rocksdb/options/options_helper.h +4 -0
  91. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +4 -0
  92. package/deps/rocksdb/rocksdb/options/options_test.cc +4 -0
  93. package/deps/rocksdb/rocksdb/src.mk +3 -0
  94. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +6 -1
  95. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +4 -0
  96. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +36 -3
  97. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +36 -1
  98. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +14 -3
  99. package/deps/rocksdb/rocksdb/table/internal_iterator.h +1 -1
  100. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +6 -0
  101. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +5 -0
  102. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +3 -0
  103. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -7
  104. package/deps/rocksdb/rocksdb/table/table_builder.h +7 -3
  105. package/deps/rocksdb/rocksdb/table/table_properties.cc +9 -0
  106. package/deps/rocksdb/rocksdb/test_util/mock_time_env.h +3 -2
  107. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +58 -30
  108. package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +1 -0
  109. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +20 -0
  110. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +29 -154
  111. package/deps/rocksdb/rocksdb/util/rate_limiter.h +16 -34
  112. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +0 -92
  113. package/deps/rocksdb/rocksdb/util/timer.h +6 -0
  114. package/deps/rocksdb/rocksdb/util/vector_iterator.h +4 -3
  115. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -45
  116. package/deps/rocksdb/rocksdb/utilities/debug.cc +40 -0
  117. package/deps/rocksdb/rocksdb.gyp +2 -0
  118. package/index.js +19 -2
  119. package/package.json +1 -1
  120. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  121. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -1432,6 +1432,126 @@ TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) {
1432
1432
  SyncPoint::GetInstance()->ClearAllCallBacks();
1433
1433
  }
1434
1434
 
1435
+ TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) {
1436
+ Options options = GetDefaultOptions();
1437
+
1438
+ LRUCacheOptions co;
1439
+ co.capacity = 1 << 25;
1440
+ co.num_shard_bits = 2;
1441
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
1442
+ auto backing_cache = NewLRUCache(co);
1443
+
1444
+ options.blob_cache = backing_cache;
1445
+
1446
+ BlockBasedTableOptions block_based_options;
1447
+ block_based_options.no_block_cache = false;
1448
+ block_based_options.block_cache = backing_cache;
1449
+ block_based_options.cache_index_and_filter_blocks = true;
1450
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
1451
+
1452
+ options.enable_blob_files = true;
1453
+ options.create_if_missing = true;
1454
+ options.disable_auto_compactions = true;
1455
+ options.enable_blob_garbage_collection = true;
1456
+ options.blob_garbage_collection_age_cutoff = 1.0;
1457
+ options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
1458
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
1459
+
1460
+ DestroyAndReopen(options);
1461
+
1462
+ constexpr size_t kNumBlobs = 10;
1463
+ constexpr size_t kValueSize = 100;
1464
+
1465
+ std::string value(kValueSize, 'a');
1466
+
1467
+ for (size_t i = 1; i <= kNumBlobs; i++) {
1468
+ ASSERT_OK(Put(std::to_string(i), value));
1469
+ ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap
1470
+ ASSERT_OK(Flush());
1471
+ ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
1472
+ ASSERT_EQ(value, Get(std::to_string(i)));
1473
+ ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
1474
+ ASSERT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_MISS));
1475
+ ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_HIT));
1476
+ }
1477
+
1478
+ // Verify compaction not counted
1479
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
1480
+ /*end=*/nullptr));
1481
+ EXPECT_EQ(kNumBlobs * 2,
1482
+ options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
1483
+ }
1484
+
1485
+ #ifndef ROCKSDB_LITE
1486
+ TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) {
1487
+ Options options = GetDefaultOptions();
1488
+
1489
+ LRUCacheOptions co;
1490
+ co.capacity = 1 << 25;
1491
+ co.num_shard_bits = 2;
1492
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
1493
+ auto backing_cache = NewLRUCache(co);
1494
+
1495
+ options.blob_cache = backing_cache;
1496
+
1497
+ BlockBasedTableOptions block_based_options;
1498
+ block_based_options.no_block_cache = false;
1499
+ block_based_options.block_cache = backing_cache;
1500
+ block_based_options.cache_index_and_filter_blocks = true;
1501
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
1502
+
1503
+ options.enable_blob_files = true;
1504
+ options.create_if_missing = true;
1505
+ options.disable_auto_compactions = true;
1506
+ options.enable_blob_garbage_collection = true;
1507
+ options.blob_garbage_collection_age_cutoff = 1.0;
1508
+ options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
1509
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
1510
+
1511
+ DestroyAndReopen(options);
1512
+
1513
+ constexpr size_t kNumBlobs = 10;
1514
+ constexpr size_t kValueSize = 100;
1515
+
1516
+ std::string value(kValueSize, 'a');
1517
+
1518
+ for (size_t i = 1; i <= 5; i++) {
1519
+ ASSERT_OK(Put(std::to_string(i), value));
1520
+ ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap
1521
+ ASSERT_OK(Flush());
1522
+ ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
1523
+
1524
+ ASSERT_EQ(value, Get(std::to_string(i)));
1525
+ ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
1526
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
1527
+ ASSERT_EQ(0,
1528
+ options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS));
1529
+ ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT));
1530
+ }
1531
+
1532
+ ASSERT_OK(dbfull()->SetOptions({{"prepopulate_blob_cache", "kDisable"}}));
1533
+
1534
+ for (size_t i = 6; i <= kNumBlobs; i++) {
1535
+ ASSERT_OK(Put(std::to_string(i), value));
1536
+ ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap
1537
+ ASSERT_OK(Flush());
1538
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
1539
+
1540
+ ASSERT_EQ(value, Get(std::to_string(i)));
1541
+ ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
1542
+ ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
1543
+ ASSERT_EQ(2,
1544
+ options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS));
1545
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT));
1546
+ }
1547
+
1548
+ // Verify compaction not counted
1549
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
1550
+ /*end=*/nullptr));
1551
+ EXPECT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
1552
+ }
1553
+ #endif // !ROCKSDB_LITE
1554
+
1435
1555
  } // namespace ROCKSDB_NAMESPACE
1436
1556
 
1437
1557
  int main(int argc, char** argv) {
@@ -66,7 +66,8 @@ Status BuildTable(
66
66
  SequenceNumber job_snapshot, SnapshotChecker* snapshot_checker,
67
67
  bool paranoid_file_checks, InternalStats* internal_stats,
68
68
  IOStatus* io_status, const std::shared_ptr<IOTracer>& io_tracer,
69
- BlobFileCreationReason blob_creation_reason, EventLogger* event_logger,
69
+ BlobFileCreationReason blob_creation_reason,
70
+ const SeqnoToTimeMapping& seqno_to_time_mapping, EventLogger* event_logger,
70
71
  int job_id, const Env::IOPriority io_priority,
71
72
  TableProperties* table_properties, Env::WriteLifeTimeHint write_hint,
72
73
  const std::string* full_history_ts_low,
@@ -187,10 +188,10 @@ Status BuildTable(
187
188
  blob_file_additions)
188
189
  ? new BlobFileBuilder(
189
190
  versions, fs, &ioptions, &mutable_cf_options, &file_options,
190
- job_id, tboptions.column_family_id,
191
- tboptions.column_family_name, io_priority, write_hint,
192
- io_tracer, blob_callback, blob_creation_reason,
193
- &blob_file_paths, blob_file_additions)
191
+ tboptions.db_id, tboptions.db_session_id, job_id,
192
+ tboptions.column_family_id, tboptions.column_family_name,
193
+ io_priority, write_hint, io_tracer, blob_callback,
194
+ blob_creation_reason, &blob_file_paths, blob_file_additions)
194
195
  : nullptr);
195
196
 
196
197
  const std::atomic<bool> kManualCompactionCanceledFalse{false};
@@ -260,6 +261,15 @@ Status BuildTable(
260
261
  if (!s.ok() || empty) {
261
262
  builder->Abandon();
262
263
  } else {
264
+ std::string seqno_time_mapping_str;
265
+ seqno_to_time_mapping.Encode(
266
+ seqno_time_mapping_str, meta->fd.smallest_seqno,
267
+ meta->fd.largest_seqno, meta->file_creation_time);
268
+ builder->SetSeqnoTimeTableProperties(
269
+ seqno_time_mapping_str,
270
+ ioptions.compaction_style == CompactionStyle::kCompactionStyleFIFO
271
+ ? meta->file_creation_time
272
+ : meta->oldest_ancester_time);
263
273
  s = builder->Finish();
264
274
  }
265
275
  if (io_status->ok()) {
@@ -9,7 +9,9 @@
9
9
  #include <string>
10
10
  #include <utility>
11
11
  #include <vector>
12
+
12
13
  #include "db/range_tombstone_fragmenter.h"
14
+ #include "db/seqno_to_time_mapping.h"
13
15
  #include "db/table_properties_collector.h"
14
16
  #include "logging/event_logger.h"
15
17
  #include "options/cf_options.h"
@@ -61,6 +63,7 @@ extern Status BuildTable(
61
63
  bool paranoid_file_checks, InternalStats* internal_stats,
62
64
  IOStatus* io_status, const std::shared_ptr<IOTracer>& io_tracer,
63
65
  BlobFileCreationReason blob_creation_reason,
66
+ const SeqnoToTimeMapping& seqno_to_time_mapping,
64
67
  EventLogger* event_logger = nullptr, int job_id = 0,
65
68
  const Env::IOPriority io_priority = Env::IO_HIGH,
66
69
  TableProperties* table_properties = nullptr,
@@ -99,6 +99,7 @@ using ROCKSDB_NAMESPACE::Options;
99
99
  using ROCKSDB_NAMESPACE::PerfContext;
100
100
  using ROCKSDB_NAMESPACE::PerfLevel;
101
101
  using ROCKSDB_NAMESPACE::PinnableSlice;
102
+ using ROCKSDB_NAMESPACE::PrepopulateBlobCache;
102
103
  using ROCKSDB_NAMESPACE::RandomAccessFile;
103
104
  using ROCKSDB_NAMESPACE::Range;
104
105
  using ROCKSDB_NAMESPACE::RateLimiter;
@@ -3140,6 +3141,14 @@ void rocksdb_options_set_blob_cache(rocksdb_options_t* opt,
3140
3141
  opt->rep.blob_cache = blob_cache->rep;
3141
3142
  }
3142
3143
 
3144
+ void rocksdb_options_set_prepopulate_blob_cache(rocksdb_options_t* opt, int t) {
3145
+ opt->rep.prepopulate_blob_cache = static_cast<PrepopulateBlobCache>(t);
3146
+ }
3147
+
3148
+ int rocksdb_options_get_prepopulate_blob_cache(rocksdb_options_t* opt) {
3149
+ return static_cast<int>(opt->rep.prepopulate_blob_cache);
3150
+ }
3151
+
3143
3152
  void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
3144
3153
  opt->rep.num_levels = n;
3145
3154
  }
@@ -4899,6 +4908,15 @@ void rocksdb_sstfilewriter_delete_with_ts(rocksdb_sstfilewriter_t* writer,
4899
4908
  SaveError(errptr, writer->rep->Delete(Slice(key, keylen), Slice(ts, tslen)));
4900
4909
  }
4901
4910
 
4911
+ void rocksdb_sstfilewriter_delete_range(rocksdb_sstfilewriter_t* writer,
4912
+ const char* begin_key,
4913
+ size_t begin_keylen,
4914
+ const char* end_key, size_t end_keylen,
4915
+ char** errptr) {
4916
+ SaveError(errptr, writer->rep->DeleteRange(Slice(begin_key, begin_keylen),
4917
+ Slice(end_key, end_keylen)));
4918
+ }
4919
+
4902
4920
  void rocksdb_sstfilewriter_finish(rocksdb_sstfilewriter_t* writer,
4903
4921
  char** errptr) {
4904
4922
  SaveError(errptr, writer->rep->Finish(nullptr));
@@ -905,6 +905,21 @@ int main(int argc, char** argv) {
905
905
  CheckGet(db, roptions, "sstk22", "v5");
906
906
  CheckGet(db, roptions, "sstk3", "v6");
907
907
 
908
+ rocksdb_sstfilewriter_open(writer, sstfilename, &err);
909
+ CheckNoError(err);
910
+ rocksdb_sstfilewriter_put(writer, "abc1", 4, "v7", 2, &err);
911
+ CheckNoError(err);
912
+ rocksdb_sstfilewriter_put(writer, "abc2", 4, "v8", 2, &err);
913
+ CheckNoError(err);
914
+ rocksdb_sstfilewriter_put(writer, "abc3", 4, "v9", 2, &err);
915
+ CheckNoError(err);
916
+ rocksdb_sstfilewriter_put(writer, "abc4", 4, "v10", 3, &err);
917
+ CheckNoError(err);
918
+ rocksdb_sstfilewriter_delete_range(writer, "abc1", 4, "abc4", 4, &err);
919
+ CheckNoError(err);
920
+ rocksdb_sstfilewriter_finish(writer, &err);
921
+ CheckNoError(err);
922
+
908
923
  rocksdb_ingestexternalfileoptions_destroy(ing_opt);
909
924
  rocksdb_sstfilewriter_destroy(writer);
910
925
  rocksdb_options_destroy(io_options);
@@ -2053,6 +2068,9 @@ int main(int argc, char** argv) {
2053
2068
  rocksdb_options_set_blob_file_starting_level(o, 5);
2054
2069
  CheckCondition(5 == rocksdb_options_get_blob_file_starting_level(o));
2055
2070
 
2071
+ rocksdb_options_set_prepopulate_blob_cache(o, 1 /* flush only */);
2072
+ CheckCondition(1 == rocksdb_options_get_prepopulate_blob_cache(o));
2073
+
2056
2074
  // Create a copy that should be equal to the original.
2057
2075
  rocksdb_options_t* copy;
2058
2076
  copy = rocksdb_options_create_copy(o);
@@ -524,6 +524,8 @@ class ColumnFamilyData {
524
524
  return file_metadata_cache_res_mgr_;
525
525
  }
526
526
 
527
+ SequenceNumber GetFirstMemtableSequenceNumber() const;
528
+
527
529
  static const uint32_t kDummyColumnFamilyDataId;
528
530
 
529
531
  // Keep track of whether the mempurge feature was ever used.
@@ -7,6 +7,7 @@
7
7
 
8
8
  #include <cassert>
9
9
 
10
+ #include "rocksdb/comparator.h"
10
11
  #include "table/internal_iterator.h"
11
12
 
12
13
  namespace ROCKSDB_NAMESPACE {
@@ -19,7 +20,7 @@ namespace ROCKSDB_NAMESPACE {
19
20
  class ClippingIterator : public InternalIterator {
20
21
  public:
21
22
  ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end,
22
- const Comparator* cmp)
23
+ const CompareInterface* cmp)
23
24
  : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) {
24
25
  assert(iter_);
25
26
  assert(cmp_);
@@ -268,7 +269,7 @@ class ClippingIterator : public InternalIterator {
268
269
  InternalIterator* iter_;
269
270
  const Slice* start_;
270
271
  const Slice* end_;
271
- const Comparator* cmp_;
272
+ const CompareInterface* cmp_;
272
273
  bool valid_;
273
274
  };
274
275
 
@@ -440,6 +440,11 @@ bool Compaction::IsTrivialMove() const {
440
440
  }
441
441
  }
442
442
 
443
+ // PerKeyPlacement compaction should never be trivial move.
444
+ if (SupportsPerKeyPlacement()) {
445
+ return false;
446
+ }
447
+
443
448
  return true;
444
449
  }
445
450
 
@@ -741,10 +746,10 @@ int Compaction::EvaluatePenultimateLevel(
741
746
  return kInvalidLevel;
742
747
  }
743
748
 
744
- // TODO: will add public like `options.preclude_last_level_data_seconds` for
745
- // per_key_placement feature, will check that option here. Currently, only
746
- // set by unittest
747
- bool supports_per_key_placement = false;
749
+ bool supports_per_key_placement =
750
+ immutable_options.preclude_last_level_data_seconds > 0;
751
+
752
+ // it could be overridden by unittest
748
753
  TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled",
749
754
  &supports_per_key_placement);
750
755
  if (!supports_per_key_placement) {
@@ -33,7 +33,8 @@ CompactionIterator::CompactionIterator(
33
33
  const Compaction* compaction, const CompactionFilter* compaction_filter,
34
34
  const std::atomic<bool>* shutting_down,
35
35
  const std::shared_ptr<Logger> info_log,
36
- const std::string* full_history_ts_low)
36
+ const std::string* full_history_ts_low,
37
+ const SequenceNumber penultimate_level_cutoff_seqno)
37
38
  : CompactionIterator(
38
39
  input, cmp, merge_helper, last_sequence, snapshots,
39
40
  earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
@@ -42,7 +43,8 @@ CompactionIterator::CompactionIterator(
42
43
  manual_compaction_canceled,
43
44
  std::unique_ptr<CompactionProxy>(
44
45
  compaction ? new RealCompaction(compaction) : nullptr),
45
- compaction_filter, shutting_down, info_log, full_history_ts_low) {}
46
+ compaction_filter, shutting_down, info_log, full_history_ts_low,
47
+ penultimate_level_cutoff_seqno) {}
46
48
 
47
49
  CompactionIterator::CompactionIterator(
48
50
  InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@@ -58,7 +60,8 @@ CompactionIterator::CompactionIterator(
58
60
  const CompactionFilter* compaction_filter,
59
61
  const std::atomic<bool>* shutting_down,
60
62
  const std::shared_ptr<Logger> info_log,
61
- const std::string* full_history_ts_low)
63
+ const std::string* full_history_ts_low,
64
+ const SequenceNumber penultimate_level_cutoff_seqno)
62
65
  : input_(input, cmp,
63
66
  !compaction || compaction->DoesInputReferenceBlobFiles()),
64
67
  cmp_(cmp),
@@ -92,7 +95,8 @@ CompactionIterator::CompactionIterator(
92
95
  CreatePrefetchBufferCollectionIfNeeded(compaction_.get())),
93
96
  current_key_committed_(false),
94
97
  cmp_with_history_ts_low_(0),
95
- level_(compaction_ == nullptr ? 0 : compaction_->level()) {
98
+ level_(compaction_ == nullptr ? 0 : compaction_->level()),
99
+ penultimate_level_cutoff_seqno_(penultimate_level_cutoff_seqno) {
96
100
  assert(snapshots_ != nullptr);
97
101
  bottommost_level_ = compaction_ == nullptr
98
102
  ? false
@@ -1077,8 +1081,7 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() {
1077
1081
 
1078
1082
  void CompactionIterator::DecideOutputLevel() {
1079
1083
  #ifndef NDEBUG
1080
- // TODO: will be set by sequence number or key range, for now, it will only be
1081
- // set by unittest
1084
+ // Could be overridden by unittest
1082
1085
  PerKeyPlacementContext context(level_, ikey_.user_key, value_,
1083
1086
  ikey_.sequence);
1084
1087
  TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
@@ -1086,9 +1089,10 @@ void CompactionIterator::DecideOutputLevel() {
1086
1089
  output_to_penultimate_level_ = context.output_to_penultimate_level;
1087
1090
  #endif /* !NDEBUG */
1088
1091
 
1089
- // if the key is within the earliest snapshot, it has to output to the
1090
- // penultimate level.
1091
- if (ikey_.sequence > earliest_snapshot_) {
1092
+ // if the key is newer than the cutoff sequence or within the earliest
1093
+ // snapshot, it should output to the penultimate level.
1094
+ if (ikey_.sequence > penultimate_level_cutoff_seqno_ ||
1095
+ ikey_.sequence > earliest_snapshot_) {
1092
1096
  output_to_penultimate_level_ = true;
1093
1097
  }
1094
1098
 
@@ -1148,7 +1152,8 @@ void CompactionIterator::PrepareOutput() {
1148
1152
  !compaction_->allow_ingest_behind() && bottommost_level_ &&
1149
1153
  DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
1150
1154
  ikey_.type != kTypeMerge && current_key_committed_ &&
1151
- !output_to_penultimate_level_) {
1155
+ !output_to_penultimate_level_ &&
1156
+ ikey_.sequence < penultimate_level_cutoff_seqno_) {
1152
1157
  if (ikey_.type == kTypeDeletion ||
1153
1158
  (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
1154
1159
  ROCKS_LOG_FATAL(
@@ -181,42 +181,40 @@ class CompactionIterator {
181
181
  const Compaction* compaction_;
182
182
  };
183
183
 
184
- CompactionIterator(InternalIterator* input, const Comparator* cmp,
185
- MergeHelper* merge_helper, SequenceNumber last_sequence,
186
- std::vector<SequenceNumber>* snapshots,
187
- SequenceNumber earliest_write_conflict_snapshot,
188
- SequenceNumber job_snapshot,
189
- const SnapshotChecker* snapshot_checker, Env* env,
190
- bool report_detailed_time, bool expect_valid_internal_key,
191
- CompactionRangeDelAggregator* range_del_agg,
192
- BlobFileBuilder* blob_file_builder,
193
- bool allow_data_in_errors,
194
- bool enforce_single_del_contracts,
195
- const std::atomic<bool>& manual_compaction_canceled,
196
- const Compaction* compaction = nullptr,
197
- const CompactionFilter* compaction_filter = nullptr,
198
- const std::atomic<bool>* shutting_down = nullptr,
199
- const std::shared_ptr<Logger> info_log = nullptr,
200
- const std::string* full_history_ts_low = nullptr);
184
+ CompactionIterator(
185
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
186
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
187
+ SequenceNumber earliest_write_conflict_snapshot,
188
+ SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
189
+ Env* env, bool report_detailed_time, bool expect_valid_internal_key,
190
+ CompactionRangeDelAggregator* range_del_agg,
191
+ BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
192
+ bool enforce_single_del_contracts,
193
+ const std::atomic<bool>& manual_compaction_canceled,
194
+ const Compaction* compaction = nullptr,
195
+ const CompactionFilter* compaction_filter = nullptr,
196
+ const std::atomic<bool>* shutting_down = nullptr,
197
+ const std::shared_ptr<Logger> info_log = nullptr,
198
+ const std::string* full_history_ts_low = nullptr,
199
+ const SequenceNumber penultimate_level_cutoff_seqno = kMaxSequenceNumber);
201
200
 
202
201
  // Constructor with custom CompactionProxy, used for tests.
203
- CompactionIterator(InternalIterator* input, const Comparator* cmp,
204
- MergeHelper* merge_helper, SequenceNumber last_sequence,
205
- std::vector<SequenceNumber>* snapshots,
206
- SequenceNumber earliest_write_conflict_snapshot,
207
- SequenceNumber job_snapshot,
208
- const SnapshotChecker* snapshot_checker, Env* env,
209
- bool report_detailed_time, bool expect_valid_internal_key,
210
- CompactionRangeDelAggregator* range_del_agg,
211
- BlobFileBuilder* blob_file_builder,
212
- bool allow_data_in_errors,
213
- bool enforce_single_del_contracts,
214
- const std::atomic<bool>& manual_compaction_canceled,
215
- std::unique_ptr<CompactionProxy> compaction,
216
- const CompactionFilter* compaction_filter = nullptr,
217
- const std::atomic<bool>* shutting_down = nullptr,
218
- const std::shared_ptr<Logger> info_log = nullptr,
219
- const std::string* full_history_ts_low = nullptr);
202
+ CompactionIterator(
203
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
204
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
205
+ SequenceNumber earliest_write_conflict_snapshot,
206
+ SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
207
+ Env* env, bool report_detailed_time, bool expect_valid_internal_key,
208
+ CompactionRangeDelAggregator* range_del_agg,
209
+ BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
210
+ bool enforce_single_del_contracts,
211
+ const std::atomic<bool>& manual_compaction_canceled,
212
+ std::unique_ptr<CompactionProxy> compaction,
213
+ const CompactionFilter* compaction_filter = nullptr,
214
+ const std::atomic<bool>* shutting_down = nullptr,
215
+ const std::shared_ptr<Logger> info_log = nullptr,
216
+ const std::string* full_history_ts_low = nullptr,
217
+ const SequenceNumber penultimate_level_cutoff_seqno = kMaxSequenceNumber);
220
218
 
221
219
  ~CompactionIterator();
222
220
 
@@ -446,6 +444,10 @@ class CompactionIterator {
446
444
  // output to.
447
445
  bool output_to_penultimate_level_{false};
448
446
 
447
+ // any key later than this sequence number should have
448
+ // output_to_penultimate_level_ set to true
449
+ const SequenceNumber penultimate_level_cutoff_seqno_ = kMaxSequenceNumber;
450
+
449
451
  void AdvanceInputIter() { input_.Next(); }
450
452
 
451
453
  void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); }
@@ -223,12 +223,12 @@ void CompactionJob::Prepare() {
223
223
 
224
224
  // Generate file_levels_ for compaction before making Iterator
225
225
  auto* c = compact_->compaction;
226
- assert(c->column_family_data() != nullptr);
227
- assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
226
+ ColumnFamilyData* cfd = c->column_family_data();
227
+ assert(cfd != nullptr);
228
+ assert(cfd->current()->storage_info()->NumLevelFiles(
228
229
  compact_->compaction->level()) > 0);
229
230
 
230
- write_hint_ =
231
- c->column_family_data()->CalculateSSTWriteHint(c->output_level());
231
+ write_hint_ = cfd->CalculateSSTWriteHint(c->output_level());
232
232
  bottommost_level_ = c->bottommost_level();
233
233
 
234
234
  if (c->ShouldFormSubcompactions()) {
@@ -251,6 +251,43 @@ void CompactionJob::Prepare() {
251
251
 
252
252
  compact_->sub_compact_states.emplace_back(c, start, end, /*sub_job_id*/ 0);
253
253
  }
254
+
255
+ if (c->immutable_options()->preclude_last_level_data_seconds > 0) {
256
+ // TODO(zjay): move to a function
257
+ seqno_time_mapping_.SetMaxTimeDuration(
258
+ c->immutable_options()->preclude_last_level_data_seconds);
259
+ // setup seqno_time_mapping_
260
+ for (const auto& each_level : *c->inputs()) {
261
+ for (const auto& fmd : each_level.files) {
262
+ std::shared_ptr<const TableProperties> tp;
263
+ Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr);
264
+ if (s.ok()) {
265
+ seqno_time_mapping_.Add(tp->seqno_to_time_mapping)
266
+ .PermitUncheckedError();
267
+ seqno_time_mapping_.Add(fmd->fd.smallest_seqno,
268
+ fmd->oldest_ancester_time);
269
+ }
270
+ }
271
+ }
272
+
273
+ auto status = seqno_time_mapping_.Sort();
274
+ if (!status.ok()) {
275
+ ROCKS_LOG_WARN(db_options_.info_log,
276
+ "Invalid sequence number to time mapping: Status: %s",
277
+ status.ToString().c_str());
278
+ }
279
+ int64_t _current_time = 0;
280
+ status = db_options_.clock->GetCurrentTime(&_current_time);
281
+ if (!status.ok()) {
282
+ ROCKS_LOG_WARN(db_options_.info_log,
283
+ "Failed to get current time in compaction: Status: %s",
284
+ status.ToString().c_str());
285
+ penultimate_level_cutoff_seqno_ = 0;
286
+ } else {
287
+ penultimate_level_cutoff_seqno_ =
288
+ seqno_time_mapping_.TruncateOldEntries(_current_time);
289
+ }
290
+ }
254
291
  }
255
292
 
256
293
  struct RangeWithSize {
@@ -962,10 +999,10 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
962
999
  ? new BlobFileBuilder(
963
1000
  versions_, fs_.get(),
964
1001
  sub_compact->compaction->immutable_options(),
965
- mutable_cf_options, &file_options_, job_id_, cfd->GetID(),
966
- cfd->GetName(), Env::IOPriority::IO_LOW, write_hint_,
967
- io_tracer_, blob_callback_, BlobFileCreationReason::kCompaction,
968
- &blob_file_paths,
1002
+ mutable_cf_options, &file_options_, db_id_, db_session_id_,
1003
+ job_id_, cfd->GetID(), cfd->GetName(), Env::IOPriority::IO_LOW,
1004
+ write_hint_, io_tracer_, blob_callback_,
1005
+ BlobFileCreationReason::kCompaction, &blob_file_paths,
969
1006
  sub_compact->Current().GetBlobFileAdditionsPtr())
970
1007
  : nullptr);
971
1008
 
@@ -989,7 +1026,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
989
1026
  blob_file_builder.get(), db_options_.allow_data_in_errors,
990
1027
  db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
991
1028
  sub_compact->compaction, compaction_filter, shutting_down_,
992
- db_options_.info_log, full_history_ts_low);
1029
+ db_options_.info_log, full_history_ts_low,
1030
+ penultimate_level_cutoff_seqno_);
993
1031
  c_iter->SeekToFirst();
994
1032
 
995
1033
  // Assign range delete aggregator to the target output level, which makes sure
@@ -1253,7 +1291,7 @@ Status CompactionJob::FinishCompactionOutputFile(
1253
1291
 
1254
1292
  const uint64_t current_entries = outputs.NumEntries();
1255
1293
 
1256
- s = outputs.Finish(s);
1294
+ s = outputs.Finish(s, seqno_time_mapping_);
1257
1295
 
1258
1296
  if (s.ok()) {
1259
1297
  // With accurate smallest and largest key, we can get a slightly more
@@ -1617,9 +1655,8 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
1617
1655
  sub_compact->compaction->output_compression_opts(), cfd->GetID(),
1618
1656
  cfd->GetName(), sub_compact->compaction->output_level(),
1619
1657
  bottommost_level_, TableFileCreationReason::kCompaction,
1620
- oldest_ancester_time, 0 /* oldest_key_time */, current_time, db_id_,
1621
- db_session_id_, sub_compact->compaction->max_output_file_size(),
1622
- file_number);
1658
+ 0 /* oldest_key_time */, current_time, db_id_, db_session_id_,
1659
+ sub_compact->compaction->max_output_file_size(), file_number);
1623
1660
 
1624
1661
  outputs.NewBuilder(tboptions);
1625
1662
 
@@ -27,6 +27,7 @@
27
27
  #include "db/log_writer.h"
28
28
  #include "db/memtable_list.h"
29
29
  #include "db/range_del_aggregator.h"
30
+ #include "db/seqno_to_time_mapping.h"
30
31
  #include "db/version_edit.h"
31
32
  #include "db/write_controller.h"
32
33
  #include "db/write_thread.h"
@@ -299,6 +300,17 @@ class CompactionJob {
299
300
 
300
301
  uint64_t GetCompactionId(SubcompactionState* sub_compact) const;
301
302
 
303
+ // Stores the sequence number to time mapping gathered from all input files
304
+ // it also collects the smallest_seqno -> oldest_ancester_time from the SST.
305
+ SeqnoToTimeMapping seqno_time_mapping_;
306
+
307
+ // cutoff sequence number for penultimate level, only set when
308
+ // per_key_placement feature is enabled.
309
+ // If a key with sequence number larger than penultimate_level_cutoff_seqno_,
310
+ // it will be placed on the penultimate_level and seqnuence number won't be
311
+ // zeroed out.
312
+ SequenceNumber penultimate_level_cutoff_seqno_ = kMaxSequenceNumber;
313
+
302
314
  // Get table file name in where it's outputting to, which should also be in
303
315
  // `output_directory_`.
304
316
  virtual std::string GetTableFileName(uint64_t file_number);
@@ -18,12 +18,19 @@ void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) {
18
18
  builder_.reset(NewTableBuilder(tboptions, file_writer_.get()));
19
19
  }
20
20
 
21
- Status CompactionOutputs::Finish(const Status& intput_status) {
21
+ Status CompactionOutputs::Finish(const Status& intput_status,
22
+ const SeqnoToTimeMapping& seqno_time_mapping) {
22
23
  FileMetaData* meta = GetMetaData();
23
24
  assert(meta != nullptr);
24
25
  Status s = intput_status;
25
26
  if (s.ok()) {
27
+ std::string seqno_time_mapping_str;
28
+ seqno_time_mapping.Encode(seqno_time_mapping_str, meta->fd.smallest_seqno,
29
+ meta->fd.largest_seqno, meta->file_creation_time);
30
+ builder_->SetSeqnoTimeTableProperties(seqno_time_mapping_str,
31
+ meta->oldest_ancester_time);
26
32
  s = builder_->Finish();
33
+
27
34
  } else {
28
35
  builder_->Abandon();
29
36
  }
@@ -111,7 +111,8 @@ class CompactionOutputs {
111
111
  }
112
112
 
113
113
  // Finish the current output file
114
- Status Finish(const Status& intput_status);
114
+ Status Finish(const Status& intput_status,
115
+ const SeqnoToTimeMapping& seqno_time_mapping);
115
116
 
116
117
  // Update output table properties from table builder
117
118
  void UpdateTableProperties() {