duckdb 0.5.2-dev547.0 → 0.5.2-dev561.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/duckdb.cpp CHANGED
@@ -1711,20 +1711,22 @@ unique_ptr<Key> Key::CreateKey(const char *value);
1711
1711
 
1712
1712
  namespace duckdb {
1713
1713
  class BlockHandle;
1714
+ class BlockManager;
1714
1715
  class BufferHandle;
1715
1716
  class DatabaseInstance;
1716
1717
 
1717
1718
  //! This struct is responsible for reading meta data from disk
1718
1719
  class MetaBlockReader : public Deserializer {
1719
1720
  public:
1720
- MetaBlockReader(DatabaseInstance &db, block_id_t block);
1721
+ MetaBlockReader(BlockManager &block_manager, block_id_t block, bool free_blocks_on_read = true);
1721
1722
  ~MetaBlockReader() override;
1722
1723
 
1723
- DatabaseInstance &db;
1724
+ BlockManager &block_manager;
1724
1725
  shared_ptr<BlockHandle> block;
1725
1726
  BufferHandle handle;
1726
1727
  idx_t offset;
1727
1728
  block_id_t next_block;
1729
+ bool free_blocks_on_read;
1728
1730
 
1729
1731
  public:
1730
1732
  //! Read content of size read_size into the buffer
@@ -2241,8 +2243,9 @@ enum VerifyExistenceType : uint8_t {
2241
2243
 
2242
2244
  class ART : public Index {
2243
2245
  public:
2244
- ART(const vector<column_t> &column_ids, const vector<unique_ptr<Expression>> &unbound_expressions,
2245
- IndexConstraintType constraint_type, DatabaseInstance &db, idx_t block_id = DConstants::INVALID_INDEX,
2246
+ ART(const vector<column_t> &column_ids, TableIOManager &table_io_manager,
2247
+ const vector<unique_ptr<Expression>> &unbound_expressions, IndexConstraintType constraint_type,
2248
+ DatabaseInstance &db, idx_t block_id = DConstants::INVALID_INDEX,
2246
2249
  idx_t block_offset = DConstants::INVALID_INDEX);
2247
2250
  ~ART() override;
2248
2251
 
@@ -2251,6 +2254,7 @@ public:
2251
2254
 
2252
2255
  DatabaseInstance &db;
2253
2256
 
2257
+ public:
2254
2258
  //! Initialize a scan on the index with the given expression and column ids
2255
2259
  //! to fetch from the base table for a single predicate
2256
2260
  unique_ptr<IndexScanState> InitializeScanSinglePredicate(Transaction &transaction, Value value,
@@ -3487,6 +3491,39 @@ protected:
3487
3491
 
3488
3492
 
3489
3493
 
3494
+ //===----------------------------------------------------------------------===//
3495
+ // DuckDB
3496
+ //
3497
+ // duckdb/storage/table_io_manager.hpp
3498
+ //
3499
+ //
3500
+ //===----------------------------------------------------------------------===//
3501
+
3502
+
3503
+
3504
+
3505
+
3506
+ namespace duckdb {
3507
+ class BlockManager;
3508
+ class DataTable;
3509
+
3510
+ class TableIOManager {
3511
+ public:
3512
+ virtual ~TableIOManager() {
3513
+ }
3514
+
3515
+ //! Obtains a reference to the TableIOManager of a specific table
3516
+ static TableIOManager &Get(DataTable &table);
3517
+
3518
+ //! The block manager used for managing index data
3519
+ virtual BlockManager &GetIndexBlockManager() = 0;
3520
+
3521
+ //! The block manager used for storing row group data
3522
+ virtual BlockManager &GetBlockManagerForRowData() = 0;
3523
+ };
3524
+
3525
+ } // namespace duckdb
3526
+
3490
3527
  //===----------------------------------------------------------------------===//
3491
3528
  // DuckDB
3492
3529
  //
@@ -3586,10 +3623,10 @@ class TransactionManager;
3586
3623
  //! server crashes or is shut down.
3587
3624
  class WriteAheadLog {
3588
3625
  public:
3589
- explicit WriteAheadLog(DatabaseInstance &database);
3626
+ //! Initialize the WAL in the specified directory
3627
+ explicit WriteAheadLog(DatabaseInstance &database, const string &path);
3628
+ virtual ~WriteAheadLog();
3590
3629
 
3591
- //! Whether or not the WAL has been initialized
3592
- bool initialized;
3593
3630
  //! Skip writing to the WAL
3594
3631
  bool skip_writing;
3595
3632
 
@@ -3597,14 +3634,12 @@ public:
3597
3634
  //! Replay the WAL
3598
3635
  static bool Replay(DatabaseInstance &database, string &path);
3599
3636
 
3600
- //! Initialize the WAL in the specified directory
3601
- void Initialize(string &path);
3602
3637
  //! Returns the current size of the WAL in bytes
3603
3638
  int64_t GetWALSize();
3604
3639
  //! Gets the total bytes written to the WAL since startup
3605
3640
  idx_t GetTotalWritten();
3606
3641
 
3607
- void WriteCreateTable(TableCatalogEntry *entry);
3642
+ virtual void WriteCreateTable(TableCatalogEntry *entry);
3608
3643
  void WriteDropTable(TableCatalogEntry *entry);
3609
3644
 
3610
3645
  void WriteCreateSchema(SchemaCatalogEntry *entry);
@@ -3650,7 +3685,7 @@ public:
3650
3685
 
3651
3686
  void WriteCheckpoint(block_id_t meta_block);
3652
3687
 
3653
- private:
3688
+ protected:
3654
3689
  DatabaseInstance &database;
3655
3690
  unique_ptr<BufferedFileWriter> writer;
3656
3691
  string wal_path;
@@ -3662,19 +3697,38 @@ private:
3662
3697
  namespace duckdb {
3663
3698
  class BlockManager;
3664
3699
  class Catalog;
3700
+ class CheckpointWriter;
3665
3701
  class DatabaseInstance;
3666
3702
  class TransactionManager;
3667
3703
  class TableCatalogEntry;
3668
3704
 
3705
+ struct DatabaseSize {
3706
+ idx_t total_blocks = 0;
3707
+ idx_t block_size = 0;
3708
+ idx_t free_blocks = 0;
3709
+ idx_t used_blocks = 0;
3710
+ idx_t bytes = 0;
3711
+ idx_t wal_size = 0;
3712
+ };
3713
+
3714
+ class StorageCommitState {
3715
+ public:
3716
+ // Destruction of this object, without prior call to FlushCommit,
3717
+ // will roll back the committed changes.
3718
+ virtual ~StorageCommitState() {
3719
+ }
3720
+
3721
+ // Make the commit persistent
3722
+ virtual void FlushCommit() = 0;
3723
+ };
3724
+
3669
3725
  //! StorageManager is responsible for managing the physical storage of the
3670
3726
  //! database on disk
3671
3727
  class StorageManager {
3672
3728
  public:
3673
3729
  StorageManager(DatabaseInstance &db, string path, bool read_only);
3674
- ~StorageManager();
3730
+ virtual ~StorageManager();
3675
3731
 
3676
- //! The BlockManager to read/store meta information and data in blocks
3677
- unique_ptr<BlockManager> block_manager;
3678
3732
  //! The BufferManager of the database
3679
3733
  unique_ptr<BufferManager> buffer_manager;
3680
3734
  //! The database this storagemanager belongs to
@@ -3686,35 +3740,62 @@ public:
3686
3740
 
3687
3741
  //! Initialize a database or load an existing database from the given path
3688
3742
  void Initialize();
3689
- //! Get the WAL of the StorageManager, returns nullptr if in-memory
3690
- WriteAheadLog *GetWriteAheadLog() {
3691
- return wal.initialized ? &wal : nullptr;
3692
- }
3693
3743
 
3694
3744
  DatabaseInstance &GetDatabase() {
3695
3745
  return db;
3696
3746
  }
3697
3747
 
3698
- void CreateCheckpoint(bool delete_wal = false, bool force_checkpoint = false);
3748
+ //! Get the WAL of the StorageManager, returns nullptr if in-memory
3749
+ WriteAheadLog *GetWriteAheadLog() {
3750
+ return wal.get();
3751
+ }
3699
3752
 
3700
3753
  string GetDBPath() {
3701
3754
  return path;
3702
3755
  }
3703
3756
  bool InMemory();
3704
3757
 
3705
- private:
3706
- //! Load the database from a directory
3707
- void LoadDatabase();
3758
+ virtual bool AutomaticCheckpoint(idx_t estimated_wal_bytes) = 0;
3759
+ virtual unique_ptr<StorageCommitState> GenStorageCommitState(Transaction &transaction, bool checkpoint) = 0;
3760
+ virtual bool IsCheckpointClean(block_id_t checkpoint_id) = 0;
3761
+ virtual void CreateCheckpoint(bool delete_wal = false, bool force_checkpoint = false) = 0;
3762
+ virtual DatabaseSize GetDatabaseSize() = 0;
3763
+ virtual shared_ptr<TableIOManager> GetTableIOManager(BoundCreateTableInfo *info) = 0;
3764
+
3765
+ protected:
3766
+ virtual void LoadDatabase() = 0;
3767
+ virtual void CreateBufferManager();
3708
3768
 
3709
3769
  //! The path of the database
3710
3770
  string path;
3711
3771
  //! The WriteAheadLog of the storage manager
3712
- WriteAheadLog wal;
3772
+ unique_ptr<WriteAheadLog> wal;
3713
3773
 
3714
3774
  //! Whether or not the database is opened in read-only mode
3715
3775
  bool read_only;
3716
3776
  };
3717
3777
 
3778
+ //! Stores database in a single file.
3779
+ class SingleFileStorageManager : public StorageManager {
3780
+ public:
3781
+ SingleFileStorageManager(DatabaseInstance &db, string path, bool read_only);
3782
+
3783
+ //! The BlockManager to read/store meta information and data in blocks
3784
+ unique_ptr<BlockManager> block_manager;
3785
+ //! TableIoManager
3786
+ unique_ptr<TableIOManager> table_io_manager;
3787
+
3788
+ public:
3789
+ bool AutomaticCheckpoint(idx_t estimated_wal_bytes) override;
3790
+ unique_ptr<StorageCommitState> GenStorageCommitState(Transaction &transaction, bool checkpoint) override;
3791
+ bool IsCheckpointClean(block_id_t checkpoint_id) override;
3792
+ void CreateCheckpoint(bool delete_wal, bool force_checkpoint) override;
3793
+ DatabaseSize GetDatabaseSize() override;
3794
+ shared_ptr<TableIOManager> GetTableIOManager(BoundCreateTableInfo *info) override;
3795
+
3796
+ protected:
3797
+ void LoadDatabase() override;
3798
+ };
3718
3799
  } // namespace duckdb
3719
3800
 
3720
3801
 
@@ -3766,13 +3847,14 @@ void AddDataTableIndex(DataTable *storage, vector<ColumnDefinition> &columns, ve
3766
3847
  bound_expressions.push_back(make_unique<BoundReferenceExpression>(columns[key].Type(), key_nr++));
3767
3848
  column_ids.push_back(column.StorageOid());
3768
3849
  }
3769
- // create an adaptive radix tree around the expressions
3770
3850
  unique_ptr<ART> art;
3851
+ // create an adaptive radix tree around the expressions
3771
3852
  if (index_block) {
3772
- art = make_unique<ART>(column_ids, move(unbound_expressions), constraint_type, storage->db,
3773
- index_block->block_id, index_block->offset);
3853
+ art = make_unique<ART>(column_ids, TableIOManager::Get(*storage), move(unbound_expressions), constraint_type,
3854
+ storage->db, index_block->block_id, index_block->offset);
3774
3855
  } else {
3775
- art = make_unique<ART>(column_ids, move(unbound_expressions), constraint_type, storage->db);
3856
+ art = make_unique<ART>(column_ids, TableIOManager::Get(*storage), move(unbound_expressions), constraint_type,
3857
+ storage->db);
3776
3858
  if (!storage->IsRoot()) {
3777
3859
  throw TransactionException("Transaction conflict: cannot add an index to a table that has been altered!");
3778
3860
  }
@@ -3810,7 +3892,10 @@ TableCatalogEntry::TableCatalogEntry(Catalog *catalog, SchemaCatalogEntry *schem
3810
3892
  }
3811
3893
  storage_columns.push_back(col_def.Copy());
3812
3894
  }
3813
- storage = make_shared<DataTable>(catalog->db, schema->name, name, move(storage_columns), move(info->data));
3895
+ storage =
3896
+ make_shared<DataTable>(catalog->db, StorageManager::GetStorageManager(catalog->db).GetTableIOManager(info),
3897
+ schema->name, name, move(storage_columns), move(info->data));
3898
+
3814
3899
  // create the unique indexes for the UNIQUE and PRIMARY KEY and FOREIGN KEY constraints
3815
3900
  idx_t indexes_idx = 0;
3816
3901
  for (idx_t i = 0; i < bound_constraints.size(); i++) {
@@ -58079,15 +58164,17 @@ ExpressionExecutorState::ExpressionExecutorState(const string &name) : profiler(
58079
58164
 
58080
58165
 
58081
58166
 
58167
+
58082
58168
  #include <algorithm>
58083
58169
  #include <cstring>
58084
58170
  #include <ctgmath>
58085
58171
 
58086
58172
  namespace duckdb {
58087
58173
 
58088
- ART::ART(const vector<column_t> &column_ids, const vector<unique_ptr<Expression>> &unbound_expressions,
58089
- IndexConstraintType constraint_type, DatabaseInstance &db, idx_t block_id, idx_t block_offset)
58090
- : Index(IndexType::ART, column_ids, unbound_expressions, constraint_type), db(db) {
58174
+ ART::ART(const vector<column_t> &column_ids, TableIOManager &table_io_manager,
58175
+ const vector<unique_ptr<Expression>> &unbound_expressions, IndexConstraintType constraint_type,
58176
+ DatabaseInstance &db, idx_t block_id, idx_t block_offset)
58177
+ : Index(IndexType::ART, table_io_manager, column_ids, unbound_expressions, constraint_type), db(db) {
58091
58178
  if (block_id != DConstants::INVALID_INDEX) {
58092
58179
  tree = Node::Deserialize(*this, block_id, block_offset);
58093
58180
  } else {
@@ -58377,7 +58464,8 @@ void ART::ConstructAndMerge(IndexLock &lock, PayloadScanner &scanner, Allocator
58377
58464
  payload_types.emplace_back(LogicalType::ROW_TYPE);
58378
58465
 
58379
58466
  auto skipped_all_nulls = false;
58380
- auto temp_art = make_unique<ART>(this->column_ids, this->unbound_expressions, this->constraint_type, this->db);
58467
+ auto temp_art = make_unique<ART>(this->column_ids, this->table_io_manager, this->unbound_expressions,
58468
+ this->constraint_type, this->db);
58381
58469
  for (;;) {
58382
58470
  DataChunk ordered_chunk;
58383
58471
  ordered_chunk.Initialize(allocator, payload_types);
@@ -58420,7 +58508,8 @@ void ART::ConstructAndMerge(IndexLock &lock, PayloadScanner &scanner, Allocator
58420
58508
  auto row_ids = FlatVector::GetData<row_t>(row_identifiers);
58421
58509
 
58422
58510
  // construct the ART of this chunk
58423
- auto art = make_unique<ART>(this->column_ids, this->unbound_expressions, this->constraint_type, this->db);
58511
+ auto art = make_unique<ART>(this->column_ids, this->table_io_manager, this->unbound_expressions,
58512
+ this->constraint_type, this->db);
58424
58513
  auto key_section = KeySection(start_idx, ordered_chunk.size() - 1, 0, 0);
58425
58514
  auto has_constraint = IsPrimary() || IsUnique();
58426
58515
  Construct(keys, row_ids, art->tree, key_section, has_constraint);
@@ -59404,8 +59493,7 @@ void Leaf::Remove(row_t row_id) {
59404
59493
  }
59405
59494
 
59406
59495
  BlockPointer Leaf::Serialize(duckdb::MetaBlockWriter &writer) {
59407
- auto block_id = writer.block->id;
59408
- uint32_t offset = writer.offset;
59496
+ auto ptr = writer.GetBlockPointer();
59409
59497
  // Write Node Type
59410
59498
  writer.Write(type);
59411
59499
  // Write compression Info
@@ -59417,7 +59505,7 @@ BlockPointer Leaf::Serialize(duckdb::MetaBlockWriter &writer) {
59417
59505
  for (idx_t i = 0; i < count; i++) {
59418
59506
  writer.Write(row_ids[i]);
59419
59507
  }
59420
- return {block_id, offset};
59508
+ return ptr;
59421
59509
  }
59422
59510
 
59423
59511
  Leaf *Leaf::Deserialize(MetaBlockReader &reader) {
@@ -59450,8 +59538,6 @@ void Leaf::Merge(bool &has_constraint, Node *&l_node, Node *&r_node) {
59450
59538
 
59451
59539
 
59452
59540
 
59453
-
59454
-
59455
59541
  //===----------------------------------------------------------------------===//
59456
59542
  // DuckDB
59457
59543
  //
@@ -59469,6 +59555,9 @@ using std::swap;
59469
59555
  }
59470
59556
 
59471
59557
 
59558
+
59559
+
59560
+
59472
59561
  namespace duckdb {
59473
59562
 
59474
59563
  InternalType::InternalType(Node *n) {
@@ -59602,8 +59691,7 @@ BlockPointer Node::SerializeInternal(ART &art, duckdb::MetaBlockWriter &writer,
59602
59691
  for (idx_t i = 0; i < internal_type.children_size; i++) {
59603
59692
  child_offsets.emplace_back(internal_type.children[i].Serialize(art, writer));
59604
59693
  }
59605
- auto block_id = writer.block->id;
59606
- uint32_t offset = writer.offset;
59694
+ auto ptr = writer.GetBlockPointer();
59607
59695
  // Write Node Type
59608
59696
  writer.Write(type);
59609
59697
  // Write count
@@ -59619,7 +59707,7 @@ BlockPointer Node::SerializeInternal(ART &art, duckdb::MetaBlockWriter &writer,
59619
59707
  writer.Write(offsets.block_id);
59620
59708
  writer.Write(offsets.offset);
59621
59709
  }
59622
- return {block_id, offset};
59710
+ return ptr;
59623
59711
  }
59624
59712
 
59625
59713
  BlockPointer Node::Serialize(ART &art, duckdb::MetaBlockWriter &writer) {
@@ -59655,7 +59743,7 @@ void Node::DeserializeInternal(duckdb::MetaBlockReader &reader) {
59655
59743
  }
59656
59744
 
59657
59745
  Node *Node::Deserialize(ART &art, idx_t block_id, idx_t offset) {
59658
- MetaBlockReader reader(art.db, block_id);
59746
+ MetaBlockReader reader(art.table_io_manager.GetIndexBlockManager(), block_id);
59659
59747
  reader.offset = offset;
59660
59748
  auto n = reader.Read<uint8_t>();
59661
59749
  NodeType node_type(static_cast<NodeType>(n));
@@ -79673,6 +79761,7 @@ public:
79673
79761
 
79674
79762
 
79675
79763
 
79764
+
79676
79765
  namespace duckdb {
79677
79766
 
79678
79767
  //===--------------------------------------------------------------------===//
@@ -79710,7 +79799,8 @@ unique_ptr<GlobalSinkState> PhysicalCreateIndex::GetGlobalSinkState(ClientContex
79710
79799
  // create the global index
79711
79800
  switch (info->index_type) {
79712
79801
  case IndexType::ART: {
79713
- state->global_index = make_unique<ART>(storage_ids, unbound_expressions, info->constraint_type, *context.db);
79802
+ state->global_index = make_unique<ART>(storage_ids, TableIOManager::Get(*table.storage), unbound_expressions,
79803
+ info->constraint_type, *context.db);
79714
79804
  break;
79715
79805
  }
79716
79806
  default:
@@ -79729,8 +79819,8 @@ unique_ptr<LocalSinkState> PhysicalCreateIndex::GetLocalSinkState(ExecutionConte
79729
79819
  // create the local index
79730
79820
  switch (info->index_type) {
79731
79821
  case IndexType::ART: {
79732
- state->local_index =
79733
- make_unique<ART>(storage_ids, unbound_expressions, info->constraint_type, *context.client.db);
79822
+ state->local_index = make_unique<ART>(storage_ids, TableIOManager::Get(*table.storage), unbound_expressions,
79823
+ info->constraint_type, *context.client.db);
79734
79824
  break;
79735
79825
  }
79736
79826
  default:
@@ -122487,32 +122577,18 @@ void PragmaDatabaseSizeFunction(ClientContext &context, TableFunctionInput &data
122487
122577
  return;
122488
122578
  }
122489
122579
  auto &storage = StorageManager::GetStorageManager(context);
122490
- auto &block_manager = BlockManager::GetBlockManager(context);
122491
122580
  auto &buffer_manager = BufferManager::GetBufferManager(context);
122492
122581
 
122582
+ auto ds = storage.GetDatabaseSize();
122583
+
122493
122584
  output.SetCardinality(1);
122494
- if (!storage.InMemory()) {
122495
- auto total_blocks = block_manager.TotalBlocks();
122496
- auto block_size = Storage::BLOCK_ALLOC_SIZE;
122497
- auto free_blocks = block_manager.FreeBlocks();
122498
- auto used_blocks = total_blocks - free_blocks;
122499
- auto bytes = (total_blocks * block_size);
122500
- auto wal = storage.GetWriteAheadLog();
122501
- auto wal_size = wal ? wal->GetWALSize() : 0;
122502
- output.data[0].SetValue(0, Value(StringUtil::BytesToHumanReadableString(bytes)));
122503
- output.data[1].SetValue(0, Value::BIGINT(block_size));
122504
- output.data[2].SetValue(0, Value::BIGINT(total_blocks));
122505
- output.data[3].SetValue(0, Value::BIGINT(used_blocks));
122506
- output.data[4].SetValue(0, Value::BIGINT(free_blocks));
122507
- output.data[5].SetValue(0, Value(StringUtil::BytesToHumanReadableString(wal_size)));
122508
- } else {
122509
- output.data[0].SetValue(0, Value());
122510
- output.data[1].SetValue(0, Value());
122511
- output.data[2].SetValue(0, Value());
122512
- output.data[3].SetValue(0, Value());
122513
- output.data[4].SetValue(0, Value());
122514
- output.data[5].SetValue(0, Value());
122515
- }
122585
+ output.data[0].SetValue(0, Value(StringUtil::BytesToHumanReadableString(ds.bytes)));
122586
+ output.data[1].SetValue(0, Value::BIGINT(ds.block_size));
122587
+ output.data[2].SetValue(0, Value::BIGINT(ds.total_blocks));
122588
+ output.data[3].SetValue(0, Value::BIGINT(ds.used_blocks));
122589
+ output.data[4].SetValue(0, Value::BIGINT(ds.free_blocks));
122590
+ output.data[5].SetValue(0, Value(StringUtil::BytesToHumanReadableString(ds.wal_size)));
122591
+
122516
122592
  output.data[6].SetValue(0, Value(StringUtil::BytesToHumanReadableString(buffer_manager.GetUsedMemory())));
122517
122593
  auto max_memory = buffer_manager.GetMaxMemory();
122518
122594
  output.data[7].SetValue(0, max_memory == (idx_t)-1 ? Value("Unlimited")
@@ -130510,14 +130586,6 @@ BufferManager &BufferManager::GetBufferManager(DatabaseInstance &db) {
130510
130586
  return *db.GetStorageManager().buffer_manager;
130511
130587
  }
130512
130588
 
130513
- BlockManager &BlockManager::GetBlockManager(DatabaseInstance &db) {
130514
- return *db.GetStorageManager().block_manager;
130515
- }
130516
-
130517
- BlockManager &BlockManager::GetBlockManager(ClientContext &context) {
130518
- return BlockManager::GetBlockManager(DatabaseInstance::GetDatabase(context));
130519
- }
130520
-
130521
130589
  DatabaseInstance &DatabaseInstance::GetDatabase(ClientContext &context) {
130522
130590
  return *context.db;
130523
130591
  }
@@ -130604,10 +130672,11 @@ void DatabaseInstance::Initialize(const char *database_path, DBConfig *user_conf
130604
130672
  config.options.temporary_directory = string();
130605
130673
  }
130606
130674
 
130607
- // config.create_storage_manager(config.options.database_path,
130608
- // config.options.access_mode == AccessMode::READ_ONLY);
130609
- storage = make_unique<StorageManager>(*this, config.options.database_path,
130610
- config.options.access_mode == AccessMode::READ_ONLY);
130675
+ // TODO: Support an extension here, to generate different storage managers
130676
+ // depending on the DB path structure/prefix.
130677
+ const string dbPath = config.options.database_path;
130678
+ storage = make_unique<SingleFileStorageManager>(*this, dbPath, config.options.access_mode == AccessMode::READ_ONLY);
130679
+
130611
130680
  catalog = make_unique<Catalog>(*this);
130612
130681
  transaction_manager = make_unique<TransactionManager>(*this);
130613
130682
  scheduler = make_unique<TaskScheduler>(*this);
@@ -192836,10 +192905,7 @@ public:
192836
192905
  using BlockManager::BlockManager;
192837
192906
 
192838
192907
  // LCOV_EXCL_START
192839
- void StartCheckpoint() override {
192840
- throw InternalException("Cannot perform IO in in-memory database!");
192841
- }
192842
- unique_ptr<Block> CreateBlock(block_id_t block_id) override {
192908
+ unique_ptr<Block> CreateBlock(block_id_t block_id, FileBuffer *source_buffer) override {
192843
192909
  throw InternalException("Cannot perform IO in in-memory database!");
192844
192910
  }
192845
192911
  block_id_t GetFreeBlockId() override {
@@ -193756,7 +193822,17 @@ Allocator &BufferManager::GetBufferAllocator() {
193756
193822
  //===----------------------------------------------------------------------===//
193757
193823
  // DuckDB
193758
193824
  //
193759
- // duckdb/storage/checkpoint/table_data_reader.hpp
193825
+ // duckdb/storage/checkpoint/table_data_writer.hpp
193826
+ //
193827
+ //
193828
+ //===----------------------------------------------------------------------===//
193829
+
193830
+
193831
+
193832
+ //===----------------------------------------------------------------------===//
193833
+ // DuckDB
193834
+ //
193835
+ // duckdb/storage/checkpoint/row_group_writer.hpp
193760
193836
  //
193761
193837
  //
193762
193838
  //===----------------------------------------------------------------------===//
@@ -193773,6 +193849,14 @@ Allocator &BufferManager::GetBufferAllocator() {
193773
193849
 
193774
193850
 
193775
193851
 
193852
+ //===----------------------------------------------------------------------===//
193853
+ // DuckDB
193854
+ //
193855
+ // duckdb/storage/partial_block_manager.hpp
193856
+ //
193857
+ //
193858
+ //===----------------------------------------------------------------------===//
193859
+
193776
193860
 
193777
193861
 
193778
193862
 
@@ -193792,83 +193876,390 @@ class TableCatalogEntry;
193792
193876
  class ViewCatalogEntry;
193793
193877
  class TypeCatalogEntry;
193794
193878
 
193795
- struct PartialColumnSegment {
193796
- ColumnSegment *segment;
193879
+ struct PartialBlockState {
193880
+ block_id_t block_id;
193881
+ //! How big is the block we're writing to. (Total bytes to assign).
193882
+ uint32_t block_size;
193883
+ //! How many bytes of the allocation are used. (offset_in_block of next allocation)
193797
193884
  uint32_t offset_in_block;
193885
+ //! How many times has the block been used?
193886
+ uint32_t block_use_count;
193798
193887
  };
193799
193888
 
193800
193889
  struct PartialBlock {
193801
- block_id_t block_id;
193802
- //! The block handle that stores this block
193803
- shared_ptr<BlockHandle> block;
193804
- //! The segments that are involved in the partial block
193805
- vector<PartialColumnSegment> segments;
193890
+ explicit PartialBlock(PartialBlockState state) : state(move(state)) {
193891
+ }
193892
+ virtual ~PartialBlock() {
193893
+ }
193894
+
193895
+ PartialBlockState state;
193806
193896
 
193807
- void FlushToDisk(DatabaseInstance &db);
193897
+ public:
193898
+ virtual void Flush() = 0;
193899
+ };
193900
+
193901
+ struct PartialBlockAllocation {
193902
+ // BlockManager owning the block_id
193903
+ BlockManager *block_manager {nullptr};
193904
+ //! How many bytes assigned to the caller?
193905
+ uint32_t allocation_size;
193906
+ //! State of assigned block.
193907
+ PartialBlockState state;
193908
+ //! Arbitrary state related to partial block storage.
193909
+ unique_ptr<PartialBlock> partial_block;
193808
193910
  };
193809
193911
 
193810
- //! CheckpointManager is responsible for checkpointing the database
193811
- class CheckpointManager {
193912
+ //! Enables sharing blocks across some scope. Scope is whatever we want to share
193913
+ //! blocks across. It may be an entire checkpoint or just a single row group.
193914
+ //! In any case, they must share a block manager.
193915
+ class PartialBlockManager {
193812
193916
  public:
193813
- static constexpr const idx_t PARTIAL_BLOCK_THRESHOLD = Storage::BLOCK_SIZE / 5 * 4;
193917
+ // 20% free / 80% utilization
193918
+ static constexpr const idx_t DEFAULT_MAX_PARTIAL_BLOCK_SIZE = Storage::BLOCK_SIZE / 5 * 4;
193919
+ // Max number of shared references to a block. No effective limit by default.
193920
+ static constexpr const idx_t DEFAULT_MAX_USE_COUNT = 1 << 20;
193921
+ // No point letting map size grow unbounded. We'll drop blocks with the
193922
+ // least free space first.
193923
+ static constexpr const idx_t MAX_BLOCK_MAP_SIZE = 1 << 31;
193814
193924
 
193815
193925
  public:
193816
- explicit CheckpointManager(DatabaseInstance &db);
193926
+ PartialBlockManager(BlockManager &block_manager, uint32_t max_partial_block_size = DEFAULT_MAX_PARTIAL_BLOCK_SIZE,
193927
+ uint32_t max_use_count = DEFAULT_MAX_USE_COUNT)
193928
+ : block_manager(block_manager), max_partial_block_size(max_partial_block_size), max_use_count(max_use_count) {
193929
+ }
193930
+
193931
+ public:
193932
+ //! Flush any remaining partial blocks to disk
193933
+ void FlushPartialBlocks();
193934
+
193935
+ PartialBlockAllocation GetBlockAllocation(uint32_t segment_size);
193936
+
193937
+ virtual void AllocateBlock(PartialBlockState &state, uint32_t segment_size);
193938
+
193939
+ //! Register a partially filled block that is filled with "segment_size" entries
193940
+ void RegisterPartialBlock(PartialBlockAllocation &&allocation);
193941
+
193942
+ protected:
193943
+ BlockManager &block_manager;
193944
+ //! A map of (available space -> PartialBlock) for partially filled blocks
193945
+ //! This is a multimap because there might be outstanding partial blocks with
193946
+ //! the same amount of left-over space
193947
+ multimap<idx_t, unique_ptr<PartialBlock>> partially_filled_blocks;
193948
+
193949
+ //! The maximum size (in bytes) at which a partial block will be considered a partial block
193950
+ uint32_t max_partial_block_size;
193951
+ uint32_t max_use_count;
193952
+
193953
+ protected:
193954
+ //! Try to obtain a partially filled block that can fit "segment_size" bytes
193955
+ //! If successful, returns true and returns the block_id and offset_in_block to write to
193956
+ //! Otherwise, returns false
193957
+ bool GetPartialBlock(idx_t segment_size, unique_ptr<PartialBlock> &state);
193958
+ };
193959
+
193960
+ } // namespace duckdb
193961
+
193962
+
193963
+
193964
+ namespace duckdb {
193965
+ class DatabaseInstance;
193966
+ class ClientContext;
193967
+ class ColumnSegment;
193968
+ class MetaBlockReader;
193969
+ class SchemaCatalogEntry;
193970
+ class SequenceCatalogEntry;
193971
+ class TableCatalogEntry;
193972
+ class ViewCatalogEntry;
193973
+ class TypeCatalogEntry;
193974
+
193975
+ class CheckpointWriter {
193976
+ public:
193977
+ explicit CheckpointWriter(DatabaseInstance &db) : db(db) {
193978
+ }
193979
+ virtual ~CheckpointWriter() {
193980
+ }
193817
193981
 
193818
193982
  //! The database
193819
193983
  DatabaseInstance &db;
193984
+
193985
+ virtual MetaBlockWriter &GetMetaBlockWriter() = 0;
193986
+ virtual unique_ptr<TableDataWriter> GetTableDataWriter(TableCatalogEntry &table) = 0;
193987
+ virtual BlockPointer WriteIndexData(IndexCatalogEntry &index_catalog) = 0;
193988
+
193989
+ protected:
193990
+ virtual void WriteSchema(SchemaCatalogEntry &schema);
193991
+ virtual void WriteTable(TableCatalogEntry &table);
193992
+ virtual void WriteView(ViewCatalogEntry &table);
193993
+ virtual void WriteSequence(SequenceCatalogEntry &table);
193994
+ virtual void WriteMacro(ScalarMacroCatalogEntry &table);
193995
+ virtual void WriteTableMacro(TableMacroCatalogEntry &table);
193996
+ virtual void WriteIndex(IndexCatalogEntry &index_catalog);
193997
+ virtual void WriteType(TypeCatalogEntry &table);
193998
+ };
193999
+
194000
+ class CheckpointReader {
194001
+ public:
194002
+ virtual ~CheckpointReader() {
194003
+ }
194004
+
194005
+ protected:
194006
+ virtual void LoadCheckpoint(ClientContext &context, MetaBlockReader &reader);
194007
+ virtual void ReadSchema(ClientContext &context, MetaBlockReader &reader);
194008
+ virtual void ReadTable(ClientContext &context, MetaBlockReader &reader);
194009
+ virtual void ReadView(ClientContext &context, MetaBlockReader &reader);
194010
+ virtual void ReadSequence(ClientContext &context, MetaBlockReader &reader);
194011
+ virtual void ReadMacro(ClientContext &context, MetaBlockReader &reader);
194012
+ virtual void ReadTableMacro(ClientContext &context, MetaBlockReader &reader);
194013
+ virtual void ReadIndex(ClientContext &context, MetaBlockReader &reader);
194014
+ virtual void ReadType(ClientContext &context, MetaBlockReader &reader);
194015
+
194016
+ virtual void ReadTableData(ClientContext &context, MetaBlockReader &reader, BoundCreateTableInfo &bound_info);
194017
+ };
194018
+
194019
+ class SingleFileCheckpointReader final : public CheckpointReader {
194020
+ public:
194021
+ explicit SingleFileCheckpointReader(SingleFileStorageManager &storage) : storage(storage) {
194022
+ }
194023
+
194024
+ void LoadFromStorage();
194025
+
194026
+ //! The database
194027
+ SingleFileStorageManager &storage;
194028
+ };
194029
+
194030
+ //! CheckpointWriter is responsible for checkpointing the database
194031
+ class SingleFileRowGroupWriter;
194032
+ class SingleFileTableDataWriter;
194033
+
194034
+ class SingleFileCheckpointWriter final : public CheckpointWriter {
194035
+ friend class SingleFileRowGroupWriter;
194036
+ friend class SingleFileTableDataWriter;
194037
+
194038
+ public:
194039
+ explicit SingleFileCheckpointWriter(DatabaseInstance &db, BlockManager &block_manager)
194040
+ : CheckpointWriter(db), partial_block_manager(block_manager) {
194041
+ }
194042
+
194043
+ //! Checkpoint the current state of the WAL and flush it to the main storage. This should be called BEFORE any
194044
+ //! connection is available because right now the checkpointing cannot be done online. (TODO)
194045
+ void CreateCheckpoint();
194046
+
194047
+ virtual MetaBlockWriter &GetMetaBlockWriter() override;
194048
+ virtual unique_ptr<TableDataWriter> GetTableDataWriter(TableCatalogEntry &table) override;
194049
+ virtual BlockPointer WriteIndexData(IndexCatalogEntry &index_catalog) override;
194050
+
194051
+ BlockManager &GetBlockManager();
194052
+
194053
+ private:
193820
194054
  //! The metadata writer is responsible for writing schema information
193821
194055
  unique_ptr<MetaBlockWriter> metadata_writer;
193822
194056
  //! The table data writer is responsible for writing the DataPointers used by the table chunks
193823
- unique_ptr<MetaBlockWriter> tabledata_writer;
194057
+ unique_ptr<MetaBlockWriter> table_metadata_writer;
194058
+ //! Because this is single-file storage, we can share partial blocks across
194059
+ //! an entire checkpoint.
194060
+ PartialBlockManager partial_block_manager;
194061
+ };
193824
194062
 
194063
+ } // namespace duckdb
194064
+
194065
+
194066
+ namespace duckdb {
194067
+ struct ColumnCheckpointState;
194068
+ class CheckpointWriter;
194069
+ class ColumnData;
194070
+ class ColumnSegment;
194071
+ class RowGroup;
194072
+ class BaseStatistics;
194073
+ class SegmentStatistics;
194074
+
194075
+ // Writes data for an entire row group.
194076
+ class RowGroupWriter {
193825
194077
  public:
193826
- //! Checkpoint the current state of the WAL and flush it to the main storage. This should be called BEFORE any
193827
- //! connction is available because right now the checkpointing cannot be done online. (TODO)
193828
- void CreateCheckpoint();
193829
- //! Load from a stored checkpoint
193830
- void LoadFromStorage();
194078
+ RowGroupWriter(TableCatalogEntry &table, PartialBlockManager &partial_block_manager)
194079
+ : table(table), partial_block_manager(partial_block_manager) {
194080
+ }
194081
+ virtual ~RowGroupWriter() {
194082
+ }
193831
194083
 
193832
- //! Try to obtain a partially filled block that can fit "segment_size" bytes
193833
- //! If successful, returns true and returns the block_id and offset_in_block to write to
193834
- //! Otherwise, returns false
193835
- bool GetPartialBlock(ColumnSegment *segment, idx_t segment_size, block_id_t &block_id, uint32_t &offset_in_block,
193836
- PartialBlock *&partial_block_ptr, unique_ptr<PartialBlock> &owned_partial_block);
194084
+ CompressionType GetColumnCompressionType(idx_t i);
193837
194085
 
193838
- //! Register a partially filled block that is filled with "segment_size" entries
193839
- void RegisterPartialBlock(ColumnSegment *segment, idx_t segment_size, block_id_t block_id);
194086
+ virtual void WriteColumnDataPointers(ColumnCheckpointState &column_checkpoint_state) = 0;
193840
194087
 
193841
- //! Flush any remaining partial segments to disk
193842
- void FlushPartialSegments();
194088
+ virtual MetaBlockWriter &GetPayloadWriter() = 0;
193843
194089
 
193844
- private:
193845
- void WriteSchema(SchemaCatalogEntry &schema);
193846
- void WriteTable(TableCatalogEntry &table);
193847
- void WriteView(ViewCatalogEntry &table);
193848
- void WriteSequence(SequenceCatalogEntry &table);
193849
- void WriteMacro(ScalarMacroCatalogEntry &table);
193850
- void WriteTableMacro(TableMacroCatalogEntry &table);
193851
- void WriteIndex(IndexCatalogEntry &index_catalog);
193852
- void WriteType(TypeCatalogEntry &table);
193853
-
193854
- void ReadSchema(ClientContext &context, MetaBlockReader &reader);
193855
- void ReadTable(ClientContext &context, MetaBlockReader &reader);
193856
- void ReadView(ClientContext &context, MetaBlockReader &reader);
193857
- void ReadSequence(ClientContext &context, MetaBlockReader &reader);
193858
- void ReadMacro(ClientContext &context, MetaBlockReader &reader);
193859
- void ReadTableMacro(ClientContext &context, MetaBlockReader &reader);
193860
- void ReadIndex(ClientContext &context, MetaBlockReader &reader);
193861
- void ReadType(ClientContext &context, MetaBlockReader &reader);
194090
+ void RegisterPartialBlock(PartialBlockAllocation &&allocation);
194091
+ PartialBlockAllocation GetBlockAllocation(uint32_t segment_size);
194092
+
194093
+ protected:
194094
+ TableCatalogEntry &table;
194095
+ PartialBlockManager &partial_block_manager;
194096
+ };
194097
+
194098
+ // Writes data for an entire row group.
194099
+ class SingleFileRowGroupWriter : public RowGroupWriter {
194100
+ public:
194101
+ SingleFileRowGroupWriter(TableCatalogEntry &table, PartialBlockManager &partial_block_manager,
194102
+ MetaBlockWriter &table_data_writer)
194103
+ : RowGroupWriter(table, partial_block_manager), table_data_writer(table_data_writer) {
194104
+ }
194105
+
194106
+ //! MetaBlockWriter is a cursor on a given BlockManager. This returns the
194107
+ //! cursor against which we should write payload data for the specified RowGroup.
194108
+ MetaBlockWriter &table_data_writer;
194109
+
194110
+ public:
194111
+ virtual void WriteColumnDataPointers(ColumnCheckpointState &column_checkpoint_state) override;
194112
+
194113
+ virtual MetaBlockWriter &GetPayloadWriter() override;
194114
+ };
194115
+
194116
+ } // namespace duckdb
194117
+
194118
+
194119
+ namespace duckdb {
194120
+
194121
+ //! The table data writer is responsible for writing the data of a table to
194122
+ //! storage.
194123
+ //
194124
+ //! This is meant to encapsulate and abstract:
194125
+ //! - Storage/encoding of table metadata (block pointers)
194126
+ //! - Mapping management of data block locations
194127
+ //! Abstraction will support, for example: tiering, versioning, or splitting into multiple block managers.
194128
+ class TableDataWriter {
194129
+ public:
194130
+ explicit TableDataWriter(TableCatalogEntry &table);
194131
+ virtual ~TableDataWriter();
194132
+
194133
+ public:
194134
+ void WriteTableData();
194135
+
194136
+ CompressionType GetColumnCompressionType(idx_t i);
194137
+
194138
+ virtual void FinalizeTable(vector<unique_ptr<BaseStatistics>> &&global_stats, DataTableInfo *info) = 0;
194139
+ virtual unique_ptr<RowGroupWriter> GetRowGroupWriter(RowGroup &row_group) = 0;
194140
+
194141
+ virtual void AddRowGroup(RowGroupPointer &&row_group_pointer, unique_ptr<RowGroupWriter> &&writer);
194142
+
194143
+ protected:
194144
+ TableCatalogEntry &table;
194145
+ // Pointers to the start of each row group.
194146
+ vector<RowGroupPointer> row_group_pointers;
194147
+ };
194148
+
194149
+ class SingleFileTableDataWriter : public TableDataWriter {
194150
+ public:
194151
+ SingleFileTableDataWriter(SingleFileCheckpointWriter &checkpoint_manager, TableCatalogEntry &table,
194152
+ MetaBlockWriter &table_data_writer, MetaBlockWriter &meta_data_writer);
194153
+
194154
+ public:
194155
+ virtual void FinalizeTable(vector<unique_ptr<BaseStatistics>> &&global_stats, DataTableInfo *info) override;
194156
+ virtual unique_ptr<RowGroupWriter> GetRowGroupWriter(RowGroup &row_group) override;
193862
194157
 
193863
194158
  private:
193864
- //! A map of (available space -> PartialBlock) for partially filled blocks
193865
- //! This is a multimap because there might be outstanding partial blocks with the same amount of left-over space
193866
- multimap<idx_t, unique_ptr<PartialBlock>> partially_filled_blocks;
194159
+ SingleFileCheckpointWriter &checkpoint_manager;
194160
+ // Writes the actual table data
194161
+ MetaBlockWriter &table_data_writer;
194162
+ // Writes the metadata of the table
194163
+ MetaBlockWriter &meta_data_writer;
194164
+ };
194165
+
194166
+ } // namespace duckdb
194167
+
194168
+
194169
+
194170
+
194171
+
194172
+ //===----------------------------------------------------------------------===//
194173
+ // DuckDB
194174
+ //
194175
+ // duckdb/storage/table/column_checkpoint_state.hpp
194176
+ //
194177
+ //
194178
+ //===----------------------------------------------------------------------===//
194179
+
194180
+
194181
+
194182
+
194183
+
194184
+
194185
+
194186
+
194187
+
194188
+ namespace duckdb {
194189
+ class ColumnData;
194190
+ class DatabaseInstance;
194191
+ class RowGroup;
194192
+ class TableDataWriter;
194193
+
194194
+ struct ColumnCheckpointState {
194195
+ ColumnCheckpointState(RowGroup &row_group, ColumnData &column_data, RowGroupWriter &writer);
194196
+ virtual ~ColumnCheckpointState();
194197
+
194198
+ RowGroup &row_group;
194199
+ ColumnData &column_data;
194200
+ RowGroupWriter &writer;
194201
+ SegmentTree new_tree;
194202
+ vector<DataPointer> data_pointers;
194203
+ unique_ptr<BaseStatistics> global_stats;
194204
+
194205
+ public:
194206
+ virtual unique_ptr<BaseStatistics> GetStatistics();
194207
+
194208
+ virtual void FlushSegment(unique_ptr<ColumnSegment> segment, idx_t segment_size);
194209
+ virtual void WriteDataPointers();
193867
194210
  };
193868
194211
 
193869
194212
  } // namespace duckdb
193870
194213
 
193871
194214
 
194215
+ namespace duckdb {
194216
+
194217
+ CompressionType RowGroupWriter::GetColumnCompressionType(idx_t i) {
194218
+ return table.columns[i].CompressionType();
194219
+ }
194220
+
194221
+ void RowGroupWriter::RegisterPartialBlock(PartialBlockAllocation &&allocation) {
194222
+ partial_block_manager.RegisterPartialBlock(move(allocation));
194223
+ }
194224
+
194225
+ PartialBlockAllocation RowGroupWriter::GetBlockAllocation(uint32_t segment_size) {
194226
+ return partial_block_manager.GetBlockAllocation(segment_size);
194227
+ }
194228
+
194229
+ void SingleFileRowGroupWriter::WriteColumnDataPointers(ColumnCheckpointState &column_checkpoint_state) {
194230
+ auto &meta_writer = table_data_writer;
194231
+ const auto &data_pointers = column_checkpoint_state.data_pointers;
194232
+
194233
+ meta_writer.Write<idx_t>(data_pointers.size());
194234
+ // then write the data pointers themselves
194235
+ for (idx_t k = 0; k < data_pointers.size(); k++) {
194236
+ auto &data_pointer = data_pointers[k];
194237
+ meta_writer.Write<idx_t>(data_pointer.row_start);
194238
+ meta_writer.Write<idx_t>(data_pointer.tuple_count);
194239
+ meta_writer.Write<block_id_t>(data_pointer.block_pointer.block_id);
194240
+ meta_writer.Write<uint32_t>(data_pointer.block_pointer.offset);
194241
+ meta_writer.Write<CompressionType>(data_pointer.compression_type);
194242
+ data_pointer.statistics->Serialize(meta_writer);
194243
+ }
194244
+ }
194245
+
194246
+ MetaBlockWriter &SingleFileRowGroupWriter::GetPayloadWriter() {
194247
+ return table_data_writer;
194248
+ }
194249
+
194250
+ } // namespace duckdb
194251
+ //===----------------------------------------------------------------------===//
194252
+ // DuckDB
194253
+ //
194254
+ // duckdb/storage/checkpoint/table_data_reader.hpp
194255
+ //
194256
+ //
194257
+ //===----------------------------------------------------------------------===//
194258
+
194259
+
194260
+
194261
+
194262
+
193872
194263
  namespace duckdb {
193873
194264
  struct BoundCreateTableInfo;
193874
194265
 
@@ -193931,61 +194322,6 @@ void TableDataReader::ReadTableData() {
193931
194322
  }
193932
194323
 
193933
194324
  } // namespace duckdb
193934
- //===----------------------------------------------------------------------===//
193935
- // DuckDB
193936
- //
193937
- // duckdb/storage/checkpoint/table_data_writer.hpp
193938
- //
193939
- //
193940
- //===----------------------------------------------------------------------===//
193941
-
193942
-
193943
-
193944
-
193945
-
193946
- namespace duckdb {
193947
- class CheckpointManager;
193948
- class ColumnData;
193949
- class ColumnSegment;
193950
- class RowGroup;
193951
- class BaseStatistics;
193952
- class SegmentStatistics;
193953
-
193954
- //! The table data writer is responsible for writing the data of a table to the block manager
193955
- class TableDataWriter {
193956
- friend class ColumnData;
193957
-
193958
- public:
193959
- TableDataWriter(DatabaseInstance &db, CheckpointManager &checkpoint_manager, TableCatalogEntry &table,
193960
- MetaBlockWriter &table_data_writer, MetaBlockWriter &meta_data_writer);
193961
- ~TableDataWriter();
193962
-
193963
- void WriteTableData();
193964
-
193965
- MetaBlockWriter &GetTableWriter() {
193966
- return table_data_writer;
193967
- }
193968
- MetaBlockWriter &GetMetaWriter() {
193969
- return meta_data_writer;
193970
- }
193971
-
193972
- CheckpointManager &GetCheckpointManager() {
193973
- return checkpoint_manager;
193974
- }
193975
-
193976
- CompressionType GetColumnCompressionType(idx_t i);
193977
-
193978
- private:
193979
- CheckpointManager &checkpoint_manager;
193980
- TableCatalogEntry &table;
193981
- // Writes the actual table data
193982
- MetaBlockWriter &table_data_writer;
193983
- // Writes the metadata of the table
193984
- MetaBlockWriter &meta_data_writer;
193985
- };
193986
-
193987
- } // namespace duckdb
193988
-
193989
194325
 
193990
194326
 
193991
194327
 
@@ -193995,10 +194331,7 @@ private:
193995
194331
 
193996
194332
  namespace duckdb {
193997
194333
 
193998
- TableDataWriter::TableDataWriter(DatabaseInstance &, CheckpointManager &checkpoint_manager, TableCatalogEntry &table,
193999
- MetaBlockWriter &table_data_writer, MetaBlockWriter &meta_data_writer)
194000
- : checkpoint_manager(checkpoint_manager), table(table), table_data_writer(table_data_writer),
194001
- meta_data_writer(meta_data_writer) {
194334
+ TableDataWriter::TableDataWriter(TableCatalogEntry &table) : table(table) {
194002
194335
  }
194003
194336
 
194004
194337
  TableDataWriter::~TableDataWriter() {
@@ -194013,6 +194346,51 @@ CompressionType TableDataWriter::GetColumnCompressionType(idx_t i) {
194013
194346
  return table.columns[i].CompressionType();
194014
194347
  }
194015
194348
 
194349
+ void TableDataWriter::AddRowGroup(RowGroupPointer &&row_group_pointer, unique_ptr<RowGroupWriter> &&writer) {
194350
+ row_group_pointers.push_back(move(row_group_pointer));
194351
+ writer.reset();
194352
+ }
194353
+
194354
+ SingleFileTableDataWriter::SingleFileTableDataWriter(SingleFileCheckpointWriter &checkpoint_manager,
194355
+ TableCatalogEntry &table, MetaBlockWriter &table_data_writer,
194356
+ MetaBlockWriter &meta_data_writer)
194357
+ : TableDataWriter(table), checkpoint_manager(checkpoint_manager), table_data_writer(table_data_writer),
194358
+ meta_data_writer(meta_data_writer) {
194359
+ }
194360
+
194361
+ unique_ptr<RowGroupWriter> SingleFileTableDataWriter::GetRowGroupWriter(RowGroup &row_group) {
194362
+ return make_unique<SingleFileRowGroupWriter>(table, checkpoint_manager.partial_block_manager, table_data_writer);
194363
+ }
194364
+
194365
+ void SingleFileTableDataWriter::FinalizeTable(vector<unique_ptr<BaseStatistics>> &&global_stats, DataTableInfo *info) {
194366
+ // store the current position in the metadata writer
194367
+ // this is where the row groups for this table start
194368
+ auto pointer = table_data_writer.GetBlockPointer();
194369
+
194370
+ for (auto &stats : global_stats) {
194371
+ stats->Serialize(table_data_writer);
194372
+ }
194373
+ // now start writing the row group pointers to disk
194374
+ table_data_writer.Write<uint64_t>(row_group_pointers.size());
194375
+ for (auto &row_group_pointer : row_group_pointers) {
194376
+ RowGroup::Serialize(row_group_pointer, table_data_writer);
194377
+ }
194378
+
194379
+ // Pointer to the table itself goes to the metadata stream.
194380
+ meta_data_writer.Write<block_id_t>(pointer.block_id);
194381
+ meta_data_writer.Write<uint64_t>(pointer.offset);
194382
+
194383
+ // Now we serialize indexes in the table_metadata_writer
194384
+ std::vector<BlockPointer> index_pointers = info->indexes.SerializeIndexes(table_data_writer);
194385
+
194386
+ // Write-off to metadata block ids and offsets of indexes
194387
+ meta_data_writer.Write<idx_t>(index_pointers.size());
194388
+ for (auto &block_info : index_pointers) {
194389
+ meta_data_writer.Write<idx_t>(block_info.block_id);
194390
+ meta_data_writer.Write<idx_t>(block_info.offset);
194391
+ }
194392
+ }
194393
+
194016
194394
  } // namespace duckdb
194017
194395
  //===----------------------------------------------------------------------===//
194018
194396
  // DuckDB
@@ -194030,11 +194408,11 @@ namespace duckdb {
194030
194408
 
194031
194409
  class WriteOverflowStringsToDisk : public OverflowStringWriter {
194032
194410
  public:
194033
- explicit WriteOverflowStringsToDisk(DatabaseInstance &db);
194411
+ explicit WriteOverflowStringsToDisk(BlockManager &block_manager);
194034
194412
  ~WriteOverflowStringsToDisk() override;
194035
194413
 
194036
- //! The checkpoint manager
194037
- DatabaseInstance &db;
194414
+ //! The block manager
194415
+ BlockManager &block_manager;
194038
194416
 
194039
194417
  //! Temporary buffer
194040
194418
  BufferHandle handle;
@@ -194060,20 +194438,18 @@ private:
194060
194438
 
194061
194439
  namespace duckdb {
194062
194440
 
194063
- WriteOverflowStringsToDisk::WriteOverflowStringsToDisk(DatabaseInstance &db)
194064
- : db(db), block_id(INVALID_BLOCK), offset(0) {
194441
+ WriteOverflowStringsToDisk::WriteOverflowStringsToDisk(BlockManager &block_manager)
194442
+ : block_manager(block_manager), block_id(INVALID_BLOCK), offset(0) {
194065
194443
  }
194066
194444
 
194067
194445
  WriteOverflowStringsToDisk::~WriteOverflowStringsToDisk() {
194068
- auto &block_manager = BlockManager::GetBlockManager(db);
194069
194446
  if (offset > 0) {
194070
194447
  block_manager.Write(handle.GetFileBuffer(), block_id);
194071
194448
  }
194072
194449
  }
194073
194450
 
194074
194451
  void WriteOverflowStringsToDisk::WriteString(string_t string, block_id_t &result_block, int32_t &result_offset) {
194075
- auto &buffer_manager = BufferManager::GetBufferManager(db);
194076
- auto &block_manager = BlockManager::GetBlockManager(db);
194452
+ auto &buffer_manager = block_manager.buffer_manager;
194077
194453
  if (!handle.IsValid()) {
194078
194454
  handle = buffer_manager.Allocate(Storage::BLOCK_SIZE);
194079
194455
  }
@@ -194123,7 +194499,6 @@ void WriteOverflowStringsToDisk::WriteString(string_t string, block_id_t &result
194123
194499
  }
194124
194500
 
194125
194501
  void WriteOverflowStringsToDisk::AllocateNewBlock(block_id_t new_block_id) {
194126
- auto &block_manager = BlockManager::GetBlockManager(db);
194127
194502
  if (block_id != INVALID_BLOCK) {
194128
194503
  // there is an old block, write it first
194129
194504
  block_manager.Write(handle.GetFileBuffer(), block_id);
@@ -194163,33 +194538,47 @@ void WriteOverflowStringsToDisk::AllocateNewBlock(block_id_t new_block_id) {
194163
194538
 
194164
194539
 
194165
194540
 
194541
+
194166
194542
 
194167
194543
 
194168
194544
  namespace duckdb {
194169
194545
 
194170
194546
  void ReorderTableEntries(vector<TableCatalogEntry *> &tables);
194171
194547
 
194172
- CheckpointManager::CheckpointManager(DatabaseInstance &db) : db(db) {
194548
+ BlockManager &SingleFileCheckpointWriter::GetBlockManager() {
194549
+ auto &storage_manager = (SingleFileStorageManager &)db.GetStorageManager();
194550
+ return *storage_manager.block_manager;
194551
+ }
194552
+
194553
+ BlockPointer SingleFileCheckpointWriter::WriteIndexData(IndexCatalogEntry &index_catalog) {
194554
+ return index_catalog.index->Serialize(*table_metadata_writer);
194173
194555
  }
194174
194556
 
194175
- void CheckpointManager::CreateCheckpoint() {
194557
+ MetaBlockWriter &SingleFileCheckpointWriter::GetMetaBlockWriter() {
194558
+ return *metadata_writer;
194559
+ }
194560
+
194561
+ unique_ptr<TableDataWriter> SingleFileCheckpointWriter::GetTableDataWriter(TableCatalogEntry &table) {
194562
+ return make_unique<SingleFileTableDataWriter>(*this, table, *table_metadata_writer, GetMetaBlockWriter());
194563
+ }
194564
+
194565
+ void SingleFileCheckpointWriter::CreateCheckpoint() {
194176
194566
  auto &config = DBConfig::GetConfig(db);
194177
- auto &storage_manager = StorageManager::GetStorageManager(db);
194567
+ auto &storage_manager = (SingleFileStorageManager &)db.GetStorageManager();
194178
194568
  if (storage_manager.InMemory()) {
194179
194569
  return;
194180
194570
  }
194181
194571
  // assert that the checkpoint manager hasn't been used before
194182
194572
  D_ASSERT(!metadata_writer);
194183
194573
 
194184
- auto &block_manager = BlockManager::GetBlockManager(db);
194185
- block_manager.StartCheckpoint();
194574
+ auto &block_manager = GetBlockManager();
194186
194575
 
194187
194576
  //! Set up the writers for the checkpoints
194188
- metadata_writer = make_unique<MetaBlockWriter>(db);
194189
- tabledata_writer = make_unique<MetaBlockWriter>(db);
194577
+ metadata_writer = make_unique<MetaBlockWriter>(block_manager);
194578
+ table_metadata_writer = make_unique<MetaBlockWriter>(block_manager);
194190
194579
 
194191
194580
  // get the id of the first meta block
194192
- block_id_t meta_block = metadata_writer->block->id;
194581
+ block_id_t meta_block = metadata_writer->GetBlockPointer().block_id;
194193
194582
 
194194
194583
  vector<SchemaCatalogEntry *> schemas;
194195
194584
  // we scan the set of committed schemas
@@ -194201,10 +194590,10 @@ void CheckpointManager::CreateCheckpoint() {
194201
194590
  for (auto &schema : schemas) {
194202
194591
  WriteSchema(*schema);
194203
194592
  }
194204
- FlushPartialSegments();
194593
+ partial_block_manager.FlushPartialBlocks();
194205
194594
  // flush the meta data to disk
194206
194595
  metadata_writer->Flush();
194207
- tabledata_writer->Flush();
194596
+ table_metadata_writer->Flush();
194208
194597
 
194209
194598
  // write a checkpoint flag to the WAL
194210
194599
  // this protects against the rare event that the database crashes AFTER writing the file, but BEFORE truncating the
@@ -194232,39 +194621,39 @@ void CheckpointManager::CreateCheckpoint() {
194232
194621
  wal->Truncate(0);
194233
194622
 
194234
194623
  // mark all blocks written as part of the metadata as modified
194235
- for (auto &block_id : metadata_writer->written_blocks) {
194236
- block_manager.MarkBlockAsModified(block_id);
194237
- }
194238
- for (auto &block_id : tabledata_writer->written_blocks) {
194239
- block_manager.MarkBlockAsModified(block_id);
194240
- }
194624
+ metadata_writer->MarkWrittenBlocks();
194625
+ table_metadata_writer->MarkWrittenBlocks();
194241
194626
  }
194242
194627
 
194243
- void CheckpointManager::LoadFromStorage() {
194244
- auto &block_manager = BlockManager::GetBlockManager(db);
194628
+ void SingleFileCheckpointReader::LoadFromStorage() {
194629
+ auto &block_manager = *storage.block_manager;
194245
194630
  block_id_t meta_block = block_manager.GetMetaBlock();
194246
194631
  if (meta_block < 0) {
194247
194632
  // storage is empty
194248
194633
  return;
194249
194634
  }
194250
194635
 
194251
- Connection con(db);
194636
+ Connection con(storage.db);
194252
194637
  con.BeginTransaction();
194253
194638
  // create the MetaBlockReader to read from the storage
194254
- MetaBlockReader reader(db, meta_block);
194639
+ MetaBlockReader reader(block_manager, meta_block);
194640
+ LoadCheckpoint(*con.context, reader);
194641
+ con.Commit();
194642
+ }
194643
+
194644
+ void CheckpointReader::LoadCheckpoint(ClientContext &context, MetaBlockReader &reader) {
194255
194645
  uint32_t schema_count = reader.Read<uint32_t>();
194256
194646
  for (uint32_t i = 0; i < schema_count; i++) {
194257
- ReadSchema(*con.context, reader);
194647
+ ReadSchema(context, reader);
194258
194648
  }
194259
- con.Commit();
194260
194649
  }
194261
194650
 
194262
194651
  //===--------------------------------------------------------------------===//
194263
194652
  // Schema
194264
194653
  //===--------------------------------------------------------------------===//
194265
- void CheckpointManager::WriteSchema(SchemaCatalogEntry &schema) {
194654
+ void CheckpointWriter::WriteSchema(SchemaCatalogEntry &schema) {
194266
194655
  // write the schema data
194267
- schema.Serialize(*metadata_writer);
194656
+ schema.Serialize(GetMetaBlockWriter());
194268
194657
  // then, we fetch the tables/views/sequences information
194269
194658
  vector<TableCatalogEntry *> tables;
194270
194659
  vector<ViewCatalogEntry *> views;
@@ -194322,7 +194711,7 @@ void CheckpointManager::WriteSchema(SchemaCatalogEntry &schema) {
194322
194711
  indexes.push_back((IndexCatalogEntry *)entry);
194323
194712
  });
194324
194713
 
194325
- FieldWriter writer(*metadata_writer);
194714
+ FieldWriter writer(GetMetaBlockWriter());
194326
194715
  writer.WriteField<uint32_t>(custom_types.size());
194327
194716
  writer.WriteField<uint32_t>(sequences.size());
194328
194717
  writer.WriteField<uint32_t>(tables.size());
@@ -194367,8 +194756,8 @@ void CheckpointManager::WriteSchema(SchemaCatalogEntry &schema) {
194367
194756
  }
194368
194757
  }
194369
194758
 
194370
- void CheckpointManager::ReadSchema(ClientContext &context, MetaBlockReader &reader) {
194371
- auto &catalog = Catalog::GetCatalog(db);
194759
+ void CheckpointReader::ReadSchema(ClientContext &context, MetaBlockReader &reader) {
194760
+ auto &catalog = Catalog::GetCatalog(context);
194372
194761
 
194373
194762
  // read the schema and create it in the catalog
194374
194763
  auto info = SchemaCatalogEntry::Deserialize(reader);
@@ -194421,51 +194810,52 @@ void CheckpointManager::ReadSchema(ClientContext &context, MetaBlockReader &read
194421
194810
  //===--------------------------------------------------------------------===//
194422
194811
  // Views
194423
194812
  //===--------------------------------------------------------------------===//
194424
- void CheckpointManager::WriteView(ViewCatalogEntry &view) {
194425
- view.Serialize(*metadata_writer);
194813
+ void CheckpointWriter::WriteView(ViewCatalogEntry &view) {
194814
+ view.Serialize(GetMetaBlockWriter());
194426
194815
  }
194427
194816
 
194428
- void CheckpointManager::ReadView(ClientContext &context, MetaBlockReader &reader) {
194817
+ void CheckpointReader::ReadView(ClientContext &context, MetaBlockReader &reader) {
194429
194818
  auto info = ViewCatalogEntry::Deserialize(reader, context);
194430
194819
 
194431
- auto &catalog = Catalog::GetCatalog(db);
194820
+ auto &catalog = Catalog::GetCatalog(context);
194432
194821
  catalog.CreateView(context, info.get());
194433
194822
  }
194434
194823
 
194435
194824
  //===--------------------------------------------------------------------===//
194436
194825
  // Sequences
194437
194826
  //===--------------------------------------------------------------------===//
194438
- void CheckpointManager::WriteSequence(SequenceCatalogEntry &seq) {
194439
- seq.Serialize(*metadata_writer);
194827
+ void CheckpointWriter::WriteSequence(SequenceCatalogEntry &seq) {
194828
+ seq.Serialize(GetMetaBlockWriter());
194440
194829
  }
194441
194830
 
194442
- void CheckpointManager::ReadSequence(ClientContext &context, MetaBlockReader &reader) {
194831
+ void CheckpointReader::ReadSequence(ClientContext &context, MetaBlockReader &reader) {
194443
194832
  auto info = SequenceCatalogEntry::Deserialize(reader);
194444
194833
 
194445
- auto &catalog = Catalog::GetCatalog(db);
194834
+ auto &catalog = Catalog::GetCatalog(context);
194446
194835
  catalog.CreateSequence(context, info.get());
194447
194836
  }
194448
194837
 
194449
194838
  //===--------------------------------------------------------------------===//
194450
194839
  // Indexes
194451
194840
  //===--------------------------------------------------------------------===//
194452
- void CheckpointManager::WriteIndex(IndexCatalogEntry &index_catalog) {
194841
+ void CheckpointWriter::WriteIndex(IndexCatalogEntry &index_catalog) {
194453
194842
  // Write the index data and metadata
194454
194843
  // Serialize the necessary meta data for index catalog construction.
194455
- auto root_offset = index_catalog.index->Serialize(*tabledata_writer);
194456
- index_catalog.Serialize(*metadata_writer);
194844
+ auto root_offset = WriteIndexData(index_catalog);
194845
+ auto &metadata_writer = GetMetaBlockWriter();
194846
+ index_catalog.Serialize(metadata_writer);
194457
194847
  // Serialize the Block id and offset of root node
194458
- metadata_writer->Write(root_offset.block_id);
194459
- metadata_writer->Write(root_offset.offset);
194848
+ metadata_writer.Write(root_offset.block_id);
194849
+ metadata_writer.Write(root_offset.offset);
194460
194850
  }
194461
194851
 
194462
- void CheckpointManager::ReadIndex(ClientContext &context, MetaBlockReader &reader) {
194852
+ void CheckpointReader::ReadIndex(ClientContext &context, MetaBlockReader &reader) {
194463
194853
 
194464
194854
  // Deserialize the index meta data
194465
194855
  auto info = IndexCatalogEntry::Deserialize(reader, context);
194466
194856
 
194467
194857
  // Create index in the catalog
194468
- auto &catalog = Catalog::GetCatalog(db);
194858
+ auto &catalog = Catalog::GetCatalog(context);
194469
194859
  auto schema_catalog = catalog.GetSchema(context, info->schema);
194470
194860
  auto table_catalog =
194471
194861
  (TableCatalogEntry *)catalog.GetEntry(context, CatalogType::TABLE_ENTRY, info->schema, info->table->table_name);
@@ -194506,8 +194896,9 @@ void CheckpointManager::ReadIndex(ClientContext &context, MetaBlockReader &reade
194506
194896
 
194507
194897
  switch (info->index_type) {
194508
194898
  case IndexType::ART: {
194509
- auto art = make_unique<ART>(info->column_ids, move(unbound_expressions), info->constraint_type, db,
194510
- root_block_id, root_offset);
194899
+ auto art =
194900
+ make_unique<ART>(info->column_ids, TableIOManager::Get(*table_catalog->storage), move(unbound_expressions),
194901
+ info->constraint_type, *context.db, root_block_id, root_offset);
194511
194902
  index_catalog->index = art.get();
194512
194903
  table_catalog->storage->info->indexes.AddIndex(move(art));
194513
194904
  break;
@@ -194520,52 +194911,53 @@ void CheckpointManager::ReadIndex(ClientContext &context, MetaBlockReader &reade
194520
194911
  //===--------------------------------------------------------------------===//
194521
194912
  // Custom Types
194522
194913
  //===--------------------------------------------------------------------===//
194523
- void CheckpointManager::WriteType(TypeCatalogEntry &table) {
194524
- table.Serialize(*metadata_writer);
194914
+ void CheckpointWriter::WriteType(TypeCatalogEntry &table) {
194915
+ table.Serialize(GetMetaBlockWriter());
194525
194916
  }
194526
194917
 
194527
- void CheckpointManager::ReadType(ClientContext &context, MetaBlockReader &reader) {
194918
+ void CheckpointReader::ReadType(ClientContext &context, MetaBlockReader &reader) {
194528
194919
  auto info = TypeCatalogEntry::Deserialize(reader);
194529
194920
 
194530
- auto &catalog = Catalog::GetCatalog(db);
194921
+ auto &catalog = Catalog::GetCatalog(context);
194531
194922
  catalog.CreateType(context, info.get());
194532
194923
  }
194533
194924
 
194534
194925
  //===--------------------------------------------------------------------===//
194535
194926
  // Macro's
194536
194927
  //===--------------------------------------------------------------------===//
194537
- void CheckpointManager::WriteMacro(ScalarMacroCatalogEntry &macro) {
194538
- macro.Serialize(*metadata_writer);
194928
+ void CheckpointWriter::WriteMacro(ScalarMacroCatalogEntry &macro) {
194929
+ macro.Serialize(GetMetaBlockWriter());
194539
194930
  }
194540
194931
 
194541
- void CheckpointManager::ReadMacro(ClientContext &context, MetaBlockReader &reader) {
194932
+ void CheckpointReader::ReadMacro(ClientContext &context, MetaBlockReader &reader) {
194542
194933
  auto info = ScalarMacroCatalogEntry::Deserialize(reader, context);
194543
- auto &catalog = Catalog::GetCatalog(db);
194934
+ auto &catalog = Catalog::GetCatalog(context);
194544
194935
  catalog.CreateFunction(context, info.get());
194545
194936
  }
194546
194937
 
194547
- void CheckpointManager::WriteTableMacro(TableMacroCatalogEntry &macro) {
194548
- macro.Serialize(*metadata_writer);
194938
+ void CheckpointWriter::WriteTableMacro(TableMacroCatalogEntry &macro) {
194939
+ macro.Serialize(GetMetaBlockWriter());
194549
194940
  }
194550
194941
 
194551
- void CheckpointManager::ReadTableMacro(ClientContext &context, MetaBlockReader &reader) {
194942
+ void CheckpointReader::ReadTableMacro(ClientContext &context, MetaBlockReader &reader) {
194552
194943
  auto info = TableMacroCatalogEntry::Deserialize(reader, context);
194553
- auto &catalog = Catalog::GetCatalog(db);
194944
+ auto &catalog = Catalog::GetCatalog(context);
194554
194945
  catalog.CreateFunction(context, info.get());
194555
194946
  }
194556
194947
 
194557
194948
  //===--------------------------------------------------------------------===//
194558
194949
  // Table Metadata
194559
194950
  //===--------------------------------------------------------------------===//
194560
- void CheckpointManager::WriteTable(TableCatalogEntry &table) {
194951
+ void CheckpointWriter::WriteTable(TableCatalogEntry &table) {
194561
194952
  // write the table meta data
194562
- table.Serialize(*metadata_writer);
194563
- // now we need to write the table data
194564
- TableDataWriter writer(db, *this, table, *tabledata_writer, *metadata_writer);
194565
- writer.WriteTableData();
194953
+ table.Serialize(GetMetaBlockWriter());
194954
+ // now we need to write the table data.
194955
+ if (auto writer = GetTableDataWriter(table)) {
194956
+ writer->WriteTableData();
194957
+ }
194566
194958
  }
194567
194959
 
194568
- void CheckpointManager::ReadTable(ClientContext &context, MetaBlockReader &reader) {
194960
+ void CheckpointReader::ReadTable(ClientContext &context, MetaBlockReader &reader) {
194569
194961
  // deserialize the table meta data
194570
194962
  auto info = TableCatalogEntry::Deserialize(reader, context);
194571
194963
  // bind the info
@@ -194573,11 +194965,22 @@ void CheckpointManager::ReadTable(ClientContext &context, MetaBlockReader &reade
194573
194965
  auto bound_info = binder->BindCreateTableInfo(move(info));
194574
194966
 
194575
194967
  // now read the actual table data and place it into the create table info
194968
+ ReadTableData(context, reader, *bound_info);
194969
+
194970
+ // finally create the table in the catalog
194971
+ auto &catalog = Catalog::GetCatalog(context);
194972
+ catalog.CreateTable(context, bound_info.get());
194973
+ }
194974
+
194975
+ void CheckpointReader::ReadTableData(ClientContext &context, MetaBlockReader &reader,
194976
+ BoundCreateTableInfo &bound_info) {
194576
194977
  auto block_id = reader.Read<block_id_t>();
194577
194978
  auto offset = reader.Read<uint64_t>();
194578
- MetaBlockReader table_data_reader(db, block_id);
194979
+
194980
+ MetaBlockReader table_data_reader(reader.block_manager, block_id);
194579
194981
  table_data_reader.offset = offset;
194580
- TableDataReader data_reader(table_data_reader, *bound_info);
194982
+ TableDataReader data_reader(table_data_reader, bound_info);
194983
+
194581
194984
  data_reader.ReadTableData();
194582
194985
 
194583
194986
  // Get any indexes block info
@@ -194585,82 +194988,7 @@ void CheckpointManager::ReadTable(ClientContext &context, MetaBlockReader &reade
194585
194988
  for (idx_t i = 0; i < num_indexes; i++) {
194586
194989
  auto idx_block_id = reader.Read<idx_t>();
194587
194990
  auto idx_offset = reader.Read<idx_t>();
194588
- bound_info->indexes.emplace_back(idx_block_id, idx_offset);
194589
- }
194590
-
194591
- // finally create the table in the catalog
194592
- auto &catalog = Catalog::GetCatalog(db);
194593
- catalog.CreateTable(context, bound_info.get());
194594
- }
194595
-
194596
- //===--------------------------------------------------------------------===//
194597
- // Partial Blocks
194598
- //===--------------------------------------------------------------------===//
194599
- bool CheckpointManager::GetPartialBlock(ColumnSegment *segment, idx_t segment_size, block_id_t &block_id,
194600
- uint32_t &offset_in_block, PartialBlock *&partial_block_ptr,
194601
- unique_ptr<PartialBlock> &owned_partial_block) {
194602
- auto entry = partially_filled_blocks.lower_bound(segment_size);
194603
- if (entry == partially_filled_blocks.end()) {
194604
- return false;
194605
- }
194606
- // found a partially filled block! fill in the info
194607
- auto partial_block = move(entry->second);
194608
- partial_block_ptr = partial_block.get();
194609
- block_id = partial_block->block_id;
194610
- offset_in_block = Storage::BLOCK_SIZE - entry->first;
194611
- partially_filled_blocks.erase(entry);
194612
- PartialColumnSegment partial_segment;
194613
- partial_segment.segment = segment;
194614
- partial_segment.offset_in_block = offset_in_block;
194615
- partial_block->segments.push_back(partial_segment);
194616
-
194617
- D_ASSERT(offset_in_block > 0);
194618
- D_ASSERT(ValueIsAligned(offset_in_block));
194619
-
194620
- // check if the block is STILL partially filled after adding the segment_size
194621
- auto new_size = AlignValue(offset_in_block + segment_size);
194622
- if (new_size <= CheckpointManager::PARTIAL_BLOCK_THRESHOLD) {
194623
- // the block is still partially filled: add it to the partially_filled_blocks list
194624
- auto new_space_left = Storage::BLOCK_SIZE - new_size;
194625
- partially_filled_blocks.insert(make_pair(new_space_left, move(partial_block)));
194626
- // should not write the block yet: perhaps more columns will be added
194627
- } else {
194628
- // we are done with this block after the current write: write it to disk
194629
- owned_partial_block = move(partial_block);
194630
- }
194631
- return true;
194632
- }
194633
-
194634
- void CheckpointManager::RegisterPartialBlock(ColumnSegment *segment, idx_t segment_size, block_id_t block_id) {
194635
- D_ASSERT(segment_size <= CheckpointManager::PARTIAL_BLOCK_THRESHOLD);
194636
- auto partial_block = make_unique<PartialBlock>();
194637
- partial_block->block_id = block_id;
194638
- partial_block->block = segment->block;
194639
-
194640
- PartialColumnSegment partial_segment;
194641
- partial_segment.segment = segment;
194642
- partial_segment.offset_in_block = 0;
194643
- partial_block->segments.push_back(partial_segment);
194644
- auto space_left = Storage::BLOCK_SIZE - AlignValue(segment_size);
194645
- partially_filled_blocks.insert(make_pair(space_left, move(partial_block)));
194646
- }
194647
-
194648
- void CheckpointManager::FlushPartialSegments() {
194649
- for (auto &entry : partially_filled_blocks) {
194650
- entry.second->FlushToDisk(db);
194651
- }
194652
- }
194653
-
194654
- void PartialBlock::FlushToDisk(DatabaseInstance &db) {
194655
- auto &block_manager = BlockManager::GetBlockManager(db);
194656
-
194657
- // the data for the block might already exists in-memory of our block
194658
- // instead of copying the data we alter some metadata so the buffer points to an on-disk block
194659
- block = block_manager.ConvertToPersistent(block_id, move(block));
194660
-
194661
- // now set this block as the block for all segments
194662
- for (auto &seg : segments) {
194663
- seg.segment->ConvertToPersistent(block, block_id, seg.offset_in_block);
194991
+ bound_info.indexes.emplace_back(idx_block_id, idx_offset);
194664
194992
  }
194665
194993
  }
194666
194994
 
@@ -196111,47 +196439,6 @@ private:
196111
196439
 
196112
196440
 
196113
196441
 
196114
- //===----------------------------------------------------------------------===//
196115
- // DuckDB
196116
- //
196117
- // duckdb/storage/table/column_checkpoint_state.hpp
196118
- //
196119
- //
196120
- //===----------------------------------------------------------------------===//
196121
-
196122
-
196123
-
196124
-
196125
-
196126
-
196127
-
196128
-
196129
-
196130
- namespace duckdb {
196131
- class ColumnData;
196132
- class DatabaseInstance;
196133
- class RowGroup;
196134
- class TableDataWriter;
196135
-
196136
- struct ColumnCheckpointState {
196137
- ColumnCheckpointState(RowGroup &row_group, ColumnData &column_data, TableDataWriter &writer);
196138
- virtual ~ColumnCheckpointState();
196139
-
196140
- RowGroup &row_group;
196141
- ColumnData &column_data;
196142
- TableDataWriter &writer;
196143
- SegmentTree new_tree;
196144
- vector<DataPointer> data_pointers;
196145
- unique_ptr<BaseStatistics> global_stats;
196146
-
196147
- public:
196148
- virtual unique_ptr<BaseStatistics> GetStatistics();
196149
-
196150
- virtual void FlushSegment(unique_ptr<ColumnSegment> segment, idx_t segment_size);
196151
- virtual void FlushToDisk();
196152
- };
196153
-
196154
- } // namespace duckdb
196155
196442
 
196156
196443
 
196157
196444
 
@@ -196160,6 +196447,7 @@ class ColumnData;
196160
196447
  class ColumnSegment;
196161
196448
  class DatabaseInstance;
196162
196449
  class RowGroup;
196450
+ class RowGroupWriter;
196163
196451
  class TableDataWriter;
196164
196452
  struct TransactionData;
196165
196453
 
@@ -196174,11 +196462,13 @@ class ColumnData {
196174
196462
  friend class ColumnDataCheckpointer;
196175
196463
 
196176
196464
  public:
196177
- ColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type, ColumnData *parent);
196465
+ ColumnData(BlockManager &block_manager, DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type,
196466
+ ColumnData *parent);
196178
196467
  ColumnData(ColumnData &other, idx_t start, ColumnData *parent);
196179
-
196180
196468
  virtual ~ColumnData();
196181
196469
 
196470
+ //! The block manager
196471
+ BlockManager &block_manager;
196182
196472
  //! Table info for the column
196183
196473
  DataTableInfo &info;
196184
196474
  //! The column index of the column, either within the parent table or within the parent
@@ -196242,25 +196532,27 @@ public:
196242
196532
 
196243
196533
  virtual void CommitDropColumn();
196244
196534
 
196245
- virtual unique_ptr<ColumnCheckpointState> CreateCheckpointState(RowGroup &row_group, TableDataWriter &writer);
196246
- virtual unique_ptr<ColumnCheckpointState> Checkpoint(RowGroup &row_group, TableDataWriter &writer,
196535
+ virtual unique_ptr<ColumnCheckpointState> CreateCheckpointState(RowGroup &row_group, RowGroupWriter &writer);
196536
+ virtual unique_ptr<ColumnCheckpointState> Checkpoint(RowGroup &row_group, RowGroupWriter &writer,
196247
196537
  ColumnCheckpointInfo &checkpoint_info);
196248
196538
 
196249
196539
  virtual void CheckpointScan(ColumnSegment *segment, ColumnScanState &state, idx_t row_group_start, idx_t count,
196250
196540
  Vector &scan_vector);
196251
196541
 
196252
196542
  virtual void DeserializeColumn(Deserializer &source);
196253
- static shared_ptr<ColumnData> Deserialize(DataTableInfo &info, idx_t column_index, idx_t start_row,
196254
- Deserializer &source, const LogicalType &type, ColumnData *parent);
196543
+ static shared_ptr<ColumnData> Deserialize(BlockManager &block_manager, DataTableInfo &info, idx_t column_index,
196544
+ idx_t start_row, Deserializer &source, const LogicalType &type,
196545
+ ColumnData *parent);
196255
196546
 
196256
196547
  virtual void GetStorageInfo(idx_t row_group_index, vector<idx_t> col_path, vector<vector<Value>> &result);
196257
196548
  virtual void Verify(RowGroup &parent);
196258
196549
 
196259
- static shared_ptr<ColumnData> CreateColumn(DataTableInfo &info, idx_t column_index, idx_t start_row,
196260
- const LogicalType &type, ColumnData *parent = nullptr);
196550
+ static shared_ptr<ColumnData> CreateColumn(BlockManager &block_manager, DataTableInfo &info, idx_t column_index,
196551
+ idx_t start_row, const LogicalType &type, ColumnData *parent = nullptr);
196261
196552
  static shared_ptr<ColumnData> CreateColumn(ColumnData &other, idx_t start_row, ColumnData *parent = nullptr);
196262
- static unique_ptr<ColumnData> CreateColumnUnique(DataTableInfo &info, idx_t column_index, idx_t start_row,
196263
- const LogicalType &type, ColumnData *parent = nullptr);
196553
+ static unique_ptr<ColumnData> CreateColumnUnique(BlockManager &block_manager, DataTableInfo &info,
196554
+ idx_t column_index, idx_t start_row, const LogicalType &type,
196555
+ ColumnData *parent = nullptr);
196264
196556
  static unique_ptr<ColumnData> CreateColumnUnique(ColumnData &other, idx_t start_row, ColumnData *parent = nullptr);
196265
196557
 
196266
196558
  protected:
@@ -197032,7 +197324,7 @@ public:
197032
197324
  next_width = 0;
197033
197325
 
197034
197326
  // Reset the pointers into the current segment
197035
- auto &buffer_manager = BufferManager::GetBufferManager(current_segment->db);
197327
+ auto &buffer_manager = BufferManager::GetBufferManager(checkpointer.GetDatabase());
197036
197328
  current_handle = buffer_manager.Pin(current_segment->block);
197037
197329
  current_dictionary = DictionaryCompressionStorage::GetDictionary(*current_segment, current_handle);
197038
197330
  current_end_ptr = current_handle.Ptr() + current_dictionary.end;
@@ -197116,7 +197408,7 @@ public:
197116
197408
  }
197117
197409
 
197118
197410
  idx_t Finalize() {
197119
- auto &buffer_manager = BufferManager::GetBufferManager(current_segment->db);
197411
+ auto &buffer_manager = BufferManager::GetBufferManager(checkpointer.GetDatabase());
197120
197412
  auto handle = buffer_manager.Pin(current_segment->block);
197121
197413
  D_ASSERT(current_dictionary.end == Storage::BLOCK_SIZE);
197122
197414
 
@@ -197559,7 +197851,7 @@ void UncompressedCompressState::CreateEmptySegment(idx_t row_start) {
197559
197851
  auto compressed_segment = ColumnSegment::CreateTransientSegment(db, type, row_start);
197560
197852
  if (type.InternalType() == PhysicalType::VARCHAR) {
197561
197853
  auto &state = (UncompressedStringSegmentState &)*compressed_segment->GetSegmentState();
197562
- state.overflow_writer = make_unique<WriteOverflowStringsToDisk>(db);
197854
+ state.overflow_writer = make_unique<WriteOverflowStringsToDisk>(checkpointer.GetColumnData().block_manager);
197563
197855
  }
197564
197856
  current_segment = move(compressed_segment);
197565
197857
  }
@@ -199357,8 +199649,8 @@ string_t UncompressedStringStorage::ReadOverflowString(ColumnSegment &segment, V
199357
199649
  D_ASSERT(block != INVALID_BLOCK);
199358
199650
  D_ASSERT(offset < Storage::BLOCK_SIZE);
199359
199651
 
199360
- auto &block_manager = BlockManager::GetBlockManager(segment.db);
199361
- auto &buffer_manager = BufferManager::GetBufferManager(segment.db);
199652
+ auto &block_manager = segment.GetBlockManager();
199653
+ auto &buffer_manager = block_manager.buffer_manager;
199362
199654
  auto &state = (UncompressedStringSegmentState &)*segment.GetSegmentState();
199363
199655
  if (block < MAXIMUM_BLOCK) {
199364
199656
  // read the overflow string from disk
@@ -200042,7 +200334,8 @@ namespace duckdb {
200042
200334
  //! Validity column data represents the validity data (i.e. which values are null)
200043
200335
  class ValidityColumnData : public ColumnData {
200044
200336
  public:
200045
- ValidityColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, ColumnData *parent);
200337
+ ValidityColumnData(BlockManager &block_manager, DataTableInfo &info, idx_t column_index, idx_t start_row,
200338
+ ColumnData *parent);
200046
200339
  ValidityColumnData(ColumnData &original, idx_t start_row, ColumnData *parent = nullptr);
200047
200340
 
200048
200341
  public:
@@ -200057,8 +200350,8 @@ namespace duckdb {
200057
200350
  //! Standard column data represents a regular flat column (e.g. a column of type INTEGER or STRING)
200058
200351
  class StandardColumnData : public ColumnData {
200059
200352
  public:
200060
- StandardColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type,
200061
- ColumnData *parent = nullptr);
200353
+ StandardColumnData(BlockManager &block_manager, DataTableInfo &info, idx_t column_index, idx_t start_row,
200354
+ LogicalType type, ColumnData *parent = nullptr);
200062
200355
  StandardColumnData(ColumnData &original, idx_t start_row, ColumnData *parent = nullptr);
200063
200356
 
200064
200357
  //! The validity column data
@@ -200088,8 +200381,8 @@ public:
200088
200381
 
200089
200382
  void CommitDropColumn() override;
200090
200383
 
200091
- unique_ptr<ColumnCheckpointState> CreateCheckpointState(RowGroup &row_group, TableDataWriter &writer) override;
200092
- unique_ptr<ColumnCheckpointState> Checkpoint(RowGroup &row_group, TableDataWriter &writer,
200384
+ unique_ptr<ColumnCheckpointState> CreateCheckpointState(RowGroup &row_group, RowGroupWriter &writer) override;
200385
+ unique_ptr<ColumnCheckpointState> Checkpoint(RowGroup &row_group, RowGroupWriter &writer,
200093
200386
  ColumnCheckpointInfo &checkpoint_info) override;
200094
200387
  void CheckpointScan(ColumnSegment *segment, ColumnScanState &state, idx_t row_group_start, idx_t count,
200095
200388
  Vector &scan_vector) override;
@@ -200112,13 +200405,15 @@ private:
200112
200405
 
200113
200406
  namespace duckdb {
200114
200407
 
200115
- DataTable::DataTable(DatabaseInstance &db, const string &schema, const string &table,
200116
- vector<ColumnDefinition> column_definitions_p, unique_ptr<PersistentTableData> data)
200117
- : info(make_shared<DataTableInfo>(db, schema, table)), column_definitions(move(column_definitions_p)), db(db),
200118
- is_root(true) {
200408
+ DataTable::DataTable(DatabaseInstance &db, shared_ptr<TableIOManager> table_io_manager_p, const string &schema,
200409
+ const string &table, vector<ColumnDefinition> column_definitions_p,
200410
+ unique_ptr<PersistentTableData> data)
200411
+ : info(make_shared<DataTableInfo>(db, move(table_io_manager_p), schema, table)),
200412
+ column_definitions(move(column_definitions_p)), db(db), is_root(true) {
200119
200413
  // initialize the table with the existing data from disk, if any
200120
200414
  auto types = GetTypes();
200121
- this->row_groups = make_shared<RowGroupCollection>(info, types, 0);
200415
+ this->row_groups =
200416
+ make_shared<RowGroupCollection>(info, TableIOManager::Get(*this).GetBlockManagerForRowData(), types, 0);
200122
200417
  if (data && !data->row_groups.empty()) {
200123
200418
  this->row_groups->Initialize(*data);
200124
200419
  stats.Initialize(types, *data);
@@ -200268,6 +200563,10 @@ vector<LogicalType> DataTable::GetTypes() {
200268
200563
  return types;
200269
200564
  }
200270
200565
 
200566
+ TableIOManager &TableIOManager::Get(DataTable &table) {
200567
+ return *table.info->table_io_manager;
200568
+ }
200569
+
200271
200570
  //===--------------------------------------------------------------------===//
200272
200571
  // Scan
200273
200572
  //===--------------------------------------------------------------------===//
@@ -201006,38 +201305,14 @@ void DataTable::Checkpoint(TableDataWriter &writer) {
201006
201305
  global_stats.push_back(stats.CopyStats(i));
201007
201306
  }
201008
201307
 
201009
- vector<RowGroupPointer> row_group_pointers;
201010
- row_groups->Checkpoint(writer, row_group_pointers, global_stats);
201011
-
201012
- // store the current position in the metadata writer
201013
- // this is where the row groups for this table start
201014
- auto &data_writer = writer.GetTableWriter();
201015
- auto pointer = data_writer.GetBlockPointer();
201016
-
201017
- for (auto &stats : global_stats) {
201018
- stats->Serialize(data_writer);
201019
- }
201020
- // now start writing the row group pointers to disk
201021
- data_writer.Write<uint64_t>(row_group_pointers.size());
201022
- for (auto &row_group_pointer : row_group_pointers) {
201023
- RowGroup::Serialize(row_group_pointer, data_writer);
201024
- }
201025
- // Now we serialize indexes in the tabledata_writer
201026
- auto blocks_info = info->indexes.SerializeIndexes(data_writer);
201027
-
201028
- // metadata writing time
201029
- auto &metadata_writer = writer.GetMetaWriter();
201308
+ row_groups->Checkpoint(writer, global_stats);
201030
201309
 
201031
- // write the block pointer for the table info
201032
- metadata_writer.Write<block_id_t>(pointer.block_id);
201033
- metadata_writer.Write<uint64_t>(pointer.offset);
201034
-
201035
- // Write-off block ids and offsets of indexes
201036
- metadata_writer.Write<idx_t>(blocks_info.size());
201037
- for (auto &block_info : blocks_info) {
201038
- metadata_writer.Write<idx_t>(block_info.block_id);
201039
- metadata_writer.Write<idx_t>(block_info.offset);
201040
- }
201310
+ // The rowgroup payload data has been written. Now write:
201311
+ // column stats
201312
+ // row-group pointers
201313
+ // table pointer
201314
+ // index data
201315
+ writer.FinalizeTable(move(global_stats), info.get());
201041
201316
  }
201042
201317
 
201043
201318
  void DataTable::CommitDropColumn(idx_t index) {
@@ -201071,9 +201346,9 @@ vector<vector<Value>> DataTable::GetStorageInfo() {
201071
201346
 
201072
201347
  namespace duckdb {
201073
201348
 
201074
- Index::Index(IndexType type, const vector<column_t> &column_ids_p,
201349
+ Index::Index(IndexType type, TableIOManager &table_io_manager, const vector<column_t> &column_ids_p,
201075
201350
  const vector<unique_ptr<Expression>> &unbound_expressions, IndexConstraintType constraint_type_p)
201076
- : type(type), column_ids(column_ids_p), constraint_type(constraint_type_p),
201351
+ : type(type), table_io_manager(table_io_manager), column_ids(column_ids_p), constraint_type(constraint_type_p),
201077
201352
  executor(Allocator::DefaultAllocator()) {
201078
201353
  for (auto &expr : unbound_expressions) {
201079
201354
  types.push_back(expr->return_type.InternalType());
@@ -201160,12 +201435,15 @@ BlockPointer Index::Serialize(duckdb::MetaBlockWriter &writer) {
201160
201435
 
201161
201436
 
201162
201437
 
201438
+
201163
201439
  namespace duckdb {
201164
201440
 
201165
201441
  LocalTableStorage::LocalTableStorage(DataTable &table)
201166
201442
  : table(table), allocator(Allocator::Get(table.db)), deleted_rows(0) {
201167
201443
  auto types = table.GetTypes();
201168
- row_groups = make_shared<RowGroupCollection>(table.info, types, MAX_ROW_ID, 0);
201444
+ row_groups = make_shared<RowGroupCollection>(table.info, TableIOManager::Get(table).GetBlockManagerForRowData(),
201445
+ types, MAX_ROW_ID, 0);
201446
+
201169
201447
  stats.InitializeEmpty(types);
201170
201448
  table.info->indexes.Scan([&](Index &index) {
201171
201449
  D_ASSERT(index.type == IndexType::ART);
@@ -201176,7 +201454,8 @@ LocalTableStorage::LocalTableStorage(DataTable &table)
201176
201454
  for (auto &expr : art.unbound_expressions) {
201177
201455
  unbound_expressions.push_back(expr->Copy());
201178
201456
  }
201179
- indexes.AddIndex(make_unique<ART>(art.column_ids, move(unbound_expressions), art.constraint_type, art.db));
201457
+ indexes.AddIndex(make_unique<ART>(art.column_ids, art.table_io_manager, move(unbound_expressions),
201458
+ art.constraint_type, art.db));
201180
201459
  }
201181
201460
  return false;
201182
201461
  });
@@ -201518,7 +201797,8 @@ void LocalStorage::VerifyNewConstraint(DataTable &parent, const BoundConstraint
201518
201797
 
201519
201798
  namespace duckdb {
201520
201799
 
201521
- MetaBlockReader::MetaBlockReader(DatabaseInstance &db, block_id_t block_id) : db(db), offset(0), next_block(-1) {
201800
+ MetaBlockReader::MetaBlockReader(BlockManager &block_manager, block_id_t block_id, bool free_blocks_on_read)
201801
+ : block_manager(block_manager), offset(0), next_block(-1), free_blocks_on_read(free_blocks_on_read) {
201522
201802
  ReadNewBlock(block_id);
201523
201803
  }
201524
201804
 
@@ -201536,6 +201816,9 @@ void MetaBlockReader::ReadData(data_ptr_t buffer, idx_t read_size) {
201536
201816
  buffer += to_read;
201537
201817
  }
201538
201818
  // then move to the next block
201819
+ if (next_block == INVALID_BLOCK) {
201820
+ throw IOException("Cannot read from INVALID_BLOCK.");
201821
+ }
201539
201822
  ReadNewBlock(next_block);
201540
201823
  }
201541
201824
  // we have enough left in this block to read from the buffer
@@ -201544,10 +201827,15 @@ void MetaBlockReader::ReadData(data_ptr_t buffer, idx_t read_size) {
201544
201827
  }
201545
201828
 
201546
201829
  void MetaBlockReader::ReadNewBlock(block_id_t id) {
201547
- auto &block_manager = BlockManager::GetBlockManager(db);
201548
- auto &buffer_manager = BufferManager::GetBufferManager(db);
201830
+ auto &buffer_manager = block_manager.buffer_manager;
201549
201831
 
201550
- block_manager.MarkBlockAsModified(id);
201832
+ // Marking these blocks as modified will cause them to be moved to the free
201833
+ // list upon the next successful checkpoint. Marking them modified here
201834
+ // assumes MetaBlockReader is exclusively used for reading checkpoint data,
201835
+ // and thus any blocks we're reading will be obviated by the next checkpoint.
201836
+ if (free_blocks_on_read) {
201837
+ block_manager.MarkBlockAsModified(id);
201838
+ }
201551
201839
  block = block_manager.RegisterBlock(id);
201552
201840
  handle = buffer_manager.Pin(block);
201553
201841
 
@@ -201563,28 +201851,26 @@ void MetaBlockReader::ReadNewBlock(block_id_t id) {
201563
201851
 
201564
201852
  namespace duckdb {
201565
201853
 
201566
- MetaBlockWriter::MetaBlockWriter(DatabaseInstance &db, block_id_t initial_block_id) : db(db) {
201854
+ MetaBlockWriter::MetaBlockWriter(BlockManager &block_manager, block_id_t initial_block_id)
201855
+ : block_manager(block_manager) {
201567
201856
  if (initial_block_id == INVALID_BLOCK) {
201568
201857
  initial_block_id = GetNextBlockId();
201569
201858
  }
201570
- auto &block_manager = BlockManager::GetBlockManager(db);
201571
- block = block_manager.CreateBlock(initial_block_id);
201859
+ block = block_manager.CreateBlock(initial_block_id, nullptr);
201572
201860
  Store<block_id_t>(-1, block->buffer);
201573
201861
  offset = sizeof(block_id_t);
201574
201862
  }
201575
201863
 
201576
201864
  MetaBlockWriter::~MetaBlockWriter() {
201577
- if (Exception::UncaughtException()) {
201578
- return;
201579
- }
201580
- try {
201581
- Flush();
201582
- } catch (...) {
201583
- }
201865
+ // If there's an exception during checkpoint, this can get destroyed without
201866
+ // flushing the data...which is fine, because none of the unwritten data
201867
+ // will be referenced.
201868
+ //
201869
+ // Otherwise, we should have explicitly flushed (and thereby nulled the block).
201870
+ D_ASSERT(!block || Exception::UncaughtException());
201584
201871
  }
201585
201872
 
201586
201873
  block_id_t MetaBlockWriter::GetNextBlockId() {
201587
- auto &block_manager = BlockManager::GetBlockManager(db);
201588
201874
  return block_manager.GetFreeBlockId();
201589
201875
  }
201590
201876
 
@@ -201596,9 +201882,13 @@ BlockPointer MetaBlockWriter::GetBlockPointer() {
201596
201882
  }
201597
201883
 
201598
201884
  void MetaBlockWriter::Flush() {
201885
+ AdvanceBlock();
201886
+ block = nullptr;
201887
+ }
201888
+
201889
+ void MetaBlockWriter::AdvanceBlock() {
201599
201890
  written_blocks.insert(block->id);
201600
201891
  if (offset > sizeof(block_id_t)) {
201601
- auto &block_manager = BlockManager::GetBlockManager(db);
201602
201892
  block_manager.Write(*block);
201603
201893
  offset = sizeof(block_id_t);
201604
201894
  }
@@ -201621,8 +201911,8 @@ void MetaBlockWriter::WriteData(const_data_ptr_t buffer, idx_t write_size) {
201621
201911
  // write the block id of the new block to the start of the current block
201622
201912
  Store<block_id_t>(new_block_id, block->buffer);
201623
201913
  // first flush the old block
201624
- Flush();
201625
- // now update the block id of the lbock
201914
+ AdvanceBlock();
201915
+ // now update the block id of the block
201626
201916
  block->id = new_block_id;
201627
201917
  Store<block_id_t>(-1, block->buffer);
201628
201918
  }
@@ -201630,6 +201920,87 @@ void MetaBlockWriter::WriteData(const_data_ptr_t buffer, idx_t write_size) {
201630
201920
  offset += write_size;
201631
201921
  }
201632
201922
 
201923
+ } // namespace duckdb
201924
+
201925
+
201926
+ namespace duckdb {
201927
+
201928
+ //===--------------------------------------------------------------------===//
201929
+ // Partial Blocks
201930
+ //===--------------------------------------------------------------------===//
201931
+ PartialBlockAllocation PartialBlockManager::GetBlockAllocation(uint32_t segment_size) {
201932
+ PartialBlockAllocation allocation;
201933
+ allocation.block_manager = &block_manager;
201934
+ allocation.allocation_size = segment_size;
201935
+
201936
+ // if the block is less than 80% full, we consider it a "partial block"
201937
+ // which means we will try to fit it with other blocks
201938
+ // check if there is a partial block available we can write to
201939
+ if (segment_size <= max_partial_block_size && GetPartialBlock(segment_size, allocation.partial_block)) {
201940
+ //! there is! increase the reference count of this block
201941
+ allocation.partial_block->state.block_use_count += 1;
201942
+ allocation.state = allocation.partial_block->state;
201943
+ block_manager.IncreaseBlockReferenceCount(allocation.state.block_id);
201944
+ } else {
201945
+ // full block: get a free block to write to
201946
+ AllocateBlock(allocation.state, segment_size);
201947
+ }
201948
+ return allocation;
201949
+ }
201950
+
201951
+ void PartialBlockManager::AllocateBlock(PartialBlockState &state, uint32_t segment_size) {
201952
+ D_ASSERT(segment_size <= Storage::BLOCK_SIZE);
201953
+ state.block_id = block_manager.GetFreeBlockId();
201954
+ state.block_size = Storage::BLOCK_SIZE;
201955
+ state.offset_in_block = 0;
201956
+ state.block_use_count = 1;
201957
+ }
201958
+
201959
+ bool PartialBlockManager::GetPartialBlock(idx_t segment_size, unique_ptr<PartialBlock> &partial_block) {
201960
+ auto entry = partially_filled_blocks.lower_bound(segment_size);
201961
+ if (entry == partially_filled_blocks.end()) {
201962
+ return false;
201963
+ }
201964
+ // found a partially filled block! fill in the info
201965
+ partial_block = move(entry->second);
201966
+ partially_filled_blocks.erase(entry);
201967
+
201968
+ D_ASSERT(partial_block->state.offset_in_block > 0);
201969
+ D_ASSERT(ValueIsAligned(partial_block->state.offset_in_block));
201970
+ return true;
201971
+ }
201972
+
201973
+ void PartialBlockManager::RegisterPartialBlock(PartialBlockAllocation &&allocation) {
201974
+ auto &state(allocation.partial_block->state);
201975
+ if (state.block_use_count < max_use_count) {
201976
+ auto new_size = AlignValue(allocation.allocation_size + state.offset_in_block);
201977
+ state.offset_in_block = new_size;
201978
+ auto new_space_left = state.block_size - new_size;
201979
+ // check if the block is STILL partially filled after adding the segment_size
201980
+ if (new_space_left >= Storage::BLOCK_SIZE - max_partial_block_size) {
201981
+ // the block is still partially filled: add it to the partially_filled_blocks list
201982
+ partially_filled_blocks.insert(make_pair(new_space_left, move(allocation.partial_block)));
201983
+ }
201984
+ }
201985
+ auto block_to_free = move(allocation.partial_block);
201986
+ if (!block_to_free && partially_filled_blocks.size() > MAX_BLOCK_MAP_SIZE) {
201987
+ // Free the page with the least space free.
201988
+ auto itr = partially_filled_blocks.begin();
201989
+ block_to_free = move(itr->second);
201990
+ partially_filled_blocks.erase(itr);
201991
+ }
201992
+ // Flush any block that we're not going to reuse.
201993
+ if (block_to_free) {
201994
+ block_to_free->Flush();
201995
+ }
201996
+ }
201997
+
201998
+ void PartialBlockManager::FlushPartialBlocks() {
201999
+ for (auto &e : partially_filled_blocks) {
202000
+ e.second->Flush();
202001
+ }
202002
+ }
202003
+
201633
202004
  } // namespace duckdb
201634
202005
  //===----------------------------------------------------------------------===//
201635
202006
  // DuckDB
@@ -201660,9 +202031,8 @@ class SingleFileBlockManager : public BlockManager {
201660
202031
  public:
201661
202032
  SingleFileBlockManager(DatabaseInstance &db, string path, bool read_only, bool create_new, bool use_direct_io);
201662
202033
 
201663
- void StartCheckpoint() override;
201664
202034
  //! Creates a new Block using the specified block_id and returns a pointer
201665
- unique_ptr<Block> CreateBlock(block_id_t block_id) override;
202035
+ unique_ptr<Block> CreateBlock(block_id_t block_id, FileBuffer *source_buffer) override;
201666
202036
  //! Return the next free block id
201667
202037
  block_id_t GetFreeBlockId() override;
201668
202038
  //! Returns whether or not a specified block is the root block
@@ -201688,10 +202058,11 @@ public:
201688
202058
  idx_t FreeBlocks() override {
201689
202059
  return free_list.size();
201690
202060
  }
202061
+
202062
+ private:
201691
202063
  //! Load the free list from the file
201692
202064
  void LoadFreeList();
201693
202065
 
201694
- private:
201695
202066
  void Initialize(DatabaseHeader &header);
201696
202067
 
201697
202068
  //! Return the blocks to which we will write the free list and modified blocks
@@ -201911,6 +202282,7 @@ SingleFileBlockManager::SingleFileBlockManager(DatabaseInstance &db, string path
201911
202282
  active_header = 1;
201912
202283
  Initialize(h2);
201913
202284
  }
202285
+ LoadFreeList();
201914
202286
  }
201915
202287
  }
201916
202288
 
@@ -201930,7 +202302,7 @@ void SingleFileBlockManager::LoadFreeList() {
201930
202302
  // no free list
201931
202303
  return;
201932
202304
  }
201933
- MetaBlockReader reader(db, free_list_id);
202305
+ MetaBlockReader reader(*this, free_list_id);
201934
202306
  auto free_list_count = reader.Read<uint64_t>();
201935
202307
  free_list.clear();
201936
202308
  for (idx_t i = 0; i < free_list_count; i++) {
@@ -201945,9 +202317,6 @@ void SingleFileBlockManager::LoadFreeList() {
201945
202317
  }
201946
202318
  }
201947
202319
 
201948
- void SingleFileBlockManager::StartCheckpoint() {
201949
- }
201950
-
201951
202320
  bool SingleFileBlockManager::IsRootBlock(block_id_t root) {
201952
202321
  return root == meta_block;
201953
202322
  }
@@ -201981,6 +202350,10 @@ void SingleFileBlockManager::MarkBlockAsModified(block_id_t block_id) {
201981
202350
  }
201982
202351
  return;
201983
202352
  }
202353
+ // Check for multi-free
202354
+ // TODO: Fix the bug that causes this assert to fire, then uncomment it.
202355
+ // D_ASSERT(modified_blocks.find(block_id) == modified_blocks.end());
202356
+ D_ASSERT(free_list.find(block_id) == free_list.end());
201984
202357
  modified_blocks.insert(block_id);
201985
202358
  }
201986
202359
 
@@ -201998,8 +202371,13 @@ block_id_t SingleFileBlockManager::GetMetaBlock() {
201998
202371
  return meta_block;
201999
202372
  }
202000
202373
 
202001
- unique_ptr<Block> SingleFileBlockManager::CreateBlock(block_id_t block_id) {
202002
- return make_unique<Block>(Allocator::Get(db), block_id);
202374
+ unique_ptr<Block> SingleFileBlockManager::CreateBlock(block_id_t block_id, FileBuffer *source_buffer) {
202375
+ if (source_buffer) {
202376
+ D_ASSERT(source_buffer->AllocSize() == Storage::BLOCK_ALLOC_SIZE);
202377
+ return make_unique<Block>(*source_buffer, block_id);
202378
+ } else {
202379
+ return make_unique<Block>(Allocator::Get(db), block_id);
202380
+ }
202003
202381
  }
202004
202382
 
202005
202383
  void SingleFileBlockManager::Read(Block &block) {
@@ -202047,8 +202425,8 @@ vector<block_id_t> SingleFileBlockManager::GetFreeListBlocks() {
202047
202425
 
202048
202426
  class FreeListBlockWriter : public MetaBlockWriter {
202049
202427
  public:
202050
- FreeListBlockWriter(DatabaseInstance &db_p, vector<block_id_t> &free_list_blocks_p)
202051
- : MetaBlockWriter(db_p, free_list_blocks_p[0]), free_list_blocks(free_list_blocks_p), index(1) {
202428
+ FreeListBlockWriter(BlockManager &block_manager, vector<block_id_t> &free_list_blocks_p)
202429
+ : MetaBlockWriter(block_manager, free_list_blocks_p[0]), free_list_blocks(free_list_blocks_p), index(1) {
202052
202430
  }
202053
202431
 
202054
202432
  vector<block_id_t> &free_list_blocks;
@@ -202083,10 +202461,11 @@ void SingleFileBlockManager::WriteHeader(DatabaseHeader header) {
202083
202461
  // a normal MetaBlockWriter will fetch blocks to use from the free_list
202084
202462
  // but since we are WRITING the free_list, this behavior is sub-optimal
202085
202463
 
202086
- FreeListBlockWriter writer(db, free_list_blocks);
202464
+ FreeListBlockWriter writer(*this, free_list_blocks);
202087
202465
 
202088
- D_ASSERT(writer.block->id == free_list_blocks[0]);
202089
- header.free_list = writer.block->id;
202466
+ auto ptr = writer.GetBlockPointer();
202467
+ D_ASSERT(ptr.block_id == free_list_blocks[0]);
202468
+ header.free_list = ptr.block_id;
202090
202469
  for (auto &block_id : free_list_blocks) {
202091
202470
  modified_blocks.insert(block_id);
202092
202471
  }
@@ -203268,7 +203647,7 @@ void StorageLock::ReleaseSharedLock() {
203268
203647
  namespace duckdb {
203269
203648
 
203270
203649
  StorageManager::StorageManager(DatabaseInstance &db, string path, bool read_only)
203271
- : db(db), path(move(path)), wal(db), read_only(read_only) {
203650
+ : db(db), path(move(path)), read_only(read_only) {
203272
203651
  }
203273
203652
 
203274
203653
  StorageManager::~StorageManager() {
@@ -203294,14 +203673,20 @@ bool StorageManager::InMemory() {
203294
203673
  return path.empty() || path == ":memory:";
203295
203674
  }
203296
203675
 
203676
+ void StorageManager::CreateBufferManager() {
203677
+ auto &config = DBConfig::GetConfig(db);
203678
+ buffer_manager = make_unique<BufferManager>(db, config.options.temporary_directory, config.options.maximum_memory);
203679
+ }
203680
+
203297
203681
  void StorageManager::Initialize() {
203298
203682
  bool in_memory = InMemory();
203299
203683
  if (in_memory && read_only) {
203300
203684
  throw CatalogException("Cannot launch in-memory database in read-only mode!");
203301
203685
  }
203686
+ CreateBufferManager();
203687
+
203302
203688
  auto &config = DBConfig::GetConfig(db);
203303
203689
  auto &catalog = Catalog::GetCatalog(db);
203304
- buffer_manager = make_unique<BufferManager>(db, config.options.temporary_directory, config.options.maximum_memory);
203305
203690
 
203306
203691
  // first initialize the base system catalogs
203307
203692
  // these are never written to the WAL
@@ -203323,15 +203708,38 @@ void StorageManager::Initialize() {
203323
203708
  // commit transactions
203324
203709
  con.Commit();
203325
203710
 
203326
- if (!in_memory) {
203327
- // create or load the database from disk, if not in-memory mode
203328
- LoadDatabase();
203329
- } else {
203330
- block_manager = make_unique<InMemoryBlockManager>(*buffer_manager);
203711
+ // create or load the database from disk, if not in-memory mode
203712
+ LoadDatabase();
203713
+ }
203714
+
203715
+ ///////////////////////////////////////////////////////////////////////////
203716
+ class SingleFileTableIOManager : public TableIOManager {
203717
+ public:
203718
+ explicit SingleFileTableIOManager(BlockManager &block_manager) : block_manager(block_manager) {
203331
203719
  }
203720
+
203721
+ BlockManager &block_manager;
203722
+
203723
+ public:
203724
+ BlockManager &GetIndexBlockManager() override {
203725
+ return block_manager;
203726
+ }
203727
+ BlockManager &GetBlockManagerForRowData() override {
203728
+ return block_manager;
203729
+ }
203730
+ };
203731
+
203732
+ SingleFileStorageManager::SingleFileStorageManager(DatabaseInstance &db, string path, bool read_only)
203733
+ : StorageManager(db, move(path), read_only) {
203332
203734
  }
203333
203735
 
203334
- void StorageManager::LoadDatabase() {
203736
+ void SingleFileStorageManager::LoadDatabase() {
203737
+ if (InMemory()) {
203738
+ block_manager = make_unique<InMemoryBlockManager>(*buffer_manager);
203739
+ table_io_manager = make_unique<SingleFileTableIOManager>(*block_manager);
203740
+ return;
203741
+ }
203742
+
203335
203743
  string wal_path = path + ".wal";
203336
203744
  auto &fs = db.GetFileSystem();
203337
203745
  auto &config = db.config;
@@ -203349,15 +203757,14 @@ void StorageManager::LoadDatabase() {
203349
203757
  }
203350
203758
  // initialize the block manager while creating a new db file
203351
203759
  block_manager = make_unique<SingleFileBlockManager>(db, path, read_only, true, config.options.use_direct_io);
203760
+ table_io_manager = make_unique<SingleFileTableIOManager>(*block_manager);
203352
203761
  } else {
203353
203762
  // initialize the block manager while loading the current db file
203354
- auto sf_bm = make_unique<SingleFileBlockManager>(db, path, read_only, false, config.options.use_direct_io);
203355
- auto sf = sf_bm.get();
203356
- block_manager = move(sf_bm);
203357
- sf->LoadFreeList();
203763
+ block_manager = make_unique<SingleFileBlockManager>(db, path, read_only, false, config.options.use_direct_io);
203764
+ table_io_manager = make_unique<SingleFileTableIOManager>(*block_manager);
203358
203765
 
203359
203766
  //! Load from storage
203360
- CheckpointManager checkpointer(db);
203767
+ auto checkpointer = SingleFileCheckpointReader(*this);
203361
203768
  checkpointer.LoadFromStorage();
203362
203769
  // check if the WAL file exists
203363
203770
  if (fs.FileExists(wal_path)) {
@@ -203367,27 +203774,131 @@ void StorageManager::LoadDatabase() {
203367
203774
  }
203368
203775
  // initialize the WAL file
203369
203776
  if (!read_only) {
203370
- wal.Initialize(wal_path);
203777
+ wal = make_unique<WriteAheadLog>(db, wal_path);
203371
203778
  if (truncate_wal) {
203372
- wal.Truncate(0);
203779
+ wal->Truncate(0);
203373
203780
  }
203374
203781
  }
203375
203782
  }
203376
203783
 
203377
- void StorageManager::CreateCheckpoint(bool delete_wal, bool force_checkpoint) {
203378
- if (InMemory() || read_only || !wal.initialized) {
203784
+ ///////////////////////////////////////////////////////////////////////////////
203785
+
203786
+ class SingleFileStorageCommitState : public StorageCommitState {
203787
+ idx_t initial_wal_size = 0;
203788
+ idx_t initial_written = 0;
203789
+ WriteAheadLog *log;
203790
+ bool checkpoint;
203791
+
203792
+ public:
203793
+ SingleFileStorageCommitState(StorageManager &storage_manager, bool checkpoint);
203794
+ ~SingleFileStorageCommitState() override;
203795
+
203796
+ // Make the commit persistent
203797
+ void FlushCommit() override;
203798
+ };
203799
+
203800
+ SingleFileStorageCommitState::SingleFileStorageCommitState(StorageManager &storage_manager, bool checkpoint)
203801
+ : checkpoint(checkpoint) {
203802
+ log = storage_manager.GetWriteAheadLog();
203803
+ if (log) {
203804
+ auto initial_size = log->GetWALSize();
203805
+ initial_written = log->GetTotalWritten();
203806
+ initial_wal_size = initial_size < 0 ? 0 : idx_t(initial_size);
203807
+
203808
+ if (checkpoint) {
203809
+ // check if we are checkpointing after this commit
203810
+ // if we are checkpointing, we don't need to write anything to the WAL
203811
+ // this saves us a lot of unnecessary writes to disk in the case of large commits
203812
+ log->skip_writing = true;
203813
+ }
203814
+ } else {
203815
+ D_ASSERT(!checkpoint);
203816
+ }
203817
+ }
203818
+
203819
+ // Make the commit persistent
203820
+ void SingleFileStorageCommitState::FlushCommit() {
203821
+ if (log) {
203822
+ // flush the WAL if any changes were made
203823
+ if (log->GetTotalWritten() > initial_written) {
203824
+ D_ASSERT(!checkpoint);
203825
+ D_ASSERT(!log->skip_writing);
203826
+ log->Flush();
203827
+ }
203828
+ log->skip_writing = false;
203829
+ }
203830
+ // Null so that the destructor will not truncate the log.
203831
+ log = nullptr;
203832
+ }
203833
+
203834
+ SingleFileStorageCommitState::~SingleFileStorageCommitState() {
203835
+ // If log is non-null, then commit threw an exception before flushing.
203836
+ if (log) {
203837
+ log->skip_writing = false;
203838
+ if (log->GetTotalWritten() > initial_written) {
203839
+ // remove any entries written into the WAL by truncating it
203840
+ log->Truncate(initial_wal_size);
203841
+ }
203842
+ }
203843
+ }
203844
+
203845
+ unique_ptr<StorageCommitState> SingleFileStorageManager::GenStorageCommitState(Transaction &transaction,
203846
+ bool checkpoint) {
203847
+ return make_unique<SingleFileStorageCommitState>(*this, checkpoint);
203848
+ }
203849
+
203850
+ bool SingleFileStorageManager::IsCheckpointClean(block_id_t checkpoint_id) {
203851
+ return block_manager->IsRootBlock(checkpoint_id);
203852
+ }
203853
+
203854
+ void SingleFileStorageManager::CreateCheckpoint(bool delete_wal, bool force_checkpoint) {
203855
+ if (InMemory() || read_only || !wal) {
203379
203856
  return;
203380
203857
  }
203381
- if (wal.GetWALSize() > 0 || db.config.options.force_checkpoint || force_checkpoint) {
203858
+ if (wal->GetWALSize() > 0 || db.config.options.force_checkpoint || force_checkpoint) {
203382
203859
  // we only need to checkpoint if there is anything in the WAL
203383
- CheckpointManager checkpointer(db);
203860
+ SingleFileCheckpointWriter checkpointer(db, *block_manager);
203384
203861
  checkpointer.CreateCheckpoint();
203385
203862
  }
203386
203863
  if (delete_wal) {
203387
- wal.Delete();
203864
+ wal->Delete();
203865
+ wal.reset();
203388
203866
  }
203389
203867
  }
203390
203868
 
203869
+ DatabaseSize SingleFileStorageManager::GetDatabaseSize() {
203870
+ // All members default to zero
203871
+ DatabaseSize ds;
203872
+ if (!InMemory()) {
203873
+ ds.total_blocks = block_manager->TotalBlocks();
203874
+ ds.block_size = Storage::BLOCK_ALLOC_SIZE;
203875
+ ds.free_blocks = block_manager->FreeBlocks();
203876
+ ds.used_blocks = ds.total_blocks - ds.free_blocks;
203877
+ ds.bytes = (ds.total_blocks * ds.block_size);
203878
+ if (auto wal = GetWriteAheadLog()) {
203879
+ ds.wal_size = wal->GetWALSize();
203880
+ }
203881
+ }
203882
+ return ds;
203883
+ }
203884
+
203885
+ bool SingleFileStorageManager::AutomaticCheckpoint(idx_t estimated_wal_bytes) {
203886
+ auto log = GetWriteAheadLog();
203887
+ if (!log) {
203888
+ return false;
203889
+ }
203890
+
203891
+ auto initial_size = log->GetWALSize();
203892
+ idx_t expected_wal_size = initial_size + estimated_wal_bytes;
203893
+ return expected_wal_size > db.config.options.checkpoint_wal_size;
203894
+ }
203895
+
203896
+ shared_ptr<TableIOManager> SingleFileStorageManager::GetTableIOManager(BoundCreateTableInfo *info /*info*/) {
203897
+ // This is an unmanaged reference. No ref/deref overhead. Lifetime of the
203898
+ // TableIoManager follows lifetime of the StorageManager (this).
203899
+ return shared_ptr<TableIOManager>(shared_ptr<char>(nullptr), table_io_manager.get());
203900
+ }
203901
+
203391
203902
  } // namespace duckdb
203392
203903
 
203393
203904
 
@@ -203679,8 +204190,8 @@ namespace duckdb {
203679
204190
  //! List column data represents a list
203680
204191
  class ListColumnData : public ColumnData {
203681
204192
  public:
203682
- ListColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type,
203683
- ColumnData *parent = nullptr);
204193
+ ListColumnData(BlockManager &block_manager, DataTableInfo &info, idx_t column_index, idx_t start_row,
204194
+ LogicalType type, ColumnData *parent = nullptr);
203684
204195
  ListColumnData(ColumnData &original, idx_t start_row, ColumnData *parent = nullptr);
203685
204196
 
203686
204197
  //! The child-column of the list
@@ -203714,8 +204225,8 @@ public:
203714
204225
 
203715
204226
  void CommitDropColumn() override;
203716
204227
 
203717
- unique_ptr<ColumnCheckpointState> CreateCheckpointState(RowGroup &row_group, TableDataWriter &writer) override;
203718
- unique_ptr<ColumnCheckpointState> Checkpoint(RowGroup &row_group, TableDataWriter &writer,
204228
+ unique_ptr<ColumnCheckpointState> CreateCheckpointState(RowGroup &row_group, RowGroupWriter &writer) override;
204229
+ unique_ptr<ColumnCheckpointState> Checkpoint(RowGroup &row_group, RowGroupWriter &writer,
203719
204230
  ColumnCheckpointInfo &checkpoint_info) override;
203720
204231
 
203721
204232
  void DeserializeColumn(Deserializer &source) override;
@@ -203736,7 +204247,7 @@ private:
203736
204247
 
203737
204248
  namespace duckdb {
203738
204249
 
203739
- ColumnCheckpointState::ColumnCheckpointState(RowGroup &row_group, ColumnData &column_data, TableDataWriter &writer)
204250
+ ColumnCheckpointState::ColumnCheckpointState(RowGroup &row_group, ColumnData &column_data, RowGroupWriter &writer)
203740
204251
  : row_group(row_group), column_data(column_data), writer(writer) {
203741
204252
  }
203742
204253
 
@@ -203748,6 +204259,56 @@ unique_ptr<BaseStatistics> ColumnCheckpointState::GetStatistics() {
203748
204259
  return move(global_stats);
203749
204260
  }
203750
204261
 
204262
+ struct PartialBlockForCheckpoint : PartialBlock {
204263
+ struct PartialColumnSegment {
204264
+ ColumnSegment *segment;
204265
+ uint32_t offset_in_block;
204266
+ };
204267
+
204268
+ public:
204269
+ PartialBlockForCheckpoint(ColumnSegment *first_segment, BlockManager &block_manager, PartialBlockState state)
204270
+ : PartialBlock(state), first_segment(first_segment), block_manager(block_manager) {
204271
+ }
204272
+
204273
+ ~PartialBlockForCheckpoint() override {
204274
+ D_ASSERT(IsFlushed() || Exception::UncaughtException());
204275
+ }
204276
+
204277
+ // We will copy all subsequent segment data into the memory corresponding
204278
+ // to the first segment. Once the block is full (or checkpoint is complete)
204279
+ // we'll invoke Flush(), which will cause
204280
+ // the block to get written to storage (via BlockManger::ConvertToPersistent),
204281
+ // and all segments to have their references updated
204282
+ // (via ColumnSegment::ConvertToPersistent)
204283
+ ColumnSegment *first_segment;
204284
+ BlockManager &block_manager;
204285
+ vector<PartialColumnSegment> tail_segments;
204286
+
204287
+ public:
204288
+ bool IsFlushed() {
204289
+ // first_segment is zeroed on Flush
204290
+ return !first_segment;
204291
+ }
204292
+
204293
+ void Flush() override {
204294
+ // At this point, we've already copied all data from tail_segments
204295
+ // into the page owned by first_segment. We flush all segment data to
204296
+ // disk with the following call.
204297
+ first_segment->ConvertToPersistent(&block_manager, state.block_id);
204298
+ // Now that the page is persistent, update tail_segments to point to the
204299
+ // newly persistent block.
204300
+ for (auto e : tail_segments) {
204301
+ e.segment->MarkAsPersistent(first_segment->block, e.offset_in_block);
204302
+ }
204303
+ first_segment = nullptr;
204304
+ tail_segments.clear();
204305
+ }
204306
+
204307
+ void AddSegmentToTail(ColumnSegment *segment, uint32_t offset_in_block) {
204308
+ tail_segments.push_back({segment, offset_in_block});
204309
+ }
204310
+ };
204311
+
203751
204312
  void ColumnCheckpointState::FlushSegment(unique_ptr<ColumnSegment> segment, idx_t segment_size) {
203752
204313
  D_ASSERT(segment_size <= Storage::BLOCK_SIZE);
203753
204314
  auto tuple_count = segment->count.load();
@@ -203761,46 +204322,41 @@ void ColumnCheckpointState::FlushSegment(unique_ptr<ColumnSegment> segment, idx_
203761
204322
  // get the buffer of the segment and pin it
203762
204323
  auto &db = column_data.GetDatabase();
203763
204324
  auto &buffer_manager = BufferManager::GetBufferManager(db);
203764
- auto &block_manager = BlockManager::GetBlockManager(db);
203765
- auto &checkpoint_manager = writer.GetCheckpointManager();
203766
-
203767
- bool block_is_constant = segment->stats.statistics->IsConstant();
203768
-
203769
204325
  block_id_t block_id = INVALID_BLOCK;
203770
204326
  uint32_t offset_in_block = 0;
203771
- bool need_to_write = true;
203772
- PartialBlock *partial_block = nullptr;
203773
- unique_ptr<PartialBlock> owned_partial_block;
203774
- if (!block_is_constant) {
204327
+
204328
+ if (!segment->stats.statistics->IsConstant()) {
203775
204329
  // non-constant block
203776
- // if the block is less than 80% full, we consider it a "partial block"
203777
- // which means we will try to fit it with other blocks
203778
- if (segment_size <= CheckpointManager::PARTIAL_BLOCK_THRESHOLD) {
203779
- // the block is a partial block
203780
- // check if there is a partial block available we can write to
203781
- if (checkpoint_manager.GetPartialBlock(segment.get(), segment_size, block_id, offset_in_block,
203782
- partial_block, owned_partial_block)) {
203783
- //! there is! increase the reference count of this block
203784
- block_manager.IncreaseBlockReferenceCount(block_id);
203785
- } else {
203786
- // there isn't: generate a new block for this segment
203787
- block_id = block_manager.GetFreeBlockId();
203788
- offset_in_block = 0;
203789
- need_to_write = false;
203790
- // now register this block as a partial block
203791
- checkpoint_manager.RegisterPartialBlock(segment.get(), segment_size, block_id);
203792
- }
204330
+ PartialBlockAllocation allocation = writer.GetBlockAllocation(segment_size);
204331
+ block_id = allocation.state.block_id;
204332
+ offset_in_block = allocation.state.offset_in_block;
204333
+
204334
+ if (allocation.partial_block) {
204335
+ // Use an existing block.
204336
+ D_ASSERT(offset_in_block > 0);
204337
+ auto pstate = (PartialBlockForCheckpoint *)allocation.partial_block.get();
204338
+ // pin the source block
204339
+ auto old_handle = buffer_manager.Pin(segment->block);
204340
+ // pin the target block
204341
+ auto new_handle = buffer_manager.Pin(pstate->first_segment->block);
204342
+ // memcpy the contents of the old block to the new block
204343
+ memcpy(new_handle.Ptr() + offset_in_block, old_handle.Ptr(), segment_size);
204344
+ pstate->AddSegmentToTail(segment.get(), offset_in_block);
203793
204345
  } else {
203794
- // full block: get a free block to write to
203795
- block_id = block_manager.GetFreeBlockId();
203796
- offset_in_block = 0;
204346
+ // Create a new block for future reuse.
204347
+ D_ASSERT(offset_in_block == 0);
204348
+ allocation.partial_block =
204349
+ make_unique<PartialBlockForCheckpoint>(segment.get(), *allocation.block_manager, allocation.state);
203797
204350
  }
204351
+ // Writer will decide whether to reuse this block.
204352
+ writer.RegisterPartialBlock(move(allocation));
203798
204353
  } else {
203799
204354
  // constant block: no need to write anything to disk besides the stats
203800
204355
  // set up the compression function to constant
203801
204356
  auto &config = DBConfig::GetConfig(db);
203802
204357
  segment->function =
203803
204358
  config.GetCompressionFunction(CompressionType::COMPRESSION_CONSTANT, segment->type.InternalType());
204359
+ segment->ConvertToPersistent(nullptr, INVALID_BLOCK);
203804
204360
  }
203805
204361
 
203806
204362
  // construct the data pointer
@@ -203816,43 +204372,13 @@ void ColumnCheckpointState::FlushSegment(unique_ptr<ColumnSegment> segment, idx_
203816
204372
  data_pointer.compression_type = segment->function->type;
203817
204373
  data_pointer.statistics = segment->stats.statistics->Copy();
203818
204374
 
203819
- if (need_to_write) {
203820
- if (partial_block) {
203821
- // pin the current block
203822
- auto old_handle = buffer_manager.Pin(segment->block);
203823
- // pin the new block
203824
- auto new_handle = buffer_manager.Pin(partial_block->block);
203825
- // memcpy the contents of the old block to the new block
203826
- memcpy(new_handle.Ptr() + offset_in_block, old_handle.Ptr(), segment_size);
203827
- } else {
203828
- // convert the segment into a persistent segment that points to this block
203829
- segment->ConvertToPersistent(block_id);
203830
- }
203831
- }
203832
- if (owned_partial_block) {
203833
- // the partial block has become full: write it to disk
203834
- owned_partial_block->FlushToDisk(db);
203835
- }
203836
-
203837
204375
  // append the segment to the new segment tree
203838
204376
  new_tree.AppendSegment(move(segment));
203839
204377
  data_pointers.push_back(move(data_pointer));
203840
204378
  }
203841
204379
 
203842
- void ColumnCheckpointState::FlushToDisk() {
203843
- auto &meta_writer = writer.GetTableWriter();
203844
-
203845
- meta_writer.Write<idx_t>(data_pointers.size());
203846
- // then write the data pointers themselves
203847
- for (idx_t k = 0; k < data_pointers.size(); k++) {
203848
- auto &data_pointer = data_pointers[k];
203849
- meta_writer.Write<idx_t>(data_pointer.row_start);
203850
- meta_writer.Write<idx_t>(data_pointer.tuple_count);
203851
- meta_writer.Write<block_id_t>(data_pointer.block_pointer.block_id);
203852
- meta_writer.Write<uint32_t>(data_pointer.block_pointer.offset);
203853
- meta_writer.Write<CompressionType>(data_pointer.compression_type);
203854
- data_pointer.statistics->Serialize(meta_writer);
203855
- }
204380
+ void ColumnCheckpointState::WriteDataPointers() {
204381
+ writer.WriteColumnDataPointers(*this);
203856
204382
  }
203857
204383
 
203858
204384
  } // namespace duckdb
@@ -203888,8 +204414,8 @@ namespace duckdb {
203888
204414
  //! Struct column data represents a struct
203889
204415
  class StructColumnData : public ColumnData {
203890
204416
  public:
203891
- StructColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type,
203892
- ColumnData *parent = nullptr);
204417
+ StructColumnData(BlockManager &block_manager, DataTableInfo &info, idx_t column_index, idx_t start_row,
204418
+ LogicalType type, ColumnData *parent = nullptr);
203893
204419
  StructColumnData(ColumnData &original, idx_t start_row, ColumnData *parent = nullptr);
203894
204420
 
203895
204421
  //! The sub-columns of the struct
@@ -203924,8 +204450,8 @@ public:
203924
204450
 
203925
204451
  void CommitDropColumn() override;
203926
204452
 
203927
- unique_ptr<ColumnCheckpointState> CreateCheckpointState(RowGroup &row_group, TableDataWriter &writer) override;
203928
- unique_ptr<ColumnCheckpointState> Checkpoint(RowGroup &row_group, TableDataWriter &writer,
204453
+ unique_ptr<ColumnCheckpointState> CreateCheckpointState(RowGroup &row_group, RowGroupWriter &writer) override;
204454
+ unique_ptr<ColumnCheckpointState> Checkpoint(RowGroup &row_group, RowGroupWriter &writer,
203929
204455
  ColumnCheckpointInfo &checkpoint_info) override;
203930
204456
 
203931
204457
  void DeserializeColumn(Deserializer &source) override;
@@ -204049,13 +204575,15 @@ struct UpdateNode {
204049
204575
 
204050
204576
  namespace duckdb {
204051
204577
 
204052
- ColumnData::ColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type, ColumnData *parent)
204053
- : info(info), column_index(column_index), start(start_row), type(move(type)), parent(parent) {
204578
+ ColumnData::ColumnData(BlockManager &block_manager, DataTableInfo &info, idx_t column_index, idx_t start_row,
204579
+ LogicalType type, ColumnData *parent)
204580
+ : block_manager(block_manager), info(info), column_index(column_index), start(start_row), type(move(type)),
204581
+ parent(parent) {
204054
204582
  }
204055
204583
 
204056
204584
  ColumnData::ColumnData(ColumnData &other, idx_t start, ColumnData *parent)
204057
- : info(other.info), column_index(other.column_index), start(start), type(move(other.type)), parent(parent),
204058
- updates(move(other.updates)) {
204585
+ : block_manager(other.block_manager), info(other.info), column_index(other.column_index), start(start),
204586
+ type(move(other.type)), parent(parent), updates(move(other.updates)) {
204059
204587
  idx_t offset = 0;
204060
204588
  for (auto segment = other.data.GetRootSegment(); segment; segment = segment->next.get()) {
204061
204589
  auto &other = (ColumnSegment &)*segment;
@@ -204251,6 +204779,7 @@ void ColumnData::InitializeAppend(ColumnAppendState &state) {
204251
204779
 
204252
204780
  D_ASSERT(state.current->segment_type == ColumnSegmentType::TRANSIENT);
204253
204781
  state.current->InitializeAppend(state);
204782
+ D_ASSERT(state.current->function->append);
204254
204783
  }
204255
204784
 
204256
204785
  void ColumnData::AppendData(BaseStatistics &stats, ColumnAppendState &state, UnifiedVectorFormat &vdata, idx_t count) {
@@ -204353,7 +204882,6 @@ void ColumnData::AppendTransientSegment(idx_t start_row) {
204353
204882
  }
204354
204883
 
204355
204884
  void ColumnData::CommitDropColumn() {
204356
- auto &block_manager = BlockManager::GetBlockManager(GetDatabase());
204357
204885
  auto segment = (ColumnSegment *)data.GetRootSegment();
204358
204886
  while (segment) {
204359
204887
  if (segment->segment_type == ColumnSegmentType::PERSISTENT) {
@@ -204366,7 +204894,7 @@ void ColumnData::CommitDropColumn() {
204366
204894
  }
204367
204895
  }
204368
204896
 
204369
- unique_ptr<ColumnCheckpointState> ColumnData::CreateCheckpointState(RowGroup &row_group, TableDataWriter &writer) {
204897
+ unique_ptr<ColumnCheckpointState> ColumnData::CreateCheckpointState(RowGroup &row_group, RowGroupWriter &writer) {
204370
204898
  return make_unique<ColumnCheckpointState>(row_group, *this, writer);
204371
204899
  }
204372
204900
 
@@ -204379,7 +204907,7 @@ void ColumnData::CheckpointScan(ColumnSegment *segment, ColumnScanState &state,
204379
204907
  }
204380
204908
  }
204381
204909
 
204382
- unique_ptr<ColumnCheckpointState> ColumnData::Checkpoint(RowGroup &row_group, TableDataWriter &writer,
204910
+ unique_ptr<ColumnCheckpointState> ColumnData::Checkpoint(RowGroup &row_group, RowGroupWriter &writer,
204383
204911
  ColumnCheckpointInfo &checkpoint_info) {
204384
204912
  // scan the segments of the column data
204385
204913
  // set up the checkpoint state
@@ -204416,16 +204944,17 @@ void ColumnData::DeserializeColumn(Deserializer &source) {
204416
204944
 
204417
204945
  // create a persistent segment
204418
204946
  auto segment = ColumnSegment::CreatePersistentSegment(
204419
- GetDatabase(), data_pointer.block_pointer.block_id, data_pointer.block_pointer.offset, type,
204947
+ GetDatabase(), block_manager, data_pointer.block_pointer.block_id, data_pointer.block_pointer.offset, type,
204420
204948
  data_pointer.row_start, data_pointer.tuple_count, data_pointer.compression_type,
204421
204949
  move(data_pointer.statistics));
204422
204950
  data.AppendSegment(move(segment));
204423
204951
  }
204424
204952
  }
204425
204953
 
204426
- shared_ptr<ColumnData> ColumnData::Deserialize(DataTableInfo &info, idx_t column_index, idx_t start_row,
204427
- Deserializer &source, const LogicalType &type, ColumnData *parent) {
204428
- auto entry = ColumnData::CreateColumn(info, column_index, start_row, type, parent);
204954
+ shared_ptr<ColumnData> ColumnData::Deserialize(BlockManager &block_manager, DataTableInfo &info, idx_t column_index,
204955
+ idx_t start_row, Deserializer &source, const LogicalType &type,
204956
+ ColumnData *parent) {
204957
+ auto entry = ColumnData::CreateColumn(block_manager, info, column_index, start_row, type, parent);
204429
204958
  entry->DeserializeColumn(source);
204430
204959
  return entry;
204431
204960
  }
@@ -204510,16 +205039,16 @@ void ColumnData::Verify(RowGroup &parent) {
204510
205039
  }
204511
205040
 
204512
205041
  template <class RET, class OP>
204513
- static RET CreateColumnInternal(DataTableInfo &info, idx_t column_index, idx_t start_row, const LogicalType &type,
204514
- ColumnData *parent) {
205042
+ static RET CreateColumnInternal(BlockManager &block_manager, DataTableInfo &info, idx_t column_index, idx_t start_row,
205043
+ const LogicalType &type, ColumnData *parent) {
204515
205044
  if (type.InternalType() == PhysicalType::STRUCT) {
204516
- return OP::template Create<StructColumnData>(info, column_index, start_row, type, parent);
205045
+ return OP::template Create<StructColumnData>(block_manager, info, column_index, start_row, type, parent);
204517
205046
  } else if (type.InternalType() == PhysicalType::LIST) {
204518
- return OP::template Create<ListColumnData>(info, column_index, start_row, type, parent);
205047
+ return OP::template Create<ListColumnData>(block_manager, info, column_index, start_row, type, parent);
204519
205048
  } else if (type.id() == LogicalTypeId::VALIDITY) {
204520
- return OP::template Create<ValidityColumnData>(info, column_index, start_row, parent);
205049
+ return OP::template Create<ValidityColumnData>(block_manager, info, column_index, start_row, parent);
204521
205050
  }
204522
- return OP::template Create<StandardColumnData>(info, column_index, start_row, type, parent);
205051
+ return OP::template Create<StandardColumnData>(block_manager, info, column_index, start_row, type, parent);
204523
205052
  }
204524
205053
 
204525
205054
  template <class RET, class OP>
@@ -204534,18 +205063,21 @@ static RET CreateColumnInternal(ColumnData &other, idx_t start_row, ColumnData *
204534
205063
  return OP::template Create<StandardColumnData>(other, start_row, parent);
204535
205064
  }
204536
205065
 
204537
- shared_ptr<ColumnData> ColumnData::CreateColumn(DataTableInfo &info, idx_t column_index, idx_t start_row,
204538
- const LogicalType &type, ColumnData *parent) {
204539
- return CreateColumnInternal<shared_ptr<ColumnData>, SharedConstructor>(info, column_index, start_row, type, parent);
205066
+ shared_ptr<ColumnData> ColumnData::CreateColumn(BlockManager &block_manager, DataTableInfo &info, idx_t column_index,
205067
+ idx_t start_row, const LogicalType &type, ColumnData *parent) {
205068
+ return CreateColumnInternal<shared_ptr<ColumnData>, SharedConstructor>(block_manager, info, column_index, start_row,
205069
+ type, parent);
204540
205070
  }
204541
205071
 
204542
205072
  shared_ptr<ColumnData> ColumnData::CreateColumn(ColumnData &other, idx_t start_row, ColumnData *parent) {
204543
205073
  return CreateColumnInternal<shared_ptr<ColumnData>, SharedConstructor>(other, start_row, parent);
204544
205074
  }
204545
205075
 
204546
- unique_ptr<ColumnData> ColumnData::CreateColumnUnique(DataTableInfo &info, idx_t column_index, idx_t start_row,
204547
- const LogicalType &type, ColumnData *parent) {
204548
- return CreateColumnInternal<unique_ptr<ColumnData>, UniqueConstructor>(info, column_index, start_row, type, parent);
205076
+ unique_ptr<ColumnData> ColumnData::CreateColumnUnique(BlockManager &block_manager, DataTableInfo &info,
205077
+ idx_t column_index, idx_t start_row, const LogicalType &type,
205078
+ ColumnData *parent) {
205079
+ return CreateColumnInternal<unique_ptr<ColumnData>, UniqueConstructor>(block_manager, info, column_index, start_row,
205080
+ type, parent);
204549
205081
  }
204550
205082
 
204551
205083
  unique_ptr<ColumnData> ColumnData::CreateColumnUnique(ColumnData &other, idx_t start_row, ColumnData *parent) {
@@ -204715,7 +205247,7 @@ void ColumnDataCheckpointer::WriteToDisk() {
204715
205247
  // first we check the current segments
204716
205248
  // if there are any persistent segments, we will mark their old block ids as modified
204717
205249
  // since the segments will be rewritten their old on disk data is no longer required
204718
- auto &block_manager = BlockManager::GetBlockManager(GetDatabase());
205250
+ auto &block_manager = col_data.block_manager;
204719
205251
  for (auto segment = (ColumnSegment *)owned_segment.get(); segment; segment = (ColumnSegment *)segment->next.get()) {
204720
205252
  if (segment->segment_type == ColumnSegmentType::PERSISTENT) {
204721
205253
  // persistent segment has updates: mark it as modified and rewrite the block with the merged updates
@@ -204825,26 +205357,32 @@ void ColumnDataCheckpointer::Checkpoint(unique_ptr<SegmentBase> segment) {
204825
205357
 
204826
205358
  namespace duckdb {
204827
205359
 
204828
- unique_ptr<ColumnSegment> ColumnSegment::CreatePersistentSegment(DatabaseInstance &db, block_id_t block_id,
204829
- idx_t offset, const LogicalType &type, idx_t start,
204830
- idx_t count, CompressionType compression_type,
205360
+ unique_ptr<ColumnSegment> ColumnSegment::CreatePersistentSegment(DatabaseInstance &db, BlockManager &block_manager,
205361
+ block_id_t block_id, idx_t offset,
205362
+ const LogicalType &type, idx_t start, idx_t count,
205363
+ CompressionType compression_type,
204831
205364
  unique_ptr<BaseStatistics> statistics) {
204832
205365
  auto &config = DBConfig::GetConfig(db);
204833
205366
  CompressionFunction *function;
205367
+ shared_ptr<BlockHandle> block;
204834
205368
  if (block_id == INVALID_BLOCK) {
205369
+ // constant segment, no need to allocate an actual block
204835
205370
  function = config.GetCompressionFunction(CompressionType::COMPRESSION_CONSTANT, type.InternalType());
204836
205371
  } else {
204837
205372
  function = config.GetCompressionFunction(compression_type, type.InternalType());
205373
+ block = block_manager.RegisterBlock(block_id);
204838
205374
  }
204839
- return make_unique<ColumnSegment>(db, type, ColumnSegmentType::PERSISTENT, start, count, function, move(statistics),
204840
- block_id, offset);
205375
+ return make_unique<ColumnSegment>(db, block, type, ColumnSegmentType::PERSISTENT, start, count, function,
205376
+ move(statistics), block_id, offset);
204841
205377
  }
204842
205378
 
204843
205379
  unique_ptr<ColumnSegment> ColumnSegment::CreateTransientSegment(DatabaseInstance &db, const LogicalType &type,
204844
205380
  idx_t start) {
204845
205381
  auto &config = DBConfig::GetConfig(db);
204846
205382
  auto function = config.GetCompressionFunction(CompressionType::COMPRESSION_UNCOMPRESSED, type.InternalType());
204847
- return make_unique<ColumnSegment>(db, type, ColumnSegmentType::TRANSIENT, start, 0, function, nullptr,
205383
+ // transient: allocate a buffer for the uncompressed segment
205384
+ auto block = BufferManager::GetBufferManager(db).RegisterMemory(Storage::BLOCK_SIZE, false);
205385
+ return make_unique<ColumnSegment>(db, block, type, ColumnSegmentType::TRANSIENT, start, 0, function, nullptr,
204848
205386
  INVALID_BLOCK, 0);
204849
205387
  }
204850
205388
 
@@ -204852,27 +205390,13 @@ unique_ptr<ColumnSegment> ColumnSegment::CreateSegment(ColumnSegment &other, idx
204852
205390
  return make_unique<ColumnSegment>(other, start);
204853
205391
  }
204854
205392
 
204855
- ColumnSegment::ColumnSegment(DatabaseInstance &db, LogicalType type_p, ColumnSegmentType segment_type, idx_t start,
204856
- idx_t count, CompressionFunction *function_p, unique_ptr<BaseStatistics> statistics,
204857
- block_id_t block_id_p, idx_t offset_p)
205393
+ ColumnSegment::ColumnSegment(DatabaseInstance &db, shared_ptr<BlockHandle> block, LogicalType type_p,
205394
+ ColumnSegmentType segment_type, idx_t start, idx_t count, CompressionFunction *function_p,
205395
+ unique_ptr<BaseStatistics> statistics, block_id_t block_id_p, idx_t offset_p)
204858
205396
  : SegmentBase(start, count), db(db), type(move(type_p)), type_size(GetTypeIdSize(type.InternalType())),
204859
- segment_type(segment_type), function(function_p), stats(type, move(statistics)), block_id(block_id_p),
204860
- offset(offset_p) {
205397
+ segment_type(segment_type), function(function_p), stats(type, move(statistics)), block(move(block)),
205398
+ block_id(block_id_p), offset(offset_p) {
204861
205399
  D_ASSERT(function);
204862
- auto &block_manager = BlockManager::GetBlockManager(db);
204863
- auto &buffer_manager = BufferManager::GetBufferManager(db);
204864
- if (block_id == INVALID_BLOCK) {
204865
- // no block id specified
204866
- // there are two cases here:
204867
- // transient: allocate a buffer for the uncompressed segment
204868
- // persistent: constant segment, no need to allocate anything
204869
- if (segment_type == ColumnSegmentType::TRANSIENT) {
204870
- this->block = buffer_manager.RegisterMemory(Storage::BLOCK_SIZE, false);
204871
- }
204872
- } else {
204873
- D_ASSERT(segment_type == ColumnSegmentType::PERSISTENT);
204874
- this->block = block_manager.RegisterBlock(block_id);
204875
- }
204876
205400
  if (function->init_segment) {
204877
205401
  segment_state = function->init_segment(*this, block_id);
204878
205402
  }
@@ -204960,22 +205484,23 @@ void ColumnSegment::RevertAppend(idx_t start_row) {
204960
205484
  //===--------------------------------------------------------------------===//
204961
205485
  // Convert To Persistent
204962
205486
  //===--------------------------------------------------------------------===//
204963
- void ColumnSegment::ConvertToPersistent(block_id_t block_id_p) {
205487
+ void ColumnSegment::ConvertToPersistent(BlockManager *block_manager, block_id_t block_id_p) {
204964
205488
  D_ASSERT(segment_type == ColumnSegmentType::TRANSIENT);
204965
205489
  segment_type = ColumnSegmentType::PERSISTENT;
205490
+
204966
205491
  block_id = block_id_p;
204967
205492
  offset = 0;
204968
205493
 
204969
205494
  if (block_id == INVALID_BLOCK) {
204970
205495
  // constant block: reset the block buffer
205496
+ D_ASSERT(stats.statistics->IsConstant());
204971
205497
  block.reset();
204972
205498
  } else {
205499
+ D_ASSERT(!stats.statistics->IsConstant());
204973
205500
  // non-constant block: write the block to disk
204974
- auto &block_manager = BlockManager::GetBlockManager(db);
204975
-
204976
205501
  // the data for the block already exists in-memory of our block
204977
205502
  // instead of copying the data we alter some metadata so the buffer points to an on-disk block
204978
- block = block_manager.ConvertToPersistent(block_id, move(block));
205503
+ block = block_manager->ConvertToPersistent(block_id, move(block));
204979
205504
  }
204980
205505
 
204981
205506
  segment_state.reset();
@@ -204984,10 +205509,11 @@ void ColumnSegment::ConvertToPersistent(block_id_t block_id_p) {
204984
205509
  }
204985
205510
  }
204986
205511
 
204987
- void ColumnSegment::ConvertToPersistent(shared_ptr<BlockHandle> block_p, block_id_t block_id_p, uint32_t offset_p) {
205512
+ void ColumnSegment::MarkAsPersistent(shared_ptr<BlockHandle> block_p, uint32_t offset_p) {
204988
205513
  D_ASSERT(segment_type == ColumnSegmentType::TRANSIENT);
204989
205514
  segment_type = ColumnSegmentType::PERSISTENT;
204990
- block_id = block_id_p;
205515
+
205516
+ block_id = block_p->BlockId();
204991
205517
  offset = offset_p;
204992
205518
  block = move(block_p);
204993
205519
 
@@ -205278,13 +205804,14 @@ idx_t ColumnSegment::FilterSelection(SelectionVector &sel, Vector &result, const
205278
205804
 
205279
205805
  namespace duckdb {
205280
205806
 
205281
- ListColumnData::ListColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type_p,
205282
- ColumnData *parent)
205283
- : ColumnData(info, column_index, start_row, move(type_p), parent), validity(info, 0, start_row, this) {
205807
+ ListColumnData::ListColumnData(BlockManager &block_manager, DataTableInfo &info, idx_t column_index, idx_t start_row,
205808
+ LogicalType type_p, ColumnData *parent)
205809
+ : ColumnData(block_manager, info, column_index, start_row, move(type_p), parent),
205810
+ validity(block_manager, info, 0, start_row, this) {
205284
205811
  D_ASSERT(type.InternalType() == PhysicalType::LIST);
205285
205812
  auto &child_type = ListType::GetChildType(type);
205286
205813
  // the child column, with column index 1 (0 is the validity mask)
205287
- child_column = ColumnData::CreateColumnUnique(info, 1, start_row, child_type, this);
205814
+ child_column = ColumnData::CreateColumnUnique(block_manager, info, 1, start_row, child_type, this);
205288
205815
  }
205289
205816
 
205290
205817
  ListColumnData::ListColumnData(ColumnData &original, idx_t start_row, ColumnData *parent)
@@ -205595,7 +206122,7 @@ void ListColumnData::CommitDropColumn() {
205595
206122
  }
205596
206123
 
205597
206124
  struct ListColumnCheckpointState : public ColumnCheckpointState {
205598
- ListColumnCheckpointState(RowGroup &row_group, ColumnData &column_data, TableDataWriter &writer)
206125
+ ListColumnCheckpointState(RowGroup &row_group, ColumnData &column_data, RowGroupWriter &writer)
205599
206126
  : ColumnCheckpointState(row_group, column_data, writer) {
205600
206127
  global_stats = make_unique<ListStatistics>(column_data.type);
205601
206128
  }
@@ -205612,18 +206139,18 @@ public:
205612
206139
  return stats;
205613
206140
  }
205614
206141
 
205615
- void FlushToDisk() override {
205616
- ColumnCheckpointState::FlushToDisk();
205617
- validity_state->FlushToDisk();
205618
- child_state->FlushToDisk();
206142
+ void WriteDataPointers() override {
206143
+ ColumnCheckpointState::WriteDataPointers();
206144
+ validity_state->WriteDataPointers();
206145
+ child_state->WriteDataPointers();
205619
206146
  }
205620
206147
  };
205621
206148
 
205622
- unique_ptr<ColumnCheckpointState> ListColumnData::CreateCheckpointState(RowGroup &row_group, TableDataWriter &writer) {
206149
+ unique_ptr<ColumnCheckpointState> ListColumnData::CreateCheckpointState(RowGroup &row_group, RowGroupWriter &writer) {
205623
206150
  return make_unique<ListColumnCheckpointState>(row_group, *this, writer);
205624
206151
  }
205625
206152
 
205626
- unique_ptr<ColumnCheckpointState> ListColumnData::Checkpoint(RowGroup &row_group, TableDataWriter &writer,
206153
+ unique_ptr<ColumnCheckpointState> ListColumnData::Checkpoint(RowGroup &row_group, RowGroupWriter &writer,
205627
206154
  ColumnCheckpointInfo &checkpoint_info) {
205628
206155
  auto validity_state = validity.Checkpoint(row_group, writer, checkpoint_info);
205629
206156
  auto base_state = ColumnData::Checkpoint(row_group, writer, checkpoint_info);
@@ -205682,24 +206209,27 @@ namespace duckdb {
205682
206209
  constexpr const idx_t RowGroup::ROW_GROUP_VECTOR_COUNT;
205683
206210
  constexpr const idx_t RowGroup::ROW_GROUP_SIZE;
205684
206211
 
205685
- RowGroup::RowGroup(DatabaseInstance &db, DataTableInfo &table_info, idx_t start, idx_t count)
205686
- : SegmentBase(start, count), db(db), table_info(table_info) {
206212
+ RowGroup::RowGroup(DatabaseInstance &db, BlockManager &block_manager, DataTableInfo &table_info, idx_t start,
206213
+ idx_t count)
206214
+ : SegmentBase(start, count), db(db), block_manager(block_manager), table_info(table_info) {
205687
206215
 
205688
206216
  Verify();
205689
206217
  }
205690
206218
 
205691
- RowGroup::RowGroup(DatabaseInstance &db, DataTableInfo &table_info, const vector<LogicalType> &types,
205692
- RowGroupPointer &pointer)
205693
- : SegmentBase(pointer.row_start, pointer.tuple_count), db(db), table_info(table_info) {
206219
+ RowGroup::RowGroup(DatabaseInstance &db, BlockManager &block_manager, DataTableInfo &table_info,
206220
+ const vector<LogicalType> &types, RowGroupPointer &&pointer)
206221
+ : SegmentBase(pointer.row_start, pointer.tuple_count), db(db), block_manager(block_manager),
206222
+ table_info(table_info) {
205694
206223
  // deserialize the columns
205695
206224
  if (pointer.data_pointers.size() != types.size()) {
205696
206225
  throw IOException("Row group column count is unaligned with table column count. Corrupt file?");
205697
206226
  }
205698
206227
  for (idx_t i = 0; i < pointer.data_pointers.size(); i++) {
205699
206228
  auto &block_pointer = pointer.data_pointers[i];
205700
- MetaBlockReader column_data_reader(db, block_pointer.block_id);
206229
+ MetaBlockReader column_data_reader(block_manager, block_pointer.block_id);
205701
206230
  column_data_reader.offset = block_pointer.offset;
205702
- this->columns.push_back(ColumnData::Deserialize(table_info, i, start, column_data_reader, types[i], nullptr));
206231
+ this->columns.push_back(
206232
+ ColumnData::Deserialize(block_manager, table_info, i, start, column_data_reader, types[i], nullptr));
205703
206233
  }
205704
206234
 
205705
206235
  // set up the statistics
@@ -205713,8 +206243,8 @@ RowGroup::RowGroup(DatabaseInstance &db, DataTableInfo &table_info, const vector
205713
206243
  }
205714
206244
 
205715
206245
  RowGroup::RowGroup(RowGroup &row_group, idx_t start)
205716
- : SegmentBase(start, row_group.count), db(row_group.db), table_info(row_group.table_info),
205717
- version_info(move(row_group.version_info)), stats(move(row_group.stats)) {
206246
+ : SegmentBase(start, row_group.count), db(row_group.db), block_manager(row_group.block_manager),
206247
+ table_info(row_group.table_info), version_info(move(row_group.version_info)), stats(move(row_group.stats)) {
205718
206248
  for (auto &column : row_group.columns) {
205719
206249
  this->columns.push_back(ColumnData::CreateColumn(*column, start));
205720
206250
  }
@@ -205740,7 +206270,7 @@ RowGroup::~RowGroup() {
205740
206270
  void RowGroup::InitializeEmpty(const vector<LogicalType> &types) {
205741
206271
  // set up the segment trees for the column segments
205742
206272
  for (idx_t i = 0; i < types.size(); i++) {
205743
- auto column_data = ColumnData::CreateColumn(GetTableInfo(), i, start, types[i]);
206273
+ auto column_data = ColumnData::CreateColumn(block_manager, GetTableInfo(), i, start, types[i]);
205744
206274
  stats.push_back(make_shared<SegmentStatistics>(types[i]));
205745
206275
  columns.push_back(move(column_data));
205746
206276
  }
@@ -205802,7 +206332,7 @@ unique_ptr<RowGroup> RowGroup::AlterType(const LogicalType &target_type, idx_t c
205802
206332
  Verify();
205803
206333
 
205804
206334
  // construct a new column data for this type
205805
- auto column_data = ColumnData::CreateColumn(GetTableInfo(), changed_idx, start, target_type);
206335
+ auto column_data = ColumnData::CreateColumn(block_manager, GetTableInfo(), changed_idx, start, target_type);
205806
206336
 
205807
206337
  ColumnAppendState append_state;
205808
206338
  column_data->InitializeAppend(append_state);
@@ -205825,7 +206355,7 @@ unique_ptr<RowGroup> RowGroup::AlterType(const LogicalType &target_type, idx_t c
205825
206355
  }
205826
206356
 
205827
206357
  // set up the row_group based on this row_group
205828
- auto row_group = make_unique<RowGroup>(db, table_info, this->start, this->count);
206358
+ auto row_group = make_unique<RowGroup>(db, block_manager, table_info, this->start, this->count);
205829
206359
  row_group->version_info = version_info;
205830
206360
  for (idx_t i = 0; i < columns.size(); i++) {
205831
206361
  if (i == changed_idx) {
@@ -205847,7 +206377,8 @@ unique_ptr<RowGroup> RowGroup::AddColumn(ColumnDefinition &new_column, Expressio
205847
206377
  Verify();
205848
206378
 
205849
206379
  // construct a new column data for the new column
205850
- auto added_column = ColumnData::CreateColumn(GetTableInfo(), columns.size(), start, new_column.Type());
206380
+ auto added_column =
206381
+ ColumnData::CreateColumn(block_manager, GetTableInfo(), columns.size(), start, new_column.Type());
205851
206382
  auto added_col_stats = make_shared<SegmentStatistics>(
205852
206383
  new_column.Type(), BaseStatistics::CreateEmpty(new_column.Type(), StatisticsType::LOCAL_STATS));
205853
206384
 
@@ -205868,7 +206399,7 @@ unique_ptr<RowGroup> RowGroup::AddColumn(ColumnDefinition &new_column, Expressio
205868
206399
  }
205869
206400
 
205870
206401
  // set up the row_group based on this row_group
205871
- auto row_group = make_unique<RowGroup>(db, table_info, this->start, this->count);
206402
+ auto row_group = make_unique<RowGroup>(db, block_manager, table_info, this->start, this->count);
205872
206403
  row_group->version_info = version_info;
205873
206404
  row_group->columns = columns;
205874
206405
  row_group->stats = stats;
@@ -205885,7 +206416,7 @@ unique_ptr<RowGroup> RowGroup::RemoveColumn(idx_t removed_column) {
205885
206416
 
205886
206417
  D_ASSERT(removed_column < columns.size());
205887
206418
 
205888
- auto row_group = make_unique<RowGroup>(db, table_info, this->start, this->count);
206419
+ auto row_group = make_unique<RowGroup>(db, block_manager, table_info, this->start, this->count);
205889
206420
  row_group->version_info = version_info;
205890
206421
  row_group->columns = columns;
205891
206422
  row_group->stats = stats;
@@ -206342,12 +206873,19 @@ void RowGroup::MergeIntoStatistics(idx_t column_idx, BaseStatistics &other) {
206342
206873
  other.Merge(*stats[column_idx]->statistics);
206343
206874
  }
206344
206875
 
206345
- RowGroupPointer RowGroup::Checkpoint(TableDataWriter &writer, vector<unique_ptr<BaseStatistics>> &global_stats) {
206876
+ RowGroupPointer RowGroup::Checkpoint(RowGroupWriter &writer, vector<unique_ptr<BaseStatistics>> &global_stats) {
206346
206877
  RowGroupPointer row_group_pointer;
206347
206878
  vector<unique_ptr<ColumnCheckpointState>> states;
206348
206879
  states.reserve(columns.size());
206349
206880
 
206350
- // checkpoint the individual columns of the row group
206881
+ // Checkpoint the individual columns of the row group
206882
+ // Here we're iterating over columns. Each column can have multiple segments.
206883
+ // (Some columns will be wider than others, and require different numbers
206884
+ // of blocks to encode.) Segments cannot span blocks.
206885
+ //
206886
+ // Some of these columns are composite (list, struct). The data is written
206887
+ // first sequentially, and the pointers are written later, so that the
206888
+ // pointers all end up densely packed, and thus more cache-friendly.
206351
206889
  for (idx_t column_idx = 0; column_idx < columns.size(); column_idx++) {
206352
206890
  auto &column = columns[column_idx];
206353
206891
  ColumnCheckpointInfo checkpoint_info {writer.GetColumnCompressionType(column_idx)};
@@ -206367,15 +206905,18 @@ RowGroupPointer RowGroup::Checkpoint(TableDataWriter &writer, vector<unique_ptr<
206367
206905
  row_group_pointer.row_start = start;
206368
206906
  row_group_pointer.tuple_count = count;
206369
206907
  for (auto &state : states) {
206370
- // get the current position of the meta data writer
206371
- auto &meta_writer = writer.GetTableWriter();
206372
- auto pointer = meta_writer.GetBlockPointer();
206908
+ // get the current position of the table data writer
206909
+ auto &data_writer = writer.GetPayloadWriter();
206910
+ auto pointer = data_writer.GetBlockPointer();
206373
206911
 
206374
206912
  // store the stats and the data pointers in the row group pointers
206375
206913
  row_group_pointer.data_pointers.push_back(pointer);
206376
206914
 
206377
- // now flush the actual column data to disk
206378
- state->FlushToDisk();
206915
+ // Write pointers to the column segments.
206916
+ //
206917
+ // Just as above, the state can refer to many other states, so this
206918
+ // can cascade recursively into more pointer writes.
206919
+ state->WriteDataPointers();
206379
206920
  }
206380
206921
  row_group_pointer.versions = version_info;
206381
206922
  Verify();
@@ -206592,11 +207133,13 @@ void VersionDeleteState::Flush() {
206592
207133
 
206593
207134
 
206594
207135
 
207136
+
206595
207137
  namespace duckdb {
206596
207138
 
206597
- RowGroupCollection::RowGroupCollection(shared_ptr<DataTableInfo> info_p, vector<LogicalType> types_p, idx_t row_start_p,
206598
- idx_t total_rows_p)
206599
- : total_rows(total_rows_p), info(move(info_p)), types(move(types_p)), row_start(row_start_p) {
207139
+ RowGroupCollection::RowGroupCollection(shared_ptr<DataTableInfo> info_p, BlockManager &block_manager,
207140
+ vector<LogicalType> types_p, idx_t row_start_p, idx_t total_rows_p)
207141
+ : block_manager(block_manager), total_rows(total_rows_p), info(move(info_p)), types(move(types_p)),
207142
+ row_start(row_start_p) {
206600
207143
  row_groups = make_shared<SegmentTree>();
206601
207144
  }
206602
207145
 
@@ -206618,7 +207161,7 @@ Allocator &RowGroupCollection::GetAllocator() const {
206618
207161
  void RowGroupCollection::Initialize(PersistentTableData &data) {
206619
207162
  D_ASSERT(this->row_start == 0);
206620
207163
  for (auto &row_group_pointer : data.row_groups) {
206621
- auto new_row_group = make_unique<RowGroup>(info->db, *info, types, row_group_pointer);
207164
+ auto new_row_group = make_unique<RowGroup>(info->db, block_manager, *info, types, move(row_group_pointer));
206622
207165
  auto row_group_count = new_row_group->start + new_row_group->count;
206623
207166
  if (row_group_count > this->total_rows) {
206624
207167
  this->total_rows = row_group_count;
@@ -206629,7 +207172,7 @@ void RowGroupCollection::Initialize(PersistentTableData &data) {
206629
207172
 
206630
207173
  void RowGroupCollection::AppendRowGroup(idx_t start_row) {
206631
207174
  D_ASSERT(start_row >= row_start);
206632
- auto new_row_group = make_unique<RowGroup>(info->db, *info, start_row, 0);
207175
+ auto new_row_group = make_unique<RowGroup>(info->db, block_manager, *info, start_row, 0);
206633
207176
  new_row_group->InitializeEmpty(types);
206634
207177
  row_groups->AppendSegment(move(new_row_group));
206635
207178
  }
@@ -206997,13 +207540,12 @@ void RowGroupCollection::UpdateColumn(TransactionData transaction, Vector &row_i
206997
207540
  //===--------------------------------------------------------------------===//
206998
207541
  // Checkpoint
206999
207542
  //===--------------------------------------------------------------------===//
207000
- void RowGroupCollection::Checkpoint(TableDataWriter &writer, vector<RowGroupPointer> &row_group_pointers,
207001
- vector<unique_ptr<BaseStatistics>> &global_stats) {
207002
- auto row_group = (RowGroup *)row_groups->GetRootSegment();
207003
- while (row_group) {
207004
- auto pointer = row_group->Checkpoint(writer, global_stats);
207005
- row_group_pointers.push_back(move(pointer));
207006
- row_group = (RowGroup *)row_group->next.get();
207543
+ void RowGroupCollection::Checkpoint(TableDataWriter &writer, vector<unique_ptr<BaseStatistics>> &global_stats) {
207544
+ for (auto row_group = (RowGroup *)row_groups->GetRootSegment(); row_group;
207545
+ row_group = (RowGroup *)row_group->next.get()) {
207546
+ auto rowg_writer = writer.GetRowGroupWriter(*row_group);
207547
+ auto pointer = row_group->Checkpoint(*rowg_writer, global_stats);
207548
+ writer.AddRowGroup(move(pointer), move(rowg_writer));
207007
207549
  }
207008
207550
  }
207009
207551
 
@@ -207052,7 +207594,7 @@ shared_ptr<RowGroupCollection> RowGroupCollection::AddColumn(ColumnDefinition &n
207052
207594
  idx_t new_column_idx = types.size();
207053
207595
  auto new_types = types;
207054
207596
  new_types.push_back(new_column.GetType());
207055
- auto result = make_shared<RowGroupCollection>(info, move(new_types), row_start, total_rows.load());
207597
+ auto result = make_shared<RowGroupCollection>(info, block_manager, move(new_types), row_start, total_rows.load());
207056
207598
 
207057
207599
  ExpressionExecutor executor(GetAllocator());
207058
207600
  DataChunk dummy_chunk;
@@ -207082,7 +207624,7 @@ shared_ptr<RowGroupCollection> RowGroupCollection::RemoveColumn(idx_t col_idx) {
207082
207624
  auto new_types = types;
207083
207625
  new_types.erase(new_types.begin() + col_idx);
207084
207626
 
207085
- auto result = make_shared<RowGroupCollection>(info, move(new_types), row_start, total_rows.load());
207627
+ auto result = make_shared<RowGroupCollection>(info, block_manager, move(new_types), row_start, total_rows.load());
207086
207628
 
207087
207629
  auto current_row_group = (RowGroup *)row_groups->GetRootSegment();
207088
207630
  while (current_row_group) {
@@ -207100,7 +207642,7 @@ shared_ptr<RowGroupCollection> RowGroupCollection::AlterType(idx_t changed_idx,
207100
207642
  auto new_types = types;
207101
207643
  new_types[changed_idx] = target_type;
207102
207644
 
207103
- auto result = make_shared<RowGroupCollection>(info, move(new_types), row_start, total_rows.load());
207645
+ auto result = make_shared<RowGroupCollection>(info, block_manager, move(new_types), row_start, total_rows.load());
207104
207646
 
207105
207647
  vector<LogicalType> scan_types;
207106
207648
  for (idx_t i = 0; i < bound_columns.size(); i++) {
@@ -207366,9 +207908,10 @@ void SegmentTree::Replace(SegmentTree &other) {
207366
207908
 
207367
207909
  namespace duckdb {
207368
207910
 
207369
- StandardColumnData::StandardColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type,
207370
- ColumnData *parent)
207371
- : ColumnData(info, column_index, start_row, move(type), parent), validity(info, 0, start_row, this) {
207911
+ StandardColumnData::StandardColumnData(BlockManager &block_manager, DataTableInfo &info, idx_t column_index,
207912
+ idx_t start_row, LogicalType type, ColumnData *parent)
207913
+ : ColumnData(block_manager, info, column_index, start_row, move(type), parent),
207914
+ validity(block_manager, info, 0, start_row, this) {
207372
207915
  }
207373
207916
 
207374
207917
  StandardColumnData::StandardColumnData(ColumnData &original, idx_t start_row, ColumnData *parent)
@@ -207516,7 +208059,7 @@ void StandardColumnData::CommitDropColumn() {
207516
208059
  }
207517
208060
 
207518
208061
  struct StandardColumnCheckpointState : public ColumnCheckpointState {
207519
- StandardColumnCheckpointState(RowGroup &row_group, ColumnData &column_data, TableDataWriter &writer)
208062
+ StandardColumnCheckpointState(RowGroup &row_group, ColumnData &column_data, RowGroupWriter &writer)
207520
208063
  : ColumnCheckpointState(row_group, column_data, writer) {
207521
208064
  }
207522
208065
 
@@ -207529,18 +208072,18 @@ public:
207529
208072
  return move(global_stats);
207530
208073
  }
207531
208074
 
207532
- void FlushToDisk() override {
207533
- ColumnCheckpointState::FlushToDisk();
207534
- validity_state->FlushToDisk();
208075
+ void WriteDataPointers() override {
208076
+ ColumnCheckpointState::WriteDataPointers();
208077
+ validity_state->WriteDataPointers();
207535
208078
  }
207536
208079
  };
207537
208080
 
207538
208081
  unique_ptr<ColumnCheckpointState> StandardColumnData::CreateCheckpointState(RowGroup &row_group,
207539
- TableDataWriter &writer) {
208082
+ RowGroupWriter &writer) {
207540
208083
  return make_unique<StandardColumnCheckpointState>(row_group, *this, writer);
207541
208084
  }
207542
208085
 
207543
- unique_ptr<ColumnCheckpointState> StandardColumnData::Checkpoint(RowGroup &row_group, TableDataWriter &writer,
208086
+ unique_ptr<ColumnCheckpointState> StandardColumnData::Checkpoint(RowGroup &row_group, RowGroupWriter &writer,
207544
208087
  ColumnCheckpointInfo &checkpoint_info) {
207545
208088
  auto validity_state = validity.Checkpoint(row_group, writer, checkpoint_info);
207546
208089
  auto base_state = ColumnData::Checkpoint(row_group, writer, checkpoint_info);
@@ -207582,9 +208125,10 @@ void StandardColumnData::Verify(RowGroup &parent) {
207582
208125
 
207583
208126
  namespace duckdb {
207584
208127
 
207585
- StructColumnData::StructColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type_p,
207586
- ColumnData *parent)
207587
- : ColumnData(info, column_index, start_row, move(type_p), parent), validity(info, 0, start_row, this) {
208128
+ StructColumnData::StructColumnData(BlockManager &block_manager, DataTableInfo &info, idx_t column_index,
208129
+ idx_t start_row, LogicalType type_p, ColumnData *parent)
208130
+ : ColumnData(block_manager, info, column_index, start_row, move(type_p), parent),
208131
+ validity(block_manager, info, 0, start_row, this) {
207588
208132
  D_ASSERT(type.InternalType() == PhysicalType::STRUCT);
207589
208133
  auto &child_types = StructType::GetChildTypes(type);
207590
208134
  D_ASSERT(child_types.size() > 0);
@@ -207592,7 +208136,7 @@ StructColumnData::StructColumnData(DataTableInfo &info, idx_t column_index, idx_
207592
208136
  idx_t sub_column_index = 1;
207593
208137
  for (auto &child_type : child_types) {
207594
208138
  sub_columns.push_back(
207595
- ColumnData::CreateColumnUnique(info, sub_column_index, start_row, child_type.second, this));
208139
+ ColumnData::CreateColumnUnique(block_manager, info, sub_column_index, start_row, child_type.second, this));
207596
208140
  sub_column_index++;
207597
208141
  }
207598
208142
  }
@@ -207814,7 +208358,7 @@ void StructColumnData::CommitDropColumn() {
207814
208358
  }
207815
208359
 
207816
208360
  struct StructColumnCheckpointState : public ColumnCheckpointState {
207817
- StructColumnCheckpointState(RowGroup &row_group, ColumnData &column_data, TableDataWriter &writer)
208361
+ StructColumnCheckpointState(RowGroup &row_group, ColumnData &column_data, RowGroupWriter &writer)
207818
208362
  : ColumnCheckpointState(row_group, column_data, writer) {
207819
208363
  global_stats = make_unique<StructStatistics>(column_data.type);
207820
208364
  }
@@ -207834,20 +208378,19 @@ public:
207834
208378
  return move(stats);
207835
208379
  }
207836
208380
 
207837
- void FlushToDisk() override {
207838
- validity_state->FlushToDisk();
208381
+ void WriteDataPointers() override {
208382
+ validity_state->WriteDataPointers();
207839
208383
  for (auto &state : child_states) {
207840
- state->FlushToDisk();
208384
+ state->WriteDataPointers();
207841
208385
  }
207842
208386
  }
207843
208387
  };
207844
208388
 
207845
- unique_ptr<ColumnCheckpointState> StructColumnData::CreateCheckpointState(RowGroup &row_group,
207846
- TableDataWriter &writer) {
208389
+ unique_ptr<ColumnCheckpointState> StructColumnData::CreateCheckpointState(RowGroup &row_group, RowGroupWriter &writer) {
207847
208390
  return make_unique<StructColumnCheckpointState>(row_group, *this, writer);
207848
208391
  }
207849
208392
 
207850
- unique_ptr<ColumnCheckpointState> StructColumnData::Checkpoint(RowGroup &row_group, TableDataWriter &writer,
208393
+ unique_ptr<ColumnCheckpointState> StructColumnData::Checkpoint(RowGroup &row_group, RowGroupWriter &writer,
207851
208394
  ColumnCheckpointInfo &checkpoint_info) {
207852
208395
  auto checkpoint_state = make_unique<StructColumnCheckpointState>(row_group, *this, writer);
207853
208396
  checkpoint_state->validity_state = validity.Checkpoint(row_group, writer, checkpoint_info);
@@ -209279,8 +209822,9 @@ bool UpdateSegment::HasUpdates(idx_t start_row_index, idx_t end_row_index) {
209279
209822
 
209280
209823
  namespace duckdb {
209281
209824
 
209282
- ValidityColumnData::ValidityColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, ColumnData *parent)
209283
- : ColumnData(info, column_index, start_row, LogicalType(LogicalTypeId::VALIDITY), parent) {
209825
+ ValidityColumnData::ValidityColumnData(BlockManager &block_manager, DataTableInfo &info, idx_t column_index,
209826
+ idx_t start_row, ColumnData *parent)
209827
+ : ColumnData(block_manager, info, column_index, start_row, LogicalType(LogicalTypeId::VALIDITY), parent) {
209284
209828
  }
209285
209829
 
209286
209830
  ValidityColumnData::ValidityColumnData(ColumnData &original, idx_t start_row, ColumnData *parent)
@@ -209384,6 +209928,7 @@ vector<BlockPointer> TableIndexList::SerializeIndexes(duckdb::MetaBlockWriter &w
209384
209928
 
209385
209929
 
209386
209930
 
209931
+
209387
209932
  namespace duckdb {
209388
209933
 
209389
209934
  class ReplayState {
@@ -209472,8 +210017,8 @@ bool WriteAheadLog::Replay(DatabaseInstance &database, string &path) {
209472
210017
  initial_reader.reset();
209473
210018
  if (checkpoint_state.checkpoint_id != INVALID_BLOCK) {
209474
210019
  // there is a checkpoint flag: check if we need to deserialize the WAL
209475
- auto &manager = BlockManager::GetBlockManager(database);
209476
- if (manager.IsRootBlock(checkpoint_state.checkpoint_id)) {
210020
+ auto &manager = StorageManager::GetStorageManager(database);
210021
+ if (manager.IsCheckpointClean(checkpoint_state.checkpoint_id)) {
209477
210022
  // the contents of the WAL have already been checkpointed
209478
210023
  // we can safely truncate the WAL and ignore its contents
209479
210024
  return true;
@@ -209904,15 +210449,14 @@ void ReplayState::ReplayCheckpoint() {
209904
210449
 
209905
210450
  namespace duckdb {
209906
210451
 
209907
- WriteAheadLog::WriteAheadLog(DatabaseInstance &database) : initialized(false), skip_writing(false), database(database) {
209908
- }
209909
-
209910
- void WriteAheadLog::Initialize(string &path) {
210452
+ WriteAheadLog::WriteAheadLog(DatabaseInstance &database, const string &path) : skip_writing(false), database(database) {
209911
210453
  wal_path = path;
209912
210454
  writer = make_unique<BufferedFileWriter>(database.GetFileSystem(), path.c_str(),
209913
210455
  FileFlags::FILE_FLAGS_WRITE | FileFlags::FILE_FLAGS_FILE_CREATE |
209914
210456
  FileFlags::FILE_FLAGS_APPEND);
209915
- initialized = true;
210457
+ }
210458
+
210459
+ WriteAheadLog::~WriteAheadLog() {
209916
210460
  }
209917
210461
 
209918
210462
  int64_t WriteAheadLog::GetWALSize() {
@@ -209930,10 +210474,9 @@ void WriteAheadLog::Truncate(int64_t size) {
209930
210474
  }
209931
210475
 
209932
210476
  void WriteAheadLog::Delete() {
209933
- if (!initialized) {
210477
+ if (!writer) {
209934
210478
  return;
209935
210479
  }
209936
- initialized = false;
209937
210480
  writer.reset();
209938
210481
 
209939
210482
  auto &fs = FileSystem::GetFileSystem(database);
@@ -210869,41 +211412,23 @@ bool Transaction::ChangesMade() {
210869
211412
  }
210870
211413
 
210871
211414
  bool Transaction::AutomaticCheckpoint(DatabaseInstance &db) {
210872
- auto &config = DBConfig::GetConfig(db);
210873
211415
  auto &storage_manager = StorageManager::GetStorageManager(db);
210874
- auto log = storage_manager.GetWriteAheadLog();
210875
- if (!log) {
210876
- return false;
210877
- }
210878
-
210879
- auto initial_size = log->GetWALSize();
210880
- idx_t expected_wal_size = initial_size + storage.EstimatedSize() + undo_buffer.EstimatedSize();
210881
- return expected_wal_size > config.options.checkpoint_wal_size;
211416
+ return storage_manager.AutomaticCheckpoint(storage.EstimatedSize() + undo_buffer.EstimatedSize());
210882
211417
  }
210883
211418
 
210884
211419
  string Transaction::Commit(DatabaseInstance &db, transaction_t commit_id, bool checkpoint) noexcept {
211420
+ // "checkpoint" parameter indicates if the caller will checkpoint. If checkpoint ==
211421
+ // true: Then this function will NOT write to the WAL or flush/persist.
211422
+ // This method only makes commit in memory, expecting caller to checkpoint/flush.
211423
+ // false: Then this function WILL write to the WAL and Flush/Persist it.
210885
211424
  this->commit_id = commit_id;
210886
211425
  auto &storage_manager = StorageManager::GetStorageManager(db);
210887
211426
  auto log = storage_manager.GetWriteAheadLog();
210888
211427
 
210889
211428
  UndoBuffer::IteratorState iterator_state;
210890
211429
  LocalStorage::CommitState commit_state;
210891
- idx_t initial_wal_size = 0;
210892
- idx_t initial_written = 0;
210893
- if (log) {
210894
- auto initial_size = log->GetWALSize();
210895
- initial_written = log->GetTotalWritten();
210896
- initial_wal_size = initial_size < 0 ? 0 : idx_t(initial_size);
210897
- } else {
210898
- D_ASSERT(!checkpoint);
210899
- }
211430
+ auto storage_commit_state = storage_manager.GenStorageCommitState(*this, checkpoint);
210900
211431
  try {
210901
- if (checkpoint) {
210902
- // check if we are checkpointing after this commit
210903
- // if we are checkpointing, we don't need to write anything to the WAL
210904
- // this saves us a lot of unnecessary writes to disk in the case of large commits
210905
- log->skip_writing = true;
210906
- }
210907
211432
  storage.Commit(commit_state, *this, log, commit_id);
210908
211433
  undo_buffer.Commit(iterator_state, log, commit_id);
210909
211434
  if (log) {
@@ -210911,25 +211436,11 @@ string Transaction::Commit(DatabaseInstance &db, transaction_t commit_id, bool c
210911
211436
  for (auto &entry : sequence_usage) {
210912
211437
  log->WriteSequenceValue(entry.first, entry.second);
210913
211438
  }
210914
- // flush the WAL if any changes were made
210915
- if (log->GetTotalWritten() > initial_written) {
210916
- D_ASSERT(!checkpoint);
210917
- D_ASSERT(!log->skip_writing);
210918
- log->Flush();
210919
- }
210920
- log->skip_writing = false;
210921
211439
  }
211440
+ storage_commit_state->FlushCommit();
210922
211441
  return string();
210923
211442
  } catch (std::exception &ex) {
210924
211443
  undo_buffer.RevertCommit(iterator_state, transaction_id);
210925
- if (log) {
210926
- log->skip_writing = false;
210927
- if (log->GetTotalWritten() > initial_written) {
210928
- // remove any entries written into the WAL by truncating it
210929
- log->Truncate(initial_wal_size);
210930
- }
210931
- }
210932
- D_ASSERT(!log || !log->skip_writing);
210933
211444
  return ex.what();
210934
211445
  }
210935
211446
  }