duckdb 0.5.2-dev1040.0 → 0.5.2-dev1080.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/duckdb.cpp CHANGED
@@ -6442,7 +6442,6 @@ void DependencyManager::AddOwnership(ClientContext &context, CatalogEntry *owner
6442
6442
 
6443
6443
 
6444
6444
 
6445
- #include <execinfo.h>
6446
6445
  #endif
6447
6446
 
6448
6447
  namespace duckdb {
@@ -6487,8 +6486,6 @@ struct AllocatorDebugInfo {
6487
6486
  AllocatorDebugInfo();
6488
6487
  ~AllocatorDebugInfo();
6489
6488
 
6490
- static string GetStackTrace(int max_depth = 128);
6491
-
6492
6489
  void AllocateData(data_ptr_t pointer, idx_t size);
6493
6490
  void FreeData(data_ptr_t pointer, idx_t size);
6494
6491
  void ReallocateData(data_ptr_t pointer, data_ptr_t new_pointer, idx_t old_size, idx_t new_size);
@@ -6585,7 +6582,7 @@ AllocatorDebugInfo::~AllocatorDebugInfo() {
6585
6582
  if (allocation_count != 0) {
6586
6583
  printf("Outstanding allocations found for Allocator\n");
6587
6584
  for (auto &entry : pointers) {
6588
- printf("Allocation of size %ld at address %p\n", entry.second.first, (void *)entry.first);
6585
+ printf("Allocation of size %llu at address %p\n", entry.second.first, (void *)entry.first);
6589
6586
  printf("Stack trace:\n%s\n", entry.second.second.c_str());
6590
6587
  printf("\n");
6591
6588
  }
@@ -6597,28 +6594,11 @@ AllocatorDebugInfo::~AllocatorDebugInfo() {
6597
6594
  D_ASSERT(allocation_count == 0);
6598
6595
  }
6599
6596
 
6600
- string AllocatorDebugInfo::GetStackTrace(int max_depth) {
6601
- #ifdef DUCKDB_DEBUG_ALLOCATION
6602
- string result;
6603
- auto callstack = unique_ptr<void *[]>(new void *[max_depth]);
6604
- int frames = backtrace(callstack.get(), max_depth);
6605
- char **strs = backtrace_symbols(callstack.get(), frames);
6606
- for (int i = 0; i < frames; i++) {
6607
- result += strs[i];
6608
- result += "\n";
6609
- }
6610
- free(strs);
6611
- return result;
6612
- #else
6613
- throw InternalException("GetStackTrace not supported without DUCKDB_DEBUG_ALLOCATION");
6614
- #endif
6615
- }
6616
-
6617
6597
  void AllocatorDebugInfo::AllocateData(data_ptr_t pointer, idx_t size) {
6618
6598
  allocation_count += size;
6619
6599
  #ifdef DUCKDB_DEBUG_ALLOCATION
6620
6600
  lock_guard<mutex> l(pointer_lock);
6621
- pointers[pointer] = make_pair(size, GetStackTrace());
6601
+ pointers[pointer] = make_pair(size, Exception::GetStackTrace());
6622
6602
  #endif
6623
6603
  }
6624
6604
 
@@ -8122,7 +8102,8 @@ void DuckDBAssertInternal(bool condition, const char *condition_name, const char
8122
8102
  if (condition) {
8123
8103
  return;
8124
8104
  }
8125
- throw InternalException("Assertion triggered in file \"%s\" on line %d: %s", file, linenr, condition_name);
8105
+ throw InternalException("Assertion triggered in file \"%s\" on line %d: %s%s", file, linenr, condition_name,
8106
+ Exception::GetStackTrace());
8126
8107
  }
8127
8108
 
8128
8109
  } // namespace duckdb
@@ -9671,6 +9652,9 @@ string StatementReturnTypeToString(StatementReturnType type) {
9671
9652
  #include <stdio.h>
9672
9653
  #include <stdlib.h>
9673
9654
  #endif
9655
+ #ifdef DUCKDB_DEBUG_STACKTRACE
9656
+ #include <execinfo.h>
9657
+ #endif
9674
9658
 
9675
9659
  namespace duckdb {
9676
9660
 
@@ -9699,6 +9683,24 @@ bool Exception::UncaughtException() {
9699
9683
  #endif
9700
9684
  }
9701
9685
 
9686
+ string Exception::GetStackTrace(int max_depth) {
9687
+ #ifdef DUCKDB_DEBUG_STACKTRACE
9688
+ string result;
9689
+ auto callstack = unique_ptr<void *[]>(new void *[max_depth]);
9690
+ int frames = backtrace(callstack.get(), max_depth);
9691
+ char **strs = backtrace_symbols(callstack.get(), frames);
9692
+ for (int i = 0; i < frames; i++) {
9693
+ result += strs[i];
9694
+ result += "\n";
9695
+ }
9696
+ free(strs);
9697
+ return "\n" + result;
9698
+ #else
9699
+ // Stack trace not available. Toggle DUCKDB_DEBUG_STACKTRACE in exception.cpp to enable stack traces.
9700
+ return "";
9701
+ #endif
9702
+ }
9703
+
9702
9704
  string Exception::ConstructMessageRecursive(const string &msg, vector<ExceptionFormatValue> &values) {
9703
9705
  return ExceptionFormatValue::Format(msg, values);
9704
9706
  }
@@ -39187,7 +39189,9 @@ public:
39187
39189
  bool ParallelSink() const override {
39188
39190
  return true;
39189
39191
  }
39190
-
39192
+ bool IsOrderPreserving() const override {
39193
+ return false;
39194
+ }
39191
39195
  string ParamsToString() const override;
39192
39196
 
39193
39197
  public:
@@ -39352,6 +39356,11 @@ struct aggr_ht_entry_32 {
39352
39356
 
39353
39357
  enum HtEntryType { HT_WIDTH_32, HT_WIDTH_64 };
39354
39358
 
39359
+ struct AggregateHTScanState {
39360
+ mutex lock;
39361
+ idx_t scan_position = 0;
39362
+ };
39363
+
39355
39364
  class GroupedAggregateHashTable : public BaseAggregateHashTable {
39356
39365
  public:
39357
39366
  //! The hash table load factor, when a resize is triggered
@@ -39381,7 +39390,7 @@ public:
39381
39390
  //! Scan the HT starting from the scan_position until the result and group
39382
39391
  //! chunks are filled. scan_position will be updated by this function.
39383
39392
  //! Returns the amount of elements found.
39384
- idx_t Scan(idx_t &scan_position, DataChunk &result);
39393
+ idx_t Scan(AggregateHTScanState &scan_state, DataChunk &result);
39385
39394
 
39386
39395
  //! Fetch the aggregates for specific groups from the HT and place them in the result
39387
39396
  void FetchAggregates(DataChunk &groups, DataChunk &result);
@@ -56453,28 +56462,32 @@ void GroupedAggregateHashTable::Partition(vector<GroupedAggregateHashTable *> &p
56453
56462
  D_ASSERT(total_count == entries);
56454
56463
  }
56455
56464
 
56456
- idx_t GroupedAggregateHashTable::Scan(idx_t &scan_position, DataChunk &result) {
56457
- if (scan_position >= entries) {
56458
- return 0;
56459
- }
56460
- auto remaining = entries - scan_position;
56461
- auto this_n = MinValue((idx_t)STANDARD_VECTOR_SIZE, remaining);
56462
-
56465
+ idx_t GroupedAggregateHashTable::Scan(AggregateHTScanState &scan_state, DataChunk &result) {
56466
+ idx_t this_n;
56463
56467
  Vector addresses(LogicalType::POINTER);
56464
56468
  auto data_pointers = FlatVector::GetData<data_ptr_t>(addresses);
56469
+ {
56470
+ lock_guard<mutex> l(scan_state.lock);
56471
+ if (scan_state.scan_position >= entries) {
56472
+ return 0;
56473
+ }
56474
+ auto remaining = entries - scan_state.scan_position;
56475
+ this_n = MinValue((idx_t)STANDARD_VECTOR_SIZE, remaining);
56465
56476
 
56466
- auto chunk_idx = scan_position / tuples_per_block;
56467
- auto chunk_offset = (scan_position % tuples_per_block) * tuple_size;
56468
- D_ASSERT(chunk_offset + tuple_size <= Storage::BLOCK_SIZE);
56477
+ auto chunk_idx = scan_state.scan_position / tuples_per_block;
56478
+ auto chunk_offset = (scan_state.scan_position % tuples_per_block) * tuple_size;
56479
+ D_ASSERT(chunk_offset + tuple_size <= Storage::BLOCK_SIZE);
56469
56480
 
56470
- auto read_ptr = payload_hds_ptrs[chunk_idx++];
56471
- for (idx_t i = 0; i < this_n; i++) {
56472
- data_pointers[i] = read_ptr + chunk_offset;
56473
- chunk_offset += tuple_size;
56474
- if (chunk_offset >= tuples_per_block * tuple_size) {
56475
- read_ptr = payload_hds_ptrs[chunk_idx++];
56476
- chunk_offset = 0;
56481
+ auto read_ptr = payload_hds_ptrs[chunk_idx++];
56482
+ for (idx_t i = 0; i < this_n; i++) {
56483
+ data_pointers[i] = read_ptr + chunk_offset;
56484
+ chunk_offset += tuple_size;
56485
+ if (chunk_offset >= tuples_per_block * tuple_size) {
56486
+ read_ptr = payload_hds_ptrs[chunk_idx++];
56487
+ chunk_offset = 0;
56488
+ }
56477
56489
  }
56490
+ scan_state.scan_position += this_n;
56478
56491
  }
56479
56492
 
56480
56493
  result.SetCardinality(this_n);
@@ -56487,8 +56500,6 @@ idx_t GroupedAggregateHashTable::Scan(idx_t &scan_position, DataChunk &result) {
56487
56500
  }
56488
56501
 
56489
56502
  RowOperations::FinalizeStates(layout, addresses, result, group_cols);
56490
-
56491
- scan_position += this_n;
56492
56503
  return this_n;
56493
56504
  }
56494
56505
 
@@ -63356,12 +63367,12 @@ public:
63356
63367
  }
63357
63368
 
63358
63369
  const PhysicalHashAggregate &op;
63359
- std::atomic<size_t> state_index;
63370
+ mutex lock;
63371
+ atomic<idx_t> state_index;
63360
63372
 
63361
63373
  vector<unique_ptr<GlobalSourceState>> radix_states;
63362
63374
 
63363
63375
  public:
63364
- #if 0
63365
63376
  idx_t MaxThreads() override {
63366
63377
  // If there are no tables, we only need one thread.
63367
63378
  if (op.radix_tables.empty()) {
@@ -63373,10 +63384,8 @@ public:
63373
63384
  for (size_t sidx = 0; sidx < op.radix_tables.size(); ++sidx) {
63374
63385
  count += op.radix_tables[sidx].Size(*ht_state.radix_states[sidx]);
63375
63386
  }
63376
-
63377
- return (count + STANDARD_VECTOR_SIZE - 1 ) / STANDARD_VECTOR_SIZE;
63387
+ return MaxValue<idx_t>(1, count / RowGroup::ROW_GROUP_SIZE);
63378
63388
  }
63379
- #endif
63380
63389
  };
63381
63390
 
63382
63391
  unique_ptr<GlobalSourceState> PhysicalHashAggregate::GetGlobalSourceState(ClientContext &context) const {
@@ -63404,12 +63413,24 @@ void PhysicalHashAggregate::GetData(ExecutionContext &context, DataChunk &chunk,
63404
63413
  auto &ht_state = (HashAggregateGlobalState &)*sink_state;
63405
63414
  auto &gstate = (PhysicalHashAggregateGlobalSourceState &)gstate_p;
63406
63415
  auto &lstate = (PhysicalHashAggregateLocalSourceState &)lstate_p;
63407
- for (size_t sidx = gstate.state_index; sidx < radix_tables.size(); sidx = ++gstate.state_index) {
63408
- radix_tables[sidx].GetData(context, chunk, *ht_state.radix_states[sidx], *gstate.radix_states[sidx],
63409
- *lstate.radix_states[sidx]);
63416
+ while (true) {
63417
+ idx_t radix_idx = gstate.state_index;
63418
+ if (radix_idx >= radix_tables.size()) {
63419
+ break;
63420
+ }
63421
+ radix_tables[radix_idx].GetData(context, chunk, *ht_state.radix_states[radix_idx],
63422
+ *gstate.radix_states[radix_idx], *lstate.radix_states[radix_idx]);
63410
63423
  if (chunk.size() != 0) {
63411
63424
  return;
63412
63425
  }
63426
+ // move to the next table
63427
+ lock_guard<mutex> l(gstate.lock);
63428
+ radix_idx++;
63429
+ if (radix_idx > gstate.state_index) {
63430
+ // we have not yet worked on the table
63431
+ // move the global index forwards
63432
+ gstate.state_index = radix_idx;
63433
+ }
63413
63434
  }
63414
63435
  }
63415
63436
 
@@ -63495,6 +63516,10 @@ public:
63495
63516
  return true;
63496
63517
  }
63497
63518
 
63519
+ bool IsOrderPreserving() const override {
63520
+ return false;
63521
+ }
63522
+
63498
63523
  public:
63499
63524
  //! The group types
63500
63525
  vector<LogicalType> group_types;
@@ -68921,6 +68946,10 @@ public:
68921
68946
  PhysicalOperator &op);
68922
68947
  void BuildPipelines(Executor &executor, Pipeline &current, PipelineBuildState &state) override;
68923
68948
  vector<const PhysicalOperator *> GetSources() const override;
68949
+
68950
+ bool IsOrderPreserving() const override {
68951
+ return false;
68952
+ }
68924
68953
  };
68925
68954
 
68926
68955
  } // namespace duckdb
@@ -68946,10 +68975,6 @@ public:
68946
68975
  //! Construct the remainder of a Full Outer Join based on which tuples in the RHS found no match
68947
68976
  static void ConstructFullOuterJoinResult(bool *found_match, ColumnDataCollection &input, DataChunk &result,
68948
68977
  ColumnDataScanState &scan_state);
68949
-
68950
- bool IsOrderPreserving() const override {
68951
- return false;
68952
- }
68953
68978
  };
68954
68979
 
68955
68980
  } // namespace duckdb
@@ -69680,6 +69705,10 @@ public:
69680
69705
  return true;
69681
69706
  }
69682
69707
 
69708
+ bool IsOrderPreserving() const override {
69709
+ return false;
69710
+ }
69711
+
69683
69712
  public:
69684
69713
  // Sink Interface
69685
69714
  unique_ptr<GlobalSinkState> GetGlobalSinkState(ClientContext &context) const override;
@@ -72797,6 +72826,10 @@ public:
72797
72826
  return true;
72798
72827
  }
72799
72828
 
72829
+ bool IsOrderPreserving() const override {
72830
+ return false;
72831
+ }
72832
+
72800
72833
  public:
72801
72834
  void BuildPipelines(Executor &executor, Pipeline &current, PipelineBuildState &state) override;
72802
72835
  vector<const PhysicalOperator *> GetSources() const override;
@@ -78505,22 +78538,37 @@ vector<const PhysicalOperator *> PhysicalExport::GetSources() const {
78505
78538
 
78506
78539
 
78507
78540
 
78541
+
78508
78542
  namespace duckdb {
78509
78543
 
78510
78544
  //! Physically insert a set of data into a table
78511
78545
  class PhysicalInsert : public PhysicalOperator {
78512
78546
  public:
78547
+ //! INSERT INTO
78513
78548
  PhysicalInsert(vector<LogicalType> types, TableCatalogEntry *table, vector<idx_t> column_index_map,
78514
- vector<unique_ptr<Expression>> bound_defaults, idx_t estimated_cardinality, bool return_chunk);
78549
+ vector<unique_ptr<Expression>> bound_defaults, idx_t estimated_cardinality, bool return_chunk,
78550
+ bool parallel);
78551
+ //! CREATE TABLE AS
78552
+ PhysicalInsert(LogicalOperator &op, SchemaCatalogEntry *schema, unique_ptr<BoundCreateTableInfo> info,
78553
+ idx_t estimated_cardinality, bool parallel);
78515
78554
 
78516
78555
  //! The map from insert column index to table column index
78517
78556
  vector<idx_t> column_index_map;
78518
78557
  //! The table to insert into
78519
- TableCatalogEntry *table;
78558
+ TableCatalogEntry *insert_table;
78559
+ //! The insert types
78560
+ vector<LogicalType> insert_types;
78520
78561
  //! The default expressions of the columns for which no value is provided
78521
78562
  vector<unique_ptr<Expression>> bound_defaults;
78522
78563
  //! If the returning statement is present, return the whole chunk
78523
78564
  bool return_chunk;
78565
+ //! Table schema, in case of CREATE TABLE AS
78566
+ SchemaCatalogEntry *schema;
78567
+ //! Create table info, in case of CREATE TABLE AS
78568
+ unique_ptr<BoundCreateTableInfo> info;
78569
+ //! Whether or not the INSERT can be executed in parallel
78570
+ //! This insert is not order preserving if executed in parallel
78571
+ bool parallel;
78524
78572
 
78525
78573
  public:
78526
78574
  // Source interface
@@ -78543,7 +78591,7 @@ public:
78543
78591
  }
78544
78592
 
78545
78593
  bool ParallelSink() const override {
78546
- return false;
78594
+ return parallel;
78547
78595
  }
78548
78596
  };
78549
78597
 
@@ -78557,14 +78605,32 @@ public:
78557
78605
 
78558
78606
 
78559
78607
 
78608
+
78609
+
78610
+
78611
+
78560
78612
  namespace duckdb {
78561
78613
 
78562
78614
  PhysicalInsert::PhysicalInsert(vector<LogicalType> types, TableCatalogEntry *table, vector<idx_t> column_index_map,
78563
78615
  vector<unique_ptr<Expression>> bound_defaults, idx_t estimated_cardinality,
78564
- bool return_chunk)
78616
+ bool return_chunk, bool parallel)
78565
78617
  : PhysicalOperator(PhysicalOperatorType::INSERT, move(types), estimated_cardinality),
78566
- column_index_map(std::move(column_index_map)), table(table), bound_defaults(move(bound_defaults)),
78567
- return_chunk(return_chunk) {
78618
+ column_index_map(std::move(column_index_map)), insert_table(table), insert_types(table->GetTypes()),
78619
+ bound_defaults(move(bound_defaults)), return_chunk(return_chunk), parallel(parallel) {
78620
+ }
78621
+
78622
+ PhysicalInsert::PhysicalInsert(LogicalOperator &op, SchemaCatalogEntry *schema, unique_ptr<BoundCreateTableInfo> info_p,
78623
+ idx_t estimated_cardinality, bool parallel)
78624
+ : PhysicalOperator(PhysicalOperatorType::INSERT, op.types, estimated_cardinality), insert_table(nullptr),
78625
+ return_chunk(false), schema(schema), info(move(info_p)), parallel(parallel) {
78626
+ auto &create_info = (CreateTableInfo &)*info->base;
78627
+ for (auto &col : create_info.columns) {
78628
+ if (col.Generated()) {
78629
+ continue;
78630
+ }
78631
+ insert_types.push_back(col.GetType());
78632
+ bound_defaults.push_back(make_unique<BoundConstantExpression>(Value(col.GetType())));
78633
+ }
78568
78634
  }
78569
78635
 
78570
78636
  //===--------------------------------------------------------------------===//
@@ -78577,43 +78643,58 @@ public:
78577
78643
  }
78578
78644
 
78579
78645
  mutex lock;
78646
+ TableCatalogEntry *table;
78580
78647
  idx_t insert_count;
78581
- LocalAppendState append_state;
78582
78648
  bool initialized;
78649
+ LocalAppendState append_state;
78583
78650
  ColumnDataCollection return_collection;
78584
78651
  };
78585
78652
 
78586
78653
  class InsertLocalState : public LocalSinkState {
78587
78654
  public:
78588
- InsertLocalState(Allocator &allocator, const vector<LogicalType> &types,
78655
+ InsertLocalState(ClientContext &context, const vector<LogicalType> &types,
78589
78656
  const vector<unique_ptr<Expression>> &bound_defaults)
78590
- : default_executor(allocator, bound_defaults) {
78591
- insert_chunk.Initialize(allocator, types);
78657
+ : default_executor(Allocator::Get(context), bound_defaults) {
78658
+ insert_chunk.Initialize(Allocator::Get(context), types);
78592
78659
  }
78593
78660
 
78594
78661
  DataChunk insert_chunk;
78595
78662
  ExpressionExecutor default_executor;
78663
+ TableAppendState local_append_state;
78664
+ unique_ptr<RowGroupCollection> local_collection;
78665
+ unique_ptr<OptimisticDataWriter> writer;
78596
78666
  };
78597
78667
 
78598
78668
  unique_ptr<GlobalSinkState> PhysicalInsert::GetGlobalSinkState(ClientContext &context) const {
78599
- return make_unique<InsertGlobalState>(context, GetTypes());
78669
+ auto result = make_unique<InsertGlobalState>(context, GetTypes());
78670
+ if (info) {
78671
+ // CREATE TABLE AS
78672
+ D_ASSERT(!insert_table);
78673
+ auto &catalog = Catalog::GetCatalog(context);
78674
+ result->table = (TableCatalogEntry *)catalog.CreateTable(context, schema, info.get());
78675
+ } else {
78676
+ D_ASSERT(insert_table);
78677
+ result->table = insert_table;
78678
+ }
78679
+ return move(result);
78600
78680
  }
78601
78681
 
78602
78682
  unique_ptr<LocalSinkState> PhysicalInsert::GetLocalSinkState(ExecutionContext &context) const {
78603
- return make_unique<InsertLocalState>(Allocator::Get(context.client), table->GetTypes(), bound_defaults);
78683
+ return make_unique<InsertLocalState>(context.client, insert_types, bound_defaults);
78604
78684
  }
78605
78685
 
78606
- SinkResultType PhysicalInsert::Sink(ExecutionContext &context, GlobalSinkState &state, LocalSinkState &lstate,
78686
+ SinkResultType PhysicalInsert::Sink(ExecutionContext &context, GlobalSinkState &state, LocalSinkState &lstate_p,
78607
78687
  DataChunk &chunk) const {
78608
78688
  auto &gstate = (InsertGlobalState &)state;
78609
- auto &istate = (InsertLocalState &)lstate;
78689
+ auto &lstate = (InsertLocalState &)lstate_p;
78610
78690
 
78611
78691
  chunk.Flatten();
78612
- istate.default_executor.SetChunk(chunk);
78692
+ lstate.default_executor.SetChunk(chunk);
78613
78693
 
78614
- istate.insert_chunk.Reset();
78615
- istate.insert_chunk.SetCardinality(chunk);
78694
+ lstate.insert_chunk.Reset();
78695
+ lstate.insert_chunk.SetCardinality(chunk);
78616
78696
 
78697
+ auto table = gstate.table;
78617
78698
  if (!column_index_map.empty()) {
78618
78699
  // columns specified by the user, use column_index_map
78619
78700
  for (idx_t i = 0; i < table->columns.size(); i++) {
@@ -78624,48 +78705,101 @@ SinkResultType PhysicalInsert::Sink(ExecutionContext &context, GlobalSinkState &
78624
78705
  auto storage_idx = col.StorageOid();
78625
78706
  if (column_index_map[i] == DConstants::INVALID_INDEX) {
78626
78707
  // insert default value
78627
- istate.default_executor.ExecuteExpression(i, istate.insert_chunk.data[storage_idx]);
78708
+ lstate.default_executor.ExecuteExpression(i, lstate.insert_chunk.data[storage_idx]);
78628
78709
  } else {
78629
78710
  // get value from child chunk
78630
78711
  D_ASSERT((idx_t)column_index_map[i] < chunk.ColumnCount());
78631
- D_ASSERT(istate.insert_chunk.data[storage_idx].GetType() == chunk.data[column_index_map[i]].GetType());
78632
- istate.insert_chunk.data[storage_idx].Reference(chunk.data[column_index_map[i]]);
78712
+ D_ASSERT(lstate.insert_chunk.data[storage_idx].GetType() == chunk.data[column_index_map[i]].GetType());
78713
+ lstate.insert_chunk.data[storage_idx].Reference(chunk.data[column_index_map[i]]);
78633
78714
  }
78634
78715
  }
78635
78716
  } else {
78636
78717
  // no columns specified, just append directly
78637
- for (idx_t i = 0; i < istate.insert_chunk.ColumnCount(); i++) {
78638
- D_ASSERT(istate.insert_chunk.data[i].GetType() == chunk.data[i].GetType());
78639
- istate.insert_chunk.data[i].Reference(chunk.data[i]);
78718
+ for (idx_t i = 0; i < lstate.insert_chunk.ColumnCount(); i++) {
78719
+ D_ASSERT(lstate.insert_chunk.data[i].GetType() == chunk.data[i].GetType());
78720
+ lstate.insert_chunk.data[i].Reference(chunk.data[i]);
78640
78721
  }
78641
78722
  }
78642
78723
 
78643
- lock_guard<mutex> glock(gstate.lock);
78644
- if (!gstate.initialized) {
78645
- table->storage->InitializeLocalAppend(gstate.append_state, context.client);
78646
- gstate.initialized = true;
78647
- }
78648
- table->storage->LocalAppend(gstate.append_state, *table, context.client, istate.insert_chunk);
78724
+ if (!parallel) {
78725
+ if (!gstate.initialized) {
78726
+ table->storage->InitializeLocalAppend(gstate.append_state, context.client);
78727
+ gstate.initialized = true;
78728
+ }
78729
+ table->storage->LocalAppend(gstate.append_state, *table, context.client, lstate.insert_chunk);
78649
78730
 
78650
- if (return_chunk) {
78651
- gstate.return_collection.Append(istate.insert_chunk);
78731
+ if (return_chunk) {
78732
+ gstate.return_collection.Append(lstate.insert_chunk);
78733
+ }
78734
+ gstate.insert_count += chunk.size();
78735
+ } else {
78736
+ D_ASSERT(!return_chunk);
78737
+ // parallel append
78738
+ if (!lstate.local_collection) {
78739
+ auto &table_info = table->storage->info;
78740
+ auto &block_manager = TableIOManager::Get(*table->storage).GetBlockManagerForRowData();
78741
+ lstate.local_collection = make_unique<RowGroupCollection>(table_info, block_manager, insert_types, 0);
78742
+ lstate.local_collection->InitializeEmpty();
78743
+ lstate.local_collection->InitializeAppend(lstate.local_append_state);
78744
+ lstate.writer = make_unique<OptimisticDataWriter>(gstate.table->storage.get());
78745
+ }
78746
+ auto new_row_group = lstate.local_collection->Append(lstate.insert_chunk, lstate.local_append_state);
78747
+ if (new_row_group) {
78748
+ lstate.writer->CheckFlushToDisk(*lstate.local_collection);
78749
+ }
78652
78750
  }
78653
78751
 
78654
- gstate.insert_count += chunk.size();
78655
78752
  return SinkResultType::NEED_MORE_INPUT;
78656
78753
  }
78657
78754
 
78658
- void PhysicalInsert::Combine(ExecutionContext &context, GlobalSinkState &gstate, LocalSinkState &lstate) const {
78659
- auto &state = (InsertLocalState &)lstate;
78755
+ void PhysicalInsert::Combine(ExecutionContext &context, GlobalSinkState &gstate_p, LocalSinkState &lstate_p) const {
78756
+ auto &gstate = (InsertGlobalState &)gstate_p;
78757
+ auto &lstate = (InsertLocalState &)lstate_p;
78660
78758
  auto &client_profiler = QueryProfiler::Get(context.client);
78661
- context.thread.profiler.Flush(this, &state.default_executor, "default_executor", 1);
78759
+ context.thread.profiler.Flush(this, &lstate.default_executor, "default_executor", 1);
78662
78760
  client_profiler.Flush(context.thread.profiler);
78761
+
78762
+ if (!parallel) {
78763
+ return;
78764
+ }
78765
+ if (!lstate.local_collection) {
78766
+ return;
78767
+ }
78768
+ // parallel append: finalize the append
78769
+ TransactionData tdata(0, 0);
78770
+ lstate.local_collection->FinalizeAppend(tdata, lstate.local_append_state);
78771
+
78772
+ auto append_count = lstate.local_collection->GetTotalRows();
78773
+
78774
+ if (append_count < LocalStorage::MERGE_THRESHOLD) {
78775
+ // we have few rows - append to the local storage directly
78776
+ lock_guard<mutex> lock(gstate.lock);
78777
+ gstate.insert_count += append_count;
78778
+ auto table = gstate.table;
78779
+ table->storage->InitializeLocalAppend(gstate.append_state, context.client);
78780
+ auto &transaction = Transaction::GetTransaction(context.client);
78781
+ lstate.local_collection->Scan(transaction, [&](DataChunk &insert_chunk) {
78782
+ table->storage->LocalAppend(gstate.append_state, *table, context.client, insert_chunk);
78783
+ return true;
78784
+ });
78785
+ table->storage->FinalizeLocalAppend(gstate.append_state);
78786
+ } else {
78787
+ // we have many rows - flush the row group collection to disk (if required) and merge into the transaction-local
78788
+ // state
78789
+ lstate.writer->FlushToDisk(*lstate.local_collection);
78790
+ lstate.writer->FinalFlush();
78791
+
78792
+ lock_guard<mutex> lock(gstate.lock);
78793
+ gstate.insert_count += append_count;
78794
+ gstate.table->storage->LocalMerge(context.client, *lstate.local_collection);
78795
+ }
78663
78796
  }
78664
78797
 
78665
78798
  SinkFinalizeType PhysicalInsert::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
78666
78799
  GlobalSinkState &state) const {
78667
78800
  auto &gstate = (InsertGlobalState &)state;
78668
- if (gstate.initialized) {
78801
+ if (!parallel && gstate.initialized) {
78802
+ auto table = gstate.table;
78669
78803
  table->storage->FinalizeLocalAppend(gstate.append_state);
78670
78804
  }
78671
78805
  return SinkFinalizeType::READY;
@@ -80485,142 +80619,6 @@ void PhysicalCreateTable::GetData(ExecutionContext &context, DataChunk &chunk, G
80485
80619
  state.finished = true;
80486
80620
  }
80487
80621
 
80488
- } // namespace duckdb
80489
- //===----------------------------------------------------------------------===//
80490
- // DuckDB
80491
- //
80492
- // duckdb/execution/operator/schema/physical_create_table_as.hpp
80493
- //
80494
- //
80495
- //===----------------------------------------------------------------------===//
80496
-
80497
-
80498
-
80499
-
80500
-
80501
-
80502
- namespace duckdb {
80503
-
80504
- //! Physically CREATE TABLE AS statement
80505
- class PhysicalCreateTableAs : public PhysicalOperator {
80506
- public:
80507
- PhysicalCreateTableAs(LogicalOperator &op, SchemaCatalogEntry *schema, unique_ptr<BoundCreateTableInfo> info,
80508
- idx_t estimated_cardinality);
80509
-
80510
- //! Schema to insert to
80511
- SchemaCatalogEntry *schema;
80512
- //! Table name to create
80513
- unique_ptr<BoundCreateTableInfo> info;
80514
-
80515
- public:
80516
- // Source interface
80517
- unique_ptr<GlobalSourceState> GetGlobalSourceState(ClientContext &context) const override;
80518
- void GetData(ExecutionContext &context, DataChunk &chunk, GlobalSourceState &gstate,
80519
- LocalSourceState &lstate) const override;
80520
-
80521
- public:
80522
- unique_ptr<GlobalSinkState> GetGlobalSinkState(ClientContext &context) const override;
80523
- SinkResultType Sink(ExecutionContext &context, GlobalSinkState &state, LocalSinkState &lstate,
80524
- DataChunk &input) const override;
80525
- SinkFinalizeType Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
80526
- GlobalSinkState &gstate) const override;
80527
-
80528
- bool IsSink() const override {
80529
- return true;
80530
- }
80531
- bool ParallelSink() const override {
80532
- return true;
80533
- }
80534
- };
80535
- } // namespace duckdb
80536
-
80537
-
80538
-
80539
-
80540
-
80541
-
80542
- namespace duckdb {
80543
-
80544
- PhysicalCreateTableAs::PhysicalCreateTableAs(LogicalOperator &op, SchemaCatalogEntry *schema,
80545
- unique_ptr<BoundCreateTableInfo> info, idx_t estimated_cardinality)
80546
- : PhysicalOperator(PhysicalOperatorType::CREATE_TABLE_AS, op.types, estimated_cardinality), schema(schema),
80547
- info(move(info)) {
80548
- }
80549
-
80550
- //===--------------------------------------------------------------------===//
80551
- // Sink
80552
- //===--------------------------------------------------------------------===//
80553
- class CreateTableAsGlobalState : public GlobalSinkState {
80554
- public:
80555
- CreateTableAsGlobalState() : inserted_count(0), initialized(false) {
80556
- }
80557
-
80558
- mutex append_lock;
80559
- TableCatalogEntry *table;
80560
- int64_t inserted_count;
80561
- LocalAppendState append_state;
80562
- bool initialized;
80563
- };
80564
-
80565
- unique_ptr<GlobalSinkState> PhysicalCreateTableAs::GetGlobalSinkState(ClientContext &context) const {
80566
- auto sink = make_unique<CreateTableAsGlobalState>();
80567
- auto &catalog = Catalog::GetCatalog(context);
80568
- sink->table = (TableCatalogEntry *)catalog.CreateTable(context, schema, info.get());
80569
- return move(sink);
80570
- }
80571
-
80572
- SinkResultType PhysicalCreateTableAs::Sink(ExecutionContext &context, GlobalSinkState &state, LocalSinkState &lstate_p,
80573
- DataChunk &input) const {
80574
- auto &sink = (CreateTableAsGlobalState &)state;
80575
- D_ASSERT(sink.table);
80576
- lock_guard<mutex> client_guard(sink.append_lock);
80577
- if (!sink.initialized) {
80578
- sink.table->storage->InitializeLocalAppend(sink.append_state, context.client);
80579
- sink.initialized = true;
80580
- }
80581
- sink.table->storage->LocalAppend(sink.append_state, *sink.table, context.client, input);
80582
- sink.inserted_count += input.size();
80583
- return SinkResultType::NEED_MORE_INPUT;
80584
- }
80585
-
80586
- SinkFinalizeType PhysicalCreateTableAs::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
80587
- GlobalSinkState &state) const {
80588
- auto &gstate = (CreateTableAsGlobalState &)state;
80589
- if (gstate.initialized) {
80590
- gstate.table->storage->FinalizeLocalAppend(gstate.append_state);
80591
- }
80592
- return SinkFinalizeType::READY;
80593
- }
80594
-
80595
- //===--------------------------------------------------------------------===//
80596
- // Source
80597
- //===--------------------------------------------------------------------===//
80598
- class CreateTableAsSourceState : public GlobalSourceState {
80599
- public:
80600
- CreateTableAsSourceState() : finished(false) {
80601
- }
80602
-
80603
- bool finished;
80604
- };
80605
-
80606
- unique_ptr<GlobalSourceState> PhysicalCreateTableAs::GetGlobalSourceState(ClientContext &context) const {
80607
- return make_unique<CreateTableAsSourceState>();
80608
- }
80609
-
80610
- void PhysicalCreateTableAs::GetData(ExecutionContext &context, DataChunk &chunk, GlobalSourceState &gstate,
80611
- LocalSourceState &lstate) const {
80612
- auto &state = (CreateTableAsSourceState &)gstate;
80613
- auto &sink = (CreateTableAsGlobalState &)*sink_state;
80614
- if (state.finished) {
80615
- return;
80616
- }
80617
- if (sink.table) {
80618
- chunk.SetCardinality(1);
80619
- chunk.SetValue(0, 0, Value::BIGINT(sink.inserted_count));
80620
- }
80621
- state.finished = true;
80622
- }
80623
-
80624
80622
  } // namespace duckdb
80625
80623
  //===----------------------------------------------------------------------===//
80626
80624
  // DuckDB
@@ -82779,6 +82777,7 @@ protected:
82779
82777
  } // namespace duckdb
82780
82778
 
82781
82779
 
82780
+
82782
82781
  namespace duckdb {
82783
82782
 
82784
82783
  static void ExtractDependencies(Expression &expr, unordered_set<CatalogEntry *> &dependencies) {
@@ -82804,9 +82803,15 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::CreatePlan(LogicalCreateTabl
82804
82803
  catalog.GetEntry(context, CatalogType::TABLE_ENTRY, create_info.schema, create_info.table, true);
82805
82804
  bool replace = op.info->Base().on_conflict == OnCreateConflict::REPLACE_ON_CONFLICT;
82806
82805
  if ((!existing_entry || replace) && !op.children.empty()) {
82807
- D_ASSERT(op.children.size() == 1);
82808
- auto create = make_unique<PhysicalCreateTableAs>(op, op.schema, move(op.info), op.estimated_cardinality);
82809
82806
  auto plan = CreatePlan(*op.children[0]);
82807
+
82808
+ auto &config = DBConfig::GetConfig(context);
82809
+ bool plan_preserves_order = plan->AllOperatorsPreserveOrder();
82810
+ bool parallel_streaming_insert = !config.options.preserve_insertion_order || !plan_preserves_order;
82811
+
82812
+ D_ASSERT(op.children.size() == 1);
82813
+ auto create = make_unique<PhysicalInsert>(op, op.schema, move(op.info), op.estimated_cardinality,
82814
+ parallel_streaming_insert);
82810
82815
  create->children.push_back(move(plan));
82811
82816
  return move(create);
82812
82817
  } else {
@@ -84296,6 +84301,8 @@ protected:
84296
84301
  } // namespace duckdb
84297
84302
 
84298
84303
 
84304
+
84305
+
84299
84306
  namespace duckdb {
84300
84307
 
84301
84308
  unique_ptr<PhysicalOperator> PhysicalPlanGenerator::CreatePlan(LogicalInsert &op) {
@@ -84305,9 +84312,21 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::CreatePlan(LogicalInsert &op
84305
84312
  plan = CreatePlan(*op.children[0]);
84306
84313
  }
84307
84314
 
84315
+ auto &config = DBConfig::GetConfig(context);
84316
+ bool plan_preserves_order = plan->AllOperatorsPreserveOrder();
84317
+ bool parallel_streaming_insert = !config.options.preserve_insertion_order || !plan_preserves_order;
84318
+ if (!op.table->storage->info->indexes.Empty()) {
84319
+ // not for tables with indexes currently
84320
+ parallel_streaming_insert = false;
84321
+ }
84322
+ if (op.return_chunk) {
84323
+ // not supported for RETURNING
84324
+ parallel_streaming_insert = false;
84325
+ }
84326
+
84308
84327
  dependencies.insert(op.table);
84309
84328
  auto insert = make_unique<PhysicalInsert>(op.types, op.table, op.column_index_map, move(op.bound_defaults),
84310
- op.estimated_cardinality, op.return_chunk);
84329
+ op.estimated_cardinality, op.return_chunk, parallel_streaming_insert);
84311
84330
  if (plan) {
84312
84331
  insert->children.push_back(move(plan));
84313
84332
  }
@@ -86075,14 +86094,16 @@ bool RadixPartitionedHashTable::ForceSingleHT(GlobalSinkState &state) const {
86075
86094
  class RadixHTGlobalSourceState : public GlobalSourceState {
86076
86095
  public:
86077
86096
  explicit RadixHTGlobalSourceState(Allocator &allocator, const RadixPartitionedHashTable &ht)
86078
- : ht_index(0), ht_scan_position(0), finished(false) {
86097
+ : ht_index(0), initialized(false), finished(false) {
86079
86098
  }
86080
86099
 
86081
86100
  //! Heavy handed for now.
86082
86101
  mutex lock;
86083
86102
  //! The current position to scan the HT for output tuples
86084
- idx_t ht_index;
86085
- idx_t ht_scan_position;
86103
+ atomic<idx_t> ht_index;
86104
+ //! The set of aggregate scan states
86105
+ unique_ptr<AggregateHTScanState[]> ht_scan_states;
86106
+ atomic<bool> initialized;
86086
86107
  atomic<bool> finished;
86087
86108
  };
86088
86109
 
@@ -86168,23 +86189,39 @@ void RadixPartitionedHashTable::GetData(ExecutionContext &context, DataChunk &ch
86168
86189
  idx_t elements_found = 0;
86169
86190
 
86170
86191
  lstate.scan_chunk.Reset();
86192
+ if (!state.initialized) {
86193
+ lock_guard<mutex> l(state.lock);
86194
+ if (!state.ht_scan_states) {
86195
+ state.ht_scan_states =
86196
+ unique_ptr<AggregateHTScanState[]>(new AggregateHTScanState[gstate.finalized_hts.size()]);
86197
+ } else {
86198
+ D_ASSERT(state.initialized);
86199
+ }
86200
+ state.initialized = true;
86201
+ }
86171
86202
  while (true) {
86172
- lock_guard<mutex> glock(state.lock);
86173
- if (state.ht_index == gstate.finalized_hts.size()) {
86203
+ idx_t ht_index = state.ht_index;
86204
+ if (ht_index >= gstate.finalized_hts.size()) {
86174
86205
  state.finished = true;
86175
86206
  return;
86176
86207
  }
86177
- D_ASSERT(gstate.finalized_hts[state.ht_index]);
86178
- elements_found = gstate.finalized_hts[state.ht_index]->Scan(state.ht_scan_position, lstate.scan_chunk);
86179
-
86208
+ D_ASSERT(state.ht_index < gstate.finalized_hts.size());
86209
+ D_ASSERT(state.ht_scan_states);
86210
+ auto &ht = gstate.finalized_hts[ht_index];
86211
+ auto &scan_state = state.ht_scan_states[ht_index];
86212
+ D_ASSERT(ht);
86213
+ elements_found = ht->Scan(scan_state, lstate.scan_chunk);
86180
86214
  if (elements_found > 0) {
86181
86215
  break;
86182
86216
  }
86183
- if (!gstate.multi_scan) {
86184
- gstate.finalized_hts[state.ht_index].reset();
86217
+ // move to the next hash table
86218
+ lock_guard<mutex> l(state.lock);
86219
+ ht_index++;
86220
+ if (ht_index > state.ht_index) {
86221
+ // we have not yet worked on the table
86222
+ // move the global index forwards
86223
+ state.ht_index = ht_index;
86185
86224
  }
86186
- state.ht_index++;
86187
- state.ht_scan_position = 0;
86188
86225
  }
86189
86226
 
86190
86227
  // compute the final projection list
@@ -141762,10 +141799,9 @@ static void PrintRow(std::ostream &ss, const string &annotation, int id, const s
141762
141799
 
141763
141800
  static void ExtractFunctions(std::ostream &ss, ExpressionInfo &info, int &fun_id, int depth) {
141764
141801
  if (info.hasfunction) {
141765
- D_ASSERT(info.sample_tuples_count != 0);
141766
- PrintRow(ss, "Function", fun_id++, info.function_name,
141767
- int(info.function_time) / double(info.sample_tuples_count), info.sample_tuples_count,
141768
- info.tuples_count, "", depth);
141802
+ double time = info.sample_tuples_count == 0 ? 0 : int(info.function_time) / double(info.sample_tuples_count);
141803
+ PrintRow(ss, "Function", fun_id++, info.function_name, time, info.sample_tuples_count, info.tuples_count, "",
141804
+ depth);
141769
141805
  }
141770
141806
  if (info.children.empty()) {
141771
141807
  return;
@@ -141792,10 +141828,11 @@ static void ToJSONRecursive(QueryProfiler::TreeNode &node, std::ostream &ss, int
141792
141828
  continue;
141793
141829
  }
141794
141830
  for (auto &expr_timer : expr_executor->roots) {
141795
- D_ASSERT(expr_timer->sample_tuples_count != 0);
141796
- PrintRow(ss, "ExpressionRoot", expression_counter++, expr_timer->name,
141797
- int(expr_timer->time) / double(expr_timer->sample_tuples_count), expr_timer->sample_tuples_count,
141798
- expr_timer->tuples_count, expr_timer->extra_info, depth + 1);
141831
+ double time = expr_timer->sample_tuples_count == 0
141832
+ ? 0
141833
+ : double(expr_timer->time) / double(expr_timer->sample_tuples_count);
141834
+ PrintRow(ss, "ExpressionRoot", expression_counter++, expr_timer->name, time,
141835
+ expr_timer->sample_tuples_count, expr_timer->tuples_count, expr_timer->extra_info, depth + 1);
141799
141836
  // Extract all functions inside the tree
141800
141837
  ExtractFunctions(ss, *expr_timer->root, function_counter, depth + 1);
141801
141838
  }
@@ -197829,7 +197866,7 @@ public:
197829
197866
 
197830
197867
  protected:
197831
197868
  //! Append a transient segment
197832
- void AppendTransientSegment(idx_t start_row);
197869
+ void AppendTransientSegment(SegmentLock &l, idx_t start_row);
197833
197870
 
197834
197871
  //! Scans a base vector from the column
197835
197872
  idx_t ScanVector(ColumnScanState &state, Vector &result, idx_t remaining);
@@ -197867,7 +197904,7 @@ public:
197867
197904
  RowGroup &GetRowGroup();
197868
197905
  ColumnCheckpointState &GetCheckpointState();
197869
197906
 
197870
- void Checkpoint(unique_ptr<SegmentBase> segment);
197907
+ void Checkpoint(vector<SegmentNode> nodes);
197871
197908
 
197872
197909
  private:
197873
197910
  void ScanSegments(const std::function<void(Vector &, idx_t)> &callback);
@@ -197882,7 +197919,7 @@ private:
197882
197919
  ColumnCheckpointState &state;
197883
197920
  bool is_validity;
197884
197921
  Vector intermediate;
197885
- unique_ptr<SegmentBase> owned_segment;
197922
+ vector<SegmentNode> nodes;
197886
197923
  vector<CompressionFunction *> compression_functions;
197887
197924
  ColumnCheckpointInfo &checkpoint_info;
197888
197925
  };
@@ -202468,16 +202505,16 @@ public:
202468
202505
  template <class OP = EmptyRLEWriter>
202469
202506
  void Update(T *data, ValidityMask &validity, idx_t idx) {
202470
202507
  if (validity.RowIsValid(idx)) {
202471
- all_null = false;
202472
- if (seen_count == 0) {
202508
+ if (all_null) {
202473
202509
  // no value seen yet
202474
- // assign the current value, and set the seen_count to 1
202510
+ // assign the current value, and increment the seen_count
202475
202511
  // note that we increment last_seen_count rather than setting it to 1
202476
202512
  // this is intentional: this is the first VALID value we see
202477
202513
  // but it might not be the first value in case of nulls!
202478
202514
  last_value = data[idx];
202479
- seen_count = 1;
202515
+ seen_count++;
202480
202516
  last_seen_count++;
202517
+ all_null = false;
202481
202518
  } else if (last_value == data[idx]) {
202482
202519
  // the last value is identical to this value: increment the last_seen_count
202483
202520
  last_seen_count++;
@@ -203881,33 +203918,24 @@ DataTable::DataTable(DatabaseInstance &db, shared_ptr<TableIOManager> table_io_m
203881
203918
  make_shared<RowGroupCollection>(info, TableIOManager::Get(*this).GetBlockManagerForRowData(), types, 0);
203882
203919
  if (data && !data->row_groups.empty()) {
203883
203920
  this->row_groups->Initialize(*data);
203884
- stats.Initialize(types, *data);
203885
- }
203886
- if (stats.Empty()) {
203921
+ } else {
203922
+ this->row_groups->InitializeEmpty();
203887
203923
  D_ASSERT(row_groups->GetTotalRows() == 0);
203888
-
203889
- stats.InitializeEmpty(types);
203890
203924
  }
203891
203925
  row_groups->Verify();
203892
203926
  }
203893
203927
 
203894
203928
  DataTable::DataTable(ClientContext &context, DataTable &parent, ColumnDefinition &new_column, Expression *default_value)
203895
203929
  : info(parent.info), db(parent.db), is_root(true) {
203930
+ // add the column definitions from this DataTable
203896
203931
  for (auto &column_def : parent.column_definitions) {
203897
203932
  column_definitions.emplace_back(column_def.Copy());
203898
203933
  }
203934
+ column_definitions.emplace_back(new_column.Copy());
203899
203935
  // prevent any new tuples from being added to the parent
203900
203936
  lock_guard<mutex> parent_lock(parent.append_lock);
203901
- // add the new column to this DataTable
203902
- auto new_column_type = new_column.Type();
203903
- auto new_column_idx = parent.column_definitions.size();
203904
-
203905
- stats.InitializeAddColumn(parent.stats, new_column_type);
203906
-
203907
- // add the column definitions from this DataTable
203908
- column_definitions.emplace_back(new_column.Copy());
203909
203937
 
203910
- this->row_groups = parent.row_groups->AddColumn(new_column, default_value, stats.GetStats(new_column_idx));
203938
+ this->row_groups = parent.row_groups->AddColumn(new_column, default_value);
203911
203939
 
203912
203940
  // also add this column to client local storage
203913
203941
  auto &local_storage = LocalStorage::Get(context);
@@ -203937,9 +203965,6 @@ DataTable::DataTable(ClientContext &context, DataTable &parent, idx_t removed_co
203937
203965
  return false;
203938
203966
  });
203939
203967
 
203940
- // erase the stats from this DataTable
203941
- stats.InitializeRemoveColumn(parent.stats, removed_column);
203942
-
203943
203968
  // erase the column definitions from this DataTable
203944
203969
  D_ASSERT(removed_column < column_definitions.size());
203945
203970
  column_definitions.erase(column_definitions.begin() + removed_column);
@@ -203973,7 +203998,6 @@ DataTable::DataTable(ClientContext &context, DataTable &parent, unique_ptr<Bound
203973
203998
  for (auto &column_def : parent.column_definitions) {
203974
203999
  column_definitions.emplace_back(column_def.Copy());
203975
204000
  }
203976
- stats.InitializeAddConstraint(parent.stats);
203977
204001
 
203978
204002
  // Verify the new constraint against current persistent/local data
203979
204003
  VerifyNewConstraint(context, parent, constraint.get());
@@ -204008,9 +204032,7 @@ DataTable::DataTable(ClientContext &context, DataTable &parent, idx_t changed_id
204008
204032
 
204009
204033
  // set up the statistics for the table
204010
204034
  // the column that had its type changed will have the new statistics computed during conversion
204011
- stats.InitializeAlterType(parent.stats, changed_idx, target_type);
204012
- this->row_groups =
204013
- parent.row_groups->AlterType(changed_idx, target_type, bound_columns, cast_expr, stats.GetStats(changed_idx));
204035
+ this->row_groups = parent.row_groups->AlterType(changed_idx, target_type, bound_columns, cast_expr);
204014
204036
 
204015
204037
  // scan the original table, and fill the new column with the transformed value
204016
204038
  auto &local_storage = LocalStorage::Get(context);
@@ -204357,6 +204379,11 @@ void DataTable::FinalizeLocalAppend(LocalAppendState &state) {
204357
204379
  LocalStorage::FinalizeAppend(state);
204358
204380
  }
204359
204381
 
204382
+ void DataTable::LocalMerge(ClientContext &context, RowGroupCollection &collection) {
204383
+ auto &local_storage = LocalStorage::Get(context);
204384
+ local_storage.LocalMerge(this, collection);
204385
+ }
204386
+
204360
204387
  void DataTable::LocalAppend(TableCatalogEntry &table, ClientContext &context, DataChunk &chunk) {
204361
204388
  LocalAppendState append_state;
204362
204389
  table.storage->InitializeLocalAppend(append_state, context);
@@ -204392,7 +204419,7 @@ void DataTable::InitializeAppend(Transaction &transaction, TableAppendState &sta
204392
204419
 
204393
204420
  void DataTable::Append(DataChunk &chunk, TableAppendState &state) {
204394
204421
  D_ASSERT(is_root);
204395
- row_groups->Append(chunk, state, stats);
204422
+ row_groups->Append(chunk, state);
204396
204423
  }
204397
204424
 
204398
204425
  void DataTable::ScanTableSegment(idx_t row_start, idx_t count, const std::function<void(DataChunk &chunk)> &function) {
@@ -204439,9 +204466,8 @@ void DataTable::ScanTableSegment(idx_t row_start, idx_t count, const std::functi
204439
204466
  }
204440
204467
  }
204441
204468
 
204442
- void DataTable::MergeStorage(RowGroupCollection &data, TableIndexList &indexes, TableStatistics &other_stats) {
204469
+ void DataTable::MergeStorage(RowGroupCollection &data, TableIndexList &indexes) {
204443
204470
  row_groups->MergeStorage(data);
204444
- stats.MergeStats(other_stats);
204445
204471
  row_groups->Verify();
204446
204472
  }
204447
204473
 
@@ -204740,7 +204766,7 @@ void DataTable::Update(TableCatalogEntry &table, ClientContext &context, Vector
204740
204766
  // we need to figure out for each id to which row group it belongs
204741
204767
  // usually all (or many) ids belong to the same row group
204742
204768
  // we iterate over the ids and check for every id if it belongs to the same row group as their predecessor
204743
- row_groups->Update(transaction, ids, column_ids, updates, stats);
204769
+ row_groups->Update(transaction, ids, column_ids, updates);
204744
204770
  }
204745
204771
 
204746
204772
  void DataTable::UpdateColumn(TableCatalogEntry &table, ClientContext &context, Vector &row_ids,
@@ -204761,7 +204787,7 @@ void DataTable::UpdateColumn(TableCatalogEntry &table, ClientContext &context, V
204761
204787
 
204762
204788
  updates.Flatten();
204763
204789
  row_ids.Flatten(updates.size());
204764
- row_groups->UpdateColumn(transaction, row_ids, column_path, updates, stats);
204790
+ row_groups->UpdateColumn(transaction, row_ids, column_path, updates);
204765
204791
  }
204766
204792
 
204767
204793
  //===--------------------------------------------------------------------===//
@@ -204778,13 +204804,12 @@ unique_ptr<BaseStatistics> DataTable::GetStatistics(ClientContext &context, colu
204778
204804
  if (column_id == COLUMN_IDENTIFIER_ROW_ID) {
204779
204805
  return nullptr;
204780
204806
  }
204781
- return stats.CopyStats(column_id);
204807
+ return row_groups->CopyStats(column_id);
204782
204808
  }
204783
204809
 
204784
204810
  void DataTable::SetStatistics(column_t column_id, const std::function<void(BaseStatistics &)> &set_fun) {
204785
204811
  D_ASSERT(column_id != COLUMN_IDENTIFIER_ROW_ID);
204786
- auto stats_guard = stats.GetLock();
204787
- set_fun(*stats.GetStats(column_id).stats);
204812
+ row_groups->SetStatistics(column_id, set_fun);
204788
204813
  }
204789
204814
 
204790
204815
  //===--------------------------------------------------------------------===//
@@ -204795,7 +204820,7 @@ void DataTable::Checkpoint(TableDataWriter &writer) {
204795
204820
  // FIXME: we might want to combine adjacent row groups in case they have had deletions...
204796
204821
  vector<unique_ptr<BaseStatistics>> global_stats;
204797
204822
  for (idx_t i = 0; i < column_definitions.size(); i++) {
204798
- global_stats.push_back(stats.CopyStats(i));
204823
+ global_stats.push_back(row_groups->CopyStats(i));
204799
204824
  }
204800
204825
 
204801
204826
  row_groups->Checkpoint(writer, global_stats);
@@ -204933,16 +204958,96 @@ BlockPointer Index::Serialize(duckdb::MetaBlockWriter &writer) {
204933
204958
 
204934
204959
  namespace duckdb {
204935
204960
 
204961
+ //===--------------------------------------------------------------------===//
204962
+ // OptimisticDataWriter
204963
+ //===--------------------------------------------------------------------===//
204964
+ OptimisticDataWriter::OptimisticDataWriter(DataTable *table) : table(table) {
204965
+ }
204966
+
204967
+ OptimisticDataWriter::OptimisticDataWriter(DataTable *table, OptimisticDataWriter &parent)
204968
+ : table(table), partial_manager(move(parent.partial_manager)), written_blocks(move(parent.written_blocks)) {
204969
+ if (partial_manager) {
204970
+ partial_manager->FlushPartialBlocks();
204971
+ }
204972
+ }
204973
+
204974
+ OptimisticDataWriter::~OptimisticDataWriter() {
204975
+ }
204976
+
204977
+ void OptimisticDataWriter::CheckFlushToDisk(RowGroupCollection &row_groups) {
204978
+ // we finished writing a complete row group
204979
+ // check if we should pre-emptively write it to disk
204980
+ if (table->info->IsTemporary() || StorageManager::GetStorageManager(table->db).InMemory()) {
204981
+ return;
204982
+ }
204983
+ // we should! write the second-to-last row group to disk
204984
+ // allocate the partial block-manager if none is allocated yet
204985
+ if (!partial_manager) {
204986
+ auto &block_manager = table->info->table_io_manager->GetBlockManagerForRowData();
204987
+ partial_manager = make_unique<PartialBlockManager>(block_manager);
204988
+ }
204989
+ // flush second-to-last row group
204990
+ auto row_group = row_groups.GetRowGroup(-2);
204991
+ FlushToDisk(row_group);
204992
+ }
204993
+
204994
+ void OptimisticDataWriter::FlushToDisk(RowGroup *row_group) {
204995
+ // flush the specified row group
204996
+ D_ASSERT(row_group);
204997
+ //! The set of column compression types (if any)
204998
+ vector<CompressionType> compression_types;
204999
+ D_ASSERT(compression_types.empty());
205000
+ for (auto &column : table->column_definitions) {
205001
+ compression_types.push_back(column.CompressionType());
205002
+ }
205003
+ auto row_group_pointer = row_group->WriteToDisk(*partial_manager, compression_types);
205004
+
205005
+ // update the set of written blocks
205006
+ for (idx_t col_idx = 0; col_idx < row_group_pointer.statistics.size(); col_idx++) {
205007
+ row_group_pointer.states[col_idx]->GetBlockIds(written_blocks);
205008
+ }
205009
+ }
205010
+
205011
+ void OptimisticDataWriter::FlushToDisk(RowGroupCollection &row_groups) {
205012
+ if (!partial_manager) {
205013
+ // no partial manager - nothing to flush
205014
+ return;
205015
+ }
205016
+ // flush the last row group
205017
+ FlushToDisk(row_groups.GetRowGroup(-1));
205018
+ }
205019
+
205020
+ void OptimisticDataWriter::FinalFlush() {
205021
+ if (!partial_manager) {
205022
+ return;
205023
+ }
205024
+ // then flush the partial manager
205025
+ partial_manager->FlushPartialBlocks();
205026
+ partial_manager.reset();
205027
+ }
205028
+
205029
+ void OptimisticDataWriter::Rollback() {
205030
+ if (partial_manager) {
205031
+ partial_manager->Clear();
205032
+ partial_manager.reset();
205033
+ }
205034
+ if (!written_blocks.empty()) {
205035
+ auto &block_manager = table->info->table_io_manager->GetBlockManagerForRowData();
205036
+ for (auto block_id : written_blocks) {
205037
+ block_manager.MarkBlockAsModified(block_id);
205038
+ }
205039
+ }
205040
+ }
205041
+
204936
205042
  //===--------------------------------------------------------------------===//
204937
205043
  // Local Table Storage
204938
205044
  //===--------------------------------------------------------------------===//
204939
205045
  LocalTableStorage::LocalTableStorage(DataTable &table)
204940
- : table(&table), allocator(Allocator::Get(table.db)), deleted_rows(0) {
205046
+ : table(&table), allocator(Allocator::Get(table.db)), deleted_rows(0), optimistic_writer(&table) {
204941
205047
  auto types = table.GetTypes();
204942
205048
  row_groups = make_shared<RowGroupCollection>(table.info, TableIOManager::Get(table).GetBlockManagerForRowData(),
204943
205049
  types, MAX_ROW_ID, 0);
204944
-
204945
- stats.InitializeEmpty(types);
205050
+ row_groups->InitializeEmpty();
204946
205051
  table.info->indexes.Scan([&](Index &index) {
204947
205052
  D_ASSERT(index.type == IndexType::ART);
204948
205053
  auto &art = (ART &)index;
@@ -204963,24 +205068,15 @@ LocalTableStorage::LocalTableStorage(DataTable &new_dt, LocalTableStorage &paren
204963
205068
  const LogicalType &target_type, const vector<column_t> &bound_columns,
204964
205069
  Expression &cast_expr)
204965
205070
  : table(&new_dt), allocator(Allocator::Get(table->db)), deleted_rows(parent.deleted_rows),
204966
- partial_manager(move(parent.partial_manager)), written_blocks(move(parent.written_blocks)) {
204967
- if (partial_manager) {
204968
- partial_manager->FlushPartialBlocks();
204969
- }
204970
- stats.InitializeAlterType(parent.stats, changed_idx, target_type);
204971
- row_groups =
204972
- parent.row_groups->AlterType(changed_idx, target_type, bound_columns, cast_expr, stats.GetStats(changed_idx));
205071
+ optimistic_writer(table, parent.optimistic_writer) {
205072
+ row_groups = parent.row_groups->AlterType(changed_idx, target_type, bound_columns, cast_expr);
204973
205073
  parent.row_groups.reset();
204974
205074
  indexes.Move(parent.indexes);
204975
205075
  }
204976
205076
 
204977
205077
  LocalTableStorage::LocalTableStorage(DataTable &new_dt, LocalTableStorage &parent, idx_t drop_idx)
204978
205078
  : table(&new_dt), allocator(Allocator::Get(table->db)), deleted_rows(parent.deleted_rows),
204979
- partial_manager(move(parent.partial_manager)), written_blocks(move(parent.written_blocks)) {
204980
- if (partial_manager) {
204981
- partial_manager->FlushPartialBlocks();
204982
- }
204983
- stats.InitializeRemoveColumn(parent.stats, drop_idx);
205079
+ optimistic_writer(table, parent.optimistic_writer) {
204984
205080
  row_groups = parent.row_groups->RemoveColumn(drop_idx);
204985
205081
  parent.row_groups.reset();
204986
205082
  indexes.Move(parent.indexes);
@@ -204989,10 +205085,8 @@ LocalTableStorage::LocalTableStorage(DataTable &new_dt, LocalTableStorage &paren
204989
205085
  LocalTableStorage::LocalTableStorage(DataTable &new_dt, LocalTableStorage &parent, ColumnDefinition &new_column,
204990
205086
  Expression *default_value)
204991
205087
  : table(&new_dt), allocator(Allocator::Get(table->db)), deleted_rows(parent.deleted_rows),
204992
- partial_manager(move(parent.partial_manager)), written_blocks(move(parent.written_blocks)) {
204993
- idx_t new_column_idx = parent.table->column_definitions.size();
204994
- stats.InitializeAddColumn(parent.stats, new_column.GetType());
204995
- row_groups = parent.row_groups->AddColumn(new_column, default_value, stats.GetStats(new_column_idx));
205088
+ optimistic_writer(table, parent.optimistic_writer) {
205089
+ row_groups = parent.row_groups->AddColumn(new_column, default_value);
204996
205090
  parent.row_groups.reset();
204997
205091
  indexes.Move(parent.indexes);
204998
205092
  }
@@ -205021,6 +205115,143 @@ idx_t LocalTableStorage::EstimatedSize() {
205021
205115
  return appended_rows * row_size;
205022
205116
  }
205023
205117
 
205118
+ void LocalTableStorage::CheckFlushToDisk() {
205119
+ if (deleted_rows != 0) {
205120
+ // we have deletes - we cannot merge row groups
205121
+ return;
205122
+ }
205123
+ optimistic_writer.CheckFlushToDisk(*row_groups);
205124
+ }
205125
+
205126
+ void LocalTableStorage::FlushToDisk() {
205127
+ optimistic_writer.FlushToDisk(*row_groups);
205128
+ optimistic_writer.FinalFlush();
205129
+ }
205130
+
205131
+ void LocalTableStorage::AppendToIndexes(Transaction &transaction, TableAppendState &append_state, idx_t append_count,
205132
+ bool append_to_table) {
205133
+ bool constraint_violated = false;
205134
+ if (append_to_table) {
205135
+ table->InitializeAppend(transaction, append_state, append_count);
205136
+ }
205137
+ if (append_to_table) {
205138
+ // appending: need to scan entire
205139
+ row_groups->Scan(transaction, [&](DataChunk &chunk) -> bool {
205140
+ // append this chunk to the indexes of the table
205141
+ if (!table->AppendToIndexes(chunk, append_state.current_row)) {
205142
+ constraint_violated = true;
205143
+ return false;
205144
+ }
205145
+ // append to base table
205146
+ table->Append(chunk, append_state);
205147
+ return true;
205148
+ });
205149
+ } else {
205150
+ // only need to scan for index append
205151
+ // figure out which columns we need to scan for the set of indexes
205152
+ auto columns = table->info->indexes.GetRequiredColumns();
205153
+ // create an empty mock chunk that contains all the correct types for the table
205154
+ DataChunk mock_chunk;
205155
+ mock_chunk.InitializeEmpty(table->GetTypes());
205156
+ row_groups->Scan(transaction, columns, [&](DataChunk &chunk) -> bool {
205157
+ // construct the mock chunk by referencing the required columns
205158
+ for (idx_t i = 0; i < columns.size(); i++) {
205159
+ mock_chunk.data[columns[i]].Reference(chunk.data[i]);
205160
+ }
205161
+ mock_chunk.SetCardinality(chunk);
205162
+ // append this chunk to the indexes of the table
205163
+ if (!table->AppendToIndexes(mock_chunk, append_state.current_row)) {
205164
+ constraint_violated = true;
205165
+ return false;
205166
+ }
205167
+ append_state.current_row += chunk.size();
205168
+ return true;
205169
+ });
205170
+ }
205171
+ if (constraint_violated) {
205172
+ // need to revert the append
205173
+ row_t current_row = append_state.row_start;
205174
+ // remove the data from the indexes, if there are any indexes
205175
+ row_groups->Scan(transaction, [&](DataChunk &chunk) -> bool {
205176
+ // append this chunk to the indexes of the table
205177
+ table->RemoveFromIndexes(append_state, chunk, current_row);
205178
+
205179
+ current_row += chunk.size();
205180
+ if (current_row >= append_state.current_row) {
205181
+ // finished deleting all rows from the index: abort now
205182
+ return false;
205183
+ }
205184
+ return true;
205185
+ });
205186
+ if (append_to_table) {
205187
+ table->RevertAppendInternal(append_state.row_start, append_count);
205188
+ }
205189
+ throw ConstraintException("PRIMARY KEY or UNIQUE constraint violated: duplicated key");
205190
+ }
205191
+ }
205192
+
205193
+ void LocalTableStorage::Rollback() {
205194
+ optimistic_writer.Rollback();
205195
+ }
205196
+
205197
+ //===--------------------------------------------------------------------===//
205198
+ // LocalTableManager
205199
+ //===--------------------------------------------------------------------===//
205200
+ LocalTableStorage *LocalTableManager::GetStorage(DataTable *table) {
205201
+ lock_guard<mutex> l(table_storage_lock);
205202
+ auto entry = table_storage.find(table);
205203
+ return entry == table_storage.end() ? nullptr : entry->second.get();
205204
+ }
205205
+
205206
+ LocalTableStorage *LocalTableManager::GetOrCreateStorage(DataTable *table) {
205207
+ lock_guard<mutex> l(table_storage_lock);
205208
+ auto entry = table_storage.find(table);
205209
+ if (entry == table_storage.end()) {
205210
+ auto new_storage = make_shared<LocalTableStorage>(*table);
205211
+ auto storage = new_storage.get();
205212
+ table_storage.insert(make_pair(table, move(new_storage)));
205213
+ return storage;
205214
+ } else {
205215
+ return entry->second.get();
205216
+ }
205217
+ }
205218
+
205219
+ bool LocalTableManager::IsEmpty() {
205220
+ lock_guard<mutex> l(table_storage_lock);
205221
+ return table_storage.empty();
205222
+ }
205223
+
205224
+ shared_ptr<LocalTableStorage> LocalTableManager::MoveEntry(DataTable *table) {
205225
+ lock_guard<mutex> l(table_storage_lock);
205226
+ auto entry = table_storage.find(table);
205227
+ if (entry == table_storage.end()) {
205228
+ return nullptr;
205229
+ }
205230
+ auto storage_entry = move(entry->second);
205231
+ table_storage.erase(table);
205232
+ return storage_entry;
205233
+ }
205234
+
205235
+ unordered_map<DataTable *, shared_ptr<LocalTableStorage>> LocalTableManager::MoveEntries() {
205236
+ lock_guard<mutex> l(table_storage_lock);
205237
+ return move(table_storage);
205238
+ }
205239
+
205240
+ idx_t LocalTableManager::EstimatedSize() {
205241
+ lock_guard<mutex> l(table_storage_lock);
205242
+ idx_t estimated_size = 0;
205243
+ for (auto &storage : table_storage) {
205244
+ estimated_size += storage.second->EstimatedSize();
205245
+ }
205246
+ return estimated_size;
205247
+ }
205248
+
205249
+ void LocalTableManager::InsertEntry(DataTable *table, shared_ptr<LocalTableStorage> entry) {
205250
+ lock_guard<mutex> l(table_storage_lock);
205251
+ D_ASSERT(table_storage.find(table) == table_storage.end());
205252
+ table_storage[table] = move(entry);
205253
+ }
205254
+
205024
205255
  //===--------------------------------------------------------------------===//
205025
205256
  // LocalStorage
205026
205257
  //===--------------------------------------------------------------------===//
@@ -205036,11 +205267,10 @@ LocalStorage &LocalStorage::Get(ClientContext &context) {
205036
205267
  }
205037
205268
 
205038
205269
  void LocalStorage::InitializeScan(DataTable *table, CollectionScanState &state, TableFilterSet *table_filters) {
205039
- auto entry = table_storage.find(table);
205040
- if (entry == table_storage.end()) {
205270
+ auto storage = table_manager.GetStorage(table);
205271
+ if (storage == nullptr) {
205041
205272
  return;
205042
205273
  }
205043
- auto storage = entry->second.get();
205044
205274
  storage->InitializeScan(state, table_filters);
205045
205275
  }
205046
205276
 
@@ -205049,7 +205279,7 @@ void LocalStorage::Scan(CollectionScanState &state, const vector<column_t> &colu
205049
205279
  }
205050
205280
 
205051
205281
  void LocalStorage::InitializeParallelScan(DataTable *table, ParallelCollectionScanState &state) {
205052
- auto storage = GetStorage(table);
205282
+ auto storage = table_manager.GetStorage(table);
205053
205283
  if (!storage) {
205054
205284
  state.max_row = 0;
205055
205285
  state.vector_index = 0;
@@ -205061,7 +205291,7 @@ void LocalStorage::InitializeParallelScan(DataTable *table, ParallelCollectionSc
205061
205291
 
205062
205292
  bool LocalStorage::NextParallelScan(ClientContext &context, DataTable *table, ParallelCollectionScanState &state,
205063
205293
  CollectionScanState &scan_state) {
205064
- auto storage = GetStorage(table);
205294
+ auto storage = table_manager.GetStorage(table);
205065
205295
  if (!storage) {
205066
205296
  return false;
205067
205297
  }
@@ -205069,15 +205299,8 @@ bool LocalStorage::NextParallelScan(ClientContext &context, DataTable *table, Pa
205069
205299
  }
205070
205300
 
205071
205301
  void LocalStorage::InitializeAppend(LocalAppendState &state, DataTable *table) {
205072
- auto entry = table_storage.find(table);
205073
- if (entry == table_storage.end()) {
205074
- auto new_storage = make_shared<LocalTableStorage>(*table);
205075
- state.storage = new_storage.get();
205076
- table_storage.insert(make_pair(table, move(new_storage)));
205077
- } else {
205078
- state.storage = entry->second.get();
205079
- }
205080
- state.storage->row_groups->InitializeAppend(state.append_state);
205302
+ state.storage = table_manager.GetOrCreateStorage(table);
205303
+ state.storage->row_groups->InitializeAppend(TransactionData(transaction), state.append_state, 0);
205081
205304
  }
205082
205305
 
205083
205306
  void LocalStorage::Append(LocalAppendState &state, DataChunk &chunk) {
@@ -205089,7 +205312,7 @@ void LocalStorage::Append(LocalAppendState &state, DataChunk &chunk) {
205089
205312
  }
205090
205313
 
205091
205314
  //! Append the chunk to the local storage
205092
- auto new_row_group = storage->row_groups->Append(chunk, state.append_state, storage->stats);
205315
+ auto new_row_group = storage->row_groups->Append(chunk, state.append_state);
205093
205316
 
205094
205317
  //! Check if we should pre-emptively flush blocks to disk
205095
205318
  if (new_row_group) {
@@ -205097,80 +205320,29 @@ void LocalStorage::Append(LocalAppendState &state, DataChunk &chunk) {
205097
205320
  }
205098
205321
  }
205099
205322
 
205100
- void LocalTableStorage::CheckFlushToDisk() {
205101
- // we finished writing a complete row group
205102
- // check if we should pre-emptively write it to disk
205103
- if (table->info->IsTemporary() || StorageManager::GetStorageManager(table->db).InMemory()) {
205104
- return;
205105
- }
205106
- if (deleted_rows != 0) {
205107
- // we have deletes - we cannot merge
205108
- return;
205109
- }
205110
- // we should! write the second-to-last row group to disk
205111
- // allocate the partial block-manager if none is allocated yet
205112
- if (!partial_manager) {
205113
- auto &block_manager = table->info->table_io_manager->GetBlockManagerForRowData();
205114
- partial_manager = make_unique<PartialBlockManager>(block_manager);
205115
- }
205116
- // flush second-to-last row group
205117
- auto row_group = row_groups->GetRowGroup(-2);
205118
- FlushToDisk(row_group);
205323
+ void LocalStorage::FinalizeAppend(LocalAppendState &state) {
205324
+ state.storage->row_groups->FinalizeAppend(state.append_state.transaction, state.append_state);
205119
205325
  }
205120
205326
 
205121
- void LocalTableStorage::FlushToDisk(RowGroup *row_group) {
205122
- // flush the specified row group
205123
- D_ASSERT(row_group);
205124
- D_ASSERT(deleted_rows == 0);
205125
- D_ASSERT(partial_manager);
205126
- //! The set of column compression types (if any)
205127
- vector<CompressionType> compression_types;
205128
- D_ASSERT(compression_types.empty());
205129
- for (auto &column : table->column_definitions) {
205130
- compression_types.push_back(column.CompressionType());
205131
- }
205132
- auto row_group_pointer = row_group->WriteToDisk(*partial_manager, compression_types);
205133
- for (idx_t col_idx = 0; col_idx < row_group_pointer.statistics.size(); col_idx++) {
205134
- row_group_pointer.states[col_idx]->GetBlockIds(written_blocks);
205135
- stats.MergeStats(col_idx, *row_group_pointer.statistics[col_idx]);
205136
- }
205137
- }
205138
- void LocalTableStorage::FlushToDisk() {
205139
- // no partial manager - nothing to flush
205140
- if (!partial_manager) {
205141
- return;
205142
- }
205143
- // flush the last row group
205144
- FlushToDisk(row_groups->GetRowGroup(-1));
205145
- // then flush the partial manager
205146
- partial_manager->FlushPartialBlocks();
205147
- partial_manager.reset();
205327
+ void LocalStorage::LocalMerge(DataTable *table, RowGroupCollection &collection) {
205328
+ auto storage = table_manager.GetOrCreateStorage(table);
205329
+ storage->row_groups->MergeStorage(collection);
205148
205330
  }
205149
205331
 
205150
- void LocalStorage::FinalizeAppend(LocalAppendState &state) {
205151
- TransactionData transaction_data(0, 0);
205152
- state.storage->row_groups->FinalizeAppend(transaction_data, state.append_state);
205332
+ bool LocalStorage::ChangesMade() noexcept {
205333
+ return !table_manager.IsEmpty();
205153
205334
  }
205154
205335
 
205155
- LocalTableStorage *LocalStorage::GetStorage(DataTable *table) {
205156
- auto entry = table_storage.find(table);
205157
- return entry == table_storage.end() ? nullptr : entry->second.get();
205336
+ bool LocalStorage::Find(DataTable *table) {
205337
+ return table_manager.GetStorage(table) != nullptr;
205158
205338
  }
205159
205339
 
205160
205340
  idx_t LocalStorage::EstimatedSize() {
205161
- idx_t estimated_size = 0;
205162
- for (auto &storage : table_storage) {
205163
- estimated_size += storage.second->EstimatedSize();
205164
- }
205165
- return estimated_size;
205166
- }
205167
-
205168
- bool LocalTableStorage::HasWrittenBlocks() {
205169
- return partial_manager || !written_blocks.empty();
205341
+ return table_manager.EstimatedSize();
205170
205342
  }
205171
205343
 
205172
205344
  idx_t LocalStorage::Delete(DataTable *table, Vector &row_ids, idx_t count) {
205173
- auto storage = GetStorage(table);
205345
+ auto storage = table_manager.GetStorage(table);
205174
205346
  D_ASSERT(storage);
205175
205347
 
205176
205348
  // delete from unique indices (if any)
@@ -205185,119 +205357,14 @@ idx_t LocalStorage::Delete(DataTable *table, Vector &row_ids, idx_t count) {
205185
205357
  }
205186
205358
 
205187
205359
  void LocalStorage::Update(DataTable *table, Vector &row_ids, const vector<column_t> &column_ids, DataChunk &updates) {
205188
- auto storage = GetStorage(table);
205360
+ auto storage = table_manager.GetStorage(table);
205189
205361
  D_ASSERT(storage);
205190
205362
 
205191
205363
  auto ids = FlatVector::GetData<row_t>(row_ids);
205192
- storage->row_groups->Update(TransactionData(0, 0), ids, column_ids, updates, storage->stats);
205193
- }
205194
-
205195
- template <class T>
205196
- bool LocalTableStorage::ScanTableStorage(Transaction &transaction, const vector<column_t> &column_ids, T &&fun) {
205197
- auto all_types = table->GetTypes();
205198
- vector<LogicalType> scan_types;
205199
- for (idx_t i = 0; i < column_ids.size(); i++) {
205200
- scan_types.push_back(all_types[column_ids[i]]);
205201
- }
205202
- DataChunk chunk;
205203
- chunk.Initialize(allocator, scan_types);
205204
-
205205
- // initialize the scan
205206
- TableScanState state;
205207
- state.Initialize(column_ids, nullptr);
205208
- InitializeScan(state.local_state, nullptr);
205209
-
205210
- while (true) {
205211
- chunk.Reset();
205212
- state.local_state.Scan(transaction, chunk);
205213
- if (chunk.size() == 0) {
205214
- return true;
205215
- }
205216
- if (!fun(chunk)) {
205217
- return false;
205218
- }
205219
- }
205220
- }
205221
-
205222
- template <class T>
205223
- bool LocalTableStorage::ScanTableStorage(Transaction &transaction, T &&fun) {
205224
- vector<column_t> column_ids;
205225
- column_ids.reserve(table->column_definitions.size());
205226
- for (idx_t i = 0; i < table->column_definitions.size(); i++) {
205227
- column_ids.push_back(i);
205228
- }
205229
- return ScanTableStorage(transaction, column_ids, fun);
205230
- }
205231
-
205232
- void LocalTableStorage::AppendToIndexes(Transaction &transaction, TableAppendState &append_state, idx_t append_count,
205233
- bool append_to_table) {
205234
- bool constraint_violated = false;
205235
- if (append_to_table) {
205236
- table->InitializeAppend(transaction, append_state, append_count);
205237
- }
205238
- if (append_to_table) {
205239
- // appending: need to scan entire
205240
- ScanTableStorage(transaction, [&](DataChunk &chunk) -> bool {
205241
- // append this chunk to the indexes of the table
205242
- if (!table->AppendToIndexes(chunk, append_state.current_row)) {
205243
- constraint_violated = true;
205244
- return false;
205245
- }
205246
- // append to base table
205247
- table->Append(chunk, append_state);
205248
- return true;
205249
- });
205250
- } else {
205251
- // only need to scan for index append
205252
- // figure out which columns we need to scan for the set of indexes
205253
- auto columns = table->info->indexes.GetRequiredColumns();
205254
- // create an empty mock chunk that contains all the correct types for the table
205255
- DataChunk mock_chunk;
205256
- mock_chunk.InitializeEmpty(table->GetTypes());
205257
- ScanTableStorage(transaction, columns, [&](DataChunk &chunk) -> bool {
205258
- // construct the mock chunk by referencing the required columns
205259
- for (idx_t i = 0; i < columns.size(); i++) {
205260
- mock_chunk.data[columns[i]].Reference(chunk.data[i]);
205261
- }
205262
- mock_chunk.SetCardinality(chunk);
205263
- // append this chunk to the indexes of the table
205264
- if (!table->AppendToIndexes(mock_chunk, append_state.current_row)) {
205265
- constraint_violated = true;
205266
- return false;
205267
- }
205268
- append_state.current_row += chunk.size();
205269
- return true;
205270
- });
205271
- }
205272
- if (constraint_violated) {
205273
- // need to revert the append
205274
- row_t current_row = append_state.row_start;
205275
- // remove the data from the indexes, if there are any indexes
205276
- ScanTableStorage(transaction, [&](DataChunk &chunk) -> bool {
205277
- // append this chunk to the indexes of the table
205278
- table->RemoveFromIndexes(append_state, chunk, current_row);
205279
-
205280
- current_row += chunk.size();
205281
- if (current_row >= append_state.current_row) {
205282
- // finished deleting all rows from the index: abort now
205283
- return false;
205284
- }
205285
- return true;
205286
- });
205287
- if (append_to_table) {
205288
- table->RevertAppendInternal(append_state.row_start, append_count);
205289
- }
205290
- throw ConstraintException("PRIMARY KEY or UNIQUE constraint violated: duplicated key");
205291
- }
205364
+ storage->row_groups->Update(TransactionData(0, 0), ids, column_ids, updates);
205292
205365
  }
205293
205366
 
205294
205367
  void LocalStorage::Flush(DataTable &table, LocalTableStorage &storage) {
205295
- // bulk append threshold: a full row group
205296
- static constexpr const idx_t MERGE_THRESHOLD = RowGroup::ROW_GROUP_SIZE;
205297
-
205298
- auto storage_entry = move(table_storage[&table]);
205299
- table_storage[&table].reset();
205300
-
205301
205368
  if (storage.row_groups->GetTotalRows() <= storage.deleted_rows) {
205302
205369
  return;
205303
205370
  }
@@ -205317,13 +205384,12 @@ void LocalStorage::Flush(DataTable &table, LocalTableStorage &storage) {
205317
205384
  storage.AppendToIndexes(transaction, append_state, append_count, false);
205318
205385
  }
205319
205386
  // finally move over the row groups
205320
- table.MergeStorage(*storage.row_groups, storage.indexes, storage.stats);
205387
+ table.MergeStorage(*storage.row_groups, storage.indexes);
205321
205388
  } else {
205322
- if (storage.partial_manager || !storage.written_blocks.empty()) {
205323
- // we have written data but cannot merge to disk after all
205324
- // revert the data we have already written
205325
- storage.Rollback();
205326
- }
205389
+ // check if we have written data
205390
+ // if we have, we cannot merge to disk after all
205391
+ // so we need to revert the data we have already written
205392
+ storage.Rollback();
205327
205393
  // append to the indexes and append to the base table
205328
205394
  storage.AppendToIndexes(transaction, append_state, append_count, true);
205329
205395
  }
@@ -205331,102 +205397,91 @@ void LocalStorage::Flush(DataTable &table, LocalTableStorage &storage) {
205331
205397
  }
205332
205398
 
205333
205399
  void LocalStorage::Commit(LocalStorage::CommitState &commit_state, Transaction &transaction) {
205334
- // commit local storage, iterate over all entries in the table storage map
205400
+ // commit local storage
205401
+ // iterate over all entries in the table storage map and commit them
205402
+ // after this, the local storage is no longer required and can be cleared
205403
+ auto table_storage = table_manager.MoveEntries();
205335
205404
  for (auto &entry : table_storage) {
205336
205405
  auto table = entry.first;
205337
205406
  auto storage = entry.second.get();
205338
205407
  Flush(*table, *storage);
205408
+
205409
+ entry.second.reset();
205339
205410
  }
205340
- // finished commit: clear local storage
205341
- table_storage.clear();
205342
205411
  }
205343
205412
 
205344
205413
  void LocalStorage::Rollback() {
205414
+ // rollback local storage
205415
+ // after this, the local storage is no longer required and can be cleared
205416
+ auto table_storage = table_manager.MoveEntries();
205345
205417
  for (auto &entry : table_storage) {
205346
205418
  auto storage = entry.second.get();
205347
205419
  if (!storage) {
205348
205420
  continue;
205349
205421
  }
205350
205422
  storage->Rollback();
205351
- }
205352
- }
205353
205423
 
205354
- void LocalTableStorage::Rollback() {
205355
- if (partial_manager) {
205356
- partial_manager->Clear();
205357
- partial_manager.reset();
205358
- }
205359
- auto &block_manager = table->info->table_io_manager->GetBlockManagerForRowData();
205360
- for (auto block_id : written_blocks) {
205361
- block_manager.MarkBlockAsModified(block_id);
205424
+ entry.second.reset();
205362
205425
  }
205363
205426
  }
205364
205427
 
205365
205428
  idx_t LocalStorage::AddedRows(DataTable *table) {
205366
- auto entry = table_storage.find(table);
205367
- if (entry == table_storage.end()) {
205429
+ auto storage = table_manager.GetStorage(table);
205430
+ if (!storage) {
205368
205431
  return 0;
205369
205432
  }
205370
- return entry->second->row_groups->GetTotalRows() - entry->second->deleted_rows;
205433
+ return storage->row_groups->GetTotalRows() - storage->deleted_rows;
205371
205434
  }
205372
205435
 
205373
205436
  void LocalStorage::MoveStorage(DataTable *old_dt, DataTable *new_dt) {
205374
205437
  // check if there are any pending appends for the old version of the table
205375
- auto entry = table_storage.find(old_dt);
205376
- if (entry == table_storage.end()) {
205438
+ auto new_storage = table_manager.MoveEntry(old_dt);
205439
+ if (!new_storage) {
205377
205440
  return;
205378
205441
  }
205379
205442
  // take over the storage from the old entry
205380
- auto new_storage = move(entry->second);
205381
205443
  new_storage->table = new_dt;
205382
- table_storage.erase(entry);
205383
- table_storage[new_dt] = move(new_storage);
205444
+ table_manager.InsertEntry(new_dt, move(new_storage));
205384
205445
  }
205385
205446
 
205386
205447
  void LocalStorage::AddColumn(DataTable *old_dt, DataTable *new_dt, ColumnDefinition &new_column,
205387
205448
  Expression *default_value) {
205388
205449
  // check if there are any pending appends for the old version of the table
205389
- auto entry = table_storage.find(old_dt);
205390
- if (entry == table_storage.end()) {
205450
+ auto storage = table_manager.MoveEntry(old_dt);
205451
+ if (!storage) {
205391
205452
  return;
205392
205453
  }
205393
- auto storage = move(entry->second);
205394
205454
  auto new_storage = make_unique<LocalTableStorage>(*new_dt, *storage, new_column, default_value);
205395
-
205396
- table_storage[new_dt] = move(new_storage);
205397
- table_storage.erase(old_dt);
205455
+ table_manager.InsertEntry(new_dt, move(new_storage));
205398
205456
  }
205399
205457
 
205400
205458
  void LocalStorage::DropColumn(DataTable *old_dt, DataTable *new_dt, idx_t removed_column) {
205401
205459
  // check if there are any pending appends for the old version of the table
205402
- auto entry = table_storage.find(old_dt);
205403
- if (entry == table_storage.end()) {
205460
+ auto storage = table_manager.MoveEntry(old_dt);
205461
+ if (!storage) {
205404
205462
  return;
205405
205463
  }
205406
- auto storage = move(entry->second);
205407
205464
  auto new_storage = make_unique<LocalTableStorage>(*new_dt, *storage, removed_column);
205408
-
205409
- table_storage[new_dt] = move(new_storage);
205410
- table_storage.erase(old_dt);
205465
+ table_manager.InsertEntry(new_dt, move(new_storage));
205411
205466
  }
205412
205467
 
205413
205468
  void LocalStorage::ChangeType(DataTable *old_dt, DataTable *new_dt, idx_t changed_idx, const LogicalType &target_type,
205414
205469
  const vector<column_t> &bound_columns, Expression &cast_expr) {
205415
205470
  // check if there are any pending appends for the old version of the table
205416
- auto entry = table_storage.find(old_dt);
205417
- if (entry == table_storage.end()) {
205471
+ auto storage = table_manager.MoveEntry(old_dt);
205472
+ if (!storage) {
205418
205473
  return;
205419
205474
  }
205420
- auto storage = move(entry->second);
205421
205475
  auto new_storage =
205422
205476
  make_unique<LocalTableStorage>(*new_dt, *storage, changed_idx, target_type, bound_columns, cast_expr);
205423
-
205424
- table_storage[new_dt] = move(new_storage);
205425
- table_storage.erase(old_dt);
205477
+ table_manager.InsertEntry(new_dt, move(new_storage));
205426
205478
  }
205427
205479
 
205428
205480
  void LocalStorage::FetchChunk(DataTable *table, Vector &row_ids, idx_t count, DataChunk &verify_chunk) {
205429
- auto storage = GetStorage(table);
205481
+ auto storage = table_manager.GetStorage(table);
205482
+ if (!storage) {
205483
+ throw InternalException("LocalStorage::FetchChunk - local storage not found");
205484
+ }
205430
205485
 
205431
205486
  ColumnFetchState fetch_state;
205432
205487
  vector<column_t> col_ids;
@@ -205439,13 +205494,15 @@ void LocalStorage::FetchChunk(DataTable *table, Vector &row_ids, idx_t count, Da
205439
205494
  }
205440
205495
 
205441
205496
  TableIndexList &LocalStorage::GetIndexes(DataTable *table) {
205442
- auto storage = GetStorage(table);
205443
-
205497
+ auto storage = table_manager.GetStorage(table);
205498
+ if (!storage) {
205499
+ throw InternalException("LocalStorage::GetIndexes - local storage not found");
205500
+ }
205444
205501
  return storage->indexes;
205445
205502
  }
205446
205503
 
205447
205504
  void LocalStorage::VerifyNewConstraint(DataTable &parent, const BoundConstraint &constraint) {
205448
- auto storage = GetStorage(&parent);
205505
+ auto storage = table_manager.GetStorage(&parent);
205449
205506
  if (!storage) {
205450
205507
  return;
205451
205508
  }
@@ -207771,9 +207828,6 @@ idx_t ChunkVectorInfo::Delete(transaction_t transaction_id, row_t rows[], idx_t
207771
207828
  // tuple was already deleted by another transaction
207772
207829
  throw TransactionException("Conflict on tuple deletion!");
207773
207830
  }
207774
- if (inserted[rows[i]] >= TRANSACTION_ID_START) {
207775
- throw TransactionException("Deleting non-committed tuples is not supported (for now...)");
207776
- }
207777
207831
  // after verifying that there are no conflicts we mark the tuple as deleted
207778
207832
  deleted[rows[i]] = transaction_id;
207779
207833
  rows[deleted_tuples] = rows[i];
@@ -208297,7 +208351,7 @@ ColumnData::ColumnData(ColumnData &other, idx_t start, ColumnData *parent)
208297
208351
  : block_manager(other.block_manager), info(other.info), column_index(other.column_index), start(start),
208298
208352
  type(move(other.type)), parent(parent), updates(move(other.updates)), version(parent ? parent->version + 1 : 0) {
208299
208353
  idx_t offset = 0;
208300
- for (auto segment = other.data.GetRootSegment(); segment; segment = segment->next.get()) {
208354
+ for (auto segment = other.data.GetRootSegment(); segment; segment = segment->Next()) {
208301
208355
  auto &other = (ColumnSegment &)*segment;
208302
208356
  this->data.AppendSegment(ColumnSegment::CreateSegment(other, start + offset));
208303
208357
  offset += segment->count;
@@ -208327,8 +208381,9 @@ void ColumnData::IncrementVersion() {
208327
208381
  }
208328
208382
 
208329
208383
  idx_t ColumnData::GetMaxEntry() {
208330
- auto first_segment = data.GetRootSegment();
208331
- auto last_segment = data.GetLastSegment();
208384
+ auto l = data.Lock();
208385
+ auto first_segment = data.GetRootSegment(l);
208386
+ auto last_segment = data.GetLastSegment(l);
208332
208387
  if (!first_segment) {
208333
208388
  D_ASSERT(!last_segment);
208334
208389
  return 0;
@@ -208380,15 +208435,18 @@ idx_t ColumnData::ScanVector(ColumnScanState &state, Vector &result, idx_t remai
208380
208435
  state.row_index <= state.current->start + state.current->count);
208381
208436
  idx_t scan_count = MinValue<idx_t>(remaining, state.current->start + state.current->count - state.row_index);
208382
208437
  idx_t result_offset = initial_remaining - remaining;
208383
- state.current->Scan(state, scan_count, result, result_offset, scan_count == initial_remaining);
208438
+ if (scan_count > 0) {
208439
+ state.current->Scan(state, scan_count, result, result_offset, scan_count == initial_remaining);
208440
+
208441
+ state.row_index += scan_count;
208442
+ remaining -= scan_count;
208443
+ }
208384
208444
 
208385
- state.row_index += scan_count;
208386
- remaining -= scan_count;
208387
208445
  if (remaining > 0) {
208388
208446
  if (!state.current->next) {
208389
208447
  break;
208390
208448
  }
208391
- state.current = (ColumnSegment *)state.current->next.get();
208449
+ state.current = (ColumnSegment *)state.current->Next();
208392
208450
  state.current->InitializeScan(state);
208393
208451
  state.segment_checked = false;
208394
208452
  D_ASSERT(state.row_index >= state.current->start &&
@@ -208488,17 +208546,17 @@ void ColumnData::Append(BaseStatistics &stats, ColumnAppendState &state, Vector
208488
208546
  }
208489
208547
 
208490
208548
  void ColumnData::InitializeAppend(ColumnAppendState &state) {
208491
- lock_guard<mutex> tree_lock(data.node_lock);
208492
- if (data.nodes.empty()) {
208549
+ auto l = data.Lock();
208550
+ if (data.IsEmpty(l)) {
208493
208551
  // no segments yet, append an empty segment
208494
- AppendTransientSegment(start);
208552
+ AppendTransientSegment(l, start);
208495
208553
  }
208496
- auto segment = (ColumnSegment *)data.GetLastSegment();
208554
+ auto segment = (ColumnSegment *)data.GetLastSegment(l);
208497
208555
  if (segment->segment_type == ColumnSegmentType::PERSISTENT) {
208498
208556
  // no transient segments yet
208499
208557
  auto total_rows = segment->start + segment->count;
208500
- AppendTransientSegment(total_rows);
208501
- state.current = (ColumnSegment *)data.GetLastSegment();
208558
+ AppendTransientSegment(l, total_rows);
208559
+ state.current = (ColumnSegment *)data.GetLastSegment(l);
208502
208560
  } else {
208503
208561
  state.current = (ColumnSegment *)segment;
208504
208562
  }
@@ -208521,9 +208579,9 @@ void ColumnData::AppendData(BaseStatistics &stats, ColumnAppendState &state, Uni
208521
208579
 
208522
208580
  // we couldn't fit everything we wanted in the current column segment, create a new one
208523
208581
  {
208524
- lock_guard<mutex> tree_lock(data.node_lock);
208525
- AppendTransientSegment(state.current->start + state.current->count);
208526
- state.current = (ColumnSegment *)data.GetLastSegment();
208582
+ auto l = data.Lock();
208583
+ AppendTransientSegment(l, state.current->start + state.current->count);
208584
+ state.current = (ColumnSegment *)data.GetLastSegment(l);
208527
208585
  state.current->InitializeAppend(state);
208528
208586
  }
208529
208587
  offset += copied_elements;
@@ -208532,23 +208590,23 @@ void ColumnData::AppendData(BaseStatistics &stats, ColumnAppendState &state, Uni
208532
208590
  }
208533
208591
 
208534
208592
  void ColumnData::RevertAppend(row_t start_row) {
208535
- lock_guard<mutex> tree_lock(data.node_lock);
208593
+ auto l = data.Lock();
208536
208594
  // check if this row is in the segment tree at all
208537
- if (idx_t(start_row) >= data.nodes.back().row_start + data.nodes.back().node->count) {
208595
+ auto last_segment = data.GetLastSegment(l);
208596
+ if (idx_t(start_row) >= last_segment->start + last_segment->count) {
208538
208597
  // the start row is equal to the final portion of the column data: nothing was ever appended here
208539
- D_ASSERT(idx_t(start_row) == data.nodes.back().row_start + data.nodes.back().node->count);
208598
+ D_ASSERT(idx_t(start_row) == last_segment->start + last_segment->count);
208540
208599
  return;
208541
208600
  }
208542
208601
  // find the segment index that the current row belongs to
208543
- idx_t segment_index = data.GetSegmentIndex(start_row);
208544
- auto segment = data.nodes[segment_index].node;
208602
+ idx_t segment_index = data.GetSegmentIndex(l, start_row);
208603
+ auto segment = data.GetSegmentByIndex(l, segment_index);
208545
208604
  auto &transient = (ColumnSegment &)*segment;
208546
208605
  D_ASSERT(transient.segment_type == ColumnSegmentType::TRANSIENT);
208547
208606
 
208548
208607
  // remove any segments AFTER this segment: they should be deleted entirely
208549
- if (segment_index < data.nodes.size() - 1) {
208550
- data.nodes.erase(data.nodes.begin() + segment_index + 1, data.nodes.end());
208551
- }
208608
+ data.EraseSegments(l, segment_index);
208609
+
208552
208610
  segment->next = nullptr;
208553
208611
  transient.RevertAppend(start_row);
208554
208612
  }
@@ -208602,9 +208660,9 @@ unique_ptr<BaseStatistics> ColumnData::GetUpdateStatistics() {
208602
208660
  return updates ? updates->GetStatistics() : nullptr;
208603
208661
  }
208604
208662
 
208605
- void ColumnData::AppendTransientSegment(idx_t start_row) {
208663
+ void ColumnData::AppendTransientSegment(SegmentLock &l, idx_t start_row) {
208606
208664
  auto new_segment = ColumnSegment::CreateTransientSegment(GetDatabase(), type, start_row);
208607
- data.AppendSegment(move(new_segment));
208665
+ data.AppendSegment(l, move(new_segment));
208608
208666
  }
208609
208667
 
208610
208668
  void ColumnData::CommitDropColumn() {
@@ -208616,7 +208674,7 @@ void ColumnData::CommitDropColumn() {
208616
208674
  block_manager.MarkBlockAsModified(block_id);
208617
208675
  }
208618
208676
  }
208619
- segment = (ColumnSegment *)segment->next.get();
208677
+ segment = (ColumnSegment *)segment->Next();
208620
208678
  }
208621
208679
  }
208622
208680
 
@@ -208642,17 +208700,19 @@ unique_ptr<ColumnCheckpointState> ColumnData::Checkpoint(RowGroup &row_group,
208642
208700
  auto checkpoint_state = CreateCheckpointState(row_group, partial_block_manager);
208643
208701
  checkpoint_state->global_stats = BaseStatistics::CreateEmpty(type, StatisticsType::LOCAL_STATS);
208644
208702
 
208645
- if (!data.root_node) {
208703
+ auto l = data.Lock();
208704
+ auto nodes = data.MoveSegments(l);
208705
+ if (nodes.empty()) {
208646
208706
  // empty table: flush the empty list
208647
208707
  return checkpoint_state;
208648
208708
  }
208649
208709
  lock_guard<mutex> update_guard(update_lock);
208650
208710
 
208651
208711
  ColumnDataCheckpointer checkpointer(*this, row_group, *checkpoint_state, checkpoint_info);
208652
- checkpointer.Checkpoint(move(data.root_node));
208712
+ checkpointer.Checkpoint(move(nodes));
208653
208713
 
208654
208714
  // replace the old tree with the new one
208655
- data.Replace(checkpoint_state->new_tree);
208715
+ data.Replace(l, checkpoint_state->new_tree);
208656
208716
  version++;
208657
208717
 
208658
208718
  return checkpoint_state;
@@ -208743,13 +208803,14 @@ void ColumnData::GetStorageInfo(idx_t row_group_index, vector<idx_t> col_path, v
208743
208803
  result.push_back(move(column_info));
208744
208804
 
208745
208805
  segment_idx++;
208746
- segment = (ColumnSegment *)segment->next.get();
208806
+ segment = (ColumnSegment *)segment->Next();
208747
208807
  }
208748
208808
  }
208749
208809
 
208750
208810
  void ColumnData::Verify(RowGroup &parent) {
208751
208811
  #ifdef DEBUG
208752
208812
  D_ASSERT(this->start == parent.start);
208813
+ data.Verify();
208753
208814
  auto root = data.GetRootSegment();
208754
208815
  if (root) {
208755
208816
  D_ASSERT(root != nullptr);
@@ -208761,7 +208822,7 @@ void ColumnData::Verify(RowGroup &parent) {
208761
208822
  if (!root->next) {
208762
208823
  D_ASSERT(prev_end == parent.start + parent.count);
208763
208824
  }
208764
- root = root->next.get();
208825
+ root = root->Next();
208765
208826
  }
208766
208827
  }
208767
208828
  #endif
@@ -208853,7 +208914,8 @@ ColumnCheckpointState &ColumnDataCheckpointer::GetCheckpointState() {
208853
208914
 
208854
208915
  void ColumnDataCheckpointer::ScanSegments(const std::function<void(Vector &, idx_t)> &callback) {
208855
208916
  Vector scan_vector(intermediate.GetType(), nullptr);
208856
- for (auto segment = (ColumnSegment *)owned_segment.get(); segment; segment = (ColumnSegment *)segment->next.get()) {
208917
+ for (idx_t segment_idx = 0; segment_idx < nodes.size(); segment_idx++) {
208918
+ auto segment = (ColumnSegment *)nodes[segment_idx].node.get();
208857
208919
  ColumnScanState scan_state;
208858
208920
  scan_state.current = segment;
208859
208921
  segment->InitializeScan(scan_state);
@@ -208977,7 +209039,8 @@ void ColumnDataCheckpointer::WriteToDisk() {
208977
209039
  // if there are any persistent segments, we will mark their old block ids as modified
208978
209040
  // since the segments will be rewritten their old on disk data is no longer required
208979
209041
  auto &block_manager = col_data.block_manager;
208980
- for (auto segment = (ColumnSegment *)owned_segment.get(); segment; segment = (ColumnSegment *)segment->next.get()) {
209042
+ for (idx_t segment_idx = 0; segment_idx < nodes.size(); segment_idx++) {
209043
+ auto segment = (ColumnSegment *)nodes[segment_idx].node.get();
208981
209044
  if (segment->segment_type == ColumnSegmentType::PERSISTENT) {
208982
209045
  // persistent segment has updates: mark it as modified and rewrite the block with the merged updates
208983
209046
  auto block_id = segment->GetBlockId();
@@ -209003,11 +209066,12 @@ void ColumnDataCheckpointer::WriteToDisk() {
209003
209066
  [&](Vector &scan_vector, idx_t count) { best_function->compress(*compress_state, scan_vector, count); });
209004
209067
  best_function->compress_finalize(*compress_state);
209005
209068
 
209006
- owned_segment.reset();
209069
+ nodes.clear();
209007
209070
  }
209008
209071
 
209009
209072
  bool ColumnDataCheckpointer::HasChanges() {
209010
- for (auto segment = (ColumnSegment *)owned_segment.get(); segment; segment = (ColumnSegment *)segment->next.get()) {
209073
+ for (idx_t segment_idx = 0; segment_idx < nodes.size(); segment_idx++) {
209074
+ auto segment = (ColumnSegment *)nodes[segment_idx].node.get();
209011
209075
  if (segment->segment_type == ColumnSegmentType::TRANSIENT) {
209012
209076
  // transient segment: always need to write to disk
209013
209077
  return true;
@@ -209026,10 +209090,8 @@ bool ColumnDataCheckpointer::HasChanges() {
209026
209090
  void ColumnDataCheckpointer::WritePersistentSegments() {
209027
209091
  // all segments are persistent and there are no updates
209028
209092
  // we only need to write the metadata
209029
- auto segment = (ColumnSegment *)owned_segment.get();
209030
- while (segment) {
209031
- auto next_segment = move(segment->next);
209032
-
209093
+ for (idx_t segment_idx = 0; segment_idx < nodes.size(); segment_idx++) {
209094
+ auto segment = (ColumnSegment *)nodes[segment_idx].node.get();
209033
209095
  D_ASSERT(segment->segment_type == ColumnSegmentType::PERSISTENT);
209034
209096
 
209035
209097
  // set up the data pointer directly using the data from the persistent segment
@@ -209045,19 +209107,15 @@ void ColumnDataCheckpointer::WritePersistentSegments() {
209045
209107
  state.global_stats->Merge(*segment->stats.statistics);
209046
209108
 
209047
209109
  // directly append the current segment to the new tree
209048
- state.new_tree.AppendSegment(move(owned_segment));
209110
+ state.new_tree.AppendSegment(move(nodes[segment_idx].node));
209049
209111
 
209050
209112
  state.data_pointers.push_back(move(pointer));
209051
-
209052
- // move to the next segment in the list
209053
- owned_segment = move(next_segment);
209054
- segment = (ColumnSegment *)owned_segment.get();
209055
209113
  }
209056
209114
  }
209057
209115
 
209058
- void ColumnDataCheckpointer::Checkpoint(unique_ptr<SegmentBase> segment) {
209059
- D_ASSERT(!owned_segment);
209060
- this->owned_segment = move(segment);
209116
+ void ColumnDataCheckpointer::Checkpoint(vector<SegmentNode> nodes) {
209117
+ D_ASSERT(!nodes.empty());
209118
+ this->nodes = move(nodes);
209061
209119
  // first check if any of the segments have changes
209062
209120
  if (!HasChanges()) {
209063
209121
  // no changes: only need to write the metadata for this column
@@ -210917,21 +210975,27 @@ Allocator &RowGroupCollection::GetAllocator() const {
210917
210975
  //===--------------------------------------------------------------------===//
210918
210976
  void RowGroupCollection::Initialize(PersistentTableData &data) {
210919
210977
  D_ASSERT(this->row_start == 0);
210978
+ auto l = row_groups->Lock();
210920
210979
  for (auto &row_group_pointer : data.row_groups) {
210921
210980
  auto new_row_group = make_unique<RowGroup>(info->db, block_manager, *info, types, move(row_group_pointer));
210922
210981
  auto row_group_count = new_row_group->start + new_row_group->count;
210923
210982
  if (row_group_count > this->total_rows) {
210924
210983
  this->total_rows = row_group_count;
210925
210984
  }
210926
- row_groups->AppendSegment(move(new_row_group));
210985
+ row_groups->AppendSegment(l, move(new_row_group));
210927
210986
  }
210987
+ stats.Initialize(types, data);
210928
210988
  }
210929
210989
 
210930
- void RowGroupCollection::AppendRowGroup(idx_t start_row) {
210990
+ void RowGroupCollection::InitializeEmpty() {
210991
+ stats.InitializeEmpty(types);
210992
+ }
210993
+
210994
+ void RowGroupCollection::AppendRowGroup(SegmentLock &l, idx_t start_row) {
210931
210995
  D_ASSERT(start_row >= row_start);
210932
210996
  auto new_row_group = make_unique<RowGroup>(info->db, block_manager, *info, start_row, 0);
210933
210997
  new_row_group->InitializeEmpty(types);
210934
- row_groups->AppendSegment(move(new_row_group));
210998
+ row_groups->AppendSegment(l, move(new_row_group));
210935
210999
  }
210936
211000
 
210937
211001
  RowGroup *RowGroupCollection::GetRowGroup(int64_t index) {
@@ -210941,7 +211005,8 @@ RowGroup *RowGroupCollection::GetRowGroup(int64_t index) {
210941
211005
  void RowGroupCollection::Verify() {
210942
211006
  #ifdef DEBUG
210943
211007
  idx_t current_total_rows = 0;
210944
- for (auto segment = row_groups->GetRootSegment(); segment; segment = segment->next.get()) {
211008
+ row_groups->Verify();
211009
+ for (auto segment = row_groups->GetRootSegment(); segment; segment = segment->Next()) {
210945
211010
  auto &row_group = (RowGroup &)*segment;
210946
211011
  row_group.Verify();
210947
211012
  D_ASSERT(row_group.start == this->row_start + current_total_rows);
@@ -210960,12 +211025,12 @@ void RowGroupCollection::InitializeScan(CollectionScanState &state, const vector
210960
211025
  D_ASSERT(row_group);
210961
211026
  state.max_row = row_start + total_rows;
210962
211027
  while (row_group && !row_group->InitializeScan(state.row_group_state)) {
210963
- row_group = (RowGroup *)row_group->next.get();
211028
+ row_group = (RowGroup *)row_group->Next();
210964
211029
  }
210965
211030
  }
210966
211031
 
210967
211032
  void RowGroupCollection::InitializeCreateIndexScan(CreateIndexScanState &state) {
210968
- state.delete_lock = std::unique_lock<mutex>(row_groups->node_lock);
211033
+ state.segment_lock = row_groups->Lock();
210969
211034
  }
210970
211035
 
210971
211036
  void RowGroupCollection::InitializeScanWithOffset(CollectionScanState &state, const vector<column_t> &column_ids,
@@ -211011,11 +211076,11 @@ bool RowGroupCollection::NextParallelScan(ClientContext &context, ParallelCollec
211011
211076
  if (ClientConfig::GetConfig(context).verify_parallelism) {
211012
211077
  state.vector_index++;
211013
211078
  if (state.vector_index * STANDARD_VECTOR_SIZE >= state.current_row_group->count) {
211014
- state.current_row_group = (RowGroup *)state.current_row_group->next.get();
211079
+ state.current_row_group = (RowGroup *)state.current_row_group->Next();
211015
211080
  state.vector_index = 0;
211016
211081
  }
211017
211082
  } else {
211018
- state.current_row_group = (RowGroup *)state.current_row_group->next.get();
211083
+ state.current_row_group = (RowGroup *)state.current_row_group->Next();
211019
211084
  }
211020
211085
  if (!need_to_scan) {
211021
211086
  // filters allow us to skip this row group: move to the next row group
@@ -211026,6 +211091,41 @@ bool RowGroupCollection::NextParallelScan(ClientContext &context, ParallelCollec
211026
211091
  return false;
211027
211092
  }
211028
211093
 
211094
+ bool RowGroupCollection::Scan(Transaction &transaction, const vector<column_t> &column_ids,
211095
+ const std::function<bool(DataChunk &chunk)> &fun) {
211096
+ vector<LogicalType> scan_types;
211097
+ for (idx_t i = 0; i < column_ids.size(); i++) {
211098
+ scan_types.push_back(types[column_ids[i]]);
211099
+ }
211100
+ DataChunk chunk;
211101
+ chunk.Initialize(GetAllocator(), scan_types);
211102
+
211103
+ // initialize the scan
211104
+ TableScanState state;
211105
+ state.Initialize(column_ids, nullptr);
211106
+ InitializeScan(state.local_state, column_ids, nullptr);
211107
+
211108
+ while (true) {
211109
+ chunk.Reset();
211110
+ state.local_state.Scan(transaction, chunk);
211111
+ if (chunk.size() == 0) {
211112
+ return true;
211113
+ }
211114
+ if (!fun(chunk)) {
211115
+ return false;
211116
+ }
211117
+ }
211118
+ }
211119
+
211120
+ bool RowGroupCollection::Scan(Transaction &transaction, const std::function<bool(DataChunk &chunk)> &fun) {
211121
+ vector<column_t> column_ids;
211122
+ column_ids.reserve(types.size());
211123
+ for (idx_t i = 0; i < types.size(); i++) {
211124
+ column_ids.push_back(i);
211125
+ }
211126
+ return Scan(transaction, column_ids, fun);
211127
+ }
211128
+
211029
211129
  //===--------------------------------------------------------------------===//
211030
211130
  // Fetch
211031
211131
  //===--------------------------------------------------------------------===//
@@ -211036,7 +211136,16 @@ void RowGroupCollection::Fetch(TransactionData transaction, DataChunk &result, c
211036
211136
  idx_t count = 0;
211037
211137
  for (idx_t i = 0; i < fetch_count; i++) {
211038
211138
  auto row_id = row_ids[i];
211039
- auto row_group = (RowGroup *)row_groups->GetSegment(row_id);
211139
+ RowGroup *row_group;
211140
+ {
211141
+ idx_t segment_index;
211142
+ auto l = row_groups->Lock();
211143
+ if (!row_groups->TryGetSegmentIndex(l, row_id, segment_index)) {
211144
+ // in parallel append scenarios it is possible for the row_id
211145
+ continue;
211146
+ }
211147
+ row_group = (RowGroup *)row_groups->GetSegmentByIndex(l, segment_index);
211148
+ }
211040
211149
  if (!row_group->Fetch(transaction, row_id - row_group->start)) {
211041
211150
  continue;
211042
211151
  }
@@ -211058,7 +211167,12 @@ TableAppendState::~TableAppendState() {
211058
211167
  }
211059
211168
 
211060
211169
  bool RowGroupCollection::IsEmpty() const {
211061
- return row_groups->GetRootSegment() == nullptr;
211170
+ auto l = row_groups->Lock();
211171
+ return IsEmpty(l);
211172
+ }
211173
+
211174
+ bool RowGroupCollection::IsEmpty(SegmentLock &l) const {
211175
+ return row_groups->IsEmpty(l);
211062
211176
  }
211063
211177
 
211064
211178
  void RowGroupCollection::InitializeAppend(TransactionData transaction, TableAppendState &state, idx_t append_count) {
@@ -211067,17 +211181,17 @@ void RowGroupCollection::InitializeAppend(TransactionData transaction, TableAppe
211067
211181
  state.total_append_count = 0;
211068
211182
 
211069
211183
  // start writing to the row_groups
211070
- lock_guard<mutex> row_group_lock(row_groups->node_lock);
211071
- if (IsEmpty()) {
211184
+ auto l = row_groups->Lock();
211185
+ if (IsEmpty(l)) {
211072
211186
  // empty row group collection: empty first row group
211073
- AppendRowGroup(row_start);
211187
+ AppendRowGroup(l, row_start);
211074
211188
  }
211075
- state.start_row_group = (RowGroup *)row_groups->GetLastSegment();
211189
+ state.start_row_group = (RowGroup *)row_groups->GetLastSegment(l);
211076
211190
  D_ASSERT(this->row_start + total_rows == state.start_row_group->start + state.start_row_group->count);
211077
211191
  state.start_row_group->InitializeAppend(state.row_group_append_state);
211078
211192
  state.remaining = append_count;
211193
+ state.transaction = transaction;
211079
211194
  if (state.remaining > 0) {
211080
- state.transaction = transaction;
211081
211195
  state.start_row_group->AppendVersionInfo(transaction, state.remaining);
211082
211196
  total_rows += state.remaining;
211083
211197
  }
@@ -211088,7 +211202,7 @@ void RowGroupCollection::InitializeAppend(TableAppendState &state) {
211088
211202
  InitializeAppend(tdata, state, 0);
211089
211203
  }
211090
211204
 
211091
- bool RowGroupCollection::Append(DataChunk &chunk, TableAppendState &state, TableStatistics &stats) {
211205
+ bool RowGroupCollection::Append(DataChunk &chunk, TableAppendState &state) {
211092
211206
  D_ASSERT(chunk.ColumnCount() == types.size());
211093
211207
  chunk.Verify();
211094
211208
 
@@ -211128,10 +211242,11 @@ bool RowGroupCollection::Append(DataChunk &chunk, TableAppendState &state, Table
211128
211242
  // append a new row_group
211129
211243
  new_row_group = true;
211130
211244
  auto next_start = current_row_group->start + state.row_group_append_state.offset_in_row_group;
211131
- AppendRowGroup(next_start);
211245
+
211246
+ auto l = row_groups->Lock();
211247
+ AppendRowGroup(l, next_start);
211132
211248
  // set up the append state for this row_group
211133
- lock_guard<mutex> row_group_lock(row_groups->node_lock);
211134
- auto last_row_group = (RowGroup *)row_groups->GetLastSegment();
211249
+ auto last_row_group = (RowGroup *)row_groups->GetLastSegment(l);
211135
211250
  last_row_group->InitializeAppend(state.row_group_append_state);
211136
211251
  if (state.remaining > 0) {
211137
211252
  last_row_group->AppendVersionInfo(state.transaction, state.remaining);
@@ -211154,19 +211269,20 @@ bool RowGroupCollection::Append(DataChunk &chunk, TableAppendState &state, Table
211154
211269
  }
211155
211270
 
211156
211271
  void RowGroupCollection::FinalizeAppend(TransactionData transaction, TableAppendState &state) {
211157
- D_ASSERT(state.transaction.transaction_id == 0);
211158
211272
  auto remaining = state.total_append_count;
211159
211273
  auto row_group = state.start_row_group;
211160
211274
  while (remaining > 0) {
211161
211275
  auto append_count = MinValue<idx_t>(remaining, RowGroup::ROW_GROUP_SIZE - row_group->count);
211162
211276
  row_group->AppendVersionInfo(transaction, append_count);
211163
211277
  remaining -= append_count;
211164
- row_group = (RowGroup *)row_group->next.get();
211278
+ row_group = (RowGroup *)row_group->Next();
211165
211279
  }
211166
211280
  total_rows += state.total_append_count;
211167
211281
 
211168
211282
  state.total_append_count = 0;
211169
211283
  state.start_row_group = nullptr;
211284
+
211285
+ Verify();
211170
211286
  }
211171
211287
 
211172
211288
  void RowGroupCollection::CommitAppend(transaction_t commit_id, idx_t row_start, idx_t count) {
@@ -211185,7 +211301,7 @@ void RowGroupCollection::CommitAppend(transaction_t commit_id, idx_t row_start,
211185
211301
  if (remaining == 0) {
211186
211302
  break;
211187
211303
  }
211188
- row_group = (RowGroup *)row_group->next.get();
211304
+ row_group = (RowGroup *)row_group->Next();
211189
211305
  }
211190
211306
  }
211191
211307
 
@@ -211195,16 +211311,15 @@ void RowGroupCollection::RevertAppendInternal(idx_t start_row, idx_t count) {
211195
211311
  }
211196
211312
  total_rows = start_row;
211197
211313
 
211198
- lock_guard<mutex> tree_lock(row_groups->node_lock);
211314
+ auto l = row_groups->Lock();
211199
211315
  // find the segment index that the current row belongs to
211200
- idx_t segment_index = row_groups->GetSegmentIndex(start_row);
211201
- auto segment = row_groups->nodes[segment_index].node;
211316
+ idx_t segment_index = row_groups->GetSegmentIndex(l, start_row);
211317
+ auto segment = row_groups->GetSegmentByIndex(l, segment_index);
211202
211318
  auto &info = (RowGroup &)*segment;
211203
211319
 
211204
211320
  // remove any segments AFTER this segment: they should be deleted entirely
211205
- if (segment_index < row_groups->nodes.size() - 1) {
211206
- row_groups->nodes.erase(row_groups->nodes.begin() + segment_index + 1, row_groups->nodes.end());
211207
- }
211321
+ row_groups->EraseSegments(l, segment_index);
211322
+
211208
211323
  info.next = nullptr;
211209
211324
  info.RevertAppend(start_row);
211210
211325
  }
@@ -211212,12 +211327,13 @@ void RowGroupCollection::RevertAppendInternal(idx_t start_row, idx_t count) {
211212
211327
  void RowGroupCollection::MergeStorage(RowGroupCollection &data) {
211213
211328
  D_ASSERT(data.types == types);
211214
211329
  auto index = row_start + total_rows.load();
211215
- for (auto segment = data.row_groups->GetRootSegment(); segment; segment = segment->next.get()) {
211330
+ for (auto segment = data.row_groups->GetRootSegment(); segment; segment = segment->Next()) {
211216
211331
  auto &row_group = (RowGroup &)*segment;
211217
211332
  auto new_group = make_unique<RowGroup>(row_group, index);
211218
211333
  index += new_group->count;
211219
211334
  row_groups->AppendSegment(move(new_group));
211220
211335
  }
211336
+ stats.MergeStats(data.stats);
211221
211337
  total_rows += data.total_rows.load();
211222
211338
  }
211223
211339
 
@@ -211255,7 +211371,7 @@ idx_t RowGroupCollection::Delete(TransactionData transaction, DataTable *table,
211255
211371
  // Update
211256
211372
  //===--------------------------------------------------------------------===//
211257
211373
  void RowGroupCollection::Update(TransactionData transaction, row_t *ids, const vector<column_t> &column_ids,
211258
- DataChunk &updates, TableStatistics &stats) {
211374
+ DataChunk &updates) {
211259
211375
  idx_t pos = 0;
211260
211376
  do {
211261
211377
  idx_t start = pos;
@@ -211327,7 +211443,7 @@ void RowGroupCollection::RemoveFromIndexes(TableIndexList &indexes, Vector &row_
211327
211443
  }
211328
211444
 
211329
211445
  void RowGroupCollection::UpdateColumn(TransactionData transaction, Vector &row_ids, const vector<column_t> &column_path,
211330
- DataChunk &updates, TableStatistics &stats) {
211446
+ DataChunk &updates) {
211331
211447
  auto first_id = FlatVector::GetValue<row_t>(row_ids, 0);
211332
211448
  if (first_id >= MAX_ROW_ID) {
211333
211449
  throw NotImplementedException("Cannot update a column-path on transaction local data");
@@ -211345,7 +211461,7 @@ void RowGroupCollection::UpdateColumn(TransactionData transaction, Vector &row_i
211345
211461
  //===--------------------------------------------------------------------===//
211346
211462
  void RowGroupCollection::Checkpoint(TableDataWriter &writer, vector<unique_ptr<BaseStatistics>> &global_stats) {
211347
211463
  for (auto row_group = (RowGroup *)row_groups->GetRootSegment(); row_group;
211348
- row_group = (RowGroup *)row_group->next.get()) {
211464
+ row_group = (RowGroup *)row_group->Next()) {
211349
211465
  auto rowg_writer = writer.GetRowGroupWriter(*row_group);
211350
211466
  auto pointer = row_group->Checkpoint(*rowg_writer, global_stats);
211351
211467
  writer.AddRowGroup(move(pointer), move(rowg_writer));
@@ -211359,7 +211475,7 @@ void RowGroupCollection::CommitDropColumn(idx_t index) {
211359
211475
  auto segment = (RowGroup *)row_groups->GetRootSegment();
211360
211476
  while (segment) {
211361
211477
  segment->CommitDropColumn(index);
211362
- segment = (RowGroup *)segment->next.get();
211478
+ segment = (RowGroup *)segment->Next();
211363
211479
  }
211364
211480
  }
211365
211481
 
@@ -211367,7 +211483,7 @@ void RowGroupCollection::CommitDropTable() {
211367
211483
  auto segment = (RowGroup *)row_groups->GetRootSegment();
211368
211484
  while (segment) {
211369
211485
  segment->CommitDrop();
211370
- segment = (RowGroup *)segment->next.get();
211486
+ segment = (RowGroup *)segment->Next();
211371
211487
  }
211372
211488
  }
211373
211489
 
@@ -211383,7 +211499,7 @@ vector<vector<Value>> RowGroupCollection::GetStorageInfo() {
211383
211499
  row_group->GetStorageInfo(row_group_index, result);
211384
211500
  row_group_index++;
211385
211501
 
211386
- row_group = (RowGroup *)row_group->next.get();
211502
+ row_group = (RowGroup *)row_group->Next();
211387
211503
  }
211388
211504
 
211389
211505
  return result;
@@ -211392,8 +211508,7 @@ vector<vector<Value>> RowGroupCollection::GetStorageInfo() {
211392
211508
  //===--------------------------------------------------------------------===//
211393
211509
  // Alter
211394
211510
  //===--------------------------------------------------------------------===//
211395
- shared_ptr<RowGroupCollection> RowGroupCollection::AddColumn(ColumnDefinition &new_column, Expression *default_value,
211396
- ColumnStatistics &stats) {
211511
+ shared_ptr<RowGroupCollection> RowGroupCollection::AddColumn(ColumnDefinition &new_column, Expression *default_value) {
211397
211512
  idx_t new_column_idx = types.size();
211398
211513
  auto new_types = types;
211399
211514
  new_types.push_back(new_column.GetType());
@@ -211408,16 +211523,19 @@ shared_ptr<RowGroupCollection> RowGroupCollection::AddColumn(ColumnDefinition &n
211408
211523
  executor.AddExpression(*default_value);
211409
211524
  }
211410
211525
 
211526
+ result->stats.InitializeAddColumn(stats, new_column.GetType());
211527
+ auto &new_column_stats = result->stats.GetStats(new_column_idx);
211528
+
211411
211529
  // fill the column with its DEFAULT value, or NULL if none is specified
211412
211530
  auto new_stats = make_unique<SegmentStatistics>(new_column.GetType());
211413
211531
  auto current_row_group = (RowGroup *)row_groups->GetRootSegment();
211414
211532
  while (current_row_group) {
211415
211533
  auto new_row_group = current_row_group->AddColumn(new_column, executor, default_value, default_vector);
211416
211534
  // merge in the statistics
211417
- new_row_group->MergeIntoStatistics(new_column_idx, *stats.stats);
211535
+ new_row_group->MergeIntoStatistics(new_column_idx, *new_column_stats.stats);
211418
211536
 
211419
211537
  result->row_groups->AppendSegment(move(new_row_group));
211420
- current_row_group = (RowGroup *)current_row_group->next.get();
211538
+ current_row_group = (RowGroup *)current_row_group->Next();
211421
211539
  }
211422
211540
  return result;
211423
211541
  }
@@ -211428,24 +211546,25 @@ shared_ptr<RowGroupCollection> RowGroupCollection::RemoveColumn(idx_t col_idx) {
211428
211546
  new_types.erase(new_types.begin() + col_idx);
211429
211547
 
211430
211548
  auto result = make_shared<RowGroupCollection>(info, block_manager, move(new_types), row_start, total_rows.load());
211549
+ result->stats.InitializeRemoveColumn(stats, col_idx);
211431
211550
 
211432
211551
  auto current_row_group = (RowGroup *)row_groups->GetRootSegment();
211433
211552
  while (current_row_group) {
211434
211553
  auto new_row_group = current_row_group->RemoveColumn(col_idx);
211435
211554
  result->row_groups->AppendSegment(move(new_row_group));
211436
- current_row_group = (RowGroup *)current_row_group->next.get();
211555
+ current_row_group = (RowGroup *)current_row_group->Next();
211437
211556
  }
211438
211557
  return result;
211439
211558
  }
211440
211559
 
211441
211560
  shared_ptr<RowGroupCollection> RowGroupCollection::AlterType(idx_t changed_idx, const LogicalType &target_type,
211442
- vector<column_t> bound_columns, Expression &cast_expr,
211443
- ColumnStatistics &stats) {
211561
+ vector<column_t> bound_columns, Expression &cast_expr) {
211444
211562
  D_ASSERT(changed_idx < types.size());
211445
211563
  auto new_types = types;
211446
211564
  new_types[changed_idx] = target_type;
211447
211565
 
211448
211566
  auto result = make_shared<RowGroupCollection>(info, block_manager, move(new_types), row_start, total_rows.load());
211567
+ result->stats.InitializeAlterType(stats, changed_idx, target_type);
211449
211568
 
211450
211569
  vector<LogicalType> scan_types;
211451
211570
  for (idx_t i = 0; i < bound_columns.size(); i++) {
@@ -211467,12 +211586,13 @@ shared_ptr<RowGroupCollection> RowGroupCollection::AlterType(idx_t changed_idx,
211467
211586
 
211468
211587
  // now alter the type of the column within all of the row_groups individually
211469
211588
  auto current_row_group = (RowGroup *)row_groups->GetRootSegment();
211589
+ auto &changed_stats = result->stats.GetStats(changed_idx);
211470
211590
  while (current_row_group) {
211471
211591
  auto new_row_group = current_row_group->AlterType(target_type, changed_idx, executor,
211472
211592
  scan_state.table_state.row_group_state, scan_chunk);
211473
- new_row_group->MergeIntoStatistics(changed_idx, *stats.stats);
211593
+ new_row_group->MergeIntoStatistics(changed_idx, *changed_stats.stats);
211474
211594
  result->row_groups->AppendSegment(move(new_row_group));
211475
- current_row_group = (RowGroup *)current_row_group->next.get();
211595
+ current_row_group = (RowGroup *)current_row_group->Next();
211476
211596
  }
211477
211597
 
211478
211598
  return result;
@@ -211494,9 +211614,9 @@ void RowGroupCollection::VerifyNewConstraint(DataTable &parent, const BoundConst
211494
211614
  vector<column_t> cids;
211495
211615
  cids.push_back(not_null_constraint.index);
211496
211616
  // Use ScanCommitted to scan the latest committed data
211497
- InitializeCreateIndexScan(state);
211498
211617
  state.Initialize(cids, nullptr);
211499
211618
  InitializeScan(state.table_state, cids, nullptr);
211619
+ InitializeCreateIndexScan(state);
211500
211620
  while (true) {
211501
211621
  scan_chunk.Reset();
211502
211622
  state.table_state.ScanCommitted(scan_chunk, TableScanType::TABLE_SCAN_COMMITTED_ROWS_OMIT_PERMANENTLY_DELETED);
@@ -211511,6 +211631,19 @@ void RowGroupCollection::VerifyNewConstraint(DataTable &parent, const BoundConst
211511
211631
  }
211512
211632
  }
211513
211633
 
211634
+ //===--------------------------------------------------------------------===//
211635
+ // Statistics
211636
+ //===--------------------------------------------------------------------===//
211637
+ unique_ptr<BaseStatistics> RowGroupCollection::CopyStats(column_t column_id) {
211638
+ return stats.CopyStats(column_id);
211639
+ }
211640
+
211641
+ void RowGroupCollection::SetStatistics(column_t column_id, const std::function<void(BaseStatistics &)> &set_fun) {
211642
+ D_ASSERT(column_id != COLUMN_IDENTIFIER_ROW_ID);
211643
+ auto stats_guard = stats.GetLock();
211644
+ set_fun(*stats.GetStats(column_id).stats);
211645
+ }
211646
+
211514
211647
  } // namespace duckdb
211515
211648
 
211516
211649
 
@@ -211549,7 +211682,7 @@ void ColumnScanState::NextInternal(idx_t count) {
211549
211682
  }
211550
211683
  row_index += count;
211551
211684
  while (row_index >= current->start + current->count) {
211552
- current = (ColumnSegment *)current->next.get();
211685
+ current = (ColumnSegment *)current->Next();
211553
211686
  initialized = false;
211554
211687
  segment_checked = false;
211555
211688
  if (!current) {
@@ -211606,7 +211739,7 @@ bool CollectionScanState::Scan(Transaction &transaction, DataChunk &result) {
211606
211739
  return true;
211607
211740
  } else {
211608
211741
  do {
211609
- current_row_group = row_group_state.row_group = (RowGroup *)current_row_group->next.get();
211742
+ current_row_group = row_group_state.row_group = (RowGroup *)current_row_group->Next();
211610
211743
  if (current_row_group) {
211611
211744
  bool scan_row_group = current_row_group->InitializeScan(row_group_state);
211612
211745
  if (scan_row_group) {
@@ -211627,7 +211760,7 @@ bool CollectionScanState::ScanCommitted(DataChunk &result, TableScanType type) {
211627
211760
  if (result.size() > 0) {
211628
211761
  return true;
211629
211762
  } else {
211630
- current_row_group = row_group_state.row_group = (RowGroup *)current_row_group->next.get();
211763
+ current_row_group = row_group_state.row_group = (RowGroup *)current_row_group->Next();
211631
211764
  if (current_row_group) {
211632
211765
  current_row_group->InitializeScan(row_group_state);
211633
211766
  }
@@ -211640,37 +211773,74 @@ bool CollectionScanState::ScanCommitted(DataChunk &result, TableScanType type) {
211640
211773
 
211641
211774
 
211642
211775
 
211776
+
211643
211777
  namespace duckdb {
211644
211778
 
211779
+ SegmentLock SegmentTree::Lock() {
211780
+ return SegmentLock(node_lock);
211781
+ }
211782
+
211783
+ bool SegmentTree::IsEmpty(SegmentLock &) {
211784
+ return nodes.empty();
211785
+ }
211786
+
211787
+ SegmentBase *SegmentTree::GetRootSegment(SegmentLock &l) {
211788
+ return nodes.empty() ? nullptr : nodes[0].node.get();
211789
+ }
211790
+
211791
+ vector<SegmentNode> SegmentTree::MoveSegments(SegmentLock &) {
211792
+ return move(nodes);
211793
+ }
211794
+
211645
211795
  SegmentBase *SegmentTree::GetRootSegment() {
211646
- return root_node.get();
211796
+ auto l = Lock();
211797
+ return GetRootSegment(l);
211647
211798
  }
211648
211799
 
211649
- SegmentBase *SegmentTree::GetSegmentByIndex(int64_t index) {
211800
+ SegmentBase *SegmentTree::GetSegmentByIndex(SegmentLock &, int64_t index) {
211650
211801
  if (index < 0) {
211651
211802
  index = nodes.size() + index;
211652
211803
  if (index < 0) {
211653
211804
  return nullptr;
211654
211805
  }
211655
- return nodes[index].node;
211806
+ return nodes[index].node.get();
211656
211807
  } else {
211657
211808
  if (idx_t(index) >= nodes.size()) {
211658
211809
  return nullptr;
211659
211810
  }
211660
- return nodes[index].node;
211811
+ return nodes[index].node.get();
211812
+ }
211813
+ }
211814
+ SegmentBase *SegmentTree::GetSegmentByIndex(int64_t index) {
211815
+ auto l = Lock();
211816
+ return GetSegmentByIndex(l, index);
211817
+ }
211818
+
211819
+ SegmentBase *SegmentTree::GetLastSegment(SegmentLock &l) {
211820
+ if (nodes.empty()) {
211821
+ return nullptr;
211661
211822
  }
211823
+ return nodes.back().node.get();
211662
211824
  }
211663
211825
 
211664
211826
  SegmentBase *SegmentTree::GetLastSegment() {
211665
- return nodes.empty() ? nullptr : nodes.back().node;
211827
+ auto l = Lock();
211828
+ return GetLastSegment(l);
211829
+ }
211830
+
211831
+ SegmentBase *SegmentTree::GetSegment(SegmentLock &l, idx_t row_number) {
211832
+ return nodes[GetSegmentIndex(l, row_number)].node.get();
211666
211833
  }
211667
211834
 
211668
211835
  SegmentBase *SegmentTree::GetSegment(idx_t row_number) {
211669
- lock_guard<mutex> tree_lock(node_lock);
211670
- return nodes[GetSegmentIndex(row_number)].node;
211836
+ auto l = Lock();
211837
+ return GetSegment(l, row_number);
211671
211838
  }
211672
211839
 
211673
- idx_t SegmentTree::GetSegmentIndex(idx_t row_number) {
211840
+ bool SegmentTree::TryGetSegmentIndex(SegmentLock &, idx_t row_number, idx_t &result) {
211841
+ if (nodes.empty()) {
211842
+ return false;
211843
+ }
211674
211844
  D_ASSERT(!nodes.empty());
211675
211845
  D_ASSERT(row_number >= nodes[0].row_start);
211676
211846
  D_ASSERT(row_number < nodes.back().row_start + nodes.back().node->count);
@@ -211687,44 +211857,97 @@ idx_t SegmentTree::GetSegmentIndex(idx_t row_number) {
211687
211857
  } else if (row_number >= entry.row_start + entry.node->count) {
211688
211858
  lower = index + 1;
211689
211859
  } else {
211690
- return index;
211860
+ result = index;
211861
+ return true;
211691
211862
  }
211692
211863
  }
211693
- throw InternalException("Could not find node in column segment tree!");
211864
+ return false;
211694
211865
  }
211695
211866
 
211696
- bool SegmentTree::HasSegment(SegmentBase *segment) {
211697
- lock_guard<mutex> tree_lock(node_lock);
211867
+ idx_t SegmentTree::GetSegmentIndex(SegmentLock &l, idx_t row_number) {
211868
+ idx_t segment_index;
211869
+ if (TryGetSegmentIndex(l, row_number, segment_index)) {
211870
+ return segment_index;
211871
+ }
211872
+ string error;
211873
+ error = StringUtil::Format("Attempting to find row number \"%lld\" in %lld nodes\n", row_number, nodes.size());
211874
+ for (idx_t i = 0; i < nodes.size(); i++) {
211875
+ error +=
211876
+ StringUtil::Format("Node %lld: Start %lld, Count %lld", i, nodes[i].row_start, nodes[i].node->count.load());
211877
+ }
211878
+ throw InternalException("Could not find node in column segment tree!\n%s%s", error, Exception::GetStackTrace());
211879
+ }
211880
+
211881
+ idx_t SegmentTree::GetSegmentIndex(idx_t row_number) {
211882
+ auto l = Lock();
211883
+ return GetSegmentIndex(l, row_number);
211884
+ }
211885
+
211886
+ bool SegmentTree::HasSegment(SegmentLock &, SegmentBase *segment) {
211698
211887
  for (auto &node : nodes) {
211699
- if (node.node == segment) {
211888
+ if (node.node.get() == segment) {
211700
211889
  return true;
211701
211890
  }
211702
211891
  }
211703
211892
  return false;
211704
211893
  }
211705
211894
 
211706
- void SegmentTree::AppendSegment(unique_ptr<SegmentBase> segment) {
211895
+ bool SegmentTree::HasSegment(SegmentBase *segment) {
211896
+ auto l = Lock();
211897
+ return HasSegment(l, segment);
211898
+ }
211899
+
211900
+ void SegmentTree::AppendSegment(SegmentLock &, unique_ptr<SegmentBase> segment) {
211707
211901
  D_ASSERT(segment);
211708
211902
  // add the node to the list of nodes
211903
+ if (!nodes.empty()) {
211904
+ nodes.back().node->next = segment.get();
211905
+ }
211709
211906
  SegmentNode node;
211710
211907
  node.row_start = segment->start;
211711
- node.node = segment.get();
211712
- nodes.push_back(node);
211908
+ node.node = move(segment);
211909
+ nodes.push_back(move(node));
211910
+ }
211713
211911
 
211714
- if (nodes.size() > 1) {
211715
- // add the node as the next pointer of the last node
211716
- D_ASSERT(!nodes[nodes.size() - 2].node->next);
211717
- nodes[nodes.size() - 2].node->next = move(segment);
211718
- } else {
211719
- root_node = move(segment);
211912
+ void SegmentTree::AppendSegment(unique_ptr<SegmentBase> segment) {
211913
+ auto l = Lock();
211914
+ AppendSegment(l, move(segment));
211915
+ }
211916
+
211917
+ void SegmentTree::EraseSegments(SegmentLock &, idx_t segment_start) {
211918
+ if (segment_start >= nodes.size() - 1) {
211919
+ return;
211720
211920
  }
211921
+ nodes.erase(nodes.begin() + segment_start + 1, nodes.end());
211721
211922
  }
211722
211923
 
211723
- void SegmentTree::Replace(SegmentTree &other) {
211724
- root_node = move(other.root_node);
211924
+ void SegmentTree::Replace(SegmentLock &, SegmentTree &other) {
211725
211925
  nodes = move(other.nodes);
211726
211926
  }
211727
211927
 
211928
+ void SegmentTree::Replace(SegmentTree &other) {
211929
+ auto l = Lock();
211930
+ Replace(l, other);
211931
+ }
211932
+
211933
+ void SegmentTree::Verify(SegmentLock &) {
211934
+ #ifdef DEBUG
211935
+ idx_t base_start = nodes.empty() ? 0 : nodes[0].node->start;
211936
+ for (idx_t i = 0; i < nodes.size(); i++) {
211937
+ D_ASSERT(nodes[i].row_start == nodes[i].node->start);
211938
+ D_ASSERT(nodes[i].node->start == base_start);
211939
+ base_start += nodes[i].node->count;
211940
+ }
211941
+ #endif
211942
+ }
211943
+
211944
+ void SegmentTree::Verify() {
211945
+ #ifdef DEBUG
211946
+ auto l = Lock();
211947
+ Verify(l);
211948
+ #endif
211949
+ }
211950
+
211728
211951
  } // namespace duckdb
211729
211952
 
211730
211953