duckdb 0.8.2-dev1724.0 → 0.8.2-dev1791.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -100,6 +100,13 @@ var stmt = con.prepare('select ?::INTEGER as fortytwo', function(err, stmt) {
100
100
  });
101
101
  ```
102
102
 
103
+ ## Supported Node versions
104
+ We actively support only LTS and In-Support Node versions, as per July 2023, they are: Node 16, Node 18 and Node 20.
105
+ Release schedule for Node.js can be checked here: https://github.com/nodejs/release#release-schedule.
106
+
107
+ We currently bundle and test DuckDB also for Node 10, 12, 14, 17 and 19. We plan of going so going forward as long as the tooling supports it.
108
+ As per July 2023, Node 15 has been removed from the supported versions.
109
+
103
110
  ## Development
104
111
 
105
112
  ### First install:
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.8.2-dev1724.0",
5
+ "version": "0.8.2-dev1791.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -93,6 +93,7 @@ public:
93
93
  shared_ptr<ParquetFileMetadataCache> metadata;
94
94
  ParquetOptions parquet_options;
95
95
  MultiFileReaderData reader_data;
96
+ unique_ptr<ColumnReader> root_reader;
96
97
 
97
98
  public:
98
99
  void InitializeScan(ParquetReaderScanState &state, vector<idx_t> groups_to_read);
@@ -116,6 +116,11 @@ struct ParquetWriteBindData : public TableFunctionData {
116
116
  vector<string> column_names;
117
117
  duckdb_parquet::format::CompressionCodec::type codec = duckdb_parquet::format::CompressionCodec::SNAPPY;
118
118
  idx_t row_group_size = RowGroup::ROW_GROUP_SIZE;
119
+
120
+ //! If row_group_size_bytes is not set, we default to row_group_size * BYTES_PER_ROW
121
+ static constexpr const idx_t BYTES_PER_ROW = 1024;
122
+ idx_t row_group_size_bytes;
123
+
119
124
  ChildFieldIDs field_ids;
120
125
  };
121
126
 
@@ -741,33 +746,39 @@ static void GetFieldIDs(const Value &field_ids_value, ChildFieldIDs &field_ids,
741
746
  unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyInfo &info, vector<string> &names,
742
747
  vector<LogicalType> &sql_types) {
743
748
  D_ASSERT(names.size() == sql_types.size());
749
+ bool row_group_size_bytes_set = false;
744
750
  auto bind_data = make_uniq<ParquetWriteBindData>();
745
751
  for (auto &option : info.options) {
746
- auto loption = StringUtil::Lower(option.first);
752
+ const auto loption = StringUtil::Lower(option.first);
753
+ if (option.second.size() != 1) {
754
+ // All parquet write options require exactly one argument
755
+ throw BinderException("%s requires exactly one argument", StringUtil::Upper(loption));
756
+ }
747
757
  if (loption == "row_group_size" || loption == "chunk_size") {
748
758
  bind_data->row_group_size = option.second[0].GetValue<uint64_t>();
759
+ } else if (loption == "row_group_size_bytes") {
760
+ auto roption = option.second[0];
761
+ if (roption.GetTypeMutable().id() == LogicalTypeId::VARCHAR) {
762
+ bind_data->row_group_size_bytes = DBConfig::ParseMemoryLimit(roption.ToString());
763
+ } else {
764
+ bind_data->row_group_size_bytes = option.second[0].GetValue<uint64_t>();
765
+ }
766
+ row_group_size_bytes_set = true;
749
767
  } else if (loption == "compression" || loption == "codec") {
750
- if (!option.second.empty()) {
751
- auto roption = StringUtil::Lower(option.second[0].ToString());
752
- if (roption == "uncompressed") {
753
- bind_data->codec = duckdb_parquet::format::CompressionCodec::UNCOMPRESSED;
754
- continue;
755
- } else if (roption == "snappy") {
756
- bind_data->codec = duckdb_parquet::format::CompressionCodec::SNAPPY;
757
- continue;
758
- } else if (roption == "gzip") {
759
- bind_data->codec = duckdb_parquet::format::CompressionCodec::GZIP;
760
- continue;
761
- } else if (roption == "zstd") {
762
- bind_data->codec = duckdb_parquet::format::CompressionCodec::ZSTD;
763
- continue;
764
- }
768
+ const auto roption = StringUtil::Lower(option.second[0].ToString());
769
+ if (roption == "uncompressed") {
770
+ bind_data->codec = duckdb_parquet::format::CompressionCodec::UNCOMPRESSED;
771
+ } else if (roption == "snappy") {
772
+ bind_data->codec = duckdb_parquet::format::CompressionCodec::SNAPPY;
773
+ } else if (roption == "gzip") {
774
+ bind_data->codec = duckdb_parquet::format::CompressionCodec::GZIP;
775
+ } else if (roption == "zstd") {
776
+ bind_data->codec = duckdb_parquet::format::CompressionCodec::ZSTD;
777
+ } else {
778
+ throw BinderException("Expected %s argument to be either [uncompressed, snappy, gzip or zstd]",
779
+ loption);
765
780
  }
766
- throw BinderException("Expected %s argument to be either [uncompressed, snappy, gzip or zstd]", loption);
767
781
  } else if (loption == "field_ids") {
768
- if (option.second.size() != 1) {
769
- throw BinderException("FIELD_IDS requires exactly one argument");
770
- }
771
782
  if (option.second[0].type().id() == LogicalTypeId::VARCHAR &&
772
783
  StringUtil::Lower(StringValue::Get(option.second[0])) == "auto") {
773
784
  idx_t field_id = 0;
@@ -788,6 +799,9 @@ unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyInfo &info
788
799
  throw NotImplementedException("Unrecognized option for PARQUET: %s", option.first.c_str());
789
800
  }
790
801
  }
802
+ if (!row_group_size_bytes_set) {
803
+ bind_data->row_group_size_bytes = bind_data->row_group_size * ParquetWriteBindData::BYTES_PER_ROW;
804
+ }
791
805
  bind_data->sql_types = sql_types;
792
806
  bind_data->column_names = names;
793
807
  return std::move(bind_data);
@@ -812,8 +826,10 @@ void ParquetWriteSink(ExecutionContext &context, FunctionData &bind_data_p, Glob
812
826
 
813
827
  // append data to the local (buffered) chunk collection
814
828
  local_state.buffer.Append(local_state.append_state, input);
815
- if (local_state.buffer.Count() > bind_data.row_group_size) {
816
- // if the chunk collection exceeds a certain size we flush it to the parquet file
829
+
830
+ if (local_state.buffer.Count() > bind_data.row_group_size ||
831
+ local_state.buffer.SizeInBytes() > bind_data.row_group_size_bytes) {
832
+ // if the chunk collection exceeds a certain size (rows/bytes) we flush it to the parquet file
817
833
  local_state.append_state.current_chunk_state.handles.clear();
818
834
  global_state.writer->Flush(local_state.buffer);
819
835
  local_state.buffer.InitializeAppend(local_state.append_state);
@@ -399,8 +399,7 @@ void ParquetReader::InitializeSchema() {
399
399
  if (file_meta_data->schema.size() < 2) {
400
400
  throw FormatException("Need at least one non-root column in the file");
401
401
  }
402
- auto root_reader = CreateReader();
403
-
402
+ root_reader = CreateReader();
404
403
  auto &root_type = root_reader->Type();
405
404
  auto &child_types = StructType::GetChildTypes(root_type);
406
405
  D_ASSERT(root_type.id() == LogicalTypeId::STRUCT);
@@ -450,7 +449,6 @@ ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, Parqu
450
449
  ObjectCache::GetObjectCache(context_p).Put(file_name, metadata);
451
450
  }
452
451
  }
453
-
454
452
  InitializeSchema();
455
453
  }
456
454
 
@@ -483,7 +481,6 @@ unique_ptr<BaseStatistics> ParquetReader::ReadStatistics(const string &name) {
483
481
 
484
482
  unique_ptr<BaseStatistics> column_stats;
485
483
  auto file_meta_data = GetFileMetadata();
486
- auto root_reader = CreateReader();
487
484
  auto column_reader = root_reader->Cast<StructColumnReader>().GetChildReader(file_col_idx);
488
485
 
489
486
  for (idx_t row_group_idx = 0; row_group_idx < file_meta_data->row_groups.size(); row_group_idx++) {
@@ -87,16 +87,22 @@ PartitionGlobalSinkState::PartitionGlobalSinkState(ClientContext &context,
87
87
  const vector<unique_ptr<BaseStatistics>> &partition_stats,
88
88
  idx_t estimated_cardinality)
89
89
  : context(context), buffer_manager(BufferManager::GetBufferManager(context)), allocator(Allocator::Get(context)),
90
- fixed_bits(0), payload_types(payload_types), memory_per_thread(0), count(0) {
90
+ fixed_bits(0), payload_types(payload_types), memory_per_thread(0), max_bits(1), count(0) {
91
91
 
92
92
  GenerateOrderings(partitions, orders, partition_bys, order_bys, partition_stats);
93
93
 
94
94
  memory_per_thread = PhysicalOperator::GetMaxThreadMemory(context);
95
95
  external = ClientConfig::GetConfig(context).force_external;
96
96
 
97
+ const auto thread_pages = PreviousPowerOfTwo(memory_per_thread / (4 * idx_t(Storage::BLOCK_ALLOC_SIZE)));
98
+ while (max_bits < 10 && (thread_pages >> max_bits) > 1) {
99
+ ++max_bits;
100
+ }
101
+
97
102
  if (!orders.empty()) {
98
- grouping_types = payload_types;
99
- grouping_types.push_back(LogicalType::HASH);
103
+ auto types = payload_types;
104
+ types.push_back(LogicalType::HASH);
105
+ grouping_types.Initialize(types);
100
106
 
101
107
  ResizeGroupingData(estimated_cardinality);
102
108
  }
@@ -108,10 +114,15 @@ void PartitionGlobalSinkState::SyncPartitioning(const PartitionGlobalSinkState &
108
114
  const auto old_bits = grouping_data ? grouping_data->GetRadixBits() : 0;
109
115
  if (fixed_bits != old_bits) {
110
116
  const auto hash_col_idx = payload_types.size();
111
- grouping_data = make_uniq<RadixPartitionedColumnData>(context, grouping_types, fixed_bits, hash_col_idx);
117
+ grouping_data = make_uniq<RadixPartitionedTupleData>(buffer_manager, grouping_types, fixed_bits, hash_col_idx);
112
118
  }
113
119
  }
114
120
 
121
+ unique_ptr<RadixPartitionedTupleData> PartitionGlobalSinkState::CreatePartition(idx_t new_bits) const {
122
+ const auto hash_col_idx = payload_types.size();
123
+ return make_uniq<RadixPartitionedTupleData>(buffer_manager, grouping_types, new_bits, hash_col_idx);
124
+ }
125
+
115
126
  void PartitionGlobalSinkState::ResizeGroupingData(idx_t cardinality) {
116
127
  // Have we started to combine? Then just live with it.
117
128
  if (fixed_bits || (grouping_data && !grouping_data->GetPartitions().empty())) {
@@ -121,47 +132,31 @@ void PartitionGlobalSinkState::ResizeGroupingData(idx_t cardinality) {
121
132
  const idx_t partition_size = STANDARD_ROW_GROUPS_SIZE;
122
133
  const auto bits = grouping_data ? grouping_data->GetRadixBits() : 0;
123
134
  auto new_bits = bits ? bits : 4;
124
- while (new_bits < 10 && (cardinality / RadixPartitioning::NumberOfPartitions(new_bits)) > partition_size) {
135
+ while (new_bits < max_bits && (cardinality / RadixPartitioning::NumberOfPartitions(new_bits)) > partition_size) {
125
136
  ++new_bits;
126
137
  }
127
138
 
128
139
  // Repartition the grouping data
129
140
  if (new_bits != bits) {
130
- const auto hash_col_idx = payload_types.size();
131
- grouping_data = make_uniq<RadixPartitionedColumnData>(context, grouping_types, new_bits, hash_col_idx);
141
+ grouping_data = CreatePartition(new_bits);
132
142
  }
133
143
  }
134
144
 
135
145
  void PartitionGlobalSinkState::SyncLocalPartition(GroupingPartition &local_partition, GroupingAppend &local_append) {
136
146
  // We are done if the local_partition is right sized.
137
- auto &local_radix = local_partition->Cast<RadixPartitionedColumnData>();
138
- if (local_radix.GetRadixBits() == grouping_data->GetRadixBits()) {
147
+ auto &local_radix = local_partition->Cast<RadixPartitionedTupleData>();
148
+ const auto new_bits = grouping_data->GetRadixBits();
149
+ if (local_radix.GetRadixBits() == new_bits) {
139
150
  return;
140
151
  }
141
152
 
142
153
  // If the local partition is now too small, flush it and reallocate
143
- auto new_partition = grouping_data->CreateShared();
144
- auto new_append = make_uniq<PartitionedColumnDataAppendState>();
145
- new_partition->InitializeAppendState(*new_append);
146
-
154
+ auto new_partition = CreatePartition(new_bits);
147
155
  local_partition->FlushAppendState(*local_append);
148
- auto &local_groups = local_partition->GetPartitions();
149
- for (auto &local_group : local_groups) {
150
- ColumnDataScanState scanner;
151
- local_group->InitializeScan(scanner);
152
-
153
- DataChunk scan_chunk;
154
- local_group->InitializeScanChunk(scan_chunk);
155
- for (scan_chunk.Reset(); local_group->Scan(scanner, scan_chunk); scan_chunk.Reset()) {
156
- new_partition->Append(*new_append, scan_chunk);
157
- }
158
- }
159
-
160
- // The append state has stale pointers to the old local partition, so nuke it from orbit.
161
- new_partition->FlushAppendState(*new_append);
156
+ local_partition->Repartition(*new_partition);
162
157
 
163
158
  local_partition = std::move(new_partition);
164
- local_append = make_uniq<PartitionedColumnDataAppendState>();
159
+ local_append = make_uniq<PartitionedTupleDataAppendState>();
165
160
  local_partition->InitializeAppendState(*local_append);
166
161
  }
167
162
 
@@ -170,8 +165,8 @@ void PartitionGlobalSinkState::UpdateLocalPartition(GroupingPartition &local_par
170
165
  lock_guard<mutex> guard(lock);
171
166
 
172
167
  if (!local_partition) {
173
- local_partition = grouping_data->CreateShared();
174
- local_append = make_uniq<PartitionedColumnDataAppendState>();
168
+ local_partition = CreatePartition(grouping_data->GetRadixBits());
169
+ local_append = make_uniq<PartitionedTupleDataAppendState>();
175
170
  local_partition->InitializeAppendState(*local_append);
176
171
  return;
177
172
  }
@@ -196,7 +191,7 @@ void PartitionGlobalSinkState::CombineLocalPartition(GroupingPartition &local_pa
196
191
  grouping_data->Combine(*local_partition);
197
192
  }
198
193
 
199
- void PartitionGlobalSinkState::BuildSortState(ColumnDataCollection &group_data, GlobalSortState &global_sort) const {
194
+ void PartitionGlobalSinkState::BuildSortState(TupleDataCollection &group_data, GlobalSortState &global_sort) const {
200
195
  // Set up the sort expression computation.
201
196
  vector<LogicalType> sort_types;
202
197
  ExpressionExecutor executor(context);
@@ -221,16 +216,9 @@ void PartitionGlobalSinkState::BuildSortState(ColumnDataCollection &group_data,
221
216
  for (column_t i = 0; i < payload_types.size(); ++i) {
222
217
  column_ids.emplace_back(i);
223
218
  }
224
- ColumnDataConsumer scanner(group_data, column_ids);
225
- ColumnDataConsumerScanState chunk_state;
226
- chunk_state.current_chunk_state.properties = ColumnDataScanProperties::ALLOW_ZERO_COPY;
227
- scanner.InitializeScan();
228
- for (auto chunk_idx = scanner.ChunkCount(); chunk_idx-- > 0;) {
229
- if (!scanner.AssignChunk(chunk_state)) {
230
- break;
231
- }
232
- scanner.ScanChunk(chunk_state, payload_chunk);
233
-
219
+ TupleDataScanState chunk_state;
220
+ group_data.InitializeScan(chunk_state, column_ids);
221
+ while (group_data.Scan(chunk_state, payload_chunk)) {
234
222
  sort_chunk.Reset();
235
223
  executor.Execute(payload_chunk, sort_chunk);
236
224
 
@@ -238,13 +226,12 @@ void PartitionGlobalSinkState::BuildSortState(ColumnDataCollection &group_data,
238
226
  if (local_sort.SizeInBytes() > memory_per_thread) {
239
227
  local_sort.Sort(global_sort, true);
240
228
  }
241
- scanner.FinishChunk(chunk_state);
242
229
  }
243
230
 
244
231
  global_sort.AddLocalState(local_sort);
245
232
  }
246
233
 
247
- void PartitionGlobalSinkState::BuildSortState(ColumnDataCollection &group_data, PartitionGlobalHashGroup &hash_group) {
234
+ void PartitionGlobalSinkState::BuildSortState(TupleDataCollection &group_data, PartitionGlobalHashGroup &hash_group) {
248
235
  BuildSortState(group_data, *hash_group.global_sort);
249
236
 
250
237
  hash_group.count += group_data.Count();
@@ -315,7 +315,7 @@ void LocalSortState::ReOrder(SortedData &sd, data_ptr_t sorting_ptr, RowDataColl
315
315
  sd.data_blocks.back()->block->SetSwizzling(nullptr);
316
316
  // Create a single heap block to store the ordered heap
317
317
  idx_t total_byte_offset =
318
- std::accumulate(heap.blocks.begin(), heap.blocks.end(), 0,
318
+ std::accumulate(heap.blocks.begin(), heap.blocks.end(), (idx_t)0,
319
319
  [](idx_t a, const unique_ptr<RowDataBlock> &b) { return a + b->byte_offset; });
320
320
  idx_t heap_block_size = MaxValue(total_byte_offset, (idx_t)Storage::BLOCK_SIZE);
321
321
  auto ordered_heap_block = make_uniq<RowDataBlock>(*buffer_manager, heap_block_size, 1);
@@ -85,7 +85,7 @@ SortedBlock::SortedBlock(BufferManager &buffer_manager, GlobalSortState &state)
85
85
  }
86
86
 
87
87
  idx_t SortedBlock::Count() const {
88
- idx_t count = std::accumulate(radix_sorting_data.begin(), radix_sorting_data.end(), 0,
88
+ idx_t count = std::accumulate(radix_sorting_data.begin(), radix_sorting_data.end(), (idx_t)0,
89
89
  [](idx_t a, const unique_ptr<RowDataBlock> &b) { return a + b->count; });
90
90
  if (!sort_layout.all_constant) {
91
91
  D_ASSERT(count == blob_sorting_data->Count());
@@ -100,6 +100,14 @@ Allocator &ColumnDataCollection::GetAllocator() const {
100
100
  return allocator->GetAllocator();
101
101
  }
102
102
 
103
+ idx_t ColumnDataCollection::SizeInBytes() const {
104
+ idx_t total_size = 0;
105
+ for (const auto &segment : segments) {
106
+ total_size += segment->SizeInBytes();
107
+ }
108
+ return total_size;
109
+ }
110
+
103
111
  //===--------------------------------------------------------------------===//
104
112
  // ColumnDataRow
105
113
  //===--------------------------------------------------------------------===//
@@ -243,6 +243,11 @@ idx_t ColumnDataCollectionSegment::ChunkCount() const {
243
243
  return chunk_data.size();
244
244
  }
245
245
 
246
+ idx_t ColumnDataCollectionSegment::SizeInBytes() const {
247
+ D_ASSERT(!allocator->IsShared());
248
+ return allocator->SizeInBytes() + heap->SizeInBytes();
249
+ }
250
+
246
251
  void ColumnDataCollectionSegment::FetchChunk(idx_t chunk_idx, DataChunk &result) {
247
252
  vector<column_t> column_ids;
248
253
  column_ids.reserve(types.size());
@@ -55,4 +55,8 @@ string_t StringHeap::EmptyString(idx_t len) {
55
55
  return string_t(insert_pos, len);
56
56
  }
57
57
 
58
+ idx_t StringHeap::SizeInBytes() const {
59
+ return allocator.SizeInBytes();
60
+ }
61
+
58
62
  } // namespace duckdb
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.8.2-dev1724"
2
+ #define DUCKDB_VERSION "0.8.2-dev1791"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "0e0fd210cd"
5
+ #define DUCKDB_SOURCE_ID "ecae3d0c87"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -42,8 +42,8 @@ public:
42
42
  using Orders = vector<BoundOrderByNode>;
43
43
  using Types = vector<LogicalType>;
44
44
 
45
- using GroupingPartition = unique_ptr<PartitionedColumnData>;
46
- using GroupingAppend = unique_ptr<PartitionedColumnDataAppendState>;
45
+ using GroupingPartition = unique_ptr<PartitionedTupleData>;
46
+ using GroupingAppend = unique_ptr<PartitionedTupleDataAppendState>;
47
47
 
48
48
  static void GenerateOrderings(Orders &partitions, Orders &orders,
49
49
  const vector<unique_ptr<Expression>> &partition_bys, const Orders &order_bys,
@@ -53,13 +53,14 @@ public:
53
53
  const vector<BoundOrderByNode> &order_bys, const Types &payload_types,
54
54
  const vector<unique_ptr<BaseStatistics>> &partitions_stats, idx_t estimated_cardinality);
55
55
 
56
+ unique_ptr<RadixPartitionedTupleData> CreatePartition(idx_t new_bits) const;
56
57
  void SyncPartitioning(const PartitionGlobalSinkState &other);
57
58
 
58
59
  void UpdateLocalPartition(GroupingPartition &local_partition, GroupingAppend &local_append);
59
60
  void CombineLocalPartition(GroupingPartition &local_partition, GroupingAppend &local_append);
60
61
 
61
- void BuildSortState(ColumnDataCollection &group_data, GlobalSortState &global_sort) const;
62
- void BuildSortState(ColumnDataCollection &group_data, PartitionGlobalHashGroup &global_sort);
62
+ void BuildSortState(TupleDataCollection &group_data, GlobalSortState &global_sort) const;
63
+ void BuildSortState(TupleDataCollection &group_data, PartitionGlobalHashGroup &global_sort);
63
64
 
64
65
  ClientContext &context;
65
66
  BufferManager &buffer_manager;
@@ -67,9 +68,9 @@ public:
67
68
  mutex lock;
68
69
 
69
70
  // OVER(PARTITION BY...) (hash grouping)
70
- unique_ptr<RadixPartitionedColumnData> grouping_data;
71
+ unique_ptr<RadixPartitionedTupleData> grouping_data;
71
72
  //! Payload plus hash column
72
- Types grouping_types;
73
+ TupleDataLayout grouping_types;
73
74
  //! The number of radix bits if this partition is being synced with another
74
75
  idx_t fixed_bits;
75
76
 
@@ -88,6 +89,7 @@ public:
88
89
 
89
90
  // Threading
90
91
  idx_t memory_per_thread;
92
+ idx_t max_bits;
91
93
  atomic<idx_t> count;
92
94
 
93
95
  private:
@@ -107,8 +109,8 @@ public:
107
109
  ExpressionExecutor executor;
108
110
  DataChunk group_chunk;
109
111
  DataChunk payload_chunk;
110
- unique_ptr<PartitionedColumnData> local_partition;
111
- unique_ptr<PartitionedColumnDataAppendState> local_append;
112
+ unique_ptr<PartitionedTupleData> local_partition;
113
+ unique_ptr<PartitionedTupleDataAppendState> local_append;
112
114
 
113
115
  // OVER(...) (sorting)
114
116
  size_t sort_cols;
@@ -132,7 +134,7 @@ class PartitionLocalMergeState;
132
134
 
133
135
  class PartitionGlobalMergeState {
134
136
  public:
135
- using GroupDataPtr = unique_ptr<ColumnDataCollection>;
137
+ using GroupDataPtr = unique_ptr<TupleDataCollection>;
136
138
 
137
139
  PartitionGlobalMergeState(PartitionGlobalSinkState &sink, GroupDataPtr group_data, hash_t hash_bin);
138
140
 
@@ -43,9 +43,19 @@ public:
43
43
  void MakeShared() {
44
44
  shared = true;
45
45
  }
46
+ bool IsShared() const {
47
+ return shared;
48
+ }
46
49
  idx_t BlockCount() const {
47
50
  return blocks.size();
48
51
  }
52
+ idx_t SizeInBytes() const {
53
+ idx_t total_size = 0;
54
+ for (const auto &block : blocks) {
55
+ total_size += block.size;
56
+ }
57
+ return total_size;
58
+ }
49
59
 
50
60
  public:
51
61
  void AllocateData(idx_t size, uint32_t &block_id, uint32_t &offset, ChunkManagementState *chunk_state);
@@ -61,6 +61,9 @@ public:
61
61
  return types.size();
62
62
  }
63
63
 
64
+ //! The size (in bytes) of this ColumnDataCollection
65
+ idx_t SizeInBytes() const;
66
+
64
67
  //! Get the allocator
65
68
  DUCKDB_API Allocator &GetAllocator() const;
66
69
 
@@ -126,6 +126,8 @@ public:
126
126
  }
127
127
 
128
128
  idx_t ChunkCount() const;
129
+ idx_t SizeInBytes() const;
130
+
129
131
  void FetchChunk(idx_t chunk_idx, DataChunk &result);
130
132
  void FetchChunk(idx_t chunk_idx, DataChunk &result, const vector<column_t> &column_ids);
131
133
 
@@ -123,7 +123,11 @@ protected:
123
123
  void BuildBufferSpace(PartitionedTupleDataAppendState &state);
124
124
  //! Create a collection for a specific a partition
125
125
  unique_ptr<TupleDataCollection> CreatePartitionCollection(idx_t partition_index) const {
126
- return make_uniq<TupleDataCollection>(allocators->allocators[partition_index]);
126
+ if (allocators) {
127
+ return make_uniq<TupleDataCollection>(allocators->allocators[partition_index]);
128
+ } else {
129
+ return make_uniq<TupleDataCollection>(buffer_manager, layout);
130
+ }
127
131
  }
128
132
 
129
133
  protected:
@@ -38,6 +38,9 @@ public:
38
38
  //! Allocates space for an empty string of size "len" on the heap
39
39
  DUCKDB_API string_t EmptyString(idx_t len);
40
40
 
41
+ //! Size of strings
42
+ DUCKDB_API idx_t SizeInBytes() const;
43
+
41
44
  private:
42
45
  ArenaAllocator allocator;
43
46
  };
@@ -46,6 +46,7 @@ public:
46
46
  DUCKDB_API ArenaChunk *GetTail();
47
47
 
48
48
  DUCKDB_API bool IsEmpty() const;
49
+ DUCKDB_API idx_t SizeInBytes() const;
49
50
 
50
51
  //! Returns an "Allocator" wrapper for this arena allocator
51
52
  Allocator &GetAllocator() {
@@ -81,9 +81,7 @@ unique_ptr<LogicalOperator> Optimizer::Optimize(unique_ptr<LogicalOperator> plan
81
81
 
82
82
  switch (plan_p->type) {
83
83
  case LogicalOperatorType::LOGICAL_TRANSACTION:
84
- case LogicalOperatorType::LOGICAL_SET:
85
- case LogicalOperatorType::LOGICAL_PRAGMA:
86
- return plan_p;
84
+ return plan_p; // skip optimizing simple & often-occurring plans unaffected by rewrites
87
85
  default:
88
86
  break;
89
87
  }
@@ -193,48 +193,53 @@ void Parser::ParseQuery(const string &query) {
193
193
  auto query_statements = SplitQueryStringIntoStatements(query);
194
194
  auto stmt_loc = 0;
195
195
  for (auto const &query_statement : query_statements) {
196
- PostgresParser another_parser;
197
- another_parser.Parse(query_statement);
198
- // LCOV_EXCL_START
199
- // first see if DuckDB can parse this individual query statement
200
- if (another_parser.success) {
201
- if (!another_parser.parse_tree) {
202
- // empty statement
203
- continue;
204
- }
205
- transformer.TransformParseTree(another_parser.parse_tree, statements);
206
- // important to set in the case of a mixture of DDB and parser ext statements
207
- statements.back()->stmt_length = query_statement.size() - 1;
208
- statements.back()->stmt_location = stmt_loc;
209
- stmt_loc += query_statement.size();
210
- } else {
211
- // let extensions parse the statement which DuckDB failed to parse
212
- bool parsed_single_statement = false;
213
- for (auto &ext : *options.extensions) {
214
- D_ASSERT(!parsed_single_statement);
215
- D_ASSERT(ext.parse_function);
216
- auto result = ext.parse_function(ext.parser_info.get(), query_statement);
217
- if (result.type == ParserExtensionResultType::PARSE_SUCCESSFUL) {
218
- auto statement = make_uniq<ExtensionStatement>(ext, std::move(result.parse_data));
219
- statement->stmt_length = query_statement.size() - 1;
220
- statement->stmt_location = stmt_loc;
221
- stmt_loc += query_statement.size();
222
- statements.push_back(std::move(statement));
223
- parsed_single_statement = true;
224
- break;
225
- } else if (result.type == ParserExtensionResultType::DISPLAY_EXTENSION_ERROR) {
226
- throw ParserException(result.error);
227
- } else {
228
- // We move to the next one!
196
+ string another_parser_error;
197
+ // Creating a new scope to allow extensions to use PostgresParser, which is not reentrant
198
+ {
199
+ PostgresParser another_parser;
200
+ another_parser.Parse(query_statement);
201
+ // LCOV_EXCL_START
202
+ // first see if DuckDB can parse this individual query statement
203
+ if (another_parser.success) {
204
+ if (!another_parser.parse_tree) {
205
+ // empty statement
206
+ continue;
229
207
  }
208
+ transformer.TransformParseTree(another_parser.parse_tree, statements);
209
+ // important to set in the case of a mixture of DDB and parser ext statements
210
+ statements.back()->stmt_length = query_statement.size() - 1;
211
+ statements.back()->stmt_location = stmt_loc;
212
+ stmt_loc += query_statement.size();
213
+ continue;
214
+ } else {
215
+ another_parser_error = QueryErrorContext::Format(query, another_parser.error_message,
216
+ another_parser.error_location - 1);
230
217
  }
231
- if (!parsed_single_statement) {
232
- parser_error = QueryErrorContext::Format(query, another_parser.error_message,
233
- another_parser.error_location - 1);
234
- throw ParserException(parser_error);
218
+ } // LCOV_EXCL_STOP
219
+ // LCOV_EXCL_START
220
+ // let extensions parse the statement which DuckDB failed to parse
221
+ bool parsed_single_statement = false;
222
+ for (auto &ext : *options.extensions) {
223
+ D_ASSERT(!parsed_single_statement);
224
+ D_ASSERT(ext.parse_function);
225
+ auto result = ext.parse_function(ext.parser_info.get(), query_statement);
226
+ if (result.type == ParserExtensionResultType::PARSE_SUCCESSFUL) {
227
+ auto statement = make_uniq<ExtensionStatement>(ext, std::move(result.parse_data));
228
+ statement->stmt_length = query_statement.size() - 1;
229
+ statement->stmt_location = stmt_loc;
230
+ stmt_loc += query_statement.size();
231
+ statements.push_back(std::move(statement));
232
+ parsed_single_statement = true;
233
+ break;
234
+ } else if (result.type == ParserExtensionResultType::DISPLAY_EXTENSION_ERROR) {
235
+ throw ParserException(result.error);
236
+ } else {
237
+ // We move to the next one!
235
238
  }
236
239
  }
237
- // LCOV_EXCL_STOP
240
+ if (!parsed_single_statement) {
241
+ throw ParserException(parser_error);
242
+ } // LCOV_EXCL_STOP
238
243
  }
239
244
  }
240
245
  }
@@ -151,4 +151,16 @@ bool ArenaAllocator::IsEmpty() const {
151
151
  return head == nullptr;
152
152
  }
153
153
 
154
+ idx_t ArenaAllocator::SizeInBytes() const {
155
+ idx_t total_size = 0;
156
+ if (!IsEmpty()) {
157
+ auto current = head.get();
158
+ while (current != nullptr) {
159
+ total_size += current->current_position;
160
+ current = current->next.get();
161
+ }
162
+ }
163
+ return total_size;
164
+ }
165
+
154
166
  } // namespace duckdb