duckdb 0.7.2-dev1034.0 → 0.7.2-dev1138.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp +1 -1
  3. package/src/duckdb/src/common/hive_partitioning.cpp +3 -1
  4. package/src/duckdb/src/common/progress_bar/progress_bar.cpp +7 -0
  5. package/src/duckdb/src/common/serializer/enum_serializer.cpp +6 -6
  6. package/src/duckdb/src/common/sort/comparators.cpp +14 -5
  7. package/src/duckdb/src/common/types/interval.cpp +0 -41
  8. package/src/duckdb/src/common/types/list_segment.cpp +658 -0
  9. package/src/duckdb/src/common/types/string_heap.cpp +1 -1
  10. package/src/duckdb/src/common/types/string_type.cpp +1 -1
  11. package/src/duckdb/src/common/types/vector.cpp +1 -1
  12. package/src/duckdb/src/common/value_operations/comparison_operations.cpp +14 -22
  13. package/src/duckdb/src/common/vector_operations/comparison_operators.cpp +10 -10
  14. package/src/duckdb/src/common/vector_operations/is_distinct_from.cpp +11 -10
  15. package/src/duckdb/src/execution/expression_executor/execute_comparison.cpp +2 -2
  16. package/src/duckdb/src/execution/index/art/art.cpp +13 -0
  17. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +1 -1
  18. package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +2 -0
  19. package/src/duckdb/src/execution/operator/join/physical_index_join.cpp +1 -0
  20. package/src/duckdb/src/execution/operator/join/physical_join.cpp +0 -3
  21. package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +5 -1
  22. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +18 -5
  23. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +3 -0
  24. package/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp +2 -1
  25. package/src/duckdb/src/execution/operator/persistent/physical_delete.cpp +1 -3
  26. package/src/duckdb/src/execution/operator/persistent/physical_insert.cpp +1 -0
  27. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +0 -4
  28. package/src/duckdb/src/execution/physical_plan/plan_aggregate.cpp +1 -0
  29. package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +1 -1
  30. package/src/duckdb/src/execution/physical_plan/plan_create_index.cpp +2 -1
  31. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +1 -0
  32. package/src/duckdb/src/function/aggregate/nested/list.cpp +6 -712
  33. package/src/duckdb/src/function/scalar/list/list_sort.cpp +25 -18
  34. package/src/duckdb/src/function/table/read_csv.cpp +5 -0
  35. package/src/duckdb/src/function/table/table_scan.cpp +8 -11
  36. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  37. package/src/duckdb/src/include/duckdb/common/helper.hpp +1 -1
  38. package/src/duckdb/src/include/duckdb/common/operator/comparison_operators.hpp +45 -149
  39. package/src/duckdb/src/include/duckdb/common/progress_bar/progress_bar.hpp +2 -0
  40. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +39 -3
  41. package/src/duckdb/src/include/duckdb/common/types/list_segment.hpp +70 -0
  42. package/src/duckdb/src/include/duckdb/common/types/string_type.hpp +73 -3
  43. package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +1 -12
  44. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +4 -0
  45. package/src/duckdb/src/include/duckdb/main/client_config.hpp +2 -0
  46. package/src/duckdb/src/include/duckdb/storage/compression/chimp/chimp_scan.hpp +1 -0
  47. package/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp +1 -0
  48. package/src/duckdb/src/include/duckdb/storage/data_pointer.hpp +0 -2
  49. package/src/duckdb/src/include/duckdb/storage/data_table.hpp +1 -0
  50. package/src/duckdb/src/include/duckdb/storage/index.hpp +1 -1
  51. package/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp +1 -1
  52. package/src/duckdb/src/include/duckdb/storage/table/column_data.hpp +18 -7
  53. package/src/duckdb/src/include/duckdb/storage/table/column_segment.hpp +0 -3
  54. package/src/duckdb/src/include/duckdb/storage/table/column_segment_tree.hpp +18 -0
  55. package/src/duckdb/src/include/duckdb/storage/table/persistent_table_data.hpp +0 -1
  56. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +35 -43
  57. package/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp +18 -5
  58. package/src/duckdb/src/include/duckdb/storage/table/row_group_segment_tree.hpp +2 -4
  59. package/src/duckdb/src/include/duckdb/storage/table/scan_state.hpp +12 -29
  60. package/src/duckdb/src/include/duckdb/storage/table/segment_base.hpp +2 -3
  61. package/src/duckdb/src/include/duckdb/storage/table/segment_tree.hpp +11 -1
  62. package/src/duckdb/src/include/duckdb/storage/table/standard_column_data.hpp +0 -4
  63. package/src/duckdb/src/include/duckdb/transaction/local_storage.hpp +4 -1
  64. package/src/duckdb/src/include/duckdb.h +21 -0
  65. package/src/duckdb/src/main/capi/table_function-c.cpp +23 -0
  66. package/src/duckdb/src/main/settings/settings.cpp +20 -8
  67. package/src/duckdb/src/optimizer/filter_combiner.cpp +2 -5
  68. package/src/duckdb/src/optimizer/join_order/cardinality_estimator.cpp +2 -0
  69. package/src/duckdb/src/optimizer/join_order/join_order_optimizer.cpp +1 -0
  70. package/src/duckdb/src/parallel/meta_pipeline.cpp +0 -3
  71. package/src/duckdb/src/parser/transform/expression/transform_function.cpp +22 -0
  72. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +1 -0
  73. package/src/duckdb/src/storage/compression/bitpacking.cpp +1 -1
  74. package/src/duckdb/src/storage/compression/fixed_size_uncompressed.cpp +2 -1
  75. package/src/duckdb/src/storage/compression/numeric_constant.cpp +1 -1
  76. package/src/duckdb/src/storage/compression/rle.cpp +1 -0
  77. package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +1 -1
  78. package/src/duckdb/src/storage/data_table.cpp +3 -3
  79. package/src/duckdb/src/storage/local_storage.cpp +7 -0
  80. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  81. package/src/duckdb/src/storage/table/column_data.cpp +75 -18
  82. package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +3 -1
  83. package/src/duckdb/src/storage/table/column_segment.cpp +17 -31
  84. package/src/duckdb/src/storage/table/list_column_data.cpp +9 -12
  85. package/src/duckdb/src/storage/table/row_group.cpp +200 -136
  86. package/src/duckdb/src/storage/table/row_group_collection.cpp +75 -45
  87. package/src/duckdb/src/storage/table/scan_state.cpp +31 -38
  88. package/src/duckdb/src/storage/table/standard_column_data.cpp +4 -6
  89. package/src/duckdb/src/storage/table/struct_column_data.cpp +11 -18
  90. package/src/duckdb/src/storage/table/update_segment.cpp +3 -0
  91. package/src/duckdb/ub_src_common_types.cpp +2 -0
@@ -15,49 +15,40 @@
15
15
  #include "duckdb/main/database.hpp"
16
16
  #include "duckdb/main/attached_database.hpp"
17
17
  #include "duckdb/transaction/duck_transaction.hpp"
18
+ #include "duckdb/storage/table/append_state.hpp"
19
+ #include "duckdb/storage/table/scan_state.hpp"
18
20
 
19
21
  namespace duckdb {
20
22
 
21
23
  constexpr const idx_t RowGroup::ROW_GROUP_VECTOR_COUNT;
22
24
  constexpr const idx_t RowGroup::ROW_GROUP_SIZE;
23
25
 
24
- RowGroup::RowGroup(AttachedDatabase &db, BlockManager &block_manager, DataTableInfo &table_info, idx_t start,
25
- idx_t count)
26
- : SegmentBase<RowGroup>(start, count), db(db), block_manager(block_manager), table_info(table_info) {
27
-
26
+ RowGroup::RowGroup(RowGroupCollection &collection, idx_t start, idx_t count)
27
+ : SegmentBase<RowGroup>(start, count), collection(collection) {
28
28
  Verify();
29
29
  }
30
30
 
31
- RowGroup::RowGroup(AttachedDatabase &db, BlockManager &block_manager, DataTableInfo &table_info,
32
- const vector<LogicalType> &types, RowGroupPointer &&pointer)
33
- : SegmentBase<RowGroup>(pointer.row_start, pointer.tuple_count), db(db), block_manager(block_manager),
34
- table_info(table_info) {
31
+ RowGroup::RowGroup(RowGroupCollection &collection, RowGroupPointer &&pointer)
32
+ : SegmentBase<RowGroup>(pointer.row_start, pointer.tuple_count), collection(collection) {
35
33
  // deserialize the columns
36
- if (pointer.data_pointers.size() != types.size()) {
34
+ if (pointer.data_pointers.size() != collection.GetTypes().size()) {
37
35
  throw IOException("Row group column count is unaligned with table column count. Corrupt file?");
38
36
  }
39
- for (idx_t i = 0; i < pointer.data_pointers.size(); i++) {
40
- auto &block_pointer = pointer.data_pointers[i];
41
- MetaBlockReader column_data_reader(block_manager, block_pointer.block_id);
42
- column_data_reader.offset = block_pointer.offset;
43
- this->columns.push_back(
44
- ColumnData::Deserialize(block_manager, table_info, i, start, column_data_reader, types[i], nullptr));
45
- }
46
-
47
- // set up the statistics
48
- for (auto &stats : pointer.statistics) {
49
- this->stats.emplace_back(std::move(stats));
37
+ this->column_pointers = std::move(pointer.data_pointers);
38
+ this->columns.resize(column_pointers.size());
39
+ this->is_loaded = unique_ptr<atomic<bool>[]>(new atomic<bool>[columns.size()]);
40
+ for (idx_t c = 0; c < columns.size(); c++) {
41
+ this->is_loaded[c] = false;
50
42
  }
51
43
  this->version_info = std::move(pointer.versions);
52
44
 
53
45
  Verify();
54
46
  }
55
47
 
56
- RowGroup::RowGroup(RowGroup &row_group, idx_t start)
57
- : SegmentBase<RowGroup>(start, row_group.count.load()), db(row_group.db), block_manager(row_group.block_manager),
58
- table_info(row_group.table_info), version_info(std::move(row_group.version_info)),
59
- stats(std::move(row_group.stats)) {
60
- for (auto &column : row_group.columns) {
48
+ RowGroup::RowGroup(RowGroup &row_group, RowGroupCollection &collection, idx_t start)
49
+ : SegmentBase<RowGroup>(start, row_group.count.load()), collection(collection),
50
+ version_info(std::move(row_group.version_info)) {
51
+ for (auto &column : row_group.GetColumns()) {
61
52
  this->columns.push_back(ColumnData::CreateColumn(*column, start));
62
53
  }
63
54
  if (version_info) {
@@ -79,23 +70,104 @@ void VersionNode::SetStart(idx_t start) {
79
70
  RowGroup::~RowGroup() {
80
71
  }
81
72
 
73
+ vector<shared_ptr<ColumnData>> &RowGroup::GetColumns() {
74
+ // ensure all columns are loaded
75
+ for (idx_t c = 0; c < GetColumnCount(); c++) {
76
+ GetColumn(c);
77
+ }
78
+ return columns;
79
+ }
80
+
81
+ idx_t RowGroup::GetColumnCount() const {
82
+ return columns.size();
83
+ }
84
+
85
+ ColumnData &RowGroup::GetColumn(idx_t c) {
86
+ D_ASSERT(c < columns.size());
87
+ if (!is_loaded) {
88
+ // not being lazy loaded
89
+ D_ASSERT(columns[c]);
90
+ return *columns[c];
91
+ }
92
+ if (is_loaded[c]) {
93
+ D_ASSERT(columns[c]);
94
+ return *columns[c];
95
+ }
96
+ lock_guard<mutex> l(row_group_lock);
97
+ if (columns[c]) {
98
+ D_ASSERT(is_loaded[c]);
99
+ return *columns[c];
100
+ }
101
+ if (column_pointers.size() != columns.size()) {
102
+ throw InternalException("Lazy loading a column but the pointer was not set");
103
+ }
104
+ auto &block_manager = collection.GetBlockManager();
105
+ auto &types = collection.GetTypes();
106
+ auto &block_pointer = column_pointers[c];
107
+ MetaBlockReader column_data_reader(block_manager, block_pointer.block_id);
108
+ column_data_reader.offset = block_pointer.offset;
109
+ this->columns[c] =
110
+ ColumnData::Deserialize(GetBlockManager(), GetTableInfo(), c, start, column_data_reader, types[c], nullptr);
111
+ is_loaded[c] = true;
112
+ return *columns[c];
113
+ }
114
+
82
115
  DatabaseInstance &RowGroup::GetDatabase() {
83
- return db.GetDatabase();
116
+ return collection.GetDatabase();
117
+ }
118
+
119
+ BlockManager &RowGroup::GetBlockManager() {
120
+ return collection.GetBlockManager();
121
+ }
122
+ DataTableInfo &RowGroup::GetTableInfo() {
123
+ return collection.GetTableInfo();
84
124
  }
85
125
 
86
126
  void RowGroup::InitializeEmpty(const vector<LogicalType> &types) {
87
127
  // set up the segment trees for the column segments
128
+ D_ASSERT(columns.empty());
88
129
  for (idx_t i = 0; i < types.size(); i++) {
89
- auto column_data = ColumnData::CreateColumn(block_manager, GetTableInfo(), i, start, types[i]);
90
- stats.emplace_back(types[i]);
130
+ auto column_data = ColumnData::CreateColumn(GetBlockManager(), GetTableInfo(), i, start, types[i]);
91
131
  columns.push_back(std::move(column_data));
92
132
  }
93
133
  }
94
134
 
95
- bool RowGroup::InitializeScanWithOffset(RowGroupScanState &state, idx_t vector_offset) {
135
+ void ColumnScanState::Initialize(const LogicalType &type) {
136
+ if (type.id() == LogicalTypeId::VALIDITY) {
137
+ // validity - nothing to initialize
138
+ return;
139
+ }
140
+ if (type.InternalType() == PhysicalType::STRUCT) {
141
+ // validity + struct children
142
+ auto &struct_children = StructType::GetChildTypes(type);
143
+ child_states.resize(struct_children.size() + 1);
144
+ for (idx_t i = 0; i < struct_children.size(); i++) {
145
+ child_states[i + 1].Initialize(struct_children[i].second);
146
+ }
147
+ } else if (type.InternalType() == PhysicalType::LIST) {
148
+ // validity + list child
149
+ child_states.resize(2);
150
+ child_states[1].Initialize(ListType::GetChildType(type));
151
+ } else {
152
+ // validity
153
+ child_states.resize(1);
154
+ }
155
+ }
156
+
157
+ void CollectionScanState::Initialize(const vector<LogicalType> &types) {
158
+ auto &column_ids = GetColumnIds();
159
+ column_scans = unique_ptr<ColumnScanState[]>(new ColumnScanState[column_ids.size()]);
160
+ for (idx_t i = 0; i < column_ids.size(); i++) {
161
+ if (column_ids[i] == COLUMN_IDENTIFIER_ROW_ID) {
162
+ continue;
163
+ }
164
+ column_scans[i].Initialize(types[column_ids[i]]);
165
+ }
166
+ }
167
+
168
+ bool RowGroup::InitializeScanWithOffset(CollectionScanState &state, idx_t vector_offset) {
96
169
  auto &column_ids = state.GetColumnIds();
97
170
  auto filters = state.GetFilters();
98
- auto parent_max_row = state.GetParentMaxRow();
99
171
  if (filters) {
100
172
  if (!CheckZonemap(*filters, column_ids)) {
101
173
  return false;
@@ -104,13 +176,14 @@ bool RowGroup::InitializeScanWithOffset(RowGroupScanState &state, idx_t vector_o
104
176
 
105
177
  state.row_group = this;
106
178
  state.vector_index = vector_offset;
107
- state.max_row = this->start > parent_max_row ? 0 : MinValue<idx_t>(this->count, parent_max_row - this->start);
108
- state.column_scans = unique_ptr<ColumnScanState[]>(new ColumnScanState[column_ids.size()]);
179
+ state.max_row_group_row =
180
+ this->start > state.max_row ? 0 : MinValue<idx_t>(this->count, state.max_row - this->start);
181
+ D_ASSERT(state.column_scans);
109
182
  for (idx_t i = 0; i < column_ids.size(); i++) {
110
183
  auto column = column_ids[i];
111
184
  if (column != COLUMN_IDENTIFIER_ROW_ID) {
112
- columns[column]->InitializeScanWithOffset(state.column_scans[i],
113
- start + vector_offset * STANDARD_VECTOR_SIZE);
185
+ auto &column_data = GetColumn(column);
186
+ column_data.InitializeScanWithOffset(state.column_scans[i], start + vector_offset * STANDARD_VECTOR_SIZE);
114
187
  } else {
115
188
  state.column_scans[i].current = nullptr;
116
189
  }
@@ -118,10 +191,9 @@ bool RowGroup::InitializeScanWithOffset(RowGroupScanState &state, idx_t vector_o
118
191
  return true;
119
192
  }
120
193
 
121
- bool RowGroup::InitializeScan(RowGroupScanState &state) {
194
+ bool RowGroup::InitializeScan(CollectionScanState &state) {
122
195
  auto &column_ids = state.GetColumnIds();
123
196
  auto filters = state.GetFilters();
124
- auto parent_max_row = state.GetParentMaxRow();
125
197
  if (filters) {
126
198
  if (!CheckZonemap(*filters, column_ids)) {
127
199
  return false;
@@ -129,12 +201,17 @@ bool RowGroup::InitializeScan(RowGroupScanState &state) {
129
201
  }
130
202
  state.row_group = this;
131
203
  state.vector_index = 0;
132
- state.max_row = this->start > parent_max_row ? 0 : MinValue<idx_t>(this->count, parent_max_row - this->start);
133
- state.column_scans = unique_ptr<ColumnScanState[]>(new ColumnScanState[column_ids.size()]);
204
+ state.max_row_group_row =
205
+ this->start > state.max_row ? 0 : MinValue<idx_t>(this->count, state.max_row - this->start);
206
+ if (state.max_row_group_row == 0) {
207
+ return false;
208
+ }
209
+ D_ASSERT(state.column_scans);
134
210
  for (idx_t i = 0; i < column_ids.size(); i++) {
135
211
  auto column = column_ids[i];
136
212
  if (column != COLUMN_IDENTIFIER_ROW_ID) {
137
- columns[column]->InitializeScan(state.column_scans[i]);
213
+ auto &column_data = GetColumn(column);
214
+ column_data.InitializeScan(state.column_scans[i]);
138
215
  } else {
139
216
  state.column_scans[i].current = nullptr;
140
217
  }
@@ -142,18 +219,19 @@ bool RowGroup::InitializeScan(RowGroupScanState &state) {
142
219
  return true;
143
220
  }
144
221
 
145
- unique_ptr<RowGroup> RowGroup::AlterType(const LogicalType &target_type, idx_t changed_idx,
146
- ExpressionExecutor &executor, RowGroupScanState &scan_state,
147
- DataChunk &scan_chunk) {
222
+ unique_ptr<RowGroup> RowGroup::AlterType(RowGroupCollection &new_collection, const LogicalType &target_type,
223
+ idx_t changed_idx, ExpressionExecutor &executor,
224
+ CollectionScanState &scan_state, DataChunk &scan_chunk) {
148
225
  Verify();
149
226
 
150
227
  // construct a new column data for this type
151
- auto column_data = ColumnData::CreateColumn(block_manager, GetTableInfo(), changed_idx, start, target_type);
228
+ auto column_data = ColumnData::CreateColumn(GetBlockManager(), GetTableInfo(), changed_idx, start, target_type);
152
229
 
153
230
  ColumnAppendState append_state;
154
231
  column_data->InitializeAppend(append_state);
155
232
 
156
233
  // scan the original table, and fill the new column with the transformed value
234
+ scan_state.Initialize(collection.GetTypes());
157
235
  InitializeScan(scan_state);
158
236
 
159
237
  DataChunk append_chunk;
@@ -161,7 +239,6 @@ unique_ptr<RowGroup> RowGroup::AlterType(const LogicalType &target_type, idx_t c
161
239
  append_types.push_back(target_type);
162
240
  append_chunk.Initialize(Allocator::DefaultAllocator(), append_types);
163
241
  auto &append_vector = append_chunk.data[0];
164
- SegmentStatistics altered_col_stats(target_type);
165
242
  while (true) {
166
243
  // scan the table
167
244
  scan_chunk.Reset();
@@ -172,35 +249,33 @@ unique_ptr<RowGroup> RowGroup::AlterType(const LogicalType &target_type, idx_t c
172
249
  // execute the expression
173
250
  append_chunk.Reset();
174
251
  executor.ExecuteExpression(scan_chunk, append_vector);
175
- column_data->Append(altered_col_stats.statistics, append_state, append_vector, scan_chunk.size());
252
+ column_data->Append(append_state, append_vector, scan_chunk.size());
176
253
  }
177
254
 
178
255
  // set up the row_group based on this row_group
179
- auto row_group = make_unique<RowGroup>(db, block_manager, table_info, this->start, this->count);
256
+ auto row_group = make_unique<RowGroup>(new_collection, this->start, this->count);
180
257
  row_group->version_info = version_info;
181
- for (idx_t i = 0; i < columns.size(); i++) {
258
+ auto &cols = GetColumns();
259
+ for (idx_t i = 0; i < cols.size(); i++) {
182
260
  if (i == changed_idx) {
183
261
  // this is the altered column: use the new column
184
262
  row_group->columns.push_back(std::move(column_data));
185
- row_group->stats.push_back(std::move(altered_col_stats)); // NOLINT: false positive
186
263
  } else {
187
264
  // this column was not altered: use the data directly
188
- row_group->columns.push_back(columns[i]);
189
- row_group->stats.emplace_back(stats[i].statistics.Copy());
265
+ row_group->columns.push_back(cols[i]);
190
266
  }
191
267
  }
192
268
  row_group->Verify();
193
269
  return row_group;
194
270
  }
195
271
 
196
- unique_ptr<RowGroup> RowGroup::AddColumn(ColumnDefinition &new_column, ExpressionExecutor &executor,
197
- Expression *default_value, Vector &result) {
272
+ unique_ptr<RowGroup> RowGroup::AddColumn(RowGroupCollection &new_collection, ColumnDefinition &new_column,
273
+ ExpressionExecutor &executor, Expression *default_value, Vector &result) {
198
274
  Verify();
199
275
 
200
276
  // construct a new column data for the new column
201
277
  auto added_column =
202
- ColumnData::CreateColumn(block_manager, GetTableInfo(), columns.size(), start, new_column.Type());
203
- SegmentStatistics added_col_stats(new_column.Type());
278
+ ColumnData::CreateColumn(GetBlockManager(), GetTableInfo(), GetColumnCount(), start, new_column.Type());
204
279
 
205
280
  idx_t rows_to_write = this->count;
206
281
  if (rows_to_write > 0) {
@@ -214,56 +289,51 @@ unique_ptr<RowGroup> RowGroup::AddColumn(ColumnDefinition &new_column, Expressio
214
289
  dummy_chunk.SetCardinality(rows_in_this_vector);
215
290
  executor.ExecuteExpression(dummy_chunk, result);
216
291
  }
217
- added_column->Append(added_col_stats.statistics, state, result, rows_in_this_vector);
292
+ added_column->Append(state, result, rows_in_this_vector);
218
293
  }
219
294
  }
220
295
 
221
296
  // set up the row_group based on this row_group
222
- auto row_group = make_unique<RowGroup>(db, block_manager, table_info, this->start, this->count);
297
+ auto row_group = make_unique<RowGroup>(new_collection, this->start, this->count);
223
298
  row_group->version_info = version_info;
224
- row_group->columns = columns;
225
- for (auto &stat : stats) {
226
- row_group->stats.emplace_back(stat.statistics.Copy());
227
- }
299
+ row_group->columns = GetColumns();
228
300
  // now add the new column
229
301
  row_group->columns.push_back(std::move(added_column));
230
- row_group->stats.push_back(std::move(added_col_stats));
231
302
 
232
303
  row_group->Verify();
233
304
  return row_group;
234
305
  }
235
306
 
236
- unique_ptr<RowGroup> RowGroup::RemoveColumn(idx_t removed_column) {
307
+ unique_ptr<RowGroup> RowGroup::RemoveColumn(RowGroupCollection &new_collection, idx_t removed_column) {
237
308
  Verify();
238
309
 
239
310
  D_ASSERT(removed_column < columns.size());
240
311
 
241
- auto row_group = make_unique<RowGroup>(db, block_manager, table_info, this->start, this->count);
312
+ auto row_group = make_unique<RowGroup>(new_collection, this->start, this->count);
242
313
  row_group->version_info = version_info;
243
- row_group->columns = columns;
244
- for (auto &stat : stats) {
245
- row_group->stats.emplace_back(stat.statistics.Copy());
314
+ // copy over all columns except for the removed one
315
+ auto &cols = GetColumns();
316
+ for (idx_t i = 0; i < cols.size(); i++) {
317
+ if (i != removed_column) {
318
+ row_group->columns.push_back(cols[i]);
319
+ }
246
320
  }
247
- // now remove the column
248
- row_group->columns.erase(row_group->columns.begin() + removed_column);
249
- row_group->stats.erase(row_group->stats.begin() + removed_column);
250
321
 
251
322
  row_group->Verify();
252
323
  return row_group;
253
324
  }
254
325
 
255
326
  void RowGroup::CommitDrop() {
256
- for (idx_t column_idx = 0; column_idx < columns.size(); column_idx++) {
327
+ for (idx_t column_idx = 0; column_idx < GetColumnCount(); column_idx++) {
257
328
  CommitDropColumn(column_idx);
258
329
  }
259
330
  }
260
331
 
261
332
  void RowGroup::CommitDropColumn(idx_t column_idx) {
262
- D_ASSERT(column_idx < columns.size());
263
- columns[column_idx]->CommitDropColumn();
333
+ GetColumn(column_idx).CommitDropColumn();
264
334
  }
265
335
 
266
- void RowGroup::NextVector(RowGroupScanState &state) {
336
+ void RowGroup::NextVector(CollectionScanState &state) {
267
337
  state.vector_index++;
268
338
  auto &column_ids = state.GetColumnIds();
269
339
  for (idx_t i = 0; i < column_ids.size(); i++) {
@@ -272,7 +342,7 @@ void RowGroup::NextVector(RowGroupScanState &state) {
272
342
  continue;
273
343
  }
274
344
  D_ASSERT(column < columns.size());
275
- columns[column]->Skip(state.column_scans[i]);
345
+ GetColumn(column).Skip(state.column_scans[i]);
276
346
  }
277
347
  }
278
348
 
@@ -281,17 +351,14 @@ bool RowGroup::CheckZonemap(TableFilterSet &filters, const vector<column_t> &col
281
351
  auto column_index = entry.first;
282
352
  auto &filter = entry.second;
283
353
  auto base_column_index = column_ids[column_index];
284
-
285
- auto propagate_result = filter->CheckStatistics(stats[base_column_index].statistics);
286
- if (propagate_result == FilterPropagateResult::FILTER_ALWAYS_FALSE ||
287
- propagate_result == FilterPropagateResult::FILTER_FALSE_OR_NULL) {
354
+ if (!GetColumn(base_column_index).CheckZonemap(*filter)) {
288
355
  return false;
289
356
  }
290
357
  }
291
358
  return true;
292
359
  }
293
360
 
294
- bool RowGroup::CheckZonemapSegments(RowGroupScanState &state) {
361
+ bool RowGroup::CheckZonemapSegments(CollectionScanState &state) {
295
362
  auto &column_ids = state.GetColumnIds();
296
363
  auto filters = state.GetFilters();
297
364
  if (!filters) {
@@ -301,7 +368,7 @@ bool RowGroup::CheckZonemapSegments(RowGroupScanState &state) {
301
368
  D_ASSERT(entry.first < column_ids.size());
302
369
  auto column_idx = entry.first;
303
370
  auto base_column_idx = column_ids[column_idx];
304
- bool read_segment = columns[base_column_idx]->CheckZonemap(state.column_scans[column_idx], *entry.second);
371
+ bool read_segment = GetColumn(base_column_idx).CheckZonemap(state.column_scans[column_idx], *entry.second);
305
372
  if (!read_segment) {
306
373
  idx_t target_row =
307
374
  state.column_scans[column_idx].current->start + state.column_scans[column_idx].current->count;
@@ -328,19 +395,19 @@ bool RowGroup::CheckZonemapSegments(RowGroupScanState &state) {
328
395
  }
329
396
 
330
397
  template <TableScanType TYPE>
331
- void RowGroup::TemplatedScan(TransactionData transaction, RowGroupScanState &state, DataChunk &result) {
398
+ void RowGroup::TemplatedScan(TransactionData transaction, CollectionScanState &state, DataChunk &result) {
332
399
  const bool ALLOW_UPDATES = TYPE != TableScanType::TABLE_SCAN_COMMITTED_ROWS_DISALLOW_UPDATES &&
333
400
  TYPE != TableScanType::TABLE_SCAN_COMMITTED_ROWS_OMIT_PERMANENTLY_DELETED;
334
401
  auto table_filters = state.GetFilters();
335
402
  auto &column_ids = state.GetColumnIds();
336
403
  auto adaptive_filter = state.GetAdaptiveFilter();
337
404
  while (true) {
338
- if (state.vector_index * STANDARD_VECTOR_SIZE >= state.max_row) {
405
+ if (state.vector_index * STANDARD_VECTOR_SIZE >= state.max_row_group_row) {
339
406
  // exceeded the amount of rows to scan
340
407
  return;
341
408
  }
342
409
  idx_t current_row = state.vector_index * STANDARD_VECTOR_SIZE;
343
- auto max_count = MinValue<idx_t>(STANDARD_VECTOR_SIZE, state.max_row - current_row);
410
+ auto max_count = MinValue<idx_t>(STANDARD_VECTOR_SIZE, state.max_row_group_row - current_row);
344
411
 
345
412
  //! first check the zonemap if we have to scan this partition
346
413
  if (!CheckZonemapSegments(state)) {
@@ -376,11 +443,12 @@ void RowGroup::TemplatedScan(TransactionData transaction, RowGroupScanState &sta
376
443
  D_ASSERT(result.data[i].GetType().InternalType() == ROW_TYPE);
377
444
  result.data[i].Sequence(this->start + current_row, 1, count);
378
445
  } else {
446
+ auto &col_data = GetColumn(column);
379
447
  if (TYPE != TableScanType::TABLE_SCAN_REGULAR) {
380
- columns[column]->ScanCommitted(state.vector_index, state.column_scans[i], result.data[i],
381
- ALLOW_UPDATES);
448
+ col_data.ScanCommitted(state.vector_index, state.column_scans[i], result.data[i],
449
+ ALLOW_UPDATES);
382
450
  } else {
383
- columns[column]->Scan(transaction, state.vector_index, state.column_scans[i], result.data[i]);
451
+ col_data.Scan(transaction, state.vector_index, state.column_scans[i], result.data[i]);
384
452
  }
385
453
  }
386
454
  }
@@ -402,9 +470,9 @@ void RowGroup::TemplatedScan(TransactionData transaction, RowGroupScanState &sta
402
470
  for (idx_t i = 0; i < table_filters->filters.size(); i++) {
403
471
  auto tf_idx = adaptive_filter->permutation[i];
404
472
  auto col_idx = column_ids[tf_idx];
405
- columns[col_idx]->Select(transaction, state.vector_index, state.column_scans[tf_idx],
406
- result.data[tf_idx], sel, approved_tuple_count,
407
- *table_filters->filters[tf_idx]);
473
+ auto &col_data = GetColumn(col_idx);
474
+ col_data.Select(transaction, state.vector_index, state.column_scans[tf_idx], result.data[tf_idx],
475
+ sel, approved_tuple_count, *table_filters->filters[tf_idx]);
408
476
  }
409
477
  for (auto &table_filter : table_filters->filters) {
410
478
  result.data[table_filter.first].Slice(sel, approved_tuple_count);
@@ -421,7 +489,8 @@ void RowGroup::TemplatedScan(TransactionData transaction, RowGroupScanState &sta
421
489
  continue;
422
490
  }
423
491
  if (table_filters->filters.find(i) == table_filters->filters.end()) {
424
- columns[col_idx]->Skip(state.column_scans[i]);
492
+ auto &col_data = GetColumn(col_idx);
493
+ col_data.Skip(state.column_scans[i]);
425
494
  }
426
495
  }
427
496
  state.vector_index++;
@@ -439,13 +508,13 @@ void RowGroup::TemplatedScan(TransactionData transaction, RowGroupScanState &sta
439
508
  result_data[sel_idx] = this->start + current_row + sel.get_index(sel_idx);
440
509
  }
441
510
  } else {
511
+ auto &col_data = GetColumn(column);
442
512
  if (TYPE == TableScanType::TABLE_SCAN_REGULAR) {
443
- columns[column]->FilterScan(transaction, state.vector_index, state.column_scans[i],
444
- result.data[i], sel, approved_tuple_count);
513
+ col_data.FilterScan(transaction, state.vector_index, state.column_scans[i], result.data[i],
514
+ sel, approved_tuple_count);
445
515
  } else {
446
- columns[column]->FilterScanCommitted(state.vector_index, state.column_scans[i],
447
- result.data[i], sel, approved_tuple_count,
448
- ALLOW_UPDATES);
516
+ col_data.FilterScanCommitted(state.vector_index, state.column_scans[i], result.data[i], sel,
517
+ approved_tuple_count, ALLOW_UPDATES);
449
518
  }
450
519
  }
451
520
  }
@@ -463,12 +532,12 @@ void RowGroup::TemplatedScan(TransactionData transaction, RowGroupScanState &sta
463
532
  }
464
533
  }
465
534
 
466
- void RowGroup::Scan(TransactionData transaction, RowGroupScanState &state, DataChunk &result) {
535
+ void RowGroup::Scan(TransactionData transaction, CollectionScanState &state, DataChunk &result) {
467
536
  TemplatedScan<TableScanType::TABLE_SCAN_REGULAR>(transaction, state, result);
468
537
  }
469
538
 
470
- void RowGroup::ScanCommitted(RowGroupScanState &state, DataChunk &result, TableScanType type) {
471
- auto &transaction_manager = DuckTransactionManager::Get(db);
539
+ void RowGroup::ScanCommitted(CollectionScanState &state, DataChunk &result, TableScanType type) {
540
+ auto &transaction_manager = DuckTransactionManager::Get(collection.GetAttached());
472
541
 
473
542
  auto lowest_active_start = transaction_manager.LowestActiveStart();
474
543
  auto lowest_active_id = transaction_manager.LowestActiveId();
@@ -541,7 +610,8 @@ void RowGroup::FetchRow(TransactionData transaction, ColumnFetchState &state, co
541
610
  data[result_idx] = row_id;
542
611
  } else {
543
612
  // regular column: fetch data from the base column
544
- columns[column]->FetchRow(transaction, state, row_id, result.data[col_idx], result_idx);
613
+ auto &col_data = GetColumn(column);
614
+ col_data.FetchRow(transaction, state, row_id, result.data[col_idx], result_idx);
545
615
  }
546
616
  }
547
617
  }
@@ -626,16 +696,18 @@ void RowGroup::InitializeAppend(RowGroupAppendState &append_state) {
626
696
  append_state.row_group = this;
627
697
  append_state.offset_in_row_group = this->count;
628
698
  // for each column, initialize the append state
629
- append_state.states = unique_ptr<ColumnAppendState[]>(new ColumnAppendState[columns.size()]);
630
- for (idx_t i = 0; i < columns.size(); i++) {
631
- columns[i]->InitializeAppend(append_state.states[i]);
699
+ append_state.states = unique_ptr<ColumnAppendState[]>(new ColumnAppendState[GetColumnCount()]);
700
+ for (idx_t i = 0; i < GetColumnCount(); i++) {
701
+ auto &col_data = GetColumn(i);
702
+ col_data.InitializeAppend(append_state.states[i]);
632
703
  }
633
704
  }
634
705
 
635
706
  void RowGroup::Append(RowGroupAppendState &state, DataChunk &chunk, idx_t append_count) {
636
707
  // append to the current row_group
637
- for (idx_t i = 0; i < columns.size(); i++) {
638
- columns[i]->Append(stats[i].statistics, state.states[i], chunk.data[i], append_count);
708
+ for (idx_t i = 0; i < GetColumnCount(); i++) {
709
+ auto &col_data = GetColumn(i);
710
+ col_data.Append(state.states[i], chunk.data[i], append_count);
639
711
  }
640
712
  state.offset_in_row_group += append_count;
641
713
  }
@@ -650,15 +722,16 @@ void RowGroup::Update(TransactionData transaction, DataChunk &update_chunk, row_
650
722
  for (idx_t i = 0; i < column_ids.size(); i++) {
651
723
  auto column = column_ids[i];
652
724
  D_ASSERT(column.index != COLUMN_IDENTIFIER_ROW_ID);
653
- D_ASSERT(columns[column.index]->type.id() == update_chunk.data[i].GetType().id());
725
+ auto &col_data = GetColumn(column.index);
726
+ D_ASSERT(col_data.type.id() == update_chunk.data[i].GetType().id());
654
727
  if (offset > 0) {
655
728
  Vector sliced_vector(update_chunk.data[i], offset, offset + count);
656
729
  sliced_vector.Flatten(count);
657
- columns[column.index]->Update(transaction, column.index, sliced_vector, ids + offset, count);
730
+ col_data.Update(transaction, column.index, sliced_vector, ids + offset, count);
658
731
  } else {
659
- columns[column.index]->Update(transaction, column.index, update_chunk.data[i], ids, count);
732
+ col_data.Update(transaction, column.index, update_chunk.data[i], ids, count);
660
733
  }
661
- MergeStatistics(column.index, *columns[column.index]->GetUpdateStatistics());
734
+ MergeStatistics(column.index, *col_data.GetUpdateStatistics());
662
735
  }
663
736
  }
664
737
 
@@ -670,29 +743,27 @@ void RowGroup::UpdateColumn(TransactionData transaction, DataChunk &updates, Vec
670
743
  auto primary_column_idx = column_path[0];
671
744
  D_ASSERT(primary_column_idx != COLUMN_IDENTIFIER_ROW_ID);
672
745
  D_ASSERT(primary_column_idx < columns.size());
673
- columns[primary_column_idx]->UpdateColumn(transaction, column_path, updates.data[0], ids, updates.size(), 1);
674
- MergeStatistics(primary_column_idx, *columns[primary_column_idx]->GetUpdateStatistics());
746
+ auto &col_data = GetColumn(primary_column_idx);
747
+ col_data.UpdateColumn(transaction, column_path, updates.data[0], ids, updates.size(), 1);
748
+ MergeStatistics(primary_column_idx, *col_data.GetUpdateStatistics());
675
749
  }
676
750
 
677
751
  unique_ptr<BaseStatistics> RowGroup::GetStatistics(idx_t column_idx) {
678
- D_ASSERT(column_idx < stats.size());
679
-
752
+ auto &col_data = GetColumn(column_idx);
680
753
  lock_guard<mutex> slock(stats_lock);
681
- return stats[column_idx].statistics.ToUnique();
754
+ return col_data.GetStatistics();
682
755
  }
683
756
 
684
757
  void RowGroup::MergeStatistics(idx_t column_idx, const BaseStatistics &other) {
685
- D_ASSERT(column_idx < stats.size());
686
-
758
+ auto &col_data = GetColumn(column_idx);
687
759
  lock_guard<mutex> slock(stats_lock);
688
- stats[column_idx].statistics.Merge(other);
760
+ col_data.MergeStatistics(other);
689
761
  }
690
762
 
691
763
  void RowGroup::MergeIntoStatistics(idx_t column_idx, BaseStatistics &other) {
692
- D_ASSERT(column_idx < stats.size());
693
-
764
+ auto &col_data = GetColumn(column_idx);
694
765
  lock_guard<mutex> slock(stats_lock);
695
- other.Merge(stats[column_idx].statistics);
766
+ col_data.MergeIntoStatistics(other);
696
767
  }
697
768
 
698
769
  RowGroupWriteData RowGroup::WriteToDisk(PartialBlockManager &manager,
@@ -709,10 +780,10 @@ RowGroupWriteData RowGroup::WriteToDisk(PartialBlockManager &manager,
709
780
  // Some of these columns are composite (list, struct). The data is written
710
781
  // first sequentially, and the pointers are written later, so that the
711
782
  // pointers all end up densely packed, and thus more cache-friendly.
712
- for (idx_t column_idx = 0; column_idx < columns.size(); column_idx++) {
713
- auto &column = columns[column_idx];
783
+ for (idx_t column_idx = 0; column_idx < GetColumnCount(); column_idx++) {
784
+ auto &column = GetColumn(column_idx);
714
785
  ColumnCheckpointInfo checkpoint_info {compression_types[column_idx]};
715
- auto checkpoint_state = column->Checkpoint(*this, manager, checkpoint_info);
786
+ auto checkpoint_state = column.Checkpoint(*this, manager, checkpoint_info);
716
787
  D_ASSERT(checkpoint_state);
717
788
 
718
789
  auto stats = checkpoint_state->GetStatistics();
@@ -730,14 +801,13 @@ RowGroupPointer RowGroup::Checkpoint(RowGroupWriter &writer, TableStatistics &gl
730
801
 
731
802
  vector<CompressionType> compression_types;
732
803
  compression_types.reserve(columns.size());
733
- for (idx_t column_idx = 0; column_idx < columns.size(); column_idx++) {
804
+ for (idx_t column_idx = 0; column_idx < GetColumnCount(); column_idx++) {
734
805
  compression_types.push_back(writer.GetColumnCompressionType(column_idx));
735
806
  }
736
807
  auto result = WriteToDisk(writer.GetPartialBlockManager(), compression_types);
737
- for (idx_t column_idx = 0; column_idx < columns.size(); column_idx++) {
808
+ for (idx_t column_idx = 0; column_idx < GetColumnCount(); column_idx++) {
738
809
  global_stats.GetStats(column_idx).Statistics().Merge(result.statistics[column_idx]);
739
810
  }
740
- row_group_pointer.statistics = std::move(result.statistics);
741
811
 
742
812
  // construct the row group pointer and write the column meta data to disk
743
813
  D_ASSERT(result.states.size() == columns.size());
@@ -811,9 +881,6 @@ void RowGroup::Serialize(RowGroupPointer &pointer, Serializer &main_serializer)
811
881
  writer.WriteField<uint64_t>(pointer.row_start);
812
882
  writer.WriteField<uint64_t>(pointer.tuple_count);
813
883
  auto &serializer = writer.GetSerializer();
814
- for (auto &stats : pointer.statistics) {
815
- stats.Serialize(serializer);
816
- }
817
884
  for (auto &data_pointer : pointer.data_pointers) {
818
885
  serializer.Write<block_id_t>(data_pointer.block_id);
819
886
  serializer.Write<uint64_t>(data_pointer.offset);
@@ -831,13 +898,9 @@ RowGroupPointer RowGroup::Deserialize(Deserializer &main_source, const vector<Lo
831
898
 
832
899
  auto physical_columns = columns.size();
833
900
  result.data_pointers.reserve(physical_columns);
834
- result.statistics.reserve(physical_columns);
835
901
 
836
902
  auto &source = reader.GetSource();
837
- for (auto &col_type : columns) {
838
- result.statistics.push_back(BaseStatistics::Deserialize(source, col_type));
839
- }
840
- for (idx_t i = 0; i < columns.size(); i++) {
903
+ for (idx_t i = 0; i < physical_columns; i++) {
841
904
  BlockPointer pointer;
842
905
  pointer.block_id = source.Read<block_id_t>();
843
906
  pointer.offset = source.Read<uint64_t>();
@@ -853,8 +916,9 @@ RowGroupPointer RowGroup::Deserialize(Deserializer &main_source, const vector<Lo
853
916
  // GetStorageInfo
854
917
  //===--------------------------------------------------------------------===//
855
918
  void RowGroup::GetStorageInfo(idx_t row_group_index, TableStorageInfo &result) {
856
- for (idx_t col_idx = 0; col_idx < columns.size(); col_idx++) {
857
- columns[col_idx]->GetStorageInfo(row_group_index, {col_idx}, result);
919
+ for (idx_t col_idx = 0; col_idx < GetColumnCount(); col_idx++) {
920
+ auto &col_data = GetColumn(col_idx);
921
+ col_data.GetStorageInfo(row_group_index, {col_idx}, result);
858
922
  }
859
923
  }
860
924
 
@@ -900,7 +964,7 @@ idx_t RowGroup::Delete(TransactionData transaction, DataTable *table, row_t *ids
900
964
 
901
965
  void RowGroup::Verify() {
902
966
  #ifdef DEBUG
903
- for (auto &column : columns) {
967
+ for (auto &column : GetColumns()) {
904
968
  column->Verify(*this);
905
969
  }
906
970
  #endif