duckdb 0.7.2-dev1671.0 → 0.7.2-dev1734.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/icu/icu-datefunc.cpp +20 -8
- package/src/duckdb/extension/icu/icu-strptime.cpp +117 -29
- package/src/duckdb/extension/icu/include/icu-datefunc.hpp +2 -0
- package/src/duckdb/src/common/local_file_system.cpp +13 -2
- package/src/duckdb/src/common/sort/partition_state.cpp +644 -0
- package/src/duckdb/src/execution/expression_executor.cpp +1 -1
- package/src/duckdb/src/execution/expression_executor_state.cpp +2 -3
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +77 -849
- package/src/duckdb/src/function/table/system/duckdb_extensions.cpp +2 -2
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +247 -0
- package/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +1 -3
- package/src/duckdb/src/include/duckdb/planner/pragma_handler.hpp +3 -2
- package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +1 -2
- package/src/duckdb/src/include/duckdb/storage/buffer/buffer_pool.hpp +77 -0
- package/src/duckdb/src/include/duckdb/storage/buffer/temporary_file_information.hpp +12 -0
- package/src/duckdb/src/include/duckdb/storage/buffer_manager.hpp +3 -59
- package/src/duckdb/src/main/extension/extension_install.cpp +11 -0
- package/src/duckdb/src/main/extension/extension_load.cpp +29 -3
- package/src/duckdb/src/main/query_profiler.cpp +1 -1
- package/src/duckdb/src/planner/pragma_handler.cpp +7 -5
- package/src/duckdb/src/storage/buffer/block_handle.cpp +128 -0
- package/src/duckdb/src/storage/buffer/block_manager.cpp +81 -0
- package/src/duckdb/src/storage/buffer/buffer_pool.cpp +132 -0
- package/src/duckdb/src/storage/buffer/buffer_pool_reservation.cpp +32 -0
- package/src/duckdb/src/storage/buffer_manager.cpp +0 -351
- package/src/duckdb/third_party/libpg_query/postgres_parser.cpp +3 -5
- package/src/duckdb/ub_src_common_sort.cpp +2 -0
- package/src/duckdb/ub_src_storage_buffer.cpp +8 -0
@@ -0,0 +1,644 @@
|
|
1
|
+
#include "duckdb/common/sort/partition_state.hpp"
|
2
|
+
|
3
|
+
#include "duckdb/common/types/column_data_consumer.hpp"
|
4
|
+
#include "duckdb/common/row_operations/row_operations.hpp"
|
5
|
+
#include "duckdb/main/config.hpp"
|
6
|
+
#include "duckdb/parallel/event.hpp"
|
7
|
+
|
8
|
+
#include <numeric>
|
9
|
+
|
10
|
+
namespace duckdb {
|
11
|
+
|
12
|
+
PartitionGlobalHashGroup::PartitionGlobalHashGroup(BufferManager &buffer_manager, const Orders &partitions,
|
13
|
+
const Orders &orders, const Types &payload_types, bool external)
|
14
|
+
: count(0) {
|
15
|
+
|
16
|
+
RowLayout payload_layout;
|
17
|
+
payload_layout.Initialize(payload_types);
|
18
|
+
global_sort = make_uniq<GlobalSortState>(buffer_manager, orders, payload_layout);
|
19
|
+
global_sort->external = external;
|
20
|
+
|
21
|
+
partition_layout = global_sort->sort_layout.GetPrefixComparisonLayout(partitions.size());
|
22
|
+
}
|
23
|
+
|
24
|
+
void PartitionGlobalHashGroup::ComputeMasks(ValidityMask &partition_mask, ValidityMask &order_mask) {
|
25
|
+
D_ASSERT(count > 0);
|
26
|
+
|
27
|
+
// Set up a comparator for the partition subset
|
28
|
+
const auto partition_size = partition_layout.comparison_size;
|
29
|
+
|
30
|
+
SBIterator prev(*global_sort, ExpressionType::COMPARE_LESSTHAN);
|
31
|
+
SBIterator curr(*global_sort, ExpressionType::COMPARE_LESSTHAN);
|
32
|
+
|
33
|
+
partition_mask.SetValidUnsafe(0);
|
34
|
+
order_mask.SetValidUnsafe(0);
|
35
|
+
for (++curr; curr.GetIndex() < count; ++curr) {
|
36
|
+
// Compare the partition subset first because if that differs, then so does the full ordering
|
37
|
+
int part_cmp = 0;
|
38
|
+
if (partition_layout.all_constant) {
|
39
|
+
part_cmp = FastMemcmp(prev.entry_ptr, curr.entry_ptr, partition_size);
|
40
|
+
} else {
|
41
|
+
part_cmp = Comparators::CompareTuple(prev.scan, curr.scan, prev.entry_ptr, curr.entry_ptr, partition_layout,
|
42
|
+
prev.external);
|
43
|
+
}
|
44
|
+
|
45
|
+
if (part_cmp) {
|
46
|
+
partition_mask.SetValidUnsafe(curr.GetIndex());
|
47
|
+
order_mask.SetValidUnsafe(curr.GetIndex());
|
48
|
+
} else if (prev.Compare(curr)) {
|
49
|
+
order_mask.SetValidUnsafe(curr.GetIndex());
|
50
|
+
}
|
51
|
+
++prev;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
PartitionGlobalSinkState::PartitionGlobalSinkState(ClientContext &context,
|
56
|
+
const vector<unique_ptr<Expression>> &partitions_p,
|
57
|
+
const vector<BoundOrderByNode> &orders_p, const Types &payload_types,
|
58
|
+
const vector<unique_ptr<BaseStatistics>> &partitions_stats,
|
59
|
+
idx_t estimated_cardinality)
|
60
|
+
: context(context), buffer_manager(BufferManager::GetBufferManager(context)), allocator(Allocator::Get(context)),
|
61
|
+
payload_types(payload_types), memory_per_thread(0), count(0) {
|
62
|
+
|
63
|
+
// we sort by both 1) partition by expression list and 2) order by expressions
|
64
|
+
const auto partition_cols = partitions_p.size();
|
65
|
+
for (idx_t prt_idx = 0; prt_idx < partition_cols; prt_idx++) {
|
66
|
+
auto &pexpr = partitions_p[prt_idx];
|
67
|
+
|
68
|
+
if (partitions_stats.empty() || !partitions_stats[prt_idx]) {
|
69
|
+
orders.emplace_back(OrderType::ASCENDING, OrderByNullType::NULLS_FIRST, pexpr->Copy(), nullptr);
|
70
|
+
} else {
|
71
|
+
orders.emplace_back(OrderType::ASCENDING, OrderByNullType::NULLS_FIRST, pexpr->Copy(),
|
72
|
+
partitions_stats[prt_idx]->ToUnique());
|
73
|
+
}
|
74
|
+
partitions.emplace_back(orders.back().Copy());
|
75
|
+
}
|
76
|
+
|
77
|
+
for (const auto &order : orders_p) {
|
78
|
+
orders.emplace_back(order.Copy());
|
79
|
+
}
|
80
|
+
|
81
|
+
memory_per_thread = PhysicalOperator::GetMaxThreadMemory(context);
|
82
|
+
external = ClientConfig::GetConfig(context).force_external;
|
83
|
+
|
84
|
+
if (!orders.empty()) {
|
85
|
+
grouping_types = payload_types;
|
86
|
+
grouping_types.push_back(LogicalType::HASH);
|
87
|
+
|
88
|
+
ResizeGroupingData(estimated_cardinality);
|
89
|
+
}
|
90
|
+
}
|
91
|
+
|
92
|
+
void PartitionGlobalSinkState::ResizeGroupingData(idx_t cardinality) {
|
93
|
+
// Have we started to combine? Then just live with it.
|
94
|
+
if (grouping_data && !grouping_data->GetPartitions().empty()) {
|
95
|
+
return;
|
96
|
+
}
|
97
|
+
// Is the average partition size too large?
|
98
|
+
const idx_t partition_size = STANDARD_ROW_GROUPS_SIZE;
|
99
|
+
const auto bits = grouping_data ? grouping_data->GetRadixBits() : 0;
|
100
|
+
auto new_bits = bits ? bits : 4;
|
101
|
+
while (new_bits < 10 && (cardinality / RadixPartitioning::NumberOfPartitions(new_bits)) > partition_size) {
|
102
|
+
++new_bits;
|
103
|
+
}
|
104
|
+
|
105
|
+
// Repartition the grouping data
|
106
|
+
if (new_bits != bits) {
|
107
|
+
const auto hash_col_idx = payload_types.size();
|
108
|
+
grouping_data = make_uniq<RadixPartitionedColumnData>(context, grouping_types, new_bits, hash_col_idx);
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
void PartitionGlobalSinkState::SyncLocalPartition(GroupingPartition &local_partition, GroupingAppend &local_append) {
|
113
|
+
// We are done if the local_partition is right sized.
|
114
|
+
auto local_radix = (RadixPartitionedColumnData *)local_partition.get();
|
115
|
+
if (local_radix->GetRadixBits() == grouping_data->GetRadixBits()) {
|
116
|
+
return;
|
117
|
+
}
|
118
|
+
|
119
|
+
// If the local partition is now too small, flush it and reallocate
|
120
|
+
auto new_partition = grouping_data->CreateShared();
|
121
|
+
auto new_append = make_uniq<PartitionedColumnDataAppendState>();
|
122
|
+
new_partition->InitializeAppendState(*new_append);
|
123
|
+
|
124
|
+
local_partition->FlushAppendState(*local_append);
|
125
|
+
auto &local_groups = local_partition->GetPartitions();
|
126
|
+
for (auto &local_group : local_groups) {
|
127
|
+
ColumnDataScanState scanner;
|
128
|
+
local_group->InitializeScan(scanner);
|
129
|
+
|
130
|
+
DataChunk scan_chunk;
|
131
|
+
local_group->InitializeScanChunk(scan_chunk);
|
132
|
+
for (scan_chunk.Reset(); local_group->Scan(scanner, scan_chunk); scan_chunk.Reset()) {
|
133
|
+
new_partition->Append(*new_append, scan_chunk);
|
134
|
+
}
|
135
|
+
}
|
136
|
+
|
137
|
+
// The append state has stale pointers to the old local partition, so nuke it from orbit.
|
138
|
+
new_partition->FlushAppendState(*new_append);
|
139
|
+
|
140
|
+
local_partition = std::move(new_partition);
|
141
|
+
local_append = make_uniq<PartitionedColumnDataAppendState>();
|
142
|
+
local_partition->InitializeAppendState(*local_append);
|
143
|
+
}
|
144
|
+
|
145
|
+
void PartitionGlobalSinkState::UpdateLocalPartition(GroupingPartition &local_partition, GroupingAppend &local_append) {
|
146
|
+
// Make sure grouping_data doesn't change under us.
|
147
|
+
lock_guard<mutex> guard(lock);
|
148
|
+
|
149
|
+
if (!local_partition) {
|
150
|
+
local_partition = grouping_data->CreateShared();
|
151
|
+
local_append = make_uniq<PartitionedColumnDataAppendState>();
|
152
|
+
local_partition->InitializeAppendState(*local_append);
|
153
|
+
return;
|
154
|
+
}
|
155
|
+
|
156
|
+
// Grow the groups if they are too big
|
157
|
+
ResizeGroupingData(count);
|
158
|
+
|
159
|
+
// Sync local partition to have the same bit count
|
160
|
+
SyncLocalPartition(local_partition, local_append);
|
161
|
+
}
|
162
|
+
|
163
|
+
void PartitionGlobalSinkState::CombineLocalPartition(GroupingPartition &local_partition, GroupingAppend &local_append) {
|
164
|
+
if (!local_partition) {
|
165
|
+
return;
|
166
|
+
}
|
167
|
+
local_partition->FlushAppendState(*local_append);
|
168
|
+
|
169
|
+
// Make sure grouping_data doesn't change under us.
|
170
|
+
// Combine has an internal mutex, so this is single-threaded anyway.
|
171
|
+
lock_guard<mutex> guard(lock);
|
172
|
+
SyncLocalPartition(local_partition, local_append);
|
173
|
+
grouping_data->Combine(*local_partition);
|
174
|
+
}
|
175
|
+
|
176
|
+
void PartitionGlobalSinkState::BuildSortState(ColumnDataCollection &group_data, PartitionGlobalHashGroup &hash_group) {
|
177
|
+
auto &global_sort = *hash_group.global_sort;
|
178
|
+
|
179
|
+
// Set up the sort expression computation.
|
180
|
+
vector<LogicalType> sort_types;
|
181
|
+
ExpressionExecutor executor(context);
|
182
|
+
for (auto &order : orders) {
|
183
|
+
auto &oexpr = order.expression;
|
184
|
+
sort_types.emplace_back(oexpr->return_type);
|
185
|
+
executor.AddExpression(*oexpr);
|
186
|
+
}
|
187
|
+
DataChunk sort_chunk;
|
188
|
+
sort_chunk.Initialize(allocator, sort_types);
|
189
|
+
|
190
|
+
// Copy the data from the group into the sort code.
|
191
|
+
LocalSortState local_sort;
|
192
|
+
local_sort.Initialize(global_sort, global_sort.buffer_manager);
|
193
|
+
|
194
|
+
// Strip hash column
|
195
|
+
DataChunk payload_chunk;
|
196
|
+
payload_chunk.Initialize(allocator, payload_types);
|
197
|
+
|
198
|
+
vector<column_t> column_ids;
|
199
|
+
column_ids.reserve(payload_types.size());
|
200
|
+
for (column_t i = 0; i < payload_types.size(); ++i) {
|
201
|
+
column_ids.emplace_back(i);
|
202
|
+
}
|
203
|
+
ColumnDataConsumer scanner(group_data, column_ids);
|
204
|
+
ColumnDataConsumerScanState chunk_state;
|
205
|
+
chunk_state.current_chunk_state.properties = ColumnDataScanProperties::ALLOW_ZERO_COPY;
|
206
|
+
scanner.InitializeScan();
|
207
|
+
for (auto chunk_idx = scanner.ChunkCount(); chunk_idx-- > 0;) {
|
208
|
+
if (!scanner.AssignChunk(chunk_state)) {
|
209
|
+
break;
|
210
|
+
}
|
211
|
+
scanner.ScanChunk(chunk_state, payload_chunk);
|
212
|
+
|
213
|
+
sort_chunk.Reset();
|
214
|
+
executor.Execute(payload_chunk, sort_chunk);
|
215
|
+
|
216
|
+
local_sort.SinkChunk(sort_chunk, payload_chunk);
|
217
|
+
if (local_sort.SizeInBytes() > memory_per_thread) {
|
218
|
+
local_sort.Sort(global_sort, true);
|
219
|
+
}
|
220
|
+
scanner.FinishChunk(chunk_state);
|
221
|
+
}
|
222
|
+
|
223
|
+
global_sort.AddLocalState(local_sort);
|
224
|
+
|
225
|
+
hash_group.count += group_data.Count();
|
226
|
+
}
|
227
|
+
|
228
|
+
// Per-thread sink state
|
229
|
+
PartitionLocalSinkState::PartitionLocalSinkState(ClientContext &context, PartitionGlobalSinkState &gstate_p)
|
230
|
+
: gstate(gstate_p), allocator(Allocator::Get(context)), executor(context) {
|
231
|
+
|
232
|
+
vector<LogicalType> group_types;
|
233
|
+
for (idx_t prt_idx = 0; prt_idx < gstate.partitions.size(); prt_idx++) {
|
234
|
+
auto &pexpr = *gstate.partitions[prt_idx].expression.get();
|
235
|
+
group_types.push_back(pexpr.return_type);
|
236
|
+
executor.AddExpression(pexpr);
|
237
|
+
}
|
238
|
+
sort_cols = gstate.orders.size() + group_types.size();
|
239
|
+
|
240
|
+
if (sort_cols) {
|
241
|
+
if (!group_types.empty()) {
|
242
|
+
// OVER(PARTITION BY...)
|
243
|
+
group_chunk.Initialize(allocator, group_types);
|
244
|
+
}
|
245
|
+
// OVER(...)
|
246
|
+
auto payload_types = gstate.payload_types;
|
247
|
+
payload_types.emplace_back(LogicalType::HASH);
|
248
|
+
payload_chunk.Initialize(allocator, payload_types);
|
249
|
+
} else {
|
250
|
+
// OVER()
|
251
|
+
payload_layout.Initialize(gstate.payload_types);
|
252
|
+
}
|
253
|
+
}
|
254
|
+
|
255
|
+
void PartitionLocalSinkState::Hash(DataChunk &input_chunk, Vector &hash_vector) {
|
256
|
+
const auto count = input_chunk.size();
|
257
|
+
if (group_chunk.ColumnCount() > 0) {
|
258
|
+
// OVER(PARTITION BY...) (hash grouping)
|
259
|
+
group_chunk.Reset();
|
260
|
+
executor.Execute(input_chunk, group_chunk);
|
261
|
+
VectorOperations::Hash(group_chunk.data[0], hash_vector, count);
|
262
|
+
for (idx_t prt_idx = 1; prt_idx < group_chunk.ColumnCount(); ++prt_idx) {
|
263
|
+
VectorOperations::CombineHash(hash_vector, group_chunk.data[prt_idx], count);
|
264
|
+
}
|
265
|
+
} else {
|
266
|
+
// OVER(...) (sorting)
|
267
|
+
// Single partition => single hash value
|
268
|
+
hash_vector.SetVectorType(VectorType::CONSTANT_VECTOR);
|
269
|
+
auto hashes = ConstantVector::GetData<hash_t>(hash_vector);
|
270
|
+
hashes[0] = 0;
|
271
|
+
}
|
272
|
+
}
|
273
|
+
|
274
|
+
void PartitionLocalSinkState::Sink(DataChunk &input_chunk) {
|
275
|
+
gstate.count += input_chunk.size();
|
276
|
+
|
277
|
+
// OVER()
|
278
|
+
if (sort_cols == 0) {
|
279
|
+
// No sorts, so build paged row chunks
|
280
|
+
if (!rows) {
|
281
|
+
const auto entry_size = payload_layout.GetRowWidth();
|
282
|
+
const auto capacity = MaxValue<idx_t>(STANDARD_VECTOR_SIZE, (Storage::BLOCK_SIZE / entry_size) + 1);
|
283
|
+
rows = make_uniq<RowDataCollection>(gstate.buffer_manager, capacity, entry_size);
|
284
|
+
strings = make_uniq<RowDataCollection>(gstate.buffer_manager, (idx_t)Storage::BLOCK_SIZE, 1, true);
|
285
|
+
}
|
286
|
+
const auto row_count = input_chunk.size();
|
287
|
+
const auto row_sel = FlatVector::IncrementalSelectionVector();
|
288
|
+
Vector addresses(LogicalType::POINTER);
|
289
|
+
auto key_locations = FlatVector::GetData<data_ptr_t>(addresses);
|
290
|
+
const auto prev_rows_blocks = rows->blocks.size();
|
291
|
+
auto handles = rows->Build(row_count, key_locations, nullptr, row_sel);
|
292
|
+
auto input_data = input_chunk.ToUnifiedFormat();
|
293
|
+
RowOperations::Scatter(input_chunk, input_data.get(), payload_layout, addresses, *strings, *row_sel, row_count);
|
294
|
+
// Mark that row blocks contain pointers (heap blocks are pinned)
|
295
|
+
if (!payload_layout.AllConstant()) {
|
296
|
+
D_ASSERT(strings->keep_pinned);
|
297
|
+
for (size_t i = prev_rows_blocks; i < rows->blocks.size(); ++i) {
|
298
|
+
rows->blocks[i]->block->SetSwizzling("PartitionLocalSinkState::Sink");
|
299
|
+
}
|
300
|
+
}
|
301
|
+
return;
|
302
|
+
}
|
303
|
+
|
304
|
+
// OVER(...)
|
305
|
+
payload_chunk.Reset();
|
306
|
+
auto &hash_vector = payload_chunk.data.back();
|
307
|
+
Hash(input_chunk, hash_vector);
|
308
|
+
for (idx_t col_idx = 0; col_idx < input_chunk.ColumnCount(); ++col_idx) {
|
309
|
+
payload_chunk.data[col_idx].Reference(input_chunk.data[col_idx]);
|
310
|
+
}
|
311
|
+
payload_chunk.SetCardinality(input_chunk);
|
312
|
+
|
313
|
+
gstate.UpdateLocalPartition(local_partition, local_append);
|
314
|
+
local_partition->Append(*local_append, payload_chunk);
|
315
|
+
}
|
316
|
+
|
317
|
+
void PartitionLocalSinkState::Combine() {
|
318
|
+
// OVER()
|
319
|
+
if (sort_cols == 0) {
|
320
|
+
// Only one partition again, so need a global lock.
|
321
|
+
lock_guard<mutex> glock(gstate.lock);
|
322
|
+
if (gstate.rows) {
|
323
|
+
if (rows) {
|
324
|
+
gstate.rows->Merge(*rows);
|
325
|
+
gstate.strings->Merge(*strings);
|
326
|
+
rows.reset();
|
327
|
+
strings.reset();
|
328
|
+
}
|
329
|
+
} else {
|
330
|
+
gstate.rows = std::move(rows);
|
331
|
+
gstate.strings = std::move(strings);
|
332
|
+
}
|
333
|
+
return;
|
334
|
+
}
|
335
|
+
|
336
|
+
// OVER(...)
|
337
|
+
gstate.CombineLocalPartition(local_partition, local_append);
|
338
|
+
}
|
339
|
+
|
340
|
+
PartitionGlobalMergeState::PartitionGlobalMergeState(PartitionGlobalSinkState &sink, GroupDataPtr group_data)
|
341
|
+
: sink(sink), group_data(std::move(group_data)), stage(PartitionSortStage::INIT), total_tasks(0), tasks_assigned(0),
|
342
|
+
tasks_completed(0) {
|
343
|
+
|
344
|
+
const auto group_idx = sink.hash_groups.size();
|
345
|
+
auto new_group = make_uniq<PartitionGlobalHashGroup>(sink.buffer_manager, sink.partitions, sink.orders,
|
346
|
+
sink.payload_types, sink.external);
|
347
|
+
sink.hash_groups.emplace_back(std::move(new_group));
|
348
|
+
|
349
|
+
hash_group = sink.hash_groups[group_idx].get();
|
350
|
+
global_sort = sink.hash_groups[group_idx]->global_sort.get();
|
351
|
+
}
|
352
|
+
|
353
|
+
void PartitionLocalMergeState::Prepare() {
|
354
|
+
auto &global_sort = *merge_state->global_sort;
|
355
|
+
merge_state->sink.BuildSortState(*merge_state->group_data, *merge_state->hash_group);
|
356
|
+
merge_state->group_data.reset();
|
357
|
+
|
358
|
+
global_sort.PrepareMergePhase();
|
359
|
+
}
|
360
|
+
|
361
|
+
void PartitionLocalMergeState::Merge() {
|
362
|
+
auto &global_sort = *merge_state->global_sort;
|
363
|
+
MergeSorter merge_sorter(global_sort, global_sort.buffer_manager);
|
364
|
+
merge_sorter.PerformInMergeRound();
|
365
|
+
}
|
366
|
+
|
367
|
+
void PartitionLocalMergeState::ExecuteTask() {
|
368
|
+
switch (stage) {
|
369
|
+
case PartitionSortStage::PREPARE:
|
370
|
+
Prepare();
|
371
|
+
break;
|
372
|
+
case PartitionSortStage::MERGE:
|
373
|
+
Merge();
|
374
|
+
break;
|
375
|
+
default:
|
376
|
+
throw InternalException("Unexpected PartitionGlobalMergeState in ExecuteTask!");
|
377
|
+
}
|
378
|
+
|
379
|
+
merge_state->CompleteTask();
|
380
|
+
finished = true;
|
381
|
+
}
|
382
|
+
|
383
|
+
bool PartitionGlobalMergeState::AssignTask(PartitionLocalMergeState &local_state) {
|
384
|
+
lock_guard<mutex> guard(lock);
|
385
|
+
|
386
|
+
if (tasks_assigned >= total_tasks) {
|
387
|
+
return false;
|
388
|
+
}
|
389
|
+
|
390
|
+
local_state.merge_state = this;
|
391
|
+
local_state.stage = stage;
|
392
|
+
local_state.finished = false;
|
393
|
+
tasks_assigned++;
|
394
|
+
|
395
|
+
return true;
|
396
|
+
}
|
397
|
+
|
398
|
+
void PartitionGlobalMergeState::CompleteTask() {
|
399
|
+
lock_guard<mutex> guard(lock);
|
400
|
+
|
401
|
+
++tasks_completed;
|
402
|
+
}
|
403
|
+
|
404
|
+
bool PartitionGlobalMergeState::TryPrepareNextStage() {
|
405
|
+
lock_guard<mutex> guard(lock);
|
406
|
+
|
407
|
+
if (tasks_completed < total_tasks) {
|
408
|
+
return false;
|
409
|
+
}
|
410
|
+
|
411
|
+
tasks_assigned = tasks_completed = 0;
|
412
|
+
|
413
|
+
switch (stage) {
|
414
|
+
case PartitionSortStage::INIT:
|
415
|
+
total_tasks = 1;
|
416
|
+
stage = PartitionSortStage::PREPARE;
|
417
|
+
return true;
|
418
|
+
|
419
|
+
case PartitionSortStage::PREPARE:
|
420
|
+
total_tasks = global_sort->sorted_blocks.size() / 2;
|
421
|
+
if (!total_tasks) {
|
422
|
+
break;
|
423
|
+
}
|
424
|
+
stage = PartitionSortStage::MERGE;
|
425
|
+
global_sort->InitializeMergeRound();
|
426
|
+
return true;
|
427
|
+
|
428
|
+
case PartitionSortStage::MERGE:
|
429
|
+
global_sort->CompleteMergeRound(true);
|
430
|
+
total_tasks = global_sort->sorted_blocks.size() / 2;
|
431
|
+
if (!total_tasks) {
|
432
|
+
break;
|
433
|
+
}
|
434
|
+
global_sort->InitializeMergeRound();
|
435
|
+
return true;
|
436
|
+
|
437
|
+
case PartitionSortStage::SORTED:
|
438
|
+
break;
|
439
|
+
}
|
440
|
+
|
441
|
+
stage = PartitionSortStage::SORTED;
|
442
|
+
|
443
|
+
return false;
|
444
|
+
}
|
445
|
+
|
446
|
+
PartitionGlobalMergeStates::PartitionGlobalMergeStates(PartitionGlobalSinkState &sink) {
|
447
|
+
// Schedule all the sorts for maximum thread utilisation
|
448
|
+
for (auto &group_data : sink.grouping_data->GetPartitions()) {
|
449
|
+
// Prepare for merge sort phase
|
450
|
+
if (group_data->Count()) {
|
451
|
+
auto state = make_uniq<PartitionGlobalMergeState>(sink, std::move(group_data));
|
452
|
+
states.emplace_back(std::move(state));
|
453
|
+
}
|
454
|
+
}
|
455
|
+
}
|
456
|
+
|
457
|
+
class PartitionMergeTask : public ExecutorTask {
|
458
|
+
public:
|
459
|
+
PartitionMergeTask(shared_ptr<Event> event_p, ClientContext &context_p, PartitionGlobalMergeStates &hash_groups_p)
|
460
|
+
: ExecutorTask(context_p), event(std::move(event_p)), hash_groups(hash_groups_p) {
|
461
|
+
}
|
462
|
+
|
463
|
+
TaskExecutionResult ExecuteTask(TaskExecutionMode mode) override;
|
464
|
+
|
465
|
+
private:
|
466
|
+
shared_ptr<Event> event;
|
467
|
+
PartitionLocalMergeState local_state;
|
468
|
+
PartitionGlobalMergeStates &hash_groups;
|
469
|
+
};
|
470
|
+
|
471
|
+
TaskExecutionResult PartitionMergeTask::ExecuteTask(TaskExecutionMode mode) {
|
472
|
+
// Loop until all hash groups are done
|
473
|
+
size_t sorted = 0;
|
474
|
+
while (sorted < hash_groups.states.size()) {
|
475
|
+
// First check if there is an unfinished task for this thread
|
476
|
+
if (executor.HasError()) {
|
477
|
+
return TaskExecutionResult::TASK_ERROR;
|
478
|
+
}
|
479
|
+
if (!local_state.TaskFinished()) {
|
480
|
+
local_state.ExecuteTask();
|
481
|
+
continue;
|
482
|
+
}
|
483
|
+
|
484
|
+
// Thread is done with its assigned task, try to fetch new work
|
485
|
+
for (auto group = sorted; group < hash_groups.states.size(); ++group) {
|
486
|
+
auto &global_state = hash_groups.states[group];
|
487
|
+
if (global_state->IsSorted()) {
|
488
|
+
// This hash group is done
|
489
|
+
// Update the high water mark of densely completed groups
|
490
|
+
if (sorted == group) {
|
491
|
+
++sorted;
|
492
|
+
}
|
493
|
+
continue;
|
494
|
+
}
|
495
|
+
|
496
|
+
// Try to assign work for this hash group to this thread
|
497
|
+
if (global_state->AssignTask(local_state)) {
|
498
|
+
// We assigned a task to this thread!
|
499
|
+
// Break out of this loop to re-enter the top-level loop and execute the task
|
500
|
+
break;
|
501
|
+
}
|
502
|
+
|
503
|
+
// Hash group global state couldn't assign a task to this thread
|
504
|
+
// Try to prepare the next stage
|
505
|
+
if (!global_state->TryPrepareNextStage()) {
|
506
|
+
// This current hash group is not yet done
|
507
|
+
// But we were not able to assign a task for it to this thread
|
508
|
+
// See if the next hash group is better
|
509
|
+
continue;
|
510
|
+
}
|
511
|
+
|
512
|
+
// We were able to prepare the next stage for this hash group!
|
513
|
+
// Try to assign a task once more
|
514
|
+
if (global_state->AssignTask(local_state)) {
|
515
|
+
// We assigned a task to this thread!
|
516
|
+
// Break out of this loop to re-enter the top-level loop and execute the task
|
517
|
+
break;
|
518
|
+
}
|
519
|
+
|
520
|
+
// We were able to prepare the next merge round,
|
521
|
+
// but we were not able to assign a task for it to this thread
|
522
|
+
// The tasks were assigned to other threads while this thread waited for the lock
|
523
|
+
// Go to the next iteration to see if another hash group has a task
|
524
|
+
}
|
525
|
+
}
|
526
|
+
|
527
|
+
event->FinishTask();
|
528
|
+
return TaskExecutionResult::TASK_FINISHED;
|
529
|
+
}
|
530
|
+
|
531
|
+
void PartitionMergeEvent::Schedule() {
|
532
|
+
auto &context = pipeline->GetClientContext();
|
533
|
+
|
534
|
+
// Schedule tasks equal to the number of threads, which will each merge multiple partitions
|
535
|
+
auto &ts = TaskScheduler::GetScheduler(context);
|
536
|
+
idx_t num_threads = ts.NumberOfThreads();
|
537
|
+
|
538
|
+
vector<unique_ptr<Task>> merge_tasks;
|
539
|
+
for (idx_t tnum = 0; tnum < num_threads; tnum++) {
|
540
|
+
merge_tasks.emplace_back(make_uniq<PartitionMergeTask>(shared_from_this(), context, merge_states));
|
541
|
+
}
|
542
|
+
SetTasks(std::move(merge_tasks));
|
543
|
+
}
|
544
|
+
|
545
|
+
PartitionLocalSourceState::PartitionLocalSourceState(PartitionGlobalSinkState &gstate_p) : gstate(gstate_p) {
|
546
|
+
const auto &input_types = gstate.payload_types;
|
547
|
+
layout.Initialize(input_types);
|
548
|
+
input_chunk.Initialize(gstate.allocator, input_types);
|
549
|
+
}
|
550
|
+
|
551
|
+
void PartitionLocalSourceState::MaterializeSortedData() {
|
552
|
+
auto &global_sort_state = *hash_group->global_sort;
|
553
|
+
if (global_sort_state.sorted_blocks.empty()) {
|
554
|
+
return;
|
555
|
+
}
|
556
|
+
|
557
|
+
// scan the sorted row data
|
558
|
+
D_ASSERT(global_sort_state.sorted_blocks.size() == 1);
|
559
|
+
auto &sb = *global_sort_state.sorted_blocks[0];
|
560
|
+
|
561
|
+
// Free up some memory before allocating more
|
562
|
+
sb.radix_sorting_data.clear();
|
563
|
+
sb.blob_sorting_data = nullptr;
|
564
|
+
|
565
|
+
// Move the sorting row blocks into our RDCs
|
566
|
+
auto &buffer_manager = global_sort_state.buffer_manager;
|
567
|
+
auto &sd = *sb.payload_data;
|
568
|
+
|
569
|
+
// Data blocks are required
|
570
|
+
D_ASSERT(!sd.data_blocks.empty());
|
571
|
+
auto &block = sd.data_blocks[0];
|
572
|
+
rows = make_uniq<RowDataCollection>(buffer_manager, block->capacity, block->entry_size);
|
573
|
+
rows->blocks = std::move(sd.data_blocks);
|
574
|
+
rows->count = std::accumulate(rows->blocks.begin(), rows->blocks.end(), idx_t(0),
|
575
|
+
[&](idx_t c, const unique_ptr<RowDataBlock> &b) { return c + b->count; });
|
576
|
+
|
577
|
+
// Heap blocks are optional, but we want both for iteration.
|
578
|
+
if (!sd.heap_blocks.empty()) {
|
579
|
+
auto &block = sd.heap_blocks[0];
|
580
|
+
heap = make_uniq<RowDataCollection>(buffer_manager, block->capacity, block->entry_size);
|
581
|
+
heap->blocks = std::move(sd.heap_blocks);
|
582
|
+
hash_group.reset();
|
583
|
+
} else {
|
584
|
+
heap = make_uniq<RowDataCollection>(buffer_manager, (idx_t)Storage::BLOCK_SIZE, 1, true);
|
585
|
+
}
|
586
|
+
heap->count = std::accumulate(heap->blocks.begin(), heap->blocks.end(), idx_t(0),
|
587
|
+
[&](idx_t c, const unique_ptr<RowDataBlock> &b) { return c + b->count; });
|
588
|
+
}
|
589
|
+
|
590
|
+
idx_t PartitionLocalSourceState::GeneratePartition(const idx_t hash_bin_p) {
|
591
|
+
// Get rid of any stale data
|
592
|
+
hash_bin = hash_bin_p;
|
593
|
+
|
594
|
+
// There are three types of partitions:
|
595
|
+
// 1. No partition (no sorting)
|
596
|
+
// 2. One partition (sorting, but no hashing)
|
597
|
+
// 3. Multiple partitions (sorting and hashing)
|
598
|
+
|
599
|
+
// How big is the partition?
|
600
|
+
idx_t count = 0;
|
601
|
+
if (hash_bin < gstate.hash_groups.size() && gstate.hash_groups[hash_bin]) {
|
602
|
+
count = gstate.hash_groups[hash_bin]->count;
|
603
|
+
} else if (gstate.rows && !hash_bin) {
|
604
|
+
count = gstate.count;
|
605
|
+
} else {
|
606
|
+
return count;
|
607
|
+
}
|
608
|
+
|
609
|
+
// Initialise masks to false
|
610
|
+
const auto bit_count = ValidityMask::ValidityMaskSize(count);
|
611
|
+
partition_bits.clear();
|
612
|
+
partition_bits.resize(bit_count, 0);
|
613
|
+
partition_mask.Initialize(partition_bits.data());
|
614
|
+
|
615
|
+
order_bits.clear();
|
616
|
+
order_bits.resize(bit_count, 0);
|
617
|
+
order_mask.Initialize(order_bits.data());
|
618
|
+
|
619
|
+
// Scan the sorted data into new Collections
|
620
|
+
auto external = gstate.external;
|
621
|
+
if (gstate.rows && !hash_bin) {
|
622
|
+
// Simple mask
|
623
|
+
partition_mask.SetValidUnsafe(0);
|
624
|
+
order_mask.SetValidUnsafe(0);
|
625
|
+
// No partition - align the heap blocks with the row blocks
|
626
|
+
rows = gstate.rows->CloneEmpty(gstate.rows->keep_pinned);
|
627
|
+
heap = gstate.strings->CloneEmpty(gstate.strings->keep_pinned);
|
628
|
+
RowDataCollectionScanner::AlignHeapBlocks(*rows, *heap, *gstate.rows, *gstate.strings, layout);
|
629
|
+
external = true;
|
630
|
+
} else if (hash_bin < gstate.hash_groups.size() && gstate.hash_groups[hash_bin]) {
|
631
|
+
// Overwrite the collections with the sorted data
|
632
|
+
hash_group = std::move(gstate.hash_groups[hash_bin]);
|
633
|
+
hash_group->ComputeMasks(partition_mask, order_mask);
|
634
|
+
MaterializeSortedData();
|
635
|
+
} else {
|
636
|
+
return count;
|
637
|
+
}
|
638
|
+
|
639
|
+
scanner = make_uniq<RowDataCollectionScanner>(*rows, *heap, layout, external, false);
|
640
|
+
|
641
|
+
return count;
|
642
|
+
}
|
643
|
+
|
644
|
+
} // namespace duckdb
|
@@ -56,7 +56,7 @@ Allocator &ExpressionExecutor::GetAllocator() {
|
|
56
56
|
|
57
57
|
void ExpressionExecutor::AddExpression(const Expression &expr) {
|
58
58
|
expressions.push_back(&expr);
|
59
|
-
auto state = make_uniq<ExpressionExecutorState>(
|
59
|
+
auto state = make_uniq<ExpressionExecutorState>();
|
60
60
|
Initialize(expr, *state);
|
61
61
|
state->Verify();
|
62
62
|
states.push_back(std::move(state));
|
@@ -31,11 +31,10 @@ ClientContext &ExpressionState::GetContext() {
|
|
31
31
|
return root.executor->GetContext();
|
32
32
|
}
|
33
33
|
|
34
|
-
ExpressionState::ExpressionState(const Expression &expr, ExpressionExecutorState &root)
|
35
|
-
: expr(expr), root(root), name(expr.ToString()) {
|
34
|
+
ExpressionState::ExpressionState(const Expression &expr, ExpressionExecutorState &root) : expr(expr), root(root) {
|
36
35
|
}
|
37
36
|
|
38
|
-
ExpressionExecutorState::ExpressionExecutorState(
|
37
|
+
ExpressionExecutorState::ExpressionExecutorState() : profiler() {
|
39
38
|
}
|
40
39
|
|
41
40
|
void ExpressionState::Verify(ExpressionExecutorState &root_executor) {
|