duckdb 0.7.2-dev3117.0 → 0.7.2-dev3154.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/include/parquet_writer.hpp +7 -0
- package/src/duckdb/extension/parquet/parquet-extension.cpp +42 -0
- package/src/duckdb/extension/parquet/parquet_writer.cpp +23 -9
- package/src/duckdb/src/common/enums/physical_operator_type.cpp +2 -0
- package/src/duckdb/src/common/types/vector.cpp +4 -5
- package/src/duckdb/src/common/types/vector_buffer.cpp +1 -1
- package/src/duckdb/src/core_functions/function_list.cpp +1 -0
- package/src/duckdb/src/core_functions/scalar/map/map_concat.cpp +186 -0
- package/src/duckdb/src/execution/operator/persistent/physical_batch_copy_to_file.cpp +65 -21
- package/src/duckdb/src/execution/operator/persistent/physical_fixed_batch_copy.cpp +494 -0
- package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp +16 -6
- package/src/duckdb/src/execution/window_segment_tree.cpp +17 -13
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/enums/physical_operator_type.hpp +1 -0
- package/src/duckdb/src/include/duckdb/core_functions/scalar/map_functions.hpp +14 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp +13 -0
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_fixed_batch_copy.hpp +72 -0
- package/src/duckdb/src/include/duckdb/function/copy_function.hpp +3 -1
- package/src/duckdb/src/planner/operator/logical_delete.cpp +2 -0
- package/src/duckdb/src/planner/operator/logical_update.cpp +2 -0
- package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +1 -0
- package/src/duckdb/ub_src_core_functions_scalar_map.cpp +2 -0
- package/src/duckdb/ub_src_execution_operator_persistent.cpp +2 -0
@@ -0,0 +1,494 @@
|
|
1
|
+
#include "duckdb/execution/operator/persistent/physical_fixed_batch_copy.hpp"
|
2
|
+
#include "duckdb/execution/operator/persistent/physical_copy_to_file.hpp"
|
3
|
+
#include "duckdb/parallel/base_pipeline_event.hpp"
|
4
|
+
#include "duckdb/common/vector_operations/vector_operations.hpp"
|
5
|
+
#include "duckdb/common/types/batched_data_collection.hpp"
|
6
|
+
#include "duckdb/common/allocator.hpp"
|
7
|
+
#include "duckdb/common/queue.hpp"
|
8
|
+
#include "duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp"
|
9
|
+
|
10
|
+
#include <algorithm>
|
11
|
+
|
12
|
+
namespace duckdb {
|
13
|
+
|
14
|
+
PhysicalFixedBatchCopy::PhysicalFixedBatchCopy(vector<LogicalType> types, CopyFunction function_p,
|
15
|
+
unique_ptr<FunctionData> bind_data_p, idx_t estimated_cardinality)
|
16
|
+
: PhysicalOperator(PhysicalOperatorType::BATCH_COPY_TO_FILE, std::move(types), estimated_cardinality),
|
17
|
+
function(std::move(function_p)), bind_data(std::move(bind_data_p)) {
|
18
|
+
if (!function.flush_batch || !function.prepare_batch || !function.desired_batch_size) {
|
19
|
+
throw InternalException("PhysicalFixedBatchCopy created for copy function that does not have "
|
20
|
+
"prepare_batch/flush_batch/desired_batch_size defined");
|
21
|
+
}
|
22
|
+
}
|
23
|
+
|
24
|
+
//===--------------------------------------------------------------------===//
|
25
|
+
// Sink
|
26
|
+
//===--------------------------------------------------------------------===//
|
27
|
+
class BatchCopyTask {
|
28
|
+
public:
|
29
|
+
virtual ~BatchCopyTask() {
|
30
|
+
}
|
31
|
+
|
32
|
+
virtual void Execute(const PhysicalFixedBatchCopy &op, ClientContext &context, GlobalSinkState &gstate_p) = 0;
|
33
|
+
};
|
34
|
+
|
35
|
+
//===--------------------------------------------------------------------===//
|
36
|
+
// States
|
37
|
+
//===--------------------------------------------------------------------===//
|
38
|
+
class FixedBatchCopyGlobalState : public GlobalSinkState {
|
39
|
+
public:
|
40
|
+
explicit FixedBatchCopyGlobalState(unique_ptr<GlobalFunctionData> global_state)
|
41
|
+
: rows_copied(0), global_state(std::move(global_state)), batch_size(0), scheduled_batch_index(0),
|
42
|
+
flushed_batch_index(0), any_flushing(false), any_finished(false) {
|
43
|
+
}
|
44
|
+
|
45
|
+
mutex lock;
|
46
|
+
mutex flush_lock;
|
47
|
+
//! The total number of rows copied to the file
|
48
|
+
atomic<idx_t> rows_copied;
|
49
|
+
//! Global copy state
|
50
|
+
unique_ptr<GlobalFunctionData> global_state;
|
51
|
+
//! The desired batch size (if any)
|
52
|
+
idx_t batch_size;
|
53
|
+
//! Unpartitioned batches - only used in case batch_size is required
|
54
|
+
map<idx_t, unique_ptr<ColumnDataCollection>> raw_batches;
|
55
|
+
//! The prepared batch data by batch index - ready to flush
|
56
|
+
map<idx_t, unique_ptr<PreparedBatchData>> batch_data;
|
57
|
+
//! The index of the latest batch index that has been scheduled
|
58
|
+
atomic<idx_t> scheduled_batch_index;
|
59
|
+
//! The index of the latest batch index that has been flushed
|
60
|
+
atomic<idx_t> flushed_batch_index;
|
61
|
+
//! Whether or not any thread is flushing
|
62
|
+
atomic<bool> any_flushing;
|
63
|
+
//! Whether or not any threads are finished
|
64
|
+
atomic<bool> any_finished;
|
65
|
+
|
66
|
+
void AddTask(unique_ptr<BatchCopyTask> task) {
|
67
|
+
lock_guard<mutex> l(task_lock);
|
68
|
+
task_queue.push(std::move(task));
|
69
|
+
}
|
70
|
+
|
71
|
+
unique_ptr<BatchCopyTask> GetTask() {
|
72
|
+
lock_guard<mutex> l(task_lock);
|
73
|
+
if (task_queue.empty()) {
|
74
|
+
return nullptr;
|
75
|
+
}
|
76
|
+
auto entry = std::move(task_queue.front());
|
77
|
+
task_queue.pop();
|
78
|
+
return entry;
|
79
|
+
}
|
80
|
+
|
81
|
+
idx_t TaskCount() {
|
82
|
+
lock_guard<mutex> l(task_lock);
|
83
|
+
return task_queue.size();
|
84
|
+
}
|
85
|
+
|
86
|
+
void AddBatchData(idx_t batch_index, unique_ptr<PreparedBatchData> new_batch) {
|
87
|
+
// move the batch data to the set of prepared batch data
|
88
|
+
lock_guard<mutex> l(lock);
|
89
|
+
auto entry = batch_data.insert(make_pair(batch_index, std::move(new_batch)));
|
90
|
+
if (!entry.second) {
|
91
|
+
throw InternalException("Duplicate batch index %llu encountered in PhysicalFixedBatchCopy", batch_index);
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
private:
|
96
|
+
mutex task_lock;
|
97
|
+
//! The task queue for the batch copy to file
|
98
|
+
queue<unique_ptr<BatchCopyTask>> task_queue;
|
99
|
+
};
|
100
|
+
|
101
|
+
class FixedBatchCopyLocalState : public LocalSinkState {
|
102
|
+
public:
|
103
|
+
explicit FixedBatchCopyLocalState(unique_ptr<LocalFunctionData> local_state_p)
|
104
|
+
: local_state(std::move(local_state_p)), rows_copied(0) {
|
105
|
+
}
|
106
|
+
|
107
|
+
//! Local copy state
|
108
|
+
unique_ptr<LocalFunctionData> local_state;
|
109
|
+
//! The current collection we are appending to
|
110
|
+
unique_ptr<ColumnDataCollection> collection;
|
111
|
+
//! The append state of the collection
|
112
|
+
ColumnDataAppendState append_state;
|
113
|
+
//! How many rows have been copied in total
|
114
|
+
idx_t rows_copied;
|
115
|
+
//! The current batch index
|
116
|
+
optional_idx batch_index;
|
117
|
+
|
118
|
+
void InitializeCollection(ClientContext &context, const PhysicalOperator &op) {
|
119
|
+
collection = make_uniq<ColumnDataCollection>(Allocator::Get(context), op.children[0]->types);
|
120
|
+
collection->InitializeAppend(append_state);
|
121
|
+
}
|
122
|
+
};
|
123
|
+
|
124
|
+
//===--------------------------------------------------------------------===//
|
125
|
+
// Sink
|
126
|
+
//===--------------------------------------------------------------------===//
|
127
|
+
SinkResultType PhysicalFixedBatchCopy::Sink(ExecutionContext &context, DataChunk &chunk,
|
128
|
+
OperatorSinkInput &input) const {
|
129
|
+
auto &state = input.local_state.Cast<FixedBatchCopyLocalState>();
|
130
|
+
if (!state.collection) {
|
131
|
+
state.InitializeCollection(context.client, *this);
|
132
|
+
state.batch_index = state.partition_info.batch_index.GetIndex();
|
133
|
+
}
|
134
|
+
state.rows_copied += chunk.size();
|
135
|
+
state.collection->Append(state.append_state, chunk);
|
136
|
+
return SinkResultType::NEED_MORE_INPUT;
|
137
|
+
}
|
138
|
+
|
139
|
+
void PhysicalFixedBatchCopy::Combine(ExecutionContext &context, GlobalSinkState &gstate_p,
|
140
|
+
LocalSinkState &lstate) const {
|
141
|
+
auto &state = lstate.Cast<FixedBatchCopyLocalState>();
|
142
|
+
auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
|
143
|
+
gstate.rows_copied += state.rows_copied;
|
144
|
+
if (!gstate.any_finished) {
|
145
|
+
// signal that this thread is finished processing batches and that we should move on to Finalize
|
146
|
+
lock_guard<mutex> l(gstate.lock);
|
147
|
+
gstate.any_finished = true;
|
148
|
+
}
|
149
|
+
ExecuteTasks(context.client, gstate);
|
150
|
+
}
|
151
|
+
|
152
|
+
//===--------------------------------------------------------------------===//
|
153
|
+
// ProcessRemainingBatchesEvent
|
154
|
+
//===--------------------------------------------------------------------===//
|
155
|
+
class ProcessRemainingBatchesTask : public ExecutorTask {
|
156
|
+
public:
|
157
|
+
ProcessRemainingBatchesTask(Executor &executor, shared_ptr<Event> event_p, FixedBatchCopyGlobalState &state_p,
|
158
|
+
ClientContext &context, const PhysicalFixedBatchCopy &op)
|
159
|
+
: ExecutorTask(executor), event(std::move(event_p)), op(op), gstate(state_p), context(context) {
|
160
|
+
}
|
161
|
+
|
162
|
+
TaskExecutionResult ExecuteTask(TaskExecutionMode mode) override {
|
163
|
+
while (op.ExecuteTask(context, gstate)) {
|
164
|
+
op.FlushBatchData(context, gstate, 0);
|
165
|
+
}
|
166
|
+
event->FinishTask();
|
167
|
+
return TaskExecutionResult::TASK_FINISHED;
|
168
|
+
}
|
169
|
+
|
170
|
+
private:
|
171
|
+
shared_ptr<Event> event;
|
172
|
+
const PhysicalFixedBatchCopy &op;
|
173
|
+
FixedBatchCopyGlobalState &gstate;
|
174
|
+
ClientContext &context;
|
175
|
+
};
|
176
|
+
|
177
|
+
class ProcessRemainingBatchesEvent : public BasePipelineEvent {
|
178
|
+
public:
|
179
|
+
ProcessRemainingBatchesEvent(const PhysicalFixedBatchCopy &op_p, FixedBatchCopyGlobalState &gstate_p,
|
180
|
+
Pipeline &pipeline_p, ClientContext &context)
|
181
|
+
: BasePipelineEvent(pipeline_p), op(op_p), gstate(gstate_p), context(context) {
|
182
|
+
}
|
183
|
+
const PhysicalFixedBatchCopy &op;
|
184
|
+
FixedBatchCopyGlobalState &gstate;
|
185
|
+
ClientContext &context;
|
186
|
+
|
187
|
+
public:
|
188
|
+
void Schedule() override {
|
189
|
+
vector<shared_ptr<Task>> tasks;
|
190
|
+
for (idx_t i = 0; i < idx_t(TaskScheduler::GetScheduler(context).NumberOfThreads()); i++) {
|
191
|
+
auto process_task =
|
192
|
+
make_uniq<ProcessRemainingBatchesTask>(pipeline->executor, shared_from_this(), gstate, context, op);
|
193
|
+
tasks.push_back(std::move(process_task));
|
194
|
+
}
|
195
|
+
D_ASSERT(!tasks.empty());
|
196
|
+
SetTasks(std::move(tasks));
|
197
|
+
}
|
198
|
+
|
199
|
+
void FinishEvent() override {
|
200
|
+
//! Now that all batches are processed we finish flushing the file to disk
|
201
|
+
op.FinalFlush(context, gstate);
|
202
|
+
}
|
203
|
+
};
|
204
|
+
//===--------------------------------------------------------------------===//
|
205
|
+
// Finalize
|
206
|
+
//===--------------------------------------------------------------------===//
|
207
|
+
SinkFinalizeType PhysicalFixedBatchCopy::FinalFlush(ClientContext &context, GlobalSinkState &gstate_p) const {
|
208
|
+
auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
|
209
|
+
if (gstate.TaskCount() != 0) {
|
210
|
+
throw InternalException("Unexecuted tasks are remaining in PhysicalFixedBatchCopy::FinalFlush!?");
|
211
|
+
}
|
212
|
+
idx_t min_batch_index = idx_t(NumericLimits<int64_t>::Maximum());
|
213
|
+
FlushBatchData(context, gstate_p, min_batch_index);
|
214
|
+
if (gstate.scheduled_batch_index != gstate.flushed_batch_index) {
|
215
|
+
throw InternalException("Not all batches were flushed to disk - incomplete file?");
|
216
|
+
}
|
217
|
+
if (function.copy_to_finalize) {
|
218
|
+
function.copy_to_finalize(context, *bind_data, *gstate.global_state);
|
219
|
+
|
220
|
+
if (use_tmp_file) {
|
221
|
+
PhysicalCopyToFile::MoveTmpFile(context, file_path);
|
222
|
+
}
|
223
|
+
}
|
224
|
+
return SinkFinalizeType::READY;
|
225
|
+
}
|
226
|
+
|
227
|
+
SinkFinalizeType PhysicalFixedBatchCopy::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
|
228
|
+
GlobalSinkState &gstate_p) const {
|
229
|
+
auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
|
230
|
+
idx_t min_batch_index = idx_t(NumericLimits<int64_t>::Maximum());
|
231
|
+
// repartition any remaining batches
|
232
|
+
RepartitionBatches(context, gstate_p, min_batch_index, true);
|
233
|
+
// check if we have multiple tasks to execute
|
234
|
+
if (gstate.TaskCount() <= 1) {
|
235
|
+
// we don't - just execute the remaining task and finish flushing to disk
|
236
|
+
ExecuteTasks(context, gstate_p);
|
237
|
+
FinalFlush(context, gstate_p);
|
238
|
+
return SinkFinalizeType::READY;
|
239
|
+
}
|
240
|
+
// we have multiple tasks remaining - launch an event to execute the tasks in parallel
|
241
|
+
auto new_event = make_shared<ProcessRemainingBatchesEvent>(*this, gstate, pipeline, context);
|
242
|
+
event.InsertEvent(std::move(new_event));
|
243
|
+
return SinkFinalizeType::READY;
|
244
|
+
}
|
245
|
+
|
246
|
+
//===--------------------------------------------------------------------===//
|
247
|
+
// Tasks
|
248
|
+
//===--------------------------------------------------------------------===//
|
249
|
+
class RepartitionedFlushTask : public BatchCopyTask {
|
250
|
+
public:
|
251
|
+
RepartitionedFlushTask() {
|
252
|
+
}
|
253
|
+
|
254
|
+
void Execute(const PhysicalFixedBatchCopy &op, ClientContext &context, GlobalSinkState &gstate_p) override {
|
255
|
+
op.FlushBatchData(context, gstate_p, 0);
|
256
|
+
}
|
257
|
+
};
|
258
|
+
|
259
|
+
class PrepareBatchTask : public BatchCopyTask {
|
260
|
+
public:
|
261
|
+
PrepareBatchTask(idx_t batch_index, unique_ptr<ColumnDataCollection> collection_p)
|
262
|
+
: batch_index(batch_index), collection(std::move(collection_p)) {
|
263
|
+
}
|
264
|
+
|
265
|
+
idx_t batch_index;
|
266
|
+
unique_ptr<ColumnDataCollection> collection;
|
267
|
+
|
268
|
+
void Execute(const PhysicalFixedBatchCopy &op, ClientContext &context, GlobalSinkState &gstate_p) override {
|
269
|
+
auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
|
270
|
+
auto batch_data =
|
271
|
+
op.function.prepare_batch(context, *op.bind_data, *gstate.global_state, std::move(collection));
|
272
|
+
gstate.AddBatchData(batch_index, std::move(batch_data));
|
273
|
+
if (batch_index == gstate.flushed_batch_index) {
|
274
|
+
gstate.AddTask(make_uniq<RepartitionedFlushTask>());
|
275
|
+
}
|
276
|
+
}
|
277
|
+
};
|
278
|
+
|
279
|
+
//===--------------------------------------------------------------------===//
|
280
|
+
// Batch Data Handling
|
281
|
+
//===--------------------------------------------------------------------===//
|
282
|
+
void PhysicalFixedBatchCopy::AddRawBatchData(ClientContext &context, GlobalSinkState &gstate_p, idx_t batch_index,
|
283
|
+
unique_ptr<ColumnDataCollection> collection) const {
|
284
|
+
auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
|
285
|
+
|
286
|
+
// add the batch index to the set of raw batches
|
287
|
+
lock_guard<mutex> l(gstate.lock);
|
288
|
+
auto entry = gstate.raw_batches.insert(make_pair(batch_index, std::move(collection)));
|
289
|
+
if (!entry.second) {
|
290
|
+
throw InternalException("Duplicate batch index %llu encountered in PhysicalFixedBatchCopy", batch_index);
|
291
|
+
}
|
292
|
+
}
|
293
|
+
|
294
|
+
static bool CorrectSizeForBatch(idx_t collection_size, idx_t desired_size) {
|
295
|
+
return idx_t(AbsValue<int64_t>(int64_t(collection_size) - int64_t(desired_size))) < STANDARD_VECTOR_SIZE;
|
296
|
+
}
|
297
|
+
|
298
|
+
void PhysicalFixedBatchCopy::RepartitionBatches(ClientContext &context, GlobalSinkState &gstate_p, idx_t min_index,
|
299
|
+
bool final) const {
|
300
|
+
auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
|
301
|
+
|
302
|
+
// repartition batches until the min index is reached
|
303
|
+
lock_guard<mutex> l(gstate.lock);
|
304
|
+
if (gstate.raw_batches.empty()) {
|
305
|
+
return;
|
306
|
+
}
|
307
|
+
if (!final) {
|
308
|
+
if (gstate.any_finished) {
|
309
|
+
// we only repartition in ::NextBatch if all threads are still busy processing batches
|
310
|
+
// otherwise we might end up repartitioning a lot of data with only a few threads remaining
|
311
|
+
// which causes erratic performance
|
312
|
+
return;
|
313
|
+
}
|
314
|
+
// if this is not the final flush we first check if we have enough data to merge past the batch threshold
|
315
|
+
idx_t candidate_rows = 0;
|
316
|
+
for (auto entry = gstate.raw_batches.begin(); entry != gstate.raw_batches.end(); entry++) {
|
317
|
+
if (entry->first >= min_index) {
|
318
|
+
// we have exceeded the minimum batch
|
319
|
+
break;
|
320
|
+
}
|
321
|
+
candidate_rows += entry->second->Count();
|
322
|
+
}
|
323
|
+
if (candidate_rows < gstate.batch_size) {
|
324
|
+
// not enough rows - cancel!
|
325
|
+
return;
|
326
|
+
}
|
327
|
+
}
|
328
|
+
// gather all collections we can repartition
|
329
|
+
idx_t max_batch_index = 0;
|
330
|
+
vector<unique_ptr<ColumnDataCollection>> collections;
|
331
|
+
for (auto entry = gstate.raw_batches.begin(); entry != gstate.raw_batches.end();) {
|
332
|
+
if (entry->first >= min_index) {
|
333
|
+
break;
|
334
|
+
}
|
335
|
+
max_batch_index = entry->first;
|
336
|
+
collections.push_back(std::move(entry->second));
|
337
|
+
entry = gstate.raw_batches.erase(entry);
|
338
|
+
}
|
339
|
+
unique_ptr<ColumnDataCollection> current_collection;
|
340
|
+
ColumnDataAppendState append_state;
|
341
|
+
// now perform the actual repartitioning
|
342
|
+
for (auto &collection : collections) {
|
343
|
+
if (!current_collection) {
|
344
|
+
if (CorrectSizeForBatch(collection->Count(), gstate.batch_size)) {
|
345
|
+
// the collection is ~approximately equal to the batch size (off by at most one vector)
|
346
|
+
// use it directly
|
347
|
+
gstate.AddTask(make_uniq<PrepareBatchTask>(gstate.scheduled_batch_index++, std::move(collection)));
|
348
|
+
collection.reset();
|
349
|
+
} else if (collection->Count() < gstate.batch_size) {
|
350
|
+
// the collection is smaller than the batch size - use it as a starting point
|
351
|
+
current_collection = std::move(collection);
|
352
|
+
collection.reset();
|
353
|
+
} else {
|
354
|
+
// the collection is too large for a batch - we need to repartition
|
355
|
+
// create an empty collection
|
356
|
+
current_collection = make_uniq<ColumnDataCollection>(Allocator::Get(context), children[0]->types);
|
357
|
+
}
|
358
|
+
if (current_collection) {
|
359
|
+
current_collection->InitializeAppend(append_state);
|
360
|
+
}
|
361
|
+
}
|
362
|
+
if (!collection) {
|
363
|
+
// we have consumed the collection already - no need to append
|
364
|
+
continue;
|
365
|
+
}
|
366
|
+
// iterate the collection while appending
|
367
|
+
for (auto &chunk : collection->Chunks()) {
|
368
|
+
// append the chunk to the collection
|
369
|
+
current_collection->Append(append_state, chunk);
|
370
|
+
if (current_collection->Count() < gstate.batch_size) {
|
371
|
+
// the collection is still under the batch size - continue
|
372
|
+
continue;
|
373
|
+
}
|
374
|
+
// the collection is full - move it to the result and create a new one
|
375
|
+
gstate.AddTask(make_uniq<PrepareBatchTask>(gstate.scheduled_batch_index++, std::move(current_collection)));
|
376
|
+
current_collection = make_uniq<ColumnDataCollection>(Allocator::Get(context), children[0]->types);
|
377
|
+
current_collection->InitializeAppend(append_state);
|
378
|
+
}
|
379
|
+
}
|
380
|
+
if (current_collection && current_collection->Count() > 0) {
|
381
|
+
// if there are any remaining batches that are not filled up to the batch size
|
382
|
+
// AND this is not the final collection
|
383
|
+
// re-add it to the set of raw (to-be-merged) batches
|
384
|
+
if (final || CorrectSizeForBatch(current_collection->Count(), gstate.batch_size)) {
|
385
|
+
gstate.AddTask(make_uniq<PrepareBatchTask>(gstate.scheduled_batch_index++, std::move(current_collection)));
|
386
|
+
} else {
|
387
|
+
gstate.raw_batches[max_batch_index] = std::move(current_collection);
|
388
|
+
}
|
389
|
+
}
|
390
|
+
}
|
391
|
+
|
392
|
+
void PhysicalFixedBatchCopy::FlushBatchData(ClientContext &context, GlobalSinkState &gstate_p, idx_t min_index) const {
|
393
|
+
auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
|
394
|
+
|
395
|
+
// flush batch data to disk (if there are any to flush)
|
396
|
+
// grab the flush lock - we can only call flush_batch with this lock
|
397
|
+
// otherwise the data might end up in the wrong order
|
398
|
+
{
|
399
|
+
lock_guard<mutex> l(gstate.flush_lock);
|
400
|
+
if (gstate.any_flushing) {
|
401
|
+
return;
|
402
|
+
}
|
403
|
+
gstate.any_flushing = true;
|
404
|
+
}
|
405
|
+
ActiveFlushGuard active_flush(gstate.any_flushing);
|
406
|
+
while (true) {
|
407
|
+
unique_ptr<PreparedBatchData> batch_data;
|
408
|
+
{
|
409
|
+
lock_guard<mutex> l(gstate.lock);
|
410
|
+
if (gstate.batch_data.empty()) {
|
411
|
+
// no batch data left to flush
|
412
|
+
break;
|
413
|
+
}
|
414
|
+
auto entry = gstate.batch_data.begin();
|
415
|
+
if (entry->first != gstate.flushed_batch_index) {
|
416
|
+
// this entry is not yet ready to be flushed
|
417
|
+
break;
|
418
|
+
}
|
419
|
+
if (entry->first < gstate.flushed_batch_index) {
|
420
|
+
throw InternalException("Batch index was out of order!?");
|
421
|
+
}
|
422
|
+
batch_data = std::move(entry->second);
|
423
|
+
gstate.batch_data.erase(entry);
|
424
|
+
}
|
425
|
+
function.flush_batch(context, *bind_data, *gstate.global_state, *batch_data);
|
426
|
+
gstate.flushed_batch_index++;
|
427
|
+
}
|
428
|
+
}
|
429
|
+
|
430
|
+
//===--------------------------------------------------------------------===//
|
431
|
+
// Tasks
|
432
|
+
//===--------------------------------------------------------------------===//
|
433
|
+
bool PhysicalFixedBatchCopy::ExecuteTask(ClientContext &context, GlobalSinkState &gstate_p) const {
|
434
|
+
auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
|
435
|
+
auto task = gstate.GetTask();
|
436
|
+
if (!task) {
|
437
|
+
return false;
|
438
|
+
}
|
439
|
+
task->Execute(*this, context, gstate_p);
|
440
|
+
return true;
|
441
|
+
}
|
442
|
+
|
443
|
+
void PhysicalFixedBatchCopy::ExecuteTasks(ClientContext &context, GlobalSinkState &gstate_p) const {
|
444
|
+
while (ExecuteTask(context, gstate_p)) {
|
445
|
+
}
|
446
|
+
}
|
447
|
+
|
448
|
+
//===--------------------------------------------------------------------===//
|
449
|
+
// Next Batch
|
450
|
+
//===--------------------------------------------------------------------===//
|
451
|
+
void PhysicalFixedBatchCopy::NextBatch(ExecutionContext &context, GlobalSinkState &gstate_p,
|
452
|
+
LocalSinkState &lstate) const {
|
453
|
+
auto &state = lstate.Cast<FixedBatchCopyLocalState>();
|
454
|
+
if (state.collection && state.collection->Count() > 0) {
|
455
|
+
// we finished processing this batch
|
456
|
+
// start flushing data
|
457
|
+
auto min_batch_index = lstate.partition_info.min_batch_index.GetIndex();
|
458
|
+
// push the raw batch data into the set of unprocessed batches
|
459
|
+
AddRawBatchData(context.client, gstate_p, state.batch_index.GetIndex(), std::move(state.collection));
|
460
|
+
// attempt to repartition to our desired batch size
|
461
|
+
RepartitionBatches(context.client, gstate_p, min_batch_index);
|
462
|
+
// execute a single batch task
|
463
|
+
ExecuteTask(context.client, gstate_p);
|
464
|
+
FlushBatchData(context.client, gstate_p, min_batch_index);
|
465
|
+
}
|
466
|
+
state.batch_index = lstate.partition_info.batch_index.GetIndex();
|
467
|
+
|
468
|
+
state.InitializeCollection(context.client, *this);
|
469
|
+
}
|
470
|
+
|
471
|
+
unique_ptr<LocalSinkState> PhysicalFixedBatchCopy::GetLocalSinkState(ExecutionContext &context) const {
|
472
|
+
return make_uniq<FixedBatchCopyLocalState>(function.copy_to_initialize_local(context, *bind_data));
|
473
|
+
}
|
474
|
+
|
475
|
+
unique_ptr<GlobalSinkState> PhysicalFixedBatchCopy::GetGlobalSinkState(ClientContext &context) const {
|
476
|
+
auto result =
|
477
|
+
make_uniq<FixedBatchCopyGlobalState>(function.copy_to_initialize_global(context, *bind_data, file_path));
|
478
|
+
result->batch_size = function.desired_batch_size(context, *bind_data);
|
479
|
+
return std::move(result);
|
480
|
+
}
|
481
|
+
|
482
|
+
//===--------------------------------------------------------------------===//
|
483
|
+
// Source
|
484
|
+
//===--------------------------------------------------------------------===//
|
485
|
+
SourceResultType PhysicalFixedBatchCopy::GetData(ExecutionContext &context, DataChunk &chunk,
|
486
|
+
OperatorSourceInput &input) const {
|
487
|
+
auto &g = sink_state->Cast<FixedBatchCopyGlobalState>();
|
488
|
+
|
489
|
+
chunk.SetCardinality(1);
|
490
|
+
chunk.SetValue(0, 0, Value::BIGINT(g.rows_copied));
|
491
|
+
return SourceResultType::FINISHED;
|
492
|
+
}
|
493
|
+
|
494
|
+
} // namespace duckdb
|
@@ -1,6 +1,7 @@
|
|
1
1
|
#include "duckdb/execution/physical_plan_generator.hpp"
|
2
2
|
#include "duckdb/execution/operator/persistent/physical_copy_to_file.hpp"
|
3
3
|
#include "duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp"
|
4
|
+
#include "duckdb/execution/operator/persistent/physical_fixed_batch_copy.hpp"
|
4
5
|
#include "duckdb/planner/operator/logical_copy_to_file.hpp"
|
5
6
|
|
6
7
|
namespace duckdb {
|
@@ -28,12 +29,21 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::CreatePlan(LogicalCopyToFile
|
|
28
29
|
throw InternalException("BATCH_COPY_TO_FILE can only be used if batch indexes are supported");
|
29
30
|
}
|
30
31
|
// batched copy to file
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
if (op.function.desired_batch_size) {
|
33
|
+
auto copy = make_uniq<PhysicalFixedBatchCopy>(op.types, op.function, std::move(op.bind_data),
|
34
|
+
op.estimated_cardinality);
|
35
|
+
copy->file_path = op.file_path;
|
36
|
+
copy->use_tmp_file = op.use_tmp_file;
|
37
|
+
copy->children.push_back(std::move(plan));
|
38
|
+
return std::move(copy);
|
39
|
+
} else {
|
40
|
+
auto copy = make_uniq<PhysicalBatchCopyToFile>(op.types, op.function, std::move(op.bind_data),
|
41
|
+
op.estimated_cardinality);
|
42
|
+
copy->file_path = op.file_path;
|
43
|
+
copy->use_tmp_file = op.use_tmp_file;
|
44
|
+
copy->children.push_back(std::move(plan));
|
45
|
+
return std::move(copy);
|
46
|
+
}
|
37
47
|
}
|
38
48
|
// COPY from select statement to file
|
39
49
|
auto copy = make_uniq<PhysicalCopyToFile>(op.types, op.function, std::move(op.bind_data), op.estimated_cardinality);
|
@@ -106,18 +106,10 @@ void WindowConstantAggregate::Sink(DataChunk &payload_chunk, SelectionVector *fi
|
|
106
106
|
auto end = partition_end - chunk_begin;
|
107
107
|
|
108
108
|
inputs.Reset();
|
109
|
-
if (begin) {
|
110
|
-
for (idx_t c = 0; c < payload_chunk.ColumnCount(); ++c) {
|
111
|
-
inputs.data[c].Slice(payload_chunk.data[c], begin, end);
|
112
|
-
}
|
113
|
-
} else {
|
114
|
-
inputs.Reference(payload_chunk);
|
115
|
-
}
|
116
|
-
inputs.SetCardinality(end - begin);
|
117
|
-
|
118
|
-
// Slice to any filtered rows
|
119
|
-
SelectionVector sel;
|
120
109
|
if (filter_sel) {
|
110
|
+
// Slice to any filtered rows in [begin, end)
|
111
|
+
SelectionVector sel;
|
112
|
+
|
121
113
|
// Find the first value in [begin, end)
|
122
114
|
for (; filter_idx < filtered; ++filter_idx) {
|
123
115
|
auto idx = filter_sel->get_index(filter_idx);
|
@@ -125,7 +117,9 @@ void WindowConstantAggregate::Sink(DataChunk &payload_chunk, SelectionVector *fi
|
|
125
117
|
break;
|
126
118
|
}
|
127
119
|
}
|
128
|
-
|
120
|
+
|
121
|
+
// Find the first value in [end, filtered)
|
122
|
+
sel.Initialize(filter_sel->data() + filter_idx);
|
129
123
|
idx_t nsel = 0;
|
130
124
|
for (; filter_idx < filtered; ++filter_idx, ++nsel) {
|
131
125
|
auto idx = filter_sel->get_index(filter_idx);
|
@@ -135,8 +129,18 @@ void WindowConstantAggregate::Sink(DataChunk &payload_chunk, SelectionVector *fi
|
|
135
129
|
}
|
136
130
|
|
137
131
|
if (nsel != inputs.size()) {
|
138
|
-
inputs.Slice(sel, nsel);
|
132
|
+
inputs.Slice(payload_chunk, sel, nsel);
|
133
|
+
}
|
134
|
+
} else {
|
135
|
+
// Slice to [begin, end)
|
136
|
+
if (begin) {
|
137
|
+
for (idx_t c = 0; c < payload_chunk.ColumnCount(); ++c) {
|
138
|
+
inputs.data[c].Slice(payload_chunk.data[c], begin, end);
|
139
|
+
}
|
140
|
+
} else {
|
141
|
+
inputs.Reference(payload_chunk);
|
139
142
|
}
|
143
|
+
inputs.SetCardinality(end - begin);
|
140
144
|
}
|
141
145
|
|
142
146
|
// Aggregate the filtered rows into a single state
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.7.2-
|
2
|
+
#define DUCKDB_VERSION "0.7.2-dev3154"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "eddb84d5ca"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -43,7 +43,10 @@ struct MapEntriesFun {
|
|
43
43
|
struct MapExtractFun {
|
44
44
|
static constexpr const char *Name = "map_extract";
|
45
45
|
static constexpr const char *Parameters = "map,key";
|
46
|
-
static constexpr const char *Description =
|
46
|
+
static constexpr const char *Description =
|
47
|
+
"Return a list containing the value for a given key or an empty list if the key is not contained in the map. "
|
48
|
+
"The type of the key provided in the second parameter must match the type of the map’s keys else an error is "
|
49
|
+
"returned.";
|
47
50
|
static constexpr const char *Example = "map_extract(map(['key'], ['val']), 'key')";
|
48
51
|
|
49
52
|
static ScalarFunction GetFunction();
|
@@ -64,6 +67,16 @@ struct MapFromEntriesFun {
|
|
64
67
|
static ScalarFunction GetFunction();
|
65
68
|
};
|
66
69
|
|
70
|
+
struct MapConcatFun {
|
71
|
+
static constexpr const char *Name = "map_concat";
|
72
|
+
static constexpr const char *Parameters = "any,...";
|
73
|
+
static constexpr const char *Description = "Returns a map created from merging the input maps, on key collision "
|
74
|
+
"the value is taken from the last map with that key";
|
75
|
+
static constexpr const char *Example = "map_concat(map([1,2], ['a', 'b']), map([2,3], ['c', 'd']));";
|
76
|
+
|
77
|
+
static ScalarFunction GetFunction();
|
78
|
+
};
|
79
|
+
|
67
80
|
struct MapKeysFun {
|
68
81
|
static constexpr const char *Name = "map_keys";
|
69
82
|
static constexpr const char *Parameters = "map";
|
package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp
CHANGED
@@ -64,5 +64,18 @@ private:
|
|
64
64
|
void PrepareBatchData(ClientContext &context, GlobalSinkState &gstate_p, idx_t batch_index,
|
65
65
|
unique_ptr<ColumnDataCollection> collection) const;
|
66
66
|
void FlushBatchData(ClientContext &context, GlobalSinkState &gstate_p, idx_t min_index) const;
|
67
|
+
SinkFinalizeType FinalFlush(ClientContext &context, GlobalSinkState &gstate_p) const;
|
67
68
|
};
|
69
|
+
|
70
|
+
struct ActiveFlushGuard {
|
71
|
+
explicit ActiveFlushGuard(atomic<bool> &bool_value_p) : bool_value(bool_value_p) {
|
72
|
+
bool_value = true;
|
73
|
+
}
|
74
|
+
~ActiveFlushGuard() {
|
75
|
+
bool_value = false;
|
76
|
+
}
|
77
|
+
|
78
|
+
atomic<bool> &bool_value;
|
79
|
+
};
|
80
|
+
|
68
81
|
} // namespace duckdb
|