duckdb 0.7.2-dev3117.0 → 0.7.2-dev3154.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/parquet/include/parquet_writer.hpp +7 -0
  3. package/src/duckdb/extension/parquet/parquet-extension.cpp +42 -0
  4. package/src/duckdb/extension/parquet/parquet_writer.cpp +23 -9
  5. package/src/duckdb/src/common/enums/physical_operator_type.cpp +2 -0
  6. package/src/duckdb/src/common/types/vector.cpp +4 -5
  7. package/src/duckdb/src/common/types/vector_buffer.cpp +1 -1
  8. package/src/duckdb/src/core_functions/function_list.cpp +1 -0
  9. package/src/duckdb/src/core_functions/scalar/map/map_concat.cpp +186 -0
  10. package/src/duckdb/src/execution/operator/persistent/physical_batch_copy_to_file.cpp +65 -21
  11. package/src/duckdb/src/execution/operator/persistent/physical_fixed_batch_copy.cpp +494 -0
  12. package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp +16 -6
  13. package/src/duckdb/src/execution/window_segment_tree.cpp +17 -13
  14. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  15. package/src/duckdb/src/include/duckdb/common/enums/physical_operator_type.hpp +1 -0
  16. package/src/duckdb/src/include/duckdb/core_functions/scalar/map_functions.hpp +14 -1
  17. package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp +13 -0
  18. package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_fixed_batch_copy.hpp +72 -0
  19. package/src/duckdb/src/include/duckdb/function/copy_function.hpp +3 -1
  20. package/src/duckdb/src/planner/operator/logical_delete.cpp +2 -0
  21. package/src/duckdb/src/planner/operator/logical_update.cpp +2 -0
  22. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +1 -0
  23. package/src/duckdb/ub_src_core_functions_scalar_map.cpp +2 -0
  24. package/src/duckdb/ub_src_execution_operator_persistent.cpp +2 -0
@@ -0,0 +1,494 @@
1
+ #include "duckdb/execution/operator/persistent/physical_fixed_batch_copy.hpp"
2
+ #include "duckdb/execution/operator/persistent/physical_copy_to_file.hpp"
3
+ #include "duckdb/parallel/base_pipeline_event.hpp"
4
+ #include "duckdb/common/vector_operations/vector_operations.hpp"
5
+ #include "duckdb/common/types/batched_data_collection.hpp"
6
+ #include "duckdb/common/allocator.hpp"
7
+ #include "duckdb/common/queue.hpp"
8
+ #include "duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp"
9
+
10
+ #include <algorithm>
11
+
12
+ namespace duckdb {
13
+
14
+ PhysicalFixedBatchCopy::PhysicalFixedBatchCopy(vector<LogicalType> types, CopyFunction function_p,
15
+ unique_ptr<FunctionData> bind_data_p, idx_t estimated_cardinality)
16
+ : PhysicalOperator(PhysicalOperatorType::BATCH_COPY_TO_FILE, std::move(types), estimated_cardinality),
17
+ function(std::move(function_p)), bind_data(std::move(bind_data_p)) {
18
+ if (!function.flush_batch || !function.prepare_batch || !function.desired_batch_size) {
19
+ throw InternalException("PhysicalFixedBatchCopy created for copy function that does not have "
20
+ "prepare_batch/flush_batch/desired_batch_size defined");
21
+ }
22
+ }
23
+
24
+ //===--------------------------------------------------------------------===//
25
+ // Sink
26
+ //===--------------------------------------------------------------------===//
27
+ class BatchCopyTask {
28
+ public:
29
+ virtual ~BatchCopyTask() {
30
+ }
31
+
32
+ virtual void Execute(const PhysicalFixedBatchCopy &op, ClientContext &context, GlobalSinkState &gstate_p) = 0;
33
+ };
34
+
35
+ //===--------------------------------------------------------------------===//
36
+ // States
37
+ //===--------------------------------------------------------------------===//
38
+ class FixedBatchCopyGlobalState : public GlobalSinkState {
39
+ public:
40
+ explicit FixedBatchCopyGlobalState(unique_ptr<GlobalFunctionData> global_state)
41
+ : rows_copied(0), global_state(std::move(global_state)), batch_size(0), scheduled_batch_index(0),
42
+ flushed_batch_index(0), any_flushing(false), any_finished(false) {
43
+ }
44
+
45
+ mutex lock;
46
+ mutex flush_lock;
47
+ //! The total number of rows copied to the file
48
+ atomic<idx_t> rows_copied;
49
+ //! Global copy state
50
+ unique_ptr<GlobalFunctionData> global_state;
51
+ //! The desired batch size (if any)
52
+ idx_t batch_size;
53
+ //! Unpartitioned batches - only used in case batch_size is required
54
+ map<idx_t, unique_ptr<ColumnDataCollection>> raw_batches;
55
+ //! The prepared batch data by batch index - ready to flush
56
+ map<idx_t, unique_ptr<PreparedBatchData>> batch_data;
57
+ //! The index of the latest batch index that has been scheduled
58
+ atomic<idx_t> scheduled_batch_index;
59
+ //! The index of the latest batch index that has been flushed
60
+ atomic<idx_t> flushed_batch_index;
61
+ //! Whether or not any thread is flushing
62
+ atomic<bool> any_flushing;
63
+ //! Whether or not any threads are finished
64
+ atomic<bool> any_finished;
65
+
66
+ void AddTask(unique_ptr<BatchCopyTask> task) {
67
+ lock_guard<mutex> l(task_lock);
68
+ task_queue.push(std::move(task));
69
+ }
70
+
71
+ unique_ptr<BatchCopyTask> GetTask() {
72
+ lock_guard<mutex> l(task_lock);
73
+ if (task_queue.empty()) {
74
+ return nullptr;
75
+ }
76
+ auto entry = std::move(task_queue.front());
77
+ task_queue.pop();
78
+ return entry;
79
+ }
80
+
81
+ idx_t TaskCount() {
82
+ lock_guard<mutex> l(task_lock);
83
+ return task_queue.size();
84
+ }
85
+
86
+ void AddBatchData(idx_t batch_index, unique_ptr<PreparedBatchData> new_batch) {
87
+ // move the batch data to the set of prepared batch data
88
+ lock_guard<mutex> l(lock);
89
+ auto entry = batch_data.insert(make_pair(batch_index, std::move(new_batch)));
90
+ if (!entry.second) {
91
+ throw InternalException("Duplicate batch index %llu encountered in PhysicalFixedBatchCopy", batch_index);
92
+ }
93
+ }
94
+
95
+ private:
96
+ mutex task_lock;
97
+ //! The task queue for the batch copy to file
98
+ queue<unique_ptr<BatchCopyTask>> task_queue;
99
+ };
100
+
101
+ class FixedBatchCopyLocalState : public LocalSinkState {
102
+ public:
103
+ explicit FixedBatchCopyLocalState(unique_ptr<LocalFunctionData> local_state_p)
104
+ : local_state(std::move(local_state_p)), rows_copied(0) {
105
+ }
106
+
107
+ //! Local copy state
108
+ unique_ptr<LocalFunctionData> local_state;
109
+ //! The current collection we are appending to
110
+ unique_ptr<ColumnDataCollection> collection;
111
+ //! The append state of the collection
112
+ ColumnDataAppendState append_state;
113
+ //! How many rows have been copied in total
114
+ idx_t rows_copied;
115
+ //! The current batch index
116
+ optional_idx batch_index;
117
+
118
+ void InitializeCollection(ClientContext &context, const PhysicalOperator &op) {
119
+ collection = make_uniq<ColumnDataCollection>(Allocator::Get(context), op.children[0]->types);
120
+ collection->InitializeAppend(append_state);
121
+ }
122
+ };
123
+
124
+ //===--------------------------------------------------------------------===//
125
+ // Sink
126
+ //===--------------------------------------------------------------------===//
127
+ SinkResultType PhysicalFixedBatchCopy::Sink(ExecutionContext &context, DataChunk &chunk,
128
+ OperatorSinkInput &input) const {
129
+ auto &state = input.local_state.Cast<FixedBatchCopyLocalState>();
130
+ if (!state.collection) {
131
+ state.InitializeCollection(context.client, *this);
132
+ state.batch_index = state.partition_info.batch_index.GetIndex();
133
+ }
134
+ state.rows_copied += chunk.size();
135
+ state.collection->Append(state.append_state, chunk);
136
+ return SinkResultType::NEED_MORE_INPUT;
137
+ }
138
+
139
+ void PhysicalFixedBatchCopy::Combine(ExecutionContext &context, GlobalSinkState &gstate_p,
140
+ LocalSinkState &lstate) const {
141
+ auto &state = lstate.Cast<FixedBatchCopyLocalState>();
142
+ auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
143
+ gstate.rows_copied += state.rows_copied;
144
+ if (!gstate.any_finished) {
145
+ // signal that this thread is finished processing batches and that we should move on to Finalize
146
+ lock_guard<mutex> l(gstate.lock);
147
+ gstate.any_finished = true;
148
+ }
149
+ ExecuteTasks(context.client, gstate);
150
+ }
151
+
152
+ //===--------------------------------------------------------------------===//
153
+ // ProcessRemainingBatchesEvent
154
+ //===--------------------------------------------------------------------===//
155
+ class ProcessRemainingBatchesTask : public ExecutorTask {
156
+ public:
157
+ ProcessRemainingBatchesTask(Executor &executor, shared_ptr<Event> event_p, FixedBatchCopyGlobalState &state_p,
158
+ ClientContext &context, const PhysicalFixedBatchCopy &op)
159
+ : ExecutorTask(executor), event(std::move(event_p)), op(op), gstate(state_p), context(context) {
160
+ }
161
+
162
+ TaskExecutionResult ExecuteTask(TaskExecutionMode mode) override {
163
+ while (op.ExecuteTask(context, gstate)) {
164
+ op.FlushBatchData(context, gstate, 0);
165
+ }
166
+ event->FinishTask();
167
+ return TaskExecutionResult::TASK_FINISHED;
168
+ }
169
+
170
+ private:
171
+ shared_ptr<Event> event;
172
+ const PhysicalFixedBatchCopy &op;
173
+ FixedBatchCopyGlobalState &gstate;
174
+ ClientContext &context;
175
+ };
176
+
177
+ class ProcessRemainingBatchesEvent : public BasePipelineEvent {
178
+ public:
179
+ ProcessRemainingBatchesEvent(const PhysicalFixedBatchCopy &op_p, FixedBatchCopyGlobalState &gstate_p,
180
+ Pipeline &pipeline_p, ClientContext &context)
181
+ : BasePipelineEvent(pipeline_p), op(op_p), gstate(gstate_p), context(context) {
182
+ }
183
+ const PhysicalFixedBatchCopy &op;
184
+ FixedBatchCopyGlobalState &gstate;
185
+ ClientContext &context;
186
+
187
+ public:
188
+ void Schedule() override {
189
+ vector<shared_ptr<Task>> tasks;
190
+ for (idx_t i = 0; i < idx_t(TaskScheduler::GetScheduler(context).NumberOfThreads()); i++) {
191
+ auto process_task =
192
+ make_uniq<ProcessRemainingBatchesTask>(pipeline->executor, shared_from_this(), gstate, context, op);
193
+ tasks.push_back(std::move(process_task));
194
+ }
195
+ D_ASSERT(!tasks.empty());
196
+ SetTasks(std::move(tasks));
197
+ }
198
+
199
+ void FinishEvent() override {
200
+ //! Now that all batches are processed we finish flushing the file to disk
201
+ op.FinalFlush(context, gstate);
202
+ }
203
+ };
204
+ //===--------------------------------------------------------------------===//
205
+ // Finalize
206
+ //===--------------------------------------------------------------------===//
207
+ SinkFinalizeType PhysicalFixedBatchCopy::FinalFlush(ClientContext &context, GlobalSinkState &gstate_p) const {
208
+ auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
209
+ if (gstate.TaskCount() != 0) {
210
+ throw InternalException("Unexecuted tasks are remaining in PhysicalFixedBatchCopy::FinalFlush!?");
211
+ }
212
+ idx_t min_batch_index = idx_t(NumericLimits<int64_t>::Maximum());
213
+ FlushBatchData(context, gstate_p, min_batch_index);
214
+ if (gstate.scheduled_batch_index != gstate.flushed_batch_index) {
215
+ throw InternalException("Not all batches were flushed to disk - incomplete file?");
216
+ }
217
+ if (function.copy_to_finalize) {
218
+ function.copy_to_finalize(context, *bind_data, *gstate.global_state);
219
+
220
+ if (use_tmp_file) {
221
+ PhysicalCopyToFile::MoveTmpFile(context, file_path);
222
+ }
223
+ }
224
+ return SinkFinalizeType::READY;
225
+ }
226
+
227
+ SinkFinalizeType PhysicalFixedBatchCopy::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
228
+ GlobalSinkState &gstate_p) const {
229
+ auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
230
+ idx_t min_batch_index = idx_t(NumericLimits<int64_t>::Maximum());
231
+ // repartition any remaining batches
232
+ RepartitionBatches(context, gstate_p, min_batch_index, true);
233
+ // check if we have multiple tasks to execute
234
+ if (gstate.TaskCount() <= 1) {
235
+ // we don't - just execute the remaining task and finish flushing to disk
236
+ ExecuteTasks(context, gstate_p);
237
+ FinalFlush(context, gstate_p);
238
+ return SinkFinalizeType::READY;
239
+ }
240
+ // we have multiple tasks remaining - launch an event to execute the tasks in parallel
241
+ auto new_event = make_shared<ProcessRemainingBatchesEvent>(*this, gstate, pipeline, context);
242
+ event.InsertEvent(std::move(new_event));
243
+ return SinkFinalizeType::READY;
244
+ }
245
+
246
+ //===--------------------------------------------------------------------===//
247
+ // Tasks
248
+ //===--------------------------------------------------------------------===//
249
+ class RepartitionedFlushTask : public BatchCopyTask {
250
+ public:
251
+ RepartitionedFlushTask() {
252
+ }
253
+
254
+ void Execute(const PhysicalFixedBatchCopy &op, ClientContext &context, GlobalSinkState &gstate_p) override {
255
+ op.FlushBatchData(context, gstate_p, 0);
256
+ }
257
+ };
258
+
259
+ class PrepareBatchTask : public BatchCopyTask {
260
+ public:
261
+ PrepareBatchTask(idx_t batch_index, unique_ptr<ColumnDataCollection> collection_p)
262
+ : batch_index(batch_index), collection(std::move(collection_p)) {
263
+ }
264
+
265
+ idx_t batch_index;
266
+ unique_ptr<ColumnDataCollection> collection;
267
+
268
+ void Execute(const PhysicalFixedBatchCopy &op, ClientContext &context, GlobalSinkState &gstate_p) override {
269
+ auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
270
+ auto batch_data =
271
+ op.function.prepare_batch(context, *op.bind_data, *gstate.global_state, std::move(collection));
272
+ gstate.AddBatchData(batch_index, std::move(batch_data));
273
+ if (batch_index == gstate.flushed_batch_index) {
274
+ gstate.AddTask(make_uniq<RepartitionedFlushTask>());
275
+ }
276
+ }
277
+ };
278
+
279
+ //===--------------------------------------------------------------------===//
280
+ // Batch Data Handling
281
+ //===--------------------------------------------------------------------===//
282
+ void PhysicalFixedBatchCopy::AddRawBatchData(ClientContext &context, GlobalSinkState &gstate_p, idx_t batch_index,
283
+ unique_ptr<ColumnDataCollection> collection) const {
284
+ auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
285
+
286
+ // add the batch index to the set of raw batches
287
+ lock_guard<mutex> l(gstate.lock);
288
+ auto entry = gstate.raw_batches.insert(make_pair(batch_index, std::move(collection)));
289
+ if (!entry.second) {
290
+ throw InternalException("Duplicate batch index %llu encountered in PhysicalFixedBatchCopy", batch_index);
291
+ }
292
+ }
293
+
294
+ static bool CorrectSizeForBatch(idx_t collection_size, idx_t desired_size) {
295
+ return idx_t(AbsValue<int64_t>(int64_t(collection_size) - int64_t(desired_size))) < STANDARD_VECTOR_SIZE;
296
+ }
297
+
298
+ void PhysicalFixedBatchCopy::RepartitionBatches(ClientContext &context, GlobalSinkState &gstate_p, idx_t min_index,
299
+ bool final) const {
300
+ auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
301
+
302
+ // repartition batches until the min index is reached
303
+ lock_guard<mutex> l(gstate.lock);
304
+ if (gstate.raw_batches.empty()) {
305
+ return;
306
+ }
307
+ if (!final) {
308
+ if (gstate.any_finished) {
309
+ // we only repartition in ::NextBatch if all threads are still busy processing batches
310
+ // otherwise we might end up repartitioning a lot of data with only a few threads remaining
311
+ // which causes erratic performance
312
+ return;
313
+ }
314
+ // if this is not the final flush we first check if we have enough data to merge past the batch threshold
315
+ idx_t candidate_rows = 0;
316
+ for (auto entry = gstate.raw_batches.begin(); entry != gstate.raw_batches.end(); entry++) {
317
+ if (entry->first >= min_index) {
318
+ // we have exceeded the minimum batch
319
+ break;
320
+ }
321
+ candidate_rows += entry->second->Count();
322
+ }
323
+ if (candidate_rows < gstate.batch_size) {
324
+ // not enough rows - cancel!
325
+ return;
326
+ }
327
+ }
328
+ // gather all collections we can repartition
329
+ idx_t max_batch_index = 0;
330
+ vector<unique_ptr<ColumnDataCollection>> collections;
331
+ for (auto entry = gstate.raw_batches.begin(); entry != gstate.raw_batches.end();) {
332
+ if (entry->first >= min_index) {
333
+ break;
334
+ }
335
+ max_batch_index = entry->first;
336
+ collections.push_back(std::move(entry->second));
337
+ entry = gstate.raw_batches.erase(entry);
338
+ }
339
+ unique_ptr<ColumnDataCollection> current_collection;
340
+ ColumnDataAppendState append_state;
341
+ // now perform the actual repartitioning
342
+ for (auto &collection : collections) {
343
+ if (!current_collection) {
344
+ if (CorrectSizeForBatch(collection->Count(), gstate.batch_size)) {
345
+ // the collection is ~approximately equal to the batch size (off by at most one vector)
346
+ // use it directly
347
+ gstate.AddTask(make_uniq<PrepareBatchTask>(gstate.scheduled_batch_index++, std::move(collection)));
348
+ collection.reset();
349
+ } else if (collection->Count() < gstate.batch_size) {
350
+ // the collection is smaller than the batch size - use it as a starting point
351
+ current_collection = std::move(collection);
352
+ collection.reset();
353
+ } else {
354
+ // the collection is too large for a batch - we need to repartition
355
+ // create an empty collection
356
+ current_collection = make_uniq<ColumnDataCollection>(Allocator::Get(context), children[0]->types);
357
+ }
358
+ if (current_collection) {
359
+ current_collection->InitializeAppend(append_state);
360
+ }
361
+ }
362
+ if (!collection) {
363
+ // we have consumed the collection already - no need to append
364
+ continue;
365
+ }
366
+ // iterate the collection while appending
367
+ for (auto &chunk : collection->Chunks()) {
368
+ // append the chunk to the collection
369
+ current_collection->Append(append_state, chunk);
370
+ if (current_collection->Count() < gstate.batch_size) {
371
+ // the collection is still under the batch size - continue
372
+ continue;
373
+ }
374
+ // the collection is full - move it to the result and create a new one
375
+ gstate.AddTask(make_uniq<PrepareBatchTask>(gstate.scheduled_batch_index++, std::move(current_collection)));
376
+ current_collection = make_uniq<ColumnDataCollection>(Allocator::Get(context), children[0]->types);
377
+ current_collection->InitializeAppend(append_state);
378
+ }
379
+ }
380
+ if (current_collection && current_collection->Count() > 0) {
381
+ // if there are any remaining batches that are not filled up to the batch size
382
+ // AND this is not the final collection
383
+ // re-add it to the set of raw (to-be-merged) batches
384
+ if (final || CorrectSizeForBatch(current_collection->Count(), gstate.batch_size)) {
385
+ gstate.AddTask(make_uniq<PrepareBatchTask>(gstate.scheduled_batch_index++, std::move(current_collection)));
386
+ } else {
387
+ gstate.raw_batches[max_batch_index] = std::move(current_collection);
388
+ }
389
+ }
390
+ }
391
+
392
+ void PhysicalFixedBatchCopy::FlushBatchData(ClientContext &context, GlobalSinkState &gstate_p, idx_t min_index) const {
393
+ auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
394
+
395
+ // flush batch data to disk (if there are any to flush)
396
+ // grab the flush lock - we can only call flush_batch with this lock
397
+ // otherwise the data might end up in the wrong order
398
+ {
399
+ lock_guard<mutex> l(gstate.flush_lock);
400
+ if (gstate.any_flushing) {
401
+ return;
402
+ }
403
+ gstate.any_flushing = true;
404
+ }
405
+ ActiveFlushGuard active_flush(gstate.any_flushing);
406
+ while (true) {
407
+ unique_ptr<PreparedBatchData> batch_data;
408
+ {
409
+ lock_guard<mutex> l(gstate.lock);
410
+ if (gstate.batch_data.empty()) {
411
+ // no batch data left to flush
412
+ break;
413
+ }
414
+ auto entry = gstate.batch_data.begin();
415
+ if (entry->first != gstate.flushed_batch_index) {
416
+ // this entry is not yet ready to be flushed
417
+ break;
418
+ }
419
+ if (entry->first < gstate.flushed_batch_index) {
420
+ throw InternalException("Batch index was out of order!?");
421
+ }
422
+ batch_data = std::move(entry->second);
423
+ gstate.batch_data.erase(entry);
424
+ }
425
+ function.flush_batch(context, *bind_data, *gstate.global_state, *batch_data);
426
+ gstate.flushed_batch_index++;
427
+ }
428
+ }
429
+
430
+ //===--------------------------------------------------------------------===//
431
+ // Tasks
432
+ //===--------------------------------------------------------------------===//
433
+ bool PhysicalFixedBatchCopy::ExecuteTask(ClientContext &context, GlobalSinkState &gstate_p) const {
434
+ auto &gstate = gstate_p.Cast<FixedBatchCopyGlobalState>();
435
+ auto task = gstate.GetTask();
436
+ if (!task) {
437
+ return false;
438
+ }
439
+ task->Execute(*this, context, gstate_p);
440
+ return true;
441
+ }
442
+
443
+ void PhysicalFixedBatchCopy::ExecuteTasks(ClientContext &context, GlobalSinkState &gstate_p) const {
444
+ while (ExecuteTask(context, gstate_p)) {
445
+ }
446
+ }
447
+
448
+ //===--------------------------------------------------------------------===//
449
+ // Next Batch
450
+ //===--------------------------------------------------------------------===//
451
+ void PhysicalFixedBatchCopy::NextBatch(ExecutionContext &context, GlobalSinkState &gstate_p,
452
+ LocalSinkState &lstate) const {
453
+ auto &state = lstate.Cast<FixedBatchCopyLocalState>();
454
+ if (state.collection && state.collection->Count() > 0) {
455
+ // we finished processing this batch
456
+ // start flushing data
457
+ auto min_batch_index = lstate.partition_info.min_batch_index.GetIndex();
458
+ // push the raw batch data into the set of unprocessed batches
459
+ AddRawBatchData(context.client, gstate_p, state.batch_index.GetIndex(), std::move(state.collection));
460
+ // attempt to repartition to our desired batch size
461
+ RepartitionBatches(context.client, gstate_p, min_batch_index);
462
+ // execute a single batch task
463
+ ExecuteTask(context.client, gstate_p);
464
+ FlushBatchData(context.client, gstate_p, min_batch_index);
465
+ }
466
+ state.batch_index = lstate.partition_info.batch_index.GetIndex();
467
+
468
+ state.InitializeCollection(context.client, *this);
469
+ }
470
+
471
+ unique_ptr<LocalSinkState> PhysicalFixedBatchCopy::GetLocalSinkState(ExecutionContext &context) const {
472
+ return make_uniq<FixedBatchCopyLocalState>(function.copy_to_initialize_local(context, *bind_data));
473
+ }
474
+
475
+ unique_ptr<GlobalSinkState> PhysicalFixedBatchCopy::GetGlobalSinkState(ClientContext &context) const {
476
+ auto result =
477
+ make_uniq<FixedBatchCopyGlobalState>(function.copy_to_initialize_global(context, *bind_data, file_path));
478
+ result->batch_size = function.desired_batch_size(context, *bind_data);
479
+ return std::move(result);
480
+ }
481
+
482
+ //===--------------------------------------------------------------------===//
483
+ // Source
484
+ //===--------------------------------------------------------------------===//
485
+ SourceResultType PhysicalFixedBatchCopy::GetData(ExecutionContext &context, DataChunk &chunk,
486
+ OperatorSourceInput &input) const {
487
+ auto &g = sink_state->Cast<FixedBatchCopyGlobalState>();
488
+
489
+ chunk.SetCardinality(1);
490
+ chunk.SetValue(0, 0, Value::BIGINT(g.rows_copied));
491
+ return SourceResultType::FINISHED;
492
+ }
493
+
494
+ } // namespace duckdb
@@ -1,6 +1,7 @@
1
1
  #include "duckdb/execution/physical_plan_generator.hpp"
2
2
  #include "duckdb/execution/operator/persistent/physical_copy_to_file.hpp"
3
3
  #include "duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp"
4
+ #include "duckdb/execution/operator/persistent/physical_fixed_batch_copy.hpp"
4
5
  #include "duckdb/planner/operator/logical_copy_to_file.hpp"
5
6
 
6
7
  namespace duckdb {
@@ -28,12 +29,21 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::CreatePlan(LogicalCopyToFile
28
29
  throw InternalException("BATCH_COPY_TO_FILE can only be used if batch indexes are supported");
29
30
  }
30
31
  // batched copy to file
31
- auto copy = make_uniq<PhysicalBatchCopyToFile>(op.types, op.function, std::move(op.bind_data),
32
- op.estimated_cardinality);
33
- copy->file_path = op.file_path;
34
- copy->use_tmp_file = op.use_tmp_file;
35
- copy->children.push_back(std::move(plan));
36
- return std::move(copy);
32
+ if (op.function.desired_batch_size) {
33
+ auto copy = make_uniq<PhysicalFixedBatchCopy>(op.types, op.function, std::move(op.bind_data),
34
+ op.estimated_cardinality);
35
+ copy->file_path = op.file_path;
36
+ copy->use_tmp_file = op.use_tmp_file;
37
+ copy->children.push_back(std::move(plan));
38
+ return std::move(copy);
39
+ } else {
40
+ auto copy = make_uniq<PhysicalBatchCopyToFile>(op.types, op.function, std::move(op.bind_data),
41
+ op.estimated_cardinality);
42
+ copy->file_path = op.file_path;
43
+ copy->use_tmp_file = op.use_tmp_file;
44
+ copy->children.push_back(std::move(plan));
45
+ return std::move(copy);
46
+ }
37
47
  }
38
48
  // COPY from select statement to file
39
49
  auto copy = make_uniq<PhysicalCopyToFile>(op.types, op.function, std::move(op.bind_data), op.estimated_cardinality);
@@ -106,18 +106,10 @@ void WindowConstantAggregate::Sink(DataChunk &payload_chunk, SelectionVector *fi
106
106
  auto end = partition_end - chunk_begin;
107
107
 
108
108
  inputs.Reset();
109
- if (begin) {
110
- for (idx_t c = 0; c < payload_chunk.ColumnCount(); ++c) {
111
- inputs.data[c].Slice(payload_chunk.data[c], begin, end);
112
- }
113
- } else {
114
- inputs.Reference(payload_chunk);
115
- }
116
- inputs.SetCardinality(end - begin);
117
-
118
- // Slice to any filtered rows
119
- SelectionVector sel;
120
109
  if (filter_sel) {
110
+ // Slice to any filtered rows in [begin, end)
111
+ SelectionVector sel;
112
+
121
113
  // Find the first value in [begin, end)
122
114
  for (; filter_idx < filtered; ++filter_idx) {
123
115
  auto idx = filter_sel->get_index(filter_idx);
@@ -125,7 +117,9 @@ void WindowConstantAggregate::Sink(DataChunk &payload_chunk, SelectionVector *fi
125
117
  break;
126
118
  }
127
119
  }
128
- sel.Initialize(filter_sel->data());
120
+
121
+ // Find the first value in [end, filtered)
122
+ sel.Initialize(filter_sel->data() + filter_idx);
129
123
  idx_t nsel = 0;
130
124
  for (; filter_idx < filtered; ++filter_idx, ++nsel) {
131
125
  auto idx = filter_sel->get_index(filter_idx);
@@ -135,8 +129,18 @@ void WindowConstantAggregate::Sink(DataChunk &payload_chunk, SelectionVector *fi
135
129
  }
136
130
 
137
131
  if (nsel != inputs.size()) {
138
- inputs.Slice(sel, nsel);
132
+ inputs.Slice(payload_chunk, sel, nsel);
133
+ }
134
+ } else {
135
+ // Slice to [begin, end)
136
+ if (begin) {
137
+ for (idx_t c = 0; c < payload_chunk.ColumnCount(); ++c) {
138
+ inputs.data[c].Slice(payload_chunk.data[c], begin, end);
139
+ }
140
+ } else {
141
+ inputs.Reference(payload_chunk);
139
142
  }
143
+ inputs.SetCardinality(end - begin);
140
144
  }
141
145
 
142
146
  // Aggregate the filtered rows into a single state
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.7.2-dev3117"
2
+ #define DUCKDB_VERSION "0.7.2-dev3154"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "dd0e0da3f0"
5
+ #define DUCKDB_SOURCE_ID "eddb84d5ca"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -31,6 +31,7 @@ enum class PhysicalOperatorType : uint8_t {
31
31
  PROJECTION,
32
32
  COPY_TO_FILE,
33
33
  BATCH_COPY_TO_FILE,
34
+ FIXED_BATCH_COPY_TO_FILE,
34
35
  RESERVOIR_SAMPLE,
35
36
  STREAMING_SAMPLE,
36
37
  STREAMING_WINDOW,
@@ -43,7 +43,10 @@ struct MapEntriesFun {
43
43
  struct MapExtractFun {
44
44
  static constexpr const char *Name = "map_extract";
45
45
  static constexpr const char *Parameters = "map,key";
46
- static constexpr const char *Description = "Return a list containing the value for a given key or an empty list if the key is not contained in the map. The type of the key provided in the second parameter must match the type of the map’s keys else an error is returned.";
46
+ static constexpr const char *Description =
47
+ "Return a list containing the value for a given key or an empty list if the key is not contained in the map. "
48
+ "The type of the key provided in the second parameter must match the type of the map’s keys else an error is "
49
+ "returned.";
47
50
  static constexpr const char *Example = "map_extract(map(['key'], ['val']), 'key')";
48
51
 
49
52
  static ScalarFunction GetFunction();
@@ -64,6 +67,16 @@ struct MapFromEntriesFun {
64
67
  static ScalarFunction GetFunction();
65
68
  };
66
69
 
70
+ struct MapConcatFun {
71
+ static constexpr const char *Name = "map_concat";
72
+ static constexpr const char *Parameters = "any,...";
73
+ static constexpr const char *Description = "Returns a map created from merging the input maps, on key collision "
74
+ "the value is taken from the last map with that key";
75
+ static constexpr const char *Example = "map_concat(map([1,2], ['a', 'b']), map([2,3], ['c', 'd']));";
76
+
77
+ static ScalarFunction GetFunction();
78
+ };
79
+
67
80
  struct MapKeysFun {
68
81
  static constexpr const char *Name = "map_keys";
69
82
  static constexpr const char *Parameters = "map";
@@ -64,5 +64,18 @@ private:
64
64
  void PrepareBatchData(ClientContext &context, GlobalSinkState &gstate_p, idx_t batch_index,
65
65
  unique_ptr<ColumnDataCollection> collection) const;
66
66
  void FlushBatchData(ClientContext &context, GlobalSinkState &gstate_p, idx_t min_index) const;
67
+ SinkFinalizeType FinalFlush(ClientContext &context, GlobalSinkState &gstate_p) const;
67
68
  };
69
+
70
+ struct ActiveFlushGuard {
71
+ explicit ActiveFlushGuard(atomic<bool> &bool_value_p) : bool_value(bool_value_p) {
72
+ bool_value = true;
73
+ }
74
+ ~ActiveFlushGuard() {
75
+ bool_value = false;
76
+ }
77
+
78
+ atomic<bool> &bool_value;
79
+ };
80
+
68
81
  } // namespace duckdb