npm - duckdb - Versions diffs - 0.7.2-dev3117.0 → 0.7.2-dev3154.0 - Mend

duckdb 0.7.2-dev3117.0 → 0.7.2-dev3154.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "0.7.2-dev3117.0",
+  "version": "0.7.2-dev3154.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb/extension/parquet/include/parquet_writer.hpp CHANGED Viewed

@@ -25,12 +25,19 @@ namespace duckdb {
 class FileSystem;
 class FileOpener;
+struct PreparedRowGroup {
+	duckdb_parquet::format::RowGroup row_group;
+	vector<duckdb::unique_ptr<ColumnWriterState>> states;
+};
 class ParquetWriter {
 public:
 	ParquetWriter(FileSystem &fs, string file_name, FileOpener *file_opener, vector<LogicalType> types,
 	              vector<string> names, duckdb_parquet::format::CompressionCodec::type codec);
 public:
+	void PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGroup &result);
+	void FlushRowGroup(PreparedRowGroup &row_group);
 	void Flush(ColumnDataCollection &buffer);
 	void Finalize();

package/src/duckdb/extension/parquet/parquet-extension.cpp CHANGED Viewed

@@ -674,9 +674,48 @@ CopyFunctionExecutionMode ParquetWriteExecutionMode(bool preserve_insertion_orde
 	if (!preserve_insertion_order) {
 		return CopyFunctionExecutionMode::PARALLEL_COPY_TO_FILE;
 	}
+	if (supports_batch_index) {
+		return CopyFunctionExecutionMode::BATCH_COPY_TO_FILE;
+	}
 	return CopyFunctionExecutionMode::REGULAR_COPY_TO_FILE;
 }
+//===--------------------------------------------------------------------===//
+// Prepare Batch
+//===--------------------------------------------------------------------===//
+struct ParquetWriteBatchData : public PreparedBatchData {
+	PreparedRowGroup prepared_row_group;
+};
+unique_ptr<PreparedBatchData> ParquetWritePrepareBatch(ClientContext &context, FunctionData &bind_data,
+                                                       GlobalFunctionData &gstate,
+                                                       unique_ptr<ColumnDataCollection> collection) {
+	auto &global_state = gstate.Cast<ParquetWriteGlobalState>();
+	auto result = make_uniq<ParquetWriteBatchData>();
+	global_state.writer->PrepareRowGroup(*collection, result->prepared_row_group);
+	return std::move(result);
+}
+//===--------------------------------------------------------------------===//
+// Flush Batch
+//===--------------------------------------------------------------------===//
+void ParquetWriteFlushBatch(ClientContext &context, FunctionData &bind_data, GlobalFunctionData &gstate,
+                            PreparedBatchData &batch_p) {
+	auto &global_state = gstate.Cast<ParquetWriteGlobalState>();
+	auto &batch = batch_p.Cast<ParquetWriteBatchData>();
+	global_state.writer->FlushRowGroup(batch.prepared_row_group);
+}
+//===--------------------------------------------------------------------===//
+// Desired Batch Size
+//===--------------------------------------------------------------------===//
+idx_t ParquetWriteDesiredBatchSize(ClientContext &context, FunctionData &bind_data_p) {
+	auto &bind_data = bind_data_p.Cast<ParquetWriteBindData>();
+	return bind_data.row_group_size;
+}
+//===--------------------------------------------------------------------===//
+// Scan Replacement
+//===--------------------------------------------------------------------===//
 unique_ptr<TableRef> ParquetScanReplacement(ClientContext &context, const string &table_name,
                                             ReplacementScanData *data) {
 	auto lower_name = StringUtil::Lower(table_name);
@@ -719,6 +758,9 @@ void ParquetExtension::Load(DuckDB &db) {
 	function.execution_mode = ParquetWriteExecutionMode;
 	function.copy_from_bind = ParquetScanFunction::ParquetReadBind;
 	function.copy_from_function = scan_fun.functions[0];
+	function.prepare_batch = ParquetWritePrepareBatch;
+	function.flush_batch = ParquetWriteFlushBatch;
+	function.desired_batch_size = ParquetWriteDesiredBatchSize;
 	function.extension = "parquet";
 	ExtensionUtil::RegisterFunction(db_instance, function);

package/src/duckdb/extension/parquet/parquet_writer.cpp CHANGED Viewed

@@ -261,17 +261,13 @@ ParquetWriter::ParquetWriter(FileSystem &fs, string file_name_p, FileOpener *fil
 	}
 }
-void ParquetWriter::Flush(ColumnDataCollection &buffer) {
-	if (buffer.Count() == 0) {
-		return;
-	}
+void ParquetWriter::PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGroup &result) {
 	// set up a new row group for this chunk collection
-	ParquetRowGroup row_group;
+	auto &row_group = result.row_group;
 	row_group.num_rows = buffer.Count();
 	row_group.__isset.file_offset = true;
-	vector<duckdb::unique_ptr<ColumnWriterState>> states;
+	auto &states = result.states;
 	// iterate over each of the columns of the chunk collection and write them
 	D_ASSERT(buffer.ColumnCount() == column_writers.size());
 	for (idx_t col_idx = 0; col_idx < buffer.ColumnCount(); col_idx++) {
@@ -292,10 +288,17 @@ void ParquetWriter::Flush(ColumnDataCollection &buffer) {
 		}
 		states.push_back(std::move(write_state));
 	}
+}
+void ParquetWriter::FlushRowGroup(PreparedRowGroup &prepared) {
 	lock_guard<mutex> glock(lock);
+	auto &row_group = prepared.row_group;
+	auto &states = prepared.states;
+	if (states.empty()) {
+		throw InternalException("Attempting to flush a row group with no rows");
+	}
 	row_group.file_offset = writer->GetTotalWritten();
-	for (idx_t col_idx = 0; col_idx < buffer.ColumnCount(); col_idx++) {
+	for (idx_t col_idx = 0; col_idx < states.size(); col_idx++) {
 		const auto &col_writer = column_writers[col_idx];
 		auto write_state = std::move(states[col_idx]);
 		col_writer->FinalizeWrite(*write_state);
@@ -303,7 +306,18 @@ void ParquetWriter::Flush(ColumnDataCollection &buffer) {
 	// append the row group to the file meta data
 	file_meta_data.row_groups.push_back(row_group);
-	file_meta_data.num_rows += buffer.Count();
+	file_meta_data.num_rows += row_group.num_rows;
+}
+void ParquetWriter::Flush(ColumnDataCollection &buffer) {
+	if (buffer.Count() == 0) {
+		return;
+	}
+	PreparedRowGroup prepared_row_group;
+	PrepareRowGroup(buffer, prepared_row_group);
+	FlushRowGroup(prepared_row_group);
 }
 void ParquetWriter::Finalize() {

package/src/duckdb/src/common/enums/physical_operator_type.cpp CHANGED Viewed

@@ -49,6 +49,8 @@ string PhysicalOperatorToString(PhysicalOperatorType type) {
 		return "COPY_TO_FILE";
 	case PhysicalOperatorType::BATCH_COPY_TO_FILE:
 		return "BATCH_COPY_TO_FILE";
+	case PhysicalOperatorType::FIXED_BATCH_COPY_TO_FILE:
+		return "FIXED_BATCH_COPY_TO_FILE";
 	case PhysicalOperatorType::DELIM_JOIN:
 		return "DELIM_JOIN";
 	case PhysicalOperatorType::BLOCKWISE_NL_JOIN:

package/src/duckdb/src/common/types/vector.cpp CHANGED Viewed

@@ -1763,15 +1763,14 @@ MapInvalidReason MapVector::CheckMapValidity(Vector &map, idx_t count, const Sel
 	for (idx_t row = 0; row < count; row++) {
 		auto mapped_row = sel.get_index(row);
-		auto row_idx = map_vdata.sel->get_index(mapped_row);
+		auto map_idx = map_vdata.sel->get_index(mapped_row);
 		// map is allowed to be NULL
-		if (!map_validity.RowIsValid(row_idx)) {
+		if (!map_validity.RowIsValid(map_idx)) {
 			continue;
 		}
-		row_idx = key_vdata.sel->get_index(row);
 		value_set_t unique_keys;
-		for (idx_t i = 0; i < list_data[row_idx].length; i++) {
-			auto index = list_data[row_idx].offset + i;
+		for (idx_t i = 0; i < list_data[map_idx].length; i++) {
+			auto index = list_data[map_idx].offset + i;
 			index = key_vdata.sel->get_index(index);
 			if (!key_validity.RowIsValid(index)) {
 				return MapInvalidReason::NULL_KEY;

package/src/duckdb/src/common/types/vector_buffer.cpp CHANGED Viewed

@@ -89,7 +89,7 @@ void VectorListBuffer::Append(const Vector &to_append, const SelectionVector &se
 }
 void VectorListBuffer::PushBack(const Value &insert) {
-	if (size + 1 > capacity) {
+	while (size + 1 > capacity) {
 		child->Resize(capacity, capacity * 2);
 		capacity *= 2;
 	}

package/src/duckdb/src/core_functions/function_list.cpp CHANGED Viewed

@@ -212,6 +212,7 @@ static StaticFunctionDefinition internal_functions[] = {
 	DUCKDB_SCALAR_FUNCTION(MakeTimeFun),
 	DUCKDB_SCALAR_FUNCTION(MakeTimestampFun),
 	DUCKDB_SCALAR_FUNCTION(MapFun),
+	DUCKDB_SCALAR_FUNCTION(MapConcatFun),
 	DUCKDB_SCALAR_FUNCTION(MapEntriesFun),
 	DUCKDB_SCALAR_FUNCTION(MapExtractFun),
 	DUCKDB_SCALAR_FUNCTION(MapFromEntriesFun),

package/src/duckdb/src/core_functions/scalar/map/map_concat.cpp ADDED Viewed

@@ -0,0 +1,186 @@
+#include "duckdb/planner/expression/bound_function_expression.hpp"
+#include "duckdb/common/string_util.hpp"
+#include "duckdb/parser/expression/bound_expression.hpp"
+#include "duckdb/function/scalar/nested_functions.hpp"
+#include "duckdb/common/types/data_chunk.hpp"
+#include "duckdb/common/pair.hpp"
+#include "duckdb/common/types.hpp"
+#include "duckdb/common/unordered_map.hpp"
+#include "duckdb/core_functions/scalar/map_functions.hpp"
+namespace duckdb {
+namespace {
+struct MapKeyIndexPair {
+	MapKeyIndexPair(idx_t map, idx_t key) : map_index(map), key_index(key) {
+	}
+	// The index of the map that this key comes from
+	idx_t map_index;
+	// The index within the maps key_list
+	idx_t key_index;
+};
+} // namespace
+vector<Value> GetListEntries(vector<Value> keys, vector<Value> values) {
+	D_ASSERT(keys.size() == values.size());
+	vector<Value> entries;
+	for (idx_t i = 0; i < keys.size(); i++) {
+		child_list_t<Value> children;
+		children.emplace_back(make_pair("key", std::move(keys[i])));
+		children.emplace_back(make_pair("value", std::move(values[i])));
+		entries.push_back(Value::STRUCT(std::move(children)));
+	}
+	return entries;
+}
+static void MapConcatFunction(DataChunk &args, ExpressionState &state, Vector &result) {
+	if (result.GetType().id() == LogicalTypeId::SQLNULL) {
+		// All inputs are NULL, just return NULL
+		auto &validity = FlatVector::Validity(result);
+		validity.SetInvalid(0);
+		result.SetVectorType(VectorType::CONSTANT_VECTOR);
+		return;
+	}
+	D_ASSERT(result.GetType().id() == LogicalTypeId::MAP);
+	auto count = args.size();
+	auto map_count = args.ColumnCount();
+	vector<UnifiedVectorFormat> map_formats(map_count);
+	for (idx_t i = 0; i < map_count; i++) {
+		auto &map = args.data[i];
+		map.ToUnifiedFormat(count, map_formats[i]);
+	}
+	auto result_data = FlatVector::GetData<list_entry_t>(result);
+	for (idx_t i = 0; i < count; i++) {
+		// Loop through all the maps per list
+		// we cant do better because all the entries of the child vector have to be contiguous
+		// so we cant start the next row before we have finished the one before it
+		auto &result_entry = result_data[i];
+		vector<MapKeyIndexPair> index_to_map;
+		vector<Value> keys_list;
+		for (idx_t map_idx = 0; map_idx < map_count; map_idx++) {
+			if (args.data[map_idx].GetType().id() == LogicalTypeId::SQLNULL) {
+				continue;
+			}
+			auto &map_format = map_formats[map_idx];
+			auto &keys = MapVector::GetKeys(args.data[map_idx]);
+			auto index = map_format.sel->get_index(i);
+			auto entry = ((list_entry_t *)map_format.data)[index];
+			// Update the list for this row
+			for (idx_t list_idx = 0; list_idx < entry.length; list_idx++) {
+				auto key_index = entry.offset + list_idx;
+				auto key = keys.GetValue(key_index);
+				auto entry = std::find(keys_list.begin(), keys_list.end(), key);
+				if (entry == keys_list.end()) {
+					// Result list does not contain this value yet
+					keys_list.push_back(key);
+					index_to_map.emplace_back(map_idx, key_index);
+				} else {
+					// Result list already contains this, update where to find the value at
+					auto distance = std::distance(keys_list.begin(), entry);
+					auto &mapping = *(index_to_map.begin() + distance);
+					mapping.key_index = key_index;
+					mapping.map_index = map_idx;
+				}
+			}
+		}
+		vector<Value> values_list;
+		D_ASSERT(keys_list.size() == index_to_map.size());
+		// Get the values from the mapping
+		for (auto &mapping : index_to_map) {
+			auto &map = args.data[mapping.map_index];
+			auto &values = MapVector::GetValues(map);
+			values_list.push_back(values.GetValue(mapping.key_index));
+		}
+		idx_t entries_count = keys_list.size();
+		D_ASSERT(values_list.size() == keys_list.size());
+		result_entry.offset = ListVector::GetListSize(result);
+		result_entry.length = values_list.size();
+		auto list_entries = GetListEntries(std::move(keys_list), std::move(values_list));
+		for (auto &list_entry : list_entries) {
+			ListVector::PushBack(result, list_entry);
+		}
+		ListVector::SetListSize(result, ListVector::GetListSize(result) + entries_count);
+	}
+	if (args.AllConstant()) {
+		result.SetVectorType(VectorType::CONSTANT_VECTOR);
+	}
+	result.Verify(count);
+}
+static bool IsEmptyMap(const LogicalType &map) {
+	D_ASSERT(map.id() == LogicalTypeId::MAP);
+	auto &key_type = MapType::KeyType(map);
+	auto &value_type = MapType::ValueType(map);
+	return key_type.id() == LogicalType::SQLNULL && value_type.id() == LogicalType::SQLNULL;
+}
+static unique_ptr<FunctionData> MapConcatBind(ClientContext &context, ScalarFunction &bound_function,
+                                              vector<unique_ptr<Expression>> &arguments) {
+	auto arg_count = arguments.size();
+	if (arg_count < 2) {
+		throw InvalidInputException("The provided amount of arguments is incorrect, please provide 2 or more maps");
+	}
+	if (arguments[0]->return_type.id() == LogicalTypeId::UNKNOWN) {
+		// Prepared statement
+		bound_function.arguments.emplace_back(LogicalTypeId::UNKNOWN);
+		bound_function.return_type = LogicalType(LogicalTypeId::SQLNULL);
+		return nullptr;
+	}
+	LogicalType expected = LogicalType::SQLNULL;
+	bool is_null = true;
+	// Check and verify that all the maps are of the same type
+	for (idx_t i = 0; i < arg_count; i++) {
+		auto &arg = arguments[i];
+		auto &map = arg->return_type;
+		if (map.id() == LogicalTypeId::UNKNOWN) {
+			// Prepared statement
+			bound_function.arguments.emplace_back(LogicalTypeId::UNKNOWN);
+			bound_function.return_type = LogicalType(LogicalTypeId::SQLNULL);
+			return nullptr;
+		}
+		if (map.id() == LogicalTypeId::SQLNULL) {
+			// The maps are allowed to be NULL
+			continue;
+		}
+		is_null = false;
+		if (IsEmptyMap(map)) {
+			// Map is allowed to be empty
+			continue;
+		}
+		if (expected.id() == LogicalTypeId::SQLNULL) {
+			expected = map;
+		} else if (map != expected) {
+			throw InvalidInputException(
+			    "'value' type of map differs between arguments, expected '%s', found '%s' instead", expected.ToString(),
+			    map.ToString());
+		}
+	}
+	if (expected.id() == LogicalTypeId::SQLNULL && is_null == false) {
+		expected = LogicalType::MAP(LogicalType::SQLNULL, LogicalType::SQLNULL);
+	}
+	bound_function.return_type = expected;
+	return make_uniq<VariableReturnBindData>(bound_function.return_type);
+}
+ScalarFunction MapConcatFun::GetFunction() {
+	//! the arguments and return types are actually set in the binder function
+	ScalarFunction fun("map_concat", {}, LogicalTypeId::LIST, MapConcatFunction, MapConcatBind);
+	fun.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
+	fun.varargs = LogicalType::ANY;
+	return fun;
+}
+} // namespace duckdb

package/src/duckdb/src/execution/operator/persistent/physical_batch_copy_to_file.cpp CHANGED Viewed

@@ -1,18 +1,17 @@
 #include "duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp"
 #include "duckdb/execution/operator/persistent/physical_copy_to_file.hpp"
+#include "duckdb/parallel/base_pipeline_event.hpp"
 #include "duckdb/common/vector_operations/vector_operations.hpp"
 #include "duckdb/common/types/batched_data_collection.hpp"
-#include "duckdb/common/file_system.hpp"
-#include "duckdb/common/file_opener.hpp"
 #include "duckdb/common/allocator.hpp"
 #include <algorithm>
 namespace duckdb {
 PhysicalBatchCopyToFile::PhysicalBatchCopyToFile(vector<LogicalType> types, CopyFunction function_p,
-                                                 unique_ptr<FunctionData> bind_data, idx_t estimated_cardinality)
+                                                 unique_ptr<FunctionData> bind_data_p, idx_t estimated_cardinality)
     : PhysicalOperator(PhysicalOperatorType::BATCH_COPY_TO_FILE, std::move(types), estimated_cardinality),
-      function(std::move(function_p)), bind_data(std::move(bind_data)) {
+      function(std::move(function_p)), bind_data(std::move(bind_data_p)) {
 	if (!function.flush_batch || !function.prepare_batch) {
 		throw InternalException(
 		    "PhysicalBatchCopyToFile created for copy function that does not have prepare_batch/flush_batch defined");
@@ -20,32 +19,52 @@ PhysicalBatchCopyToFile::PhysicalBatchCopyToFile(vector<LogicalType> types, Copy
 }
 //===--------------------------------------------------------------------===//
-// Sink
+// States
 //===--------------------------------------------------------------------===//
 class BatchCopyToGlobalState : public GlobalSinkState {
 public:
 	explicit BatchCopyToGlobalState(unique_ptr<GlobalFunctionData> global_state)
-	    : rows_copied(0), global_state(std::move(global_state)) {
+	    : rows_copied(0), global_state(std::move(global_state)), any_flushing(false) {
 	}
 	mutex lock;
-	mutex flush_lock;
+	//! The total number of rows copied to the file
 	atomic<idx_t> rows_copied;
+	//! Global copy state
 	unique_ptr<GlobalFunctionData> global_state;
+	//! The prepared batch data by batch index - ready to flush
 	map<idx_t, unique_ptr<PreparedBatchData>> batch_data;
+	//! Lock for flushing to disk
+	mutex flush_lock;
+	//! Whether or not any threads are flushing (only one thread can flush at a time)
+	atomic<bool> any_flushing;
+	void AddBatchData(idx_t batch_index, unique_ptr<PreparedBatchData> new_batch) {
+		// move the batch data to the set of prepared batch data
+		lock_guard<mutex> l(lock);
+		auto entry = batch_data.insert(make_pair(batch_index, std::move(new_batch)));
+		if (!entry.second) {
+			throw InternalException("Duplicate batch index %llu encountered in PhysicalBatchCopyToFile", batch_index);
+		}
+	}
 };
 class BatchCopyToLocalState : public LocalSinkState {
 public:
 	explicit BatchCopyToLocalState(unique_ptr<LocalFunctionData> local_state_p)
-	    : local_state(std::move(local_state_p)), rows_copied(0), batch_index(0) {
+	    : local_state(std::move(local_state_p)), rows_copied(0) {
 	}
+	//! Local copy state
 	unique_ptr<LocalFunctionData> local_state;
+	//! The current collection we are appending to
 	unique_ptr<ColumnDataCollection> collection;
+	//! The append state of the collection
 	ColumnDataAppendState append_state;
+	//! How many rows have been copied in total
 	idx_t rows_copied;
-	idx_t batch_index;
+	//! The current batch index
+	optional_idx batch_index;
 	void InitializeCollection(ClientContext &context, const PhysicalOperator &op) {
 		collection = make_uniq<ColumnDataCollection>(Allocator::Get(context), op.children[0]->types);
@@ -53,11 +72,15 @@ public:
 	}
 };
+//===--------------------------------------------------------------------===//
+// Sink
+//===--------------------------------------------------------------------===//
 SinkResultType PhysicalBatchCopyToFile::Sink(ExecutionContext &context, DataChunk &chunk,
                                              OperatorSinkInput &input) const {
 	auto &state = input.local_state.Cast<BatchCopyToLocalState>();
 	if (!state.collection) {
 		state.InitializeCollection(context.client, *this);
+		state.batch_index = state.partition_info.batch_index.GetIndex();
 	}
 	state.rows_copied += chunk.size();
 	state.collection->Append(state.append_state, chunk);
@@ -71,10 +94,13 @@ void PhysicalBatchCopyToFile::Combine(ExecutionContext &context, GlobalSinkState
 	gstate.rows_copied += state.rows_copied;
 }
-SinkFinalizeType PhysicalBatchCopyToFile::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
-                                                   GlobalSinkState &gstate_p) const {
+//===--------------------------------------------------------------------===//
+// Finalize
+//===--------------------------------------------------------------------===//
+SinkFinalizeType PhysicalBatchCopyToFile::FinalFlush(ClientContext &context, GlobalSinkState &gstate_p) const {
 	auto &gstate = gstate_p.Cast<BatchCopyToGlobalState>();
-	FlushBatchData(context, gstate_p, NumericLimits<int64_t>::Maximum());
+	idx_t min_batch_index = idx_t(NumericLimits<int64_t>::Maximum());
+	FlushBatchData(context, gstate_p, min_batch_index);
 	if (function.copy_to_finalize) {
 		function.copy_to_finalize(context, *bind_data, *gstate.global_state);
@@ -85,25 +111,39 @@ SinkFinalizeType PhysicalBatchCopyToFile::Finalize(Pipeline &pipeline, Event &ev
 	return SinkFinalizeType::READY;
 }
+SinkFinalizeType PhysicalBatchCopyToFile::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
+                                                   GlobalSinkState &gstate_p) const {
+	FinalFlush(context, gstate_p);
+	return SinkFinalizeType::READY;
+}
+//===--------------------------------------------------------------------===//
+// Batch Data Handling
+//===--------------------------------------------------------------------===//
 void PhysicalBatchCopyToFile::PrepareBatchData(ClientContext &context, GlobalSinkState &gstate_p, idx_t batch_index,
                                                unique_ptr<ColumnDataCollection> collection) const {
 	auto &gstate = gstate_p.Cast<BatchCopyToGlobalState>();
 	// prepare the batch
 	auto batch_data = function.prepare_batch(context, *bind_data, *gstate.global_state, std::move(collection));
-	// move the batch data to the set of prepared batch data
-	lock_guard<mutex> l(gstate.lock);
-	gstate.batch_data[batch_index] = std::move(batch_data);
+	gstate.AddBatchData(batch_index, std::move(batch_data));
 }
 void PhysicalBatchCopyToFile::FlushBatchData(ClientContext &context, GlobalSinkState &gstate_p, idx_t min_index) const {
 	auto &gstate = gstate_p.Cast<BatchCopyToGlobalState>();
 	// flush batch data to disk (if there are any to flush)
-	while (true) {
-		// grab the flush lock - we can only call flush_batch with this lock
-		// otherwise the data might end up in the wrong order
+	// grab the flush lock - we can only call flush_batch with this lock
+	// otherwise the data might end up in the wrong order
+	{
 		lock_guard<mutex> l(gstate.flush_lock);
+		if (gstate.any_flushing) {
+			return;
+		}
+		gstate.any_flushing = true;
+	}
+	ActiveFlushGuard active_flush(gstate.any_flushing);
+	while (true) {
 		unique_ptr<PreparedBatchData> batch_data;
 		{
 			// fetch the next batch to flush (if any)
@@ -128,14 +168,18 @@ void PhysicalBatchCopyToFile::FlushBatchData(ClientContext &context, GlobalSinkS
 	}
 }
+//===--------------------------------------------------------------------===//
+// Next Batch
+//===--------------------------------------------------------------------===//
 void PhysicalBatchCopyToFile::NextBatch(ExecutionContext &context, GlobalSinkState &gstate_p,
                                         LocalSinkState &lstate) const {
 	auto &state = lstate.Cast<BatchCopyToLocalState>();
-	if (state.collection) {
+	if (state.collection && state.collection->Count() > 0) {
 		// we finished processing this batch
 		// start flushing data
-		PrepareBatchData(context.client, gstate_p, state.batch_index, std::move(state.collection));
-		FlushBatchData(context.client, gstate_p, lstate.partition_info.min_batch_index.GetIndex());
+		auto min_batch_index = lstate.partition_info.min_batch_index.GetIndex();
+		PrepareBatchData(context.client, gstate_p, state.batch_index.GetIndex(), std::move(state.collection));
+		FlushBatchData(context.client, gstate_p, min_batch_index);
 	}
 	state.batch_index = lstate.partition_info.batch_index.GetIndex();