npm - duckdb - Versions diffs - 0.6.2-dev1832.0 → 0.6.2-dev1873.0 - Mend

duckdb 0.6.2-dev1832.0 → 0.6.2-dev1873.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "0.6.2-dev1832.0",
+  "version": "0.6.2-dev1873.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb/src/common/bind_helpers.cpp ADDED Viewed

@@ -0,0 +1,67 @@
+#include "duckdb/common/bind_helpers.hpp"
+#include "duckdb/common/common.hpp"
+#include "duckdb/common/types.hpp"
+#include "duckdb/common/exception.hpp"
+#include "duckdb/common/types/value.hpp"
+#include "duckdb/common/case_insensitive_map.hpp"
+namespace duckdb {
+Value ConvertVectorToValue(vector<Value> set) {
+	if (set.empty()) {
+		return Value::EMPTYLIST(LogicalType::BOOLEAN);
+	}
+	return Value::LIST(move(set));
+}
+vector<bool> ParseColumnList(const vector<Value> &set, vector<string> &names, const string &loption) {
+	vector<bool> result;
+	if (set.empty()) {
+		throw BinderException("\"%s\" expects a column list or * as parameter", loption);
+	}
+	// list of options: parse the list
+	case_insensitive_map_t<bool> option_map;
+	for (idx_t i = 0; i < set.size(); i++) {
+		option_map[set[i].ToString()] = false;
+	}
+	result.resize(names.size(), false);
+	for (idx_t i = 0; i < names.size(); i++) {
+		auto entry = option_map.find(names[i]);
+		if (entry != option_map.end()) {
+			result[i] = true;
+			entry->second = true;
+		}
+	}
+	for (auto &entry : option_map) {
+		if (!entry.second) {
+			throw BinderException("\"%s\" expected to find %s, but it was not found in the table", loption,
+			                      entry.first.c_str());
+		}
+	}
+	return result;
+}
+vector<bool> ParseColumnList(const Value &value, vector<string> &names, const string &loption) {
+	vector<bool> result;
+	// Only accept a list of arguments
+	if (value.type().id() != LogicalTypeId::LIST) {
+		// Support a single argument if it's '*'
+		if (value.type().id() == LogicalTypeId::VARCHAR && value.GetValue<string>() == "*") {
+			result.resize(names.size(), true);
+			return result;
+		}
+		throw BinderException("\"%s\" expects a column list or * as parameter", loption);
+	}
+	auto &children = ListValue::GetChildren(value);
+	// accept '*' as single argument
+	if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&
+	    children[0].GetValue<string>() == "*") {
+		result.resize(names.size(), true);
+		return result;
+	}
+	return ParseColumnList(children, names, loption);
+}
+} // namespace duckdb

package/src/duckdb/src/common/file_system.cpp CHANGED Viewed

@@ -282,7 +282,8 @@ void FileSystem::RemoveDirectory(const string &directory) {
 	throw NotImplementedException("%s: RemoveDirectory is not implemented!", GetName());
 }
-bool FileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback) {
+bool FileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback,
+                           FileOpener *opener) {
 	throw NotImplementedException("%s: ListFiles is not implemented!", GetName());
 }

package/src/duckdb/src/common/hive_partitioning.cpp CHANGED Viewed

@@ -6,8 +6,6 @@
 #include "duckdb/planner/expression_iterator.hpp"
 #include "re2/re2.h"
-#include <iostream>
 namespace duckdb {
 static unordered_map<column_t, string> GetKnownColumnValues(string &filename,
@@ -88,6 +86,7 @@ void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector<str
                                               unordered_map<string, column_t> &column_map, idx_t table_index,
                                               bool hive_enabled, bool filename_enabled) {
 	vector<string> pruned_files;
+	vector<bool> have_preserved_filter(filters.size(), false);
 	vector<unique_ptr<Expression>> pruned_filters;
 	duckdb_re2::RE2 regex(REGEX_STRING);
@@ -101,15 +100,21 @@ void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector<str
 		auto known_values = GetKnownColumnValues(file, column_map, regex, filename_enabled, hive_enabled);
 		FilterCombiner combiner(context);
-		for (auto &filter : filters) {
+		for (idx_t j = 0; j < filters.size(); j++) {
+			auto &filter = filters[j];
 			unique_ptr<Expression> filter_copy = filter->Copy();
 			ConvertKnownColRefToConstants(filter_copy, known_values, table_index);
 			// Evaluate the filter, if it can be evaluated here, we can not prune this filter
 			Value result_value;
 			if (!filter_copy->IsScalar() || !filter_copy->IsFoldable() ||
 			    !ExpressionExecutor::TryEvaluateScalar(context, *filter_copy, result_value)) {
 				// can not be evaluated only with the filename/hive columns added, we can not prune this filter
-				pruned_filters.emplace_back(filter->Copy());
+				if (!have_preserved_filter[j]) {
+					pruned_filters.emplace_back(filter->Copy());
+					have_preserved_filter[j] = true;
+				}
 			} else if (!result_value.GetValue<bool>()) {
 				// filter evaluates to false
 				should_prune_file = true;
@@ -126,8 +131,128 @@ void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector<str
 		}
 	}
+	D_ASSERT(filters.size() >= pruned_filters.size());
 	filters = std::move(pruned_filters);
 	files = std::move(pruned_files);
 }
+HivePartitionedColumnData::HivePartitionedColumnData(const HivePartitionedColumnData &other)
+    : PartitionedColumnData(other) {
+	// Synchronize to ensure consistency of shared partition map
+	if (other.global_state) {
+		global_state = other.global_state;
+		unique_lock<mutex> lck(global_state->lock);
+		SynchronizeLocalMap();
+	}
+}
+void HivePartitionedColumnData::ComputePartitionIndices(PartitionedColumnDataAppendState &state, DataChunk &input) {
+	Vector hashes(LogicalType::HASH, input.size());
+	input.Hash(group_by_columns, hashes);
+	for (idx_t i = 0; i < input.size(); i++) {
+		HivePartitionKey key;
+		key.hash = FlatVector::GetData<hash_t>(hashes)[i];
+		for (auto &col : group_by_columns) {
+			key.values.emplace_back(input.GetValue(col, i));
+		}
+		auto lookup = local_partition_map.find(key);
+		const auto partition_indices = FlatVector::GetData<idx_t>(state.partition_indices);
+		if (lookup == local_partition_map.end()) {
+			idx_t new_partition_id = RegisterNewPartition(key, state);
+			partition_indices[i] = new_partition_id;
+		} else {
+			partition_indices[i] = lookup->second;
+		}
+	}
+}
+std::map<idx_t, const HivePartitionKey *> HivePartitionedColumnData::GetReverseMap() {
+	std::map<idx_t, const HivePartitionKey *> ret;
+	for (const auto &pair : local_partition_map) {
+		ret[pair.second] = &(pair.first);
+	}
+	return ret;
+}
+void HivePartitionedColumnData::GrowAllocators() {
+	unique_lock<mutex> lck_gstate(allocators->lock);
+	idx_t current_allocator_size = allocators->allocators.size();
+	idx_t required_allocators = local_partition_map.size();
+	allocators->allocators.reserve(current_allocator_size);
+	for (idx_t i = current_allocator_size; i < required_allocators; i++) {
+		CreateAllocator();
+	}
+	D_ASSERT(allocators->allocators.size() == local_partition_map.size());
+}
+void HivePartitionedColumnData::GrowAppendState(PartitionedColumnDataAppendState &state) {
+	idx_t current_append_state_size = state.partition_append_states.size();
+	idx_t required_append_state_size = local_partition_map.size();
+	for (idx_t i = current_append_state_size; i < required_append_state_size; i++) {
+		state.partition_append_states.emplace_back(make_unique<ColumnDataAppendState>());
+		state.partition_buffers.emplace_back(CreatePartitionBuffer());
+	}
+}
+void HivePartitionedColumnData::GrowPartitions(PartitionedColumnDataAppendState &state) {
+	idx_t current_partitions = partitions.size();
+	idx_t required_partitions = local_partition_map.size();
+	D_ASSERT(allocators->allocators.size() == required_partitions);
+	for (idx_t i = current_partitions; i < required_partitions; i++) {
+		partitions.emplace_back(CreatePartitionCollection(i));
+		partitions[i]->InitializeAppend(*state.partition_append_states[i]);
+	}
+	D_ASSERT(partitions.size() == local_partition_map.size());
+}
+void HivePartitionedColumnData::SynchronizeLocalMap() {
+	// Synchronise global map into local, may contain changes from other threads too
+	for (auto it = global_state->partitions.begin() + local_partition_map.size(); it < global_state->partitions.end();
+	     it++) {
+		local_partition_map[(*it)->first] = (*it)->second;
+	}
+}
+idx_t HivePartitionedColumnData::RegisterNewPartition(HivePartitionKey key, PartitionedColumnDataAppendState &state) {
+	if (global_state) {
+		idx_t partition_id;
+		// Synchronize Global state with our local state with the newly discoveren partition
+		{
+			unique_lock<mutex> lck_gstate(global_state->lock);
+			// Insert into global map, or return partition if already present
+			auto res =
+			    global_state->partition_map.emplace(std::make_pair(std::move(key), global_state->partition_map.size()));
+			auto it = res.first;
+			partition_id = it->second;
+			// Add iterator to vector to allow incrementally updating local states from global state
+			global_state->partitions.emplace_back(it);
+			SynchronizeLocalMap();
+		}
+		// After synchronizing with the global state, we need to grow the shared allocators to support
+		// the number of partitions, which guarantees that there's always enough allocators available to each thread
+		GrowAllocators();
+		// Grow local partition data
+		GrowAppendState(state);
+		GrowPartitions(state);
+		return partition_id;
+	} else {
+		return local_partition_map.emplace(std::make_pair(std::move(key), local_partition_map.size())).first->second;
+	}
+}
 } // namespace duckdb

package/src/duckdb/src/common/local_file_system.cpp CHANGED Viewed

@@ -407,7 +407,8 @@ void LocalFileSystem::RemoveFile(const string &filename) {
 	}
 }
-bool LocalFileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback) {
+bool LocalFileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback,
+                                FileOpener *opener) {
 	if (!DirectoryExists(directory)) {
 		return false;
 	}
@@ -734,7 +735,8 @@ void LocalFileSystem::RemoveFile(const string &filename) {
 	}
 }
-bool LocalFileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback) {
+bool LocalFileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback,
+                                FileOpener *opener) {
 	string search_dir = JoinPath(directory, "*");
 	auto unicode_path = WindowsUtil::UTF8ToUnicode(search_dir.c_str());

package/src/duckdb/src/common/radix_partitioning.cpp CHANGED Viewed

@@ -435,6 +435,7 @@ RadixPartitionedColumnData::RadixPartitionedColumnData(ClientContext &context_p,
 RadixPartitionedColumnData::RadixPartitionedColumnData(const RadixPartitionedColumnData &other)
     : PartitionedColumnData(other), radix_bits(other.radix_bits), hash_col_idx(other.hash_col_idx) {
 	for (idx_t i = 0; i < RadixPartitioning::NumberOfPartitions(radix_bits); i++) {
 		partitions.emplace_back(CreatePartitionCollection(i));
 	}

package/src/duckdb/src/common/string_util.cpp CHANGED Viewed

@@ -1,7 +1,8 @@
 #include "duckdb/common/string_util.hpp"
+#include "duckdb/common/exception.hpp"
 #include "duckdb/common/pair.hpp"
 #include "duckdb/common/to_string.hpp"
-#include "duckdb/common/exception.hpp"
 #include <algorithm>
 #include <cctype>
@@ -31,6 +32,13 @@ void StringUtil::RTrim(string &str) {
 	          str.end());
 }
+void StringUtil::RTrim(string &str, const string &chars_to_trim) {
+	str.erase(find_if(str.rbegin(), str.rend(),
+	                  [&chars_to_trim](int ch) { return ch > 0 && chars_to_trim.find(ch) == string::npos; })
+	              .base(),
+	          str.end());
+}
 void StringUtil::Trim(string &str) {
 	StringUtil::LTrim(str);
 	StringUtil::RTrim(str);

package/src/duckdb/src/common/types/data_chunk.cpp CHANGED Viewed

@@ -307,6 +307,16 @@ void DataChunk::Hash(Vector &result) {
 	}
 }
+void DataChunk::Hash(vector<idx_t> &column_ids, Vector &result) {
+	D_ASSERT(result.GetType().id() == LogicalType::HASH);
+	D_ASSERT(column_ids.size() > 0);
+	VectorOperations::Hash(data[column_ids[0]], result, size());
+	for (idx_t i = 1; i < column_ids.size(); i++) {
+		VectorOperations::CombineHash(result, data[column_ids[i]], size());
+	}
+}
 void DataChunk::Verify() {
 #ifdef DEBUG
 	D_ASSERT(size() <= capacity);

package/src/duckdb/src/common/types/partitioned_column_data.cpp CHANGED Viewed

@@ -1,6 +1,7 @@
 #include "duckdb/common/types/partitioned_column_data.hpp"
 #include "duckdb/common/radix_partitioning.hpp"
+#include "duckdb/common/hive_partitioning.hpp"
 #include "duckdb/storage/buffer_manager.hpp"
 namespace duckdb {
@@ -18,6 +19,8 @@ unique_ptr<PartitionedColumnData> PartitionedColumnData::CreateShared() {
 	switch (type) {
 	case PartitionedColumnDataType::RADIX:
 		return make_unique<RadixPartitionedColumnData>((RadixPartitionedColumnData &)*this);
+	case PartitionedColumnDataType::HIVE:
+		return make_unique<HivePartitionedColumnData>((HivePartitionedColumnData &)*this);
 	default:
 		throw NotImplementedException("CreateShared for this type of PartitionedColumnData");
 	}
@@ -141,10 +144,12 @@ void PartitionedColumnData::FlushAppendState(PartitionedColumnDataAppendState &s
 void PartitionedColumnData::Combine(PartitionedColumnData &other) {
 	// Now combine the state's partitions into this
 	lock_guard<mutex> guard(lock);
 	if (partitions.empty()) {
 		// This is the first merge, we just copy them over
 		partitions = std::move(other.partitions);
 	} else {
+		D_ASSERT(partitions.size() == other.partitions.size());
 		// Combine the append state's partitions into this PartitionedColumnData
 		for (idx_t i = 0; i < other.partitions.size(); i++) {
 			partitions[i]->Combine(*other.partitions[i]);

package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp CHANGED Viewed

@@ -1,4 +1,5 @@
 #include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
+#include "duckdb/common/bind_helpers.hpp"
 #include "duckdb/common/vector_size.hpp"
 #include "duckdb/common/string_util.hpp"
@@ -59,56 +60,6 @@ static int64_t ParseInteger(const Value &value, const string &loption) {
 	return value.GetValue<int64_t>();
 }
-static vector<bool> ParseColumnList(const vector<Value> &set, vector<string> &names, const string &loption) {
-	vector<bool> result;
-	if (set.empty()) {
-		throw BinderException("\"%s\" expects a column list or * as parameter", loption);
-	}
-	// list of options: parse the list
-	unordered_map<string, bool> option_map;
-	for (idx_t i = 0; i < set.size(); i++) {
-		option_map[set[i].ToString()] = false;
-	}
-	result.resize(names.size(), false);
-	for (idx_t i = 0; i < names.size(); i++) {
-		auto entry = option_map.find(names[i]);
-		if (entry != option_map.end()) {
-			result[i] = true;
-			entry->second = true;
-		}
-	}
-	for (auto &entry : option_map) {
-		if (!entry.second) {
-			throw BinderException("\"%s\" expected to find %s, but it was not found in the table", loption,
-			                      entry.first.c_str());
-		}
-	}
-	return result;
-}
-static vector<bool> ParseColumnList(const Value &value, vector<string> &names, const string &loption) {
-	vector<bool> result;
-	// Only accept a list of arguments
-	if (value.type().id() != LogicalTypeId::LIST) {
-		// Support a single argument if it's '*'
-		if (value.type().id() == LogicalTypeId::VARCHAR && value.GetValue<string>() == "*") {
-			result.resize(names.size(), true);
-			return result;
-		}
-		throw BinderException("\"%s\" expects a column list or * as parameter", loption);
-	}
-	auto &children = ListValue::GetChildren(value);
-	// accept '*' as single argument
-	if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&
-	    children[0].GetValue<string>() == "*") {
-		result.resize(names.size(), true);
-		return result;
-	}
-	return ParseColumnList(children, names, loption);
-}
 void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
 	this->delimiter = StringUtil::Replace(input, "\\t", "\t");
 	this->has_delimiter = true;

package/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp CHANGED Viewed

@@ -1,6 +1,8 @@
 #include "duckdb/execution/operator/persistent/physical_copy_to_file.hpp"
 #include "duckdb/common/vector_operations/vector_operations.hpp"
+#include "duckdb/common/hive_partitioning.hpp"
 #include "duckdb/common/file_system.hpp"
+#include "duckdb/common/file_opener.hpp"
 #include <algorithm>
@@ -15,14 +17,24 @@ public:
 	idx_t rows_copied;
 	idx_t last_file_offset;
 	unique_ptr<GlobalFunctionData> global_state;
+	//! shared state for HivePartitionedColumnData
+	shared_ptr<GlobalHivePartitionState> partition_state;
 };
 class CopyToFunctionLocalState : public LocalSinkState {
 public:
-	explicit CopyToFunctionLocalState(unique_ptr<LocalFunctionData> local_state) : local_state(std::move(local_state)) {
+	explicit CopyToFunctionLocalState(unique_ptr<LocalFunctionData> local_state)
+	    : local_state(std::move(local_state)), writer_offset(0) {
 	}
 	unique_ptr<GlobalFunctionData> global_state;
 	unique_ptr<LocalFunctionData> local_state;
+	//! Buffers the tuples in partitions before writing
+	unique_ptr<HivePartitionedColumnData> part_buffer;
+	unique_ptr<PartitionedColumnDataAppendState> part_buffer_append_state;
+	idx_t writer_offset;
 };
 //===--------------------------------------------------------------------===//
@@ -48,6 +60,11 @@ SinkResultType PhysicalCopyToFile::Sink(ExecutionContext &context, GlobalSinkSta
 	auto &g = (CopyToFunctionGlobalState &)gstate;
 	auto &l = (CopyToFunctionLocalState &)lstate;
+	if (partition_output) {
+		l.part_buffer->Append(*l.part_buffer_append_state, input);
+		return SinkResultType::NEED_MORE_INPUT;
+	}
 	{
 		lock_guard<mutex> glock(g.lock);
 		g.rows_copied += input.size();
@@ -57,13 +74,67 @@ SinkResultType PhysicalCopyToFile::Sink(ExecutionContext &context, GlobalSinkSta
 	return SinkResultType::NEED_MORE_INPUT;
 }
+static void CreateDir(const string &dir_path, FileSystem &fs) {
+	if (!fs.DirectoryExists(dir_path)) {
+		fs.CreateDirectory(dir_path);
+	}
+}
+static string CreateDirRecursive(const vector<idx_t> &cols, const vector<string> &names, const vector<Value> &values,
+                                 string path, FileSystem &fs) {
+	CreateDir(path, fs);
+	for (idx_t i = 0; i < cols.size(); i++) {
+		auto partition_col_name = names[cols[i]];
+		auto partition_value = values[i];
+		string p_dir = partition_col_name + "=" + partition_value.ToString();
+		path = fs.JoinPath(path, p_dir);
+		CreateDir(path, fs);
+	}
+	return path;
+}
 void PhysicalCopyToFile::Combine(ExecutionContext &context, GlobalSinkState &gstate, LocalSinkState &lstate) const {
 	auto &g = (CopyToFunctionGlobalState &)gstate;
 	auto &l = (CopyToFunctionLocalState &)lstate;
+	if (partition_output) {
+		auto &fs = FileSystem::GetFileSystem(context.client);
+		l.part_buffer->FlushAppendState(*l.part_buffer_append_state);
+		auto &partitions = l.part_buffer->GetPartitions();
+		auto partition_key_map = l.part_buffer->GetReverseMap();
+		string trimmed_path = file_path;
+		StringUtil::RTrim(trimmed_path, fs.PathSeparator());
+		for (idx_t i = 0; i < partitions.size(); i++) {
+			string hive_path =
+			    CreateDirRecursive(partition_columns, names, partition_key_map[i]->values, trimmed_path, fs);
+			string full_path = fs.JoinPath(hive_path, "data_" + to_string(l.writer_offset) + "." + function.extension);
+			if (fs.FileExists(full_path) && !allow_overwrite) {
+				throw IOException("failed to create " + full_path +
+				                  ", file exists! Enable ALLOW_OVERWRITE option to force writing");
+			}
+			// Create a writer for the current file
+			auto fun_data_global = function.copy_to_initialize_global(context.client, *bind_data, full_path);
+			auto fun_data_local = function.copy_to_initialize_local(context, *bind_data);
+			for (auto &chunk : partitions[i]->Chunks()) {
+				function.copy_to_sink(context, *bind_data, *fun_data_global, *fun_data_local, chunk);
+			}
+			function.copy_to_combine(context, *bind_data, *fun_data_global, *fun_data_local);
+			function.copy_to_finalize(context.client, *bind_data, *fun_data_global);
+		}
+		return;
+	}
 	if (function.copy_to_combine) {
 		function.copy_to_combine(context, *bind_data, per_thread_output ? *l.global_state : *g.global_state,
 		                         *l.local_state);
 		if (per_thread_output) {
 			function.copy_to_finalize(context.client, *bind_data, *l.global_state);
 		}
@@ -73,7 +144,7 @@ void PhysicalCopyToFile::Combine(ExecutionContext &context, GlobalSinkState &gst
 SinkFinalizeType PhysicalCopyToFile::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
                                               GlobalSinkState &gstate_p) const {
 	auto &gstate = (CopyToFunctionGlobalState &)gstate_p;
-	if (per_thread_output) {
+	if (per_thread_output || partition_output) {
 		// already happened in combine
 		return SinkFinalizeType::READY;
 	}
@@ -82,6 +153,7 @@ SinkFinalizeType PhysicalCopyToFile::Finalize(Pipeline &pipeline, Event &event,
 		if (use_tmp_file) {
 			D_ASSERT(!per_thread_output); // FIXME
+			D_ASSERT(!partition_output);  // FIXME
 			MoveTmpFile(context, file_path);
 		}
 	}
@@ -89,6 +161,20 @@ SinkFinalizeType PhysicalCopyToFile::Finalize(Pipeline &pipeline, Event &event,
 }
 unique_ptr<LocalSinkState> PhysicalCopyToFile::GetLocalSinkState(ExecutionContext &context) const {
+	if (partition_output) {
+		auto state = make_unique<CopyToFunctionLocalState>(nullptr);
+		{
+			auto &g = (CopyToFunctionGlobalState &)*sink_state;
+			lock_guard<mutex> glock(g.lock);
+			state->writer_offset = g.last_file_offset++;
+			state->part_buffer = make_unique<HivePartitionedColumnData>(context.client, expected_types,
+			                                                            partition_columns, g.partition_state);
+			state->part_buffer_append_state = make_unique<PartitionedColumnDataAppendState>();
+			state->part_buffer->InitializeAppendState(*state->part_buffer_append_state);
+		}
+		return std::move(state);
+	}
 	auto res = make_unique<CopyToFunctionLocalState>(function.copy_to_initialize_local(context, *bind_data));
 	if (per_thread_output) {
 		idx_t this_file_offset;
@@ -98,9 +184,10 @@ unique_ptr<LocalSinkState> PhysicalCopyToFile::GetLocalSinkState(ExecutionContex
 			this_file_offset = g.last_file_offset++;
 		}
 		auto &fs = FileSystem::GetFileSystem(context.client);
-		string output_path = fs.JoinPath(file_path, StringUtil::Format("out_%llu", this_file_offset));
-		if (fs.FileExists(output_path)) {
-			throw IOException("%s exists", output_path);
+		string output_path =
+		    fs.JoinPath(file_path, StringUtil::Format("out_%llu", this_file_offset) + "." + function.extension);
+		if (fs.FileExists(output_path) && !allow_overwrite) {
+			throw IOException("%s exists! Enable ALLOW_OVERWRITE option to force writing", output_path);
 		}
 		res->global_state = function.copy_to_initialize_global(context.client, *bind_data, output_path);
 	}
@@ -108,27 +195,35 @@ unique_ptr<LocalSinkState> PhysicalCopyToFile::GetLocalSinkState(ExecutionContex
 }
 unique_ptr<GlobalSinkState> PhysicalCopyToFile::GetGlobalSinkState(ClientContext &context) const {
-	if (per_thread_output) {
+	if (partition_output || per_thread_output) {
 		auto &fs = FileSystem::GetFileSystem(context);
-		if (fs.FileExists(file_path)) {
-			throw IOException("%s exists", file_path);
+		if (fs.FileExists(file_path) && !allow_overwrite) {
+			throw IOException("%s exists! Enable ALLOW_OVERWRITE option to force writing", file_path);
 		}
 		if (!fs.DirectoryExists(file_path)) {
 			fs.CreateDirectory(file_path);
-		} else {
+		} else if (!allow_overwrite) {
 			idx_t n_files = 0;
-			fs.ListFiles(file_path, [&n_files](const string &path, bool) { n_files++; });
+			fs.ListFiles(
+			    file_path, [&n_files](const string &path, bool) { n_files++; }, FileOpener::Get(context));
 			if (n_files > 0) {
-				throw IOException("Directory %s is not empty", file_path);
+				throw IOException("Directory %s is not empty! Enable ALLOW_OVERWRITE option to force writing",
+				                  file_path);
 			}
 		}
-		return make_unique<CopyToFunctionGlobalState>(nullptr);
-	} else {
-		return make_unique<CopyToFunctionGlobalState>(
-		    function.copy_to_initialize_global(context, *bind_data, file_path));
+		auto state = make_unique<CopyToFunctionGlobalState>(nullptr);
+		if (partition_output) {
+			state->partition_state = make_shared<GlobalHivePartitionState>();
+		}
+		return std::move(state);
 	}
+	return make_unique<CopyToFunctionGlobalState>(function.copy_to_initialize_global(context, *bind_data, file_path));
 }
 //===--------------------------------------------------------------------===//

package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp CHANGED Viewed

@@ -17,7 +17,12 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::CreatePlan(LogicalCopyToFile
 	    make_unique<PhysicalCopyToFile>(op.types, op.function, std::move(op.bind_data), op.estimated_cardinality);
 	copy->file_path = op.file_path;
 	copy->use_tmp_file = op.use_tmp_file;
+	copy->allow_overwrite = op.allow_overwrite;
 	copy->per_thread_output = op.per_thread_output;
+	copy->partition_output = op.partition_output;
+	copy->partition_columns = op.partition_columns;
+	copy->names = op.names;
+	copy->expected_types = op.expected_types;
 	if (op.function.parallel) {
 		copy->parallel = op.function.parallel(context, *copy->bind_data);
 	}

package/src/duckdb/src/function/table/copy_csv.cpp CHANGED Viewed

@@ -3,6 +3,7 @@
 #include "duckdb/common/serializer/buffered_serializer.hpp"
 #include "duckdb/function/copy_function.hpp"
 #include "duckdb/parser/parsed_data/copy_info.hpp"
+#include "duckdb/common/bind_helpers.hpp"
 #include "duckdb/common/string_util.hpp"
 #include "duckdb/common/file_system.hpp"
 #include "duckdb/common/types/string_type.hpp"
@@ -58,13 +59,6 @@ void BaseCSVData::Finalize() {
 	}
 }
-static Value ConvertVectorToValue(vector<Value> set) {
-	if (set.empty()) {
-		return Value::EMPTYLIST(LogicalType::BOOLEAN);
-	}
-	return Value::LIST(std::move(set));
-}
 static unique_ptr<FunctionData> WriteCSVBind(ClientContext &context, CopyInfo &info, vector<string> &names,
                                              vector<LogicalType> &sql_types) {
 	auto bind_data = make_unique<WriteCSVData>(info.file_path, sql_types, names);