duckdb 0.6.2-dev1832.0 → 0.6.2-dev1873.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/src/common/bind_helpers.cpp +67 -0
  3. package/src/duckdb/src/common/file_system.cpp +2 -1
  4. package/src/duckdb/src/common/hive_partitioning.cpp +129 -4
  5. package/src/duckdb/src/common/local_file_system.cpp +4 -2
  6. package/src/duckdb/src/common/radix_partitioning.cpp +1 -0
  7. package/src/duckdb/src/common/string_util.cpp +9 -1
  8. package/src/duckdb/src/common/types/data_chunk.cpp +10 -0
  9. package/src/duckdb/src/common/types/partitioned_column_data.cpp +5 -0
  10. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +1 -50
  11. package/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp +110 -15
  12. package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp +5 -0
  13. package/src/duckdb/src/function/table/copy_csv.cpp +1 -7
  14. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  15. package/src/duckdb/src/include/duckdb/common/bind_helpers.hpp +21 -0
  16. package/src/duckdb/src/include/duckdb/common/file_system.hpp +3 -1
  17. package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +71 -2
  18. package/src/duckdb/src/include/duckdb/common/local_file_system.hpp +2 -1
  19. package/src/duckdb/src/include/duckdb/common/string_util.hpp +2 -0
  20. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +2 -0
  21. package/src/duckdb/src/include/duckdb/common/types/partitioned_column_data.hpp +1 -1
  22. package/src/duckdb/src/include/duckdb/common/virtual_file_system.hpp +3 -2
  23. package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp +7 -1
  24. package/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp +6 -0
  25. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +36 -1
  26. package/src/duckdb/src/planner/operator/logical_copy_to_file.cpp +8 -0
  27. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +8159 -8028
  28. package/src/duckdb/ub_src_common.cpp +2 -0
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.6.2-dev1832.0",
5
+ "version": "0.6.2-dev1873.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -0,0 +1,67 @@
1
+ #include "duckdb/common/bind_helpers.hpp"
2
+ #include "duckdb/common/common.hpp"
3
+ #include "duckdb/common/types.hpp"
4
+ #include "duckdb/common/exception.hpp"
5
+ #include "duckdb/common/types/value.hpp"
6
+ #include "duckdb/common/case_insensitive_map.hpp"
7
+
8
+ namespace duckdb {
9
+
10
+ Value ConvertVectorToValue(vector<Value> set) {
11
+ if (set.empty()) {
12
+ return Value::EMPTYLIST(LogicalType::BOOLEAN);
13
+ }
14
+ return Value::LIST(move(set));
15
+ }
16
+
17
+ vector<bool> ParseColumnList(const vector<Value> &set, vector<string> &names, const string &loption) {
18
+ vector<bool> result;
19
+
20
+ if (set.empty()) {
21
+ throw BinderException("\"%s\" expects a column list or * as parameter", loption);
22
+ }
23
+ // list of options: parse the list
24
+ case_insensitive_map_t<bool> option_map;
25
+ for (idx_t i = 0; i < set.size(); i++) {
26
+ option_map[set[i].ToString()] = false;
27
+ }
28
+ result.resize(names.size(), false);
29
+ for (idx_t i = 0; i < names.size(); i++) {
30
+ auto entry = option_map.find(names[i]);
31
+ if (entry != option_map.end()) {
32
+ result[i] = true;
33
+ entry->second = true;
34
+ }
35
+ }
36
+ for (auto &entry : option_map) {
37
+ if (!entry.second) {
38
+ throw BinderException("\"%s\" expected to find %s, but it was not found in the table", loption,
39
+ entry.first.c_str());
40
+ }
41
+ }
42
+ return result;
43
+ }
44
+
45
+ vector<bool> ParseColumnList(const Value &value, vector<string> &names, const string &loption) {
46
+ vector<bool> result;
47
+
48
+ // Only accept a list of arguments
49
+ if (value.type().id() != LogicalTypeId::LIST) {
50
+ // Support a single argument if it's '*'
51
+ if (value.type().id() == LogicalTypeId::VARCHAR && value.GetValue<string>() == "*") {
52
+ result.resize(names.size(), true);
53
+ return result;
54
+ }
55
+ throw BinderException("\"%s\" expects a column list or * as parameter", loption);
56
+ }
57
+ auto &children = ListValue::GetChildren(value);
58
+ // accept '*' as single argument
59
+ if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&
60
+ children[0].GetValue<string>() == "*") {
61
+ result.resize(names.size(), true);
62
+ return result;
63
+ }
64
+ return ParseColumnList(children, names, loption);
65
+ }
66
+
67
+ } // namespace duckdb
@@ -282,7 +282,8 @@ void FileSystem::RemoveDirectory(const string &directory) {
282
282
  throw NotImplementedException("%s: RemoveDirectory is not implemented!", GetName());
283
283
  }
284
284
 
285
- bool FileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback) {
285
+ bool FileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback,
286
+ FileOpener *opener) {
286
287
  throw NotImplementedException("%s: ListFiles is not implemented!", GetName());
287
288
  }
288
289
 
@@ -6,8 +6,6 @@
6
6
  #include "duckdb/planner/expression_iterator.hpp"
7
7
  #include "re2/re2.h"
8
8
 
9
- #include <iostream>
10
-
11
9
  namespace duckdb {
12
10
 
13
11
  static unordered_map<column_t, string> GetKnownColumnValues(string &filename,
@@ -88,6 +86,7 @@ void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector<str
88
86
  unordered_map<string, column_t> &column_map, idx_t table_index,
89
87
  bool hive_enabled, bool filename_enabled) {
90
88
  vector<string> pruned_files;
89
+ vector<bool> have_preserved_filter(filters.size(), false);
91
90
  vector<unique_ptr<Expression>> pruned_filters;
92
91
  duckdb_re2::RE2 regex(REGEX_STRING);
93
92
 
@@ -101,15 +100,21 @@ void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector<str
101
100
  auto known_values = GetKnownColumnValues(file, column_map, regex, filename_enabled, hive_enabled);
102
101
 
103
102
  FilterCombiner combiner(context);
104
- for (auto &filter : filters) {
103
+
104
+ for (idx_t j = 0; j < filters.size(); j++) {
105
+ auto &filter = filters[j];
105
106
  unique_ptr<Expression> filter_copy = filter->Copy();
106
107
  ConvertKnownColRefToConstants(filter_copy, known_values, table_index);
107
108
  // Evaluate the filter, if it can be evaluated here, we can not prune this filter
108
109
  Value result_value;
110
+
109
111
  if (!filter_copy->IsScalar() || !filter_copy->IsFoldable() ||
110
112
  !ExpressionExecutor::TryEvaluateScalar(context, *filter_copy, result_value)) {
111
113
  // can not be evaluated only with the filename/hive columns added, we can not prune this filter
112
- pruned_filters.emplace_back(filter->Copy());
114
+ if (!have_preserved_filter[j]) {
115
+ pruned_filters.emplace_back(filter->Copy());
116
+ have_preserved_filter[j] = true;
117
+ }
113
118
  } else if (!result_value.GetValue<bool>()) {
114
119
  // filter evaluates to false
115
120
  should_prune_file = true;
@@ -126,8 +131,128 @@ void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector<str
126
131
  }
127
132
  }
128
133
 
134
+ D_ASSERT(filters.size() >= pruned_filters.size());
135
+
129
136
  filters = std::move(pruned_filters);
130
137
  files = std::move(pruned_files);
131
138
  }
132
139
 
140
+ HivePartitionedColumnData::HivePartitionedColumnData(const HivePartitionedColumnData &other)
141
+ : PartitionedColumnData(other) {
142
+ // Synchronize to ensure consistency of shared partition map
143
+ if (other.global_state) {
144
+ global_state = other.global_state;
145
+ unique_lock<mutex> lck(global_state->lock);
146
+ SynchronizeLocalMap();
147
+ }
148
+ }
149
+
150
+ void HivePartitionedColumnData::ComputePartitionIndices(PartitionedColumnDataAppendState &state, DataChunk &input) {
151
+ Vector hashes(LogicalType::HASH, input.size());
152
+ input.Hash(group_by_columns, hashes);
153
+
154
+ for (idx_t i = 0; i < input.size(); i++) {
155
+ HivePartitionKey key;
156
+ key.hash = FlatVector::GetData<hash_t>(hashes)[i];
157
+ for (auto &col : group_by_columns) {
158
+ key.values.emplace_back(input.GetValue(col, i));
159
+ }
160
+
161
+ auto lookup = local_partition_map.find(key);
162
+ const auto partition_indices = FlatVector::GetData<idx_t>(state.partition_indices);
163
+ if (lookup == local_partition_map.end()) {
164
+ idx_t new_partition_id = RegisterNewPartition(key, state);
165
+ partition_indices[i] = new_partition_id;
166
+ } else {
167
+ partition_indices[i] = lookup->second;
168
+ }
169
+ }
170
+ }
171
+
172
+ std::map<idx_t, const HivePartitionKey *> HivePartitionedColumnData::GetReverseMap() {
173
+ std::map<idx_t, const HivePartitionKey *> ret;
174
+ for (const auto &pair : local_partition_map) {
175
+ ret[pair.second] = &(pair.first);
176
+ }
177
+ return ret;
178
+ }
179
+
180
+ void HivePartitionedColumnData::GrowAllocators() {
181
+ unique_lock<mutex> lck_gstate(allocators->lock);
182
+
183
+ idx_t current_allocator_size = allocators->allocators.size();
184
+ idx_t required_allocators = local_partition_map.size();
185
+
186
+ allocators->allocators.reserve(current_allocator_size);
187
+ for (idx_t i = current_allocator_size; i < required_allocators; i++) {
188
+ CreateAllocator();
189
+ }
190
+
191
+ D_ASSERT(allocators->allocators.size() == local_partition_map.size());
192
+ }
193
+
194
+ void HivePartitionedColumnData::GrowAppendState(PartitionedColumnDataAppendState &state) {
195
+ idx_t current_append_state_size = state.partition_append_states.size();
196
+ idx_t required_append_state_size = local_partition_map.size();
197
+
198
+ for (idx_t i = current_append_state_size; i < required_append_state_size; i++) {
199
+ state.partition_append_states.emplace_back(make_unique<ColumnDataAppendState>());
200
+ state.partition_buffers.emplace_back(CreatePartitionBuffer());
201
+ }
202
+ }
203
+
204
+ void HivePartitionedColumnData::GrowPartitions(PartitionedColumnDataAppendState &state) {
205
+ idx_t current_partitions = partitions.size();
206
+ idx_t required_partitions = local_partition_map.size();
207
+
208
+ D_ASSERT(allocators->allocators.size() == required_partitions);
209
+
210
+ for (idx_t i = current_partitions; i < required_partitions; i++) {
211
+ partitions.emplace_back(CreatePartitionCollection(i));
212
+ partitions[i]->InitializeAppend(*state.partition_append_states[i]);
213
+ }
214
+ D_ASSERT(partitions.size() == local_partition_map.size());
215
+ }
216
+
217
+ void HivePartitionedColumnData::SynchronizeLocalMap() {
218
+ // Synchronise global map into local, may contain changes from other threads too
219
+ for (auto it = global_state->partitions.begin() + local_partition_map.size(); it < global_state->partitions.end();
220
+ it++) {
221
+ local_partition_map[(*it)->first] = (*it)->second;
222
+ }
223
+ }
224
+
225
+ idx_t HivePartitionedColumnData::RegisterNewPartition(HivePartitionKey key, PartitionedColumnDataAppendState &state) {
226
+ if (global_state) {
227
+ idx_t partition_id;
228
+
229
+ // Synchronize Global state with our local state with the newly discoveren partition
230
+ {
231
+ unique_lock<mutex> lck_gstate(global_state->lock);
232
+
233
+ // Insert into global map, or return partition if already present
234
+ auto res =
235
+ global_state->partition_map.emplace(std::make_pair(std::move(key), global_state->partition_map.size()));
236
+ auto it = res.first;
237
+ partition_id = it->second;
238
+
239
+ // Add iterator to vector to allow incrementally updating local states from global state
240
+ global_state->partitions.emplace_back(it);
241
+ SynchronizeLocalMap();
242
+ }
243
+
244
+ // After synchronizing with the global state, we need to grow the shared allocators to support
245
+ // the number of partitions, which guarantees that there's always enough allocators available to each thread
246
+ GrowAllocators();
247
+
248
+ // Grow local partition data
249
+ GrowAppendState(state);
250
+ GrowPartitions(state);
251
+
252
+ return partition_id;
253
+ } else {
254
+ return local_partition_map.emplace(std::make_pair(std::move(key), local_partition_map.size())).first->second;
255
+ }
256
+ }
257
+
133
258
  } // namespace duckdb
@@ -407,7 +407,8 @@ void LocalFileSystem::RemoveFile(const string &filename) {
407
407
  }
408
408
  }
409
409
 
410
- bool LocalFileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback) {
410
+ bool LocalFileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback,
411
+ FileOpener *opener) {
411
412
  if (!DirectoryExists(directory)) {
412
413
  return false;
413
414
  }
@@ -734,7 +735,8 @@ void LocalFileSystem::RemoveFile(const string &filename) {
734
735
  }
735
736
  }
736
737
 
737
- bool LocalFileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback) {
738
+ bool LocalFileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback,
739
+ FileOpener *opener) {
738
740
  string search_dir = JoinPath(directory, "*");
739
741
 
740
742
  auto unicode_path = WindowsUtil::UTF8ToUnicode(search_dir.c_str());
@@ -435,6 +435,7 @@ RadixPartitionedColumnData::RadixPartitionedColumnData(ClientContext &context_p,
435
435
 
436
436
  RadixPartitionedColumnData::RadixPartitionedColumnData(const RadixPartitionedColumnData &other)
437
437
  : PartitionedColumnData(other), radix_bits(other.radix_bits), hash_col_idx(other.hash_col_idx) {
438
+
438
439
  for (idx_t i = 0; i < RadixPartitioning::NumberOfPartitions(radix_bits); i++) {
439
440
  partitions.emplace_back(CreatePartitionCollection(i));
440
441
  }
@@ -1,7 +1,8 @@
1
1
  #include "duckdb/common/string_util.hpp"
2
+
3
+ #include "duckdb/common/exception.hpp"
2
4
  #include "duckdb/common/pair.hpp"
3
5
  #include "duckdb/common/to_string.hpp"
4
- #include "duckdb/common/exception.hpp"
5
6
 
6
7
  #include <algorithm>
7
8
  #include <cctype>
@@ -31,6 +32,13 @@ void StringUtil::RTrim(string &str) {
31
32
  str.end());
32
33
  }
33
34
 
35
+ void StringUtil::RTrim(string &str, const string &chars_to_trim) {
36
+ str.erase(find_if(str.rbegin(), str.rend(),
37
+ [&chars_to_trim](int ch) { return ch > 0 && chars_to_trim.find(ch) == string::npos; })
38
+ .base(),
39
+ str.end());
40
+ }
41
+
34
42
  void StringUtil::Trim(string &str) {
35
43
  StringUtil::LTrim(str);
36
44
  StringUtil::RTrim(str);
@@ -307,6 +307,16 @@ void DataChunk::Hash(Vector &result) {
307
307
  }
308
308
  }
309
309
 
310
+ void DataChunk::Hash(vector<idx_t> &column_ids, Vector &result) {
311
+ D_ASSERT(result.GetType().id() == LogicalType::HASH);
312
+ D_ASSERT(column_ids.size() > 0);
313
+
314
+ VectorOperations::Hash(data[column_ids[0]], result, size());
315
+ for (idx_t i = 1; i < column_ids.size(); i++) {
316
+ VectorOperations::CombineHash(result, data[column_ids[i]], size());
317
+ }
318
+ }
319
+
310
320
  void DataChunk::Verify() {
311
321
  #ifdef DEBUG
312
322
  D_ASSERT(size() <= capacity);
@@ -1,6 +1,7 @@
1
1
  #include "duckdb/common/types/partitioned_column_data.hpp"
2
2
 
3
3
  #include "duckdb/common/radix_partitioning.hpp"
4
+ #include "duckdb/common/hive_partitioning.hpp"
4
5
  #include "duckdb/storage/buffer_manager.hpp"
5
6
 
6
7
  namespace duckdb {
@@ -18,6 +19,8 @@ unique_ptr<PartitionedColumnData> PartitionedColumnData::CreateShared() {
18
19
  switch (type) {
19
20
  case PartitionedColumnDataType::RADIX:
20
21
  return make_unique<RadixPartitionedColumnData>((RadixPartitionedColumnData &)*this);
22
+ case PartitionedColumnDataType::HIVE:
23
+ return make_unique<HivePartitionedColumnData>((HivePartitionedColumnData &)*this);
21
24
  default:
22
25
  throw NotImplementedException("CreateShared for this type of PartitionedColumnData");
23
26
  }
@@ -141,10 +144,12 @@ void PartitionedColumnData::FlushAppendState(PartitionedColumnDataAppendState &s
141
144
  void PartitionedColumnData::Combine(PartitionedColumnData &other) {
142
145
  // Now combine the state's partitions into this
143
146
  lock_guard<mutex> guard(lock);
147
+
144
148
  if (partitions.empty()) {
145
149
  // This is the first merge, we just copy them over
146
150
  partitions = std::move(other.partitions);
147
151
  } else {
152
+ D_ASSERT(partitions.size() == other.partitions.size());
148
153
  // Combine the append state's partitions into this PartitionedColumnData
149
154
  for (idx_t i = 0; i < other.partitions.size(); i++) {
150
155
  partitions[i]->Combine(*other.partitions[i]);
@@ -1,4 +1,5 @@
1
1
  #include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
2
+ #include "duckdb/common/bind_helpers.hpp"
2
3
  #include "duckdb/common/vector_size.hpp"
3
4
  #include "duckdb/common/string_util.hpp"
4
5
 
@@ -59,56 +60,6 @@ static int64_t ParseInteger(const Value &value, const string &loption) {
59
60
  return value.GetValue<int64_t>();
60
61
  }
61
62
 
62
- static vector<bool> ParseColumnList(const vector<Value> &set, vector<string> &names, const string &loption) {
63
- vector<bool> result;
64
-
65
- if (set.empty()) {
66
- throw BinderException("\"%s\" expects a column list or * as parameter", loption);
67
- }
68
- // list of options: parse the list
69
- unordered_map<string, bool> option_map;
70
- for (idx_t i = 0; i < set.size(); i++) {
71
- option_map[set[i].ToString()] = false;
72
- }
73
- result.resize(names.size(), false);
74
- for (idx_t i = 0; i < names.size(); i++) {
75
- auto entry = option_map.find(names[i]);
76
- if (entry != option_map.end()) {
77
- result[i] = true;
78
- entry->second = true;
79
- }
80
- }
81
- for (auto &entry : option_map) {
82
- if (!entry.second) {
83
- throw BinderException("\"%s\" expected to find %s, but it was not found in the table", loption,
84
- entry.first.c_str());
85
- }
86
- }
87
- return result;
88
- }
89
-
90
- static vector<bool> ParseColumnList(const Value &value, vector<string> &names, const string &loption) {
91
- vector<bool> result;
92
-
93
- // Only accept a list of arguments
94
- if (value.type().id() != LogicalTypeId::LIST) {
95
- // Support a single argument if it's '*'
96
- if (value.type().id() == LogicalTypeId::VARCHAR && value.GetValue<string>() == "*") {
97
- result.resize(names.size(), true);
98
- return result;
99
- }
100
- throw BinderException("\"%s\" expects a column list or * as parameter", loption);
101
- }
102
- auto &children = ListValue::GetChildren(value);
103
- // accept '*' as single argument
104
- if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&
105
- children[0].GetValue<string>() == "*") {
106
- result.resize(names.size(), true);
107
- return result;
108
- }
109
- return ParseColumnList(children, names, loption);
110
- }
111
-
112
63
  void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
113
64
  this->delimiter = StringUtil::Replace(input, "\\t", "\t");
114
65
  this->has_delimiter = true;
@@ -1,6 +1,8 @@
1
1
  #include "duckdb/execution/operator/persistent/physical_copy_to_file.hpp"
2
2
  #include "duckdb/common/vector_operations/vector_operations.hpp"
3
+ #include "duckdb/common/hive_partitioning.hpp"
3
4
  #include "duckdb/common/file_system.hpp"
5
+ #include "duckdb/common/file_opener.hpp"
4
6
 
5
7
  #include <algorithm>
6
8
 
@@ -15,14 +17,24 @@ public:
15
17
  idx_t rows_copied;
16
18
  idx_t last_file_offset;
17
19
  unique_ptr<GlobalFunctionData> global_state;
20
+
21
+ //! shared state for HivePartitionedColumnData
22
+ shared_ptr<GlobalHivePartitionState> partition_state;
18
23
  };
19
24
 
20
25
  class CopyToFunctionLocalState : public LocalSinkState {
21
26
  public:
22
- explicit CopyToFunctionLocalState(unique_ptr<LocalFunctionData> local_state) : local_state(std::move(local_state)) {
27
+ explicit CopyToFunctionLocalState(unique_ptr<LocalFunctionData> local_state)
28
+ : local_state(std::move(local_state)), writer_offset(0) {
23
29
  }
24
30
  unique_ptr<GlobalFunctionData> global_state;
25
31
  unique_ptr<LocalFunctionData> local_state;
32
+
33
+ //! Buffers the tuples in partitions before writing
34
+ unique_ptr<HivePartitionedColumnData> part_buffer;
35
+ unique_ptr<PartitionedColumnDataAppendState> part_buffer_append_state;
36
+
37
+ idx_t writer_offset;
26
38
  };
27
39
 
28
40
  //===--------------------------------------------------------------------===//
@@ -48,6 +60,11 @@ SinkResultType PhysicalCopyToFile::Sink(ExecutionContext &context, GlobalSinkSta
48
60
  auto &g = (CopyToFunctionGlobalState &)gstate;
49
61
  auto &l = (CopyToFunctionLocalState &)lstate;
50
62
 
63
+ if (partition_output) {
64
+ l.part_buffer->Append(*l.part_buffer_append_state, input);
65
+ return SinkResultType::NEED_MORE_INPUT;
66
+ }
67
+
51
68
  {
52
69
  lock_guard<mutex> glock(g.lock);
53
70
  g.rows_copied += input.size();
@@ -57,13 +74,67 @@ SinkResultType PhysicalCopyToFile::Sink(ExecutionContext &context, GlobalSinkSta
57
74
  return SinkResultType::NEED_MORE_INPUT;
58
75
  }
59
76
 
77
+ static void CreateDir(const string &dir_path, FileSystem &fs) {
78
+ if (!fs.DirectoryExists(dir_path)) {
79
+ fs.CreateDirectory(dir_path);
80
+ }
81
+ }
82
+
83
+ static string CreateDirRecursive(const vector<idx_t> &cols, const vector<string> &names, const vector<Value> &values,
84
+ string path, FileSystem &fs) {
85
+ CreateDir(path, fs);
86
+
87
+ for (idx_t i = 0; i < cols.size(); i++) {
88
+ auto partition_col_name = names[cols[i]];
89
+ auto partition_value = values[i];
90
+ string p_dir = partition_col_name + "=" + partition_value.ToString();
91
+ path = fs.JoinPath(path, p_dir);
92
+ CreateDir(path, fs);
93
+ }
94
+
95
+ return path;
96
+ }
97
+
60
98
  void PhysicalCopyToFile::Combine(ExecutionContext &context, GlobalSinkState &gstate, LocalSinkState &lstate) const {
61
99
  auto &g = (CopyToFunctionGlobalState &)gstate;
62
100
  auto &l = (CopyToFunctionLocalState &)lstate;
63
101
 
102
+ if (partition_output) {
103
+ auto &fs = FileSystem::GetFileSystem(context.client);
104
+ l.part_buffer->FlushAppendState(*l.part_buffer_append_state);
105
+ auto &partitions = l.part_buffer->GetPartitions();
106
+ auto partition_key_map = l.part_buffer->GetReverseMap();
107
+
108
+ string trimmed_path = file_path;
109
+ StringUtil::RTrim(trimmed_path, fs.PathSeparator());
110
+
111
+ for (idx_t i = 0; i < partitions.size(); i++) {
112
+ string hive_path =
113
+ CreateDirRecursive(partition_columns, names, partition_key_map[i]->values, trimmed_path, fs);
114
+ string full_path = fs.JoinPath(hive_path, "data_" + to_string(l.writer_offset) + "." + function.extension);
115
+ if (fs.FileExists(full_path) && !allow_overwrite) {
116
+ throw IOException("failed to create " + full_path +
117
+ ", file exists! Enable ALLOW_OVERWRITE option to force writing");
118
+ }
119
+ // Create a writer for the current file
120
+ auto fun_data_global = function.copy_to_initialize_global(context.client, *bind_data, full_path);
121
+ auto fun_data_local = function.copy_to_initialize_local(context, *bind_data);
122
+
123
+ for (auto &chunk : partitions[i]->Chunks()) {
124
+ function.copy_to_sink(context, *bind_data, *fun_data_global, *fun_data_local, chunk);
125
+ }
126
+
127
+ function.copy_to_combine(context, *bind_data, *fun_data_global, *fun_data_local);
128
+ function.copy_to_finalize(context.client, *bind_data, *fun_data_global);
129
+ }
130
+
131
+ return;
132
+ }
133
+
64
134
  if (function.copy_to_combine) {
65
135
  function.copy_to_combine(context, *bind_data, per_thread_output ? *l.global_state : *g.global_state,
66
136
  *l.local_state);
137
+
67
138
  if (per_thread_output) {
68
139
  function.copy_to_finalize(context.client, *bind_data, *l.global_state);
69
140
  }
@@ -73,7 +144,7 @@ void PhysicalCopyToFile::Combine(ExecutionContext &context, GlobalSinkState &gst
73
144
  SinkFinalizeType PhysicalCopyToFile::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
74
145
  GlobalSinkState &gstate_p) const {
75
146
  auto &gstate = (CopyToFunctionGlobalState &)gstate_p;
76
- if (per_thread_output) {
147
+ if (per_thread_output || partition_output) {
77
148
  // already happened in combine
78
149
  return SinkFinalizeType::READY;
79
150
  }
@@ -82,6 +153,7 @@ SinkFinalizeType PhysicalCopyToFile::Finalize(Pipeline &pipeline, Event &event,
82
153
 
83
154
  if (use_tmp_file) {
84
155
  D_ASSERT(!per_thread_output); // FIXME
156
+ D_ASSERT(!partition_output); // FIXME
85
157
  MoveTmpFile(context, file_path);
86
158
  }
87
159
  }
@@ -89,6 +161,20 @@ SinkFinalizeType PhysicalCopyToFile::Finalize(Pipeline &pipeline, Event &event,
89
161
  }
90
162
 
91
163
  unique_ptr<LocalSinkState> PhysicalCopyToFile::GetLocalSinkState(ExecutionContext &context) const {
164
+ if (partition_output) {
165
+ auto state = make_unique<CopyToFunctionLocalState>(nullptr);
166
+ {
167
+ auto &g = (CopyToFunctionGlobalState &)*sink_state;
168
+ lock_guard<mutex> glock(g.lock);
169
+ state->writer_offset = g.last_file_offset++;
170
+
171
+ state->part_buffer = make_unique<HivePartitionedColumnData>(context.client, expected_types,
172
+ partition_columns, g.partition_state);
173
+ state->part_buffer_append_state = make_unique<PartitionedColumnDataAppendState>();
174
+ state->part_buffer->InitializeAppendState(*state->part_buffer_append_state);
175
+ }
176
+ return std::move(state);
177
+ }
92
178
  auto res = make_unique<CopyToFunctionLocalState>(function.copy_to_initialize_local(context, *bind_data));
93
179
  if (per_thread_output) {
94
180
  idx_t this_file_offset;
@@ -98,9 +184,10 @@ unique_ptr<LocalSinkState> PhysicalCopyToFile::GetLocalSinkState(ExecutionContex
98
184
  this_file_offset = g.last_file_offset++;
99
185
  }
100
186
  auto &fs = FileSystem::GetFileSystem(context.client);
101
- string output_path = fs.JoinPath(file_path, StringUtil::Format("out_%llu", this_file_offset));
102
- if (fs.FileExists(output_path)) {
103
- throw IOException("%s exists", output_path);
187
+ string output_path =
188
+ fs.JoinPath(file_path, StringUtil::Format("out_%llu", this_file_offset) + "." + function.extension);
189
+ if (fs.FileExists(output_path) && !allow_overwrite) {
190
+ throw IOException("%s exists! Enable ALLOW_OVERWRITE option to force writing", output_path);
104
191
  }
105
192
  res->global_state = function.copy_to_initialize_global(context.client, *bind_data, output_path);
106
193
  }
@@ -108,27 +195,35 @@ unique_ptr<LocalSinkState> PhysicalCopyToFile::GetLocalSinkState(ExecutionContex
108
195
  }
109
196
 
110
197
  unique_ptr<GlobalSinkState> PhysicalCopyToFile::GetGlobalSinkState(ClientContext &context) const {
111
- if (per_thread_output) {
198
+
199
+ if (partition_output || per_thread_output) {
112
200
  auto &fs = FileSystem::GetFileSystem(context);
113
201
 
114
- if (fs.FileExists(file_path)) {
115
- throw IOException("%s exists", file_path);
202
+ if (fs.FileExists(file_path) && !allow_overwrite) {
203
+ throw IOException("%s exists! Enable ALLOW_OVERWRITE option to force writing", file_path);
116
204
  }
117
205
  if (!fs.DirectoryExists(file_path)) {
118
206
  fs.CreateDirectory(file_path);
119
- } else {
207
+ } else if (!allow_overwrite) {
120
208
  idx_t n_files = 0;
121
- fs.ListFiles(file_path, [&n_files](const string &path, bool) { n_files++; });
209
+ fs.ListFiles(
210
+ file_path, [&n_files](const string &path, bool) { n_files++; }, FileOpener::Get(context));
122
211
  if (n_files > 0) {
123
- throw IOException("Directory %s is not empty", file_path);
212
+ throw IOException("Directory %s is not empty! Enable ALLOW_OVERWRITE option to force writing",
213
+ file_path);
124
214
  }
125
215
  }
126
216
 
127
- return make_unique<CopyToFunctionGlobalState>(nullptr);
128
- } else {
129
- return make_unique<CopyToFunctionGlobalState>(
130
- function.copy_to_initialize_global(context, *bind_data, file_path));
217
+ auto state = make_unique<CopyToFunctionGlobalState>(nullptr);
218
+
219
+ if (partition_output) {
220
+ state->partition_state = make_shared<GlobalHivePartitionState>();
221
+ }
222
+
223
+ return std::move(state);
131
224
  }
225
+
226
+ return make_unique<CopyToFunctionGlobalState>(function.copy_to_initialize_global(context, *bind_data, file_path));
132
227
  }
133
228
 
134
229
  //===--------------------------------------------------------------------===//
@@ -17,7 +17,12 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::CreatePlan(LogicalCopyToFile
17
17
  make_unique<PhysicalCopyToFile>(op.types, op.function, std::move(op.bind_data), op.estimated_cardinality);
18
18
  copy->file_path = op.file_path;
19
19
  copy->use_tmp_file = op.use_tmp_file;
20
+ copy->allow_overwrite = op.allow_overwrite;
20
21
  copy->per_thread_output = op.per_thread_output;
22
+ copy->partition_output = op.partition_output;
23
+ copy->partition_columns = op.partition_columns;
24
+ copy->names = op.names;
25
+ copy->expected_types = op.expected_types;
21
26
  if (op.function.parallel) {
22
27
  copy->parallel = op.function.parallel(context, *copy->bind_data);
23
28
  }
@@ -3,6 +3,7 @@
3
3
  #include "duckdb/common/serializer/buffered_serializer.hpp"
4
4
  #include "duckdb/function/copy_function.hpp"
5
5
  #include "duckdb/parser/parsed_data/copy_info.hpp"
6
+ #include "duckdb/common/bind_helpers.hpp"
6
7
  #include "duckdb/common/string_util.hpp"
7
8
  #include "duckdb/common/file_system.hpp"
8
9
  #include "duckdb/common/types/string_type.hpp"
@@ -58,13 +59,6 @@ void BaseCSVData::Finalize() {
58
59
  }
59
60
  }
60
61
 
61
- static Value ConvertVectorToValue(vector<Value> set) {
62
- if (set.empty()) {
63
- return Value::EMPTYLIST(LogicalType::BOOLEAN);
64
- }
65
- return Value::LIST(std::move(set));
66
- }
67
-
68
62
  static unique_ptr<FunctionData> WriteCSVBind(ClientContext &context, CopyInfo &info, vector<string> &names,
69
63
  vector<LogicalType> &sql_types) {
70
64
  auto bind_data = make_unique<WriteCSVData>(info.file_path, sql_types, names);