duckdb 0.6.2-dev1832.0 → 0.6.2-dev1873.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/src/common/bind_helpers.cpp +67 -0
- package/src/duckdb/src/common/file_system.cpp +2 -1
- package/src/duckdb/src/common/hive_partitioning.cpp +129 -4
- package/src/duckdb/src/common/local_file_system.cpp +4 -2
- package/src/duckdb/src/common/radix_partitioning.cpp +1 -0
- package/src/duckdb/src/common/string_util.cpp +9 -1
- package/src/duckdb/src/common/types/data_chunk.cpp +10 -0
- package/src/duckdb/src/common/types/partitioned_column_data.cpp +5 -0
- package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +1 -50
- package/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp +110 -15
- package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp +5 -0
- package/src/duckdb/src/function/table/copy_csv.cpp +1 -7
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/bind_helpers.hpp +21 -0
- package/src/duckdb/src/include/duckdb/common/file_system.hpp +3 -1
- package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +71 -2
- package/src/duckdb/src/include/duckdb/common/local_file_system.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/string_util.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/types/partitioned_column_data.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/virtual_file_system.hpp +3 -2
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp +7 -1
- package/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp +6 -0
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +36 -1
- package/src/duckdb/src/planner/operator/logical_copy_to_file.cpp +8 -0
- package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +8159 -8028
- package/src/duckdb/ub_src_common.cpp +2 -0
package/package.json
CHANGED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
#include "duckdb/common/bind_helpers.hpp"
|
|
2
|
+
#include "duckdb/common/common.hpp"
|
|
3
|
+
#include "duckdb/common/types.hpp"
|
|
4
|
+
#include "duckdb/common/exception.hpp"
|
|
5
|
+
#include "duckdb/common/types/value.hpp"
|
|
6
|
+
#include "duckdb/common/case_insensitive_map.hpp"
|
|
7
|
+
|
|
8
|
+
namespace duckdb {
|
|
9
|
+
|
|
10
|
+
Value ConvertVectorToValue(vector<Value> set) {
|
|
11
|
+
if (set.empty()) {
|
|
12
|
+
return Value::EMPTYLIST(LogicalType::BOOLEAN);
|
|
13
|
+
}
|
|
14
|
+
return Value::LIST(move(set));
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
vector<bool> ParseColumnList(const vector<Value> &set, vector<string> &names, const string &loption) {
|
|
18
|
+
vector<bool> result;
|
|
19
|
+
|
|
20
|
+
if (set.empty()) {
|
|
21
|
+
throw BinderException("\"%s\" expects a column list or * as parameter", loption);
|
|
22
|
+
}
|
|
23
|
+
// list of options: parse the list
|
|
24
|
+
case_insensitive_map_t<bool> option_map;
|
|
25
|
+
for (idx_t i = 0; i < set.size(); i++) {
|
|
26
|
+
option_map[set[i].ToString()] = false;
|
|
27
|
+
}
|
|
28
|
+
result.resize(names.size(), false);
|
|
29
|
+
for (idx_t i = 0; i < names.size(); i++) {
|
|
30
|
+
auto entry = option_map.find(names[i]);
|
|
31
|
+
if (entry != option_map.end()) {
|
|
32
|
+
result[i] = true;
|
|
33
|
+
entry->second = true;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
for (auto &entry : option_map) {
|
|
37
|
+
if (!entry.second) {
|
|
38
|
+
throw BinderException("\"%s\" expected to find %s, but it was not found in the table", loption,
|
|
39
|
+
entry.first.c_str());
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return result;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
vector<bool> ParseColumnList(const Value &value, vector<string> &names, const string &loption) {
|
|
46
|
+
vector<bool> result;
|
|
47
|
+
|
|
48
|
+
// Only accept a list of arguments
|
|
49
|
+
if (value.type().id() != LogicalTypeId::LIST) {
|
|
50
|
+
// Support a single argument if it's '*'
|
|
51
|
+
if (value.type().id() == LogicalTypeId::VARCHAR && value.GetValue<string>() == "*") {
|
|
52
|
+
result.resize(names.size(), true);
|
|
53
|
+
return result;
|
|
54
|
+
}
|
|
55
|
+
throw BinderException("\"%s\" expects a column list or * as parameter", loption);
|
|
56
|
+
}
|
|
57
|
+
auto &children = ListValue::GetChildren(value);
|
|
58
|
+
// accept '*' as single argument
|
|
59
|
+
if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&
|
|
60
|
+
children[0].GetValue<string>() == "*") {
|
|
61
|
+
result.resize(names.size(), true);
|
|
62
|
+
return result;
|
|
63
|
+
}
|
|
64
|
+
return ParseColumnList(children, names, loption);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
} // namespace duckdb
|
|
@@ -282,7 +282,8 @@ void FileSystem::RemoveDirectory(const string &directory) {
|
|
|
282
282
|
throw NotImplementedException("%s: RemoveDirectory is not implemented!", GetName());
|
|
283
283
|
}
|
|
284
284
|
|
|
285
|
-
bool FileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback
|
|
285
|
+
bool FileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback,
|
|
286
|
+
FileOpener *opener) {
|
|
286
287
|
throw NotImplementedException("%s: ListFiles is not implemented!", GetName());
|
|
287
288
|
}
|
|
288
289
|
|
|
@@ -6,8 +6,6 @@
|
|
|
6
6
|
#include "duckdb/planner/expression_iterator.hpp"
|
|
7
7
|
#include "re2/re2.h"
|
|
8
8
|
|
|
9
|
-
#include <iostream>
|
|
10
|
-
|
|
11
9
|
namespace duckdb {
|
|
12
10
|
|
|
13
11
|
static unordered_map<column_t, string> GetKnownColumnValues(string &filename,
|
|
@@ -88,6 +86,7 @@ void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector<str
|
|
|
88
86
|
unordered_map<string, column_t> &column_map, idx_t table_index,
|
|
89
87
|
bool hive_enabled, bool filename_enabled) {
|
|
90
88
|
vector<string> pruned_files;
|
|
89
|
+
vector<bool> have_preserved_filter(filters.size(), false);
|
|
91
90
|
vector<unique_ptr<Expression>> pruned_filters;
|
|
92
91
|
duckdb_re2::RE2 regex(REGEX_STRING);
|
|
93
92
|
|
|
@@ -101,15 +100,21 @@ void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector<str
|
|
|
101
100
|
auto known_values = GetKnownColumnValues(file, column_map, regex, filename_enabled, hive_enabled);
|
|
102
101
|
|
|
103
102
|
FilterCombiner combiner(context);
|
|
104
|
-
|
|
103
|
+
|
|
104
|
+
for (idx_t j = 0; j < filters.size(); j++) {
|
|
105
|
+
auto &filter = filters[j];
|
|
105
106
|
unique_ptr<Expression> filter_copy = filter->Copy();
|
|
106
107
|
ConvertKnownColRefToConstants(filter_copy, known_values, table_index);
|
|
107
108
|
// Evaluate the filter, if it can be evaluated here, we can not prune this filter
|
|
108
109
|
Value result_value;
|
|
110
|
+
|
|
109
111
|
if (!filter_copy->IsScalar() || !filter_copy->IsFoldable() ||
|
|
110
112
|
!ExpressionExecutor::TryEvaluateScalar(context, *filter_copy, result_value)) {
|
|
111
113
|
// can not be evaluated only with the filename/hive columns added, we can not prune this filter
|
|
112
|
-
|
|
114
|
+
if (!have_preserved_filter[j]) {
|
|
115
|
+
pruned_filters.emplace_back(filter->Copy());
|
|
116
|
+
have_preserved_filter[j] = true;
|
|
117
|
+
}
|
|
113
118
|
} else if (!result_value.GetValue<bool>()) {
|
|
114
119
|
// filter evaluates to false
|
|
115
120
|
should_prune_file = true;
|
|
@@ -126,8 +131,128 @@ void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector<str
|
|
|
126
131
|
}
|
|
127
132
|
}
|
|
128
133
|
|
|
134
|
+
D_ASSERT(filters.size() >= pruned_filters.size());
|
|
135
|
+
|
|
129
136
|
filters = std::move(pruned_filters);
|
|
130
137
|
files = std::move(pruned_files);
|
|
131
138
|
}
|
|
132
139
|
|
|
140
|
+
HivePartitionedColumnData::HivePartitionedColumnData(const HivePartitionedColumnData &other)
|
|
141
|
+
: PartitionedColumnData(other) {
|
|
142
|
+
// Synchronize to ensure consistency of shared partition map
|
|
143
|
+
if (other.global_state) {
|
|
144
|
+
global_state = other.global_state;
|
|
145
|
+
unique_lock<mutex> lck(global_state->lock);
|
|
146
|
+
SynchronizeLocalMap();
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
void HivePartitionedColumnData::ComputePartitionIndices(PartitionedColumnDataAppendState &state, DataChunk &input) {
|
|
151
|
+
Vector hashes(LogicalType::HASH, input.size());
|
|
152
|
+
input.Hash(group_by_columns, hashes);
|
|
153
|
+
|
|
154
|
+
for (idx_t i = 0; i < input.size(); i++) {
|
|
155
|
+
HivePartitionKey key;
|
|
156
|
+
key.hash = FlatVector::GetData<hash_t>(hashes)[i];
|
|
157
|
+
for (auto &col : group_by_columns) {
|
|
158
|
+
key.values.emplace_back(input.GetValue(col, i));
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
auto lookup = local_partition_map.find(key);
|
|
162
|
+
const auto partition_indices = FlatVector::GetData<idx_t>(state.partition_indices);
|
|
163
|
+
if (lookup == local_partition_map.end()) {
|
|
164
|
+
idx_t new_partition_id = RegisterNewPartition(key, state);
|
|
165
|
+
partition_indices[i] = new_partition_id;
|
|
166
|
+
} else {
|
|
167
|
+
partition_indices[i] = lookup->second;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
std::map<idx_t, const HivePartitionKey *> HivePartitionedColumnData::GetReverseMap() {
|
|
173
|
+
std::map<idx_t, const HivePartitionKey *> ret;
|
|
174
|
+
for (const auto &pair : local_partition_map) {
|
|
175
|
+
ret[pair.second] = &(pair.first);
|
|
176
|
+
}
|
|
177
|
+
return ret;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
void HivePartitionedColumnData::GrowAllocators() {
|
|
181
|
+
unique_lock<mutex> lck_gstate(allocators->lock);
|
|
182
|
+
|
|
183
|
+
idx_t current_allocator_size = allocators->allocators.size();
|
|
184
|
+
idx_t required_allocators = local_partition_map.size();
|
|
185
|
+
|
|
186
|
+
allocators->allocators.reserve(current_allocator_size);
|
|
187
|
+
for (idx_t i = current_allocator_size; i < required_allocators; i++) {
|
|
188
|
+
CreateAllocator();
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
D_ASSERT(allocators->allocators.size() == local_partition_map.size());
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
void HivePartitionedColumnData::GrowAppendState(PartitionedColumnDataAppendState &state) {
|
|
195
|
+
idx_t current_append_state_size = state.partition_append_states.size();
|
|
196
|
+
idx_t required_append_state_size = local_partition_map.size();
|
|
197
|
+
|
|
198
|
+
for (idx_t i = current_append_state_size; i < required_append_state_size; i++) {
|
|
199
|
+
state.partition_append_states.emplace_back(make_unique<ColumnDataAppendState>());
|
|
200
|
+
state.partition_buffers.emplace_back(CreatePartitionBuffer());
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
void HivePartitionedColumnData::GrowPartitions(PartitionedColumnDataAppendState &state) {
|
|
205
|
+
idx_t current_partitions = partitions.size();
|
|
206
|
+
idx_t required_partitions = local_partition_map.size();
|
|
207
|
+
|
|
208
|
+
D_ASSERT(allocators->allocators.size() == required_partitions);
|
|
209
|
+
|
|
210
|
+
for (idx_t i = current_partitions; i < required_partitions; i++) {
|
|
211
|
+
partitions.emplace_back(CreatePartitionCollection(i));
|
|
212
|
+
partitions[i]->InitializeAppend(*state.partition_append_states[i]);
|
|
213
|
+
}
|
|
214
|
+
D_ASSERT(partitions.size() == local_partition_map.size());
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
void HivePartitionedColumnData::SynchronizeLocalMap() {
|
|
218
|
+
// Synchronise global map into local, may contain changes from other threads too
|
|
219
|
+
for (auto it = global_state->partitions.begin() + local_partition_map.size(); it < global_state->partitions.end();
|
|
220
|
+
it++) {
|
|
221
|
+
local_partition_map[(*it)->first] = (*it)->second;
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
idx_t HivePartitionedColumnData::RegisterNewPartition(HivePartitionKey key, PartitionedColumnDataAppendState &state) {
|
|
226
|
+
if (global_state) {
|
|
227
|
+
idx_t partition_id;
|
|
228
|
+
|
|
229
|
+
// Synchronize Global state with our local state with the newly discoveren partition
|
|
230
|
+
{
|
|
231
|
+
unique_lock<mutex> lck_gstate(global_state->lock);
|
|
232
|
+
|
|
233
|
+
// Insert into global map, or return partition if already present
|
|
234
|
+
auto res =
|
|
235
|
+
global_state->partition_map.emplace(std::make_pair(std::move(key), global_state->partition_map.size()));
|
|
236
|
+
auto it = res.first;
|
|
237
|
+
partition_id = it->second;
|
|
238
|
+
|
|
239
|
+
// Add iterator to vector to allow incrementally updating local states from global state
|
|
240
|
+
global_state->partitions.emplace_back(it);
|
|
241
|
+
SynchronizeLocalMap();
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// After synchronizing with the global state, we need to grow the shared allocators to support
|
|
245
|
+
// the number of partitions, which guarantees that there's always enough allocators available to each thread
|
|
246
|
+
GrowAllocators();
|
|
247
|
+
|
|
248
|
+
// Grow local partition data
|
|
249
|
+
GrowAppendState(state);
|
|
250
|
+
GrowPartitions(state);
|
|
251
|
+
|
|
252
|
+
return partition_id;
|
|
253
|
+
} else {
|
|
254
|
+
return local_partition_map.emplace(std::make_pair(std::move(key), local_partition_map.size())).first->second;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
133
258
|
} // namespace duckdb
|
|
@@ -407,7 +407,8 @@ void LocalFileSystem::RemoveFile(const string &filename) {
|
|
|
407
407
|
}
|
|
408
408
|
}
|
|
409
409
|
|
|
410
|
-
bool LocalFileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback
|
|
410
|
+
bool LocalFileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback,
|
|
411
|
+
FileOpener *opener) {
|
|
411
412
|
if (!DirectoryExists(directory)) {
|
|
412
413
|
return false;
|
|
413
414
|
}
|
|
@@ -734,7 +735,8 @@ void LocalFileSystem::RemoveFile(const string &filename) {
|
|
|
734
735
|
}
|
|
735
736
|
}
|
|
736
737
|
|
|
737
|
-
bool LocalFileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback
|
|
738
|
+
bool LocalFileSystem::ListFiles(const string &directory, const std::function<void(const string &, bool)> &callback,
|
|
739
|
+
FileOpener *opener) {
|
|
738
740
|
string search_dir = JoinPath(directory, "*");
|
|
739
741
|
|
|
740
742
|
auto unicode_path = WindowsUtil::UTF8ToUnicode(search_dir.c_str());
|
|
@@ -435,6 +435,7 @@ RadixPartitionedColumnData::RadixPartitionedColumnData(ClientContext &context_p,
|
|
|
435
435
|
|
|
436
436
|
RadixPartitionedColumnData::RadixPartitionedColumnData(const RadixPartitionedColumnData &other)
|
|
437
437
|
: PartitionedColumnData(other), radix_bits(other.radix_bits), hash_col_idx(other.hash_col_idx) {
|
|
438
|
+
|
|
438
439
|
for (idx_t i = 0; i < RadixPartitioning::NumberOfPartitions(radix_bits); i++) {
|
|
439
440
|
partitions.emplace_back(CreatePartitionCollection(i));
|
|
440
441
|
}
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
#include "duckdb/common/string_util.hpp"
|
|
2
|
+
|
|
3
|
+
#include "duckdb/common/exception.hpp"
|
|
2
4
|
#include "duckdb/common/pair.hpp"
|
|
3
5
|
#include "duckdb/common/to_string.hpp"
|
|
4
|
-
#include "duckdb/common/exception.hpp"
|
|
5
6
|
|
|
6
7
|
#include <algorithm>
|
|
7
8
|
#include <cctype>
|
|
@@ -31,6 +32,13 @@ void StringUtil::RTrim(string &str) {
|
|
|
31
32
|
str.end());
|
|
32
33
|
}
|
|
33
34
|
|
|
35
|
+
void StringUtil::RTrim(string &str, const string &chars_to_trim) {
|
|
36
|
+
str.erase(find_if(str.rbegin(), str.rend(),
|
|
37
|
+
[&chars_to_trim](int ch) { return ch > 0 && chars_to_trim.find(ch) == string::npos; })
|
|
38
|
+
.base(),
|
|
39
|
+
str.end());
|
|
40
|
+
}
|
|
41
|
+
|
|
34
42
|
void StringUtil::Trim(string &str) {
|
|
35
43
|
StringUtil::LTrim(str);
|
|
36
44
|
StringUtil::RTrim(str);
|
|
@@ -307,6 +307,16 @@ void DataChunk::Hash(Vector &result) {
|
|
|
307
307
|
}
|
|
308
308
|
}
|
|
309
309
|
|
|
310
|
+
void DataChunk::Hash(vector<idx_t> &column_ids, Vector &result) {
|
|
311
|
+
D_ASSERT(result.GetType().id() == LogicalType::HASH);
|
|
312
|
+
D_ASSERT(column_ids.size() > 0);
|
|
313
|
+
|
|
314
|
+
VectorOperations::Hash(data[column_ids[0]], result, size());
|
|
315
|
+
for (idx_t i = 1; i < column_ids.size(); i++) {
|
|
316
|
+
VectorOperations::CombineHash(result, data[column_ids[i]], size());
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
310
320
|
void DataChunk::Verify() {
|
|
311
321
|
#ifdef DEBUG
|
|
312
322
|
D_ASSERT(size() <= capacity);
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#include "duckdb/common/types/partitioned_column_data.hpp"
|
|
2
2
|
|
|
3
3
|
#include "duckdb/common/radix_partitioning.hpp"
|
|
4
|
+
#include "duckdb/common/hive_partitioning.hpp"
|
|
4
5
|
#include "duckdb/storage/buffer_manager.hpp"
|
|
5
6
|
|
|
6
7
|
namespace duckdb {
|
|
@@ -18,6 +19,8 @@ unique_ptr<PartitionedColumnData> PartitionedColumnData::CreateShared() {
|
|
|
18
19
|
switch (type) {
|
|
19
20
|
case PartitionedColumnDataType::RADIX:
|
|
20
21
|
return make_unique<RadixPartitionedColumnData>((RadixPartitionedColumnData &)*this);
|
|
22
|
+
case PartitionedColumnDataType::HIVE:
|
|
23
|
+
return make_unique<HivePartitionedColumnData>((HivePartitionedColumnData &)*this);
|
|
21
24
|
default:
|
|
22
25
|
throw NotImplementedException("CreateShared for this type of PartitionedColumnData");
|
|
23
26
|
}
|
|
@@ -141,10 +144,12 @@ void PartitionedColumnData::FlushAppendState(PartitionedColumnDataAppendState &s
|
|
|
141
144
|
void PartitionedColumnData::Combine(PartitionedColumnData &other) {
|
|
142
145
|
// Now combine the state's partitions into this
|
|
143
146
|
lock_guard<mutex> guard(lock);
|
|
147
|
+
|
|
144
148
|
if (partitions.empty()) {
|
|
145
149
|
// This is the first merge, we just copy them over
|
|
146
150
|
partitions = std::move(other.partitions);
|
|
147
151
|
} else {
|
|
152
|
+
D_ASSERT(partitions.size() == other.partitions.size());
|
|
148
153
|
// Combine the append state's partitions into this PartitionedColumnData
|
|
149
154
|
for (idx_t i = 0; i < other.partitions.size(); i++) {
|
|
150
155
|
partitions[i]->Combine(*other.partitions[i]);
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
|
|
2
|
+
#include "duckdb/common/bind_helpers.hpp"
|
|
2
3
|
#include "duckdb/common/vector_size.hpp"
|
|
3
4
|
#include "duckdb/common/string_util.hpp"
|
|
4
5
|
|
|
@@ -59,56 +60,6 @@ static int64_t ParseInteger(const Value &value, const string &loption) {
|
|
|
59
60
|
return value.GetValue<int64_t>();
|
|
60
61
|
}
|
|
61
62
|
|
|
62
|
-
static vector<bool> ParseColumnList(const vector<Value> &set, vector<string> &names, const string &loption) {
|
|
63
|
-
vector<bool> result;
|
|
64
|
-
|
|
65
|
-
if (set.empty()) {
|
|
66
|
-
throw BinderException("\"%s\" expects a column list or * as parameter", loption);
|
|
67
|
-
}
|
|
68
|
-
// list of options: parse the list
|
|
69
|
-
unordered_map<string, bool> option_map;
|
|
70
|
-
for (idx_t i = 0; i < set.size(); i++) {
|
|
71
|
-
option_map[set[i].ToString()] = false;
|
|
72
|
-
}
|
|
73
|
-
result.resize(names.size(), false);
|
|
74
|
-
for (idx_t i = 0; i < names.size(); i++) {
|
|
75
|
-
auto entry = option_map.find(names[i]);
|
|
76
|
-
if (entry != option_map.end()) {
|
|
77
|
-
result[i] = true;
|
|
78
|
-
entry->second = true;
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
for (auto &entry : option_map) {
|
|
82
|
-
if (!entry.second) {
|
|
83
|
-
throw BinderException("\"%s\" expected to find %s, but it was not found in the table", loption,
|
|
84
|
-
entry.first.c_str());
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
return result;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
static vector<bool> ParseColumnList(const Value &value, vector<string> &names, const string &loption) {
|
|
91
|
-
vector<bool> result;
|
|
92
|
-
|
|
93
|
-
// Only accept a list of arguments
|
|
94
|
-
if (value.type().id() != LogicalTypeId::LIST) {
|
|
95
|
-
// Support a single argument if it's '*'
|
|
96
|
-
if (value.type().id() == LogicalTypeId::VARCHAR && value.GetValue<string>() == "*") {
|
|
97
|
-
result.resize(names.size(), true);
|
|
98
|
-
return result;
|
|
99
|
-
}
|
|
100
|
-
throw BinderException("\"%s\" expects a column list or * as parameter", loption);
|
|
101
|
-
}
|
|
102
|
-
auto &children = ListValue::GetChildren(value);
|
|
103
|
-
// accept '*' as single argument
|
|
104
|
-
if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&
|
|
105
|
-
children[0].GetValue<string>() == "*") {
|
|
106
|
-
result.resize(names.size(), true);
|
|
107
|
-
return result;
|
|
108
|
-
}
|
|
109
|
-
return ParseColumnList(children, names, loption);
|
|
110
|
-
}
|
|
111
|
-
|
|
112
63
|
void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
|
|
113
64
|
this->delimiter = StringUtil::Replace(input, "\\t", "\t");
|
|
114
65
|
this->has_delimiter = true;
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
#include "duckdb/execution/operator/persistent/physical_copy_to_file.hpp"
|
|
2
2
|
#include "duckdb/common/vector_operations/vector_operations.hpp"
|
|
3
|
+
#include "duckdb/common/hive_partitioning.hpp"
|
|
3
4
|
#include "duckdb/common/file_system.hpp"
|
|
5
|
+
#include "duckdb/common/file_opener.hpp"
|
|
4
6
|
|
|
5
7
|
#include <algorithm>
|
|
6
8
|
|
|
@@ -15,14 +17,24 @@ public:
|
|
|
15
17
|
idx_t rows_copied;
|
|
16
18
|
idx_t last_file_offset;
|
|
17
19
|
unique_ptr<GlobalFunctionData> global_state;
|
|
20
|
+
|
|
21
|
+
//! shared state for HivePartitionedColumnData
|
|
22
|
+
shared_ptr<GlobalHivePartitionState> partition_state;
|
|
18
23
|
};
|
|
19
24
|
|
|
20
25
|
class CopyToFunctionLocalState : public LocalSinkState {
|
|
21
26
|
public:
|
|
22
|
-
explicit CopyToFunctionLocalState(unique_ptr<LocalFunctionData> local_state)
|
|
27
|
+
explicit CopyToFunctionLocalState(unique_ptr<LocalFunctionData> local_state)
|
|
28
|
+
: local_state(std::move(local_state)), writer_offset(0) {
|
|
23
29
|
}
|
|
24
30
|
unique_ptr<GlobalFunctionData> global_state;
|
|
25
31
|
unique_ptr<LocalFunctionData> local_state;
|
|
32
|
+
|
|
33
|
+
//! Buffers the tuples in partitions before writing
|
|
34
|
+
unique_ptr<HivePartitionedColumnData> part_buffer;
|
|
35
|
+
unique_ptr<PartitionedColumnDataAppendState> part_buffer_append_state;
|
|
36
|
+
|
|
37
|
+
idx_t writer_offset;
|
|
26
38
|
};
|
|
27
39
|
|
|
28
40
|
//===--------------------------------------------------------------------===//
|
|
@@ -48,6 +60,11 @@ SinkResultType PhysicalCopyToFile::Sink(ExecutionContext &context, GlobalSinkSta
|
|
|
48
60
|
auto &g = (CopyToFunctionGlobalState &)gstate;
|
|
49
61
|
auto &l = (CopyToFunctionLocalState &)lstate;
|
|
50
62
|
|
|
63
|
+
if (partition_output) {
|
|
64
|
+
l.part_buffer->Append(*l.part_buffer_append_state, input);
|
|
65
|
+
return SinkResultType::NEED_MORE_INPUT;
|
|
66
|
+
}
|
|
67
|
+
|
|
51
68
|
{
|
|
52
69
|
lock_guard<mutex> glock(g.lock);
|
|
53
70
|
g.rows_copied += input.size();
|
|
@@ -57,13 +74,67 @@ SinkResultType PhysicalCopyToFile::Sink(ExecutionContext &context, GlobalSinkSta
|
|
|
57
74
|
return SinkResultType::NEED_MORE_INPUT;
|
|
58
75
|
}
|
|
59
76
|
|
|
77
|
+
static void CreateDir(const string &dir_path, FileSystem &fs) {
|
|
78
|
+
if (!fs.DirectoryExists(dir_path)) {
|
|
79
|
+
fs.CreateDirectory(dir_path);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
static string CreateDirRecursive(const vector<idx_t> &cols, const vector<string> &names, const vector<Value> &values,
|
|
84
|
+
string path, FileSystem &fs) {
|
|
85
|
+
CreateDir(path, fs);
|
|
86
|
+
|
|
87
|
+
for (idx_t i = 0; i < cols.size(); i++) {
|
|
88
|
+
auto partition_col_name = names[cols[i]];
|
|
89
|
+
auto partition_value = values[i];
|
|
90
|
+
string p_dir = partition_col_name + "=" + partition_value.ToString();
|
|
91
|
+
path = fs.JoinPath(path, p_dir);
|
|
92
|
+
CreateDir(path, fs);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return path;
|
|
96
|
+
}
|
|
97
|
+
|
|
60
98
|
void PhysicalCopyToFile::Combine(ExecutionContext &context, GlobalSinkState &gstate, LocalSinkState &lstate) const {
|
|
61
99
|
auto &g = (CopyToFunctionGlobalState &)gstate;
|
|
62
100
|
auto &l = (CopyToFunctionLocalState &)lstate;
|
|
63
101
|
|
|
102
|
+
if (partition_output) {
|
|
103
|
+
auto &fs = FileSystem::GetFileSystem(context.client);
|
|
104
|
+
l.part_buffer->FlushAppendState(*l.part_buffer_append_state);
|
|
105
|
+
auto &partitions = l.part_buffer->GetPartitions();
|
|
106
|
+
auto partition_key_map = l.part_buffer->GetReverseMap();
|
|
107
|
+
|
|
108
|
+
string trimmed_path = file_path;
|
|
109
|
+
StringUtil::RTrim(trimmed_path, fs.PathSeparator());
|
|
110
|
+
|
|
111
|
+
for (idx_t i = 0; i < partitions.size(); i++) {
|
|
112
|
+
string hive_path =
|
|
113
|
+
CreateDirRecursive(partition_columns, names, partition_key_map[i]->values, trimmed_path, fs);
|
|
114
|
+
string full_path = fs.JoinPath(hive_path, "data_" + to_string(l.writer_offset) + "." + function.extension);
|
|
115
|
+
if (fs.FileExists(full_path) && !allow_overwrite) {
|
|
116
|
+
throw IOException("failed to create " + full_path +
|
|
117
|
+
", file exists! Enable ALLOW_OVERWRITE option to force writing");
|
|
118
|
+
}
|
|
119
|
+
// Create a writer for the current file
|
|
120
|
+
auto fun_data_global = function.copy_to_initialize_global(context.client, *bind_data, full_path);
|
|
121
|
+
auto fun_data_local = function.copy_to_initialize_local(context, *bind_data);
|
|
122
|
+
|
|
123
|
+
for (auto &chunk : partitions[i]->Chunks()) {
|
|
124
|
+
function.copy_to_sink(context, *bind_data, *fun_data_global, *fun_data_local, chunk);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function.copy_to_combine(context, *bind_data, *fun_data_global, *fun_data_local);
|
|
128
|
+
function.copy_to_finalize(context.client, *bind_data, *fun_data_global);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
return;
|
|
132
|
+
}
|
|
133
|
+
|
|
64
134
|
if (function.copy_to_combine) {
|
|
65
135
|
function.copy_to_combine(context, *bind_data, per_thread_output ? *l.global_state : *g.global_state,
|
|
66
136
|
*l.local_state);
|
|
137
|
+
|
|
67
138
|
if (per_thread_output) {
|
|
68
139
|
function.copy_to_finalize(context.client, *bind_data, *l.global_state);
|
|
69
140
|
}
|
|
@@ -73,7 +144,7 @@ void PhysicalCopyToFile::Combine(ExecutionContext &context, GlobalSinkState &gst
|
|
|
73
144
|
SinkFinalizeType PhysicalCopyToFile::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
|
|
74
145
|
GlobalSinkState &gstate_p) const {
|
|
75
146
|
auto &gstate = (CopyToFunctionGlobalState &)gstate_p;
|
|
76
|
-
if (per_thread_output) {
|
|
147
|
+
if (per_thread_output || partition_output) {
|
|
77
148
|
// already happened in combine
|
|
78
149
|
return SinkFinalizeType::READY;
|
|
79
150
|
}
|
|
@@ -82,6 +153,7 @@ SinkFinalizeType PhysicalCopyToFile::Finalize(Pipeline &pipeline, Event &event,
|
|
|
82
153
|
|
|
83
154
|
if (use_tmp_file) {
|
|
84
155
|
D_ASSERT(!per_thread_output); // FIXME
|
|
156
|
+
D_ASSERT(!partition_output); // FIXME
|
|
85
157
|
MoveTmpFile(context, file_path);
|
|
86
158
|
}
|
|
87
159
|
}
|
|
@@ -89,6 +161,20 @@ SinkFinalizeType PhysicalCopyToFile::Finalize(Pipeline &pipeline, Event &event,
|
|
|
89
161
|
}
|
|
90
162
|
|
|
91
163
|
unique_ptr<LocalSinkState> PhysicalCopyToFile::GetLocalSinkState(ExecutionContext &context) const {
|
|
164
|
+
if (partition_output) {
|
|
165
|
+
auto state = make_unique<CopyToFunctionLocalState>(nullptr);
|
|
166
|
+
{
|
|
167
|
+
auto &g = (CopyToFunctionGlobalState &)*sink_state;
|
|
168
|
+
lock_guard<mutex> glock(g.lock);
|
|
169
|
+
state->writer_offset = g.last_file_offset++;
|
|
170
|
+
|
|
171
|
+
state->part_buffer = make_unique<HivePartitionedColumnData>(context.client, expected_types,
|
|
172
|
+
partition_columns, g.partition_state);
|
|
173
|
+
state->part_buffer_append_state = make_unique<PartitionedColumnDataAppendState>();
|
|
174
|
+
state->part_buffer->InitializeAppendState(*state->part_buffer_append_state);
|
|
175
|
+
}
|
|
176
|
+
return std::move(state);
|
|
177
|
+
}
|
|
92
178
|
auto res = make_unique<CopyToFunctionLocalState>(function.copy_to_initialize_local(context, *bind_data));
|
|
93
179
|
if (per_thread_output) {
|
|
94
180
|
idx_t this_file_offset;
|
|
@@ -98,9 +184,10 @@ unique_ptr<LocalSinkState> PhysicalCopyToFile::GetLocalSinkState(ExecutionContex
|
|
|
98
184
|
this_file_offset = g.last_file_offset++;
|
|
99
185
|
}
|
|
100
186
|
auto &fs = FileSystem::GetFileSystem(context.client);
|
|
101
|
-
string output_path =
|
|
102
|
-
|
|
103
|
-
|
|
187
|
+
string output_path =
|
|
188
|
+
fs.JoinPath(file_path, StringUtil::Format("out_%llu", this_file_offset) + "." + function.extension);
|
|
189
|
+
if (fs.FileExists(output_path) && !allow_overwrite) {
|
|
190
|
+
throw IOException("%s exists! Enable ALLOW_OVERWRITE option to force writing", output_path);
|
|
104
191
|
}
|
|
105
192
|
res->global_state = function.copy_to_initialize_global(context.client, *bind_data, output_path);
|
|
106
193
|
}
|
|
@@ -108,27 +195,35 @@ unique_ptr<LocalSinkState> PhysicalCopyToFile::GetLocalSinkState(ExecutionContex
|
|
|
108
195
|
}
|
|
109
196
|
|
|
110
197
|
unique_ptr<GlobalSinkState> PhysicalCopyToFile::GetGlobalSinkState(ClientContext &context) const {
|
|
111
|
-
|
|
198
|
+
|
|
199
|
+
if (partition_output || per_thread_output) {
|
|
112
200
|
auto &fs = FileSystem::GetFileSystem(context);
|
|
113
201
|
|
|
114
|
-
if (fs.FileExists(file_path)) {
|
|
115
|
-
throw IOException("%s exists", file_path);
|
|
202
|
+
if (fs.FileExists(file_path) && !allow_overwrite) {
|
|
203
|
+
throw IOException("%s exists! Enable ALLOW_OVERWRITE option to force writing", file_path);
|
|
116
204
|
}
|
|
117
205
|
if (!fs.DirectoryExists(file_path)) {
|
|
118
206
|
fs.CreateDirectory(file_path);
|
|
119
|
-
} else {
|
|
207
|
+
} else if (!allow_overwrite) {
|
|
120
208
|
idx_t n_files = 0;
|
|
121
|
-
fs.ListFiles(
|
|
209
|
+
fs.ListFiles(
|
|
210
|
+
file_path, [&n_files](const string &path, bool) { n_files++; }, FileOpener::Get(context));
|
|
122
211
|
if (n_files > 0) {
|
|
123
|
-
throw IOException("Directory %s is not empty",
|
|
212
|
+
throw IOException("Directory %s is not empty! Enable ALLOW_OVERWRITE option to force writing",
|
|
213
|
+
file_path);
|
|
124
214
|
}
|
|
125
215
|
}
|
|
126
216
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
217
|
+
auto state = make_unique<CopyToFunctionGlobalState>(nullptr);
|
|
218
|
+
|
|
219
|
+
if (partition_output) {
|
|
220
|
+
state->partition_state = make_shared<GlobalHivePartitionState>();
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
return std::move(state);
|
|
131
224
|
}
|
|
225
|
+
|
|
226
|
+
return make_unique<CopyToFunctionGlobalState>(function.copy_to_initialize_global(context, *bind_data, file_path));
|
|
132
227
|
}
|
|
133
228
|
|
|
134
229
|
//===--------------------------------------------------------------------===//
|
|
@@ -17,7 +17,12 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::CreatePlan(LogicalCopyToFile
|
|
|
17
17
|
make_unique<PhysicalCopyToFile>(op.types, op.function, std::move(op.bind_data), op.estimated_cardinality);
|
|
18
18
|
copy->file_path = op.file_path;
|
|
19
19
|
copy->use_tmp_file = op.use_tmp_file;
|
|
20
|
+
copy->allow_overwrite = op.allow_overwrite;
|
|
20
21
|
copy->per_thread_output = op.per_thread_output;
|
|
22
|
+
copy->partition_output = op.partition_output;
|
|
23
|
+
copy->partition_columns = op.partition_columns;
|
|
24
|
+
copy->names = op.names;
|
|
25
|
+
copy->expected_types = op.expected_types;
|
|
21
26
|
if (op.function.parallel) {
|
|
22
27
|
copy->parallel = op.function.parallel(context, *copy->bind_data);
|
|
23
28
|
}
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "duckdb/common/serializer/buffered_serializer.hpp"
|
|
4
4
|
#include "duckdb/function/copy_function.hpp"
|
|
5
5
|
#include "duckdb/parser/parsed_data/copy_info.hpp"
|
|
6
|
+
#include "duckdb/common/bind_helpers.hpp"
|
|
6
7
|
#include "duckdb/common/string_util.hpp"
|
|
7
8
|
#include "duckdb/common/file_system.hpp"
|
|
8
9
|
#include "duckdb/common/types/string_type.hpp"
|
|
@@ -58,13 +59,6 @@ void BaseCSVData::Finalize() {
|
|
|
58
59
|
}
|
|
59
60
|
}
|
|
60
61
|
|
|
61
|
-
static Value ConvertVectorToValue(vector<Value> set) {
|
|
62
|
-
if (set.empty()) {
|
|
63
|
-
return Value::EMPTYLIST(LogicalType::BOOLEAN);
|
|
64
|
-
}
|
|
65
|
-
return Value::LIST(std::move(set));
|
|
66
|
-
}
|
|
67
|
-
|
|
68
62
|
static unique_ptr<FunctionData> WriteCSVBind(ClientContext &context, CopyInfo &info, vector<string> &names,
|
|
69
63
|
vector<LogicalType> &sql_types) {
|
|
70
64
|
auto bind_data = make_unique<WriteCSVData>(info.file_path, sql_types, names);
|