duckdb 0.6.2-dev1124.0 → 0.6.2-dev1160.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/column_writer.cpp +38 -4
- package/src/duckdb/extension/parquet/include/parquet_reader.hpp +13 -0
- package/src/duckdb/extension/parquet/parquet-extension.cpp +88 -23
- package/src/duckdb/extension/parquet/parquet_reader.cpp +35 -3
- package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +16 -22
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +30 -20
- package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +4 -4
- package/src/duckdb/src/function/table/read_csv.cpp +16 -47
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/union_by_name.hpp +93 -0
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/base_csv_reader.hpp +6 -8
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +2 -0
- package/src/duckdb/src/include/duckdb/storage/table/segment_base.hpp +9 -1
- package/src/duckdb/src/main/connection.cpp +2 -2
- package/src/duckdb/src/planner/expression/bound_lambdaref_expression.cpp +4 -4
- package/src/duckdb/src/storage/table/update_segment.cpp +3 -3
package/package.json
CHANGED
|
@@ -1723,6 +1723,38 @@ void ListColumnWriter::FinalizeAnalyze(ColumnWriterState &state_p) {
|
|
|
1723
1723
|
child_writer->FinalizeAnalyze(*state.child_state);
|
|
1724
1724
|
}
|
|
1725
1725
|
|
|
1726
|
+
idx_t GetConsecutiveChildList(Vector &list, idx_t count, Vector &result) {
|
|
1727
|
+
auto list_data = FlatVector::GetData<list_entry_t>(list);
|
|
1728
|
+
auto &validity = FlatVector::Validity(list);
|
|
1729
|
+
bool consecutive_flat_list = true;
|
|
1730
|
+
idx_t child_count = 0;
|
|
1731
|
+
for (idx_t i = 0; i < count; i++) {
|
|
1732
|
+
if (!validity.RowIsValid(i)) {
|
|
1733
|
+
continue;
|
|
1734
|
+
}
|
|
1735
|
+
if (list_data[i].offset != child_count) {
|
|
1736
|
+
consecutive_flat_list = false;
|
|
1737
|
+
}
|
|
1738
|
+
child_count += list_data[i].length;
|
|
1739
|
+
}
|
|
1740
|
+
if (!consecutive_flat_list) {
|
|
1741
|
+
SelectionVector child_sel(child_count);
|
|
1742
|
+
idx_t entry = 0;
|
|
1743
|
+
for (idx_t i = 0; i < count; i++) {
|
|
1744
|
+
if (!validity.RowIsValid(i)) {
|
|
1745
|
+
continue;
|
|
1746
|
+
}
|
|
1747
|
+
for (idx_t k = 0; k < list_data[i].length; k++) {
|
|
1748
|
+
child_sel.set_index(entry++, list_data[i].offset + k);
|
|
1749
|
+
}
|
|
1750
|
+
}
|
|
1751
|
+
|
|
1752
|
+
result.Slice(child_sel, child_count);
|
|
1753
|
+
result.Flatten(child_count);
|
|
1754
|
+
}
|
|
1755
|
+
return child_count;
|
|
1756
|
+
}
|
|
1757
|
+
|
|
1726
1758
|
void ListColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
|
|
1727
1759
|
auto &state = (ListColumnWriterState &)state_p;
|
|
1728
1760
|
|
|
@@ -1775,8 +1807,9 @@ void ListColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *pa
|
|
|
1775
1807
|
state.parent_index += vcount;
|
|
1776
1808
|
|
|
1777
1809
|
auto &list_child = ListVector::GetEntry(vector);
|
|
1778
|
-
|
|
1779
|
-
|
|
1810
|
+
Vector child_list(list_child);
|
|
1811
|
+
idx_t child_count = GetConsecutiveChildList(vector, count, child_list);
|
|
1812
|
+
child_writer->Prepare(*state.child_state, &state_p, child_list, child_count);
|
|
1780
1813
|
}
|
|
1781
1814
|
|
|
1782
1815
|
void ListColumnWriter::BeginWrite(ColumnWriterState &state_p) {
|
|
@@ -1788,8 +1821,9 @@ void ListColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t c
|
|
|
1788
1821
|
auto &state = (ListColumnWriterState &)state_p;
|
|
1789
1822
|
|
|
1790
1823
|
auto &list_child = ListVector::GetEntry(vector);
|
|
1791
|
-
|
|
1792
|
-
|
|
1824
|
+
Vector child_list(list_child);
|
|
1825
|
+
idx_t child_count = GetConsecutiveChildList(vector, count, child_list);
|
|
1826
|
+
child_writer->Write(*state.child_state, child_list, child_count);
|
|
1793
1827
|
}
|
|
1794
1828
|
|
|
1795
1829
|
void ListColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
|
|
@@ -73,6 +73,7 @@ struct ParquetOptions {
|
|
|
73
73
|
bool filename = false;
|
|
74
74
|
bool file_row_number = false;
|
|
75
75
|
bool hive_partitioning = false;
|
|
76
|
+
bool union_by_name = false;
|
|
76
77
|
|
|
77
78
|
public:
|
|
78
79
|
void Serialize(FieldWriter &writer) const;
|
|
@@ -109,6 +110,17 @@ public:
|
|
|
109
110
|
shared_ptr<ParquetFileMetadataCache> metadata;
|
|
110
111
|
ParquetOptions parquet_options;
|
|
111
112
|
|
|
113
|
+
//! when reading multiple parquet files (with union by name option)
|
|
114
|
+
//! TableFunction might return more cols than any single parquet file. Even all parquet files have same
|
|
115
|
+
//! cols, those files might have cols at different positions and with different logical type.
|
|
116
|
+
//! e.g. p1.parquet (a INT , b VARCHAR) p2.parquet (c VARCHAR, a VARCHAR)
|
|
117
|
+
vector<idx_t> union_idx_map;
|
|
118
|
+
//! If the parquet file dont have union_cols5 union_null_cols[5] will be true.
|
|
119
|
+
//! some parquet files may not have all union cols.
|
|
120
|
+
vector<bool> union_null_cols;
|
|
121
|
+
//! All union cols will cast to same type.
|
|
122
|
+
vector<LogicalType> union_col_types;
|
|
123
|
+
|
|
112
124
|
public:
|
|
113
125
|
void InitializeScan(ParquetReaderScanState &state, vector<column_t> column_ids, vector<idx_t> groups_to_read,
|
|
114
126
|
TableFilterSet *table_filters);
|
|
@@ -139,6 +151,7 @@ private:
|
|
|
139
151
|
uint64_t GetGroupSpan(ParquetReaderScanState &state);
|
|
140
152
|
void PrepareRowGroupBuffer(ParquetReaderScanState &state, idx_t out_col_idx);
|
|
141
153
|
LogicalType DeriveLogicalType(const SchemaElement &s_ele);
|
|
154
|
+
void RearrangeChildReaders(unique_ptr<duckdb::ColumnReader> &root_reader, vector<column_t> &column_ids);
|
|
142
155
|
|
|
143
156
|
template <typename... Args>
|
|
144
157
|
std::runtime_error FormatException(const string fmt_str, Args... params) {
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
#include "duckdb/common/field_writer.hpp"
|
|
21
21
|
#include "duckdb/common/file_system.hpp"
|
|
22
22
|
#include "duckdb/common/hive_partitioning.hpp"
|
|
23
|
+
#include "duckdb/common/union_by_name.hpp"
|
|
23
24
|
#include "duckdb/common/types/chunk_collection.hpp"
|
|
24
25
|
#include "duckdb/function/copy_function.hpp"
|
|
25
26
|
#include "duckdb/function/table_function.hpp"
|
|
@@ -45,6 +46,10 @@ struct ParquetReadBindData : public TableFunctionData {
|
|
|
45
46
|
vector<string> names;
|
|
46
47
|
vector<LogicalType> types;
|
|
47
48
|
|
|
49
|
+
// The union readers are created (when parquet union_by_name option is on) during binding
|
|
50
|
+
// Those readers can be re-used during ParquetParallelStateNext
|
|
51
|
+
vector<shared_ptr<ParquetReader>> union_readers;
|
|
52
|
+
|
|
48
53
|
// These come from the initial_reader, but need to be stored in case the initial_reader is removed by a filter
|
|
49
54
|
idx_t initial_file_cardinality;
|
|
50
55
|
idx_t initial_file_row_groups;
|
|
@@ -127,6 +132,7 @@ void ParquetOptions::Serialize(FieldWriter &writer) const {
|
|
|
127
132
|
writer.WriteField<bool>(filename);
|
|
128
133
|
writer.WriteField<bool>(file_row_number);
|
|
129
134
|
writer.WriteField<bool>(hive_partitioning);
|
|
135
|
+
writer.WriteField<bool>(union_by_name);
|
|
130
136
|
}
|
|
131
137
|
|
|
132
138
|
void ParquetOptions::Deserialize(FieldReader &reader) {
|
|
@@ -134,6 +140,7 @@ void ParquetOptions::Deserialize(FieldReader &reader) {
|
|
|
134
140
|
filename = reader.ReadRequired<bool>();
|
|
135
141
|
file_row_number = reader.ReadRequired<bool>();
|
|
136
142
|
hive_partitioning = reader.ReadRequired<bool>();
|
|
143
|
+
union_by_name = reader.ReadRequired<bool>();
|
|
137
144
|
}
|
|
138
145
|
|
|
139
146
|
BindInfo ParquetGetBatchInfo(const FunctionData *bind_data) {
|
|
@@ -148,6 +155,7 @@ BindInfo ParquetGetBatchInfo(const FunctionData *bind_data) {
|
|
|
148
155
|
bind_info.InsertOption("filename", Value::BOOLEAN(parquet_bind->parquet_options.filename));
|
|
149
156
|
bind_info.InsertOption("file_row_number", Value::BOOLEAN(parquet_bind->parquet_options.file_row_number));
|
|
150
157
|
bind_info.InsertOption("hive_partitioning", Value::BOOLEAN(parquet_bind->parquet_options.hive_partitioning));
|
|
158
|
+
bind_info.InsertOption("union_by_name", Value::BOOLEAN(parquet_bind->parquet_options.union_by_name));
|
|
151
159
|
return bind_info;
|
|
152
160
|
}
|
|
153
161
|
|
|
@@ -164,6 +172,7 @@ public:
|
|
|
164
172
|
table_function.named_parameters["filename"] = LogicalType::BOOLEAN;
|
|
165
173
|
table_function.named_parameters["file_row_number"] = LogicalType::BOOLEAN;
|
|
166
174
|
table_function.named_parameters["hive_partitioning"] = LogicalType::BOOLEAN;
|
|
175
|
+
table_function.named_parameters["union_by_name"] = LogicalType::BOOLEAN;
|
|
167
176
|
table_function.get_batch_index = ParquetScanGetBatchIndex;
|
|
168
177
|
table_function.serialize = ParquetScanSerialize;
|
|
169
178
|
table_function.deserialize = ParquetScanDeserialize;
|
|
@@ -180,6 +189,7 @@ public:
|
|
|
180
189
|
table_function.named_parameters["filename"] = LogicalType::BOOLEAN;
|
|
181
190
|
table_function.named_parameters["file_row_number"] = LogicalType::BOOLEAN;
|
|
182
191
|
table_function.named_parameters["hive_partitioning"] = LogicalType::BOOLEAN;
|
|
192
|
+
table_function.named_parameters["union_by_name"] = LogicalType::BOOLEAN;
|
|
183
193
|
set.AddFunction(table_function);
|
|
184
194
|
return set;
|
|
185
195
|
}
|
|
@@ -201,22 +211,31 @@ public:
|
|
|
201
211
|
parquet_options.file_row_number = true;
|
|
202
212
|
} else if (loption == "hive_partitioning") {
|
|
203
213
|
parquet_options.hive_partitioning = true;
|
|
214
|
+
} else if (loption == "union_by_name") {
|
|
215
|
+
parquet_options.union_by_name = true;
|
|
204
216
|
} else {
|
|
205
217
|
throw NotImplementedException("Unsupported option for COPY FROM parquet: %s", option.first);
|
|
206
218
|
}
|
|
207
219
|
}
|
|
208
|
-
auto result = make_unique<ParquetReadBindData>();
|
|
209
220
|
|
|
210
221
|
FileSystem &fs = FileSystem::GetFileSystem(context);
|
|
211
|
-
|
|
212
|
-
if (
|
|
222
|
+
auto files = fs.Glob(info.file_path, context);
|
|
223
|
+
if (files.empty()) {
|
|
213
224
|
throw IOException("No files found that match the pattern \"%s\"", info.file_path);
|
|
214
225
|
}
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
226
|
+
|
|
227
|
+
// The most likely path (Parquet read without union by name option)
|
|
228
|
+
if (!parquet_options.union_by_name) {
|
|
229
|
+
auto result = make_unique<ParquetReadBindData>();
|
|
230
|
+
result->files = std::move(files);
|
|
231
|
+
result->SetInitialReader(
|
|
232
|
+
make_shared<ParquetReader>(context, result->files[0], expected_types, parquet_options));
|
|
233
|
+
result->names = result->initial_reader->names;
|
|
234
|
+
result->types = result->initial_reader->return_types;
|
|
235
|
+
return std::move(result);
|
|
236
|
+
} else {
|
|
237
|
+
return ParquetUnionNamesBind(context, files, expected_types, expected_names, parquet_options);
|
|
238
|
+
}
|
|
220
239
|
}
|
|
221
240
|
|
|
222
241
|
static unique_ptr<BaseStatistics> ParquetScanStats(ClientContext &context, const FunctionData *bind_data_p,
|
|
@@ -303,11 +322,40 @@ public:
|
|
|
303
322
|
vector<LogicalType> &return_types, vector<string> &names,
|
|
304
323
|
ParquetOptions parquet_options) {
|
|
305
324
|
auto result = make_unique<ParquetReadBindData>();
|
|
325
|
+
|
|
326
|
+
// The most likely path (Parquet Scan without union by name option)
|
|
327
|
+
if (!parquet_options.union_by_name) {
|
|
328
|
+
result->files = std::move(files);
|
|
329
|
+
result->SetInitialReader(make_shared<ParquetReader>(context, result->files[0], parquet_options));
|
|
330
|
+
return_types = result->types = result->initial_reader->return_types;
|
|
331
|
+
names = result->names = result->initial_reader->names;
|
|
332
|
+
return std::move(result);
|
|
333
|
+
} else {
|
|
334
|
+
return ParquetUnionNamesBind(context, files, return_types, names, parquet_options);
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
static unique_ptr<FunctionData> ParquetUnionNamesBind(ClientContext &context, vector<string> files,
|
|
339
|
+
vector<LogicalType> &return_types, vector<string> &names,
|
|
340
|
+
ParquetOptions parquet_options) {
|
|
341
|
+
auto result = make_unique<ParquetReadBindData>();
|
|
306
342
|
result->files = std::move(files);
|
|
307
343
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
344
|
+
case_insensitive_map_t<idx_t> union_names_map;
|
|
345
|
+
vector<string> union_col_names;
|
|
346
|
+
vector<LogicalType> union_col_types;
|
|
347
|
+
auto dummy_readers = UnionByName<ParquetReader, ParquetOptions>::UnionCols(
|
|
348
|
+
context, result->files, union_col_types, union_col_names, union_names_map, parquet_options);
|
|
349
|
+
|
|
350
|
+
dummy_readers = UnionByName<ParquetReader, ParquetOptions>::CreateUnionMap(
|
|
351
|
+
std::move(dummy_readers), union_col_types, union_col_names, union_names_map);
|
|
352
|
+
|
|
353
|
+
std::move(dummy_readers.begin(), dummy_readers.end(), std::back_inserter(result->union_readers));
|
|
354
|
+
names.assign(union_col_names.begin(), union_col_names.end());
|
|
355
|
+
return_types.assign(union_col_types.begin(), union_col_types.end());
|
|
356
|
+
result->SetInitialReader(result->union_readers[0]);
|
|
357
|
+
D_ASSERT(names.size() == return_types.size());
|
|
358
|
+
|
|
311
359
|
return std::move(result);
|
|
312
360
|
}
|
|
313
361
|
|
|
@@ -337,6 +385,8 @@ public:
|
|
|
337
385
|
parquet_options.file_row_number = BooleanValue::Get(kv.second);
|
|
338
386
|
} else if (loption == "hive_partitioning") {
|
|
339
387
|
parquet_options.hive_partitioning = BooleanValue::Get(kv.second);
|
|
388
|
+
} else if (loption == "union_by_name") {
|
|
389
|
+
parquet_options.union_by_name = BooleanValue::Get(kv.second);
|
|
340
390
|
}
|
|
341
391
|
}
|
|
342
392
|
FileSystem &fs = FileSystem::GetFileSystem(context);
|
|
@@ -370,6 +420,8 @@ public:
|
|
|
370
420
|
parquet_options.file_row_number = BooleanValue::Get(kv.second);
|
|
371
421
|
} else if (loption == "hive_partitioning") {
|
|
372
422
|
parquet_options.hive_partitioning = BooleanValue::Get(kv.second);
|
|
423
|
+
} else if (loption == "union_by_name") {
|
|
424
|
+
parquet_options.union_by_name = true;
|
|
373
425
|
}
|
|
374
426
|
}
|
|
375
427
|
return ParquetScanBindInternal(context, std::move(files), return_types, names, parquet_options);
|
|
@@ -417,20 +469,24 @@ public:
|
|
|
417
469
|
|
|
418
470
|
result->file_opening = std::vector<bool>(bind_data.files.size(), false);
|
|
419
471
|
result->file_mutexes = std::unique_ptr<mutex[]>(new mutex[bind_data.files.size()]);
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
} else {
|
|
426
|
-
if (bind_data.files.empty()) {
|
|
427
|
-
result->initial_reader = nullptr;
|
|
472
|
+
if (!bind_data.parquet_options.union_by_name) {
|
|
473
|
+
result->readers = std::vector<shared_ptr<ParquetReader>>(bind_data.files.size(), nullptr);
|
|
474
|
+
if (bind_data.initial_reader) {
|
|
475
|
+
result->initial_reader = bind_data.initial_reader;
|
|
476
|
+
result->readers[0] = bind_data.initial_reader;
|
|
428
477
|
} else {
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
478
|
+
if (bind_data.files.empty()) {
|
|
479
|
+
result->initial_reader = nullptr;
|
|
480
|
+
} else {
|
|
481
|
+
result->initial_reader =
|
|
482
|
+
make_shared<ParquetReader>(context, bind_data.files[0], bind_data.names, bind_data.types,
|
|
483
|
+
input.column_ids, bind_data.parquet_options, bind_data.files[0]);
|
|
484
|
+
result->readers[0] = result->initial_reader;
|
|
485
|
+
}
|
|
433
486
|
}
|
|
487
|
+
} else {
|
|
488
|
+
result->readers = std::move(bind_data.union_readers);
|
|
489
|
+
result->initial_reader = result->readers[0];
|
|
434
490
|
}
|
|
435
491
|
|
|
436
492
|
result->row_group_index = 0;
|
|
@@ -497,6 +553,9 @@ public:
|
|
|
497
553
|
|
|
498
554
|
bind_data.chunk_count++;
|
|
499
555
|
if (output.size() > 0) {
|
|
556
|
+
if (bind_data.parquet_options.union_by_name) {
|
|
557
|
+
UnionByName<ParquetReader, ParquetOptions>::SetNullUnionCols(output, data.reader->union_null_cols);
|
|
558
|
+
}
|
|
500
559
|
return;
|
|
501
560
|
}
|
|
502
561
|
if (!ParquetParallelStateNext(context, bind_data, data, gstate)) {
|
|
@@ -533,6 +592,12 @@ public:
|
|
|
533
592
|
D_ASSERT(parallel_state.initial_reader);
|
|
534
593
|
|
|
535
594
|
if (parallel_state.readers[parallel_state.file_index]) {
|
|
595
|
+
const auto ¤t_reader = parallel_state.readers[parallel_state.file_index];
|
|
596
|
+
if (current_reader->union_null_cols.empty()) {
|
|
597
|
+
current_reader->union_null_cols.resize(current_reader->return_types.size());
|
|
598
|
+
std::fill(current_reader->union_null_cols.begin(), current_reader->union_null_cols.end(), false);
|
|
599
|
+
}
|
|
600
|
+
|
|
536
601
|
if (parallel_state.row_group_index <
|
|
537
602
|
parallel_state.readers[parallel_state.file_index]->NumRowGroups()) {
|
|
538
603
|
// The current reader has rowgroups left to be scanned
|
|
@@ -352,7 +352,6 @@ unique_ptr<ColumnReader> ParquetReader::CreateReader(const duckdb_parquet::forma
|
|
|
352
352
|
D_ASSERT(file_meta_data->row_groups.empty() || next_file_idx == file_meta_data->row_groups[0].columns.size());
|
|
353
353
|
|
|
354
354
|
auto &root_struct_reader = (StructColumnReader &)*ret;
|
|
355
|
-
|
|
356
355
|
// add casts if required
|
|
357
356
|
for (auto &entry : cast_map) {
|
|
358
357
|
auto column_idx = entry.first;
|
|
@@ -700,11 +699,35 @@ void ParquetReader::InitializeScan(ParquetReaderScanState &state, vector<column_
|
|
|
700
699
|
|
|
701
700
|
state.thrift_file_proto = CreateThriftProtocol(allocator, *state.file_handle, *file_opener, state.prefetch_mode);
|
|
702
701
|
state.root_reader = CreateReader(GetFileMetadata());
|
|
702
|
+
if (parquet_options.union_by_name) {
|
|
703
|
+
RearrangeChildReaders(state.root_reader, state.column_ids);
|
|
704
|
+
}
|
|
703
705
|
|
|
704
706
|
state.define_buf.resize(allocator, STANDARD_VECTOR_SIZE);
|
|
705
707
|
state.repeat_buf.resize(allocator, STANDARD_VECTOR_SIZE);
|
|
706
708
|
}
|
|
707
709
|
|
|
710
|
+
void ParquetReader::RearrangeChildReaders(unique_ptr<duckdb::ColumnReader> &root_reader, vector<column_t> &column_ids) {
|
|
711
|
+
auto &root_struct_reader = (StructColumnReader &)*root_reader;
|
|
712
|
+
unordered_map<idx_t, idx_t> reverse_union_idx;
|
|
713
|
+
|
|
714
|
+
for (idx_t col = 0; col < union_idx_map.size(); ++col) {
|
|
715
|
+
auto child_reader = move(root_struct_reader.child_readers[col]);
|
|
716
|
+
auto cast_reader = make_unique<CastColumnReader>(move(child_reader), union_col_types[union_idx_map[col]]);
|
|
717
|
+
root_struct_reader.child_readers[col] = move(cast_reader);
|
|
718
|
+
reverse_union_idx[union_idx_map[col]] = col;
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
vector<bool> column_id_nulls(column_ids.size(), true);
|
|
722
|
+
for (idx_t col = 0; col < column_ids.size(); ++col) {
|
|
723
|
+
auto find = reverse_union_idx.find(column_ids[col]);
|
|
724
|
+
if (find != reverse_union_idx.end()) {
|
|
725
|
+
column_ids[col] = find->second;
|
|
726
|
+
column_id_nulls[col] = false;
|
|
727
|
+
}
|
|
728
|
+
}
|
|
729
|
+
union_null_cols = move(column_id_nulls);
|
|
730
|
+
}
|
|
708
731
|
void FilterIsNull(Vector &v, parquet_filter_t &filter_mask, idx_t count) {
|
|
709
732
|
if (v.GetVectorType() == VectorType::CONSTANT_VECTOR) {
|
|
710
733
|
auto &mask = ConstantVector::Validity(v);
|
|
@@ -898,6 +921,8 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
|
|
|
898
921
|
return false;
|
|
899
922
|
}
|
|
900
923
|
|
|
924
|
+
D_ASSERT(union_null_cols.size() >= result.ColumnCount());
|
|
925
|
+
|
|
901
926
|
// see if we have to switch to the next row group in the parquet file
|
|
902
927
|
if (state.current_group < 0 || (int64_t)state.group_offset >= GetGroup(state).num_rows) {
|
|
903
928
|
state.current_group++;
|
|
@@ -915,7 +940,7 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
|
|
|
915
940
|
uint64_t to_scan_compressed_bytes = 0;
|
|
916
941
|
for (idx_t out_col_idx = 0; out_col_idx < result.ColumnCount(); out_col_idx++) {
|
|
917
942
|
// this is a special case where we are not interested in the actual contents of the file
|
|
918
|
-
if (IsRowIdColumnId(state.column_ids[out_col_idx])) {
|
|
943
|
+
if (IsRowIdColumnId(state.column_ids[out_col_idx]) || union_null_cols[out_col_idx]) {
|
|
919
944
|
continue;
|
|
920
945
|
}
|
|
921
946
|
|
|
@@ -956,7 +981,7 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
|
|
|
956
981
|
// Prefetch column-wise
|
|
957
982
|
for (idx_t out_col_idx = 0; out_col_idx < result.ColumnCount(); out_col_idx++) {
|
|
958
983
|
|
|
959
|
-
if (IsRowIdColumnId(state.column_ids[out_col_idx])) {
|
|
984
|
+
if (IsRowIdColumnId(state.column_ids[out_col_idx]) || union_null_cols[out_col_idx]) {
|
|
960
985
|
continue;
|
|
961
986
|
}
|
|
962
987
|
|
|
@@ -1007,6 +1032,10 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
|
|
|
1007
1032
|
if (state.filters) {
|
|
1008
1033
|
vector<bool> need_to_read(result.ColumnCount(), true);
|
|
1009
1034
|
|
|
1035
|
+
for (idx_t col = 0; col < need_to_read.size(); ++col) {
|
|
1036
|
+
need_to_read[col] = need_to_read[col] && !union_null_cols[col];
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1010
1039
|
// first load the columns that are used in filters
|
|
1011
1040
|
for (auto &filter_col : state.filters->filters) {
|
|
1012
1041
|
auto file_col_idx = state.column_ids[filter_col.first];
|
|
@@ -1058,6 +1087,9 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
|
|
|
1058
1087
|
result.data[out_col_idx].Reference(constant_42);
|
|
1059
1088
|
continue;
|
|
1060
1089
|
}
|
|
1090
|
+
if (union_null_cols[out_col_idx]) {
|
|
1091
|
+
continue;
|
|
1092
|
+
}
|
|
1061
1093
|
|
|
1062
1094
|
auto rows_read = root_reader->GetChildReader(file_col_idx)
|
|
1063
1095
|
->Read(result.size(), filter_mask, define_ptr, repeat_ptr, result.data[out_col_idx]);
|
|
@@ -160,7 +160,7 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
|
|
|
160
160
|
} else {
|
|
161
161
|
row_empty = false;
|
|
162
162
|
}
|
|
163
|
-
if (!
|
|
163
|
+
if (!return_types.empty() && column == return_types.size() && length == 0) {
|
|
164
164
|
// skip a single trailing delimiter in last column
|
|
165
165
|
return;
|
|
166
166
|
}
|
|
@@ -168,14 +168,14 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
|
|
|
168
168
|
column++;
|
|
169
169
|
return;
|
|
170
170
|
}
|
|
171
|
-
if (column >=
|
|
171
|
+
if (column >= return_types.size()) {
|
|
172
172
|
if (options.ignore_errors) {
|
|
173
173
|
error_column_overflow = true;
|
|
174
174
|
return;
|
|
175
175
|
} else {
|
|
176
176
|
throw InvalidInputException(
|
|
177
177
|
"Error in file \"%s\", on line %s: expected %lld values per row, but got more. (%s)", options.file_path,
|
|
178
|
-
GetLineNumberStr(linenr, linenr_estimated).c_str(),
|
|
178
|
+
GetLineNumberStr(linenr, linenr_estimated).c_str(), return_types.size(), options.ToString());
|
|
179
179
|
}
|
|
180
180
|
}
|
|
181
181
|
|
|
@@ -183,7 +183,7 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
|
|
|
183
183
|
idx_t row_entry = parse_chunk.size();
|
|
184
184
|
|
|
185
185
|
// test against null string, but only if the value was not quoted
|
|
186
|
-
if ((!has_quotes ||
|
|
186
|
+
if ((!has_quotes || return_types[column].id() != LogicalTypeId::VARCHAR) && !options.force_not_null[column] &&
|
|
187
187
|
Equals::Operation(str_val, string_t(options.null_str))) {
|
|
188
188
|
FlatVector::SetNull(parse_chunk.data[column], row_entry, true);
|
|
189
189
|
} else {
|
|
@@ -221,7 +221,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
|
|
|
221
221
|
|
|
222
222
|
if (row_empty) {
|
|
223
223
|
row_empty = false;
|
|
224
|
-
if (
|
|
224
|
+
if (return_types.size() != 1) {
|
|
225
225
|
if (mode == ParserMode::PARSING) {
|
|
226
226
|
FlatVector::SetNull(parse_chunk.data[0], parse_chunk.size(), false);
|
|
227
227
|
}
|
|
@@ -238,7 +238,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
|
|
|
238
238
|
return false;
|
|
239
239
|
}
|
|
240
240
|
|
|
241
|
-
if (column <
|
|
241
|
+
if (column < return_types.size() && mode != ParserMode::SNIFFING_DIALECT) {
|
|
242
242
|
if (options.ignore_errors) {
|
|
243
243
|
column = 0;
|
|
244
244
|
return false;
|
|
@@ -249,7 +249,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
|
|
|
249
249
|
} else {
|
|
250
250
|
throw InvalidInputException(
|
|
251
251
|
"Error in file \"%s\" on line %s: expected %lld values per row, but got %d.\nParser options:\n%s",
|
|
252
|
-
options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(),
|
|
252
|
+
options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), return_types.size(), column,
|
|
253
253
|
options.ToString());
|
|
254
254
|
}
|
|
255
255
|
}
|
|
@@ -282,13 +282,6 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
|
|
|
282
282
|
return false;
|
|
283
283
|
}
|
|
284
284
|
|
|
285
|
-
void BaseCSVReader::SetNullUnionCols(DataChunk &insert_chunk) {
|
|
286
|
-
for (idx_t col = 0; col < insert_nulls_idx.size(); ++col) {
|
|
287
|
-
insert_chunk.data[insert_nulls_idx[col]].SetVectorType(VectorType::CONSTANT_VECTOR);
|
|
288
|
-
ConstantVector::SetNull(insert_chunk.data[insert_nulls_idx[col]], true);
|
|
289
|
-
}
|
|
290
|
-
}
|
|
291
|
-
|
|
292
285
|
void BaseCSVReader::VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset) {
|
|
293
286
|
D_ASSERT(col_idx < chunk.data.size());
|
|
294
287
|
D_ASSERT(row_idx < chunk.size());
|
|
@@ -302,8 +295,8 @@ void BaseCSVReader::VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, i
|
|
|
302
295
|
auto utf_type = Utf8Proc::Analyze(s.GetDataUnsafe(), s.GetSize());
|
|
303
296
|
if (utf_type == UnicodeType::INVALID) {
|
|
304
297
|
string col_name = to_string(col_idx);
|
|
305
|
-
if (col_idx <
|
|
306
|
-
col_name = "\"" +
|
|
298
|
+
if (col_idx < names.size()) {
|
|
299
|
+
col_name = "\"" + names[col_idx] + "\"";
|
|
307
300
|
}
|
|
308
301
|
int64_t error_line = linenr - (chunk.size() - row_idx) + 1 + offset;
|
|
309
302
|
D_ASSERT(error_line >= 0);
|
|
@@ -330,9 +323,9 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
|
|
|
330
323
|
|
|
331
324
|
// convert the columns in the parsed chunk to the types of the table
|
|
332
325
|
insert_chunk.SetCardinality(parse_chunk);
|
|
333
|
-
for (idx_t col_idx = 0; col_idx <
|
|
326
|
+
for (idx_t col_idx = 0; col_idx < return_types.size(); col_idx++) {
|
|
334
327
|
auto insert_idx = insert_cols_idx[col_idx];
|
|
335
|
-
auto &type =
|
|
328
|
+
auto &type = return_types[col_idx];
|
|
336
329
|
if (type.id() == LogicalTypeId::VARCHAR) {
|
|
337
330
|
// target type is varchar: no need to convert
|
|
338
331
|
// just test that all strings are valid utf-8 strings
|
|
@@ -345,7 +338,8 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
|
|
|
345
338
|
// use the date format to cast the chunk
|
|
346
339
|
success = TryCastDateVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_idx],
|
|
347
340
|
parse_chunk.size(), error_message);
|
|
348
|
-
} else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
|
|
341
|
+
} else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
|
|
342
|
+
return_types[col_idx].id() == LogicalTypeId::TIMESTAMP) {
|
|
349
343
|
// use the date format to cast the chunk
|
|
350
344
|
success = TryCastTimestampVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_idx],
|
|
351
345
|
parse_chunk.size(), error_message);
|
|
@@ -365,8 +359,8 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
|
|
|
365
359
|
continue;
|
|
366
360
|
}
|
|
367
361
|
string col_name = to_string(col_idx);
|
|
368
|
-
if (col_idx <
|
|
369
|
-
col_name = "\"" +
|
|
362
|
+
if (col_idx < names.size()) {
|
|
363
|
+
col_name = "\"" + names[col_idx] + "\"";
|
|
370
364
|
}
|
|
371
365
|
|
|
372
366
|
// figure out the exact line number
|
|
@@ -401,7 +395,7 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
|
|
|
401
395
|
|
|
402
396
|
for (idx_t row_idx = 0; row_idx < parse_chunk.size(); row_idx++) {
|
|
403
397
|
bool failed = false;
|
|
404
|
-
for (idx_t column_idx = 0; column_idx <
|
|
398
|
+
for (idx_t column_idx = 0; column_idx < return_types.size(); column_idx++) {
|
|
405
399
|
|
|
406
400
|
auto &inserted_column = insert_chunk.data[column_idx];
|
|
407
401
|
auto &parsed_column = parse_chunk.data[column_idx];
|
|
@@ -37,6 +37,16 @@ BufferedCSVReader::BufferedCSVReader(ClientContext &context, BufferedCSVReaderOp
|
|
|
37
37
|
std::move(options_p), requested_types) {
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
+
BufferedCSVReader::BufferedCSVReader(ClientContext &context, string filename, BufferedCSVReaderOptions options_p,
|
|
41
|
+
const vector<LogicalType> &requested_types)
|
|
42
|
+
: BaseCSVReader(FileSystem::GetFileSystem(context), Allocator::Get(context), FileSystem::GetFileOpener(context),
|
|
43
|
+
move(options_p), requested_types),
|
|
44
|
+
buffer_size(0), position(0), start(0) {
|
|
45
|
+
options.file_path = move(filename);
|
|
46
|
+
file_handle = OpenCSV(options);
|
|
47
|
+
Initialize(requested_types);
|
|
48
|
+
}
|
|
49
|
+
|
|
40
50
|
BufferedCSVReader::~BufferedCSVReader() {
|
|
41
51
|
}
|
|
42
52
|
|
|
@@ -236,20 +246,20 @@ static string NormalizeColumnName(const string &col_name) {
|
|
|
236
246
|
void BufferedCSVReader::Initialize(const vector<LogicalType> &requested_types) {
|
|
237
247
|
PrepareComplexParser();
|
|
238
248
|
if (options.auto_detect) {
|
|
239
|
-
|
|
240
|
-
if (
|
|
249
|
+
return_types = SniffCSV(requested_types);
|
|
250
|
+
if (return_types.empty()) {
|
|
241
251
|
throw Exception("Failed to detect column types from CSV: is the file a valid CSV file?");
|
|
242
252
|
}
|
|
243
253
|
if (cached_chunks.empty()) {
|
|
244
254
|
JumpToBeginning(options.skip_rows, options.header);
|
|
245
255
|
}
|
|
246
256
|
} else {
|
|
247
|
-
|
|
257
|
+
return_types = requested_types;
|
|
248
258
|
ResetBuffer();
|
|
249
259
|
SkipRowsAndReadHeader(options.skip_rows, options.header);
|
|
250
260
|
}
|
|
251
|
-
InitParseChunk(
|
|
252
|
-
InitInsertChunkIdx(
|
|
261
|
+
InitParseChunk(return_types.size());
|
|
262
|
+
InitInsertChunkIdx(return_types.size());
|
|
253
263
|
// we only need reset support during the automatic CSV type detection
|
|
254
264
|
// since reset support might require caching (in the case of streams), we disable it for the remainder
|
|
255
265
|
file_handle->DisableReset();
|
|
@@ -297,7 +307,7 @@ void BufferedCSVReader::SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header)
|
|
|
297
307
|
|
|
298
308
|
if (skip_header) {
|
|
299
309
|
// ignore the first line as a header line
|
|
300
|
-
InitParseChunk(
|
|
310
|
+
InitParseChunk(return_types.size());
|
|
301
311
|
ParseCSV(ParserMode::PARSING_HEADER);
|
|
302
312
|
}
|
|
303
313
|
}
|
|
@@ -520,14 +530,14 @@ void BufferedCSVReader::DetectCandidateTypes(const vector<LogicalType> &type_can
|
|
|
520
530
|
format_candidates[t.first].clear();
|
|
521
531
|
}
|
|
522
532
|
|
|
523
|
-
// set all
|
|
524
|
-
|
|
525
|
-
|
|
533
|
+
// set all return_types to VARCHAR so we can do datatype detection based on VARCHAR values
|
|
534
|
+
return_types.clear();
|
|
535
|
+
return_types.assign(options.num_cols, LogicalType::VARCHAR);
|
|
526
536
|
|
|
527
537
|
// jump to beginning and skip potential header
|
|
528
538
|
JumpToBeginning(options.skip_rows, true);
|
|
529
539
|
DataChunk header_row;
|
|
530
|
-
header_row.Initialize(allocator,
|
|
540
|
+
header_row.Initialize(allocator, return_types);
|
|
531
541
|
parse_chunk.Copy(header_row);
|
|
532
542
|
|
|
533
543
|
if (header_row.size() == 0) {
|
|
@@ -535,7 +545,7 @@ void BufferedCSVReader::DetectCandidateTypes(const vector<LogicalType> &type_can
|
|
|
535
545
|
}
|
|
536
546
|
|
|
537
547
|
// init parse chunk and read csv with info candidate
|
|
538
|
-
InitParseChunk(
|
|
548
|
+
InitParseChunk(return_types.size());
|
|
539
549
|
if (!TryParseCSV(ParserMode::SNIFFING_DATATYPES)) {
|
|
540
550
|
continue;
|
|
541
551
|
}
|
|
@@ -713,7 +723,7 @@ void BufferedCSVReader::DetectHeader(const vector<vector<LogicalType>> &best_sql
|
|
|
713
723
|
col_name = col_name + "_" + to_string(name_collision_count[col_name]);
|
|
714
724
|
}
|
|
715
725
|
|
|
716
|
-
|
|
726
|
+
names.push_back(col_name);
|
|
717
727
|
name_collision_count[col_name] = 0;
|
|
718
728
|
}
|
|
719
729
|
|
|
@@ -721,7 +731,7 @@ void BufferedCSVReader::DetectHeader(const vector<vector<LogicalType>> &best_sql
|
|
|
721
731
|
options.header = false;
|
|
722
732
|
for (idx_t col = 0; col < options.num_cols; col++) {
|
|
723
733
|
string column_name = GenerateColumnName(options.num_cols, col);
|
|
724
|
-
|
|
734
|
+
names.push_back(column_name);
|
|
725
735
|
}
|
|
726
736
|
}
|
|
727
737
|
}
|
|
@@ -731,8 +741,8 @@ vector<LogicalType> BufferedCSVReader::RefineTypeDetection(const vector<LogicalT
|
|
|
731
741
|
vector<vector<LogicalType>> &best_sql_types_candidates,
|
|
732
742
|
map<LogicalTypeId, vector<string>> &best_format_candidates) {
|
|
733
743
|
// for the type refine we set the SQL types to VARCHAR for all columns
|
|
734
|
-
|
|
735
|
-
|
|
744
|
+
return_types.clear();
|
|
745
|
+
return_types.assign(options.num_cols, LogicalType::VARCHAR);
|
|
736
746
|
|
|
737
747
|
vector<LogicalType> detected_types;
|
|
738
748
|
|
|
@@ -747,11 +757,11 @@ vector<LogicalType> BufferedCSVReader::RefineTypeDetection(const vector<LogicalT
|
|
|
747
757
|
}
|
|
748
758
|
} else if (options.all_varchar) {
|
|
749
759
|
// return all types varchar
|
|
750
|
-
detected_types =
|
|
760
|
+
detected_types = return_types;
|
|
751
761
|
} else {
|
|
752
762
|
// jump through the rest of the file and continue to refine the sql type guess
|
|
753
763
|
while (JumpToNextSample()) {
|
|
754
|
-
InitParseChunk(
|
|
764
|
+
InitParseChunk(return_types.size());
|
|
755
765
|
// if jump ends up a bad line, we just skip this chunk
|
|
756
766
|
if (!TryParseCSV(ParserMode::SNIFFING_DATATYPES)) {
|
|
757
767
|
continue;
|
|
@@ -878,11 +888,11 @@ vector<LogicalType> BufferedCSVReader::SniffCSV(const vector<LogicalType> &reque
|
|
|
878
888
|
options.num_cols = best_num_cols;
|
|
879
889
|
DetectHeader(best_sql_types_candidates, best_header_row);
|
|
880
890
|
auto sql_types_per_column = options.sql_types_per_column;
|
|
881
|
-
for (idx_t i = 0; i <
|
|
882
|
-
auto it = sql_types_per_column.find(
|
|
891
|
+
for (idx_t i = 0; i < names.size(); i++) {
|
|
892
|
+
auto it = sql_types_per_column.find(names[i]);
|
|
883
893
|
if (it != sql_types_per_column.end()) {
|
|
884
894
|
best_sql_types_candidates[i] = {it->second};
|
|
885
|
-
sql_types_per_column.erase(
|
|
895
|
+
sql_types_per_column.erase(names[i]);
|
|
886
896
|
}
|
|
887
897
|
}
|
|
888
898
|
if (!sql_types_per_column.empty()) {
|
|
@@ -38,9 +38,9 @@ ParallelCSVReader::~ParallelCSVReader() {
|
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
|
|
41
|
-
|
|
42
|
-
InitParseChunk(
|
|
43
|
-
InitInsertChunkIdx(
|
|
41
|
+
return_types = requested_types;
|
|
42
|
+
InitParseChunk(return_types.size());
|
|
43
|
+
InitInsertChunkIdx(return_types.size());
|
|
44
44
|
}
|
|
45
45
|
|
|
46
46
|
bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
|
|
@@ -360,7 +360,7 @@ final_state : {
|
|
|
360
360
|
// remaining values to be added to the chunk
|
|
361
361
|
AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes);
|
|
362
362
|
if (try_add_line) {
|
|
363
|
-
bool success = column ==
|
|
363
|
+
bool success = column == return_types.size();
|
|
364
364
|
if (success) {
|
|
365
365
|
AddRow(insert_chunk, column, error_message);
|
|
366
366
|
success = Flush(insert_chunk);
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "duckdb/main/database.hpp"
|
|
5
5
|
#include "duckdb/common/string_util.hpp"
|
|
6
6
|
#include "duckdb/common/hive_partitioning.hpp"
|
|
7
|
+
#include "duckdb/common/union_by_name.hpp"
|
|
7
8
|
#include "duckdb/main/config.hpp"
|
|
8
9
|
#include "duckdb/parser/expression/constant_expression.hpp"
|
|
9
10
|
#include "duckdb/parser/expression/function_expression.hpp"
|
|
@@ -127,9 +128,9 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
|
127
128
|
if (options.auto_detect) {
|
|
128
129
|
options.file_path = result->files[0];
|
|
129
130
|
auto initial_reader = make_unique<BufferedCSVReader>(context, options);
|
|
130
|
-
return_types.assign(initial_reader->
|
|
131
|
+
return_types.assign(initial_reader->return_types.begin(), initial_reader->return_types.end());
|
|
131
132
|
if (names.empty()) {
|
|
132
|
-
names.assign(initial_reader->
|
|
133
|
+
names.assign(initial_reader->names.begin(), initial_reader->names.end());
|
|
133
134
|
} else {
|
|
134
135
|
if (explicitly_set_columns) {
|
|
135
136
|
// The user has influenced the names, can't assume they are valid anymore
|
|
@@ -143,7 +144,7 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
|
143
144
|
}
|
|
144
145
|
}
|
|
145
146
|
options = initial_reader->options;
|
|
146
|
-
result->sql_types = initial_reader->
|
|
147
|
+
result->sql_types = initial_reader->return_types;
|
|
147
148
|
result->initial_reader = std::move(initial_reader);
|
|
148
149
|
} else {
|
|
149
150
|
result->sql_types = return_types;
|
|
@@ -152,58 +153,25 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
|
152
153
|
|
|
153
154
|
// union_col_names will exclude filename and hivepartition
|
|
154
155
|
if (options.union_by_name) {
|
|
155
|
-
idx_t union_names_index = 0;
|
|
156
156
|
case_insensitive_map_t<idx_t> union_names_map;
|
|
157
157
|
vector<string> union_col_names;
|
|
158
158
|
vector<LogicalType> union_col_types;
|
|
159
159
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
auto reader = make_unique<BufferedCSVReader>(context, options);
|
|
163
|
-
auto &col_names = reader->col_names;
|
|
164
|
-
auto &sql_types = reader->sql_types;
|
|
165
|
-
D_ASSERT(col_names.size() == sql_types.size());
|
|
166
|
-
|
|
167
|
-
for (idx_t col = 0; col < col_names.size(); ++col) {
|
|
168
|
-
auto union_find = union_names_map.find(col_names[col]);
|
|
169
|
-
|
|
170
|
-
if (union_find != union_names_map.end()) {
|
|
171
|
-
// given same name , union_col's type must compatible with col's type
|
|
172
|
-
LogicalType compatible_type;
|
|
173
|
-
compatible_type = LogicalType::MaxLogicalType(union_col_types[union_find->second], sql_types[col]);
|
|
174
|
-
union_col_types[union_find->second] = compatible_type;
|
|
175
|
-
} else {
|
|
176
|
-
union_names_map[col_names[col]] = union_names_index;
|
|
177
|
-
union_names_index++;
|
|
178
|
-
|
|
179
|
-
union_col_names.emplace_back(col_names[col]);
|
|
180
|
-
union_col_types.emplace_back(sql_types[col]);
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
result->union_readers.push_back(std::move(reader));
|
|
184
|
-
}
|
|
160
|
+
auto dummy_readers = UnionByName<BufferedCSVReader, BufferedCSVReaderOptions>::UnionCols(
|
|
161
|
+
context, result->files, union_col_types, union_col_names, union_names_map, options);
|
|
185
162
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
vector<bool> is_null_cols(union_col_names.size(), true);
|
|
163
|
+
dummy_readers = UnionByName<BufferedCSVReader, BufferedCSVReaderOptions>::CreateUnionMap(
|
|
164
|
+
std::move(dummy_readers), union_col_types, union_col_names, union_names_map);
|
|
189
165
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
is_null_cols[remap_col] = false;
|
|
194
|
-
}
|
|
195
|
-
for (idx_t col = 0; col < union_col_names.size(); ++col) {
|
|
196
|
-
if (is_null_cols[col]) {
|
|
197
|
-
reader->insert_nulls_idx.push_back(col);
|
|
198
|
-
}
|
|
199
|
-
}
|
|
166
|
+
std::move(dummy_readers.begin(), dummy_readers.end(), std::back_inserter(result->union_readers));
|
|
167
|
+
for (auto &reader : result->union_readers) {
|
|
168
|
+
reader->insert_cols_idx = reader->union_idx_map;
|
|
200
169
|
}
|
|
201
170
|
|
|
202
|
-
const idx_t first_file_index = 0;
|
|
203
|
-
result->initial_reader = std::move(result->union_readers[first_file_index]);
|
|
204
|
-
|
|
205
171
|
names.assign(union_col_names.begin(), union_col_names.end());
|
|
206
172
|
return_types.assign(union_col_types.begin(), union_col_types.end());
|
|
173
|
+
const idx_t first_file_index = 0;
|
|
174
|
+
result->initial_reader = std::move(result->union_readers[first_file_index]);
|
|
207
175
|
D_ASSERT(names.size() == return_types.size());
|
|
208
176
|
}
|
|
209
177
|
|
|
@@ -544,7 +512,7 @@ static unique_ptr<GlobalTableFunctionState> SingleThreadedCSVInit(ClientContext
|
|
|
544
512
|
}
|
|
545
513
|
result->next_file = 1;
|
|
546
514
|
if (result->initial_reader) {
|
|
547
|
-
result->sql_types = result->initial_reader->
|
|
515
|
+
result->sql_types = result->initial_reader->return_types;
|
|
548
516
|
}
|
|
549
517
|
return std::move(result);
|
|
550
518
|
}
|
|
@@ -603,7 +571,8 @@ static void SingleThreadedCSVFunction(ClientContext &context, TableFunctionInput
|
|
|
603
571
|
} while (true);
|
|
604
572
|
|
|
605
573
|
if (bind_data.options.union_by_name) {
|
|
606
|
-
|
|
574
|
+
UnionByName<BufferedCSVReader, BufferedCSVReaderOptions>::SetNullUnionCols(output,
|
|
575
|
+
lstate.csv_reader->union_null_cols);
|
|
607
576
|
}
|
|
608
577
|
if (bind_data.options.include_file_name) {
|
|
609
578
|
auto &col = output.data[bind_data.filename_col_idx];
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
|
2
|
-
#define DUCKDB_VERSION "0.6.2-
|
|
2
|
+
#define DUCKDB_VERSION "0.6.2-dev1160"
|
|
3
3
|
#endif
|
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
|
5
|
+
#define DUCKDB_SOURCE_ID "351d01503a"
|
|
6
6
|
#endif
|
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
|
8
8
|
#include "duckdb/main/database.hpp"
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
|
2
|
+
// DuckDB
|
|
3
|
+
//
|
|
4
|
+
// duckdb/common/union_by_name.hpp
|
|
5
|
+
//
|
|
6
|
+
//
|
|
7
|
+
//===----------------------------------------------------------------------===//
|
|
8
|
+
|
|
9
|
+
#pragma once
|
|
10
|
+
#include <vector>
|
|
11
|
+
#include <string>
|
|
12
|
+
#include "duckdb/common/types.hpp"
|
|
13
|
+
|
|
14
|
+
using std::string;
|
|
15
|
+
using std::vector;
|
|
16
|
+
|
|
17
|
+
namespace duckdb {
|
|
18
|
+
|
|
19
|
+
template <class READER_TYPE, class OPTION_TYPE>
|
|
20
|
+
class UnionByName {
|
|
21
|
+
|
|
22
|
+
public:
|
|
23
|
+
//! Union all files(readers) by their col names
|
|
24
|
+
static vector<unique_ptr<READER_TYPE>>
|
|
25
|
+
UnionCols(ClientContext &context, const vector<string> &files, vector<LogicalType> &union_col_types,
|
|
26
|
+
vector<string> &union_col_names, case_insensitive_map_t<idx_t> &union_names_map, OPTION_TYPE options) {
|
|
27
|
+
idx_t union_names_index = 0;
|
|
28
|
+
vector<unique_ptr<READER_TYPE>> union_readers;
|
|
29
|
+
|
|
30
|
+
for (idx_t file_idx = 0; file_idx < files.size(); ++file_idx) {
|
|
31
|
+
const auto file_name = files[file_idx];
|
|
32
|
+
auto reader = make_unique<READER_TYPE>(context, file_name, options);
|
|
33
|
+
|
|
34
|
+
auto &col_names = reader->names;
|
|
35
|
+
auto &sql_types = reader->return_types;
|
|
36
|
+
D_ASSERT(col_names.size() == sql_types.size());
|
|
37
|
+
|
|
38
|
+
for (idx_t col = 0; col < col_names.size(); ++col) {
|
|
39
|
+
auto union_find = union_names_map.find(col_names[col]);
|
|
40
|
+
|
|
41
|
+
if (union_find != union_names_map.end()) {
|
|
42
|
+
// given same name , union_col's type must compatible with col's type
|
|
43
|
+
LogicalType compatible_type;
|
|
44
|
+
compatible_type = LogicalType::MaxLogicalType(union_col_types[union_find->second], sql_types[col]);
|
|
45
|
+
union_col_types[union_find->second] = compatible_type;
|
|
46
|
+
} else {
|
|
47
|
+
union_names_map[col_names[col]] = union_names_index;
|
|
48
|
+
union_names_index++;
|
|
49
|
+
|
|
50
|
+
union_col_names.emplace_back(col_names[col]);
|
|
51
|
+
union_col_types.emplace_back(sql_types[col]);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
union_readers.push_back(move(reader));
|
|
55
|
+
}
|
|
56
|
+
return union_readers;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
//! Create information for reader's col mapping to union cols
|
|
60
|
+
static vector<unique_ptr<READER_TYPE>> CreateUnionMap(vector<unique_ptr<READER_TYPE>> union_readers,
|
|
61
|
+
vector<LogicalType> &union_col_types,
|
|
62
|
+
vector<string> &union_col_names,
|
|
63
|
+
case_insensitive_map_t<idx_t> &union_names_map) {
|
|
64
|
+
for (auto &reader : union_readers) {
|
|
65
|
+
auto &col_names = reader->names;
|
|
66
|
+
vector<bool> union_null_cols(union_col_names.size(), true);
|
|
67
|
+
vector<idx_t> union_idx_map(col_names.size(), 0);
|
|
68
|
+
|
|
69
|
+
for (idx_t col = 0; col < col_names.size(); ++col) {
|
|
70
|
+
idx_t union_idx = union_names_map[col_names[col]];
|
|
71
|
+
union_idx_map[col] = union_idx;
|
|
72
|
+
union_null_cols[union_idx] = false;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
reader->union_col_types = union_col_types;
|
|
76
|
+
reader->union_idx_map = move(union_idx_map);
|
|
77
|
+
reader->union_null_cols = move(union_null_cols);
|
|
78
|
+
}
|
|
79
|
+
return union_readers;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
//! Set nulls into the cols that mismtach union names
|
|
83
|
+
static void SetNullUnionCols(DataChunk &result, const vector<bool> &union_null_cols) {
|
|
84
|
+
for (idx_t col = 0; col < union_null_cols.size(); ++col) {
|
|
85
|
+
if (union_null_cols[col]) {
|
|
86
|
+
result.data[col].SetVectorType(VectorType::CONSTANT_VECTOR);
|
|
87
|
+
ConstantVector::SetNull(result.data[col], true);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
} // namespace duckdb
|
|
@@ -44,13 +44,15 @@ public:
|
|
|
44
44
|
Allocator &allocator;
|
|
45
45
|
FileOpener *opener;
|
|
46
46
|
BufferedCSVReaderOptions options;
|
|
47
|
-
vector<LogicalType>
|
|
48
|
-
vector<string>
|
|
47
|
+
vector<LogicalType> return_types;
|
|
48
|
+
vector<string> names;
|
|
49
49
|
|
|
50
50
|
//! remap parse_chunk col to insert_chunk col, because when
|
|
51
51
|
//! union_by_name option on insert_chunk may have more cols
|
|
52
52
|
vector<idx_t> insert_cols_idx;
|
|
53
|
-
vector<idx_t>
|
|
53
|
+
vector<idx_t> union_idx_map;
|
|
54
|
+
vector<bool> union_null_cols;
|
|
55
|
+
vector<LogicalType> union_col_types;
|
|
54
56
|
|
|
55
57
|
idx_t linenr = 0;
|
|
56
58
|
bool linenr_estimated = false;
|
|
@@ -70,10 +72,6 @@ public:
|
|
|
70
72
|
|
|
71
73
|
ParserMode mode;
|
|
72
74
|
|
|
73
|
-
public:
|
|
74
|
-
//! Fill nulls into the cols that mismtach union names
|
|
75
|
-
void SetNullUnionCols(DataChunk &insert_chunk);
|
|
76
|
-
|
|
77
75
|
protected:
|
|
78
76
|
//! Initializes the parse_chunk with varchar columns and aligns info with new number of cols
|
|
79
77
|
void InitParseChunk(idx_t num_cols);
|
|
@@ -100,7 +98,7 @@ protected:
|
|
|
100
98
|
static string GetLineNumberStr(idx_t linenr, bool linenr_estimated);
|
|
101
99
|
|
|
102
100
|
protected:
|
|
103
|
-
//! Whether or not the current row's columns have overflown
|
|
101
|
+
//! Whether or not the current row's columns have overflown return_types.size()
|
|
104
102
|
bool error_column_overflow = false;
|
|
105
103
|
//! Number of sniffed columns - only used when auto-detecting
|
|
106
104
|
vector<idx_t> sniffed_column_counts;
|
|
@@ -57,6 +57,8 @@ public:
|
|
|
57
57
|
const vector<LogicalType> &requested_types = vector<LogicalType>());
|
|
58
58
|
BufferedCSVReader(FileSystem &fs, Allocator &allocator, FileOpener *opener, BufferedCSVReaderOptions options,
|
|
59
59
|
const vector<LogicalType> &requested_types = vector<LogicalType>());
|
|
60
|
+
BufferedCSVReader(ClientContext &context, string filename, BufferedCSVReaderOptions options,
|
|
61
|
+
const vector<LogicalType> &requested_types = vector<LogicalType>());
|
|
60
62
|
~BufferedCSVReader();
|
|
61
63
|
|
|
62
64
|
unique_ptr<char[]> buffer;
|
|
@@ -19,9 +19,12 @@ public:
|
|
|
19
19
|
}
|
|
20
20
|
virtual ~SegmentBase() {
|
|
21
21
|
}
|
|
22
|
-
|
|
23
22
|
SegmentBase *Next() {
|
|
23
|
+
#ifndef DUCKDB_R_BUILD
|
|
24
24
|
return next.load();
|
|
25
|
+
#else
|
|
26
|
+
return next;
|
|
27
|
+
#endif
|
|
25
28
|
}
|
|
26
29
|
|
|
27
30
|
//! The start row id of this chunk
|
|
@@ -29,7 +32,12 @@ public:
|
|
|
29
32
|
//! The amount of entries in this storage chunk
|
|
30
33
|
atomic<idx_t> count;
|
|
31
34
|
//! The next segment after this one
|
|
35
|
+
|
|
36
|
+
#ifndef DUCKDB_R_BUILD
|
|
32
37
|
atomic<SegmentBase *> next;
|
|
38
|
+
#else
|
|
39
|
+
SegmentBase *next;
|
|
40
|
+
#endif
|
|
33
41
|
};
|
|
34
42
|
|
|
35
43
|
} // namespace duckdb
|
|
@@ -224,8 +224,8 @@ shared_ptr<Relation> Connection::ReadCSV(const string &csv_file) {
|
|
|
224
224
|
options.auto_detect = true;
|
|
225
225
|
BufferedCSVReader reader(*context, options);
|
|
226
226
|
vector<ColumnDefinition> column_list;
|
|
227
|
-
for (idx_t i = 0; i < reader.
|
|
228
|
-
column_list.emplace_back(reader.
|
|
227
|
+
for (idx_t i = 0; i < reader.return_types.size(); i++) {
|
|
228
|
+
column_list.emplace_back(reader.names[i], reader.return_types[i]);
|
|
229
229
|
}
|
|
230
230
|
return make_shared<ReadCSVRelation>(context, csv_file, std::move(column_list), true);
|
|
231
231
|
}
|
|
@@ -8,14 +8,14 @@ namespace duckdb {
|
|
|
8
8
|
|
|
9
9
|
BoundLambdaRefExpression::BoundLambdaRefExpression(string alias_p, LogicalType type, ColumnBinding binding,
|
|
10
10
|
idx_t lambda_index, idx_t depth)
|
|
11
|
-
: Expression(ExpressionType::BOUND_LAMBDA_REF, ExpressionClass::BOUND_LAMBDA_REF, move(type)),
|
|
12
|
-
lambda_index(lambda_index), depth(depth) {
|
|
13
|
-
this->alias = move(alias_p);
|
|
11
|
+
: Expression(ExpressionType::BOUND_LAMBDA_REF, ExpressionClass::BOUND_LAMBDA_REF, std::move(type)),
|
|
12
|
+
binding(binding), lambda_index(lambda_index), depth(depth) {
|
|
13
|
+
this->alias = std::move(alias_p);
|
|
14
14
|
}
|
|
15
15
|
|
|
16
16
|
BoundLambdaRefExpression::BoundLambdaRefExpression(LogicalType type, ColumnBinding binding, idx_t lambda_index,
|
|
17
17
|
idx_t depth)
|
|
18
|
-
: BoundLambdaRefExpression(string(), move(type), binding, lambda_index, depth) {
|
|
18
|
+
: BoundLambdaRefExpression(string(), std::move(type), binding, lambda_index, depth) {
|
|
19
19
|
}
|
|
20
20
|
|
|
21
21
|
unique_ptr<Expression> BoundLambdaRefExpression::Copy() {
|
|
@@ -616,7 +616,7 @@ struct UpdateSelectElement {
|
|
|
616
616
|
|
|
617
617
|
template <>
|
|
618
618
|
string_t UpdateSelectElement::Operation(UpdateSegment *segment, string_t element) {
|
|
619
|
-
return element.IsInlined() ? element : segment->GetStringHeap().
|
|
619
|
+
return element.IsInlined() ? element : segment->GetStringHeap().AddBlob(element);
|
|
620
620
|
}
|
|
621
621
|
|
|
622
622
|
template <class T>
|
|
@@ -942,7 +942,7 @@ idx_t UpdateStringStatistics(UpdateSegment *segment, SegmentStatistics &stats, V
|
|
|
942
942
|
for (idx_t i = 0; i < count; i++) {
|
|
943
943
|
((StringStatistics &)*stats.statistics).Update(update_data[i]);
|
|
944
944
|
if (!update_data[i].IsInlined()) {
|
|
945
|
-
update_data[i] = segment->GetStringHeap().
|
|
945
|
+
update_data[i] = segment->GetStringHeap().AddBlob(update_data[i]);
|
|
946
946
|
}
|
|
947
947
|
}
|
|
948
948
|
sel.Initialize(nullptr);
|
|
@@ -955,7 +955,7 @@ idx_t UpdateStringStatistics(UpdateSegment *segment, SegmentStatistics &stats, V
|
|
|
955
955
|
sel.set_index(not_null_count++, i);
|
|
956
956
|
((StringStatistics &)*stats.statistics).Update(update_data[i]);
|
|
957
957
|
if (!update_data[i].IsInlined()) {
|
|
958
|
-
update_data[i] = segment->GetStringHeap().
|
|
958
|
+
update_data[i] = segment->GetStringHeap().AddBlob(update_data[i]);
|
|
959
959
|
}
|
|
960
960
|
}
|
|
961
961
|
}
|