duckdb 0.5.2-dev523.0 → 0.5.2-dev547.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb.cpp +97 -7
- package/src/duckdb.hpp +15 -2
- package/src/parquet-amalgamation.cpp +36939 -36939
package/package.json
CHANGED
package/src/duckdb.cpp
CHANGED
|
@@ -75716,6 +75716,8 @@ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value
|
|
|
75716
75716
|
has_escape = true;
|
|
75717
75717
|
} else if (loption == "ignore_errors") {
|
|
75718
75718
|
ignore_errors = ParseBoolean(value, loption);
|
|
75719
|
+
} else if (loption == "union_by_name") {
|
|
75720
|
+
union_by_name = ParseBoolean(value, loption);
|
|
75719
75721
|
} else {
|
|
75720
75722
|
throw BinderException("Unrecognized option for CSV reader \"%s\"", loption);
|
|
75721
75723
|
}
|
|
@@ -75927,6 +75929,7 @@ void BufferedCSVReader::Initialize(const vector<LogicalType> &requested_types) {
|
|
|
75927
75929
|
SkipRowsAndReadHeader(options.skip_rows, options.header);
|
|
75928
75930
|
}
|
|
75929
75931
|
InitParseChunk(sql_types.size());
|
|
75932
|
+
InitInsertChunkIdx(sql_types.size());
|
|
75930
75933
|
// we only need reset support during the automatic CSV type detection
|
|
75931
75934
|
// since reset support might require caching (in the case of streams), we disable it for the remainder
|
|
75932
75935
|
file_handle->DisableReset();
|
|
@@ -76072,6 +76075,12 @@ void BufferedCSVReader::InitParseChunk(idx_t num_cols) {
|
|
|
76072
76075
|
}
|
|
76073
76076
|
}
|
|
76074
76077
|
|
|
76078
|
+
void BufferedCSVReader::InitInsertChunkIdx(idx_t num_cols) {
|
|
76079
|
+
for (idx_t col = 0; col < num_cols; ++col) {
|
|
76080
|
+
insert_cols_idx.push_back(col);
|
|
76081
|
+
}
|
|
76082
|
+
}
|
|
76083
|
+
|
|
76075
76084
|
void BufferedCSVReader::JumpToBeginning(idx_t skip_rows = 0, bool skip_header = false) {
|
|
76076
76085
|
ResetBuffer();
|
|
76077
76086
|
ResetStream();
|
|
@@ -77371,6 +77380,13 @@ bool BufferedCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column) {
|
|
|
77371
77380
|
return false;
|
|
77372
77381
|
}
|
|
77373
77382
|
|
|
77383
|
+
void BufferedCSVReader::SetNullUnionCols(DataChunk &insert_chunk) {
|
|
77384
|
+
for (idx_t col = 0; col < insert_nulls_idx.size(); ++col) {
|
|
77385
|
+
insert_chunk.data[insert_nulls_idx[col]].SetVectorType(VectorType::CONSTANT_VECTOR);
|
|
77386
|
+
ConstantVector::SetNull(insert_chunk.data[insert_nulls_idx[col]], true);
|
|
77387
|
+
}
|
|
77388
|
+
}
|
|
77389
|
+
|
|
77374
77390
|
void BufferedCSVReader::Flush(DataChunk &insert_chunk) {
|
|
77375
77391
|
if (parse_chunk.size() == 0) {
|
|
77376
77392
|
return;
|
|
@@ -77401,22 +77417,25 @@ void BufferedCSVReader::Flush(DataChunk &insert_chunk) {
|
|
|
77401
77417
|
}
|
|
77402
77418
|
}
|
|
77403
77419
|
}
|
|
77404
|
-
insert_chunk.data[col_idx].Reference(parse_chunk.data[col_idx]);
|
|
77420
|
+
insert_chunk.data[insert_cols_idx[col_idx]].Reference(parse_chunk.data[col_idx]);
|
|
77405
77421
|
} else {
|
|
77406
77422
|
string error_message;
|
|
77407
77423
|
bool success;
|
|
77408
77424
|
if (options.has_format[LogicalTypeId::DATE] && sql_types[col_idx].id() == LogicalTypeId::DATE) {
|
|
77409
77425
|
// use the date format to cast the chunk
|
|
77410
|
-
success =
|
|
77411
|
-
|
|
77426
|
+
success =
|
|
77427
|
+
TryCastDateVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_cols_idx[col_idx]],
|
|
77428
|
+
parse_chunk.size(), error_message);
|
|
77412
77429
|
} else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
|
|
77413
77430
|
sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP) {
|
|
77414
77431
|
// use the date format to cast the chunk
|
|
77415
|
-
success = TryCastTimestampVector(options, parse_chunk.data[col_idx],
|
|
77416
|
-
parse_chunk.size(),
|
|
77432
|
+
success = TryCastTimestampVector(options, parse_chunk.data[col_idx],
|
|
77433
|
+
insert_chunk.data[insert_cols_idx[col_idx]], parse_chunk.size(),
|
|
77434
|
+
error_message);
|
|
77417
77435
|
} else {
|
|
77418
77436
|
// target type is not varchar: perform a cast
|
|
77419
|
-
success = VectorOperations::DefaultTryCast(parse_chunk.data[col_idx],
|
|
77437
|
+
success = VectorOperations::DefaultTryCast(parse_chunk.data[col_idx],
|
|
77438
|
+
insert_chunk.data[insert_cols_idx[col_idx]],
|
|
77420
77439
|
parse_chunk.size(), &error_message);
|
|
77421
77440
|
}
|
|
77422
77441
|
if (success) {
|
|
@@ -118249,6 +118268,9 @@ struct ReadCSVData : public BaseCSVData {
|
|
|
118249
118268
|
//! The initial reader (if any): this is used when automatic detection is used during binding.
|
|
118250
118269
|
//! In this case, the CSV reader is already created and might as well be re-used.
|
|
118251
118270
|
unique_ptr<BufferedCSVReader> initial_reader;
|
|
118271
|
+
//! The union readers is created(when csv union_by_name option is on) during binding
|
|
118272
|
+
//! Those reader can be re-used during ReadCSVFunction
|
|
118273
|
+
vector<unique_ptr<BufferedCSVReader>> union_readers;
|
|
118252
118274
|
};
|
|
118253
118275
|
|
|
118254
118276
|
struct CSVCopyFunction {
|
|
@@ -119506,6 +119528,64 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
|
119506
119528
|
result->sql_types = return_types;
|
|
119507
119529
|
D_ASSERT(return_types.size() == names.size());
|
|
119508
119530
|
}
|
|
119531
|
+
|
|
119532
|
+
// union_col_names will exclude filename and hivepartition
|
|
119533
|
+
if (options.union_by_name) {
|
|
119534
|
+
idx_t union_names_index = 0;
|
|
119535
|
+
case_insensitive_map_t<idx_t> union_names_map;
|
|
119536
|
+
vector<string> union_col_names;
|
|
119537
|
+
vector<LogicalType> union_col_types;
|
|
119538
|
+
|
|
119539
|
+
for (idx_t file_idx = 0; file_idx < result->files.size(); ++file_idx) {
|
|
119540
|
+
options.file_path = result->files[file_idx];
|
|
119541
|
+
auto reader = make_unique<BufferedCSVReader>(context, options);
|
|
119542
|
+
auto &col_names = reader->col_names;
|
|
119543
|
+
auto &sql_types = reader->sql_types;
|
|
119544
|
+
D_ASSERT(col_names.size() == sql_types.size());
|
|
119545
|
+
|
|
119546
|
+
for (idx_t col = 0; col < col_names.size(); ++col) {
|
|
119547
|
+
auto union_find = union_names_map.find(col_names[col]);
|
|
119548
|
+
|
|
119549
|
+
if (union_find != union_names_map.end()) {
|
|
119550
|
+
// given same name , union_col's type must compatible with col's type
|
|
119551
|
+
LogicalType compatible_type;
|
|
119552
|
+
compatible_type = LogicalType::MaxLogicalType(union_col_types[union_find->second], sql_types[col]);
|
|
119553
|
+
union_col_types[union_find->second] = compatible_type;
|
|
119554
|
+
} else {
|
|
119555
|
+
union_names_map[col_names[col]] = union_names_index;
|
|
119556
|
+
union_names_index++;
|
|
119557
|
+
|
|
119558
|
+
union_col_names.emplace_back(col_names[col]);
|
|
119559
|
+
union_col_types.emplace_back(sql_types[col]);
|
|
119560
|
+
}
|
|
119561
|
+
}
|
|
119562
|
+
result->union_readers.push_back(move(reader));
|
|
119563
|
+
}
|
|
119564
|
+
|
|
119565
|
+
for (auto &reader : result->union_readers) {
|
|
119566
|
+
auto &col_names = reader->col_names;
|
|
119567
|
+
vector<bool> is_null_cols(union_col_names.size(), true);
|
|
119568
|
+
|
|
119569
|
+
for (idx_t col = 0; col < col_names.size(); ++col) {
|
|
119570
|
+
idx_t remap_col = union_names_map[col_names[col]];
|
|
119571
|
+
reader->insert_cols_idx[col] = remap_col;
|
|
119572
|
+
is_null_cols[remap_col] = false;
|
|
119573
|
+
}
|
|
119574
|
+
for (idx_t col = 0; col < union_col_names.size(); ++col) {
|
|
119575
|
+
if (is_null_cols[col]) {
|
|
119576
|
+
reader->insert_nulls_idx.push_back(col);
|
|
119577
|
+
}
|
|
119578
|
+
}
|
|
119579
|
+
}
|
|
119580
|
+
|
|
119581
|
+
const idx_t first_file_index = 0;
|
|
119582
|
+
result->initial_reader = move(result->union_readers[first_file_index]);
|
|
119583
|
+
|
|
119584
|
+
names.assign(union_col_names.begin(), union_col_names.end());
|
|
119585
|
+
return_types.assign(union_col_types.begin(), union_col_types.end());
|
|
119586
|
+
D_ASSERT(names.size() == return_types.size());
|
|
119587
|
+
}
|
|
119588
|
+
|
|
119509
119589
|
if (result->options.include_file_name) {
|
|
119510
119590
|
result->filename_col_idx = names.size();
|
|
119511
119591
|
return_types.emplace_back(LogicalType::VARCHAR);
|
|
@@ -119574,13 +119654,22 @@ static void ReadCSVFunction(ClientContext &context, TableFunctionInput &data_p,
|
|
|
119574
119654
|
// exhausted this file, but we have more files we can read
|
|
119575
119655
|
// open the next file and increment the counter
|
|
119576
119656
|
bind_data.options.file_path = bind_data.files[data.file_index];
|
|
119577
|
-
|
|
119657
|
+
// reuse csv_readers was created during binding
|
|
119658
|
+
if (bind_data.options.union_by_name) {
|
|
119659
|
+
data.csv_reader = move(bind_data.union_readers[data.file_index]);
|
|
119660
|
+
} else {
|
|
119661
|
+
data.csv_reader =
|
|
119662
|
+
make_unique<BufferedCSVReader>(context, bind_data.options, data.csv_reader->sql_types);
|
|
119663
|
+
}
|
|
119578
119664
|
data.file_index++;
|
|
119579
119665
|
} else {
|
|
119580
119666
|
break;
|
|
119581
119667
|
}
|
|
119582
119668
|
} while (true);
|
|
119583
119669
|
|
|
119670
|
+
if (bind_data.options.union_by_name) {
|
|
119671
|
+
data.csv_reader->SetNullUnionCols(output);
|
|
119672
|
+
}
|
|
119584
119673
|
if (bind_data.options.include_file_name) {
|
|
119585
119674
|
auto &col = output.data[bind_data.filename_col_idx];
|
|
119586
119675
|
col.SetValue(0, Value(data.csv_reader->options.file_path));
|
|
@@ -119633,6 +119722,7 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
|
|
|
119633
119722
|
table_function.named_parameters["max_line_size"] = LogicalType::VARCHAR;
|
|
119634
119723
|
table_function.named_parameters["maximum_line_size"] = LogicalType::VARCHAR;
|
|
119635
119724
|
table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
|
|
119725
|
+
table_function.named_parameters["union_by_name"] = LogicalType::BOOLEAN;
|
|
119636
119726
|
}
|
|
119637
119727
|
|
|
119638
119728
|
double CSVReaderProgress(ClientContext &context, const FunctionData *bind_data_p,
|
package/src/duckdb.hpp
CHANGED
|
@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
|
|
|
11
11
|
#pragma once
|
|
12
12
|
#define DUCKDB_AMALGAMATION 1
|
|
13
13
|
#define DUCKDB_AMALGAMATION_EXTENDED 1
|
|
14
|
-
#define DUCKDB_SOURCE_ID "
|
|
15
|
-
#define DUCKDB_VERSION "v0.5.2-
|
|
14
|
+
#define DUCKDB_SOURCE_ID "83aff54f1"
|
|
15
|
+
#define DUCKDB_VERSION "v0.5.2-dev547"
|
|
16
16
|
//===----------------------------------------------------------------------===//
|
|
17
17
|
// DuckDB
|
|
18
18
|
//
|
|
@@ -27165,6 +27165,8 @@ struct BufferedCSVReaderOptions {
|
|
|
27165
27165
|
bool include_file_name = false;
|
|
27166
27166
|
//! Whether or not to include a parsed hive partition columns
|
|
27167
27167
|
bool include_parsed_hive_partitions = false;
|
|
27168
|
+
//! Whether or not to union files with different (but compatible) columns
|
|
27169
|
+
bool union_by_name = false;
|
|
27168
27170
|
|
|
27169
27171
|
//===--------------------------------------------------------------------===//
|
|
27170
27172
|
// WriteCSVOptions
|
|
@@ -27224,6 +27226,12 @@ public:
|
|
|
27224
27226
|
BufferedCSVReaderOptions options;
|
|
27225
27227
|
vector<LogicalType> sql_types;
|
|
27226
27228
|
vector<string> col_names;
|
|
27229
|
+
|
|
27230
|
+
//! remap parse_chunk col to insert_chunk col, because when
|
|
27231
|
+
//! union_by_name option on insert_chunk may have more cols
|
|
27232
|
+
vector<idx_t> insert_cols_idx;
|
|
27233
|
+
vector<idx_t> insert_nulls_idx;
|
|
27234
|
+
|
|
27227
27235
|
unique_ptr<CSVFileHandle> file_handle;
|
|
27228
27236
|
|
|
27229
27237
|
unique_ptr<char[]> buffer;
|
|
@@ -27258,11 +27266,16 @@ public:
|
|
|
27258
27266
|
|
|
27259
27267
|
idx_t GetFileSize();
|
|
27260
27268
|
|
|
27269
|
+
//! Fill nulls into the cols that mismtach union names
|
|
27270
|
+
void SetNullUnionCols(DataChunk &insert_chunk);
|
|
27271
|
+
|
|
27261
27272
|
private:
|
|
27262
27273
|
//! Initialize Parser
|
|
27263
27274
|
void Initialize(const vector<LogicalType> &requested_types);
|
|
27264
27275
|
//! Initializes the parse_chunk with varchar columns and aligns info with new number of cols
|
|
27265
27276
|
void InitParseChunk(idx_t num_cols);
|
|
27277
|
+
//! Initializes the insert_chunk idx for mapping parse_chunk cols to insert_chunk cols
|
|
27278
|
+
void InitInsertChunkIdx(idx_t num_cols);
|
|
27266
27279
|
//! Initializes the TextSearchShiftArrays for complex parser
|
|
27267
27280
|
void PrepareComplexParser();
|
|
27268
27281
|
//! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.
|