duckdb 0.5.2-dev2006.0 → 0.5.2-dev2076.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb.cpp +1649 -786
- package/src/duckdb.hpp +373 -93
- package/src/parquet-amalgamation.cpp +37721 -37721
package/src/duckdb.cpp
CHANGED
|
@@ -28769,9 +28769,10 @@ bool TryCast::Operation(string_t input, hugeint_t &result, bool strict) {
|
|
|
28769
28769
|
//===--------------------------------------------------------------------===//
|
|
28770
28770
|
// Decimal String Cast
|
|
28771
28771
|
//===--------------------------------------------------------------------===//
|
|
28772
|
-
template <class
|
|
28772
|
+
template <class TYPE>
|
|
28773
28773
|
struct DecimalCastData {
|
|
28774
|
-
|
|
28774
|
+
typedef TYPE type_t;
|
|
28775
|
+
TYPE result;
|
|
28775
28776
|
uint8_t width;
|
|
28776
28777
|
uint8_t scale;
|
|
28777
28778
|
uint8_t digit_count;
|
|
@@ -28794,8 +28795,14 @@ struct DecimalCastOperation {
|
|
|
28794
28795
|
}
|
|
28795
28796
|
state.digit_count++;
|
|
28796
28797
|
if (NEGATIVE) {
|
|
28798
|
+
if (state.result < (NumericLimits<typename T::type_t>::Minimum() / 10)) {
|
|
28799
|
+
return false;
|
|
28800
|
+
}
|
|
28797
28801
|
state.result = state.result * 10 - digit;
|
|
28798
28802
|
} else {
|
|
28803
|
+
if (state.result > (NumericLimits<typename T::type_t>::Maximum() / 10)) {
|
|
28804
|
+
return false;
|
|
28805
|
+
}
|
|
28799
28806
|
state.result = state.result * 10 + digit;
|
|
28800
28807
|
}
|
|
28801
28808
|
return true;
|
|
@@ -42650,6 +42657,19 @@ static int8_t TemplatedCompareValue(Vector &left_vec, Vector &right_vec, idx_t l
|
|
|
42650
42657
|
return 1;
|
|
42651
42658
|
}
|
|
42652
42659
|
|
|
42660
|
+
template <>
|
|
42661
|
+
int8_t TemplatedCompareValue<Value>(Vector &left_vec, Vector &right_vec, idx_t left_idx, idx_t right_idx) {
|
|
42662
|
+
auto left_val = left_vec.GetValue(left_idx);
|
|
42663
|
+
auto right_val = right_vec.GetValue(right_idx);
|
|
42664
|
+
if (ValueOperations::Equals(left_val, right_val)) {
|
|
42665
|
+
return 0;
|
|
42666
|
+
}
|
|
42667
|
+
if (ValueOperations::LessThan(left_val, right_val)) {
|
|
42668
|
+
return -1;
|
|
42669
|
+
}
|
|
42670
|
+
return 1;
|
|
42671
|
+
}
|
|
42672
|
+
|
|
42653
42673
|
// return type here is int32 because strcmp() on some platforms returns rather large values
|
|
42654
42674
|
static int32_t CompareValue(Vector &left_vec, Vector &right_vec, idx_t vector_idx_left, idx_t vector_idx_right,
|
|
42655
42675
|
OrderByNullType null_order) {
|
|
@@ -42693,7 +42713,7 @@ static int32_t CompareValue(Vector &left_vec, Vector &right_vec, idx_t vector_id
|
|
|
42693
42713
|
case PhysicalType::INTERVAL:
|
|
42694
42714
|
return TemplatedCompareValue<interval_t>(left_vec, right_vec, vector_idx_left, vector_idx_right);
|
|
42695
42715
|
default:
|
|
42696
|
-
|
|
42716
|
+
return TemplatedCompareValue<Value>(left_vec, right_vec, vector_idx_left, vector_idx_right);
|
|
42697
42717
|
}
|
|
42698
42718
|
}
|
|
42699
42719
|
|
|
@@ -79319,398 +79339,446 @@ string PhysicalTopN::ParamsToString() const {
|
|
|
79319
79339
|
|
|
79320
79340
|
namespace duckdb {
|
|
79321
79341
|
|
|
79322
|
-
|
|
79342
|
+
string BaseCSVReader::GetLineNumberStr(idx_t linenr, bool linenr_estimated) {
|
|
79343
|
+
string estimated = (linenr_estimated ? string(" (estimated)") : string(""));
|
|
79344
|
+
return to_string(linenr + 1) + estimated;
|
|
79345
|
+
}
|
|
79323
79346
|
|
|
79324
|
-
|
|
79325
|
-
|
|
79326
|
-
|
|
79327
|
-
return true;
|
|
79328
|
-
}
|
|
79329
|
-
if (set.size() > 1) {
|
|
79330
|
-
throw BinderException("\"%s\" expects a single argument as a boolean value (e.g. TRUE or 1)", loption);
|
|
79331
|
-
}
|
|
79332
|
-
return ParseBoolean(set[0], loption);
|
|
79347
|
+
BaseCSVReader::BaseCSVReader(FileSystem &fs_p, Allocator &allocator, FileOpener *opener_p,
|
|
79348
|
+
BufferedCSVReaderOptions options_p, const vector<LogicalType> &requested_types)
|
|
79349
|
+
: fs(fs_p), allocator(allocator), opener(opener_p), options(move(options_p)) {
|
|
79333
79350
|
}
|
|
79334
79351
|
|
|
79335
|
-
|
|
79352
|
+
BaseCSVReader::BaseCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
|
|
79353
|
+
const vector<LogicalType> &requested_types)
|
|
79354
|
+
: BaseCSVReader(FileSystem::GetFileSystem(context), Allocator::Get(context), FileSystem::GetFileOpener(context),
|
|
79355
|
+
move(options_p), requested_types) {
|
|
79356
|
+
}
|
|
79336
79357
|
|
|
79337
|
-
|
|
79338
|
-
auto &children = ListValue::GetChildren(value);
|
|
79339
|
-
return ParseBoolean(children, loption);
|
|
79340
|
-
}
|
|
79341
|
-
if (value.type() == LogicalType::FLOAT || value.type() == LogicalType::DOUBLE ||
|
|
79342
|
-
value.type().id() == LogicalTypeId::DECIMAL) {
|
|
79343
|
-
throw BinderException("\"%s\" expects a boolean value (e.g. TRUE or 1)", loption);
|
|
79344
|
-
}
|
|
79345
|
-
return BooleanValue::Get(value.DefaultCastAs(LogicalType::BOOLEAN));
|
|
79358
|
+
BaseCSVReader::~BaseCSVReader() {
|
|
79346
79359
|
}
|
|
79347
79360
|
|
|
79348
|
-
|
|
79349
|
-
|
|
79350
|
-
|
|
79351
|
-
|
|
79352
|
-
|
|
79353
|
-
|
|
79354
|
-
|
|
79361
|
+
unique_ptr<CSVFileHandle> BaseCSVReader::OpenCSV(const BufferedCSVReaderOptions &options_p) {
|
|
79362
|
+
auto file_handle = fs.OpenFile(options_p.file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK,
|
|
79363
|
+
options_p.compression, this->opener);
|
|
79364
|
+
return make_unique<CSVFileHandle>(move(file_handle));
|
|
79365
|
+
}
|
|
79366
|
+
|
|
79367
|
+
void BaseCSVReader::InitParseChunk(idx_t num_cols) {
|
|
79368
|
+
// adapt not null info
|
|
79369
|
+
if (options.force_not_null.size() != num_cols) {
|
|
79370
|
+
options.force_not_null.resize(num_cols, false);
|
|
79355
79371
|
}
|
|
79356
|
-
if (
|
|
79357
|
-
|
|
79372
|
+
if (num_cols == parse_chunk.ColumnCount()) {
|
|
79373
|
+
parse_chunk.Reset();
|
|
79374
|
+
} else {
|
|
79375
|
+
parse_chunk.Destroy();
|
|
79376
|
+
|
|
79377
|
+
// initialize the parse_chunk with a set of VARCHAR types
|
|
79378
|
+
vector<LogicalType> varchar_types(num_cols, LogicalType::VARCHAR);
|
|
79379
|
+
parse_chunk.Initialize(allocator, varchar_types);
|
|
79358
79380
|
}
|
|
79359
|
-
return value.GetValue<string>();
|
|
79360
79381
|
}
|
|
79361
79382
|
|
|
79362
|
-
|
|
79363
|
-
|
|
79364
|
-
|
|
79365
|
-
if (children.size() != 1) {
|
|
79366
|
-
// no option specified or multiple options specified
|
|
79367
|
-
throw BinderException("\"%s\" expects a single argument as an integer value", loption);
|
|
79368
|
-
}
|
|
79369
|
-
return ParseInteger(children[0], loption);
|
|
79383
|
+
void BaseCSVReader::InitInsertChunkIdx(idx_t num_cols) {
|
|
79384
|
+
for (idx_t col = 0; col < num_cols; ++col) {
|
|
79385
|
+
insert_cols_idx.push_back(col);
|
|
79370
79386
|
}
|
|
79371
|
-
return value.GetValue<int64_t>();
|
|
79372
79387
|
}
|
|
79373
79388
|
|
|
79374
|
-
|
|
79375
|
-
|
|
79389
|
+
void BaseCSVReader::SetDateFormat(const string &format_specifier, const LogicalTypeId &sql_type) {
|
|
79390
|
+
options.has_format[sql_type] = true;
|
|
79391
|
+
auto &date_format = options.date_format[sql_type];
|
|
79392
|
+
date_format.format_specifier = format_specifier;
|
|
79393
|
+
StrTimeFormat::ParseFormatSpecifier(date_format.format_specifier, date_format);
|
|
79394
|
+
}
|
|
79376
79395
|
|
|
79377
|
-
|
|
79378
|
-
|
|
79396
|
+
bool BaseCSVReader::TryCastValue(const Value &value, const LogicalType &sql_type) {
|
|
79397
|
+
if (options.has_format[LogicalTypeId::DATE] && sql_type.id() == LogicalTypeId::DATE) {
|
|
79398
|
+
date_t result;
|
|
79399
|
+
string error_message;
|
|
79400
|
+
return options.date_format[LogicalTypeId::DATE].TryParseDate(string_t(StringValue::Get(value)), result,
|
|
79401
|
+
error_message);
|
|
79402
|
+
} else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type.id() == LogicalTypeId::TIMESTAMP) {
|
|
79403
|
+
timestamp_t result;
|
|
79404
|
+
string error_message;
|
|
79405
|
+
return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(string_t(StringValue::Get(value)),
|
|
79406
|
+
result, error_message);
|
|
79407
|
+
} else {
|
|
79408
|
+
Value new_value;
|
|
79409
|
+
string error_message;
|
|
79410
|
+
return value.DefaultTryCastAs(sql_type, new_value, &error_message, true);
|
|
79379
79411
|
}
|
|
79380
|
-
|
|
79381
|
-
|
|
79382
|
-
|
|
79383
|
-
|
|
79412
|
+
}
|
|
79413
|
+
|
|
79414
|
+
struct TryCastDateOperator {
|
|
79415
|
+
static bool Operation(BufferedCSVReaderOptions &options, string_t input, date_t &result, string &error_message) {
|
|
79416
|
+
return options.date_format[LogicalTypeId::DATE].TryParseDate(input, result, error_message);
|
|
79384
79417
|
}
|
|
79385
|
-
|
|
79386
|
-
|
|
79387
|
-
|
|
79388
|
-
|
|
79389
|
-
|
|
79390
|
-
|
|
79391
|
-
}
|
|
79418
|
+
};
|
|
79419
|
+
|
|
79420
|
+
struct TryCastTimestampOperator {
|
|
79421
|
+
static bool Operation(BufferedCSVReaderOptions &options, string_t input, timestamp_t &result,
|
|
79422
|
+
string &error_message) {
|
|
79423
|
+
return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(input, result, error_message);
|
|
79392
79424
|
}
|
|
79393
|
-
|
|
79394
|
-
|
|
79395
|
-
|
|
79396
|
-
|
|
79425
|
+
};
|
|
79426
|
+
|
|
79427
|
+
template <class OP, class T>
|
|
79428
|
+
static bool TemplatedTryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
|
|
79429
|
+
idx_t count, string &error_message) {
|
|
79430
|
+
D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
|
|
79431
|
+
bool all_converted = true;
|
|
79432
|
+
UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
|
|
79433
|
+
T result;
|
|
79434
|
+
if (!OP::Operation(options, input, result, error_message)) {
|
|
79435
|
+
all_converted = false;
|
|
79397
79436
|
}
|
|
79398
|
-
|
|
79399
|
-
|
|
79437
|
+
return result;
|
|
79438
|
+
});
|
|
79439
|
+
return all_converted;
|
|
79400
79440
|
}
|
|
79401
79441
|
|
|
79402
|
-
|
|
79403
|
-
|
|
79442
|
+
bool TryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
|
|
79443
|
+
string &error_message) {
|
|
79444
|
+
return TemplatedTryCastDateVector<TryCastDateOperator, date_t>(options, input_vector, result_vector, count,
|
|
79445
|
+
error_message);
|
|
79446
|
+
}
|
|
79404
79447
|
|
|
79405
|
-
|
|
79406
|
-
|
|
79407
|
-
|
|
79408
|
-
|
|
79409
|
-
|
|
79410
|
-
|
|
79411
|
-
|
|
79412
|
-
|
|
79413
|
-
|
|
79414
|
-
|
|
79415
|
-
|
|
79416
|
-
|
|
79417
|
-
|
|
79418
|
-
|
|
79419
|
-
|
|
79448
|
+
bool TryCastTimestampVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
|
|
79449
|
+
string &error_message) {
|
|
79450
|
+
return TemplatedTryCastDateVector<TryCastTimestampOperator, timestamp_t>(options, input_vector, result_vector,
|
|
79451
|
+
count, error_message);
|
|
79452
|
+
}
|
|
79453
|
+
|
|
79454
|
+
bool BaseCSVReader::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) {
|
|
79455
|
+
// try vector-cast from string to sql_type
|
|
79456
|
+
Vector dummy_result(sql_type);
|
|
79457
|
+
if (options.has_format[LogicalTypeId::DATE] && sql_type == LogicalTypeId::DATE) {
|
|
79458
|
+
// use the date format to cast the chunk
|
|
79459
|
+
string error_message;
|
|
79460
|
+
return TryCastDateVector(options, parse_chunk_col, dummy_result, size, error_message);
|
|
79461
|
+
} else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type == LogicalTypeId::TIMESTAMP) {
|
|
79462
|
+
// use the timestamp format to cast the chunk
|
|
79463
|
+
string error_message;
|
|
79464
|
+
return TryCastTimestampVector(options, parse_chunk_col, dummy_result, size, error_message);
|
|
79465
|
+
} else {
|
|
79466
|
+
// target type is not varchar: perform a cast
|
|
79467
|
+
string error_message;
|
|
79468
|
+
return VectorOperations::DefaultTryCast(parse_chunk_col, dummy_result, size, &error_message, true);
|
|
79420
79469
|
}
|
|
79421
|
-
return ParseColumnList(children, names, loption);
|
|
79422
79470
|
}
|
|
79423
79471
|
|
|
79424
|
-
|
|
79425
|
-
|
|
79426
|
-
|
|
79427
|
-
|
|
79428
|
-
|
|
79429
|
-
|
|
79472
|
+
void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes) {
|
|
79473
|
+
auto length = str_val.GetSize();
|
|
79474
|
+
if (length == 0 && column == 0) {
|
|
79475
|
+
row_empty = true;
|
|
79476
|
+
} else {
|
|
79477
|
+
row_empty = false;
|
|
79430
79478
|
}
|
|
79431
79479
|
|
|
79432
|
-
|
|
79433
|
-
|
|
79480
|
+
if (!sql_types.empty() && column == sql_types.size() && length == 0) {
|
|
79481
|
+
// skip a single trailing delimiter in last column
|
|
79482
|
+
return;
|
|
79434
79483
|
}
|
|
79435
|
-
|
|
79436
|
-
|
|
79437
|
-
|
|
79438
|
-
}
|
|
79439
|
-
file_handle->Seek(position);
|
|
79484
|
+
if (mode == ParserMode::SNIFFING_DIALECT) {
|
|
79485
|
+
column++;
|
|
79486
|
+
return;
|
|
79440
79487
|
}
|
|
79441
|
-
|
|
79442
|
-
if (
|
|
79443
|
-
|
|
79488
|
+
if (column >= sql_types.size()) {
|
|
79489
|
+
if (options.ignore_errors) {
|
|
79490
|
+
error_column_overflow = true;
|
|
79491
|
+
return;
|
|
79492
|
+
} else {
|
|
79493
|
+
throw InvalidInputException(
|
|
79494
|
+
"Error in file \"%s\", on line %s: expected %lld values per row, but got more. (%s)", options.file_path,
|
|
79495
|
+
GetLineNumberStr(linenr, linenr_estimated).c_str(), sql_types.size(), options.ToString());
|
|
79444
79496
|
}
|
|
79445
|
-
return file_handle->SeekPosition();
|
|
79446
79497
|
}
|
|
79447
|
-
|
|
79448
|
-
|
|
79449
|
-
|
|
79450
|
-
|
|
79451
|
-
|
|
79452
|
-
|
|
79498
|
+
|
|
79499
|
+
// insert the line number into the chunk
|
|
79500
|
+
idx_t row_entry = parse_chunk.size();
|
|
79501
|
+
|
|
79502
|
+
// test against null string, but only if the value was not quoted
|
|
79503
|
+
if ((!has_quotes || sql_types[column].id() != LogicalTypeId::VARCHAR) && !options.force_not_null[column] &&
|
|
79504
|
+
Equals::Operation(str_val, string_t(options.null_str))) {
|
|
79505
|
+
FlatVector::SetNull(parse_chunk.data[column], row_entry, true);
|
|
79506
|
+
} else {
|
|
79507
|
+
auto &v = parse_chunk.data[column];
|
|
79508
|
+
auto parse_data = FlatVector::GetData<string_t>(v);
|
|
79509
|
+
if (!escape_positions.empty()) {
|
|
79510
|
+
// remove escape characters (if any)
|
|
79511
|
+
string old_val = str_val.GetString();
|
|
79512
|
+
string new_val = "";
|
|
79513
|
+
idx_t prev_pos = 0;
|
|
79514
|
+
for (idx_t i = 0; i < escape_positions.size(); i++) {
|
|
79515
|
+
idx_t next_pos = escape_positions[i];
|
|
79516
|
+
new_val += old_val.substr(prev_pos, next_pos - prev_pos);
|
|
79517
|
+
|
|
79518
|
+
if (options.escape.empty() || options.escape == options.quote) {
|
|
79519
|
+
prev_pos = next_pos + options.quote.size();
|
|
79520
|
+
} else {
|
|
79521
|
+
prev_pos = next_pos + options.escape.size();
|
|
79522
|
+
}
|
|
79453
79523
|
}
|
|
79454
|
-
|
|
79524
|
+
new_val += old_val.substr(prev_pos, old_val.size() - prev_pos);
|
|
79525
|
+
escape_positions.clear();
|
|
79526
|
+
parse_data[row_entry] = StringVector::AddStringOrBlob(v, string_t(new_val));
|
|
79527
|
+
} else {
|
|
79528
|
+
parse_data[row_entry] = str_val;
|
|
79455
79529
|
}
|
|
79456
79530
|
}
|
|
79457
|
-
bool PlainFileSource() {
|
|
79458
|
-
return plain_file_source;
|
|
79459
|
-
}
|
|
79460
79531
|
|
|
79461
|
-
|
|
79462
|
-
|
|
79463
|
-
|
|
79532
|
+
// move to the next column
|
|
79533
|
+
column++;
|
|
79534
|
+
}
|
|
79464
79535
|
|
|
79465
|
-
|
|
79466
|
-
|
|
79467
|
-
}
|
|
79536
|
+
bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column) {
|
|
79537
|
+
linenr++;
|
|
79468
79538
|
|
|
79469
|
-
|
|
79470
|
-
|
|
79471
|
-
|
|
79472
|
-
|
|
79473
|
-
|
|
79474
|
-
// we need to read from our cached buffer
|
|
79475
|
-
auto buffer_read_count = MinValue<idx_t>(nr_bytes, buffer_size - read_position);
|
|
79476
|
-
memcpy(buffer, cached_buffer.get() + read_position, buffer_read_count);
|
|
79477
|
-
result_offset += buffer_read_count;
|
|
79478
|
-
read_position += buffer_read_count;
|
|
79479
|
-
if (result_offset == nr_bytes) {
|
|
79480
|
-
return nr_bytes;
|
|
79481
|
-
}
|
|
79482
|
-
} else if (!reset_enabled && cached_buffer) {
|
|
79483
|
-
// reset is disabled but we still have cached data
|
|
79484
|
-
// we can remove any cached data
|
|
79485
|
-
cached_buffer.reset();
|
|
79486
|
-
buffer_size = 0;
|
|
79487
|
-
buffer_capacity = 0;
|
|
79488
|
-
read_position = 0;
|
|
79489
|
-
}
|
|
79490
|
-
// we have data left to read from the file
|
|
79491
|
-
// read directly into the buffer
|
|
79492
|
-
auto bytes_read = file_handle->Read((char *)buffer + result_offset, nr_bytes - result_offset);
|
|
79493
|
-
read_position += bytes_read;
|
|
79494
|
-
if (reset_enabled) {
|
|
79495
|
-
// if reset caching is enabled, we need to cache the bytes that we have read
|
|
79496
|
-
if (buffer_size + bytes_read >= buffer_capacity) {
|
|
79497
|
-
// no space; first enlarge the buffer
|
|
79498
|
-
buffer_capacity = MaxValue<idx_t>(NextPowerOfTwo(buffer_size + bytes_read), buffer_capacity * 2);
|
|
79499
|
-
|
|
79500
|
-
auto new_buffer = unique_ptr<data_t[]>(new data_t[buffer_capacity]);
|
|
79501
|
-
if (buffer_size > 0) {
|
|
79502
|
-
memcpy(new_buffer.get(), cached_buffer.get(), buffer_size);
|
|
79503
|
-
}
|
|
79504
|
-
cached_buffer = move(new_buffer);
|
|
79505
|
-
}
|
|
79506
|
-
memcpy(cached_buffer.get() + buffer_size, (char *)buffer + result_offset, bytes_read);
|
|
79507
|
-
buffer_size += bytes_read;
|
|
79539
|
+
if (row_empty) {
|
|
79540
|
+
row_empty = false;
|
|
79541
|
+
if (sql_types.size() != 1) {
|
|
79542
|
+
if (mode == ParserMode::PARSING) {
|
|
79543
|
+
FlatVector::SetNull(parse_chunk.data[0], parse_chunk.size(), false);
|
|
79508
79544
|
}
|
|
79545
|
+
column = 0;
|
|
79546
|
+
return false;
|
|
79547
|
+
}
|
|
79548
|
+
}
|
|
79549
|
+
|
|
79550
|
+
// Error forwarded by 'ignore_errors' - originally encountered in 'AddValue'
|
|
79551
|
+
if (error_column_overflow) {
|
|
79552
|
+
D_ASSERT(options.ignore_errors);
|
|
79553
|
+
error_column_overflow = false;
|
|
79554
|
+
column = 0;
|
|
79555
|
+
return false;
|
|
79556
|
+
}
|
|
79509
79557
|
|
|
79510
|
-
|
|
79558
|
+
if (column < sql_types.size() && mode != ParserMode::SNIFFING_DIALECT) {
|
|
79559
|
+
if (options.ignore_errors) {
|
|
79560
|
+
column = 0;
|
|
79561
|
+
return false;
|
|
79511
79562
|
} else {
|
|
79512
|
-
|
|
79563
|
+
throw InvalidInputException(
|
|
79564
|
+
"Error in file \"%s\" on line %s: expected %lld values per row, but got %d. (%s)", options.file_path,
|
|
79565
|
+
GetLineNumberStr(linenr, linenr_estimated).c_str(), sql_types.size(), column, options.ToString());
|
|
79513
79566
|
}
|
|
79514
79567
|
}
|
|
79515
79568
|
|
|
79516
|
-
|
|
79517
|
-
|
|
79518
|
-
|
|
79519
|
-
|
|
79520
|
-
|
|
79521
|
-
idx_t bytes_read = Read(buffer, 1);
|
|
79522
|
-
if (bytes_read == 0) {
|
|
79523
|
-
return result;
|
|
79524
|
-
}
|
|
79525
|
-
if (carriage_return) {
|
|
79526
|
-
if (buffer[0] != '\n') {
|
|
79527
|
-
if (!file_handle->CanSeek()) {
|
|
79528
|
-
throw BinderException(
|
|
79529
|
-
"Carriage return newlines not supported when reading CSV files in which we cannot seek");
|
|
79530
|
-
}
|
|
79531
|
-
file_handle->Seek(file_handle->SeekPosition() - 1);
|
|
79532
|
-
return result;
|
|
79533
|
-
}
|
|
79534
|
-
}
|
|
79535
|
-
if (buffer[0] == '\n') {
|
|
79536
|
-
return result;
|
|
79537
|
-
}
|
|
79538
|
-
if (buffer[0] != '\r') {
|
|
79539
|
-
result += buffer[0];
|
|
79540
|
-
} else {
|
|
79541
|
-
carriage_return = true;
|
|
79542
|
-
}
|
|
79569
|
+
if (mode == ParserMode::SNIFFING_DIALECT) {
|
|
79570
|
+
sniffed_column_counts.push_back(column);
|
|
79571
|
+
|
|
79572
|
+
if (sniffed_column_counts.size() == options.sample_chunk_size) {
|
|
79573
|
+
return true;
|
|
79543
79574
|
}
|
|
79575
|
+
} else {
|
|
79576
|
+
parse_chunk.SetCardinality(parse_chunk.size() + 1);
|
|
79544
79577
|
}
|
|
79545
79578
|
|
|
79546
|
-
|
|
79547
|
-
|
|
79579
|
+
if (mode == ParserMode::PARSING_HEADER) {
|
|
79580
|
+
return true;
|
|
79548
79581
|
}
|
|
79549
79582
|
|
|
79550
|
-
|
|
79551
|
-
|
|
79552
|
-
|
|
79553
|
-
bool can_seek = false;
|
|
79554
|
-
bool plain_file_source = false;
|
|
79555
|
-
idx_t file_size = 0;
|
|
79556
|
-
// reset support
|
|
79557
|
-
unique_ptr<data_t[]> cached_buffer;
|
|
79558
|
-
idx_t read_position = 0;
|
|
79559
|
-
idx_t buffer_size = 0;
|
|
79560
|
-
idx_t buffer_capacity = 0;
|
|
79561
|
-
};
|
|
79583
|
+
if (mode == ParserMode::SNIFFING_DATATYPES && parse_chunk.size() == options.sample_chunk_size) {
|
|
79584
|
+
return true;
|
|
79585
|
+
}
|
|
79562
79586
|
|
|
79563
|
-
|
|
79564
|
-
|
|
79565
|
-
|
|
79566
|
-
if (input.empty()) {
|
|
79567
|
-
this->delimiter = string("\0", 1);
|
|
79587
|
+
if (mode == ParserMode::PARSING && parse_chunk.size() == STANDARD_VECTOR_SIZE) {
|
|
79588
|
+
Flush(insert_chunk);
|
|
79589
|
+
return true;
|
|
79568
79590
|
}
|
|
79591
|
+
|
|
79592
|
+
column = 0;
|
|
79593
|
+
return false;
|
|
79569
79594
|
}
|
|
79570
79595
|
|
|
79571
|
-
void
|
|
79572
|
-
|
|
79573
|
-
|
|
79574
|
-
|
|
79575
|
-
error = StrTimeFormat::ParseFormatSpecifier(format, date_format);
|
|
79576
|
-
date_format.format_specifier = format;
|
|
79577
|
-
} else {
|
|
79578
|
-
auto &date_format = this->write_date_format[type];
|
|
79579
|
-
error = StrTimeFormat::ParseFormatSpecifier(format, date_format);
|
|
79580
|
-
}
|
|
79581
|
-
if (!error.empty()) {
|
|
79582
|
-
throw InvalidInputException("Could not parse DATEFORMAT: %s", error.c_str());
|
|
79596
|
+
void BaseCSVReader::SetNullUnionCols(DataChunk &insert_chunk) {
|
|
79597
|
+
for (idx_t col = 0; col < insert_nulls_idx.size(); ++col) {
|
|
79598
|
+
insert_chunk.data[insert_nulls_idx[col]].SetVectorType(VectorType::CONSTANT_VECTOR);
|
|
79599
|
+
ConstantVector::SetNull(insert_chunk.data[insert_nulls_idx[col]], true);
|
|
79583
79600
|
}
|
|
79584
|
-
has_format[type] = true;
|
|
79585
79601
|
}
|
|
79586
79602
|
|
|
79587
|
-
void
|
|
79588
|
-
|
|
79589
|
-
|
|
79603
|
+
void BaseCSVReader::VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset) {
|
|
79604
|
+
D_ASSERT(col_idx < chunk.data.size());
|
|
79605
|
+
D_ASSERT(row_idx < chunk.size());
|
|
79606
|
+
auto &v = chunk.data[col_idx];
|
|
79607
|
+
if (FlatVector::IsNull(v, row_idx)) {
|
|
79590
79608
|
return;
|
|
79591
79609
|
}
|
|
79592
|
-
|
|
79593
|
-
|
|
79594
|
-
|
|
79595
|
-
|
|
79596
|
-
|
|
79597
|
-
|
|
79598
|
-
|
|
79599
|
-
|
|
79600
|
-
sample_chunks = std::numeric_limits<uint64_t>::max();
|
|
79601
|
-
sample_chunk_size = STANDARD_VECTOR_SIZE;
|
|
79602
|
-
} else if (sample_size <= STANDARD_VECTOR_SIZE) {
|
|
79603
|
-
sample_chunk_size = sample_size;
|
|
79604
|
-
sample_chunks = 1;
|
|
79605
|
-
} else {
|
|
79606
|
-
sample_chunk_size = STANDARD_VECTOR_SIZE;
|
|
79607
|
-
sample_chunks = sample_size / STANDARD_VECTOR_SIZE;
|
|
79608
|
-
}
|
|
79609
|
-
} else if (loption == "skip") {
|
|
79610
|
-
skip_rows = ParseInteger(value, loption);
|
|
79611
|
-
} else if (loption == "max_line_size" || loption == "maximum_line_size") {
|
|
79612
|
-
maximum_line_size = ParseInteger(value, loption);
|
|
79613
|
-
} else if (loption == "sample_chunk_size") {
|
|
79614
|
-
sample_chunk_size = ParseInteger(value, loption);
|
|
79615
|
-
if (sample_chunk_size > STANDARD_VECTOR_SIZE) {
|
|
79616
|
-
throw BinderException(
|
|
79617
|
-
"Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d",
|
|
79618
|
-
STANDARD_VECTOR_SIZE);
|
|
79619
|
-
} else if (sample_chunk_size < 1) {
|
|
79620
|
-
throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1");
|
|
79621
|
-
}
|
|
79622
|
-
} else if (loption == "sample_chunks") {
|
|
79623
|
-
sample_chunks = ParseInteger(value, loption);
|
|
79624
|
-
if (sample_chunks < 1) {
|
|
79625
|
-
throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1");
|
|
79610
|
+
|
|
79611
|
+
auto parse_data = FlatVector::GetData<string_t>(chunk.data[col_idx]);
|
|
79612
|
+
auto s = parse_data[row_idx];
|
|
79613
|
+
auto utf_type = Utf8Proc::Analyze(s.GetDataUnsafe(), s.GetSize());
|
|
79614
|
+
if (utf_type == UnicodeType::INVALID) {
|
|
79615
|
+
string col_name = to_string(col_idx);
|
|
79616
|
+
if (col_idx < col_names.size()) {
|
|
79617
|
+
col_name = "\"" + col_names[col_idx] + "\"";
|
|
79626
79618
|
}
|
|
79627
|
-
|
|
79628
|
-
|
|
79629
|
-
|
|
79630
|
-
|
|
79631
|
-
|
|
79632
|
-
|
|
79633
|
-
string format = ParseString(value, loption);
|
|
79634
|
-
SetDateFormat(LogicalTypeId::TIMESTAMP, format, true);
|
|
79635
|
-
} else if (loption == "escape") {
|
|
79636
|
-
escape = ParseString(value, loption);
|
|
79637
|
-
has_escape = true;
|
|
79638
|
-
} else if (loption == "ignore_errors") {
|
|
79639
|
-
ignore_errors = ParseBoolean(value, loption);
|
|
79640
|
-
} else if (loption == "union_by_name") {
|
|
79641
|
-
union_by_name = ParseBoolean(value, loption);
|
|
79642
|
-
} else {
|
|
79643
|
-
throw BinderException("Unrecognized option for CSV reader \"%s\"", loption);
|
|
79619
|
+
int64_t error_line = linenr - (chunk.size() - row_idx) + 1 + offset;
|
|
79620
|
+
D_ASSERT(error_line >= 0);
|
|
79621
|
+
throw InvalidInputException("Error in file \"%s\" at line %llu in column \"%s\": "
|
|
79622
|
+
"%s. Parser options: %s",
|
|
79623
|
+
options.file_path, error_line, col_name,
|
|
79624
|
+
ErrorManager::InvalidUnicodeError(s.GetString(), "CSV file"), options.ToString());
|
|
79644
79625
|
}
|
|
79645
79626
|
}
|
|
79646
79627
|
|
|
79647
|
-
void
|
|
79648
|
-
|
|
79649
|
-
|
|
79628
|
+
void BaseCSVReader::VerifyUTF8(idx_t col_idx) {
|
|
79629
|
+
D_ASSERT(col_idx < parse_chunk.data.size());
|
|
79630
|
+
for (idx_t i = 0; i < parse_chunk.size(); i++) {
|
|
79631
|
+
VerifyUTF8(col_idx, i, parse_chunk);
|
|
79650
79632
|
}
|
|
79633
|
+
}
|
|
79651
79634
|
|
|
79652
|
-
|
|
79653
|
-
|
|
79654
|
-
|
|
79655
|
-
|
|
79656
|
-
|
|
79657
|
-
|
|
79658
|
-
|
|
79659
|
-
|
|
79660
|
-
|
|
79635
|
+
bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
|
|
79636
|
+
if (parse_chunk.size() == 0) {
|
|
79637
|
+
return true;
|
|
79638
|
+
}
|
|
79639
|
+
|
|
79640
|
+
bool conversion_error_ignored = false;
|
|
79641
|
+
|
|
79642
|
+
// convert the columns in the parsed chunk to the types of the table
|
|
79643
|
+
insert_chunk.SetCardinality(parse_chunk);
|
|
79644
|
+
for (idx_t col_idx = 0; col_idx < sql_types.size(); col_idx++) {
|
|
79645
|
+
if (sql_types[col_idx].id() == LogicalTypeId::VARCHAR) {
|
|
79646
|
+
// target type is varchar: no need to convert
|
|
79647
|
+
// just test that all strings are valid utf-8 strings
|
|
79648
|
+
VerifyUTF8(col_idx);
|
|
79649
|
+
insert_chunk.data[insert_cols_idx[col_idx]].Reference(parse_chunk.data[col_idx]);
|
|
79650
|
+
} else {
|
|
79651
|
+
string error_message;
|
|
79652
|
+
bool success;
|
|
79653
|
+
if (options.has_format[LogicalTypeId::DATE] && sql_types[col_idx].id() == LogicalTypeId::DATE) {
|
|
79654
|
+
// use the date format to cast the chunk
|
|
79655
|
+
success =
|
|
79656
|
+
TryCastDateVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_cols_idx[col_idx]],
|
|
79657
|
+
parse_chunk.size(), error_message);
|
|
79658
|
+
} else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
|
|
79659
|
+
sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP) {
|
|
79660
|
+
// use the date format to cast the chunk
|
|
79661
|
+
success = TryCastTimestampVector(options, parse_chunk.data[col_idx],
|
|
79662
|
+
insert_chunk.data[insert_cols_idx[col_idx]], parse_chunk.size(),
|
|
79663
|
+
error_message);
|
|
79664
|
+
} else {
|
|
79665
|
+
// target type is not varchar: perform a cast
|
|
79666
|
+
success = VectorOperations::DefaultTryCast(parse_chunk.data[col_idx],
|
|
79667
|
+
insert_chunk.data[insert_cols_idx[col_idx]],
|
|
79668
|
+
parse_chunk.size(), &error_message);
|
|
79669
|
+
}
|
|
79670
|
+
if (success) {
|
|
79671
|
+
continue;
|
|
79672
|
+
}
|
|
79673
|
+
if (try_add_line) {
|
|
79674
|
+
return false;
|
|
79675
|
+
}
|
|
79676
|
+
if (options.ignore_errors) {
|
|
79677
|
+
conversion_error_ignored = true;
|
|
79678
|
+
continue;
|
|
79679
|
+
}
|
|
79680
|
+
string col_name = to_string(col_idx);
|
|
79681
|
+
if (col_idx < col_names.size()) {
|
|
79682
|
+
col_name = "\"" + col_names[col_idx] + "\"";
|
|
79683
|
+
}
|
|
79684
|
+
|
|
79685
|
+
// figure out the exact line number
|
|
79686
|
+
idx_t row_idx;
|
|
79687
|
+
for (row_idx = 0; row_idx < parse_chunk.size(); row_idx++) {
|
|
79688
|
+
auto &inserted_column = insert_chunk.data[col_idx];
|
|
79689
|
+
auto &parsed_column = parse_chunk.data[col_idx];
|
|
79690
|
+
|
|
79691
|
+
if (FlatVector::IsNull(inserted_column, row_idx) && !FlatVector::IsNull(parsed_column, row_idx)) {
|
|
79692
|
+
break;
|
|
79693
|
+
}
|
|
79694
|
+
}
|
|
79695
|
+
auto error_line = linenr - (parse_chunk.size() - row_idx) + 1;
|
|
79696
|
+
|
|
79697
|
+
if (options.auto_detect) {
|
|
79698
|
+
throw InvalidInputException("%s in column %s, at line %llu. Parser "
|
|
79699
|
+
"options: %s. Consider either increasing the sample size "
|
|
79700
|
+
"(SAMPLE_SIZE=X [X rows] or SAMPLE_SIZE=-1 [all rows]), "
|
|
79701
|
+
"or skipping column conversion (ALL_VARCHAR=1)",
|
|
79702
|
+
error_message, col_name, error_line, options.ToString());
|
|
79703
|
+
} else {
|
|
79704
|
+
throw InvalidInputException("%s at line %llu in column %s. Parser options: %s ", error_message,
|
|
79705
|
+
error_line, col_name, options.ToString());
|
|
79706
|
+
}
|
|
79661
79707
|
}
|
|
79662
|
-
SetDateFormat(LogicalTypeId::TIMESTAMP, format, false);
|
|
79663
|
-
} else {
|
|
79664
|
-
throw BinderException("Unrecognized option CSV writer \"%s\"", loption);
|
|
79665
79708
|
}
|
|
79666
|
-
|
|
79709
|
+
if (conversion_error_ignored) {
|
|
79710
|
+
D_ASSERT(options.ignore_errors);
|
|
79711
|
+
SelectionVector succesful_rows;
|
|
79712
|
+
succesful_rows.Initialize(parse_chunk.size());
|
|
79713
|
+
idx_t sel_size = 0;
|
|
79667
79714
|
|
|
79668
|
-
|
|
79669
|
-
|
|
79670
|
-
|
|
79715
|
+
for (idx_t row_idx = 0; row_idx < parse_chunk.size(); row_idx++) {
|
|
79716
|
+
bool failed = false;
|
|
79717
|
+
for (idx_t column_idx = 0; column_idx < sql_types.size(); column_idx++) {
|
|
79671
79718
|
|
|
79672
|
-
|
|
79673
|
-
|
|
79674
|
-
|
|
79675
|
-
|
|
79676
|
-
|
|
79677
|
-
|
|
79678
|
-
|
|
79679
|
-
|
|
79680
|
-
|
|
79681
|
-
|
|
79682
|
-
|
|
79683
|
-
|
|
79684
|
-
null_str = ParseString(value, loption);
|
|
79685
|
-
} else if (loption == "encoding") {
|
|
79686
|
-
auto encoding = StringUtil::Lower(ParseString(value, loption));
|
|
79687
|
-
if (encoding != "utf8" && encoding != "utf-8") {
|
|
79688
|
-
throw BinderException("Copy is only supported for UTF-8 encoded files, ENCODING 'UTF-8'");
|
|
79719
|
+
auto &inserted_column = insert_chunk.data[column_idx];
|
|
79720
|
+
auto &parsed_column = parse_chunk.data[column_idx];
|
|
79721
|
+
|
|
79722
|
+
bool was_already_null = FlatVector::IsNull(parsed_column, row_idx);
|
|
79723
|
+
if (!was_already_null && FlatVector::IsNull(inserted_column, row_idx)) {
|
|
79724
|
+
failed = true;
|
|
79725
|
+
break;
|
|
79726
|
+
}
|
|
79727
|
+
}
|
|
79728
|
+
if (!failed) {
|
|
79729
|
+
succesful_rows.set_index(sel_size++, row_idx);
|
|
79730
|
+
}
|
|
79689
79731
|
}
|
|
79690
|
-
|
|
79691
|
-
compression = FileCompressionTypeFromString(ParseString(value, loption));
|
|
79692
|
-
} else {
|
|
79693
|
-
// unrecognized option in base CSV
|
|
79694
|
-
return false;
|
|
79732
|
+
insert_chunk.Slice(succesful_rows, sel_size);
|
|
79695
79733
|
}
|
|
79734
|
+
parse_chunk.Reset();
|
|
79696
79735
|
return true;
|
|
79697
79736
|
}
|
|
79737
|
+
} // namespace duckdb
|
|
79698
79738
|
|
|
79699
|
-
|
|
79700
|
-
|
|
79701
|
-
|
|
79702
|
-
|
|
79703
|
-
|
|
79704
|
-
|
|
79705
|
-
|
|
79706
|
-
|
|
79739
|
+
|
|
79740
|
+
|
|
79741
|
+
|
|
79742
|
+
|
|
79743
|
+
|
|
79744
|
+
|
|
79745
|
+
|
|
79746
|
+
|
|
79747
|
+
|
|
79748
|
+
|
|
79749
|
+
|
|
79750
|
+
|
|
79751
|
+
|
|
79752
|
+
|
|
79753
|
+
|
|
79754
|
+
|
|
79755
|
+
|
|
79756
|
+
#include <algorithm>
|
|
79757
|
+
#include <cctype>
|
|
79758
|
+
#include <cstring>
|
|
79759
|
+
#include <fstream>
|
|
79760
|
+
|
|
79761
|
+
namespace duckdb {
|
|
79762
|
+
|
|
79763
|
+
BufferedCSVReader::BufferedCSVReader(FileSystem &fs_p, Allocator &allocator, FileOpener *opener_p,
|
|
79764
|
+
BufferedCSVReaderOptions options_p, const vector<LogicalType> &requested_types)
|
|
79765
|
+
: BaseCSVReader(fs_p, allocator, opener_p, move(options_p), requested_types), buffer_size(0), position(0),
|
|
79766
|
+
start(0) {
|
|
79767
|
+
file_handle = OpenCSV(options);
|
|
79768
|
+
Initialize(requested_types);
|
|
79707
79769
|
}
|
|
79708
79770
|
|
|
79709
|
-
|
|
79710
|
-
|
|
79711
|
-
|
|
79771
|
+
BufferedCSVReader::BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
|
|
79772
|
+
const vector<LogicalType> &requested_types)
|
|
79773
|
+
: BufferedCSVReader(FileSystem::GetFileSystem(context), Allocator::Get(context), FileSystem::GetFileOpener(context),
|
|
79774
|
+
move(options_p), requested_types) {
|
|
79712
79775
|
}
|
|
79713
79776
|
|
|
79777
|
+
BufferedCSVReader::~BufferedCSVReader() {
|
|
79778
|
+
}
|
|
79779
|
+
|
|
79780
|
+
enum class QuoteRule : uint8_t { QUOTES_RFC = 0, QUOTES_OTHER = 1, NO_QUOTES = 2 };
|
|
79781
|
+
|
|
79714
79782
|
static bool StartsWithNumericDate(string &separator, const string &value) {
|
|
79715
79783
|
auto begin = value.c_str();
|
|
79716
79784
|
auto end = begin + value.size();
|
|
@@ -79813,61 +79881,6 @@ TextSearchShiftArray::TextSearchShiftArray(string search_term) : length(search_t
|
|
|
79813
79881
|
}
|
|
79814
79882
|
}
|
|
79815
79883
|
|
|
79816
|
-
BufferedCSVReader::BufferedCSVReader(FileSystem &fs_p, Allocator &allocator, FileOpener *opener_p,
|
|
79817
|
-
BufferedCSVReaderOptions options_p, const vector<LogicalType> &requested_types)
|
|
79818
|
-
: fs(fs_p), allocator(allocator), opener(opener_p), options(move(options_p)), buffer_size(0), position(0),
|
|
79819
|
-
start(0) {
|
|
79820
|
-
file_handle = OpenCSV(options);
|
|
79821
|
-
Initialize(requested_types);
|
|
79822
|
-
}
|
|
79823
|
-
|
|
79824
|
-
BufferedCSVReader::BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
|
|
79825
|
-
const vector<LogicalType> &requested_types)
|
|
79826
|
-
: BufferedCSVReader(FileSystem::GetFileSystem(context), Allocator::Get(context), FileSystem::GetFileOpener(context),
|
|
79827
|
-
move(options_p), requested_types) {
|
|
79828
|
-
}
|
|
79829
|
-
|
|
79830
|
-
BufferedCSVReader::~BufferedCSVReader() {
|
|
79831
|
-
}
|
|
79832
|
-
|
|
79833
|
-
idx_t BufferedCSVReader::GetFileSize() {
|
|
79834
|
-
return file_handle ? file_handle->FileSize() : 0;
|
|
79835
|
-
}
|
|
79836
|
-
|
|
79837
|
-
void BufferedCSVReader::Initialize(const vector<LogicalType> &requested_types) {
|
|
79838
|
-
PrepareComplexParser();
|
|
79839
|
-
if (options.auto_detect) {
|
|
79840
|
-
sql_types = SniffCSV(requested_types);
|
|
79841
|
-
if (sql_types.empty()) {
|
|
79842
|
-
throw Exception("Failed to detect column types from CSV: is the file a valid CSV file?");
|
|
79843
|
-
}
|
|
79844
|
-
if (cached_chunks.empty()) {
|
|
79845
|
-
JumpToBeginning(options.skip_rows, options.header);
|
|
79846
|
-
}
|
|
79847
|
-
} else {
|
|
79848
|
-
sql_types = requested_types;
|
|
79849
|
-
ResetBuffer();
|
|
79850
|
-
SkipRowsAndReadHeader(options.skip_rows, options.header);
|
|
79851
|
-
}
|
|
79852
|
-
InitParseChunk(sql_types.size());
|
|
79853
|
-
InitInsertChunkIdx(sql_types.size());
|
|
79854
|
-
// we only need reset support during the automatic CSV type detection
|
|
79855
|
-
// since reset support might require caching (in the case of streams), we disable it for the remainder
|
|
79856
|
-
file_handle->DisableReset();
|
|
79857
|
-
}
|
|
79858
|
-
|
|
79859
|
-
void BufferedCSVReader::PrepareComplexParser() {
|
|
79860
|
-
delimiter_search = TextSearchShiftArray(options.delimiter);
|
|
79861
|
-
escape_search = TextSearchShiftArray(options.escape);
|
|
79862
|
-
quote_search = TextSearchShiftArray(options.quote);
|
|
79863
|
-
}
|
|
79864
|
-
|
|
79865
|
-
unique_ptr<CSVFileHandle> BufferedCSVReader::OpenCSV(const BufferedCSVReaderOptions &options) {
|
|
79866
|
-
auto file_handle = fs.OpenFile(options.file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK,
|
|
79867
|
-
options.compression, this->opener);
|
|
79868
|
-
return make_unique<CSVFileHandle>(move(file_handle));
|
|
79869
|
-
}
|
|
79870
|
-
|
|
79871
79884
|
// Helper function to generate column names
|
|
79872
79885
|
static string GenerateColumnName(const idx_t total_cols, const idx_t col_number, const string &prefix = "column") {
|
|
79873
79886
|
int max_digits = NumericHelper::UnsignedLength(total_cols - 1);
|
|
@@ -79957,6 +79970,28 @@ static string NormalizeColumnName(const string &col_name) {
|
|
|
79957
79970
|
return col_name_cleaned;
|
|
79958
79971
|
}
|
|
79959
79972
|
|
|
79973
|
+
void BufferedCSVReader::Initialize(const vector<LogicalType> &requested_types) {
|
|
79974
|
+
PrepareComplexParser();
|
|
79975
|
+
if (options.auto_detect) {
|
|
79976
|
+
sql_types = SniffCSV(requested_types);
|
|
79977
|
+
if (sql_types.empty()) {
|
|
79978
|
+
throw Exception("Failed to detect column types from CSV: is the file a valid CSV file?");
|
|
79979
|
+
}
|
|
79980
|
+
if (cached_chunks.empty()) {
|
|
79981
|
+
JumpToBeginning(options.skip_rows, options.header);
|
|
79982
|
+
}
|
|
79983
|
+
} else {
|
|
79984
|
+
sql_types = requested_types;
|
|
79985
|
+
ResetBuffer();
|
|
79986
|
+
SkipRowsAndReadHeader(options.skip_rows, options.header);
|
|
79987
|
+
}
|
|
79988
|
+
InitParseChunk(sql_types.size());
|
|
79989
|
+
InitInsertChunkIdx(sql_types.size());
|
|
79990
|
+
// we only need reset support during the automatic CSV type detection
|
|
79991
|
+
// since reset support might require caching (in the case of streams), we disable it for the remainder
|
|
79992
|
+
file_handle->DisableReset();
|
|
79993
|
+
}
|
|
79994
|
+
|
|
79960
79995
|
void BufferedCSVReader::ResetBuffer() {
|
|
79961
79996
|
buffer.reset();
|
|
79962
79997
|
buffer_size = 0;
|
|
@@ -79980,28 +80015,6 @@ void BufferedCSVReader::ResetStream() {
|
|
|
79980
80015
|
jumping_samples = false;
|
|
79981
80016
|
}
|
|
79982
80017
|
|
|
79983
|
-
void BufferedCSVReader::InitParseChunk(idx_t num_cols) {
|
|
79984
|
-
// adapt not null info
|
|
79985
|
-
if (options.force_not_null.size() != num_cols) {
|
|
79986
|
-
options.force_not_null.resize(num_cols, false);
|
|
79987
|
-
}
|
|
79988
|
-
if (num_cols == parse_chunk.ColumnCount()) {
|
|
79989
|
-
parse_chunk.Reset();
|
|
79990
|
-
} else {
|
|
79991
|
-
parse_chunk.Destroy();
|
|
79992
|
-
|
|
79993
|
-
// initialize the parse_chunk with a set of VARCHAR types
|
|
79994
|
-
vector<LogicalType> varchar_types(num_cols, LogicalType::VARCHAR);
|
|
79995
|
-
parse_chunk.Initialize(allocator, varchar_types);
|
|
79996
|
-
}
|
|
79997
|
-
}
|
|
79998
|
-
|
|
79999
|
-
void BufferedCSVReader::InitInsertChunkIdx(idx_t num_cols) {
|
|
80000
|
-
for (idx_t col = 0; col < num_cols; ++col) {
|
|
80001
|
-
insert_cols_idx.push_back(col);
|
|
80002
|
-
}
|
|
80003
|
-
}
|
|
80004
|
-
|
|
80005
80018
|
void BufferedCSVReader::JumpToBeginning(idx_t skip_rows = 0, bool skip_header = false) {
|
|
80006
80019
|
ResetBuffer();
|
|
80007
80020
|
ResetStream();
|
|
@@ -80026,6 +80039,12 @@ void BufferedCSVReader::SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header)
|
|
|
80026
80039
|
}
|
|
80027
80040
|
}
|
|
80028
80041
|
|
|
80042
|
+
void BufferedCSVReader::PrepareComplexParser() {
|
|
80043
|
+
delimiter_search = TextSearchShiftArray(options.delimiter);
|
|
80044
|
+
escape_search = TextSearchShiftArray(options.escape);
|
|
80045
|
+
quote_search = TextSearchShiftArray(options.quote);
|
|
80046
|
+
}
|
|
80047
|
+
|
|
80029
80048
|
bool BufferedCSVReader::JumpToNextSample() {
|
|
80030
80049
|
// get bytes contained in the previously read chunk
|
|
80031
80050
|
idx_t remaining_bytes_in_buffer = buffer_size - start;
|
|
@@ -80099,91 +80118,6 @@ bool BufferedCSVReader::JumpToNextSample() {
|
|
|
80099
80118
|
return true;
|
|
80100
80119
|
}
|
|
80101
80120
|
|
|
80102
|
-
void BufferedCSVReader::SetDateFormat(const string &format_specifier, const LogicalTypeId &sql_type) {
|
|
80103
|
-
options.has_format[sql_type] = true;
|
|
80104
|
-
auto &date_format = options.date_format[sql_type];
|
|
80105
|
-
date_format.format_specifier = format_specifier;
|
|
80106
|
-
StrTimeFormat::ParseFormatSpecifier(date_format.format_specifier, date_format);
|
|
80107
|
-
}
|
|
80108
|
-
|
|
80109
|
-
bool BufferedCSVReader::TryCastValue(const Value &value, const LogicalType &sql_type) {
|
|
80110
|
-
if (options.has_format[LogicalTypeId::DATE] && sql_type.id() == LogicalTypeId::DATE) {
|
|
80111
|
-
date_t result;
|
|
80112
|
-
string error_message;
|
|
80113
|
-
return options.date_format[LogicalTypeId::DATE].TryParseDate(string_t(StringValue::Get(value)), result,
|
|
80114
|
-
error_message);
|
|
80115
|
-
} else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type.id() == LogicalTypeId::TIMESTAMP) {
|
|
80116
|
-
timestamp_t result;
|
|
80117
|
-
string error_message;
|
|
80118
|
-
return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(string_t(StringValue::Get(value)),
|
|
80119
|
-
result, error_message);
|
|
80120
|
-
} else {
|
|
80121
|
-
Value new_value;
|
|
80122
|
-
string error_message;
|
|
80123
|
-
return value.DefaultTryCastAs(sql_type, new_value, &error_message, true);
|
|
80124
|
-
}
|
|
80125
|
-
}
|
|
80126
|
-
|
|
80127
|
-
struct TryCastDateOperator {
|
|
80128
|
-
static bool Operation(BufferedCSVReaderOptions &options, string_t input, date_t &result, string &error_message) {
|
|
80129
|
-
return options.date_format[LogicalTypeId::DATE].TryParseDate(input, result, error_message);
|
|
80130
|
-
}
|
|
80131
|
-
};
|
|
80132
|
-
|
|
80133
|
-
struct TryCastTimestampOperator {
|
|
80134
|
-
static bool Operation(BufferedCSVReaderOptions &options, string_t input, timestamp_t &result,
|
|
80135
|
-
string &error_message) {
|
|
80136
|
-
return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(input, result, error_message);
|
|
80137
|
-
}
|
|
80138
|
-
};
|
|
80139
|
-
|
|
80140
|
-
template <class OP, class T>
|
|
80141
|
-
static bool TemplatedTryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
|
|
80142
|
-
idx_t count, string &error_message) {
|
|
80143
|
-
D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
|
|
80144
|
-
bool all_converted = true;
|
|
80145
|
-
UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
|
|
80146
|
-
T result;
|
|
80147
|
-
if (!OP::Operation(options, input, result, error_message)) {
|
|
80148
|
-
all_converted = false;
|
|
80149
|
-
}
|
|
80150
|
-
return result;
|
|
80151
|
-
});
|
|
80152
|
-
return all_converted;
|
|
80153
|
-
}
|
|
80154
|
-
|
|
80155
|
-
bool TryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
|
|
80156
|
-
string &error_message) {
|
|
80157
|
-
return TemplatedTryCastDateVector<TryCastDateOperator, date_t>(options, input_vector, result_vector, count,
|
|
80158
|
-
error_message);
|
|
80159
|
-
}
|
|
80160
|
-
|
|
80161
|
-
bool TryCastTimestampVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
|
|
80162
|
-
string &error_message) {
|
|
80163
|
-
return TemplatedTryCastDateVector<TryCastTimestampOperator, timestamp_t>(options, input_vector, result_vector,
|
|
80164
|
-
count, error_message);
|
|
80165
|
-
}
|
|
80166
|
-
|
|
80167
|
-
bool BufferedCSVReader::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) {
|
|
80168
|
-
// try vector-cast from string to sql_type
|
|
80169
|
-
Vector dummy_result(sql_type);
|
|
80170
|
-
if (options.has_format[LogicalTypeId::DATE] && sql_type == LogicalTypeId::DATE) {
|
|
80171
|
-
// use the date format to cast the chunk
|
|
80172
|
-
string error_message;
|
|
80173
|
-
return TryCastDateVector(options, parse_chunk_col, dummy_result, size, error_message);
|
|
80174
|
-
} else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type == LogicalTypeId::TIMESTAMP) {
|
|
80175
|
-
// use the timestamp format to cast the chunk
|
|
80176
|
-
string error_message;
|
|
80177
|
-
return TryCastTimestampVector(options, parse_chunk_col, dummy_result, size, error_message);
|
|
80178
|
-
} else {
|
|
80179
|
-
// target type is not varchar: perform a cast
|
|
80180
|
-
string error_message;
|
|
80181
|
-
return VectorOperations::DefaultTryCast(parse_chunk_col, dummy_result, size, &error_message, true);
|
|
80182
|
-
}
|
|
80183
|
-
}
|
|
80184
|
-
|
|
80185
|
-
enum class QuoteRule : uint8_t { QUOTES_RFC = 0, QUOTES_OTHER = 1, NO_QUOTES = 2 };
|
|
80186
|
-
|
|
80187
80121
|
void BufferedCSVReader::DetectDialect(const vector<LogicalType> &requested_types,
|
|
80188
80122
|
BufferedCSVReaderOptions &original_options,
|
|
80189
80123
|
vector<BufferedCSVReaderOptions> &info_candidates, idx_t &best_num_cols) {
|
|
@@ -81181,267 +81115,926 @@ bool BufferedCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_ch
|
|
|
81181
81115
|
}
|
|
81182
81116
|
}
|
|
81183
81117
|
|
|
81184
|
-
|
|
81185
|
-
|
|
81186
|
-
|
|
81187
|
-
|
|
81188
|
-
|
|
81189
|
-
|
|
81118
|
+
} // namespace duckdb
|
|
81119
|
+
|
|
81120
|
+
|
|
81121
|
+
|
|
81122
|
+
namespace duckdb {
|
|
81123
|
+
|
|
81124
|
+
CSVBuffer::CSVBuffer(idx_t buffer_size_p, CSVFileHandle &file_handle) : first_buffer(true) {
|
|
81125
|
+
buffer = unique_ptr<char[]>(new char[buffer_size_p]);
|
|
81126
|
+
actual_size = file_handle.Read(buffer.get(), buffer_size_p);
|
|
81127
|
+
if (actual_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
|
|
81128
|
+
start_position += 3;
|
|
81190
81129
|
}
|
|
81130
|
+
last_buffer = file_handle.FinishedReading();
|
|
81131
|
+
}
|
|
81191
81132
|
|
|
81192
|
-
|
|
81193
|
-
|
|
81194
|
-
|
|
81133
|
+
CSVBuffer::CSVBuffer(unique_ptr<char[]> buffer_p, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer)
|
|
81134
|
+
: buffer(move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer) {
|
|
81135
|
+
}
|
|
81136
|
+
|
|
81137
|
+
unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t set_buffer_size) {
|
|
81138
|
+
if (file_handle.FinishedReading()) {
|
|
81139
|
+
// this was the last buffer
|
|
81140
|
+
return nullptr;
|
|
81195
81141
|
}
|
|
81196
|
-
|
|
81197
|
-
|
|
81142
|
+
|
|
81143
|
+
auto next_buffer = unique_ptr<char[]>(new char[set_buffer_size]);
|
|
81144
|
+
|
|
81145
|
+
idx_t next_buffer_actual_size = file_handle.Read(next_buffer.get(), set_buffer_size);
|
|
81146
|
+
|
|
81147
|
+
return make_unique<CSVBuffer>(move(next_buffer), set_buffer_size, next_buffer_actual_size,
|
|
81148
|
+
file_handle.FinishedReading());
|
|
81149
|
+
}
|
|
81150
|
+
|
|
81151
|
+
idx_t CSVBuffer::GetBufferSize() {
|
|
81152
|
+
return actual_size;
|
|
81153
|
+
}
|
|
81154
|
+
|
|
81155
|
+
idx_t CSVBuffer::GetStart() {
|
|
81156
|
+
return start_position;
|
|
81157
|
+
}
|
|
81158
|
+
|
|
81159
|
+
bool CSVBuffer::IsCSVFileLastBuffer() {
|
|
81160
|
+
return last_buffer;
|
|
81161
|
+
}
|
|
81162
|
+
|
|
81163
|
+
bool CSVBuffer::IsCSVFileFirstBuffer() {
|
|
81164
|
+
return first_buffer;
|
|
81165
|
+
}
|
|
81166
|
+
|
|
81167
|
+
} // namespace duckdb
|
|
81168
|
+
|
|
81169
|
+
|
|
81170
|
+
|
|
81171
|
+
|
|
81172
|
+
namespace duckdb {
|
|
81173
|
+
|
|
81174
|
+
static bool ParseBoolean(const Value &value, const string &loption);
|
|
81175
|
+
|
|
81176
|
+
static bool ParseBoolean(const vector<Value> &set, const string &loption) {
|
|
81177
|
+
if (set.empty()) {
|
|
81178
|
+
// no option specified: default to true
|
|
81179
|
+
return true;
|
|
81180
|
+
}
|
|
81181
|
+
if (set.size() > 1) {
|
|
81182
|
+
throw BinderException("\"%s\" expects a single argument as a boolean value (e.g. TRUE or 1)", loption);
|
|
81183
|
+
}
|
|
81184
|
+
return ParseBoolean(set[0], loption);
|
|
81185
|
+
}
|
|
81186
|
+
|
|
81187
|
+
static bool ParseBoolean(const Value &value, const string &loption) {
|
|
81188
|
+
|
|
81189
|
+
if (value.type().id() == LogicalTypeId::LIST) {
|
|
81190
|
+
auto &children = ListValue::GetChildren(value);
|
|
81191
|
+
return ParseBoolean(children, loption);
|
|
81192
|
+
}
|
|
81193
|
+
if (value.type() == LogicalType::FLOAT || value.type() == LogicalType::DOUBLE ||
|
|
81194
|
+
value.type().id() == LogicalTypeId::DECIMAL) {
|
|
81195
|
+
throw BinderException("\"%s\" expects a boolean value (e.g. TRUE or 1)", loption);
|
|
81196
|
+
}
|
|
81197
|
+
return BooleanValue::Get(value.DefaultCastAs(LogicalType::BOOLEAN));
|
|
81198
|
+
}
|
|
81199
|
+
|
|
81200
|
+
static string ParseString(const Value &value, const string &loption) {
|
|
81201
|
+
if (value.type().id() == LogicalTypeId::LIST) {
|
|
81202
|
+
auto &children = ListValue::GetChildren(value);
|
|
81203
|
+
if (children.size() != 1) {
|
|
81204
|
+
throw BinderException("\"%s\" expects a single argument as a string value", loption);
|
|
81205
|
+
}
|
|
81206
|
+
return ParseString(children[0], loption);
|
|
81207
|
+
}
|
|
81208
|
+
if (value.type().id() != LogicalTypeId::VARCHAR) {
|
|
81209
|
+
throw BinderException("\"%s\" expects a string argument!", loption);
|
|
81210
|
+
}
|
|
81211
|
+
return value.GetValue<string>();
|
|
81212
|
+
}
|
|
81213
|
+
|
|
81214
|
+
static int64_t ParseInteger(const Value &value, const string &loption) {
|
|
81215
|
+
if (value.type().id() == LogicalTypeId::LIST) {
|
|
81216
|
+
auto &children = ListValue::GetChildren(value);
|
|
81217
|
+
if (children.size() != 1) {
|
|
81218
|
+
// no option specified or multiple options specified
|
|
81219
|
+
throw BinderException("\"%s\" expects a single argument as an integer value", loption);
|
|
81220
|
+
}
|
|
81221
|
+
return ParseInteger(children[0], loption);
|
|
81222
|
+
}
|
|
81223
|
+
return value.GetValue<int64_t>();
|
|
81224
|
+
}
|
|
81225
|
+
|
|
81226
|
+
static vector<bool> ParseColumnList(const vector<Value> &set, vector<string> &names, const string &loption) {
|
|
81227
|
+
vector<bool> result;
|
|
81228
|
+
|
|
81229
|
+
if (set.empty()) {
|
|
81230
|
+
throw BinderException("\"%s\" expects a column list or * as parameter", loption);
|
|
81231
|
+
}
|
|
81232
|
+
// list of options: parse the list
|
|
81233
|
+
unordered_map<string, bool> option_map;
|
|
81234
|
+
for (idx_t i = 0; i < set.size(); i++) {
|
|
81235
|
+
option_map[set[i].ToString()] = false;
|
|
81236
|
+
}
|
|
81237
|
+
result.resize(names.size(), false);
|
|
81238
|
+
for (idx_t i = 0; i < names.size(); i++) {
|
|
81239
|
+
auto entry = option_map.find(names[i]);
|
|
81240
|
+
if (entry != option_map.end()) {
|
|
81241
|
+
result[i] = true;
|
|
81242
|
+
entry->second = true;
|
|
81243
|
+
}
|
|
81244
|
+
}
|
|
81245
|
+
for (auto &entry : option_map) {
|
|
81246
|
+
if (!entry.second) {
|
|
81247
|
+
throw BinderException("\"%s\" expected to find %s, but it was not found in the table", loption,
|
|
81248
|
+
entry.first.c_str());
|
|
81249
|
+
}
|
|
81250
|
+
}
|
|
81251
|
+
return result;
|
|
81252
|
+
}
|
|
81253
|
+
|
|
81254
|
+
static vector<bool> ParseColumnList(const Value &value, vector<string> &names, const string &loption) {
|
|
81255
|
+
vector<bool> result;
|
|
81256
|
+
|
|
81257
|
+
// Only accept a list of arguments
|
|
81258
|
+
if (value.type().id() != LogicalTypeId::LIST) {
|
|
81259
|
+
// Support a single argument if it's '*'
|
|
81260
|
+
if (value.type().id() == LogicalTypeId::VARCHAR && value.GetValue<string>() == "*") {
|
|
81261
|
+
result.resize(names.size(), true);
|
|
81262
|
+
return result;
|
|
81263
|
+
}
|
|
81264
|
+
throw BinderException("\"%s\" expects a column list or * as parameter", loption);
|
|
81265
|
+
}
|
|
81266
|
+
auto &children = ListValue::GetChildren(value);
|
|
81267
|
+
// accept '*' as single argument
|
|
81268
|
+
if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&
|
|
81269
|
+
children[0].GetValue<string>() == "*") {
|
|
81270
|
+
result.resize(names.size(), true);
|
|
81271
|
+
return result;
|
|
81272
|
+
}
|
|
81273
|
+
return ParseColumnList(children, names, loption);
|
|
81274
|
+
}
|
|
81275
|
+
|
|
81276
|
+
void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
|
|
81277
|
+
this->delimiter = StringUtil::Replace(input, "\\t", "\t");
|
|
81278
|
+
this->has_delimiter = true;
|
|
81279
|
+
if (input.empty()) {
|
|
81280
|
+
this->delimiter = string("\0", 1);
|
|
81281
|
+
}
|
|
81282
|
+
}
|
|
81283
|
+
|
|
81284
|
+
void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) {
|
|
81285
|
+
string error;
|
|
81286
|
+
if (read_format) {
|
|
81287
|
+
auto &date_format = this->date_format[type];
|
|
81288
|
+
error = StrTimeFormat::ParseFormatSpecifier(format, date_format);
|
|
81289
|
+
date_format.format_specifier = format;
|
|
81290
|
+
} else {
|
|
81291
|
+
auto &date_format = this->write_date_format[type];
|
|
81292
|
+
error = StrTimeFormat::ParseFormatSpecifier(format, date_format);
|
|
81293
|
+
}
|
|
81294
|
+
if (!error.empty()) {
|
|
81295
|
+
throw InvalidInputException("Could not parse DATEFORMAT: %s", error.c_str());
|
|
81296
|
+
}
|
|
81297
|
+
has_format[type] = true;
|
|
81298
|
+
}
|
|
81299
|
+
|
|
81300
|
+
void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value &value,
|
|
81301
|
+
vector<string> &expected_names) {
|
|
81302
|
+
if (SetBaseOption(loption, value)) {
|
|
81198
81303
|
return;
|
|
81199
81304
|
}
|
|
81200
|
-
if (
|
|
81201
|
-
|
|
81202
|
-
|
|
81203
|
-
|
|
81305
|
+
if (loption == "auto_detect") {
|
|
81306
|
+
auto_detect = ParseBoolean(value, loption);
|
|
81307
|
+
} else if (loption == "sample_size") {
|
|
81308
|
+
int64_t sample_size = ParseInteger(value, loption);
|
|
81309
|
+
if (sample_size < 1 && sample_size != -1) {
|
|
81310
|
+
throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1");
|
|
81311
|
+
}
|
|
81312
|
+
if (sample_size == -1) {
|
|
81313
|
+
sample_chunks = std::numeric_limits<uint64_t>::max();
|
|
81314
|
+
sample_chunk_size = STANDARD_VECTOR_SIZE;
|
|
81315
|
+
} else if (sample_size <= STANDARD_VECTOR_SIZE) {
|
|
81316
|
+
sample_chunk_size = sample_size;
|
|
81317
|
+
sample_chunks = 1;
|
|
81204
81318
|
} else {
|
|
81205
|
-
|
|
81206
|
-
|
|
81207
|
-
|
|
81319
|
+
sample_chunk_size = STANDARD_VECTOR_SIZE;
|
|
81320
|
+
sample_chunks = sample_size / STANDARD_VECTOR_SIZE;
|
|
81321
|
+
}
|
|
81322
|
+
} else if (loption == "skip") {
|
|
81323
|
+
skip_rows = ParseInteger(value, loption);
|
|
81324
|
+
} else if (loption == "max_line_size" || loption == "maximum_line_size") {
|
|
81325
|
+
maximum_line_size = ParseInteger(value, loption);
|
|
81326
|
+
} else if (loption == "sample_chunk_size") {
|
|
81327
|
+
sample_chunk_size = ParseInteger(value, loption);
|
|
81328
|
+
if (sample_chunk_size > STANDARD_VECTOR_SIZE) {
|
|
81329
|
+
throw BinderException(
|
|
81330
|
+
"Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d",
|
|
81331
|
+
STANDARD_VECTOR_SIZE);
|
|
81332
|
+
} else if (sample_chunk_size < 1) {
|
|
81333
|
+
throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1");
|
|
81334
|
+
}
|
|
81335
|
+
} else if (loption == "sample_chunks") {
|
|
81336
|
+
sample_chunks = ParseInteger(value, loption);
|
|
81337
|
+
if (sample_chunks < 1) {
|
|
81338
|
+
throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1");
|
|
81208
81339
|
}
|
|
81340
|
+
} else if (loption == "force_not_null") {
|
|
81341
|
+
force_not_null = ParseColumnList(value, expected_names, loption);
|
|
81342
|
+
} else if (loption == "date_format" || loption == "dateformat") {
|
|
81343
|
+
string format = ParseString(value, loption);
|
|
81344
|
+
SetDateFormat(LogicalTypeId::DATE, format, true);
|
|
81345
|
+
} else if (loption == "timestamp_format" || loption == "timestampformat") {
|
|
81346
|
+
string format = ParseString(value, loption);
|
|
81347
|
+
SetDateFormat(LogicalTypeId::TIMESTAMP, format, true);
|
|
81348
|
+
} else if (loption == "escape") {
|
|
81349
|
+
escape = ParseString(value, loption);
|
|
81350
|
+
has_escape = true;
|
|
81351
|
+
} else if (loption == "ignore_errors") {
|
|
81352
|
+
ignore_errors = ParseBoolean(value, loption);
|
|
81353
|
+
} else if (loption == "union_by_name") {
|
|
81354
|
+
union_by_name = ParseBoolean(value, loption);
|
|
81355
|
+
} else {
|
|
81356
|
+
throw BinderException("Unrecognized option for CSV reader \"%s\"", loption);
|
|
81209
81357
|
}
|
|
81358
|
+
}
|
|
81210
81359
|
|
|
81211
|
-
|
|
81212
|
-
|
|
81360
|
+
void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value &value) {
|
|
81361
|
+
if (SetBaseOption(loption, value)) {
|
|
81362
|
+
return;
|
|
81363
|
+
}
|
|
81213
81364
|
|
|
81214
|
-
|
|
81215
|
-
|
|
81216
|
-
|
|
81217
|
-
|
|
81365
|
+
if (loption == "force_quote") {
|
|
81366
|
+
force_quote = ParseColumnList(value, names, loption);
|
|
81367
|
+
} else if (loption == "date_format" || loption == "dateformat") {
|
|
81368
|
+
string format = ParseString(value, loption);
|
|
81369
|
+
SetDateFormat(LogicalTypeId::DATE, format, false);
|
|
81370
|
+
} else if (loption == "timestamp_format" || loption == "timestampformat") {
|
|
81371
|
+
string format = ParseString(value, loption);
|
|
81372
|
+
if (StringUtil::Lower(format) == "iso") {
|
|
81373
|
+
format = "%Y-%m-%dT%H:%M:%S.%fZ";
|
|
81374
|
+
}
|
|
81375
|
+
SetDateFormat(LogicalTypeId::TIMESTAMP, format, false);
|
|
81218
81376
|
} else {
|
|
81219
|
-
|
|
81220
|
-
|
|
81221
|
-
|
|
81222
|
-
// remove escape characters (if any)
|
|
81223
|
-
string old_val = str_val.GetString();
|
|
81224
|
-
string new_val = "";
|
|
81225
|
-
idx_t prev_pos = 0;
|
|
81226
|
-
for (idx_t i = 0; i < escape_positions.size(); i++) {
|
|
81227
|
-
idx_t next_pos = escape_positions[i];
|
|
81228
|
-
new_val += old_val.substr(prev_pos, next_pos - prev_pos);
|
|
81377
|
+
throw BinderException("Unrecognized option CSV writer \"%s\"", loption);
|
|
81378
|
+
}
|
|
81379
|
+
}
|
|
81229
81380
|
|
|
81230
|
-
|
|
81231
|
-
|
|
81232
|
-
|
|
81233
|
-
|
|
81234
|
-
|
|
81235
|
-
|
|
81236
|
-
|
|
81237
|
-
|
|
81238
|
-
|
|
81239
|
-
|
|
81240
|
-
|
|
81381
|
+
bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value &value) {
|
|
81382
|
+
// Make sure this function was only called after the option was turned into lowercase
|
|
81383
|
+
D_ASSERT(!std::any_of(loption.begin(), loption.end(), ::isupper));
|
|
81384
|
+
|
|
81385
|
+
if (StringUtil::StartsWith(loption, "delim") || StringUtil::StartsWith(loption, "sep")) {
|
|
81386
|
+
SetDelimiter(ParseString(value, loption));
|
|
81387
|
+
} else if (loption == "quote") {
|
|
81388
|
+
quote = ParseString(value, loption);
|
|
81389
|
+
has_quote = true;
|
|
81390
|
+
} else if (loption == "escape") {
|
|
81391
|
+
escape = ParseString(value, loption);
|
|
81392
|
+
has_escape = true;
|
|
81393
|
+
} else if (loption == "header") {
|
|
81394
|
+
header = ParseBoolean(value, loption);
|
|
81395
|
+
has_header = true;
|
|
81396
|
+
} else if (loption == "null" || loption == "nullstr") {
|
|
81397
|
+
null_str = ParseString(value, loption);
|
|
81398
|
+
} else if (loption == "encoding") {
|
|
81399
|
+
auto encoding = StringUtil::Lower(ParseString(value, loption));
|
|
81400
|
+
if (encoding != "utf8" && encoding != "utf-8") {
|
|
81401
|
+
throw BinderException("Copy is only supported for UTF-8 encoded files, ENCODING 'UTF-8'");
|
|
81241
81402
|
}
|
|
81403
|
+
} else if (loption == "compression") {
|
|
81404
|
+
compression = FileCompressionTypeFromString(ParseString(value, loption));
|
|
81405
|
+
} else {
|
|
81406
|
+
// unrecognized option in base CSV
|
|
81407
|
+
return false;
|
|
81242
81408
|
}
|
|
81409
|
+
return true;
|
|
81410
|
+
}
|
|
81243
81411
|
|
|
81244
|
-
|
|
81245
|
-
|
|
81412
|
+
std::string BufferedCSVReaderOptions::ToString() const {
|
|
81413
|
+
return "DELIMITER='" + delimiter + (has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
|
|
81414
|
+
", QUOTE='" + quote + (has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
|
|
81415
|
+
", ESCAPE='" + escape + (has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
|
|
81416
|
+
", HEADER=" + std::to_string(header) +
|
|
81417
|
+
(has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
|
|
81418
|
+
", SAMPLE_SIZE=" + std::to_string(sample_chunk_size * sample_chunks) +
|
|
81419
|
+
", IGNORE_ERRORS=" + std::to_string(ignore_errors) + ", ALL_VARCHAR=" + std::to_string(all_varchar);
|
|
81246
81420
|
}
|
|
81247
81421
|
|
|
81248
|
-
|
|
81249
|
-
|
|
81422
|
+
} // namespace duckdb
|
|
81423
|
+
//===----------------------------------------------------------------------===//
|
|
81424
|
+
// DuckDB
|
|
81425
|
+
//
|
|
81426
|
+
// duckdb/execution/operator/persistent/buffered_csv_reader.hpp
|
|
81427
|
+
//
|
|
81428
|
+
//
|
|
81429
|
+
//===----------------------------------------------------------------------===//
|
|
81250
81430
|
|
|
81251
|
-
|
|
81252
|
-
|
|
81253
|
-
|
|
81254
|
-
|
|
81255
|
-
|
|
81431
|
+
|
|
81432
|
+
|
|
81433
|
+
|
|
81434
|
+
|
|
81435
|
+
|
|
81436
|
+
|
|
81437
|
+
|
|
81438
|
+
#include <sstream>
|
|
81439
|
+
#include <utility>
|
|
81440
|
+
|
|
81441
|
+
namespace duckdb {
|
|
81442
|
+
|
|
81443
|
+
struct CSVBufferRead {
|
|
81444
|
+
CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, idx_t buffer_start_p, idx_t buffer_end_p, idx_t batch_index,
|
|
81445
|
+
idx_t estimated_linenr)
|
|
81446
|
+
: buffer(move(buffer_p)), buffer_start(buffer_start_p), buffer_end(buffer_end_p), batch_index(batch_index),
|
|
81447
|
+
estimated_linenr(estimated_linenr) {
|
|
81448
|
+
if (buffer) {
|
|
81449
|
+
if (buffer_end > buffer->GetBufferSize()) {
|
|
81450
|
+
buffer_end = buffer->GetBufferSize();
|
|
81256
81451
|
}
|
|
81257
|
-
|
|
81258
|
-
|
|
81452
|
+
} else {
|
|
81453
|
+
buffer_start = 0;
|
|
81454
|
+
buffer_end = 0;
|
|
81259
81455
|
}
|
|
81260
81456
|
}
|
|
81261
81457
|
|
|
81262
|
-
|
|
81263
|
-
|
|
81264
|
-
|
|
81265
|
-
|
|
81266
|
-
column = 0;
|
|
81267
|
-
return false;
|
|
81458
|
+
CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, shared_ptr<CSVBuffer> nxt_buffer_p, idx_t buffer_start_p,
|
|
81459
|
+
idx_t buffer_end_p, idx_t batch_index, idx_t estimated_linenr)
|
|
81460
|
+
: CSVBufferRead(std::move(buffer_p), buffer_start_p, buffer_end_p, batch_index, estimated_linenr) {
|
|
81461
|
+
next_buffer = std::move(nxt_buffer_p);
|
|
81268
81462
|
}
|
|
81269
81463
|
|
|
81270
|
-
|
|
81271
|
-
|
|
81272
|
-
|
|
81273
|
-
|
|
81464
|
+
CSVBufferRead() : buffer_start(0), buffer_end(NumericLimits<idx_t>::Maximum()) {};
|
|
81465
|
+
|
|
81466
|
+
const char &operator[](size_t i) const {
|
|
81467
|
+
if (i < buffer->GetBufferSize()) {
|
|
81468
|
+
return buffer->buffer[i];
|
|
81469
|
+
}
|
|
81470
|
+
return next_buffer->buffer[i - buffer->GetBufferSize()];
|
|
81471
|
+
}
|
|
81472
|
+
|
|
81473
|
+
string_t GetValue(idx_t start_buffer, idx_t position_buffer, idx_t offset) {
|
|
81474
|
+
idx_t length = position_buffer - start_buffer - offset;
|
|
81475
|
+
// 1) It's all in the current buffer
|
|
81476
|
+
if (start_buffer + length <= buffer->GetBufferSize()) {
|
|
81477
|
+
auto buffer_ptr = buffer->buffer.get();
|
|
81478
|
+
return string_t(buffer_ptr + start_buffer, length);
|
|
81479
|
+
} else if (start_buffer >= buffer->GetBufferSize()) {
|
|
81480
|
+
// 2) It's all in the next buffer
|
|
81481
|
+
D_ASSERT(next_buffer);
|
|
81482
|
+
D_ASSERT(next_buffer->GetBufferSize() >= length + (start_buffer - buffer->GetBufferSize()));
|
|
81483
|
+
auto buffer_ptr = next_buffer->buffer.get();
|
|
81484
|
+
return string_t(buffer_ptr + (start_buffer - buffer->GetBufferSize()), length);
|
|
81274
81485
|
} else {
|
|
81275
|
-
|
|
81276
|
-
|
|
81277
|
-
|
|
81486
|
+
// 3) It starts in the current buffer and ends in the next buffer
|
|
81487
|
+
D_ASSERT(next_buffer);
|
|
81488
|
+
auto intersection = unique_ptr<char[]>(new char[length]);
|
|
81489
|
+
idx_t cur_pos = 0;
|
|
81490
|
+
for (idx_t i = start_buffer; i < buffer->GetBufferSize(); i++) {
|
|
81491
|
+
intersection[cur_pos++] = buffer->buffer[i];
|
|
81492
|
+
}
|
|
81493
|
+
idx_t nxt_buffer_pos = 0;
|
|
81494
|
+
for (; cur_pos < length; cur_pos++) {
|
|
81495
|
+
intersection[cur_pos] = next_buffer->buffer[nxt_buffer_pos++];
|
|
81496
|
+
}
|
|
81497
|
+
intersections.emplace_back(move(intersection));
|
|
81498
|
+
return string_t(intersections.back().get(), length);
|
|
81278
81499
|
}
|
|
81279
81500
|
}
|
|
81280
81501
|
|
|
81281
|
-
|
|
81282
|
-
|
|
81502
|
+
shared_ptr<CSVBuffer> buffer;
|
|
81503
|
+
shared_ptr<CSVBuffer> next_buffer;
|
|
81504
|
+
vector<unique_ptr<char[]>> intersections;
|
|
81283
81505
|
|
|
81284
|
-
|
|
81285
|
-
|
|
81286
|
-
|
|
81287
|
-
|
|
81288
|
-
|
|
81506
|
+
idx_t buffer_start;
|
|
81507
|
+
idx_t buffer_end;
|
|
81508
|
+
idx_t batch_index;
|
|
81509
|
+
idx_t estimated_linenr;
|
|
81510
|
+
};
|
|
81511
|
+
|
|
81512
|
+
//! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
|
|
81513
|
+
class ParallelCSVReader : public BaseCSVReader {
|
|
81514
|
+
public:
|
|
81515
|
+
ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
|
|
81516
|
+
const vector<LogicalType> &requested_types);
|
|
81517
|
+
~ParallelCSVReader();
|
|
81518
|
+
|
|
81519
|
+
//! Current Position (Relative to the Buffer)
|
|
81520
|
+
idx_t position_buffer = 0;
|
|
81521
|
+
|
|
81522
|
+
//! Start of the piece of the buffer this thread should read
|
|
81523
|
+
idx_t start_buffer = 0;
|
|
81524
|
+
//! End of the piece of this buffer this thread should read
|
|
81525
|
+
idx_t end_buffer = NumericLimits<idx_t>::Maximum();
|
|
81526
|
+
//! The actual buffer size
|
|
81527
|
+
idx_t buffer_size = 0;
|
|
81528
|
+
|
|
81529
|
+
//! If this flag is set, it means we are about to try to read our last row.
|
|
81530
|
+
bool reached_remainder_state = false;
|
|
81531
|
+
|
|
81532
|
+
unique_ptr<CSVBufferRead> buffer;
|
|
81533
|
+
|
|
81534
|
+
public:
|
|
81535
|
+
void SetBufferRead(unique_ptr<CSVBufferRead> buffer);
|
|
81536
|
+
//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
|
|
81537
|
+
void ParseCSV(DataChunk &insert_chunk);
|
|
81538
|
+
|
|
81539
|
+
private:
|
|
81540
|
+
//! Initialize Parser
|
|
81541
|
+
void Initialize(const vector<LogicalType> &requested_types);
|
|
81542
|
+
//! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.
|
|
81543
|
+
void ParseCSV(ParserMode mode);
|
|
81544
|
+
//! Try to parse a single datachunk from the file. Returns whether or not the parsing is successful
|
|
81545
|
+
bool TryParseCSV(ParserMode mode);
|
|
81546
|
+
//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
|
|
81547
|
+
bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
|
|
81548
|
+
//! Sets Position depending on the byte_start of this thread
|
|
81549
|
+
bool SetPosition(DataChunk &insert_chunk);
|
|
81550
|
+
//! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
|
|
81551
|
+
//! Up to finding a new line. This function sets the buffer_end and marks a boolean variable
|
|
81552
|
+
//! when changing the buffer end the first time.
|
|
81553
|
+
//! It returns FALSE if the parser should jump to the final state of parsing or not
|
|
81554
|
+
bool BufferRemainder();
|
|
81555
|
+
//! Parses a CSV file with a one-byte delimiter, escape and quote character
|
|
81556
|
+
bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
|
|
81557
|
+
};
|
|
81558
|
+
|
|
81559
|
+
} // namespace duckdb
|
|
81560
|
+
|
|
81561
|
+
|
|
81562
|
+
|
|
81563
|
+
|
|
81564
|
+
|
|
81565
|
+
|
|
81566
|
+
|
|
81567
|
+
|
|
81568
|
+
|
|
81569
|
+
|
|
81570
|
+
|
|
81571
|
+
|
|
81572
|
+
|
|
81573
|
+
|
|
81574
|
+
|
|
81575
|
+
|
|
81576
|
+
//===----------------------------------------------------------------------===//
|
|
81577
|
+
// DuckDB
|
|
81578
|
+
//
|
|
81579
|
+
// duckdb/function/table/read_csv.hpp
|
|
81580
|
+
//
|
|
81581
|
+
//
|
|
81582
|
+
//===----------------------------------------------------------------------===//
|
|
81583
|
+
|
|
81584
|
+
|
|
81585
|
+
|
|
81586
|
+
|
|
81587
|
+
|
|
81588
|
+
|
|
81589
|
+
|
|
81590
|
+
|
|
81591
|
+
|
|
81592
|
+
|
|
81593
|
+
|
|
81594
|
+
namespace duckdb {
|
|
81595
|
+
|
|
81596
|
+
class ReadCSV {
|
|
81597
|
+
public:
|
|
81598
|
+
static unique_ptr<CSVFileHandle> OpenCSV(const BufferedCSVReaderOptions &options, ClientContext &context);
|
|
81599
|
+
};
|
|
81600
|
+
|
|
81601
|
+
struct BaseCSVData : public TableFunctionData {
|
|
81602
|
+
virtual ~BaseCSVData() {
|
|
81289
81603
|
}
|
|
81604
|
+
//! The file path of the CSV file to read or write
|
|
81605
|
+
vector<string> files;
|
|
81606
|
+
//! The CSV reader options
|
|
81607
|
+
BufferedCSVReaderOptions options;
|
|
81608
|
+
//! Offsets for generated columns
|
|
81609
|
+
idx_t filename_col_idx;
|
|
81610
|
+
idx_t hive_partition_col_idx;
|
|
81290
81611
|
|
|
81291
|
-
|
|
81292
|
-
|
|
81612
|
+
void Finalize();
|
|
81613
|
+
};
|
|
81614
|
+
|
|
81615
|
+
struct WriteCSVData : public BaseCSVData {
|
|
81616
|
+
WriteCSVData(string file_path, vector<LogicalType> sql_types, vector<string> names) : sql_types(move(sql_types)) {
|
|
81617
|
+
files.push_back(move(file_path));
|
|
81618
|
+
options.names = move(names);
|
|
81293
81619
|
}
|
|
81294
81620
|
|
|
81295
|
-
|
|
81296
|
-
|
|
81621
|
+
//! The SQL types to write
|
|
81622
|
+
vector<LogicalType> sql_types;
|
|
81623
|
+
//! The newline string to write
|
|
81624
|
+
string newline = "\n";
|
|
81625
|
+
//! Whether or not we are writing a simple CSV (delimiter, quote and escape are all 1 byte in length)
|
|
81626
|
+
bool is_simple;
|
|
81627
|
+
//! The size of the CSV file (in bytes) that we buffer before we flush it to disk
|
|
81628
|
+
idx_t flush_size = 4096 * 8;
|
|
81629
|
+
};
|
|
81630
|
+
|
|
81631
|
+
struct ReadCSVData : public BaseCSVData {
|
|
81632
|
+
//! The expected SQL types to read
|
|
81633
|
+
vector<LogicalType> sql_types;
|
|
81634
|
+
//! The initial reader (if any): this is used when automatic detection is used during binding.
|
|
81635
|
+
//! In this case, the CSV reader is already created and might as well be re-used.
|
|
81636
|
+
unique_ptr<BufferedCSVReader> initial_reader;
|
|
81637
|
+
//! The union readers are created (when csv union_by_name option is on) during binding
|
|
81638
|
+
//! Those readers can be re-used during ReadCSVFunction
|
|
81639
|
+
vector<unique_ptr<BufferedCSVReader>> union_readers;
|
|
81640
|
+
//! Whether or not the single-threaded reader should be used
|
|
81641
|
+
bool single_threaded = false;
|
|
81642
|
+
|
|
81643
|
+
void InitializeFiles(ClientContext &context, const vector<string> &patterns);
|
|
81644
|
+
void FinalizeRead(ClientContext &context);
|
|
81645
|
+
};
|
|
81646
|
+
|
|
81647
|
+
struct CSVCopyFunction {
|
|
81648
|
+
static void RegisterFunction(BuiltinFunctions &set);
|
|
81649
|
+
};
|
|
81650
|
+
|
|
81651
|
+
struct ReadCSVTableFunction {
|
|
81652
|
+
static TableFunction GetFunction(bool list_parameter = false);
|
|
81653
|
+
static TableFunction GetAutoFunction(bool list_parameter = false);
|
|
81654
|
+
static void RegisterFunction(BuiltinFunctions &set);
|
|
81655
|
+
};
|
|
81656
|
+
|
|
81657
|
+
} // namespace duckdb
|
|
81658
|
+
|
|
81659
|
+
|
|
81660
|
+
#include <algorithm>
|
|
81661
|
+
#include <cctype>
|
|
81662
|
+
#include <cstring>
|
|
81663
|
+
#include <fstream>
|
|
81664
|
+
#include <utility>
|
|
81665
|
+
|
|
81666
|
+
namespace duckdb {
|
|
81667
|
+
|
|
81668
|
+
ParallelCSVReader::ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
|
|
81669
|
+
unique_ptr<CSVBufferRead> buffer_p, const vector<LogicalType> &requested_types)
|
|
81670
|
+
: BaseCSVReader(context, move(options_p), requested_types) {
|
|
81671
|
+
Initialize(requested_types);
|
|
81672
|
+
SetBufferRead(move(buffer_p));
|
|
81673
|
+
if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
|
|
81674
|
+
throw InternalException("Parallel CSV reader cannot handle CSVs with multi-byte delimiters/escapes/quotes");
|
|
81297
81675
|
}
|
|
81676
|
+
}
|
|
81298
81677
|
|
|
81299
|
-
|
|
81300
|
-
|
|
81678
|
+
ParallelCSVReader::~ParallelCSVReader() {
|
|
81679
|
+
}
|
|
81680
|
+
|
|
81681
|
+
void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
|
|
81682
|
+
sql_types = requested_types;
|
|
81683
|
+
InitParseChunk(sql_types.size());
|
|
81684
|
+
InitInsertChunkIdx(sql_types.size());
|
|
81685
|
+
}
|
|
81686
|
+
|
|
81687
|
+
bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
|
|
81688
|
+
if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer &&
|
|
81689
|
+
start_buffer == buffer->buffer->GetStart()) {
|
|
81690
|
+
// First buffer doesn't need any setting
|
|
81301
81691
|
return true;
|
|
81302
81692
|
}
|
|
81303
81693
|
|
|
81304
|
-
|
|
81305
|
-
|
|
81694
|
+
// We have to move position up to next new line
|
|
81695
|
+
idx_t end_buffer_real = end_buffer;
|
|
81696
|
+
// Check if we already start in a valid line
|
|
81697
|
+
string error_message;
|
|
81698
|
+
bool successfully_read_first_line = false;
|
|
81699
|
+
while (!successfully_read_first_line) {
|
|
81700
|
+
DataChunk first_line_chunk;
|
|
81701
|
+
first_line_chunk.Initialize(allocator, insert_chunk.GetTypes());
|
|
81702
|
+
for (; position_buffer < end_buffer; position_buffer++) {
|
|
81703
|
+
if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
|
|
81704
|
+
position_buffer++;
|
|
81705
|
+
break;
|
|
81706
|
+
}
|
|
81707
|
+
}
|
|
81708
|
+
D_ASSERT(position_buffer <= end_buffer);
|
|
81709
|
+
if (position_buffer == end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
|
|
81710
|
+
break;
|
|
81711
|
+
}
|
|
81712
|
+
idx_t position_set = position_buffer;
|
|
81713
|
+
start_buffer = position_buffer;
|
|
81714
|
+
// We check if we can add this line
|
|
81715
|
+
successfully_read_first_line = TryParseSimpleCSV(first_line_chunk, error_message, true);
|
|
81716
|
+
start_buffer = position_set;
|
|
81717
|
+
end_buffer = end_buffer_real;
|
|
81718
|
+
position_buffer = position_set;
|
|
81719
|
+
if (end_buffer == position_buffer) {
|
|
81720
|
+
break;
|
|
81721
|
+
}
|
|
81722
|
+
}
|
|
81723
|
+
|
|
81724
|
+
return successfully_read_first_line;
|
|
81306
81725
|
}
|
|
81307
81726
|
|
|
81308
|
-
void
|
|
81309
|
-
|
|
81310
|
-
|
|
81311
|
-
ConstantVector::SetNull(insert_chunk.data[insert_nulls_idx[col]], true);
|
|
81727
|
+
void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
|
|
81728
|
+
if (!buffer_read_p->buffer) {
|
|
81729
|
+
throw InternalException("ParallelCSVReader::SetBufferRead - CSVBufferRead does not have a buffer to read");
|
|
81312
81730
|
}
|
|
81731
|
+
position_buffer = buffer_read_p->buffer_start;
|
|
81732
|
+
start_buffer = buffer_read_p->buffer_start;
|
|
81733
|
+
end_buffer = buffer_read_p->buffer_end;
|
|
81734
|
+
if (buffer_read_p->next_buffer) {
|
|
81735
|
+
buffer_size = buffer_read_p->buffer->GetBufferSize() + buffer_read_p->next_buffer->GetBufferSize();
|
|
81736
|
+
} else {
|
|
81737
|
+
buffer_size = buffer_read_p->buffer->GetBufferSize();
|
|
81738
|
+
}
|
|
81739
|
+
linenr = buffer_read_p->estimated_linenr;
|
|
81740
|
+
buffer = move(buffer_read_p);
|
|
81741
|
+
|
|
81742
|
+
linenr_estimated = true;
|
|
81743
|
+
reached_remainder_state = false;
|
|
81744
|
+
D_ASSERT(end_buffer <= buffer_size);
|
|
81313
81745
|
}
|
|
81314
81746
|
|
|
81315
|
-
|
|
81316
|
-
|
|
81317
|
-
|
|
81318
|
-
|
|
81319
|
-
|
|
81320
|
-
|
|
81747
|
+
// If BufferRemainder returns false, it means we are done scanning this buffer and should go to the end_state
|
|
81748
|
+
bool ParallelCSVReader::BufferRemainder() {
|
|
81749
|
+
if (position_buffer >= end_buffer && !reached_remainder_state) {
|
|
81750
|
+
// First time we finish the buffer piece we should scan here, we set the variables
|
|
81751
|
+
// to allow this piece to be scanned up to the end of the buffer or the next new line
|
|
81752
|
+
reached_remainder_state = true;
|
|
81753
|
+
// end_buffer is allowed to go to buffer size to finish its last line
|
|
81754
|
+
end_buffer = buffer_size;
|
|
81755
|
+
}
|
|
81756
|
+
if (position_buffer >= end_buffer) {
|
|
81757
|
+
// buffer ends, return false
|
|
81758
|
+
return false;
|
|
81321
81759
|
}
|
|
81760
|
+
// we can still scan stuff, return true
|
|
81761
|
+
return true;
|
|
81762
|
+
}
|
|
81322
81763
|
|
|
81323
|
-
|
|
81324
|
-
|
|
81325
|
-
|
|
81326
|
-
|
|
81327
|
-
|
|
81328
|
-
|
|
81329
|
-
|
|
81764
|
+
bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
|
|
81765
|
+
|
|
81766
|
+
// used for parsing algorithm
|
|
81767
|
+
D_ASSERT(end_buffer <= buffer_size);
|
|
81768
|
+
bool finished_chunk = false;
|
|
81769
|
+
idx_t column = 0;
|
|
81770
|
+
idx_t offset = 0;
|
|
81771
|
+
bool has_quotes = false;
|
|
81772
|
+
vector<idx_t> escape_positions;
|
|
81773
|
+
if (start_buffer == buffer->buffer_start && !try_add_line) {
|
|
81774
|
+
// First time reading this buffer piece
|
|
81775
|
+
if (!SetPosition(insert_chunk)) {
|
|
81776
|
+
// This means the buffer size does not contain a new line
|
|
81777
|
+
return true;
|
|
81330
81778
|
}
|
|
81331
|
-
int64_t error_line = linenr - (chunk.size() - row_idx) + 1 + offset;
|
|
81332
|
-
D_ASSERT(error_line >= 0);
|
|
81333
|
-
throw InvalidInputException("Error in file \"%s\" at line %llu in column \"%s\": "
|
|
81334
|
-
"%s. Parser options: %s",
|
|
81335
|
-
options.file_path, error_line, col_name,
|
|
81336
|
-
ErrorManager::InvalidUnicodeError(s.GetString(), "CSV file"), options.ToString());
|
|
81337
81779
|
}
|
|
81338
|
-
}
|
|
81339
81780
|
|
|
81340
|
-
|
|
81341
|
-
|
|
81342
|
-
|
|
81343
|
-
|
|
81781
|
+
// start parsing the first value
|
|
81782
|
+
goto value_start;
|
|
81783
|
+
|
|
81784
|
+
value_start : {
|
|
81785
|
+
/* state: value_start */
|
|
81786
|
+
if (!BufferRemainder()) {
|
|
81787
|
+
goto final_state;
|
|
81344
81788
|
}
|
|
81345
|
-
|
|
81789
|
+
offset = 0;
|
|
81346
81790
|
|
|
81347
|
-
|
|
81348
|
-
if (
|
|
81349
|
-
|
|
81791
|
+
// this state parses the first character of a value
|
|
81792
|
+
if ((*buffer)[position_buffer] == options.quote[0]) {
|
|
81793
|
+
// quote: actual value starts in the next position
|
|
81794
|
+
// move to in_quotes state
|
|
81795
|
+
start_buffer = position_buffer + 1;
|
|
81796
|
+
goto in_quotes;
|
|
81797
|
+
} else {
|
|
81798
|
+
// no quote, move to normal parsing state
|
|
81799
|
+
start_buffer = position_buffer;
|
|
81800
|
+
goto normal;
|
|
81350
81801
|
}
|
|
81802
|
+
};
|
|
81351
81803
|
|
|
81352
|
-
|
|
81804
|
+
normal : {
|
|
81805
|
+
/* state: normal parsing state */
|
|
81806
|
+
// this state parses the remainder of a non-quoted value until we reach a delimiter or newline
|
|
81807
|
+
for (; position_buffer < end_buffer; position_buffer++) {
|
|
81808
|
+
if ((*buffer)[position_buffer] == options.delimiter[0]) {
|
|
81809
|
+
// delimiter: end the value and add it to the chunk
|
|
81810
|
+
goto add_value;
|
|
81811
|
+
} else if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
|
|
81812
|
+
// newline: add row
|
|
81813
|
+
D_ASSERT(try_add_line || column == insert_chunk.ColumnCount() - 1);
|
|
81814
|
+
goto add_row;
|
|
81815
|
+
}
|
|
81816
|
+
}
|
|
81817
|
+
if (!BufferRemainder()) {
|
|
81818
|
+
goto final_state;
|
|
81819
|
+
} else {
|
|
81820
|
+
goto normal;
|
|
81821
|
+
}
|
|
81822
|
+
};
|
|
81353
81823
|
|
|
81354
|
-
|
|
81355
|
-
|
|
81356
|
-
|
|
81357
|
-
|
|
81358
|
-
|
|
81359
|
-
|
|
81360
|
-
|
|
81361
|
-
|
|
81362
|
-
|
|
81363
|
-
|
|
81364
|
-
|
|
81365
|
-
|
|
81366
|
-
// use the date format to cast the chunk
|
|
81367
|
-
success =
|
|
81368
|
-
TryCastDateVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_cols_idx[col_idx]],
|
|
81369
|
-
parse_chunk.size(), error_message);
|
|
81370
|
-
} else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
|
|
81371
|
-
sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP) {
|
|
81372
|
-
// use the date format to cast the chunk
|
|
81373
|
-
success = TryCastTimestampVector(options, parse_chunk.data[col_idx],
|
|
81374
|
-
insert_chunk.data[insert_cols_idx[col_idx]], parse_chunk.size(),
|
|
81375
|
-
error_message);
|
|
81376
|
-
} else {
|
|
81377
|
-
// target type is not varchar: perform a cast
|
|
81378
|
-
success = VectorOperations::DefaultTryCast(parse_chunk.data[col_idx],
|
|
81379
|
-
insert_chunk.data[insert_cols_idx[col_idx]],
|
|
81380
|
-
parse_chunk.size(), &error_message);
|
|
81381
|
-
}
|
|
81382
|
-
if (success) {
|
|
81383
|
-
continue;
|
|
81384
|
-
}
|
|
81385
|
-
if (options.ignore_errors) {
|
|
81386
|
-
conversion_error_ignored = true;
|
|
81387
|
-
continue;
|
|
81388
|
-
}
|
|
81389
|
-
string col_name = to_string(col_idx);
|
|
81390
|
-
if (col_idx < col_names.size()) {
|
|
81391
|
-
col_name = "\"" + col_names[col_idx] + "\"";
|
|
81392
|
-
}
|
|
81824
|
+
add_value : {
|
|
81825
|
+
/* state: Add value to string vector */
|
|
81826
|
+
AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes);
|
|
81827
|
+
// increase position by 1 and move start to the new position
|
|
81828
|
+
offset = 0;
|
|
81829
|
+
has_quotes = false;
|
|
81830
|
+
start_buffer = ++position_buffer;
|
|
81831
|
+
if (!BufferRemainder()) {
|
|
81832
|
+
goto final_state;
|
|
81833
|
+
}
|
|
81834
|
+
goto value_start;
|
|
81835
|
+
};
|
|
81393
81836
|
|
|
81394
|
-
|
|
81395
|
-
|
|
81396
|
-
|
|
81397
|
-
|
|
81398
|
-
auto &parsed_column = parse_chunk.data[col_idx];
|
|
81837
|
+
add_row : {
|
|
81838
|
+
/* state: Add Row to Parse chunk */
|
|
81839
|
+
// check type of newline (\r or \n)
|
|
81840
|
+
bool carriage_return = (*buffer)[position_buffer] == '\r';
|
|
81399
81841
|
|
|
81400
|
-
|
|
81401
|
-
|
|
81402
|
-
|
|
81842
|
+
AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes);
|
|
81843
|
+
if (try_add_line) {
|
|
81844
|
+
bool success = column == insert_chunk.ColumnCount();
|
|
81845
|
+
if (success) {
|
|
81846
|
+
AddRow(insert_chunk, column);
|
|
81847
|
+
success = Flush(insert_chunk);
|
|
81848
|
+
}
|
|
81849
|
+
reached_remainder_state = false;
|
|
81850
|
+
parse_chunk.Reset();
|
|
81851
|
+
return success;
|
|
81852
|
+
} else {
|
|
81853
|
+
finished_chunk = AddRow(insert_chunk, column);
|
|
81854
|
+
}
|
|
81855
|
+
// increase position by 1 and move start to the new position
|
|
81856
|
+
offset = 0;
|
|
81857
|
+
has_quotes = false;
|
|
81858
|
+
start_buffer = ++position_buffer;
|
|
81859
|
+
if (reached_remainder_state || finished_chunk) {
|
|
81860
|
+
goto final_state;
|
|
81861
|
+
}
|
|
81862
|
+
if (!BufferRemainder()) {
|
|
81863
|
+
goto final_state;
|
|
81864
|
+
}
|
|
81865
|
+
if (carriage_return) {
|
|
81866
|
+
// \r newline, go to special state that parses an optional \n afterwards
|
|
81867
|
+
goto carriage_return;
|
|
81868
|
+
} else {
|
|
81869
|
+
// \n newline, move to value start
|
|
81870
|
+
if (finished_chunk) {
|
|
81871
|
+
goto final_state;
|
|
81872
|
+
}
|
|
81873
|
+
goto value_start;
|
|
81874
|
+
}
|
|
81875
|
+
}
|
|
81876
|
+
in_quotes:
|
|
81877
|
+
/* state: in_quotes this state parses the remainder of a quoted value*/
|
|
81878
|
+
has_quotes = true;
|
|
81879
|
+
position_buffer++;
|
|
81880
|
+
for (; position_buffer < end_buffer; position_buffer++) {
|
|
81881
|
+
if ((*buffer)[position_buffer] == options.quote[0]) {
|
|
81882
|
+
// quote: move to unquoted state
|
|
81883
|
+
goto unquote;
|
|
81884
|
+
} else if ((*buffer)[position_buffer] == options.escape[0]) {
|
|
81885
|
+
// escape: store the escaped position and move to handle_escape state
|
|
81886
|
+
escape_positions.push_back(position_buffer - start_buffer);
|
|
81887
|
+
goto handle_escape;
|
|
81888
|
+
}
|
|
81889
|
+
}
|
|
81890
|
+
if (!BufferRemainder()) {
|
|
81891
|
+
if (buffer->buffer->IsCSVFileLastBuffer()) {
|
|
81892
|
+
if (try_add_line) {
|
|
81893
|
+
return false;
|
|
81403
81894
|
}
|
|
81404
|
-
|
|
81895
|
+
// still in quoted state at the end of the file or at the end of a buffer when running multithreaded, error:
|
|
81896
|
+
throw InvalidInputException("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path,
|
|
81897
|
+
GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
|
|
81898
|
+
} else {
|
|
81899
|
+
goto final_state;
|
|
81900
|
+
}
|
|
81901
|
+
} else {
|
|
81902
|
+
position_buffer--;
|
|
81903
|
+
goto in_quotes;
|
|
81904
|
+
}
|
|
81405
81905
|
|
|
81406
|
-
|
|
81407
|
-
|
|
81408
|
-
|
|
81409
|
-
|
|
81410
|
-
|
|
81411
|
-
|
|
81906
|
+
unquote:
|
|
81907
|
+
/* state: unquote: this state handles the state directly after we unquote*/
|
|
81908
|
+
//
|
|
81909
|
+
// in this state we expect either another quote (entering the quoted state again, and escaping the quote)
|
|
81910
|
+
// or a delimiter/newline, ending the current value and moving on to the next value
|
|
81911
|
+
position_buffer++;
|
|
81912
|
+
if (!BufferRemainder()) {
|
|
81913
|
+
offset = 1;
|
|
81914
|
+
goto final_state;
|
|
81915
|
+
}
|
|
81916
|
+
if ((*buffer)[position_buffer] == options.quote[0] &&
|
|
81917
|
+
(options.escape.empty() || options.escape[0] == options.quote[0])) {
|
|
81918
|
+
// escaped quote, return to quoted state and store escape position
|
|
81919
|
+
escape_positions.push_back(position_buffer - start_buffer);
|
|
81920
|
+
goto in_quotes;
|
|
81921
|
+
} else if ((*buffer)[position_buffer] == options.delimiter[0]) {
|
|
81922
|
+
// delimiter, add value
|
|
81923
|
+
offset = 1;
|
|
81924
|
+
goto add_value;
|
|
81925
|
+
} else if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
|
|
81926
|
+
offset = 1;
|
|
81927
|
+
D_ASSERT(column == insert_chunk.ColumnCount() - 1);
|
|
81928
|
+
goto add_row;
|
|
81929
|
+
} else if (position_buffer >= end_buffer) {
|
|
81930
|
+
// reached end of buffer
|
|
81931
|
+
offset = 1;
|
|
81932
|
+
goto final_state;
|
|
81933
|
+
} else {
|
|
81934
|
+
error_message = StringUtil::Format(
|
|
81935
|
+
"Error in file \"%s\" on line %s: quote should be followed by end of value, end of "
|
|
81936
|
+
"row or another quote. (%s). ",
|
|
81937
|
+
options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
|
|
81938
|
+
return false;
|
|
81939
|
+
}
|
|
81940
|
+
handle_escape : {
|
|
81941
|
+
/* state: handle_escape */
|
|
81942
|
+
// escape should be followed by a quote or another escape character
|
|
81943
|
+
position_buffer++;
|
|
81944
|
+
if (!BufferRemainder()) {
|
|
81945
|
+
goto final_state;
|
|
81946
|
+
}
|
|
81947
|
+
if (position_buffer >= buffer_size && buffer->buffer->IsCSVFileLastBuffer()) {
|
|
81948
|
+
error_message = StringUtil::Format(
|
|
81949
|
+
"Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
|
|
81950
|
+
GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
|
|
81951
|
+
return false;
|
|
81952
|
+
}
|
|
81953
|
+
if ((*buffer)[position_buffer] != options.quote[0] && (*buffer)[position_buffer] != options.escape[0]) {
|
|
81954
|
+
error_message = StringUtil::Format(
|
|
81955
|
+
"Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
|
|
81956
|
+
GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
|
|
81957
|
+
return false;
|
|
81958
|
+
}
|
|
81959
|
+
// escape was followed by quote or escape, go back to quoted state
|
|
81960
|
+
goto in_quotes;
|
|
81961
|
+
}
|
|
81962
|
+
|
|
81963
|
+
carriage_return : {
|
|
81964
|
+
/* state: carriage_return */
|
|
81965
|
+
// this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
|
|
81966
|
+
if ((*buffer)[position_buffer] == '\n') {
|
|
81967
|
+
// newline after carriage return: skip
|
|
81968
|
+
// increase position by 1 and move start to the new position
|
|
81969
|
+
start_buffer = ++position_buffer;
|
|
81970
|
+
if (position_buffer >= buffer_size) {
|
|
81971
|
+
// file ends right after delimiter, go to final state
|
|
81972
|
+
goto final_state;
|
|
81973
|
+
}
|
|
81974
|
+
}
|
|
81975
|
+
goto value_start;
|
|
81976
|
+
}
|
|
81977
|
+
final_state : {
|
|
81978
|
+
/* state: final_stage reached after we finished reading the end_buffer of the csv buffer */
|
|
81979
|
+
// reset end buffer
|
|
81980
|
+
end_buffer = buffer->buffer_end;
|
|
81981
|
+
if (finished_chunk) {
|
|
81982
|
+
return true;
|
|
81983
|
+
}
|
|
81984
|
+
// If this is the last buffer, we have to read the last value
|
|
81985
|
+
if (buffer->buffer->IsCSVFileLastBuffer() || (buffer->next_buffer->IsCSVFileLastBuffer())) {
|
|
81986
|
+
if (column > 0 || position_buffer > start_buffer) {
|
|
81987
|
+
// remaining values to be added to the chunk
|
|
81988
|
+
D_ASSERT(column == insert_chunk.ColumnCount() - 1);
|
|
81989
|
+
AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes);
|
|
81990
|
+
if (try_add_line) {
|
|
81991
|
+
bool success = column == sql_types.size();
|
|
81992
|
+
if (success) {
|
|
81993
|
+
AddRow(insert_chunk, column);
|
|
81994
|
+
success = Flush(insert_chunk);
|
|
81995
|
+
}
|
|
81996
|
+
parse_chunk.Reset();
|
|
81997
|
+
reached_remainder_state = false;
|
|
81998
|
+
return success;
|
|
81412
81999
|
} else {
|
|
81413
|
-
|
|
81414
|
-
error_line, col_name, options.ToString());
|
|
82000
|
+
AddRow(insert_chunk, column);
|
|
81415
82001
|
}
|
|
81416
82002
|
}
|
|
81417
82003
|
}
|
|
81418
|
-
|
|
81419
|
-
|
|
81420
|
-
|
|
81421
|
-
|
|
81422
|
-
|
|
82004
|
+
// flush the parsed chunk and finalize parsing
|
|
82005
|
+
if (mode == ParserMode::PARSING) {
|
|
82006
|
+
Flush(insert_chunk);
|
|
82007
|
+
}
|
|
82008
|
+
return true;
|
|
82009
|
+
};
|
|
82010
|
+
}
|
|
81423
82011
|
|
|
81424
|
-
|
|
81425
|
-
|
|
81426
|
-
|
|
82012
|
+
void ParallelCSVReader::ParseCSV(DataChunk &insert_chunk) {
|
|
82013
|
+
string error_message;
|
|
82014
|
+
if (!TryParseCSV(ParserMode::PARSING, insert_chunk, error_message)) {
|
|
82015
|
+
throw InvalidInputException(error_message);
|
|
82016
|
+
}
|
|
82017
|
+
}
|
|
81427
82018
|
|
|
81428
|
-
|
|
81429
|
-
|
|
82019
|
+
bool ParallelCSVReader::TryParseCSV(ParserMode mode) {
|
|
82020
|
+
DataChunk dummy_chunk;
|
|
82021
|
+
string error_message;
|
|
82022
|
+
return TryParseCSV(mode, dummy_chunk, error_message);
|
|
82023
|
+
}
|
|
81430
82024
|
|
|
81431
|
-
|
|
81432
|
-
|
|
81433
|
-
|
|
81434
|
-
|
|
81435
|
-
|
|
81436
|
-
}
|
|
81437
|
-
if (!failed) {
|
|
81438
|
-
succesful_rows.set_index(sel_size++, row_idx);
|
|
81439
|
-
}
|
|
81440
|
-
}
|
|
81441
|
-
insert_chunk.Slice(succesful_rows, sel_size);
|
|
82025
|
+
void ParallelCSVReader::ParseCSV(ParserMode mode) {
|
|
82026
|
+
DataChunk dummy_chunk;
|
|
82027
|
+
string error_message;
|
|
82028
|
+
if (!TryParseCSV(mode, dummy_chunk, error_message)) {
|
|
82029
|
+
throw InvalidInputException(error_message);
|
|
81442
82030
|
}
|
|
81443
|
-
parse_chunk.Reset();
|
|
81444
82031
|
}
|
|
82032
|
+
|
|
82033
|
+
bool ParallelCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_chunk, string &error_message) {
|
|
82034
|
+
mode = parser_mode;
|
|
82035
|
+
return TryParseSimpleCSV(insert_chunk, error_message);
|
|
82036
|
+
}
|
|
82037
|
+
|
|
81445
82038
|
} // namespace duckdb
|
|
81446
82039
|
//===----------------------------------------------------------------------===//
|
|
81447
82040
|
// DuckDB
|
|
@@ -120938,6 +121531,28 @@ void StripAccentsFun::RegisterFunction(BuiltinFunctions &set) {
|
|
|
120938
121531
|
|
|
120939
121532
|
namespace duckdb {
|
|
120940
121533
|
|
|
121534
|
+
static const int64_t SUPPORTED_UPPER_BOUND = NumericLimits<uint32_t>::Maximum();
|
|
121535
|
+
static const int64_t SUPPORTED_LOWER_BOUND = -SUPPORTED_UPPER_BOUND - 1;
|
|
121536
|
+
|
|
121537
|
+
static inline void AssertInSupportedRange(idx_t input_size, int64_t offset, int64_t length) {
|
|
121538
|
+
|
|
121539
|
+
if (input_size > (uint64_t)SUPPORTED_UPPER_BOUND) {
|
|
121540
|
+
throw OutOfRangeException("Substring input size is too large (> %d)", SUPPORTED_UPPER_BOUND);
|
|
121541
|
+
}
|
|
121542
|
+
if (offset < SUPPORTED_LOWER_BOUND) {
|
|
121543
|
+
throw OutOfRangeException("Substring offset outside of supported range (< %d)", SUPPORTED_LOWER_BOUND);
|
|
121544
|
+
}
|
|
121545
|
+
if (offset > SUPPORTED_UPPER_BOUND) {
|
|
121546
|
+
throw OutOfRangeException("Substring offset outside of supported range (> %d)", SUPPORTED_UPPER_BOUND);
|
|
121547
|
+
}
|
|
121548
|
+
if (length < SUPPORTED_LOWER_BOUND) {
|
|
121549
|
+
throw OutOfRangeException("Substring length outside of supported range (< %d)", SUPPORTED_LOWER_BOUND);
|
|
121550
|
+
}
|
|
121551
|
+
if (length > SUPPORTED_UPPER_BOUND) {
|
|
121552
|
+
throw OutOfRangeException("Substring length outside of supported range (> %d)", SUPPORTED_UPPER_BOUND);
|
|
121553
|
+
}
|
|
121554
|
+
}
|
|
121555
|
+
|
|
120941
121556
|
string_t SubstringEmptyString(Vector &result) {
|
|
120942
121557
|
auto result_string = StringVector::EmptyString(result, 0);
|
|
120943
121558
|
result_string.Finalize();
|
|
@@ -120977,7 +121592,7 @@ bool SubstringStartEnd(int64_t input_size, int64_t offset, int64_t length, int64
|
|
|
120977
121592
|
} else {
|
|
120978
121593
|
// negative length: go backwards (i.e. end = start, start = start + length)
|
|
120979
121594
|
end = start;
|
|
120980
|
-
start = MaxValue<int64_t>(0,
|
|
121595
|
+
start = MaxValue<int64_t>(0, start + length);
|
|
120981
121596
|
}
|
|
120982
121597
|
if (start == end) {
|
|
120983
121598
|
return false;
|
|
@@ -120990,6 +121605,8 @@ string_t SubstringASCII(Vector &result, string_t input, int64_t offset, int64_t
|
|
|
120990
121605
|
auto input_data = input.GetDataUnsafe();
|
|
120991
121606
|
auto input_size = input.GetSize();
|
|
120992
121607
|
|
|
121608
|
+
AssertInSupportedRange(input_size, offset, length);
|
|
121609
|
+
|
|
120993
121610
|
int64_t start, end;
|
|
120994
121611
|
if (!SubstringStartEnd(input_size, offset, length, start, end)) {
|
|
120995
121612
|
return SubstringEmptyString(result);
|
|
@@ -121001,6 +121618,8 @@ string_t SubstringFun::SubstringUnicode(Vector &result, string_t input, int64_t
|
|
|
121001
121618
|
auto input_data = input.GetDataUnsafe();
|
|
121002
121619
|
auto input_size = input.GetSize();
|
|
121003
121620
|
|
|
121621
|
+
AssertInSupportedRange(input_size, offset, length);
|
|
121622
|
+
|
|
121004
121623
|
if (length == 0) {
|
|
121005
121624
|
return SubstringEmptyString(result);
|
|
121006
121625
|
}
|
|
@@ -121051,14 +121670,15 @@ string_t SubstringFun::SubstringUnicode(Vector &result, string_t input, int64_t
|
|
|
121051
121670
|
int64_t start, end;
|
|
121052
121671
|
|
|
121053
121672
|
// we express start and end as unicode codepoints from the front
|
|
121673
|
+
offset--;
|
|
121054
121674
|
if (length < 0) {
|
|
121055
121675
|
// negative length
|
|
121056
|
-
start = MaxValue<int64_t>(0, offset + length
|
|
121057
|
-
end = offset
|
|
121676
|
+
start = MaxValue<int64_t>(0, offset + length);
|
|
121677
|
+
end = offset;
|
|
121058
121678
|
} else {
|
|
121059
121679
|
// positive length
|
|
121060
|
-
start = MaxValue<int64_t>(0, offset
|
|
121061
|
-
end = offset + length
|
|
121680
|
+
start = MaxValue<int64_t>(0, offset);
|
|
121681
|
+
end = offset + length;
|
|
121062
121682
|
}
|
|
121063
121683
|
|
|
121064
121684
|
int64_t current_character = 0;
|
|
@@ -121086,6 +121706,8 @@ string_t SubstringFun::SubstringGrapheme(Vector &result, string_t input, int64_t
|
|
|
121086
121706
|
auto input_data = input.GetDataUnsafe();
|
|
121087
121707
|
auto input_size = input.GetSize();
|
|
121088
121708
|
|
|
121709
|
+
AssertInSupportedRange(input_size, offset, length);
|
|
121710
|
+
|
|
121089
121711
|
// we don't know yet if the substring is ascii, but we assume it is (for now)
|
|
121090
121712
|
// first get the start and end as if this was an ascii string
|
|
121091
121713
|
int64_t start, end;
|
|
@@ -121170,7 +121792,7 @@ static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &r
|
|
|
121170
121792
|
} else {
|
|
121171
121793
|
BinaryExecutor::Execute<string_t, int64_t, string_t>(
|
|
121172
121794
|
input_vector, offset_vector, result, args.size(), [&](string_t input_string, int64_t offset) {
|
|
121173
|
-
return OP::Substring(result, input_string, offset, NumericLimits<
|
|
121795
|
+
return OP::Substring(result, input_string, offset, NumericLimits<uint32_t>::Maximum());
|
|
121174
121796
|
});
|
|
121175
121797
|
}
|
|
121176
121798
|
}
|
|
@@ -121189,7 +121811,7 @@ static void SubstringFunctionASCII(DataChunk &args, ExpressionState &state, Vect
|
|
|
121189
121811
|
} else {
|
|
121190
121812
|
BinaryExecutor::Execute<string_t, int64_t, string_t>(
|
|
121191
121813
|
input_vector, offset_vector, result, args.size(), [&](string_t input_string, int64_t offset) {
|
|
121192
|
-
return SubstringASCII(result, input_string, offset, NumericLimits<
|
|
121814
|
+
return SubstringASCII(result, input_string, offset, NumericLimits<uint32_t>::Maximum());
|
|
121193
121815
|
});
|
|
121194
121816
|
}
|
|
121195
121817
|
}
|
|
@@ -124179,72 +124801,6 @@ void CheckpointFunction::RegisterFunction(BuiltinFunctions &set) {
|
|
|
124179
124801
|
}
|
|
124180
124802
|
|
|
124181
124803
|
} // namespace duckdb
|
|
124182
|
-
//===----------------------------------------------------------------------===//
|
|
124183
|
-
// DuckDB
|
|
124184
|
-
//
|
|
124185
|
-
// duckdb/function/table/read_csv.hpp
|
|
124186
|
-
//
|
|
124187
|
-
//
|
|
124188
|
-
//===----------------------------------------------------------------------===//
|
|
124189
|
-
|
|
124190
|
-
|
|
124191
|
-
|
|
124192
|
-
|
|
124193
|
-
|
|
124194
|
-
|
|
124195
|
-
|
|
124196
|
-
namespace duckdb {
|
|
124197
|
-
|
|
124198
|
-
struct BaseCSVData : public TableFunctionData {
|
|
124199
|
-
//! The file path of the CSV file to read or write
|
|
124200
|
-
vector<string> files;
|
|
124201
|
-
//! The CSV reader options
|
|
124202
|
-
BufferedCSVReaderOptions options;
|
|
124203
|
-
//! Offsets for generated columns
|
|
124204
|
-
idx_t filename_col_idx;
|
|
124205
|
-
idx_t hive_partition_col_idx;
|
|
124206
|
-
|
|
124207
|
-
void Finalize();
|
|
124208
|
-
};
|
|
124209
|
-
|
|
124210
|
-
struct WriteCSVData : public BaseCSVData {
|
|
124211
|
-
WriteCSVData(string file_path, vector<LogicalType> sql_types, vector<string> names) : sql_types(move(sql_types)) {
|
|
124212
|
-
files.push_back(move(file_path));
|
|
124213
|
-
options.names = move(names);
|
|
124214
|
-
}
|
|
124215
|
-
|
|
124216
|
-
//! The SQL types to write
|
|
124217
|
-
vector<LogicalType> sql_types;
|
|
124218
|
-
//! The newline string to write
|
|
124219
|
-
string newline = "\n";
|
|
124220
|
-
//! Whether or not we are writing a simple CSV (delimiter, quote and escape are all 1 byte in length)
|
|
124221
|
-
bool is_simple;
|
|
124222
|
-
//! The size of the CSV file (in bytes) that we buffer before we flush it to disk
|
|
124223
|
-
idx_t flush_size = 4096 * 8;
|
|
124224
|
-
};
|
|
124225
|
-
|
|
124226
|
-
struct ReadCSVData : public BaseCSVData {
|
|
124227
|
-
//! The expected SQL types to read
|
|
124228
|
-
vector<LogicalType> sql_types;
|
|
124229
|
-
//! The initial reader (if any): this is used when automatic detection is used during binding.
|
|
124230
|
-
//! In this case, the CSV reader is already created and might as well be re-used.
|
|
124231
|
-
unique_ptr<BufferedCSVReader> initial_reader;
|
|
124232
|
-
//! The union readers is created(when csv union_by_name option is on) during binding
|
|
124233
|
-
//! Those reader can be re-used during ReadCSVFunction
|
|
124234
|
-
vector<unique_ptr<BufferedCSVReader>> union_readers;
|
|
124235
|
-
};
|
|
124236
|
-
|
|
124237
|
-
struct CSVCopyFunction {
|
|
124238
|
-
static void RegisterFunction(BuiltinFunctions &set);
|
|
124239
|
-
};
|
|
124240
|
-
|
|
124241
|
-
struct ReadCSVTableFunction {
|
|
124242
|
-
static TableFunction GetFunction(bool list_parameter = false);
|
|
124243
|
-
static TableFunction GetAutoFunction(bool list_parameter = false);
|
|
124244
|
-
static void RegisterFunction(BuiltinFunctions &set);
|
|
124245
|
-
};
|
|
124246
|
-
|
|
124247
|
-
} // namespace duckdb
|
|
124248
124804
|
|
|
124249
124805
|
|
|
124250
124806
|
|
|
@@ -124263,7 +124819,7 @@ void SubstringDetection(string &str_1, string &str_2, const string &name_str_1,
|
|
|
124263
124819
|
if (str_1.empty() || str_2.empty()) {
|
|
124264
124820
|
return;
|
|
124265
124821
|
}
|
|
124266
|
-
if (str_1.find(str_2) != string::npos || str_2.find(str_1) != std::string::npos) {
|
|
124822
|
+
if ((str_1.find(str_2) != string::npos || str_2.find(str_1) != std::string::npos) && str_1 != "NULL") {
|
|
124267
124823
|
throw BinderException("%s must not appear in the %s specification and vice versa", name_str_1, name_str_2);
|
|
124268
124824
|
}
|
|
124269
124825
|
}
|
|
@@ -124338,12 +124894,9 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, CopyInfo &in
|
|
|
124338
124894
|
bind_data->sql_types = expected_types;
|
|
124339
124895
|
|
|
124340
124896
|
string file_pattern = info.file_path;
|
|
124897
|
+
vector<string> patterns {file_pattern};
|
|
124341
124898
|
|
|
124342
|
-
|
|
124343
|
-
bind_data->files = fs.Glob(file_pattern, context);
|
|
124344
|
-
if (bind_data->files.empty()) {
|
|
124345
|
-
throw IOException("No files found that match the pattern \"%s\"", file_pattern);
|
|
124346
|
-
}
|
|
124899
|
+
bind_data->InitializeFiles(context, patterns);
|
|
124347
124900
|
|
|
124348
124901
|
auto &options = bind_data->options;
|
|
124349
124902
|
|
|
@@ -124358,7 +124911,7 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, CopyInfo &in
|
|
|
124358
124911
|
// no FORCE_QUOTE specified: initialize to false
|
|
124359
124912
|
options.force_not_null.resize(expected_types.size(), false);
|
|
124360
124913
|
}
|
|
124361
|
-
bind_data->
|
|
124914
|
+
bind_data->FinalizeRead(context);
|
|
124362
124915
|
return move(bind_data);
|
|
124363
124916
|
}
|
|
124364
124917
|
|
|
@@ -125417,11 +125970,39 @@ void BuiltinFunctions::RegisterTableFunctions() {
|
|
|
125417
125970
|
|
|
125418
125971
|
|
|
125419
125972
|
|
|
125420
|
-
|
|
125421
125973
|
#include <limits>
|
|
125422
125974
|
|
|
125423
125975
|
namespace duckdb {
|
|
125424
125976
|
|
|
125977
|
+
unique_ptr<CSVFileHandle> ReadCSV::OpenCSV(const BufferedCSVReaderOptions &options, ClientContext &context) {
|
|
125978
|
+
auto &fs = FileSystem::GetFileSystem(context);
|
|
125979
|
+
auto opener = FileSystem::GetFileOpener(context);
|
|
125980
|
+
auto file_handle = fs.OpenFile(options.file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK,
|
|
125981
|
+
options.compression, opener);
|
|
125982
|
+
return make_unique<CSVFileHandle>(move(file_handle));
|
|
125983
|
+
}
|
|
125984
|
+
|
|
125985
|
+
void ReadCSVData::InitializeFiles(ClientContext &context, const vector<string> &patterns) {
|
|
125986
|
+
auto &fs = FileSystem::GetFileSystem(context);
|
|
125987
|
+
for (auto &file_pattern : patterns) {
|
|
125988
|
+
auto found_files = fs.Glob(file_pattern, context);
|
|
125989
|
+
if (found_files.empty()) {
|
|
125990
|
+
throw IOException("No files found that match the pattern \"%s\"", file_pattern);
|
|
125991
|
+
}
|
|
125992
|
+
files.insert(files.end(), found_files.begin(), found_files.end());
|
|
125993
|
+
}
|
|
125994
|
+
}
|
|
125995
|
+
|
|
125996
|
+
void ReadCSVData::FinalizeRead(ClientContext &context) {
|
|
125997
|
+
BaseCSVData::Finalize();
|
|
125998
|
+
auto &config = DBConfig::GetConfig(context);
|
|
125999
|
+
single_threaded = !config.options.experimental_parallel_csv_reader;
|
|
126000
|
+
if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
|
|
126001
|
+
// not supported for parallel CSV reading
|
|
126002
|
+
single_threaded = true;
|
|
126003
|
+
}
|
|
126004
|
+
}
|
|
126005
|
+
|
|
125425
126006
|
static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctionBindInput &input,
|
|
125426
126007
|
vector<LogicalType> &return_types, vector<string> &names) {
|
|
125427
126008
|
auto &config = DBConfig::GetConfig(context);
|
|
@@ -125442,14 +126023,7 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
|
125442
126023
|
patterns.push_back(StringValue::Get(input.inputs[0]));
|
|
125443
126024
|
}
|
|
125444
126025
|
|
|
125445
|
-
|
|
125446
|
-
for (auto &file_pattern : patterns) {
|
|
125447
|
-
auto files = fs.Glob(file_pattern, context);
|
|
125448
|
-
if (files.empty()) {
|
|
125449
|
-
throw IOException("No files found that match the pattern \"%s\"", file_pattern);
|
|
125450
|
-
}
|
|
125451
|
-
result->files.insert(result->files.end(), files.begin(), files.end());
|
|
125452
|
-
}
|
|
126026
|
+
result->InitializeFiles(context, patterns);
|
|
125453
126027
|
|
|
125454
126028
|
for (auto &kv : input.named_parameters) {
|
|
125455
126029
|
auto loption = StringUtil::Lower(kv.first);
|
|
@@ -125480,6 +126054,11 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
|
125480
126054
|
options.include_file_name = BooleanValue::Get(kv.second);
|
|
125481
126055
|
} else if (loption == "hive_partitioning") {
|
|
125482
126056
|
options.include_parsed_hive_partitions = BooleanValue::Get(kv.second);
|
|
126057
|
+
} else if (loption == "buffer_size") {
|
|
126058
|
+
options.buffer_size = kv.second.GetValue<uint64_t>();
|
|
126059
|
+
if (options.buffer_size == 0) {
|
|
126060
|
+
throw InvalidInputException("Buffer Size option must be higher than 0");
|
|
126061
|
+
}
|
|
125483
126062
|
} else {
|
|
125484
126063
|
options.SetReadOption(loption, kv.second, names);
|
|
125485
126064
|
}
|
|
@@ -125492,13 +126071,14 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
|
125492
126071
|
if (options.auto_detect) {
|
|
125493
126072
|
options.file_path = result->files[0];
|
|
125494
126073
|
auto initial_reader = make_unique<BufferedCSVReader>(context, options);
|
|
125495
|
-
|
|
125496
126074
|
return_types.assign(initial_reader->sql_types.begin(), initial_reader->sql_types.end());
|
|
125497
126075
|
if (names.empty()) {
|
|
125498
126076
|
names.assign(initial_reader->col_names.begin(), initial_reader->col_names.end());
|
|
125499
126077
|
} else {
|
|
125500
126078
|
D_ASSERT(return_types.size() == names.size());
|
|
125501
126079
|
}
|
|
126080
|
+
options = result->options;
|
|
126081
|
+
result->sql_types = initial_reader->sql_types;
|
|
125502
126082
|
result->initial_reader = move(initial_reader);
|
|
125503
126083
|
} else {
|
|
125504
126084
|
result->sql_types = return_types;
|
|
@@ -125577,10 +126157,233 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
|
125577
126157
|
}
|
|
125578
126158
|
}
|
|
125579
126159
|
result->options.names = names;
|
|
126160
|
+
result->FinalizeRead(context);
|
|
125580
126161
|
return move(result);
|
|
125581
126162
|
}
|
|
125582
126163
|
|
|
125583
|
-
|
|
126164
|
+
static unique_ptr<FunctionData> ReadCSVAutoBind(ClientContext &context, TableFunctionBindInput &input,
|
|
126165
|
+
vector<LogicalType> &return_types, vector<string> &names) {
|
|
126166
|
+
input.named_parameters["auto_detect"] = Value::BOOLEAN(true);
|
|
126167
|
+
return ReadCSVBind(context, input, return_types, names);
|
|
126168
|
+
}
|
|
126169
|
+
|
|
126170
|
+
//===--------------------------------------------------------------------===//
|
|
126171
|
+
// Parallel CSV Reader CSV Global State
|
|
126172
|
+
//===--------------------------------------------------------------------===//
|
|
126173
|
+
//===--------------------------------------------------------------------===//
|
|
126174
|
+
// Read CSV Global State
|
|
126175
|
+
//===--------------------------------------------------------------------===//
|
|
126176
|
+
struct ParallelCSVGlobalState : public GlobalTableFunctionState {
|
|
126177
|
+
public:
|
|
126178
|
+
ParallelCSVGlobalState(unique_ptr<CSVFileHandle> file_handle_p, vector<string> &files_path_p,
|
|
126179
|
+
idx_t system_threads_p, idx_t buffer_size_p, idx_t rows_to_skip)
|
|
126180
|
+
: file_handle(move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p) {
|
|
126181
|
+
for (idx_t i = 0; i < rows_to_skip; i++) {
|
|
126182
|
+
file_handle->ReadLine();
|
|
126183
|
+
}
|
|
126184
|
+
estimated_linenr = rows_to_skip;
|
|
126185
|
+
file_size = file_handle->FileSize();
|
|
126186
|
+
first_file_size = file_size;
|
|
126187
|
+
bytes_read = 0;
|
|
126188
|
+
if (buffer_size < file_size) {
|
|
126189
|
+
bytes_per_local_state = buffer_size / MaxThreads();
|
|
126190
|
+
} else {
|
|
126191
|
+
bytes_per_local_state = file_size / MaxThreads();
|
|
126192
|
+
}
|
|
126193
|
+
current_buffer = make_shared<CSVBuffer>(buffer_size, *file_handle);
|
|
126194
|
+
next_buffer = current_buffer->Next(*file_handle, buffer_size);
|
|
126195
|
+
}
|
|
126196
|
+
ParallelCSVGlobalState() {
|
|
126197
|
+
}
|
|
126198
|
+
|
|
126199
|
+
idx_t MaxThreads() const override;
|
|
126200
|
+
//! Returns buffer and index that caller thread should read.
|
|
126201
|
+
unique_ptr<CSVBufferRead> Next(ClientContext &context, ReadCSVData &bind_data);
|
|
126202
|
+
//! If we finished reading all the CSV Files
|
|
126203
|
+
bool Finished();
|
|
126204
|
+
//! How many bytes were read up to this point
|
|
126205
|
+
atomic<idx_t> bytes_read;
|
|
126206
|
+
//! Size of current file
|
|
126207
|
+
idx_t file_size;
|
|
126208
|
+
|
|
126209
|
+
private:
|
|
126210
|
+
//! File Handle for current file
|
|
126211
|
+
unique_ptr<CSVFileHandle> file_handle;
|
|
126212
|
+
|
|
126213
|
+
shared_ptr<CSVBuffer> current_buffer;
|
|
126214
|
+
shared_ptr<CSVBuffer> next_buffer;
|
|
126215
|
+
//! The index of the next file to read (i.e. current file + 1)
|
|
126216
|
+
idx_t file_index = 1;
|
|
126217
|
+
|
|
126218
|
+
//! Mutex to lock when getting next batch of bytes (Parallel Only)
|
|
126219
|
+
mutex main_mutex;
|
|
126220
|
+
//! Byte set from for last thread
|
|
126221
|
+
idx_t next_byte = 0;
|
|
126222
|
+
|
|
126223
|
+
//! The current estimated line number
|
|
126224
|
+
idx_t estimated_linenr;
|
|
126225
|
+
|
|
126226
|
+
//! How many bytes we should execute per local state
|
|
126227
|
+
idx_t bytes_per_local_state;
|
|
126228
|
+
|
|
126229
|
+
//! Size of first file
|
|
126230
|
+
idx_t first_file_size;
|
|
126231
|
+
//! Basically max number of threads in DuckDB
|
|
126232
|
+
idx_t system_threads;
|
|
126233
|
+
//! Size of the buffers
|
|
126234
|
+
idx_t buffer_size;
|
|
126235
|
+
//! Current batch index
|
|
126236
|
+
idx_t batch_index = 0;
|
|
126237
|
+
};
|
|
126238
|
+
|
|
126239
|
+
idx_t ParallelCSVGlobalState::MaxThreads() const {
|
|
126240
|
+
// idx_t one_mb = 1000000;
|
|
126241
|
+
// idx_t threads_per_mb = first_file_size / one_mb + 1;
|
|
126242
|
+
// if (threads_per_mb < system_threads) {
|
|
126243
|
+
// return threads_per_mb;
|
|
126244
|
+
// }
|
|
126245
|
+
return system_threads;
|
|
126246
|
+
}
|
|
126247
|
+
|
|
126248
|
+
bool ParallelCSVGlobalState::Finished() {
|
|
126249
|
+
lock_guard<mutex> parallel_lock(main_mutex);
|
|
126250
|
+
return !current_buffer;
|
|
126251
|
+
}
|
|
126252
|
+
|
|
126253
|
+
unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, ReadCSVData &bind_data) {
|
|
126254
|
+
lock_guard<mutex> parallel_lock(main_mutex);
|
|
126255
|
+
if (!current_buffer) {
|
|
126256
|
+
// We are done scanning.
|
|
126257
|
+
return nullptr;
|
|
126258
|
+
}
|
|
126259
|
+
// set up the current buffer
|
|
126260
|
+
auto result = make_unique<CSVBufferRead>(current_buffer, next_buffer, next_byte, next_byte + bytes_per_local_state,
|
|
126261
|
+
batch_index++, estimated_linenr);
|
|
126262
|
+
// move the byte index of the CSV reader to the next buffer
|
|
126263
|
+
next_byte += bytes_per_local_state;
|
|
126264
|
+
estimated_linenr += bytes_per_local_state / (bind_data.sql_types.size() * 5); // estimate 5 bytes per column
|
|
126265
|
+
if (next_byte >= current_buffer->GetBufferSize()) {
|
|
126266
|
+
// We replace the current buffer with the next buffer
|
|
126267
|
+
next_byte = 0;
|
|
126268
|
+
bytes_read += current_buffer->GetBufferSize();
|
|
126269
|
+
current_buffer = next_buffer;
|
|
126270
|
+
if (next_buffer) {
|
|
126271
|
+
// Next buffer gets the next-next buffer
|
|
126272
|
+
next_buffer = next_buffer->Next(*file_handle, buffer_size);
|
|
126273
|
+
}
|
|
126274
|
+
}
|
|
126275
|
+
if (current_buffer && !next_buffer) {
|
|
126276
|
+
// This means we are done with the current file, we need to go to the next one (if exists).
|
|
126277
|
+
if (file_index < bind_data.files.size()) {
|
|
126278
|
+
bind_data.options.file_path = bind_data.files[file_index++];
|
|
126279
|
+
file_handle = ReadCSV::OpenCSV(bind_data.options, context);
|
|
126280
|
+
next_buffer = make_shared<CSVBuffer>(buffer_size, *file_handle);
|
|
126281
|
+
}
|
|
126282
|
+
}
|
|
126283
|
+
return result;
|
|
126284
|
+
}
|
|
126285
|
+
static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext &context,
|
|
126286
|
+
TableFunctionInitInput &input) {
|
|
126287
|
+
auto &bind_data = (ReadCSVData &)*input.bind_data;
|
|
126288
|
+
if (bind_data.files.empty()) {
|
|
126289
|
+
// This can happen when a filename based filter pushdown has eliminated all possible files for this scan.
|
|
126290
|
+
return make_unique<ParallelCSVGlobalState>();
|
|
126291
|
+
}
|
|
126292
|
+
unique_ptr<CSVFileHandle> file_handle;
|
|
126293
|
+
if (bind_data.initial_reader) {
|
|
126294
|
+
file_handle = move(bind_data.initial_reader->file_handle);
|
|
126295
|
+
bind_data.initial_reader.reset();
|
|
126296
|
+
} else {
|
|
126297
|
+
bind_data.options.file_path = bind_data.files[0];
|
|
126298
|
+
file_handle = ReadCSV::OpenCSV(bind_data.options, context);
|
|
126299
|
+
}
|
|
126300
|
+
idx_t rows_to_skip = bind_data.options.skip_rows + (bind_data.options.has_header ? 1 : 0);
|
|
126301
|
+
return make_unique<ParallelCSVGlobalState>(move(file_handle), bind_data.files, context.db->NumberOfThreads(),
|
|
126302
|
+
bind_data.options.buffer_size, rows_to_skip);
|
|
126303
|
+
}
|
|
126304
|
+
|
|
126305
|
+
//===--------------------------------------------------------------------===//
|
|
126306
|
+
// Read CSV Local State
|
|
126307
|
+
//===--------------------------------------------------------------------===//
|
|
126308
|
+
struct ParallelCSVLocalState : public LocalTableFunctionState {
|
|
126309
|
+
public:
|
|
126310
|
+
explicit ParallelCSVLocalState(unique_ptr<ParallelCSVReader> csv_reader_p) : csv_reader(move(csv_reader_p)) {
|
|
126311
|
+
}
|
|
126312
|
+
|
|
126313
|
+
//! The CSV reader
|
|
126314
|
+
unique_ptr<ParallelCSVReader> csv_reader;
|
|
126315
|
+
CSVBufferRead previous_buffer;
|
|
126316
|
+
};
|
|
126317
|
+
|
|
126318
|
+
unique_ptr<LocalTableFunctionState> ReadCSVInitLocal(ExecutionContext &context, TableFunctionInitInput &input,
|
|
126319
|
+
GlobalTableFunctionState *global_state_p) {
|
|
126320
|
+
auto &csv_data = (ReadCSVData &)*input.bind_data;
|
|
126321
|
+
if (csv_data.single_threaded) {
|
|
126322
|
+
return nullptr;
|
|
126323
|
+
}
|
|
126324
|
+
auto &global_state = (ParallelCSVGlobalState &)*global_state_p;
|
|
126325
|
+
auto next_local_buffer = global_state.Next(context.client, csv_data);
|
|
126326
|
+
unique_ptr<ParallelCSVReader> csv_reader;
|
|
126327
|
+
if (next_local_buffer) {
|
|
126328
|
+
csv_reader = make_unique<ParallelCSVReader>(context.client, csv_data.options, move(next_local_buffer),
|
|
126329
|
+
csv_data.sql_types);
|
|
126330
|
+
}
|
|
126331
|
+
auto new_local_state = make_unique<ParallelCSVLocalState>(move(csv_reader));
|
|
126332
|
+
return move(new_local_state);
|
|
126333
|
+
}
|
|
126334
|
+
|
|
126335
|
+
static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
|
|
126336
|
+
auto &bind_data = (ReadCSVData &)*data_p.bind_data;
|
|
126337
|
+
auto &csv_global_state = (ParallelCSVGlobalState &)*data_p.global_state;
|
|
126338
|
+
auto &csv_local_state = (ParallelCSVLocalState &)*data_p.local_state;
|
|
126339
|
+
|
|
126340
|
+
if (!csv_local_state.csv_reader) {
|
|
126341
|
+
// no csv_reader was set, this can happen when a filename-based filter has filtered out all possible files
|
|
126342
|
+
return;
|
|
126343
|
+
}
|
|
126344
|
+
|
|
126345
|
+
do {
|
|
126346
|
+
if (output.size() != 0 || (csv_global_state.Finished() && csv_local_state.csv_reader->position_buffer >=
|
|
126347
|
+
csv_local_state.csv_reader->end_buffer)) {
|
|
126348
|
+
break;
|
|
126349
|
+
}
|
|
126350
|
+
if (csv_local_state.csv_reader->position_buffer >= csv_local_state.csv_reader->end_buffer) {
|
|
126351
|
+
auto next_chunk = csv_global_state.Next(context, bind_data);
|
|
126352
|
+
if (!next_chunk) {
|
|
126353
|
+
break;
|
|
126354
|
+
}
|
|
126355
|
+
// csv_local_state.previous_buffer = csv_local_state.csv_reader->buffer;
|
|
126356
|
+
csv_local_state.csv_reader->SetBufferRead(move(next_chunk));
|
|
126357
|
+
}
|
|
126358
|
+
csv_local_state.csv_reader->ParseCSV(output);
|
|
126359
|
+
|
|
126360
|
+
} while (true);
|
|
126361
|
+
|
|
126362
|
+
if (bind_data.options.union_by_name) {
|
|
126363
|
+
throw InternalException("FIXME: union by name");
|
|
126364
|
+
}
|
|
126365
|
+
if (bind_data.options.include_file_name) {
|
|
126366
|
+
throw InternalException("FIXME: output file name");
|
|
126367
|
+
}
|
|
126368
|
+
if (bind_data.options.include_parsed_hive_partitions) {
|
|
126369
|
+
throw InternalException("FIXME: hive partitions");
|
|
126370
|
+
}
|
|
126371
|
+
}
|
|
126372
|
+
|
|
126373
|
+
static idx_t CSVReaderGetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
|
|
126374
|
+
LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
|
|
126375
|
+
auto &bind_data = (ReadCSVData &)*bind_data_p;
|
|
126376
|
+
if (bind_data.single_threaded) {
|
|
126377
|
+
return 0;
|
|
126378
|
+
}
|
|
126379
|
+
auto &data = (ParallelCSVLocalState &)*local_state;
|
|
126380
|
+
return data.csv_reader->buffer->batch_index;
|
|
126381
|
+
}
|
|
126382
|
+
|
|
126383
|
+
//===--------------------------------------------------------------------===//
|
|
126384
|
+
// Single-Threaded CSV Reader
|
|
126385
|
+
//===--------------------------------------------------------------------===//
|
|
126386
|
+
struct SingleThreadedCSVState : public GlobalTableFunctionState {
|
|
125584
126387
|
//! The CSV reader
|
|
125585
126388
|
unique_ptr<BufferedCSVReader> csv_reader;
|
|
125586
126389
|
//! The index of the next file to read (i.e. current file + 1)
|
|
@@ -125589,11 +126392,16 @@ struct ReadCSVOperatorData : public GlobalTableFunctionState {
|
|
|
125589
126392
|
idx_t file_size;
|
|
125590
126393
|
//! How many bytes were read up to this point
|
|
125591
126394
|
atomic<idx_t> bytes_read;
|
|
126395
|
+
|
|
126396
|
+
idx_t MaxThreads() const override {
|
|
126397
|
+
return 1;
|
|
126398
|
+
}
|
|
125592
126399
|
};
|
|
125593
126400
|
|
|
125594
|
-
static unique_ptr<GlobalTableFunctionState>
|
|
126401
|
+
static unique_ptr<GlobalTableFunctionState> SingleThreadedCSVInit(ClientContext &context,
|
|
126402
|
+
TableFunctionInitInput &input) {
|
|
125595
126403
|
auto &bind_data = (ReadCSVData &)*input.bind_data;
|
|
125596
|
-
auto result = make_unique<
|
|
126404
|
+
auto result = make_unique<SingleThreadedCSVState>();
|
|
125597
126405
|
if (bind_data.initial_reader) {
|
|
125598
126406
|
result->csv_reader = move(bind_data.initial_reader);
|
|
125599
126407
|
} else if (bind_data.files.empty()) {
|
|
@@ -125603,20 +126411,14 @@ static unique_ptr<GlobalTableFunctionState> ReadCSVInit(ClientContext &context,
|
|
|
125603
126411
|
bind_data.options.file_path = bind_data.files[0];
|
|
125604
126412
|
result->csv_reader = make_unique<BufferedCSVReader>(context, bind_data.options, bind_data.sql_types);
|
|
125605
126413
|
}
|
|
125606
|
-
result->file_size = result->csv_reader->
|
|
126414
|
+
result->file_size = result->csv_reader->file_handle->FileSize();
|
|
125607
126415
|
result->file_index = 1;
|
|
125608
126416
|
return move(result);
|
|
125609
126417
|
}
|
|
125610
126418
|
|
|
125611
|
-
static
|
|
125612
|
-
vector<LogicalType> &return_types, vector<string> &names) {
|
|
125613
|
-
input.named_parameters["auto_detect"] = Value::BOOLEAN(true);
|
|
125614
|
-
return ReadCSVBind(context, input, return_types, names);
|
|
125615
|
-
}
|
|
125616
|
-
|
|
125617
|
-
static void ReadCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
|
|
126419
|
+
static void SingleThreadedCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
|
|
125618
126420
|
auto &bind_data = (ReadCSVData &)*data_p.bind_data;
|
|
125619
|
-
auto &data = (
|
|
126421
|
+
auto &data = (SingleThreadedCSVState &)*data_p.global_state;
|
|
125620
126422
|
|
|
125621
126423
|
if (!data.csv_reader) {
|
|
125622
126424
|
// no csv_reader was set, this can happen when a filename-based filter has filtered out all possible files
|
|
@@ -125675,6 +126477,27 @@ static void ReadCSVFunction(ClientContext &context, TableFunctionInput &data_p,
|
|
|
125675
126477
|
}
|
|
125676
126478
|
}
|
|
125677
126479
|
|
|
126480
|
+
//===--------------------------------------------------------------------===//
|
|
126481
|
+
// Read CSV Functions
|
|
126482
|
+
//===--------------------------------------------------------------------===//
|
|
126483
|
+
static unique_ptr<GlobalTableFunctionState> ReadCSVInitGlobal(ClientContext &context, TableFunctionInitInput &input) {
|
|
126484
|
+
auto &bind_data = (ReadCSVData &)*input.bind_data;
|
|
126485
|
+
if (bind_data.single_threaded) {
|
|
126486
|
+
return SingleThreadedCSVInit(context, input);
|
|
126487
|
+
} else {
|
|
126488
|
+
return ParallelCSVInitGlobal(context, input);
|
|
126489
|
+
}
|
|
126490
|
+
}
|
|
126491
|
+
|
|
126492
|
+
static void ReadCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
|
|
126493
|
+
auto &bind_data = (ReadCSVData &)*data_p.bind_data;
|
|
126494
|
+
if (bind_data.single_threaded) {
|
|
126495
|
+
SingleThreadedCSVFunction(context, data_p, output);
|
|
126496
|
+
} else {
|
|
126497
|
+
ParallelReadCSVFunction(context, data_p, output);
|
|
126498
|
+
}
|
|
126499
|
+
}
|
|
126500
|
+
|
|
125678
126501
|
static void ReadCSVAddNamedParameters(TableFunction &table_function) {
|
|
125679
126502
|
table_function.named_parameters["sep"] = LogicalType::VARCHAR;
|
|
125680
126503
|
table_function.named_parameters["delim"] = LogicalType::VARCHAR;
|
|
@@ -125699,15 +126522,26 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
|
|
|
125699
126522
|
table_function.named_parameters["maximum_line_size"] = LogicalType::VARCHAR;
|
|
125700
126523
|
table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
|
|
125701
126524
|
table_function.named_parameters["union_by_name"] = LogicalType::BOOLEAN;
|
|
126525
|
+
table_function.named_parameters["buffer_size"] = LogicalType::UBIGINT;
|
|
125702
126526
|
}
|
|
125703
126527
|
|
|
125704
126528
|
double CSVReaderProgress(ClientContext &context, const FunctionData *bind_data_p,
|
|
125705
126529
|
const GlobalTableFunctionState *global_state) {
|
|
125706
|
-
auto &
|
|
125707
|
-
|
|
126530
|
+
auto &bind_data = (ReadCSVData &)*bind_data_p;
|
|
126531
|
+
idx_t file_size, bytes_read;
|
|
126532
|
+
if (bind_data.single_threaded) {
|
|
126533
|
+
auto &data = (const SingleThreadedCSVState &)*global_state;
|
|
126534
|
+
file_size = data.file_size;
|
|
126535
|
+
bytes_read = data.bytes_read;
|
|
126536
|
+
} else {
|
|
126537
|
+
auto &data = (const ParallelCSVGlobalState &)*global_state;
|
|
126538
|
+
file_size = data.file_size;
|
|
126539
|
+
bytes_read = data.bytes_read;
|
|
126540
|
+
}
|
|
126541
|
+
if (file_size == 0) {
|
|
125708
126542
|
return 100;
|
|
125709
126543
|
}
|
|
125710
|
-
auto percentage = (
|
|
126544
|
+
auto percentage = (bytes_read * 100.0) / file_size;
|
|
125711
126545
|
return percentage;
|
|
125712
126546
|
}
|
|
125713
126547
|
|
|
@@ -125745,7 +126579,7 @@ void BufferedCSVReaderOptions::Serialize(FieldWriter &writer) const {
|
|
|
125745
126579
|
writer.WriteField<bool>(header);
|
|
125746
126580
|
writer.WriteField<bool>(ignore_errors);
|
|
125747
126581
|
writer.WriteField<idx_t>(num_cols);
|
|
125748
|
-
writer.WriteField<idx_t>(
|
|
126582
|
+
writer.WriteField<idx_t>(buffer_sample_size);
|
|
125749
126583
|
writer.WriteString(null_str);
|
|
125750
126584
|
writer.WriteField<FileCompressionType>(compression);
|
|
125751
126585
|
// read options
|
|
@@ -125777,7 +126611,7 @@ void BufferedCSVReaderOptions::Deserialize(FieldReader &reader) {
|
|
|
125777
126611
|
header = reader.ReadRequired<bool>();
|
|
125778
126612
|
ignore_errors = reader.ReadRequired<bool>();
|
|
125779
126613
|
num_cols = reader.ReadRequired<idx_t>();
|
|
125780
|
-
|
|
126614
|
+
buffer_sample_size = reader.ReadRequired<idx_t>();
|
|
125781
126615
|
null_str = reader.ReadRequired<string>();
|
|
125782
126616
|
compression = reader.ReadRequired<FileCompressionType>();
|
|
125783
126617
|
// read options
|
|
@@ -125804,6 +126638,7 @@ static void CSVReaderSerialize(FieldWriter &writer, const FunctionData *bind_dat
|
|
|
125804
126638
|
writer.WriteField<idx_t>(bind_data.filename_col_idx);
|
|
125805
126639
|
writer.WriteField<idx_t>(bind_data.hive_partition_col_idx);
|
|
125806
126640
|
bind_data.options.Serialize(writer);
|
|
126641
|
+
writer.WriteField<bool>(bind_data.single_threaded);
|
|
125807
126642
|
}
|
|
125808
126643
|
|
|
125809
126644
|
static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, FieldReader &reader,
|
|
@@ -125814,27 +126649,31 @@ static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, Fie
|
|
|
125814
126649
|
result_data->filename_col_idx = reader.ReadRequired<idx_t>();
|
|
125815
126650
|
result_data->hive_partition_col_idx = reader.ReadRequired<idx_t>();
|
|
125816
126651
|
result_data->options.Deserialize(reader);
|
|
126652
|
+
result_data->single_threaded = reader.ReadField<bool>(true);
|
|
125817
126653
|
return move(result_data);
|
|
125818
126654
|
}
|
|
125819
126655
|
|
|
125820
126656
|
TableFunction ReadCSVTableFunction::GetFunction(bool list_parameter) {
|
|
125821
126657
|
auto parameter = list_parameter ? LogicalType::LIST(LogicalType::VARCHAR) : LogicalType::VARCHAR;
|
|
125822
|
-
TableFunction read_csv("read_csv", {parameter}, ReadCSVFunction, ReadCSVBind,
|
|
126658
|
+
TableFunction read_csv("read_csv", {parameter}, ReadCSVFunction, ReadCSVBind, ReadCSVInitGlobal, ReadCSVInitLocal);
|
|
125823
126659
|
read_csv.table_scan_progress = CSVReaderProgress;
|
|
125824
126660
|
read_csv.pushdown_complex_filter = CSVComplexFilterPushdown;
|
|
125825
126661
|
read_csv.serialize = CSVReaderSerialize;
|
|
125826
126662
|
read_csv.deserialize = CSVReaderDeserialize;
|
|
126663
|
+
read_csv.get_batch_index = CSVReaderGetBatchIndex;
|
|
125827
126664
|
ReadCSVAddNamedParameters(read_csv);
|
|
125828
126665
|
return read_csv;
|
|
125829
126666
|
}
|
|
125830
126667
|
|
|
125831
126668
|
TableFunction ReadCSVTableFunction::GetAutoFunction(bool list_parameter) {
|
|
125832
126669
|
auto parameter = list_parameter ? LogicalType::LIST(LogicalType::VARCHAR) : LogicalType::VARCHAR;
|
|
125833
|
-
TableFunction read_csv_auto("read_csv_auto", {parameter}, ReadCSVFunction, ReadCSVAutoBind,
|
|
126670
|
+
TableFunction read_csv_auto("read_csv_auto", {parameter}, ReadCSVFunction, ReadCSVAutoBind, ReadCSVInitGlobal,
|
|
126671
|
+
ReadCSVInitLocal);
|
|
125834
126672
|
read_csv_auto.table_scan_progress = CSVReaderProgress;
|
|
125835
126673
|
read_csv_auto.pushdown_complex_filter = CSVComplexFilterPushdown;
|
|
125836
126674
|
read_csv_auto.serialize = CSVReaderSerialize;
|
|
125837
126675
|
read_csv_auto.deserialize = CSVReaderDeserialize;
|
|
126676
|
+
read_csv_auto.get_batch_index = CSVReaderGetBatchIndex;
|
|
125838
126677
|
ReadCSVAddNamedParameters(read_csv_auto);
|
|
125839
126678
|
return read_csv_auto;
|
|
125840
126679
|
}
|
|
@@ -136024,6 +136863,14 @@ struct EnableProgressBarSetting {
|
|
|
136024
136863
|
static Value GetSetting(ClientContext &context);
|
|
136025
136864
|
};
|
|
136026
136865
|
|
|
136866
|
+
struct ExperimentalParallelCSVSetting {
|
|
136867
|
+
static constexpr const char *Name = "experimental_parallel_csv";
|
|
136868
|
+
static constexpr const char *Description = "Whether or not to use the experimental parallel CSV reader";
|
|
136869
|
+
static constexpr const LogicalTypeId InputType = LogicalTypeId::BOOLEAN;
|
|
136870
|
+
static void SetGlobal(DatabaseInstance *db, DBConfig &config, const Value ¶meter);
|
|
136871
|
+
static Value GetSetting(ClientContext &context);
|
|
136872
|
+
};
|
|
136873
|
+
|
|
136027
136874
|
struct ExplainOutputSetting {
|
|
136028
136875
|
static constexpr const char *Name = "explain_output";
|
|
136029
136876
|
static constexpr const char *Description = "Output of EXPLAIN statements (ALL, OPTIMIZED_ONLY, PHYSICAL_ONLY)";
|
|
@@ -136224,6 +137071,7 @@ static ConfigurationOption internal_options[] = {DUCKDB_GLOBAL(AccessModeSetting
|
|
|
136224
137071
|
DUCKDB_GLOBAL(EnableObjectCacheSetting),
|
|
136225
137072
|
DUCKDB_LOCAL(EnableProfilingSetting),
|
|
136226
137073
|
DUCKDB_LOCAL(EnableProgressBarSetting),
|
|
137074
|
+
DUCKDB_GLOBAL(ExperimentalParallelCSVSetting),
|
|
136227
137075
|
DUCKDB_LOCAL(ExplainOutputSetting),
|
|
136228
137076
|
DUCKDB_GLOBAL(ExternalThreadsSetting),
|
|
136229
137077
|
DUCKDB_LOCAL(FileSearchPathSetting),
|
|
@@ -136668,6 +137516,7 @@ public:
|
|
|
136668
137516
|
|
|
136669
137517
|
|
|
136670
137518
|
|
|
137519
|
+
|
|
136671
137520
|
namespace duckdb {
|
|
136672
137521
|
|
|
136673
137522
|
Connection::Connection(DatabaseInstance &database) : context(make_shared<ClientContext>(database.shared_from_this())) {
|
|
@@ -150518,6 +151367,18 @@ Value EnableProgressBarSetting::GetSetting(ClientContext &context) {
|
|
|
150518
151367
|
return Value::BOOLEAN(ClientConfig::GetConfig(context).enable_progress_bar);
|
|
150519
151368
|
}
|
|
150520
151369
|
|
|
151370
|
+
//===--------------------------------------------------------------------===//
|
|
151371
|
+
// Experimental Parallel CSV
|
|
151372
|
+
//===--------------------------------------------------------------------===//
|
|
151373
|
+
void ExperimentalParallelCSVSetting::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) {
|
|
151374
|
+
config.options.experimental_parallel_csv_reader = input.GetValue<bool>();
|
|
151375
|
+
}
|
|
151376
|
+
|
|
151377
|
+
Value ExperimentalParallelCSVSetting::GetSetting(ClientContext &context) {
|
|
151378
|
+
auto &config = DBConfig::GetConfig(context);
|
|
151379
|
+
return Value::BIGINT(config.options.experimental_parallel_csv_reader);
|
|
151380
|
+
}
|
|
151381
|
+
|
|
150521
151382
|
//===--------------------------------------------------------------------===//
|
|
150522
151383
|
// Explain Output
|
|
150523
151384
|
//===--------------------------------------------------------------------===//
|
|
@@ -185277,6 +186138,8 @@ BindResult ExpressionBinder::BindExpression(CollateExpression &expr, idx_t depth
|
|
|
185277
186138
|
if (child.expr->return_type.id() != LogicalTypeId::VARCHAR) {
|
|
185278
186139
|
throw BinderException("collations are only supported for type varchar");
|
|
185279
186140
|
}
|
|
186141
|
+
// Validate the collation, but don't use it
|
|
186142
|
+
PushCollation(context, child.expr->Copy(), expr.collation, false);
|
|
185280
186143
|
child.expr->return_type = LogicalType::VARCHAR_COLLATION(expr.collation);
|
|
185281
186144
|
return BindResult(move(child.expr));
|
|
185282
186145
|
}
|