duckdb 0.5.2-dev2006.0 → 0.5.2-dev2076.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/duckdb.cpp CHANGED
@@ -28769,9 +28769,10 @@ bool TryCast::Operation(string_t input, hugeint_t &result, bool strict) {
28769
28769
  //===--------------------------------------------------------------------===//
28770
28770
  // Decimal String Cast
28771
28771
  //===--------------------------------------------------------------------===//
28772
- template <class T>
28772
+ template <class TYPE>
28773
28773
  struct DecimalCastData {
28774
- T result;
28774
+ typedef TYPE type_t;
28775
+ TYPE result;
28775
28776
  uint8_t width;
28776
28777
  uint8_t scale;
28777
28778
  uint8_t digit_count;
@@ -28794,8 +28795,14 @@ struct DecimalCastOperation {
28794
28795
  }
28795
28796
  state.digit_count++;
28796
28797
  if (NEGATIVE) {
28798
+ if (state.result < (NumericLimits<typename T::type_t>::Minimum() / 10)) {
28799
+ return false;
28800
+ }
28797
28801
  state.result = state.result * 10 - digit;
28798
28802
  } else {
28803
+ if (state.result > (NumericLimits<typename T::type_t>::Maximum() / 10)) {
28804
+ return false;
28805
+ }
28799
28806
  state.result = state.result * 10 + digit;
28800
28807
  }
28801
28808
  return true;
@@ -42650,6 +42657,19 @@ static int8_t TemplatedCompareValue(Vector &left_vec, Vector &right_vec, idx_t l
42650
42657
  return 1;
42651
42658
  }
42652
42659
 
42660
+ template <>
42661
+ int8_t TemplatedCompareValue<Value>(Vector &left_vec, Vector &right_vec, idx_t left_idx, idx_t right_idx) {
42662
+ auto left_val = left_vec.GetValue(left_idx);
42663
+ auto right_val = right_vec.GetValue(right_idx);
42664
+ if (ValueOperations::Equals(left_val, right_val)) {
42665
+ return 0;
42666
+ }
42667
+ if (ValueOperations::LessThan(left_val, right_val)) {
42668
+ return -1;
42669
+ }
42670
+ return 1;
42671
+ }
42672
+
42653
42673
  // return type here is int32 because strcmp() on some platforms returns rather large values
42654
42674
  static int32_t CompareValue(Vector &left_vec, Vector &right_vec, idx_t vector_idx_left, idx_t vector_idx_right,
42655
42675
  OrderByNullType null_order) {
@@ -42693,7 +42713,7 @@ static int32_t CompareValue(Vector &left_vec, Vector &right_vec, idx_t vector_id
42693
42713
  case PhysicalType::INTERVAL:
42694
42714
  return TemplatedCompareValue<interval_t>(left_vec, right_vec, vector_idx_left, vector_idx_right);
42695
42715
  default:
42696
- throw NotImplementedException("Type for comparison");
42716
+ return TemplatedCompareValue<Value>(left_vec, right_vec, vector_idx_left, vector_idx_right);
42697
42717
  }
42698
42718
  }
42699
42719
 
@@ -79319,398 +79339,446 @@ string PhysicalTopN::ParamsToString() const {
79319
79339
 
79320
79340
  namespace duckdb {
79321
79341
 
79322
- static bool ParseBoolean(const Value &value, const string &loption);
79342
+ string BaseCSVReader::GetLineNumberStr(idx_t linenr, bool linenr_estimated) {
79343
+ string estimated = (linenr_estimated ? string(" (estimated)") : string(""));
79344
+ return to_string(linenr + 1) + estimated;
79345
+ }
79323
79346
 
79324
- static bool ParseBoolean(const vector<Value> &set, const string &loption) {
79325
- if (set.empty()) {
79326
- // no option specified: default to true
79327
- return true;
79328
- }
79329
- if (set.size() > 1) {
79330
- throw BinderException("\"%s\" expects a single argument as a boolean value (e.g. TRUE or 1)", loption);
79331
- }
79332
- return ParseBoolean(set[0], loption);
79347
+ BaseCSVReader::BaseCSVReader(FileSystem &fs_p, Allocator &allocator, FileOpener *opener_p,
79348
+ BufferedCSVReaderOptions options_p, const vector<LogicalType> &requested_types)
79349
+ : fs(fs_p), allocator(allocator), opener(opener_p), options(move(options_p)) {
79333
79350
  }
79334
79351
 
79335
- static bool ParseBoolean(const Value &value, const string &loption) {
79352
+ BaseCSVReader::BaseCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
79353
+ const vector<LogicalType> &requested_types)
79354
+ : BaseCSVReader(FileSystem::GetFileSystem(context), Allocator::Get(context), FileSystem::GetFileOpener(context),
79355
+ move(options_p), requested_types) {
79356
+ }
79336
79357
 
79337
- if (value.type().id() == LogicalTypeId::LIST) {
79338
- auto &children = ListValue::GetChildren(value);
79339
- return ParseBoolean(children, loption);
79340
- }
79341
- if (value.type() == LogicalType::FLOAT || value.type() == LogicalType::DOUBLE ||
79342
- value.type().id() == LogicalTypeId::DECIMAL) {
79343
- throw BinderException("\"%s\" expects a boolean value (e.g. TRUE or 1)", loption);
79344
- }
79345
- return BooleanValue::Get(value.DefaultCastAs(LogicalType::BOOLEAN));
79358
+ BaseCSVReader::~BaseCSVReader() {
79346
79359
  }
79347
79360
 
79348
- static string ParseString(const Value &value, const string &loption) {
79349
- if (value.type().id() == LogicalTypeId::LIST) {
79350
- auto &children = ListValue::GetChildren(value);
79351
- if (children.size() != 1) {
79352
- throw BinderException("\"%s\" expects a single argument as a string value", loption);
79353
- }
79354
- return ParseString(children[0], loption);
79361
+ unique_ptr<CSVFileHandle> BaseCSVReader::OpenCSV(const BufferedCSVReaderOptions &options_p) {
79362
+ auto file_handle = fs.OpenFile(options_p.file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK,
79363
+ options_p.compression, this->opener);
79364
+ return make_unique<CSVFileHandle>(move(file_handle));
79365
+ }
79366
+
79367
+ void BaseCSVReader::InitParseChunk(idx_t num_cols) {
79368
+ // adapt not null info
79369
+ if (options.force_not_null.size() != num_cols) {
79370
+ options.force_not_null.resize(num_cols, false);
79355
79371
  }
79356
- if (value.type().id() != LogicalTypeId::VARCHAR) {
79357
- throw BinderException("\"%s\" expects a string argument!", loption);
79372
+ if (num_cols == parse_chunk.ColumnCount()) {
79373
+ parse_chunk.Reset();
79374
+ } else {
79375
+ parse_chunk.Destroy();
79376
+
79377
+ // initialize the parse_chunk with a set of VARCHAR types
79378
+ vector<LogicalType> varchar_types(num_cols, LogicalType::VARCHAR);
79379
+ parse_chunk.Initialize(allocator, varchar_types);
79358
79380
  }
79359
- return value.GetValue<string>();
79360
79381
  }
79361
79382
 
79362
- static int64_t ParseInteger(const Value &value, const string &loption) {
79363
- if (value.type().id() == LogicalTypeId::LIST) {
79364
- auto &children = ListValue::GetChildren(value);
79365
- if (children.size() != 1) {
79366
- // no option specified or multiple options specified
79367
- throw BinderException("\"%s\" expects a single argument as an integer value", loption);
79368
- }
79369
- return ParseInteger(children[0], loption);
79383
+ void BaseCSVReader::InitInsertChunkIdx(idx_t num_cols) {
79384
+ for (idx_t col = 0; col < num_cols; ++col) {
79385
+ insert_cols_idx.push_back(col);
79370
79386
  }
79371
- return value.GetValue<int64_t>();
79372
79387
  }
79373
79388
 
79374
- static vector<bool> ParseColumnList(const vector<Value> &set, vector<string> &names, const string &loption) {
79375
- vector<bool> result;
79389
+ void BaseCSVReader::SetDateFormat(const string &format_specifier, const LogicalTypeId &sql_type) {
79390
+ options.has_format[sql_type] = true;
79391
+ auto &date_format = options.date_format[sql_type];
79392
+ date_format.format_specifier = format_specifier;
79393
+ StrTimeFormat::ParseFormatSpecifier(date_format.format_specifier, date_format);
79394
+ }
79376
79395
 
79377
- if (set.empty()) {
79378
- throw BinderException("\"%s\" expects a column list or * as parameter", loption);
79396
+ bool BaseCSVReader::TryCastValue(const Value &value, const LogicalType &sql_type) {
79397
+ if (options.has_format[LogicalTypeId::DATE] && sql_type.id() == LogicalTypeId::DATE) {
79398
+ date_t result;
79399
+ string error_message;
79400
+ return options.date_format[LogicalTypeId::DATE].TryParseDate(string_t(StringValue::Get(value)), result,
79401
+ error_message);
79402
+ } else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type.id() == LogicalTypeId::TIMESTAMP) {
79403
+ timestamp_t result;
79404
+ string error_message;
79405
+ return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(string_t(StringValue::Get(value)),
79406
+ result, error_message);
79407
+ } else {
79408
+ Value new_value;
79409
+ string error_message;
79410
+ return value.DefaultTryCastAs(sql_type, new_value, &error_message, true);
79379
79411
  }
79380
- // list of options: parse the list
79381
- unordered_map<string, bool> option_map;
79382
- for (idx_t i = 0; i < set.size(); i++) {
79383
- option_map[set[i].ToString()] = false;
79412
+ }
79413
+
79414
+ struct TryCastDateOperator {
79415
+ static bool Operation(BufferedCSVReaderOptions &options, string_t input, date_t &result, string &error_message) {
79416
+ return options.date_format[LogicalTypeId::DATE].TryParseDate(input, result, error_message);
79384
79417
  }
79385
- result.resize(names.size(), false);
79386
- for (idx_t i = 0; i < names.size(); i++) {
79387
- auto entry = option_map.find(names[i]);
79388
- if (entry != option_map.end()) {
79389
- result[i] = true;
79390
- entry->second = true;
79391
- }
79418
+ };
79419
+
79420
+ struct TryCastTimestampOperator {
79421
+ static bool Operation(BufferedCSVReaderOptions &options, string_t input, timestamp_t &result,
79422
+ string &error_message) {
79423
+ return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(input, result, error_message);
79392
79424
  }
79393
- for (auto &entry : option_map) {
79394
- if (!entry.second) {
79395
- throw BinderException("\"%s\" expected to find %s, but it was not found in the table", loption,
79396
- entry.first.c_str());
79425
+ };
79426
+
79427
+ template <class OP, class T>
79428
+ static bool TemplatedTryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
79429
+ idx_t count, string &error_message) {
79430
+ D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
79431
+ bool all_converted = true;
79432
+ UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
79433
+ T result;
79434
+ if (!OP::Operation(options, input, result, error_message)) {
79435
+ all_converted = false;
79397
79436
  }
79398
- }
79399
- return result;
79437
+ return result;
79438
+ });
79439
+ return all_converted;
79400
79440
  }
79401
79441
 
79402
- static vector<bool> ParseColumnList(const Value &value, vector<string> &names, const string &loption) {
79403
- vector<bool> result;
79442
+ bool TryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
79443
+ string &error_message) {
79444
+ return TemplatedTryCastDateVector<TryCastDateOperator, date_t>(options, input_vector, result_vector, count,
79445
+ error_message);
79446
+ }
79404
79447
 
79405
- // Only accept a list of arguments
79406
- if (value.type().id() != LogicalTypeId::LIST) {
79407
- // Support a single argument if it's '*'
79408
- if (value.type().id() == LogicalTypeId::VARCHAR && value.GetValue<string>() == "*") {
79409
- result.resize(names.size(), true);
79410
- return result;
79411
- }
79412
- throw BinderException("\"%s\" expects a column list or * as parameter", loption);
79413
- }
79414
- auto &children = ListValue::GetChildren(value);
79415
- // accept '*' as single argument
79416
- if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&
79417
- children[0].GetValue<string>() == "*") {
79418
- result.resize(names.size(), true);
79419
- return result;
79448
+ bool TryCastTimestampVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
79449
+ string &error_message) {
79450
+ return TemplatedTryCastDateVector<TryCastTimestampOperator, timestamp_t>(options, input_vector, result_vector,
79451
+ count, error_message);
79452
+ }
79453
+
79454
+ bool BaseCSVReader::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) {
79455
+ // try vector-cast from string to sql_type
79456
+ Vector dummy_result(sql_type);
79457
+ if (options.has_format[LogicalTypeId::DATE] && sql_type == LogicalTypeId::DATE) {
79458
+ // use the date format to cast the chunk
79459
+ string error_message;
79460
+ return TryCastDateVector(options, parse_chunk_col, dummy_result, size, error_message);
79461
+ } else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type == LogicalTypeId::TIMESTAMP) {
79462
+ // use the timestamp format to cast the chunk
79463
+ string error_message;
79464
+ return TryCastTimestampVector(options, parse_chunk_col, dummy_result, size, error_message);
79465
+ } else {
79466
+ // target type is not varchar: perform a cast
79467
+ string error_message;
79468
+ return VectorOperations::DefaultTryCast(parse_chunk_col, dummy_result, size, &error_message, true);
79420
79469
  }
79421
- return ParseColumnList(children, names, loption);
79422
79470
  }
79423
79471
 
79424
- struct CSVFileHandle {
79425
- public:
79426
- explicit CSVFileHandle(unique_ptr<FileHandle> file_handle_p) : file_handle(move(file_handle_p)) {
79427
- can_seek = file_handle->CanSeek();
79428
- plain_file_source = file_handle->OnDiskFile() && can_seek;
79429
- file_size = file_handle->GetFileSize();
79472
+ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes) {
79473
+ auto length = str_val.GetSize();
79474
+ if (length == 0 && column == 0) {
79475
+ row_empty = true;
79476
+ } else {
79477
+ row_empty = false;
79430
79478
  }
79431
79479
 
79432
- bool CanSeek() {
79433
- return can_seek;
79480
+ if (!sql_types.empty() && column == sql_types.size() && length == 0) {
79481
+ // skip a single trailing delimiter in last column
79482
+ return;
79434
79483
  }
79435
- void Seek(idx_t position) {
79436
- if (!can_seek) {
79437
- throw InternalException("Cannot seek in this file");
79438
- }
79439
- file_handle->Seek(position);
79484
+ if (mode == ParserMode::SNIFFING_DIALECT) {
79485
+ column++;
79486
+ return;
79440
79487
  }
79441
- idx_t SeekPosition() {
79442
- if (!can_seek) {
79443
- throw InternalException("Cannot seek in this file");
79488
+ if (column >= sql_types.size()) {
79489
+ if (options.ignore_errors) {
79490
+ error_column_overflow = true;
79491
+ return;
79492
+ } else {
79493
+ throw InvalidInputException(
79494
+ "Error in file \"%s\", on line %s: expected %lld values per row, but got more. (%s)", options.file_path,
79495
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), sql_types.size(), options.ToString());
79444
79496
  }
79445
- return file_handle->SeekPosition();
79446
79497
  }
79447
- void Reset() {
79448
- if (plain_file_source) {
79449
- file_handle->Reset();
79450
- } else {
79451
- if (!reset_enabled) {
79452
- throw InternalException("Reset called but reset is not enabled for this CSV Handle");
79498
+
79499
+ // insert the line number into the chunk
79500
+ idx_t row_entry = parse_chunk.size();
79501
+
79502
+ // test against null string, but only if the value was not quoted
79503
+ if ((!has_quotes || sql_types[column].id() != LogicalTypeId::VARCHAR) && !options.force_not_null[column] &&
79504
+ Equals::Operation(str_val, string_t(options.null_str))) {
79505
+ FlatVector::SetNull(parse_chunk.data[column], row_entry, true);
79506
+ } else {
79507
+ auto &v = parse_chunk.data[column];
79508
+ auto parse_data = FlatVector::GetData<string_t>(v);
79509
+ if (!escape_positions.empty()) {
79510
+ // remove escape characters (if any)
79511
+ string old_val = str_val.GetString();
79512
+ string new_val = "";
79513
+ idx_t prev_pos = 0;
79514
+ for (idx_t i = 0; i < escape_positions.size(); i++) {
79515
+ idx_t next_pos = escape_positions[i];
79516
+ new_val += old_val.substr(prev_pos, next_pos - prev_pos);
79517
+
79518
+ if (options.escape.empty() || options.escape == options.quote) {
79519
+ prev_pos = next_pos + options.quote.size();
79520
+ } else {
79521
+ prev_pos = next_pos + options.escape.size();
79522
+ }
79453
79523
  }
79454
- read_position = 0;
79524
+ new_val += old_val.substr(prev_pos, old_val.size() - prev_pos);
79525
+ escape_positions.clear();
79526
+ parse_data[row_entry] = StringVector::AddStringOrBlob(v, string_t(new_val));
79527
+ } else {
79528
+ parse_data[row_entry] = str_val;
79455
79529
  }
79456
79530
  }
79457
- bool PlainFileSource() {
79458
- return plain_file_source;
79459
- }
79460
79531
 
79461
- bool OnDiskFile() {
79462
- return file_handle->OnDiskFile();
79463
- }
79532
+ // move to the next column
79533
+ column++;
79534
+ }
79464
79535
 
79465
- idx_t FileSize() {
79466
- return file_size;
79467
- }
79536
+ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column) {
79537
+ linenr++;
79468
79538
 
79469
- idx_t Read(void *buffer, idx_t nr_bytes) {
79470
- if (!plain_file_source) {
79471
- // not a plain file source: we need to do some bookkeeping around the reset functionality
79472
- idx_t result_offset = 0;
79473
- if (read_position < buffer_size) {
79474
- // we need to read from our cached buffer
79475
- auto buffer_read_count = MinValue<idx_t>(nr_bytes, buffer_size - read_position);
79476
- memcpy(buffer, cached_buffer.get() + read_position, buffer_read_count);
79477
- result_offset += buffer_read_count;
79478
- read_position += buffer_read_count;
79479
- if (result_offset == nr_bytes) {
79480
- return nr_bytes;
79481
- }
79482
- } else if (!reset_enabled && cached_buffer) {
79483
- // reset is disabled but we still have cached data
79484
- // we can remove any cached data
79485
- cached_buffer.reset();
79486
- buffer_size = 0;
79487
- buffer_capacity = 0;
79488
- read_position = 0;
79489
- }
79490
- // we have data left to read from the file
79491
- // read directly into the buffer
79492
- auto bytes_read = file_handle->Read((char *)buffer + result_offset, nr_bytes - result_offset);
79493
- read_position += bytes_read;
79494
- if (reset_enabled) {
79495
- // if reset caching is enabled, we need to cache the bytes that we have read
79496
- if (buffer_size + bytes_read >= buffer_capacity) {
79497
- // no space; first enlarge the buffer
79498
- buffer_capacity = MaxValue<idx_t>(NextPowerOfTwo(buffer_size + bytes_read), buffer_capacity * 2);
79499
-
79500
- auto new_buffer = unique_ptr<data_t[]>(new data_t[buffer_capacity]);
79501
- if (buffer_size > 0) {
79502
- memcpy(new_buffer.get(), cached_buffer.get(), buffer_size);
79503
- }
79504
- cached_buffer = move(new_buffer);
79505
- }
79506
- memcpy(cached_buffer.get() + buffer_size, (char *)buffer + result_offset, bytes_read);
79507
- buffer_size += bytes_read;
79539
+ if (row_empty) {
79540
+ row_empty = false;
79541
+ if (sql_types.size() != 1) {
79542
+ if (mode == ParserMode::PARSING) {
79543
+ FlatVector::SetNull(parse_chunk.data[0], parse_chunk.size(), false);
79508
79544
  }
79545
+ column = 0;
79546
+ return false;
79547
+ }
79548
+ }
79549
+
79550
+ // Error forwarded by 'ignore_errors' - originally encountered in 'AddValue'
79551
+ if (error_column_overflow) {
79552
+ D_ASSERT(options.ignore_errors);
79553
+ error_column_overflow = false;
79554
+ column = 0;
79555
+ return false;
79556
+ }
79509
79557
 
79510
- return result_offset + bytes_read;
79558
+ if (column < sql_types.size() && mode != ParserMode::SNIFFING_DIALECT) {
79559
+ if (options.ignore_errors) {
79560
+ column = 0;
79561
+ return false;
79511
79562
  } else {
79512
- return file_handle->Read(buffer, nr_bytes);
79563
+ throw InvalidInputException(
79564
+ "Error in file \"%s\" on line %s: expected %lld values per row, but got %d. (%s)", options.file_path,
79565
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), sql_types.size(), column, options.ToString());
79513
79566
  }
79514
79567
  }
79515
79568
 
79516
- string ReadLine() {
79517
- bool carriage_return = false;
79518
- string result;
79519
- char buffer[1];
79520
- while (true) {
79521
- idx_t bytes_read = Read(buffer, 1);
79522
- if (bytes_read == 0) {
79523
- return result;
79524
- }
79525
- if (carriage_return) {
79526
- if (buffer[0] != '\n') {
79527
- if (!file_handle->CanSeek()) {
79528
- throw BinderException(
79529
- "Carriage return newlines not supported when reading CSV files in which we cannot seek");
79530
- }
79531
- file_handle->Seek(file_handle->SeekPosition() - 1);
79532
- return result;
79533
- }
79534
- }
79535
- if (buffer[0] == '\n') {
79536
- return result;
79537
- }
79538
- if (buffer[0] != '\r') {
79539
- result += buffer[0];
79540
- } else {
79541
- carriage_return = true;
79542
- }
79569
+ if (mode == ParserMode::SNIFFING_DIALECT) {
79570
+ sniffed_column_counts.push_back(column);
79571
+
79572
+ if (sniffed_column_counts.size() == options.sample_chunk_size) {
79573
+ return true;
79543
79574
  }
79575
+ } else {
79576
+ parse_chunk.SetCardinality(parse_chunk.size() + 1);
79544
79577
  }
79545
79578
 
79546
- void DisableReset() {
79547
- this->reset_enabled = false;
79579
+ if (mode == ParserMode::PARSING_HEADER) {
79580
+ return true;
79548
79581
  }
79549
79582
 
79550
- private:
79551
- unique_ptr<FileHandle> file_handle;
79552
- bool reset_enabled = true;
79553
- bool can_seek = false;
79554
- bool plain_file_source = false;
79555
- idx_t file_size = 0;
79556
- // reset support
79557
- unique_ptr<data_t[]> cached_buffer;
79558
- idx_t read_position = 0;
79559
- idx_t buffer_size = 0;
79560
- idx_t buffer_capacity = 0;
79561
- };
79583
+ if (mode == ParserMode::SNIFFING_DATATYPES && parse_chunk.size() == options.sample_chunk_size) {
79584
+ return true;
79585
+ }
79562
79586
 
79563
- void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
79564
- this->delimiter = StringUtil::Replace(input, "\\t", "\t");
79565
- this->has_delimiter = true;
79566
- if (input.empty()) {
79567
- this->delimiter = string("\0", 1);
79587
+ if (mode == ParserMode::PARSING && parse_chunk.size() == STANDARD_VECTOR_SIZE) {
79588
+ Flush(insert_chunk);
79589
+ return true;
79568
79590
  }
79591
+
79592
+ column = 0;
79593
+ return false;
79569
79594
  }
79570
79595
 
79571
- void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) {
79572
- string error;
79573
- if (read_format) {
79574
- auto &date_format = this->date_format[type];
79575
- error = StrTimeFormat::ParseFormatSpecifier(format, date_format);
79576
- date_format.format_specifier = format;
79577
- } else {
79578
- auto &date_format = this->write_date_format[type];
79579
- error = StrTimeFormat::ParseFormatSpecifier(format, date_format);
79580
- }
79581
- if (!error.empty()) {
79582
- throw InvalidInputException("Could not parse DATEFORMAT: %s", error.c_str());
79596
+ void BaseCSVReader::SetNullUnionCols(DataChunk &insert_chunk) {
79597
+ for (idx_t col = 0; col < insert_nulls_idx.size(); ++col) {
79598
+ insert_chunk.data[insert_nulls_idx[col]].SetVectorType(VectorType::CONSTANT_VECTOR);
79599
+ ConstantVector::SetNull(insert_chunk.data[insert_nulls_idx[col]], true);
79583
79600
  }
79584
- has_format[type] = true;
79585
79601
  }
79586
79602
 
79587
- void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value &value,
79588
- vector<string> &expected_names) {
79589
- if (SetBaseOption(loption, value)) {
79603
+ void BaseCSVReader::VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset) {
79604
+ D_ASSERT(col_idx < chunk.data.size());
79605
+ D_ASSERT(row_idx < chunk.size());
79606
+ auto &v = chunk.data[col_idx];
79607
+ if (FlatVector::IsNull(v, row_idx)) {
79590
79608
  return;
79591
79609
  }
79592
- if (loption == "auto_detect") {
79593
- auto_detect = ParseBoolean(value, loption);
79594
- } else if (loption == "sample_size") {
79595
- int64_t sample_size = ParseInteger(value, loption);
79596
- if (sample_size < 1 && sample_size != -1) {
79597
- throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1");
79598
- }
79599
- if (sample_size == -1) {
79600
- sample_chunks = std::numeric_limits<uint64_t>::max();
79601
- sample_chunk_size = STANDARD_VECTOR_SIZE;
79602
- } else if (sample_size <= STANDARD_VECTOR_SIZE) {
79603
- sample_chunk_size = sample_size;
79604
- sample_chunks = 1;
79605
- } else {
79606
- sample_chunk_size = STANDARD_VECTOR_SIZE;
79607
- sample_chunks = sample_size / STANDARD_VECTOR_SIZE;
79608
- }
79609
- } else if (loption == "skip") {
79610
- skip_rows = ParseInteger(value, loption);
79611
- } else if (loption == "max_line_size" || loption == "maximum_line_size") {
79612
- maximum_line_size = ParseInteger(value, loption);
79613
- } else if (loption == "sample_chunk_size") {
79614
- sample_chunk_size = ParseInteger(value, loption);
79615
- if (sample_chunk_size > STANDARD_VECTOR_SIZE) {
79616
- throw BinderException(
79617
- "Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d",
79618
- STANDARD_VECTOR_SIZE);
79619
- } else if (sample_chunk_size < 1) {
79620
- throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1");
79621
- }
79622
- } else if (loption == "sample_chunks") {
79623
- sample_chunks = ParseInteger(value, loption);
79624
- if (sample_chunks < 1) {
79625
- throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1");
79610
+
79611
+ auto parse_data = FlatVector::GetData<string_t>(chunk.data[col_idx]);
79612
+ auto s = parse_data[row_idx];
79613
+ auto utf_type = Utf8Proc::Analyze(s.GetDataUnsafe(), s.GetSize());
79614
+ if (utf_type == UnicodeType::INVALID) {
79615
+ string col_name = to_string(col_idx);
79616
+ if (col_idx < col_names.size()) {
79617
+ col_name = "\"" + col_names[col_idx] + "\"";
79626
79618
  }
79627
- } else if (loption == "force_not_null") {
79628
- force_not_null = ParseColumnList(value, expected_names, loption);
79629
- } else if (loption == "date_format" || loption == "dateformat") {
79630
- string format = ParseString(value, loption);
79631
- SetDateFormat(LogicalTypeId::DATE, format, true);
79632
- } else if (loption == "timestamp_format" || loption == "timestampformat") {
79633
- string format = ParseString(value, loption);
79634
- SetDateFormat(LogicalTypeId::TIMESTAMP, format, true);
79635
- } else if (loption == "escape") {
79636
- escape = ParseString(value, loption);
79637
- has_escape = true;
79638
- } else if (loption == "ignore_errors") {
79639
- ignore_errors = ParseBoolean(value, loption);
79640
- } else if (loption == "union_by_name") {
79641
- union_by_name = ParseBoolean(value, loption);
79642
- } else {
79643
- throw BinderException("Unrecognized option for CSV reader \"%s\"", loption);
79619
+ int64_t error_line = linenr - (chunk.size() - row_idx) + 1 + offset;
79620
+ D_ASSERT(error_line >= 0);
79621
+ throw InvalidInputException("Error in file \"%s\" at line %llu in column \"%s\": "
79622
+ "%s. Parser options: %s",
79623
+ options.file_path, error_line, col_name,
79624
+ ErrorManager::InvalidUnicodeError(s.GetString(), "CSV file"), options.ToString());
79644
79625
  }
79645
79626
  }
79646
79627
 
79647
- void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value &value) {
79648
- if (SetBaseOption(loption, value)) {
79649
- return;
79628
+ void BaseCSVReader::VerifyUTF8(idx_t col_idx) {
79629
+ D_ASSERT(col_idx < parse_chunk.data.size());
79630
+ for (idx_t i = 0; i < parse_chunk.size(); i++) {
79631
+ VerifyUTF8(col_idx, i, parse_chunk);
79650
79632
  }
79633
+ }
79651
79634
 
79652
- if (loption == "force_quote") {
79653
- force_quote = ParseColumnList(value, names, loption);
79654
- } else if (loption == "date_format" || loption == "dateformat") {
79655
- string format = ParseString(value, loption);
79656
- SetDateFormat(LogicalTypeId::DATE, format, false);
79657
- } else if (loption == "timestamp_format" || loption == "timestampformat") {
79658
- string format = ParseString(value, loption);
79659
- if (StringUtil::Lower(format) == "iso") {
79660
- format = "%Y-%m-%dT%H:%M:%S.%fZ";
79635
+ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
79636
+ if (parse_chunk.size() == 0) {
79637
+ return true;
79638
+ }
79639
+
79640
+ bool conversion_error_ignored = false;
79641
+
79642
+ // convert the columns in the parsed chunk to the types of the table
79643
+ insert_chunk.SetCardinality(parse_chunk);
79644
+ for (idx_t col_idx = 0; col_idx < sql_types.size(); col_idx++) {
79645
+ if (sql_types[col_idx].id() == LogicalTypeId::VARCHAR) {
79646
+ // target type is varchar: no need to convert
79647
+ // just test that all strings are valid utf-8 strings
79648
+ VerifyUTF8(col_idx);
79649
+ insert_chunk.data[insert_cols_idx[col_idx]].Reference(parse_chunk.data[col_idx]);
79650
+ } else {
79651
+ string error_message;
79652
+ bool success;
79653
+ if (options.has_format[LogicalTypeId::DATE] && sql_types[col_idx].id() == LogicalTypeId::DATE) {
79654
+ // use the date format to cast the chunk
79655
+ success =
79656
+ TryCastDateVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_cols_idx[col_idx]],
79657
+ parse_chunk.size(), error_message);
79658
+ } else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
79659
+ sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP) {
79660
+ // use the date format to cast the chunk
79661
+ success = TryCastTimestampVector(options, parse_chunk.data[col_idx],
79662
+ insert_chunk.data[insert_cols_idx[col_idx]], parse_chunk.size(),
79663
+ error_message);
79664
+ } else {
79665
+ // target type is not varchar: perform a cast
79666
+ success = VectorOperations::DefaultTryCast(parse_chunk.data[col_idx],
79667
+ insert_chunk.data[insert_cols_idx[col_idx]],
79668
+ parse_chunk.size(), &error_message);
79669
+ }
79670
+ if (success) {
79671
+ continue;
79672
+ }
79673
+ if (try_add_line) {
79674
+ return false;
79675
+ }
79676
+ if (options.ignore_errors) {
79677
+ conversion_error_ignored = true;
79678
+ continue;
79679
+ }
79680
+ string col_name = to_string(col_idx);
79681
+ if (col_idx < col_names.size()) {
79682
+ col_name = "\"" + col_names[col_idx] + "\"";
79683
+ }
79684
+
79685
+ // figure out the exact line number
79686
+ idx_t row_idx;
79687
+ for (row_idx = 0; row_idx < parse_chunk.size(); row_idx++) {
79688
+ auto &inserted_column = insert_chunk.data[col_idx];
79689
+ auto &parsed_column = parse_chunk.data[col_idx];
79690
+
79691
+ if (FlatVector::IsNull(inserted_column, row_idx) && !FlatVector::IsNull(parsed_column, row_idx)) {
79692
+ break;
79693
+ }
79694
+ }
79695
+ auto error_line = linenr - (parse_chunk.size() - row_idx) + 1;
79696
+
79697
+ if (options.auto_detect) {
79698
+ throw InvalidInputException("%s in column %s, at line %llu. Parser "
79699
+ "options: %s. Consider either increasing the sample size "
79700
+ "(SAMPLE_SIZE=X [X rows] or SAMPLE_SIZE=-1 [all rows]), "
79701
+ "or skipping column conversion (ALL_VARCHAR=1)",
79702
+ error_message, col_name, error_line, options.ToString());
79703
+ } else {
79704
+ throw InvalidInputException("%s at line %llu in column %s. Parser options: %s ", error_message,
79705
+ error_line, col_name, options.ToString());
79706
+ }
79661
79707
  }
79662
- SetDateFormat(LogicalTypeId::TIMESTAMP, format, false);
79663
- } else {
79664
- throw BinderException("Unrecognized option CSV writer \"%s\"", loption);
79665
79708
  }
79666
- }
79709
+ if (conversion_error_ignored) {
79710
+ D_ASSERT(options.ignore_errors);
79711
+ SelectionVector succesful_rows;
79712
+ succesful_rows.Initialize(parse_chunk.size());
79713
+ idx_t sel_size = 0;
79667
79714
 
79668
- bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value &value) {
79669
- // Make sure this function was only called after the option was turned into lowercase
79670
- D_ASSERT(!std::any_of(loption.begin(), loption.end(), ::isupper));
79715
+ for (idx_t row_idx = 0; row_idx < parse_chunk.size(); row_idx++) {
79716
+ bool failed = false;
79717
+ for (idx_t column_idx = 0; column_idx < sql_types.size(); column_idx++) {
79671
79718
 
79672
- if (StringUtil::StartsWith(loption, "delim") || StringUtil::StartsWith(loption, "sep")) {
79673
- SetDelimiter(ParseString(value, loption));
79674
- } else if (loption == "quote") {
79675
- quote = ParseString(value, loption);
79676
- has_quote = true;
79677
- } else if (loption == "escape") {
79678
- escape = ParseString(value, loption);
79679
- has_escape = true;
79680
- } else if (loption == "header") {
79681
- header = ParseBoolean(value, loption);
79682
- has_header = true;
79683
- } else if (loption == "null" || loption == "nullstr") {
79684
- null_str = ParseString(value, loption);
79685
- } else if (loption == "encoding") {
79686
- auto encoding = StringUtil::Lower(ParseString(value, loption));
79687
- if (encoding != "utf8" && encoding != "utf-8") {
79688
- throw BinderException("Copy is only supported for UTF-8 encoded files, ENCODING 'UTF-8'");
79719
+ auto &inserted_column = insert_chunk.data[column_idx];
79720
+ auto &parsed_column = parse_chunk.data[column_idx];
79721
+
79722
+ bool was_already_null = FlatVector::IsNull(parsed_column, row_idx);
79723
+ if (!was_already_null && FlatVector::IsNull(inserted_column, row_idx)) {
79724
+ failed = true;
79725
+ break;
79726
+ }
79727
+ }
79728
+ if (!failed) {
79729
+ succesful_rows.set_index(sel_size++, row_idx);
79730
+ }
79689
79731
  }
79690
- } else if (loption == "compression") {
79691
- compression = FileCompressionTypeFromString(ParseString(value, loption));
79692
- } else {
79693
- // unrecognized option in base CSV
79694
- return false;
79732
+ insert_chunk.Slice(succesful_rows, sel_size);
79695
79733
  }
79734
+ parse_chunk.Reset();
79696
79735
  return true;
79697
79736
  }
79737
+ } // namespace duckdb
79698
79738
 
79699
- std::string BufferedCSVReaderOptions::ToString() const {
79700
- return "DELIMITER='" + delimiter + (has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
79701
- ", QUOTE='" + quote + (has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
79702
- ", ESCAPE='" + escape + (has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
79703
- ", HEADER=" + std::to_string(header) +
79704
- (has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
79705
- ", SAMPLE_SIZE=" + std::to_string(sample_chunk_size * sample_chunks) +
79706
- ", IGNORE_ERRORS=" + std::to_string(ignore_errors) + ", ALL_VARCHAR=" + std::to_string(all_varchar);
79739
+
79740
+
79741
+
79742
+
79743
+
79744
+
79745
+
79746
+
79747
+
79748
+
79749
+
79750
+
79751
+
79752
+
79753
+
79754
+
79755
+
79756
+ #include <algorithm>
79757
+ #include <cctype>
79758
+ #include <cstring>
79759
+ #include <fstream>
79760
+
79761
+ namespace duckdb {
79762
+
79763
+ BufferedCSVReader::BufferedCSVReader(FileSystem &fs_p, Allocator &allocator, FileOpener *opener_p,
79764
+ BufferedCSVReaderOptions options_p, const vector<LogicalType> &requested_types)
79765
+ : BaseCSVReader(fs_p, allocator, opener_p, move(options_p), requested_types), buffer_size(0), position(0),
79766
+ start(0) {
79767
+ file_handle = OpenCSV(options);
79768
+ Initialize(requested_types);
79707
79769
  }
79708
79770
 
79709
- static string GetLineNumberStr(idx_t linenr, bool linenr_estimated) {
79710
- string estimated = (linenr_estimated ? string(" (estimated)") : string(""));
79711
- return to_string(linenr + 1) + estimated;
79771
+ BufferedCSVReader::BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
79772
+ const vector<LogicalType> &requested_types)
79773
+ : BufferedCSVReader(FileSystem::GetFileSystem(context), Allocator::Get(context), FileSystem::GetFileOpener(context),
79774
+ move(options_p), requested_types) {
79712
79775
  }
79713
79776
 
79777
+ BufferedCSVReader::~BufferedCSVReader() {
79778
+ }
79779
+
79780
+ enum class QuoteRule : uint8_t { QUOTES_RFC = 0, QUOTES_OTHER = 1, NO_QUOTES = 2 };
79781
+
79714
79782
  static bool StartsWithNumericDate(string &separator, const string &value) {
79715
79783
  auto begin = value.c_str();
79716
79784
  auto end = begin + value.size();
@@ -79813,61 +79881,6 @@ TextSearchShiftArray::TextSearchShiftArray(string search_term) : length(search_t
79813
79881
  }
79814
79882
  }
79815
79883
 
79816
- BufferedCSVReader::BufferedCSVReader(FileSystem &fs_p, Allocator &allocator, FileOpener *opener_p,
79817
- BufferedCSVReaderOptions options_p, const vector<LogicalType> &requested_types)
79818
- : fs(fs_p), allocator(allocator), opener(opener_p), options(move(options_p)), buffer_size(0), position(0),
79819
- start(0) {
79820
- file_handle = OpenCSV(options);
79821
- Initialize(requested_types);
79822
- }
79823
-
79824
- BufferedCSVReader::BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
79825
- const vector<LogicalType> &requested_types)
79826
- : BufferedCSVReader(FileSystem::GetFileSystem(context), Allocator::Get(context), FileSystem::GetFileOpener(context),
79827
- move(options_p), requested_types) {
79828
- }
79829
-
79830
- BufferedCSVReader::~BufferedCSVReader() {
79831
- }
79832
-
79833
- idx_t BufferedCSVReader::GetFileSize() {
79834
- return file_handle ? file_handle->FileSize() : 0;
79835
- }
79836
-
79837
- void BufferedCSVReader::Initialize(const vector<LogicalType> &requested_types) {
79838
- PrepareComplexParser();
79839
- if (options.auto_detect) {
79840
- sql_types = SniffCSV(requested_types);
79841
- if (sql_types.empty()) {
79842
- throw Exception("Failed to detect column types from CSV: is the file a valid CSV file?");
79843
- }
79844
- if (cached_chunks.empty()) {
79845
- JumpToBeginning(options.skip_rows, options.header);
79846
- }
79847
- } else {
79848
- sql_types = requested_types;
79849
- ResetBuffer();
79850
- SkipRowsAndReadHeader(options.skip_rows, options.header);
79851
- }
79852
- InitParseChunk(sql_types.size());
79853
- InitInsertChunkIdx(sql_types.size());
79854
- // we only need reset support during the automatic CSV type detection
79855
- // since reset support might require caching (in the case of streams), we disable it for the remainder
79856
- file_handle->DisableReset();
79857
- }
79858
-
79859
- void BufferedCSVReader::PrepareComplexParser() {
79860
- delimiter_search = TextSearchShiftArray(options.delimiter);
79861
- escape_search = TextSearchShiftArray(options.escape);
79862
- quote_search = TextSearchShiftArray(options.quote);
79863
- }
79864
-
79865
- unique_ptr<CSVFileHandle> BufferedCSVReader::OpenCSV(const BufferedCSVReaderOptions &options) {
79866
- auto file_handle = fs.OpenFile(options.file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK,
79867
- options.compression, this->opener);
79868
- return make_unique<CSVFileHandle>(move(file_handle));
79869
- }
79870
-
79871
79884
  // Helper function to generate column names
79872
79885
  static string GenerateColumnName(const idx_t total_cols, const idx_t col_number, const string &prefix = "column") {
79873
79886
  int max_digits = NumericHelper::UnsignedLength(total_cols - 1);
@@ -79957,6 +79970,28 @@ static string NormalizeColumnName(const string &col_name) {
79957
79970
  return col_name_cleaned;
79958
79971
  }
79959
79972
 
79973
+ void BufferedCSVReader::Initialize(const vector<LogicalType> &requested_types) {
79974
+ PrepareComplexParser();
79975
+ if (options.auto_detect) {
79976
+ sql_types = SniffCSV(requested_types);
79977
+ if (sql_types.empty()) {
79978
+ throw Exception("Failed to detect column types from CSV: is the file a valid CSV file?");
79979
+ }
79980
+ if (cached_chunks.empty()) {
79981
+ JumpToBeginning(options.skip_rows, options.header);
79982
+ }
79983
+ } else {
79984
+ sql_types = requested_types;
79985
+ ResetBuffer();
79986
+ SkipRowsAndReadHeader(options.skip_rows, options.header);
79987
+ }
79988
+ InitParseChunk(sql_types.size());
79989
+ InitInsertChunkIdx(sql_types.size());
79990
+ // we only need reset support during the automatic CSV type detection
79991
+ // since reset support might require caching (in the case of streams), we disable it for the remainder
79992
+ file_handle->DisableReset();
79993
+ }
79994
+
79960
79995
  void BufferedCSVReader::ResetBuffer() {
79961
79996
  buffer.reset();
79962
79997
  buffer_size = 0;
@@ -79980,28 +80015,6 @@ void BufferedCSVReader::ResetStream() {
79980
80015
  jumping_samples = false;
79981
80016
  }
79982
80017
 
79983
- void BufferedCSVReader::InitParseChunk(idx_t num_cols) {
79984
- // adapt not null info
79985
- if (options.force_not_null.size() != num_cols) {
79986
- options.force_not_null.resize(num_cols, false);
79987
- }
79988
- if (num_cols == parse_chunk.ColumnCount()) {
79989
- parse_chunk.Reset();
79990
- } else {
79991
- parse_chunk.Destroy();
79992
-
79993
- // initialize the parse_chunk with a set of VARCHAR types
79994
- vector<LogicalType> varchar_types(num_cols, LogicalType::VARCHAR);
79995
- parse_chunk.Initialize(allocator, varchar_types);
79996
- }
79997
- }
79998
-
79999
- void BufferedCSVReader::InitInsertChunkIdx(idx_t num_cols) {
80000
- for (idx_t col = 0; col < num_cols; ++col) {
80001
- insert_cols_idx.push_back(col);
80002
- }
80003
- }
80004
-
80005
80018
  void BufferedCSVReader::JumpToBeginning(idx_t skip_rows = 0, bool skip_header = false) {
80006
80019
  ResetBuffer();
80007
80020
  ResetStream();
@@ -80026,6 +80039,12 @@ void BufferedCSVReader::SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header)
80026
80039
  }
80027
80040
  }
80028
80041
 
80042
+ void BufferedCSVReader::PrepareComplexParser() {
80043
+ delimiter_search = TextSearchShiftArray(options.delimiter);
80044
+ escape_search = TextSearchShiftArray(options.escape);
80045
+ quote_search = TextSearchShiftArray(options.quote);
80046
+ }
80047
+
80029
80048
  bool BufferedCSVReader::JumpToNextSample() {
80030
80049
  // get bytes contained in the previously read chunk
80031
80050
  idx_t remaining_bytes_in_buffer = buffer_size - start;
@@ -80099,91 +80118,6 @@ bool BufferedCSVReader::JumpToNextSample() {
80099
80118
  return true;
80100
80119
  }
80101
80120
 
80102
- void BufferedCSVReader::SetDateFormat(const string &format_specifier, const LogicalTypeId &sql_type) {
80103
- options.has_format[sql_type] = true;
80104
- auto &date_format = options.date_format[sql_type];
80105
- date_format.format_specifier = format_specifier;
80106
- StrTimeFormat::ParseFormatSpecifier(date_format.format_specifier, date_format);
80107
- }
80108
-
80109
- bool BufferedCSVReader::TryCastValue(const Value &value, const LogicalType &sql_type) {
80110
- if (options.has_format[LogicalTypeId::DATE] && sql_type.id() == LogicalTypeId::DATE) {
80111
- date_t result;
80112
- string error_message;
80113
- return options.date_format[LogicalTypeId::DATE].TryParseDate(string_t(StringValue::Get(value)), result,
80114
- error_message);
80115
- } else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type.id() == LogicalTypeId::TIMESTAMP) {
80116
- timestamp_t result;
80117
- string error_message;
80118
- return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(string_t(StringValue::Get(value)),
80119
- result, error_message);
80120
- } else {
80121
- Value new_value;
80122
- string error_message;
80123
- return value.DefaultTryCastAs(sql_type, new_value, &error_message, true);
80124
- }
80125
- }
80126
-
80127
- struct TryCastDateOperator {
80128
- static bool Operation(BufferedCSVReaderOptions &options, string_t input, date_t &result, string &error_message) {
80129
- return options.date_format[LogicalTypeId::DATE].TryParseDate(input, result, error_message);
80130
- }
80131
- };
80132
-
80133
- struct TryCastTimestampOperator {
80134
- static bool Operation(BufferedCSVReaderOptions &options, string_t input, timestamp_t &result,
80135
- string &error_message) {
80136
- return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(input, result, error_message);
80137
- }
80138
- };
80139
-
80140
- template <class OP, class T>
80141
- static bool TemplatedTryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
80142
- idx_t count, string &error_message) {
80143
- D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
80144
- bool all_converted = true;
80145
- UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
80146
- T result;
80147
- if (!OP::Operation(options, input, result, error_message)) {
80148
- all_converted = false;
80149
- }
80150
- return result;
80151
- });
80152
- return all_converted;
80153
- }
80154
-
80155
- bool TryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
80156
- string &error_message) {
80157
- return TemplatedTryCastDateVector<TryCastDateOperator, date_t>(options, input_vector, result_vector, count,
80158
- error_message);
80159
- }
80160
-
80161
- bool TryCastTimestampVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
80162
- string &error_message) {
80163
- return TemplatedTryCastDateVector<TryCastTimestampOperator, timestamp_t>(options, input_vector, result_vector,
80164
- count, error_message);
80165
- }
80166
-
80167
- bool BufferedCSVReader::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) {
80168
- // try vector-cast from string to sql_type
80169
- Vector dummy_result(sql_type);
80170
- if (options.has_format[LogicalTypeId::DATE] && sql_type == LogicalTypeId::DATE) {
80171
- // use the date format to cast the chunk
80172
- string error_message;
80173
- return TryCastDateVector(options, parse_chunk_col, dummy_result, size, error_message);
80174
- } else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type == LogicalTypeId::TIMESTAMP) {
80175
- // use the timestamp format to cast the chunk
80176
- string error_message;
80177
- return TryCastTimestampVector(options, parse_chunk_col, dummy_result, size, error_message);
80178
- } else {
80179
- // target type is not varchar: perform a cast
80180
- string error_message;
80181
- return VectorOperations::DefaultTryCast(parse_chunk_col, dummy_result, size, &error_message, true);
80182
- }
80183
- }
80184
-
80185
- enum class QuoteRule : uint8_t { QUOTES_RFC = 0, QUOTES_OTHER = 1, NO_QUOTES = 2 };
80186
-
80187
80121
  void BufferedCSVReader::DetectDialect(const vector<LogicalType> &requested_types,
80188
80122
  BufferedCSVReaderOptions &original_options,
80189
80123
  vector<BufferedCSVReaderOptions> &info_candidates, idx_t &best_num_cols) {
@@ -81181,267 +81115,926 @@ bool BufferedCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_ch
81181
81115
  }
81182
81116
  }
81183
81117
 
81184
- void BufferedCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes) {
81185
- auto length = str_val.GetSize();
81186
- if (length == 0 && column == 0) {
81187
- row_empty = true;
81188
- } else {
81189
- row_empty = false;
81118
+ } // namespace duckdb
81119
+
81120
+
81121
+
81122
+ namespace duckdb {
81123
+
81124
+ CSVBuffer::CSVBuffer(idx_t buffer_size_p, CSVFileHandle &file_handle) : first_buffer(true) {
81125
+ buffer = unique_ptr<char[]>(new char[buffer_size_p]);
81126
+ actual_size = file_handle.Read(buffer.get(), buffer_size_p);
81127
+ if (actual_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
81128
+ start_position += 3;
81190
81129
  }
81130
+ last_buffer = file_handle.FinishedReading();
81131
+ }
81191
81132
 
81192
- if (!sql_types.empty() && column == sql_types.size() && length == 0) {
81193
- // skip a single trailing delimiter in last column
81194
- return;
81133
+ CSVBuffer::CSVBuffer(unique_ptr<char[]> buffer_p, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer)
81134
+ : buffer(move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer) {
81135
+ }
81136
+
81137
+ unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t set_buffer_size) {
81138
+ if (file_handle.FinishedReading()) {
81139
+ // this was the last buffer
81140
+ return nullptr;
81195
81141
  }
81196
- if (mode == ParserMode::SNIFFING_DIALECT) {
81197
- column++;
81142
+
81143
+ auto next_buffer = unique_ptr<char[]>(new char[set_buffer_size]);
81144
+
81145
+ idx_t next_buffer_actual_size = file_handle.Read(next_buffer.get(), set_buffer_size);
81146
+
81147
+ return make_unique<CSVBuffer>(move(next_buffer), set_buffer_size, next_buffer_actual_size,
81148
+ file_handle.FinishedReading());
81149
+ }
81150
+
81151
+ idx_t CSVBuffer::GetBufferSize() {
81152
+ return actual_size;
81153
+ }
81154
+
81155
+ idx_t CSVBuffer::GetStart() {
81156
+ return start_position;
81157
+ }
81158
+
81159
+ bool CSVBuffer::IsCSVFileLastBuffer() {
81160
+ return last_buffer;
81161
+ }
81162
+
81163
+ bool CSVBuffer::IsCSVFileFirstBuffer() {
81164
+ return first_buffer;
81165
+ }
81166
+
81167
+ } // namespace duckdb
81168
+
81169
+
81170
+
81171
+
81172
+ namespace duckdb {
81173
+
81174
+ static bool ParseBoolean(const Value &value, const string &loption);
81175
+
81176
+ static bool ParseBoolean(const vector<Value> &set, const string &loption) {
81177
+ if (set.empty()) {
81178
+ // no option specified: default to true
81179
+ return true;
81180
+ }
81181
+ if (set.size() > 1) {
81182
+ throw BinderException("\"%s\" expects a single argument as a boolean value (e.g. TRUE or 1)", loption);
81183
+ }
81184
+ return ParseBoolean(set[0], loption);
81185
+ }
81186
+
81187
+ static bool ParseBoolean(const Value &value, const string &loption) {
81188
+
81189
+ if (value.type().id() == LogicalTypeId::LIST) {
81190
+ auto &children = ListValue::GetChildren(value);
81191
+ return ParseBoolean(children, loption);
81192
+ }
81193
+ if (value.type() == LogicalType::FLOAT || value.type() == LogicalType::DOUBLE ||
81194
+ value.type().id() == LogicalTypeId::DECIMAL) {
81195
+ throw BinderException("\"%s\" expects a boolean value (e.g. TRUE or 1)", loption);
81196
+ }
81197
+ return BooleanValue::Get(value.DefaultCastAs(LogicalType::BOOLEAN));
81198
+ }
81199
+
81200
+ static string ParseString(const Value &value, const string &loption) {
81201
+ if (value.type().id() == LogicalTypeId::LIST) {
81202
+ auto &children = ListValue::GetChildren(value);
81203
+ if (children.size() != 1) {
81204
+ throw BinderException("\"%s\" expects a single argument as a string value", loption);
81205
+ }
81206
+ return ParseString(children[0], loption);
81207
+ }
81208
+ if (value.type().id() != LogicalTypeId::VARCHAR) {
81209
+ throw BinderException("\"%s\" expects a string argument!", loption);
81210
+ }
81211
+ return value.GetValue<string>();
81212
+ }
81213
+
81214
+ static int64_t ParseInteger(const Value &value, const string &loption) {
81215
+ if (value.type().id() == LogicalTypeId::LIST) {
81216
+ auto &children = ListValue::GetChildren(value);
81217
+ if (children.size() != 1) {
81218
+ // no option specified or multiple options specified
81219
+ throw BinderException("\"%s\" expects a single argument as an integer value", loption);
81220
+ }
81221
+ return ParseInteger(children[0], loption);
81222
+ }
81223
+ return value.GetValue<int64_t>();
81224
+ }
81225
+
81226
+ static vector<bool> ParseColumnList(const vector<Value> &set, vector<string> &names, const string &loption) {
81227
+ vector<bool> result;
81228
+
81229
+ if (set.empty()) {
81230
+ throw BinderException("\"%s\" expects a column list or * as parameter", loption);
81231
+ }
81232
+ // list of options: parse the list
81233
+ unordered_map<string, bool> option_map;
81234
+ for (idx_t i = 0; i < set.size(); i++) {
81235
+ option_map[set[i].ToString()] = false;
81236
+ }
81237
+ result.resize(names.size(), false);
81238
+ for (idx_t i = 0; i < names.size(); i++) {
81239
+ auto entry = option_map.find(names[i]);
81240
+ if (entry != option_map.end()) {
81241
+ result[i] = true;
81242
+ entry->second = true;
81243
+ }
81244
+ }
81245
+ for (auto &entry : option_map) {
81246
+ if (!entry.second) {
81247
+ throw BinderException("\"%s\" expected to find %s, but it was not found in the table", loption,
81248
+ entry.first.c_str());
81249
+ }
81250
+ }
81251
+ return result;
81252
+ }
81253
+
81254
+ static vector<bool> ParseColumnList(const Value &value, vector<string> &names, const string &loption) {
81255
+ vector<bool> result;
81256
+
81257
+ // Only accept a list of arguments
81258
+ if (value.type().id() != LogicalTypeId::LIST) {
81259
+ // Support a single argument if it's '*'
81260
+ if (value.type().id() == LogicalTypeId::VARCHAR && value.GetValue<string>() == "*") {
81261
+ result.resize(names.size(), true);
81262
+ return result;
81263
+ }
81264
+ throw BinderException("\"%s\" expects a column list or * as parameter", loption);
81265
+ }
81266
+ auto &children = ListValue::GetChildren(value);
81267
+ // accept '*' as single argument
81268
+ if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&
81269
+ children[0].GetValue<string>() == "*") {
81270
+ result.resize(names.size(), true);
81271
+ return result;
81272
+ }
81273
+ return ParseColumnList(children, names, loption);
81274
+ }
81275
+
81276
+ void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
81277
+ this->delimiter = StringUtil::Replace(input, "\\t", "\t");
81278
+ this->has_delimiter = true;
81279
+ if (input.empty()) {
81280
+ this->delimiter = string("\0", 1);
81281
+ }
81282
+ }
81283
+
81284
+ void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) {
81285
+ string error;
81286
+ if (read_format) {
81287
+ auto &date_format = this->date_format[type];
81288
+ error = StrTimeFormat::ParseFormatSpecifier(format, date_format);
81289
+ date_format.format_specifier = format;
81290
+ } else {
81291
+ auto &date_format = this->write_date_format[type];
81292
+ error = StrTimeFormat::ParseFormatSpecifier(format, date_format);
81293
+ }
81294
+ if (!error.empty()) {
81295
+ throw InvalidInputException("Could not parse DATEFORMAT: %s", error.c_str());
81296
+ }
81297
+ has_format[type] = true;
81298
+ }
81299
+
81300
+ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value &value,
81301
+ vector<string> &expected_names) {
81302
+ if (SetBaseOption(loption, value)) {
81198
81303
  return;
81199
81304
  }
81200
- if (column >= sql_types.size()) {
81201
- if (options.ignore_errors) {
81202
- error_column_overflow = true;
81203
- return;
81305
+ if (loption == "auto_detect") {
81306
+ auto_detect = ParseBoolean(value, loption);
81307
+ } else if (loption == "sample_size") {
81308
+ int64_t sample_size = ParseInteger(value, loption);
81309
+ if (sample_size < 1 && sample_size != -1) {
81310
+ throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1");
81311
+ }
81312
+ if (sample_size == -1) {
81313
+ sample_chunks = std::numeric_limits<uint64_t>::max();
81314
+ sample_chunk_size = STANDARD_VECTOR_SIZE;
81315
+ } else if (sample_size <= STANDARD_VECTOR_SIZE) {
81316
+ sample_chunk_size = sample_size;
81317
+ sample_chunks = 1;
81204
81318
  } else {
81205
- throw InvalidInputException(
81206
- "Error in file \"%s\", on line %s: expected %lld values per row, but got more. (%s)", options.file_path,
81207
- GetLineNumberStr(linenr, linenr_estimated).c_str(), sql_types.size(), options.ToString());
81319
+ sample_chunk_size = STANDARD_VECTOR_SIZE;
81320
+ sample_chunks = sample_size / STANDARD_VECTOR_SIZE;
81321
+ }
81322
+ } else if (loption == "skip") {
81323
+ skip_rows = ParseInteger(value, loption);
81324
+ } else if (loption == "max_line_size" || loption == "maximum_line_size") {
81325
+ maximum_line_size = ParseInteger(value, loption);
81326
+ } else if (loption == "sample_chunk_size") {
81327
+ sample_chunk_size = ParseInteger(value, loption);
81328
+ if (sample_chunk_size > STANDARD_VECTOR_SIZE) {
81329
+ throw BinderException(
81330
+ "Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d",
81331
+ STANDARD_VECTOR_SIZE);
81332
+ } else if (sample_chunk_size < 1) {
81333
+ throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1");
81334
+ }
81335
+ } else if (loption == "sample_chunks") {
81336
+ sample_chunks = ParseInteger(value, loption);
81337
+ if (sample_chunks < 1) {
81338
+ throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1");
81208
81339
  }
81340
+ } else if (loption == "force_not_null") {
81341
+ force_not_null = ParseColumnList(value, expected_names, loption);
81342
+ } else if (loption == "date_format" || loption == "dateformat") {
81343
+ string format = ParseString(value, loption);
81344
+ SetDateFormat(LogicalTypeId::DATE, format, true);
81345
+ } else if (loption == "timestamp_format" || loption == "timestampformat") {
81346
+ string format = ParseString(value, loption);
81347
+ SetDateFormat(LogicalTypeId::TIMESTAMP, format, true);
81348
+ } else if (loption == "escape") {
81349
+ escape = ParseString(value, loption);
81350
+ has_escape = true;
81351
+ } else if (loption == "ignore_errors") {
81352
+ ignore_errors = ParseBoolean(value, loption);
81353
+ } else if (loption == "union_by_name") {
81354
+ union_by_name = ParseBoolean(value, loption);
81355
+ } else {
81356
+ throw BinderException("Unrecognized option for CSV reader \"%s\"", loption);
81209
81357
  }
81358
+ }
81210
81359
 
81211
- // insert the line number into the chunk
81212
- idx_t row_entry = parse_chunk.size();
81360
+ void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value &value) {
81361
+ if (SetBaseOption(loption, value)) {
81362
+ return;
81363
+ }
81213
81364
 
81214
- // test against null string, but only if the value was not quoted
81215
- if ((!has_quotes || sql_types[column].id() != LogicalTypeId::VARCHAR) && !options.force_not_null[column] &&
81216
- Equals::Operation(str_val, string_t(options.null_str))) {
81217
- FlatVector::SetNull(parse_chunk.data[column], row_entry, true);
81365
+ if (loption == "force_quote") {
81366
+ force_quote = ParseColumnList(value, names, loption);
81367
+ } else if (loption == "date_format" || loption == "dateformat") {
81368
+ string format = ParseString(value, loption);
81369
+ SetDateFormat(LogicalTypeId::DATE, format, false);
81370
+ } else if (loption == "timestamp_format" || loption == "timestampformat") {
81371
+ string format = ParseString(value, loption);
81372
+ if (StringUtil::Lower(format) == "iso") {
81373
+ format = "%Y-%m-%dT%H:%M:%S.%fZ";
81374
+ }
81375
+ SetDateFormat(LogicalTypeId::TIMESTAMP, format, false);
81218
81376
  } else {
81219
- auto &v = parse_chunk.data[column];
81220
- auto parse_data = FlatVector::GetData<string_t>(v);
81221
- if (!escape_positions.empty()) {
81222
- // remove escape characters (if any)
81223
- string old_val = str_val.GetString();
81224
- string new_val = "";
81225
- idx_t prev_pos = 0;
81226
- for (idx_t i = 0; i < escape_positions.size(); i++) {
81227
- idx_t next_pos = escape_positions[i];
81228
- new_val += old_val.substr(prev_pos, next_pos - prev_pos);
81377
+ throw BinderException("Unrecognized option CSV writer \"%s\"", loption);
81378
+ }
81379
+ }
81229
81380
 
81230
- if (options.escape.empty() || options.escape == options.quote) {
81231
- prev_pos = next_pos + options.quote.size();
81232
- } else {
81233
- prev_pos = next_pos + options.escape.size();
81234
- }
81235
- }
81236
- new_val += old_val.substr(prev_pos, old_val.size() - prev_pos);
81237
- escape_positions.clear();
81238
- parse_data[row_entry] = StringVector::AddStringOrBlob(v, string_t(new_val));
81239
- } else {
81240
- parse_data[row_entry] = str_val;
81381
+ bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value &value) {
81382
+ // Make sure this function was only called after the option was turned into lowercase
81383
+ D_ASSERT(!std::any_of(loption.begin(), loption.end(), ::isupper));
81384
+
81385
+ if (StringUtil::StartsWith(loption, "delim") || StringUtil::StartsWith(loption, "sep")) {
81386
+ SetDelimiter(ParseString(value, loption));
81387
+ } else if (loption == "quote") {
81388
+ quote = ParseString(value, loption);
81389
+ has_quote = true;
81390
+ } else if (loption == "escape") {
81391
+ escape = ParseString(value, loption);
81392
+ has_escape = true;
81393
+ } else if (loption == "header") {
81394
+ header = ParseBoolean(value, loption);
81395
+ has_header = true;
81396
+ } else if (loption == "null" || loption == "nullstr") {
81397
+ null_str = ParseString(value, loption);
81398
+ } else if (loption == "encoding") {
81399
+ auto encoding = StringUtil::Lower(ParseString(value, loption));
81400
+ if (encoding != "utf8" && encoding != "utf-8") {
81401
+ throw BinderException("Copy is only supported for UTF-8 encoded files, ENCODING 'UTF-8'");
81241
81402
  }
81403
+ } else if (loption == "compression") {
81404
+ compression = FileCompressionTypeFromString(ParseString(value, loption));
81405
+ } else {
81406
+ // unrecognized option in base CSV
81407
+ return false;
81242
81408
  }
81409
+ return true;
81410
+ }
81243
81411
 
81244
- // move to the next column
81245
- column++;
81412
+ std::string BufferedCSVReaderOptions::ToString() const {
81413
+ return "DELIMITER='" + delimiter + (has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
81414
+ ", QUOTE='" + quote + (has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
81415
+ ", ESCAPE='" + escape + (has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
81416
+ ", HEADER=" + std::to_string(header) +
81417
+ (has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
81418
+ ", SAMPLE_SIZE=" + std::to_string(sample_chunk_size * sample_chunks) +
81419
+ ", IGNORE_ERRORS=" + std::to_string(ignore_errors) + ", ALL_VARCHAR=" + std::to_string(all_varchar);
81246
81420
  }
81247
81421
 
81248
- bool BufferedCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column) {
81249
- linenr++;
81422
+ } // namespace duckdb
81423
+ //===----------------------------------------------------------------------===//
81424
+ // DuckDB
81425
+ //
81426
+ // duckdb/execution/operator/persistent/buffered_csv_reader.hpp
81427
+ //
81428
+ //
81429
+ //===----------------------------------------------------------------------===//
81250
81430
 
81251
- if (row_empty) {
81252
- row_empty = false;
81253
- if (sql_types.size() != 1) {
81254
- if (mode == ParserMode::PARSING) {
81255
- FlatVector::SetNull(parse_chunk.data[0], parse_chunk.size(), false);
81431
+
81432
+
81433
+
81434
+
81435
+
81436
+
81437
+
81438
+ #include <sstream>
81439
+ #include <utility>
81440
+
81441
+ namespace duckdb {
81442
+
81443
+ struct CSVBufferRead {
81444
+ CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, idx_t buffer_start_p, idx_t buffer_end_p, idx_t batch_index,
81445
+ idx_t estimated_linenr)
81446
+ : buffer(move(buffer_p)), buffer_start(buffer_start_p), buffer_end(buffer_end_p), batch_index(batch_index),
81447
+ estimated_linenr(estimated_linenr) {
81448
+ if (buffer) {
81449
+ if (buffer_end > buffer->GetBufferSize()) {
81450
+ buffer_end = buffer->GetBufferSize();
81256
81451
  }
81257
- column = 0;
81258
- return false;
81452
+ } else {
81453
+ buffer_start = 0;
81454
+ buffer_end = 0;
81259
81455
  }
81260
81456
  }
81261
81457
 
81262
- // Error forwarded by 'ignore_errors' - originally encountered in 'AddValue'
81263
- if (error_column_overflow) {
81264
- D_ASSERT(options.ignore_errors);
81265
- error_column_overflow = false;
81266
- column = 0;
81267
- return false;
81458
+ CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, shared_ptr<CSVBuffer> nxt_buffer_p, idx_t buffer_start_p,
81459
+ idx_t buffer_end_p, idx_t batch_index, idx_t estimated_linenr)
81460
+ : CSVBufferRead(std::move(buffer_p), buffer_start_p, buffer_end_p, batch_index, estimated_linenr) {
81461
+ next_buffer = std::move(nxt_buffer_p);
81268
81462
  }
81269
81463
 
81270
- if (column < sql_types.size() && mode != ParserMode::SNIFFING_DIALECT) {
81271
- if (options.ignore_errors) {
81272
- column = 0;
81273
- return false;
81464
+ CSVBufferRead() : buffer_start(0), buffer_end(NumericLimits<idx_t>::Maximum()) {};
81465
+
81466
+ const char &operator[](size_t i) const {
81467
+ if (i < buffer->GetBufferSize()) {
81468
+ return buffer->buffer[i];
81469
+ }
81470
+ return next_buffer->buffer[i - buffer->GetBufferSize()];
81471
+ }
81472
+
81473
+ string_t GetValue(idx_t start_buffer, idx_t position_buffer, idx_t offset) {
81474
+ idx_t length = position_buffer - start_buffer - offset;
81475
+ // 1) It's all in the current buffer
81476
+ if (start_buffer + length <= buffer->GetBufferSize()) {
81477
+ auto buffer_ptr = buffer->buffer.get();
81478
+ return string_t(buffer_ptr + start_buffer, length);
81479
+ } else if (start_buffer >= buffer->GetBufferSize()) {
81480
+ // 2) It's all in the next buffer
81481
+ D_ASSERT(next_buffer);
81482
+ D_ASSERT(next_buffer->GetBufferSize() >= length + (start_buffer - buffer->GetBufferSize()));
81483
+ auto buffer_ptr = next_buffer->buffer.get();
81484
+ return string_t(buffer_ptr + (start_buffer - buffer->GetBufferSize()), length);
81274
81485
  } else {
81275
- throw InvalidInputException(
81276
- "Error in file \"%s\" on line %s: expected %lld values per row, but got %d. (%s)", options.file_path,
81277
- GetLineNumberStr(linenr, linenr_estimated).c_str(), sql_types.size(), column, options.ToString());
81486
+ // 3) It starts in the current buffer and ends in the next buffer
81487
+ D_ASSERT(next_buffer);
81488
+ auto intersection = unique_ptr<char[]>(new char[length]);
81489
+ idx_t cur_pos = 0;
81490
+ for (idx_t i = start_buffer; i < buffer->GetBufferSize(); i++) {
81491
+ intersection[cur_pos++] = buffer->buffer[i];
81492
+ }
81493
+ idx_t nxt_buffer_pos = 0;
81494
+ for (; cur_pos < length; cur_pos++) {
81495
+ intersection[cur_pos] = next_buffer->buffer[nxt_buffer_pos++];
81496
+ }
81497
+ intersections.emplace_back(move(intersection));
81498
+ return string_t(intersections.back().get(), length);
81278
81499
  }
81279
81500
  }
81280
81501
 
81281
- if (mode == ParserMode::SNIFFING_DIALECT) {
81282
- sniffed_column_counts.push_back(column);
81502
+ shared_ptr<CSVBuffer> buffer;
81503
+ shared_ptr<CSVBuffer> next_buffer;
81504
+ vector<unique_ptr<char[]>> intersections;
81283
81505
 
81284
- if (sniffed_column_counts.size() == options.sample_chunk_size) {
81285
- return true;
81286
- }
81287
- } else {
81288
- parse_chunk.SetCardinality(parse_chunk.size() + 1);
81506
+ idx_t buffer_start;
81507
+ idx_t buffer_end;
81508
+ idx_t batch_index;
81509
+ idx_t estimated_linenr;
81510
+ };
81511
+
81512
+ //! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
81513
+ class ParallelCSVReader : public BaseCSVReader {
81514
+ public:
81515
+ ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
81516
+ const vector<LogicalType> &requested_types);
81517
+ ~ParallelCSVReader();
81518
+
81519
+ //! Current Position (Relative to the Buffer)
81520
+ idx_t position_buffer = 0;
81521
+
81522
+ //! Start of the piece of the buffer this thread should read
81523
+ idx_t start_buffer = 0;
81524
+ //! End of the piece of this buffer this thread should read
81525
+ idx_t end_buffer = NumericLimits<idx_t>::Maximum();
81526
+ //! The actual buffer size
81527
+ idx_t buffer_size = 0;
81528
+
81529
+ //! If this flag is set, it means we are about to try to read our last row.
81530
+ bool reached_remainder_state = false;
81531
+
81532
+ unique_ptr<CSVBufferRead> buffer;
81533
+
81534
+ public:
81535
+ void SetBufferRead(unique_ptr<CSVBufferRead> buffer);
81536
+ //! Extract a single DataChunk from the CSV file and stores it in insert_chunk
81537
+ void ParseCSV(DataChunk &insert_chunk);
81538
+
81539
+ private:
81540
+ //! Initialize Parser
81541
+ void Initialize(const vector<LogicalType> &requested_types);
81542
+ //! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.
81543
+ void ParseCSV(ParserMode mode);
81544
+ //! Try to parse a single datachunk from the file. Returns whether or not the parsing is successful
81545
+ bool TryParseCSV(ParserMode mode);
81546
+ //! Extract a single DataChunk from the CSV file and stores it in insert_chunk
81547
+ bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
81548
+ //! Sets Position depending on the byte_start of this thread
81549
+ bool SetPosition(DataChunk &insert_chunk);
81550
+ //! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
81551
+ //! Up to finding a new line. This function sets the buffer_end and marks a boolean variable
81552
+ //! when changing the buffer end the first time.
81553
+ //! It returns FALSE if the parser should jump to the final state of parsing or not
81554
+ bool BufferRemainder();
81555
+ //! Parses a CSV file with a one-byte delimiter, escape and quote character
81556
+ bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
81557
+ };
81558
+
81559
+ } // namespace duckdb
81560
+
81561
+
81562
+
81563
+
81564
+
81565
+
81566
+
81567
+
81568
+
81569
+
81570
+
81571
+
81572
+
81573
+
81574
+
81575
+
81576
+ //===----------------------------------------------------------------------===//
81577
+ // DuckDB
81578
+ //
81579
+ // duckdb/function/table/read_csv.hpp
81580
+ //
81581
+ //
81582
+ //===----------------------------------------------------------------------===//
81583
+
81584
+
81585
+
81586
+
81587
+
81588
+
81589
+
81590
+
81591
+
81592
+
81593
+
81594
+ namespace duckdb {
81595
+
81596
+ class ReadCSV {
81597
+ public:
81598
+ static unique_ptr<CSVFileHandle> OpenCSV(const BufferedCSVReaderOptions &options, ClientContext &context);
81599
+ };
81600
+
81601
+ struct BaseCSVData : public TableFunctionData {
81602
+ virtual ~BaseCSVData() {
81289
81603
  }
81604
+ //! The file path of the CSV file to read or write
81605
+ vector<string> files;
81606
+ //! The CSV reader options
81607
+ BufferedCSVReaderOptions options;
81608
+ //! Offsets for generated columns
81609
+ idx_t filename_col_idx;
81610
+ idx_t hive_partition_col_idx;
81290
81611
 
81291
- if (mode == ParserMode::PARSING_HEADER) {
81292
- return true;
81612
+ void Finalize();
81613
+ };
81614
+
81615
+ struct WriteCSVData : public BaseCSVData {
81616
+ WriteCSVData(string file_path, vector<LogicalType> sql_types, vector<string> names) : sql_types(move(sql_types)) {
81617
+ files.push_back(move(file_path));
81618
+ options.names = move(names);
81293
81619
  }
81294
81620
 
81295
- if (mode == ParserMode::SNIFFING_DATATYPES && parse_chunk.size() == options.sample_chunk_size) {
81296
- return true;
81621
+ //! The SQL types to write
81622
+ vector<LogicalType> sql_types;
81623
+ //! The newline string to write
81624
+ string newline = "\n";
81625
+ //! Whether or not we are writing a simple CSV (delimiter, quote and escape are all 1 byte in length)
81626
+ bool is_simple;
81627
+ //! The size of the CSV file (in bytes) that we buffer before we flush it to disk
81628
+ idx_t flush_size = 4096 * 8;
81629
+ };
81630
+
81631
+ struct ReadCSVData : public BaseCSVData {
81632
+ //! The expected SQL types to read
81633
+ vector<LogicalType> sql_types;
81634
+ //! The initial reader (if any): this is used when automatic detection is used during binding.
81635
+ //! In this case, the CSV reader is already created and might as well be re-used.
81636
+ unique_ptr<BufferedCSVReader> initial_reader;
81637
+ //! The union readers are created (when csv union_by_name option is on) during binding
81638
+ //! Those readers can be re-used during ReadCSVFunction
81639
+ vector<unique_ptr<BufferedCSVReader>> union_readers;
81640
+ //! Whether or not the single-threaded reader should be used
81641
+ bool single_threaded = false;
81642
+
81643
+ void InitializeFiles(ClientContext &context, const vector<string> &patterns);
81644
+ void FinalizeRead(ClientContext &context);
81645
+ };
81646
+
81647
+ struct CSVCopyFunction {
81648
+ static void RegisterFunction(BuiltinFunctions &set);
81649
+ };
81650
+
81651
+ struct ReadCSVTableFunction {
81652
+ static TableFunction GetFunction(bool list_parameter = false);
81653
+ static TableFunction GetAutoFunction(bool list_parameter = false);
81654
+ static void RegisterFunction(BuiltinFunctions &set);
81655
+ };
81656
+
81657
+ } // namespace duckdb
81658
+
81659
+
81660
+ #include <algorithm>
81661
+ #include <cctype>
81662
+ #include <cstring>
81663
+ #include <fstream>
81664
+ #include <utility>
81665
+
81666
+ namespace duckdb {
81667
+
81668
+ ParallelCSVReader::ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
81669
+ unique_ptr<CSVBufferRead> buffer_p, const vector<LogicalType> &requested_types)
81670
+ : BaseCSVReader(context, move(options_p), requested_types) {
81671
+ Initialize(requested_types);
81672
+ SetBufferRead(move(buffer_p));
81673
+ if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
81674
+ throw InternalException("Parallel CSV reader cannot handle CSVs with multi-byte delimiters/escapes/quotes");
81297
81675
  }
81676
+ }
81298
81677
 
81299
- if (mode == ParserMode::PARSING && parse_chunk.size() == STANDARD_VECTOR_SIZE) {
81300
- Flush(insert_chunk);
81678
+ ParallelCSVReader::~ParallelCSVReader() {
81679
+ }
81680
+
81681
+ void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
81682
+ sql_types = requested_types;
81683
+ InitParseChunk(sql_types.size());
81684
+ InitInsertChunkIdx(sql_types.size());
81685
+ }
81686
+
81687
+ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
81688
+ if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer &&
81689
+ start_buffer == buffer->buffer->GetStart()) {
81690
+ // First buffer doesn't need any setting
81301
81691
  return true;
81302
81692
  }
81303
81693
 
81304
- column = 0;
81305
- return false;
81694
+ // We have to move position up to next new line
81695
+ idx_t end_buffer_real = end_buffer;
81696
+ // Check if we already start in a valid line
81697
+ string error_message;
81698
+ bool successfully_read_first_line = false;
81699
+ while (!successfully_read_first_line) {
81700
+ DataChunk first_line_chunk;
81701
+ first_line_chunk.Initialize(allocator, insert_chunk.GetTypes());
81702
+ for (; position_buffer < end_buffer; position_buffer++) {
81703
+ if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
81704
+ position_buffer++;
81705
+ break;
81706
+ }
81707
+ }
81708
+ D_ASSERT(position_buffer <= end_buffer);
81709
+ if (position_buffer == end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
81710
+ break;
81711
+ }
81712
+ idx_t position_set = position_buffer;
81713
+ start_buffer = position_buffer;
81714
+ // We check if we can add this line
81715
+ successfully_read_first_line = TryParseSimpleCSV(first_line_chunk, error_message, true);
81716
+ start_buffer = position_set;
81717
+ end_buffer = end_buffer_real;
81718
+ position_buffer = position_set;
81719
+ if (end_buffer == position_buffer) {
81720
+ break;
81721
+ }
81722
+ }
81723
+
81724
+ return successfully_read_first_line;
81306
81725
  }
81307
81726
 
81308
- void BufferedCSVReader::SetNullUnionCols(DataChunk &insert_chunk) {
81309
- for (idx_t col = 0; col < insert_nulls_idx.size(); ++col) {
81310
- insert_chunk.data[insert_nulls_idx[col]].SetVectorType(VectorType::CONSTANT_VECTOR);
81311
- ConstantVector::SetNull(insert_chunk.data[insert_nulls_idx[col]], true);
81727
+ void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
81728
+ if (!buffer_read_p->buffer) {
81729
+ throw InternalException("ParallelCSVReader::SetBufferRead - CSVBufferRead does not have a buffer to read");
81312
81730
  }
81731
+ position_buffer = buffer_read_p->buffer_start;
81732
+ start_buffer = buffer_read_p->buffer_start;
81733
+ end_buffer = buffer_read_p->buffer_end;
81734
+ if (buffer_read_p->next_buffer) {
81735
+ buffer_size = buffer_read_p->buffer->GetBufferSize() + buffer_read_p->next_buffer->GetBufferSize();
81736
+ } else {
81737
+ buffer_size = buffer_read_p->buffer->GetBufferSize();
81738
+ }
81739
+ linenr = buffer_read_p->estimated_linenr;
81740
+ buffer = move(buffer_read_p);
81741
+
81742
+ linenr_estimated = true;
81743
+ reached_remainder_state = false;
81744
+ D_ASSERT(end_buffer <= buffer_size);
81313
81745
  }
81314
81746
 
81315
- void BufferedCSVReader::VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset) {
81316
- D_ASSERT(col_idx < chunk.data.size());
81317
- D_ASSERT(row_idx < chunk.size());
81318
- auto &v = chunk.data[col_idx];
81319
- if (FlatVector::IsNull(v, row_idx)) {
81320
- return;
81747
+ // If BufferRemainder returns false, it means we are done scanning this buffer and should go to the end_state
81748
+ bool ParallelCSVReader::BufferRemainder() {
81749
+ if (position_buffer >= end_buffer && !reached_remainder_state) {
81750
+ // First time we finish the buffer piece we should scan here, we set the variables
81751
+ // to allow this piece to be scanned up to the end of the buffer or the next new line
81752
+ reached_remainder_state = true;
81753
+ // end_buffer is allowed to go to buffer size to finish its last line
81754
+ end_buffer = buffer_size;
81755
+ }
81756
+ if (position_buffer >= end_buffer) {
81757
+ // buffer ends, return false
81758
+ return false;
81321
81759
  }
81760
+ // we can still scan stuff, return true
81761
+ return true;
81762
+ }
81322
81763
 
81323
- auto parse_data = FlatVector::GetData<string_t>(chunk.data[col_idx]);
81324
- auto s = parse_data[row_idx];
81325
- auto utf_type = Utf8Proc::Analyze(s.GetDataUnsafe(), s.GetSize());
81326
- if (utf_type == UnicodeType::INVALID) {
81327
- string col_name = to_string(col_idx);
81328
- if (col_idx < col_names.size()) {
81329
- col_name = "\"" + col_names[col_idx] + "\"";
81764
+ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
81765
+
81766
+ // used for parsing algorithm
81767
+ D_ASSERT(end_buffer <= buffer_size);
81768
+ bool finished_chunk = false;
81769
+ idx_t column = 0;
81770
+ idx_t offset = 0;
81771
+ bool has_quotes = false;
81772
+ vector<idx_t> escape_positions;
81773
+ if (start_buffer == buffer->buffer_start && !try_add_line) {
81774
+ // First time reading this buffer piece
81775
+ if (!SetPosition(insert_chunk)) {
81776
+ // This means the buffer size does not contain a new line
81777
+ return true;
81330
81778
  }
81331
- int64_t error_line = linenr - (chunk.size() - row_idx) + 1 + offset;
81332
- D_ASSERT(error_line >= 0);
81333
- throw InvalidInputException("Error in file \"%s\" at line %llu in column \"%s\": "
81334
- "%s. Parser options: %s",
81335
- options.file_path, error_line, col_name,
81336
- ErrorManager::InvalidUnicodeError(s.GetString(), "CSV file"), options.ToString());
81337
81779
  }
81338
- }
81339
81780
 
81340
- void BufferedCSVReader::VerifyUTF8(idx_t col_idx) {
81341
- D_ASSERT(col_idx < parse_chunk.data.size());
81342
- for (idx_t i = 0; i < parse_chunk.size(); i++) {
81343
- VerifyUTF8(col_idx, i, parse_chunk);
81781
+ // start parsing the first value
81782
+ goto value_start;
81783
+
81784
+ value_start : {
81785
+ /* state: value_start */
81786
+ if (!BufferRemainder()) {
81787
+ goto final_state;
81344
81788
  }
81345
- }
81789
+ offset = 0;
81346
81790
 
81347
- void BufferedCSVReader::Flush(DataChunk &insert_chunk) {
81348
- if (parse_chunk.size() == 0) {
81349
- return;
81791
+ // this state parses the first character of a value
81792
+ if ((*buffer)[position_buffer] == options.quote[0]) {
81793
+ // quote: actual value starts in the next position
81794
+ // move to in_quotes state
81795
+ start_buffer = position_buffer + 1;
81796
+ goto in_quotes;
81797
+ } else {
81798
+ // no quote, move to normal parsing state
81799
+ start_buffer = position_buffer;
81800
+ goto normal;
81350
81801
  }
81802
+ };
81351
81803
 
81352
- bool conversion_error_ignored = false;
81804
+ normal : {
81805
+ /* state: normal parsing state */
81806
+ // this state parses the remainder of a non-quoted value until we reach a delimiter or newline
81807
+ for (; position_buffer < end_buffer; position_buffer++) {
81808
+ if ((*buffer)[position_buffer] == options.delimiter[0]) {
81809
+ // delimiter: end the value and add it to the chunk
81810
+ goto add_value;
81811
+ } else if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
81812
+ // newline: add row
81813
+ D_ASSERT(try_add_line || column == insert_chunk.ColumnCount() - 1);
81814
+ goto add_row;
81815
+ }
81816
+ }
81817
+ if (!BufferRemainder()) {
81818
+ goto final_state;
81819
+ } else {
81820
+ goto normal;
81821
+ }
81822
+ };
81353
81823
 
81354
- // convert the columns in the parsed chunk to the types of the table
81355
- insert_chunk.SetCardinality(parse_chunk);
81356
- for (idx_t col_idx = 0; col_idx < sql_types.size(); col_idx++) {
81357
- if (sql_types[col_idx].id() == LogicalTypeId::VARCHAR) {
81358
- // target type is varchar: no need to convert
81359
- // just test that all strings are valid utf-8 strings
81360
- VerifyUTF8(col_idx);
81361
- insert_chunk.data[insert_cols_idx[col_idx]].Reference(parse_chunk.data[col_idx]);
81362
- } else {
81363
- string error_message;
81364
- bool success;
81365
- if (options.has_format[LogicalTypeId::DATE] && sql_types[col_idx].id() == LogicalTypeId::DATE) {
81366
- // use the date format to cast the chunk
81367
- success =
81368
- TryCastDateVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_cols_idx[col_idx]],
81369
- parse_chunk.size(), error_message);
81370
- } else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
81371
- sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP) {
81372
- // use the date format to cast the chunk
81373
- success = TryCastTimestampVector(options, parse_chunk.data[col_idx],
81374
- insert_chunk.data[insert_cols_idx[col_idx]], parse_chunk.size(),
81375
- error_message);
81376
- } else {
81377
- // target type is not varchar: perform a cast
81378
- success = VectorOperations::DefaultTryCast(parse_chunk.data[col_idx],
81379
- insert_chunk.data[insert_cols_idx[col_idx]],
81380
- parse_chunk.size(), &error_message);
81381
- }
81382
- if (success) {
81383
- continue;
81384
- }
81385
- if (options.ignore_errors) {
81386
- conversion_error_ignored = true;
81387
- continue;
81388
- }
81389
- string col_name = to_string(col_idx);
81390
- if (col_idx < col_names.size()) {
81391
- col_name = "\"" + col_names[col_idx] + "\"";
81392
- }
81824
+ add_value : {
81825
+ /* state: Add value to string vector */
81826
+ AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes);
81827
+ // increase position by 1 and move start to the new position
81828
+ offset = 0;
81829
+ has_quotes = false;
81830
+ start_buffer = ++position_buffer;
81831
+ if (!BufferRemainder()) {
81832
+ goto final_state;
81833
+ }
81834
+ goto value_start;
81835
+ };
81393
81836
 
81394
- // figure out the exact line number
81395
- idx_t row_idx;
81396
- for (row_idx = 0; row_idx < parse_chunk.size(); row_idx++) {
81397
- auto &inserted_column = insert_chunk.data[col_idx];
81398
- auto &parsed_column = parse_chunk.data[col_idx];
81837
+ add_row : {
81838
+ /* state: Add Row to Parse chunk */
81839
+ // check type of newline (\r or \n)
81840
+ bool carriage_return = (*buffer)[position_buffer] == '\r';
81399
81841
 
81400
- if (FlatVector::IsNull(inserted_column, row_idx) && !FlatVector::IsNull(parsed_column, row_idx)) {
81401
- break;
81402
- }
81842
+ AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes);
81843
+ if (try_add_line) {
81844
+ bool success = column == insert_chunk.ColumnCount();
81845
+ if (success) {
81846
+ AddRow(insert_chunk, column);
81847
+ success = Flush(insert_chunk);
81848
+ }
81849
+ reached_remainder_state = false;
81850
+ parse_chunk.Reset();
81851
+ return success;
81852
+ } else {
81853
+ finished_chunk = AddRow(insert_chunk, column);
81854
+ }
81855
+ // increase position by 1 and move start to the new position
81856
+ offset = 0;
81857
+ has_quotes = false;
81858
+ start_buffer = ++position_buffer;
81859
+ if (reached_remainder_state || finished_chunk) {
81860
+ goto final_state;
81861
+ }
81862
+ if (!BufferRemainder()) {
81863
+ goto final_state;
81864
+ }
81865
+ if (carriage_return) {
81866
+ // \r newline, go to special state that parses an optional \n afterwards
81867
+ goto carriage_return;
81868
+ } else {
81869
+ // \n newline, move to value start
81870
+ if (finished_chunk) {
81871
+ goto final_state;
81872
+ }
81873
+ goto value_start;
81874
+ }
81875
+ }
81876
+ in_quotes:
81877
+ /* state: in_quotes this state parses the remainder of a quoted value*/
81878
+ has_quotes = true;
81879
+ position_buffer++;
81880
+ for (; position_buffer < end_buffer; position_buffer++) {
81881
+ if ((*buffer)[position_buffer] == options.quote[0]) {
81882
+ // quote: move to unquoted state
81883
+ goto unquote;
81884
+ } else if ((*buffer)[position_buffer] == options.escape[0]) {
81885
+ // escape: store the escaped position and move to handle_escape state
81886
+ escape_positions.push_back(position_buffer - start_buffer);
81887
+ goto handle_escape;
81888
+ }
81889
+ }
81890
+ if (!BufferRemainder()) {
81891
+ if (buffer->buffer->IsCSVFileLastBuffer()) {
81892
+ if (try_add_line) {
81893
+ return false;
81403
81894
  }
81404
- auto error_line = linenr - (parse_chunk.size() - row_idx) + 1;
81895
+ // still in quoted state at the end of the file or at the end of a buffer when running multithreaded, error:
81896
+ throw InvalidInputException("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path,
81897
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
81898
+ } else {
81899
+ goto final_state;
81900
+ }
81901
+ } else {
81902
+ position_buffer--;
81903
+ goto in_quotes;
81904
+ }
81405
81905
 
81406
- if (options.auto_detect) {
81407
- throw InvalidInputException("%s in column %s, at line %llu. Parser "
81408
- "options: %s. Consider either increasing the sample size "
81409
- "(SAMPLE_SIZE=X [X rows] or SAMPLE_SIZE=-1 [all rows]), "
81410
- "or skipping column conversion (ALL_VARCHAR=1)",
81411
- error_message, col_name, error_line, options.ToString());
81906
+ unquote:
81907
+ /* state: unquote: this state handles the state directly after we unquote*/
81908
+ //
81909
+ // in this state we expect either another quote (entering the quoted state again, and escaping the quote)
81910
+ // or a delimiter/newline, ending the current value and moving on to the next value
81911
+ position_buffer++;
81912
+ if (!BufferRemainder()) {
81913
+ offset = 1;
81914
+ goto final_state;
81915
+ }
81916
+ if ((*buffer)[position_buffer] == options.quote[0] &&
81917
+ (options.escape.empty() || options.escape[0] == options.quote[0])) {
81918
+ // escaped quote, return to quoted state and store escape position
81919
+ escape_positions.push_back(position_buffer - start_buffer);
81920
+ goto in_quotes;
81921
+ } else if ((*buffer)[position_buffer] == options.delimiter[0]) {
81922
+ // delimiter, add value
81923
+ offset = 1;
81924
+ goto add_value;
81925
+ } else if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
81926
+ offset = 1;
81927
+ D_ASSERT(column == insert_chunk.ColumnCount() - 1);
81928
+ goto add_row;
81929
+ } else if (position_buffer >= end_buffer) {
81930
+ // reached end of buffer
81931
+ offset = 1;
81932
+ goto final_state;
81933
+ } else {
81934
+ error_message = StringUtil::Format(
81935
+ "Error in file \"%s\" on line %s: quote should be followed by end of value, end of "
81936
+ "row or another quote. (%s). ",
81937
+ options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
81938
+ return false;
81939
+ }
81940
+ handle_escape : {
81941
+ /* state: handle_escape */
81942
+ // escape should be followed by a quote or another escape character
81943
+ position_buffer++;
81944
+ if (!BufferRemainder()) {
81945
+ goto final_state;
81946
+ }
81947
+ if (position_buffer >= buffer_size && buffer->buffer->IsCSVFileLastBuffer()) {
81948
+ error_message = StringUtil::Format(
81949
+ "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
81950
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
81951
+ return false;
81952
+ }
81953
+ if ((*buffer)[position_buffer] != options.quote[0] && (*buffer)[position_buffer] != options.escape[0]) {
81954
+ error_message = StringUtil::Format(
81955
+ "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
81956
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
81957
+ return false;
81958
+ }
81959
+ // escape was followed by quote or escape, go back to quoted state
81960
+ goto in_quotes;
81961
+ }
81962
+
81963
+ carriage_return : {
81964
+ /* state: carriage_return */
81965
+ // this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
81966
+ if ((*buffer)[position_buffer] == '\n') {
81967
+ // newline after carriage return: skip
81968
+ // increase position by 1 and move start to the new position
81969
+ start_buffer = ++position_buffer;
81970
+ if (position_buffer >= buffer_size) {
81971
+ // file ends right after delimiter, go to final state
81972
+ goto final_state;
81973
+ }
81974
+ }
81975
+ goto value_start;
81976
+ }
81977
+ final_state : {
81978
+ /* state: final_stage reached after we finished reading the end_buffer of the csv buffer */
81979
+ // reset end buffer
81980
+ end_buffer = buffer->buffer_end;
81981
+ if (finished_chunk) {
81982
+ return true;
81983
+ }
81984
+ // If this is the last buffer, we have to read the last value
81985
+ if (buffer->buffer->IsCSVFileLastBuffer() || (buffer->next_buffer->IsCSVFileLastBuffer())) {
81986
+ if (column > 0 || position_buffer > start_buffer) {
81987
+ // remaining values to be added to the chunk
81988
+ D_ASSERT(column == insert_chunk.ColumnCount() - 1);
81989
+ AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes);
81990
+ if (try_add_line) {
81991
+ bool success = column == sql_types.size();
81992
+ if (success) {
81993
+ AddRow(insert_chunk, column);
81994
+ success = Flush(insert_chunk);
81995
+ }
81996
+ parse_chunk.Reset();
81997
+ reached_remainder_state = false;
81998
+ return success;
81412
81999
  } else {
81413
- throw InvalidInputException("%s at line %llu in column %s. Parser options: %s ", error_message,
81414
- error_line, col_name, options.ToString());
82000
+ AddRow(insert_chunk, column);
81415
82001
  }
81416
82002
  }
81417
82003
  }
81418
- if (conversion_error_ignored) {
81419
- D_ASSERT(options.ignore_errors);
81420
- SelectionVector succesful_rows;
81421
- succesful_rows.Initialize(parse_chunk.size());
81422
- idx_t sel_size = 0;
82004
+ // flush the parsed chunk and finalize parsing
82005
+ if (mode == ParserMode::PARSING) {
82006
+ Flush(insert_chunk);
82007
+ }
82008
+ return true;
82009
+ };
82010
+ }
81423
82011
 
81424
- for (idx_t row_idx = 0; row_idx < parse_chunk.size(); row_idx++) {
81425
- bool failed = false;
81426
- for (idx_t column_idx = 0; column_idx < sql_types.size(); column_idx++) {
82012
+ void ParallelCSVReader::ParseCSV(DataChunk &insert_chunk) {
82013
+ string error_message;
82014
+ if (!TryParseCSV(ParserMode::PARSING, insert_chunk, error_message)) {
82015
+ throw InvalidInputException(error_message);
82016
+ }
82017
+ }
81427
82018
 
81428
- auto &inserted_column = insert_chunk.data[column_idx];
81429
- auto &parsed_column = parse_chunk.data[column_idx];
82019
+ bool ParallelCSVReader::TryParseCSV(ParserMode mode) {
82020
+ DataChunk dummy_chunk;
82021
+ string error_message;
82022
+ return TryParseCSV(mode, dummy_chunk, error_message);
82023
+ }
81430
82024
 
81431
- bool was_already_null = FlatVector::IsNull(parsed_column, row_idx);
81432
- if (!was_already_null && FlatVector::IsNull(inserted_column, row_idx)) {
81433
- failed = true;
81434
- break;
81435
- }
81436
- }
81437
- if (!failed) {
81438
- succesful_rows.set_index(sel_size++, row_idx);
81439
- }
81440
- }
81441
- insert_chunk.Slice(succesful_rows, sel_size);
82025
+ void ParallelCSVReader::ParseCSV(ParserMode mode) {
82026
+ DataChunk dummy_chunk;
82027
+ string error_message;
82028
+ if (!TryParseCSV(mode, dummy_chunk, error_message)) {
82029
+ throw InvalidInputException(error_message);
81442
82030
  }
81443
- parse_chunk.Reset();
81444
82031
  }
82032
+
82033
+ bool ParallelCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_chunk, string &error_message) {
82034
+ mode = parser_mode;
82035
+ return TryParseSimpleCSV(insert_chunk, error_message);
82036
+ }
82037
+
81445
82038
  } // namespace duckdb
81446
82039
  //===----------------------------------------------------------------------===//
81447
82040
  // DuckDB
@@ -120938,6 +121531,28 @@ void StripAccentsFun::RegisterFunction(BuiltinFunctions &set) {
120938
121531
 
120939
121532
  namespace duckdb {
120940
121533
 
121534
+ static const int64_t SUPPORTED_UPPER_BOUND = NumericLimits<uint32_t>::Maximum();
121535
+ static const int64_t SUPPORTED_LOWER_BOUND = -SUPPORTED_UPPER_BOUND - 1;
121536
+
121537
+ static inline void AssertInSupportedRange(idx_t input_size, int64_t offset, int64_t length) {
121538
+
121539
+ if (input_size > (uint64_t)SUPPORTED_UPPER_BOUND) {
121540
+ throw OutOfRangeException("Substring input size is too large (> %d)", SUPPORTED_UPPER_BOUND);
121541
+ }
121542
+ if (offset < SUPPORTED_LOWER_BOUND) {
121543
+ throw OutOfRangeException("Substring offset outside of supported range (< %d)", SUPPORTED_LOWER_BOUND);
121544
+ }
121545
+ if (offset > SUPPORTED_UPPER_BOUND) {
121546
+ throw OutOfRangeException("Substring offset outside of supported range (> %d)", SUPPORTED_UPPER_BOUND);
121547
+ }
121548
+ if (length < SUPPORTED_LOWER_BOUND) {
121549
+ throw OutOfRangeException("Substring length outside of supported range (< %d)", SUPPORTED_LOWER_BOUND);
121550
+ }
121551
+ if (length > SUPPORTED_UPPER_BOUND) {
121552
+ throw OutOfRangeException("Substring length outside of supported range (> %d)", SUPPORTED_UPPER_BOUND);
121553
+ }
121554
+ }
121555
+
120941
121556
  string_t SubstringEmptyString(Vector &result) {
120942
121557
  auto result_string = StringVector::EmptyString(result, 0);
120943
121558
  result_string.Finalize();
@@ -120977,7 +121592,7 @@ bool SubstringStartEnd(int64_t input_size, int64_t offset, int64_t length, int64
120977
121592
  } else {
120978
121593
  // negative length: go backwards (i.e. end = start, start = start + length)
120979
121594
  end = start;
120980
- start = MaxValue<int64_t>(0, end + length);
121595
+ start = MaxValue<int64_t>(0, start + length);
120981
121596
  }
120982
121597
  if (start == end) {
120983
121598
  return false;
@@ -120990,6 +121605,8 @@ string_t SubstringASCII(Vector &result, string_t input, int64_t offset, int64_t
120990
121605
  auto input_data = input.GetDataUnsafe();
120991
121606
  auto input_size = input.GetSize();
120992
121607
 
121608
+ AssertInSupportedRange(input_size, offset, length);
121609
+
120993
121610
  int64_t start, end;
120994
121611
  if (!SubstringStartEnd(input_size, offset, length, start, end)) {
120995
121612
  return SubstringEmptyString(result);
@@ -121001,6 +121618,8 @@ string_t SubstringFun::SubstringUnicode(Vector &result, string_t input, int64_t
121001
121618
  auto input_data = input.GetDataUnsafe();
121002
121619
  auto input_size = input.GetSize();
121003
121620
 
121621
+ AssertInSupportedRange(input_size, offset, length);
121622
+
121004
121623
  if (length == 0) {
121005
121624
  return SubstringEmptyString(result);
121006
121625
  }
@@ -121051,14 +121670,15 @@ string_t SubstringFun::SubstringUnicode(Vector &result, string_t input, int64_t
121051
121670
  int64_t start, end;
121052
121671
 
121053
121672
  // we express start and end as unicode codepoints from the front
121673
+ offset--;
121054
121674
  if (length < 0) {
121055
121675
  // negative length
121056
- start = MaxValue<int64_t>(0, offset + length - 1);
121057
- end = offset - 1;
121676
+ start = MaxValue<int64_t>(0, offset + length);
121677
+ end = offset;
121058
121678
  } else {
121059
121679
  // positive length
121060
- start = MaxValue<int64_t>(0, offset - 1);
121061
- end = offset + length - 1;
121680
+ start = MaxValue<int64_t>(0, offset);
121681
+ end = offset + length;
121062
121682
  }
121063
121683
 
121064
121684
  int64_t current_character = 0;
@@ -121086,6 +121706,8 @@ string_t SubstringFun::SubstringGrapheme(Vector &result, string_t input, int64_t
121086
121706
  auto input_data = input.GetDataUnsafe();
121087
121707
  auto input_size = input.GetSize();
121088
121708
 
121709
+ AssertInSupportedRange(input_size, offset, length);
121710
+
121089
121711
  // we don't know yet if the substring is ascii, but we assume it is (for now)
121090
121712
  // first get the start and end as if this was an ascii string
121091
121713
  int64_t start, end;
@@ -121170,7 +121792,7 @@ static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &r
121170
121792
  } else {
121171
121793
  BinaryExecutor::Execute<string_t, int64_t, string_t>(
121172
121794
  input_vector, offset_vector, result, args.size(), [&](string_t input_string, int64_t offset) {
121173
- return OP::Substring(result, input_string, offset, NumericLimits<int64_t>::Maximum() - offset);
121795
+ return OP::Substring(result, input_string, offset, NumericLimits<uint32_t>::Maximum());
121174
121796
  });
121175
121797
  }
121176
121798
  }
@@ -121189,7 +121811,7 @@ static void SubstringFunctionASCII(DataChunk &args, ExpressionState &state, Vect
121189
121811
  } else {
121190
121812
  BinaryExecutor::Execute<string_t, int64_t, string_t>(
121191
121813
  input_vector, offset_vector, result, args.size(), [&](string_t input_string, int64_t offset) {
121192
- return SubstringASCII(result, input_string, offset, NumericLimits<int64_t>::Maximum() - offset);
121814
+ return SubstringASCII(result, input_string, offset, NumericLimits<uint32_t>::Maximum());
121193
121815
  });
121194
121816
  }
121195
121817
  }
@@ -124179,72 +124801,6 @@ void CheckpointFunction::RegisterFunction(BuiltinFunctions &set) {
124179
124801
  }
124180
124802
 
124181
124803
  } // namespace duckdb
124182
- //===----------------------------------------------------------------------===//
124183
- // DuckDB
124184
- //
124185
- // duckdb/function/table/read_csv.hpp
124186
- //
124187
- //
124188
- //===----------------------------------------------------------------------===//
124189
-
124190
-
124191
-
124192
-
124193
-
124194
-
124195
-
124196
- namespace duckdb {
124197
-
124198
- struct BaseCSVData : public TableFunctionData {
124199
- //! The file path of the CSV file to read or write
124200
- vector<string> files;
124201
- //! The CSV reader options
124202
- BufferedCSVReaderOptions options;
124203
- //! Offsets for generated columns
124204
- idx_t filename_col_idx;
124205
- idx_t hive_partition_col_idx;
124206
-
124207
- void Finalize();
124208
- };
124209
-
124210
- struct WriteCSVData : public BaseCSVData {
124211
- WriteCSVData(string file_path, vector<LogicalType> sql_types, vector<string> names) : sql_types(move(sql_types)) {
124212
- files.push_back(move(file_path));
124213
- options.names = move(names);
124214
- }
124215
-
124216
- //! The SQL types to write
124217
- vector<LogicalType> sql_types;
124218
- //! The newline string to write
124219
- string newline = "\n";
124220
- //! Whether or not we are writing a simple CSV (delimiter, quote and escape are all 1 byte in length)
124221
- bool is_simple;
124222
- //! The size of the CSV file (in bytes) that we buffer before we flush it to disk
124223
- idx_t flush_size = 4096 * 8;
124224
- };
124225
-
124226
- struct ReadCSVData : public BaseCSVData {
124227
- //! The expected SQL types to read
124228
- vector<LogicalType> sql_types;
124229
- //! The initial reader (if any): this is used when automatic detection is used during binding.
124230
- //! In this case, the CSV reader is already created and might as well be re-used.
124231
- unique_ptr<BufferedCSVReader> initial_reader;
124232
- //! The union readers is created(when csv union_by_name option is on) during binding
124233
- //! Those reader can be re-used during ReadCSVFunction
124234
- vector<unique_ptr<BufferedCSVReader>> union_readers;
124235
- };
124236
-
124237
- struct CSVCopyFunction {
124238
- static void RegisterFunction(BuiltinFunctions &set);
124239
- };
124240
-
124241
- struct ReadCSVTableFunction {
124242
- static TableFunction GetFunction(bool list_parameter = false);
124243
- static TableFunction GetAutoFunction(bool list_parameter = false);
124244
- static void RegisterFunction(BuiltinFunctions &set);
124245
- };
124246
-
124247
- } // namespace duckdb
124248
124804
 
124249
124805
 
124250
124806
 
@@ -124263,7 +124819,7 @@ void SubstringDetection(string &str_1, string &str_2, const string &name_str_1,
124263
124819
  if (str_1.empty() || str_2.empty()) {
124264
124820
  return;
124265
124821
  }
124266
- if (str_1.find(str_2) != string::npos || str_2.find(str_1) != std::string::npos) {
124822
+ if ((str_1.find(str_2) != string::npos || str_2.find(str_1) != std::string::npos) && str_1 != "NULL") {
124267
124823
  throw BinderException("%s must not appear in the %s specification and vice versa", name_str_1, name_str_2);
124268
124824
  }
124269
124825
  }
@@ -124338,12 +124894,9 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, CopyInfo &in
124338
124894
  bind_data->sql_types = expected_types;
124339
124895
 
124340
124896
  string file_pattern = info.file_path;
124897
+ vector<string> patterns {file_pattern};
124341
124898
 
124342
- auto &fs = FileSystem::GetFileSystem(context);
124343
- bind_data->files = fs.Glob(file_pattern, context);
124344
- if (bind_data->files.empty()) {
124345
- throw IOException("No files found that match the pattern \"%s\"", file_pattern);
124346
- }
124899
+ bind_data->InitializeFiles(context, patterns);
124347
124900
 
124348
124901
  auto &options = bind_data->options;
124349
124902
 
@@ -124358,7 +124911,7 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, CopyInfo &in
124358
124911
  // no FORCE_QUOTE specified: initialize to false
124359
124912
  options.force_not_null.resize(expected_types.size(), false);
124360
124913
  }
124361
- bind_data->Finalize();
124914
+ bind_data->FinalizeRead(context);
124362
124915
  return move(bind_data);
124363
124916
  }
124364
124917
 
@@ -125417,11 +125970,39 @@ void BuiltinFunctions::RegisterTableFunctions() {
125417
125970
 
125418
125971
 
125419
125972
 
125420
-
125421
125973
  #include <limits>
125422
125974
 
125423
125975
  namespace duckdb {
125424
125976
 
125977
+ unique_ptr<CSVFileHandle> ReadCSV::OpenCSV(const BufferedCSVReaderOptions &options, ClientContext &context) {
125978
+ auto &fs = FileSystem::GetFileSystem(context);
125979
+ auto opener = FileSystem::GetFileOpener(context);
125980
+ auto file_handle = fs.OpenFile(options.file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK,
125981
+ options.compression, opener);
125982
+ return make_unique<CSVFileHandle>(move(file_handle));
125983
+ }
125984
+
125985
+ void ReadCSVData::InitializeFiles(ClientContext &context, const vector<string> &patterns) {
125986
+ auto &fs = FileSystem::GetFileSystem(context);
125987
+ for (auto &file_pattern : patterns) {
125988
+ auto found_files = fs.Glob(file_pattern, context);
125989
+ if (found_files.empty()) {
125990
+ throw IOException("No files found that match the pattern \"%s\"", file_pattern);
125991
+ }
125992
+ files.insert(files.end(), found_files.begin(), found_files.end());
125993
+ }
125994
+ }
125995
+
125996
+ void ReadCSVData::FinalizeRead(ClientContext &context) {
125997
+ BaseCSVData::Finalize();
125998
+ auto &config = DBConfig::GetConfig(context);
125999
+ single_threaded = !config.options.experimental_parallel_csv_reader;
126000
+ if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
126001
+ // not supported for parallel CSV reading
126002
+ single_threaded = true;
126003
+ }
126004
+ }
126005
+
125425
126006
  static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctionBindInput &input,
125426
126007
  vector<LogicalType> &return_types, vector<string> &names) {
125427
126008
  auto &config = DBConfig::GetConfig(context);
@@ -125442,14 +126023,7 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
125442
126023
  patterns.push_back(StringValue::Get(input.inputs[0]));
125443
126024
  }
125444
126025
 
125445
- auto &fs = FileSystem::GetFileSystem(context);
125446
- for (auto &file_pattern : patterns) {
125447
- auto files = fs.Glob(file_pattern, context);
125448
- if (files.empty()) {
125449
- throw IOException("No files found that match the pattern \"%s\"", file_pattern);
125450
- }
125451
- result->files.insert(result->files.end(), files.begin(), files.end());
125452
- }
126026
+ result->InitializeFiles(context, patterns);
125453
126027
 
125454
126028
  for (auto &kv : input.named_parameters) {
125455
126029
  auto loption = StringUtil::Lower(kv.first);
@@ -125480,6 +126054,11 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
125480
126054
  options.include_file_name = BooleanValue::Get(kv.second);
125481
126055
  } else if (loption == "hive_partitioning") {
125482
126056
  options.include_parsed_hive_partitions = BooleanValue::Get(kv.second);
126057
+ } else if (loption == "buffer_size") {
126058
+ options.buffer_size = kv.second.GetValue<uint64_t>();
126059
+ if (options.buffer_size == 0) {
126060
+ throw InvalidInputException("Buffer Size option must be higher than 0");
126061
+ }
125483
126062
  } else {
125484
126063
  options.SetReadOption(loption, kv.second, names);
125485
126064
  }
@@ -125492,13 +126071,14 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
125492
126071
  if (options.auto_detect) {
125493
126072
  options.file_path = result->files[0];
125494
126073
  auto initial_reader = make_unique<BufferedCSVReader>(context, options);
125495
-
125496
126074
  return_types.assign(initial_reader->sql_types.begin(), initial_reader->sql_types.end());
125497
126075
  if (names.empty()) {
125498
126076
  names.assign(initial_reader->col_names.begin(), initial_reader->col_names.end());
125499
126077
  } else {
125500
126078
  D_ASSERT(return_types.size() == names.size());
125501
126079
  }
126080
+ options = result->options;
126081
+ result->sql_types = initial_reader->sql_types;
125502
126082
  result->initial_reader = move(initial_reader);
125503
126083
  } else {
125504
126084
  result->sql_types = return_types;
@@ -125577,10 +126157,233 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
125577
126157
  }
125578
126158
  }
125579
126159
  result->options.names = names;
126160
+ result->FinalizeRead(context);
125580
126161
  return move(result);
125581
126162
  }
125582
126163
 
125583
- struct ReadCSVOperatorData : public GlobalTableFunctionState {
126164
+ static unique_ptr<FunctionData> ReadCSVAutoBind(ClientContext &context, TableFunctionBindInput &input,
126165
+ vector<LogicalType> &return_types, vector<string> &names) {
126166
+ input.named_parameters["auto_detect"] = Value::BOOLEAN(true);
126167
+ return ReadCSVBind(context, input, return_types, names);
126168
+ }
126169
+
126170
+ //===--------------------------------------------------------------------===//
126171
+ // Parallel CSV Reader CSV Global State
126172
+ //===--------------------------------------------------------------------===//
126173
+ //===--------------------------------------------------------------------===//
126174
+ // Read CSV Global State
126175
+ //===--------------------------------------------------------------------===//
126176
+ struct ParallelCSVGlobalState : public GlobalTableFunctionState {
126177
+ public:
126178
+ ParallelCSVGlobalState(unique_ptr<CSVFileHandle> file_handle_p, vector<string> &files_path_p,
126179
+ idx_t system_threads_p, idx_t buffer_size_p, idx_t rows_to_skip)
126180
+ : file_handle(move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p) {
126181
+ for (idx_t i = 0; i < rows_to_skip; i++) {
126182
+ file_handle->ReadLine();
126183
+ }
126184
+ estimated_linenr = rows_to_skip;
126185
+ file_size = file_handle->FileSize();
126186
+ first_file_size = file_size;
126187
+ bytes_read = 0;
126188
+ if (buffer_size < file_size) {
126189
+ bytes_per_local_state = buffer_size / MaxThreads();
126190
+ } else {
126191
+ bytes_per_local_state = file_size / MaxThreads();
126192
+ }
126193
+ current_buffer = make_shared<CSVBuffer>(buffer_size, *file_handle);
126194
+ next_buffer = current_buffer->Next(*file_handle, buffer_size);
126195
+ }
126196
+ ParallelCSVGlobalState() {
126197
+ }
126198
+
126199
+ idx_t MaxThreads() const override;
126200
+ //! Returns buffer and index that caller thread should read.
126201
+ unique_ptr<CSVBufferRead> Next(ClientContext &context, ReadCSVData &bind_data);
126202
+ //! If we finished reading all the CSV Files
126203
+ bool Finished();
126204
+ //! How many bytes were read up to this point
126205
+ atomic<idx_t> bytes_read;
126206
+ //! Size of current file
126207
+ idx_t file_size;
126208
+
126209
+ private:
126210
+ //! File Handle for current file
126211
+ unique_ptr<CSVFileHandle> file_handle;
126212
+
126213
+ shared_ptr<CSVBuffer> current_buffer;
126214
+ shared_ptr<CSVBuffer> next_buffer;
126215
+ //! The index of the next file to read (i.e. current file + 1)
126216
+ idx_t file_index = 1;
126217
+
126218
+ //! Mutex to lock when getting next batch of bytes (Parallel Only)
126219
+ mutex main_mutex;
126220
+ //! Byte set from for last thread
126221
+ idx_t next_byte = 0;
126222
+
126223
+ //! The current estimated line number
126224
+ idx_t estimated_linenr;
126225
+
126226
+ //! How many bytes we should execute per local state
126227
+ idx_t bytes_per_local_state;
126228
+
126229
+ //! Size of first file
126230
+ idx_t first_file_size;
126231
+ //! Basically max number of threads in DuckDB
126232
+ idx_t system_threads;
126233
+ //! Size of the buffers
126234
+ idx_t buffer_size;
126235
+ //! Current batch index
126236
+ idx_t batch_index = 0;
126237
+ };
126238
+
126239
+ idx_t ParallelCSVGlobalState::MaxThreads() const {
126240
+ // idx_t one_mb = 1000000;
126241
+ // idx_t threads_per_mb = first_file_size / one_mb + 1;
126242
+ // if (threads_per_mb < system_threads) {
126243
+ // return threads_per_mb;
126244
+ // }
126245
+ return system_threads;
126246
+ }
126247
+
126248
+ bool ParallelCSVGlobalState::Finished() {
126249
+ lock_guard<mutex> parallel_lock(main_mutex);
126250
+ return !current_buffer;
126251
+ }
126252
+
126253
+ unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, ReadCSVData &bind_data) {
126254
+ lock_guard<mutex> parallel_lock(main_mutex);
126255
+ if (!current_buffer) {
126256
+ // We are done scanning.
126257
+ return nullptr;
126258
+ }
126259
+ // set up the current buffer
126260
+ auto result = make_unique<CSVBufferRead>(current_buffer, next_buffer, next_byte, next_byte + bytes_per_local_state,
126261
+ batch_index++, estimated_linenr);
126262
+ // move the byte index of the CSV reader to the next buffer
126263
+ next_byte += bytes_per_local_state;
126264
+ estimated_linenr += bytes_per_local_state / (bind_data.sql_types.size() * 5); // estimate 5 bytes per column
126265
+ if (next_byte >= current_buffer->GetBufferSize()) {
126266
+ // We replace the current buffer with the next buffer
126267
+ next_byte = 0;
126268
+ bytes_read += current_buffer->GetBufferSize();
126269
+ current_buffer = next_buffer;
126270
+ if (next_buffer) {
126271
+ // Next buffer gets the next-next buffer
126272
+ next_buffer = next_buffer->Next(*file_handle, buffer_size);
126273
+ }
126274
+ }
126275
+ if (current_buffer && !next_buffer) {
126276
+ // This means we are done with the current file, we need to go to the next one (if exists).
126277
+ if (file_index < bind_data.files.size()) {
126278
+ bind_data.options.file_path = bind_data.files[file_index++];
126279
+ file_handle = ReadCSV::OpenCSV(bind_data.options, context);
126280
+ next_buffer = make_shared<CSVBuffer>(buffer_size, *file_handle);
126281
+ }
126282
+ }
126283
+ return result;
126284
+ }
126285
+ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext &context,
126286
+ TableFunctionInitInput &input) {
126287
+ auto &bind_data = (ReadCSVData &)*input.bind_data;
126288
+ if (bind_data.files.empty()) {
126289
+ // This can happen when a filename based filter pushdown has eliminated all possible files for this scan.
126290
+ return make_unique<ParallelCSVGlobalState>();
126291
+ }
126292
+ unique_ptr<CSVFileHandle> file_handle;
126293
+ if (bind_data.initial_reader) {
126294
+ file_handle = move(bind_data.initial_reader->file_handle);
126295
+ bind_data.initial_reader.reset();
126296
+ } else {
126297
+ bind_data.options.file_path = bind_data.files[0];
126298
+ file_handle = ReadCSV::OpenCSV(bind_data.options, context);
126299
+ }
126300
+ idx_t rows_to_skip = bind_data.options.skip_rows + (bind_data.options.has_header ? 1 : 0);
126301
+ return make_unique<ParallelCSVGlobalState>(move(file_handle), bind_data.files, context.db->NumberOfThreads(),
126302
+ bind_data.options.buffer_size, rows_to_skip);
126303
+ }
126304
+
126305
+ //===--------------------------------------------------------------------===//
126306
+ // Read CSV Local State
126307
+ //===--------------------------------------------------------------------===//
126308
+ struct ParallelCSVLocalState : public LocalTableFunctionState {
126309
+ public:
126310
+ explicit ParallelCSVLocalState(unique_ptr<ParallelCSVReader> csv_reader_p) : csv_reader(move(csv_reader_p)) {
126311
+ }
126312
+
126313
+ //! The CSV reader
126314
+ unique_ptr<ParallelCSVReader> csv_reader;
126315
+ CSVBufferRead previous_buffer;
126316
+ };
126317
+
126318
+ unique_ptr<LocalTableFunctionState> ReadCSVInitLocal(ExecutionContext &context, TableFunctionInitInput &input,
126319
+ GlobalTableFunctionState *global_state_p) {
126320
+ auto &csv_data = (ReadCSVData &)*input.bind_data;
126321
+ if (csv_data.single_threaded) {
126322
+ return nullptr;
126323
+ }
126324
+ auto &global_state = (ParallelCSVGlobalState &)*global_state_p;
126325
+ auto next_local_buffer = global_state.Next(context.client, csv_data);
126326
+ unique_ptr<ParallelCSVReader> csv_reader;
126327
+ if (next_local_buffer) {
126328
+ csv_reader = make_unique<ParallelCSVReader>(context.client, csv_data.options, move(next_local_buffer),
126329
+ csv_data.sql_types);
126330
+ }
126331
+ auto new_local_state = make_unique<ParallelCSVLocalState>(move(csv_reader));
126332
+ return move(new_local_state);
126333
+ }
126334
+
126335
+ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
126336
+ auto &bind_data = (ReadCSVData &)*data_p.bind_data;
126337
+ auto &csv_global_state = (ParallelCSVGlobalState &)*data_p.global_state;
126338
+ auto &csv_local_state = (ParallelCSVLocalState &)*data_p.local_state;
126339
+
126340
+ if (!csv_local_state.csv_reader) {
126341
+ // no csv_reader was set, this can happen when a filename-based filter has filtered out all possible files
126342
+ return;
126343
+ }
126344
+
126345
+ do {
126346
+ if (output.size() != 0 || (csv_global_state.Finished() && csv_local_state.csv_reader->position_buffer >=
126347
+ csv_local_state.csv_reader->end_buffer)) {
126348
+ break;
126349
+ }
126350
+ if (csv_local_state.csv_reader->position_buffer >= csv_local_state.csv_reader->end_buffer) {
126351
+ auto next_chunk = csv_global_state.Next(context, bind_data);
126352
+ if (!next_chunk) {
126353
+ break;
126354
+ }
126355
+ // csv_local_state.previous_buffer = csv_local_state.csv_reader->buffer;
126356
+ csv_local_state.csv_reader->SetBufferRead(move(next_chunk));
126357
+ }
126358
+ csv_local_state.csv_reader->ParseCSV(output);
126359
+
126360
+ } while (true);
126361
+
126362
+ if (bind_data.options.union_by_name) {
126363
+ throw InternalException("FIXME: union by name");
126364
+ }
126365
+ if (bind_data.options.include_file_name) {
126366
+ throw InternalException("FIXME: output file name");
126367
+ }
126368
+ if (bind_data.options.include_parsed_hive_partitions) {
126369
+ throw InternalException("FIXME: hive partitions");
126370
+ }
126371
+ }
126372
+
126373
+ static idx_t CSVReaderGetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
126374
+ LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
126375
+ auto &bind_data = (ReadCSVData &)*bind_data_p;
126376
+ if (bind_data.single_threaded) {
126377
+ return 0;
126378
+ }
126379
+ auto &data = (ParallelCSVLocalState &)*local_state;
126380
+ return data.csv_reader->buffer->batch_index;
126381
+ }
126382
+
126383
+ //===--------------------------------------------------------------------===//
126384
+ // Single-Threaded CSV Reader
126385
+ //===--------------------------------------------------------------------===//
126386
+ struct SingleThreadedCSVState : public GlobalTableFunctionState {
125584
126387
  //! The CSV reader
125585
126388
  unique_ptr<BufferedCSVReader> csv_reader;
125586
126389
  //! The index of the next file to read (i.e. current file + 1)
@@ -125589,11 +126392,16 @@ struct ReadCSVOperatorData : public GlobalTableFunctionState {
125589
126392
  idx_t file_size;
125590
126393
  //! How many bytes were read up to this point
125591
126394
  atomic<idx_t> bytes_read;
126395
+
126396
+ idx_t MaxThreads() const override {
126397
+ return 1;
126398
+ }
125592
126399
  };
125593
126400
 
125594
- static unique_ptr<GlobalTableFunctionState> ReadCSVInit(ClientContext &context, TableFunctionInitInput &input) {
126401
+ static unique_ptr<GlobalTableFunctionState> SingleThreadedCSVInit(ClientContext &context,
126402
+ TableFunctionInitInput &input) {
125595
126403
  auto &bind_data = (ReadCSVData &)*input.bind_data;
125596
- auto result = make_unique<ReadCSVOperatorData>();
126404
+ auto result = make_unique<SingleThreadedCSVState>();
125597
126405
  if (bind_data.initial_reader) {
125598
126406
  result->csv_reader = move(bind_data.initial_reader);
125599
126407
  } else if (bind_data.files.empty()) {
@@ -125603,20 +126411,14 @@ static unique_ptr<GlobalTableFunctionState> ReadCSVInit(ClientContext &context,
125603
126411
  bind_data.options.file_path = bind_data.files[0];
125604
126412
  result->csv_reader = make_unique<BufferedCSVReader>(context, bind_data.options, bind_data.sql_types);
125605
126413
  }
125606
- result->file_size = result->csv_reader->GetFileSize();
126414
+ result->file_size = result->csv_reader->file_handle->FileSize();
125607
126415
  result->file_index = 1;
125608
126416
  return move(result);
125609
126417
  }
125610
126418
 
125611
- static unique_ptr<FunctionData> ReadCSVAutoBind(ClientContext &context, TableFunctionBindInput &input,
125612
- vector<LogicalType> &return_types, vector<string> &names) {
125613
- input.named_parameters["auto_detect"] = Value::BOOLEAN(true);
125614
- return ReadCSVBind(context, input, return_types, names);
125615
- }
125616
-
125617
- static void ReadCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
126419
+ static void SingleThreadedCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
125618
126420
  auto &bind_data = (ReadCSVData &)*data_p.bind_data;
125619
- auto &data = (ReadCSVOperatorData &)*data_p.global_state;
126421
+ auto &data = (SingleThreadedCSVState &)*data_p.global_state;
125620
126422
 
125621
126423
  if (!data.csv_reader) {
125622
126424
  // no csv_reader was set, this can happen when a filename-based filter has filtered out all possible files
@@ -125675,6 +126477,27 @@ static void ReadCSVFunction(ClientContext &context, TableFunctionInput &data_p,
125675
126477
  }
125676
126478
  }
125677
126479
 
126480
+ //===--------------------------------------------------------------------===//
126481
+ // Read CSV Functions
126482
+ //===--------------------------------------------------------------------===//
126483
+ static unique_ptr<GlobalTableFunctionState> ReadCSVInitGlobal(ClientContext &context, TableFunctionInitInput &input) {
126484
+ auto &bind_data = (ReadCSVData &)*input.bind_data;
126485
+ if (bind_data.single_threaded) {
126486
+ return SingleThreadedCSVInit(context, input);
126487
+ } else {
126488
+ return ParallelCSVInitGlobal(context, input);
126489
+ }
126490
+ }
126491
+
126492
+ static void ReadCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
126493
+ auto &bind_data = (ReadCSVData &)*data_p.bind_data;
126494
+ if (bind_data.single_threaded) {
126495
+ SingleThreadedCSVFunction(context, data_p, output);
126496
+ } else {
126497
+ ParallelReadCSVFunction(context, data_p, output);
126498
+ }
126499
+ }
126500
+
125678
126501
  static void ReadCSVAddNamedParameters(TableFunction &table_function) {
125679
126502
  table_function.named_parameters["sep"] = LogicalType::VARCHAR;
125680
126503
  table_function.named_parameters["delim"] = LogicalType::VARCHAR;
@@ -125699,15 +126522,26 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
125699
126522
  table_function.named_parameters["maximum_line_size"] = LogicalType::VARCHAR;
125700
126523
  table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
125701
126524
  table_function.named_parameters["union_by_name"] = LogicalType::BOOLEAN;
126525
+ table_function.named_parameters["buffer_size"] = LogicalType::UBIGINT;
125702
126526
  }
125703
126527
 
125704
126528
  double CSVReaderProgress(ClientContext &context, const FunctionData *bind_data_p,
125705
126529
  const GlobalTableFunctionState *global_state) {
125706
- auto &data = (const ReadCSVOperatorData &)*global_state;
125707
- if (data.file_size == 0) {
126530
+ auto &bind_data = (ReadCSVData &)*bind_data_p;
126531
+ idx_t file_size, bytes_read;
126532
+ if (bind_data.single_threaded) {
126533
+ auto &data = (const SingleThreadedCSVState &)*global_state;
126534
+ file_size = data.file_size;
126535
+ bytes_read = data.bytes_read;
126536
+ } else {
126537
+ auto &data = (const ParallelCSVGlobalState &)*global_state;
126538
+ file_size = data.file_size;
126539
+ bytes_read = data.bytes_read;
126540
+ }
126541
+ if (file_size == 0) {
125708
126542
  return 100;
125709
126543
  }
125710
- auto percentage = (data.bytes_read * 100.0) / data.file_size;
126544
+ auto percentage = (bytes_read * 100.0) / file_size;
125711
126545
  return percentage;
125712
126546
  }
125713
126547
 
@@ -125745,7 +126579,7 @@ void BufferedCSVReaderOptions::Serialize(FieldWriter &writer) const {
125745
126579
  writer.WriteField<bool>(header);
125746
126580
  writer.WriteField<bool>(ignore_errors);
125747
126581
  writer.WriteField<idx_t>(num_cols);
125748
- writer.WriteField<idx_t>(buffer_size);
126582
+ writer.WriteField<idx_t>(buffer_sample_size);
125749
126583
  writer.WriteString(null_str);
125750
126584
  writer.WriteField<FileCompressionType>(compression);
125751
126585
  // read options
@@ -125777,7 +126611,7 @@ void BufferedCSVReaderOptions::Deserialize(FieldReader &reader) {
125777
126611
  header = reader.ReadRequired<bool>();
125778
126612
  ignore_errors = reader.ReadRequired<bool>();
125779
126613
  num_cols = reader.ReadRequired<idx_t>();
125780
- buffer_size = reader.ReadRequired<idx_t>();
126614
+ buffer_sample_size = reader.ReadRequired<idx_t>();
125781
126615
  null_str = reader.ReadRequired<string>();
125782
126616
  compression = reader.ReadRequired<FileCompressionType>();
125783
126617
  // read options
@@ -125804,6 +126638,7 @@ static void CSVReaderSerialize(FieldWriter &writer, const FunctionData *bind_dat
125804
126638
  writer.WriteField<idx_t>(bind_data.filename_col_idx);
125805
126639
  writer.WriteField<idx_t>(bind_data.hive_partition_col_idx);
125806
126640
  bind_data.options.Serialize(writer);
126641
+ writer.WriteField<bool>(bind_data.single_threaded);
125807
126642
  }
125808
126643
 
125809
126644
  static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, FieldReader &reader,
@@ -125814,27 +126649,31 @@ static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, Fie
125814
126649
  result_data->filename_col_idx = reader.ReadRequired<idx_t>();
125815
126650
  result_data->hive_partition_col_idx = reader.ReadRequired<idx_t>();
125816
126651
  result_data->options.Deserialize(reader);
126652
+ result_data->single_threaded = reader.ReadField<bool>(true);
125817
126653
  return move(result_data);
125818
126654
  }
125819
126655
 
125820
126656
  TableFunction ReadCSVTableFunction::GetFunction(bool list_parameter) {
125821
126657
  auto parameter = list_parameter ? LogicalType::LIST(LogicalType::VARCHAR) : LogicalType::VARCHAR;
125822
- TableFunction read_csv("read_csv", {parameter}, ReadCSVFunction, ReadCSVBind, ReadCSVInit);
126658
+ TableFunction read_csv("read_csv", {parameter}, ReadCSVFunction, ReadCSVBind, ReadCSVInitGlobal, ReadCSVInitLocal);
125823
126659
  read_csv.table_scan_progress = CSVReaderProgress;
125824
126660
  read_csv.pushdown_complex_filter = CSVComplexFilterPushdown;
125825
126661
  read_csv.serialize = CSVReaderSerialize;
125826
126662
  read_csv.deserialize = CSVReaderDeserialize;
126663
+ read_csv.get_batch_index = CSVReaderGetBatchIndex;
125827
126664
  ReadCSVAddNamedParameters(read_csv);
125828
126665
  return read_csv;
125829
126666
  }
125830
126667
 
125831
126668
  TableFunction ReadCSVTableFunction::GetAutoFunction(bool list_parameter) {
125832
126669
  auto parameter = list_parameter ? LogicalType::LIST(LogicalType::VARCHAR) : LogicalType::VARCHAR;
125833
- TableFunction read_csv_auto("read_csv_auto", {parameter}, ReadCSVFunction, ReadCSVAutoBind, ReadCSVInit);
126670
+ TableFunction read_csv_auto("read_csv_auto", {parameter}, ReadCSVFunction, ReadCSVAutoBind, ReadCSVInitGlobal,
126671
+ ReadCSVInitLocal);
125834
126672
  read_csv_auto.table_scan_progress = CSVReaderProgress;
125835
126673
  read_csv_auto.pushdown_complex_filter = CSVComplexFilterPushdown;
125836
126674
  read_csv_auto.serialize = CSVReaderSerialize;
125837
126675
  read_csv_auto.deserialize = CSVReaderDeserialize;
126676
+ read_csv_auto.get_batch_index = CSVReaderGetBatchIndex;
125838
126677
  ReadCSVAddNamedParameters(read_csv_auto);
125839
126678
  return read_csv_auto;
125840
126679
  }
@@ -136024,6 +136863,14 @@ struct EnableProgressBarSetting {
136024
136863
  static Value GetSetting(ClientContext &context);
136025
136864
  };
136026
136865
 
136866
+ struct ExperimentalParallelCSVSetting {
136867
+ static constexpr const char *Name = "experimental_parallel_csv";
136868
+ static constexpr const char *Description = "Whether or not to use the experimental parallel CSV reader";
136869
+ static constexpr const LogicalTypeId InputType = LogicalTypeId::BOOLEAN;
136870
+ static void SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &parameter);
136871
+ static Value GetSetting(ClientContext &context);
136872
+ };
136873
+
136027
136874
  struct ExplainOutputSetting {
136028
136875
  static constexpr const char *Name = "explain_output";
136029
136876
  static constexpr const char *Description = "Output of EXPLAIN statements (ALL, OPTIMIZED_ONLY, PHYSICAL_ONLY)";
@@ -136224,6 +137071,7 @@ static ConfigurationOption internal_options[] = {DUCKDB_GLOBAL(AccessModeSetting
136224
137071
  DUCKDB_GLOBAL(EnableObjectCacheSetting),
136225
137072
  DUCKDB_LOCAL(EnableProfilingSetting),
136226
137073
  DUCKDB_LOCAL(EnableProgressBarSetting),
137074
+ DUCKDB_GLOBAL(ExperimentalParallelCSVSetting),
136227
137075
  DUCKDB_LOCAL(ExplainOutputSetting),
136228
137076
  DUCKDB_GLOBAL(ExternalThreadsSetting),
136229
137077
  DUCKDB_LOCAL(FileSearchPathSetting),
@@ -136668,6 +137516,7 @@ public:
136668
137516
 
136669
137517
 
136670
137518
 
137519
+
136671
137520
  namespace duckdb {
136672
137521
 
136673
137522
  Connection::Connection(DatabaseInstance &database) : context(make_shared<ClientContext>(database.shared_from_this())) {
@@ -150518,6 +151367,18 @@ Value EnableProgressBarSetting::GetSetting(ClientContext &context) {
150518
151367
  return Value::BOOLEAN(ClientConfig::GetConfig(context).enable_progress_bar);
150519
151368
  }
150520
151369
 
151370
+ //===--------------------------------------------------------------------===//
151371
+ // Experimental Parallel CSV
151372
+ //===--------------------------------------------------------------------===//
151373
+ void ExperimentalParallelCSVSetting::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) {
151374
+ config.options.experimental_parallel_csv_reader = input.GetValue<bool>();
151375
+ }
151376
+
151377
+ Value ExperimentalParallelCSVSetting::GetSetting(ClientContext &context) {
151378
+ auto &config = DBConfig::GetConfig(context);
151379
+ return Value::BIGINT(config.options.experimental_parallel_csv_reader);
151380
+ }
151381
+
150521
151382
  //===--------------------------------------------------------------------===//
150522
151383
  // Explain Output
150523
151384
  //===--------------------------------------------------------------------===//
@@ -185277,6 +186138,8 @@ BindResult ExpressionBinder::BindExpression(CollateExpression &expr, idx_t depth
185277
186138
  if (child.expr->return_type.id() != LogicalTypeId::VARCHAR) {
185278
186139
  throw BinderException("collations are only supported for type varchar");
185279
186140
  }
186141
+ // Validate the collation, but don't use it
186142
+ PushCollation(context, child.expr->Copy(), expr.collation, false);
185280
186143
  child.expr->return_type = LogicalType::VARCHAR_COLLATION(expr.collation);
185281
186144
  return BindResult(move(child.expr));
185282
186145
  }