duckdb 0.7.2-dev3294.0 → 0.7.2-dev3353.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,7 @@
14
14
  #include "duckdb/main/extension_helper.hpp"
15
15
  #include "duckdb/common/multi_file_reader.hpp"
16
16
  #include "duckdb/main/client_data.hpp"
17
+ #include "duckdb/execution/operator/persistent/csv_line_info.hpp"
17
18
 
18
19
  #include <limits>
19
20
 
@@ -262,19 +263,21 @@ static unique_ptr<FunctionData> ReadCSVAutoBind(ClientContext &context, TableFun
262
263
  //===--------------------------------------------------------------------===//
263
264
  // Parallel CSV Reader CSV Global State
264
265
  //===--------------------------------------------------------------------===//
265
- //===--------------------------------------------------------------------===//
266
- // Read CSV Global State
267
- //===--------------------------------------------------------------------===//
266
+
268
267
  struct ParallelCSVGlobalState : public GlobalTableFunctionState {
269
268
  public:
270
269
  ParallelCSVGlobalState(ClientContext &context, unique_ptr<CSVFileHandle> file_handle_p,
271
270
  vector<string> &files_path_p, idx_t system_threads_p, idx_t buffer_size_p,
272
- idx_t rows_to_skip, bool force_parallelism_p, vector<column_t> column_ids_p)
271
+ idx_t rows_to_skip, bool force_parallelism_p, vector<column_t> column_ids_p, bool has_header)
273
272
  : file_handle(std::move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p),
274
- force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)) {
273
+ force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
274
+ line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
275
275
  file_handle->DisableReset();
276
276
  current_file_path = files_path_p[0];
277
- estimated_linenr = rows_to_skip;
277
+ line_info.lines_read[0] = rows_to_skip;
278
+ if (has_header) {
279
+ line_info.lines_read[0]++;
280
+ }
278
281
  file_size = file_handle->FileSize();
279
282
  first_file_size = file_size;
280
283
  bytes_read = 0;
@@ -296,8 +299,16 @@ public:
296
299
  next_buffer = shared_ptr<CSVBuffer>(
297
300
  current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
298
301
  running_threads = MaxThreads();
302
+
303
+ // Initialize all the book-keeping variables
304
+ auto file_count = files_path_p.size();
305
+ line_info.current_batches.resize(file_count);
306
+ tuple_start.resize(file_count);
307
+ tuple_end.resize(file_count);
308
+ tuple_end_to_batch.resize(file_count);
309
+ batch_to_tuple_end.resize(file_count);
299
310
  }
300
- ParallelCSVGlobalState() {
311
+ ParallelCSVGlobalState() : line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
301
312
  running_threads = MaxThreads();
302
313
  }
303
314
 
@@ -316,7 +327,9 @@ public:
316
327
  //! Verify if the CSV File was read correctly
317
328
  void Verify();
318
329
 
319
- void UpdateVerification(VerificationPositions positions, idx_t file_number);
330
+ void UpdateVerification(VerificationPositions positions, idx_t file_number, idx_t batch_idx);
331
+
332
+ void UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx);
320
333
 
321
334
  void IncrementThread();
322
335
 
@@ -354,8 +367,6 @@ private:
354
367
  mutex main_mutex;
355
368
  //! Byte set from for last thread
356
369
  idx_t next_byte = 0;
357
- //! The current estimated line number
358
- idx_t estimated_linenr;
359
370
  //! How many bytes we should execute per local state
360
371
  idx_t bytes_per_local_state;
361
372
  //! Size of first file
@@ -366,6 +377,8 @@ private:
366
377
  idx_t buffer_size;
367
378
  //! Current batch index
368
379
  idx_t batch_index = 0;
380
+ idx_t local_batch_index = 0;
381
+
369
382
  //! Forces parallelism for small CSV Files, should only be used for testing.
370
383
  bool force_parallelism = false;
371
384
  //! Current (Global) position of CSV
@@ -379,9 +392,15 @@ private:
379
392
  //! positions where they started reading the first line.
380
393
  vector<vector<idx_t>> tuple_end;
381
394
  vector<set<idx_t>> tuple_start;
395
+ //! Tuple end to batch
396
+ vector<unordered_map<idx_t, idx_t>> tuple_end_to_batch;
397
+ //! Batch to Tuple End
398
+ vector<unordered_map<idx_t, idx_t>> batch_to_tuple_end;
382
399
  idx_t running_threads = 0;
383
400
  //! The column ids to read
384
401
  vector<column_t> column_ids;
402
+ //! Line Info used in error messages
403
+ LineInfo line_info;
385
404
  };
386
405
 
387
406
  idx_t ParallelCSVGlobalState::MaxThreads() const {
@@ -425,32 +444,61 @@ void ParallelCSVGlobalState::Verify() {
425
444
  return;
426
445
  }
427
446
  auto max_value = *max_element(std::begin(current_tuple_end), std::end(current_tuple_end));
428
- for (auto &last_pos : current_tuple_end) {
447
+ for (idx_t tpl_idx = 0; tpl_idx < current_tuple_end.size(); tpl_idx++) {
448
+ auto last_pos = current_tuple_end[tpl_idx];
429
449
  auto first_pos = current_tuple_start.find(last_pos);
430
450
  if (first_pos == current_tuple_start.end()) {
431
451
  // this might be necessary due to carriage returns outside buffer scopes.
432
452
  first_pos = current_tuple_start.find(last_pos + 1);
433
453
  }
434
454
  if (first_pos == current_tuple_start.end() && last_pos != max_value) {
435
- string error =
436
- "Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
437
- " does not have a match\n";
438
- error += "End Lines: \n";
439
- for (auto &end_line : current_tuple_end) {
440
- error += to_string(end_line) + "\n";
441
- }
442
- error += "Start Lines: \n";
443
- for (auto &start_line : current_tuple_start) {
444
- error += to_string(start_line) + "\n";
445
- }
455
+ auto batch_idx = tuple_end_to_batch[i][last_pos];
456
+ auto problematic_line = line_info.GetLine(batch_idx);
446
457
  throw InvalidInputException(
447
- "CSV File not supported for multithreading. Please run single-threaded CSV Reading");
458
+ "CSV File not supported for multithreading. This can be a problematic line in your CSV File or "
459
+ "that this CSV can't be read in Parallel. Please, inspect if the line %llu is correct. If so, "
460
+ "please run single-threaded CSV Reading by setting parallel=false in the read_csv call.",
461
+ problematic_line);
448
462
  }
449
463
  }
450
464
  }
451
465
  }
452
466
  }
453
467
 
468
+ void LineInfo::Verify(idx_t file_idx, idx_t batch_idx, idx_t cur_first_pos) {
469
+ auto &tuple_start_set = tuple_start[file_idx];
470
+ auto &processed_batches = batch_to_tuple_end[file_idx];
471
+ auto &tuple_end_vec = tuple_end[file_idx];
472
+ bool has_error = false;
473
+ idx_t problematic_line;
474
+ if (batch_idx == 0 || tuple_start_set.empty()) {
475
+ return;
476
+ }
477
+ for (idx_t cur_batch = 0; cur_batch < batch_idx - 1; cur_batch++) {
478
+ auto cur_end = tuple_end_vec[processed_batches[cur_batch]];
479
+ auto first_pos = tuple_start_set.find(cur_end);
480
+ if (first_pos == tuple_start_set.end()) {
481
+ has_error = true;
482
+ problematic_line = GetLine(cur_batch);
483
+ break;
484
+ }
485
+ }
486
+ if (!has_error) {
487
+ auto cur_end = tuple_end_vec[processed_batches[batch_idx - 1]];
488
+ if (cur_end != cur_first_pos) {
489
+ has_error = true;
490
+ problematic_line = GetLine(batch_idx);
491
+ }
492
+ }
493
+ if (has_error) {
494
+ throw InvalidInputException(
495
+ "CSV File not supported for multithreading. This can be a problematic line in your CSV File or "
496
+ "that this CSV can't be read in Parallel. Please, inspect if the line %llu is correct. If so, "
497
+ "please run single-threaded CSV Reading by setting parallel=false in the read_csv call.",
498
+ problematic_line);
499
+ }
500
+ }
501
+
454
502
  bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bind_data,
455
503
  unique_ptr<ParallelCSVReader> &reader) {
456
504
  lock_guard<mutex> parallel_lock(main_mutex);
@@ -461,6 +509,7 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
461
509
  file_handle = ReadCSV::OpenCSV(current_file_path, bind_data.options.compression, context);
462
510
  current_csv_position = 0;
463
511
  file_number++;
512
+ local_batch_index = 0;
464
513
  current_buffer =
465
514
  make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
466
515
  next_buffer = shared_ptr<CSVBuffer>(
@@ -472,11 +521,11 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
472
521
  }
473
522
  }
474
523
  // set up the current buffer
524
+ line_info.current_batches.back().insert(local_batch_index);
475
525
  auto result = make_uniq<CSVBufferRead>(current_buffer, next_buffer, next_byte, next_byte + bytes_per_local_state,
476
- batch_index++, estimated_linenr);
526
+ batch_index++, local_batch_index++, &line_info);
477
527
  // move the byte index of the CSV reader to the next buffer
478
528
  next_byte += bytes_per_local_state;
479
- estimated_linenr += bytes_per_local_state / (bind_data.csv_types.size() * 5); // estimate 5 bytes per column
480
529
  if (next_byte >= current_buffer->GetBufferSize()) {
481
530
  // We replace the current buffer with the next buffer
482
531
  next_byte = 0;
@@ -495,17 +544,17 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
495
544
  // we are doing UNION BY NAME - fetch the options from the union reader for this file
496
545
  auto &union_reader = *bind_data.union_readers[file_index - 1];
497
546
  reader = make_uniq<ParallelCSVReader>(context, union_reader.options, std::move(result), first_position,
498
- union_reader.GetTypes());
547
+ union_reader.GetTypes(), file_index - 1);
499
548
  reader->names = union_reader.GetNames();
500
549
  } else if (file_index <= bind_data.column_info.size()) {
501
550
  // Serialized Union By name
502
551
  reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
503
- bind_data.column_info[file_index - 1].types);
552
+ bind_data.column_info[file_index - 1].types, file_index - 1);
504
553
  reader->names = bind_data.column_info[file_index - 1].names;
505
554
  } else {
506
555
  // regular file - use the standard options
507
556
  reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
508
- bind_data.csv_types);
557
+ bind_data.csv_types, file_index - 1);
509
558
  reader->names = bind_data.csv_names;
510
559
  }
511
560
  reader->options.file_path = current_file_path;
@@ -518,23 +567,64 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
518
567
  }
519
568
  return true;
520
569
  }
521
- void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions, idx_t file_number_p) {
570
+ void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions, idx_t file_number_p, idx_t batch_idx) {
522
571
  lock_guard<mutex> parallel_lock(main_mutex);
523
572
  if (positions.beginning_of_first_line < positions.end_of_last_line) {
524
573
  if (positions.end_of_last_line > max_tuple_end) {
525
574
  max_tuple_end = positions.end_of_last_line;
526
575
  }
527
- while (file_number_p >= tuple_start.size()) {
528
- vector<idx_t> empty_tuple_end;
529
- set<idx_t> empty_set;
530
- tuple_start.emplace_back(empty_set);
531
- tuple_end.emplace_back(empty_tuple_end);
532
- }
576
+ tuple_end_to_batch[file_number_p][positions.end_of_last_line] = batch_idx;
577
+ batch_to_tuple_end[file_number_p][batch_idx] = tuple_end[file_number_p].size();
533
578
  tuple_start[file_number_p].insert(positions.beginning_of_first_line);
534
579
  tuple_end[file_number_p].push_back(positions.end_of_last_line);
535
580
  }
536
581
  }
537
582
 
583
+ void ParallelCSVGlobalState::UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx) {
584
+ auto batch_idx = buffer_read.local_batch_index;
585
+ auto lines_read = buffer_read.lines_read;
586
+ lock_guard<mutex> parallel_lock(main_mutex);
587
+ line_info.current_batches[file_idx].erase(batch_idx);
588
+ line_info.lines_read[batch_idx] += lines_read;
589
+ }
590
+
591
+ bool LineInfo::CanItGetLine(idx_t file_idx, idx_t batch_idx) {
592
+ lock_guard<mutex> parallel_lock(main_mutex);
593
+ if (current_batches.empty() || done) {
594
+ return true;
595
+ }
596
+ auto min_value = *current_batches[file_idx].begin();
597
+ if (min_value >= batch_idx) {
598
+ return true;
599
+ }
600
+ return false;
601
+ }
602
+
603
+ idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t cur_start, bool verify) {
604
+ unique_ptr<lock_guard<mutex>> parallel_lock;
605
+ if (!verify) {
606
+ parallel_lock = duckdb::make_uniq<lock_guard<mutex>>(main_mutex);
607
+ }
608
+ idx_t line_count = 0;
609
+ if (done) {
610
+ return first_line;
611
+ }
612
+ for (idx_t i = 0; i <= batch_idx; i++) {
613
+ if (lines_read.find(i) == lines_read.end() && i != batch_idx) {
614
+ throw InternalException("Missing batch index on Parallel CSV Reader GetLine");
615
+ }
616
+ line_count += lines_read[i];
617
+ }
618
+
619
+ // before we are done, if this is not a call in Verify() we must check Verify up to this batch
620
+ if (!verify) {
621
+ Verify(file_idx, batch_idx, cur_start);
622
+ }
623
+ done = true;
624
+ first_line = line_count + line_error;
625
+ return first_line;
626
+ }
627
+
538
628
  static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext &context,
539
629
  TableFunctionInitInput &input) {
540
630
  auto &bind_data = (ReadCSVData &)*input.bind_data;
@@ -548,7 +638,8 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
548
638
  file_handle = ReadCSV::OpenCSV(bind_data.options.file_path, bind_data.options.compression, context);
549
639
  return make_uniq<ParallelCSVGlobalState>(
550
640
  context, std::move(file_handle), bind_data.files, context.db->NumberOfThreads(), bind_data.options.buffer_size,
551
- bind_data.options.skip_rows, ClientConfig::GetConfig(context).verify_parallelism, input.column_ids);
641
+ bind_data.options.skip_rows, ClientConfig::GetConfig(context).verify_parallelism, input.column_ids,
642
+ bind_data.options.header && bind_data.options.has_header);
552
643
  }
553
644
 
554
645
  //===--------------------------------------------------------------------===//
@@ -597,9 +688,15 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
597
688
  auto verification_updates = csv_local_state.csv_reader->GetVerificationPositions();
598
689
  if (verification_updates.beginning_of_first_line != verification_updates.end_of_last_line) {
599
690
  csv_global_state.UpdateVerification(verification_updates,
600
- csv_local_state.csv_reader->buffer->buffer->GetFileNumber());
691
+ csv_local_state.csv_reader->buffer->buffer->GetFileNumber(),
692
+ csv_local_state.csv_reader->buffer->local_batch_index);
601
693
  }
694
+ csv_global_state.UpdateLinesRead(*csv_local_state.csv_reader->buffer, csv_local_state.csv_reader->file_idx);
602
695
  auto has_next = csv_global_state.Next(context, bind_data, csv_local_state.csv_reader);
696
+ if (csv_local_state.csv_reader) {
697
+ csv_local_state.csv_reader->linenr = 0;
698
+ }
699
+
603
700
  if (!has_next) {
604
701
  csv_global_state.DecrementThread();
605
702
  break;
@@ -945,6 +1042,7 @@ void BufferedCSVReaderOptions::Serialize(FieldWriter &writer) const {
945
1042
  writer.WriteString(file_path);
946
1043
  writer.WriteString(decimal_separator);
947
1044
  writer.WriteField<bool>(null_padding);
1045
+ writer.WriteField<idx_t>(buffer_size);
948
1046
  writer.WriteSerializable(file_options);
949
1047
  // write options
950
1048
  writer.WriteListNoReference<bool>(force_quote);
@@ -980,6 +1078,7 @@ void BufferedCSVReaderOptions::Deserialize(FieldReader &reader) {
980
1078
  file_path = reader.ReadRequired<string>();
981
1079
  decimal_separator = reader.ReadRequired<string>();
982
1080
  null_padding = reader.ReadRequired<bool>();
1081
+ buffer_size = reader.ReadRequired<idx_t>();
983
1082
  file_options = reader.ReadRequiredSerializable<MultiFileReaderOptions, MultiFileReaderOptions>();
984
1083
  // write options
985
1084
  force_quote = reader.ReadRequiredList<bool>();
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.7.2-dev3294"
2
+ #define DUCKDB_VERSION "0.7.2-dev3353"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "f8eae1c85d"
5
+ #define DUCKDB_SOURCE_ID "ec0ca94cdf"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -17,6 +17,8 @@
17
17
  #include "duckdb/common/queue.hpp"
18
18
  #include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
19
19
  #include "duckdb/common/multi_file_reader.hpp"
20
+ #include "duckdb/execution/operator/persistent/csv_line_info.hpp"
21
+
20
22
  #include <sstream>
21
23
 
22
24
  namespace duckdb {
@@ -74,6 +76,10 @@ public:
74
76
  const vector<LogicalType> &GetTypes() {
75
77
  return return_types;
76
78
  }
79
+ virtual idx_t GetLineError(idx_t line_error, idx_t buffer_idx) {
80
+ return line_error;
81
+ };
82
+
77
83
  //! Initialize projection indices to select all columns
78
84
  void InitializeProjection();
79
85
 
@@ -88,17 +94,18 @@ protected:
88
94
  bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type);
89
95
 
90
96
  //! Adds a value to the current row
91
- void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes);
97
+ void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes,
98
+ idx_t buffer_idx = 0);
92
99
  //! Adds a row to the insert_chunk, returns true if the chunk is filled as a result of this row being added
93
- bool AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message);
100
+ bool AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message, idx_t buffer_idx = 0);
94
101
  //! Finalizes a chunk, parsing all values that have been added so far and adding them to the insert_chunk
95
- bool Flush(DataChunk &insert_chunk, bool try_add_line = false);
102
+ bool Flush(DataChunk &insert_chunk, idx_t buffer_idx = 0, bool try_add_line = false);
96
103
 
97
104
  unique_ptr<CSVFileHandle> OpenCSV(const BufferedCSVReaderOptions &options);
98
105
 
99
106
  void VerifyUTF8(idx_t col_idx);
100
107
  void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0);
101
- static string GetLineNumberStr(idx_t linenr, bool linenr_estimated);
108
+ string GetLineNumberStr(idx_t linenr, bool linenr_estimated, idx_t buffer_idx = 0);
102
109
 
103
110
  //! Sets the newline delimiter
104
111
  void SetNewLineDelimiter(bool carry = false, bool carry_followed_by_nl = false);
@@ -57,7 +57,8 @@ public:
57
57
  const vector<LogicalType> &requested_types = vector<LogicalType>());
58
58
  BufferedCSVReader(ClientContext &context, string filename, BufferedCSVReaderOptions options,
59
59
  const vector<LogicalType> &requested_types = vector<LogicalType>());
60
- ~BufferedCSVReader();
60
+ virtual ~BufferedCSVReader() {
61
+ }
61
62
 
62
63
  unique_ptr<char[]> buffer;
63
64
  idx_t buffer_size;
@@ -0,0 +1,40 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // duckdb/execution/operator/persistent/csv_line_info.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
9
+ #pragma once
10
+
11
+ namespace duckdb {
12
+ struct LineInfo {
13
+ public:
14
+ explicit LineInfo(mutex &main_mutex_p, vector<unordered_map<idx_t, idx_t>> &batch_to_tuple_end_p,
15
+ vector<set<idx_t>> &tuple_start_p, vector<vector<idx_t>> &tuple_end_p)
16
+ : main_mutex(main_mutex_p), batch_to_tuple_end(batch_to_tuple_end_p), tuple_start(tuple_start_p),
17
+ tuple_end(tuple_end_p) {};
18
+ bool CanItGetLine(idx_t file_idx, idx_t batch_idx);
19
+
20
+ idx_t GetLine(idx_t batch_idx, idx_t line_error = 0, idx_t file_idx = 0, idx_t cur_start = 0, bool verify = true);
21
+ //! Verify if the CSV File was read correctly from [0,batch_idx] batches.
22
+ void Verify(idx_t file_idx, idx_t batch_idx, idx_t cur_first_pos);
23
+ //! Lines read per batch, <batch_index,count>
24
+ unordered_map<idx_t, idx_t> lines_read;
25
+ //! Set of batches that have been initialized but are not yet finished.
26
+ vector<set<idx_t>> current_batches;
27
+ //! Pointer to CSV Reader Mutex
28
+ mutex &main_mutex;
29
+ //! Pointer Batch to Tuple End
30
+ vector<unordered_map<idx_t, idx_t>> &batch_to_tuple_end;
31
+ //! Pointer Batch to Tuple Start
32
+ vector<set<idx_t>> &tuple_start;
33
+ //! Pointer Batch to Tuple End
34
+ vector<vector<idx_t>> &tuple_end;
35
+ //! If we already threw an exception on a previous thread.
36
+ bool done = false;
37
+ idx_t first_line = 0;
38
+ };
39
+
40
+ } // namespace duckdb
@@ -1,7 +1,7 @@
1
1
  //===----------------------------------------------------------------------===//
2
2
  // DuckDB
3
3
  //
4
- // duckdb/execution/operator/persistent/buffered_csv_reader.hpp
4
+ // duckdb/execution/operator/persistent/parallel_csv_reader.hpp
5
5
  //
6
6
  //
7
7
  //===----------------------------------------------------------------------===//
@@ -12,6 +12,7 @@
12
12
  #include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
13
13
  #include "duckdb/execution/operator/persistent/csv_file_handle.hpp"
14
14
  #include "duckdb/execution/operator/persistent/csv_buffer.hpp"
15
+ #include "duckdb/execution/operator/persistent/csv_line_info.hpp"
15
16
 
16
17
  #include <sstream>
17
18
  #include <utility>
@@ -20,9 +21,9 @@ namespace duckdb {
20
21
 
21
22
  struct CSVBufferRead {
22
23
  CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, idx_t buffer_start_p, idx_t buffer_end_p, idx_t batch_index,
23
- idx_t estimated_linenr)
24
- : buffer(std::move(buffer_p)), buffer_start(buffer_start_p), buffer_end(buffer_end_p), batch_index(batch_index),
25
- estimated_linenr(estimated_linenr) {
24
+ idx_t local_batch_index_p, optional_ptr<LineInfo> line_info_p)
25
+ : buffer(std::move(buffer_p)), line_info(line_info_p), buffer_start(buffer_start_p), buffer_end(buffer_end_p),
26
+ batch_index(batch_index), local_batch_index(local_batch_index_p) {
26
27
  if (buffer) {
27
28
  if (buffer_end > buffer->GetBufferSize()) {
28
29
  buffer_end = buffer->GetBufferSize();
@@ -34,8 +35,9 @@ struct CSVBufferRead {
34
35
  }
35
36
 
36
37
  CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, shared_ptr<CSVBuffer> nxt_buffer_p, idx_t buffer_start_p,
37
- idx_t buffer_end_p, idx_t batch_index, idx_t estimated_linenr)
38
- : CSVBufferRead(std::move(buffer_p), buffer_start_p, buffer_end_p, batch_index, estimated_linenr) {
38
+ idx_t buffer_end_p, idx_t batch_index, idx_t local_batch_index, optional_ptr<LineInfo> line_info_p)
39
+ : CSVBufferRead(std::move(buffer_p), buffer_start_p, buffer_end_p, batch_index, local_batch_index,
40
+ line_info_p) {
39
41
  next_buffer = std::move(nxt_buffer_p);
40
42
  }
41
43
 
@@ -84,23 +86,27 @@ struct CSVBufferRead {
84
86
  shared_ptr<CSVBuffer> buffer;
85
87
  shared_ptr<CSVBuffer> next_buffer;
86
88
  vector<unique_ptr<char[]>> intersections;
89
+ optional_ptr<LineInfo> line_info;
87
90
 
88
91
  idx_t buffer_start;
89
92
  idx_t buffer_end;
90
93
  idx_t batch_index;
91
- idx_t estimated_linenr;
94
+ idx_t local_batch_index;
95
+ idx_t lines_read = 0;
92
96
  };
93
97
 
94
98
  struct VerificationPositions {
95
99
  idx_t beginning_of_first_line = 0;
96
100
  idx_t end_of_last_line = 0;
97
101
  };
98
- //! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
102
+
103
+ //! CSV Reader for Parallel Reading
99
104
  class ParallelCSVReader : public BaseCSVReader {
100
105
  public:
101
106
  ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
102
- idx_t first_pos_first_buffer, const vector<LogicalType> &requested_types);
103
- ~ParallelCSVReader();
107
+ idx_t first_pos_first_buffer, const vector<LogicalType> &requested_types, idx_t file_idx_p);
108
+ virtual ~ParallelCSVReader() {
109
+ }
104
110
 
105
111
  //! Current Position (Relative to the Buffer)
106
112
  idx_t position_buffer = 0;
@@ -118,13 +124,21 @@ public:
118
124
  bool finished = false;
119
125
 
120
126
  unique_ptr<CSVBufferRead> buffer;
127
+
128
+ idx_t file_idx;
129
+
121
130
  VerificationPositions GetVerificationPositions();
122
131
 
132
+ //! Position of the first read line and last read line for verification purposes
133
+ VerificationPositions verification_positions;
134
+
123
135
  public:
124
136
  void SetBufferRead(unique_ptr<CSVBufferRead> buffer);
125
137
  //! Extract a single DataChunk from the CSV file and stores it in insert_chunk
126
138
  void ParseCSV(DataChunk &insert_chunk);
127
139
 
140
+ idx_t GetLineError(idx_t line_error, idx_t buffer_idx) override;
141
+
128
142
  private:
129
143
  //! Initialize Parser
130
144
  void Initialize(const vector<LogicalType> &requested_types);
@@ -135,7 +149,7 @@ private:
135
149
  //! Extract a single DataChunk from the CSV file and stores it in insert_chunk
136
150
  bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
137
151
  //! Sets Position depending on the byte_start of this thread
138
- bool SetPosition(DataChunk &insert_chunk);
152
+ bool SetPosition();
139
153
  //! Called when scanning the 1st buffer, skips empty lines
140
154
  void SkipEmptyLines();
141
155
  //! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
@@ -148,8 +162,9 @@ private:
148
162
 
149
163
  //! Parses a CSV file with a one-byte delimiter, escape and quote character
150
164
  bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
151
- //! Position of the first read line and last read line for verification purposes
152
- VerificationPositions verification_positions;
165
+ //! Verifies that the line length did not go over a pre-defined limit.
166
+ void VerifyLineLength(idx_t line_size);
167
+
153
168
  //! First Position of First Buffer
154
169
  idx_t first_pos_first_buffer = 0;
155
170
  };
@@ -226,19 +226,20 @@ bool ExtensionHelper::IsFullPath(const string &extension) {
226
226
  StringUtil::Contains(extension, "\\");
227
227
  }
228
228
 
229
- string ExtensionHelper::GetExtensionName(const string &extension) {
229
+ string ExtensionHelper::GetExtensionName(const string &original_name) {
230
+ auto extension = StringUtil::Lower(original_name);
230
231
  if (!IsFullPath(extension)) {
231
- return extension;
232
+ return ExtensionHelper::ApplyExtensionAlias(extension);
232
233
  }
233
234
  auto splits = StringUtil::Split(StringUtil::Replace(extension, "\\", "/"), '/');
234
235
  if (splits.empty()) {
235
- return extension;
236
+ return ExtensionHelper::ApplyExtensionAlias(extension);
236
237
  }
237
238
  splits = StringUtil::Split(splits.back(), '.');
238
239
  if (splits.empty()) {
239
- return extension;
240
+ return ExtensionHelper::ApplyExtensionAlias(extension);
240
241
  }
241
- return StringUtil::Lower(splits.front());
242
+ return ExtensionHelper::ApplyExtensionAlias(splits.front());
242
243
  }
243
244
 
244
245
  void ExtensionHelper::LoadExternalExtension(DatabaseInstance &db, FileOpener *opener, const string &extension) {
@@ -55,6 +55,9 @@ unique_ptr<Expression> MoveConstantsRule::Apply(LogicalOperator &op, vector<refe
55
55
  }
56
56
  auto result_value = Value::HUGEINT(outer_value);
57
57
  if (!result_value.DefaultTryCastAs(constant_type)) {
58
+ if (comparison.type != ExpressionType::COMPARE_EQUAL) {
59
+ return nullptr;
60
+ }
58
61
  // if the cast is not possible then the comparison is not possible
59
62
  // for example, if we have x + 5 = 3, where x is an unsigned number, we will get x = -2
60
63
  // since this is not possible we can remove the entire branch here
@@ -140,7 +140,7 @@ void StringStats::Update(BaseStatistics &stats, const string_t &value) {
140
140
  if (unicode == UnicodeType::UNICODE) {
141
141
  string_data.has_unicode = true;
142
142
  } else if (unicode == UnicodeType::INVALID) {
143
- throw InternalException(
143
+ throw InvalidInputException(
144
144
  ErrorManager::InvalidUnicodeError(string((char *)data, size), "segment statistics update"));
145
145
  }
146
146
  }