duckdb 0.7.2-dev2144.0 → 0.7.2-dev2233.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/parquet/column_reader.cpp +3 -0
  3. package/src/duckdb/src/common/types/column/column_data_collection.cpp +7 -2
  4. package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +3 -0
  5. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +71 -22
  6. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +17 -13
  7. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +0 -7
  8. package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +124 -29
  9. package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +1 -1
  10. package/src/duckdb/src/function/table/read_csv.cpp +124 -58
  11. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  12. package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection.hpp +2 -2
  13. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +4 -1
  14. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +8 -3
  15. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +5 -7
  16. package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp +5 -1
  17. package/src/duckdb/src/include/duckdb/function/function.hpp +2 -0
  18. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +25 -0
  19. package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -0
  20. package/src/duckdb/src/include/duckdb/main/config.hpp +0 -2
  21. package/src/duckdb/src/main/settings/settings.cpp +3 -4
  22. package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +13 -0
  23. package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +9 -0
@@ -25,8 +25,9 @@
25
25
  namespace duckdb {
26
26
 
27
27
  ParallelCSVReader::ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
28
- unique_ptr<CSVBufferRead> buffer_p, const vector<LogicalType> &requested_types)
29
- : BaseCSVReader(context, std::move(options_p), requested_types) {
28
+ unique_ptr<CSVBufferRead> buffer_p, idx_t first_pos_first_buffer_p,
29
+ const vector<LogicalType> &requested_types)
30
+ : BaseCSVReader(context, std::move(options_p), requested_types), first_pos_first_buffer(first_pos_first_buffer_p) {
30
31
  Initialize(requested_types);
31
32
  SetBufferRead(std::move(buffer_p));
32
33
  if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
@@ -52,9 +53,34 @@ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl,
52
53
  return (carry && carry_followed_by_nl) || (!carry && first_char);
53
54
  }
54
55
 
56
+ void ParallelCSVReader::SkipEmptyLines() {
57
+ idx_t new_pos_buffer = position_buffer;
58
+ if (parse_chunk.data.size() == 1) {
59
+ // Empty lines are null data.
60
+ return;
61
+ }
62
+ for (; new_pos_buffer < end_buffer; new_pos_buffer++) {
63
+ if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) {
64
+ bool carrier_return = (*buffer)[new_pos_buffer] == '\r';
65
+ new_pos_buffer++;
66
+ if (carrier_return && new_pos_buffer < buffer_size && (*buffer)[new_pos_buffer] == '\n') {
67
+ position_buffer++;
68
+ }
69
+ if (new_pos_buffer > end_buffer) {
70
+ return;
71
+ }
72
+ position_buffer = new_pos_buffer;
73
+ } else if ((*buffer)[new_pos_buffer] != ' ') {
74
+ return;
75
+ }
76
+ }
77
+ }
78
+
55
79
  bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
56
80
  if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer &&
57
- start_buffer == buffer->buffer->GetStart()) {
81
+ start_buffer == first_pos_first_buffer) {
82
+ start_buffer = buffer->buffer->GetStart();
83
+ position_buffer = start_buffer;
58
84
  verification_positions.beginning_of_first_line = position_buffer;
59
85
  verification_positions.end_of_last_line = position_buffer;
60
86
  // First buffer doesn't need any setting
@@ -70,11 +96,23 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
70
96
  if (position_buffer > end_buffer) {
71
97
  return false;
72
98
  }
99
+ SkipEmptyLines();
100
+ if (verification_positions.beginning_of_first_line == 0) {
101
+ verification_positions.beginning_of_first_line = position_buffer;
102
+ }
103
+
104
+ verification_positions.end_of_last_line = position_buffer;
73
105
  return true;
74
106
  }
75
107
  }
76
108
  return false;
77
109
  }
110
+ SkipEmptyLines();
111
+ if (verification_positions.beginning_of_first_line == 0) {
112
+ verification_positions.beginning_of_first_line = position_buffer;
113
+ }
114
+
115
+ verification_positions.end_of_last_line = position_buffer;
78
116
  return true;
79
117
  }
80
118
 
@@ -102,6 +140,11 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
102
140
  }
103
141
  }
104
142
  }
143
+ SkipEmptyLines();
144
+
145
+ if (position_buffer > buffer_size) {
146
+ break;
147
+ }
105
148
 
106
149
  if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
107
150
  break;
@@ -113,18 +156,20 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
113
156
  }
114
157
  idx_t position_set = position_buffer;
115
158
  start_buffer = position_buffer;
116
-
117
159
  // We check if we can add this line
118
160
  // disable the projection pushdown while reading the first line
119
161
  // otherwise the first line parsing can be influenced by which columns we are reading
120
162
  auto column_ids = std::move(reader_data.column_ids);
121
163
  auto column_mapping = std::move(reader_data.column_mapping);
122
164
  InitializeProjection();
123
- successfully_read_first_line = TryParseSimpleCSV(first_line_chunk, error_message, true);
165
+ try {
166
+ successfully_read_first_line = TryParseSimpleCSV(first_line_chunk, error_message, true);
167
+ } catch (...) {
168
+ successfully_read_first_line = false;
169
+ }
124
170
  // restore the projection pushdown
125
171
  reader_data.column_ids = std::move(column_ids);
126
172
  reader_data.column_mapping = std::move(column_mapping);
127
-
128
173
  end_buffer = end_buffer_real;
129
174
  start_buffer = position_set;
130
175
  if (position_buffer >= end_buffer) {
@@ -190,27 +235,55 @@ bool ParallelCSVReader::BufferRemainder() {
190
235
  return true;
191
236
  }
192
237
 
238
+ void VerifyLineLength(idx_t line_size, idx_t max_line_size) {
239
+ if (line_size > max_line_size) {
240
+ // FIXME: this should also output the correct estimated linenumber where it broke
241
+ throw InvalidInputException("Maximum line size of %llu bytes exceeded!", max_line_size);
242
+ }
243
+ }
244
+
245
+ bool AllNewLine(string_t value, idx_t column_amount) {
246
+ auto value_str = value.GetString();
247
+ if (value_str.empty() && column_amount == 1) {
248
+ // This is a one column (empty)
249
+ return false;
250
+ }
251
+ for (idx_t i = 0; i < value.GetSize(); i++) {
252
+ if (!StringUtil::CharacterIsNewline(value_str[i])) {
253
+ return false;
254
+ }
255
+ }
256
+ return true;
257
+ }
258
+
193
259
  bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
194
260
  // used for parsing algorithm
261
+ if (start_buffer == buffer_size) {
262
+ // Nothing to read
263
+ finished = true;
264
+ return true;
265
+ }
195
266
  D_ASSERT(end_buffer <= buffer_size);
196
267
  bool finished_chunk = false;
197
268
  idx_t column = 0;
198
269
  idx_t offset = 0;
199
270
  bool has_quotes = false;
271
+
200
272
  vector<idx_t> escape_positions;
201
273
  if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
202
274
  // First time reading this buffer piece
203
275
  if (!SetPosition(insert_chunk)) {
204
- // This means the buffer size does not contain a new line
205
- if (position_buffer - start_buffer == options.buffer_size) {
206
- error_message = "Line does not fit in one buffer. Increase the buffer size.";
207
- return false;
208
- }
209
276
  finished = true;
210
277
  return true;
211
278
  }
212
279
  }
213
-
280
+ if (position_buffer == buffer_size) {
281
+ // Nothing to read
282
+ finished = true;
283
+ return true;
284
+ }
285
+ // Keep track of line size
286
+ idx_t line_start = position_buffer;
214
287
  // start parsing the first value
215
288
  goto value_start;
216
289
 
@@ -242,11 +315,16 @@ normal : {
242
315
  if (c == options.delimiter[0]) {
243
316
  // delimiter: end the value and add it to the chunk
244
317
  goto add_value;
318
+ } else if (c == options.quote[0] && try_add_line) {
319
+ return false;
245
320
  } else if (StringUtil::CharacterIsNewline(c)) {
246
321
  // newline: add row
247
- if (column > 0 || try_add_line || insert_chunk.data.size() == 1) {
322
+ if (column > 0 || try_add_line || parse_chunk.data.size() == 1) {
248
323
  goto add_row;
249
324
  }
325
+ if (column == 0 && position_buffer == start_buffer) {
326
+ start_buffer++;
327
+ }
250
328
  }
251
329
  }
252
330
  if (!BufferRemainder()) {
@@ -285,12 +363,15 @@ add_row : {
285
363
  parse_chunk.Reset();
286
364
  return success;
287
365
  } else {
366
+ VerifyLineLength(position_buffer - line_start, options.maximum_line_size);
367
+ line_start = position_buffer;
288
368
  finished_chunk = AddRow(insert_chunk, column, error_message);
289
369
  }
290
370
  // increase position by 1 and move start to the new position
291
371
  offset = 0;
292
372
  has_quotes = false;
293
- start_buffer = ++position_buffer;
373
+ position_buffer++;
374
+ start_buffer = position_buffer;
294
375
  verification_positions.end_of_last_line = position_buffer;
295
376
  if (reached_remainder_state) {
296
377
  goto final_state;
@@ -309,7 +390,10 @@ add_row : {
309
390
  // newline after carriage return: skip
310
391
  // increase position by 1 and move start to the new position
311
392
  start_buffer = ++position_buffer;
393
+
394
+ SkipEmptyLines();
312
395
  verification_positions.end_of_last_line = position_buffer;
396
+ start_buffer = position_buffer;
313
397
  if (reached_remainder_state) {
314
398
  goto final_state;
315
399
  }
@@ -331,6 +415,9 @@ add_row : {
331
415
  error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
332
416
  return false;
333
417
  }
418
+ SkipEmptyLines();
419
+ verification_positions.end_of_last_line = position_buffer;
420
+ start_buffer = position_buffer;
334
421
  // \n newline, move to value start
335
422
  if (finished_chunk) {
336
423
  goto final_state;
@@ -391,7 +478,7 @@ unquote : {
391
478
  } else if (StringUtil::CharacterIsNewline(c)) {
392
479
  offset = 1;
393
480
  // FIXME: should this be an assertion?
394
- D_ASSERT(column == parse_chunk.ColumnCount() - 1);
481
+ D_ASSERT(try_add_line || (!try_add_line && column == parse_chunk.ColumnCount() - 1));
395
482
  goto add_row;
396
483
  } else if (position_buffer >= end_buffer) {
397
484
  // reached end of buffer
@@ -448,22 +535,27 @@ final_state : {
448
535
  }
449
536
  // If this is the last buffer, we have to read the last value
450
537
  if (buffer->buffer->IsCSVFileLastBuffer() || (buffer->next_buffer && buffer->next_buffer->IsCSVFileLastBuffer())) {
451
- if (column > 0 || try_add_line || (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
538
+ if (column > 0 || start_buffer != position_buffer || try_add_line ||
539
+ (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
452
540
  // remaining values to be added to the chunk
453
541
  auto str_value = buffer->GetValue(start_buffer, position_buffer, offset);
454
- AddValue(str_value, column, escape_positions, has_quotes);
455
- if (try_add_line) {
456
- bool success = column == return_types.size();
457
- if (success) {
542
+ if (!AllNewLine(str_value, insert_chunk.data.size()) || offset == 0) {
543
+ AddValue(str_value, column, escape_positions, has_quotes);
544
+ if (try_add_line) {
545
+ bool success = column == return_types.size();
546
+ if (success) {
547
+ AddRow(insert_chunk, column, error_message);
548
+ success = Flush(insert_chunk);
549
+ }
550
+ parse_chunk.Reset();
551
+ reached_remainder_state = false;
552
+ return success;
553
+ } else {
554
+ VerifyLineLength(position_buffer - line_start, options.maximum_line_size);
555
+ line_start = position_buffer;
458
556
  AddRow(insert_chunk, column, error_message);
459
- success = Flush(insert_chunk);
557
+ verification_positions.end_of_last_line = position_buffer;
460
558
  }
461
- parse_chunk.Reset();
462
- reached_remainder_state = false;
463
- return success;
464
- } else {
465
- AddRow(insert_chunk, column, error_message);
466
- verification_positions.end_of_last_line = position_buffer;
467
559
  }
468
560
  }
469
561
  }
@@ -471,11 +563,14 @@ final_state : {
471
563
  if (mode == ParserMode::PARSING) {
472
564
  Flush(insert_chunk);
473
565
  }
474
- if (position_buffer != verification_positions.end_of_last_line &&
475
- !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
566
+ if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
476
567
  error_message = "Line does not fit in one buffer. Increase the buffer size.";
477
568
  return false;
478
569
  }
570
+ end_buffer = buffer_size;
571
+ SkipEmptyLines();
572
+ end_buffer = buffer->buffer_end;
573
+ verification_positions.end_of_last_line = position_buffer;
479
574
  if (position_buffer >= end_buffer) {
480
575
  if (position_buffer >= end_buffer) {
481
576
  if (position_buffer == end_buffer && StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1]) &&
@@ -103,7 +103,7 @@ idx_t PhysicalTableScan::GetBatchIndex(ExecutionContext &context, DataChunk &chu
103
103
  }
104
104
 
105
105
  string PhysicalTableScan::GetName() const {
106
- return StringUtil::Upper(function.name);
106
+ return StringUtil::Upper(function.name + " " + function.extra_info);
107
107
  }
108
108
 
109
109
  string PhysicalTableScan::ParamsToString() const {
@@ -12,6 +12,7 @@
12
12
  #include "duckdb/planner/operator/logical_get.hpp"
13
13
  #include "duckdb/main/extension_helper.hpp"
14
14
  #include "duckdb/common/multi_file_reader.hpp"
15
+ #include "duckdb/main/client_data.hpp"
15
16
 
16
17
  #include <limits>
17
18
 
@@ -23,21 +24,22 @@ unique_ptr<CSVFileHandle> ReadCSV::OpenCSV(const string &file_path, FileCompress
23
24
  auto opener = FileSystem::GetFileOpener(context);
24
25
  auto file_handle =
25
26
  fs.OpenFile(file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK, compression, opener);
27
+ if (file_handle->CanSeek()) {
28
+ file_handle->Reset();
29
+ }
26
30
  return make_uniq<CSVFileHandle>(std::move(file_handle));
27
31
  }
28
32
 
29
33
  void ReadCSVData::FinalizeRead(ClientContext &context) {
30
34
  BaseCSVData::Finalize();
31
- auto &config = DBConfig::GetConfig(context);
32
- single_threaded = !config.options.experimental_parallel_csv_reader;
33
- if (options.has_parallel) {
34
- // Override the option set in the config
35
- single_threaded = !options.use_parallel;
36
- }
35
+ // Here we identify if we can run this CSV file on parallel or not.
37
36
  bool null_or_empty = options.delimiter.empty() || options.escape.empty() || options.quote.empty() ||
38
37
  options.delimiter[0] == '\0' || options.escape[0] == '\0' || options.quote[0] == '\0';
39
38
  bool complex_options = options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1;
40
- if (null_or_empty || complex_options || options.new_line == NewLineIdentifier::MIX) {
39
+ bool not_supported_options = options.null_padding;
40
+
41
+ if (!options.run_parallel || null_or_empty || not_supported_options || complex_options ||
42
+ options.new_line == NewLineIdentifier::MIX) {
41
43
  // not supported for parallel CSV reading
42
44
  single_threaded = true;
43
45
  }
@@ -175,6 +177,8 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
175
177
  options.all_varchar = BooleanValue::Get(kv.second);
176
178
  } else if (loption == "normalize_names") {
177
179
  options.normalize_names = BooleanValue::Get(kv.second);
180
+ } else if (loption == "parallel") {
181
+ options.run_parallel = BooleanValue::Get(kv.second);
178
182
  } else {
179
183
  options.SetReadOption(loption, kv.second, names);
180
184
  }
@@ -214,6 +218,13 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
214
218
  if (options.file_options.union_by_name) {
215
219
  result->reader_bind =
216
220
  MultiFileReader::BindUnionReader<BufferedCSVReader>(context, return_types, names, *result, options);
221
+ if (result->union_readers.size() > 1) {
222
+ result->column_info.emplace_back(result->csv_names, result->csv_types);
223
+ for (idx_t i = 1; i < result->union_readers.size(); i++) {
224
+ result->column_info.emplace_back(result->union_readers[i]->names,
225
+ result->union_readers[i]->return_types);
226
+ }
227
+ }
217
228
  if (!options.sql_types_per_column.empty()) {
218
229
  auto exception = BufferedCSVReader::ColumnTypesError(options.sql_types_per_column, names);
219
230
  if (!exception.empty()) {
@@ -253,17 +264,27 @@ public:
253
264
  file_size = file_handle->FileSize();
254
265
  first_file_size = file_size;
255
266
  bytes_read = 0;
256
- if (buffer_size < file_size) {
267
+ if (buffer_size < file_size || file_size == 0) {
257
268
  bytes_per_local_state = buffer_size / ParallelCSVGlobalState::MaxThreads();
258
269
  } else {
259
270
  bytes_per_local_state = file_size / MaxThreads();
260
271
  }
261
- current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position);
262
- next_buffer =
263
- shared_ptr<CSVBuffer>(current_buffer->Next(*file_handle, buffer_size, current_csv_position).release());
272
+ if (bytes_per_local_state == 0) {
273
+ // In practice, I think this won't happen, it only happens because we are mocking up test scenarios
274
+ // this boy needs to be at least one.
275
+ bytes_per_local_state = 1;
276
+ }
277
+ for (idx_t i = 0; i < rows_to_skip; i++) {
278
+ file_handle->ReadLine();
279
+ }
280
+ first_position = current_csv_position;
281
+ current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
282
+ next_buffer = shared_ptr<CSVBuffer>(
283
+ current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
264
284
  running_threads = MaxThreads();
265
285
  }
266
286
  ParallelCSVGlobalState() {
287
+ running_threads = MaxThreads();
267
288
  }
268
289
 
269
290
  ~ParallelCSVGlobalState() override {
@@ -281,7 +302,7 @@ public:
281
302
  //! Verify if the CSV File was read correctly
282
303
  void Verify();
283
304
 
284
- void UpdateVerification(VerificationPositions positions);
305
+ void UpdateVerification(VerificationPositions positions, idx_t file_number);
285
306
 
286
307
  void IncrementThread();
287
308
 
@@ -332,14 +353,18 @@ private:
332
353
  //! Current batch index
333
354
  idx_t batch_index = 0;
334
355
  //! Forces parallelism for small CSV Files, should only be used for testing.
335
- bool force_parallelism;
356
+ bool force_parallelism = false;
336
357
  //! Current (Global) position of CSV
337
358
  idx_t current_csv_position = 0;
359
+ //! First Position of First Buffer
360
+ idx_t first_position = 0;
361
+ //! Current File Number
362
+ idx_t file_number = 0;
338
363
  idx_t max_tuple_end = 0;
339
364
  //! the vector stores positions where threads ended the last line they read in the CSV File, and the set stores
340
365
  //! positions where they started reading the first line.
341
- vector<idx_t> tuple_end;
342
- set<idx_t> tuple_start;
366
+ vector<vector<idx_t>> tuple_end;
367
+ vector<set<idx_t>> tuple_start;
343
368
  idx_t running_threads = 0;
344
369
  //! The column ids to read
345
370
  vector<column_t> column_ids;
@@ -349,10 +374,9 @@ idx_t ParallelCSVGlobalState::MaxThreads() const {
349
374
  if (force_parallelism) {
350
375
  return system_threads;
351
376
  }
352
-
353
377
  idx_t one_mb = 1000000; // We initialize max one thread per Mb
354
378
  idx_t threads_per_mb = first_file_size / one_mb + 1;
355
- if (threads_per_mb < system_threads) {
379
+ if (threads_per_mb < system_threads || threads_per_mb == 1) {
356
380
  return threads_per_mb;
357
381
  }
358
382
 
@@ -378,25 +402,36 @@ bool ParallelCSVGlobalState::Finished() {
378
402
  void ParallelCSVGlobalState::Verify() {
379
403
  // All threads are done, we run some magic sweet verification code
380
404
  if (running_threads == 0) {
381
- for (auto &last_pos : tuple_end) {
382
- auto first_pos = tuple_start.find(last_pos);
383
- if (first_pos == tuple_start.end()) {
384
- // this might be necessary due to carriage returns outside buffer scopes.
385
- first_pos = tuple_start.find(last_pos + 1);
405
+ D_ASSERT(tuple_end.size() == tuple_start.size());
406
+ for (idx_t i = 0; i < tuple_start.size(); i++) {
407
+ auto &current_tuple_end = tuple_end[i];
408
+ auto &current_tuple_start = tuple_start[i];
409
+ // figure out max value of last_pos
410
+ if (current_tuple_end.empty()) {
411
+ return;
386
412
  }
387
- if (first_pos == tuple_start.end() && last_pos != NumericLimits<uint64_t>::Maximum()) {
388
- string error = "Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
389
- " does not have a match\n";
390
- error += "End Lines: \n";
391
- for (auto &end_line : tuple_end) {
392
- error += to_string(end_line) + "\n";
413
+ auto max_value = *max_element(std::begin(current_tuple_end), std::end(current_tuple_end));
414
+ for (auto &last_pos : current_tuple_end) {
415
+ auto first_pos = current_tuple_start.find(last_pos);
416
+ if (first_pos == current_tuple_start.end()) {
417
+ // this might be necessary due to carriage returns outside buffer scopes.
418
+ first_pos = current_tuple_start.find(last_pos + 1);
393
419
  }
394
- error += "Start Lines: \n";
395
- for (auto &start_line : tuple_start) {
396
- error += to_string(start_line) + "\n";
420
+ if (first_pos == current_tuple_start.end() && last_pos != max_value) {
421
+ string error =
422
+ "Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
423
+ " does not have a match\n";
424
+ error += "End Lines: \n";
425
+ for (auto &end_line : current_tuple_end) {
426
+ error += to_string(end_line) + "\n";
427
+ }
428
+ error += "Start Lines: \n";
429
+ for (auto &start_line : current_tuple_start) {
430
+ error += to_string(start_line) + "\n";
431
+ }
432
+ throw InvalidInputException(
433
+ "CSV File not supported for multithreading. Please run single-threaded CSV Reading");
397
434
  }
398
- throw InvalidInputException(
399
- "CSV File not supported for multithreading. Please run single-threaded CSV Reading");
400
435
  }
401
436
  }
402
437
  }
@@ -411,9 +446,11 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
411
446
  current_file_path = bind_data.files[file_index++];
412
447
  file_handle = ReadCSV::OpenCSV(current_file_path, bind_data.options.compression, context);
413
448
  current_csv_position = 0;
414
- current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position);
415
- next_buffer =
416
- shared_ptr<CSVBuffer>(current_buffer->Next(*file_handle, buffer_size, current_csv_position).release());
449
+ file_number++;
450
+ current_buffer =
451
+ make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
452
+ next_buffer = shared_ptr<CSVBuffer>(
453
+ current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
417
454
  } else {
418
455
  // We are done scanning.
419
456
  reader.reset();
@@ -433,8 +470,8 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
433
470
  current_buffer = next_buffer;
434
471
  if (next_buffer) {
435
472
  // Next buffer gets the next-next buffer
436
- next_buffer =
437
- shared_ptr<CSVBuffer>(next_buffer->Next(*file_handle, buffer_size, current_csv_position).release());
473
+ next_buffer = shared_ptr<CSVBuffer>(
474
+ next_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
438
475
  }
439
476
  }
440
477
  if (!reader || reader->options.file_path != current_file_path) {
@@ -443,13 +480,18 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
443
480
  if (file_index > 0 && file_index <= bind_data.union_readers.size() && bind_data.union_readers[file_index - 1]) {
444
481
  // we are doing UNION BY NAME - fetch the options from the union reader for this file
445
482
  auto &union_reader = *bind_data.union_readers[file_index - 1];
446
- reader =
447
- make_uniq<ParallelCSVReader>(context, union_reader.options, std::move(result), union_reader.GetTypes());
483
+ reader = make_uniq<ParallelCSVReader>(context, union_reader.options, std::move(result), first_position,
484
+ union_reader.GetTypes());
448
485
  reader->names = union_reader.GetNames();
486
+ } else if (file_index <= bind_data.column_info.size()) {
487
+ // Serialized Union By name
488
+ reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
489
+ bind_data.column_info[file_index - 1].types);
490
+ reader->names = bind_data.column_info[file_index - 1].names;
449
491
  } else {
450
492
  // regular file - use the standard options
451
- reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), bind_data.csv_types);
452
- reader->options.file_path = current_file_path;
493
+ reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
494
+ bind_data.csv_types);
453
495
  reader->names = bind_data.csv_names;
454
496
  }
455
497
  reader->options.file_path = current_file_path;
@@ -461,14 +503,20 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
461
503
  }
462
504
  return true;
463
505
  }
464
- void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions) {
506
+ void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions, idx_t file_number_p) {
465
507
  lock_guard<mutex> parallel_lock(main_mutex);
466
508
  if (positions.beginning_of_first_line < positions.end_of_last_line) {
467
509
  if (positions.end_of_last_line > max_tuple_end) {
468
510
  max_tuple_end = positions.end_of_last_line;
469
511
  }
470
- tuple_start.insert(positions.beginning_of_first_line);
471
- tuple_end.push_back(positions.end_of_last_line);
512
+ while (file_number_p >= tuple_start.size()) {
513
+ vector<idx_t> empty_tuple_end;
514
+ set<idx_t> empty_set;
515
+ tuple_start.emplace_back(empty_set);
516
+ tuple_end.emplace_back(empty_tuple_end);
517
+ }
518
+ tuple_start[file_number_p].insert(positions.beginning_of_first_line);
519
+ tuple_end[file_number_p].push_back(positions.end_of_last_line);
472
520
  }
473
521
  }
474
522
 
@@ -483,11 +531,9 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
483
531
 
484
532
  bind_data.options.file_path = bind_data.files[0];
485
533
  file_handle = ReadCSV::OpenCSV(bind_data.options.file_path, bind_data.options.compression, context);
486
- idx_t rows_to_skip =
487
- bind_data.options.skip_rows + (bind_data.options.has_header && bind_data.options.header ? 1 : 0);
488
- return make_uniq<ParallelCSVGlobalState>(context, std::move(file_handle), bind_data.files,
489
- context.db->NumberOfThreads(), bind_data.options.buffer_size, rows_to_skip,
490
- ClientConfig::GetConfig(context).verify_parallelism, input.column_ids);
534
+ return make_uniq<ParallelCSVGlobalState>(
535
+ context, std::move(file_handle), bind_data.files, context.db->NumberOfThreads(), bind_data.options.buffer_size,
536
+ bind_data.options.skip_rows, ClientConfig::GetConfig(context).verify_parallelism, input.column_ids);
491
537
  }
492
538
 
493
539
  //===--------------------------------------------------------------------===//
@@ -534,11 +580,10 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
534
580
  }
535
581
  if (csv_local_state.csv_reader->finished) {
536
582
  auto verification_updates = csv_local_state.csv_reader->GetVerificationPositions();
537
- if (!csv_local_state.csv_reader->buffer->next_buffer) {
538
- // if it's the last line of the file we mark as the maximum
539
- verification_updates.end_of_last_line = NumericLimits<uint64_t>::Maximum();
583
+ if (verification_updates.beginning_of_first_line != verification_updates.end_of_last_line) {
584
+ csv_global_state.UpdateVerification(verification_updates,
585
+ csv_local_state.csv_reader->buffer->buffer->GetFileNumber());
540
586
  }
541
- csv_global_state.UpdateVerification(verification_updates);
542
587
  auto has_next = csv_global_state.Next(context, bind_data, csv_local_state.csv_reader);
543
588
  if (!has_next) {
544
589
  csv_global_state.DecrementThread();
@@ -642,14 +687,17 @@ static unique_ptr<GlobalTableFunctionState> SingleThreadedCSVInit(ClientContext
642
687
  TableFunctionInitInput &input) {
643
688
  auto &bind_data = (ReadCSVData &)*input.bind_data;
644
689
  auto result = make_uniq<SingleThreadedCSVState>(bind_data.files.size());
645
- if (bind_data.initial_reader) {
646
- result->initial_reader = std::move(bind_data.initial_reader);
647
- } else if (bind_data.files.empty()) {
690
+ if (bind_data.files.empty()) {
648
691
  // This can happen when a filename based filter pushdown has eliminated all possible files for this scan.
649
692
  return std::move(result);
650
693
  } else {
651
694
  bind_data.options.file_path = bind_data.files[0];
652
- result->initial_reader = make_uniq<BufferedCSVReader>(context, bind_data.options, bind_data.csv_types);
695
+ if (bind_data.initial_reader && !bind_data.file_exists) {
696
+ // If this is not an on disk file we gotta reuse the reader.
697
+ result->initial_reader = std::move(bind_data.initial_reader);
698
+ } else {
699
+ result->initial_reader = make_uniq<BufferedCSVReader>(context, bind_data.options, bind_data.csv_types);
700
+ }
653
701
  if (!bind_data.options.file_options.union_by_name) {
654
702
  result->initial_reader->names = bind_data.csv_names;
655
703
  }
@@ -741,6 +789,14 @@ static void SingleThreadedCSVFunction(ClientContext &context, TableFunctionInput
741
789
  //===--------------------------------------------------------------------===//
742
790
  static unique_ptr<GlobalTableFunctionState> ReadCSVInitGlobal(ClientContext &context, TableFunctionInitInput &input) {
743
791
  auto &bind_data = (ReadCSVData &)*input.bind_data;
792
+ auto &fs = FileSystem::GetFileSystem(context);
793
+ for (auto &file : bind_data.files) {
794
+ if (!fs.FileExists(file)) {
795
+ bind_data.file_exists = false;
796
+ break;
797
+ }
798
+ }
799
+ bind_data.single_threaded = bind_data.single_threaded || !bind_data.file_exists;
744
800
  if (bind_data.single_threaded) {
745
801
  return SingleThreadedCSVInit(context, input);
746
802
  } else {
@@ -863,6 +919,7 @@ void BufferedCSVReaderOptions::Serialize(FieldWriter &writer) const {
863
919
  writer.WriteField<idx_t>(buffer_sample_size);
864
920
  writer.WriteString(null_str);
865
921
  writer.WriteField<FileCompressionType>(compression);
922
+ writer.WriteField<NewLineIdentifier>(new_line);
866
923
  // read options
867
924
  writer.WriteField<idx_t>(skip_rows);
868
925
  writer.WriteField<bool>(skip_rows_set);
@@ -896,6 +953,7 @@ void BufferedCSVReaderOptions::Deserialize(FieldReader &reader) {
896
953
  buffer_sample_size = reader.ReadRequired<idx_t>();
897
954
  null_str = reader.ReadRequired<string>();
898
955
  compression = reader.ReadRequired<FileCompressionType>();
956
+ new_line = reader.ReadRequired<NewLineIdentifier>();
899
957
  // read options
900
958
  skip_rows = reader.ReadRequired<idx_t>();
901
959
  skip_rows_set = reader.ReadRequired<bool>();
@@ -926,6 +984,10 @@ static void CSVReaderSerialize(FieldWriter &writer, const FunctionData *bind_dat
926
984
  bind_data.options.Serialize(writer);
927
985
  writer.WriteField<bool>(bind_data.single_threaded);
928
986
  writer.WriteSerializable(bind_data.reader_bind);
987
+ writer.WriteField<uint32_t>(bind_data.column_info.size());
988
+ for (auto &col : bind_data.column_info) {
989
+ col.Serialize(writer);
990
+ }
929
991
  }
930
992
 
931
993
  static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, FieldReader &reader,
@@ -941,6 +1003,10 @@ static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, Fie
941
1003
  result_data->options.Deserialize(reader);
942
1004
  result_data->single_threaded = reader.ReadField<bool>(true);
943
1005
  result_data->reader_bind = reader.ReadRequiredSerializable<MultiFileReaderBindData, MultiFileReaderBindData>();
1006
+ uint32_t file_number = reader.ReadRequired<uint32_t>();
1007
+ for (idx_t i = 0; i < file_number; i++) {
1008
+ result_data->column_info.emplace_back(ColumnInfo::Deserialize(reader));
1009
+ }
944
1010
  return std::move(result_data);
945
1011
  }
946
1012
 
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.7.2-dev2144"
2
+ #define DUCKDB_VERSION "0.7.2-dev2233"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "82211fc11b"
5
+ #define DUCKDB_SOURCE_ID "c81600ed51"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"