duckdb 0.7.2-dev3294.0 → 0.7.2-dev3353.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +8 -8
- package/package.json +1 -1
- package/src/duckdb/src/execution/index/art/art.cpp +3 -0
- package/src/duckdb/src/execution/index/art/prefix.cpp +11 -11
- package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +56 -28
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -3
- package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +85 -37
- package/src/duckdb/src/function/table/read_csv.cpp +136 -37
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/base_csv_reader.hpp +11 -4
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +2 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_line_info.hpp +40 -0
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp +28 -13
- package/src/duckdb/src/main/extension/extension_load.cpp +6 -5
- package/src/duckdb/src/optimizer/rule/move_constants.cpp +3 -0
- package/src/duckdb/src/storage/statistics/string_stats.cpp +1 -1
- package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +10837 -10795
- package/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +6 -6
@@ -14,6 +14,7 @@
|
|
14
14
|
#include "duckdb/main/extension_helper.hpp"
|
15
15
|
#include "duckdb/common/multi_file_reader.hpp"
|
16
16
|
#include "duckdb/main/client_data.hpp"
|
17
|
+
#include "duckdb/execution/operator/persistent/csv_line_info.hpp"
|
17
18
|
|
18
19
|
#include <limits>
|
19
20
|
|
@@ -262,19 +263,21 @@ static unique_ptr<FunctionData> ReadCSVAutoBind(ClientContext &context, TableFun
|
|
262
263
|
//===--------------------------------------------------------------------===//
|
263
264
|
// Parallel CSV Reader CSV Global State
|
264
265
|
//===--------------------------------------------------------------------===//
|
265
|
-
|
266
|
-
// Read CSV Global State
|
267
|
-
//===--------------------------------------------------------------------===//
|
266
|
+
|
268
267
|
struct ParallelCSVGlobalState : public GlobalTableFunctionState {
|
269
268
|
public:
|
270
269
|
ParallelCSVGlobalState(ClientContext &context, unique_ptr<CSVFileHandle> file_handle_p,
|
271
270
|
vector<string> &files_path_p, idx_t system_threads_p, idx_t buffer_size_p,
|
272
|
-
idx_t rows_to_skip, bool force_parallelism_p, vector<column_t> column_ids_p)
|
271
|
+
idx_t rows_to_skip, bool force_parallelism_p, vector<column_t> column_ids_p, bool has_header)
|
273
272
|
: file_handle(std::move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p),
|
274
|
-
force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p))
|
273
|
+
force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
|
274
|
+
line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
|
275
275
|
file_handle->DisableReset();
|
276
276
|
current_file_path = files_path_p[0];
|
277
|
-
|
277
|
+
line_info.lines_read[0] = rows_to_skip;
|
278
|
+
if (has_header) {
|
279
|
+
line_info.lines_read[0]++;
|
280
|
+
}
|
278
281
|
file_size = file_handle->FileSize();
|
279
282
|
first_file_size = file_size;
|
280
283
|
bytes_read = 0;
|
@@ -296,8 +299,16 @@ public:
|
|
296
299
|
next_buffer = shared_ptr<CSVBuffer>(
|
297
300
|
current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
|
298
301
|
running_threads = MaxThreads();
|
302
|
+
|
303
|
+
// Initialize all the book-keeping variables
|
304
|
+
auto file_count = files_path_p.size();
|
305
|
+
line_info.current_batches.resize(file_count);
|
306
|
+
tuple_start.resize(file_count);
|
307
|
+
tuple_end.resize(file_count);
|
308
|
+
tuple_end_to_batch.resize(file_count);
|
309
|
+
batch_to_tuple_end.resize(file_count);
|
299
310
|
}
|
300
|
-
ParallelCSVGlobalState() {
|
311
|
+
ParallelCSVGlobalState() : line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
|
301
312
|
running_threads = MaxThreads();
|
302
313
|
}
|
303
314
|
|
@@ -316,7 +327,9 @@ public:
|
|
316
327
|
//! Verify if the CSV File was read correctly
|
317
328
|
void Verify();
|
318
329
|
|
319
|
-
void UpdateVerification(VerificationPositions positions, idx_t file_number);
|
330
|
+
void UpdateVerification(VerificationPositions positions, idx_t file_number, idx_t batch_idx);
|
331
|
+
|
332
|
+
void UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx);
|
320
333
|
|
321
334
|
void IncrementThread();
|
322
335
|
|
@@ -354,8 +367,6 @@ private:
|
|
354
367
|
mutex main_mutex;
|
355
368
|
//! Byte set from for last thread
|
356
369
|
idx_t next_byte = 0;
|
357
|
-
//! The current estimated line number
|
358
|
-
idx_t estimated_linenr;
|
359
370
|
//! How many bytes we should execute per local state
|
360
371
|
idx_t bytes_per_local_state;
|
361
372
|
//! Size of first file
|
@@ -366,6 +377,8 @@ private:
|
|
366
377
|
idx_t buffer_size;
|
367
378
|
//! Current batch index
|
368
379
|
idx_t batch_index = 0;
|
380
|
+
idx_t local_batch_index = 0;
|
381
|
+
|
369
382
|
//! Forces parallelism for small CSV Files, should only be used for testing.
|
370
383
|
bool force_parallelism = false;
|
371
384
|
//! Current (Global) position of CSV
|
@@ -379,9 +392,15 @@ private:
|
|
379
392
|
//! positions where they started reading the first line.
|
380
393
|
vector<vector<idx_t>> tuple_end;
|
381
394
|
vector<set<idx_t>> tuple_start;
|
395
|
+
//! Tuple end to batch
|
396
|
+
vector<unordered_map<idx_t, idx_t>> tuple_end_to_batch;
|
397
|
+
//! Batch to Tuple End
|
398
|
+
vector<unordered_map<idx_t, idx_t>> batch_to_tuple_end;
|
382
399
|
idx_t running_threads = 0;
|
383
400
|
//! The column ids to read
|
384
401
|
vector<column_t> column_ids;
|
402
|
+
//! Line Info used in error messages
|
403
|
+
LineInfo line_info;
|
385
404
|
};
|
386
405
|
|
387
406
|
idx_t ParallelCSVGlobalState::MaxThreads() const {
|
@@ -425,32 +444,61 @@ void ParallelCSVGlobalState::Verify() {
|
|
425
444
|
return;
|
426
445
|
}
|
427
446
|
auto max_value = *max_element(std::begin(current_tuple_end), std::end(current_tuple_end));
|
428
|
-
for (
|
447
|
+
for (idx_t tpl_idx = 0; tpl_idx < current_tuple_end.size(); tpl_idx++) {
|
448
|
+
auto last_pos = current_tuple_end[tpl_idx];
|
429
449
|
auto first_pos = current_tuple_start.find(last_pos);
|
430
450
|
if (first_pos == current_tuple_start.end()) {
|
431
451
|
// this might be necessary due to carriage returns outside buffer scopes.
|
432
452
|
first_pos = current_tuple_start.find(last_pos + 1);
|
433
453
|
}
|
434
454
|
if (first_pos == current_tuple_start.end() && last_pos != max_value) {
|
435
|
-
|
436
|
-
|
437
|
-
" does not have a match\n";
|
438
|
-
error += "End Lines: \n";
|
439
|
-
for (auto &end_line : current_tuple_end) {
|
440
|
-
error += to_string(end_line) + "\n";
|
441
|
-
}
|
442
|
-
error += "Start Lines: \n";
|
443
|
-
for (auto &start_line : current_tuple_start) {
|
444
|
-
error += to_string(start_line) + "\n";
|
445
|
-
}
|
455
|
+
auto batch_idx = tuple_end_to_batch[i][last_pos];
|
456
|
+
auto problematic_line = line_info.GetLine(batch_idx);
|
446
457
|
throw InvalidInputException(
|
447
|
-
"CSV File not supported for multithreading.
|
458
|
+
"CSV File not supported for multithreading. This can be a problematic line in your CSV File or "
|
459
|
+
"that this CSV can't be read in Parallel. Please, inspect if the line %llu is correct. If so, "
|
460
|
+
"please run single-threaded CSV Reading by setting parallel=false in the read_csv call.",
|
461
|
+
problematic_line);
|
448
462
|
}
|
449
463
|
}
|
450
464
|
}
|
451
465
|
}
|
452
466
|
}
|
453
467
|
|
468
|
+
void LineInfo::Verify(idx_t file_idx, idx_t batch_idx, idx_t cur_first_pos) {
|
469
|
+
auto &tuple_start_set = tuple_start[file_idx];
|
470
|
+
auto &processed_batches = batch_to_tuple_end[file_idx];
|
471
|
+
auto &tuple_end_vec = tuple_end[file_idx];
|
472
|
+
bool has_error = false;
|
473
|
+
idx_t problematic_line;
|
474
|
+
if (batch_idx == 0 || tuple_start_set.empty()) {
|
475
|
+
return;
|
476
|
+
}
|
477
|
+
for (idx_t cur_batch = 0; cur_batch < batch_idx - 1; cur_batch++) {
|
478
|
+
auto cur_end = tuple_end_vec[processed_batches[cur_batch]];
|
479
|
+
auto first_pos = tuple_start_set.find(cur_end);
|
480
|
+
if (first_pos == tuple_start_set.end()) {
|
481
|
+
has_error = true;
|
482
|
+
problematic_line = GetLine(cur_batch);
|
483
|
+
break;
|
484
|
+
}
|
485
|
+
}
|
486
|
+
if (!has_error) {
|
487
|
+
auto cur_end = tuple_end_vec[processed_batches[batch_idx - 1]];
|
488
|
+
if (cur_end != cur_first_pos) {
|
489
|
+
has_error = true;
|
490
|
+
problematic_line = GetLine(batch_idx);
|
491
|
+
}
|
492
|
+
}
|
493
|
+
if (has_error) {
|
494
|
+
throw InvalidInputException(
|
495
|
+
"CSV File not supported for multithreading. This can be a problematic line in your CSV File or "
|
496
|
+
"that this CSV can't be read in Parallel. Please, inspect if the line %llu is correct. If so, "
|
497
|
+
"please run single-threaded CSV Reading by setting parallel=false in the read_csv call.",
|
498
|
+
problematic_line);
|
499
|
+
}
|
500
|
+
}
|
501
|
+
|
454
502
|
bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bind_data,
|
455
503
|
unique_ptr<ParallelCSVReader> &reader) {
|
456
504
|
lock_guard<mutex> parallel_lock(main_mutex);
|
@@ -461,6 +509,7 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
461
509
|
file_handle = ReadCSV::OpenCSV(current_file_path, bind_data.options.compression, context);
|
462
510
|
current_csv_position = 0;
|
463
511
|
file_number++;
|
512
|
+
local_batch_index = 0;
|
464
513
|
current_buffer =
|
465
514
|
make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
|
466
515
|
next_buffer = shared_ptr<CSVBuffer>(
|
@@ -472,11 +521,11 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
472
521
|
}
|
473
522
|
}
|
474
523
|
// set up the current buffer
|
524
|
+
line_info.current_batches.back().insert(local_batch_index);
|
475
525
|
auto result = make_uniq<CSVBufferRead>(current_buffer, next_buffer, next_byte, next_byte + bytes_per_local_state,
|
476
|
-
batch_index++,
|
526
|
+
batch_index++, local_batch_index++, &line_info);
|
477
527
|
// move the byte index of the CSV reader to the next buffer
|
478
528
|
next_byte += bytes_per_local_state;
|
479
|
-
estimated_linenr += bytes_per_local_state / (bind_data.csv_types.size() * 5); // estimate 5 bytes per column
|
480
529
|
if (next_byte >= current_buffer->GetBufferSize()) {
|
481
530
|
// We replace the current buffer with the next buffer
|
482
531
|
next_byte = 0;
|
@@ -495,17 +544,17 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
495
544
|
// we are doing UNION BY NAME - fetch the options from the union reader for this file
|
496
545
|
auto &union_reader = *bind_data.union_readers[file_index - 1];
|
497
546
|
reader = make_uniq<ParallelCSVReader>(context, union_reader.options, std::move(result), first_position,
|
498
|
-
union_reader.GetTypes());
|
547
|
+
union_reader.GetTypes(), file_index - 1);
|
499
548
|
reader->names = union_reader.GetNames();
|
500
549
|
} else if (file_index <= bind_data.column_info.size()) {
|
501
550
|
// Serialized Union By name
|
502
551
|
reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
|
503
|
-
bind_data.column_info[file_index - 1].types);
|
552
|
+
bind_data.column_info[file_index - 1].types, file_index - 1);
|
504
553
|
reader->names = bind_data.column_info[file_index - 1].names;
|
505
554
|
} else {
|
506
555
|
// regular file - use the standard options
|
507
556
|
reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
|
508
|
-
bind_data.csv_types);
|
557
|
+
bind_data.csv_types, file_index - 1);
|
509
558
|
reader->names = bind_data.csv_names;
|
510
559
|
}
|
511
560
|
reader->options.file_path = current_file_path;
|
@@ -518,23 +567,64 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
518
567
|
}
|
519
568
|
return true;
|
520
569
|
}
|
521
|
-
void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions, idx_t file_number_p) {
|
570
|
+
void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions, idx_t file_number_p, idx_t batch_idx) {
|
522
571
|
lock_guard<mutex> parallel_lock(main_mutex);
|
523
572
|
if (positions.beginning_of_first_line < positions.end_of_last_line) {
|
524
573
|
if (positions.end_of_last_line > max_tuple_end) {
|
525
574
|
max_tuple_end = positions.end_of_last_line;
|
526
575
|
}
|
527
|
-
|
528
|
-
|
529
|
-
set<idx_t> empty_set;
|
530
|
-
tuple_start.emplace_back(empty_set);
|
531
|
-
tuple_end.emplace_back(empty_tuple_end);
|
532
|
-
}
|
576
|
+
tuple_end_to_batch[file_number_p][positions.end_of_last_line] = batch_idx;
|
577
|
+
batch_to_tuple_end[file_number_p][batch_idx] = tuple_end[file_number_p].size();
|
533
578
|
tuple_start[file_number_p].insert(positions.beginning_of_first_line);
|
534
579
|
tuple_end[file_number_p].push_back(positions.end_of_last_line);
|
535
580
|
}
|
536
581
|
}
|
537
582
|
|
583
|
+
void ParallelCSVGlobalState::UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx) {
|
584
|
+
auto batch_idx = buffer_read.local_batch_index;
|
585
|
+
auto lines_read = buffer_read.lines_read;
|
586
|
+
lock_guard<mutex> parallel_lock(main_mutex);
|
587
|
+
line_info.current_batches[file_idx].erase(batch_idx);
|
588
|
+
line_info.lines_read[batch_idx] += lines_read;
|
589
|
+
}
|
590
|
+
|
591
|
+
bool LineInfo::CanItGetLine(idx_t file_idx, idx_t batch_idx) {
|
592
|
+
lock_guard<mutex> parallel_lock(main_mutex);
|
593
|
+
if (current_batches.empty() || done) {
|
594
|
+
return true;
|
595
|
+
}
|
596
|
+
auto min_value = *current_batches[file_idx].begin();
|
597
|
+
if (min_value >= batch_idx) {
|
598
|
+
return true;
|
599
|
+
}
|
600
|
+
return false;
|
601
|
+
}
|
602
|
+
|
603
|
+
idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t cur_start, bool verify) {
|
604
|
+
unique_ptr<lock_guard<mutex>> parallel_lock;
|
605
|
+
if (!verify) {
|
606
|
+
parallel_lock = duckdb::make_uniq<lock_guard<mutex>>(main_mutex);
|
607
|
+
}
|
608
|
+
idx_t line_count = 0;
|
609
|
+
if (done) {
|
610
|
+
return first_line;
|
611
|
+
}
|
612
|
+
for (idx_t i = 0; i <= batch_idx; i++) {
|
613
|
+
if (lines_read.find(i) == lines_read.end() && i != batch_idx) {
|
614
|
+
throw InternalException("Missing batch index on Parallel CSV Reader GetLine");
|
615
|
+
}
|
616
|
+
line_count += lines_read[i];
|
617
|
+
}
|
618
|
+
|
619
|
+
// before we are done, if this is not a call in Verify() we must check Verify up to this batch
|
620
|
+
if (!verify) {
|
621
|
+
Verify(file_idx, batch_idx, cur_start);
|
622
|
+
}
|
623
|
+
done = true;
|
624
|
+
first_line = line_count + line_error;
|
625
|
+
return first_line;
|
626
|
+
}
|
627
|
+
|
538
628
|
static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext &context,
|
539
629
|
TableFunctionInitInput &input) {
|
540
630
|
auto &bind_data = (ReadCSVData &)*input.bind_data;
|
@@ -548,7 +638,8 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
|
|
548
638
|
file_handle = ReadCSV::OpenCSV(bind_data.options.file_path, bind_data.options.compression, context);
|
549
639
|
return make_uniq<ParallelCSVGlobalState>(
|
550
640
|
context, std::move(file_handle), bind_data.files, context.db->NumberOfThreads(), bind_data.options.buffer_size,
|
551
|
-
bind_data.options.skip_rows, ClientConfig::GetConfig(context).verify_parallelism, input.column_ids
|
641
|
+
bind_data.options.skip_rows, ClientConfig::GetConfig(context).verify_parallelism, input.column_ids,
|
642
|
+
bind_data.options.header && bind_data.options.has_header);
|
552
643
|
}
|
553
644
|
|
554
645
|
//===--------------------------------------------------------------------===//
|
@@ -597,9 +688,15 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
|
|
597
688
|
auto verification_updates = csv_local_state.csv_reader->GetVerificationPositions();
|
598
689
|
if (verification_updates.beginning_of_first_line != verification_updates.end_of_last_line) {
|
599
690
|
csv_global_state.UpdateVerification(verification_updates,
|
600
|
-
csv_local_state.csv_reader->buffer->buffer->GetFileNumber()
|
691
|
+
csv_local_state.csv_reader->buffer->buffer->GetFileNumber(),
|
692
|
+
csv_local_state.csv_reader->buffer->local_batch_index);
|
601
693
|
}
|
694
|
+
csv_global_state.UpdateLinesRead(*csv_local_state.csv_reader->buffer, csv_local_state.csv_reader->file_idx);
|
602
695
|
auto has_next = csv_global_state.Next(context, bind_data, csv_local_state.csv_reader);
|
696
|
+
if (csv_local_state.csv_reader) {
|
697
|
+
csv_local_state.csv_reader->linenr = 0;
|
698
|
+
}
|
699
|
+
|
603
700
|
if (!has_next) {
|
604
701
|
csv_global_state.DecrementThread();
|
605
702
|
break;
|
@@ -945,6 +1042,7 @@ void BufferedCSVReaderOptions::Serialize(FieldWriter &writer) const {
|
|
945
1042
|
writer.WriteString(file_path);
|
946
1043
|
writer.WriteString(decimal_separator);
|
947
1044
|
writer.WriteField<bool>(null_padding);
|
1045
|
+
writer.WriteField<idx_t>(buffer_size);
|
948
1046
|
writer.WriteSerializable(file_options);
|
949
1047
|
// write options
|
950
1048
|
writer.WriteListNoReference<bool>(force_quote);
|
@@ -980,6 +1078,7 @@ void BufferedCSVReaderOptions::Deserialize(FieldReader &reader) {
|
|
980
1078
|
file_path = reader.ReadRequired<string>();
|
981
1079
|
decimal_separator = reader.ReadRequired<string>();
|
982
1080
|
null_padding = reader.ReadRequired<bool>();
|
1081
|
+
buffer_size = reader.ReadRequired<idx_t>();
|
983
1082
|
file_options = reader.ReadRequiredSerializable<MultiFileReaderOptions, MultiFileReaderOptions>();
|
984
1083
|
// write options
|
985
1084
|
force_quote = reader.ReadRequiredList<bool>();
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.7.2-
|
2
|
+
#define DUCKDB_VERSION "0.7.2-dev3353"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "ec0ca94cdf"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -17,6 +17,8 @@
|
|
17
17
|
#include "duckdb/common/queue.hpp"
|
18
18
|
#include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
|
19
19
|
#include "duckdb/common/multi_file_reader.hpp"
|
20
|
+
#include "duckdb/execution/operator/persistent/csv_line_info.hpp"
|
21
|
+
|
20
22
|
#include <sstream>
|
21
23
|
|
22
24
|
namespace duckdb {
|
@@ -74,6 +76,10 @@ public:
|
|
74
76
|
const vector<LogicalType> &GetTypes() {
|
75
77
|
return return_types;
|
76
78
|
}
|
79
|
+
virtual idx_t GetLineError(idx_t line_error, idx_t buffer_idx) {
|
80
|
+
return line_error;
|
81
|
+
};
|
82
|
+
|
77
83
|
//! Initialize projection indices to select all columns
|
78
84
|
void InitializeProjection();
|
79
85
|
|
@@ -88,17 +94,18 @@ protected:
|
|
88
94
|
bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type);
|
89
95
|
|
90
96
|
//! Adds a value to the current row
|
91
|
-
void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes
|
97
|
+
void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes,
|
98
|
+
idx_t buffer_idx = 0);
|
92
99
|
//! Adds a row to the insert_chunk, returns true if the chunk is filled as a result of this row being added
|
93
|
-
bool AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message);
|
100
|
+
bool AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message, idx_t buffer_idx = 0);
|
94
101
|
//! Finalizes a chunk, parsing all values that have been added so far and adding them to the insert_chunk
|
95
|
-
bool Flush(DataChunk &insert_chunk, bool try_add_line = false);
|
102
|
+
bool Flush(DataChunk &insert_chunk, idx_t buffer_idx = 0, bool try_add_line = false);
|
96
103
|
|
97
104
|
unique_ptr<CSVFileHandle> OpenCSV(const BufferedCSVReaderOptions &options);
|
98
105
|
|
99
106
|
void VerifyUTF8(idx_t col_idx);
|
100
107
|
void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0);
|
101
|
-
|
108
|
+
string GetLineNumberStr(idx_t linenr, bool linenr_estimated, idx_t buffer_idx = 0);
|
102
109
|
|
103
110
|
//! Sets the newline delimiter
|
104
111
|
void SetNewLineDelimiter(bool carry = false, bool carry_followed_by_nl = false);
|
@@ -57,7 +57,8 @@ public:
|
|
57
57
|
const vector<LogicalType> &requested_types = vector<LogicalType>());
|
58
58
|
BufferedCSVReader(ClientContext &context, string filename, BufferedCSVReaderOptions options,
|
59
59
|
const vector<LogicalType> &requested_types = vector<LogicalType>());
|
60
|
-
~BufferedCSVReader()
|
60
|
+
virtual ~BufferedCSVReader() {
|
61
|
+
}
|
61
62
|
|
62
63
|
unique_ptr<char[]> buffer;
|
63
64
|
idx_t buffer_size;
|
@@ -0,0 +1,40 @@
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
2
|
+
// DuckDB
|
3
|
+
//
|
4
|
+
// duckdb/execution/operator/persistent/csv_line_info.hpp
|
5
|
+
//
|
6
|
+
//
|
7
|
+
//===----------------------------------------------------------------------===//
|
8
|
+
|
9
|
+
#pragma once
|
10
|
+
|
11
|
+
namespace duckdb {
|
12
|
+
struct LineInfo {
|
13
|
+
public:
|
14
|
+
explicit LineInfo(mutex &main_mutex_p, vector<unordered_map<idx_t, idx_t>> &batch_to_tuple_end_p,
|
15
|
+
vector<set<idx_t>> &tuple_start_p, vector<vector<idx_t>> &tuple_end_p)
|
16
|
+
: main_mutex(main_mutex_p), batch_to_tuple_end(batch_to_tuple_end_p), tuple_start(tuple_start_p),
|
17
|
+
tuple_end(tuple_end_p) {};
|
18
|
+
bool CanItGetLine(idx_t file_idx, idx_t batch_idx);
|
19
|
+
|
20
|
+
idx_t GetLine(idx_t batch_idx, idx_t line_error = 0, idx_t file_idx = 0, idx_t cur_start = 0, bool verify = true);
|
21
|
+
//! Verify if the CSV File was read correctly from [0,batch_idx] batches.
|
22
|
+
void Verify(idx_t file_idx, idx_t batch_idx, idx_t cur_first_pos);
|
23
|
+
//! Lines read per batch, <batch_index,count>
|
24
|
+
unordered_map<idx_t, idx_t> lines_read;
|
25
|
+
//! Set of batches that have been initialized but are not yet finished.
|
26
|
+
vector<set<idx_t>> current_batches;
|
27
|
+
//! Pointer to CSV Reader Mutex
|
28
|
+
mutex &main_mutex;
|
29
|
+
//! Pointer Batch to Tuple End
|
30
|
+
vector<unordered_map<idx_t, idx_t>> &batch_to_tuple_end;
|
31
|
+
//! Pointer Batch to Tuple Start
|
32
|
+
vector<set<idx_t>> &tuple_start;
|
33
|
+
//! Pointer Batch to Tuple End
|
34
|
+
vector<vector<idx_t>> &tuple_end;
|
35
|
+
//! If we already threw an exception on a previous thread.
|
36
|
+
bool done = false;
|
37
|
+
idx_t first_line = 0;
|
38
|
+
};
|
39
|
+
|
40
|
+
} // namespace duckdb
|
@@ -1,7 +1,7 @@
|
|
1
1
|
//===----------------------------------------------------------------------===//
|
2
2
|
// DuckDB
|
3
3
|
//
|
4
|
-
// duckdb/execution/operator/persistent/
|
4
|
+
// duckdb/execution/operator/persistent/parallel_csv_reader.hpp
|
5
5
|
//
|
6
6
|
//
|
7
7
|
//===----------------------------------------------------------------------===//
|
@@ -12,6 +12,7 @@
|
|
12
12
|
#include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
|
13
13
|
#include "duckdb/execution/operator/persistent/csv_file_handle.hpp"
|
14
14
|
#include "duckdb/execution/operator/persistent/csv_buffer.hpp"
|
15
|
+
#include "duckdb/execution/operator/persistent/csv_line_info.hpp"
|
15
16
|
|
16
17
|
#include <sstream>
|
17
18
|
#include <utility>
|
@@ -20,9 +21,9 @@ namespace duckdb {
|
|
20
21
|
|
21
22
|
struct CSVBufferRead {
|
22
23
|
CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, idx_t buffer_start_p, idx_t buffer_end_p, idx_t batch_index,
|
23
|
-
idx_t
|
24
|
-
: buffer(std::move(buffer_p)), buffer_start(buffer_start_p), buffer_end(buffer_end_p),
|
25
|
-
|
24
|
+
idx_t local_batch_index_p, optional_ptr<LineInfo> line_info_p)
|
25
|
+
: buffer(std::move(buffer_p)), line_info(line_info_p), buffer_start(buffer_start_p), buffer_end(buffer_end_p),
|
26
|
+
batch_index(batch_index), local_batch_index(local_batch_index_p) {
|
26
27
|
if (buffer) {
|
27
28
|
if (buffer_end > buffer->GetBufferSize()) {
|
28
29
|
buffer_end = buffer->GetBufferSize();
|
@@ -34,8 +35,9 @@ struct CSVBufferRead {
|
|
34
35
|
}
|
35
36
|
|
36
37
|
CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, shared_ptr<CSVBuffer> nxt_buffer_p, idx_t buffer_start_p,
|
37
|
-
idx_t buffer_end_p, idx_t batch_index, idx_t
|
38
|
-
: CSVBufferRead(std::move(buffer_p), buffer_start_p, buffer_end_p, batch_index,
|
38
|
+
idx_t buffer_end_p, idx_t batch_index, idx_t local_batch_index, optional_ptr<LineInfo> line_info_p)
|
39
|
+
: CSVBufferRead(std::move(buffer_p), buffer_start_p, buffer_end_p, batch_index, local_batch_index,
|
40
|
+
line_info_p) {
|
39
41
|
next_buffer = std::move(nxt_buffer_p);
|
40
42
|
}
|
41
43
|
|
@@ -84,23 +86,27 @@ struct CSVBufferRead {
|
|
84
86
|
shared_ptr<CSVBuffer> buffer;
|
85
87
|
shared_ptr<CSVBuffer> next_buffer;
|
86
88
|
vector<unique_ptr<char[]>> intersections;
|
89
|
+
optional_ptr<LineInfo> line_info;
|
87
90
|
|
88
91
|
idx_t buffer_start;
|
89
92
|
idx_t buffer_end;
|
90
93
|
idx_t batch_index;
|
91
|
-
idx_t
|
94
|
+
idx_t local_batch_index;
|
95
|
+
idx_t lines_read = 0;
|
92
96
|
};
|
93
97
|
|
94
98
|
struct VerificationPositions {
|
95
99
|
idx_t beginning_of_first_line = 0;
|
96
100
|
idx_t end_of_last_line = 0;
|
97
101
|
};
|
98
|
-
|
102
|
+
|
103
|
+
//! CSV Reader for Parallel Reading
|
99
104
|
class ParallelCSVReader : public BaseCSVReader {
|
100
105
|
public:
|
101
106
|
ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
|
102
|
-
idx_t first_pos_first_buffer, const vector<LogicalType> &requested_types);
|
103
|
-
~ParallelCSVReader()
|
107
|
+
idx_t first_pos_first_buffer, const vector<LogicalType> &requested_types, idx_t file_idx_p);
|
108
|
+
virtual ~ParallelCSVReader() {
|
109
|
+
}
|
104
110
|
|
105
111
|
//! Current Position (Relative to the Buffer)
|
106
112
|
idx_t position_buffer = 0;
|
@@ -118,13 +124,21 @@ public:
|
|
118
124
|
bool finished = false;
|
119
125
|
|
120
126
|
unique_ptr<CSVBufferRead> buffer;
|
127
|
+
|
128
|
+
idx_t file_idx;
|
129
|
+
|
121
130
|
VerificationPositions GetVerificationPositions();
|
122
131
|
|
132
|
+
//! Position of the first read line and last read line for verification purposes
|
133
|
+
VerificationPositions verification_positions;
|
134
|
+
|
123
135
|
public:
|
124
136
|
void SetBufferRead(unique_ptr<CSVBufferRead> buffer);
|
125
137
|
//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
|
126
138
|
void ParseCSV(DataChunk &insert_chunk);
|
127
139
|
|
140
|
+
idx_t GetLineError(idx_t line_error, idx_t buffer_idx) override;
|
141
|
+
|
128
142
|
private:
|
129
143
|
//! Initialize Parser
|
130
144
|
void Initialize(const vector<LogicalType> &requested_types);
|
@@ -135,7 +149,7 @@ private:
|
|
135
149
|
//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
|
136
150
|
bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
|
137
151
|
//! Sets Position depending on the byte_start of this thread
|
138
|
-
bool SetPosition(
|
152
|
+
bool SetPosition();
|
139
153
|
//! Called when scanning the 1st buffer, skips empty lines
|
140
154
|
void SkipEmptyLines();
|
141
155
|
//! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
|
@@ -148,8 +162,9 @@ private:
|
|
148
162
|
|
149
163
|
//! Parses a CSV file with a one-byte delimiter, escape and quote character
|
150
164
|
bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
|
151
|
-
//!
|
152
|
-
|
165
|
+
//! Verifies that the line length did not go over a pre-defined limit.
|
166
|
+
void VerifyLineLength(idx_t line_size);
|
167
|
+
|
153
168
|
//! First Position of First Buffer
|
154
169
|
idx_t first_pos_first_buffer = 0;
|
155
170
|
};
|
@@ -226,19 +226,20 @@ bool ExtensionHelper::IsFullPath(const string &extension) {
|
|
226
226
|
StringUtil::Contains(extension, "\\");
|
227
227
|
}
|
228
228
|
|
229
|
-
string ExtensionHelper::GetExtensionName(const string &
|
229
|
+
string ExtensionHelper::GetExtensionName(const string &original_name) {
|
230
|
+
auto extension = StringUtil::Lower(original_name);
|
230
231
|
if (!IsFullPath(extension)) {
|
231
|
-
return extension;
|
232
|
+
return ExtensionHelper::ApplyExtensionAlias(extension);
|
232
233
|
}
|
233
234
|
auto splits = StringUtil::Split(StringUtil::Replace(extension, "\\", "/"), '/');
|
234
235
|
if (splits.empty()) {
|
235
|
-
return extension;
|
236
|
+
return ExtensionHelper::ApplyExtensionAlias(extension);
|
236
237
|
}
|
237
238
|
splits = StringUtil::Split(splits.back(), '.');
|
238
239
|
if (splits.empty()) {
|
239
|
-
return extension;
|
240
|
+
return ExtensionHelper::ApplyExtensionAlias(extension);
|
240
241
|
}
|
241
|
-
return
|
242
|
+
return ExtensionHelper::ApplyExtensionAlias(splits.front());
|
242
243
|
}
|
243
244
|
|
244
245
|
void ExtensionHelper::LoadExternalExtension(DatabaseInstance &db, FileOpener *opener, const string &extension) {
|
@@ -55,6 +55,9 @@ unique_ptr<Expression> MoveConstantsRule::Apply(LogicalOperator &op, vector<refe
|
|
55
55
|
}
|
56
56
|
auto result_value = Value::HUGEINT(outer_value);
|
57
57
|
if (!result_value.DefaultTryCastAs(constant_type)) {
|
58
|
+
if (comparison.type != ExpressionType::COMPARE_EQUAL) {
|
59
|
+
return nullptr;
|
60
|
+
}
|
58
61
|
// if the cast is not possible then the comparison is not possible
|
59
62
|
// for example, if we have x + 5 = 3, where x is an unsigned number, we will get x = -2
|
60
63
|
// since this is not possible we can remove the entire branch here
|
@@ -140,7 +140,7 @@ void StringStats::Update(BaseStatistics &stats, const string_t &value) {
|
|
140
140
|
if (unicode == UnicodeType::UNICODE) {
|
141
141
|
string_data.has_unicode = true;
|
142
142
|
} else if (unicode == UnicodeType::INVALID) {
|
143
|
-
throw
|
143
|
+
throw InvalidInputException(
|
144
144
|
ErrorManager::InvalidUnicodeError(string((char *)data, size), "segment statistics update"));
|
145
145
|
}
|
146
146
|
}
|