duckdb 0.6.2-dev735.0 → 0.6.2-dev750.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +16 -19
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +7 -4
- package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +7 -6
- package/src/duckdb/src/function/table/read_csv.cpp +168 -62
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/http_stats.hpp +5 -0
- package/src/duckdb/src/main/query_profiler.cpp +1 -1
package/package.json
CHANGED
|
@@ -160,7 +160,6 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
|
|
|
160
160
|
} else {
|
|
161
161
|
row_empty = false;
|
|
162
162
|
}
|
|
163
|
-
|
|
164
163
|
if (!sql_types.empty() && column == sql_types.size() && length == 0) {
|
|
165
164
|
// skip a single trailing delimiter in last column
|
|
166
165
|
return;
|
|
@@ -249,7 +248,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
|
|
|
249
248
|
return false;
|
|
250
249
|
} else {
|
|
251
250
|
throw InvalidInputException(
|
|
252
|
-
"Error in file \"%s\" on line %s: expected %lld values per row, but got %d
|
|
251
|
+
"Error in file \"%s\" on line %s: expected %lld values per row, but got %d.\nParser options:\n%s",
|
|
253
252
|
options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), sql_types.size(), column,
|
|
254
253
|
options.ToString());
|
|
255
254
|
}
|
|
@@ -309,7 +308,7 @@ void BaseCSVReader::VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, i
|
|
|
309
308
|
int64_t error_line = linenr - (chunk.size() - row_idx) + 1 + offset;
|
|
310
309
|
D_ASSERT(error_line >= 0);
|
|
311
310
|
throw InvalidInputException("Error in file \"%s\" at line %llu in column \"%s\": "
|
|
312
|
-
"%s. Parser options
|
|
311
|
+
"%s. Parser options:\n%s",
|
|
313
312
|
options.file_path, error_line, col_name,
|
|
314
313
|
ErrorManager::InvalidUnicodeError(s.GetString(), "CSV file"), options.ToString());
|
|
315
314
|
}
|
|
@@ -332,29 +331,27 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
|
|
|
332
331
|
// convert the columns in the parsed chunk to the types of the table
|
|
333
332
|
insert_chunk.SetCardinality(parse_chunk);
|
|
334
333
|
for (idx_t col_idx = 0; col_idx < sql_types.size(); col_idx++) {
|
|
335
|
-
|
|
334
|
+
auto insert_idx = insert_cols_idx[col_idx];
|
|
335
|
+
auto &type = sql_types[col_idx];
|
|
336
|
+
if (type.id() == LogicalTypeId::VARCHAR) {
|
|
336
337
|
// target type is varchar: no need to convert
|
|
337
338
|
// just test that all strings are valid utf-8 strings
|
|
338
339
|
VerifyUTF8(col_idx);
|
|
339
|
-
insert_chunk.data[
|
|
340
|
+
insert_chunk.data[insert_idx].Reference(parse_chunk.data[col_idx]);
|
|
340
341
|
} else {
|
|
341
342
|
string error_message;
|
|
342
343
|
bool success;
|
|
343
|
-
if (options.has_format[LogicalTypeId::DATE] &&
|
|
344
|
+
if (options.has_format[LogicalTypeId::DATE] && type.id() == LogicalTypeId::DATE) {
|
|
344
345
|
// use the date format to cast the chunk
|
|
345
|
-
success =
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
} else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
|
|
349
|
-
sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP) {
|
|
346
|
+
success = TryCastDateVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_idx],
|
|
347
|
+
parse_chunk.size(), error_message);
|
|
348
|
+
} else if (options.has_format[LogicalTypeId::TIMESTAMP] && type.id() == LogicalTypeId::TIMESTAMP) {
|
|
350
349
|
// use the date format to cast the chunk
|
|
351
|
-
success = TryCastTimestampVector(options, parse_chunk.data[col_idx],
|
|
352
|
-
|
|
353
|
-
error_message);
|
|
350
|
+
success = TryCastTimestampVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_idx],
|
|
351
|
+
parse_chunk.size(), error_message);
|
|
354
352
|
} else {
|
|
355
353
|
// target type is not varchar: perform a cast
|
|
356
|
-
success = VectorOperations::DefaultTryCast(parse_chunk.data[col_idx],
|
|
357
|
-
insert_chunk.data[insert_cols_idx[col_idx]],
|
|
354
|
+
success = VectorOperations::DefaultTryCast(parse_chunk.data[col_idx], insert_chunk.data[insert_idx],
|
|
358
355
|
parse_chunk.size(), &error_message);
|
|
359
356
|
}
|
|
360
357
|
if (success) {
|
|
@@ -385,13 +382,13 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
|
|
|
385
382
|
auto error_line = linenr - (parse_chunk.size() - row_idx) + 1;
|
|
386
383
|
|
|
387
384
|
if (options.auto_detect) {
|
|
388
|
-
throw InvalidInputException("%s in column %s, at line %llu
|
|
389
|
-
"options
|
|
385
|
+
throw InvalidInputException("%s in column %s, at line %llu.\n\nParser "
|
|
386
|
+
"options:\n%s.\n\nConsider either increasing the sample size "
|
|
390
387
|
"(SAMPLE_SIZE=X [X rows] or SAMPLE_SIZE=-1 [all rows]), "
|
|
391
388
|
"or skipping column conversion (ALL_VARCHAR=1)",
|
|
392
389
|
error_message, col_name, error_line, options.ToString());
|
|
393
390
|
} else {
|
|
394
|
-
throw InvalidInputException("%s at line %llu in column %s. Parser options
|
|
391
|
+
throw InvalidInputException("%s at line %llu in column %s. Parser options:\n%s ", error_message,
|
|
395
392
|
error_line, col_name, options.ToString());
|
|
396
393
|
}
|
|
397
394
|
}
|
|
@@ -851,10 +851,13 @@ vector<LogicalType> BufferedCSVReader::SniffCSV(const vector<LogicalType> &reque
|
|
|
851
851
|
// #######
|
|
852
852
|
// type candidates, ordered by descending specificity (~ from high to low)
|
|
853
853
|
vector<LogicalType> type_candidates = {
|
|
854
|
-
LogicalType::VARCHAR,
|
|
855
|
-
LogicalType::
|
|
856
|
-
LogicalType::
|
|
857
|
-
LogicalType::
|
|
854
|
+
LogicalType::VARCHAR,
|
|
855
|
+
LogicalType::TIMESTAMP,
|
|
856
|
+
LogicalType::DATE,
|
|
857
|
+
LogicalType::TIME,
|
|
858
|
+
LogicalType::DOUBLE,
|
|
859
|
+
/* LogicalType::FLOAT,*/ LogicalType::BIGINT,
|
|
860
|
+
/*LogicalType::INTEGER,*/ /*LogicalType::SMALLINT, LogicalType::TINYINT,*/ LogicalType::BOOLEAN,
|
|
858
861
|
LogicalType::SQLNULL};
|
|
859
862
|
// format template candidates, ordered by descending specificity (~ from high to low)
|
|
860
863
|
std::map<LogicalTypeId, vector<const char *>> format_template_candidates = {
|
|
@@ -251,13 +251,14 @@ bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value
|
|
|
251
251
|
}
|
|
252
252
|
|
|
253
253
|
std::string BufferedCSVReaderOptions::ToString() const {
|
|
254
|
-
return "
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
"
|
|
254
|
+
return " file=" + file_path + "\n delimiter='" + delimiter +
|
|
255
|
+
(has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n quote='" + quote +
|
|
256
|
+
(has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n escape='" + escape +
|
|
257
|
+
(has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
|
|
258
|
+
"\n header=" + std::to_string(header) +
|
|
258
259
|
(has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
|
|
259
|
-
"
|
|
260
|
-
"
|
|
260
|
+
"\n sample_size=" + std::to_string(sample_chunk_size * sample_chunks) +
|
|
261
|
+
"\n ignore_erros=" + std::to_string(ignore_errors) + "\n all_varchar=" + std::to_string(all_varchar);
|
|
261
262
|
}
|
|
262
263
|
|
|
263
264
|
} // namespace duckdb
|
|
@@ -271,6 +271,24 @@ public:
|
|
|
271
271
|
atomic<idx_t> bytes_read;
|
|
272
272
|
//! Size of current file
|
|
273
273
|
idx_t file_size;
|
|
274
|
+
//! The index of the next file to read (i.e. current file + 1)
|
|
275
|
+
idx_t file_index = 1;
|
|
276
|
+
|
|
277
|
+
double GetProgress(ReadCSVData &bind_data) const {
|
|
278
|
+
idx_t total_files = bind_data.files.size();
|
|
279
|
+
|
|
280
|
+
// get the progress WITHIN the current file
|
|
281
|
+
double progress;
|
|
282
|
+
if (file_size == 0) {
|
|
283
|
+
progress = 1.0;
|
|
284
|
+
} else {
|
|
285
|
+
progress = double(bytes_read) / double(file_size);
|
|
286
|
+
}
|
|
287
|
+
// now get the total percentage of files read
|
|
288
|
+
double percentage = double(file_index) / total_files;
|
|
289
|
+
percentage += (double(1) / double(total_files)) * progress;
|
|
290
|
+
return percentage * 100;
|
|
291
|
+
}
|
|
274
292
|
|
|
275
293
|
private:
|
|
276
294
|
//! File Handle for current file
|
|
@@ -278,8 +296,6 @@ private:
|
|
|
278
296
|
|
|
279
297
|
shared_ptr<CSVBuffer> current_buffer;
|
|
280
298
|
shared_ptr<CSVBuffer> next_buffer;
|
|
281
|
-
//! The index of the next file to read (i.e. current file + 1)
|
|
282
|
-
idx_t file_index = 1;
|
|
283
299
|
|
|
284
300
|
//! Mutex to lock when getting next batch of bytes (Parallel Only)
|
|
285
301
|
mutex main_mutex;
|
|
@@ -348,6 +364,7 @@ unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, R
|
|
|
348
364
|
}
|
|
349
365
|
return result;
|
|
350
366
|
}
|
|
367
|
+
|
|
351
368
|
static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext &context,
|
|
352
369
|
TableFunctionInitInput &input) {
|
|
353
370
|
auto &bind_data = (ReadCSVData &)*input.bind_data;
|
|
@@ -359,7 +376,6 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
|
|
|
359
376
|
|
|
360
377
|
bind_data.options.file_path = bind_data.files[0];
|
|
361
378
|
file_handle = ReadCSV::OpenCSV(bind_data.options, context);
|
|
362
|
-
|
|
363
379
|
idx_t rows_to_skip = bind_data.options.skip_rows + (bind_data.options.has_header ? 1 : 0);
|
|
364
380
|
return make_unique<ParallelCSVGlobalState>(context, move(file_handle), bind_data.files,
|
|
365
381
|
context.db->NumberOfThreads(), bind_data.options.buffer_size,
|
|
@@ -379,12 +395,9 @@ public:
|
|
|
379
395
|
CSVBufferRead previous_buffer;
|
|
380
396
|
};
|
|
381
397
|
|
|
382
|
-
unique_ptr<LocalTableFunctionState>
|
|
383
|
-
|
|
398
|
+
unique_ptr<LocalTableFunctionState> ParallelReadCSVInitLocal(ExecutionContext &context, TableFunctionInitInput &input,
|
|
399
|
+
GlobalTableFunctionState *global_state_p) {
|
|
384
400
|
auto &csv_data = (ReadCSVData &)*input.bind_data;
|
|
385
|
-
if (csv_data.single_threaded) {
|
|
386
|
-
return nullptr;
|
|
387
|
-
}
|
|
388
401
|
auto &global_state = (ParallelCSVGlobalState &)*global_state_p;
|
|
389
402
|
auto next_local_buffer = global_state.Next(context.client, csv_data);
|
|
390
403
|
unique_ptr<ParallelCSVReader> csv_reader;
|
|
@@ -416,7 +429,6 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
|
|
|
416
429
|
if (!next_chunk) {
|
|
417
430
|
break;
|
|
418
431
|
}
|
|
419
|
-
// csv_local_state.previous_buffer = csv_local_state.csv_reader->buffer;
|
|
420
432
|
csv_local_state.csv_reader->SetBufferRead(move(next_chunk));
|
|
421
433
|
}
|
|
422
434
|
csv_local_state.csv_reader->ParseCSV(output);
|
|
@@ -434,91 +446,172 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
|
|
|
434
446
|
}
|
|
435
447
|
}
|
|
436
448
|
|
|
437
|
-
static idx_t CSVReaderGetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
|
|
438
|
-
LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
|
|
439
|
-
auto &bind_data = (ReadCSVData &)*bind_data_p;
|
|
440
|
-
if (bind_data.single_threaded) {
|
|
441
|
-
return 0;
|
|
442
|
-
}
|
|
443
|
-
auto &data = (ParallelCSVLocalState &)*local_state;
|
|
444
|
-
return data.csv_reader->buffer->batch_index;
|
|
445
|
-
}
|
|
446
|
-
|
|
447
449
|
//===--------------------------------------------------------------------===//
|
|
448
450
|
// Single-Threaded CSV Reader
|
|
449
451
|
//===--------------------------------------------------------------------===//
|
|
450
452
|
struct SingleThreadedCSVState : public GlobalTableFunctionState {
|
|
451
|
-
|
|
452
|
-
|
|
453
|
+
explicit SingleThreadedCSVState(idx_t total_files) : total_files(total_files), next_file(0), progress_in_files(0) {
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
mutex csv_lock;
|
|
457
|
+
unique_ptr<BufferedCSVReader> initial_reader;
|
|
458
|
+
//! The total number of files to read from
|
|
459
|
+
idx_t total_files;
|
|
453
460
|
//! The index of the next file to read (i.e. current file + 1)
|
|
454
|
-
idx_t
|
|
455
|
-
//!
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
461
|
+
atomic<idx_t> next_file;
|
|
462
|
+
//! How far along we are in reading the current set of open files
|
|
463
|
+
//! This goes from [0...next_file] * 100
|
|
464
|
+
atomic<idx_t> progress_in_files;
|
|
465
|
+
//! The set of SQL types
|
|
466
|
+
vector<LogicalType> sql_types;
|
|
459
467
|
|
|
460
468
|
idx_t MaxThreads() const override {
|
|
461
|
-
return
|
|
469
|
+
return total_files;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
double GetProgress(ReadCSVData &bind_data) const {
|
|
473
|
+
D_ASSERT(total_files == bind_data.files.size());
|
|
474
|
+
D_ASSERT(progress_in_files <= total_files * 100);
|
|
475
|
+
return (double(progress_in_files) / double(total_files));
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
unique_ptr<BufferedCSVReader> GetCSVReader(ClientContext &context, ReadCSVData &bind_data, idx_t &file_index,
|
|
479
|
+
idx_t &total_size) {
|
|
480
|
+
BufferedCSVReaderOptions options;
|
|
481
|
+
{
|
|
482
|
+
lock_guard<mutex> l(csv_lock);
|
|
483
|
+
if (initial_reader) {
|
|
484
|
+
return move(initial_reader);
|
|
485
|
+
}
|
|
486
|
+
if (next_file >= total_files) {
|
|
487
|
+
return nullptr;
|
|
488
|
+
}
|
|
489
|
+
options = bind_data.options;
|
|
490
|
+
file_index = next_file;
|
|
491
|
+
next_file++;
|
|
492
|
+
}
|
|
493
|
+
// reuse csv_readers was created during binding
|
|
494
|
+
unique_ptr<BufferedCSVReader> result;
|
|
495
|
+
if (options.union_by_name) {
|
|
496
|
+
result = move(bind_data.union_readers[file_index]);
|
|
497
|
+
} else {
|
|
498
|
+
options.file_path = bind_data.files[file_index];
|
|
499
|
+
result = make_unique<BufferedCSVReader>(context, move(options), sql_types);
|
|
500
|
+
}
|
|
501
|
+
total_size = result->file_handle->FileSize();
|
|
502
|
+
return result;
|
|
503
|
+
}
|
|
504
|
+
};
|
|
505
|
+
|
|
506
|
+
struct SingleThreadedCSVLocalState : public LocalTableFunctionState {
|
|
507
|
+
public:
|
|
508
|
+
explicit SingleThreadedCSVLocalState() : bytes_read(0), total_size(0), current_progress(0), file_index(0) {
|
|
462
509
|
}
|
|
510
|
+
|
|
511
|
+
//! The CSV reader
|
|
512
|
+
unique_ptr<BufferedCSVReader> csv_reader;
|
|
513
|
+
//! The current amount of bytes read by this reader
|
|
514
|
+
idx_t bytes_read;
|
|
515
|
+
//! The total amount of bytes in the file
|
|
516
|
+
idx_t total_size;
|
|
517
|
+
//! The current progress from 0..100
|
|
518
|
+
idx_t current_progress;
|
|
519
|
+
//! The file index of this reader
|
|
520
|
+
idx_t file_index;
|
|
463
521
|
};
|
|
464
522
|
|
|
465
523
|
static unique_ptr<GlobalTableFunctionState> SingleThreadedCSVInit(ClientContext &context,
|
|
466
524
|
TableFunctionInitInput &input) {
|
|
467
525
|
auto &bind_data = (ReadCSVData &)*input.bind_data;
|
|
468
|
-
auto result = make_unique<SingleThreadedCSVState>();
|
|
526
|
+
auto result = make_unique<SingleThreadedCSVState>(bind_data.files.size());
|
|
469
527
|
if (bind_data.initial_reader) {
|
|
470
|
-
result->
|
|
528
|
+
result->initial_reader = move(bind_data.initial_reader);
|
|
471
529
|
} else if (bind_data.files.empty()) {
|
|
472
530
|
// This can happen when a filename based filter pushdown has eliminated all possible files for this scan.
|
|
473
531
|
return move(result);
|
|
474
532
|
} else {
|
|
475
533
|
bind_data.options.file_path = bind_data.files[0];
|
|
476
|
-
result->
|
|
534
|
+
result->initial_reader = make_unique<BufferedCSVReader>(context, bind_data.options, bind_data.sql_types);
|
|
535
|
+
if (bind_data.options.auto_detect) {
|
|
536
|
+
bind_data.options = result->initial_reader->options;
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
if (!bind_data.options.union_by_name) {
|
|
540
|
+
// if we are reading multiple files - run auto-detect only on the first file
|
|
541
|
+
// UNLESS union by name is turned on - in that case we assume that different files have different schemas
|
|
542
|
+
// as such, we need to re-run the auto detection on each file
|
|
543
|
+
bind_data.options.auto_detect = false;
|
|
544
|
+
}
|
|
545
|
+
result->next_file = 1;
|
|
546
|
+
if (result->initial_reader) {
|
|
547
|
+
result->sql_types = result->initial_reader->sql_types;
|
|
477
548
|
}
|
|
478
|
-
|
|
479
|
-
|
|
549
|
+
return move(result);
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
unique_ptr<LocalTableFunctionState> SingleThreadedReadCSVInitLocal(ExecutionContext &context,
|
|
553
|
+
TableFunctionInitInput &input,
|
|
554
|
+
GlobalTableFunctionState *global_state_p) {
|
|
555
|
+
auto &bind_data = (ReadCSVData &)*input.bind_data;
|
|
556
|
+
auto &data = (SingleThreadedCSVState &)*global_state_p;
|
|
557
|
+
auto result = make_unique<SingleThreadedCSVLocalState>();
|
|
558
|
+
result->csv_reader = data.GetCSVReader(context.client, bind_data, result->file_index, result->total_size);
|
|
480
559
|
return move(result);
|
|
481
560
|
}
|
|
482
561
|
|
|
483
562
|
static void SingleThreadedCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
|
|
484
563
|
auto &bind_data = (ReadCSVData &)*data_p.bind_data;
|
|
485
564
|
auto &data = (SingleThreadedCSVState &)*data_p.global_state;
|
|
486
|
-
|
|
487
|
-
if (!
|
|
565
|
+
auto &lstate = (SingleThreadedCSVLocalState &)*data_p.local_state;
|
|
566
|
+
if (!lstate.csv_reader) {
|
|
488
567
|
// no csv_reader was set, this can happen when a filename-based filter has filtered out all possible files
|
|
489
568
|
return;
|
|
490
569
|
}
|
|
491
570
|
|
|
492
571
|
do {
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
572
|
+
lstate.csv_reader->ParseCSV(output);
|
|
573
|
+
// update the number of bytes read
|
|
574
|
+
D_ASSERT(lstate.bytes_read <= lstate.csv_reader->bytes_in_chunk);
|
|
575
|
+
auto bytes_read = MinValue<idx_t>(lstate.total_size, lstate.csv_reader->bytes_in_chunk);
|
|
576
|
+
auto current_progress = lstate.total_size == 0 ? 100 : 100 * bytes_read / lstate.total_size;
|
|
577
|
+
if (current_progress > lstate.current_progress) {
|
|
578
|
+
if (current_progress > 100) {
|
|
579
|
+
throw InternalException("Progress should never exceed 100");
|
|
580
|
+
}
|
|
581
|
+
data.progress_in_files += current_progress - lstate.current_progress;
|
|
582
|
+
lstate.current_progress = current_progress;
|
|
583
|
+
}
|
|
584
|
+
if (output.size() == 0) {
|
|
585
|
+
// exhausted this file, but we might have more files we can read
|
|
586
|
+
auto csv_reader = data.GetCSVReader(context, bind_data, lstate.file_index, lstate.total_size);
|
|
587
|
+
// add any left-over progress for this file to the progress bar
|
|
588
|
+
if (lstate.current_progress < 100) {
|
|
589
|
+
data.progress_in_files += 100 - lstate.current_progress;
|
|
590
|
+
}
|
|
591
|
+
// reset the current progress
|
|
592
|
+
lstate.current_progress = 0;
|
|
593
|
+
lstate.bytes_read = 0;
|
|
594
|
+
lstate.csv_reader = move(csv_reader);
|
|
595
|
+
if (!lstate.csv_reader) {
|
|
596
|
+
// no more files - we are done
|
|
597
|
+
return;
|
|
505
598
|
}
|
|
506
|
-
|
|
599
|
+
lstate.bytes_read = 0;
|
|
507
600
|
} else {
|
|
508
601
|
break;
|
|
509
602
|
}
|
|
510
603
|
} while (true);
|
|
511
604
|
|
|
512
605
|
if (bind_data.options.union_by_name) {
|
|
513
|
-
|
|
606
|
+
lstate.csv_reader->SetNullUnionCols(output);
|
|
514
607
|
}
|
|
515
608
|
if (bind_data.options.include_file_name) {
|
|
516
609
|
auto &col = output.data[bind_data.filename_col_idx];
|
|
517
|
-
col.SetValue(0, Value(
|
|
610
|
+
col.SetValue(0, Value(lstate.csv_reader->options.file_path));
|
|
518
611
|
col.SetVectorType(VectorType::CONSTANT_VECTOR);
|
|
519
612
|
}
|
|
520
613
|
if (bind_data.options.include_parsed_hive_partitions) {
|
|
521
|
-
auto partitions = HivePartitioning::Parse(
|
|
614
|
+
auto partitions = HivePartitioning::Parse(lstate.csv_reader->options.file_path);
|
|
522
615
|
|
|
523
616
|
idx_t i = bind_data.hive_partition_col_idx;
|
|
524
617
|
|
|
@@ -531,7 +624,7 @@ static void SingleThreadedCSVFunction(ClientContext &context, TableFunctionInput
|
|
|
531
624
|
for (auto &part : partitions) {
|
|
532
625
|
if (bind_data.options.names[i] != part.first) {
|
|
533
626
|
throw IOException("Hive partition names mismatch, expected '" + bind_data.options.names[i] +
|
|
534
|
-
"' but found '" + part.first + "' for file '" +
|
|
627
|
+
"' but found '" + part.first + "' for file '" + lstate.csv_reader->options.file_path +
|
|
535
628
|
"'");
|
|
536
629
|
}
|
|
537
630
|
auto &col = output.data[i++];
|
|
@@ -553,6 +646,16 @@ static unique_ptr<GlobalTableFunctionState> ReadCSVInitGlobal(ClientContext &con
|
|
|
553
646
|
}
|
|
554
647
|
}
|
|
555
648
|
|
|
649
|
+
unique_ptr<LocalTableFunctionState> ReadCSVInitLocal(ExecutionContext &context, TableFunctionInitInput &input,
|
|
650
|
+
GlobalTableFunctionState *global_state_p) {
|
|
651
|
+
auto &csv_data = (ReadCSVData &)*input.bind_data;
|
|
652
|
+
if (csv_data.single_threaded) {
|
|
653
|
+
return SingleThreadedReadCSVInitLocal(context, input, global_state_p);
|
|
654
|
+
} else {
|
|
655
|
+
return ParallelReadCSVInitLocal(context, input, global_state_p);
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
|
|
556
659
|
static void ReadCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
|
|
557
660
|
auto &bind_data = (ReadCSVData &)*data_p.bind_data;
|
|
558
661
|
if (bind_data.single_threaded) {
|
|
@@ -562,6 +665,17 @@ static void ReadCSVFunction(ClientContext &context, TableFunctionInput &data_p,
|
|
|
562
665
|
}
|
|
563
666
|
}
|
|
564
667
|
|
|
668
|
+
static idx_t CSVReaderGetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
|
|
669
|
+
LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
|
|
670
|
+
auto &bind_data = (ReadCSVData &)*bind_data_p;
|
|
671
|
+
if (bind_data.single_threaded) {
|
|
672
|
+
auto &data = (SingleThreadedCSVLocalState &)*local_state;
|
|
673
|
+
return data.file_index;
|
|
674
|
+
}
|
|
675
|
+
auto &data = (ParallelCSVLocalState &)*local_state;
|
|
676
|
+
return data.csv_reader->buffer->batch_index;
|
|
677
|
+
}
|
|
678
|
+
|
|
565
679
|
static void ReadCSVAddNamedParameters(TableFunction &table_function) {
|
|
566
680
|
table_function.named_parameters["sep"] = LogicalType::VARCHAR;
|
|
567
681
|
table_function.named_parameters["delim"] = LogicalType::VARCHAR;
|
|
@@ -592,21 +706,13 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
|
|
|
592
706
|
double CSVReaderProgress(ClientContext &context, const FunctionData *bind_data_p,
|
|
593
707
|
const GlobalTableFunctionState *global_state) {
|
|
594
708
|
auto &bind_data = (ReadCSVData &)*bind_data_p;
|
|
595
|
-
idx_t file_size, bytes_read;
|
|
596
709
|
if (bind_data.single_threaded) {
|
|
597
|
-
auto &data = (
|
|
598
|
-
|
|
599
|
-
bytes_read = data.bytes_read;
|
|
710
|
+
auto &data = (SingleThreadedCSVState &)*global_state;
|
|
711
|
+
return data.GetProgress(bind_data);
|
|
600
712
|
} else {
|
|
601
713
|
auto &data = (const ParallelCSVGlobalState &)*global_state;
|
|
602
|
-
|
|
603
|
-
bytes_read = data.bytes_read;
|
|
604
|
-
}
|
|
605
|
-
if (file_size == 0) {
|
|
606
|
-
return 100;
|
|
714
|
+
return data.GetProgress(bind_data);
|
|
607
715
|
}
|
|
608
|
-
auto percentage = (bytes_read * 100.0) / file_size;
|
|
609
|
-
return percentage;
|
|
610
716
|
}
|
|
611
717
|
|
|
612
718
|
void CSVComplexFilterPushdown(ClientContext &context, LogicalGet &get, FunctionData *bind_data_p,
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
|
2
|
-
#define DUCKDB_VERSION "0.6.2-
|
|
2
|
+
#define DUCKDB_VERSION "0.6.2-dev750"
|
|
3
3
|
#endif
|
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
|
5
|
+
#define DUCKDB_SOURCE_ID "412d8c5074"
|
|
6
6
|
#endif
|
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
|
8
8
|
#include "duckdb/main/database.hpp"
|
|
@@ -377,7 +377,7 @@ void QueryProfiler::QueryTreeToStream(std::ostream &ss) const {
|
|
|
377
377
|
return;
|
|
378
378
|
}
|
|
379
379
|
|
|
380
|
-
if (context.client_data->http_stats) {
|
|
380
|
+
if (context.client_data->http_stats && !context.client_data->http_stats->IsEmpty()) {
|
|
381
381
|
string read =
|
|
382
382
|
"in: " + StringUtil::BytesToHumanReadableString(context.client_data->http_stats->total_bytes_received);
|
|
383
383
|
string written =
|