duckdb 0.7.2-dev2144.0 → 0.7.2-dev2233.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/column_reader.cpp +3 -0
- package/src/duckdb/src/common/types/column/column_data_collection.cpp +7 -2
- package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +3 -0
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +71 -22
- package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +17 -13
- package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +0 -7
- package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +124 -29
- package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +1 -1
- package/src/duckdb/src/function/table/read_csv.cpp +124 -58
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +4 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +8 -3
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +5 -7
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp +5 -1
- package/src/duckdb/src/include/duckdb/function/function.hpp +2 -0
- package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +25 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -0
- package/src/duckdb/src/include/duckdb/main/config.hpp +0 -2
- package/src/duckdb/src/main/settings/settings.cpp +3 -4
- package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +13 -0
- package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +9 -0
@@ -25,8 +25,9 @@
|
|
25
25
|
namespace duckdb {
|
26
26
|
|
27
27
|
ParallelCSVReader::ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
|
28
|
-
unique_ptr<CSVBufferRead> buffer_p,
|
29
|
-
|
28
|
+
unique_ptr<CSVBufferRead> buffer_p, idx_t first_pos_first_buffer_p,
|
29
|
+
const vector<LogicalType> &requested_types)
|
30
|
+
: BaseCSVReader(context, std::move(options_p), requested_types), first_pos_first_buffer(first_pos_first_buffer_p) {
|
30
31
|
Initialize(requested_types);
|
31
32
|
SetBufferRead(std::move(buffer_p));
|
32
33
|
if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
|
@@ -52,9 +53,34 @@ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl,
|
|
52
53
|
return (carry && carry_followed_by_nl) || (!carry && first_char);
|
53
54
|
}
|
54
55
|
|
56
|
+
void ParallelCSVReader::SkipEmptyLines() {
|
57
|
+
idx_t new_pos_buffer = position_buffer;
|
58
|
+
if (parse_chunk.data.size() == 1) {
|
59
|
+
// Empty lines are null data.
|
60
|
+
return;
|
61
|
+
}
|
62
|
+
for (; new_pos_buffer < end_buffer; new_pos_buffer++) {
|
63
|
+
if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) {
|
64
|
+
bool carrier_return = (*buffer)[new_pos_buffer] == '\r';
|
65
|
+
new_pos_buffer++;
|
66
|
+
if (carrier_return && new_pos_buffer < buffer_size && (*buffer)[new_pos_buffer] == '\n') {
|
67
|
+
position_buffer++;
|
68
|
+
}
|
69
|
+
if (new_pos_buffer > end_buffer) {
|
70
|
+
return;
|
71
|
+
}
|
72
|
+
position_buffer = new_pos_buffer;
|
73
|
+
} else if ((*buffer)[new_pos_buffer] != ' ') {
|
74
|
+
return;
|
75
|
+
}
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
55
79
|
bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
|
56
80
|
if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer &&
|
57
|
-
start_buffer ==
|
81
|
+
start_buffer == first_pos_first_buffer) {
|
82
|
+
start_buffer = buffer->buffer->GetStart();
|
83
|
+
position_buffer = start_buffer;
|
58
84
|
verification_positions.beginning_of_first_line = position_buffer;
|
59
85
|
verification_positions.end_of_last_line = position_buffer;
|
60
86
|
// First buffer doesn't need any setting
|
@@ -70,11 +96,23 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
|
|
70
96
|
if (position_buffer > end_buffer) {
|
71
97
|
return false;
|
72
98
|
}
|
99
|
+
SkipEmptyLines();
|
100
|
+
if (verification_positions.beginning_of_first_line == 0) {
|
101
|
+
verification_positions.beginning_of_first_line = position_buffer;
|
102
|
+
}
|
103
|
+
|
104
|
+
verification_positions.end_of_last_line = position_buffer;
|
73
105
|
return true;
|
74
106
|
}
|
75
107
|
}
|
76
108
|
return false;
|
77
109
|
}
|
110
|
+
SkipEmptyLines();
|
111
|
+
if (verification_positions.beginning_of_first_line == 0) {
|
112
|
+
verification_positions.beginning_of_first_line = position_buffer;
|
113
|
+
}
|
114
|
+
|
115
|
+
verification_positions.end_of_last_line = position_buffer;
|
78
116
|
return true;
|
79
117
|
}
|
80
118
|
|
@@ -102,6 +140,11 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
|
|
102
140
|
}
|
103
141
|
}
|
104
142
|
}
|
143
|
+
SkipEmptyLines();
|
144
|
+
|
145
|
+
if (position_buffer > buffer_size) {
|
146
|
+
break;
|
147
|
+
}
|
105
148
|
|
106
149
|
if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
|
107
150
|
break;
|
@@ -113,18 +156,20 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
|
|
113
156
|
}
|
114
157
|
idx_t position_set = position_buffer;
|
115
158
|
start_buffer = position_buffer;
|
116
|
-
|
117
159
|
// We check if we can add this line
|
118
160
|
// disable the projection pushdown while reading the first line
|
119
161
|
// otherwise the first line parsing can be influenced by which columns we are reading
|
120
162
|
auto column_ids = std::move(reader_data.column_ids);
|
121
163
|
auto column_mapping = std::move(reader_data.column_mapping);
|
122
164
|
InitializeProjection();
|
123
|
-
|
165
|
+
try {
|
166
|
+
successfully_read_first_line = TryParseSimpleCSV(first_line_chunk, error_message, true);
|
167
|
+
} catch (...) {
|
168
|
+
successfully_read_first_line = false;
|
169
|
+
}
|
124
170
|
// restore the projection pushdown
|
125
171
|
reader_data.column_ids = std::move(column_ids);
|
126
172
|
reader_data.column_mapping = std::move(column_mapping);
|
127
|
-
|
128
173
|
end_buffer = end_buffer_real;
|
129
174
|
start_buffer = position_set;
|
130
175
|
if (position_buffer >= end_buffer) {
|
@@ -190,27 +235,55 @@ bool ParallelCSVReader::BufferRemainder() {
|
|
190
235
|
return true;
|
191
236
|
}
|
192
237
|
|
238
|
+
void VerifyLineLength(idx_t line_size, idx_t max_line_size) {
|
239
|
+
if (line_size > max_line_size) {
|
240
|
+
// FIXME: this should also output the correct estimated linenumber where it broke
|
241
|
+
throw InvalidInputException("Maximum line size of %llu bytes exceeded!", max_line_size);
|
242
|
+
}
|
243
|
+
}
|
244
|
+
|
245
|
+
bool AllNewLine(string_t value, idx_t column_amount) {
|
246
|
+
auto value_str = value.GetString();
|
247
|
+
if (value_str.empty() && column_amount == 1) {
|
248
|
+
// This is a one column (empty)
|
249
|
+
return false;
|
250
|
+
}
|
251
|
+
for (idx_t i = 0; i < value.GetSize(); i++) {
|
252
|
+
if (!StringUtil::CharacterIsNewline(value_str[i])) {
|
253
|
+
return false;
|
254
|
+
}
|
255
|
+
}
|
256
|
+
return true;
|
257
|
+
}
|
258
|
+
|
193
259
|
bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
|
194
260
|
// used for parsing algorithm
|
261
|
+
if (start_buffer == buffer_size) {
|
262
|
+
// Nothing to read
|
263
|
+
finished = true;
|
264
|
+
return true;
|
265
|
+
}
|
195
266
|
D_ASSERT(end_buffer <= buffer_size);
|
196
267
|
bool finished_chunk = false;
|
197
268
|
idx_t column = 0;
|
198
269
|
idx_t offset = 0;
|
199
270
|
bool has_quotes = false;
|
271
|
+
|
200
272
|
vector<idx_t> escape_positions;
|
201
273
|
if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
|
202
274
|
// First time reading this buffer piece
|
203
275
|
if (!SetPosition(insert_chunk)) {
|
204
|
-
// This means the buffer size does not contain a new line
|
205
|
-
if (position_buffer - start_buffer == options.buffer_size) {
|
206
|
-
error_message = "Line does not fit in one buffer. Increase the buffer size.";
|
207
|
-
return false;
|
208
|
-
}
|
209
276
|
finished = true;
|
210
277
|
return true;
|
211
278
|
}
|
212
279
|
}
|
213
|
-
|
280
|
+
if (position_buffer == buffer_size) {
|
281
|
+
// Nothing to read
|
282
|
+
finished = true;
|
283
|
+
return true;
|
284
|
+
}
|
285
|
+
// Keep track of line size
|
286
|
+
idx_t line_start = position_buffer;
|
214
287
|
// start parsing the first value
|
215
288
|
goto value_start;
|
216
289
|
|
@@ -242,11 +315,16 @@ normal : {
|
|
242
315
|
if (c == options.delimiter[0]) {
|
243
316
|
// delimiter: end the value and add it to the chunk
|
244
317
|
goto add_value;
|
318
|
+
} else if (c == options.quote[0] && try_add_line) {
|
319
|
+
return false;
|
245
320
|
} else if (StringUtil::CharacterIsNewline(c)) {
|
246
321
|
// newline: add row
|
247
|
-
if (column > 0 || try_add_line ||
|
322
|
+
if (column > 0 || try_add_line || parse_chunk.data.size() == 1) {
|
248
323
|
goto add_row;
|
249
324
|
}
|
325
|
+
if (column == 0 && position_buffer == start_buffer) {
|
326
|
+
start_buffer++;
|
327
|
+
}
|
250
328
|
}
|
251
329
|
}
|
252
330
|
if (!BufferRemainder()) {
|
@@ -285,12 +363,15 @@ add_row : {
|
|
285
363
|
parse_chunk.Reset();
|
286
364
|
return success;
|
287
365
|
} else {
|
366
|
+
VerifyLineLength(position_buffer - line_start, options.maximum_line_size);
|
367
|
+
line_start = position_buffer;
|
288
368
|
finished_chunk = AddRow(insert_chunk, column, error_message);
|
289
369
|
}
|
290
370
|
// increase position by 1 and move start to the new position
|
291
371
|
offset = 0;
|
292
372
|
has_quotes = false;
|
293
|
-
|
373
|
+
position_buffer++;
|
374
|
+
start_buffer = position_buffer;
|
294
375
|
verification_positions.end_of_last_line = position_buffer;
|
295
376
|
if (reached_remainder_state) {
|
296
377
|
goto final_state;
|
@@ -309,7 +390,10 @@ add_row : {
|
|
309
390
|
// newline after carriage return: skip
|
310
391
|
// increase position by 1 and move start to the new position
|
311
392
|
start_buffer = ++position_buffer;
|
393
|
+
|
394
|
+
SkipEmptyLines();
|
312
395
|
verification_positions.end_of_last_line = position_buffer;
|
396
|
+
start_buffer = position_buffer;
|
313
397
|
if (reached_remainder_state) {
|
314
398
|
goto final_state;
|
315
399
|
}
|
@@ -331,6 +415,9 @@ add_row : {
|
|
331
415
|
error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
|
332
416
|
return false;
|
333
417
|
}
|
418
|
+
SkipEmptyLines();
|
419
|
+
verification_positions.end_of_last_line = position_buffer;
|
420
|
+
start_buffer = position_buffer;
|
334
421
|
// \n newline, move to value start
|
335
422
|
if (finished_chunk) {
|
336
423
|
goto final_state;
|
@@ -391,7 +478,7 @@ unquote : {
|
|
391
478
|
} else if (StringUtil::CharacterIsNewline(c)) {
|
392
479
|
offset = 1;
|
393
480
|
// FIXME: should this be an assertion?
|
394
|
-
D_ASSERT(column == parse_chunk.ColumnCount() - 1);
|
481
|
+
D_ASSERT(try_add_line || (!try_add_line && column == parse_chunk.ColumnCount() - 1));
|
395
482
|
goto add_row;
|
396
483
|
} else if (position_buffer >= end_buffer) {
|
397
484
|
// reached end of buffer
|
@@ -448,22 +535,27 @@ final_state : {
|
|
448
535
|
}
|
449
536
|
// If this is the last buffer, we have to read the last value
|
450
537
|
if (buffer->buffer->IsCSVFileLastBuffer() || (buffer->next_buffer && buffer->next_buffer->IsCSVFileLastBuffer())) {
|
451
|
-
if (column > 0 ||
|
538
|
+
if (column > 0 || start_buffer != position_buffer || try_add_line ||
|
539
|
+
(insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
|
452
540
|
// remaining values to be added to the chunk
|
453
541
|
auto str_value = buffer->GetValue(start_buffer, position_buffer, offset);
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
542
|
+
if (!AllNewLine(str_value, insert_chunk.data.size()) || offset == 0) {
|
543
|
+
AddValue(str_value, column, escape_positions, has_quotes);
|
544
|
+
if (try_add_line) {
|
545
|
+
bool success = column == return_types.size();
|
546
|
+
if (success) {
|
547
|
+
AddRow(insert_chunk, column, error_message);
|
548
|
+
success = Flush(insert_chunk);
|
549
|
+
}
|
550
|
+
parse_chunk.Reset();
|
551
|
+
reached_remainder_state = false;
|
552
|
+
return success;
|
553
|
+
} else {
|
554
|
+
VerifyLineLength(position_buffer - line_start, options.maximum_line_size);
|
555
|
+
line_start = position_buffer;
|
458
556
|
AddRow(insert_chunk, column, error_message);
|
459
|
-
|
557
|
+
verification_positions.end_of_last_line = position_buffer;
|
460
558
|
}
|
461
|
-
parse_chunk.Reset();
|
462
|
-
reached_remainder_state = false;
|
463
|
-
return success;
|
464
|
-
} else {
|
465
|
-
AddRow(insert_chunk, column, error_message);
|
466
|
-
verification_positions.end_of_last_line = position_buffer;
|
467
559
|
}
|
468
560
|
}
|
469
561
|
}
|
@@ -471,11 +563,14 @@ final_state : {
|
|
471
563
|
if (mode == ParserMode::PARSING) {
|
472
564
|
Flush(insert_chunk);
|
473
565
|
}
|
474
|
-
if (position_buffer
|
475
|
-
!StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
|
566
|
+
if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
|
476
567
|
error_message = "Line does not fit in one buffer. Increase the buffer size.";
|
477
568
|
return false;
|
478
569
|
}
|
570
|
+
end_buffer = buffer_size;
|
571
|
+
SkipEmptyLines();
|
572
|
+
end_buffer = buffer->buffer_end;
|
573
|
+
verification_positions.end_of_last_line = position_buffer;
|
479
574
|
if (position_buffer >= end_buffer) {
|
480
575
|
if (position_buffer >= end_buffer) {
|
481
576
|
if (position_buffer == end_buffer && StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1]) &&
|
@@ -103,7 +103,7 @@ idx_t PhysicalTableScan::GetBatchIndex(ExecutionContext &context, DataChunk &chu
|
|
103
103
|
}
|
104
104
|
|
105
105
|
string PhysicalTableScan::GetName() const {
|
106
|
-
return StringUtil::Upper(function.name);
|
106
|
+
return StringUtil::Upper(function.name + " " + function.extra_info);
|
107
107
|
}
|
108
108
|
|
109
109
|
string PhysicalTableScan::ParamsToString() const {
|
@@ -12,6 +12,7 @@
|
|
12
12
|
#include "duckdb/planner/operator/logical_get.hpp"
|
13
13
|
#include "duckdb/main/extension_helper.hpp"
|
14
14
|
#include "duckdb/common/multi_file_reader.hpp"
|
15
|
+
#include "duckdb/main/client_data.hpp"
|
15
16
|
|
16
17
|
#include <limits>
|
17
18
|
|
@@ -23,21 +24,22 @@ unique_ptr<CSVFileHandle> ReadCSV::OpenCSV(const string &file_path, FileCompress
|
|
23
24
|
auto opener = FileSystem::GetFileOpener(context);
|
24
25
|
auto file_handle =
|
25
26
|
fs.OpenFile(file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK, compression, opener);
|
27
|
+
if (file_handle->CanSeek()) {
|
28
|
+
file_handle->Reset();
|
29
|
+
}
|
26
30
|
return make_uniq<CSVFileHandle>(std::move(file_handle));
|
27
31
|
}
|
28
32
|
|
29
33
|
void ReadCSVData::FinalizeRead(ClientContext &context) {
|
30
34
|
BaseCSVData::Finalize();
|
31
|
-
|
32
|
-
single_threaded = !config.options.experimental_parallel_csv_reader;
|
33
|
-
if (options.has_parallel) {
|
34
|
-
// Override the option set in the config
|
35
|
-
single_threaded = !options.use_parallel;
|
36
|
-
}
|
35
|
+
// Here we identify if we can run this CSV file on parallel or not.
|
37
36
|
bool null_or_empty = options.delimiter.empty() || options.escape.empty() || options.quote.empty() ||
|
38
37
|
options.delimiter[0] == '\0' || options.escape[0] == '\0' || options.quote[0] == '\0';
|
39
38
|
bool complex_options = options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1;
|
40
|
-
|
39
|
+
bool not_supported_options = options.null_padding;
|
40
|
+
|
41
|
+
if (!options.run_parallel || null_or_empty || not_supported_options || complex_options ||
|
42
|
+
options.new_line == NewLineIdentifier::MIX) {
|
41
43
|
// not supported for parallel CSV reading
|
42
44
|
single_threaded = true;
|
43
45
|
}
|
@@ -175,6 +177,8 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
175
177
|
options.all_varchar = BooleanValue::Get(kv.second);
|
176
178
|
} else if (loption == "normalize_names") {
|
177
179
|
options.normalize_names = BooleanValue::Get(kv.second);
|
180
|
+
} else if (loption == "parallel") {
|
181
|
+
options.run_parallel = BooleanValue::Get(kv.second);
|
178
182
|
} else {
|
179
183
|
options.SetReadOption(loption, kv.second, names);
|
180
184
|
}
|
@@ -214,6 +218,13 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
214
218
|
if (options.file_options.union_by_name) {
|
215
219
|
result->reader_bind =
|
216
220
|
MultiFileReader::BindUnionReader<BufferedCSVReader>(context, return_types, names, *result, options);
|
221
|
+
if (result->union_readers.size() > 1) {
|
222
|
+
result->column_info.emplace_back(result->csv_names, result->csv_types);
|
223
|
+
for (idx_t i = 1; i < result->union_readers.size(); i++) {
|
224
|
+
result->column_info.emplace_back(result->union_readers[i]->names,
|
225
|
+
result->union_readers[i]->return_types);
|
226
|
+
}
|
227
|
+
}
|
217
228
|
if (!options.sql_types_per_column.empty()) {
|
218
229
|
auto exception = BufferedCSVReader::ColumnTypesError(options.sql_types_per_column, names);
|
219
230
|
if (!exception.empty()) {
|
@@ -253,17 +264,27 @@ public:
|
|
253
264
|
file_size = file_handle->FileSize();
|
254
265
|
first_file_size = file_size;
|
255
266
|
bytes_read = 0;
|
256
|
-
if (buffer_size < file_size) {
|
267
|
+
if (buffer_size < file_size || file_size == 0) {
|
257
268
|
bytes_per_local_state = buffer_size / ParallelCSVGlobalState::MaxThreads();
|
258
269
|
} else {
|
259
270
|
bytes_per_local_state = file_size / MaxThreads();
|
260
271
|
}
|
261
|
-
|
262
|
-
|
263
|
-
|
272
|
+
if (bytes_per_local_state == 0) {
|
273
|
+
// In practice, I think this won't happen, it only happens because we are mocking up test scenarios
|
274
|
+
// this boy needs to be at least one.
|
275
|
+
bytes_per_local_state = 1;
|
276
|
+
}
|
277
|
+
for (idx_t i = 0; i < rows_to_skip; i++) {
|
278
|
+
file_handle->ReadLine();
|
279
|
+
}
|
280
|
+
first_position = current_csv_position;
|
281
|
+
current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
|
282
|
+
next_buffer = shared_ptr<CSVBuffer>(
|
283
|
+
current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
|
264
284
|
running_threads = MaxThreads();
|
265
285
|
}
|
266
286
|
ParallelCSVGlobalState() {
|
287
|
+
running_threads = MaxThreads();
|
267
288
|
}
|
268
289
|
|
269
290
|
~ParallelCSVGlobalState() override {
|
@@ -281,7 +302,7 @@ public:
|
|
281
302
|
//! Verify if the CSV File was read correctly
|
282
303
|
void Verify();
|
283
304
|
|
284
|
-
void UpdateVerification(VerificationPositions positions);
|
305
|
+
void UpdateVerification(VerificationPositions positions, idx_t file_number);
|
285
306
|
|
286
307
|
void IncrementThread();
|
287
308
|
|
@@ -332,14 +353,18 @@ private:
|
|
332
353
|
//! Current batch index
|
333
354
|
idx_t batch_index = 0;
|
334
355
|
//! Forces parallelism for small CSV Files, should only be used for testing.
|
335
|
-
bool force_parallelism;
|
356
|
+
bool force_parallelism = false;
|
336
357
|
//! Current (Global) position of CSV
|
337
358
|
idx_t current_csv_position = 0;
|
359
|
+
//! First Position of First Buffer
|
360
|
+
idx_t first_position = 0;
|
361
|
+
//! Current File Number
|
362
|
+
idx_t file_number = 0;
|
338
363
|
idx_t max_tuple_end = 0;
|
339
364
|
//! the vector stores positions where threads ended the last line they read in the CSV File, and the set stores
|
340
365
|
//! positions where they started reading the first line.
|
341
|
-
vector<idx_t
|
342
|
-
set<idx_t
|
366
|
+
vector<vector<idx_t>> tuple_end;
|
367
|
+
vector<set<idx_t>> tuple_start;
|
343
368
|
idx_t running_threads = 0;
|
344
369
|
//! The column ids to read
|
345
370
|
vector<column_t> column_ids;
|
@@ -349,10 +374,9 @@ idx_t ParallelCSVGlobalState::MaxThreads() const {
|
|
349
374
|
if (force_parallelism) {
|
350
375
|
return system_threads;
|
351
376
|
}
|
352
|
-
|
353
377
|
idx_t one_mb = 1000000; // We initialize max one thread per Mb
|
354
378
|
idx_t threads_per_mb = first_file_size / one_mb + 1;
|
355
|
-
if (threads_per_mb < system_threads) {
|
379
|
+
if (threads_per_mb < system_threads || threads_per_mb == 1) {
|
356
380
|
return threads_per_mb;
|
357
381
|
}
|
358
382
|
|
@@ -378,25 +402,36 @@ bool ParallelCSVGlobalState::Finished() {
|
|
378
402
|
void ParallelCSVGlobalState::Verify() {
|
379
403
|
// All threads are done, we run some magic sweet verification code
|
380
404
|
if (running_threads == 0) {
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
405
|
+
D_ASSERT(tuple_end.size() == tuple_start.size());
|
406
|
+
for (idx_t i = 0; i < tuple_start.size(); i++) {
|
407
|
+
auto ¤t_tuple_end = tuple_end[i];
|
408
|
+
auto ¤t_tuple_start = tuple_start[i];
|
409
|
+
// figure out max value of last_pos
|
410
|
+
if (current_tuple_end.empty()) {
|
411
|
+
return;
|
386
412
|
}
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
413
|
+
auto max_value = *max_element(std::begin(current_tuple_end), std::end(current_tuple_end));
|
414
|
+
for (auto &last_pos : current_tuple_end) {
|
415
|
+
auto first_pos = current_tuple_start.find(last_pos);
|
416
|
+
if (first_pos == current_tuple_start.end()) {
|
417
|
+
// this might be necessary due to carriage returns outside buffer scopes.
|
418
|
+
first_pos = current_tuple_start.find(last_pos + 1);
|
393
419
|
}
|
394
|
-
|
395
|
-
|
396
|
-
|
420
|
+
if (first_pos == current_tuple_start.end() && last_pos != max_value) {
|
421
|
+
string error =
|
422
|
+
"Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
|
423
|
+
" does not have a match\n";
|
424
|
+
error += "End Lines: \n";
|
425
|
+
for (auto &end_line : current_tuple_end) {
|
426
|
+
error += to_string(end_line) + "\n";
|
427
|
+
}
|
428
|
+
error += "Start Lines: \n";
|
429
|
+
for (auto &start_line : current_tuple_start) {
|
430
|
+
error += to_string(start_line) + "\n";
|
431
|
+
}
|
432
|
+
throw InvalidInputException(
|
433
|
+
"CSV File not supported for multithreading. Please run single-threaded CSV Reading");
|
397
434
|
}
|
398
|
-
throw InvalidInputException(
|
399
|
-
"CSV File not supported for multithreading. Please run single-threaded CSV Reading");
|
400
435
|
}
|
401
436
|
}
|
402
437
|
}
|
@@ -411,9 +446,11 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
411
446
|
current_file_path = bind_data.files[file_index++];
|
412
447
|
file_handle = ReadCSV::OpenCSV(current_file_path, bind_data.options.compression, context);
|
413
448
|
current_csv_position = 0;
|
414
|
-
|
415
|
-
|
416
|
-
|
449
|
+
file_number++;
|
450
|
+
current_buffer =
|
451
|
+
make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
|
452
|
+
next_buffer = shared_ptr<CSVBuffer>(
|
453
|
+
current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
|
417
454
|
} else {
|
418
455
|
// We are done scanning.
|
419
456
|
reader.reset();
|
@@ -433,8 +470,8 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
433
470
|
current_buffer = next_buffer;
|
434
471
|
if (next_buffer) {
|
435
472
|
// Next buffer gets the next-next buffer
|
436
|
-
next_buffer =
|
437
|
-
|
473
|
+
next_buffer = shared_ptr<CSVBuffer>(
|
474
|
+
next_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
|
438
475
|
}
|
439
476
|
}
|
440
477
|
if (!reader || reader->options.file_path != current_file_path) {
|
@@ -443,13 +480,18 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
443
480
|
if (file_index > 0 && file_index <= bind_data.union_readers.size() && bind_data.union_readers[file_index - 1]) {
|
444
481
|
// we are doing UNION BY NAME - fetch the options from the union reader for this file
|
445
482
|
auto &union_reader = *bind_data.union_readers[file_index - 1];
|
446
|
-
reader =
|
447
|
-
|
483
|
+
reader = make_uniq<ParallelCSVReader>(context, union_reader.options, std::move(result), first_position,
|
484
|
+
union_reader.GetTypes());
|
448
485
|
reader->names = union_reader.GetNames();
|
486
|
+
} else if (file_index <= bind_data.column_info.size()) {
|
487
|
+
// Serialized Union By name
|
488
|
+
reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
|
489
|
+
bind_data.column_info[file_index - 1].types);
|
490
|
+
reader->names = bind_data.column_info[file_index - 1].names;
|
449
491
|
} else {
|
450
492
|
// regular file - use the standard options
|
451
|
-
reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result),
|
452
|
-
|
493
|
+
reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
|
494
|
+
bind_data.csv_types);
|
453
495
|
reader->names = bind_data.csv_names;
|
454
496
|
}
|
455
497
|
reader->options.file_path = current_file_path;
|
@@ -461,14 +503,20 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
461
503
|
}
|
462
504
|
return true;
|
463
505
|
}
|
464
|
-
void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions) {
|
506
|
+
void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions, idx_t file_number_p) {
|
465
507
|
lock_guard<mutex> parallel_lock(main_mutex);
|
466
508
|
if (positions.beginning_of_first_line < positions.end_of_last_line) {
|
467
509
|
if (positions.end_of_last_line > max_tuple_end) {
|
468
510
|
max_tuple_end = positions.end_of_last_line;
|
469
511
|
}
|
470
|
-
tuple_start.
|
471
|
-
|
512
|
+
while (file_number_p >= tuple_start.size()) {
|
513
|
+
vector<idx_t> empty_tuple_end;
|
514
|
+
set<idx_t> empty_set;
|
515
|
+
tuple_start.emplace_back(empty_set);
|
516
|
+
tuple_end.emplace_back(empty_tuple_end);
|
517
|
+
}
|
518
|
+
tuple_start[file_number_p].insert(positions.beginning_of_first_line);
|
519
|
+
tuple_end[file_number_p].push_back(positions.end_of_last_line);
|
472
520
|
}
|
473
521
|
}
|
474
522
|
|
@@ -483,11 +531,9 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
|
|
483
531
|
|
484
532
|
bind_data.options.file_path = bind_data.files[0];
|
485
533
|
file_handle = ReadCSV::OpenCSV(bind_data.options.file_path, bind_data.options.compression, context);
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
context.db->NumberOfThreads(), bind_data.options.buffer_size, rows_to_skip,
|
490
|
-
ClientConfig::GetConfig(context).verify_parallelism, input.column_ids);
|
534
|
+
return make_uniq<ParallelCSVGlobalState>(
|
535
|
+
context, std::move(file_handle), bind_data.files, context.db->NumberOfThreads(), bind_data.options.buffer_size,
|
536
|
+
bind_data.options.skip_rows, ClientConfig::GetConfig(context).verify_parallelism, input.column_ids);
|
491
537
|
}
|
492
538
|
|
493
539
|
//===--------------------------------------------------------------------===//
|
@@ -534,11 +580,10 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
|
|
534
580
|
}
|
535
581
|
if (csv_local_state.csv_reader->finished) {
|
536
582
|
auto verification_updates = csv_local_state.csv_reader->GetVerificationPositions();
|
537
|
-
if (
|
538
|
-
|
539
|
-
|
583
|
+
if (verification_updates.beginning_of_first_line != verification_updates.end_of_last_line) {
|
584
|
+
csv_global_state.UpdateVerification(verification_updates,
|
585
|
+
csv_local_state.csv_reader->buffer->buffer->GetFileNumber());
|
540
586
|
}
|
541
|
-
csv_global_state.UpdateVerification(verification_updates);
|
542
587
|
auto has_next = csv_global_state.Next(context, bind_data, csv_local_state.csv_reader);
|
543
588
|
if (!has_next) {
|
544
589
|
csv_global_state.DecrementThread();
|
@@ -642,14 +687,17 @@ static unique_ptr<GlobalTableFunctionState> SingleThreadedCSVInit(ClientContext
|
|
642
687
|
TableFunctionInitInput &input) {
|
643
688
|
auto &bind_data = (ReadCSVData &)*input.bind_data;
|
644
689
|
auto result = make_uniq<SingleThreadedCSVState>(bind_data.files.size());
|
645
|
-
if (bind_data.
|
646
|
-
result->initial_reader = std::move(bind_data.initial_reader);
|
647
|
-
} else if (bind_data.files.empty()) {
|
690
|
+
if (bind_data.files.empty()) {
|
648
691
|
// This can happen when a filename based filter pushdown has eliminated all possible files for this scan.
|
649
692
|
return std::move(result);
|
650
693
|
} else {
|
651
694
|
bind_data.options.file_path = bind_data.files[0];
|
652
|
-
|
695
|
+
if (bind_data.initial_reader && !bind_data.file_exists) {
|
696
|
+
// If this is not an on disk file we gotta reuse the reader.
|
697
|
+
result->initial_reader = std::move(bind_data.initial_reader);
|
698
|
+
} else {
|
699
|
+
result->initial_reader = make_uniq<BufferedCSVReader>(context, bind_data.options, bind_data.csv_types);
|
700
|
+
}
|
653
701
|
if (!bind_data.options.file_options.union_by_name) {
|
654
702
|
result->initial_reader->names = bind_data.csv_names;
|
655
703
|
}
|
@@ -741,6 +789,14 @@ static void SingleThreadedCSVFunction(ClientContext &context, TableFunctionInput
|
|
741
789
|
//===--------------------------------------------------------------------===//
|
742
790
|
static unique_ptr<GlobalTableFunctionState> ReadCSVInitGlobal(ClientContext &context, TableFunctionInitInput &input) {
|
743
791
|
auto &bind_data = (ReadCSVData &)*input.bind_data;
|
792
|
+
auto &fs = FileSystem::GetFileSystem(context);
|
793
|
+
for (auto &file : bind_data.files) {
|
794
|
+
if (!fs.FileExists(file)) {
|
795
|
+
bind_data.file_exists = false;
|
796
|
+
break;
|
797
|
+
}
|
798
|
+
}
|
799
|
+
bind_data.single_threaded = bind_data.single_threaded || !bind_data.file_exists;
|
744
800
|
if (bind_data.single_threaded) {
|
745
801
|
return SingleThreadedCSVInit(context, input);
|
746
802
|
} else {
|
@@ -863,6 +919,7 @@ void BufferedCSVReaderOptions::Serialize(FieldWriter &writer) const {
|
|
863
919
|
writer.WriteField<idx_t>(buffer_sample_size);
|
864
920
|
writer.WriteString(null_str);
|
865
921
|
writer.WriteField<FileCompressionType>(compression);
|
922
|
+
writer.WriteField<NewLineIdentifier>(new_line);
|
866
923
|
// read options
|
867
924
|
writer.WriteField<idx_t>(skip_rows);
|
868
925
|
writer.WriteField<bool>(skip_rows_set);
|
@@ -896,6 +953,7 @@ void BufferedCSVReaderOptions::Deserialize(FieldReader &reader) {
|
|
896
953
|
buffer_sample_size = reader.ReadRequired<idx_t>();
|
897
954
|
null_str = reader.ReadRequired<string>();
|
898
955
|
compression = reader.ReadRequired<FileCompressionType>();
|
956
|
+
new_line = reader.ReadRequired<NewLineIdentifier>();
|
899
957
|
// read options
|
900
958
|
skip_rows = reader.ReadRequired<idx_t>();
|
901
959
|
skip_rows_set = reader.ReadRequired<bool>();
|
@@ -926,6 +984,10 @@ static void CSVReaderSerialize(FieldWriter &writer, const FunctionData *bind_dat
|
|
926
984
|
bind_data.options.Serialize(writer);
|
927
985
|
writer.WriteField<bool>(bind_data.single_threaded);
|
928
986
|
writer.WriteSerializable(bind_data.reader_bind);
|
987
|
+
writer.WriteField<uint32_t>(bind_data.column_info.size());
|
988
|
+
for (auto &col : bind_data.column_info) {
|
989
|
+
col.Serialize(writer);
|
990
|
+
}
|
929
991
|
}
|
930
992
|
|
931
993
|
static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, FieldReader &reader,
|
@@ -941,6 +1003,10 @@ static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, Fie
|
|
941
1003
|
result_data->options.Deserialize(reader);
|
942
1004
|
result_data->single_threaded = reader.ReadField<bool>(true);
|
943
1005
|
result_data->reader_bind = reader.ReadRequiredSerializable<MultiFileReaderBindData, MultiFileReaderBindData>();
|
1006
|
+
uint32_t file_number = reader.ReadRequired<uint32_t>();
|
1007
|
+
for (idx_t i = 0; i < file_number; i++) {
|
1008
|
+
result_data->column_info.emplace_back(ColumnInfo::Deserialize(reader));
|
1009
|
+
}
|
944
1010
|
return std::move(result_data);
|
945
1011
|
}
|
946
1012
|
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.7.2-
|
2
|
+
#define DUCKDB_VERSION "0.7.2-dev2233"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "c81600ed51"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|