duckdb 0.7.2-dev3294.0 → 0.7.2-dev3353.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +8 -8
- package/package.json +1 -1
- package/src/duckdb/src/execution/index/art/art.cpp +3 -0
- package/src/duckdb/src/execution/index/art/prefix.cpp +11 -11
- package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +56 -28
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -3
- package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +85 -37
- package/src/duckdb/src/function/table/read_csv.cpp +136 -37
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/base_csv_reader.hpp +11 -4
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +2 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_line_info.hpp +40 -0
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp +28 -13
- package/src/duckdb/src/main/extension/extension_load.cpp +6 -5
- package/src/duckdb/src/optimizer/rule/move_constants.cpp +3 -0
- package/src/duckdb/src/storage/statistics/string_stats.cpp +1 -1
- package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +10837 -10795
- package/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +6 -6
package/binding.gyp
CHANGED
@@ -237,18 +237,18 @@
|
|
237
237
|
"src/duckdb/third_party/zstd/compress/zstd_lazy.cpp",
|
238
238
|
"src/duckdb/third_party/zstd/compress/zstd_ldm.cpp",
|
239
239
|
"src/duckdb/third_party/zstd/compress/zstd_opt.cpp",
|
240
|
-
"src/duckdb/extension/icu/./icu-makedate.cpp",
|
241
|
-
"src/duckdb/extension/icu/./icu-datepart.cpp",
|
242
|
-
"src/duckdb/extension/icu/./icu-timebucket.cpp",
|
243
|
-
"src/duckdb/extension/icu/./icu-list-range.cpp",
|
244
|
-
"src/duckdb/extension/icu/./icu-table-range.cpp",
|
245
240
|
"src/duckdb/extension/icu/./icu-dateadd.cpp",
|
246
241
|
"src/duckdb/extension/icu/./icu-datetrunc.cpp",
|
247
|
-
"src/duckdb/extension/icu/./icu-
|
242
|
+
"src/duckdb/extension/icu/./icu-datesub.cpp",
|
243
|
+
"src/duckdb/extension/icu/./icu-table-range.cpp",
|
244
|
+
"src/duckdb/extension/icu/./icu-timebucket.cpp",
|
245
|
+
"src/duckdb/extension/icu/./icu-list-range.cpp",
|
246
|
+
"src/duckdb/extension/icu/./icu-datepart.cpp",
|
248
247
|
"src/duckdb/extension/icu/./icu-timezone.cpp",
|
249
|
-
"src/duckdb/extension/icu/./icu-strptime.cpp",
|
250
248
|
"src/duckdb/extension/icu/./icu-extension.cpp",
|
251
|
-
"src/duckdb/extension/icu/./icu-
|
249
|
+
"src/duckdb/extension/icu/./icu-makedate.cpp",
|
250
|
+
"src/duckdb/extension/icu/./icu-datefunc.cpp",
|
251
|
+
"src/duckdb/extension/icu/./icu-strptime.cpp",
|
252
252
|
"src/duckdb/ub_extension_icu_third_party_icu_common.cpp",
|
253
253
|
"src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp",
|
254
254
|
"src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp",
|
package/package.json
CHANGED
@@ -1039,6 +1039,9 @@ void ART::InitializeMerge(ARTFlags &flags) {
|
|
1039
1039
|
bool ART::MergeIndexes(IndexLock &state, Index &other_index) {
|
1040
1040
|
|
1041
1041
|
auto &other_art = other_index.Cast<ART>();
|
1042
|
+
if (!other_art.tree->IsSet()) {
|
1043
|
+
return true;
|
1044
|
+
}
|
1042
1045
|
|
1043
1046
|
if (tree->IsSet()) {
|
1044
1047
|
// fully deserialize other_index, and traverse it to increment its buffer IDs
|
@@ -159,23 +159,15 @@ void Prefix::Concatenate(ART &art, const uint8_t byte, const Prefix &other) {
|
|
159
159
|
return;
|
160
160
|
}
|
161
161
|
|
162
|
+
auto this_inlined = IsInlined();
|
162
163
|
auto this_count = count;
|
163
164
|
auto this_data = data;
|
164
165
|
Initialize();
|
165
166
|
|
166
|
-
// append the other prefix
|
167
|
+
// append the other prefix and possibly move the data to a segment
|
167
168
|
Append(art, other);
|
168
|
-
|
169
169
|
if (IsInlined()) {
|
170
|
-
|
171
|
-
reference<PrefixSegment> segment(MoveInlinedToSegment(art));
|
172
|
-
// append the byte
|
173
|
-
segment = segment.get().Append(art, count, byte);
|
174
|
-
// append this prefix
|
175
|
-
for (idx_t i = 0; i < this_count; i++) {
|
176
|
-
segment = segment.get().Append(art, count, this_data.inlined[i]);
|
177
|
-
}
|
178
|
-
return;
|
170
|
+
MoveInlinedToSegment(art);
|
179
171
|
}
|
180
172
|
|
181
173
|
// get the tail
|
@@ -183,6 +175,14 @@ void Prefix::Concatenate(ART &art, const uint8_t byte, const Prefix &other) {
|
|
183
175
|
// append the byte
|
184
176
|
segment = segment.get().Append(art, count, byte);
|
185
177
|
|
178
|
+
if (this_inlined) {
|
179
|
+
// append this prefix
|
180
|
+
for (idx_t i = 0; i < this_count; i++) {
|
181
|
+
segment = segment.get().Append(art, count, this_data.inlined[i]);
|
182
|
+
}
|
183
|
+
return;
|
184
|
+
}
|
185
|
+
|
186
186
|
// iterate all segments of this prefix, copy their data, and free them
|
187
187
|
auto this_ptr = this_data.ptr;
|
188
188
|
auto remaining = this_count;
|
@@ -17,6 +17,7 @@
|
|
17
17
|
#include "utf8proc.hpp"
|
18
18
|
#include "duckdb/parser/keyword_helper.hpp"
|
19
19
|
#include "duckdb/main/error_manager.hpp"
|
20
|
+
#include "duckdb/execution/operator/persistent/parallel_csv_reader.hpp"
|
20
21
|
|
21
22
|
#include <algorithm>
|
22
23
|
#include <cctype>
|
@@ -25,9 +26,10 @@
|
|
25
26
|
|
26
27
|
namespace duckdb {
|
27
28
|
|
28
|
-
string BaseCSVReader::GetLineNumberStr(idx_t
|
29
|
-
|
30
|
-
|
29
|
+
string BaseCSVReader::GetLineNumberStr(idx_t line_error, bool is_line_estimated, idx_t buffer_idx) {
|
30
|
+
// If an error happens during auto-detect it is an estimated line
|
31
|
+
string estimated = (is_line_estimated ? string(" (estimated)") : string(""));
|
32
|
+
return to_string(GetLineError(line_error, buffer_idx)) + estimated;
|
31
33
|
}
|
32
34
|
|
33
35
|
BaseCSVReader::BaseCSVReader(ClientContext &context_p, BufferedCSVReaderOptions options_p,
|
@@ -165,40 +167,48 @@ struct TryCastTimestampOperator {
|
|
165
167
|
|
166
168
|
template <class OP, class T>
|
167
169
|
static bool TemplatedTryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
|
168
|
-
idx_t count, string &error_message) {
|
170
|
+
idx_t count, string &error_message, idx_t &line_error) {
|
169
171
|
D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
|
170
172
|
bool all_converted = true;
|
173
|
+
idx_t cur_line = 0;
|
171
174
|
UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
|
172
175
|
T result;
|
173
176
|
if (!OP::Operation(options, input, result, error_message)) {
|
177
|
+
line_error = cur_line;
|
174
178
|
all_converted = false;
|
175
179
|
}
|
180
|
+
cur_line++;
|
176
181
|
return result;
|
177
182
|
});
|
178
183
|
return all_converted;
|
179
184
|
}
|
180
185
|
|
181
186
|
bool TryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
|
182
|
-
string &error_message) {
|
187
|
+
string &error_message, idx_t &line_error) {
|
183
188
|
return TemplatedTryCastDateVector<TryCastDateOperator, date_t>(options, input_vector, result_vector, count,
|
184
|
-
error_message);
|
189
|
+
error_message, line_error);
|
185
190
|
}
|
186
191
|
|
187
192
|
bool TryCastTimestampVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
|
188
193
|
string &error_message) {
|
194
|
+
idx_t line_error;
|
189
195
|
return TemplatedTryCastDateVector<TryCastTimestampOperator, timestamp_t>(options, input_vector, result_vector,
|
190
|
-
count, error_message);
|
196
|
+
count, error_message, line_error);
|
191
197
|
}
|
192
198
|
|
193
199
|
template <class OP, class T>
|
194
200
|
bool TemplatedTryCastFloatingVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
|
195
|
-
idx_t count, string &error_message) {
|
201
|
+
idx_t count, string &error_message, idx_t &line_error) {
|
196
202
|
D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
|
197
203
|
bool all_converted = true;
|
204
|
+
idx_t row = 0;
|
198
205
|
UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
|
199
206
|
T result;
|
200
207
|
if (!OP::Operation(input, result, &error_message)) {
|
208
|
+
line_error = row;
|
201
209
|
all_converted = false;
|
210
|
+
} else {
|
211
|
+
row++;
|
202
212
|
}
|
203
213
|
return result;
|
204
214
|
});
|
@@ -226,7 +236,8 @@ bool BaseCSVReader::TryCastVector(Vector &parse_chunk_col, idx_t size, const Log
|
|
226
236
|
if (options.has_format[LogicalTypeId::DATE] && sql_type == LogicalTypeId::DATE) {
|
227
237
|
// use the date format to cast the chunk
|
228
238
|
string error_message;
|
229
|
-
|
239
|
+
idx_t line_error;
|
240
|
+
return TryCastDateVector(options, parse_chunk_col, dummy_result, size, error_message, line_error);
|
230
241
|
} else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type == LogicalTypeId::TIMESTAMP) {
|
231
242
|
// use the timestamp format to cast the chunk
|
232
243
|
string error_message;
|
@@ -238,7 +249,8 @@ bool BaseCSVReader::TryCastVector(Vector &parse_chunk_col, idx_t size, const Log
|
|
238
249
|
}
|
239
250
|
}
|
240
251
|
|
241
|
-
void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes
|
252
|
+
void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes,
|
253
|
+
idx_t buffer_idx) {
|
242
254
|
auto length = str_val.GetSize();
|
243
255
|
if (length == 0 && column == 0) {
|
244
256
|
row_empty = true;
|
@@ -260,7 +272,8 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
|
|
260
272
|
} else {
|
261
273
|
throw InvalidInputException(
|
262
274
|
"Error in file \"%s\", on line %s: expected %lld values per row, but got more. (%s)", options.file_path,
|
263
|
-
GetLineNumberStr(linenr, linenr_estimated).c_str(), return_types.size(),
|
275
|
+
GetLineNumberStr(linenr, linenr_estimated, buffer_idx).c_str(), return_types.size(),
|
276
|
+
options.ToString());
|
264
277
|
}
|
265
278
|
}
|
266
279
|
|
@@ -301,7 +314,7 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
|
|
301
314
|
column++;
|
302
315
|
}
|
303
316
|
|
304
|
-
bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message) {
|
317
|
+
bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message, idx_t buffer_idx) {
|
305
318
|
linenr++;
|
306
319
|
|
307
320
|
if (row_empty) {
|
@@ -338,8 +351,8 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
|
|
338
351
|
} else {
|
339
352
|
throw InvalidInputException(
|
340
353
|
"Error in file \"%s\" on line %s: expected %lld values per row, but got %d.\nParser options:\n%s",
|
341
|
-
options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(),
|
342
|
-
options.ToString());
|
354
|
+
options.file_path, GetLineNumberStr(linenr, linenr_estimated, buffer_idx).c_str(),
|
355
|
+
return_types.size(), column, options.ToString());
|
343
356
|
}
|
344
357
|
}
|
345
358
|
}
|
@@ -363,7 +376,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
|
|
363
376
|
}
|
364
377
|
|
365
378
|
if (mode == ParserMode::PARSING && parse_chunk.size() == STANDARD_VECTOR_SIZE) {
|
366
|
-
Flush(insert_chunk);
|
379
|
+
Flush(insert_chunk, buffer_idx);
|
367
380
|
return true;
|
368
381
|
}
|
369
382
|
|
@@ -426,20 +439,21 @@ bool TryCastDecimalVectorCommaSeparated(BufferedCSVReaderOptions &options, Vecto
|
|
426
439
|
}
|
427
440
|
|
428
441
|
bool TryCastFloatingVectorCommaSeparated(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
|
429
|
-
idx_t count, string &error_message, const LogicalType &result_type
|
442
|
+
idx_t count, string &error_message, const LogicalType &result_type,
|
443
|
+
idx_t &line_error) {
|
430
444
|
switch (result_type.InternalType()) {
|
431
445
|
case PhysicalType::DOUBLE:
|
432
446
|
return TemplatedTryCastFloatingVector<TryCastErrorMessageCommaSeparated, double>(
|
433
|
-
options, input_vector, result_vector, count, error_message);
|
447
|
+
options, input_vector, result_vector, count, error_message, line_error);
|
434
448
|
case PhysicalType::FLOAT:
|
435
449
|
return TemplatedTryCastFloatingVector<TryCastErrorMessageCommaSeparated, float>(
|
436
|
-
options, input_vector, result_vector, count, error_message);
|
450
|
+
options, input_vector, result_vector, count, error_message, line_error);
|
437
451
|
default:
|
438
452
|
throw InternalException("Unimplemented physical type for floating");
|
439
453
|
}
|
440
454
|
}
|
441
455
|
|
442
|
-
bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
|
456
|
+
bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_add_line) {
|
443
457
|
if (parse_chunk.size() == 0) {
|
444
458
|
return true;
|
445
459
|
}
|
@@ -468,9 +482,12 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
|
|
468
482
|
} else {
|
469
483
|
string error_message;
|
470
484
|
bool success;
|
485
|
+
idx_t line_error = 0;
|
486
|
+
bool target_type_not_varchar = false;
|
471
487
|
if (options.has_format[LogicalTypeId::DATE] && type.id() == LogicalTypeId::DATE) {
|
472
488
|
// use the date format to cast the chunk
|
473
|
-
success = TryCastDateVector(options, parse_vector, result_vector, parse_chunk.size(), error_message
|
489
|
+
success = TryCastDateVector(options, parse_vector, result_vector, parse_chunk.size(), error_message,
|
490
|
+
line_error);
|
474
491
|
} else if (options.has_format[LogicalTypeId::TIMESTAMP] && type.id() == LogicalTypeId::TIMESTAMP) {
|
475
492
|
// use the date format to cast the chunk
|
476
493
|
success =
|
@@ -478,12 +495,13 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
|
|
478
495
|
} else if (options.decimal_separator != "." &&
|
479
496
|
(type.id() == LogicalTypeId::FLOAT || type.id() == LogicalTypeId::DOUBLE)) {
|
480
497
|
success = TryCastFloatingVectorCommaSeparated(options, parse_vector, result_vector, parse_chunk.size(),
|
481
|
-
error_message, type);
|
498
|
+
error_message, type, line_error);
|
482
499
|
} else if (options.decimal_separator != "." && type.id() == LogicalTypeId::DECIMAL) {
|
483
500
|
success = TryCastDecimalVectorCommaSeparated(options, parse_vector, result_vector, parse_chunk.size(),
|
484
501
|
error_message, type);
|
485
502
|
} else {
|
486
503
|
// target type is not varchar: perform a cast
|
504
|
+
target_type_not_varchar = true;
|
487
505
|
success =
|
488
506
|
VectorOperations::TryCast(context, parse_vector, result_vector, parse_chunk.size(), &error_message);
|
489
507
|
}
|
@@ -503,15 +521,25 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
|
|
503
521
|
}
|
504
522
|
|
505
523
|
// figure out the exact line number
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
524
|
+
if (target_type_not_varchar) {
|
525
|
+
UnifiedVectorFormat inserted_column_data;
|
526
|
+
result_vector.ToUnifiedFormat(parse_chunk.size(), inserted_column_data);
|
527
|
+
for (; line_error < parse_chunk.size(); line_error++) {
|
528
|
+
if (!inserted_column_data.validity.RowIsValid(line_error) &&
|
529
|
+
!FlatVector::IsNull(parse_vector, line_error)) {
|
530
|
+
break;
|
531
|
+
}
|
512
532
|
}
|
513
533
|
}
|
514
|
-
|
534
|
+
|
535
|
+
idx_t error_line;
|
536
|
+
// The line_error must be summed with linenr (All lines emmited from this batch)
|
537
|
+
// But subtracted from the parse_chunk
|
538
|
+
D_ASSERT(line_error + linenr >= parse_chunk.size());
|
539
|
+
line_error += linenr;
|
540
|
+
line_error -= parse_chunk.size();
|
541
|
+
|
542
|
+
error_line = GetLineError(line_error, buffer_idx);
|
515
543
|
|
516
544
|
if (options.auto_detect) {
|
517
545
|
throw InvalidInputException("%s in column %s, at line %llu.\n\nParser "
|
@@ -39,9 +39,6 @@ BufferedCSVReader::BufferedCSVReader(ClientContext &context, string filename, Bu
|
|
39
39
|
Initialize(requested_types);
|
40
40
|
}
|
41
41
|
|
42
|
-
BufferedCSVReader::~BufferedCSVReader() {
|
43
|
-
}
|
44
|
-
|
45
42
|
enum class QuoteRule : uint8_t { QUOTES_RFC = 0, QUOTES_OTHER = 1, NO_QUOTES = 2 };
|
46
43
|
|
47
44
|
static bool StartsWithNumericDate(string &separator, const string &value) {
|
@@ -15,19 +15,20 @@
|
|
15
15
|
#include "utf8proc.hpp"
|
16
16
|
#include "duckdb/parser/keyword_helper.hpp"
|
17
17
|
#include "duckdb/function/table/read_csv.hpp"
|
18
|
+
#include "duckdb/execution/operator/persistent/csv_line_info.hpp"
|
18
19
|
|
19
20
|
#include <algorithm>
|
20
21
|
#include <cctype>
|
21
22
|
#include <cstring>
|
22
23
|
#include <fstream>
|
23
|
-
#include <utility>
|
24
24
|
|
25
25
|
namespace duckdb {
|
26
26
|
|
27
27
|
ParallelCSVReader::ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
|
28
28
|
unique_ptr<CSVBufferRead> buffer_p, idx_t first_pos_first_buffer_p,
|
29
|
-
const vector<LogicalType> &requested_types)
|
30
|
-
: BaseCSVReader(context, std::move(options_p), requested_types),
|
29
|
+
const vector<LogicalType> &requested_types, idx_t file_idx_p)
|
30
|
+
: BaseCSVReader(context, std::move(options_p), requested_types), file_idx(file_idx_p),
|
31
|
+
first_pos_first_buffer(first_pos_first_buffer_p) {
|
31
32
|
Initialize(requested_types);
|
32
33
|
SetBufferRead(std::move(buffer_p));
|
33
34
|
if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
|
@@ -35,9 +36,6 @@ ParallelCSVReader::ParallelCSVReader(ClientContext &context, BufferedCSVReaderOp
|
|
35
36
|
}
|
36
37
|
}
|
37
38
|
|
38
|
-
ParallelCSVReader::~ParallelCSVReader() {
|
39
|
-
}
|
40
|
-
|
41
39
|
void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
|
42
40
|
return_types = requested_types;
|
43
41
|
InitParseChunk(return_types.size());
|
@@ -76,7 +74,7 @@ void ParallelCSVReader::SkipEmptyLines() {
|
|
76
74
|
}
|
77
75
|
}
|
78
76
|
|
79
|
-
bool ParallelCSVReader::SetPosition(
|
77
|
+
bool ParallelCSVReader::SetPosition() {
|
80
78
|
if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer &&
|
81
79
|
start_buffer == first_pos_first_buffer) {
|
82
80
|
start_buffer = buffer->buffer->GetStart();
|
@@ -84,7 +82,7 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
|
|
84
82
|
verification_positions.beginning_of_first_line = position_buffer;
|
85
83
|
verification_positions.end_of_last_line = position_buffer;
|
86
84
|
// First buffer doesn't need any setting
|
87
|
-
|
85
|
+
|
88
86
|
if (options.header) {
|
89
87
|
for (; position_buffer < end_buffer; position_buffer++) {
|
90
88
|
if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
|
@@ -205,10 +203,8 @@ void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
|
|
205
203
|
} else {
|
206
204
|
buffer_size = buffer_read_p->buffer->GetBufferSize();
|
207
205
|
}
|
208
|
-
linenr = buffer_read_p->estimated_linenr;
|
209
206
|
buffer = std::move(buffer_read_p);
|
210
207
|
|
211
|
-
linenr_estimated = true;
|
212
208
|
reached_remainder_state = false;
|
213
209
|
verification_positions.beginning_of_first_line = 0;
|
214
210
|
verification_positions.end_of_last_line = 0;
|
@@ -239,10 +235,12 @@ bool ParallelCSVReader::BufferRemainder() {
|
|
239
235
|
return true;
|
240
236
|
}
|
241
237
|
|
242
|
-
void VerifyLineLength(idx_t line_size
|
243
|
-
if (line_size >
|
244
|
-
|
245
|
-
|
238
|
+
void ParallelCSVReader::VerifyLineLength(idx_t line_size) {
|
239
|
+
if (line_size > options.maximum_line_size) {
|
240
|
+
throw InvalidInputException("Error in file \"%s\" on line %s: Maximum line size of %llu bytes exceeded!",
|
241
|
+
options.file_path,
|
242
|
+
GetLineNumberStr(parse_chunk.size(), linenr_estimated, buffer->batch_index).c_str(),
|
243
|
+
options.maximum_line_size);
|
246
244
|
}
|
247
245
|
}
|
248
246
|
|
@@ -261,6 +259,33 @@ bool AllNewLine(string_t value, idx_t column_amount) {
|
|
261
259
|
}
|
262
260
|
|
263
261
|
bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
|
262
|
+
// If line is not set, we have to figure it out, we assume whatever is in the first line
|
263
|
+
if (options.new_line == NewLineIdentifier::NOT_SET) {
|
264
|
+
idx_t cur_pos = position_buffer;
|
265
|
+
// we can start in the middle of a new line, so move a bit forward.
|
266
|
+
while (cur_pos < end_buffer) {
|
267
|
+
if (StringUtil::CharacterIsNewline((*buffer)[cur_pos])) {
|
268
|
+
cur_pos++;
|
269
|
+
} else {
|
270
|
+
break;
|
271
|
+
}
|
272
|
+
}
|
273
|
+
for (; cur_pos < end_buffer; cur_pos++) {
|
274
|
+
if (StringUtil::CharacterIsNewline((*buffer)[cur_pos])) {
|
275
|
+
bool carriage_return = (*buffer)[cur_pos] == '\r';
|
276
|
+
bool carriage_return_followed = false;
|
277
|
+
cur_pos++;
|
278
|
+
if (cur_pos < end_buffer) {
|
279
|
+
if (carriage_return && (*buffer)[cur_pos] == '\n') {
|
280
|
+
carriage_return_followed = true;
|
281
|
+
cur_pos++;
|
282
|
+
}
|
283
|
+
}
|
284
|
+
SetNewLineDelimiter(carriage_return, carriage_return_followed);
|
285
|
+
break;
|
286
|
+
}
|
287
|
+
}
|
288
|
+
}
|
264
289
|
// used for parsing algorithm
|
265
290
|
if (start_buffer == buffer_size) {
|
266
291
|
// Nothing to read
|
@@ -276,7 +301,7 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
|
|
276
301
|
vector<idx_t> escape_positions;
|
277
302
|
if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
|
278
303
|
// First time reading this buffer piece
|
279
|
-
if (!SetPosition(
|
304
|
+
if (!SetPosition()) {
|
280
305
|
finished = true;
|
281
306
|
return true;
|
282
307
|
}
|
@@ -340,7 +365,8 @@ normal : {
|
|
340
365
|
|
341
366
|
add_value : {
|
342
367
|
/* state: Add value to string vector */
|
343
|
-
AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes
|
368
|
+
AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes,
|
369
|
+
buffer->local_batch_index);
|
344
370
|
// increase position by 1 and move start to the new position
|
345
371
|
offset = 0;
|
346
372
|
has_quotes = false;
|
@@ -356,20 +382,23 @@ add_row : {
|
|
356
382
|
// check type of newline (\r or \n)
|
357
383
|
bool carriage_return = (*buffer)[position_buffer] == '\r';
|
358
384
|
|
359
|
-
AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes
|
385
|
+
AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes,
|
386
|
+
buffer->local_batch_index);
|
360
387
|
if (try_add_line) {
|
361
388
|
bool success = column == insert_chunk.ColumnCount();
|
362
389
|
if (success) {
|
363
|
-
|
364
|
-
|
390
|
+
idx_t cur_linenr = linenr;
|
391
|
+
AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
|
392
|
+
success = Flush(insert_chunk, buffer->local_batch_index, true);
|
393
|
+
linenr = cur_linenr;
|
365
394
|
}
|
366
395
|
reached_remainder_state = false;
|
367
396
|
parse_chunk.Reset();
|
368
397
|
return success;
|
369
398
|
} else {
|
370
|
-
VerifyLineLength(position_buffer - line_start
|
399
|
+
VerifyLineLength(position_buffer - line_start);
|
371
400
|
line_start = position_buffer;
|
372
|
-
finished_chunk = AddRow(insert_chunk, column, error_message);
|
401
|
+
finished_chunk = AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
|
373
402
|
}
|
374
403
|
// increase position by 1 and move start to the new position
|
375
404
|
offset = 0;
|
@@ -377,15 +406,12 @@ add_row : {
|
|
377
406
|
position_buffer++;
|
378
407
|
start_buffer = position_buffer;
|
379
408
|
verification_positions.end_of_last_line = position_buffer;
|
380
|
-
if (reached_remainder_state) {
|
381
|
-
goto final_state;
|
382
|
-
}
|
383
|
-
if (!BufferRemainder()) {
|
384
|
-
goto final_state;
|
385
|
-
}
|
386
409
|
if (carriage_return) {
|
387
410
|
// \r newline, go to special state that parses an optional \n afterwards
|
388
411
|
// optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
|
412
|
+
if (!BufferRemainder()) {
|
413
|
+
goto final_state;
|
414
|
+
}
|
389
415
|
if ((*buffer)[position_buffer] == '\n') {
|
390
416
|
if (options.new_line == NewLineIdentifier::SINGLE) {
|
391
417
|
error_message = "Wrong NewLine Identifier. Expecting \\r\\n";
|
@@ -419,6 +445,12 @@ add_row : {
|
|
419
445
|
error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
|
420
446
|
return false;
|
421
447
|
}
|
448
|
+
if (reached_remainder_state) {
|
449
|
+
goto final_state;
|
450
|
+
}
|
451
|
+
if (!BufferRemainder()) {
|
452
|
+
goto final_state;
|
453
|
+
}
|
422
454
|
SkipEmptyLines();
|
423
455
|
verification_positions.end_of_last_line = position_buffer;
|
424
456
|
start_buffer = position_buffer;
|
@@ -451,7 +483,8 @@ in_quotes:
|
|
451
483
|
}
|
452
484
|
// still in quoted state at the end of the file or at the end of a buffer when running multithreaded, error:
|
453
485
|
throw InvalidInputException("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path,
|
454
|
-
GetLineNumberStr(linenr, linenr_estimated).c_str(),
|
486
|
+
GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(),
|
487
|
+
options.ToString());
|
455
488
|
} else {
|
456
489
|
goto final_state;
|
457
490
|
}
|
@@ -492,7 +525,8 @@ unquote : {
|
|
492
525
|
error_message = StringUtil::Format(
|
493
526
|
"Error in file \"%s\" on line %s: quote should be followed by end of value, end of "
|
494
527
|
"row or another quote. (%s). ",
|
495
|
-
options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(),
|
528
|
+
options.file_path, GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(),
|
529
|
+
options.ToString());
|
496
530
|
return false;
|
497
531
|
}
|
498
532
|
}
|
@@ -506,13 +540,13 @@ handle_escape : {
|
|
506
540
|
if (position_buffer >= buffer_size && buffer->buffer->IsCSVFileLastBuffer()) {
|
507
541
|
error_message = StringUtil::Format(
|
508
542
|
"Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
|
509
|
-
GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
|
543
|
+
GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), options.ToString());
|
510
544
|
return false;
|
511
545
|
}
|
512
546
|
if ((*buffer)[position_buffer] != options.quote[0] && (*buffer)[position_buffer] != options.escape[0]) {
|
513
547
|
error_message = StringUtil::Format(
|
514
548
|
"Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
|
515
|
-
GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
|
549
|
+
GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), options.ToString());
|
516
550
|
return false;
|
517
551
|
}
|
518
552
|
// escape was followed by quote or escape, go back to quoted state
|
@@ -535,6 +569,7 @@ final_state : {
|
|
535
569
|
finished = true;
|
536
570
|
}
|
537
571
|
}
|
572
|
+
buffer->lines_read += insert_chunk.size();
|
538
573
|
return true;
|
539
574
|
}
|
540
575
|
// If this is the last buffer, we have to read the last value
|
@@ -544,20 +579,22 @@ final_state : {
|
|
544
579
|
// remaining values to be added to the chunk
|
545
580
|
auto str_value = buffer->GetValue(start_buffer, position_buffer, offset);
|
546
581
|
if (!AllNewLine(str_value, insert_chunk.data.size()) || offset == 0) {
|
547
|
-
AddValue(str_value, column, escape_positions, has_quotes);
|
582
|
+
AddValue(str_value, column, escape_positions, has_quotes, buffer->local_batch_index);
|
548
583
|
if (try_add_line) {
|
549
584
|
bool success = column == return_types.size();
|
550
585
|
if (success) {
|
551
|
-
|
552
|
-
|
586
|
+
auto cur_linenr = linenr;
|
587
|
+
AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
|
588
|
+
success = Flush(insert_chunk, buffer->local_batch_index);
|
589
|
+
linenr = cur_linenr;
|
553
590
|
}
|
554
591
|
parse_chunk.Reset();
|
555
592
|
reached_remainder_state = false;
|
556
593
|
return success;
|
557
594
|
} else {
|
558
|
-
VerifyLineLength(position_buffer - line_start
|
595
|
+
VerifyLineLength(position_buffer - line_start);
|
559
596
|
line_start = position_buffer;
|
560
|
-
AddRow(insert_chunk, column, error_message);
|
597
|
+
AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
|
561
598
|
verification_positions.end_of_last_line = position_buffer;
|
562
599
|
}
|
563
600
|
}
|
@@ -565,7 +602,8 @@ final_state : {
|
|
565
602
|
}
|
566
603
|
// flush the parsed chunk and finalize parsing
|
567
604
|
if (mode == ParserMode::PARSING) {
|
568
|
-
Flush(insert_chunk);
|
605
|
+
Flush(insert_chunk, buffer->local_batch_index);
|
606
|
+
buffer->lines_read += insert_chunk.size();
|
569
607
|
}
|
570
608
|
if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
|
571
609
|
error_message = "Line does not fit in one buffer. Increase the buffer size.";
|
@@ -597,6 +635,16 @@ void ParallelCSVReader::ParseCSV(DataChunk &insert_chunk) {
|
|
597
635
|
}
|
598
636
|
}
|
599
637
|
|
638
|
+
idx_t ParallelCSVReader::GetLineError(idx_t line_error, idx_t buffer_idx) {
|
639
|
+
|
640
|
+
while (true) {
|
641
|
+
if (buffer->line_info->CanItGetLine(file_idx, buffer_idx)) {
|
642
|
+
auto cur_start = verification_positions.beginning_of_first_line + buffer->buffer->GetCSVGlobalStart();
|
643
|
+
return buffer->line_info->GetLine(buffer_idx, line_error, file_idx, cur_start, false);
|
644
|
+
}
|
645
|
+
}
|
646
|
+
}
|
647
|
+
|
600
648
|
bool ParallelCSVReader::TryParseCSV(ParserMode mode) {
|
601
649
|
DataChunk dummy_chunk;
|
602
650
|
string error_message;
|