duckdb 0.7.2-dev3294.0 → 0.7.2-dev3353.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/binding.gyp CHANGED
@@ -237,18 +237,18 @@
237
237
  "src/duckdb/third_party/zstd/compress/zstd_lazy.cpp",
238
238
  "src/duckdb/third_party/zstd/compress/zstd_ldm.cpp",
239
239
  "src/duckdb/third_party/zstd/compress/zstd_opt.cpp",
240
- "src/duckdb/extension/icu/./icu-makedate.cpp",
241
- "src/duckdb/extension/icu/./icu-datepart.cpp",
242
- "src/duckdb/extension/icu/./icu-timebucket.cpp",
243
- "src/duckdb/extension/icu/./icu-list-range.cpp",
244
- "src/duckdb/extension/icu/./icu-table-range.cpp",
245
240
  "src/duckdb/extension/icu/./icu-dateadd.cpp",
246
241
  "src/duckdb/extension/icu/./icu-datetrunc.cpp",
247
- "src/duckdb/extension/icu/./icu-datefunc.cpp",
242
+ "src/duckdb/extension/icu/./icu-datesub.cpp",
243
+ "src/duckdb/extension/icu/./icu-table-range.cpp",
244
+ "src/duckdb/extension/icu/./icu-timebucket.cpp",
245
+ "src/duckdb/extension/icu/./icu-list-range.cpp",
246
+ "src/duckdb/extension/icu/./icu-datepart.cpp",
248
247
  "src/duckdb/extension/icu/./icu-timezone.cpp",
249
- "src/duckdb/extension/icu/./icu-strptime.cpp",
250
248
  "src/duckdb/extension/icu/./icu-extension.cpp",
251
- "src/duckdb/extension/icu/./icu-datesub.cpp",
249
+ "src/duckdb/extension/icu/./icu-makedate.cpp",
250
+ "src/duckdb/extension/icu/./icu-datefunc.cpp",
251
+ "src/duckdb/extension/icu/./icu-strptime.cpp",
252
252
  "src/duckdb/ub_extension_icu_third_party_icu_common.cpp",
253
253
  "src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp",
254
254
  "src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp",
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.7.2-dev3294.0",
5
+ "version": "0.7.2-dev3353.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -1039,6 +1039,9 @@ void ART::InitializeMerge(ARTFlags &flags) {
1039
1039
  bool ART::MergeIndexes(IndexLock &state, Index &other_index) {
1040
1040
 
1041
1041
  auto &other_art = other_index.Cast<ART>();
1042
+ if (!other_art.tree->IsSet()) {
1043
+ return true;
1044
+ }
1042
1045
 
1043
1046
  if (tree->IsSet()) {
1044
1047
  // fully deserialize other_index, and traverse it to increment its buffer IDs
@@ -159,23 +159,15 @@ void Prefix::Concatenate(ART &art, const uint8_t byte, const Prefix &other) {
159
159
  return;
160
160
  }
161
161
 
162
+ auto this_inlined = IsInlined();
162
163
  auto this_count = count;
163
164
  auto this_data = data;
164
165
  Initialize();
165
166
 
166
- // append the other prefix
167
+ // append the other prefix and possibly move the data to a segment
167
168
  Append(art, other);
168
-
169
169
  if (IsInlined()) {
170
- // move to a segment
171
- reference<PrefixSegment> segment(MoveInlinedToSegment(art));
172
- // append the byte
173
- segment = segment.get().Append(art, count, byte);
174
- // append this prefix
175
- for (idx_t i = 0; i < this_count; i++) {
176
- segment = segment.get().Append(art, count, this_data.inlined[i]);
177
- }
178
- return;
170
+ MoveInlinedToSegment(art);
179
171
  }
180
172
 
181
173
  // get the tail
@@ -183,6 +175,14 @@ void Prefix::Concatenate(ART &art, const uint8_t byte, const Prefix &other) {
183
175
  // append the byte
184
176
  segment = segment.get().Append(art, count, byte);
185
177
 
178
+ if (this_inlined) {
179
+ // append this prefix
180
+ for (idx_t i = 0; i < this_count; i++) {
181
+ segment = segment.get().Append(art, count, this_data.inlined[i]);
182
+ }
183
+ return;
184
+ }
185
+
186
186
  // iterate all segments of this prefix, copy their data, and free them
187
187
  auto this_ptr = this_data.ptr;
188
188
  auto remaining = this_count;
@@ -17,6 +17,7 @@
17
17
  #include "utf8proc.hpp"
18
18
  #include "duckdb/parser/keyword_helper.hpp"
19
19
  #include "duckdb/main/error_manager.hpp"
20
+ #include "duckdb/execution/operator/persistent/parallel_csv_reader.hpp"
20
21
 
21
22
  #include <algorithm>
22
23
  #include <cctype>
@@ -25,9 +26,10 @@
25
26
 
26
27
  namespace duckdb {
27
28
 
28
- string BaseCSVReader::GetLineNumberStr(idx_t linenr, bool linenr_estimated) {
29
- string estimated = (linenr_estimated ? string(" (estimated)") : string(""));
30
- return to_string(linenr + 1) + estimated;
29
+ string BaseCSVReader::GetLineNumberStr(idx_t line_error, bool is_line_estimated, idx_t buffer_idx) {
30
+ // If an error happens during auto-detect it is an estimated line
31
+ string estimated = (is_line_estimated ? string(" (estimated)") : string(""));
32
+ return to_string(GetLineError(line_error, buffer_idx)) + estimated;
31
33
  }
32
34
 
33
35
  BaseCSVReader::BaseCSVReader(ClientContext &context_p, BufferedCSVReaderOptions options_p,
@@ -165,40 +167,48 @@ struct TryCastTimestampOperator {
165
167
 
166
168
  template <class OP, class T>
167
169
  static bool TemplatedTryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
168
- idx_t count, string &error_message) {
170
+ idx_t count, string &error_message, idx_t &line_error) {
169
171
  D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
170
172
  bool all_converted = true;
173
+ idx_t cur_line = 0;
171
174
  UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
172
175
  T result;
173
176
  if (!OP::Operation(options, input, result, error_message)) {
177
+ line_error = cur_line;
174
178
  all_converted = false;
175
179
  }
180
+ cur_line++;
176
181
  return result;
177
182
  });
178
183
  return all_converted;
179
184
  }
180
185
 
181
186
  bool TryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
182
- string &error_message) {
187
+ string &error_message, idx_t &line_error) {
183
188
  return TemplatedTryCastDateVector<TryCastDateOperator, date_t>(options, input_vector, result_vector, count,
184
- error_message);
189
+ error_message, line_error);
185
190
  }
186
191
 
187
192
  bool TryCastTimestampVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
188
193
  string &error_message) {
194
+ idx_t line_error;
189
195
  return TemplatedTryCastDateVector<TryCastTimestampOperator, timestamp_t>(options, input_vector, result_vector,
190
- count, error_message);
196
+ count, error_message, line_error);
191
197
  }
192
198
 
193
199
  template <class OP, class T>
194
200
  bool TemplatedTryCastFloatingVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
195
- idx_t count, string &error_message) {
201
+ idx_t count, string &error_message, idx_t &line_error) {
196
202
  D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
197
203
  bool all_converted = true;
204
+ idx_t row = 0;
198
205
  UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
199
206
  T result;
200
207
  if (!OP::Operation(input, result, &error_message)) {
208
+ line_error = row;
201
209
  all_converted = false;
210
+ } else {
211
+ row++;
202
212
  }
203
213
  return result;
204
214
  });
@@ -226,7 +236,8 @@ bool BaseCSVReader::TryCastVector(Vector &parse_chunk_col, idx_t size, const Log
226
236
  if (options.has_format[LogicalTypeId::DATE] && sql_type == LogicalTypeId::DATE) {
227
237
  // use the date format to cast the chunk
228
238
  string error_message;
229
- return TryCastDateVector(options, parse_chunk_col, dummy_result, size, error_message);
239
+ idx_t line_error;
240
+ return TryCastDateVector(options, parse_chunk_col, dummy_result, size, error_message, line_error);
230
241
  } else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type == LogicalTypeId::TIMESTAMP) {
231
242
  // use the timestamp format to cast the chunk
232
243
  string error_message;
@@ -238,7 +249,8 @@ bool BaseCSVReader::TryCastVector(Vector &parse_chunk_col, idx_t size, const Log
238
249
  }
239
250
  }
240
251
 
241
- void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes) {
252
+ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes,
253
+ idx_t buffer_idx) {
242
254
  auto length = str_val.GetSize();
243
255
  if (length == 0 && column == 0) {
244
256
  row_empty = true;
@@ -260,7 +272,8 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
260
272
  } else {
261
273
  throw InvalidInputException(
262
274
  "Error in file \"%s\", on line %s: expected %lld values per row, but got more. (%s)", options.file_path,
263
- GetLineNumberStr(linenr, linenr_estimated).c_str(), return_types.size(), options.ToString());
275
+ GetLineNumberStr(linenr, linenr_estimated, buffer_idx).c_str(), return_types.size(),
276
+ options.ToString());
264
277
  }
265
278
  }
266
279
 
@@ -301,7 +314,7 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
301
314
  column++;
302
315
  }
303
316
 
304
- bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message) {
317
+ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message, idx_t buffer_idx) {
305
318
  linenr++;
306
319
 
307
320
  if (row_empty) {
@@ -338,8 +351,8 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
338
351
  } else {
339
352
  throw InvalidInputException(
340
353
  "Error in file \"%s\" on line %s: expected %lld values per row, but got %d.\nParser options:\n%s",
341
- options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), return_types.size(), column,
342
- options.ToString());
354
+ options.file_path, GetLineNumberStr(linenr, linenr_estimated, buffer_idx).c_str(),
355
+ return_types.size(), column, options.ToString());
343
356
  }
344
357
  }
345
358
  }
@@ -363,7 +376,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
363
376
  }
364
377
 
365
378
  if (mode == ParserMode::PARSING && parse_chunk.size() == STANDARD_VECTOR_SIZE) {
366
- Flush(insert_chunk);
379
+ Flush(insert_chunk, buffer_idx);
367
380
  return true;
368
381
  }
369
382
 
@@ -426,20 +439,21 @@ bool TryCastDecimalVectorCommaSeparated(BufferedCSVReaderOptions &options, Vecto
426
439
  }
427
440
 
428
441
  bool TryCastFloatingVectorCommaSeparated(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
429
- idx_t count, string &error_message, const LogicalType &result_type) {
442
+ idx_t count, string &error_message, const LogicalType &result_type,
443
+ idx_t &line_error) {
430
444
  switch (result_type.InternalType()) {
431
445
  case PhysicalType::DOUBLE:
432
446
  return TemplatedTryCastFloatingVector<TryCastErrorMessageCommaSeparated, double>(
433
- options, input_vector, result_vector, count, error_message);
447
+ options, input_vector, result_vector, count, error_message, line_error);
434
448
  case PhysicalType::FLOAT:
435
449
  return TemplatedTryCastFloatingVector<TryCastErrorMessageCommaSeparated, float>(
436
- options, input_vector, result_vector, count, error_message);
450
+ options, input_vector, result_vector, count, error_message, line_error);
437
451
  default:
438
452
  throw InternalException("Unimplemented physical type for floating");
439
453
  }
440
454
  }
441
455
 
442
- bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
456
+ bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_add_line) {
443
457
  if (parse_chunk.size() == 0) {
444
458
  return true;
445
459
  }
@@ -468,9 +482,12 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
468
482
  } else {
469
483
  string error_message;
470
484
  bool success;
485
+ idx_t line_error = 0;
486
+ bool target_type_not_varchar = false;
471
487
  if (options.has_format[LogicalTypeId::DATE] && type.id() == LogicalTypeId::DATE) {
472
488
  // use the date format to cast the chunk
473
- success = TryCastDateVector(options, parse_vector, result_vector, parse_chunk.size(), error_message);
489
+ success = TryCastDateVector(options, parse_vector, result_vector, parse_chunk.size(), error_message,
490
+ line_error);
474
491
  } else if (options.has_format[LogicalTypeId::TIMESTAMP] && type.id() == LogicalTypeId::TIMESTAMP) {
475
492
  // use the date format to cast the chunk
476
493
  success =
@@ -478,12 +495,13 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
478
495
  } else if (options.decimal_separator != "." &&
479
496
  (type.id() == LogicalTypeId::FLOAT || type.id() == LogicalTypeId::DOUBLE)) {
480
497
  success = TryCastFloatingVectorCommaSeparated(options, parse_vector, result_vector, parse_chunk.size(),
481
- error_message, type);
498
+ error_message, type, line_error);
482
499
  } else if (options.decimal_separator != "." && type.id() == LogicalTypeId::DECIMAL) {
483
500
  success = TryCastDecimalVectorCommaSeparated(options, parse_vector, result_vector, parse_chunk.size(),
484
501
  error_message, type);
485
502
  } else {
486
503
  // target type is not varchar: perform a cast
504
+ target_type_not_varchar = true;
487
505
  success =
488
506
  VectorOperations::TryCast(context, parse_vector, result_vector, parse_chunk.size(), &error_message);
489
507
  }
@@ -503,15 +521,25 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
503
521
  }
504
522
 
505
523
  // figure out the exact line number
506
- UnifiedVectorFormat inserted_column_data;
507
- result_vector.ToUnifiedFormat(parse_chunk.size(), inserted_column_data);
508
- idx_t row_idx;
509
- for (row_idx = 0; row_idx < parse_chunk.size(); row_idx++) {
510
- if (!inserted_column_data.validity.RowIsValid(row_idx) && !FlatVector::IsNull(parse_vector, row_idx)) {
511
- break;
524
+ if (target_type_not_varchar) {
525
+ UnifiedVectorFormat inserted_column_data;
526
+ result_vector.ToUnifiedFormat(parse_chunk.size(), inserted_column_data);
527
+ for (; line_error < parse_chunk.size(); line_error++) {
528
+ if (!inserted_column_data.validity.RowIsValid(line_error) &&
529
+ !FlatVector::IsNull(parse_vector, line_error)) {
530
+ break;
531
+ }
512
532
  }
513
533
  }
514
- auto error_line = linenr - (parse_chunk.size() - row_idx) + 1;
534
+
535
+ idx_t error_line;
536
+ // The line_error must be summed with linenr (All lines emmited from this batch)
537
+ // But subtracted from the parse_chunk
538
+ D_ASSERT(line_error + linenr >= parse_chunk.size());
539
+ line_error += linenr;
540
+ line_error -= parse_chunk.size();
541
+
542
+ error_line = GetLineError(line_error, buffer_idx);
515
543
 
516
544
  if (options.auto_detect) {
517
545
  throw InvalidInputException("%s in column %s, at line %llu.\n\nParser "
@@ -39,9 +39,6 @@ BufferedCSVReader::BufferedCSVReader(ClientContext &context, string filename, Bu
39
39
  Initialize(requested_types);
40
40
  }
41
41
 
42
- BufferedCSVReader::~BufferedCSVReader() {
43
- }
44
-
45
42
  enum class QuoteRule : uint8_t { QUOTES_RFC = 0, QUOTES_OTHER = 1, NO_QUOTES = 2 };
46
43
 
47
44
  static bool StartsWithNumericDate(string &separator, const string &value) {
@@ -15,19 +15,20 @@
15
15
  #include "utf8proc.hpp"
16
16
  #include "duckdb/parser/keyword_helper.hpp"
17
17
  #include "duckdb/function/table/read_csv.hpp"
18
+ #include "duckdb/execution/operator/persistent/csv_line_info.hpp"
18
19
 
19
20
  #include <algorithm>
20
21
  #include <cctype>
21
22
  #include <cstring>
22
23
  #include <fstream>
23
- #include <utility>
24
24
 
25
25
  namespace duckdb {
26
26
 
27
27
  ParallelCSVReader::ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
28
28
  unique_ptr<CSVBufferRead> buffer_p, idx_t first_pos_first_buffer_p,
29
- const vector<LogicalType> &requested_types)
30
- : BaseCSVReader(context, std::move(options_p), requested_types), first_pos_first_buffer(first_pos_first_buffer_p) {
29
+ const vector<LogicalType> &requested_types, idx_t file_idx_p)
30
+ : BaseCSVReader(context, std::move(options_p), requested_types), file_idx(file_idx_p),
31
+ first_pos_first_buffer(first_pos_first_buffer_p) {
31
32
  Initialize(requested_types);
32
33
  SetBufferRead(std::move(buffer_p));
33
34
  if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
@@ -35,9 +36,6 @@ ParallelCSVReader::ParallelCSVReader(ClientContext &context, BufferedCSVReaderOp
35
36
  }
36
37
  }
37
38
 
38
- ParallelCSVReader::~ParallelCSVReader() {
39
- }
40
-
41
39
  void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
42
40
  return_types = requested_types;
43
41
  InitParseChunk(return_types.size());
@@ -76,7 +74,7 @@ void ParallelCSVReader::SkipEmptyLines() {
76
74
  }
77
75
  }
78
76
 
79
- bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
77
+ bool ParallelCSVReader::SetPosition() {
80
78
  if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer &&
81
79
  start_buffer == first_pos_first_buffer) {
82
80
  start_buffer = buffer->buffer->GetStart();
@@ -84,7 +82,7 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
84
82
  verification_positions.beginning_of_first_line = position_buffer;
85
83
  verification_positions.end_of_last_line = position_buffer;
86
84
  // First buffer doesn't need any setting
87
- // Unless we have a header
85
+
88
86
  if (options.header) {
89
87
  for (; position_buffer < end_buffer; position_buffer++) {
90
88
  if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
@@ -205,10 +203,8 @@ void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
205
203
  } else {
206
204
  buffer_size = buffer_read_p->buffer->GetBufferSize();
207
205
  }
208
- linenr = buffer_read_p->estimated_linenr;
209
206
  buffer = std::move(buffer_read_p);
210
207
 
211
- linenr_estimated = true;
212
208
  reached_remainder_state = false;
213
209
  verification_positions.beginning_of_first_line = 0;
214
210
  verification_positions.end_of_last_line = 0;
@@ -239,10 +235,12 @@ bool ParallelCSVReader::BufferRemainder() {
239
235
  return true;
240
236
  }
241
237
 
242
- void VerifyLineLength(idx_t line_size, idx_t max_line_size) {
243
- if (line_size > max_line_size) {
244
- // FIXME: this should also output the correct estimated linenumber where it broke
245
- throw InvalidInputException("Maximum line size of %llu bytes exceeded!", max_line_size);
238
+ void ParallelCSVReader::VerifyLineLength(idx_t line_size) {
239
+ if (line_size > options.maximum_line_size) {
240
+ throw InvalidInputException("Error in file \"%s\" on line %s: Maximum line size of %llu bytes exceeded!",
241
+ options.file_path,
242
+ GetLineNumberStr(parse_chunk.size(), linenr_estimated, buffer->batch_index).c_str(),
243
+ options.maximum_line_size);
246
244
  }
247
245
  }
248
246
 
@@ -261,6 +259,33 @@ bool AllNewLine(string_t value, idx_t column_amount) {
261
259
  }
262
260
 
263
261
  bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
262
+ // If line is not set, we have to figure it out, we assume whatever is in the first line
263
+ if (options.new_line == NewLineIdentifier::NOT_SET) {
264
+ idx_t cur_pos = position_buffer;
265
+ // we can start in the middle of a new line, so move a bit forward.
266
+ while (cur_pos < end_buffer) {
267
+ if (StringUtil::CharacterIsNewline((*buffer)[cur_pos])) {
268
+ cur_pos++;
269
+ } else {
270
+ break;
271
+ }
272
+ }
273
+ for (; cur_pos < end_buffer; cur_pos++) {
274
+ if (StringUtil::CharacterIsNewline((*buffer)[cur_pos])) {
275
+ bool carriage_return = (*buffer)[cur_pos] == '\r';
276
+ bool carriage_return_followed = false;
277
+ cur_pos++;
278
+ if (cur_pos < end_buffer) {
279
+ if (carriage_return && (*buffer)[cur_pos] == '\n') {
280
+ carriage_return_followed = true;
281
+ cur_pos++;
282
+ }
283
+ }
284
+ SetNewLineDelimiter(carriage_return, carriage_return_followed);
285
+ break;
286
+ }
287
+ }
288
+ }
264
289
  // used for parsing algorithm
265
290
  if (start_buffer == buffer_size) {
266
291
  // Nothing to read
@@ -276,7 +301,7 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
276
301
  vector<idx_t> escape_positions;
277
302
  if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
278
303
  // First time reading this buffer piece
279
- if (!SetPosition(insert_chunk)) {
304
+ if (!SetPosition()) {
280
305
  finished = true;
281
306
  return true;
282
307
  }
@@ -340,7 +365,8 @@ normal : {
340
365
 
341
366
  add_value : {
342
367
  /* state: Add value to string vector */
343
- AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes);
368
+ AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes,
369
+ buffer->local_batch_index);
344
370
  // increase position by 1 and move start to the new position
345
371
  offset = 0;
346
372
  has_quotes = false;
@@ -356,20 +382,23 @@ add_row : {
356
382
  // check type of newline (\r or \n)
357
383
  bool carriage_return = (*buffer)[position_buffer] == '\r';
358
384
 
359
- AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes);
385
+ AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes,
386
+ buffer->local_batch_index);
360
387
  if (try_add_line) {
361
388
  bool success = column == insert_chunk.ColumnCount();
362
389
  if (success) {
363
- AddRow(insert_chunk, column, error_message);
364
- success = Flush(insert_chunk);
390
+ idx_t cur_linenr = linenr;
391
+ AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
392
+ success = Flush(insert_chunk, buffer->local_batch_index, true);
393
+ linenr = cur_linenr;
365
394
  }
366
395
  reached_remainder_state = false;
367
396
  parse_chunk.Reset();
368
397
  return success;
369
398
  } else {
370
- VerifyLineLength(position_buffer - line_start, options.maximum_line_size);
399
+ VerifyLineLength(position_buffer - line_start);
371
400
  line_start = position_buffer;
372
- finished_chunk = AddRow(insert_chunk, column, error_message);
401
+ finished_chunk = AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
373
402
  }
374
403
  // increase position by 1 and move start to the new position
375
404
  offset = 0;
@@ -377,15 +406,12 @@ add_row : {
377
406
  position_buffer++;
378
407
  start_buffer = position_buffer;
379
408
  verification_positions.end_of_last_line = position_buffer;
380
- if (reached_remainder_state) {
381
- goto final_state;
382
- }
383
- if (!BufferRemainder()) {
384
- goto final_state;
385
- }
386
409
  if (carriage_return) {
387
410
  // \r newline, go to special state that parses an optional \n afterwards
388
411
  // optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
412
+ if (!BufferRemainder()) {
413
+ goto final_state;
414
+ }
389
415
  if ((*buffer)[position_buffer] == '\n') {
390
416
  if (options.new_line == NewLineIdentifier::SINGLE) {
391
417
  error_message = "Wrong NewLine Identifier. Expecting \\r\\n";
@@ -419,6 +445,12 @@ add_row : {
419
445
  error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
420
446
  return false;
421
447
  }
448
+ if (reached_remainder_state) {
449
+ goto final_state;
450
+ }
451
+ if (!BufferRemainder()) {
452
+ goto final_state;
453
+ }
422
454
  SkipEmptyLines();
423
455
  verification_positions.end_of_last_line = position_buffer;
424
456
  start_buffer = position_buffer;
@@ -451,7 +483,8 @@ in_quotes:
451
483
  }
452
484
  // still in quoted state at the end of the file or at the end of a buffer when running multithreaded, error:
453
485
  throw InvalidInputException("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path,
454
- GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
486
+ GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(),
487
+ options.ToString());
455
488
  } else {
456
489
  goto final_state;
457
490
  }
@@ -492,7 +525,8 @@ unquote : {
492
525
  error_message = StringUtil::Format(
493
526
  "Error in file \"%s\" on line %s: quote should be followed by end of value, end of "
494
527
  "row or another quote. (%s). ",
495
- options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
528
+ options.file_path, GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(),
529
+ options.ToString());
496
530
  return false;
497
531
  }
498
532
  }
@@ -506,13 +540,13 @@ handle_escape : {
506
540
  if (position_buffer >= buffer_size && buffer->buffer->IsCSVFileLastBuffer()) {
507
541
  error_message = StringUtil::Format(
508
542
  "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
509
- GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
543
+ GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), options.ToString());
510
544
  return false;
511
545
  }
512
546
  if ((*buffer)[position_buffer] != options.quote[0] && (*buffer)[position_buffer] != options.escape[0]) {
513
547
  error_message = StringUtil::Format(
514
548
  "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
515
- GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
549
+ GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), options.ToString());
516
550
  return false;
517
551
  }
518
552
  // escape was followed by quote or escape, go back to quoted state
@@ -535,6 +569,7 @@ final_state : {
535
569
  finished = true;
536
570
  }
537
571
  }
572
+ buffer->lines_read += insert_chunk.size();
538
573
  return true;
539
574
  }
540
575
  // If this is the last buffer, we have to read the last value
@@ -544,20 +579,22 @@ final_state : {
544
579
  // remaining values to be added to the chunk
545
580
  auto str_value = buffer->GetValue(start_buffer, position_buffer, offset);
546
581
  if (!AllNewLine(str_value, insert_chunk.data.size()) || offset == 0) {
547
- AddValue(str_value, column, escape_positions, has_quotes);
582
+ AddValue(str_value, column, escape_positions, has_quotes, buffer->local_batch_index);
548
583
  if (try_add_line) {
549
584
  bool success = column == return_types.size();
550
585
  if (success) {
551
- AddRow(insert_chunk, column, error_message);
552
- success = Flush(insert_chunk);
586
+ auto cur_linenr = linenr;
587
+ AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
588
+ success = Flush(insert_chunk, buffer->local_batch_index);
589
+ linenr = cur_linenr;
553
590
  }
554
591
  parse_chunk.Reset();
555
592
  reached_remainder_state = false;
556
593
  return success;
557
594
  } else {
558
- VerifyLineLength(position_buffer - line_start, options.maximum_line_size);
595
+ VerifyLineLength(position_buffer - line_start);
559
596
  line_start = position_buffer;
560
- AddRow(insert_chunk, column, error_message);
597
+ AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
561
598
  verification_positions.end_of_last_line = position_buffer;
562
599
  }
563
600
  }
@@ -565,7 +602,8 @@ final_state : {
565
602
  }
566
603
  // flush the parsed chunk and finalize parsing
567
604
  if (mode == ParserMode::PARSING) {
568
- Flush(insert_chunk);
605
+ Flush(insert_chunk, buffer->local_batch_index);
606
+ buffer->lines_read += insert_chunk.size();
569
607
  }
570
608
  if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
571
609
  error_message = "Line does not fit in one buffer. Increase the buffer size.";
@@ -597,6 +635,16 @@ void ParallelCSVReader::ParseCSV(DataChunk &insert_chunk) {
597
635
  }
598
636
  }
599
637
 
638
+ idx_t ParallelCSVReader::GetLineError(idx_t line_error, idx_t buffer_idx) {
639
+
640
+ while (true) {
641
+ if (buffer->line_info->CanItGetLine(file_idx, buffer_idx)) {
642
+ auto cur_start = verification_positions.beginning_of_first_line + buffer->buffer->GetCSVGlobalStart();
643
+ return buffer->line_info->GetLine(buffer_idx, line_error, file_idx, cur_start, false);
644
+ }
645
+ }
646
+ }
647
+
600
648
  bool ParallelCSVReader::TryParseCSV(ParserMode mode) {
601
649
  DataChunk dummy_chunk;
602
650
  string error_message;