duckdb 0.6.2-dev1687.0 → 0.6.2-dev1736.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.6.2-dev1687.0",
5
+ "version": "0.6.2-dev1736.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -551,4 +551,32 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
551
551
  parse_chunk.Reset();
552
552
  return true;
553
553
  }
554
+
555
+ void BaseCSVReader::SetNewLineDelimiter(bool carry, bool carry_followed_by_nl) {
556
+ if ((mode == ParserMode::SNIFFING_DIALECT && !options.has_newline) ||
557
+ options.new_line == NewLineIdentifier::NOT_SET) {
558
+ if (options.new_line == NewLineIdentifier::MIX) {
559
+ return;
560
+ }
561
+ NewLineIdentifier this_line_identifier;
562
+ if (carry) {
563
+ if (carry_followed_by_nl) {
564
+ this_line_identifier = NewLineIdentifier::CARRY_ON;
565
+ } else {
566
+ this_line_identifier = NewLineIdentifier::SINGLE;
567
+ }
568
+ } else {
569
+ this_line_identifier = NewLineIdentifier::SINGLE;
570
+ }
571
+ if (options.new_line == NewLineIdentifier::NOT_SET) {
572
+ options.new_line = this_line_identifier;
573
+ return;
574
+ }
575
+ if (options.new_line != this_line_identifier) {
576
+ options.new_line = NewLineIdentifier::MIX;
577
+ return;
578
+ }
579
+ options.new_line = this_line_identifier;
580
+ }
581
+ }
554
582
  } // namespace duckdb
@@ -482,6 +482,7 @@ void BufferedCSVReader::DetectDialect(const vector<LogicalType> &requested_types
482
482
  } else if ((more_values || single_column_before) && rows_consistent) {
483
483
  sniff_info.skip_rows = start_row;
484
484
  sniff_info.num_cols = num_cols;
485
+ sniff_info.new_line = options.new_line;
485
486
  best_consistent_rows = consistent_rows;
486
487
  best_num_cols = num_cols;
487
488
 
@@ -497,6 +498,7 @@ void BufferedCSVReader::DetectDialect(const vector<LogicalType> &requested_types
497
498
  if (!same_quote_is_candidate) {
498
499
  sniff_info.skip_rows = start_row;
499
500
  sniff_info.num_cols = num_cols;
501
+ sniff_info.new_line = options.new_line;
500
502
  info_candidates.push_back(sniff_info);
501
503
  }
502
504
  }
@@ -1264,6 +1266,7 @@ add_row : {
1264
1266
  // \r newline, go to special state that parses an optional \n afterwards
1265
1267
  goto carriage_return;
1266
1268
  } else {
1269
+ SetNewLineDelimiter();
1267
1270
  // \n newline, move to value start
1268
1271
  if (finished_chunk) {
1269
1272
  return true;
@@ -1342,6 +1345,7 @@ carriage_return:
1342
1345
  /* state: carriage_return */
1343
1346
  // this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
1344
1347
  if (buffer[position] == '\n') {
1348
+ SetNewLineDelimiter(true, true);
1345
1349
  // newline after carriage return: skip
1346
1350
  // increase position by 1 and move start to the new position
1347
1351
  start = ++position;
@@ -1349,6 +1353,8 @@ carriage_return:
1349
1353
  // file ends right after delimiter, go to final state
1350
1354
  goto final_state;
1351
1355
  }
1356
+ } else {
1357
+ SetNewLineDelimiter(true, false);
1352
1358
  }
1353
1359
  if (finished_chunk) {
1354
1360
  return true;
@@ -3,12 +3,15 @@
3
3
 
4
4
  namespace duckdb {
5
5
 
6
- CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle)
6
+ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
7
+ idx_t &global_csv_current_position)
7
8
  : context(context), first_buffer(true) {
8
9
  this->handle = AllocateBuffer(buffer_size_p);
9
10
 
10
11
  auto buffer = Ptr();
11
12
  actual_size = file_handle.Read(buffer, buffer_size_p);
13
+ global_csv_start = global_csv_current_position;
14
+ global_csv_current_position += actual_size;
12
15
  if (actual_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
13
16
  start_position += 3;
14
17
  }
@@ -16,21 +19,25 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle
16
19
  }
17
20
 
18
21
  CSVBuffer::CSVBuffer(ClientContext &context, BufferHandle buffer_p, idx_t buffer_size_p, idx_t actual_size_p,
19
- bool final_buffer)
20
- : context(context), handle(std::move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer) {
22
+ bool final_buffer, idx_t global_csv_current_position)
23
+ : context(context), handle(std::move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer),
24
+ global_csv_start(global_csv_current_position) {
21
25
  }
22
26
 
23
- unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t set_buffer_size) {
27
+ unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size,
28
+ idx_t &global_csv_current_position) {
24
29
  if (file_handle.FinishedReading()) {
25
30
  // this was the last buffer
26
31
  return nullptr;
27
32
  }
28
33
 
29
- auto next_buffer = AllocateBuffer(set_buffer_size);
30
- idx_t next_buffer_actual_size = file_handle.Read(next_buffer.Ptr(), set_buffer_size);
34
+ auto next_buffer = AllocateBuffer(buffer_size);
35
+ idx_t next_buffer_actual_size = file_handle.Read(next_buffer.Ptr(), buffer_size);
31
36
 
32
- return make_unique<CSVBuffer>(context, std::move(next_buffer), set_buffer_size, next_buffer_actual_size,
33
- file_handle.FinishedReading());
37
+ auto next_csv_buffer = make_unique<CSVBuffer>(context, std::move(next_buffer), buffer_size, next_buffer_actual_size,
38
+ file_handle.FinishedReading(), global_csv_current_position);
39
+ global_csv_current_position += next_buffer_actual_size;
40
+ return next_csv_buffer;
34
41
  }
35
42
 
36
43
  BufferHandle CSVBuffer::AllocateBuffer(idx_t buffer_size) {
@@ -54,4 +61,8 @@ bool CSVBuffer::IsCSVFileFirstBuffer() {
54
61
  return first_buffer;
55
62
  }
56
63
 
64
+ idx_t CSVBuffer::GetCSVGlobalStart() {
65
+ return global_csv_start;
66
+ }
67
+
57
68
  } // namespace duckdb
@@ -117,6 +117,17 @@ void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
117
117
  }
118
118
  }
119
119
 
120
+ void BufferedCSVReaderOptions::SetNewline(const string &input) {
121
+ if (input == "\\n" || input == "\\r") {
122
+ new_line = NewLineIdentifier::SINGLE;
123
+ } else if (input == "\\r\\n") {
124
+ new_line = NewLineIdentifier::CARRY_ON;
125
+ } else {
126
+ throw InvalidInputException("This is not accepted as a newline: " + input);
127
+ }
128
+ has_newline = true;
129
+ }
130
+
120
131
  void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) {
121
132
  string error;
122
133
  if (read_format) {
@@ -233,6 +244,8 @@ bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value
233
244
  } else if (loption == "quote") {
234
245
  quote = ParseString(value, loption);
235
246
  has_quote = true;
247
+ } else if (loption == "new_line") {
248
+ SetNewline(ParseString(value, loption));
236
249
  } else if (loption == "escape") {
237
250
  escape = ParseString(value, loption);
238
251
  has_escape = true;
@@ -43,15 +43,31 @@ void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
43
43
  InitInsertChunkIdx(return_types.size());
44
44
  }
45
45
 
46
+ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl, bool first_char) {
47
+ // Set the delimiter if not set yet.
48
+ SetNewLineDelimiter(carry, carry_followed_by_nl);
49
+ D_ASSERT(options.new_line == NewLineIdentifier::SINGLE || options.new_line == NewLineIdentifier::CARRY_ON);
50
+ if (options.new_line == NewLineIdentifier::SINGLE) {
51
+ return (!carry) || (carry && !carry_followed_by_nl);
52
+ }
53
+ return (carry && carry_followed_by_nl) || (!carry && first_char);
54
+ }
55
+
46
56
  bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
47
57
  if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer &&
48
58
  start_buffer == buffer->buffer->GetStart()) {
59
+ verification_positions.beginning_of_first_line = position_buffer;
60
+ verification_positions.end_of_last_line = position_buffer;
49
61
  // First buffer doesn't need any setting
50
62
  // Unless we have a header
51
63
  if (options.header && options.auto_detect) {
52
64
  for (; position_buffer < end_buffer; position_buffer++) {
53
65
  if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
66
+ bool carrier_return = (*buffer)[position_buffer] == '\r';
54
67
  position_buffer++;
68
+ if (carrier_return && position_buffer < end_buffer && (*buffer)[position_buffer] == '\n') {
69
+ position_buffer++;
70
+ }
55
71
  return true;
56
72
  }
57
73
  }
@@ -70,26 +86,50 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
70
86
  first_line_chunk.Initialize(allocator, insert_chunk.GetTypes());
71
87
  for (; position_buffer < end_buffer; position_buffer++) {
72
88
  if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
89
+ bool carriage_return = (*buffer)[position_buffer] == '\r';
90
+ bool carriage_return_followed = false;
73
91
  position_buffer++;
74
- break;
92
+ if (position_buffer < end_buffer) {
93
+ if (carriage_return && (*buffer)[position_buffer] == '\n') {
94
+ carriage_return_followed = true;
95
+ position_buffer++;
96
+ }
97
+ }
98
+ if (NewLineDelimiter(carriage_return, carriage_return_followed, position_buffer - 1 == start_buffer)) {
99
+ break;
100
+ }
75
101
  }
76
102
  }
77
- D_ASSERT(position_buffer <= end_buffer);
78
- if (position_buffer == end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
103
+
104
+ if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
105
+ break;
106
+ }
107
+
108
+ if (position_buffer > end_buffer && options.new_line == NewLineIdentifier::CARRY_ON &&
109
+ (*buffer)[position_buffer - 1] == '\n') {
79
110
  break;
80
111
  }
81
112
  idx_t position_set = position_buffer;
82
113
  start_buffer = position_buffer;
83
114
  // We check if we can add this line
84
115
  successfully_read_first_line = TryParseSimpleCSV(first_line_chunk, error_message, true);
85
- start_buffer = position_set;
116
+
86
117
  end_buffer = end_buffer_real;
87
- position_buffer = position_set;
88
- if (end_buffer == position_buffer) {
118
+ start_buffer = position_set;
119
+ if (position_buffer >= end_buffer) {
120
+ if (successfully_read_first_line) {
121
+ position_buffer = position_set;
122
+ }
89
123
  break;
90
124
  }
125
+ position_buffer = position_set;
126
+ }
127
+ if (verification_positions.beginning_of_first_line == 0) {
128
+ verification_positions.beginning_of_first_line = position_buffer;
91
129
  }
92
130
 
131
+ verification_positions.end_of_last_line = position_buffer;
132
+ finished = false;
93
133
  return successfully_read_first_line;
94
134
  }
95
135
 
@@ -110,9 +150,18 @@ void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
110
150
 
111
151
  linenr_estimated = true;
112
152
  reached_remainder_state = false;
153
+ verification_positions.beginning_of_first_line = 0;
154
+ verification_positions.end_of_last_line = 0;
155
+ finished = false;
113
156
  D_ASSERT(end_buffer <= buffer_size);
114
157
  }
115
158
 
159
+ VerificationPositions ParallelCSVReader::GetVerificationPositions() {
160
+ verification_positions.beginning_of_first_line += buffer->buffer->GetCSVGlobalStart();
161
+ verification_positions.end_of_last_line += buffer->buffer->GetCSVGlobalStart();
162
+ return verification_positions;
163
+ }
164
+
116
165
  // If BufferRemainder returns false, it means we are done scanning this buffer and should go to the end_state
117
166
  bool ParallelCSVReader::BufferRemainder() {
118
167
  if (position_buffer >= end_buffer && !reached_remainder_state) {
@@ -131,7 +180,6 @@ bool ParallelCSVReader::BufferRemainder() {
131
180
  }
132
181
 
133
182
  bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
134
-
135
183
  // used for parsing algorithm
136
184
  D_ASSERT(end_buffer <= buffer_size);
137
185
  bool finished_chunk = false;
@@ -139,10 +187,15 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
139
187
  idx_t offset = 0;
140
188
  bool has_quotes = false;
141
189
  vector<idx_t> escape_positions;
142
- if (start_buffer == buffer->buffer_start && !try_add_line) {
190
+ if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
143
191
  // First time reading this buffer piece
144
192
  if (!SetPosition(insert_chunk)) {
145
193
  // This means the buffer size does not contain a new line
194
+ if (position_buffer - start_buffer == options.buffer_size) {
195
+ error_message = "Line does not fit in one buffer. Increase the buffer size.";
196
+ return false;
197
+ }
198
+ finished = true;
146
199
  return true;
147
200
  }
148
201
  }
@@ -180,7 +233,7 @@ normal : {
180
233
  goto add_value;
181
234
  } else if (StringUtil::CharacterIsNewline(c)) {
182
235
  // newline: add row
183
- if (column > 0 || try_add_line) {
236
+ if (column > 0 || try_add_line || insert_chunk.data.size() == 1) {
184
237
  goto add_row;
185
238
  }
186
239
  }
@@ -227,7 +280,8 @@ add_row : {
227
280
  offset = 0;
228
281
  has_quotes = false;
229
282
  start_buffer = ++position_buffer;
230
- if (reached_remainder_state || finished_chunk) {
283
+ verification_positions.end_of_last_line = position_buffer;
284
+ if (reached_remainder_state) {
231
285
  goto final_state;
232
286
  }
233
287
  if (!BufferRemainder()) {
@@ -235,8 +289,37 @@ add_row : {
235
289
  }
236
290
  if (carriage_return) {
237
291
  // \r newline, go to special state that parses an optional \n afterwards
238
- goto carriage_return;
292
+ // optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
293
+ if ((*buffer)[position_buffer] == '\n') {
294
+ if (options.new_line == NewLineIdentifier::SINGLE) {
295
+ error_message = "Wrong NewLine Identifier. Expecting \\r\\n";
296
+ return false;
297
+ }
298
+ // newline after carriage return: skip
299
+ // increase position by 1 and move start to the new position
300
+ start_buffer = ++position_buffer;
301
+ verification_positions.end_of_last_line = position_buffer;
302
+ if (reached_remainder_state) {
303
+ goto final_state;
304
+ }
305
+ } else {
306
+ if (options.new_line == NewLineIdentifier::CARRY_ON) {
307
+ error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
308
+ return false;
309
+ }
310
+ }
311
+ if (!BufferRemainder()) {
312
+ goto final_state;
313
+ }
314
+ if (reached_remainder_state || finished_chunk) {
315
+ goto final_state;
316
+ }
317
+ goto value_start;
239
318
  } else {
319
+ if (options.new_line == NewLineIdentifier::CARRY_ON) {
320
+ error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
321
+ return false;
322
+ }
240
323
  // \n newline, move to value start
241
324
  if (finished_chunk) {
242
325
  goto final_state;
@@ -332,33 +415,31 @@ handle_escape : {
332
415
  // escape was followed by quote or escape, go back to quoted state
333
416
  goto in_quotes;
334
417
  }
335
-
336
- carriage_return : {
337
- /* state: carriage_return */
338
- // this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
339
- if ((*buffer)[position_buffer] == '\n') {
340
- // newline after carriage return: skip
341
- // increase position by 1 and move start to the new position
342
- start_buffer = ++position_buffer;
343
- if (position_buffer >= buffer_size) {
344
- // file ends right after delimiter, go to final state
345
- goto final_state;
346
- }
347
- }
348
- goto value_start;
349
- }
350
418
  final_state : {
351
419
  /* state: final_stage reached after we finished reading the end_buffer of the csv buffer */
352
420
  // reset end buffer
353
421
  end_buffer = buffer->buffer_end;
422
+ if (position_buffer == end_buffer) {
423
+ reached_remainder_state = false;
424
+ }
354
425
  if (finished_chunk) {
426
+ if (position_buffer >= end_buffer) {
427
+ if (position_buffer == end_buffer && StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1]) &&
428
+ position_buffer < buffer_size) {
429
+ // last position is a new line, we still have to go through one more line of this buffer
430
+ finished = false;
431
+ } else {
432
+ finished = true;
433
+ }
434
+ }
355
435
  return true;
356
436
  }
357
437
  // If this is the last buffer, we have to read the last value
358
438
  if (buffer->buffer->IsCSVFileLastBuffer() || (buffer->next_buffer->IsCSVFileLastBuffer())) {
359
- if (column > 0 || try_add_line) {
439
+ if (column > 0 || try_add_line || (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
360
440
  // remaining values to be added to the chunk
361
- AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes);
441
+ auto str_value = buffer->GetValue(start_buffer, position_buffer, offset);
442
+ AddValue(str_value, column, escape_positions, has_quotes);
362
443
  if (try_add_line) {
363
444
  bool success = column == return_types.size();
364
445
  if (success) {
@@ -370,6 +451,7 @@ final_state : {
370
451
  return success;
371
452
  } else {
372
453
  AddRow(insert_chunk, column, error_message);
454
+ verification_positions.end_of_last_line = position_buffer;
373
455
  }
374
456
  }
375
457
  }
@@ -377,6 +459,22 @@ final_state : {
377
459
  if (mode == ParserMode::PARSING) {
378
460
  Flush(insert_chunk);
379
461
  }
462
+ if (position_buffer != verification_positions.end_of_last_line &&
463
+ !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
464
+ error_message = "Line does not fit in one buffer. Increase the buffer size.";
465
+ return false;
466
+ }
467
+ if (position_buffer >= end_buffer) {
468
+ if (position_buffer >= end_buffer) {
469
+ if (position_buffer == end_buffer && StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1]) &&
470
+ position_buffer < buffer_size) {
471
+ // last position is a new line, we still have to go through one more line of this buffer
472
+ finished = false;
473
+ } else {
474
+ finished = true;
475
+ }
476
+ }
477
+ }
380
478
  return true;
381
479
  };
382
480
  }
@@ -38,7 +38,10 @@ void ReadCSVData::FinalizeRead(ClientContext &context) {
38
38
  BaseCSVData::Finalize();
39
39
  auto &config = DBConfig::GetConfig(context);
40
40
  single_threaded = !config.options.experimental_parallel_csv_reader;
41
- if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
41
+ bool null_or_empty = options.delimiter.empty() || options.escape.empty() || options.quote.empty() ||
42
+ options.delimiter[0] == '\0' || options.escape[0] == '\0' || options.quote[0] == '\0';
43
+ bool complex_options = options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1;
44
+ if (null_or_empty || complex_options || options.new_line == NewLineIdentifier::MIX) {
42
45
  // not supported for parallel CSV reading
43
46
  single_threaded = true;
44
47
  }
@@ -239,8 +242,9 @@ struct ParallelCSVGlobalState : public GlobalTableFunctionState {
239
242
  public:
240
243
  ParallelCSVGlobalState(ClientContext &context, unique_ptr<CSVFileHandle> file_handle_p,
241
244
  vector<string> &files_path_p, idx_t system_threads_p, idx_t buffer_size_p,
242
- idx_t rows_to_skip)
243
- : file_handle(std::move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p) {
245
+ idx_t rows_to_skip, bool force_parallelism_p)
246
+ : file_handle(std::move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p),
247
+ force_parallelism(force_parallelism_p) {
244
248
  for (idx_t i = 0; i < rows_to_skip; i++) {
245
249
  file_handle->ReadLine();
246
250
  }
@@ -253,23 +257,34 @@ public:
253
257
  } else {
254
258
  bytes_per_local_state = file_size / MaxThreads();
255
259
  }
256
- current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle);
257
- next_buffer = current_buffer->Next(*file_handle, buffer_size);
260
+ current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position);
261
+ next_buffer = current_buffer->Next(*file_handle, buffer_size, current_csv_position);
262
+ running_threads = MaxThreads();
258
263
  }
259
264
  ParallelCSVGlobalState() {
260
265
  }
261
266
 
267
+ ~ParallelCSVGlobalState() override {
268
+ }
269
+
262
270
  idx_t MaxThreads() const override;
263
271
  //! Returns buffer and index that caller thread should read.
264
272
  unique_ptr<CSVBufferRead> Next(ClientContext &context, ReadCSVData &bind_data);
265
- //! If we finished reading all the CSV Files
273
+ //! Verify if the CSV File was read correctly
274
+ void Verify();
275
+
276
+ void UpdateVerification(VerificationPositions positions);
277
+
278
+ void IncrementThread();
279
+
280
+ void DecrementThread();
281
+
266
282
  bool Finished();
283
+
267
284
  //! How many bytes were read up to this point
268
285
  atomic<idx_t> bytes_read;
269
286
  //! Size of current file
270
287
  idx_t file_size;
271
- //! The index of the next file to read (i.e. current file + 1)
272
- idx_t file_index = 1;
273
288
 
274
289
  double GetProgress(ReadCSVData &bind_data) const {
275
290
  idx_t total_files = bind_data.files.size();
@@ -290,21 +305,20 @@ public:
290
305
  private:
291
306
  //! File Handle for current file
292
307
  unique_ptr<CSVFileHandle> file_handle;
293
-
294
308
  shared_ptr<CSVBuffer> current_buffer;
295
309
  shared_ptr<CSVBuffer> next_buffer;
296
310
 
311
+ //! The index of the next file to read (i.e. current file + 1)
312
+ idx_t file_index = 1;
313
+
297
314
  //! Mutex to lock when getting next batch of bytes (Parallel Only)
298
315
  mutex main_mutex;
299
316
  //! Byte set from for last thread
300
317
  idx_t next_byte = 0;
301
-
302
318
  //! The current estimated line number
303
319
  idx_t estimated_linenr;
304
-
305
320
  //! How many bytes we should execute per local state
306
321
  idx_t bytes_per_local_state;
307
-
308
322
  //! Size of first file
309
323
  idx_t first_file_size;
310
324
  //! Basically max number of threads in DuckDB
@@ -313,20 +327,73 @@ private:
313
327
  idx_t buffer_size;
314
328
  //! Current batch index
315
329
  idx_t batch_index = 0;
330
+ //! Forces parallelism for small CSV Files, should only be used for testing.
331
+ bool force_parallelism;
332
+ //! Current (Global) position of CSV
333
+ idx_t current_csv_position = 0;
334
+ idx_t max_tuple_end = 0;
335
+ //! the vector stores positions where threads ended the last line they read in the CSV File, and the set stores
336
+ //! positions where they started reading the first line.
337
+ vector<idx_t> tuple_end;
338
+ set<idx_t> tuple_start;
339
+ idx_t running_threads = 0;
316
340
  };
317
341
 
318
342
  idx_t ParallelCSVGlobalState::MaxThreads() const {
319
- // idx_t one_mb = 1000000;
320
- // idx_t threads_per_mb = first_file_size / one_mb + 1;
321
- // if (threads_per_mb < system_threads) {
322
- // return threads_per_mb;
323
- // }
343
+ if (force_parallelism) {
344
+ return system_threads;
345
+ }
346
+
347
+ idx_t one_mb = 1000000; // We initialize max one thread per Mb
348
+ idx_t threads_per_mb = first_file_size / one_mb + 1;
349
+ if (threads_per_mb < system_threads) {
350
+ return threads_per_mb;
351
+ }
352
+
324
353
  return system_threads;
325
354
  }
326
355
 
356
+ void ParallelCSVGlobalState::IncrementThread() {
357
+ lock_guard<mutex> parallel_lock(main_mutex);
358
+ running_threads++;
359
+ }
360
+
361
+ void ParallelCSVGlobalState::DecrementThread() {
362
+ lock_guard<mutex> parallel_lock(main_mutex);
363
+ D_ASSERT(running_threads > 0);
364
+ running_threads--;
365
+ }
366
+
327
367
  bool ParallelCSVGlobalState::Finished() {
328
368
  lock_guard<mutex> parallel_lock(main_mutex);
329
- return !current_buffer;
369
+ return running_threads == 0;
370
+ }
371
+
372
+ void ParallelCSVGlobalState::Verify() {
373
+ // All threads are done, we run some magic sweet verification code
374
+ if (running_threads == 0) {
375
+ for (auto &last_pos : tuple_end) {
376
+ auto first_pos = tuple_start.find(last_pos);
377
+ if (first_pos == tuple_start.end()) {
378
+ // this might be necessary due to carriage returns outside buffer scopes.
379
+ first_pos = tuple_start.find(last_pos + 1);
380
+ }
381
+ if (first_pos == tuple_start.end() && last_pos != max_tuple_end) {
382
+ string error = "Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
383
+ " does not have a match\n";
384
+ error += "End Lines: \n";
385
+ for (auto &end_line : tuple_end) {
386
+ error += to_string(end_line) + "\n";
387
+ }
388
+ error += "Start Lines: \n";
389
+ for (auto &start_line : tuple_start) {
390
+ error += to_string(start_line) + "\n";
391
+ }
392
+ throw InvalidInputException(
393
+ "CSV File not supported for multithreading. Please run single-threaded CSV Reading");
394
+ }
395
+ }
396
+ }
330
397
  }
331
398
 
332
399
  unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, ReadCSVData &bind_data) {
@@ -348,7 +415,7 @@ unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, R
348
415
  current_buffer = next_buffer;
349
416
  if (next_buffer) {
350
417
  // Next buffer gets the next-next buffer
351
- next_buffer = next_buffer->Next(*file_handle, buffer_size);
418
+ next_buffer = next_buffer->Next(*file_handle, buffer_size, current_csv_position);
352
419
  }
353
420
  }
354
421
  if (current_buffer && !next_buffer) {
@@ -356,11 +423,26 @@ unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, R
356
423
  if (file_index < bind_data.files.size()) {
357
424
  bind_data.options.file_path = bind_data.files[file_index++];
358
425
  file_handle = ReadCSV::OpenCSV(bind_data.options, context);
359
- next_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle);
426
+ current_csv_position = 0;
427
+ // FIXME: This will probably require some changes on the verification code
428
+ next_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position);
360
429
  }
361
430
  }
362
431
  return result;
363
432
  }
433
+ void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions) {
434
+ lock_guard<mutex> parallel_lock(main_mutex);
435
+ if (positions.beginning_of_first_line < positions.end_of_last_line) {
436
+ if (positions.end_of_last_line > max_tuple_end) {
437
+ max_tuple_end = positions.end_of_last_line;
438
+ }
439
+ tuple_start.insert(positions.beginning_of_first_line);
440
+ tuple_end.push_back(positions.end_of_last_line);
441
+ }
442
+ }
443
+
444
+ void SetNewLine() {
445
+ }
364
446
 
365
447
  static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext &context,
366
448
  TableFunctionInitInput &input) {
@@ -373,10 +455,11 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
373
455
 
374
456
  bind_data.options.file_path = bind_data.files[0];
375
457
  file_handle = ReadCSV::OpenCSV(bind_data.options, context);
376
- idx_t rows_to_skip = bind_data.options.skip_rows + (bind_data.options.has_header ? 1 : 0);
458
+ idx_t rows_to_skip =
459
+ bind_data.options.skip_rows + (bind_data.options.has_header && bind_data.options.header ? 1 : 0);
377
460
  return make_unique<ParallelCSVGlobalState>(context, std::move(file_handle), bind_data.files,
378
461
  context.db->NumberOfThreads(), bind_data.options.buffer_size,
379
- rows_to_skip);
462
+ rows_to_skip, ClientConfig::GetConfig(context).verify_parallelism);
380
463
  }
381
464
 
382
465
  //===--------------------------------------------------------------------===//
@@ -390,6 +473,7 @@ public:
390
473
  //! The CSV reader
391
474
  unique_ptr<ParallelCSVReader> csv_reader;
392
475
  CSVBufferRead previous_buffer;
476
+ bool done = false;
393
477
  };
394
478
 
395
479
  unique_ptr<LocalTableFunctionState> ParallelReadCSVInitLocal(ExecutionContext &context, TableFunctionInitInput &input,
@@ -401,9 +485,10 @@ unique_ptr<LocalTableFunctionState> ParallelReadCSVInitLocal(ExecutionContext &c
401
485
  if (next_local_buffer) {
402
486
  csv_reader = make_unique<ParallelCSVReader>(context.client, csv_data.options, std::move(next_local_buffer),
403
487
  csv_data.sql_types);
488
+ } else {
489
+ global_state.DecrementThread();
404
490
  }
405
- auto new_local_state = make_unique<ParallelCSVLocalState>(std::move(csv_reader));
406
- return std::move(new_local_state);
491
+ return make_unique<ParallelCSVLocalState>(std::move(csv_reader));
407
492
  }
408
493
 
409
494
  static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
@@ -417,13 +502,14 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
417
502
  }
418
503
 
419
504
  do {
420
- if (output.size() != 0 || (csv_global_state.Finished() && csv_local_state.csv_reader->position_buffer >=
421
- csv_local_state.csv_reader->end_buffer)) {
505
+ if (output.size() != 0) {
422
506
  break;
423
507
  }
424
- if (csv_local_state.csv_reader->position_buffer >= csv_local_state.csv_reader->end_buffer) {
508
+ if (csv_local_state.csv_reader->finished) {
509
+ csv_global_state.UpdateVerification(csv_local_state.csv_reader->GetVerificationPositions());
425
510
  auto next_chunk = csv_global_state.Next(context, bind_data);
426
511
  if (!next_chunk) {
512
+ csv_global_state.DecrementThread();
427
513
  break;
428
514
  }
429
515
  csv_local_state.csv_reader->SetBufferRead(std::move(next_chunk));
@@ -431,7 +517,9 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
431
517
  csv_local_state.csv_reader->ParseCSV(output);
432
518
 
433
519
  } while (true);
434
-
520
+ if (csv_global_state.Finished()) {
521
+ csv_global_state.Verify();
522
+ }
435
523
  if (bind_data.options.union_by_name) {
436
524
  throw InternalException("FIXME: union by name");
437
525
  }
@@ -678,6 +766,7 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
678
766
  table_function.named_parameters["sep"] = LogicalType::VARCHAR;
679
767
  table_function.named_parameters["delim"] = LogicalType::VARCHAR;
680
768
  table_function.named_parameters["quote"] = LogicalType::VARCHAR;
769
+ table_function.named_parameters["new_line"] = LogicalType::VARCHAR;
681
770
  table_function.named_parameters["escape"] = LogicalType::VARCHAR;
682
771
  table_function.named_parameters["nullstr"] = LogicalType::VARCHAR;
683
772
  table_function.named_parameters["columns"] = LogicalType::ANY;
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.6.2-dev1687"
2
+ #define DUCKDB_VERSION "0.6.2-dev1736"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "355d6ee967"
5
+ #define DUCKDB_SOURCE_ID "424848838c"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -14,6 +14,10 @@
14
14
 
15
15
  #include <assert.h>
16
16
  #define D_ASSERT assert
17
+ namespace duckdb {
18
+ DUCKDB_API void DuckDBAssertInternal(bool condition, const char *condition_name, const char *file, int linenr);
19
+ }
20
+
17
21
  #else
18
22
  namespace duckdb {
19
23
  DUCKDB_API void DuckDBAssertInternal(bool condition, const char *condition_name, const char *file, int linenr);
@@ -97,6 +97,9 @@ protected:
97
97
  void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0);
98
98
  static string GetLineNumberStr(idx_t linenr, bool linenr_estimated);
99
99
 
100
+ //! Sets the newline delimiter
101
+ void SetNewLineDelimiter(bool carry = false, bool carry_followed_by_nl = false);
102
+
100
103
  protected:
101
104
  //! Whether or not the current row's columns have overflown return_types.size()
102
105
  bool error_column_overflow = false;
@@ -75,7 +75,6 @@ public:
75
75
  public:
76
76
  //! Extract a single DataChunk from the CSV file and stores it in insert_chunk
77
77
  void ParseCSV(DataChunk &insert_chunk);
78
-
79
78
  static string ColumnTypesError(case_insensitive_map_t<idx_t> sql_types_per_column, const vector<string> &names);
80
79
 
81
80
  private:
@@ -20,13 +20,15 @@ public:
20
20
  static constexpr idx_t INITIAL_BUFFER_SIZE_COLOSSAL = 32000000; // 32MB
21
21
 
22
22
  //! Constructor for Initial Buffer
23
- CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle);
23
+ CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
24
+ idx_t &global_csv_current_position);
24
25
 
25
26
  //! Constructor for `Next()` Buffers
26
- CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer);
27
+ CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer,
28
+ idx_t global_csv_current_position);
27
29
 
28
30
  //! Creates a new buffer with the next part of the CSV File
29
- unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t set_buffer_size);
31
+ unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position);
30
32
 
31
33
  //! Gets the buffer actual size
32
34
  idx_t GetBufferSize();
@@ -40,6 +42,8 @@ public:
40
42
  //! If this buffer is the first buffer of the CSV File
41
43
  bool IsCSVFileFirstBuffer();
42
44
 
45
+ idx_t GetCSVGlobalStart();
46
+
43
47
  BufferHandle AllocateBuffer(idx_t buffer_size);
44
48
 
45
49
  char *Ptr() {
@@ -59,5 +63,7 @@ private:
59
63
  bool last_buffer = false;
60
64
  //! If this is the first buffer of the CSV File
61
65
  bool first_buffer = false;
66
+ //! Global position from the CSV File where this buffer starts
67
+ idx_t global_csv_start = 0;
62
68
  };
63
69
  } // namespace duckdb
@@ -17,6 +17,13 @@
17
17
 
18
18
  namespace duckdb {
19
19
 
20
+ enum NewLineIdentifier {
21
+ SINGLE = 1, // Either \r or \n
22
+ CARRY_ON = 2, // \r\n
23
+ MIX = 3, // Hippie-Land, can't run it multithreaded
24
+ NOT_SET = 4
25
+ };
26
+
20
27
  struct BufferedCSVReaderOptions {
21
28
  //===--------------------------------------------------------------------===//
22
29
  // CommonCSVOptions
@@ -26,7 +33,11 @@ struct BufferedCSVReaderOptions {
26
33
  bool has_delimiter = false;
27
34
  //! Delimiter to separate columns within each line
28
35
  string delimiter = ",";
29
- //! Whether or not a quote sign was defined by the user
36
+ //! Whether or not a new_line was defined by the user
37
+ bool has_newline = false;
38
+ //! New Line separator
39
+ NewLineIdentifier new_line = NewLineIdentifier::NOT_SET;
40
+
30
41
  bool has_quote = false;
31
42
  //! Quote used for columns that contain reserved characters, e.g., delimiter
32
43
  string quote = "\"";
@@ -112,6 +123,8 @@ struct BufferedCSVReaderOptions {
112
123
  void Deserialize(FieldReader &reader);
113
124
 
114
125
  void SetDelimiter(const string &delimiter);
126
+
127
+ void SetNewline(const string &input);
115
128
  //! Set an option that is supported by both reading and writing functions, called by
116
129
  //! the SetReadOption and SetWriteOption methods
117
130
  bool SetBaseOption(const string &loption, const Value &value);
@@ -91,6 +91,10 @@ struct CSVBufferRead {
91
91
  idx_t estimated_linenr;
92
92
  };
93
93
 
94
+ struct VerificationPositions {
95
+ idx_t beginning_of_first_line = 0;
96
+ idx_t end_of_last_line = 0;
97
+ };
94
98
  //! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
95
99
  class ParallelCSVReader : public BaseCSVReader {
96
100
  public:
@@ -111,7 +115,10 @@ public:
111
115
  //! If this flag is set, it means we are about to try to read our last row.
112
116
  bool reached_remainder_state = false;
113
117
 
118
+ bool finished = false;
119
+
114
120
  unique_ptr<CSVBufferRead> buffer;
121
+ VerificationPositions GetVerificationPositions();
115
122
 
116
123
  public:
117
124
  void SetBufferRead(unique_ptr<CSVBufferRead> buffer);
@@ -134,8 +141,13 @@ private:
134
141
  //! when changing the buffer end the first time.
135
142
  //! It returns FALSE if the parser should jump to the final state of parsing or not
136
143
  bool BufferRemainder();
144
+
145
+ bool NewLineDelimiter(bool carry, bool carry_followed_by_nl, bool first_char);
146
+
137
147
  //! Parses a CSV file with a one-byte delimiter, escape and quote character
138
148
  bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
149
+ //! Position of the first read line and last read line for verification purposes
150
+ VerificationPositions verification_positions;
139
151
  };
140
152
 
141
153
  } // namespace duckdb