duckdb 0.6.2-dev1687.0 → 0.6.2-dev1736.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +28 -0
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +6 -0
- package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +19 -8
- package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +13 -0
- package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +126 -28
- package/src/duckdb/src/function/table/read_csv.cpp +117 -28
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/assert.hpp +4 -0
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/base_csv_reader.hpp +3 -0
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +9 -3
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +14 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp +12 -0
package/package.json
CHANGED
|
@@ -551,4 +551,32 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
|
|
|
551
551
|
parse_chunk.Reset();
|
|
552
552
|
return true;
|
|
553
553
|
}
|
|
554
|
+
|
|
555
|
+
void BaseCSVReader::SetNewLineDelimiter(bool carry, bool carry_followed_by_nl) {
|
|
556
|
+
if ((mode == ParserMode::SNIFFING_DIALECT && !options.has_newline) ||
|
|
557
|
+
options.new_line == NewLineIdentifier::NOT_SET) {
|
|
558
|
+
if (options.new_line == NewLineIdentifier::MIX) {
|
|
559
|
+
return;
|
|
560
|
+
}
|
|
561
|
+
NewLineIdentifier this_line_identifier;
|
|
562
|
+
if (carry) {
|
|
563
|
+
if (carry_followed_by_nl) {
|
|
564
|
+
this_line_identifier = NewLineIdentifier::CARRY_ON;
|
|
565
|
+
} else {
|
|
566
|
+
this_line_identifier = NewLineIdentifier::SINGLE;
|
|
567
|
+
}
|
|
568
|
+
} else {
|
|
569
|
+
this_line_identifier = NewLineIdentifier::SINGLE;
|
|
570
|
+
}
|
|
571
|
+
if (options.new_line == NewLineIdentifier::NOT_SET) {
|
|
572
|
+
options.new_line = this_line_identifier;
|
|
573
|
+
return;
|
|
574
|
+
}
|
|
575
|
+
if (options.new_line != this_line_identifier) {
|
|
576
|
+
options.new_line = NewLineIdentifier::MIX;
|
|
577
|
+
return;
|
|
578
|
+
}
|
|
579
|
+
options.new_line = this_line_identifier;
|
|
580
|
+
}
|
|
581
|
+
}
|
|
554
582
|
} // namespace duckdb
|
|
@@ -482,6 +482,7 @@ void BufferedCSVReader::DetectDialect(const vector<LogicalType> &requested_types
|
|
|
482
482
|
} else if ((more_values || single_column_before) && rows_consistent) {
|
|
483
483
|
sniff_info.skip_rows = start_row;
|
|
484
484
|
sniff_info.num_cols = num_cols;
|
|
485
|
+
sniff_info.new_line = options.new_line;
|
|
485
486
|
best_consistent_rows = consistent_rows;
|
|
486
487
|
best_num_cols = num_cols;
|
|
487
488
|
|
|
@@ -497,6 +498,7 @@ void BufferedCSVReader::DetectDialect(const vector<LogicalType> &requested_types
|
|
|
497
498
|
if (!same_quote_is_candidate) {
|
|
498
499
|
sniff_info.skip_rows = start_row;
|
|
499
500
|
sniff_info.num_cols = num_cols;
|
|
501
|
+
sniff_info.new_line = options.new_line;
|
|
500
502
|
info_candidates.push_back(sniff_info);
|
|
501
503
|
}
|
|
502
504
|
}
|
|
@@ -1264,6 +1266,7 @@ add_row : {
|
|
|
1264
1266
|
// \r newline, go to special state that parses an optional \n afterwards
|
|
1265
1267
|
goto carriage_return;
|
|
1266
1268
|
} else {
|
|
1269
|
+
SetNewLineDelimiter();
|
|
1267
1270
|
// \n newline, move to value start
|
|
1268
1271
|
if (finished_chunk) {
|
|
1269
1272
|
return true;
|
|
@@ -1342,6 +1345,7 @@ carriage_return:
|
|
|
1342
1345
|
/* state: carriage_return */
|
|
1343
1346
|
// this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
|
|
1344
1347
|
if (buffer[position] == '\n') {
|
|
1348
|
+
SetNewLineDelimiter(true, true);
|
|
1345
1349
|
// newline after carriage return: skip
|
|
1346
1350
|
// increase position by 1 and move start to the new position
|
|
1347
1351
|
start = ++position;
|
|
@@ -1349,6 +1353,8 @@ carriage_return:
|
|
|
1349
1353
|
// file ends right after delimiter, go to final state
|
|
1350
1354
|
goto final_state;
|
|
1351
1355
|
}
|
|
1356
|
+
} else {
|
|
1357
|
+
SetNewLineDelimiter(true, false);
|
|
1352
1358
|
}
|
|
1353
1359
|
if (finished_chunk) {
|
|
1354
1360
|
return true;
|
|
@@ -3,12 +3,15 @@
|
|
|
3
3
|
|
|
4
4
|
namespace duckdb {
|
|
5
5
|
|
|
6
|
-
CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle
|
|
6
|
+
CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
|
|
7
|
+
idx_t &global_csv_current_position)
|
|
7
8
|
: context(context), first_buffer(true) {
|
|
8
9
|
this->handle = AllocateBuffer(buffer_size_p);
|
|
9
10
|
|
|
10
11
|
auto buffer = Ptr();
|
|
11
12
|
actual_size = file_handle.Read(buffer, buffer_size_p);
|
|
13
|
+
global_csv_start = global_csv_current_position;
|
|
14
|
+
global_csv_current_position += actual_size;
|
|
12
15
|
if (actual_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
|
|
13
16
|
start_position += 3;
|
|
14
17
|
}
|
|
@@ -16,21 +19,25 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle
|
|
|
16
19
|
}
|
|
17
20
|
|
|
18
21
|
CSVBuffer::CSVBuffer(ClientContext &context, BufferHandle buffer_p, idx_t buffer_size_p, idx_t actual_size_p,
|
|
19
|
-
bool final_buffer)
|
|
20
|
-
: context(context), handle(std::move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer)
|
|
22
|
+
bool final_buffer, idx_t global_csv_current_position)
|
|
23
|
+
: context(context), handle(std::move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer),
|
|
24
|
+
global_csv_start(global_csv_current_position) {
|
|
21
25
|
}
|
|
22
26
|
|
|
23
|
-
unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t
|
|
27
|
+
unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size,
|
|
28
|
+
idx_t &global_csv_current_position) {
|
|
24
29
|
if (file_handle.FinishedReading()) {
|
|
25
30
|
// this was the last buffer
|
|
26
31
|
return nullptr;
|
|
27
32
|
}
|
|
28
33
|
|
|
29
|
-
auto next_buffer = AllocateBuffer(
|
|
30
|
-
idx_t next_buffer_actual_size = file_handle.Read(next_buffer.Ptr(),
|
|
34
|
+
auto next_buffer = AllocateBuffer(buffer_size);
|
|
35
|
+
idx_t next_buffer_actual_size = file_handle.Read(next_buffer.Ptr(), buffer_size);
|
|
31
36
|
|
|
32
|
-
|
|
33
|
-
|
|
37
|
+
auto next_csv_buffer = make_unique<CSVBuffer>(context, std::move(next_buffer), buffer_size, next_buffer_actual_size,
|
|
38
|
+
file_handle.FinishedReading(), global_csv_current_position);
|
|
39
|
+
global_csv_current_position += next_buffer_actual_size;
|
|
40
|
+
return next_csv_buffer;
|
|
34
41
|
}
|
|
35
42
|
|
|
36
43
|
BufferHandle CSVBuffer::AllocateBuffer(idx_t buffer_size) {
|
|
@@ -54,4 +61,8 @@ bool CSVBuffer::IsCSVFileFirstBuffer() {
|
|
|
54
61
|
return first_buffer;
|
|
55
62
|
}
|
|
56
63
|
|
|
64
|
+
idx_t CSVBuffer::GetCSVGlobalStart() {
|
|
65
|
+
return global_csv_start;
|
|
66
|
+
}
|
|
67
|
+
|
|
57
68
|
} // namespace duckdb
|
|
@@ -117,6 +117,17 @@ void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
|
|
|
117
117
|
}
|
|
118
118
|
}
|
|
119
119
|
|
|
120
|
+
void BufferedCSVReaderOptions::SetNewline(const string &input) {
|
|
121
|
+
if (input == "\\n" || input == "\\r") {
|
|
122
|
+
new_line = NewLineIdentifier::SINGLE;
|
|
123
|
+
} else if (input == "\\r\\n") {
|
|
124
|
+
new_line = NewLineIdentifier::CARRY_ON;
|
|
125
|
+
} else {
|
|
126
|
+
throw InvalidInputException("This is not accepted as a newline: " + input);
|
|
127
|
+
}
|
|
128
|
+
has_newline = true;
|
|
129
|
+
}
|
|
130
|
+
|
|
120
131
|
void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) {
|
|
121
132
|
string error;
|
|
122
133
|
if (read_format) {
|
|
@@ -233,6 +244,8 @@ bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value
|
|
|
233
244
|
} else if (loption == "quote") {
|
|
234
245
|
quote = ParseString(value, loption);
|
|
235
246
|
has_quote = true;
|
|
247
|
+
} else if (loption == "new_line") {
|
|
248
|
+
SetNewline(ParseString(value, loption));
|
|
236
249
|
} else if (loption == "escape") {
|
|
237
250
|
escape = ParseString(value, loption);
|
|
238
251
|
has_escape = true;
|
|
@@ -43,15 +43,31 @@ void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
|
|
|
43
43
|
InitInsertChunkIdx(return_types.size());
|
|
44
44
|
}
|
|
45
45
|
|
|
46
|
+
bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl, bool first_char) {
|
|
47
|
+
// Set the delimiter if not set yet.
|
|
48
|
+
SetNewLineDelimiter(carry, carry_followed_by_nl);
|
|
49
|
+
D_ASSERT(options.new_line == NewLineIdentifier::SINGLE || options.new_line == NewLineIdentifier::CARRY_ON);
|
|
50
|
+
if (options.new_line == NewLineIdentifier::SINGLE) {
|
|
51
|
+
return (!carry) || (carry && !carry_followed_by_nl);
|
|
52
|
+
}
|
|
53
|
+
return (carry && carry_followed_by_nl) || (!carry && first_char);
|
|
54
|
+
}
|
|
55
|
+
|
|
46
56
|
bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
|
|
47
57
|
if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer &&
|
|
48
58
|
start_buffer == buffer->buffer->GetStart()) {
|
|
59
|
+
verification_positions.beginning_of_first_line = position_buffer;
|
|
60
|
+
verification_positions.end_of_last_line = position_buffer;
|
|
49
61
|
// First buffer doesn't need any setting
|
|
50
62
|
// Unless we have a header
|
|
51
63
|
if (options.header && options.auto_detect) {
|
|
52
64
|
for (; position_buffer < end_buffer; position_buffer++) {
|
|
53
65
|
if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
|
|
66
|
+
bool carrier_return = (*buffer)[position_buffer] == '\r';
|
|
54
67
|
position_buffer++;
|
|
68
|
+
if (carrier_return && position_buffer < end_buffer && (*buffer)[position_buffer] == '\n') {
|
|
69
|
+
position_buffer++;
|
|
70
|
+
}
|
|
55
71
|
return true;
|
|
56
72
|
}
|
|
57
73
|
}
|
|
@@ -70,26 +86,50 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
|
|
|
70
86
|
first_line_chunk.Initialize(allocator, insert_chunk.GetTypes());
|
|
71
87
|
for (; position_buffer < end_buffer; position_buffer++) {
|
|
72
88
|
if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
|
|
89
|
+
bool carriage_return = (*buffer)[position_buffer] == '\r';
|
|
90
|
+
bool carriage_return_followed = false;
|
|
73
91
|
position_buffer++;
|
|
74
|
-
|
|
92
|
+
if (position_buffer < end_buffer) {
|
|
93
|
+
if (carriage_return && (*buffer)[position_buffer] == '\n') {
|
|
94
|
+
carriage_return_followed = true;
|
|
95
|
+
position_buffer++;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
if (NewLineDelimiter(carriage_return, carriage_return_followed, position_buffer - 1 == start_buffer)) {
|
|
99
|
+
break;
|
|
100
|
+
}
|
|
75
101
|
}
|
|
76
102
|
}
|
|
77
|
-
|
|
78
|
-
if (position_buffer
|
|
103
|
+
|
|
104
|
+
if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
|
|
105
|
+
break;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if (position_buffer > end_buffer && options.new_line == NewLineIdentifier::CARRY_ON &&
|
|
109
|
+
(*buffer)[position_buffer - 1] == '\n') {
|
|
79
110
|
break;
|
|
80
111
|
}
|
|
81
112
|
idx_t position_set = position_buffer;
|
|
82
113
|
start_buffer = position_buffer;
|
|
83
114
|
// We check if we can add this line
|
|
84
115
|
successfully_read_first_line = TryParseSimpleCSV(first_line_chunk, error_message, true);
|
|
85
|
-
|
|
116
|
+
|
|
86
117
|
end_buffer = end_buffer_real;
|
|
87
|
-
|
|
88
|
-
if (
|
|
118
|
+
start_buffer = position_set;
|
|
119
|
+
if (position_buffer >= end_buffer) {
|
|
120
|
+
if (successfully_read_first_line) {
|
|
121
|
+
position_buffer = position_set;
|
|
122
|
+
}
|
|
89
123
|
break;
|
|
90
124
|
}
|
|
125
|
+
position_buffer = position_set;
|
|
126
|
+
}
|
|
127
|
+
if (verification_positions.beginning_of_first_line == 0) {
|
|
128
|
+
verification_positions.beginning_of_first_line = position_buffer;
|
|
91
129
|
}
|
|
92
130
|
|
|
131
|
+
verification_positions.end_of_last_line = position_buffer;
|
|
132
|
+
finished = false;
|
|
93
133
|
return successfully_read_first_line;
|
|
94
134
|
}
|
|
95
135
|
|
|
@@ -110,9 +150,18 @@ void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
|
|
|
110
150
|
|
|
111
151
|
linenr_estimated = true;
|
|
112
152
|
reached_remainder_state = false;
|
|
153
|
+
verification_positions.beginning_of_first_line = 0;
|
|
154
|
+
verification_positions.end_of_last_line = 0;
|
|
155
|
+
finished = false;
|
|
113
156
|
D_ASSERT(end_buffer <= buffer_size);
|
|
114
157
|
}
|
|
115
158
|
|
|
159
|
+
VerificationPositions ParallelCSVReader::GetVerificationPositions() {
|
|
160
|
+
verification_positions.beginning_of_first_line += buffer->buffer->GetCSVGlobalStart();
|
|
161
|
+
verification_positions.end_of_last_line += buffer->buffer->GetCSVGlobalStart();
|
|
162
|
+
return verification_positions;
|
|
163
|
+
}
|
|
164
|
+
|
|
116
165
|
// If BufferRemainder returns false, it means we are done scanning this buffer and should go to the end_state
|
|
117
166
|
bool ParallelCSVReader::BufferRemainder() {
|
|
118
167
|
if (position_buffer >= end_buffer && !reached_remainder_state) {
|
|
@@ -131,7 +180,6 @@ bool ParallelCSVReader::BufferRemainder() {
|
|
|
131
180
|
}
|
|
132
181
|
|
|
133
182
|
bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
|
|
134
|
-
|
|
135
183
|
// used for parsing algorithm
|
|
136
184
|
D_ASSERT(end_buffer <= buffer_size);
|
|
137
185
|
bool finished_chunk = false;
|
|
@@ -139,10 +187,15 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
|
|
|
139
187
|
idx_t offset = 0;
|
|
140
188
|
bool has_quotes = false;
|
|
141
189
|
vector<idx_t> escape_positions;
|
|
142
|
-
if (start_buffer == buffer->buffer_start && !try_add_line) {
|
|
190
|
+
if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
|
|
143
191
|
// First time reading this buffer piece
|
|
144
192
|
if (!SetPosition(insert_chunk)) {
|
|
145
193
|
// This means the buffer size does not contain a new line
|
|
194
|
+
if (position_buffer - start_buffer == options.buffer_size) {
|
|
195
|
+
error_message = "Line does not fit in one buffer. Increase the buffer size.";
|
|
196
|
+
return false;
|
|
197
|
+
}
|
|
198
|
+
finished = true;
|
|
146
199
|
return true;
|
|
147
200
|
}
|
|
148
201
|
}
|
|
@@ -180,7 +233,7 @@ normal : {
|
|
|
180
233
|
goto add_value;
|
|
181
234
|
} else if (StringUtil::CharacterIsNewline(c)) {
|
|
182
235
|
// newline: add row
|
|
183
|
-
if (column > 0 || try_add_line) {
|
|
236
|
+
if (column > 0 || try_add_line || insert_chunk.data.size() == 1) {
|
|
184
237
|
goto add_row;
|
|
185
238
|
}
|
|
186
239
|
}
|
|
@@ -227,7 +280,8 @@ add_row : {
|
|
|
227
280
|
offset = 0;
|
|
228
281
|
has_quotes = false;
|
|
229
282
|
start_buffer = ++position_buffer;
|
|
230
|
-
|
|
283
|
+
verification_positions.end_of_last_line = position_buffer;
|
|
284
|
+
if (reached_remainder_state) {
|
|
231
285
|
goto final_state;
|
|
232
286
|
}
|
|
233
287
|
if (!BufferRemainder()) {
|
|
@@ -235,8 +289,37 @@ add_row : {
|
|
|
235
289
|
}
|
|
236
290
|
if (carriage_return) {
|
|
237
291
|
// \r newline, go to special state that parses an optional \n afterwards
|
|
238
|
-
|
|
292
|
+
// optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
|
|
293
|
+
if ((*buffer)[position_buffer] == '\n') {
|
|
294
|
+
if (options.new_line == NewLineIdentifier::SINGLE) {
|
|
295
|
+
error_message = "Wrong NewLine Identifier. Expecting \\r\\n";
|
|
296
|
+
return false;
|
|
297
|
+
}
|
|
298
|
+
// newline after carriage return: skip
|
|
299
|
+
// increase position by 1 and move start to the new position
|
|
300
|
+
start_buffer = ++position_buffer;
|
|
301
|
+
verification_positions.end_of_last_line = position_buffer;
|
|
302
|
+
if (reached_remainder_state) {
|
|
303
|
+
goto final_state;
|
|
304
|
+
}
|
|
305
|
+
} else {
|
|
306
|
+
if (options.new_line == NewLineIdentifier::CARRY_ON) {
|
|
307
|
+
error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
|
|
308
|
+
return false;
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
if (!BufferRemainder()) {
|
|
312
|
+
goto final_state;
|
|
313
|
+
}
|
|
314
|
+
if (reached_remainder_state || finished_chunk) {
|
|
315
|
+
goto final_state;
|
|
316
|
+
}
|
|
317
|
+
goto value_start;
|
|
239
318
|
} else {
|
|
319
|
+
if (options.new_line == NewLineIdentifier::CARRY_ON) {
|
|
320
|
+
error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
|
|
321
|
+
return false;
|
|
322
|
+
}
|
|
240
323
|
// \n newline, move to value start
|
|
241
324
|
if (finished_chunk) {
|
|
242
325
|
goto final_state;
|
|
@@ -332,33 +415,31 @@ handle_escape : {
|
|
|
332
415
|
// escape was followed by quote or escape, go back to quoted state
|
|
333
416
|
goto in_quotes;
|
|
334
417
|
}
|
|
335
|
-
|
|
336
|
-
carriage_return : {
|
|
337
|
-
/* state: carriage_return */
|
|
338
|
-
// this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
|
|
339
|
-
if ((*buffer)[position_buffer] == '\n') {
|
|
340
|
-
// newline after carriage return: skip
|
|
341
|
-
// increase position by 1 and move start to the new position
|
|
342
|
-
start_buffer = ++position_buffer;
|
|
343
|
-
if (position_buffer >= buffer_size) {
|
|
344
|
-
// file ends right after delimiter, go to final state
|
|
345
|
-
goto final_state;
|
|
346
|
-
}
|
|
347
|
-
}
|
|
348
|
-
goto value_start;
|
|
349
|
-
}
|
|
350
418
|
final_state : {
|
|
351
419
|
/* state: final_stage reached after we finished reading the end_buffer of the csv buffer */
|
|
352
420
|
// reset end buffer
|
|
353
421
|
end_buffer = buffer->buffer_end;
|
|
422
|
+
if (position_buffer == end_buffer) {
|
|
423
|
+
reached_remainder_state = false;
|
|
424
|
+
}
|
|
354
425
|
if (finished_chunk) {
|
|
426
|
+
if (position_buffer >= end_buffer) {
|
|
427
|
+
if (position_buffer == end_buffer && StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1]) &&
|
|
428
|
+
position_buffer < buffer_size) {
|
|
429
|
+
// last position is a new line, we still have to go through one more line of this buffer
|
|
430
|
+
finished = false;
|
|
431
|
+
} else {
|
|
432
|
+
finished = true;
|
|
433
|
+
}
|
|
434
|
+
}
|
|
355
435
|
return true;
|
|
356
436
|
}
|
|
357
437
|
// If this is the last buffer, we have to read the last value
|
|
358
438
|
if (buffer->buffer->IsCSVFileLastBuffer() || (buffer->next_buffer->IsCSVFileLastBuffer())) {
|
|
359
|
-
if (column > 0 || try_add_line) {
|
|
439
|
+
if (column > 0 || try_add_line || (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
|
|
360
440
|
// remaining values to be added to the chunk
|
|
361
|
-
|
|
441
|
+
auto str_value = buffer->GetValue(start_buffer, position_buffer, offset);
|
|
442
|
+
AddValue(str_value, column, escape_positions, has_quotes);
|
|
362
443
|
if (try_add_line) {
|
|
363
444
|
bool success = column == return_types.size();
|
|
364
445
|
if (success) {
|
|
@@ -370,6 +451,7 @@ final_state : {
|
|
|
370
451
|
return success;
|
|
371
452
|
} else {
|
|
372
453
|
AddRow(insert_chunk, column, error_message);
|
|
454
|
+
verification_positions.end_of_last_line = position_buffer;
|
|
373
455
|
}
|
|
374
456
|
}
|
|
375
457
|
}
|
|
@@ -377,6 +459,22 @@ final_state : {
|
|
|
377
459
|
if (mode == ParserMode::PARSING) {
|
|
378
460
|
Flush(insert_chunk);
|
|
379
461
|
}
|
|
462
|
+
if (position_buffer != verification_positions.end_of_last_line &&
|
|
463
|
+
!StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
|
|
464
|
+
error_message = "Line does not fit in one buffer. Increase the buffer size.";
|
|
465
|
+
return false;
|
|
466
|
+
}
|
|
467
|
+
if (position_buffer >= end_buffer) {
|
|
468
|
+
if (position_buffer >= end_buffer) {
|
|
469
|
+
if (position_buffer == end_buffer && StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1]) &&
|
|
470
|
+
position_buffer < buffer_size) {
|
|
471
|
+
// last position is a new line, we still have to go through one more line of this buffer
|
|
472
|
+
finished = false;
|
|
473
|
+
} else {
|
|
474
|
+
finished = true;
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
}
|
|
380
478
|
return true;
|
|
381
479
|
};
|
|
382
480
|
}
|
|
@@ -38,7 +38,10 @@ void ReadCSVData::FinalizeRead(ClientContext &context) {
|
|
|
38
38
|
BaseCSVData::Finalize();
|
|
39
39
|
auto &config = DBConfig::GetConfig(context);
|
|
40
40
|
single_threaded = !config.options.experimental_parallel_csv_reader;
|
|
41
|
-
|
|
41
|
+
bool null_or_empty = options.delimiter.empty() || options.escape.empty() || options.quote.empty() ||
|
|
42
|
+
options.delimiter[0] == '\0' || options.escape[0] == '\0' || options.quote[0] == '\0';
|
|
43
|
+
bool complex_options = options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1;
|
|
44
|
+
if (null_or_empty || complex_options || options.new_line == NewLineIdentifier::MIX) {
|
|
42
45
|
// not supported for parallel CSV reading
|
|
43
46
|
single_threaded = true;
|
|
44
47
|
}
|
|
@@ -239,8 +242,9 @@ struct ParallelCSVGlobalState : public GlobalTableFunctionState {
|
|
|
239
242
|
public:
|
|
240
243
|
ParallelCSVGlobalState(ClientContext &context, unique_ptr<CSVFileHandle> file_handle_p,
|
|
241
244
|
vector<string> &files_path_p, idx_t system_threads_p, idx_t buffer_size_p,
|
|
242
|
-
idx_t rows_to_skip)
|
|
243
|
-
: file_handle(std::move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p)
|
|
245
|
+
idx_t rows_to_skip, bool force_parallelism_p)
|
|
246
|
+
: file_handle(std::move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p),
|
|
247
|
+
force_parallelism(force_parallelism_p) {
|
|
244
248
|
for (idx_t i = 0; i < rows_to_skip; i++) {
|
|
245
249
|
file_handle->ReadLine();
|
|
246
250
|
}
|
|
@@ -253,23 +257,34 @@ public:
|
|
|
253
257
|
} else {
|
|
254
258
|
bytes_per_local_state = file_size / MaxThreads();
|
|
255
259
|
}
|
|
256
|
-
current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle);
|
|
257
|
-
next_buffer = current_buffer->Next(*file_handle, buffer_size);
|
|
260
|
+
current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position);
|
|
261
|
+
next_buffer = current_buffer->Next(*file_handle, buffer_size, current_csv_position);
|
|
262
|
+
running_threads = MaxThreads();
|
|
258
263
|
}
|
|
259
264
|
ParallelCSVGlobalState() {
|
|
260
265
|
}
|
|
261
266
|
|
|
267
|
+
~ParallelCSVGlobalState() override {
|
|
268
|
+
}
|
|
269
|
+
|
|
262
270
|
idx_t MaxThreads() const override;
|
|
263
271
|
//! Returns buffer and index that caller thread should read.
|
|
264
272
|
unique_ptr<CSVBufferRead> Next(ClientContext &context, ReadCSVData &bind_data);
|
|
265
|
-
//!
|
|
273
|
+
//! Verify if the CSV File was read correctly
|
|
274
|
+
void Verify();
|
|
275
|
+
|
|
276
|
+
void UpdateVerification(VerificationPositions positions);
|
|
277
|
+
|
|
278
|
+
void IncrementThread();
|
|
279
|
+
|
|
280
|
+
void DecrementThread();
|
|
281
|
+
|
|
266
282
|
bool Finished();
|
|
283
|
+
|
|
267
284
|
//! How many bytes were read up to this point
|
|
268
285
|
atomic<idx_t> bytes_read;
|
|
269
286
|
//! Size of current file
|
|
270
287
|
idx_t file_size;
|
|
271
|
-
//! The index of the next file to read (i.e. current file + 1)
|
|
272
|
-
idx_t file_index = 1;
|
|
273
288
|
|
|
274
289
|
double GetProgress(ReadCSVData &bind_data) const {
|
|
275
290
|
idx_t total_files = bind_data.files.size();
|
|
@@ -290,21 +305,20 @@ public:
|
|
|
290
305
|
private:
|
|
291
306
|
//! File Handle for current file
|
|
292
307
|
unique_ptr<CSVFileHandle> file_handle;
|
|
293
|
-
|
|
294
308
|
shared_ptr<CSVBuffer> current_buffer;
|
|
295
309
|
shared_ptr<CSVBuffer> next_buffer;
|
|
296
310
|
|
|
311
|
+
//! The index of the next file to read (i.e. current file + 1)
|
|
312
|
+
idx_t file_index = 1;
|
|
313
|
+
|
|
297
314
|
//! Mutex to lock when getting next batch of bytes (Parallel Only)
|
|
298
315
|
mutex main_mutex;
|
|
299
316
|
//! Byte set from for last thread
|
|
300
317
|
idx_t next_byte = 0;
|
|
301
|
-
|
|
302
318
|
//! The current estimated line number
|
|
303
319
|
idx_t estimated_linenr;
|
|
304
|
-
|
|
305
320
|
//! How many bytes we should execute per local state
|
|
306
321
|
idx_t bytes_per_local_state;
|
|
307
|
-
|
|
308
322
|
//! Size of first file
|
|
309
323
|
idx_t first_file_size;
|
|
310
324
|
//! Basically max number of threads in DuckDB
|
|
@@ -313,20 +327,73 @@ private:
|
|
|
313
327
|
idx_t buffer_size;
|
|
314
328
|
//! Current batch index
|
|
315
329
|
idx_t batch_index = 0;
|
|
330
|
+
//! Forces parallelism for small CSV Files, should only be used for testing.
|
|
331
|
+
bool force_parallelism;
|
|
332
|
+
//! Current (Global) position of CSV
|
|
333
|
+
idx_t current_csv_position = 0;
|
|
334
|
+
idx_t max_tuple_end = 0;
|
|
335
|
+
//! the vector stores positions where threads ended the last line they read in the CSV File, and the set stores
|
|
336
|
+
//! positions where they started reading the first line.
|
|
337
|
+
vector<idx_t> tuple_end;
|
|
338
|
+
set<idx_t> tuple_start;
|
|
339
|
+
idx_t running_threads = 0;
|
|
316
340
|
};
|
|
317
341
|
|
|
318
342
|
idx_t ParallelCSVGlobalState::MaxThreads() const {
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
//
|
|
343
|
+
if (force_parallelism) {
|
|
344
|
+
return system_threads;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
idx_t one_mb = 1000000; // We initialize max one thread per Mb
|
|
348
|
+
idx_t threads_per_mb = first_file_size / one_mb + 1;
|
|
349
|
+
if (threads_per_mb < system_threads) {
|
|
350
|
+
return threads_per_mb;
|
|
351
|
+
}
|
|
352
|
+
|
|
324
353
|
return system_threads;
|
|
325
354
|
}
|
|
326
355
|
|
|
356
|
+
void ParallelCSVGlobalState::IncrementThread() {
|
|
357
|
+
lock_guard<mutex> parallel_lock(main_mutex);
|
|
358
|
+
running_threads++;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
void ParallelCSVGlobalState::DecrementThread() {
|
|
362
|
+
lock_guard<mutex> parallel_lock(main_mutex);
|
|
363
|
+
D_ASSERT(running_threads > 0);
|
|
364
|
+
running_threads--;
|
|
365
|
+
}
|
|
366
|
+
|
|
327
367
|
bool ParallelCSVGlobalState::Finished() {
|
|
328
368
|
lock_guard<mutex> parallel_lock(main_mutex);
|
|
329
|
-
return
|
|
369
|
+
return running_threads == 0;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
void ParallelCSVGlobalState::Verify() {
|
|
373
|
+
// All threads are done, we run some magic sweet verification code
|
|
374
|
+
if (running_threads == 0) {
|
|
375
|
+
for (auto &last_pos : tuple_end) {
|
|
376
|
+
auto first_pos = tuple_start.find(last_pos);
|
|
377
|
+
if (first_pos == tuple_start.end()) {
|
|
378
|
+
// this might be necessary due to carriage returns outside buffer scopes.
|
|
379
|
+
first_pos = tuple_start.find(last_pos + 1);
|
|
380
|
+
}
|
|
381
|
+
if (first_pos == tuple_start.end() && last_pos != max_tuple_end) {
|
|
382
|
+
string error = "Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
|
|
383
|
+
" does not have a match\n";
|
|
384
|
+
error += "End Lines: \n";
|
|
385
|
+
for (auto &end_line : tuple_end) {
|
|
386
|
+
error += to_string(end_line) + "\n";
|
|
387
|
+
}
|
|
388
|
+
error += "Start Lines: \n";
|
|
389
|
+
for (auto &start_line : tuple_start) {
|
|
390
|
+
error += to_string(start_line) + "\n";
|
|
391
|
+
}
|
|
392
|
+
throw InvalidInputException(
|
|
393
|
+
"CSV File not supported for multithreading. Please run single-threaded CSV Reading");
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
}
|
|
330
397
|
}
|
|
331
398
|
|
|
332
399
|
unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, ReadCSVData &bind_data) {
|
|
@@ -348,7 +415,7 @@ unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, R
|
|
|
348
415
|
current_buffer = next_buffer;
|
|
349
416
|
if (next_buffer) {
|
|
350
417
|
// Next buffer gets the next-next buffer
|
|
351
|
-
next_buffer = next_buffer->Next(*file_handle, buffer_size);
|
|
418
|
+
next_buffer = next_buffer->Next(*file_handle, buffer_size, current_csv_position);
|
|
352
419
|
}
|
|
353
420
|
}
|
|
354
421
|
if (current_buffer && !next_buffer) {
|
|
@@ -356,11 +423,26 @@ unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, R
|
|
|
356
423
|
if (file_index < bind_data.files.size()) {
|
|
357
424
|
bind_data.options.file_path = bind_data.files[file_index++];
|
|
358
425
|
file_handle = ReadCSV::OpenCSV(bind_data.options, context);
|
|
359
|
-
|
|
426
|
+
current_csv_position = 0;
|
|
427
|
+
// FIXME: This will probably require some changes on the verification code
|
|
428
|
+
next_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position);
|
|
360
429
|
}
|
|
361
430
|
}
|
|
362
431
|
return result;
|
|
363
432
|
}
|
|
433
|
+
void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions) {
|
|
434
|
+
lock_guard<mutex> parallel_lock(main_mutex);
|
|
435
|
+
if (positions.beginning_of_first_line < positions.end_of_last_line) {
|
|
436
|
+
if (positions.end_of_last_line > max_tuple_end) {
|
|
437
|
+
max_tuple_end = positions.end_of_last_line;
|
|
438
|
+
}
|
|
439
|
+
tuple_start.insert(positions.beginning_of_first_line);
|
|
440
|
+
tuple_end.push_back(positions.end_of_last_line);
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
void SetNewLine() {
|
|
445
|
+
}
|
|
364
446
|
|
|
365
447
|
static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext &context,
|
|
366
448
|
TableFunctionInitInput &input) {
|
|
@@ -373,10 +455,11 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
|
|
|
373
455
|
|
|
374
456
|
bind_data.options.file_path = bind_data.files[0];
|
|
375
457
|
file_handle = ReadCSV::OpenCSV(bind_data.options, context);
|
|
376
|
-
idx_t rows_to_skip =
|
|
458
|
+
idx_t rows_to_skip =
|
|
459
|
+
bind_data.options.skip_rows + (bind_data.options.has_header && bind_data.options.header ? 1 : 0);
|
|
377
460
|
return make_unique<ParallelCSVGlobalState>(context, std::move(file_handle), bind_data.files,
|
|
378
461
|
context.db->NumberOfThreads(), bind_data.options.buffer_size,
|
|
379
|
-
rows_to_skip);
|
|
462
|
+
rows_to_skip, ClientConfig::GetConfig(context).verify_parallelism);
|
|
380
463
|
}
|
|
381
464
|
|
|
382
465
|
//===--------------------------------------------------------------------===//
|
|
@@ -390,6 +473,7 @@ public:
|
|
|
390
473
|
//! The CSV reader
|
|
391
474
|
unique_ptr<ParallelCSVReader> csv_reader;
|
|
392
475
|
CSVBufferRead previous_buffer;
|
|
476
|
+
bool done = false;
|
|
393
477
|
};
|
|
394
478
|
|
|
395
479
|
unique_ptr<LocalTableFunctionState> ParallelReadCSVInitLocal(ExecutionContext &context, TableFunctionInitInput &input,
|
|
@@ -401,9 +485,10 @@ unique_ptr<LocalTableFunctionState> ParallelReadCSVInitLocal(ExecutionContext &c
|
|
|
401
485
|
if (next_local_buffer) {
|
|
402
486
|
csv_reader = make_unique<ParallelCSVReader>(context.client, csv_data.options, std::move(next_local_buffer),
|
|
403
487
|
csv_data.sql_types);
|
|
488
|
+
} else {
|
|
489
|
+
global_state.DecrementThread();
|
|
404
490
|
}
|
|
405
|
-
|
|
406
|
-
return std::move(new_local_state);
|
|
491
|
+
return make_unique<ParallelCSVLocalState>(std::move(csv_reader));
|
|
407
492
|
}
|
|
408
493
|
|
|
409
494
|
static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
|
|
@@ -417,13 +502,14 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
|
|
|
417
502
|
}
|
|
418
503
|
|
|
419
504
|
do {
|
|
420
|
-
if (output.size() != 0
|
|
421
|
-
csv_local_state.csv_reader->end_buffer)) {
|
|
505
|
+
if (output.size() != 0) {
|
|
422
506
|
break;
|
|
423
507
|
}
|
|
424
|
-
if (csv_local_state.csv_reader->
|
|
508
|
+
if (csv_local_state.csv_reader->finished) {
|
|
509
|
+
csv_global_state.UpdateVerification(csv_local_state.csv_reader->GetVerificationPositions());
|
|
425
510
|
auto next_chunk = csv_global_state.Next(context, bind_data);
|
|
426
511
|
if (!next_chunk) {
|
|
512
|
+
csv_global_state.DecrementThread();
|
|
427
513
|
break;
|
|
428
514
|
}
|
|
429
515
|
csv_local_state.csv_reader->SetBufferRead(std::move(next_chunk));
|
|
@@ -431,7 +517,9 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
|
|
|
431
517
|
csv_local_state.csv_reader->ParseCSV(output);
|
|
432
518
|
|
|
433
519
|
} while (true);
|
|
434
|
-
|
|
520
|
+
if (csv_global_state.Finished()) {
|
|
521
|
+
csv_global_state.Verify();
|
|
522
|
+
}
|
|
435
523
|
if (bind_data.options.union_by_name) {
|
|
436
524
|
throw InternalException("FIXME: union by name");
|
|
437
525
|
}
|
|
@@ -678,6 +766,7 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
|
|
|
678
766
|
table_function.named_parameters["sep"] = LogicalType::VARCHAR;
|
|
679
767
|
table_function.named_parameters["delim"] = LogicalType::VARCHAR;
|
|
680
768
|
table_function.named_parameters["quote"] = LogicalType::VARCHAR;
|
|
769
|
+
table_function.named_parameters["new_line"] = LogicalType::VARCHAR;
|
|
681
770
|
table_function.named_parameters["escape"] = LogicalType::VARCHAR;
|
|
682
771
|
table_function.named_parameters["nullstr"] = LogicalType::VARCHAR;
|
|
683
772
|
table_function.named_parameters["columns"] = LogicalType::ANY;
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
|
2
|
-
#define DUCKDB_VERSION "0.6.2-
|
|
2
|
+
#define DUCKDB_VERSION "0.6.2-dev1736"
|
|
3
3
|
#endif
|
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
|
5
|
+
#define DUCKDB_SOURCE_ID "424848838c"
|
|
6
6
|
#endif
|
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
|
8
8
|
#include "duckdb/main/database.hpp"
|
|
@@ -14,6 +14,10 @@
|
|
|
14
14
|
|
|
15
15
|
#include <assert.h>
|
|
16
16
|
#define D_ASSERT assert
|
|
17
|
+
namespace duckdb {
|
|
18
|
+
DUCKDB_API void DuckDBAssertInternal(bool condition, const char *condition_name, const char *file, int linenr);
|
|
19
|
+
}
|
|
20
|
+
|
|
17
21
|
#else
|
|
18
22
|
namespace duckdb {
|
|
19
23
|
DUCKDB_API void DuckDBAssertInternal(bool condition, const char *condition_name, const char *file, int linenr);
|
|
@@ -97,6 +97,9 @@ protected:
|
|
|
97
97
|
void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0);
|
|
98
98
|
static string GetLineNumberStr(idx_t linenr, bool linenr_estimated);
|
|
99
99
|
|
|
100
|
+
//! Sets the newline delimiter
|
|
101
|
+
void SetNewLineDelimiter(bool carry = false, bool carry_followed_by_nl = false);
|
|
102
|
+
|
|
100
103
|
protected:
|
|
101
104
|
//! Whether or not the current row's columns have overflown return_types.size()
|
|
102
105
|
bool error_column_overflow = false;
|
|
@@ -75,7 +75,6 @@ public:
|
|
|
75
75
|
public:
|
|
76
76
|
//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
|
|
77
77
|
void ParseCSV(DataChunk &insert_chunk);
|
|
78
|
-
|
|
79
78
|
static string ColumnTypesError(case_insensitive_map_t<idx_t> sql_types_per_column, const vector<string> &names);
|
|
80
79
|
|
|
81
80
|
private:
|
|
@@ -20,13 +20,15 @@ public:
|
|
|
20
20
|
static constexpr idx_t INITIAL_BUFFER_SIZE_COLOSSAL = 32000000; // 32MB
|
|
21
21
|
|
|
22
22
|
//! Constructor for Initial Buffer
|
|
23
|
-
CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle
|
|
23
|
+
CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
|
|
24
|
+
idx_t &global_csv_current_position);
|
|
24
25
|
|
|
25
26
|
//! Constructor for `Next()` Buffers
|
|
26
|
-
CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer
|
|
27
|
+
CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer,
|
|
28
|
+
idx_t global_csv_current_position);
|
|
27
29
|
|
|
28
30
|
//! Creates a new buffer with the next part of the CSV File
|
|
29
|
-
unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t
|
|
31
|
+
unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position);
|
|
30
32
|
|
|
31
33
|
//! Gets the buffer actual size
|
|
32
34
|
idx_t GetBufferSize();
|
|
@@ -40,6 +42,8 @@ public:
|
|
|
40
42
|
//! If this buffer is the first buffer of the CSV File
|
|
41
43
|
bool IsCSVFileFirstBuffer();
|
|
42
44
|
|
|
45
|
+
idx_t GetCSVGlobalStart();
|
|
46
|
+
|
|
43
47
|
BufferHandle AllocateBuffer(idx_t buffer_size);
|
|
44
48
|
|
|
45
49
|
char *Ptr() {
|
|
@@ -59,5 +63,7 @@ private:
|
|
|
59
63
|
bool last_buffer = false;
|
|
60
64
|
//! If this is the first buffer of the CSV File
|
|
61
65
|
bool first_buffer = false;
|
|
66
|
+
//! Global position from the CSV File where this buffer starts
|
|
67
|
+
idx_t global_csv_start = 0;
|
|
62
68
|
};
|
|
63
69
|
} // namespace duckdb
|
|
@@ -17,6 +17,13 @@
|
|
|
17
17
|
|
|
18
18
|
namespace duckdb {
|
|
19
19
|
|
|
20
|
+
enum NewLineIdentifier {
|
|
21
|
+
SINGLE = 1, // Either \r or \n
|
|
22
|
+
CARRY_ON = 2, // \r\n
|
|
23
|
+
MIX = 3, // Hippie-Land, can't run it multithreaded
|
|
24
|
+
NOT_SET = 4
|
|
25
|
+
};
|
|
26
|
+
|
|
20
27
|
struct BufferedCSVReaderOptions {
|
|
21
28
|
//===--------------------------------------------------------------------===//
|
|
22
29
|
// CommonCSVOptions
|
|
@@ -26,7 +33,11 @@ struct BufferedCSVReaderOptions {
|
|
|
26
33
|
bool has_delimiter = false;
|
|
27
34
|
//! Delimiter to separate columns within each line
|
|
28
35
|
string delimiter = ",";
|
|
29
|
-
//! Whether or not a
|
|
36
|
+
//! Whether or not a new_line was defined by the user
|
|
37
|
+
bool has_newline = false;
|
|
38
|
+
//! New Line separator
|
|
39
|
+
NewLineIdentifier new_line = NewLineIdentifier::NOT_SET;
|
|
40
|
+
|
|
30
41
|
bool has_quote = false;
|
|
31
42
|
//! Quote used for columns that contain reserved characters, e.g., delimiter
|
|
32
43
|
string quote = "\"";
|
|
@@ -112,6 +123,8 @@ struct BufferedCSVReaderOptions {
|
|
|
112
123
|
void Deserialize(FieldReader &reader);
|
|
113
124
|
|
|
114
125
|
void SetDelimiter(const string &delimiter);
|
|
126
|
+
|
|
127
|
+
void SetNewline(const string &input);
|
|
115
128
|
//! Set an option that is supported by both reading and writing functions, called by
|
|
116
129
|
//! the SetReadOption and SetWriteOption methods
|
|
117
130
|
bool SetBaseOption(const string &loption, const Value &value);
|
|
@@ -91,6 +91,10 @@ struct CSVBufferRead {
|
|
|
91
91
|
idx_t estimated_linenr;
|
|
92
92
|
};
|
|
93
93
|
|
|
94
|
+
struct VerificationPositions {
|
|
95
|
+
idx_t beginning_of_first_line = 0;
|
|
96
|
+
idx_t end_of_last_line = 0;
|
|
97
|
+
};
|
|
94
98
|
//! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
|
|
95
99
|
class ParallelCSVReader : public BaseCSVReader {
|
|
96
100
|
public:
|
|
@@ -111,7 +115,10 @@ public:
|
|
|
111
115
|
//! If this flag is set, it means we are about to try to read our last row.
|
|
112
116
|
bool reached_remainder_state = false;
|
|
113
117
|
|
|
118
|
+
bool finished = false;
|
|
119
|
+
|
|
114
120
|
unique_ptr<CSVBufferRead> buffer;
|
|
121
|
+
VerificationPositions GetVerificationPositions();
|
|
115
122
|
|
|
116
123
|
public:
|
|
117
124
|
void SetBufferRead(unique_ptr<CSVBufferRead> buffer);
|
|
@@ -134,8 +141,13 @@ private:
|
|
|
134
141
|
//! when changing the buffer end the first time.
|
|
135
142
|
//! It returns FALSE if the parser should jump to the final state of parsing or not
|
|
136
143
|
bool BufferRemainder();
|
|
144
|
+
|
|
145
|
+
bool NewLineDelimiter(bool carry, bool carry_followed_by_nl, bool first_char);
|
|
146
|
+
|
|
137
147
|
//! Parses a CSV file with a one-byte delimiter, escape and quote character
|
|
138
148
|
bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
|
|
149
|
+
//! Position of the first read line and last read line for verification purposes
|
|
150
|
+
VerificationPositions verification_positions;
|
|
139
151
|
};
|
|
140
152
|
|
|
141
153
|
} // namespace duckdb
|