duckdb 0.9.1-dev19.0 → 0.9.1-dev67.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +13 -9
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +1 -2
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +3 -5
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +7 -9
- package/src/duckdb/src/function/table/read_csv.cpp +1 -1
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp +1 -1
- package/src/duckdb/src/optimizer/common_aggregate_optimizer.cpp +2 -2
package/package.json
CHANGED
@@ -49,11 +49,12 @@ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl,
|
|
49
49
|
return (carry && carry_followed_by_nl) || (!carry && first_char);
|
50
50
|
}
|
51
51
|
|
52
|
-
|
52
|
+
bool ParallelCSVReader::SkipEmptyLines() {
|
53
|
+
const idx_t initial_position_buffer = position_buffer;
|
53
54
|
idx_t new_pos_buffer = position_buffer;
|
54
55
|
if (parse_chunk.data.size() == 1) {
|
55
56
|
// Empty lines are null data.
|
56
|
-
return;
|
57
|
+
return initial_position_buffer != position_buffer;
|
57
58
|
}
|
58
59
|
for (; new_pos_buffer < end_buffer; new_pos_buffer++) {
|
59
60
|
if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) {
|
@@ -63,13 +64,14 @@ void ParallelCSVReader::SkipEmptyLines() {
|
|
63
64
|
position_buffer++;
|
64
65
|
}
|
65
66
|
if (new_pos_buffer > end_buffer) {
|
66
|
-
return;
|
67
|
+
return initial_position_buffer != position_buffer;
|
67
68
|
}
|
68
69
|
position_buffer = new_pos_buffer;
|
69
70
|
} else if ((*buffer)[new_pos_buffer] != ' ') {
|
70
|
-
return;
|
71
|
+
return initial_position_buffer != position_buffer;
|
71
72
|
}
|
72
73
|
}
|
74
|
+
return initial_position_buffer != position_buffer;
|
73
75
|
}
|
74
76
|
|
75
77
|
bool ParallelCSVReader::SetPosition() {
|
@@ -185,7 +187,6 @@ bool ParallelCSVReader::SetPosition() {
|
|
185
187
|
}
|
186
188
|
// Ensure that parse_chunk has no gunk when trying to figure new line
|
187
189
|
parse_chunk.Reset();
|
188
|
-
|
189
190
|
verification_positions.end_of_last_line = position_buffer;
|
190
191
|
finished = false;
|
191
192
|
return successfully_read_first_line;
|
@@ -288,7 +289,7 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
|
|
288
289
|
idx_t column = 0;
|
289
290
|
idx_t offset = 0;
|
290
291
|
bool has_quotes = false;
|
291
|
-
|
292
|
+
bool last_line_empty = false;
|
292
293
|
vector<idx_t> escape_positions;
|
293
294
|
if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
|
294
295
|
// First time reading this buffer piece
|
@@ -454,7 +455,10 @@ add_row : {
|
|
454
455
|
if (!BufferRemainder()) {
|
455
456
|
goto final_state;
|
456
457
|
}
|
457
|
-
SkipEmptyLines()
|
458
|
+
if (SkipEmptyLines() && reached_remainder_state) {
|
459
|
+
last_line_empty = true;
|
460
|
+
goto final_state;
|
461
|
+
}
|
458
462
|
if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
|
459
463
|
error_message = "Line does not fit in one buffer. Increase the buffer size.";
|
460
464
|
return false;
|
@@ -583,8 +587,8 @@ final_state : {
|
|
583
587
|
return true;
|
584
588
|
}
|
585
589
|
// If this is the last buffer, we have to read the last value
|
586
|
-
if (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
|
587
|
-
|
590
|
+
if (!last_line_empty && (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
|
591
|
+
(buffer->next_buffer && buffer->next_buffer->is_last_buffer))) {
|
588
592
|
if (column > 0 || start_buffer != position_buffer || try_add_line ||
|
589
593
|
(insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
|
590
594
|
// remaining values to be added to the chunk
|
@@ -26,8 +26,7 @@ struct SniffDialect {
|
|
26
26
|
bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
|
27
27
|
machine.column_count += machine.previous_state == CSVState::DELIMITER;
|
28
28
|
sniffed_column_counts[machine.cur_rows] = machine.column_count;
|
29
|
-
machine.cur_rows +=
|
30
|
-
machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
|
29
|
+
machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
|
31
30
|
machine.column_count -= (machine.column_count - 1) * (machine.previous_state == CSVState::RECORD_SEPARATOR);
|
32
31
|
|
33
32
|
// It means our carriage return is actually a record separator
|
@@ -143,7 +143,7 @@ struct SniffValue {
|
|
143
143
|
machine.rows_read++;
|
144
144
|
}
|
145
145
|
|
146
|
-
if ((machine.previous_state == CSVState::RECORD_SEPARATOR
|
146
|
+
if ((machine.previous_state == CSVState::RECORD_SEPARATOR) ||
|
147
147
|
(machine.state != CSVState::RECORD_SEPARATOR && machine.previous_state == CSVState::CARRIAGE_RETURN)) {
|
148
148
|
sniffed_values[machine.cur_rows].position = machine.line_start_pos;
|
149
149
|
sniffed_values[machine.cur_rows].set = true;
|
@@ -153,8 +153,7 @@ struct SniffValue {
|
|
153
153
|
machine.Transition(current_char);
|
154
154
|
|
155
155
|
bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
|
156
|
-
if (machine.previous_state == CSVState::DELIMITER ||
|
157
|
-
(machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
|
156
|
+
if (machine.previous_state == CSVState::DELIMITER || (machine.previous_state == CSVState::RECORD_SEPARATOR) ||
|
158
157
|
(machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) {
|
159
158
|
// Started a new value
|
160
159
|
// Check if it's UTF-8
|
@@ -173,8 +172,7 @@ struct SniffValue {
|
|
173
172
|
(machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
|
174
173
|
machine.value += current_char;
|
175
174
|
}
|
176
|
-
machine.cur_rows +=
|
177
|
-
machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
|
175
|
+
machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
|
178
176
|
// It means our carriage return is actually a record separator
|
179
177
|
machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
|
180
178
|
if (machine.cur_rows >= sniffed_values.size()) {
|
@@ -3,9 +3,9 @@
|
|
3
3
|
namespace duckdb {
|
4
4
|
struct Parse {
|
5
5
|
inline static void Initialize(CSVStateMachine &machine) {
|
6
|
-
machine.state = CSVState::
|
7
|
-
machine.previous_state = CSVState::
|
8
|
-
machine.pre_previous_state = CSVState::
|
6
|
+
machine.state = CSVState::EMPTY_LINE;
|
7
|
+
machine.previous_state = CSVState::EMPTY_LINE;
|
8
|
+
machine.pre_previous_state = CSVState::EMPTY_LINE;
|
9
9
|
|
10
10
|
machine.cur_rows = 0;
|
11
11
|
machine.column_count = 0;
|
@@ -17,16 +17,15 @@ struct Parse {
|
|
17
17
|
machine.Transition(current_char);
|
18
18
|
|
19
19
|
bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
|
20
|
-
if (machine.previous_state == CSVState::DELIMITER ||
|
21
|
-
(machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
|
20
|
+
if (machine.previous_state == CSVState::DELIMITER || (machine.previous_state == CSVState::RECORD_SEPARATOR) ||
|
22
21
|
(machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) {
|
23
22
|
// Started a new value
|
24
23
|
// Check if it's UTF-8 (Or not?)
|
25
24
|
machine.VerifyUTF8();
|
26
25
|
auto &v = parse_chunk.data[machine.column_count++];
|
27
26
|
auto parse_data = FlatVector::GetData<string_t>(v);
|
28
|
-
auto &validity_mask = FlatVector::Validity(v);
|
29
27
|
if (machine.value.empty()) {
|
28
|
+
auto &validity_mask = FlatVector::Validity(v);
|
30
29
|
validity_mask.SetInvalid(machine.cur_rows);
|
31
30
|
} else {
|
32
31
|
parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
|
@@ -47,12 +46,11 @@ struct Parse {
|
|
47
46
|
(machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
|
48
47
|
machine.value += current_char;
|
49
48
|
}
|
50
|
-
machine.cur_rows +=
|
51
|
-
machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
|
49
|
+
machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR && machine.column_count > 0;
|
52
50
|
machine.column_count -= machine.column_count * (machine.previous_state == CSVState::RECORD_SEPARATOR);
|
53
51
|
|
54
52
|
// It means our carriage return is actually a record separator
|
55
|
-
machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
|
53
|
+
machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return && machine.column_count > 0;
|
56
54
|
machine.column_count -= machine.column_count * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return);
|
57
55
|
|
58
56
|
if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
|
@@ -38,7 +38,7 @@ void ReadCSVData::FinalizeRead(ClientContext &context) {
|
|
38
38
|
auto number_of_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
|
39
39
|
//! If we have many csv files, we run single-threaded on each file and parallelize on the number of files
|
40
40
|
bool many_csv_files = files.size() > 1 && int64_t(files.size() * 2) >= number_of_threads;
|
41
|
-
if (options.parallel_mode != ParallelMode::PARALLEL && many_csv_files) {
|
41
|
+
if (options.parallel_mode != ParallelMode::PARALLEL && (many_csv_files || number_of_threads == 1)) {
|
42
42
|
single_threaded = true;
|
43
43
|
}
|
44
44
|
if (options.parallel_mode == ParallelMode::SINGLE_THREADED || not_supported_options ||
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "
|
2
|
+
#define DUCKDB_VERSION "v0.9.1-dev67"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "7512d7ff4f"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -148,7 +148,7 @@ private:
|
|
148
148
|
//! Sets Position depending on the byte_start of this thread
|
149
149
|
bool SetPosition();
|
150
150
|
//! Called when scanning the 1st buffer, skips empty lines
|
151
|
-
|
151
|
+
bool SkipEmptyLines();
|
152
152
|
//! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
|
153
153
|
//! Up to finding a new line. This function sets the buffer_end and marks a boolean variable
|
154
154
|
//! when changing the buffer end the first time.
|
@@ -38,8 +38,8 @@ void CommonAggregateOptimizer::ExtractCommonAggregates(LogicalAggregate &aggr) {
|
|
38
38
|
// aggregate does not exist yet: add it to the map
|
39
39
|
aggregate_remap[*aggr.expressions[i]] = i;
|
40
40
|
if (i != original_index) {
|
41
|
-
// this aggregate is not erased, however an
|
42
|
-
// so we need to remap this
|
41
|
+
// this aggregate is not erased, however an aggregate BEFORE it has been erased
|
42
|
+
// so we need to remap this aggregate
|
43
43
|
ColumnBinding original_binding(aggr.aggregate_index, original_index);
|
44
44
|
ColumnBinding new_binding(aggr.aggregate_index, i);
|
45
45
|
aggregate_map[original_binding] = new_binding;
|