duckdb 0.8.2-dev4653.0 → 0.8.2-dev4711.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +7 -0
- package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +3 -3
- package/src/duckdb/src/execution/operator/csv_scanner/base_csv_reader.cpp +5 -1
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +18 -9
- package/src/duckdb/src/execution/operator/csv_scanner/csv_reader_options.cpp +11 -27
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +1 -2
- package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +4 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +11 -2
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +8 -8
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +7 -6
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +27 -6
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +49 -41
- package/src/duckdb/src/function/table/read_csv.cpp +12 -9
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +2 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/base_csv_reader.hpp +4 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_line_info.hpp +4 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_reader_options.hpp +2 -4
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +3 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp +1 -0
- package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +1 -2
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +3 -0
- package/src/duckdb/src/main/query_result.cpp +16 -10
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +42 -44
package/package.json
CHANGED
@@ -433,6 +433,13 @@ bool TupleDataCollection::Scan(TupleDataParallelScanState &gstate, TupleDataLoca
|
|
433
433
|
return true;
|
434
434
|
}
|
435
435
|
|
436
|
+
bool TupleDataCollection::ScanComplete(const TupleDataScanState &state) const {
|
437
|
+
if (Count() == 0) {
|
438
|
+
return true;
|
439
|
+
}
|
440
|
+
return state.segment_index == segments.size() - 1 && state.chunk_index == segments.back().ChunkCount();
|
441
|
+
}
|
442
|
+
|
436
443
|
void TupleDataCollection::FinalizePinState(TupleDataPinState &pin_state, TupleDataSegment &segment) {
|
437
444
|
segment.allocator->ReleaseOrStoreHandles(pin_state, segment);
|
438
445
|
}
|
@@ -782,13 +782,13 @@ public:
|
|
782
782
|
}
|
783
783
|
|
784
784
|
auto &ht_state = op.sink_state->Cast<HashAggregateGlobalSinkState>();
|
785
|
-
idx_t
|
785
|
+
idx_t partitions = 0;
|
786
786
|
for (size_t sidx = 0; sidx < op.groupings.size(); ++sidx) {
|
787
787
|
auto &grouping = op.groupings[sidx];
|
788
788
|
auto &grouping_gstate = ht_state.grouping_states[sidx];
|
789
|
-
|
789
|
+
partitions += grouping.table_data.NumberOfPartitions(*grouping_gstate.table_state);
|
790
790
|
}
|
791
|
-
return MaxValue<idx_t>(1,
|
791
|
+
return MaxValue<idx_t>(1, partitions);
|
792
792
|
}
|
793
793
|
};
|
794
794
|
|
@@ -263,7 +263,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
|
|
263
263
|
return true;
|
264
264
|
}
|
265
265
|
|
266
|
-
if (mode == ParserMode::SNIFFING_DATATYPES
|
266
|
+
if (mode == ParserMode::SNIFFING_DATATYPES) {
|
267
267
|
return true;
|
268
268
|
}
|
269
269
|
|
@@ -480,6 +480,10 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_ad
|
|
480
480
|
|
481
481
|
bool was_already_null = FlatVector::IsNull(parse_vector, row_idx);
|
482
482
|
if (!was_already_null && FlatVector::IsNull(result_vector, row_idx)) {
|
483
|
+
Increment(buffer_idx);
|
484
|
+
auto bla = GetLineError(global_row_idx, buffer_idx, false);
|
485
|
+
row_idx += bla;
|
486
|
+
row_idx -= bla;
|
483
487
|
row_failed = true;
|
484
488
|
failed_cells.emplace_back(row_idx, col_idx, row_line);
|
485
489
|
}
|
@@ -8,10 +8,14 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle
|
|
8
8
|
: context(context), first_buffer(true), file_number(file_number_p), can_seek(file_handle.CanSeek()) {
|
9
9
|
AllocateBuffer(buffer_size_p);
|
10
10
|
auto buffer = Ptr();
|
11
|
-
|
11
|
+
actual_buffer_size = file_handle.Read(buffer, buffer_size_p);
|
12
|
+
while (actual_buffer_size < buffer_size_p && !file_handle.FinishedReading()) {
|
13
|
+
// We keep reading until this block is full
|
14
|
+
actual_buffer_size += file_handle.Read(&buffer[actual_buffer_size], buffer_size_p - actual_buffer_size);
|
15
|
+
}
|
12
16
|
global_csv_start = global_csv_current_position;
|
13
17
|
// BOM check (https://en.wikipedia.org/wiki/Byte_order_mark)
|
14
|
-
if (
|
18
|
+
if (actual_buffer_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
|
15
19
|
start_position += 3;
|
16
20
|
}
|
17
21
|
last_buffer = file_handle.FinishedReading();
|
@@ -22,13 +26,18 @@ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t b
|
|
22
26
|
: context(context), global_csv_start(global_csv_current_position), file_number(file_number_p),
|
23
27
|
can_seek(file_handle.CanSeek()) {
|
24
28
|
AllocateBuffer(buffer_size);
|
25
|
-
|
29
|
+
auto buffer = handle.Ptr();
|
30
|
+
actual_buffer_size = file_handle.Read(handle.Ptr(), buffer_size);
|
31
|
+
while (actual_buffer_size < buffer_size && !file_handle.FinishedReading()) {
|
32
|
+
// We keep reading until this block is full
|
33
|
+
actual_buffer_size += file_handle.Read(&buffer[actual_buffer_size], buffer_size - actual_buffer_size);
|
34
|
+
}
|
26
35
|
last_buffer = file_handle.FinishedReading();
|
27
36
|
}
|
28
37
|
|
29
38
|
shared_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p) {
|
30
39
|
auto next_csv_buffer =
|
31
|
-
make_shared<CSVBuffer>(file_handle, context, buffer_size, global_csv_start +
|
40
|
+
make_shared<CSVBuffer>(file_handle, context, buffer_size, global_csv_start + actual_buffer_size, file_number_p);
|
32
41
|
if (next_csv_buffer->GetBufferSize() == 0) {
|
33
42
|
// We are done reading
|
34
43
|
return nullptr;
|
@@ -43,13 +52,13 @@ void CSVBuffer::AllocateBuffer(idx_t buffer_size) {
|
|
43
52
|
}
|
44
53
|
|
45
54
|
idx_t CSVBuffer::GetBufferSize() {
|
46
|
-
return
|
55
|
+
return actual_buffer_size;
|
47
56
|
}
|
48
57
|
|
49
58
|
void CSVBuffer::Reload(CSVFileHandle &file_handle) {
|
50
|
-
AllocateBuffer(
|
59
|
+
AllocateBuffer(actual_buffer_size);
|
51
60
|
file_handle.Seek(global_csv_start);
|
52
|
-
file_handle.Read(handle.Ptr(),
|
61
|
+
file_handle.Read(handle.Ptr(), actual_buffer_size);
|
53
62
|
}
|
54
63
|
|
55
64
|
unique_ptr<CSVBufferHandle> CSVBuffer::Pin(CSVFileHandle &file_handle) {
|
@@ -59,8 +68,8 @@ unique_ptr<CSVBufferHandle> CSVBuffer::Pin(CSVFileHandle &file_handle) {
|
|
59
68
|
block = nullptr;
|
60
69
|
Reload(file_handle);
|
61
70
|
}
|
62
|
-
return make_uniq<CSVBufferHandle>(buffer_manager.Pin(block),
|
63
|
-
start_position, file_number);
|
71
|
+
return make_uniq<CSVBufferHandle>(buffer_manager.Pin(block), actual_buffer_size, first_buffer, last_buffer,
|
72
|
+
global_csv_start, start_position, file_number);
|
64
73
|
}
|
65
74
|
|
66
75
|
void CSVBuffer::Unpin() {
|
@@ -168,38 +168,24 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
|
|
168
168
|
if (loption == "auto_detect") {
|
169
169
|
auto_detect = ParseBoolean(value, loption);
|
170
170
|
} else if (loption == "sample_size") {
|
171
|
-
int64_t
|
172
|
-
if (
|
171
|
+
int64_t sample_size_option = ParseInteger(value, loption);
|
172
|
+
if (sample_size_option < 1 && sample_size_option != -1) {
|
173
173
|
throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1");
|
174
174
|
}
|
175
|
-
if (
|
176
|
-
|
177
|
-
|
178
|
-
} else if (sample_size <= STANDARD_VECTOR_SIZE) {
|
179
|
-
sample_chunk_size = sample_size;
|
180
|
-
sample_chunks = 1;
|
175
|
+
if (sample_size_option == -1) {
|
176
|
+
// If -1, we basically read the whole thing
|
177
|
+
sample_size_chunks = NumericLimits<idx_t>().Maximum();
|
181
178
|
} else {
|
182
|
-
|
183
|
-
|
179
|
+
sample_size_chunks = sample_size_option / STANDARD_VECTOR_SIZE;
|
180
|
+
if (sample_size_option % STANDARD_VECTOR_SIZE != 0) {
|
181
|
+
sample_size_chunks++;
|
182
|
+
}
|
184
183
|
}
|
184
|
+
|
185
185
|
} else if (loption == "skip") {
|
186
186
|
SetSkipRows(ParseInteger(value, loption));
|
187
187
|
} else if (loption == "max_line_size" || loption == "maximum_line_size") {
|
188
188
|
maximum_line_size = ParseInteger(value, loption);
|
189
|
-
} else if (loption == "sample_chunk_size") {
|
190
|
-
sample_chunk_size = ParseInteger(value, loption);
|
191
|
-
if (sample_chunk_size > STANDARD_VECTOR_SIZE) {
|
192
|
-
throw BinderException(
|
193
|
-
"Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d",
|
194
|
-
STANDARD_VECTOR_SIZE);
|
195
|
-
} else if (sample_chunk_size < 1) {
|
196
|
-
throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1");
|
197
|
-
}
|
198
|
-
} else if (loption == "sample_chunks") {
|
199
|
-
sample_chunks = ParseInteger(value, loption);
|
200
|
-
if (sample_chunks < 1) {
|
201
|
-
throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1");
|
202
|
-
}
|
203
189
|
} else if (loption == "force_not_null") {
|
204
190
|
force_not_null = ParseColumnList(value, expected_names, loption);
|
205
191
|
} else if (loption == "date_format" || loption == "dateformat") {
|
@@ -322,7 +308,7 @@ string CSVReaderOptions::ToString() const {
|
|
322
308
|
(has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
|
323
309
|
"\n header=" + std::to_string(dialect_options.header) +
|
324
310
|
(has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
|
325
|
-
"\n sample_size=" + std::to_string(
|
311
|
+
"\n sample_size=" + std::to_string(sample_size_chunks * STANDARD_VECTOR_SIZE) +
|
326
312
|
"\n ignore_errors=" + std::to_string(ignore_errors) + "\n all_varchar=" + std::to_string(all_varchar);
|
327
313
|
}
|
328
314
|
|
@@ -489,8 +475,6 @@ void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) {
|
|
489
475
|
if (skip_rows_set) {
|
490
476
|
named_params["skip"] = Value::BIGINT(GetSkipRows());
|
491
477
|
}
|
492
|
-
named_params["sample_chunks"] = Value::BIGINT(sample_chunks);
|
493
|
-
named_params["sample_chunk_size"] = Value::BIGINT(sample_chunk_size);
|
494
478
|
named_params["null_padding"] = Value::BOOLEAN(null_padding);
|
495
479
|
if (!date_format.at(LogicalType::DATE).format_specifier.empty()) {
|
496
480
|
named_params["dateformat"] = Value(date_format.at(LogicalType::DATE).format_specifier);
|
@@ -29,8 +29,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
|
|
29
29
|
InitializeTransitionArray(transition_array[i], quoted_state);
|
30
30
|
break;
|
31
31
|
case unquoted_state:
|
32
|
-
|
33
|
-
break;
|
32
|
+
case invalid_state:
|
34
33
|
case escape_state:
|
35
34
|
InitializeTransitionArray(transition_array[i], invalid_state);
|
36
35
|
break;
|
@@ -647,6 +647,10 @@ idx_t ParallelCSVReader::GetLineError(idx_t line_error, idx_t buffer_idx, bool s
|
|
647
647
|
}
|
648
648
|
}
|
649
649
|
|
650
|
+
void ParallelCSVReader::Increment(idx_t buffer_idx) {
|
651
|
+
return buffer->line_info->Increment(file_idx, buffer_idx);
|
652
|
+
}
|
653
|
+
|
650
654
|
bool ParallelCSVReader::TryParseCSV(ParserMode mode) {
|
651
655
|
DataChunk dummy_chunk;
|
652
656
|
string error_message;
|
@@ -3,8 +3,9 @@
|
|
3
3
|
namespace duckdb {
|
4
4
|
|
5
5
|
CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
|
6
|
-
CSVStateMachineCache &state_machine_cache_p)
|
7
|
-
: state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p))
|
6
|
+
CSVStateMachineCache &state_machine_cache_p, bool explicit_set_columns_p)
|
7
|
+
: state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p)),
|
8
|
+
explicit_set_columns(explicit_set_columns_p) {
|
8
9
|
|
9
10
|
// Check if any type is BLOB
|
10
11
|
for (auto &type : options.sql_type_list) {
|
@@ -24,6 +25,14 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager>
|
|
24
25
|
SnifferResult CSVSniffer::SniffCSV() {
|
25
26
|
// 1. Dialect Detection
|
26
27
|
DetectDialect();
|
28
|
+
if (explicit_set_columns) {
|
29
|
+
if (!candidates.empty()) {
|
30
|
+
options.dialect_options.state_machine_options = candidates[0]->dialect_options.state_machine_options;
|
31
|
+
options.dialect_options.new_line = candidates[0]->dialect_options.new_line;
|
32
|
+
}
|
33
|
+
// We do not need to run type and header detection as these were defined by the user
|
34
|
+
return SnifferResult(detected_types, names);
|
35
|
+
}
|
27
36
|
// 2. Type Detection
|
28
37
|
DetectTypes();
|
29
38
|
// 3. Header Detection
|
@@ -15,7 +15,7 @@ struct SniffDialect {
|
|
15
15
|
inline static bool Process(CSVStateMachine &machine, vector<idx_t> &sniffed_column_counts, char current_char,
|
16
16
|
idx_t current_pos) {
|
17
17
|
|
18
|
-
D_ASSERT(sniffed_column_counts.size() ==
|
18
|
+
D_ASSERT(sniffed_column_counts.size() == STANDARD_VECTOR_SIZE);
|
19
19
|
|
20
20
|
if (machine.state == CSVState::INVALID) {
|
21
21
|
sniffed_column_counts.clear();
|
@@ -45,7 +45,7 @@ struct SniffDialect {
|
|
45
45
|
machine.single_record_separator = ((machine.state != CSVState::RECORD_SEPARATOR && carriage_return) ||
|
46
46
|
(machine.state == CSVState::RECORD_SEPARATOR && !carriage_return)) ||
|
47
47
|
machine.single_record_separator;
|
48
|
-
if (machine.cur_rows >=
|
48
|
+
if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
|
49
49
|
// We sniffed enough rows
|
50
50
|
return true;
|
51
51
|
}
|
@@ -55,10 +55,10 @@ struct SniffDialect {
|
|
55
55
|
if (machine.state == CSVState::INVALID) {
|
56
56
|
return;
|
57
57
|
}
|
58
|
-
if (machine.cur_rows <
|
58
|
+
if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state == CSVState::DELIMITER) {
|
59
59
|
sniffed_column_counts[machine.cur_rows] = ++machine.column_count;
|
60
60
|
}
|
61
|
-
if (machine.cur_rows <
|
61
|
+
if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state != CSVState::EMPTY_LINE) {
|
62
62
|
sniffed_column_counts[machine.cur_rows++] = machine.column_count;
|
63
63
|
}
|
64
64
|
NewLineIdentifier suggested_newline;
|
@@ -145,7 +145,7 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<CSVStateMachi
|
|
145
145
|
void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<CSVStateMachine> state_machine, idx_t &rows_read,
|
146
146
|
idx_t &best_consistent_rows, idx_t &prev_padding_count) {
|
147
147
|
// The sniffed_column_counts variable keeps track of the number of columns found for each row
|
148
|
-
vector<idx_t> sniffed_column_counts(
|
148
|
+
vector<idx_t> sniffed_column_counts(STANDARD_VECTOR_SIZE);
|
149
149
|
|
150
150
|
state_machine->csv_buffer_iterator.Process<SniffDialect>(*state_machine, sniffed_column_counts);
|
151
151
|
idx_t start_row = options.dialect_options.skip_rows;
|
@@ -244,7 +244,7 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<CSVStateMachine> state_machi
|
|
244
244
|
}
|
245
245
|
|
246
246
|
bool CSVSniffer::RefineCandidateNextChunk(CSVStateMachine &candidate) {
|
247
|
-
vector<idx_t> sniffed_column_counts(
|
247
|
+
vector<idx_t> sniffed_column_counts(STANDARD_VECTOR_SIZE);
|
248
248
|
candidate.csv_buffer_iterator.Process<SniffDialect>(candidate, sniffed_column_counts);
|
249
249
|
bool allow_padding = options.null_padding;
|
250
250
|
|
@@ -268,9 +268,9 @@ void CSVSniffer::RefineCandidates() {
|
|
268
268
|
return;
|
269
269
|
}
|
270
270
|
for (auto &cur_candidate : candidates) {
|
271
|
-
for (idx_t i = 1; i <= options.
|
271
|
+
for (idx_t i = 1; i <= options.sample_size_chunks; i++) {
|
272
272
|
bool finished_file = cur_candidate->csv_buffer_iterator.Finished();
|
273
|
-
if (finished_file || i == options.
|
273
|
+
if (finished_file || i == options.sample_size_chunks) {
|
274
274
|
// we finished the file or our chunk sample successfully: stop
|
275
275
|
auto successful_candidate = std::move(cur_candidate);
|
276
276
|
candidates.clear();
|
@@ -283,11 +283,7 @@ void CSVSniffer::DetectTypes() {
|
|
283
283
|
candidate->Reset();
|
284
284
|
|
285
285
|
// Parse chunk and read csv with info candidate
|
286
|
-
|
287
|
-
if (options.sample_chunk_size == 1) {
|
288
|
-
sample_size++;
|
289
|
-
}
|
290
|
-
vector<TupleSniffing> tuples(sample_size);
|
286
|
+
vector<TupleSniffing> tuples(STANDARD_VECTOR_SIZE);
|
291
287
|
candidate->csv_buffer_iterator.Process<SniffValue>(*candidate, tuples);
|
292
288
|
// Potentially Skip empty rows (I find this dirty, but it is what the original code does)
|
293
289
|
idx_t true_start = 0;
|
@@ -311,8 +307,10 @@ void CSVSniffer::DetectTypes() {
|
|
311
307
|
break;
|
312
308
|
}
|
313
309
|
}
|
310
|
+
if (values_start > 0) {
|
311
|
+
tuples.erase(tuples.begin(), tuples.begin() + values_start);
|
312
|
+
}
|
314
313
|
|
315
|
-
tuples.erase(tuples.begin(), tuples.begin() + values_start);
|
316
314
|
idx_t row_idx = 0;
|
317
315
|
if (tuples.size() > 1 && (!options.has_header || (options.has_header && options.dialect_options.header))) {
|
318
316
|
// This means we have more than one row, hence we can use the first row to detect if we have a header
|
@@ -327,6 +325,9 @@ void CSVSniffer::DetectTypes() {
|
|
327
325
|
for (; row_idx < tuples.size(); row_idx++) {
|
328
326
|
for (idx_t col = 0; col < tuples[row_idx].values.size(); col++) {
|
329
327
|
auto &col_type_candidates = info_sql_types_candidates[col];
|
328
|
+
// col_type_candidates can't be empty since anything in a CSV file should at least be a string
|
329
|
+
// and we validate utf-8 compatibility when creating the type
|
330
|
+
D_ASSERT(!col_type_candidates.empty());
|
330
331
|
auto cur_top_candidate = col_type_candidates.back();
|
331
332
|
auto dummy_val = tuples[row_idx].values[col];
|
332
333
|
// try cast from string to sql_type
|
@@ -46,7 +46,8 @@ struct Parse {
|
|
46
46
|
validity_mask.SetInvalid(machine.cur_rows);
|
47
47
|
}
|
48
48
|
}
|
49
|
-
if (machine.state == CSVState::STANDARD
|
49
|
+
if (machine.state == CSVState::STANDARD ||
|
50
|
+
(machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
|
50
51
|
machine.value += current_char;
|
51
52
|
}
|
52
53
|
machine.cur_rows +=
|
@@ -57,7 +58,7 @@ struct Parse {
|
|
57
58
|
machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
|
58
59
|
machine.column_count -= machine.column_count * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return);
|
59
60
|
|
60
|
-
if (machine.cur_rows >=
|
61
|
+
if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
|
61
62
|
// We sniffed enough rows
|
62
63
|
return true;
|
63
64
|
}
|
@@ -65,11 +66,22 @@ struct Parse {
|
|
65
66
|
}
|
66
67
|
|
67
68
|
inline static void Finalize(CSVStateMachine &machine, DataChunk &parse_chunk) {
|
68
|
-
if (machine.cur_rows <
|
69
|
+
if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state != CSVState::EMPTY_LINE) {
|
69
70
|
machine.VerifyUTF8();
|
70
71
|
auto &v = parse_chunk.data[machine.column_count++];
|
71
72
|
auto parse_data = FlatVector::GetData<string_t>(v);
|
72
|
-
|
73
|
+
if (machine.value.empty()) {
|
74
|
+
auto &validity_mask = FlatVector::Validity(v);
|
75
|
+
validity_mask.SetInvalid(machine.cur_rows);
|
76
|
+
} else {
|
77
|
+
parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
|
78
|
+
}
|
79
|
+
while (machine.column_count < parse_chunk.ColumnCount()) {
|
80
|
+
auto &v_pad = parse_chunk.data[machine.column_count++];
|
81
|
+
auto &validity_mask = FlatVector::Validity(v_pad);
|
82
|
+
validity_mask.SetInvalid(machine.cur_rows);
|
83
|
+
}
|
84
|
+
machine.cur_rows++;
|
73
85
|
}
|
74
86
|
parse_chunk.SetCardinality(machine.cur_rows);
|
75
87
|
}
|
@@ -104,8 +116,8 @@ void CSVSniffer::RefineTypes() {
|
|
104
116
|
return;
|
105
117
|
}
|
106
118
|
DataChunk parse_chunk;
|
107
|
-
parse_chunk.Initialize(BufferAllocator::Get(buffer_manager->context), detected_types,
|
108
|
-
for (idx_t i = 1; i < best_candidate->options.
|
119
|
+
parse_chunk.Initialize(BufferAllocator::Get(buffer_manager->context), detected_types, STANDARD_VECTOR_SIZE);
|
120
|
+
for (idx_t i = 1; i < best_candidate->options.sample_size_chunks; i++) {
|
109
121
|
bool finished_file = best_candidate->csv_buffer_iterator.Finished();
|
110
122
|
if (finished_file) {
|
111
123
|
// we finished the file: stop
|
@@ -124,6 +136,7 @@ void CSVSniffer::RefineTypes() {
|
|
124
136
|
best_candidate->csv_buffer_iterator.Process<Parse>(*best_candidate, parse_chunk);
|
125
137
|
for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) {
|
126
138
|
vector<LogicalType> &col_type_candidates = best_sql_types_candidates_per_column_idx[col];
|
139
|
+
bool is_bool_type = col_type_candidates.back() == LogicalType::BOOLEAN;
|
127
140
|
while (col_type_candidates.size() > 1) {
|
128
141
|
const auto &sql_type = col_type_candidates.back();
|
129
142
|
// narrow down the date formats
|
@@ -154,6 +167,14 @@ void CSVSniffer::RefineTypes() {
|
|
154
167
|
if (TryCastVector(parse_chunk.data[col], parse_chunk.size(), sql_type)) {
|
155
168
|
break;
|
156
169
|
} else {
|
170
|
+
if (col_type_candidates.back() == LogicalType::BOOLEAN && is_bool_type) {
|
171
|
+
// If we thought this was a boolean value (i.e., T,F, True, False) and it is not, we
|
172
|
+
// immediately pop to varchar.
|
173
|
+
while (col_type_candidates.back() != LogicalType::VARCHAR) {
|
174
|
+
col_type_candidates.pop_back();
|
175
|
+
}
|
176
|
+
break;
|
177
|
+
}
|
157
178
|
col_type_candidates.pop_back();
|
158
179
|
}
|
159
180
|
}
|
@@ -474,14 +474,9 @@ void RadixPartitionedHashTable::Finalize(ClientContext &, GlobalSinkState &gstat
|
|
474
474
|
//===--------------------------------------------------------------------===//
|
475
475
|
// Source
|
476
476
|
//===--------------------------------------------------------------------===//
|
477
|
-
idx_t RadixPartitionedHashTable::
|
478
|
-
const auto count = CountInternal(sink_p);
|
479
|
-
return count == 0 && grouping_set.empty() ? 1 : count;
|
480
|
-
}
|
481
|
-
|
482
|
-
idx_t RadixPartitionedHashTable::CountInternal(GlobalSinkState &sink_p) const {
|
477
|
+
idx_t RadixPartitionedHashTable::NumberOfPartitions(GlobalSinkState &sink_p) const {
|
483
478
|
auto &sink = sink_p.Cast<RadixHTGlobalSinkState>();
|
484
|
-
return sink.
|
479
|
+
return sink.partitions.size();
|
485
480
|
}
|
486
481
|
|
487
482
|
void RadixPartitionedHashTable::SetMultiScan(GlobalSinkState &sink_p) {
|
@@ -570,8 +565,7 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
|
|
570
565
|
D_ASSERT(lstate.scan_status != RadixHTScanStatus::IN_PROGRESS);
|
571
566
|
|
572
567
|
const auto n_partitions = sink.partitions.size();
|
573
|
-
if (
|
574
|
-
finished = true;
|
568
|
+
if (finished) {
|
575
569
|
return false;
|
576
570
|
}
|
577
571
|
// We first try to assign a Scan task, then a Finalize task if that didn't work, without using any locks
|
@@ -595,6 +589,11 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
|
|
595
589
|
return true;
|
596
590
|
}
|
597
591
|
|
592
|
+
// We didn't assign a Scan task
|
593
|
+
if (sink.finalize_idx >= n_partitions) {
|
594
|
+
return false; // No finalize tasks left
|
595
|
+
}
|
596
|
+
|
598
597
|
// We can just increment the atomic here, much simpler than assigning the scan task
|
599
598
|
lstate.task_idx = sink.finalize_idx++;
|
600
599
|
if (lstate.task_idx < n_partitions) {
|
@@ -603,7 +602,7 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
|
|
603
602
|
return true;
|
604
603
|
}
|
605
604
|
|
606
|
-
// We didn't manage to assign a
|
605
|
+
// We didn't manage to assign a Finalize task
|
607
606
|
return false;
|
608
607
|
}
|
609
608
|
|
@@ -693,15 +692,18 @@ void RadixHTLocalSourceState::Scan(RadixHTGlobalSinkState &sink, RadixHTGlobalSo
|
|
693
692
|
|
694
693
|
if (!data_collection.Scan(scan_state, scan_chunk)) {
|
695
694
|
scan_status = RadixHTScanStatus::DONE;
|
696
|
-
if (++gstate.scan_done == sink.partitions.size()) {
|
697
|
-
gstate.finished = true;
|
698
|
-
}
|
699
695
|
if (sink.scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE) {
|
700
696
|
data_collection.Reset();
|
701
697
|
}
|
702
698
|
return;
|
703
699
|
}
|
704
700
|
|
701
|
+
if (data_collection.ScanComplete(scan_state)) {
|
702
|
+
if (++gstate.scan_done == sink.partitions.size()) {
|
703
|
+
gstate.finished = true;
|
704
|
+
}
|
705
|
+
}
|
706
|
+
|
705
707
|
RowOperationsState row_state(aggregate_allocator);
|
706
708
|
const auto group_cols = layout.ColumnCount() - 1;
|
707
709
|
RowOperations::FinalizeStates(row_state, layout, scan_state.chunk_state.row_locations, scan_chunk, group_cols);
|
@@ -758,36 +760,38 @@ SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, D
|
|
758
760
|
return SourceResultType::FINISHED;
|
759
761
|
}
|
760
762
|
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
aggr.function.destructor
|
763
|
+
if (sink.count_before_combining == 0) {
|
764
|
+
if (grouping_set.empty()) {
|
765
|
+
// Special case hack to sort out aggregating from empty intermediates for aggregations without groups
|
766
|
+
D_ASSERT(chunk.ColumnCount() == null_groups.size() + op.aggregates.size() + op.grouping_functions.size());
|
767
|
+
// For each column in the aggregates, set to initial state
|
768
|
+
chunk.SetCardinality(1);
|
769
|
+
for (auto null_group : null_groups) {
|
770
|
+
chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
|
771
|
+
ConstantVector::SetNull(chunk.data[null_group], true);
|
772
|
+
}
|
773
|
+
ArenaAllocator allocator(BufferAllocator::Get(context.client));
|
774
|
+
for (idx_t i = 0; i < op.aggregates.size(); i++) {
|
775
|
+
D_ASSERT(op.aggregates[i]->GetExpressionClass() == ExpressionClass::BOUND_AGGREGATE);
|
776
|
+
auto &aggr = op.aggregates[i]->Cast<BoundAggregateExpression>();
|
777
|
+
auto aggr_state = make_unsafe_uniq_array<data_t>(aggr.function.state_size());
|
778
|
+
aggr.function.initialize(aggr_state.get());
|
779
|
+
|
780
|
+
AggregateInputData aggr_input_data(aggr.bind_info.get(), allocator);
|
781
|
+
Vector state_vector(Value::POINTER(CastPointerToValue(aggr_state.get())));
|
782
|
+
aggr.function.finalize(state_vector, aggr_input_data, chunk.data[null_groups.size() + i], 1, 0);
|
783
|
+
if (aggr.function.destructor) {
|
784
|
+
aggr.function.destructor(state_vector, aggr_input_data, 1);
|
785
|
+
}
|
786
|
+
}
|
787
|
+
// Place the grouping values (all the groups of the grouping_set condensed into a single value)
|
788
|
+
// Behind the null groups + aggregates
|
789
|
+
for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
|
790
|
+
chunk.data[null_groups.size() + op.aggregates.size() + i].Reference(grouping_values[i]);
|
782
791
|
}
|
783
|
-
}
|
784
|
-
// Place the grouping values (all the groups of the grouping_set condensed into a single value)
|
785
|
-
// Behind the null groups + aggregates
|
786
|
-
for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
|
787
|
-
chunk.data[null_groups.size() + op.aggregates.size() + i].Reference(grouping_values[i]);
|
788
792
|
}
|
789
793
|
gstate.finished = true;
|
790
|
-
return SourceResultType::
|
794
|
+
return SourceResultType::FINISHED;
|
791
795
|
}
|
792
796
|
|
793
797
|
while (!gstate.finished && chunk.size() == 0) {
|
@@ -796,7 +800,11 @@ SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, D
|
|
796
800
|
}
|
797
801
|
}
|
798
802
|
|
799
|
-
|
803
|
+
if (chunk.size() != 0) {
|
804
|
+
return SourceResultType::HAVE_MORE_OUTPUT;
|
805
|
+
} else {
|
806
|
+
return SourceResultType::FINISHED;
|
807
|
+
}
|
800
808
|
}
|
801
809
|
|
802
810
|
} // namespace duckdb
|
@@ -107,11 +107,11 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
107
107
|
// Initialize Buffer Manager and Sniffer
|
108
108
|
auto file_handle = BaseCSVReader::OpenCSV(context, options);
|
109
109
|
result->buffer_manager = make_shared<CSVBufferManager>(context, std::move(file_handle), options);
|
110
|
-
CSVSniffer sniffer(options, result->buffer_manager, result->state_machine_cache);
|
110
|
+
CSVSniffer sniffer(options, result->buffer_manager, result->state_machine_cache, explicitly_set_columns);
|
111
111
|
auto sniffer_result = sniffer.SniffCSV();
|
112
|
-
return_types = sniffer_result.return_types;
|
113
112
|
if (names.empty()) {
|
114
113
|
names = sniffer_result.names;
|
114
|
+
return_types = sniffer_result.return_types;
|
115
115
|
} else {
|
116
116
|
if (explicitly_set_columns) {
|
117
117
|
// The user has influenced the names, can't assume they are valid anymore
|
@@ -195,6 +195,7 @@ public:
|
|
195
195
|
auto file_count = files_path_p.size();
|
196
196
|
line_info.current_batches.resize(file_count);
|
197
197
|
line_info.lines_read.resize(file_count);
|
198
|
+
line_info.lines_errored.resize(file_count);
|
198
199
|
tuple_start.resize(file_count);
|
199
200
|
tuple_end.resize(file_count);
|
200
201
|
tuple_end_to_batch.resize(file_count);
|
@@ -509,6 +510,11 @@ bool LineInfo::CanItGetLine(idx_t file_idx, idx_t batch_idx) {
|
|
509
510
|
return false;
|
510
511
|
}
|
511
512
|
|
513
|
+
void LineInfo::Increment(idx_t file_idx, idx_t batch_idx) {
|
514
|
+
auto parallel_lock = duckdb::make_uniq<lock_guard<mutex>>(main_mutex);
|
515
|
+
lines_errored[file_idx][batch_idx]++;
|
516
|
+
}
|
517
|
+
|
512
518
|
// Returns the 1-indexed line number
|
513
519
|
idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t cur_start, bool verify,
|
514
520
|
bool stop_at_first) {
|
@@ -520,12 +526,11 @@ idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t
|
|
520
526
|
|
521
527
|
if (!stop_at_first) {
|
522
528
|
// Figure out the amount of lines read in the current file
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
break;
|
529
|
+
for (idx_t cur_batch_idx = 0; cur_batch_idx <= batch_idx; cur_batch_idx++) {
|
530
|
+
if (cur_batch_idx < batch_idx) {
|
531
|
+
line_count += lines_errored[file_idx][cur_batch_idx];
|
527
532
|
}
|
528
|
-
line_count += lines_read[file_idx][
|
533
|
+
line_count += lines_read[file_idx][cur_batch_idx];
|
529
534
|
}
|
530
535
|
return line_count + line_error + 1;
|
531
536
|
}
|
@@ -880,8 +885,6 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
|
|
880
885
|
table_function.named_parameters["header"] = LogicalType::BOOLEAN;
|
881
886
|
table_function.named_parameters["auto_detect"] = LogicalType::BOOLEAN;
|
882
887
|
table_function.named_parameters["sample_size"] = LogicalType::BIGINT;
|
883
|
-
table_function.named_parameters["sample_chunk_size"] = LogicalType::BIGINT;
|
884
|
-
table_function.named_parameters["sample_chunks"] = LogicalType::BIGINT;
|
885
888
|
table_function.named_parameters["all_varchar"] = LogicalType::BOOLEAN;
|
886
889
|
table_function.named_parameters["dateformat"] = LogicalType::VARCHAR;
|
887
890
|
table_function.named_parameters["timestampformat"] = LogicalType::VARCHAR;
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.8.2-
|
2
|
+
#define DUCKDB_VERSION "0.8.2-dev4711"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "474a0bd683"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -159,6 +159,8 @@ public:
|
|
159
159
|
bool Scan(TupleDataScanState &state, DataChunk &result);
|
160
160
|
//! Scans a DataChunk from the TupleDataCollection
|
161
161
|
bool Scan(TupleDataParallelScanState &gstate, TupleDataLocalScanState &lstate, DataChunk &result);
|
162
|
+
//! Whether the last scan has been completed on this TupleDataCollection
|
163
|
+
bool ScanComplete(const TupleDataScanState &state) const;
|
162
164
|
|
163
165
|
//! Gathers a DataChunk from the TupleDataCollection, given the specific row locations (requires full pin)
|
164
166
|
void Gather(Vector &row_locations, const SelectionVector &scan_sel, const idx_t scan_count, DataChunk &result,
|
@@ -89,7 +89,7 @@ public:
|
|
89
89
|
private:
|
90
90
|
ClientContext &context;
|
91
91
|
//! Actual size can be smaller than the buffer size in case we allocate it too optimistically.
|
92
|
-
idx_t
|
92
|
+
idx_t actual_buffer_size;
|
93
93
|
//! We need to check for Byte Order Mark, to define the start position of this buffer
|
94
94
|
//! https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8
|
95
95
|
idx_t start_position = 0;
|
@@ -20,10 +20,14 @@ public:
|
|
20
20
|
//! Return the 1-indexed line number
|
21
21
|
idx_t GetLine(idx_t batch_idx, idx_t line_error = 0, idx_t file_idx = 0, idx_t cur_start = 0, bool verify = true,
|
22
22
|
bool stop_at_first = true);
|
23
|
+
//! In case an error happened we have to increment the lines read of that batch
|
24
|
+
void Increment(idx_t file_idx, idx_t batch_idx);
|
23
25
|
//! Verify if the CSV File was read correctly from [0,batch_idx] batches.
|
24
26
|
void Verify(idx_t file_idx, idx_t batch_idx, idx_t cur_first_pos);
|
25
27
|
//! Lines read per batch, <file_index, <batch_index, count>>
|
26
28
|
vector<unordered_map<idx_t, idx_t>> lines_read;
|
29
|
+
//! Lines read per batch, <file_index, <batch_index, count>>
|
30
|
+
vector<unordered_map<idx_t, idx_t>> lines_errored;
|
27
31
|
//! Set of batches that have been initialized but are not yet finished.
|
28
32
|
vector<set<idx_t>> current_batches;
|
29
33
|
//! Pointer to CSV Reader Mutex
|
@@ -126,12 +126,10 @@ struct CSVReaderOptions {
|
|
126
126
|
bool normalize_names = false;
|
127
127
|
//! True, if column with that index must skip null check
|
128
128
|
vector<bool> force_not_null;
|
129
|
+
//! Number of sample chunks used in auto-detection
|
130
|
+
idx_t sample_size_chunks = 20480 / STANDARD_VECTOR_SIZE;
|
129
131
|
//! Consider all columns to be of type varchar
|
130
132
|
bool all_varchar = false;
|
131
|
-
//! Size of sample chunk used for dialect and type detection
|
132
|
-
idx_t sample_chunk_size = STANDARD_VECTOR_SIZE;
|
133
|
-
//! Number of sample chunks used for type detection
|
134
|
-
idx_t sample_chunks = 10;
|
135
133
|
//! Whether or not to automatically detect dialect and datatypes
|
136
134
|
bool auto_detect = false;
|
137
135
|
//! The file path of the CSV file to read
|
@@ -28,7 +28,7 @@ struct SnifferResult {
|
|
28
28
|
class CSVSniffer {
|
29
29
|
public:
|
30
30
|
explicit CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
|
31
|
-
CSVStateMachineCache &state_machine_cache);
|
31
|
+
CSVStateMachineCache &state_machine_cache, bool explicit_set_columns = false);
|
32
32
|
|
33
33
|
//! Main method that sniffs the CSV file, returns the types, names and options as a result
|
34
34
|
//! CSV Sniffing consists of five steps:
|
@@ -110,6 +110,8 @@ private:
|
|
110
110
|
//! ------------------------------------------------------//
|
111
111
|
void DetectHeader();
|
112
112
|
vector<string> names;
|
113
|
+
//! If Column Names and Types have been explicitly set
|
114
|
+
const bool explicit_set_columns;
|
113
115
|
|
114
116
|
//! ------------------------------------------------------//
|
115
117
|
//! ------------------ Type Replacement ----------------- //
|
package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp
CHANGED
@@ -13,7 +13,7 @@
|
|
13
13
|
#include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
|
14
14
|
|
15
15
|
namespace duckdb {
|
16
|
-
static constexpr uint32_t NUM_STATES =
|
16
|
+
static constexpr uint32_t NUM_STATES = 9;
|
17
17
|
static constexpr uint32_t NUM_TRANSITIONS = 256;
|
18
18
|
typedef uint8_t state_machine_t[NUM_STATES][NUM_TRANSITIONS];
|
19
19
|
|
@@ -51,13 +51,12 @@ public:
|
|
51
51
|
OperatorSourceInput &input) const;
|
52
52
|
|
53
53
|
const TupleDataLayout &GetLayout() const;
|
54
|
-
idx_t
|
54
|
+
idx_t NumberOfPartitions(GlobalSinkState &sink) const;
|
55
55
|
static void SetMultiScan(GlobalSinkState &sink);
|
56
56
|
|
57
57
|
private:
|
58
58
|
void SetGroupingValues();
|
59
59
|
void PopulateGroupChunk(DataChunk &group_chunk, DataChunk &input_chunk) const;
|
60
|
-
idx_t CountInternal(GlobalSinkState &sink) const;
|
61
60
|
|
62
61
|
TupleDataLayout layout;
|
63
62
|
};
|
@@ -240,6 +240,7 @@ static constexpr ExtensionEntry EXTENSION_FILE_CONTAINS[] = {{".parquet?", "parq
|
|
240
240
|
|
241
241
|
static constexpr const char *AUTOLOADABLE_EXTENSIONS[] = {
|
242
242
|
// "azure",
|
243
|
+
"arrow",
|
243
244
|
"aws",
|
244
245
|
"autocomplete",
|
245
246
|
"excel",
|
@@ -249,7 +250,9 @@ static constexpr const char *AUTOLOADABLE_EXTENSIONS[] = {
|
|
249
250
|
// "icu",
|
250
251
|
"json",
|
251
252
|
"parquet",
|
253
|
+
"postgres_scanner",
|
252
254
|
"sqlsmith",
|
255
|
+
"sqlite_scanner",
|
253
256
|
"tpcds",
|
254
257
|
"tpch",
|
255
258
|
"visualizer",
|
@@ -1,8 +1,9 @@
|
|
1
1
|
#include "duckdb/main/query_result.hpp"
|
2
|
+
|
3
|
+
#include "duckdb/common/box_renderer.hpp"
|
2
4
|
#include "duckdb/common/printer.hpp"
|
3
5
|
#include "duckdb/common/vector.hpp"
|
4
6
|
#include "duckdb/main/client_context.hpp"
|
5
|
-
#include "duckdb/common/box_renderer.hpp"
|
6
7
|
namespace duckdb {
|
7
8
|
|
8
9
|
BaseQueryResult::BaseQueryResult(QueryResultType type, StatementType statement_type, StatementProperties properties_p,
|
@@ -100,9 +101,17 @@ bool QueryResult::Equals(QueryResult &other) { // LCOV_EXCL_START
|
|
100
101
|
}
|
101
102
|
// now compare the actual values
|
102
103
|
// fetch chunks
|
104
|
+
unique_ptr<DataChunk> lchunk, rchunk;
|
105
|
+
idx_t lindex = 0, rindex = 0;
|
103
106
|
while (true) {
|
104
|
-
|
105
|
-
|
107
|
+
if (!lchunk || lindex == lchunk->size()) {
|
108
|
+
lchunk = Fetch();
|
109
|
+
lindex = 0;
|
110
|
+
}
|
111
|
+
if (!rchunk || rindex == rchunk->size()) {
|
112
|
+
rchunk = other.Fetch();
|
113
|
+
rindex = 0;
|
114
|
+
}
|
106
115
|
if (!lchunk && !rchunk) {
|
107
116
|
return true;
|
108
117
|
}
|
@@ -112,14 +121,11 @@ bool QueryResult::Equals(QueryResult &other) { // LCOV_EXCL_START
|
|
112
121
|
if (lchunk->size() == 0 && rchunk->size() == 0) {
|
113
122
|
return true;
|
114
123
|
}
|
115
|
-
if (lchunk->size() != rchunk->size()) {
|
116
|
-
return false;
|
117
|
-
}
|
118
124
|
D_ASSERT(lchunk->ColumnCount() == rchunk->ColumnCount());
|
119
|
-
for (
|
120
|
-
for (idx_t
|
121
|
-
auto lvalue = lchunk->GetValue(col,
|
122
|
-
auto rvalue = rchunk->GetValue(col,
|
125
|
+
for (; lindex < lchunk->size() && rindex < rchunk->size(); lindex++, rindex++) {
|
126
|
+
for (idx_t col = 0; col < rchunk->ColumnCount(); col++) {
|
127
|
+
auto lvalue = lchunk->GetValue(col, lindex);
|
128
|
+
auto rvalue = rchunk->GetValue(col, rindex);
|
123
129
|
if (lvalue.IsNull() && rvalue.IsNull()) {
|
124
130
|
continue;
|
125
131
|
}
|
@@ -101,28 +101,27 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const {
|
|
101
101
|
serializer.WriteProperty(111, "normalize_names", normalize_names);
|
102
102
|
serializer.WriteProperty(112, "force_not_null", force_not_null);
|
103
103
|
serializer.WriteProperty(113, "all_varchar", all_varchar);
|
104
|
-
serializer.WriteProperty(114, "
|
105
|
-
serializer.WriteProperty(115, "
|
106
|
-
serializer.WriteProperty(116, "
|
107
|
-
serializer.WriteProperty(117, "
|
108
|
-
serializer.WriteProperty(118, "
|
109
|
-
serializer.WriteProperty(119, "
|
110
|
-
serializer.WriteProperty(120, "
|
111
|
-
serializer.WriteProperty(121, "
|
112
|
-
serializer.WriteProperty(122, "
|
113
|
-
serializer.WriteProperty(123, "
|
114
|
-
serializer.WriteProperty(124, "
|
115
|
-
serializer.WriteProperty(125, "
|
116
|
-
serializer.WriteProperty(126, "
|
117
|
-
serializer.WriteProperty(127, "dialect_options.state_machine_options.
|
118
|
-
serializer.WriteProperty(128, "dialect_options.state_machine_options.
|
119
|
-
serializer.WriteProperty(129, "dialect_options.
|
120
|
-
serializer.WriteProperty(130, "dialect_options.
|
121
|
-
serializer.WriteProperty(131, "dialect_options.
|
122
|
-
serializer.WriteProperty(132, "dialect_options.
|
123
|
-
serializer.WriteProperty(133, "dialect_options.
|
124
|
-
serializer.WriteProperty(134, "dialect_options.
|
125
|
-
serializer.WriteProperty(135, "dialect_options.has_format", dialect_options.has_format);
|
104
|
+
serializer.WriteProperty(114, "sample_size_chunks", sample_size_chunks);
|
105
|
+
serializer.WriteProperty(115, "auto_detect", auto_detect);
|
106
|
+
serializer.WriteProperty(116, "file_path", file_path);
|
107
|
+
serializer.WriteProperty(117, "decimal_separator", decimal_separator);
|
108
|
+
serializer.WriteProperty(118, "null_padding", null_padding);
|
109
|
+
serializer.WriteProperty(119, "buffer_size", buffer_size);
|
110
|
+
serializer.WriteProperty(120, "file_options", file_options);
|
111
|
+
serializer.WriteProperty(121, "force_quote", force_quote);
|
112
|
+
serializer.WriteProperty(122, "rejects_table_name", rejects_table_name);
|
113
|
+
serializer.WriteProperty(123, "rejects_limit", rejects_limit);
|
114
|
+
serializer.WriteProperty(124, "rejects_recovery_columns", rejects_recovery_columns);
|
115
|
+
serializer.WriteProperty(125, "rejects_recovery_column_ids", rejects_recovery_column_ids);
|
116
|
+
serializer.WriteProperty(126, "dialect_options.state_machine_options.delimiter", dialect_options.state_machine_options.delimiter);
|
117
|
+
serializer.WriteProperty(127, "dialect_options.state_machine_options.quote", dialect_options.state_machine_options.quote);
|
118
|
+
serializer.WriteProperty(128, "dialect_options.state_machine_options.escape", dialect_options.state_machine_options.escape);
|
119
|
+
serializer.WriteProperty(129, "dialect_options.header", dialect_options.header);
|
120
|
+
serializer.WriteProperty(130, "dialect_options.num_cols", dialect_options.num_cols);
|
121
|
+
serializer.WriteProperty(131, "dialect_options.new_line", dialect_options.new_line);
|
122
|
+
serializer.WriteProperty(132, "dialect_options.skip_rows", dialect_options.skip_rows);
|
123
|
+
serializer.WriteProperty(133, "dialect_options.date_format", dialect_options.date_format);
|
124
|
+
serializer.WriteProperty(134, "dialect_options.has_format", dialect_options.has_format);
|
126
125
|
}
|
127
126
|
|
128
127
|
CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) {
|
@@ -141,28 +140,27 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) {
|
|
141
140
|
deserializer.ReadProperty(111, "normalize_names", result.normalize_names);
|
142
141
|
deserializer.ReadProperty(112, "force_not_null", result.force_not_null);
|
143
142
|
deserializer.ReadProperty(113, "all_varchar", result.all_varchar);
|
144
|
-
deserializer.ReadProperty(114, "
|
145
|
-
deserializer.ReadProperty(115, "
|
146
|
-
deserializer.ReadProperty(116, "
|
147
|
-
deserializer.ReadProperty(117, "
|
148
|
-
deserializer.ReadProperty(118, "
|
149
|
-
deserializer.ReadProperty(119, "
|
150
|
-
deserializer.ReadProperty(120, "
|
151
|
-
deserializer.ReadProperty(121, "
|
152
|
-
deserializer.ReadProperty(122, "
|
153
|
-
deserializer.ReadProperty(123, "
|
154
|
-
deserializer.ReadProperty(124, "
|
155
|
-
deserializer.ReadProperty(125, "
|
156
|
-
deserializer.ReadProperty(126, "
|
157
|
-
deserializer.ReadProperty(127, "dialect_options.state_machine_options.
|
158
|
-
deserializer.ReadProperty(128, "dialect_options.state_machine_options.
|
159
|
-
deserializer.ReadProperty(129, "dialect_options.
|
160
|
-
deserializer.ReadProperty(130, "dialect_options.
|
161
|
-
deserializer.ReadProperty(131, "dialect_options.
|
162
|
-
deserializer.ReadProperty(132, "dialect_options.
|
163
|
-
deserializer.ReadProperty(133, "dialect_options.
|
164
|
-
deserializer.ReadProperty(134, "dialect_options.
|
165
|
-
deserializer.ReadProperty(135, "dialect_options.has_format", result.dialect_options.has_format);
|
143
|
+
deserializer.ReadProperty(114, "sample_size_chunks", result.sample_size_chunks);
|
144
|
+
deserializer.ReadProperty(115, "auto_detect", result.auto_detect);
|
145
|
+
deserializer.ReadProperty(116, "file_path", result.file_path);
|
146
|
+
deserializer.ReadProperty(117, "decimal_separator", result.decimal_separator);
|
147
|
+
deserializer.ReadProperty(118, "null_padding", result.null_padding);
|
148
|
+
deserializer.ReadProperty(119, "buffer_size", result.buffer_size);
|
149
|
+
deserializer.ReadProperty(120, "file_options", result.file_options);
|
150
|
+
deserializer.ReadProperty(121, "force_quote", result.force_quote);
|
151
|
+
deserializer.ReadProperty(122, "rejects_table_name", result.rejects_table_name);
|
152
|
+
deserializer.ReadProperty(123, "rejects_limit", result.rejects_limit);
|
153
|
+
deserializer.ReadProperty(124, "rejects_recovery_columns", result.rejects_recovery_columns);
|
154
|
+
deserializer.ReadProperty(125, "rejects_recovery_column_ids", result.rejects_recovery_column_ids);
|
155
|
+
deserializer.ReadProperty(126, "dialect_options.state_machine_options.delimiter", result.dialect_options.state_machine_options.delimiter);
|
156
|
+
deserializer.ReadProperty(127, "dialect_options.state_machine_options.quote", result.dialect_options.state_machine_options.quote);
|
157
|
+
deserializer.ReadProperty(128, "dialect_options.state_machine_options.escape", result.dialect_options.state_machine_options.escape);
|
158
|
+
deserializer.ReadProperty(129, "dialect_options.header", result.dialect_options.header);
|
159
|
+
deserializer.ReadProperty(130, "dialect_options.num_cols", result.dialect_options.num_cols);
|
160
|
+
deserializer.ReadProperty(131, "dialect_options.new_line", result.dialect_options.new_line);
|
161
|
+
deserializer.ReadProperty(132, "dialect_options.skip_rows", result.dialect_options.skip_rows);
|
162
|
+
deserializer.ReadProperty(133, "dialect_options.date_format", result.dialect_options.date_format);
|
163
|
+
deserializer.ReadProperty(134, "dialect_options.has_format", result.dialect_options.has_format);
|
166
164
|
return result;
|
167
165
|
}
|
168
166
|
|