duckdb 0.8.2-dev4653.0 → 0.8.2-dev4711.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +7 -0
  3. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +3 -3
  4. package/src/duckdb/src/execution/operator/csv_scanner/base_csv_reader.cpp +5 -1
  5. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +18 -9
  6. package/src/duckdb/src/execution/operator/csv_scanner/csv_reader_options.cpp +11 -27
  7. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +1 -2
  8. package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +4 -0
  9. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +11 -2
  10. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +8 -8
  11. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +7 -6
  12. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +27 -6
  13. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +49 -41
  14. package/src/duckdb/src/function/table/read_csv.cpp +12 -9
  15. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  16. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +2 -0
  17. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/base_csv_reader.hpp +4 -0
  18. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +1 -1
  19. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_line_info.hpp +4 -0
  20. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_reader_options.hpp +2 -4
  21. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +3 -1
  22. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +1 -1
  23. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp +1 -0
  24. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +1 -2
  25. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +3 -0
  26. package/src/duckdb/src/main/query_result.cpp +16 -10
  27. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +42 -44
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.8.2-dev4653.0",
5
+ "version": "0.8.2-dev4711.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -433,6 +433,13 @@ bool TupleDataCollection::Scan(TupleDataParallelScanState &gstate, TupleDataLoca
433
433
  return true;
434
434
  }
435
435
 
436
+ bool TupleDataCollection::ScanComplete(const TupleDataScanState &state) const {
437
+ if (Count() == 0) {
438
+ return true;
439
+ }
440
+ return state.segment_index == segments.size() - 1 && state.chunk_index == segments.back().ChunkCount();
441
+ }
442
+
436
443
  void TupleDataCollection::FinalizePinState(TupleDataPinState &pin_state, TupleDataSegment &segment) {
437
444
  segment.allocator->ReleaseOrStoreHandles(pin_state, segment);
438
445
  }
@@ -782,13 +782,13 @@ public:
782
782
  }
783
783
 
784
784
  auto &ht_state = op.sink_state->Cast<HashAggregateGlobalSinkState>();
785
- idx_t count = 0;
785
+ idx_t partitions = 0;
786
786
  for (size_t sidx = 0; sidx < op.groupings.size(); ++sidx) {
787
787
  auto &grouping = op.groupings[sidx];
788
788
  auto &grouping_gstate = ht_state.grouping_states[sidx];
789
- count += grouping.table_data.Count(*grouping_gstate.table_state);
789
+ partitions += grouping.table_data.NumberOfPartitions(*grouping_gstate.table_state);
790
790
  }
791
- return MaxValue<idx_t>(1, count / STANDARD_VECTOR_SIZE);
791
+ return MaxValue<idx_t>(1, partitions);
792
792
  }
793
793
  };
794
794
 
@@ -263,7 +263,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
263
263
  return true;
264
264
  }
265
265
 
266
- if (mode == ParserMode::SNIFFING_DATATYPES && parse_chunk.size() == options.sample_chunk_size) {
266
+ if (mode == ParserMode::SNIFFING_DATATYPES) {
267
267
  return true;
268
268
  }
269
269
 
@@ -480,6 +480,10 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_ad
480
480
 
481
481
  bool was_already_null = FlatVector::IsNull(parse_vector, row_idx);
482
482
  if (!was_already_null && FlatVector::IsNull(result_vector, row_idx)) {
483
+ Increment(buffer_idx);
484
+ auto bla = GetLineError(global_row_idx, buffer_idx, false);
485
+ row_idx += bla;
486
+ row_idx -= bla;
483
487
  row_failed = true;
484
488
  failed_cells.emplace_back(row_idx, col_idx, row_line);
485
489
  }
@@ -8,10 +8,14 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle
8
8
  : context(context), first_buffer(true), file_number(file_number_p), can_seek(file_handle.CanSeek()) {
9
9
  AllocateBuffer(buffer_size_p);
10
10
  auto buffer = Ptr();
11
- file_size = file_handle.Read(buffer, buffer_size_p);
11
+ actual_buffer_size = file_handle.Read(buffer, buffer_size_p);
12
+ while (actual_buffer_size < buffer_size_p && !file_handle.FinishedReading()) {
13
+ // We keep reading until this block is full
14
+ actual_buffer_size += file_handle.Read(&buffer[actual_buffer_size], buffer_size_p - actual_buffer_size);
15
+ }
12
16
  global_csv_start = global_csv_current_position;
13
17
  // BOM check (https://en.wikipedia.org/wiki/Byte_order_mark)
14
- if (file_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
18
+ if (actual_buffer_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
15
19
  start_position += 3;
16
20
  }
17
21
  last_buffer = file_handle.FinishedReading();
@@ -22,13 +26,18 @@ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t b
22
26
  : context(context), global_csv_start(global_csv_current_position), file_number(file_number_p),
23
27
  can_seek(file_handle.CanSeek()) {
24
28
  AllocateBuffer(buffer_size);
25
- file_size = file_handle.Read(handle.Ptr(), buffer_size);
29
+ auto buffer = handle.Ptr();
30
+ actual_buffer_size = file_handle.Read(handle.Ptr(), buffer_size);
31
+ while (actual_buffer_size < buffer_size && !file_handle.FinishedReading()) {
32
+ // We keep reading until this block is full
33
+ actual_buffer_size += file_handle.Read(&buffer[actual_buffer_size], buffer_size - actual_buffer_size);
34
+ }
26
35
  last_buffer = file_handle.FinishedReading();
27
36
  }
28
37
 
29
38
  shared_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p) {
30
39
  auto next_csv_buffer =
31
- make_shared<CSVBuffer>(file_handle, context, buffer_size, global_csv_start + file_size, file_number_p);
40
+ make_shared<CSVBuffer>(file_handle, context, buffer_size, global_csv_start + actual_buffer_size, file_number_p);
32
41
  if (next_csv_buffer->GetBufferSize() == 0) {
33
42
  // We are done reading
34
43
  return nullptr;
@@ -43,13 +52,13 @@ void CSVBuffer::AllocateBuffer(idx_t buffer_size) {
43
52
  }
44
53
 
45
54
  idx_t CSVBuffer::GetBufferSize() {
46
- return file_size;
55
+ return actual_buffer_size;
47
56
  }
48
57
 
49
58
  void CSVBuffer::Reload(CSVFileHandle &file_handle) {
50
- AllocateBuffer(file_size);
59
+ AllocateBuffer(actual_buffer_size);
51
60
  file_handle.Seek(global_csv_start);
52
- file_handle.Read(handle.Ptr(), file_size);
61
+ file_handle.Read(handle.Ptr(), actual_buffer_size);
53
62
  }
54
63
 
55
64
  unique_ptr<CSVBufferHandle> CSVBuffer::Pin(CSVFileHandle &file_handle) {
@@ -59,8 +68,8 @@ unique_ptr<CSVBufferHandle> CSVBuffer::Pin(CSVFileHandle &file_handle) {
59
68
  block = nullptr;
60
69
  Reload(file_handle);
61
70
  }
62
- return make_uniq<CSVBufferHandle>(buffer_manager.Pin(block), file_size, first_buffer, last_buffer, global_csv_start,
63
- start_position, file_number);
71
+ return make_uniq<CSVBufferHandle>(buffer_manager.Pin(block), actual_buffer_size, first_buffer, last_buffer,
72
+ global_csv_start, start_position, file_number);
64
73
  }
65
74
 
66
75
  void CSVBuffer::Unpin() {
@@ -168,38 +168,24 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
168
168
  if (loption == "auto_detect") {
169
169
  auto_detect = ParseBoolean(value, loption);
170
170
  } else if (loption == "sample_size") {
171
- int64_t sample_size = ParseInteger(value, loption);
172
- if (sample_size < 1 && sample_size != -1) {
171
+ int64_t sample_size_option = ParseInteger(value, loption);
172
+ if (sample_size_option < 1 && sample_size_option != -1) {
173
173
  throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1");
174
174
  }
175
- if (sample_size == -1) {
176
- sample_chunks = std::numeric_limits<uint64_t>::max();
177
- sample_chunk_size = STANDARD_VECTOR_SIZE;
178
- } else if (sample_size <= STANDARD_VECTOR_SIZE) {
179
- sample_chunk_size = sample_size;
180
- sample_chunks = 1;
175
+ if (sample_size_option == -1) {
176
+ // If -1, we basically read the whole thing
177
+ sample_size_chunks = NumericLimits<idx_t>().Maximum();
181
178
  } else {
182
- sample_chunk_size = STANDARD_VECTOR_SIZE;
183
- sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1;
179
+ sample_size_chunks = sample_size_option / STANDARD_VECTOR_SIZE;
180
+ if (sample_size_option % STANDARD_VECTOR_SIZE != 0) {
181
+ sample_size_chunks++;
182
+ }
184
183
  }
184
+
185
185
  } else if (loption == "skip") {
186
186
  SetSkipRows(ParseInteger(value, loption));
187
187
  } else if (loption == "max_line_size" || loption == "maximum_line_size") {
188
188
  maximum_line_size = ParseInteger(value, loption);
189
- } else if (loption == "sample_chunk_size") {
190
- sample_chunk_size = ParseInteger(value, loption);
191
- if (sample_chunk_size > STANDARD_VECTOR_SIZE) {
192
- throw BinderException(
193
- "Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d",
194
- STANDARD_VECTOR_SIZE);
195
- } else if (sample_chunk_size < 1) {
196
- throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1");
197
- }
198
- } else if (loption == "sample_chunks") {
199
- sample_chunks = ParseInteger(value, loption);
200
- if (sample_chunks < 1) {
201
- throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1");
202
- }
203
189
  } else if (loption == "force_not_null") {
204
190
  force_not_null = ParseColumnList(value, expected_names, loption);
205
191
  } else if (loption == "date_format" || loption == "dateformat") {
@@ -322,7 +308,7 @@ string CSVReaderOptions::ToString() const {
322
308
  (has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
323
309
  "\n header=" + std::to_string(dialect_options.header) +
324
310
  (has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
325
- "\n sample_size=" + std::to_string(sample_chunk_size * sample_chunks) +
311
+ "\n sample_size=" + std::to_string(sample_size_chunks * STANDARD_VECTOR_SIZE) +
326
312
  "\n ignore_errors=" + std::to_string(ignore_errors) + "\n all_varchar=" + std::to_string(all_varchar);
327
313
  }
328
314
 
@@ -489,8 +475,6 @@ void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) {
489
475
  if (skip_rows_set) {
490
476
  named_params["skip"] = Value::BIGINT(GetSkipRows());
491
477
  }
492
- named_params["sample_chunks"] = Value::BIGINT(sample_chunks);
493
- named_params["sample_chunk_size"] = Value::BIGINT(sample_chunk_size);
494
478
  named_params["null_padding"] = Value::BOOLEAN(null_padding);
495
479
  if (!date_format.at(LogicalType::DATE).format_specifier.empty()) {
496
480
  named_params["dateformat"] = Value(date_format.at(LogicalType::DATE).format_specifier);
@@ -29,8 +29,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
29
29
  InitializeTransitionArray(transition_array[i], quoted_state);
30
30
  break;
31
31
  case unquoted_state:
32
- InitializeTransitionArray(transition_array[i], invalid_state);
33
- break;
32
+ case invalid_state:
34
33
  case escape_state:
35
34
  InitializeTransitionArray(transition_array[i], invalid_state);
36
35
  break;
@@ -647,6 +647,10 @@ idx_t ParallelCSVReader::GetLineError(idx_t line_error, idx_t buffer_idx, bool s
647
647
  }
648
648
  }
649
649
 
650
+ void ParallelCSVReader::Increment(idx_t buffer_idx) {
651
+ return buffer->line_info->Increment(file_idx, buffer_idx);
652
+ }
653
+
650
654
  bool ParallelCSVReader::TryParseCSV(ParserMode mode) {
651
655
  DataChunk dummy_chunk;
652
656
  string error_message;
@@ -3,8 +3,9 @@
3
3
  namespace duckdb {
4
4
 
5
5
  CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
6
- CSVStateMachineCache &state_machine_cache_p)
7
- : state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p)) {
6
+ CSVStateMachineCache &state_machine_cache_p, bool explicit_set_columns_p)
7
+ : state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p)),
8
+ explicit_set_columns(explicit_set_columns_p) {
8
9
 
9
10
  // Check if any type is BLOB
10
11
  for (auto &type : options.sql_type_list) {
@@ -24,6 +25,14 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager>
24
25
  SnifferResult CSVSniffer::SniffCSV() {
25
26
  // 1. Dialect Detection
26
27
  DetectDialect();
28
+ if (explicit_set_columns) {
29
+ if (!candidates.empty()) {
30
+ options.dialect_options.state_machine_options = candidates[0]->dialect_options.state_machine_options;
31
+ options.dialect_options.new_line = candidates[0]->dialect_options.new_line;
32
+ }
33
+ // We do not need to run type and header detection as these were defined by the user
34
+ return SnifferResult(detected_types, names);
35
+ }
27
36
  // 2. Type Detection
28
37
  DetectTypes();
29
38
  // 3. Header Detection
@@ -15,7 +15,7 @@ struct SniffDialect {
15
15
  inline static bool Process(CSVStateMachine &machine, vector<idx_t> &sniffed_column_counts, char current_char,
16
16
  idx_t current_pos) {
17
17
 
18
- D_ASSERT(sniffed_column_counts.size() == machine.options.sample_chunk_size);
18
+ D_ASSERT(sniffed_column_counts.size() == STANDARD_VECTOR_SIZE);
19
19
 
20
20
  if (machine.state == CSVState::INVALID) {
21
21
  sniffed_column_counts.clear();
@@ -45,7 +45,7 @@ struct SniffDialect {
45
45
  machine.single_record_separator = ((machine.state != CSVState::RECORD_SEPARATOR && carriage_return) ||
46
46
  (machine.state == CSVState::RECORD_SEPARATOR && !carriage_return)) ||
47
47
  machine.single_record_separator;
48
- if (machine.cur_rows >= machine.options.sample_chunk_size) {
48
+ if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
49
49
  // We sniffed enough rows
50
50
  return true;
51
51
  }
@@ -55,10 +55,10 @@ struct SniffDialect {
55
55
  if (machine.state == CSVState::INVALID) {
56
56
  return;
57
57
  }
58
- if (machine.cur_rows < machine.options.sample_chunk_size && machine.state == CSVState::DELIMITER) {
58
+ if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state == CSVState::DELIMITER) {
59
59
  sniffed_column_counts[machine.cur_rows] = ++machine.column_count;
60
60
  }
61
- if (machine.cur_rows < machine.options.sample_chunk_size && machine.state != CSVState::EMPTY_LINE) {
61
+ if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state != CSVState::EMPTY_LINE) {
62
62
  sniffed_column_counts[machine.cur_rows++] = machine.column_count;
63
63
  }
64
64
  NewLineIdentifier suggested_newline;
@@ -145,7 +145,7 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<CSVStateMachi
145
145
  void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<CSVStateMachine> state_machine, idx_t &rows_read,
146
146
  idx_t &best_consistent_rows, idx_t &prev_padding_count) {
147
147
  // The sniffed_column_counts variable keeps track of the number of columns found for each row
148
- vector<idx_t> sniffed_column_counts(options.sample_chunk_size);
148
+ vector<idx_t> sniffed_column_counts(STANDARD_VECTOR_SIZE);
149
149
 
150
150
  state_machine->csv_buffer_iterator.Process<SniffDialect>(*state_machine, sniffed_column_counts);
151
151
  idx_t start_row = options.dialect_options.skip_rows;
@@ -244,7 +244,7 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<CSVStateMachine> state_machi
244
244
  }
245
245
 
246
246
  bool CSVSniffer::RefineCandidateNextChunk(CSVStateMachine &candidate) {
247
- vector<idx_t> sniffed_column_counts(options.sample_chunk_size);
247
+ vector<idx_t> sniffed_column_counts(STANDARD_VECTOR_SIZE);
248
248
  candidate.csv_buffer_iterator.Process<SniffDialect>(candidate, sniffed_column_counts);
249
249
  bool allow_padding = options.null_padding;
250
250
 
@@ -268,9 +268,9 @@ void CSVSniffer::RefineCandidates() {
268
268
  return;
269
269
  }
270
270
  for (auto &cur_candidate : candidates) {
271
- for (idx_t i = 1; i <= options.sample_chunks; i++) {
271
+ for (idx_t i = 1; i <= options.sample_size_chunks; i++) {
272
272
  bool finished_file = cur_candidate->csv_buffer_iterator.Finished();
273
- if (finished_file || i == options.sample_chunks) {
273
+ if (finished_file || i == options.sample_size_chunks) {
274
274
  // we finished the file or our chunk sample successfully: stop
275
275
  auto successful_candidate = std::move(cur_candidate);
276
276
  candidates.clear();
@@ -283,11 +283,7 @@ void CSVSniffer::DetectTypes() {
283
283
  candidate->Reset();
284
284
 
285
285
  // Parse chunk and read csv with info candidate
286
- idx_t sample_size = options.sample_chunk_size;
287
- if (options.sample_chunk_size == 1) {
288
- sample_size++;
289
- }
290
- vector<TupleSniffing> tuples(sample_size);
286
+ vector<TupleSniffing> tuples(STANDARD_VECTOR_SIZE);
291
287
  candidate->csv_buffer_iterator.Process<SniffValue>(*candidate, tuples);
292
288
  // Potentially Skip empty rows (I find this dirty, but it is what the original code does)
293
289
  idx_t true_start = 0;
@@ -311,8 +307,10 @@ void CSVSniffer::DetectTypes() {
311
307
  break;
312
308
  }
313
309
  }
310
+ if (values_start > 0) {
311
+ tuples.erase(tuples.begin(), tuples.begin() + values_start);
312
+ }
314
313
 
315
- tuples.erase(tuples.begin(), tuples.begin() + values_start);
316
314
  idx_t row_idx = 0;
317
315
  if (tuples.size() > 1 && (!options.has_header || (options.has_header && options.dialect_options.header))) {
318
316
  // This means we have more than one row, hence we can use the first row to detect if we have a header
@@ -327,6 +325,9 @@ void CSVSniffer::DetectTypes() {
327
325
  for (; row_idx < tuples.size(); row_idx++) {
328
326
  for (idx_t col = 0; col < tuples[row_idx].values.size(); col++) {
329
327
  auto &col_type_candidates = info_sql_types_candidates[col];
328
+ // col_type_candidates can't be empty since anything in a CSV file should at least be a string
329
+ // and we validate utf-8 compatibility when creating the type
330
+ D_ASSERT(!col_type_candidates.empty());
330
331
  auto cur_top_candidate = col_type_candidates.back();
331
332
  auto dummy_val = tuples[row_idx].values[col];
332
333
  // try cast from string to sql_type
@@ -46,7 +46,8 @@ struct Parse {
46
46
  validity_mask.SetInvalid(machine.cur_rows);
47
47
  }
48
48
  }
49
- if (machine.state == CSVState::STANDARD) {
49
+ if (machine.state == CSVState::STANDARD ||
50
+ (machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
50
51
  machine.value += current_char;
51
52
  }
52
53
  machine.cur_rows +=
@@ -57,7 +58,7 @@ struct Parse {
57
58
  machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
58
59
  machine.column_count -= machine.column_count * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return);
59
60
 
60
- if (machine.cur_rows >= machine.options.sample_chunk_size) {
61
+ if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
61
62
  // We sniffed enough rows
62
63
  return true;
63
64
  }
@@ -65,11 +66,22 @@ struct Parse {
65
66
  }
66
67
 
67
68
  inline static void Finalize(CSVStateMachine &machine, DataChunk &parse_chunk) {
68
- if (machine.cur_rows < machine.options.sample_chunk_size && machine.state != CSVState::EMPTY_LINE) {
69
+ if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state != CSVState::EMPTY_LINE) {
69
70
  machine.VerifyUTF8();
70
71
  auto &v = parse_chunk.data[machine.column_count++];
71
72
  auto parse_data = FlatVector::GetData<string_t>(v);
72
- parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
73
+ if (machine.value.empty()) {
74
+ auto &validity_mask = FlatVector::Validity(v);
75
+ validity_mask.SetInvalid(machine.cur_rows);
76
+ } else {
77
+ parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
78
+ }
79
+ while (machine.column_count < parse_chunk.ColumnCount()) {
80
+ auto &v_pad = parse_chunk.data[machine.column_count++];
81
+ auto &validity_mask = FlatVector::Validity(v_pad);
82
+ validity_mask.SetInvalid(machine.cur_rows);
83
+ }
84
+ machine.cur_rows++;
73
85
  }
74
86
  parse_chunk.SetCardinality(machine.cur_rows);
75
87
  }
@@ -104,8 +116,8 @@ void CSVSniffer::RefineTypes() {
104
116
  return;
105
117
  }
106
118
  DataChunk parse_chunk;
107
- parse_chunk.Initialize(BufferAllocator::Get(buffer_manager->context), detected_types, options.sample_chunk_size);
108
- for (idx_t i = 1; i < best_candidate->options.sample_chunks; i++) {
119
+ parse_chunk.Initialize(BufferAllocator::Get(buffer_manager->context), detected_types, STANDARD_VECTOR_SIZE);
120
+ for (idx_t i = 1; i < best_candidate->options.sample_size_chunks; i++) {
109
121
  bool finished_file = best_candidate->csv_buffer_iterator.Finished();
110
122
  if (finished_file) {
111
123
  // we finished the file: stop
@@ -124,6 +136,7 @@ void CSVSniffer::RefineTypes() {
124
136
  best_candidate->csv_buffer_iterator.Process<Parse>(*best_candidate, parse_chunk);
125
137
  for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) {
126
138
  vector<LogicalType> &col_type_candidates = best_sql_types_candidates_per_column_idx[col];
139
+ bool is_bool_type = col_type_candidates.back() == LogicalType::BOOLEAN;
127
140
  while (col_type_candidates.size() > 1) {
128
141
  const auto &sql_type = col_type_candidates.back();
129
142
  // narrow down the date formats
@@ -154,6 +167,14 @@ void CSVSniffer::RefineTypes() {
154
167
  if (TryCastVector(parse_chunk.data[col], parse_chunk.size(), sql_type)) {
155
168
  break;
156
169
  } else {
170
+ if (col_type_candidates.back() == LogicalType::BOOLEAN && is_bool_type) {
171
+ // If we thought this was a boolean value (i.e., T,F, True, False) and it is not, we
172
+ // immediately pop to varchar.
173
+ while (col_type_candidates.back() != LogicalType::VARCHAR) {
174
+ col_type_candidates.pop_back();
175
+ }
176
+ break;
177
+ }
157
178
  col_type_candidates.pop_back();
158
179
  }
159
180
  }
@@ -474,14 +474,9 @@ void RadixPartitionedHashTable::Finalize(ClientContext &, GlobalSinkState &gstat
474
474
  //===--------------------------------------------------------------------===//
475
475
  // Source
476
476
  //===--------------------------------------------------------------------===//
477
- idx_t RadixPartitionedHashTable::Count(GlobalSinkState &sink_p) const {
478
- const auto count = CountInternal(sink_p);
479
- return count == 0 && grouping_set.empty() ? 1 : count;
480
- }
481
-
482
- idx_t RadixPartitionedHashTable::CountInternal(GlobalSinkState &sink_p) const {
477
+ idx_t RadixPartitionedHashTable::NumberOfPartitions(GlobalSinkState &sink_p) const {
483
478
  auto &sink = sink_p.Cast<RadixHTGlobalSinkState>();
484
- return sink.count_before_combining;
479
+ return sink.partitions.size();
485
480
  }
486
481
 
487
482
  void RadixPartitionedHashTable::SetMultiScan(GlobalSinkState &sink_p) {
@@ -570,8 +565,7 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
570
565
  D_ASSERT(lstate.scan_status != RadixHTScanStatus::IN_PROGRESS);
571
566
 
572
567
  const auto n_partitions = sink.partitions.size();
573
- if (scan_done == n_partitions) {
574
- finished = true;
568
+ if (finished) {
575
569
  return false;
576
570
  }
577
571
  // We first try to assign a Scan task, then a Finalize task if that didn't work, without using any locks
@@ -595,6 +589,11 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
595
589
  return true;
596
590
  }
597
591
 
592
+ // We didn't assign a Scan task
593
+ if (sink.finalize_idx >= n_partitions) {
594
+ return false; // No finalize tasks left
595
+ }
596
+
598
597
  // We can just increment the atomic here, much simpler than assigning the scan task
599
598
  lstate.task_idx = sink.finalize_idx++;
600
599
  if (lstate.task_idx < n_partitions) {
@@ -603,7 +602,7 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
603
602
  return true;
604
603
  }
605
604
 
606
- // We didn't manage to assign a finalize task
605
+ // We didn't manage to assign a Finalize task
607
606
  return false;
608
607
  }
609
608
 
@@ -693,15 +692,18 @@ void RadixHTLocalSourceState::Scan(RadixHTGlobalSinkState &sink, RadixHTGlobalSo
693
692
 
694
693
  if (!data_collection.Scan(scan_state, scan_chunk)) {
695
694
  scan_status = RadixHTScanStatus::DONE;
696
- if (++gstate.scan_done == sink.partitions.size()) {
697
- gstate.finished = true;
698
- }
699
695
  if (sink.scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE) {
700
696
  data_collection.Reset();
701
697
  }
702
698
  return;
703
699
  }
704
700
 
701
+ if (data_collection.ScanComplete(scan_state)) {
702
+ if (++gstate.scan_done == sink.partitions.size()) {
703
+ gstate.finished = true;
704
+ }
705
+ }
706
+
705
707
  RowOperationsState row_state(aggregate_allocator);
706
708
  const auto group_cols = layout.ColumnCount() - 1;
707
709
  RowOperations::FinalizeStates(row_state, layout, scan_state.chunk_state.row_locations, scan_chunk, group_cols);
@@ -758,36 +760,38 @@ SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, D
758
760
  return SourceResultType::FINISHED;
759
761
  }
760
762
 
761
- // Special case hack to sort out aggregating from empty intermediates for aggregations without groups
762
- if (CountInternal(sink_p) == 0 && grouping_set.empty()) {
763
- D_ASSERT(chunk.ColumnCount() == null_groups.size() + op.aggregates.size() + op.grouping_functions.size());
764
- // For each column in the aggregates, set to initial state
765
- chunk.SetCardinality(1);
766
- for (auto null_group : null_groups) {
767
- chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
768
- ConstantVector::SetNull(chunk.data[null_group], true);
769
- }
770
- ArenaAllocator allocator(BufferAllocator::Get(context.client));
771
- for (idx_t i = 0; i < op.aggregates.size(); i++) {
772
- D_ASSERT(op.aggregates[i]->GetExpressionClass() == ExpressionClass::BOUND_AGGREGATE);
773
- auto &aggr = op.aggregates[i]->Cast<BoundAggregateExpression>();
774
- auto aggr_state = make_unsafe_uniq_array<data_t>(aggr.function.state_size());
775
- aggr.function.initialize(aggr_state.get());
776
-
777
- AggregateInputData aggr_input_data(aggr.bind_info.get(), allocator);
778
- Vector state_vector(Value::POINTER(CastPointerToValue(aggr_state.get())));
779
- aggr.function.finalize(state_vector, aggr_input_data, chunk.data[null_groups.size() + i], 1, 0);
780
- if (aggr.function.destructor) {
781
- aggr.function.destructor(state_vector, aggr_input_data, 1);
763
+ if (sink.count_before_combining == 0) {
764
+ if (grouping_set.empty()) {
765
+ // Special case hack to sort out aggregating from empty intermediates for aggregations without groups
766
+ D_ASSERT(chunk.ColumnCount() == null_groups.size() + op.aggregates.size() + op.grouping_functions.size());
767
+ // For each column in the aggregates, set to initial state
768
+ chunk.SetCardinality(1);
769
+ for (auto null_group : null_groups) {
770
+ chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
771
+ ConstantVector::SetNull(chunk.data[null_group], true);
772
+ }
773
+ ArenaAllocator allocator(BufferAllocator::Get(context.client));
774
+ for (idx_t i = 0; i < op.aggregates.size(); i++) {
775
+ D_ASSERT(op.aggregates[i]->GetExpressionClass() == ExpressionClass::BOUND_AGGREGATE);
776
+ auto &aggr = op.aggregates[i]->Cast<BoundAggregateExpression>();
777
+ auto aggr_state = make_unsafe_uniq_array<data_t>(aggr.function.state_size());
778
+ aggr.function.initialize(aggr_state.get());
779
+
780
+ AggregateInputData aggr_input_data(aggr.bind_info.get(), allocator);
781
+ Vector state_vector(Value::POINTER(CastPointerToValue(aggr_state.get())));
782
+ aggr.function.finalize(state_vector, aggr_input_data, chunk.data[null_groups.size() + i], 1, 0);
783
+ if (aggr.function.destructor) {
784
+ aggr.function.destructor(state_vector, aggr_input_data, 1);
785
+ }
786
+ }
787
+ // Place the grouping values (all the groups of the grouping_set condensed into a single value)
788
+ // Behind the null groups + aggregates
789
+ for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
790
+ chunk.data[null_groups.size() + op.aggregates.size() + i].Reference(grouping_values[i]);
782
791
  }
783
- }
784
- // Place the grouping values (all the groups of the grouping_set condensed into a single value)
785
- // Behind the null groups + aggregates
786
- for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
787
- chunk.data[null_groups.size() + op.aggregates.size() + i].Reference(grouping_values[i]);
788
792
  }
789
793
  gstate.finished = true;
790
- return SourceResultType::HAVE_MORE_OUTPUT;
794
+ return SourceResultType::FINISHED;
791
795
  }
792
796
 
793
797
  while (!gstate.finished && chunk.size() == 0) {
@@ -796,7 +800,11 @@ SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, D
796
800
  }
797
801
  }
798
802
 
799
- return SourceResultType::HAVE_MORE_OUTPUT;
803
+ if (chunk.size() != 0) {
804
+ return SourceResultType::HAVE_MORE_OUTPUT;
805
+ } else {
806
+ return SourceResultType::FINISHED;
807
+ }
800
808
  }
801
809
 
802
810
  } // namespace duckdb
@@ -107,11 +107,11 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
107
107
  // Initialize Buffer Manager and Sniffer
108
108
  auto file_handle = BaseCSVReader::OpenCSV(context, options);
109
109
  result->buffer_manager = make_shared<CSVBufferManager>(context, std::move(file_handle), options);
110
- CSVSniffer sniffer(options, result->buffer_manager, result->state_machine_cache);
110
+ CSVSniffer sniffer(options, result->buffer_manager, result->state_machine_cache, explicitly_set_columns);
111
111
  auto sniffer_result = sniffer.SniffCSV();
112
- return_types = sniffer_result.return_types;
113
112
  if (names.empty()) {
114
113
  names = sniffer_result.names;
114
+ return_types = sniffer_result.return_types;
115
115
  } else {
116
116
  if (explicitly_set_columns) {
117
117
  // The user has influenced the names, can't assume they are valid anymore
@@ -195,6 +195,7 @@ public:
195
195
  auto file_count = files_path_p.size();
196
196
  line_info.current_batches.resize(file_count);
197
197
  line_info.lines_read.resize(file_count);
198
+ line_info.lines_errored.resize(file_count);
198
199
  tuple_start.resize(file_count);
199
200
  tuple_end.resize(file_count);
200
201
  tuple_end_to_batch.resize(file_count);
@@ -509,6 +510,11 @@ bool LineInfo::CanItGetLine(idx_t file_idx, idx_t batch_idx) {
509
510
  return false;
510
511
  }
511
512
 
513
+ void LineInfo::Increment(idx_t file_idx, idx_t batch_idx) {
514
+ auto parallel_lock = duckdb::make_uniq<lock_guard<mutex>>(main_mutex);
515
+ lines_errored[file_idx][batch_idx]++;
516
+ }
517
+
512
518
  // Returns the 1-indexed line number
513
519
  idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t cur_start, bool verify,
514
520
  bool stop_at_first) {
@@ -520,12 +526,11 @@ idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t
520
526
 
521
527
  if (!stop_at_first) {
522
528
  // Figure out the amount of lines read in the current file
523
- auto &file_batches = current_batches[file_idx];
524
- for (auto &batch : file_batches) {
525
- if (batch > batch_idx) {
526
- break;
529
+ for (idx_t cur_batch_idx = 0; cur_batch_idx <= batch_idx; cur_batch_idx++) {
530
+ if (cur_batch_idx < batch_idx) {
531
+ line_count += lines_errored[file_idx][cur_batch_idx];
527
532
  }
528
- line_count += lines_read[file_idx][batch];
533
+ line_count += lines_read[file_idx][cur_batch_idx];
529
534
  }
530
535
  return line_count + line_error + 1;
531
536
  }
@@ -880,8 +885,6 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
880
885
  table_function.named_parameters["header"] = LogicalType::BOOLEAN;
881
886
  table_function.named_parameters["auto_detect"] = LogicalType::BOOLEAN;
882
887
  table_function.named_parameters["sample_size"] = LogicalType::BIGINT;
883
- table_function.named_parameters["sample_chunk_size"] = LogicalType::BIGINT;
884
- table_function.named_parameters["sample_chunks"] = LogicalType::BIGINT;
885
888
  table_function.named_parameters["all_varchar"] = LogicalType::BOOLEAN;
886
889
  table_function.named_parameters["dateformat"] = LogicalType::VARCHAR;
887
890
  table_function.named_parameters["timestampformat"] = LogicalType::VARCHAR;
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.8.2-dev4653"
2
+ #define DUCKDB_VERSION "0.8.2-dev4711"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "bb287d4b22"
5
+ #define DUCKDB_SOURCE_ID "474a0bd683"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -159,6 +159,8 @@ public:
159
159
  bool Scan(TupleDataScanState &state, DataChunk &result);
160
160
  //! Scans a DataChunk from the TupleDataCollection
161
161
  bool Scan(TupleDataParallelScanState &gstate, TupleDataLocalScanState &lstate, DataChunk &result);
162
+ //! Whether the last scan has been completed on this TupleDataCollection
163
+ bool ScanComplete(const TupleDataScanState &state) const;
162
164
 
163
165
  //! Gathers a DataChunk from the TupleDataCollection, given the specific row locations (requires full pin)
164
166
  void Gather(Vector &row_locations, const SelectionVector &scan_sel, const idx_t scan_count, DataChunk &result,
@@ -78,6 +78,10 @@ public:
78
78
  return line_error + 1;
79
79
  };
80
80
 
81
+ virtual void Increment(idx_t buffer_idx) {
82
+ return;
83
+ }
84
+
81
85
  //! Initialize projection indices to select all columns
82
86
  void InitializeProjection();
83
87
 
@@ -89,7 +89,7 @@ public:
89
89
  private:
90
90
  ClientContext &context;
91
91
  //! Actual size can be smaller than the buffer size in case we allocate it too optimistically.
92
- idx_t file_size;
92
+ idx_t actual_buffer_size;
93
93
  //! We need to check for Byte Order Mark, to define the start position of this buffer
94
94
  //! https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8
95
95
  idx_t start_position = 0;
@@ -20,10 +20,14 @@ public:
20
20
  //! Return the 1-indexed line number
21
21
  idx_t GetLine(idx_t batch_idx, idx_t line_error = 0, idx_t file_idx = 0, idx_t cur_start = 0, bool verify = true,
22
22
  bool stop_at_first = true);
23
+ //! In case an error happened we have to increment the lines read of that batch
24
+ void Increment(idx_t file_idx, idx_t batch_idx);
23
25
  //! Verify if the CSV File was read correctly from [0,batch_idx] batches.
24
26
  void Verify(idx_t file_idx, idx_t batch_idx, idx_t cur_first_pos);
25
27
  //! Lines read per batch, <file_index, <batch_index, count>>
26
28
  vector<unordered_map<idx_t, idx_t>> lines_read;
29
+ //! Lines read per batch, <file_index, <batch_index, count>>
30
+ vector<unordered_map<idx_t, idx_t>> lines_errored;
27
31
  //! Set of batches that have been initialized but are not yet finished.
28
32
  vector<set<idx_t>> current_batches;
29
33
  //! Pointer to CSV Reader Mutex
@@ -126,12 +126,10 @@ struct CSVReaderOptions {
126
126
  bool normalize_names = false;
127
127
  //! True, if column with that index must skip null check
128
128
  vector<bool> force_not_null;
129
+ //! Number of sample chunks used in auto-detection
130
+ idx_t sample_size_chunks = 20480 / STANDARD_VECTOR_SIZE;
129
131
  //! Consider all columns to be of type varchar
130
132
  bool all_varchar = false;
131
- //! Size of sample chunk used for dialect and type detection
132
- idx_t sample_chunk_size = STANDARD_VECTOR_SIZE;
133
- //! Number of sample chunks used for type detection
134
- idx_t sample_chunks = 10;
135
133
  //! Whether or not to automatically detect dialect and datatypes
136
134
  bool auto_detect = false;
137
135
  //! The file path of the CSV file to read
@@ -28,7 +28,7 @@ struct SnifferResult {
28
28
  class CSVSniffer {
29
29
  public:
30
30
  explicit CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
31
- CSVStateMachineCache &state_machine_cache);
31
+ CSVStateMachineCache &state_machine_cache, bool explicit_set_columns = false);
32
32
 
33
33
  //! Main method that sniffs the CSV file, returns the types, names and options as a result
34
34
  //! CSV Sniffing consists of five steps:
@@ -110,6 +110,8 @@ private:
110
110
  //! ------------------------------------------------------//
111
111
  void DetectHeader();
112
112
  vector<string> names;
113
+ //! If Column Names and Types have been explicitly set
114
+ const bool explicit_set_columns;
113
115
 
114
116
  //! ------------------------------------------------------//
115
117
  //! ------------------ Type Replacement ----------------- //
@@ -13,7 +13,7 @@
13
13
  #include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
14
14
 
15
15
  namespace duckdb {
16
- static constexpr uint32_t NUM_STATES = 8;
16
+ static constexpr uint32_t NUM_STATES = 9;
17
17
  static constexpr uint32_t NUM_TRANSITIONS = 256;
18
18
  typedef uint8_t state_machine_t[NUM_STATES][NUM_TRANSITIONS];
19
19
 
@@ -134,6 +134,7 @@ public:
134
134
  void ParseCSV(DataChunk &insert_chunk);
135
135
 
136
136
  idx_t GetLineError(idx_t line_error, idx_t buffer_idx, bool stop_at_first = true) override;
137
+ void Increment(idx_t buffer_idx) override;
137
138
 
138
139
  private:
139
140
  //! Initialize Parser
@@ -51,13 +51,12 @@ public:
51
51
  OperatorSourceInput &input) const;
52
52
 
53
53
  const TupleDataLayout &GetLayout() const;
54
- idx_t Count(GlobalSinkState &sink) const;
54
+ idx_t NumberOfPartitions(GlobalSinkState &sink) const;
55
55
  static void SetMultiScan(GlobalSinkState &sink);
56
56
 
57
57
  private:
58
58
  void SetGroupingValues();
59
59
  void PopulateGroupChunk(DataChunk &group_chunk, DataChunk &input_chunk) const;
60
- idx_t CountInternal(GlobalSinkState &sink) const;
61
60
 
62
61
  TupleDataLayout layout;
63
62
  };
@@ -240,6 +240,7 @@ static constexpr ExtensionEntry EXTENSION_FILE_CONTAINS[] = {{".parquet?", "parq
240
240
 
241
241
  static constexpr const char *AUTOLOADABLE_EXTENSIONS[] = {
242
242
  // "azure",
243
+ "arrow",
243
244
  "aws",
244
245
  "autocomplete",
245
246
  "excel",
@@ -249,7 +250,9 @@ static constexpr const char *AUTOLOADABLE_EXTENSIONS[] = {
249
250
  // "icu",
250
251
  "json",
251
252
  "parquet",
253
+ "postgres_scanner",
252
254
  "sqlsmith",
255
+ "sqlite_scanner",
253
256
  "tpcds",
254
257
  "tpch",
255
258
  "visualizer",
@@ -1,8 +1,9 @@
1
1
  #include "duckdb/main/query_result.hpp"
2
+
3
+ #include "duckdb/common/box_renderer.hpp"
2
4
  #include "duckdb/common/printer.hpp"
3
5
  #include "duckdb/common/vector.hpp"
4
6
  #include "duckdb/main/client_context.hpp"
5
- #include "duckdb/common/box_renderer.hpp"
6
7
  namespace duckdb {
7
8
 
8
9
  BaseQueryResult::BaseQueryResult(QueryResultType type, StatementType statement_type, StatementProperties properties_p,
@@ -100,9 +101,17 @@ bool QueryResult::Equals(QueryResult &other) { // LCOV_EXCL_START
100
101
  }
101
102
  // now compare the actual values
102
103
  // fetch chunks
104
+ unique_ptr<DataChunk> lchunk, rchunk;
105
+ idx_t lindex = 0, rindex = 0;
103
106
  while (true) {
104
- auto lchunk = Fetch();
105
- auto rchunk = other.Fetch();
107
+ if (!lchunk || lindex == lchunk->size()) {
108
+ lchunk = Fetch();
109
+ lindex = 0;
110
+ }
111
+ if (!rchunk || rindex == rchunk->size()) {
112
+ rchunk = other.Fetch();
113
+ rindex = 0;
114
+ }
106
115
  if (!lchunk && !rchunk) {
107
116
  return true;
108
117
  }
@@ -112,14 +121,11 @@ bool QueryResult::Equals(QueryResult &other) { // LCOV_EXCL_START
112
121
  if (lchunk->size() == 0 && rchunk->size() == 0) {
113
122
  return true;
114
123
  }
115
- if (lchunk->size() != rchunk->size()) {
116
- return false;
117
- }
118
124
  D_ASSERT(lchunk->ColumnCount() == rchunk->ColumnCount());
119
- for (idx_t col = 0; col < rchunk->ColumnCount(); col++) {
120
- for (idx_t row = 0; row < rchunk->size(); row++) {
121
- auto lvalue = lchunk->GetValue(col, row);
122
- auto rvalue = rchunk->GetValue(col, row);
125
+ for (; lindex < lchunk->size() && rindex < rchunk->size(); lindex++, rindex++) {
126
+ for (idx_t col = 0; col < rchunk->ColumnCount(); col++) {
127
+ auto lvalue = lchunk->GetValue(col, lindex);
128
+ auto rvalue = rchunk->GetValue(col, rindex);
123
129
  if (lvalue.IsNull() && rvalue.IsNull()) {
124
130
  continue;
125
131
  }
@@ -101,28 +101,27 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const {
101
101
  serializer.WriteProperty(111, "normalize_names", normalize_names);
102
102
  serializer.WriteProperty(112, "force_not_null", force_not_null);
103
103
  serializer.WriteProperty(113, "all_varchar", all_varchar);
104
- serializer.WriteProperty(114, "sample_chunk_size", sample_chunk_size);
105
- serializer.WriteProperty(115, "sample_chunks", sample_chunks);
106
- serializer.WriteProperty(116, "auto_detect", auto_detect);
107
- serializer.WriteProperty(117, "file_path", file_path);
108
- serializer.WriteProperty(118, "decimal_separator", decimal_separator);
109
- serializer.WriteProperty(119, "null_padding", null_padding);
110
- serializer.WriteProperty(120, "buffer_size", buffer_size);
111
- serializer.WriteProperty(121, "file_options", file_options);
112
- serializer.WriteProperty(122, "force_quote", force_quote);
113
- serializer.WriteProperty(123, "rejects_table_name", rejects_table_name);
114
- serializer.WriteProperty(124, "rejects_limit", rejects_limit);
115
- serializer.WriteProperty(125, "rejects_recovery_columns", rejects_recovery_columns);
116
- serializer.WriteProperty(126, "rejects_recovery_column_ids", rejects_recovery_column_ids);
117
- serializer.WriteProperty(127, "dialect_options.state_machine_options.delimiter", dialect_options.state_machine_options.delimiter);
118
- serializer.WriteProperty(128, "dialect_options.state_machine_options.quote", dialect_options.state_machine_options.quote);
119
- serializer.WriteProperty(129, "dialect_options.state_machine_options.escape", dialect_options.state_machine_options.escape);
120
- serializer.WriteProperty(130, "dialect_options.header", dialect_options.header);
121
- serializer.WriteProperty(131, "dialect_options.num_cols", dialect_options.num_cols);
122
- serializer.WriteProperty(132, "dialect_options.new_line", dialect_options.new_line);
123
- serializer.WriteProperty(133, "dialect_options.skip_rows", dialect_options.skip_rows);
124
- serializer.WriteProperty(134, "dialect_options.date_format", dialect_options.date_format);
125
- serializer.WriteProperty(135, "dialect_options.has_format", dialect_options.has_format);
104
+ serializer.WriteProperty(114, "sample_size_chunks", sample_size_chunks);
105
+ serializer.WriteProperty(115, "auto_detect", auto_detect);
106
+ serializer.WriteProperty(116, "file_path", file_path);
107
+ serializer.WriteProperty(117, "decimal_separator", decimal_separator);
108
+ serializer.WriteProperty(118, "null_padding", null_padding);
109
+ serializer.WriteProperty(119, "buffer_size", buffer_size);
110
+ serializer.WriteProperty(120, "file_options", file_options);
111
+ serializer.WriteProperty(121, "force_quote", force_quote);
112
+ serializer.WriteProperty(122, "rejects_table_name", rejects_table_name);
113
+ serializer.WriteProperty(123, "rejects_limit", rejects_limit);
114
+ serializer.WriteProperty(124, "rejects_recovery_columns", rejects_recovery_columns);
115
+ serializer.WriteProperty(125, "rejects_recovery_column_ids", rejects_recovery_column_ids);
116
+ serializer.WriteProperty(126, "dialect_options.state_machine_options.delimiter", dialect_options.state_machine_options.delimiter);
117
+ serializer.WriteProperty(127, "dialect_options.state_machine_options.quote", dialect_options.state_machine_options.quote);
118
+ serializer.WriteProperty(128, "dialect_options.state_machine_options.escape", dialect_options.state_machine_options.escape);
119
+ serializer.WriteProperty(129, "dialect_options.header", dialect_options.header);
120
+ serializer.WriteProperty(130, "dialect_options.num_cols", dialect_options.num_cols);
121
+ serializer.WriteProperty(131, "dialect_options.new_line", dialect_options.new_line);
122
+ serializer.WriteProperty(132, "dialect_options.skip_rows", dialect_options.skip_rows);
123
+ serializer.WriteProperty(133, "dialect_options.date_format", dialect_options.date_format);
124
+ serializer.WriteProperty(134, "dialect_options.has_format", dialect_options.has_format);
126
125
  }
127
126
 
128
127
  CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) {
@@ -141,28 +140,27 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) {
141
140
  deserializer.ReadProperty(111, "normalize_names", result.normalize_names);
142
141
  deserializer.ReadProperty(112, "force_not_null", result.force_not_null);
143
142
  deserializer.ReadProperty(113, "all_varchar", result.all_varchar);
144
- deserializer.ReadProperty(114, "sample_chunk_size", result.sample_chunk_size);
145
- deserializer.ReadProperty(115, "sample_chunks", result.sample_chunks);
146
- deserializer.ReadProperty(116, "auto_detect", result.auto_detect);
147
- deserializer.ReadProperty(117, "file_path", result.file_path);
148
- deserializer.ReadProperty(118, "decimal_separator", result.decimal_separator);
149
- deserializer.ReadProperty(119, "null_padding", result.null_padding);
150
- deserializer.ReadProperty(120, "buffer_size", result.buffer_size);
151
- deserializer.ReadProperty(121, "file_options", result.file_options);
152
- deserializer.ReadProperty(122, "force_quote", result.force_quote);
153
- deserializer.ReadProperty(123, "rejects_table_name", result.rejects_table_name);
154
- deserializer.ReadProperty(124, "rejects_limit", result.rejects_limit);
155
- deserializer.ReadProperty(125, "rejects_recovery_columns", result.rejects_recovery_columns);
156
- deserializer.ReadProperty(126, "rejects_recovery_column_ids", result.rejects_recovery_column_ids);
157
- deserializer.ReadProperty(127, "dialect_options.state_machine_options.delimiter", result.dialect_options.state_machine_options.delimiter);
158
- deserializer.ReadProperty(128, "dialect_options.state_machine_options.quote", result.dialect_options.state_machine_options.quote);
159
- deserializer.ReadProperty(129, "dialect_options.state_machine_options.escape", result.dialect_options.state_machine_options.escape);
160
- deserializer.ReadProperty(130, "dialect_options.header", result.dialect_options.header);
161
- deserializer.ReadProperty(131, "dialect_options.num_cols", result.dialect_options.num_cols);
162
- deserializer.ReadProperty(132, "dialect_options.new_line", result.dialect_options.new_line);
163
- deserializer.ReadProperty(133, "dialect_options.skip_rows", result.dialect_options.skip_rows);
164
- deserializer.ReadProperty(134, "dialect_options.date_format", result.dialect_options.date_format);
165
- deserializer.ReadProperty(135, "dialect_options.has_format", result.dialect_options.has_format);
143
+ deserializer.ReadProperty(114, "sample_size_chunks", result.sample_size_chunks);
144
+ deserializer.ReadProperty(115, "auto_detect", result.auto_detect);
145
+ deserializer.ReadProperty(116, "file_path", result.file_path);
146
+ deserializer.ReadProperty(117, "decimal_separator", result.decimal_separator);
147
+ deserializer.ReadProperty(118, "null_padding", result.null_padding);
148
+ deserializer.ReadProperty(119, "buffer_size", result.buffer_size);
149
+ deserializer.ReadProperty(120, "file_options", result.file_options);
150
+ deserializer.ReadProperty(121, "force_quote", result.force_quote);
151
+ deserializer.ReadProperty(122, "rejects_table_name", result.rejects_table_name);
152
+ deserializer.ReadProperty(123, "rejects_limit", result.rejects_limit);
153
+ deserializer.ReadProperty(124, "rejects_recovery_columns", result.rejects_recovery_columns);
154
+ deserializer.ReadProperty(125, "rejects_recovery_column_ids", result.rejects_recovery_column_ids);
155
+ deserializer.ReadProperty(126, "dialect_options.state_machine_options.delimiter", result.dialect_options.state_machine_options.delimiter);
156
+ deserializer.ReadProperty(127, "dialect_options.state_machine_options.quote", result.dialect_options.state_machine_options.quote);
157
+ deserializer.ReadProperty(128, "dialect_options.state_machine_options.escape", result.dialect_options.state_machine_options.escape);
158
+ deserializer.ReadProperty(129, "dialect_options.header", result.dialect_options.header);
159
+ deserializer.ReadProperty(130, "dialect_options.num_cols", result.dialect_options.num_cols);
160
+ deserializer.ReadProperty(131, "dialect_options.new_line", result.dialect_options.new_line);
161
+ deserializer.ReadProperty(132, "dialect_options.skip_rows", result.dialect_options.skip_rows);
162
+ deserializer.ReadProperty(133, "dialect_options.date_format", result.dialect_options.date_format);
163
+ deserializer.ReadProperty(134, "dialect_options.has_format", result.dialect_options.has_format);
166
164
  return result;
167
165
  }
168
166