npm - duckdb - Versions diffs - 0.8.2-dev4653.0 → 0.8.2-dev4871.0 - Mend

duckdb 0.8.2-dev4653.0 → 0.8.2-dev4871.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/src/duckdb/src/execution/aggregate_hashtable.cpp CHANGED Viewed

@@ -45,6 +45,7 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
 	// Append hash column to the end and initialise the row layout
 	group_types_p.emplace_back(LogicalType::HASH);
 	layout.Initialize(std::move(group_types_p), std::move(aggregate_objects_p));
 	hash_offset = layout.GetOffsets()[layout.ColumnCount() - 1];
 	// Partitioned data and pointer table
@@ -52,7 +53,8 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
 	Resize(initial_capacity);
 	// Predicates
-	predicates.resize(layout.ColumnCount() - 1, ExpressionType::COMPARE_EQUAL);
+	predicates.resize(layout.ColumnCount() - 1, ExpressionType::COMPARE_NOT_DISTINCT_FROM);
+	row_matcher.Initialize(true, layout, predicates);
 }
 void GroupedAggregateHashTable::InitializePartitionedData() {
@@ -414,9 +416,8 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
 			}
 			// Perform group comparisons
-			RowOperations::Match(state.group_chunk, state.group_data.get(), layout, addresses_v, predicates,
-			                     state.group_compare_vector, need_compare_count, &state.no_match_vector,
-			                     no_match_count);
+			row_matcher.Match(state.group_chunk, chunk_state.vector_data, state.group_compare_vector,
+			                  need_compare_count, layout, addresses_v, &state.no_match_vector, no_match_count);
 		}
 		// Linear probing: each of the entries that do not match move to the next entry in the HT

package/src/duckdb/src/execution/index/fixed_size_allocator.cpp CHANGED Viewed

@@ -173,6 +173,19 @@ bool FixedSizeAllocator::InitializeVacuum() {
 		return false;
 	}
+	// remove all empty buffers
+	auto buffer_it = buffers.begin();
+	while (buffer_it != buffers.end()) {
+		if (!buffer_it->second.segment_count) {
+			buffers_with_free_space.erase(buffer_it->first);
+			buffer_it->second.Destroy();
+			buffer_it = buffers.erase(buffer_it);
+		} else {
+			buffer_it++;
+		}
+	}
+	// determine if a vacuum is necessary
 	multimap<idx_t, idx_t> temporary_vacuum_buffers;
 	D_ASSERT(vacuum_buffers.empty());
 	idx_t available_segments_in_memory = 0;

package/src/duckdb/src/execution/join_hashtable.cpp CHANGED Viewed

@@ -19,15 +19,15 @@ JoinHashTable::JoinHashTable(BufferManager &buffer_manager_p, const vector<JoinC
     : buffer_manager(buffer_manager_p), conditions(conditions_p), build_types(std::move(btypes)), entry_size(0),
       tuple_size(0), vfound(Value::BOOLEAN(false)), join_type(type_p), finalized(false), has_null(false),
       external(false), radix_bits(4), partition_start(0), partition_end(0) {
 	for (auto &condition : conditions) {
 		D_ASSERT(condition.left->return_type == condition.right->return_type);
 		auto type = condition.left->return_type;
 		if (condition.comparison == ExpressionType::COMPARE_EQUAL ||
-		    condition.comparison == ExpressionType::COMPARE_NOT_DISTINCT_FROM ||
-		    condition.comparison == ExpressionType::COMPARE_DISTINCT_FROM) {
-			// all equality conditions should be at the front
-			// all other conditions at the back
-			// this assert checks that
+		    condition.comparison == ExpressionType::COMPARE_NOT_DISTINCT_FROM) {
+			// ensure that all equality conditions are at the front,
+			// and that all other conditions are at the back
 			D_ASSERT(equality_types.size() == condition_types.size());
 			equality_types.push_back(type);
 		}
@@ -51,6 +51,8 @@ JoinHashTable::JoinHashTable(BufferManager &buffer_manager_p, const vector<JoinC
 	}
 	layout_types.emplace_back(LogicalType::HASH);
 	layout.Initialize(layout_types, false);
+	row_matcher.Initialize(false, layout, predicates);
+	row_matcher_no_match_sel.Initialize(true, layout, predicates);
 	const auto &offsets = layout.GetOffsets();
 	tuple_size = offsets[condition_types.size() + build_types.size()];
@@ -142,30 +144,6 @@ static idx_t FilterNullValues(UnifiedVectorFormat &vdata, const SelectionVector
 	return result_count;
 }
-idx_t JoinHashTable::PrepareKeys(DataChunk &keys, unsafe_unique_array<UnifiedVectorFormat> &key_data,
-                                 const SelectionVector *&current_sel, SelectionVector &sel, bool build_side) {
-	key_data = keys.ToUnifiedFormat();
-	// figure out which keys are NULL, and create a selection vector out of them
-	current_sel = FlatVector::IncrementalSelectionVector();
-	idx_t added_count = keys.size();
-	if (build_side && IsRightOuterJoin(join_type)) {
-		// in case of a right or full outer join, we cannot remove NULL keys from the build side
-		return added_count;
-	}
-	for (idx_t i = 0; i < keys.ColumnCount(); i++) {
-		if (!null_values_are_equal[i]) {
-			if (key_data[i].validity.AllValid()) {
-				continue;
-			}
-			added_count = FilterNullValues(key_data[i], *current_sel, added_count, sel);
-			// null values are NOT equal for this column, filter them out
-			current_sel = &sel;
-		}
-	}
-	return added_count;
-}
 void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChunk &keys, DataChunk &payload) {
 	D_ASSERT(!finalized);
 	D_ASSERT(keys.size() == payload.size());
@@ -194,23 +172,6 @@ void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChu
 		info.correlated_counts->AddChunk(info.group_chunk, info.correlated_payload, AggregateType::NON_DISTINCT);
 	}
-	// prepare the keys for processing
-	unsafe_unique_array<UnifiedVectorFormat> key_data;
-	const SelectionVector *current_sel;
-	SelectionVector sel(STANDARD_VECTOR_SIZE);
-	idx_t added_count = PrepareKeys(keys, key_data, current_sel, sel, true);
-	if (added_count < keys.size()) {
-		has_null = true;
-	}
-	if (added_count == 0) {
-		return;
-	}
-	// hash the keys and obtain an entry in the list
-	// note that we only hash the keys used in the equality comparison
-	Vector hash_values(LogicalType::HASH);
-	Hash(keys, *current_sel, added_count, hash_values);
 	// build a chunk to append to the data collection [keys, payload, (optional "found" boolean), hash]
 	DataChunk source_chunk;
 	source_chunk.InitializeEmpty(layout.GetTypes());
@@ -228,13 +189,58 @@ void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChu
 		source_chunk.data[col_offset].Reference(vfound);
 		col_offset++;
 	}
+	Vector hash_values(LogicalType::HASH);
 	source_chunk.data[col_offset].Reference(hash_values);
 	source_chunk.SetCardinality(keys);
+	// ToUnifiedFormat the source chunk
+	TupleDataCollection::ToUnifiedFormat(append_state.chunk_state, source_chunk);
+	// prepare the keys for processing
+	const SelectionVector *current_sel;
+	SelectionVector sel(STANDARD_VECTOR_SIZE);
+	idx_t added_count = PrepareKeys(keys, append_state.chunk_state.vector_data, current_sel, sel, true);
 	if (added_count < keys.size()) {
-		source_chunk.Slice(*current_sel, added_count);
+		has_null = true;
+	}
+	if (added_count == 0) {
+		return;
 	}
-	sink_collection->Append(append_state, source_chunk);
+	// hash the keys and obtain an entry in the list
+	// note that we only hash the keys used in the equality comparison
+	Hash(keys, *current_sel, added_count, hash_values);
+	// Re-reference and ToUnifiedFormat the hash column after computing it
+	source_chunk.data[col_offset].Reference(hash_values);
+	hash_values.ToUnifiedFormat(source_chunk.size(), append_state.chunk_state.vector_data.back().unified);
+	// We already called TupleDataCollection::ToUnifiedFormat, so we can AppendUnified here
+	sink_collection->AppendUnified(append_state, source_chunk, *current_sel, added_count);
+}
+idx_t JoinHashTable::PrepareKeys(DataChunk &keys, vector<TupleDataVectorFormat> &vector_data,
+                                 const SelectionVector *&current_sel, SelectionVector &sel, bool build_side) {
+	// figure out which keys are NULL, and create a selection vector out of them
+	current_sel = FlatVector::IncrementalSelectionVector();
+	idx_t added_count = keys.size();
+	if (build_side && IsRightOuterJoin(join_type)) {
+		// in case of a right or full outer join, we cannot remove NULL keys from the build side
+		return added_count;
+	}
+	for (idx_t col_idx = 0; col_idx < keys.ColumnCount(); col_idx++) {
+		if (!null_values_are_equal[col_idx]) {
+			auto &col_key_data = vector_data[col_idx].unified;
+			if (col_key_data.validity.AllValid()) {
+				continue;
+			}
+			added_count = FilterNullValues(col_key_data, *current_sel, added_count, sel);
+			// null values are NOT equal for this column, filter them out
+			current_sel = &sel;
+		}
+	}
+	return added_count;
 }
 template <bool PARALLEL>
@@ -322,12 +328,13 @@ void JoinHashTable::Finalize(idx_t chunk_idx_from, idx_t chunk_idx_to, bool para
 	} while (iterator.Next());
 }
-unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys, const SelectionVector *&current_sel) {
+unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys, TupleDataChunkState &key_state,
+                                                                 const SelectionVector *&current_sel) {
 	D_ASSERT(Count() > 0); // should be handled before
 	D_ASSERT(finalized);
 	// set up the scan structure
-	auto ss = make_uniq<ScanStructure>(*this);
+	auto ss = make_uniq<ScanStructure>(*this, key_state);
 	if (join_type != JoinType::INNER) {
 		ss->found_match = make_unsafe_uniq_array<bool>(STANDARD_VECTOR_SIZE);
@@ -335,13 +342,15 @@ unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys
 	}
 	// first prepare the keys for probing
-	ss->count = PrepareKeys(keys, ss->key_data, current_sel, ss->sel_vector, false);
+	TupleDataCollection::ToUnifiedFormat(key_state, keys);
+	ss->count = PrepareKeys(keys, key_state.vector_data, current_sel, ss->sel_vector, false);
 	return ss;
 }
-unique_ptr<ScanStructure> JoinHashTable::Probe(DataChunk &keys, Vector *precomputed_hashes) {
+unique_ptr<ScanStructure> JoinHashTable::Probe(DataChunk &keys, TupleDataChunkState &key_state,
+                                               Vector *precomputed_hashes) {
 	const SelectionVector *current_sel;
-	auto ss = InitializeScanStructure(keys, current_sel);
+	auto ss = InitializeScanStructure(keys, key_state, current_sel);
 	if (ss->count == 0) {
 		return ss;
 	}
@@ -363,8 +372,9 @@ unique_ptr<ScanStructure> JoinHashTable::Probe(DataChunk &keys, Vector *precompu
 	return ss;
 }
-ScanStructure::ScanStructure(JoinHashTable &ht)
-    : pointers(LogicalType::POINTER), sel_vector(STANDARD_VECTOR_SIZE), ht(ht), finished(false) {
+ScanStructure::ScanStructure(JoinHashTable &ht_p, TupleDataChunkState &key_state_p)
+    : key_state(key_state_p), pointers(LogicalType::POINTER), sel_vector(STANDARD_VECTOR_SIZE), ht(ht_p),
+      finished(false) {
 }
 void ScanStructure::Next(DataChunk &keys, DataChunk &left, DataChunk &result) {
@@ -404,8 +414,9 @@ idx_t ScanStructure::ResolvePredicates(DataChunk &keys, SelectionVector &match_s
 	}
 	idx_t no_match_count = 0;
-	return RowOperations::Match(keys, key_data.get(), ht.layout, pointers, ht.predicates, match_sel, this->count,
-	                            no_match_sel, no_match_count);
+	auto &matcher = no_match_sel ? ht.row_matcher_no_match_sel : ht.row_matcher;
+	return matcher.Match(keys, key_state.vector_data, match_sel, this->count, ht.layout, pointers, no_match_sel,
+	                     no_match_count);
 }
 idx_t ScanStructure::ScanInnerJoin(DataChunk &keys, SelectionVector &result_vector) {
@@ -990,7 +1001,8 @@ static void CreateSpillChunk(DataChunk &spill_chunk, DataChunk &keys, DataChunk
 	spill_chunk.data[spill_col_idx].Reference(hashes);
 }
-unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys, DataChunk &payload, ProbeSpill &probe_spill,
+unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys, TupleDataChunkState &key_state,
+                                                       DataChunk &payload, ProbeSpill &probe_spill,
                                                        ProbeSpillLocalAppendState &spill_state,
                                                        DataChunk &spill_chunk) {
 	// hash all the keys
@@ -1019,7 +1031,7 @@ unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys, DataChun
 	payload.Slice(true_sel, true_count);
 	const SelectionVector *current_sel;
-	auto ss = InitializeScanStructure(keys, current_sel);
+	auto ss = InitializeScanStructure(keys, key_state, current_sel);
 	if (ss->count == 0) {
 		return ss;
 	}

package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp CHANGED Viewed

@@ -782,13 +782,13 @@ public:
 		}
 		auto &ht_state = op.sink_state->Cast<HashAggregateGlobalSinkState>();
-		idx_t count = 0;
+		idx_t partitions = 0;
 		for (size_t sidx = 0; sidx < op.groupings.size(); ++sidx) {
 			auto &grouping = op.groupings[sidx];
 			auto &grouping_gstate = ht_state.grouping_states[sidx];
-			count += grouping.table_data.Count(*grouping_gstate.table_state);
+			partitions += grouping.table_data.NumberOfPartitions(*grouping_gstate.table_state);
 		}
-		return MaxValue<idx_t>(1, count / STANDARD_VECTOR_SIZE);
+		return MaxValue<idx_t>(1, partitions);
 	}
 };

package/src/duckdb/src/execution/operator/csv_scanner/base_csv_reader.cpp CHANGED Viewed

@@ -263,7 +263,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
 		return true;
 	}
-	if (mode == ParserMode::SNIFFING_DATATYPES && parse_chunk.size() == options.sample_chunk_size) {
+	if (mode == ParserMode::SNIFFING_DATATYPES) {
 		return true;
 	}
@@ -480,6 +480,10 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_ad
 				bool was_already_null = FlatVector::IsNull(parse_vector, row_idx);
 				if (!was_already_null && FlatVector::IsNull(result_vector, row_idx)) {
+					Increment(buffer_idx);
+					auto bla = GetLineError(global_row_idx, buffer_idx, false);
+					row_idx += bla;
+					row_idx -= bla;
 					row_failed = true;
 					failed_cells.emplace_back(row_idx, col_idx, row_line);
 				}

package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp CHANGED Viewed

@@ -8,10 +8,14 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle
     : context(context), first_buffer(true), file_number(file_number_p), can_seek(file_handle.CanSeek()) {
 	AllocateBuffer(buffer_size_p);
 	auto buffer = Ptr();
-	file_size = file_handle.Read(buffer, buffer_size_p);
+	actual_buffer_size = file_handle.Read(buffer, buffer_size_p);
+	while (actual_buffer_size < buffer_size_p && !file_handle.FinishedReading()) {
+		// We keep reading until this block is full
+		actual_buffer_size += file_handle.Read(&buffer[actual_buffer_size], buffer_size_p - actual_buffer_size);
+	}
 	global_csv_start = global_csv_current_position;
 	// BOM check (https://en.wikipedia.org/wiki/Byte_order_mark)
-	if (file_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
+	if (actual_buffer_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
 		start_position += 3;
 	}
 	last_buffer = file_handle.FinishedReading();
@@ -22,13 +26,18 @@ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t b
     : context(context), global_csv_start(global_csv_current_position), file_number(file_number_p),
       can_seek(file_handle.CanSeek()) {
 	AllocateBuffer(buffer_size);
-	file_size = file_handle.Read(handle.Ptr(), buffer_size);
+	auto buffer = handle.Ptr();
+	actual_buffer_size = file_handle.Read(handle.Ptr(), buffer_size);
+	while (actual_buffer_size < buffer_size && !file_handle.FinishedReading()) {
+		// We keep reading until this block is full
+		actual_buffer_size += file_handle.Read(&buffer[actual_buffer_size], buffer_size - actual_buffer_size);
+	}
 	last_buffer = file_handle.FinishedReading();
 }
 shared_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p) {
 	auto next_csv_buffer =
-	    make_shared<CSVBuffer>(file_handle, context, buffer_size, global_csv_start + file_size, file_number_p);
+	    make_shared<CSVBuffer>(file_handle, context, buffer_size, global_csv_start + actual_buffer_size, file_number_p);
 	if (next_csv_buffer->GetBufferSize() == 0) {
 		// We are done reading
 		return nullptr;
@@ -43,13 +52,13 @@ void CSVBuffer::AllocateBuffer(idx_t buffer_size) {
 }
 idx_t CSVBuffer::GetBufferSize() {
-	return file_size;
+	return actual_buffer_size;
 }
 void CSVBuffer::Reload(CSVFileHandle &file_handle) {
-	AllocateBuffer(file_size);
+	AllocateBuffer(actual_buffer_size);
 	file_handle.Seek(global_csv_start);
-	file_handle.Read(handle.Ptr(), file_size);
+	file_handle.Read(handle.Ptr(), actual_buffer_size);
 }
 unique_ptr<CSVBufferHandle> CSVBuffer::Pin(CSVFileHandle &file_handle) {
@@ -59,8 +68,8 @@ unique_ptr<CSVBufferHandle> CSVBuffer::Pin(CSVFileHandle &file_handle) {
 		block = nullptr;
 		Reload(file_handle);
 	}
-	return make_uniq<CSVBufferHandle>(buffer_manager.Pin(block), file_size, first_buffer, last_buffer, global_csv_start,
-	                                  start_position, file_number);
+	return make_uniq<CSVBufferHandle>(buffer_manager.Pin(block), actual_buffer_size, first_buffer, last_buffer,
+	                                  global_csv_start, start_position, file_number);
 }
 void CSVBuffer::Unpin() {

package/src/duckdb/src/execution/operator/csv_scanner/csv_reader_options.cpp CHANGED Viewed

@@ -168,38 +168,24 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
 	if (loption == "auto_detect") {
 		auto_detect = ParseBoolean(value, loption);
 	} else if (loption == "sample_size") {
-		int64_t sample_size = ParseInteger(value, loption);
-		if (sample_size < 1 && sample_size != -1) {
+		int64_t sample_size_option = ParseInteger(value, loption);
+		if (sample_size_option < 1 && sample_size_option != -1) {
 			throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1");
 		}
-		if (sample_size == -1) {
-			sample_chunks = std::numeric_limits<uint64_t>::max();
-			sample_chunk_size = STANDARD_VECTOR_SIZE;
-		} else if (sample_size <= STANDARD_VECTOR_SIZE) {
-			sample_chunk_size = sample_size;
-			sample_chunks = 1;
+		if (sample_size_option == -1) {
+			// If -1, we basically read the whole thing
+			sample_size_chunks = NumericLimits<idx_t>().Maximum();
 		} else {
-			sample_chunk_size = STANDARD_VECTOR_SIZE;
-			sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1;
+			sample_size_chunks = sample_size_option / STANDARD_VECTOR_SIZE;
+			if (sample_size_option % STANDARD_VECTOR_SIZE != 0) {
+				sample_size_chunks++;
+			}
 		}
 	} else if (loption == "skip") {
 		SetSkipRows(ParseInteger(value, loption));
 	} else if (loption == "max_line_size" || loption == "maximum_line_size") {
 		maximum_line_size = ParseInteger(value, loption);
-	} else if (loption == "sample_chunk_size") {
-		sample_chunk_size = ParseInteger(value, loption);
-		if (sample_chunk_size > STANDARD_VECTOR_SIZE) {
-			throw BinderException(
-			    "Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d",
-			    STANDARD_VECTOR_SIZE);
-		} else if (sample_chunk_size < 1) {
-			throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1");
-		}
-	} else if (loption == "sample_chunks") {
-		sample_chunks = ParseInteger(value, loption);
-		if (sample_chunks < 1) {
-			throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1");
-		}
 	} else if (loption == "force_not_null") {
 		force_not_null = ParseColumnList(value, expected_names, loption);
 	} else if (loption == "date_format" || loption == "dateformat") {
@@ -322,7 +308,7 @@ string CSVReaderOptions::ToString() const {
 	       (has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
 	       "\n  header=" + std::to_string(dialect_options.header) +
 	       (has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
-	       "\n  sample_size=" + std::to_string(sample_chunk_size * sample_chunks) +
+	       "\n  sample_size=" + std::to_string(sample_size_chunks * STANDARD_VECTOR_SIZE) +
 	       "\n  ignore_errors=" + std::to_string(ignore_errors) + "\n  all_varchar=" + std::to_string(all_varchar);
 }
@@ -489,8 +475,6 @@ void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) {
 	if (skip_rows_set) {
 		named_params["skip"] = Value::BIGINT(GetSkipRows());
 	}
-	named_params["sample_chunks"] = Value::BIGINT(sample_chunks);
-	named_params["sample_chunk_size"] = Value::BIGINT(sample_chunk_size);
 	named_params["null_padding"] = Value::BOOLEAN(null_padding);
 	if (!date_format.at(LogicalType::DATE).format_specifier.empty()) {
 		named_params["dateformat"] = Value(date_format.at(LogicalType::DATE).format_specifier);

package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp CHANGED Viewed

@@ -29,8 +29,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
 			InitializeTransitionArray(transition_array[i], quoted_state);
 			break;
 		case unquoted_state:
-			InitializeTransitionArray(transition_array[i], invalid_state);
-			break;
+		case invalid_state:
 		case escape_state:
 			InitializeTransitionArray(transition_array[i], invalid_state);
 			break;

package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp CHANGED Viewed

@@ -647,6 +647,10 @@ idx_t ParallelCSVReader::GetLineError(idx_t line_error, idx_t buffer_idx, bool s
 	}
 }
+void ParallelCSVReader::Increment(idx_t buffer_idx) {
+	return buffer->line_info->Increment(file_idx, buffer_idx);
+}
 bool ParallelCSVReader::TryParseCSV(ParserMode mode) {
 	DataChunk dummy_chunk;
 	string error_message;

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp CHANGED Viewed

@@ -3,8 +3,9 @@
 namespace duckdb {
 CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
-                       CSVStateMachineCache &state_machine_cache_p)
-    : state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p)) {
+                       CSVStateMachineCache &state_machine_cache_p, bool explicit_set_columns_p)
+    : state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p)),
+      explicit_set_columns(explicit_set_columns_p) {
 	// Check if any type is BLOB
 	for (auto &type : options.sql_type_list) {
@@ -24,6 +25,14 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager>
 SnifferResult CSVSniffer::SniffCSV() {
 	// 1. Dialect Detection
 	DetectDialect();
+	if (explicit_set_columns) {
+		if (!candidates.empty()) {
+			options.dialect_options.state_machine_options = candidates[0]->dialect_options.state_machine_options;
+			options.dialect_options.new_line = candidates[0]->dialect_options.new_line;
+		}
+		// We do not need to run type and header detection as these were defined by the user
+		return SnifferResult(detected_types, names);
+	}
 	// 2. Type Detection
 	DetectTypes();
 	// 3. Header Detection

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp CHANGED Viewed

@@ -15,7 +15,7 @@ struct SniffDialect {
 	inline static bool Process(CSVStateMachine &machine, vector<idx_t> &sniffed_column_counts, char current_char,
 	                           idx_t current_pos) {
-		D_ASSERT(sniffed_column_counts.size() == machine.options.sample_chunk_size);
+		D_ASSERT(sniffed_column_counts.size() == STANDARD_VECTOR_SIZE);
 		if (machine.state == CSVState::INVALID) {
 			sniffed_column_counts.clear();
@@ -45,7 +45,7 @@ struct SniffDialect {
 		machine.single_record_separator = ((machine.state != CSVState::RECORD_SEPARATOR && carriage_return) ||
 		                                   (machine.state == CSVState::RECORD_SEPARATOR && !carriage_return)) ||
 		                                  machine.single_record_separator;
-		if (machine.cur_rows >= machine.options.sample_chunk_size) {
+		if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
 			// We sniffed enough rows
 			return true;
 		}
@@ -55,10 +55,10 @@ struct SniffDialect {
 		if (machine.state == CSVState::INVALID) {
 			return;
 		}
-		if (machine.cur_rows < machine.options.sample_chunk_size && machine.state == CSVState::DELIMITER) {
+		if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state == CSVState::DELIMITER) {
 			sniffed_column_counts[machine.cur_rows] = ++machine.column_count;
 		}
-		if (machine.cur_rows < machine.options.sample_chunk_size && machine.state != CSVState::EMPTY_LINE) {
+		if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state != CSVState::EMPTY_LINE) {
 			sniffed_column_counts[machine.cur_rows++] = machine.column_count;
 		}
 		NewLineIdentifier suggested_newline;
@@ -145,7 +145,7 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<CSVStateMachi
 void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<CSVStateMachine> state_machine, idx_t &rows_read,
                                          idx_t &best_consistent_rows, idx_t &prev_padding_count) {
 	// The sniffed_column_counts variable keeps track of the number of columns found for each row
-	vector<idx_t> sniffed_column_counts(options.sample_chunk_size);
+	vector<idx_t> sniffed_column_counts(STANDARD_VECTOR_SIZE);
 	state_machine->csv_buffer_iterator.Process<SniffDialect>(*state_machine, sniffed_column_counts);
 	idx_t start_row = options.dialect_options.skip_rows;
@@ -244,7 +244,7 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<CSVStateMachine> state_machi
 }
 bool CSVSniffer::RefineCandidateNextChunk(CSVStateMachine &candidate) {
-	vector<idx_t> sniffed_column_counts(options.sample_chunk_size);
+	vector<idx_t> sniffed_column_counts(STANDARD_VECTOR_SIZE);
 	candidate.csv_buffer_iterator.Process<SniffDialect>(candidate, sniffed_column_counts);
 	bool allow_padding = options.null_padding;
@@ -268,9 +268,9 @@ void CSVSniffer::RefineCandidates() {
 		return;
 	}
 	for (auto &cur_candidate : candidates) {
-		for (idx_t i = 1; i <= options.sample_chunks; i++) {
+		for (idx_t i = 1; i <= options.sample_size_chunks; i++) {
 			bool finished_file = cur_candidate->csv_buffer_iterator.Finished();
-			if (finished_file || i == options.sample_chunks) {
+			if (finished_file || i == options.sample_size_chunks) {
 				// we finished the file or our chunk sample successfully: stop
 				auto successful_candidate = std::move(cur_candidate);
 				candidates.clear();

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp CHANGED Viewed

@@ -283,11 +283,7 @@ void CSVSniffer::DetectTypes() {
 		candidate->Reset();
 		// Parse chunk and read csv with info candidate
-		idx_t sample_size = options.sample_chunk_size;
-		if (options.sample_chunk_size == 1) {
-			sample_size++;
-		}
-		vector<TupleSniffing> tuples(sample_size);
+		vector<TupleSniffing> tuples(STANDARD_VECTOR_SIZE);
 		candidate->csv_buffer_iterator.Process<SniffValue>(*candidate, tuples);
 		// Potentially Skip empty rows (I find this dirty, but it is what the original code does)
 		idx_t true_start = 0;
@@ -311,8 +307,10 @@ void CSVSniffer::DetectTypes() {
 				break;
 			}
 		}
+		if (values_start > 0) {
+			tuples.erase(tuples.begin(), tuples.begin() + values_start);
+		}
-		tuples.erase(tuples.begin(), tuples.begin() + values_start);
 		idx_t row_idx = 0;
 		if (tuples.size() > 1 && (!options.has_header || (options.has_header && options.dialect_options.header))) {
 			// This means we have more than one row, hence we can use the first row to detect if we have a header
@@ -327,6 +325,9 @@ void CSVSniffer::DetectTypes() {
 		for (; row_idx < tuples.size(); row_idx++) {
 			for (idx_t col = 0; col < tuples[row_idx].values.size(); col++) {
 				auto &col_type_candidates = info_sql_types_candidates[col];
+				// col_type_candidates can't be empty since anything in a CSV file should at least be a string
+				// and we validate utf-8 compatibility when creating the type
+				D_ASSERT(!col_type_candidates.empty());
 				auto cur_top_candidate = col_type_candidates.back();
 				auto dummy_val = tuples[row_idx].values[col];
 				// try cast from string to sql_type