npm - duckdb - Versions diffs - 1.1.4-dev9.0 → 1.2.1-dev4.0 - Mend

duckdb 1.1.4-dev9.0 → 1.2.1-dev4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (221) hide show

package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp CHANGED Viewed

@@ -126,6 +126,10 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m
 			SkipBOM();
 		}
 	}
+	ignore_empty_values = state_machine.dialect_options.state_machine_options.delimiter.GetValue()[0] != ' ' &&
+	                      state_machine.dialect_options.state_machine_options.quote != ' ' &&
+	                      state_machine.dialect_options.state_machine_options.escape != ' ' &&
+	                      state_machine.dialect_options.state_machine_options.comment != ' ';
 }
 StringValueResult::~StringValueResult() {
@@ -148,7 +152,7 @@ inline bool IsValueNull(const char *null_str_ptr, const char *value_ptr, const i
 }
 bool StringValueResult::HandleTooManyColumnsError(const char *value_ptr, const idx_t size) {
-	if (cur_col_id >= number_of_columns) {
+	if (cur_col_id >= number_of_columns && state_machine.state_machine_options.strict_mode.GetValue()) {
 		bool error = true;
 		if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) {
 			// we make an exception if the first over-value is null
@@ -220,6 +224,9 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size
 		return;
 	}
 	if (cur_col_id >= number_of_columns) {
+		if (!state_machine.state_machine_options.strict_mode.GetValue()) {
+			return;
+		}
 		bool error = true;
 		if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) {
 			// we make an exception if the first over-value is null
@@ -245,9 +252,9 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size
 	}
 	if (((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) {
-		// Check for the occurrence of escaped null string like \N only if RFC 4180 conformance is disabled
+		// Check for the occurrence of escaped null string like \N only if strict_mode is disabled
 		const bool check_unquoted_escaped_null =
-		    state_machine.state_machine_options.rfc_4180.GetValue() == false && escaped && !quoted && size == 1;
+		    state_machine.state_machine_options.strict_mode.GetValue() == false && escaped && !quoted && size == 1;
 		for (idx_t i = 0; i < null_str_count; i++) {
 			bool is_null = false;
 			if (null_str_size[i] == 2 && null_str_ptr[i][0] == state_machine.state_machine_options.escape.GetValue()) {
@@ -485,19 +492,30 @@ void StringValueResult::Reset() {
 		cur_buffer = buffer_handles[iterator.GetBufferIdx()];
 	}
 	buffer_handles.clear();
+	idx_t actual_size = 0;
 	if (cur_buffer) {
 		buffer_handles[cur_buffer->buffer_idx] = cur_buffer;
+		actual_size = cur_buffer->actual_size;
 	}
 	current_errors.Reset();
 	borked_rows.clear();
+	current_line_position.begin = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, actual_size};
+	current_line_position.end = current_line_position.begin;
 }
 void StringValueResult::AddQuotedValue(StringValueResult &result, const idx_t buffer_pos) {
 	if (!result.unquoted) {
 		result.current_errors.Insert(UNTERMINATED_QUOTES, result.cur_col_id, result.chunk_col_id, result.last_position);
 	}
-	AddPossiblyEscapedValue(result, buffer_pos, result.buffer_ptr + result.quoted_position + 1,
-	                        buffer_pos - result.quoted_position - 2, buffer_pos < result.last_position.buffer_pos + 2);
+	// remove potential empty values
+	idx_t length = buffer_pos - result.quoted_position - 1;
+	while (length > 0 && result.ignore_empty_values &&
+	       result.buffer_ptr[result.quoted_position + 1 + length - 1] == ' ') {
+		length--;
+	}
+	length--;
+	AddPossiblyEscapedValue(result, buffer_pos, result.buffer_ptr + result.quoted_position + 1, length,
+	                        buffer_pos < result.last_position.buffer_pos + 2);
 	result.quoted = false;
 }
@@ -511,6 +529,10 @@ void StringValueResult::AddPossiblyEscapedValue(StringValueResult &result, const
 				return;
 			}
 		}
+		if (result.cur_col_id >= result.number_of_columns &&
+		    !result.state_machine.state_machine_options.strict_mode.GetValue()) {
+			return;
+		}
 		if (!result.HandleTooManyColumnsError(value_ptr, length)) {
 			// If it's an escaped value we have to remove all the escapes, this is not really great
 			// If we are going to escape, this vector must be a varchar vector
@@ -520,7 +542,6 @@ void StringValueResult::AddPossiblyEscapedValue(StringValueResult &result, const
 					// We have to write the cast error message.
 					std::ostringstream error;
 					// Casting Error Message
 					error << "Could not convert string \"" << std::string(value_ptr, length) << "\" to \'"
 					      << LogicalTypeIdToString(result.parse_types[result.chunk_col_id].type_id) << "\'";
 					auto error_string = error.str();
@@ -533,6 +554,7 @@ void StringValueResult::AddPossiblyEscapedValue(StringValueResult &result, const
 				auto value = StringValueScanner::RemoveEscape(
 				    value_ptr, length, result.state_machine.dialect_options.state_machine_options.escape.GetValue(),
 				    result.state_machine.dialect_options.state_machine_options.quote.GetValue(),
+				    result.state_machine.dialect_options.state_machine_options.strict_mode.GetValue(),
 				    result.parse_chunk.data[result.chunk_col_id]);
 				result.AddValueToVector(value.GetData(), value.GetSize());
 			}
@@ -806,7 +828,7 @@ bool StringValueResult::AddRowInternal() {
 	quoted_new_line = false;
 	// We need to check if we are getting the correct number of columns here.
 	// If columns are correct, we add it, and that's it.
-	if (cur_col_id != number_of_columns) {
+	if (cur_col_id < number_of_columns) {
 		// We have too few columns:
 		if (null_padding) {
 			while (cur_col_id < number_of_columns) {
@@ -1231,7 +1253,8 @@ void StringValueScanner::ProcessExtraRow() {
 	}
 }
-string_t StringValueScanner::RemoveEscape(const char *str_ptr, idx_t end, char escape, char quote, Vector &vector) {
+string_t StringValueScanner::RemoveEscape(const char *str_ptr, idx_t end, char escape, char quote, bool strict_mode,
+                                          Vector &vector) {
 	// Figure out the exact size
 	idx_t str_pos = 0;
 	bool just_escaped = false;
@@ -1239,7 +1262,7 @@ string_t StringValueScanner::RemoveEscape(const char *str_ptr, idx_t end, char e
 		if (str_ptr[cur_pos] == escape && !just_escaped) {
 			just_escaped = true;
 		} else if (str_ptr[cur_pos] == quote) {
-			if (just_escaped) {
+			if (just_escaped || !strict_mode) {
 				str_pos++;
 			}
 			just_escaped = false;
@@ -1259,7 +1282,7 @@ string_t StringValueScanner::RemoveEscape(const char *str_ptr, idx_t end, char e
 		if (c == escape && !just_escaped) {
 			just_escaped = true;
 		} else if (str_ptr[cur_pos] == quote) {
-			if (just_escaped) {
+			if (just_escaped || !strict_mode) {
 				removed_escapes_ptr[str_pos++] = c;
 			}
 			just_escaped = false;
@@ -1289,10 +1312,8 @@ void StringValueScanner::ProcessOverBufferValue() {
 		}
 		if (states.NewRow() || states.NewValue()) {
 			break;
-		} else {
-			if (!result.comment) {
-				over_buffer_string += previous_buffer[i];
-			}
+		} else if (!result.comment) {
+			over_buffer_string += previous_buffer[i];
 		}
 		if (states.IsQuoted()) {
 			result.SetQuoted(result, j);
@@ -1323,16 +1344,13 @@ void StringValueScanner::ProcessOverBufferValue() {
 		if (states.EmptyLine()) {
 			if (state_machine->dialect_options.num_cols == 1) {
 				break;
-			} else {
-				continue;
 			}
+			continue;
 		}
 		if (states.NewRow() || states.NewValue()) {
 			break;
-		} else {
-			if (!result.comment && !states.IsComment()) {
-				over_buffer_string += buffer_handle_ptr[iterator.pos.buffer_pos];
-			}
+		} else if (!result.comment && !states.IsComment()) {
+			over_buffer_string += buffer_handle_ptr[iterator.pos.buffer_pos];
 		}
 		if (states.IsQuoted()) {
 			result.SetQuoted(result, j);
@@ -1357,26 +1375,34 @@ void StringValueScanner::ProcessOverBufferValue() {
 	}
 	if (!skip_value) {
 		string_t value;
-		if (result.quoted) {
-			value = string_t(over_buffer_string.c_str() + result.quoted_position,
-			                 UnsafeNumericCast<uint32_t>(over_buffer_string.size() - 1 - result.quoted_position));
+		if (result.quoted && !result.comment) {
+			idx_t length = over_buffer_string.size() - 1 - result.quoted_position;
+			while (length > 0 && result.ignore_empty_values &&
+			       over_buffer_string.c_str()[result.quoted_position + length] == ' ') {
+				length--;
+			}
+			value = string_t(over_buffer_string.c_str() + result.quoted_position, UnsafeNumericCast<uint32_t>(length));
 			if (result.escaped) {
 				if (!result.HandleTooManyColumnsError(over_buffer_string.c_str(), over_buffer_string.size())) {
 					const auto str_ptr = over_buffer_string.c_str() + result.quoted_position;
-					value = RemoveEscape(str_ptr, over_buffer_string.size() - 2,
-					                     state_machine->dialect_options.state_machine_options.escape.GetValue(),
-					                     state_machine->dialect_options.state_machine_options.quote.GetValue(),
-					                     result.parse_chunk.data[result.chunk_col_id]);
+					value =
+					    RemoveEscape(str_ptr, over_buffer_string.size() - 2,
+					                 state_machine->dialect_options.state_machine_options.escape.GetValue(),
+					                 state_machine->dialect_options.state_machine_options.quote.GetValue(),
+					                 result.state_machine.dialect_options.state_machine_options.strict_mode.GetValue(),
+					                 result.parse_chunk.data[result.chunk_col_id]);
 				}
 			}
 		} else {
 			value = string_t(over_buffer_string.c_str(), UnsafeNumericCast<uint32_t>(over_buffer_string.size()));
 			if (result.escaped) {
 				if (!result.HandleTooManyColumnsError(over_buffer_string.c_str(), over_buffer_string.size())) {
-					value = RemoveEscape(over_buffer_string.c_str(), over_buffer_string.size(),
-					                     state_machine->dialect_options.state_machine_options.escape.GetValue(),
-					                     state_machine->dialect_options.state_machine_options.quote.GetValue(),
-					                     result.parse_chunk.data[result.chunk_col_id]);
+					value =
+					    RemoveEscape(over_buffer_string.c_str(), over_buffer_string.size(),
+					                 state_machine->dialect_options.state_machine_options.escape.GetValue(),
+					                 state_machine->dialect_options.state_machine_options.quote.GetValue(),
+					                 result.state_machine.dialect_options.state_machine_options.strict_mode.GetValue(),
+					                 result.parse_chunk.data[result.chunk_col_id]);
 				}
 			}
 		}
@@ -1436,7 +1462,7 @@ bool StringValueScanner::MoveToNextBuffer() {
 			// This means we reached the end of the file, we must add a last line if there is any to be added
 			if (states.EmptyLine() || states.NewRow() || result.added_last_line || states.IsCurrentNewRow() ||
 			    states.IsNotSet()) {
-				if (result.cur_col_id == result.number_of_columns) {
+				if (result.cur_col_id == result.number_of_columns && !result.IsStateCurrent(CSVState::INVALID)) {
 					result.number_of_rows++;
 				}
 				result.cur_col_id = 0;
@@ -1453,7 +1479,7 @@ bool StringValueScanner::MoveToNextBuffer() {
 				}
 				lines_read++;
 			} else if (states.IsQuotedCurrent() &&
-			           state_machine->dialect_options.state_machine_options.rfc_4180.GetValue()) {
+			           state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
 				// Unterminated quote
 				LinePosition current_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos,
 				                                   result.buffer_size};
@@ -1465,7 +1491,7 @@ bool StringValueScanner::MoveToNextBuffer() {
 					result.UnsetComment(result, iterator.pos.buffer_pos);
 				} else {
 					if (result.quoted && states.IsDelimiterBytes() &&
-					    state_machine->dialect_options.state_machine_options.rfc_4180.GetValue()) {
+					    state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
 						result.current_errors.Insert(UNTERMINATED_QUOTES, result.cur_col_id, result.chunk_col_id,
 						                             result.last_position);
 					}
@@ -1519,8 +1545,8 @@ bool StringValueScanner::FirstValueEndsOnQuote(CSVIterator iterator) const {
 	const idx_t to_pos = iterator.GetEndPos();
 	while (iterator.pos.buffer_pos < to_pos) {
 		state_machine->Transition(current_state, buffer_handle_ptr[iterator.pos.buffer_pos++]);
-		if ((current_state.IsState(CSVState::DELIMITER) || current_state.IsState(CSVState::CARRIAGE_RETURN) ||
-		     current_state.IsState(CSVState::RECORD_SEPARATOR))) {
+		if (current_state.IsState(CSVState::DELIMITER) || current_state.IsState(CSVState::CARRIAGE_RETURN) ||
+		    current_state.IsState(CSVState::RECORD_SEPARATOR)) {
 			return buffer_handle_ptr[iterator.pos.buffer_pos - 2] ==
 			       state_machine->dialect_options.state_machine_options.quote.GetValue();
 		}
@@ -1675,9 +1701,9 @@ void StringValueScanner::SetStart() {
 		// We need to initialize our strict state machine
 		auto &state_machine_cache = CSVStateMachineCache::Get(buffer_manager->context);
 		auto state_options = state_machine->state_machine_options;
-		// To set the state machine to be strict we ensure that rfc_4180 is set to true
-		if (!state_options.rfc_4180.IsSetByUser()) {
-			state_options.rfc_4180 = true;
+		// To set the state machine to be strict we ensure that strict_mode is set to true
+		if (!state_options.strict_mode.IsSetByUser()) {
+			state_options.strict_mode = true;
 		}
 		state_machine_strict =
 		    make_shared_ptr<CSVStateMachine>(state_machine_cache.Get(state_options), state_machine->options);
@@ -1699,6 +1725,9 @@ void StringValueScanner::SetStart() {
 		if (!best_row.is_valid && !quoted_row.is_valid && best_row.start_pos < quoted_row.start_pos) {
 			best_row = quoted_row;
 		}
+		if (quoted_row.is_valid && quoted_row.start_pos < best_row.start_pos) {
+			best_row = quoted_row;
+		}
 	}
 	// 3. We are in an escaped value
 	if (!best_row.is_valid && state_machine->dialect_options.state_machine_options.escape.GetValue() != '\0' &&
@@ -1794,7 +1823,7 @@ void StringValueScanner::FinalizeChunkProcess() {
 			}
 		}
 		if (states.IsQuotedCurrent() && !found_error &&
-		    state_machine->dialect_options.state_machine_options.rfc_4180.GetValue()) {
+		    state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
 			// If we finish the execution of a buffer, and we end in a quoted state, it means we have unterminated
 			// quotes
 			result.current_errors.Insert(type, result.cur_col_id, result.chunk_col_id, result.last_position);

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp CHANGED Viewed

@@ -156,11 +156,6 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<ColumnCountSc
 	} else {
 		new_line_id = DetectNewLineDelimiter(*buffer_manager);
 	}
-	// We only sniff RFC 4180 rules, unless manually set by user.
-	bool rfc_4180 = true;
-	if (options.dialect_options.state_machine_options.rfc_4180.IsSetByUser()) {
-		rfc_4180 = options.dialect_options.state_machine_options.rfc_4180.GetValue();
-	}
 	CSVIterator first_iterator;
 	bool iterator_set = false;
 	for (const auto quote_rule : dialect_candidates.quote_rule_candidates) {
@@ -172,8 +167,9 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<ColumnCountSc
 				for (const auto &escape : escape_candidates) {
 					for (const auto &comment : dialect_candidates.comment_candidates) {
 						D_ASSERT(buffer_manager);
-						CSVStateMachineOptions state_machine_options(delimiter, quote, escape, comment, new_line_id,
-						                                             rfc_4180);
+						CSVStateMachineOptions state_machine_options(
+						    delimiter, quote, escape, comment, new_line_id,
+						    options.dialect_options.state_machine_options.strict_mode.GetValue());
 						auto sniffing_state_machine =
 						    make_shared_ptr<CSVStateMachine>(options, state_machine_options, state_machine_cache);
 						if (options.dialect_options.skip_rows.IsSetByUser()) {

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp CHANGED Viewed

@@ -117,9 +117,7 @@ static void ReplaceNames(vector<string> &detected_names, CSVStateMachine &state_
 					detected_names.push_back(GenerateColumnName(options.name_list.size(), col++));
 					best_sql_types_candidates_per_column_idx[i] = {LogicalType::VARCHAR};
 				}
 				dialect_options.num_cols = options.name_list.size();
 			} else {
 				// we throw an error
 				const auto error = CSVError::HeaderSniffingError(
@@ -128,8 +126,16 @@ static void ReplaceNames(vector<string> &detected_names, CSVStateMachine &state_
 				error_handler.Error(error);
 			}
 		}
-		for (idx_t i = 0; i < options.name_list.size(); i++) {
-			detected_names[i] = options.name_list[i];
+		if (options.name_list.size() > detected_names.size()) {
+			// we throw an error
+			const auto error =
+			    CSVError::HeaderSniffingError(options, best_header_row, options.name_list.size(),
+			                                  state_machine.dialect_options.state_machine_options.delimiter.GetValue());
+			error_handler.Error(error);
+		} else {
+			for (idx_t i = 0; i < options.name_list.size(); i++) {
+				detected_names[i] = options.name_list[i];
+			}
 		}
 	}
 }
@@ -335,7 +341,7 @@ void CSVSniffer::DetectHeader() {
 	auto &sniffer_state_machine = best_candidate->GetStateMachine();
 	names = DetectHeaderInternal(buffer_manager->context, best_header_row, sniffer_state_machine, set_columns,
 	                             best_sql_types_candidates_per_column_idx, options, *error_handler);
-	if (single_row_file && sniffer_state_machine.dialect_options.header.GetValue()) {
+	if (EmptyOrOnlyHeader()) {
 		// This file only contains a header, lets default to the lowest type of all.
 		detected_types.clear();
 		for (idx_t i = 0; i < names.size(); i++) {

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp CHANGED Viewed

@@ -99,6 +99,10 @@ idx_t CSVSniffer::LinesSniffed() const {
 	return lines_sniffed;
 }
+bool CSVSniffer::EmptyOrOnlyHeader() const {
+	return (single_row_file && best_candidate->state_machine->dialect_options.header.GetValue()) || lines_sniffed == 0;
+}
 bool CSVSniffer::CanYouCastIt(ClientContext &context, const string_t value, const LogicalType &type,
                               const DialectOptions &dialect_options, const bool is_null, const char decimal_separator) {
 	if (is_null) {

package/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp CHANGED Viewed

@@ -31,7 +31,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
 			InitializeTransitionArray(transition_array, cur_state, CSVState::QUOTED);
 			break;
 		case CSVState::UNQUOTED:
-			if (state_machine_options.rfc_4180.GetValue()) {
+			if (state_machine_options.strict_mode.GetValue()) {
 				// If we have an unquoted state, following rfc 4180, our base state is invalid
 				InitializeTransitionArray(transition_array, cur_state, CSVState::INVALID);
 			} else {
@@ -58,7 +58,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
 	const bool multi_byte_delimiter = delimiter_value.size() != 1;
-	const bool enable_unquoted_escape = state_machine_options.rfc_4180.GetValue() == false &&
+	const bool enable_unquoted_escape = state_machine_options.strict_mode.GetValue() == false &&
 	                                    state_machine_options.quote != state_machine_options.escape &&
 	                                    state_machine_options.escape != '\0';
 	// Now set values depending on configuration
@@ -75,7 +75,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
 			transition_array[static_cast<uint8_t>('\r')][state] = CSVState::CARRIAGE_RETURN;
 			if (state == static_cast<uint8_t>(CSVState::STANDARD_NEWLINE)) {
 				transition_array[static_cast<uint8_t>('\n')][state] = CSVState::STANDARD;
-			} else if (!state_machine_options.rfc_4180.GetValue()) {
+			} else if (!state_machine_options.strict_mode.GetValue()) {
 				transition_array[static_cast<uint8_t>('\n')][state] = CSVState::RECORD_SEPARATOR;
 			} else {
 				transition_array[static_cast<uint8_t>('\n')][state] = CSVState::INVALID;
@@ -227,7 +227,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
 	if (state_machine_options.quote == state_machine_options.escape) {
 		transition_array[quote][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::QUOTED;
 	}
-	if (state_machine_options.rfc_4180 == false) {
+	if (state_machine_options.strict_mode == false) {
 		if (escape == '\0') {
 			// If escape is defined, it limits a bit how relaxed quotes can be in a reliable way.
 			transition_array[quote][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::MAYBE_QUOTED;
@@ -413,10 +413,10 @@ CSVStateMachineCache::CSVStateMachineCache() {
 				const auto &escape_candidates = default_escape[static_cast<uint8_t>(quote_rule)];
 				for (const auto &escape : escape_candidates) {
 					for (const auto &comment : default_comment) {
-						for (const bool rfc_4180 : {true, false}) {
-							Insert({delimiter, quote, escape, comment, NewLineIdentifier::SINGLE_N, rfc_4180});
-							Insert({delimiter, quote, escape, comment, NewLineIdentifier::SINGLE_R, rfc_4180});
-							Insert({delimiter, quote, escape, comment, NewLineIdentifier::CARRY_ON, rfc_4180});
+						for (const bool strict_mode : {true, false}) {
+							Insert({delimiter, quote, escape, comment, NewLineIdentifier::SINGLE_N, strict_mode});
+							Insert({delimiter, quote, escape, comment, NewLineIdentifier::SINGLE_R, strict_mode});
+							Insert({delimiter, quote, escape, comment, NewLineIdentifier::CARRY_ON, strict_mode});
 						}
 					}
 				}

package/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp CHANGED Viewed

@@ -194,11 +194,18 @@ void CSVErrorHandler::FillRejectsTable(InternalAppender &errors_appender, const
 				errors_appender.Append(Value());
 				break;
 			case CSVErrorType::TOO_FEW_COLUMNS:
-				D_ASSERT(bind_data.return_names.size() > col_idx + 1);
-				errors_appender.Append(string_t(bind_data.return_names[col_idx + 1]));
+				if (col_idx + 1 < bind_data.return_names.size()) {
+					errors_appender.Append(string_t(bind_data.return_names[col_idx + 1]));
+				} else {
+					errors_appender.Append(Value());
+				}
 				break;
 			default:
-				errors_appender.Append(string_t(bind_data.return_names[col_idx]));
+				if (col_idx < bind_data.return_names.size()) {
+					errors_appender.Append(string_t(bind_data.return_names[col_idx]));
+				} else {
+					errors_appender.Append(Value());
+				}
 			}
 			// 8. Error Type
 			errors_appender.Append(string_t(CSVErrorTypeToEnum(error.type)));
@@ -321,11 +328,13 @@ CSVError CSVError::InvalidState(const CSVReaderOptions &options, idx_t current_c
 	std::ostringstream error;
 	error << "The CSV Parser state machine reached an invalid state.\nThis can happen when is not possible to parse "
 	         "your CSV File with the given options, or the CSV File is not RFC 4180 compliant ";
 	std::ostringstream how_to_fix_it;
-	how_to_fix_it << "Possible fixes:" << '\n';
-	how_to_fix_it << "* Enable scanning files that are not RFC 4180 compliant (rfc_4180=false)." << '\n';
+	if (options.dialect_options.state_machine_options.strict_mode.GetValue()) {
+		how_to_fix_it << "Possible fixes:" << '\n';
+		how_to_fix_it << "* Disable the parser's strict mode (strict_mode=false) to allow reading rows that do not "
+		                 "comply with the CSV standard."
+		              << '\n';
+	}
 	return CSVError(error.str(), INVALID_STATE, current_column, csv_row, error_info, row_byte_position, byte_position,
 	                options, how_to_fix_it.str(), current_path);
 }
@@ -356,6 +365,11 @@ CSVError CSVError::HeaderSniffingError(const CSVReaderOptions &options, const ve
 	// 3. Suggest how to fix it!
 	error << "Possible fixes:" << '\n';
+	if (options.dialect_options.state_machine_options.strict_mode.GetValue()) {
+		error << "* Disable the parser's strict mode (strict_mode=false) to allow reading rows that do not comply with "
+		         "the CSV standard."
+		      << '\n';
+	}
 	// header
 	if (!options.dialect_options.header.IsSetByUser()) {
 		error << "* Set header (header = true) if your CSV has a header, or (header = false) if it doesn't" << '\n';
@@ -395,6 +409,11 @@ CSVError CSVError::SniffingError(const CSVReaderOptions &options, const string &
 	// 3. Suggest how to fix it!
 	error << "Possible fixes:" << '\n';
 	// 3.1 Inform the reader of the dialect
+	if (options.dialect_options.state_machine_options.strict_mode.GetValue()) {
+		error << "* Disable the parser's strict mode (strict_mode=false) to allow reading rows that do not comply with "
+		         "the CSV standard."
+		      << '\n';
+	}
 	// delimiter
 	if (!options.dialect_options.state_machine_options.delimiter.IsSetByUser()) {
 		error << "* Set delimiter (e.g., delim=\',\')" << '\n';
@@ -440,11 +459,6 @@ CSVError CSVError::SniffingError(const CSVReaderOptions &options, const string &
 	error << "* Be sure that the maximum line size is set to an appropriate value, otherwise set it (e.g., "
 	         "max_line_size=10000000)"
 	      << "\n";
-	if (options.dialect_options.state_machine_options.rfc_4180.GetValue() != false ||
-	    !options.dialect_options.state_machine_options.rfc_4180.IsSetByUser()) {
-		error << "* Enable scanning files that are not RFC 4180 compliant (rfc_4180=false). " << '\n';
-	}
 	return CSVError(error.str(), SNIFFING, {});
 }
@@ -466,6 +480,11 @@ CSVError CSVError::UnterminatedQuotesError(const CSVReaderOptions &options, idx_
 	error << "Value with unterminated quote found." << '\n';
 	std::ostringstream how_to_fix_it;
 	how_to_fix_it << "Possible fixes:" << '\n';
+	if (options.dialect_options.state_machine_options.strict_mode.GetValue()) {
+		how_to_fix_it << "* Disable the parser's strict mode (strict_mode=false) to allow reading rows that do not "
+		                 "comply with the CSV standard."
+		              << '\n';
+	}
 	how_to_fix_it << "* Enable ignore errors (ignore_errors=true) to skip this row" << '\n';
 	how_to_fix_it << "* Set quote to empty or to a different value (e.g., quote=\'\')" << '\n';
 	return CSVError(error.str(), UNTERMINATED_QUOTES, current_column, csv_row, error_info, row_byte_position,
@@ -479,6 +498,11 @@ CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, i
 	// We don't have a fix for this
 	std::ostringstream how_to_fix_it;
 	how_to_fix_it << "Possible fixes:" << '\n';
+	if (options.dialect_options.state_machine_options.strict_mode.GetValue()) {
+		how_to_fix_it << "* Disable the parser's strict mode (strict_mode=false) to allow reading rows that do not "
+		                 "comply with the CSV standard."
+		              << '\n';
+	}
 	if (!options.null_padding) {
 		how_to_fix_it << "* Enable null padding (null_padding=true) to replace missing values with NULL" << '\n';
 	}

package/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp CHANGED Viewed

@@ -189,11 +189,11 @@ void CSVReaderOptions::SetNewline(const string &input) {
 }
 bool CSVReaderOptions::GetRFC4180() const {
-	return this->dialect_options.state_machine_options.rfc_4180.GetValue();
+	return this->dialect_options.state_machine_options.strict_mode.GetValue();
 }
 void CSVReaderOptions::SetRFC4180(bool input) {
-	this->dialect_options.state_machine_options.rfc_4180.Set(input);
+	this->dialect_options.state_machine_options.strict_mode.Set(input);
 }
 bool CSVReaderOptions::IgnoreErrors() const {
@@ -413,7 +413,7 @@ bool CSVReaderOptions::SetBaseOption(const string &loption, const Value &value,
 	} else if (loption == "compression") {
 		SetCompression(ParseString(value, loption));
-	} else if (loption == "rfc_4180") {
+	} else if (loption == "strict_mode") {
 		SetRFC4180(ParseBoolean(value, loption));
 	} else {
 		// unrecognized option in base CSV
@@ -440,7 +440,7 @@ string CSVReaderOptions::ToString(const string &current_file_path) const {
 	auto &escape = dialect_options.state_machine_options.escape;
 	auto &comment = dialect_options.state_machine_options.comment;
 	auto &new_line = dialect_options.state_machine_options.new_line;
-	auto &rfc_4180 = dialect_options.state_machine_options.rfc_4180;
+	auto &strict_mode = dialect_options.state_machine_options.strict_mode;
 	auto &skip_rows = dialect_options.skip_rows;
 	auto &header = dialect_options.header;
@@ -460,8 +460,8 @@ string CSVReaderOptions::ToString(const string &current_file_path) const {
 	error += FormatOptionLine("skip_rows", skip_rows);
 	// comment
 	error += FormatOptionLine("comment", comment);
-	// rfc_4180
-	error += FormatOptionLine("rfc_4180", rfc_4180);
+	// strict_mode
+	error += FormatOptionLine("strict_mode", strict_mode);
 	// date format
 	error += FormatOptionLine("date_format", dialect_options.date_format.at(LogicalType::DATE));
 	// timestamp format
@@ -638,6 +638,9 @@ void CSVReaderOptions::FromNamedParameters(const named_parameter_map_t &in, Clie
 			}
 			auto &children = ListValue::GetChildren(kv.second);
 			for (auto &child : children) {
+				if (child.IsNull()) {
+					throw BinderException("read_csv %s parameter cannot have a NULL value", kv.first);
+				}
 				name_list.push_back(StringValue::Get(child));
 			}
 			for (auto &name : name_list) {
@@ -716,7 +719,7 @@ void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) co
 	auto &quote = dialect_options.state_machine_options.quote;
 	auto &escape = dialect_options.state_machine_options.escape;
 	auto &comment = dialect_options.state_machine_options.comment;
-	auto &rfc_4180 = dialect_options.state_machine_options.rfc_4180;
+	auto &strict_mode = dialect_options.state_machine_options.strict_mode;
 	auto &header = dialect_options.header;
 	if (delimiter.IsSetByUser()) {
 		named_params["delim"] = Value(GetDelimiter());
@@ -736,8 +739,8 @@ void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) co
 	if (header.IsSetByUser()) {
 		named_params["header"] = Value(GetHeader());
 	}
-	if (rfc_4180.IsSetByUser()) {
-		named_params["rfc_4180"] = Value(GetRFC4180());
+	if (strict_mode.IsSetByUser()) {
+		named_params["strict_mode"] = Value(GetRFC4180());
 	}
 	named_params["max_line_size"] = Value::BIGINT(NumericCast<int64_t>(maximum_line_size.GetValue()));
 	if (dialect_options.skip_rows.IsSetByUser()) {

package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp CHANGED Viewed

@@ -638,7 +638,6 @@ void JoinFilterPushdownInfo::PushInFilter(const JoinFilterPushdownFilter &info,
 	// generate the OR filter
 	auto in_filter = make_uniq<InFilter>(std::move(in_list));
-	in_filter->origin_is_hash_join = true;
 	// we push the OR filter as an OptionalFilter so that we can use it for zonemap pruning only
 	// the IN-list is expensive to execute otherwise

package/src/duckdb/src/execution/operator/persistent/physical_copy_database.cpp CHANGED Viewed

@@ -1,6 +1,8 @@
 #include "duckdb/execution/operator/persistent/physical_copy_database.hpp"
 #include "duckdb/catalog/catalog.hpp"
 #include "duckdb/catalog/catalog_entry/schema_catalog_entry.hpp"
+#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
 #include "duckdb/planner/binder.hpp"
 #include "duckdb/planner/parsed_data/bound_create_table_info.hpp"
 #include "duckdb/parser/parsed_data/create_schema_info.hpp"
@@ -9,6 +11,8 @@
 #include "duckdb/parser/parsed_data/create_type_info.hpp"
 #include "duckdb/parser/parsed_data/create_view_info.hpp"
 #include "duckdb/parser/parsed_data/create_index_info.hpp"
+#include "duckdb/execution/index/unbound_index.hpp"
+#include "duckdb/storage/data_table.hpp"
 namespace duckdb {
@@ -52,7 +56,7 @@ SourceResultType PhysicalCopyDatabase::GetData(ExecutionContext &context, DataCh
 			break;
 		}
 		case CatalogType::INDEX_ENTRY: {
-			catalog.CreateIndex(context.client, create_info->Cast<CreateIndexInfo>());
+			// Skip for now.
 			break;
 		}
 		default:
@@ -60,6 +64,30 @@ SourceResultType PhysicalCopyDatabase::GetData(ExecutionContext &context, DataCh
 			                              CatalogTypeToString(create_info->type));
 		}
 	}
+	// Create the indexes after table creation.
+	for (auto &create_info : info->entries) {
+		if (!create_info || create_info->type != CatalogType::INDEX_ENTRY) {
+			continue;
+		}
+		catalog.CreateIndex(context.client, create_info->Cast<CreateIndexInfo>());
+		auto &create_index_info = create_info->Cast<CreateIndexInfo>();
+		auto &catalog_table = catalog.GetEntry(context.client, CatalogType::TABLE_ENTRY, create_index_info.schema,
+		                                       create_index_info.table);
+		auto &table_entry = catalog_table.Cast<TableCatalogEntry>();
+		auto &data_table = table_entry.GetStorage();
+		IndexStorageInfo storage_info(create_index_info.index_name);
+		storage_info.options.emplace("v1_0_0_storage", false);
+		auto unbound_index = make_uniq<UnboundIndex>(create_index_info.Copy(), storage_info,
+		                                             data_table.GetTableIOManager(), catalog.GetAttached());
+		data_table.AddIndex(std::move(unbound_index));
+		auto &data_table_info = *data_table.GetDataTableInfo();
+		data_table_info.GetIndexes().InitializeIndexes(context.client, data_table_info);
+	}
 	return SourceResultType::FINISHED;
 }