npm - duckdb - Versions diffs - 1.4.3-dev0.0 → 1.4.3 - Mend

duckdb 1.4.3-dev0.0 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "1.4.3-dev0.0",
+  "version": "1.4.3",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb/extension/core_functions/aggregate/holistic/approximate_quantile.cpp CHANGED Viewed

@@ -355,11 +355,11 @@ AggregateFunction GetApproxQuantileListAggregateFunction(const LogicalType &type
 		return GetTypedApproxQuantileListAggregateFunction<int16_t, int16_t>(type);
 	case LogicalTypeId::INTEGER:
 	case LogicalTypeId::DATE:
-	case LogicalTypeId::TIME:
 		return GetTypedApproxQuantileListAggregateFunction<int32_t, int32_t>(type);
 	case LogicalTypeId::BIGINT:
 	case LogicalTypeId::TIMESTAMP:
 	case LogicalTypeId::TIMESTAMP_TZ:
+	case LogicalTypeId::TIME:
 		return GetTypedApproxQuantileListAggregateFunction<int64_t, int64_t>(type);
 	case LogicalTypeId::TIME_TZ:
 		//	Not binary comparable

package/src/duckdb/extension/icu/icu_extension.cpp CHANGED Viewed

@@ -230,8 +230,16 @@ static string NormalizeTimeZone(const string &tz_str) {
 		}
 		idx_t pos = 3;
-		const auto sign = tz_str[pos++];
-		if (sign != '+' && sign != '-') {
+		const auto utc = tz_str[pos++];
+		// Invert the sign (UTC and Etc use opposite sign conventions)
+		// https://en.wikipedia.org/wiki/Tz_database#Area
+		auto sign = utc;
+		if (utc == '+') {
+			sign = '-';
+			;
+		} else if (utc == '-') {
+			sign = '+';
+		} else {
 			break;
 		}
@@ -424,12 +432,13 @@ static void LoadInternal(ExtensionLoader &loader) {
 	auto locales = icu::Collator::getAvailableLocales(count);
 	for (int32_t i = 0; i < count; i++) {
 		string collation;
-		if (string(locales[i].getCountry()).empty()) {
+		const auto &locale = locales[i]; // NOLINT
+		if (string(locale.getCountry()).empty()) {
 			// language only
-			collation = locales[i].getLanguage();
+			collation = locale.getLanguage();
 		} else {
 			// language + country
-			collation = locales[i].getLanguage() + string("_") + locales[i].getCountry();
+			collation = locale.getLanguage() + string("_") + locale.getCountry();
 		}
 		collation = StringUtil::Lower(collation);

package/src/duckdb/extension/parquet/column_writer.cpp CHANGED Viewed

@@ -534,10 +534,10 @@ ColumnWriter::CreateWriterRecursive(ClientContext &context, ParquetWriter &write
 template <>
 struct NumericLimits<float_na_equal> {
 	static constexpr float Minimum() {
-		return std::numeric_limits<float>::lowest();
+		return NumericLimits<float>::Minimum();
 	};
 	static constexpr float Maximum() {
-		return std::numeric_limits<float>::max();
+		return NumericLimits<float>::Maximum();
 	};
 	static constexpr bool IsSigned() {
 		return std::is_signed<float>::value;
@@ -550,10 +550,10 @@ struct NumericLimits<float_na_equal> {
 template <>
 struct NumericLimits<double_na_equal> {
 	static constexpr double Minimum() {
-		return std::numeric_limits<double>::lowest();
+		return NumericLimits<double>::Minimum();
 	};
 	static constexpr double Maximum() {
-		return std::numeric_limits<double>::max();
+		return NumericLimits<double>::Maximum();
 	};
 	static constexpr bool IsSigned() {
 		return std::is_signed<double>::value;

package/src/duckdb/extension/parquet/include/writer/templated_column_writer.hpp CHANGED Viewed

@@ -126,7 +126,8 @@ public:
 public:
 	unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override {
 		auto result = make_uniq<StandardColumnWriterState<SRC, TGT, OP>>(writer, row_group, row_group.columns.size());
-		result->encoding = duckdb_parquet::Encoding::RLE_DICTIONARY;
+		result->encoding = writer.GetParquetVersion() == ParquetVersion::V1 ? duckdb_parquet::Encoding::PLAIN_DICTIONARY
+		                                                                    : duckdb_parquet::Encoding::RLE_DICTIONARY;
 		RegisterToRowGroup(row_group);
 		return std::move(result);
 	}
@@ -150,6 +151,8 @@ public:
 			}
 			page_state.dbp_encoder.FinishWrite(temp_writer);
 			break;
+		case duckdb_parquet::Encoding::PLAIN_DICTIONARY:
+			// PLAIN_DICTIONARY can be treated the same as RLE_DICTIONARY
 		case duckdb_parquet::Encoding::RLE_DICTIONARY:
 			D_ASSERT(page_state.dict_bit_width != 0);
 			if (!page_state.dict_written_value) {
@@ -265,7 +268,8 @@ public:
 	bool HasDictionary(PrimitiveColumnWriterState &state_p) override {
 		auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
-		return state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY;
+		return state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY ||
+		       state.encoding == duckdb_parquet::Encoding::PLAIN_DICTIONARY;
 	}
 	idx_t DictionarySize(PrimitiveColumnWriterState &state_p) override {
@@ -285,7 +289,8 @@ public:
 	void FlushDictionary(PrimitiveColumnWriterState &state_p, ColumnWriterStatistics *stats) override {
 		auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
-		D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY);
+		D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY ||
+		         state.encoding == duckdb_parquet::Encoding::PLAIN_DICTIONARY);
 		if (writer.EnableBloomFilters()) {
 			state.bloom_filter =
@@ -310,7 +315,8 @@ public:
 	idx_t GetRowSize(const Vector &vector, const idx_t index,
 	                 const PrimitiveColumnWriterState &state_p) const override {
 		auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
-		if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY) {
+		if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY ||
+		    state.encoding == duckdb_parquet::Encoding::PLAIN_DICTIONARY) {
 			return (state.key_bit_width + 7) / 8;
 		} else {
 			return OP::template GetRowSize<SRC, TGT>(vector, index);
@@ -328,6 +334,8 @@ private:
 		const auto *data_ptr = FlatVector::GetData<SRC>(input_column);
 		switch (page_state.encoding) {
+		case duckdb_parquet::Encoding::PLAIN_DICTIONARY:
+			// PLAIN_DICTIONARY can be treated the same as RLE_DICTIONARY
 		case duckdb_parquet::Encoding::RLE_DICTIONARY: {
 			idx_t r = chunk_start;
 			if (!page_state.dict_written_value) {

package/src/duckdb/src/common/encryption_key_manager.cpp CHANGED Viewed

@@ -72,21 +72,25 @@ string EncryptionKeyManager::GenerateRandomKeyID() {
 }
 void EncryptionKeyManager::AddKey(const string &key_name, data_ptr_t key) {
+	lock_guard<mutex> guard(lock);
 	derived_keys.emplace(key_name, EncryptionKey(key));
 	// Zero-out the encryption key
 	duckdb_mbedtls::MbedTlsWrapper::AESStateMBEDTLS::SecureClearData(key, DERIVED_KEY_LENGTH);
 }
 bool EncryptionKeyManager::HasKey(const string &key_name) const {
+	lock_guard<mutex> guard(lock);
 	return derived_keys.find(key_name) != derived_keys.end();
 }
 const_data_ptr_t EncryptionKeyManager::GetKey(const string &key_name) const {
 	D_ASSERT(HasKey(key_name));
+	lock_guard<mutex> guard(lock);
 	return derived_keys.at(key_name).GetPtr();
 }
 void EncryptionKeyManager::DeleteKey(const string &key_name) {
+	lock_guard<mutex> guard(lock);
 	derived_keys.erase(key_name);
 }

package/src/duckdb/src/common/local_file_system.cpp CHANGED Viewed

@@ -1283,6 +1283,29 @@ bool LocalFileSystem::OnDiskFile(FileHandle &handle) {
 	return true;
 }
+string LocalFileSystem::GetVersionTag(FileHandle &handle) {
+	// TODO: Fix using FileSystem::Stats for v1.5, which should also fix it for Windows
+#ifdef _WIN32
+	return "";
+#else
+	int fd = handle.Cast<UnixFileHandle>().fd;
+	struct stat s;
+	if (fstat(fd, &s) == -1) {
+		throw IOException("Failed to get file size for file \"%s\": %s", {{"errno", std::to_string(errno)}},
+		                  handle.path, strerror(errno));
+	}
+	// dev/ino should be enough, but to guard against in-place writes we also add file size and modification time
+	uint64_t version_tag[4];
+	Store(NumericCast<uint64_t>(s.st_dev), data_ptr_cast(&version_tag[0]));
+	Store(NumericCast<uint64_t>(s.st_ino), data_ptr_cast(&version_tag[1]));
+	Store(NumericCast<uint64_t>(s.st_size), data_ptr_cast(&version_tag[2]));
+	Store(Timestamp::FromEpochSeconds(s.st_mtime).value, data_ptr_cast(&version_tag[3]));
+	return string(char_ptr_cast(version_tag), sizeof(uint64_t) * 4);
+#endif
+}
 void LocalFileSystem::Seek(FileHandle &handle, idx_t location) {
 	if (!CanSeek()) {
 		throw IOException("Cannot seek in files of this type");

package/src/duckdb/src/common/types/column/column_data_collection.cpp CHANGED Viewed

@@ -1036,6 +1036,7 @@ void ColumnDataCollection::InitializeScan(ColumnDataParallelScanState &state, ve
 bool ColumnDataCollection::Scan(ColumnDataParallelScanState &state, ColumnDataLocalScanState &lstate,
                                 DataChunk &result) const {
+	D_ASSERT(result.GetTypes() == types);
 	result.Reset();
 	idx_t chunk_index;
@@ -1129,6 +1130,10 @@ void ColumnDataCollection::ScanAtIndex(ColumnDataParallelScanState &state, Colum
 }
 bool ColumnDataCollection::Scan(ColumnDataScanState &state, DataChunk &result) const {
+	for (idx_t i = 0; i < state.column_ids.size(); i++) {
+		D_ASSERT(result.GetTypes()[i] == types[state.column_ids[i]]);
+	}
 	result.Reset();
 	idx_t chunk_index;
@@ -1213,6 +1218,7 @@ idx_t ColumnDataCollection::ChunkCount() const {
 }
 void ColumnDataCollection::FetchChunk(idx_t chunk_idx, DataChunk &result) const {
+	D_ASSERT(result.GetTypes() == types);
 	D_ASSERT(chunk_idx < ChunkCount());
 	for (auto &segment : segments) {
 		if (chunk_idx >= segment->ChunkCount()) {

package/src/duckdb/src/common/types/conflict_manager.cpp CHANGED Viewed

@@ -87,7 +87,7 @@ optional_idx ConflictManager::GetFirstInvalidIndex(const idx_t count, const bool
 	for (idx_t i = 0; i < count; i++) {
 		if (negate && !validity.RowIsValid(i)) {
 			return i;
-		} else if (validity.RowIsValid(i)) {
+		} else if (!negate && validity.RowIsValid(i)) {
 			return i;
 		}
 	}

package/src/duckdb/src/execution/index/art/base_node.cpp CHANGED Viewed

@@ -95,7 +95,9 @@ void Node4::DeleteChild(ART &art, Node &node, Node &parent, const uint8_t byte,
 	auto prev_node4_status = node.GetGateStatus();
 	Node::FreeNode(art, node);
-	Prefix::Concat(art, parent, node, child, remaining_byte, prev_node4_status);
+	// Propagate both the prev_node_4 status and the general gate status (if the gate was earlier on),
+	// since the concatenation logic depends on both.
+	Prefix::Concat(art, parent, node, child, remaining_byte, prev_node4_status, status);
 }
 void Node4::ShrinkNode16(ART &art, Node &node4, Node &node16) {

package/src/duckdb/src/execution/index/art/prefix.cpp CHANGED Viewed

@@ -65,8 +65,8 @@ void Prefix::New(ART &art, reference<Node> &ref, const ARTKey &key, const idx_t
 	}
 }
-void Prefix::Concat(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte,
-                    const GateStatus node4_status) {
+void Prefix::Concat(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte, const GateStatus node4_status,
+                    const GateStatus status) {
 	// We have four situations from which we enter here:
 	// 1: PREFIX (parent) - Node4 (prev_node4) - PREFIX (child) - INLINED_LEAF, or
 	// 2: PREFIX (parent) - Node4 (prev_node4) - INLINED_LEAF (child), or
@@ -90,10 +90,7 @@ void Prefix::Concat(ART &art, Node &parent, Node &node4, const Node child, uint8
 		ConcatChildIsGate(art, parent, node4, child, byte);
 		return;
 	}
-	auto inside_gate = parent.GetGateStatus() == GateStatus::GATE_SET;
-	ConcatInternal(art, parent, node4, child, byte, inside_gate);
-	return;
+	ConcatInternal(art, parent, node4, child, byte, status);
 }
 void Prefix::Reduce(ART &art, Node &node, const idx_t pos) {
@@ -286,9 +283,9 @@ Prefix Prefix::GetTail(ART &art, const Node &node) {
 }
 void Prefix::ConcatInternal(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte,
-                            const bool inside_gate) {
+                            const GateStatus status) {
 	if (child.GetType() == NType::LEAF_INLINED) {
-		if (inside_gate) {
+		if (status == GateStatus::GATE_SET) {
 			if (parent.GetType() == NType::PREFIX) {
 				// The parent only contained the Node4, so we can now inline 'all the way up',
 				// and the gate is no longer nested.

package/src/duckdb/src/execution/index/bound_index.cpp CHANGED Viewed

@@ -1,11 +1,13 @@
 #include "duckdb/execution/index/bound_index.hpp"
+#include "duckdb/common/array.hpp"
 #include "duckdb/common/radix.hpp"
 #include "duckdb/common/serializer/serializer.hpp"
 #include "duckdb/planner/expression/bound_columnref_expression.hpp"
 #include "duckdb/planner/expression/bound_reference_expression.hpp"
 #include "duckdb/planner/expression_iterator.hpp"
 #include "duckdb/storage/table/append_state.hpp"
+#include "duckdb/common/types/selection_vector.hpp"
 namespace duckdb {
@@ -154,39 +156,80 @@ string BoundIndex::AppendRowError(DataChunk &input, idx_t index) {
 	return error;
 }
-void BoundIndex::ApplyBufferedReplays(const vector<LogicalType> &table_types,
-                                      vector<BufferedIndexData> &buffered_replays,
+namespace {
+struct BufferedReplayState {
+	optional_ptr<ColumnDataCollection> buffer = nullptr;
+	ColumnDataScanState scan_state;
+	DataChunk current_chunk;
+	bool scan_initialized = false;
+};
+} // namespace
+void BoundIndex::ApplyBufferedReplays(const vector<LogicalType> &table_types, BufferedIndexReplays &buffered_replays,
                                       const vector<StorageIndex> &mapped_column_ids) {
-	for (auto &replay : buffered_replays) {
-		ColumnDataScanState state;
-		auto &buffered_data = *replay.data;
-		buffered_data.InitializeScan(state);
-		DataChunk scan_chunk;
-		buffered_data.InitializeScanChunk(scan_chunk);
-		DataChunk table_chunk;
-		table_chunk.InitializeEmpty(table_types);
-		while (buffered_data.Scan(state, scan_chunk)) {
-			for (idx_t i = 0; i < scan_chunk.ColumnCount() - 1; i++) {
-				auto col_id = mapped_column_ids[i].GetPrimaryIndex();
-				table_chunk.data[col_id].Reference(scan_chunk.data[i]);
+	if (!buffered_replays.HasBufferedReplays()) {
+		return;
+	}
+	// We have two replay states: one for inserts and one for deletes. These are indexed into using the
+	// replay_type. Both scans are interleaved, so the state maintains the position of each scan.
+	array<BufferedReplayState, 2> replay_states;
+	DataChunk table_chunk;
+	table_chunk.InitializeEmpty(table_types);
+	for (const auto &replay_range : buffered_replays.ranges) {
+		const auto type_idx = static_cast<idx_t>(replay_range.type);
+		auto &state = replay_states[type_idx];
+		// Initialize the scan state if necessary. Take ownership of buffered operations, since we won't need
+		// them after replaying anyways.
+		if (!state.scan_initialized) {
+			state.buffer = buffered_replays.GetBuffer(replay_range.type);
+			state.buffer->InitializeScan(state.scan_state);
+			state.buffer->InitializeScanChunk(state.current_chunk);
+			state.scan_initialized = true;
+		}
+		idx_t current_row = replay_range.start;
+		while (current_row < replay_range.end) {
+			// Scan the next DataChunk from the ColumnDataCollection buffer if the current row is on or after
+			// that chunk's starting row index.
+			if (current_row >= state.scan_state.next_row_index) {
+				if (!state.buffer->Scan(state.scan_state, state.current_chunk)) {
+					throw InternalException("Buffered index data exhausted during replay");
+				}
 			}
-			table_chunk.SetCardinality(scan_chunk.size());
-			switch (replay.type) {
-			case BufferedIndexReplay::INSERT_ENTRY: {
-				IndexAppendInfo index_append_info(IndexAppendMode::INSERT_DUPLICATES, nullptr);
-				auto error = Append(table_chunk, scan_chunk.data.back(), index_append_info);
+			// We need to process the remaining rows in the current chunk, which is the minimum of the available
+			// rows in the chunk and the remaining rows in the current range.
+			const auto offset_in_chunk = current_row - state.scan_state.current_row_index;
+			const auto available_in_chunk = state.current_chunk.size() - offset_in_chunk;
+			// [start, end) in ReplayRange is [inclusive, exclusive).
+			const auto range_remaining = replay_range.end - current_row;
+			const auto rows_to_process = MinValue<idx_t>(available_in_chunk, range_remaining);
+			SelectionVector sel(offset_in_chunk, rows_to_process);
+			for (idx_t col_idx = 0; col_idx < state.current_chunk.ColumnCount() - 1; col_idx++) {
+				const auto col_id = mapped_column_ids[col_idx].GetPrimaryIndex();
+				table_chunk.data[col_id].Reference(state.current_chunk.data[col_idx]);
+				table_chunk.data[col_id].Slice(sel, rows_to_process);
+			}
+			table_chunk.SetCardinality(rows_to_process);
+			Vector row_ids(state.current_chunk.data.back(), sel, rows_to_process);
+			if (replay_range.type == BufferedIndexReplay::INSERT_ENTRY) {
+				IndexAppendInfo append_info(IndexAppendMode::INSERT_DUPLICATES, nullptr);
+				const auto error = Append(table_chunk, row_ids, append_info);
 				if (error.HasError()) {
 					throw InternalException("error while applying buffered appends: " + error.Message());
 				}
+				current_row += rows_to_process;
 				continue;
 			}
-			case BufferedIndexReplay::DEL_ENTRY: {
-				Delete(table_chunk, scan_chunk.data.back());
-			}
-			}
+			Delete(table_chunk, row_ids);
+			current_row += rows_to_process;
 		}
 	}
 }

package/src/duckdb/src/execution/index/unbound_index.cpp CHANGED Viewed

@@ -8,10 +8,6 @@
 namespace duckdb {
-BufferedIndexData::BufferedIndexData(BufferedIndexReplay replay_type, unique_ptr<ColumnDataCollection> data_p)
-    : type(replay_type), data(std::move(data_p)) {
-}
 UnboundIndex::UnboundIndex(unique_ptr<CreateInfo> create_info, IndexStorageInfo storage_info_p,
                            TableIOManager &table_io_manager, AttachedDatabase &db)
     : Index(create_info->Cast<CreateIndexInfo>().column_ids, table_io_manager, db), create_info(std::move(create_info)),
@@ -40,15 +36,13 @@ void UnboundIndex::CommitDrop() {
 }
 void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids,
-                               const vector<StorageIndex> &mapped_column_ids_p, BufferedIndexReplay replay_type) {
+                               const vector<StorageIndex> &mapped_column_ids_p, const BufferedIndexReplay replay_type) {
 	D_ASSERT(!column_ids.empty());
 	auto types = index_column_chunk.GetTypes(); // column types
 	types.push_back(LogicalType::ROW_TYPE);
 	auto &allocator = Allocator::Get(db);
-	BufferedIndexData buffered_data(replay_type, make_uniq<ColumnDataCollection>(allocator, types));
 	//! First time we are buffering data, canonical column_id mapping is stored.
 	//! This should be a sorted list of all the physical offsets of Indexed columns on this table.
 	if (mapped_column_ids.empty()) {
@@ -56,7 +50,7 @@ void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids,
 	}
 	D_ASSERT(mapped_column_ids == mapped_column_ids_p);
-	// Combined chunk has all the indexed columns and rowids.
+	// combined_chunk has all the indexed columns according to mapped_column_ids ordering, as well as a rowid column.
 	DataChunk combined_chunk;
 	combined_chunk.InitializeEmpty(types);
 	for (idx_t i = 0; i < index_column_chunk.ColumnCount(); i++) {
@@ -64,8 +58,25 @@ void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids,
 	}
 	combined_chunk.data.back().Reference(row_ids);
 	combined_chunk.SetCardinality(index_column_chunk.size());
-	buffered_data.data->Append(combined_chunk);
-	buffered_replays.emplace_back(std::move(buffered_data));
+	auto &buffer = buffered_replays.GetBuffer(replay_type);
+	if (buffer == nullptr) {
+		buffer = make_uniq<ColumnDataCollection>(allocator, types);
+	}
+	// The starting index of the buffer range is the size of the buffer.
+	const idx_t start = buffer->Count();
+	const idx_t end = start + combined_chunk.size();
+	auto &ranges = buffered_replays.ranges;
+	if (ranges.empty() || ranges.back().type != replay_type) {
+		// If there are no buffered ranges, or the replay types don't match, append a new range.
+		ranges.emplace_back(replay_type, start, end);
+		buffer->Append(combined_chunk);
+		return;
+	}
+	// Otherwise merge the range with the previous one.
+	ranges.back().end = end;
+	buffer->Append(combined_chunk);
 }
 } // namespace duckdb

package/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp CHANGED Viewed

@@ -26,6 +26,10 @@ BaseScanner::BaseScanner(shared_ptr<CSVBufferManager> buffer_manager_p, shared_p
 	}
 }
+void BaseScanner::Print() const {
+	state_machine->Print();
+}
 string BaseScanner::RemoveSeparator(const char *value_ptr, const idx_t size, char thousands_separator) {
 	string result;
 	result.reserve(size);

package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp CHANGED Viewed

@@ -22,7 +22,7 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m
                                      idx_t result_size_p, idx_t buffer_position, CSVErrorHandler &error_hander_p,
                                      CSVIterator &iterator_p, bool store_line_size_p,
                                      shared_ptr<CSVFileScan> csv_file_scan_p, idx_t &lines_read_p, bool sniffing_p,
-                                     string path_p, idx_t scan_id)
+                                     const string &path_p, idx_t scan_id, bool &used_unstrictness)
     : ScannerResult(states, state_machine, result_size_p),
       number_of_columns(NumericCast<uint32_t>(state_machine.dialect_options.num_cols)),
       null_padding(state_machine.options.null_padding), ignore_errors(state_machine.options.ignore_errors.GetValue()),
@@ -30,8 +30,8 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m
                                 ? 0
                                 : state_machine.dialect_options.state_machine_options.delimiter.GetValue().size() - 1),
       error_handler(error_hander_p), iterator(iterator_p), store_line_size(store_line_size_p),
-      csv_file_scan(std::move(csv_file_scan_p)), lines_read(lines_read_p),
-      current_errors(scan_id, state_machine.options.IgnoreErrors()), sniffing(sniffing_p), path(std::move(path_p)) {
+      csv_file_scan(std::move(csv_file_scan_p)), lines_read(lines_read_p), used_unstrictness(used_unstrictness),
+      current_errors(scan_id, state_machine.options.IgnoreErrors()), sniffing(sniffing_p), path(path_p) {
 	// Vector information
 	D_ASSERT(number_of_columns > 0);
 	if (!buffer_handle) {
@@ -154,23 +154,26 @@ inline bool IsValueNull(const char *null_str_ptr, const char *value_ptr, const i
 }
 bool StringValueResult::HandleTooManyColumnsError(const char *value_ptr, const idx_t size) {
-	if (cur_col_id >= number_of_columns && state_machine.state_machine_options.strict_mode.GetValue()) {
-		bool error = true;
-		if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) {
-			// we make an exception if the first over-value is null
-			bool is_value_null = false;
-			for (idx_t i = 0; i < null_str_count; i++) {
-				is_value_null = is_value_null || IsValueNull(null_str_ptr[i], value_ptr, size);
+	if (cur_col_id >= number_of_columns) {
+		if (state_machine.state_machine_options.strict_mode.GetValue()) {
+			bool error = true;
+			if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) {
+				// we make an exception if the first over-value is null
+				bool is_value_null = false;
+				for (idx_t i = 0; i < null_str_count; i++) {
+					is_value_null = is_value_null || IsValueNull(null_str_ptr[i], value_ptr, size);
+				}
+				error = !is_value_null;
 			}
-			error = !is_value_null;
-		}
-		if (error) {
-			// We error pointing to the current value error.
-			current_errors.Insert(TOO_MANY_COLUMNS, cur_col_id, chunk_col_id, last_position);
-			cur_col_id++;
+			if (error) {
+				// We error pointing to the current value error.
+				current_errors.Insert(TOO_MANY_COLUMNS, cur_col_id, chunk_col_id, last_position);
+				cur_col_id++;
+			}
+			// We had an error
+			return true;
 		}
-		// We had an error
-		return true;
+		used_unstrictness = true;
 	}
 	return false;
 }
@@ -231,6 +234,7 @@ void StringValueResult::AddValueToVector(const char *value_ptr, idx_t size, bool
 	}
 	if (cur_col_id >= number_of_columns) {
 		if (!state_machine.state_machine_options.strict_mode.GetValue()) {
+			used_unstrictness = true;
 			return;
 		}
 		bool error = true;
@@ -549,6 +553,7 @@ void StringValueResult::AddPossiblyEscapedValue(StringValueResult &result, const
 		}
 		if (result.cur_col_id >= result.number_of_columns &&
 		    !result.state_machine.state_machine_options.strict_mode.GetValue()) {
+			result.used_unstrictness = true;
 			return;
 		}
 		if (!result.HandleTooManyColumnsError(value_ptr, length)) {
@@ -980,7 +985,7 @@ StringValueScanner::StringValueScanner(idx_t scanner_idx_p, const shared_ptr<CSV
       result(states, *state_machine, cur_buffer_handle, BufferAllocator::Get(buffer_manager->context), result_size,
              iterator.pos.buffer_pos, *error_handler, iterator,
              buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
-             buffer_manager->GetFilePath(), scanner_idx_p),
+             buffer_manager->GetFilePath(), scanner_idx_p, used_unstrictness),
       start_pos(0) {
 	if (scanner_idx == 0 && csv_file_scan) {
 		lines_read += csv_file_scan->skipped_rows;
@@ -997,7 +1002,7 @@ StringValueScanner::StringValueScanner(const shared_ptr<CSVBufferManager> &buffe
       result(states, *state_machine, cur_buffer_handle, Allocator::DefaultAllocator(), result_size,
              iterator.pos.buffer_pos, *error_handler, iterator,
              buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
-             buffer_manager->GetFilePath(), 0),
+             buffer_manager->GetFilePath(), 0, used_unstrictness),
       start_pos(0) {
 	if (scanner_idx == 0 && csv_file_scan) {
 		lines_read += csv_file_scan->skipped_rows;
@@ -1939,14 +1944,17 @@ void StringValueScanner::FinalizeChunkProcess() {
 		if (result.current_errors.HandleErrors(result)) {
 			result.number_of_rows++;
 		}
-		if (states.IsQuotedCurrent() && !found_error &&
-		    state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
-			type = UNTERMINATED_QUOTES;
-			// If we finish the execution of a buffer, and we end in a quoted state, it means we have unterminated
-			// quotes
-			result.current_errors.Insert(type, result.cur_col_id, result.chunk_col_id, result.last_position);
-			if (result.current_errors.HandleErrors(result)) {
-				result.number_of_rows++;
+		if (states.IsQuotedCurrent() && !found_error) {
+			if (state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
+				type = UNTERMINATED_QUOTES;
+				// If we finish the execution of a buffer, and we end in a quoted state, it means we have unterminated
+				// quotes
+				result.current_errors.Insert(type, result.cur_col_id, result.chunk_col_id, result.last_position);
+				if (result.current_errors.HandleErrors(result)) {
+					result.number_of_rows++;
+				}
+			} else {
+				used_unstrictness = true;
 			}
 		}
 		if (!iterator.done) {

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp CHANGED Viewed

@@ -14,7 +14,7 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, const MultiFileOptions &file
 		auto &logical_type = format_template.first;
 		best_format_candidates[logical_type].clear();
 	}
-	// Initialize max columns found to either 0 or however many were set
+	// Initialize max columns found to either 0, or however many were set
 	max_columns_found = set_columns.Size();
 	error_handler = make_shared_ptr<CSVErrorHandler>(options.ignore_errors.GetValue());
 	detection_error_handler = make_shared_ptr<CSVErrorHandler>(true);
@@ -193,7 +193,8 @@ SnifferResult CSVSniffer::SniffCSV(const bool force_match) {
 		buffer_manager->ResetBufferManager();
 	}
 	buffer_manager->sniffing = false;
-	if (best_candidate->error_handler->AnyErrors() && !options.ignore_errors.GetValue()) {
+	if (best_candidate->error_handler->AnyErrors() && !options.ignore_errors.GetValue() &&
+	    best_candidate->state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
 		best_candidate->error_handler->ErrorIfTypeExists(MAXIMUM_LINE_SIZE);
 	}
 	D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());