npm - duckdb - Versions diffs - 1.2.1-dev4.0 → 1.2.1-dev8.0 - Mend

duckdb 1.2.1-dev4.0 → 1.2.1-dev8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

package/src/duckdb/extension/parquet/column_writer.cpp CHANGED Viewed

@@ -309,6 +309,7 @@ struct PageInformation {
 	idx_t offset = 0;
 	idx_t row_count = 0;
 	idx_t empty_count = 0;
+	idx_t null_count = 0;
 	idx_t estimated_page_size = 0;
 };
@@ -388,7 +389,7 @@ protected:
 	virtual unique_ptr<ColumnWriterStatistics> InitializeStatsState();
 	//! Initialize the writer for a specific page. Only used for scalar types.
-	virtual unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state);
+	virtual unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state, idx_t page_idx);
 	//! Flushes the writer for a specific page. Only used for scalar types.
 	virtual void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state);
@@ -427,7 +428,8 @@ void BasicColumnWriter::RegisterToRowGroup(duckdb_parquet::RowGroup &row_group)
 	row_group.columns.push_back(std::move(column_chunk));
 }
-unique_ptr<ColumnWriterPageState> BasicColumnWriter::InitializePageState(BasicColumnWriterState &state) {
+unique_ptr<ColumnWriterPageState> BasicColumnWriter::InitializePageState(BasicColumnWriterState &state,
+                                                                         idx_t page_idx) {
 	return nullptr;
 }
@@ -463,6 +465,8 @@ void BasicColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *p
 				state.page_info.push_back(new_info);
 				page_info_ref = state.page_info.back();
 			}
+		} else {
+			page_info.null_count++;
 		}
 		vector_index++;
 	}
@@ -502,7 +506,7 @@ void BasicColumnWriter::BeginWrite(ColumnWriterState &state_p) {
 		    MaxValue<idx_t>(NextPowerOfTwo(page_info.estimated_page_size), MemoryStream::DEFAULT_INITIAL_CAPACITY));
 		write_info.write_count = page_info.empty_count;
 		write_info.max_write_count = page_info.row_count;
-		write_info.page_state = InitializePageState(state);
+		write_info.page_state = InitializePageState(state, page_idx);
 		write_info.compressed_size = 0;
 		write_info.compressed_data = nullptr;
@@ -796,7 +800,6 @@ public:
 };
 struct BaseParquetOperator {
 	template <class SRC, class TGT>
 	static void WriteToStream(const TGT &input, WriteStream &ser) {
 		ser.WriteData(const_data_ptr_cast(&input), sizeof(TGT));
@@ -815,6 +818,11 @@ struct BaseParquetOperator {
 	template <class SRC, class TGT>
 	static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
 	}
+	template <class SRC, class TGT>
+	static idx_t GetRowSize(const Vector &, idx_t) {
+		return sizeof(TGT);
+	}
 };
 struct ParquetCastOperator : public BaseParquetOperator {
@@ -936,6 +944,11 @@ struct ParquetStringOperator : public BaseParquetOperator {
 	static uint64_t XXHash64(const TGT &target_value) {
 		return duckdb_zstd::XXH64(target_value.GetData(), target_value.GetSize(), 0);
 	}
+	template <class SRC, class TGT>
+	static idx_t GetRowSize(const Vector &vector, idx_t index) {
+		return FlatVector::GetData<string_t>(vector)[index].GetSize();
+	}
 };
 struct ParquetIntervalTargetType {
@@ -1066,6 +1079,7 @@ public:
 	// analysis state for integer values for DELTA_BINARY_PACKED/DELTA_LENGTH_BYTE_ARRAY
 	idx_t total_value_count = 0;
 	idx_t total_string_size = 0;
+	uint32_t key_bit_width = 0;
 	unordered_map<T, uint32_t> dictionary;
 	duckdb_parquet::Encoding::type encoding;
@@ -1222,11 +1236,12 @@ public:
 		return std::move(result);
 	}
-	unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state_p) override {
+	unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state_p, idx_t page_idx) override {
 		auto &state = state_p.Cast<StandardColumnWriterState<SRC>>();
-		auto result = make_uniq<StandardWriterPageState<SRC, TGT>>(state.total_value_count, state.total_string_size,
-		                                                           state.encoding, state.dictionary);
+		const auto &page_info = state_p.page_info[page_idx];
+		auto result = make_uniq<StandardWriterPageState<SRC, TGT>>(
+		    page_info.row_count - (page_info.empty_count + page_info.null_count), state.total_string_size,
+		    state.encoding, state.dictionary);
 		return std::move(result);
 	}
@@ -1335,6 +1350,8 @@ public:
 				}
 			}
 			state.dictionary.clear();
+		} else {
+			state.key_bit_width = RleBpDecoder::ComputeBitWidth(state.dictionary.size());
 		}
 	}
@@ -1488,9 +1505,13 @@ public:
 		// bloom filter will be queued for writing in ParquetWriter::BufferBloomFilter one level up
 	}
-	// TODO this now vastly over-estimates the page size
 	idx_t GetRowSize(const Vector &vector, const idx_t index, const BasicColumnWriterState &state_p) const override {
-		return sizeof(TGT);
+		auto &state = state_p.Cast<StandardColumnWriterState<SRC>>();
+		if (state.encoding == Encoding::RLE_DICTIONARY) {
+			return (state.key_bit_width + 7) / 8;
+		} else {
+			return OP::template GetRowSize<SRC, TGT>(vector, index);
+		}
 	}
 };
@@ -1570,7 +1591,7 @@ public:
 		}
 	}
-	unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state) override {
+	unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state, idx_t page_idx) override {
 		return make_uniq<BooleanWriterPageState>();
 	}
@@ -1812,7 +1833,7 @@ public:
 		}
 	}
-	unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state) override {
+	unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state, idx_t page_idx) override {
 		return make_uniq<EnumWriterPageState>(bit_width);
 	}

package/src/duckdb/extension/parquet/include/column_reader.hpp CHANGED Viewed

@@ -160,7 +160,6 @@ protected:
 private:
 	void AllocateBlock(idx_t size);
-	void AllocateCompressed(idx_t size);
 	void PrepareRead(parquet_filter_t &filter);
 	void PreparePage(PageHeader &page_hdr);
 	void PrepareDataPage(PageHeader &page_hdr);
@@ -178,7 +177,6 @@ private:
 	shared_ptr<ResizeableBuffer> block;
-	ResizeableBuffer compressed_buffer;
 	ResizeableBuffer offset_buffer;
 	unique_ptr<RleBpDecoder> dict_decoder;

package/src/duckdb/extension/parquet/include/parquet_bss_encoder.hpp CHANGED Viewed

@@ -30,7 +30,6 @@ public:
 	}
 	void FinishWrite(WriteStream &writer) {
-		D_ASSERT(count == total_value_count);
 		writer.WriteData(buffer.get(), total_value_count * bit_width);
 	}

package/src/duckdb/extension/parquet/include/parquet_dlba_encoder.hpp CHANGED Viewed

@@ -33,9 +33,8 @@ public:
 	}
 	void FinishWrite(WriteStream &writer) {
-		D_ASSERT(stream->GetPosition() == total_string_size);
 		dbp_encoder.FinishWrite(writer);
-		writer.WriteData(buffer.get(), total_string_size);
+		writer.WriteData(buffer.get(), stream->GetPosition());
 	}
 private:

package/src/duckdb/src/catalog/catalog.cpp CHANGED Viewed

@@ -769,6 +769,12 @@ CatalogEntryLookup Catalog::TryLookupEntry(CatalogEntryRetriever &retriever, Cat
 	if (if_not_found == OnEntryNotFound::RETURN_NULL) {
 		return {nullptr, nullptr, ErrorData()};
+	}
+	// Check if the default database is actually attached. CreateMissingEntryException will throw binder exception
+	// otherwise.
+	if (!GetCatalogEntry(context, GetDefaultCatalog(retriever))) {
+		auto except = CatalogException("%s with name %s does not exist!", CatalogTypeToString(type), name);
+		return {nullptr, nullptr, ErrorData(except)};
 	} else {
 		auto except = CreateMissingEntryException(retriever, name, type, schemas, error_context);
 		return {nullptr, nullptr, ErrorData(except)};
@@ -805,6 +811,12 @@ CatalogEntryLookup Catalog::TryLookupEntry(CatalogEntryRetriever &retriever, vec
 	if (if_not_found == OnEntryNotFound::RETURN_NULL) {
 		return {nullptr, nullptr, ErrorData()};
+	}
+	// Check if the default database is actually attached. CreateMissingEntryException will throw binder exception
+	// otherwise.
+	if (!GetCatalogEntry(context, GetDefaultCatalog(retriever))) {
+		auto except = CatalogException("%s with name %s does not exist!", CatalogTypeToString(type), name);
+		return {nullptr, nullptr, ErrorData(except)};
 	} else {
 		auto except = CreateMissingEntryException(retriever, name, type, schemas, error_context);
 		return {nullptr, nullptr, ErrorData(except)};

package/src/duckdb/src/catalog/catalog_entry/duck_table_entry.cpp CHANGED Viewed

@@ -863,7 +863,7 @@ unique_ptr<CatalogEntry> DuckTableEntry::Copy(ClientContext &context) const {
 	}
 	auto binder = Binder::CreateBinder(context);
-	auto bound_create_info = binder->BindCreateTableInfo(std::move(create_info), schema);
+	auto bound_create_info = binder->BindCreateTableCheckpoint(std::move(create_info), schema);
 	return make_uniq<DuckTableEntry>(catalog, schema, *bound_create_info, storage);
 }

package/src/duckdb/src/catalog/catalog_entry_retriever.cpp CHANGED Viewed

@@ -76,7 +76,7 @@ void CatalogEntryRetriever::Inherit(const CatalogEntryRetriever &parent) {
 	this->search_path = parent.search_path;
 }
-CatalogSearchPath &CatalogEntryRetriever::GetSearchPath() {
+const CatalogSearchPath &CatalogEntryRetriever::GetSearchPath() const {
 	if (search_path) {
 		return *search_path;
 	}

package/src/duckdb/src/catalog/catalog_search_path.cpp CHANGED Viewed

@@ -189,11 +189,11 @@ void CatalogSearchPath::Set(CatalogSearchEntry new_value, CatalogSetPathType set
 	Set(std::move(new_paths), set_type);
 }
-const vector<CatalogSearchEntry> &CatalogSearchPath::Get() {
+const vector<CatalogSearchEntry> &CatalogSearchPath::Get() const {
 	return paths;
 }
-string CatalogSearchPath::GetDefaultSchema(const string &catalog) {
+string CatalogSearchPath::GetDefaultSchema(const string &catalog) const {
 	for (auto &path : paths) {
 		if (path.catalog == TEMP_CATALOG) {
 			continue;
@@ -205,7 +205,7 @@ string CatalogSearchPath::GetDefaultSchema(const string &catalog) {
 	return DEFAULT_SCHEMA;
 }
-string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string &catalog) {
+string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string &catalog) const {
 	for (auto &path : paths) {
 		if (path.catalog == TEMP_CATALOG) {
 			continue;
@@ -221,7 +221,7 @@ string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string
 	return DEFAULT_SCHEMA;
 }
-string CatalogSearchPath::GetDefaultCatalog(const string &schema) {
+string CatalogSearchPath::GetDefaultCatalog(const string &schema) const {
 	if (DefaultSchemaGenerator::IsDefaultSchema(schema)) {
 		return SYSTEM_CATALOG;
 	}
@@ -236,7 +236,7 @@ string CatalogSearchPath::GetDefaultCatalog(const string &schema) {
 	return INVALID_CATALOG;
 }
-vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) {
+vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) const {
 	vector<string> catalogs;
 	if (DefaultSchemaGenerator::IsDefaultSchema(schema)) {
 		catalogs.push_back(SYSTEM_CATALOG);
@@ -250,7 +250,7 @@ vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) {
 	return catalogs;
 }
-vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) {
+vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) const {
 	vector<string> schemas;
 	for (auto &path : paths) {
 		if (StringUtil::CIEquals(path.catalog, catalog)) {
@@ -260,7 +260,7 @@ vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) {
 	return schemas;
 }
-const CatalogSearchEntry &CatalogSearchPath::GetDefault() {
+const CatalogSearchEntry &CatalogSearchPath::GetDefault() const {
 	const auto &paths = Get();
 	D_ASSERT(paths.size() >= 2);
 	return paths[1];
@@ -281,7 +281,7 @@ void CatalogSearchPath::SetPathsInternal(vector<CatalogSearchEntry> new_paths) {
 }
 bool CatalogSearchPath::SchemaInSearchPath(ClientContext &context, const string &catalog_name,
-                                           const string &schema_name) {
+                                           const string &schema_name) const {
 	for (auto &path : paths) {
 		if (!StringUtil::CIEquals(path.schema, schema_name)) {
 			continue;

package/src/duckdb/src/common/bind_helpers.cpp CHANGED Viewed

@@ -56,6 +56,9 @@ vector<bool> ParseColumnList(const Value &value, vector<string> &names, const st
 		}
 		throw BinderException("\"%s\" expects a column list or * as parameter", loption);
 	}
+	if (value.IsNull()) {
+		throw BinderException("\"%s\" expects a column list or * as parameter, it can't be a NULL value", loption);
+	}
 	auto &children = ListValue::GetChildren(value);
 	// accept '*' as single argument
 	if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&

package/src/duckdb/src/common/compressed_file_system.cpp CHANGED Viewed

@@ -31,6 +31,8 @@ void CompressedFile::Initialize(bool write) {
 	stream_data.out_buff_start = stream_data.out_buff.get();
 	stream_data.out_buff_end = stream_data.out_buff.get();
+	current_position = 0;
 	stream_wrapper = compressed_fs.CreateStream();
 	stream_wrapper->Initialize(*this, write);
 }

package/src/duckdb/src/common/hive_partitioning.cpp CHANGED Viewed

@@ -245,7 +245,7 @@ static void TemplatedGetHivePartitionValues(Vector &input, vector<HivePartitionK
 	const auto &type = input.GetType();
-	const auto reinterpret = Value::CreateValue<T>(data[0]).GetTypeMutable() != type;
+	const auto reinterpret = Value::CreateValue<T>(data[sel.get_index(0)]).GetTypeMutable() != type;
 	if (reinterpret) {
 		for (idx_t i = 0; i < count; i++) {
 			auto &key = keys[i];

package/src/duckdb/src/common/multi_file_reader.cpp CHANGED Viewed

@@ -508,14 +508,14 @@ void MultiFileReader::CreateMapping(const string &file_name,
 	// copy global columns and inject any different defaults
 	CreateColumnMapping(file_name, local_columns, global_columns, global_column_ids, reader_data, bind_data,
 	                    initial_file, global_state);
-	CreateFilterMap(global_columns, filters, reader_data, global_state);
+	CreateFilterMap(global_column_ids, filters, reader_data, global_state);
 }
-void MultiFileReader::CreateFilterMap(const vector<MultiFileReaderColumnDefinition> &global_columns,
+void MultiFileReader::CreateFilterMap(const vector<ColumnIndex> &global_column_ids,
                                       optional_ptr<TableFilterSet> filters, MultiFileReaderData &reader_data,
                                       optional_ptr<MultiFileReaderGlobalState> global_state) {
 	if (filters) {
-		auto filter_map_size = global_columns.size();
+		auto filter_map_size = global_column_ids.size();
 		if (global_state) {
 			filter_map_size += global_state->extra_columns.size();
 		}

package/src/duckdb/src/execution/aggregate_hashtable.cpp CHANGED Viewed

@@ -329,7 +329,7 @@ optional_idx GroupedAggregateHashTable::TryAddDictionaryGroups(DataChunk &groups
 	if (dictionary_id.empty()) {
 		// dictionary has no id, we can't cache across vectors
 		// only use dictionary compression if there are fewer entries than groups
-		if (dict_size >= groups.size() * DICTIONARY_THRESHOLD) {
+		if (dict_size * DICTIONARY_THRESHOLD >= groups.size()) {
 			// dictionary is too large - use regular aggregation
 			return optional_idx();
 		}

package/src/duckdb/src/execution/index/art/art.cpp CHANGED Viewed

@@ -1038,9 +1038,11 @@ string ART::GenerateConstraintErrorMessage(VerifyExistenceType verify_type, cons
 	}
 	case VerifyExistenceType::DELETE_FK: {
 		// DELETE_FK that still exists in a FK table, i.e., not a valid delete.
-		return StringUtil::Format("Violates foreign key constraint because key \"%s\" is still referenced by a foreign "
-		                          "key in a different table",
-		                          key_name);
+		return StringUtil::Format(
+		    "Violates foreign key constraint because key \"%s\" is still referenced by a foreign "
+		    "key in a different table. If this is an unexpected constraint violation, please refer to our "
+		    "foreign key limitations in the documentation",
+		    key_name);
 	}
 	default:
 		throw NotImplementedException("Type not implemented for VerifyExistenceType");
@@ -1091,16 +1093,27 @@ void ART::VerifyLeaf(const Node &leaf, const ARTKey &key, optional_ptr<ART> dele
 		return;
 	}
+	// Fast path for FOREIGN KEY constraints.
+	// Up to here, the above code paths work implicitly for FKs, as the leaf is inlined.
 	// FIXME: proper foreign key + delete ART support.
-	// This implicitly works for foreign keys, as we do not have to consider the actual row IDs.
-	// We only need to know that there are conflicts (for now), as we still perform over-eager constraint checking.
+	if (index_constraint_type == IndexConstraintType::FOREIGN) {
+		D_ASSERT(!deleted_leaf);
+		// We don't handle FK conflicts in UPSERT, so the row ID should not matter.
+		if (manager.AddHit(i, MAX_ROW_ID)) {
+			conflict_idx = i;
+		}
+		return;
+	}
 	// Scan the two row IDs in the leaf.
 	Iterator it(*this);
 	it.FindMinimum(leaf);
 	ARTKey empty_key = ARTKey();
 	unsafe_vector<row_t> row_ids;
-	it.Scan(empty_key, 2, row_ids, false);
+	auto success = it.Scan(empty_key, 2, row_ids, false);
+	if (!success || row_ids.size() != 2) {
+		throw InternalException("VerifyLeaf expects exactly two row IDs to be scanned");
+	}
 	if (!deleted_leaf) {
 		if (manager.AddHit(i, row_ids[0]) || manager.AddHit(i, row_ids[1])) {

package/src/duckdb/src/execution/index/art/iterator.cpp CHANGED Viewed

@@ -46,9 +46,11 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec
 	bool has_next;
 	do {
 		// An empty upper bound indicates that no upper bound exists.
-		if (!upper_bound.Empty() && status == GateStatus::GATE_NOT_SET) {
-			if (current_key.GreaterThan(upper_bound, equal, nested_depth)) {
-				return true;
+		if (!upper_bound.Empty()) {
+			if (status == GateStatus::GATE_NOT_SET || entered_nested_leaf) {
+				if (current_key.GreaterThan(upper_bound, equal, nested_depth)) {
+					return true;
+				}
 			}
 		}
@@ -86,6 +88,7 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec
 			throw InternalException("Invalid leaf type for index scan.");
 		}
+		entered_nested_leaf = false;
 		has_next = Next();
 	} while (has_next);
 	return true;
@@ -104,6 +107,7 @@ void Iterator::FindMinimum(const Node &node) {
 	if (node.GetGateStatus() == GateStatus::GATE_SET) {
 		D_ASSERT(status == GateStatus::GATE_NOT_SET);
 		status = GateStatus::GATE_SET;
+		entered_nested_leaf = true;
 		nested_depth = 0;
 	}

package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp CHANGED Viewed

@@ -575,6 +575,11 @@ public:
 	explicit WindowLocalSourceState(WindowGlobalSourceState &gsource);
+	void ReleaseLocalStates() {
+		auto &local_states = window_hash_group->thread_states.at(task->thread_idx);
+		local_states.clear();
+	}
 	//! Does the task have more work to do?
 	bool TaskFinished() const {
 		return !task || task->begin_idx == task->end_idx;
@@ -792,6 +797,12 @@ void WindowGlobalSourceState::FinishTask(TaskPtr task) {
 }
 bool WindowLocalSourceState::TryAssignTask() {
+	D_ASSERT(TaskFinished());
+	if (task && task->stage == WindowGroupStage::GETDATA) {
+		// If this state completed the last block in the previous iteration,
+		// release out local state memory.
+		ReleaseLocalStates();
+	}
 	// Because downstream operators may be using our internal buffers,
 	// we can't "finish" a task until we are about to get the next one.
@@ -888,10 +899,6 @@ void WindowLocalSourceState::GetData(DataChunk &result) {
 		++task->begin_idx;
 	}
-	// If that was the last block, release out local state memory.
-	if (TaskFinished()) {
-		local_states.clear();
-	}
 	result.Verify();
 }

package/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp CHANGED Viewed

@@ -4,7 +4,7 @@
 namespace duckdb {
 CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
-                     idx_t &global_csv_current_position, idx_t file_number_p)
+                     const idx_t &global_csv_current_position, idx_t file_number_p)
     : context(context), requested_size(buffer_size_p), file_number(file_number_p), can_seek(file_handle.CanSeek()),
       is_pipe(file_handle.IsPipe()) {
 	AllocateBuffer(buffer_size_p);
@@ -34,7 +34,7 @@ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t b
 }
 shared_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p,
-                                      bool &has_seaked) {
+                                      bool &has_seaked) const {
 	if (has_seaked) {
 		// This means that at some point a reload was done, and we are currently on the incorrect position in our file
 		// handle

package/src/duckdb/src/execution/operator/csv_scanner/encode/csv_encoder.cpp CHANGED Viewed

@@ -36,7 +36,7 @@ void CSVEncoderBuffer::Reset() {
 	actual_encoded_buffer_size = 0;
 }
-CSVEncoder::CSVEncoder(DBConfig &config, const string &encoding_name_to_find, idx_t buffer_size) {
+CSVEncoder::CSVEncoder(const DBConfig &config, const string &encoding_name_to_find, idx_t buffer_size) {
 	encoding_name = StringUtil::Lower(encoding_name_to_find);
 	auto function = config.GetEncodeFunction(encoding_name_to_find);
 	if (!function) {
@@ -51,6 +51,10 @@ CSVEncoder::CSVEncoder(DBConfig &config, const string &encoding_name_to_find, id
 	}
 	// We ensure that the encoded buffer size is an even number to make the two byte lookup on utf-16 work
 	idx_t encoded_buffer_size = buffer_size % 2 != 0 ? buffer_size - 1 : buffer_size;
+	if (encoded_buffer_size == 0) {
+		// This might happen if buffer size = 1
+		encoded_buffer_size = 2;
+	}
 	D_ASSERT(encoded_buffer_size > 0);
 	encoded_buffer.Initialize(encoded_buffer_size);
 	remaining_bytes_buffer.Initialize(function->GetBytesPerIteration());

package/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp CHANGED Viewed

@@ -11,9 +11,10 @@ ScannerResult::ScannerResult(CSVStates &states_p, CSVStateMachine &state_machine
 BaseScanner::BaseScanner(shared_ptr<CSVBufferManager> buffer_manager_p, shared_ptr<CSVStateMachine> state_machine_p,
                          shared_ptr<CSVErrorHandler> error_handler_p, bool sniffing_p,
-                         shared_ptr<CSVFileScan> csv_file_scan_p, CSVIterator iterator_p)
+                         shared_ptr<CSVFileScan> csv_file_scan_p, const CSVIterator &iterator_p)
     : csv_file_scan(std::move(csv_file_scan_p)), sniffing(sniffing_p), error_handler(std::move(error_handler_p)),
-      state_machine(std::move(state_machine_p)), buffer_manager(std::move(buffer_manager_p)), iterator(iterator_p) {
+      state_machine(std::move(state_machine_p)), states(), buffer_manager(std::move(buffer_manager_p)),
+      iterator(iterator_p) {
 	D_ASSERT(buffer_manager);
 	D_ASSERT(state_machine);
 	// Initialize current buffer handle

package/src/duckdb/src/execution/operator/csv_scanner/scanner/csv_schema.cpp CHANGED Viewed

@@ -76,8 +76,8 @@ void CSVSchema::MergeSchemas(CSVSchema &other, bool null_padding) {
 	}
 }
-CSVSchema::CSVSchema(vector<string> &names, vector<LogicalType> &types, const string &file_path, idx_t rows_read_p,
-                     const bool empty_p)
+CSVSchema::CSVSchema(const vector<string> &names, const vector<LogicalType> &types, const string &file_path,
+                     idx_t rows_read_p, const bool empty_p)
     : rows_read(rows_read_p), empty(empty_p) {
 	Initialize(names, types, file_path);
 }

package/src/duckdb/src/execution/operator/csv_scanner/scanner/scanner_boundary.cpp CHANGED Viewed

@@ -13,7 +13,7 @@ CSVBoundary::CSVBoundary(idx_t buffer_idx_p, idx_t buffer_pos_p, idx_t boundary_
 CSVBoundary::CSVBoundary() : buffer_idx(0), buffer_pos(0), boundary_idx(0), end_pos(NumericLimits<idx_t>::Maximum()) {
 }
-CSVIterator::CSVIterator() : is_set(false) {
+CSVIterator::CSVIterator() : buffer_size(0), is_set(false) {
 }
 void CSVBoundary::Print() const {

package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp CHANGED Viewed

@@ -688,23 +688,29 @@ bool LineError::HandleErrors(StringValueResult &result) {
 				    line_pos.GetGlobalPosition(result.requested_size), result.path);
 			}
 			break;
-		case CAST_ERROR:
+		case CAST_ERROR: {
+			string column_name;
+			LogicalTypeId type_id;
+			if (cur_error.col_idx < result.names.size()) {
+				column_name = result.names[cur_error.col_idx];
+			}
+			if (cur_error.col_idx < result.number_of_columns) {
+				type_id = result.parse_types[cur_error.chunk_idx].type_id;
+			}
 			if (result.current_line_position.begin == line_pos) {
 				csv_error = CSVError::CastError(
-				    result.state_machine.options, result.names[cur_error.col_idx], cur_error.error_message,
-				    cur_error.col_idx, borked_line, lines_per_batch,
+				    result.state_machine.options, column_name, cur_error.error_message, cur_error.col_idx, borked_line,
+				    lines_per_batch,
 				    result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl),
-				    line_pos.GetGlobalPosition(result.requested_size, first_nl),
-				    result.parse_types[cur_error.chunk_idx].type_id, result.path);
+				    line_pos.GetGlobalPosition(result.requested_size, first_nl), type_id, result.path);
 			} else {
 				csv_error = CSVError::CastError(
-				    result.state_machine.options, result.names[cur_error.col_idx], cur_error.error_message,
-				    cur_error.col_idx, borked_line, lines_per_batch,
+				    result.state_machine.options, column_name, cur_error.error_message, cur_error.col_idx, borked_line,
+				    lines_per_batch,
 				    result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl),
-				    line_pos.GetGlobalPosition(result.requested_size), result.parse_types[cur_error.chunk_idx].type_id,
-				    result.path);
+				    line_pos.GetGlobalPosition(result.requested_size), type_id, result.path);
 			}
-			break;
+		} break;
 		case MAXIMUM_LINE_SIZE:
 			csv_error = CSVError::LineSizeError(
 			    result.state_machine.options, lines_per_batch, borked_line,
@@ -964,7 +970,8 @@ StringValueScanner::StringValueScanner(idx_t scanner_idx_p, const shared_ptr<CSV
       result(states, *state_machine, cur_buffer_handle, BufferAllocator::Get(buffer_manager->context), result_size,
              iterator.pos.buffer_pos, *error_handler, iterator,
              buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
-             buffer_manager->GetFilePath(), scanner_idx_p) {
+             buffer_manager->GetFilePath(), scanner_idx_p),
+      start_pos(0) {
 	iterator.buffer_size = state_machine->options.buffer_size_option.GetValue();
 }
@@ -976,7 +983,8 @@ StringValueScanner::StringValueScanner(const shared_ptr<CSVBufferManager> &buffe
       result(states, *state_machine, cur_buffer_handle, Allocator::DefaultAllocator(), result_size,
              iterator.pos.buffer_pos, *error_handler, iterator,
              buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
-             buffer_manager->GetFilePath(), 0) {
+             buffer_manager->GetFilePath(), 0),
+      start_pos(0) {
 	iterator.buffer_size = state_machine->options.buffer_size_option.GetValue();
 }