npm - duckdb - Versions diffs - 0.8.2-dev4376.0 → 0.8.2-dev4474.0 - Mend

duckdb 0.8.2-dev4376.0 → 0.8.2-dev4474.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/binding.gyp CHANGED Viewed

@@ -80,6 +80,7 @@
                 "src/duckdb/ub_src_function_aggregate.cpp",
                 "src/duckdb/ub_src_function.cpp",
                 "src/duckdb/ub_src_function_cast.cpp",
+                "src/duckdb/ub_src_function_cast_union.cpp",
                 "src/duckdb/ub_src_function_pragma.cpp",
                 "src/duckdb/ub_src_function_scalar_compressed_materialization.cpp",
                 "src/duckdb/ub_src_function_scalar.cpp",

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "0.8.2-dev4376.0",
+  "version": "0.8.2-dev4474.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb/extension/parquet/column_writer.cpp CHANGED Viewed

@@ -1825,7 +1825,7 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
 		}
 	}
-	if (type.id() == LogicalTypeId::STRUCT) {
+	if (type.id() == LogicalTypeId::STRUCT || type.id() == LogicalTypeId::UNION) {
 		auto &child_types = StructType::GetChildTypes(type);
 		// set up the schema element for this struct
 		duckdb_parquet::format::SchemaElement schema_element;

package/src/duckdb/extension/parquet/include/parquet_writer.hpp CHANGED Viewed

@@ -15,6 +15,7 @@
 #include "duckdb/common/mutex.hpp"
 #include "duckdb/common/serializer/buffered_file_writer.hpp"
 #include "duckdb/common/types/column/column_data_collection.hpp"
+#include "duckdb/function/copy_function.hpp"
 #endif
 #include "column_writer.hpp"
@@ -75,11 +76,11 @@ public:
 		return *writer;
 	}
-	static bool TypeIsSupported(const LogicalType &type);
+	static CopyTypeSupport TypeIsSupported(const LogicalType &type);
 private:
-	static bool DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type,
-	                                            duckdb_parquet::format::Type::type &type);
+	static CopyTypeSupport DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type,
+	                                                       duckdb_parquet::format::Type::type &type);
 	string file_name;
 	vector<LogicalType> sql_types;
 	vector<string> column_names;

package/src/duckdb/extension/parquet/parquet_writer.cpp CHANGED Viewed

@@ -77,7 +77,8 @@ private:
 	WriteStream &serializer;
 };
-bool ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type, Type::type &parquet_type) {
+CopyTypeSupport ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type,
+                                                               Type::type &parquet_type) {
 	switch (duckdb_type.id()) {
 	case LogicalTypeId::BOOLEAN:
 		parquet_type = Type::BOOLEAN;
@@ -95,9 +96,11 @@ bool ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_ty
 		parquet_type = Type::FLOAT;
 		break;
 	case LogicalTypeId::DOUBLE:
-	case LogicalTypeId::HUGEINT:
 		parquet_type = Type::DOUBLE;
 		break;
+	case LogicalTypeId::HUGEINT:
+		parquet_type = Type::DOUBLE;
+		return CopyTypeSupport::LOSSY;
 	case LogicalTypeId::ENUM:
 	case LogicalTypeId::BLOB:
 	case LogicalTypeId::VARCHAR:
@@ -141,47 +144,62 @@ bool ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_ty
 		}
 		break;
 	default:
-		// Anything that is not supported returns false
-		return false;
+		// Anything that is not supported
+		return CopyTypeSupport::UNSUPPORTED;
 	}
-	return true;
+	return CopyTypeSupport::SUPPORTED;
 }
 Type::type ParquetWriter::DuckDBTypeToParquetType(const LogicalType &duckdb_type) {
 	Type::type result;
-	if (!DuckDBTypeToParquetTypeInternal(duckdb_type, result)) {
+	auto type_supports = DuckDBTypeToParquetTypeInternal(duckdb_type, result);
+	if (type_supports == CopyTypeSupport::UNSUPPORTED) {
 		throw NotImplementedException("Unimplemented type for Parquet \"%s\"", duckdb_type.ToString());
 	}
 	return result;
 }
-bool ParquetWriter::TypeIsSupported(const LogicalType &type) {
+CopyTypeSupport ParquetWriter::TypeIsSupported(const LogicalType &type) {
 	Type::type unused;
 	auto id = type.id();
 	if (id == LogicalTypeId::LIST) {
 		auto &child_type = ListType::GetChildType(type);
 		return TypeIsSupported(child_type);
 	}
+	if (id == LogicalTypeId::UNION) {
+		auto count = UnionType::GetMemberCount(type);
+		for (idx_t i = 0; i < count; i++) {
+			auto &member_type = UnionType::GetMemberType(type, i);
+			auto type_support = TypeIsSupported(member_type);
+			if (type_support != CopyTypeSupport::SUPPORTED) {
+				return type_support;
+			}
+		}
+		return CopyTypeSupport::SUPPORTED;
+	}
 	if (id == LogicalTypeId::STRUCT) {
 		auto &children = StructType::GetChildTypes(type);
 		for (auto &child : children) {
 			auto &child_type = child.second;
-			if (!TypeIsSupported(child_type)) {
-				return false;
+			auto type_support = TypeIsSupported(child_type);
+			if (type_support != CopyTypeSupport::SUPPORTED) {
+				return type_support;
 			}
 		}
-		return true;
+		return CopyTypeSupport::SUPPORTED;
 	}
 	if (id == LogicalTypeId::MAP) {
 		auto &key_type = MapType::KeyType(type);
 		auto &value_type = MapType::ValueType(type);
-		if (!TypeIsSupported(key_type)) {
-			return false;
+		auto key_type_support = TypeIsSupported(key_type);
+		if (key_type_support != CopyTypeSupport::SUPPORTED) {
+			return key_type_support;
 		}
-		if (!TypeIsSupported(value_type)) {
-			return false;
+		auto value_type_support = TypeIsSupported(value_type);
+		if (value_type_support != CopyTypeSupport::SUPPORTED) {
+			return value_type_support;
 		}
-		return true;
+		return CopyTypeSupport::SUPPORTED;
 	}
 	return DuckDBTypeToParquetTypeInternal(type, unused);
 }

package/src/duckdb/src/common/enum_util.cpp CHANGED Viewed

@@ -5974,6 +5974,8 @@ const char* EnumUtil::ToChars<UnionInvalidReason>(UnionInvalidReason value) {
 		return "NO_MEMBERS";
 	case UnionInvalidReason::VALIDITY_OVERLAP:
 		return "VALIDITY_OVERLAP";
+	case UnionInvalidReason::TAG_MISMATCH:
+		return "TAG_MISMATCH";
 	default:
 		throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented", value));
 	}
@@ -5993,6 +5995,9 @@ UnionInvalidReason EnumUtil::FromString<UnionInvalidReason>(const char *value) {
 	if (StringUtil::Equals(value, "VALIDITY_OVERLAP")) {
 		return UnionInvalidReason::VALIDITY_OVERLAP;
 	}
+	if (StringUtil::Equals(value, "TAG_MISMATCH")) {
+		return UnionInvalidReason::TAG_MISMATCH;
+	}
 	throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
 }

package/src/duckdb/src/common/types/date.cpp CHANGED Viewed

@@ -492,7 +492,7 @@ int32_t Date::ExtractDayOfTheYear(date_t date) {
 int64_t Date::ExtractJulianDay(date_t date) {
 	// Julian Day 0 is (-4713, 11, 24) in the proleptic Gregorian calendar.
-	static const auto JULIAN_EPOCH = -2440588;
+	static const int64_t JULIAN_EPOCH = -2440588;
 	return date.days - JULIAN_EPOCH;
 }

package/src/duckdb/src/common/types/vector.cpp CHANGED Viewed

@@ -2007,6 +2007,9 @@ UnionInvalidReason UnionVector::CheckUnionValidity(Vector &vector, idx_t count,
 					return UnionInvalidReason::VALIDITY_OVERLAP;
 				}
 				found_valid = true;
+				if (tag != static_cast<union_tag_t>(member_idx)) {
+					return UnionInvalidReason::TAG_MISMATCH;
+				}
 			}
 		}
 	}

package/src/duckdb/src/common/types.cpp CHANGED Viewed

@@ -398,7 +398,7 @@ string LogicalType::ToString() const {
 			if (i > 0) {
 				ret += ", ";
 			}
-			ret += "'" + KeywordHelper::WriteOptionallyQuoted(EnumType::GetString(*this, i).GetString(), '\'') + "'";
+			ret += KeywordHelper::WriteQuoted(EnumType::GetString(*this, i).GetString(), '\'');
 		}
 		ret += ")";
 		return ret;

package/src/duckdb/src/execution/index/fixed_size_buffer.cpp CHANGED Viewed

@@ -148,9 +148,6 @@ void FixedSizeBuffer::Pin() {
 uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
-	// this function calls Get() on the buffer, so the buffer must already be in memory
-	D_ASSERT(InMemory());
 	// get the bitmask data
 	auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());
 	ValidityMask mask(bitmask_ptr);
@@ -200,7 +197,7 @@ uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
 uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
-	// this function calls Get() on the buffer, so the buffer must already be in memory
+	// this function calls Get() on the buffer
 	D_ASSERT(InMemory());
 	// finds the maximum zero bit in a bitmask, and adds one to it,
@@ -259,17 +256,13 @@ uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
 	}
 	// there are no allocations in this buffer
-	// FIXME: put this line back in and then fix the missing vacuum bug in
-	// FIXME: test_index_large_aborted_append.test with force_restart
-	// FIXME: test if we still have non-dirty buffer to serialize after fixing this
-	//	throw InternalException("tried to serialize empty buffer");
-	return 0;
+	throw InternalException("tried to serialize empty buffer");
 }
 void FixedSizeBuffer::SetUninitializedRegions(PartialBlockForIndex &p_block_for_index, const idx_t segment_size,
                                               const idx_t offset, const idx_t bitmask_offset) {
-	// this function calls Get() on the buffer, so the buffer must already be in memory
+	// this function calls Get() on the buffer
 	D_ASSERT(InMemory());
 	auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());

package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp CHANGED Viewed

@@ -89,17 +89,19 @@ bool ParallelCSVReader::SetPosition() {
 						position_buffer++;
 					}
 					if (position_buffer > end_buffer) {
+						VerifyLineLength(position_buffer, buffer->batch_index);
 						return false;
 					}
 					SkipEmptyLines();
 					if (verification_positions.beginning_of_first_line == 0) {
 						verification_positions.beginning_of_first_line = position_buffer;
 					}
+					VerifyLineLength(position_buffer, buffer->batch_index);
 					verification_positions.end_of_last_line = position_buffer;
 					return true;
 				}
 			}
+			VerifyLineLength(position_buffer, buffer->batch_index);
 			return false;
 		}
 		SkipEmptyLines();
@@ -143,12 +145,13 @@ bool ParallelCSVReader::SetPosition() {
 			break;
 		}
-		if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
+		auto pos_check = position_buffer == 0 ? position_buffer : position_buffer - 1;
+		if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[pos_check])) {
 			break;
 		}
 		if (position_buffer > end_buffer && options.dialect_options.new_line == NewLineIdentifier::CARRY_ON &&
-		    (*buffer)[position_buffer - 1] == '\n') {
+		    (*buffer)[pos_check] == '\n') {
 			break;
 		}
 		idx_t position_set = position_buffer;

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp CHANGED Viewed

@@ -55,6 +55,9 @@ struct SniffDialect {
 		if (machine.state == CSVState::INVALID) {
 			return;
 		}
+		if (machine.cur_rows < machine.options.sample_chunk_size && machine.state == CSVState::DELIMITER) {
+			sniffed_column_counts[machine.cur_rows] = ++machine.column_count;
+		}
 		if (machine.cur_rows < machine.options.sample_chunk_size && machine.state != CSVState::EMPTY_LINE) {
 			sniffed_column_counts[machine.cur_rows++] = machine.column_count;
 		}

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp CHANGED Viewed

@@ -148,12 +148,18 @@ void CSVSniffer::DetectHeader() {
 			names.push_back(col_name);
 			name_collision_count[col_name] = 0;
 		}
+		if (best_header_row.size() < best_candidate->dialect_options.num_cols && options.null_padding) {
+			for (idx_t col = best_header_row.size(); col < best_candidate->dialect_options.num_cols; col++) {
+				names.push_back(GenerateColumnName(best_candidate->dialect_options.num_cols, col));
+			}
+		} else if (best_header_row.size() < best_candidate->dialect_options.num_cols) {
+			throw InternalException("Detected header has number of columns inferior to dialect detection");
+		}
 	} else {
 		best_candidate->dialect_options.header = false;
 		for (idx_t col = 0; col < best_candidate->dialect_options.num_cols; col++) {
-			string column_name = GenerateColumnName(best_candidate->dialect_options.num_cols, col);
-			names.push_back(column_name);
+			names.push_back(GenerateColumnName(best_candidate->dialect_options.num_cols, col));
 		}
 	}

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp CHANGED Viewed

@@ -183,6 +183,10 @@ struct SniffValue {
 	}
 	inline static void Finalize(CSVStateMachine &machine, vector<TupleSniffing> &sniffed_values) {
+		if (machine.cur_rows < sniffed_values.size() && machine.state == CSVState::DELIMITER) {
+			// Started a new empty value
+			sniffed_values[machine.cur_rows].values.push_back(Value(machine.value));
+		}
 		if (machine.cur_rows < sniffed_values.size() && machine.state != CSVState::EMPTY_LINE) {
 			machine.VerifyUTF8();
 			sniffed_values[machine.cur_rows].line_number = machine.rows_read;
@@ -300,7 +304,7 @@ void CSVSniffer::DetectTypes() {
 		// Potentially Skip Notes (I also find this dirty, but it is what the original code does)
 		while (true_start < tuples.size()) {
-			if (tuples[true_start].values.size() < max_columns_found) {
+			if (tuples[true_start].values.size() < max_columns_found && !options.null_padding) {
 				true_start = tuples[true_start].line_number;
 				values_start++;
 			} else {

package/src/duckdb/src/function/cast/union/from_struct.cpp ADDED Viewed

@@ -0,0 +1,114 @@
+#include "duckdb/function/cast/bound_cast_data.hpp"
+namespace duckdb {
+bool StructToUnionCast::AllowImplicitCastFromStruct(const LogicalType &source, const LogicalType &target) {
+	if (source.id() != LogicalTypeId::STRUCT) {
+		return false;
+	}
+	auto target_fields = StructType::GetChildTypes(target);
+	auto fields = StructType::GetChildTypes(source);
+	if (target_fields.size() != fields.size()) {
+		// Struct should have the same amount of fields as the union
+		return false;
+	}
+	for (idx_t i = 0; i < target_fields.size(); i++) {
+		auto &target_field = target_fields[i].second;
+		auto &target_field_name = target_fields[i].first;
+		auto &field = fields[i].second;
+		auto &field_name = fields[i].first;
+		if (i == 0) {
+			// For the tag field we don't accept a type substitute as varchar
+			if (target_field != field) {
+				return false;
+			}
+			continue;
+		}
+		if (!StringUtil::CIEquals(target_field_name, field_name)) {
+			return false;
+		}
+		if (target_field != field && field != LogicalType::VARCHAR) {
+			// We allow the field to be VARCHAR, since unsupported types get cast to VARCHAR by EXPORT DATABASE (format
+			// PARQUET) i.e UNION(a BIT) becomes STRUCT(a VARCHAR)
+			return false;
+		}
+	}
+	return true;
+}
+// Physical Cast execution
+bool StructToUnionCast::Cast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
+	auto &cast_data = parameters.cast_data->Cast<StructBoundCastData>();
+	auto &lstate = parameters.local_state->Cast<StructCastLocalState>();
+	D_ASSERT(source.GetType().id() == LogicalTypeId::STRUCT);
+	D_ASSERT(result.GetType().id() == LogicalTypeId::UNION);
+	D_ASSERT(cast_data.target.id() == LogicalTypeId::UNION);
+	auto &source_children = StructVector::GetEntries(source);
+	auto &target_children = StructVector::GetEntries(result);
+	for (idx_t i = 0; i < source_children.size(); i++) {
+		auto &result_child_vector = *target_children[i];
+		auto &source_child_vector = *source_children[i];
+		CastParameters child_parameters(parameters, cast_data.child_cast_info[i].cast_data, lstate.local_states[i]);
+		auto converted =
+		    cast_data.child_cast_info[i].function(source_child_vector, result_child_vector, count, child_parameters);
+		(void)converted;
+		D_ASSERT(converted);
+	}
+	auto check_tags = UnionVector::CheckUnionValidity(result, count);
+	switch (check_tags) {
+	case UnionInvalidReason::TAG_OUT_OF_RANGE:
+		throw ConversionException("One or more of the tags do not point to a valid union member");
+	case UnionInvalidReason::VALIDITY_OVERLAP:
+		throw ConversionException("One or more rows in the produced UNION have validity set for more than 1 member");
+	case UnionInvalidReason::TAG_MISMATCH:
+		throw ConversionException(
+		    "One or more rows in the produced UNION have tags that don't point to the valid member");
+	case UnionInvalidReason::VALID:
+		break;
+	default:
+		throw InternalException("Struct to union cast failed for unknown reason");
+	}
+	if (source.GetVectorType() == VectorType::CONSTANT_VECTOR) {
+		result.SetVectorType(VectorType::CONSTANT_VECTOR);
+		ConstantVector::SetNull(result, ConstantVector::IsNull(source));
+	} else {
+		source.Flatten(count);
+		FlatVector::Validity(result) = FlatVector::Validity(source);
+	}
+	result.Verify(count);
+	return true;
+}
+// Bind cast
+unique_ptr<BoundCastData> StructToUnionCast::BindData(BindCastInput &input, const LogicalType &source,
+                                                      const LogicalType &target) {
+	vector<BoundCastInfo> child_cast_info;
+	D_ASSERT(source.id() == LogicalTypeId::STRUCT);
+	D_ASSERT(target.id() == LogicalTypeId::UNION);
+	auto result_child_count = StructType::GetChildCount(target);
+	D_ASSERT(result_child_count == StructType::GetChildCount(source));
+	for (idx_t i = 0; i < result_child_count; i++) {
+		auto &source_child = StructType::GetChildType(source, i);
+		auto &target_child = StructType::GetChildType(target, i);
+		auto child_cast = input.GetCastFunction(source_child, target_child);
+		child_cast_info.push_back(std::move(child_cast));
+	}
+	return make_uniq<StructBoundCastData>(std::move(child_cast_info), target);
+}
+BoundCastInfo StructToUnionCast::Bind(BindCastInput &input, const LogicalType &source, const LogicalType &target) {
+	auto cast_data = StructToUnionCast::BindData(input, source, target);
+	return BoundCastInfo(&StructToUnionCast::Cast, std::move(cast_data), StructBoundCastData::InitStructCastLocalState);
+}
+} // namespace duckdb

package/src/duckdb/src/function/cast/union_casts.cpp CHANGED Viewed

@@ -11,33 +11,10 @@ namespace duckdb {
 //--------------------------------------------------------------------------------------------------
 // if the source can be implicitly cast to a member of the target union, the cast is valid
-struct ToUnionBoundCastData : public BoundCastData {
-	ToUnionBoundCastData(union_tag_t member_idx, string name, LogicalType type, int64_t cost,
-	                     BoundCastInfo member_cast_info)
-	    : tag(member_idx), name(std::move(name)), type(std::move(type)), cost(cost),
-	      member_cast_info(std::move(member_cast_info)) {
-	}
-	union_tag_t tag;
-	string name;
-	LogicalType type;
-	int64_t cost;
-	BoundCastInfo member_cast_info;
-public:
-	unique_ptr<BoundCastData> Copy() const override {
-		return make_uniq<ToUnionBoundCastData>(tag, name, type, cost, member_cast_info.Copy());
-	}
-	static bool SortByCostAscending(const ToUnionBoundCastData &left, const ToUnionBoundCastData &right) {
-		return left.cost < right.cost;
-	}
-};
 unique_ptr<BoundCastData> BindToUnionCast(BindCastInput &input, const LogicalType &source, const LogicalType &target) {
 	D_ASSERT(target.id() == LogicalTypeId::UNION);
-	vector<ToUnionBoundCastData> candidates;
+	vector<UnionBoundCastData> candidates;
 	for (idx_t member_idx = 0; member_idx < UnionType::GetMemberCount(target); member_idx++) {
 		auto member_type = UnionType::GetMemberType(target, member_idx);
@@ -68,7 +45,7 @@ unique_ptr<BoundCastData> BindToUnionCast(BindCastInput &input, const LogicalTyp
 	}
 	// sort the candidate casts by cost
-	std::sort(candidates.begin(), candidates.end(), ToUnionBoundCastData::SortByCostAscending);
+	std::sort(candidates.begin(), candidates.end(), UnionBoundCastData::SortByCostAscending);
 	// select the lowest possible cost cast
 	auto &selected_cast = candidates[0];
@@ -95,11 +72,11 @@ unique_ptr<BoundCastData> BindToUnionCast(BindCastInput &input, const LogicalTyp
 	}
 	// otherwise, return the selected cast
-	return make_uniq<ToUnionBoundCastData>(std::move(selected_cast));
+	return make_uniq<UnionBoundCastData>(std::move(selected_cast));
 }
 unique_ptr<FunctionLocalState> InitToUnionLocalState(CastLocalStateParameters &parameters) {
-	auto &cast_data = parameters.cast_data->Cast<ToUnionBoundCastData>();
+	auto &cast_data = parameters.cast_data->Cast<UnionBoundCastData>();
 	if (!cast_data.member_cast_info.init_local_state) {
 		return nullptr;
 	}
@@ -109,7 +86,7 @@ unique_ptr<FunctionLocalState> InitToUnionLocalState(CastLocalStateParameters &p
 static bool ToUnionCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
 	D_ASSERT(result.GetType().id() == LogicalTypeId::UNION);
-	auto &cast_data = parameters.cast_data->Cast<ToUnionBoundCastData>();
+	auto &cast_data = parameters.cast_data->Cast<UnionBoundCastData>();
 	auto &selected_member_vector = UnionVector::GetMember(result, cast_data.tag);
 	CastParameters child_parameters(parameters, cast_data.member_cast_info.cast_data, parameters.local_state);
@@ -127,7 +104,13 @@ static bool ToUnionCast(Vector &source, Vector &result, idx_t count, CastParamet
 BoundCastInfo DefaultCasts::ImplicitToUnionCast(BindCastInput &input, const LogicalType &source,
                                                 const LogicalType &target) {
-	return BoundCastInfo(&ToUnionCast, BindToUnionCast(input, source, target), InitToUnionLocalState);
+	D_ASSERT(target.id() == LogicalTypeId::UNION);
+	if (StructToUnionCast::AllowImplicitCastFromStruct(source, target)) {
+		return StructToUnionCast::Bind(input, source, target);
+	}
+	auto cast_data = BindToUnionCast(input, source, target);
+	return BoundCastInfo(&ToUnionCast, std::move(cast_data), InitToUnionLocalState);
 }
 //--------------------------------------------------------------------------------------------------
@@ -143,7 +126,7 @@ BoundCastInfo DefaultCasts::ImplicitToUnionCast(BindCastInput &input, const Logi
 // INVALID:	UNION(A, B) 	->	UNION(A, C)		if B can't be implicitly cast to C
 // INVALID:	UNION(A, B, D) 	->	UNION(A, B, C)
-struct UnionToUnionBoundCastData : public BoundCastData {
+struct UnionUnionBoundCastData : public BoundCastData {
 	// mapping from source member index to target member index
 	// these are always the same size as the source member count
@@ -153,7 +136,7 @@ struct UnionToUnionBoundCastData : public BoundCastData {
 	LogicalType target_type;
-	UnionToUnionBoundCastData(vector<idx_t> tag_map, vector<BoundCastInfo> member_casts, LogicalType target_type)
+	UnionUnionBoundCastData(vector<idx_t> tag_map, vector<BoundCastInfo> member_casts, LogicalType target_type)
 	    : tag_map(std::move(tag_map)), member_casts(std::move(member_casts)), target_type(std::move(target_type)) {
 	}
@@ -163,7 +146,7 @@ public:
 		for (auto &member_cast : member_casts) {
 			member_casts_copy.push_back(member_cast.Copy());
 		}
-		return make_uniq<UnionToUnionBoundCastData>(tag_map, std::move(member_casts_copy), target_type);
+		return make_uniq<UnionUnionBoundCastData>(tag_map, std::move(member_casts_copy), target_type);
 	}
 };
@@ -203,11 +186,11 @@ unique_ptr<BoundCastData> BindUnionToUnionCast(BindCastInput &input, const Logic
 		}
 	}
-	return make_uniq<UnionToUnionBoundCastData>(tag_map, std::move(member_casts), target);
+	return make_uniq<UnionUnionBoundCastData>(tag_map, std::move(member_casts), target);
 }
 unique_ptr<FunctionLocalState> InitUnionToUnionLocalState(CastLocalStateParameters &parameters) {
-	auto &cast_data = parameters.cast_data->Cast<UnionToUnionBoundCastData>();
+	auto &cast_data = parameters.cast_data->Cast<UnionUnionBoundCastData>();
 	auto result = make_uniq<StructCastLocalState>();
 	for (auto &entry : cast_data.member_casts) {
@@ -222,7 +205,7 @@ unique_ptr<FunctionLocalState> InitUnionToUnionLocalState(CastLocalStateParamete
 }
 static bool UnionToUnionCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
-	auto &cast_data = parameters.cast_data->Cast<UnionToUnionBoundCastData>();
+	auto &cast_data = parameters.cast_data->Cast<UnionUnionBoundCastData>();
 	auto &lstate = parameters.local_state->Cast<StructCastLocalState>();
 	auto source_member_count = UnionType::GetMemberCount(source.GetType());
@@ -313,7 +296,7 @@ static bool UnionToUnionCast(Vector &source, Vector &result, idx_t count, CastPa
 static bool UnionToVarcharCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
 	auto constant = source.GetVectorType() == VectorType::CONSTANT_VECTOR;
 	// first cast all union members to varchar
-	auto &cast_data = parameters.cast_data->Cast<UnionToUnionBoundCastData>();
+	auto &cast_data = parameters.cast_data->Cast<UnionUnionBoundCastData>();
 	Vector varchar_union(cast_data.target_type, count);
 	UnionToUnionCast(source, varchar_union, count, parameters);
@@ -356,6 +339,7 @@ static bool UnionToVarcharCast(Vector &source, Vector &result, idx_t count, Cast
 BoundCastInfo DefaultCasts::UnionCastSwitch(BindCastInput &input, const LogicalType &source,
                                             const LogicalType &target) {
+	D_ASSERT(source.id() == LogicalTypeId::UNION);
 	switch (target.id()) {
 	case LogicalTypeId::VARCHAR: {
 		// bind a cast in which we convert all members to VARCHAR first

package/src/duckdb/src/function/table/read_csv.cpp CHANGED Viewed

@@ -300,7 +300,7 @@ public:
 	                       const CSVReaderOptions &options, idx_t system_threads_p, const vector<string> &files_path_p,
 	                       bool force_parallelism_p, vector<column_t> column_ids_p)
 	    : buffer_manager(std::move(buffer_manager_p)), system_threads(system_threads_p),
-	      buffer_size(options.buffer_size), force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
+	      force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
 	      line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
 		current_file_path = files_path_p[0];
 		CSVFileHandle *file_handle_ptr;
@@ -316,16 +316,6 @@ public:
 		first_file_size = file_size;
 		on_disk_file = file_handle_ptr->OnDiskFile();
 		bytes_read = 0;
-		if (buffer_size < file_size || file_size == 0) {
-			bytes_per_local_state = buffer_size / ParallelCSVGlobalState::MaxThreads();
-		} else {
-			bytes_per_local_state = file_size / MaxThreads();
-		}
-		if (bytes_per_local_state == 0) {
-			// In practice, I think this won't happen, it only happens because we are mocking up test scenarios
-			// this boy needs to be at least one.
-			bytes_per_local_state = 1;
-		}
 		running_threads = MaxThreads();
 		// Initialize all the book-keeping variables
@@ -368,8 +358,6 @@ public:
 	void UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx);
-	void IncrementThread();
 	void DecrementThread();
 	bool Finished();
@@ -402,16 +390,12 @@ private:
 	mutex main_mutex;
 	//! Byte set from for last thread
 	idx_t next_byte = 0;
-	//! How many bytes we should execute per local state
-	idx_t bytes_per_local_state;
 	//! Size of first file
 	idx_t first_file_size = 0;
 	//! Whether or not this is an on-disk file
 	bool on_disk_file = true;
 	//! Basically max number of threads in DuckDB
 	idx_t system_threads;
-	//! Size of the buffers
-	idx_t buffer_size;
 	//! Current batch index
 	idx_t batch_index = 0;
 	idx_t local_batch_index = 0;
@@ -454,11 +438,6 @@ idx_t ParallelCSVGlobalState::MaxThreads() const {
 	return system_threads;
 }
-void ParallelCSVGlobalState::IncrementThread() {
-	lock_guard<mutex> parallel_lock(main_mutex);
-	running_threads++;
-}
 void ParallelCSVGlobalState::DecrementThread() {
 	lock_guard<mutex> parallel_lock(main_mutex);
 	D_ASSERT(running_threads > 0);
@@ -572,6 +551,7 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
 	}
 	// set up the current buffer
 	line_info.current_batches[file_index - 1].insert(local_batch_index);
+	idx_t bytes_per_local_state = current_buffer->actual_size / MaxThreads() + 1;
 	auto result = make_uniq<CSVBufferRead>(
 	    buffer_manager->GetBuffer(cur_buffer_idx), buffer_manager->GetBuffer(cur_buffer_idx + 1), next_byte,
 	    next_byte + bytes_per_local_state, batch_index++, local_batch_index++, &line_info);
@@ -1135,6 +1115,9 @@ unique_ptr<TableRef> ReadCSVReplacement(ClientContext &context, const string &ta
 	if (StringUtil::EndsWith(lower_name, ".gz")) {
 		lower_name = lower_name.substr(0, lower_name.size() - 3);
 	} else if (StringUtil::EndsWith(lower_name, ".zst")) {
+		if (!Catalog::TryAutoLoad(context, "parquet")) {
+			throw MissingExtensionException("parquet extension is required for reading zst compressed file");
+		}
 		lower_name = lower_name.substr(0, lower_name.size() - 4);
 	}
 	if (!StringUtil::EndsWith(lower_name, ".csv") && !StringUtil::Contains(lower_name, ".csv?") &&

package/src/duckdb/src/function/table/version/pragma_version.cpp CHANGED Viewed

@@ -1,8 +1,8 @@
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "0.8.2-dev4376"
+#define DUCKDB_VERSION "0.8.2-dev4474"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "312b995450"
+#define DUCKDB_SOURCE_ID "ba71015ee7"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"

package/src/duckdb/src/include/duckdb/common/types/vector.hpp CHANGED Viewed

@@ -447,7 +447,7 @@ struct StructVector {
 	DUCKDB_API static vector<unique_ptr<Vector>> &GetEntries(Vector &vector);
 };
-enum class UnionInvalidReason : uint8_t { VALID, TAG_OUT_OF_RANGE, NO_MEMBERS, VALIDITY_OVERLAP };
+enum class UnionInvalidReason : uint8_t { VALID, TAG_OUT_OF_RANGE, NO_MEMBERS, VALIDITY_OVERLAP, TAG_MISMATCH };
 struct UnionVector {
 	// Unions are stored as structs, but the first child is always the "tag"

package/src/duckdb/src/include/duckdb/function/cast/bound_cast_data.hpp CHANGED Viewed

@@ -81,4 +81,36 @@ public:
 	unique_ptr<FunctionLocalState> value_state;
 };
+struct UnionBoundCastData : public BoundCastData {
+	UnionBoundCastData(union_tag_t member_idx, string name, LogicalType type, int64_t cost,
+	                   BoundCastInfo member_cast_info)
+	    : tag(member_idx), name(std::move(name)), type(std::move(type)), cost(cost),
+	      member_cast_info(std::move(member_cast_info)) {
+	}
+	union_tag_t tag;
+	string name;
+	LogicalType type;
+	int64_t cost;
+	BoundCastInfo member_cast_info;
+public:
+	unique_ptr<BoundCastData> Copy() const override {
+		return make_uniq<UnionBoundCastData>(tag, name, type, cost, member_cast_info.Copy());
+	}
+	static bool SortByCostAscending(const UnionBoundCastData &left, const UnionBoundCastData &right) {
+		return left.cost < right.cost;
+	}
+};
+struct StructToUnionCast {
+public:
+	static bool AllowImplicitCastFromStruct(const LogicalType &source, const LogicalType &target);
+	static bool Cast(Vector &source, Vector &result, idx_t count, CastParameters &parameters);
+	static unique_ptr<BoundCastData> BindData(BindCastInput &input, const LogicalType &source,
+	                                          const LogicalType &target);
+	static BoundCastInfo Bind(BindCastInput &input, const LogicalType &source, const LogicalType &target);
+};
 } // namespace duckdb

package/src/duckdb/src/include/duckdb/function/copy_function.hpp CHANGED Viewed

@@ -99,7 +99,9 @@ typedef void (*copy_flush_batch_t)(ClientContext &context, FunctionData &bind_da
                                    PreparedBatchData &batch);
 typedef idx_t (*copy_desired_batch_size_t)(ClientContext &context, FunctionData &bind_data);
-typedef bool (*copy_supports_type_t)(const LogicalType &type);
+enum class CopyTypeSupport { SUPPORTED, LOSSY, UNSUPPORTED };
+typedef CopyTypeSupport (*copy_supports_type_t)(const LogicalType &type);
 class CopyFunction : public Function {
 public:

package/src/duckdb/src/include/duckdb/main/query_result.hpp CHANGED Viewed

@@ -40,7 +40,7 @@ public:
 	vector<string> names;
 public:
-	DUCKDB_API void ThrowError(const string &prepended_message = "") const;
+	[[noreturn]] DUCKDB_API void ThrowError(const string &prepended_message = "") const;
 	DUCKDB_API void SetError(PreservedError error);
 	DUCKDB_API bool HasError() const;
 	DUCKDB_API const ExceptionType &GetErrorType() const;

package/src/duckdb/src/parser/keyword_helper.cpp CHANGED Viewed

@@ -36,7 +36,7 @@ string KeywordHelper::EscapeQuotes(const string &text, char quote) {
 string KeywordHelper::WriteQuoted(const string &text, char quote) {
 	// 1. Escapes all occurences of 'quote' by doubling them (escape in SQL)
 	// 2. Adds quotes around the string
-	return string(1, quote) + EscapeQuotes(text) + string(1, quote);
+	return string(1, quote) + EscapeQuotes(text, quote) + string(1, quote);
 }
 string KeywordHelper::WriteOptionallyQuoted(const string &text, char quote, bool allow_caps) {

package/src/duckdb/src/planner/binder/statement/bind_export.cpp CHANGED Viewed

@@ -115,7 +115,95 @@ string CreateFileName(const string &id_suffix, TableCatalogEntry &table, const s
 	return StringUtil::Format("%s_%s%s.%s", schema, name, id_suffix, extension);
 }
-unique_ptr<QueryNode> CreateSelectStatement(CopyStatement &stmt, vector<unique_ptr<ParsedExpression>> select_list) {
+static bool IsSupported(CopyTypeSupport support_level) {
+	// For export purposes we don't want to lose information, so we only accept fully supported types
+	return support_level == CopyTypeSupport::SUPPORTED;
+}
+static LogicalType AlterLogicalType(const LogicalType &original, copy_supports_type_t type_check) {
+	D_ASSERT(type_check);
+	auto id = original.id();
+	switch (id) {
+	case LogicalTypeId::LIST: {
+		auto child = AlterLogicalType(ListType::GetChildType(original), type_check);
+		return LogicalType::LIST(child);
+	}
+	case LogicalTypeId::STRUCT: {
+		auto &original_children = StructType::GetChildTypes(original);
+		child_list_t<LogicalType> new_children;
+		for (auto &child : original_children) {
+			auto &child_name = child.first;
+			auto &child_type = child.second;
+			LogicalType new_type;
+			if (!IsSupported(type_check(child_type))) {
+				new_type = AlterLogicalType(child_type, type_check);
+			} else {
+				new_type = child_type;
+			}
+			new_children.push_back(std::make_pair(child_name, new_type));
+		}
+		return LogicalType::STRUCT(std::move(new_children));
+	}
+	case LogicalTypeId::UNION: {
+		auto member_count = UnionType::GetMemberCount(original);
+		child_list_t<LogicalType> new_children;
+		for (idx_t i = 0; i < member_count; i++) {
+			auto &child_name = UnionType::GetMemberName(original, i);
+			auto &child_type = UnionType::GetMemberType(original, i);
+			LogicalType new_type;
+			if (!IsSupported(type_check(child_type))) {
+				new_type = AlterLogicalType(child_type, type_check);
+			} else {
+				new_type = child_type;
+			}
+			new_children.push_back(std::make_pair(child_name, new_type));
+		}
+		return LogicalType::UNION(std::move(new_children));
+	}
+	case LogicalTypeId::MAP: {
+		auto &key_type = MapType::KeyType(original);
+		auto &value_type = MapType::ValueType(original);
+		LogicalType new_key_type;
+		LogicalType new_value_type;
+		if (!IsSupported(type_check(key_type))) {
+			new_key_type = AlterLogicalType(key_type, type_check);
+		} else {
+			new_key_type = key_type;
+		}
+		if (!IsSupported(type_check(value_type))) {
+			new_value_type = AlterLogicalType(value_type, type_check);
+		} else {
+			new_value_type = value_type;
+		}
+		return LogicalType::MAP(new_key_type, new_value_type);
+	}
+	default: {
+		D_ASSERT(!IsSupported(type_check(original)));
+		return LogicalType::VARCHAR;
+	}
+	}
+}
+static bool NeedsCast(LogicalType &type, copy_supports_type_t type_check) {
+	if (!type_check) {
+		return false;
+	}
+	if (IsSupported(type_check(type))) {
+		// The type is supported in it's entirety, no cast is required
+		return false;
+	}
+	// Change the type to something that is supported
+	type = AlterLogicalType(type, type_check);
+	return true;
+}
+static unique_ptr<QueryNode> CreateSelectStatement(CopyStatement &stmt, child_list_t<LogicalType> &select_list,
+                                                   copy_supports_type_t type_check) {
 	auto ref = make_uniq<BaseTableRef>();
 	ref->catalog_name = stmt.info->catalog;
 	ref->schema_name = stmt.info->schema;
@@ -123,7 +211,21 @@ unique_ptr<QueryNode> CreateSelectStatement(CopyStatement &stmt, vector<unique_p
 	auto statement = make_uniq<SelectNode>();
 	statement->from_table = std::move(ref);
-	statement->select_list = std::move(select_list);
+	vector<unique_ptr<ParsedExpression>> expressions;
+	for (auto &col : select_list) {
+		auto &name = col.first;
+		auto &type = col.second;
+		auto expression = make_uniq_base<ParsedExpression, ColumnRefExpression>(name);
+		if (NeedsCast(type, type_check)) {
+			// Add a cast to a type supported by the copy function
+			expression = make_uniq_base<ParsedExpression, CastExpression>(type, std::move(expression));
+		}
+		expressions.push_back(std::move(expression));
+	}
+	statement->select_list = std::move(expressions);
 	return std::move(statement);
 }
@@ -194,16 +296,10 @@ BoundStatement Binder::Bind(ExportStatement &stmt) {
 		info->table = table.name;
 		// We can not export generated columns
-		vector<unique_ptr<ParsedExpression>> expressions;
+		child_list_t<LogicalType> select_list;
 		for (auto &col : table.GetColumns().Physical()) {
-			auto expression = make_uniq_base<ParsedExpression, ColumnRefExpression>(col.GetName());
-			auto is_supported = copy_function.function.supports_type;
-			if (is_supported && !is_supported(col.Type())) {
-				expression =
-				    make_uniq_base<ParsedExpression, CastExpression>(LogicalType::VARCHAR, std::move(expression));
-			}
-			expressions.push_back(std::move(expression));
-			info->select_list.push_back(col.GetName());
+			select_list.push_back(std::make_pair(col.Name(), col.Type()));
 		}
 		ExportedTableData exported_data;
@@ -220,7 +316,8 @@ BoundStatement Binder::Bind(ExportStatement &stmt) {
 		// generate the copy statement and bind it
 		CopyStatement copy_stmt;
 		copy_stmt.info = std::move(info);
-		copy_stmt.select_statement = CreateSelectStatement(copy_stmt, std::move(expressions));
+		copy_stmt.select_statement =
+		    CreateSelectStatement(copy_stmt, select_list, copy_function.function.supports_type);
 		auto copy_binder = Binder::CreateBinder(context, this);
 		auto bound_statement = copy_binder->Bind(copy_stmt);

package/src/duckdb/src/storage/checkpoint_manager.cpp CHANGED Viewed

@@ -363,63 +363,64 @@ void CheckpointWriter::WriteIndex(IndexCatalogEntry &index_catalog, Serializer &
 void CheckpointReader::ReadIndex(ClientContext &context, Deserializer &deserializer) {
-	// Deserialize the index metadata
-	auto info = deserializer.ReadProperty<unique_ptr<CreateInfo>>(100, "index");
-	auto &index_info = info->Cast<CreateIndexInfo>();
-	// Create the index in the catalog
-	auto &schema_catalog = catalog.GetSchema(context, info->schema);
-	auto &table_catalog =
-	    catalog.GetEntry(context, CatalogType::TABLE_ENTRY, info->schema, index_info.table).Cast<DuckTableEntry>();
-	auto &index_catalog = schema_catalog.CreateIndex(context, index_info, table_catalog)->Cast<DuckIndexEntry>();
-	index_catalog.info = table_catalog.GetStorage().info;
-	// We deserialize the index lazily, i.e., we do not need to load any node information
+	// deserialize the index create info
+	auto create_info = deserializer.ReadProperty<unique_ptr<CreateInfo>>(100, "index");
+	auto &info = create_info->Cast<CreateIndexInfo>();
+	// create the index in the catalog
+	auto &schema = catalog.GetSchema(context, create_info->schema);
+	auto &table =
+	    catalog.GetEntry(context, CatalogType::TABLE_ENTRY, create_info->schema, info.table).Cast<DuckTableEntry>();
+	auto &index = schema.CreateIndex(context, info, table)->Cast<DuckIndexEntry>();
+	index.info = table.GetStorage().info;
+	// insert the parsed expressions into the stored index so that we correctly (de)serialize it during consecutive
+	// checkpoints
+	for (auto &parsed_expr : info.parsed_expressions) {
+		index.parsed_expressions.push_back(parsed_expr->Copy());
+	}
+	// we deserialize the index lazily, i.e., we do not need to load any node information
 	// except the root block pointer
-	auto index_block_pointer = deserializer.ReadProperty<BlockPointer>(101, "root_block_pointer");
+	auto root_block_pointer = deserializer.ReadProperty<BlockPointer>(101, "root_block_pointer");
-	// obtain the expressions of the ART from the index metadata
-	vector<unique_ptr<Expression>> unbound_expressions;
+	// obtain the parsed expressions of the ART from the index metadata
 	vector<unique_ptr<ParsedExpression>> parsed_expressions;
-	for (auto &p_exp : index_info.parsed_expressions) {
-		parsed_expressions.push_back(p_exp->Copy());
+	for (auto &parsed_expr : info.parsed_expressions) {
+		parsed_expressions.push_back(parsed_expr->Copy());
 	}
+	D_ASSERT(!parsed_expressions.empty());
-	// bind the parsed expressions
-	// add the table to the bind context
+	// add the table to the bind context to bind the parsed expressions
 	auto binder = Binder::CreateBinder(context);
 	vector<LogicalType> column_types;
 	vector<string> column_names;
-	for (auto &col : table_catalog.GetColumns().Logical()) {
+	for (auto &col : table.GetColumns().Logical()) {
 		column_types.push_back(col.Type());
 		column_names.push_back(col.Name());
 	}
+	// create a binder to bind the parsed expressions
 	vector<column_t> column_ids;
-	binder->bind_context.AddBaseTable(0, index_info.table, column_names, column_types, column_ids, &table_catalog);
+	binder->bind_context.AddBaseTable(0, info.table, column_names, column_types, column_ids, &table);
 	IndexBinder idx_binder(*binder, context);
+	// bind the parsed expressions to create unbound expressions
+	vector<unique_ptr<Expression>> unbound_expressions;
 	unbound_expressions.reserve(parsed_expressions.size());
 	for (auto &expr : parsed_expressions) {
 		unbound_expressions.push_back(idx_binder.Bind(expr));
 	}
-	if (parsed_expressions.empty()) {
-		// this is a PK/FK index: we create the necessary bound column ref expressions
-		unbound_expressions.reserve(index_info.column_ids.size());
-		for (idx_t key_nr = 0; key_nr < index_info.column_ids.size(); key_nr++) {
-			auto &col = table_catalog.GetColumn(LogicalIndex(index_info.column_ids[key_nr]));
-			unbound_expressions.push_back(
-			    make_uniq<BoundColumnRefExpression>(col.GetName(), col.GetType(), ColumnBinding(0, key_nr)));
-		}
-	}
 	// create the index and add it to the storage
-	switch (index_info.index_type) {
+	switch (info.index_type) {
 	case IndexType::ART: {
-		auto &storage = table_catalog.GetStorage();
-		auto art = make_uniq<ART>(index_info.column_ids, TableIOManager::Get(storage), std::move(unbound_expressions),
-		                          index_info.constraint_type, storage.db, nullptr, index_block_pointer);
+		auto &storage = table.GetStorage();
+		auto art = make_uniq<ART>(info.column_ids, TableIOManager::Get(storage), std::move(unbound_expressions),
+		                          info.constraint_type, storage.db, nullptr, root_block_pointer);
-		index_catalog.index = art.get();
+		index.index = art.get();
 		storage.info->indexes.AddIndex(std::move(art));
 	} break;
 	default:

package/src/duckdb/src/storage/compression/rle.cpp CHANGED Viewed

@@ -292,7 +292,11 @@ void RLESkip(ColumnSegment &segment, ColumnScanState &state, idx_t skip_count) {
 	scan_state.Skip(segment, skip_count);
 }
+template <bool ENTIRE_VECTOR>
 static bool CanEmitConstantVector(idx_t position, idx_t run_length, idx_t scan_count) {
+	if (!ENTIRE_VECTOR) {
+		return false;
+	}
 	if (scan_count != STANDARD_VECTOR_SIZE) {
 		// Only when we can fill an entire Vector can we emit a ConstantVector, because subsequent scans require the
 		// input Vector to be flat
@@ -330,9 +334,9 @@ static void RLEScanConstant(RLEScanState<T> &scan_state, rle_count_t *index_poin
 	return;
 }
-template <class T>
-void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result,
-                    idx_t result_offset) {
+template <class T, bool ENTIRE_VECTOR>
+void RLEScanPartialInternal(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result,
+                            idx_t result_offset) {
 	auto &scan_state = state.scan_state->Cast<RLEScanState<T>>();
 	auto data = scan_state.handle.Ptr() + segment.GetBlockOffset();
@@ -340,7 +344,8 @@ void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_c
 	auto index_pointer = reinterpret_cast<rle_count_t *>(data + scan_state.rle_count_offset);
 	// If we are scanning an entire Vector and it contains only a single run
-	if (CanEmitConstantVector(scan_state.position_in_entry, index_pointer[scan_state.entry_pos], scan_count)) {
+	if (CanEmitConstantVector<ENTIRE_VECTOR>(scan_state.position_in_entry, index_pointer[scan_state.entry_pos],
+	                                         scan_count)) {
 		RLEScanConstant<T>(scan_state, index_pointer, data_pointer, scan_count, result);
 		return;
 	}
@@ -357,9 +362,15 @@ void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_c
 	}
 }
+template <class T>
+void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result,
+                    idx_t result_offset) {
+	return RLEScanPartialInternal<T, false>(segment, state, scan_count, result, result_offset);
+}
 template <class T>
 void RLEScan(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result) {
-	RLEScanPartial<T>(segment, state, scan_count, result, 0);
+	RLEScanPartialInternal<T, true>(segment, state, scan_count, result, 0);
 }
 //===--------------------------------------------------------------------===//

package/src/duckdb/src/storage/local_storage.cpp CHANGED Viewed

@@ -159,7 +159,7 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen
 		    AppendToIndexes(transaction, *row_groups, table.info->indexes, table.GetTypes(), append_state.current_row);
 	}
 	if (error) {
-		// need to revert the append
+		// need to revert all appended row ids
 		row_t current_row = append_state.row_start;
 		// remove the data from the indexes, if there are any indexes
 		row_groups->Scan(transaction, [&](DataChunk &chunk) -> bool {
@@ -184,6 +184,13 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen
 		if (append_to_table) {
 			table.RevertAppendInternal(append_state.row_start, append_count);
 		}
+		// we need to vacuum the indexes to remove any buffers that are now empty
+		// due to reverting the appends
+		table.info->indexes.Scan([&](Index &index) {
+			index.Vacuum();
+			return false;
+		});
 		error.Throw();
 	}
 }

package/src/duckdb/ub_src_function_cast_union.cpp ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #include "src/function/cast/union/from_struct.cpp"
2	+

package/test/prepare.test.ts CHANGED Viewed

@@ -652,7 +652,16 @@ describe('prepare', function() {
             });
             it("should aggregate kurtosis(num)", function (done) {
                 db.all("SELECT kurtosis(num) as kurtosis FROM foo", function (err: null | Error, res: TableData) {
-                    assert.equal(res[0].kurtosis, -1.1999999999999997);
+                    // The `num` column of table `foo` contains each integer from 0 to 999,999 exactly once.
+                    // This is a uniform distribution. The excess kurtosis for a uniform distribution is exactly -1.2.
+                    // See https://en.wikipedia.org/wiki/Kurtosis#Other_well-known_distributions
+                    const expected = -1.2;
+                    // The calculated value can differ from the exact answer by small amounts on different platforms due
+                    // to floating-point errors. This tolerance was determined experimentally.
+                    const tolerance = Number.EPSILON * 10;
+                    assert.ok(Math.abs(res[0].kurtosis - expected) < tolerance);
                     done(err);
                 });
             });

package/test/test_all_types.test.ts CHANGED Viewed

@@ -90,7 +90,7 @@ const correct_answer_map: Record<string, any[]> = {
   date_array: [
     [],
     [
-      new Date(1970, 0, 1),
+      new Date(Date.UTC(1970, 0, 1)),
       null,
       new Date("0001-01-01T00:00:00.000Z"),
       new Date("9999-12-31T00:00:00.000Z"),
@@ -100,7 +100,7 @@ const correct_answer_map: Record<string, any[]> = {
   timestamp_array: [
     [],
     [
-      new Date(1970, 0, 1),
+      new Date(Date.UTC(1970, 0, 1)),
       null,
       new Date("0001-01-01T00:00:00.000Z"),
       new Date("9999-12-31T23:59:59.999Z"),
@@ -111,7 +111,7 @@ const correct_answer_map: Record<string, any[]> = {
   timestamptz_array: [
     [],
     [
-      new Date(1970, 0, 1),
+      new Date(Date.UTC(1970, 0, 1)),
       null,
       new Date("0001-01-01T00:00:00.000Z"),
       new Date("9999-12-31T23:59:59.999Z"),
@@ -171,7 +171,7 @@ const correct_answer_map: Record<string, any[]> = {
   ],
   timestamp: [
-    new Date("1990-01-01T00:00"),
+    new Date(Date.UTC(1990, 0, 1)),
     new Date("9999-12-31T23:59:59.000Z"),
     null,
   ],