npm - duckdb - Versions diffs - 0.8.2-dev4711.0 → 0.8.2-dev5002.0 - Mend

duckdb 0.8.2-dev4711.0 → 0.8.2-dev5002.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

package/src/duckdb/src/common/row_operations/row_matcher.cpp ADDED Viewed

@@ -0,0 +1,375 @@
+#include "duckdb/common/row_operations/row_matcher.hpp"
+#include "duckdb/common/enum_util.hpp"
+#include "duckdb/common/exception.hpp"
+#include "duckdb/common/types/row/tuple_data_collection.hpp"
+namespace duckdb {
+using ValidityBytes = TupleDataLayout::ValidityBytes;
+template <bool NO_MATCH_SEL, class T, class OP>
+static idx_t TemplatedMatch(Vector &, const TupleDataVectorFormat &lhs_format, SelectionVector &sel, const idx_t count,
+                            const TupleDataLayout &rhs_layout, Vector &rhs_row_locations, const idx_t col_idx,
+                            const vector<MatchFunction> &, SelectionVector *no_match_sel, idx_t &no_match_count) {
+	using COMPARISON_OP = ComparisonOperationWrapper<OP>;
+	// LHS
+	const auto &lhs_sel = *lhs_format.unified.sel;
+	const auto lhs_data = UnifiedVectorFormat::GetData<T>(lhs_format.unified);
+	const auto &lhs_validity = lhs_format.unified.validity;
+	// RHS
+	const auto rhs_locations = FlatVector::GetData<data_ptr_t>(rhs_row_locations);
+	const auto rhs_offset_in_row = rhs_layout.GetOffsets()[col_idx];
+	idx_t entry_idx;
+	idx_t idx_in_entry;
+	ValidityBytes::GetEntryIndex(col_idx, entry_idx, idx_in_entry);
+	idx_t match_count = 0;
+	for (idx_t i = 0; i < count; i++) {
+		const auto idx = sel.get_index(i);
+		const auto lhs_idx = lhs_sel.get_index(idx);
+		const auto lhs_null = lhs_validity.AllValid() ? false : !lhs_validity.RowIsValid(lhs_idx);
+		const auto &rhs_location = rhs_locations[idx];
+		const ValidityBytes rhs_mask(rhs_location);
+		const auto rhs_null = !rhs_mask.RowIsValid(rhs_mask.GetValidityEntryUnsafe(entry_idx), idx_in_entry);
+		if (COMPARISON_OP::template Operation<T>(lhs_data[lhs_idx], Load<T>(rhs_location + rhs_offset_in_row), lhs_null,
+		                                         rhs_null)) {
+			sel.set_index(match_count++, idx);
+		} else if (NO_MATCH_SEL) {
+			no_match_sel->set_index(no_match_count++, idx);
+		}
+	}
+	return match_count;
+}
+template <bool NO_MATCH_SEL, class OP>
+static idx_t StructMatchEquality(Vector &lhs_vector, const TupleDataVectorFormat &lhs_format, SelectionVector &sel,
+                                 const idx_t count, const TupleDataLayout &rhs_layout, Vector &rhs_row_locations,
+                                 const idx_t col_idx, const vector<MatchFunction> &child_functions,
+                                 SelectionVector *no_match_sel, idx_t &no_match_count) {
+	using COMPARISON_OP = ComparisonOperationWrapper<OP>;
+	// LHS
+	const auto &lhs_sel = *lhs_format.unified.sel;
+	const auto &lhs_validity = lhs_format.unified.validity;
+	// RHS
+	const auto rhs_locations = FlatVector::GetData<data_ptr_t>(rhs_row_locations);
+	idx_t entry_idx;
+	idx_t idx_in_entry;
+	ValidityBytes::GetEntryIndex(col_idx, entry_idx, idx_in_entry);
+	idx_t match_count = 0;
+	for (idx_t i = 0; i < count; i++) {
+		const auto idx = sel.get_index(i);
+		const auto lhs_idx = lhs_sel.get_index(idx);
+		const auto lhs_null = lhs_validity.AllValid() ? false : !lhs_validity.RowIsValid(lhs_idx);
+		const auto &rhs_location = rhs_locations[idx];
+		const ValidityBytes rhs_mask(rhs_location);
+		const auto rhs_null = !rhs_mask.RowIsValid(rhs_mask.GetValidityEntryUnsafe(entry_idx), idx_in_entry);
+		// For structs there is no value to compare, here we match NULLs and let recursion do the rest
+		// So we use the comparison only if rhs or LHS is NULL and COMPARE_NULL is true
+		if (!(lhs_null || rhs_null) ||
+		    (COMPARISON_OP::COMPARE_NULL && COMPARISON_OP::template Operation<uint32_t>(0, 0, lhs_null, rhs_null))) {
+			sel.set_index(match_count++, idx);
+		} else if (NO_MATCH_SEL) {
+			no_match_sel->set_index(no_match_count++, idx);
+		}
+	}
+	// Create a Vector of pointers to the start of the TupleDataLayout of the STRUCT
+	Vector rhs_struct_row_locations(LogicalType::POINTER);
+	const auto rhs_offset_in_row = rhs_layout.GetOffsets()[col_idx];
+	auto rhs_struct_locations = FlatVector::GetData<data_ptr_t>(rhs_struct_row_locations);
+	for (idx_t i = 0; i < match_count; i++) {
+		const auto idx = sel.get_index(i);
+		rhs_struct_locations[idx] = rhs_locations[idx] + rhs_offset_in_row;
+	}
+	// Get the struct layout and struct entries
+	const auto &rhs_struct_layout = rhs_layout.GetStructLayout(col_idx);
+	auto &lhs_struct_vectors = StructVector::GetEntries(lhs_vector);
+	D_ASSERT(rhs_struct_layout.ColumnCount() == lhs_struct_vectors.size());
+	for (idx_t struct_col_idx = 0; struct_col_idx < rhs_struct_layout.ColumnCount(); struct_col_idx++) {
+		auto &lhs_struct_vector = *lhs_struct_vectors[struct_col_idx];
+		auto &lhs_struct_format = lhs_format.children[struct_col_idx];
+		const auto &child_function = child_functions[struct_col_idx];
+		match_count = child_function.function(lhs_struct_vector, lhs_struct_format, sel, match_count, rhs_struct_layout,
+		                                      rhs_struct_row_locations, struct_col_idx, child_function.child_functions,
+		                                      no_match_sel, no_match_count);
+	}
+	return match_count;
+}
+template <typename OP>
+static idx_t SelectComparison(Vector &, Vector &, const SelectionVector &, idx_t, SelectionVector *,
+                              SelectionVector *) {
+	throw NotImplementedException("Unsupported list comparison operand for RowMatcher::GetMatchFunction");
+}
+template <>
+idx_t SelectComparison<Equals>(Vector &left, Vector &right, const SelectionVector &sel, idx_t count,
+                               SelectionVector *true_sel, SelectionVector *false_sel) {
+	return VectorOperations::NestedEquals(left, right, sel, count, true_sel, false_sel);
+}
+template <>
+idx_t SelectComparison<NotEquals>(Vector &left, Vector &right, const SelectionVector &sel, idx_t count,
+                                  SelectionVector *true_sel, SelectionVector *false_sel) {
+	return VectorOperations::NestedNotEquals(left, right, sel, count, true_sel, false_sel);
+}
+template <>
+idx_t SelectComparison<DistinctFrom>(Vector &left, Vector &right, const SelectionVector &sel, idx_t count,
+                                     SelectionVector *true_sel, SelectionVector *false_sel) {
+	return VectorOperations::DistinctFrom(left, right, &sel, count, true_sel, false_sel);
+}
+template <>
+idx_t SelectComparison<NotDistinctFrom>(Vector &left, Vector &right, const SelectionVector &sel, idx_t count,
+                                        SelectionVector *true_sel, SelectionVector *false_sel) {
+	return VectorOperations::NotDistinctFrom(left, right, &sel, count, true_sel, false_sel);
+}
+template <>
+idx_t SelectComparison<GreaterThan>(Vector &left, Vector &right, const SelectionVector &sel, idx_t count,
+                                    SelectionVector *true_sel, SelectionVector *false_sel) {
+	return VectorOperations::DistinctGreaterThan(left, right, &sel, count, true_sel, false_sel);
+}
+template <>
+idx_t SelectComparison<GreaterThanEquals>(Vector &left, Vector &right, const SelectionVector &sel, idx_t count,
+                                          SelectionVector *true_sel, SelectionVector *false_sel) {
+	return VectorOperations::DistinctGreaterThanEquals(left, right, &sel, count, true_sel, false_sel);
+}
+template <>
+idx_t SelectComparison<LessThan>(Vector &left, Vector &right, const SelectionVector &sel, idx_t count,
+                                 SelectionVector *true_sel, SelectionVector *false_sel) {
+	return VectorOperations::DistinctLessThan(left, right, &sel, count, true_sel, false_sel);
+}
+template <>
+idx_t SelectComparison<LessThanEquals>(Vector &left, Vector &right, const SelectionVector &sel, idx_t count,
+                                       SelectionVector *true_sel, SelectionVector *false_sel) {
+	return VectorOperations::DistinctLessThanEquals(left, right, &sel, count, true_sel, false_sel);
+}
+template <bool NO_MATCH_SEL, class OP>
+static idx_t GenericNestedMatch(Vector &lhs_vector, const TupleDataVectorFormat &, SelectionVector &sel,
+                                const idx_t count, const TupleDataLayout &rhs_layout, Vector &rhs_row_locations,
+                                const idx_t col_idx, const vector<MatchFunction> &, SelectionVector *no_match_sel,
+                                idx_t &no_match_count) {
+	const auto &type = rhs_layout.GetTypes()[col_idx];
+	// Gather a dense Vector containing the column values being matched
+	Vector key(type);
+	const auto gather_function = TupleDataCollection::GetGatherFunction(type);
+	gather_function.function(rhs_layout, rhs_row_locations, col_idx, sel, count, key,
+	                         *FlatVector::IncrementalSelectionVector(), key, gather_function.child_functions);
+	// Densify the input column
+	Vector sliced(lhs_vector, sel, count);
+	if (NO_MATCH_SEL) {
+		SelectionVector no_match_sel_offset(no_match_sel->data() + no_match_count);
+		auto match_count = SelectComparison<OP>(sliced, key, sel, count, &sel, &no_match_sel_offset);
+		no_match_count += count - match_count;
+		return match_count;
+	}
+	return SelectComparison<OP>(sliced, key, sel, count, &sel, nullptr);
+}
+void RowMatcher::Initialize(const bool no_match_sel, const TupleDataLayout &layout, const Predicates &predicates) {
+	match_functions.reserve(predicates.size());
+	for (idx_t col_idx = 0; col_idx < predicates.size(); col_idx++) {
+		match_functions.push_back(GetMatchFunction(no_match_sel, layout.GetTypes()[col_idx], predicates[col_idx]));
+	}
+}
+idx_t RowMatcher::Match(DataChunk &lhs, const vector<TupleDataVectorFormat> &lhs_formats, SelectionVector &sel,
+                        idx_t count, const TupleDataLayout &rhs_layout, Vector &rhs_row_locations,
+                        SelectionVector *no_match_sel, idx_t &no_match_count) {
+	D_ASSERT(!match_functions.empty());
+	for (idx_t col_idx = 0; col_idx < match_functions.size(); col_idx++) {
+		const auto &match_function = match_functions[col_idx];
+		count =
+		    match_function.function(lhs.data[col_idx], lhs_formats[col_idx], sel, count, rhs_layout, rhs_row_locations,
+		                            col_idx, match_function.child_functions, no_match_sel, no_match_count);
+	}
+	return count;
+}
+MatchFunction RowMatcher::GetMatchFunction(const bool no_match_sel, const LogicalType &type,
+                                           const ExpressionType predicate) {
+	return no_match_sel ? GetMatchFunction<true>(type, predicate) : GetMatchFunction<false>(type, predicate);
+}
+template <bool NO_MATCH_SEL>
+MatchFunction RowMatcher::GetMatchFunction(const LogicalType &type, const ExpressionType predicate) {
+	switch (type.InternalType()) {
+	case PhysicalType::BOOL:
+		return GetMatchFunction<NO_MATCH_SEL, bool>(predicate);
+	case PhysicalType::INT8:
+		return GetMatchFunction<NO_MATCH_SEL, int8_t>(predicate);
+	case PhysicalType::INT16:
+		return GetMatchFunction<NO_MATCH_SEL, int16_t>(predicate);
+	case PhysicalType::INT32:
+		return GetMatchFunction<NO_MATCH_SEL, int32_t>(predicate);
+	case PhysicalType::INT64:
+		return GetMatchFunction<NO_MATCH_SEL, int64_t>(predicate);
+	case PhysicalType::INT128:
+		return GetMatchFunction<NO_MATCH_SEL, hugeint_t>(predicate);
+	case PhysicalType::UINT8:
+		return GetMatchFunction<NO_MATCH_SEL, uint8_t>(predicate);
+	case PhysicalType::UINT16:
+		return GetMatchFunction<NO_MATCH_SEL, uint16_t>(predicate);
+	case PhysicalType::UINT32:
+		return GetMatchFunction<NO_MATCH_SEL, uint32_t>(predicate);
+	case PhysicalType::UINT64:
+		return GetMatchFunction<NO_MATCH_SEL, uint64_t>(predicate);
+	case PhysicalType::FLOAT:
+		return GetMatchFunction<NO_MATCH_SEL, float>(predicate);
+	case PhysicalType::DOUBLE:
+		return GetMatchFunction<NO_MATCH_SEL, double>(predicate);
+	case PhysicalType::INTERVAL:
+		return GetMatchFunction<NO_MATCH_SEL, interval_t>(predicate);
+	case PhysicalType::VARCHAR:
+		return GetMatchFunction<NO_MATCH_SEL, string_t>(predicate);
+	case PhysicalType::STRUCT:
+		return GetStructMatchFunction<NO_MATCH_SEL>(type, predicate);
+	case PhysicalType::LIST:
+		return GetListMatchFunction<NO_MATCH_SEL>(predicate);
+	default:
+		throw InternalException("Unsupported PhysicalType for RowMatcher::GetMatchFunction: %s",
+		                        EnumUtil::ToString(type.InternalType()));
+	}
+}
+template <bool NO_MATCH_SEL, class T>
+MatchFunction RowMatcher::GetMatchFunction(const ExpressionType predicate) {
+	MatchFunction result;
+	switch (predicate) {
+	case ExpressionType::COMPARE_EQUAL:
+		result.function = TemplatedMatch<NO_MATCH_SEL, T, Equals>;
+		break;
+	case ExpressionType::COMPARE_NOTEQUAL:
+		result.function = TemplatedMatch<NO_MATCH_SEL, T, NotEquals>;
+		break;
+	case ExpressionType::COMPARE_DISTINCT_FROM:
+		result.function = TemplatedMatch<NO_MATCH_SEL, T, DistinctFrom>;
+		break;
+	case ExpressionType::COMPARE_NOT_DISTINCT_FROM:
+		result.function = TemplatedMatch<NO_MATCH_SEL, T, NotDistinctFrom>;
+		break;
+	case ExpressionType::COMPARE_GREATERTHAN:
+		result.function = TemplatedMatch<NO_MATCH_SEL, T, GreaterThan>;
+		break;
+	case ExpressionType::COMPARE_GREATERTHANOREQUALTO:
+		result.function = TemplatedMatch<NO_MATCH_SEL, T, GreaterThanEquals>;
+		break;
+	case ExpressionType::COMPARE_LESSTHAN:
+		result.function = TemplatedMatch<NO_MATCH_SEL, T, LessThan>;
+		break;
+	case ExpressionType::COMPARE_LESSTHANOREQUALTO:
+		result.function = TemplatedMatch<NO_MATCH_SEL, T, LessThanEquals>;
+		break;
+	default:
+		throw InternalException("Unsupported ExpressionType for RowMatcher::GetMatchFunction: %s",
+		                        EnumUtil::ToString(predicate));
+	}
+	return result;
+}
+template <bool NO_MATCH_SEL>
+MatchFunction RowMatcher::GetStructMatchFunction(const LogicalType &type, const ExpressionType predicate) {
+	// We perform equality conditions like it's just a row, but we cannot perform inequality conditions like a row,
+	// because for equality conditions we need to always loop through all columns, but for inequality conditions,
+	// we need to find the first inequality, so the loop looks very different
+	MatchFunction result;
+	ExpressionType child_predicate = predicate;
+	switch (predicate) {
+	case ExpressionType::COMPARE_EQUAL:
+		result.function = StructMatchEquality<NO_MATCH_SEL, Equals>;
+		child_predicate = ExpressionType::COMPARE_NOT_DISTINCT_FROM;
+		break;
+	case ExpressionType::COMPARE_NOTEQUAL:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, NotEquals>;
+		return result;
+	case ExpressionType::COMPARE_DISTINCT_FROM:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, DistinctFrom>;
+		return result;
+	case ExpressionType::COMPARE_NOT_DISTINCT_FROM:
+		result.function = StructMatchEquality<NO_MATCH_SEL, NotDistinctFrom>;
+		break;
+	case ExpressionType::COMPARE_GREATERTHAN:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, GreaterThan>;
+		return result;
+	case ExpressionType::COMPARE_GREATERTHANOREQUALTO:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, GreaterThanEquals>;
+		return result;
+	case ExpressionType::COMPARE_LESSTHAN:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, LessThan>;
+		return result;
+	case ExpressionType::COMPARE_LESSTHANOREQUALTO:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, LessThanEquals>;
+		return result;
+	default:
+		throw InternalException("Unsupported ExpressionType for RowMatcher::GetStructMatchFunction: %s",
+		                        EnumUtil::ToString(predicate));
+	}
+	result.child_functions.reserve(StructType::GetChildCount(type));
+	for (const auto &child_type : StructType::GetChildTypes(type)) {
+		result.child_functions.push_back(GetMatchFunction<NO_MATCH_SEL>(child_type.second, child_predicate));
+	}
+	return result;
+}
+template <bool NO_MATCH_SEL>
+MatchFunction RowMatcher::GetListMatchFunction(const ExpressionType predicate) {
+	MatchFunction result;
+	switch (predicate) {
+	case ExpressionType::COMPARE_EQUAL:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, Equals>;
+		break;
+	case ExpressionType::COMPARE_NOTEQUAL:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, NotEquals>;
+		break;
+	case ExpressionType::COMPARE_DISTINCT_FROM:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, DistinctFrom>;
+		break;
+	case ExpressionType::COMPARE_NOT_DISTINCT_FROM:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, NotDistinctFrom>;
+		break;
+	case ExpressionType::COMPARE_GREATERTHAN:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, GreaterThan>;
+		break;
+	case ExpressionType::COMPARE_GREATERTHANOREQUALTO:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, GreaterThanEquals>;
+		break;
+	case ExpressionType::COMPARE_LESSTHAN:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, LessThan>;
+		break;
+	case ExpressionType::COMPARE_LESSTHANOREQUALTO:
+		result.function = GenericNestedMatch<NO_MATCH_SEL, LessThanEquals>;
+		break;
+	default:
+		throw InternalException("Unsupported ExpressionType for RowMatcher::GetListMatchFunction: %s",
+		                        EnumUtil::ToString(predicate));
+	}
+	return result;
+}
+} // namespace duckdb

package/src/duckdb/src/common/types/data_chunk.cpp CHANGED Viewed

@@ -13,6 +13,10 @@
 #include "duckdb/common/vector_operations/vector_operations.hpp"
 #include "duckdb/execution/execution_context.hpp"
+#include "duckdb/common/serializer/memory_stream.hpp"
+#include "duckdb/common/serializer/binary_serializer.hpp"
+#include "duckdb/common/serializer/binary_deserializer.hpp"
 namespace duckdb {
 DataChunk::DataChunk() : count(0), capacity(STANDARD_VECTOR_SIZE) {
@@ -231,16 +235,20 @@ string DataChunk::ToString() const {
 }
 void DataChunk::Serialize(Serializer &serializer) const {
 	// write the count
 	auto row_count = size();
 	serializer.WriteProperty<sel_t>(100, "rows", row_count);
+	// we should never try to serialize empty data chunks
 	auto column_count = ColumnCount();
+	D_ASSERT(column_count);
-	// Write the types
+	// write the types
 	serializer.WriteList(101, "types", column_count,
 	                     [&](Serializer::List &list, idx_t i) { list.WriteElement(data[i].GetType()); });
-	// Write the data
+	// write the data
 	serializer.WriteList(102, "columns", column_count, [&](Serializer::List &list, idx_t i) {
 		list.WriteObject([&](Serializer &object) {
 			// Reference the vector to avoid potentially mutating it during serialization
@@ -252,21 +260,23 @@ void DataChunk::Serialize(Serializer &serializer) const {
 }
 void DataChunk::Deserialize(Deserializer &deserializer) {
-	// read the count
+	// read and set the row count
 	auto row_count = deserializer.ReadProperty<sel_t>(100, "rows");
+	SetCardinality(row_count);
-	// Read the types
+	// read the types
 	vector<LogicalType> types;
 	deserializer.ReadList(101, "types", [&](Deserializer::List &list, idx_t i) {
 		auto type = list.ReadElement<LogicalType>();
 		types.push_back(type);
 	});
-	Initialize(Allocator::DefaultAllocator(), types);
-	// now load the column data
-	SetCardinality(row_count);
+	// initialize the data chunk
+	D_ASSERT(!types.empty());
+	Initialize(Allocator::DefaultAllocator(), types);
-	// Read the data
+	// read the data
 	deserializer.ReadList(102, "columns", [&](Deserializer::List &list, idx_t i) {
 		list.ReadObject([&](Deserializer &object) { data[i].Deserialize(object, row_count); });
 	});
@@ -296,11 +306,11 @@ void DataChunk::Slice(DataChunk &other, const SelectionVector &sel, idx_t count_
 }
 unsafe_unique_array<UnifiedVectorFormat> DataChunk::ToUnifiedFormat() {
-	auto orrified_data = make_unsafe_uniq_array<UnifiedVectorFormat>(ColumnCount());
+	auto unified_data = make_unsafe_uniq_array<UnifiedVectorFormat>(ColumnCount());
 	for (idx_t col_idx = 0; col_idx < ColumnCount(); col_idx++) {
-		data[col_idx].ToUnifiedFormat(size(), orrified_data[col_idx]);
+		data[col_idx].ToUnifiedFormat(size(), unified_data[col_idx]);
 	}
-	return orrified_data;
+	return unified_data;
 }
 void DataChunk::Hash(Vector &result) {
@@ -324,10 +334,37 @@ void DataChunk::Hash(vector<idx_t> &column_ids, Vector &result) {
 void DataChunk::Verify() {
 #ifdef DEBUG
 	D_ASSERT(size() <= capacity);
 	// verify that all vectors in this chunk have the chunk selection vector
 	for (idx_t i = 0; i < ColumnCount(); i++) {
 		data[i].Verify(size());
 	}
+	if (!ColumnCount()) {
+		// don't try to round-trip dummy data chunks with no data
+		// e.g., these exist in queries like 'SELECT distinct(col0, col1) FROM tbl', where we have groups, but no
+		// payload so the payload will be such an empty data chunk
+		return;
+	}
+	// verify that we can round-trip chunk serialization
+	MemoryStream mem_stream;
+	BinarySerializer serializer(mem_stream);
+	serializer.Begin();
+	Serialize(serializer);
+	serializer.End();
+	mem_stream.Rewind();
+	BinaryDeserializer deserializer(mem_stream);
+	DataChunk new_chunk;
+	deserializer.Begin();
+	new_chunk.Deserialize(deserializer);
+	deserializer.End();
+	D_ASSERT(size() == new_chunk.size());
 #endif
 }

package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp CHANGED Viewed

@@ -294,7 +294,7 @@ static inline void VerifyStrings(const LogicalTypeId type_id, const data_ptr_t r
 	for (idx_t i = 0; i < count; i++) {
 		const auto &row_location = row_locations[offset + i] + base_col_offset;
 		ValidityBytes row_mask(row_location);
-		if (row_mask.RowIsValid(row_mask.GetValidityEntry(entry_idx), idx_in_entry)) {
+		if (row_mask.RowIsValid(row_mask.GetValidityEntryUnsafe(entry_idx), idx_in_entry)) {
 			auto recomputed_string = Load<string_t>(row_location + col_offset);
 			recomputed_string.Verify();
 		}
@@ -328,7 +328,7 @@ void TupleDataAllocator::RecomputeHeapPointers(Vector &old_heap_ptrs, const Sele
 				const auto idx = offset + i;
 				const auto &row_location = row_locations[idx] + base_col_offset;
 				ValidityBytes row_mask(row_location);
-				if (!row_mask.RowIsValid(row_mask.GetValidityEntry(entry_idx), idx_in_entry)) {
+				if (!row_mask.RowIsValid(row_mask.GetValidityEntryUnsafe(entry_idx), idx_in_entry)) {
 					continue;
 				}
@@ -352,7 +352,7 @@ void TupleDataAllocator::RecomputeHeapPointers(Vector &old_heap_ptrs, const Sele
 				const auto idx = offset + i;
 				const auto &row_location = row_locations[idx] + base_col_offset;
 				ValidityBytes row_mask(row_location);
-				if (!row_mask.RowIsValid(row_mask.GetValidityEntry(entry_idx), idx_in_entry)) {
+				if (!row_mask.RowIsValid(row_mask.GetValidityEntryUnsafe(entry_idx), idx_in_entry)) {
 					continue;
 				}

package/src/duckdb/src/common/types/row/tuple_data_collection.cpp CHANGED Viewed

@@ -37,13 +37,17 @@ void TupleDataCollection::Initialize() {
 	}
 }
-void TupleDataCollection::GetAllColumnIDs(vector<column_t> &column_ids) {
-	column_ids.reserve(layout.ColumnCount());
-	for (idx_t col_idx = 0; col_idx < layout.ColumnCount(); col_idx++) {
+void GetAllColumnIDsInternal(vector<column_t> &column_ids, const idx_t column_count) {
+	column_ids.reserve(column_count);
+	for (idx_t col_idx = 0; col_idx < column_count; col_idx++) {
 		column_ids.emplace_back(col_idx);
 	}
 }
+void TupleDataCollection::GetAllColumnIDs(vector<column_t> &column_ids) {
+	GetAllColumnIDsInternal(column_ids, layout.ColumnCount());
+}
 const TupleDataLayout &TupleDataCollection::GetLayout() const {
 	return layout;
 }
@@ -108,7 +112,7 @@ void TupleDataCollection::InitializeAppend(TupleDataAppendState &append_state, v
                                            TupleDataPinProperties properties) {
 	VerifyAppendColumns(layout, column_ids);
 	InitializeAppend(append_state.pin_state, properties);
-	InitializeAppend(append_state.chunk_state, std::move(column_ids));
+	InitializeChunkState(append_state.chunk_state, std::move(column_ids));
 }
 void TupleDataCollection::InitializeAppend(TupleDataPinState &pin_state, TupleDataPinProperties properties) {
@@ -130,11 +134,11 @@ static void InitializeVectorFormat(vector<TupleDataVectorFormat> &vector_data, c
 			for (const auto &child_entry : child_list) {
 				child_types.emplace_back(child_entry.second);
 			}
-			InitializeVectorFormat(vector_data[col_idx].child_formats, child_types);
+			InitializeVectorFormat(vector_data[col_idx].children, child_types);
 			break;
 		}
 		case PhysicalType::LIST:
-			InitializeVectorFormat(vector_data[col_idx].child_formats, {ListType::GetChildType(type)});
+			InitializeVectorFormat(vector_data[col_idx].children, {ListType::GetChildType(type)});
 			break;
 		default:
 			break;
@@ -142,11 +146,16 @@ static void InitializeVectorFormat(vector<TupleDataVectorFormat> &vector_data, c
 	}
 }
-void TupleDataCollection::InitializeAppend(TupleDataChunkState &chunk_state, vector<column_t> column_ids) {
+void TupleDataCollection::InitializeChunkState(TupleDataChunkState &chunk_state, vector<column_t> column_ids) {
+	TupleDataCollection::InitializeChunkState(chunk_state, layout.GetTypes(), std::move(column_ids));
+}
+void TupleDataCollection::InitializeChunkState(TupleDataChunkState &chunk_state, const vector<LogicalType> &types,
+                                               vector<column_t> column_ids) {
 	if (column_ids.empty()) {
-		GetAllColumnIDs(column_ids);
+		GetAllColumnIDsInternal(column_ids, types.size());
 	}
-	InitializeVectorFormat(chunk_state.vector_data, layout.GetTypes());
+	InitializeVectorFormat(chunk_state.vector_data, types);
 	chunk_state.column_ids = std::move(column_ids);
 }
@@ -211,21 +220,23 @@ void TupleDataCollection::AppendUnified(TupleDataPinState &pin_state, TupleDataC
 }
 static inline void ToUnifiedFormatInternal(TupleDataVectorFormat &format, Vector &vector, const idx_t count) {
-	vector.ToUnifiedFormat(count, format.data);
-	format.original_sel = format.data.sel;
-	format.original_owned_sel.Initialize(format.data.owned_sel);
+	vector.ToUnifiedFormat(count, format.unified);
+	format.original_sel = format.unified.sel;
+	format.original_owned_sel.Initialize(format.unified.owned_sel);
 	switch (vector.GetType().InternalType()) {
 	case PhysicalType::STRUCT: {
 		auto &entries = StructVector::GetEntries(vector);
-		D_ASSERT(format.child_formats.size() == entries.size());
+		D_ASSERT(format.children.size() == entries.size());
 		for (idx_t struct_col_idx = 0; struct_col_idx < entries.size(); struct_col_idx++) {
-			ToUnifiedFormatInternal(format.child_formats[struct_col_idx], *entries[struct_col_idx], count);
+			ToUnifiedFormatInternal(reinterpret_cast<TupleDataVectorFormat &>(format.children[struct_col_idx]),
+			                        *entries[struct_col_idx], count);
 		}
 		break;
 	}
 	case PhysicalType::LIST:
-		D_ASSERT(format.child_formats.size() == 1);
-		ToUnifiedFormatInternal(format.child_formats[0], ListVector::GetEntry(vector), ListVector::GetListSize(vector));
+		D_ASSERT(format.children.size() == 1);
+		ToUnifiedFormatInternal(reinterpret_cast<TupleDataVectorFormat &>(format.children[0]),
+		                        ListVector::GetEntry(vector), ListVector::GetListSize(vector));
 		break;
 	default:
 		break;
@@ -242,7 +253,7 @@ void TupleDataCollection::ToUnifiedFormat(TupleDataChunkState &chunk_state, Data
 void TupleDataCollection::GetVectorData(const TupleDataChunkState &chunk_state, UnifiedVectorFormat result[]) {
 	const auto &vector_data = chunk_state.vector_data;
 	for (idx_t i = 0; i < vector_data.size(); i++) {
-		const auto &source = vector_data[i].data;
+		const auto &source = vector_data[i].unified;
 		auto &target = result[i];
 		target.sel = source.sel;
 		target.data = source.data;