npm - duckdb - Versions diffs - 0.8.2-dev4711.0 → 0.8.2-dev4871.0 - Mend

duckdb 0.8.2-dev4711.0 → 0.8.2-dev4871.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/src/duckdb/src/execution/aggregate_hashtable.cpp CHANGED Viewed

@@ -45,6 +45,7 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
 	// Append hash column to the end and initialise the row layout
 	group_types_p.emplace_back(LogicalType::HASH);
 	layout.Initialize(std::move(group_types_p), std::move(aggregate_objects_p));
 	hash_offset = layout.GetOffsets()[layout.ColumnCount() - 1];
 	// Partitioned data and pointer table
@@ -52,7 +53,8 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
 	Resize(initial_capacity);
 	// Predicates
-	predicates.resize(layout.ColumnCount() - 1, ExpressionType::COMPARE_EQUAL);
+	predicates.resize(layout.ColumnCount() - 1, ExpressionType::COMPARE_NOT_DISTINCT_FROM);
+	row_matcher.Initialize(true, layout, predicates);
 }
 void GroupedAggregateHashTable::InitializePartitionedData() {
@@ -414,9 +416,8 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
 			}
 			// Perform group comparisons
-			RowOperations::Match(state.group_chunk, state.group_data.get(), layout, addresses_v, predicates,
-			                     state.group_compare_vector, need_compare_count, &state.no_match_vector,
-			                     no_match_count);
+			row_matcher.Match(state.group_chunk, chunk_state.vector_data, state.group_compare_vector,
+			                  need_compare_count, layout, addresses_v, &state.no_match_vector, no_match_count);
 		}
 		// Linear probing: each of the entries that do not match move to the next entry in the HT

package/src/duckdb/src/execution/index/fixed_size_allocator.cpp CHANGED Viewed

@@ -173,6 +173,19 @@ bool FixedSizeAllocator::InitializeVacuum() {
 		return false;
 	}
+	// remove all empty buffers
+	auto buffer_it = buffers.begin();
+	while (buffer_it != buffers.end()) {
+		if (!buffer_it->second.segment_count) {
+			buffers_with_free_space.erase(buffer_it->first);
+			buffer_it->second.Destroy();
+			buffer_it = buffers.erase(buffer_it);
+		} else {
+			buffer_it++;
+		}
+	}
+	// determine if a vacuum is necessary
 	multimap<idx_t, idx_t> temporary_vacuum_buffers;
 	D_ASSERT(vacuum_buffers.empty());
 	idx_t available_segments_in_memory = 0;

package/src/duckdb/src/execution/join_hashtable.cpp CHANGED Viewed

@@ -19,15 +19,15 @@ JoinHashTable::JoinHashTable(BufferManager &buffer_manager_p, const vector<JoinC
     : buffer_manager(buffer_manager_p), conditions(conditions_p), build_types(std::move(btypes)), entry_size(0),
       tuple_size(0), vfound(Value::BOOLEAN(false)), join_type(type_p), finalized(false), has_null(false),
       external(false), radix_bits(4), partition_start(0), partition_end(0) {
 	for (auto &condition : conditions) {
 		D_ASSERT(condition.left->return_type == condition.right->return_type);
 		auto type = condition.left->return_type;
 		if (condition.comparison == ExpressionType::COMPARE_EQUAL ||
-		    condition.comparison == ExpressionType::COMPARE_NOT_DISTINCT_FROM ||
-		    condition.comparison == ExpressionType::COMPARE_DISTINCT_FROM) {
-			// all equality conditions should be at the front
-			// all other conditions at the back
-			// this assert checks that
+		    condition.comparison == ExpressionType::COMPARE_NOT_DISTINCT_FROM) {
+			// ensure that all equality conditions are at the front,
+			// and that all other conditions are at the back
 			D_ASSERT(equality_types.size() == condition_types.size());
 			equality_types.push_back(type);
 		}
@@ -51,6 +51,8 @@ JoinHashTable::JoinHashTable(BufferManager &buffer_manager_p, const vector<JoinC
 	}
 	layout_types.emplace_back(LogicalType::HASH);
 	layout.Initialize(layout_types, false);
+	row_matcher.Initialize(false, layout, predicates);
+	row_matcher_no_match_sel.Initialize(true, layout, predicates);
 	const auto &offsets = layout.GetOffsets();
 	tuple_size = offsets[condition_types.size() + build_types.size()];
@@ -142,30 +144,6 @@ static idx_t FilterNullValues(UnifiedVectorFormat &vdata, const SelectionVector
 	return result_count;
 }
-idx_t JoinHashTable::PrepareKeys(DataChunk &keys, unsafe_unique_array<UnifiedVectorFormat> &key_data,
-                                 const SelectionVector *&current_sel, SelectionVector &sel, bool build_side) {
-	key_data = keys.ToUnifiedFormat();
-	// figure out which keys are NULL, and create a selection vector out of them
-	current_sel = FlatVector::IncrementalSelectionVector();
-	idx_t added_count = keys.size();
-	if (build_side && IsRightOuterJoin(join_type)) {
-		// in case of a right or full outer join, we cannot remove NULL keys from the build side
-		return added_count;
-	}
-	for (idx_t i = 0; i < keys.ColumnCount(); i++) {
-		if (!null_values_are_equal[i]) {
-			if (key_data[i].validity.AllValid()) {
-				continue;
-			}
-			added_count = FilterNullValues(key_data[i], *current_sel, added_count, sel);
-			// null values are NOT equal for this column, filter them out
-			current_sel = &sel;
-		}
-	}
-	return added_count;
-}
 void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChunk &keys, DataChunk &payload) {
 	D_ASSERT(!finalized);
 	D_ASSERT(keys.size() == payload.size());
@@ -194,23 +172,6 @@ void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChu
 		info.correlated_counts->AddChunk(info.group_chunk, info.correlated_payload, AggregateType::NON_DISTINCT);
 	}
-	// prepare the keys for processing
-	unsafe_unique_array<UnifiedVectorFormat> key_data;
-	const SelectionVector *current_sel;
-	SelectionVector sel(STANDARD_VECTOR_SIZE);
-	idx_t added_count = PrepareKeys(keys, key_data, current_sel, sel, true);
-	if (added_count < keys.size()) {
-		has_null = true;
-	}
-	if (added_count == 0) {
-		return;
-	}
-	// hash the keys and obtain an entry in the list
-	// note that we only hash the keys used in the equality comparison
-	Vector hash_values(LogicalType::HASH);
-	Hash(keys, *current_sel, added_count, hash_values);
 	// build a chunk to append to the data collection [keys, payload, (optional "found" boolean), hash]
 	DataChunk source_chunk;
 	source_chunk.InitializeEmpty(layout.GetTypes());
@@ -228,13 +189,58 @@ void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChu
 		source_chunk.data[col_offset].Reference(vfound);
 		col_offset++;
 	}
+	Vector hash_values(LogicalType::HASH);
 	source_chunk.data[col_offset].Reference(hash_values);
 	source_chunk.SetCardinality(keys);
+	// ToUnifiedFormat the source chunk
+	TupleDataCollection::ToUnifiedFormat(append_state.chunk_state, source_chunk);
+	// prepare the keys for processing
+	const SelectionVector *current_sel;
+	SelectionVector sel(STANDARD_VECTOR_SIZE);
+	idx_t added_count = PrepareKeys(keys, append_state.chunk_state.vector_data, current_sel, sel, true);
 	if (added_count < keys.size()) {
-		source_chunk.Slice(*current_sel, added_count);
+		has_null = true;
+	}
+	if (added_count == 0) {
+		return;
 	}
-	sink_collection->Append(append_state, source_chunk);
+	// hash the keys and obtain an entry in the list
+	// note that we only hash the keys used in the equality comparison
+	Hash(keys, *current_sel, added_count, hash_values);
+	// Re-reference and ToUnifiedFormat the hash column after computing it
+	source_chunk.data[col_offset].Reference(hash_values);
+	hash_values.ToUnifiedFormat(source_chunk.size(), append_state.chunk_state.vector_data.back().unified);
+	// We already called TupleDataCollection::ToUnifiedFormat, so we can AppendUnified here
+	sink_collection->AppendUnified(append_state, source_chunk, *current_sel, added_count);
+}
+idx_t JoinHashTable::PrepareKeys(DataChunk &keys, vector<TupleDataVectorFormat> &vector_data,
+                                 const SelectionVector *&current_sel, SelectionVector &sel, bool build_side) {
+	// figure out which keys are NULL, and create a selection vector out of them
+	current_sel = FlatVector::IncrementalSelectionVector();
+	idx_t added_count = keys.size();
+	if (build_side && IsRightOuterJoin(join_type)) {
+		// in case of a right or full outer join, we cannot remove NULL keys from the build side
+		return added_count;
+	}
+	for (idx_t col_idx = 0; col_idx < keys.ColumnCount(); col_idx++) {
+		if (!null_values_are_equal[col_idx]) {
+			auto &col_key_data = vector_data[col_idx].unified;
+			if (col_key_data.validity.AllValid()) {
+				continue;
+			}
+			added_count = FilterNullValues(col_key_data, *current_sel, added_count, sel);
+			// null values are NOT equal for this column, filter them out
+			current_sel = &sel;
+		}
+	}
+	return added_count;
 }
 template <bool PARALLEL>
@@ -322,12 +328,13 @@ void JoinHashTable::Finalize(idx_t chunk_idx_from, idx_t chunk_idx_to, bool para
 	} while (iterator.Next());
 }
-unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys, const SelectionVector *&current_sel) {
+unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys, TupleDataChunkState &key_state,
+                                                                 const SelectionVector *&current_sel) {
 	D_ASSERT(Count() > 0); // should be handled before
 	D_ASSERT(finalized);
 	// set up the scan structure
-	auto ss = make_uniq<ScanStructure>(*this);
+	auto ss = make_uniq<ScanStructure>(*this, key_state);
 	if (join_type != JoinType::INNER) {
 		ss->found_match = make_unsafe_uniq_array<bool>(STANDARD_VECTOR_SIZE);
@@ -335,13 +342,15 @@ unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys
 	}
 	// first prepare the keys for probing
-	ss->count = PrepareKeys(keys, ss->key_data, current_sel, ss->sel_vector, false);
+	TupleDataCollection::ToUnifiedFormat(key_state, keys);
+	ss->count = PrepareKeys(keys, key_state.vector_data, current_sel, ss->sel_vector, false);
 	return ss;
 }
-unique_ptr<ScanStructure> JoinHashTable::Probe(DataChunk &keys, Vector *precomputed_hashes) {
+unique_ptr<ScanStructure> JoinHashTable::Probe(DataChunk &keys, TupleDataChunkState &key_state,
+                                               Vector *precomputed_hashes) {
 	const SelectionVector *current_sel;
-	auto ss = InitializeScanStructure(keys, current_sel);
+	auto ss = InitializeScanStructure(keys, key_state, current_sel);
 	if (ss->count == 0) {
 		return ss;
 	}
@@ -363,8 +372,9 @@ unique_ptr<ScanStructure> JoinHashTable::Probe(DataChunk &keys, Vector *precompu
 	return ss;
 }
-ScanStructure::ScanStructure(JoinHashTable &ht)
-    : pointers(LogicalType::POINTER), sel_vector(STANDARD_VECTOR_SIZE), ht(ht), finished(false) {
+ScanStructure::ScanStructure(JoinHashTable &ht_p, TupleDataChunkState &key_state_p)
+    : key_state(key_state_p), pointers(LogicalType::POINTER), sel_vector(STANDARD_VECTOR_SIZE), ht(ht_p),
+      finished(false) {
 }
 void ScanStructure::Next(DataChunk &keys, DataChunk &left, DataChunk &result) {
@@ -404,8 +414,9 @@ idx_t ScanStructure::ResolvePredicates(DataChunk &keys, SelectionVector &match_s
 	}
 	idx_t no_match_count = 0;
-	return RowOperations::Match(keys, key_data.get(), ht.layout, pointers, ht.predicates, match_sel, this->count,
-	                            no_match_sel, no_match_count);
+	auto &matcher = no_match_sel ? ht.row_matcher_no_match_sel : ht.row_matcher;
+	return matcher.Match(keys, key_state.vector_data, match_sel, this->count, ht.layout, pointers, no_match_sel,
+	                     no_match_count);
 }
 idx_t ScanStructure::ScanInnerJoin(DataChunk &keys, SelectionVector &result_vector) {
@@ -990,7 +1001,8 @@ static void CreateSpillChunk(DataChunk &spill_chunk, DataChunk &keys, DataChunk
 	spill_chunk.data[spill_col_idx].Reference(hashes);
 }
-unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys, DataChunk &payload, ProbeSpill &probe_spill,
+unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys, TupleDataChunkState &key_state,
+                                                       DataChunk &payload, ProbeSpill &probe_spill,
                                                        ProbeSpillLocalAppendState &spill_state,
                                                        DataChunk &spill_chunk) {
 	// hash all the keys
@@ -1019,7 +1031,7 @@ unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys, DataChun
 	payload.Slice(true_sel, true_count);
 	const SelectionVector *current_sel;
-	auto ss = InitializeScanStructure(keys, current_sel);
+	auto ss = InitializeScanStructure(keys, key_state, current_sel);
 	if (ss->count == 0) {
 		return ss;
 	}

package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp CHANGED Viewed

@@ -420,6 +420,8 @@ public:
 	}
 	DataChunk join_keys;
+	TupleDataChunkState join_key_state;
 	ExpressionExecutor probe_executor;
 	unique_ptr<JoinHashTable::ScanStructure> scan_structure;
 	unique_ptr<OperatorState> perfect_hash_join_state;
@@ -446,6 +448,7 @@ unique_ptr<OperatorState> PhysicalHashJoin::GetOperatorState(ExecutionContext &c
 		for (auto &cond : conditions) {
 			state->probe_executor.AddExpression(*cond.left);
 		}
+		TupleDataCollection::InitializeChunkState(state->join_key_state, condition_types);
 	}
 	if (sink.external) {
 		state->spill_chunk.Initialize(allocator, sink.probe_types);
@@ -502,10 +505,10 @@ OperatorResultType PhysicalHashJoin::ExecuteInternal(ExecutionContext &context,
 	// perform the actual probe
 	if (sink.external) {
-		state.scan_structure = sink.hash_table->ProbeAndSpill(state.join_keys, input, *sink.probe_spill,
-		                                                      state.spill_state, state.spill_chunk);
+		state.scan_structure = sink.hash_table->ProbeAndSpill(state.join_keys, state.join_key_state, input,
+		                                                      *sink.probe_spill, state.spill_state, state.spill_chunk);
 	} else {
-		state.scan_structure = sink.hash_table->Probe(state.join_keys);
+		state.scan_structure = sink.hash_table->Probe(state.join_keys, state.join_key_state);
 	}
 	state.scan_structure->Next(state.join_keys, input, chunk);
 	return OperatorResultType::HAVE_MORE_OUTPUT;
@@ -605,6 +608,7 @@ public:
 	DataChunk probe_chunk;
 	DataChunk join_keys;
 	DataChunk payload;
+	TupleDataChunkState join_key_state;
 	//! Column indices to easily reference the join keys/payload columns in probe_chunk
 	vector<idx_t> join_key_indices;
 	vector<idx_t> payload_indices;
@@ -782,6 +786,7 @@ HashJoinLocalSourceState::HashJoinLocalSourceState(const PhysicalHashJoin &op, A
 	probe_chunk.Initialize(allocator, sink.probe_types);
 	join_keys.Initialize(allocator, op.condition_types);
 	payload.Initialize(allocator, op.children[0]->types);
+	TupleDataCollection::InitializeChunkState(join_key_state, op.condition_types);
 	// Store the indices of the columns to reference them easily
 	idx_t col_idx = 0;
@@ -871,7 +876,7 @@ void HashJoinLocalSourceState::ExternalProbe(HashJoinGlobalSinkState &sink, Hash
 	}
 	// Perform the probe
-	scan_structure = sink.hash_table->Probe(join_keys, precomputed_hashes);
+	scan_structure = sink.hash_table->Probe(join_keys, join_key_state, precomputed_hashes);
 	scan_structure->Next(join_keys, payload, chunk);
 }

package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp CHANGED Viewed

@@ -254,7 +254,6 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::PlanComparisonJoin(LogicalCo
 	}
 	bool has_equality = false;
-	// bool has_inequality = false;
 	size_t has_range = 0;
 	for (size_t c = 0; c < op.conditions.size(); ++c) {
 		auto &cond = op.conditions[c];
@@ -271,7 +270,6 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::PlanComparisonJoin(LogicalCo
 			break;
 		case ExpressionType::COMPARE_NOTEQUAL:
 		case ExpressionType::COMPARE_DISTINCT_FROM:
-			// has_inequality = true;
 			break;
 		default:
 			throw NotImplementedException("Unimplemented comparison join");

package/src/duckdb/src/execution/reservoir_sample.cpp CHANGED Viewed

@@ -107,25 +107,19 @@ void ReservoirSamplePercentage::AddToReservoir(DataChunk &input) {
 			if (append_to_next_sample > 0) {
 				// we need to also add to the next sample
 				DataChunk new_chunk;
-				new_chunk.Initialize(allocator, input.GetTypes());
-				SelectionVector sel(append_to_current_sample_count);
-				for (idx_t r = 0; r < append_to_current_sample_count; r++) {
-					sel.set_index(r, r);
-				}
-				new_chunk.Slice(sel, append_to_current_sample_count);
+				new_chunk.InitializeEmpty(input.GetTypes());
+				new_chunk.Slice(input, *FlatVector::IncrementalSelectionVector(), append_to_current_sample_count);
 				new_chunk.Flatten();
 				current_sample->AddToReservoir(new_chunk);
 			} else {
 				input.Flatten();
 				input.SetCardinality(append_to_current_sample_count);
 				current_sample->AddToReservoir(input);
 			}
 		}
 		if (append_to_next_sample > 0) {
 			// slice the input for the remainder
-			SelectionVector sel(STANDARD_VECTOR_SIZE);
+			SelectionVector sel(append_to_next_sample);
 			for (idx_t i = 0; i < append_to_next_sample; i++) {
 				sel.set_index(i, append_to_current_sample_count + i);
 			}

package/src/duckdb/src/function/cast/vector_cast_helpers.cpp CHANGED Viewed

@@ -20,10 +20,16 @@ inline static void SkipWhitespace(const char *buf, idx_t &pos, idx_t len) {
 static bool SkipToCloseQuotes(idx_t &pos, const char *buf, idx_t &len) {
 	char quote = buf[pos];
 	pos++;
+	bool escaped = false;
 	while (pos < len) {
-		if (buf[pos] == quote) {
-			return true;
+		if (buf[pos] == '\\') {
+			escaped = !escaped;
+		} else {
+			if (buf[pos] == quote && !escaped) {
+				return true;
+			}
+			escaped = false;
 		}
 		pos++;
 	}

package/src/duckdb/src/function/function_binder.cpp CHANGED Viewed

@@ -1,16 +1,16 @@
 #include "duckdb/function/function_binder.hpp"
-#include "duckdb/common/limits.hpp"
-#include "duckdb/planner/expression/bound_cast_expression.hpp"
-#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
-#include "duckdb/planner/expression/bound_function_expression.hpp"
-#include "duckdb/planner/expression/bound_constant_expression.hpp"
+#include "duckdb/catalog/catalog.hpp"
 #include "duckdb/catalog/catalog_entry/scalar_function_catalog_entry.hpp"
-#include "duckdb/planner/expression_binder.hpp"
+#include "duckdb/common/limits.hpp"
+#include "duckdb/execution/expression_executor.hpp"
 #include "duckdb/function/aggregate_function.hpp"
 #include "duckdb/function/cast_rules.hpp"
-#include "duckdb/catalog/catalog.hpp"
+#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
+#include "duckdb/planner/expression/bound_cast_expression.hpp"
+#include "duckdb/planner/expression/bound_constant_expression.hpp"
+#include "duckdb/planner/expression/bound_function_expression.hpp"
+#include "duckdb/planner/expression_binder.hpp"
 namespace duckdb {
@@ -268,7 +268,8 @@ unique_ptr<Expression> FunctionBinder::BindScalarFunction(ScalarFunctionCatalogE
 	if (bound_function.null_handling == FunctionNullHandling::DEFAULT_NULL_HANDLING) {
 		for (auto &child : children) {
-			if (child->return_type == LogicalTypeId::SQLNULL) {
+			if (child->return_type == LogicalTypeId::SQLNULL ||
+			    (child->IsFoldable() && ExpressionExecutor::EvaluateScalar(context, *child).IsNull())) {
 				return make_uniq<BoundConstantExpression>(Value(LogicalType::SQLNULL));
 			}
 		}

package/src/duckdb/src/function/scalar/string/like.cpp CHANGED Viewed

@@ -196,9 +196,6 @@ static unique_ptr<FunctionData> LikeBindFunction(ClientContext &context, ScalarF
 	D_ASSERT(arguments.size() == 2 || arguments.size() == 3);
 	if (arguments[1]->IsFoldable()) {
 		Value pattern_str = ExpressionExecutor::EvaluateScalar(context, *arguments[1]);
-		if (pattern_str.IsNull()) {
-			return nullptr;
-		}
 		return LikeMatcher::CreateLikeMatcher(pattern_str.ToString());
 	}
 	return nullptr;

package/src/duckdb/src/function/table/version/pragma_version.cpp CHANGED Viewed

@@ -1,8 +1,8 @@
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "0.8.2-dev4711"
+#define DUCKDB_VERSION "0.8.2-dev4871"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "474a0bd683"
+#define DUCKDB_SOURCE_ID "5a29c99891"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"

package/src/duckdb/src/include/duckdb/common/enums/date_part_specifier.hpp CHANGED Viewed

@@ -25,7 +25,6 @@ enum class DatePartSpecifier : uint8_t {
 	SECOND,
 	MINUTE,
 	HOUR,
-	EPOCH,
 	DOW,
 	ISODOW,
 	WEEK,
@@ -39,11 +38,20 @@ enum class DatePartSpecifier : uint8_t {
 	TIMEZONE_MINUTE,
 	//	DOUBLE values
-	JULIAN_DAY
+	EPOCH,
+	JULIAN_DAY,
+	//	Invalid
+	INVALID,
+	//	Type ranges
+	BEGIN_BIGINT = YEAR,
+	BEGIN_DOUBLE = EPOCH,
+	BEGIN_INVALID = INVALID,
 };
 inline bool IsBigintDatepart(DatePartSpecifier part_code) {
-	return size_t(part_code) < size_t(DatePartSpecifier::JULIAN_DAY);
+	return size_t(part_code) < size_t(DatePartSpecifier::BEGIN_DOUBLE);
 }
 DUCKDB_API bool TryGetDatePartSpecifier(const string &specifier, DatePartSpecifier &result);

package/src/duckdb/src/include/duckdb/common/row_operations/row_matcher.hpp ADDED Viewed

@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/common/row_operations/row_matcher.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+#include "duckdb/common/enums/expression_type.hpp"
+#include "duckdb/common/types.hpp"
+namespace duckdb {
+class Vector;
+class DataChunk;
+class TupleDataLayout;
+struct TupleDataVectorFormat;
+struct SelectionVector;
+struct MatchFunction;
+typedef idx_t (*match_function_t)(Vector &lhs_vector, const TupleDataVectorFormat &lhs_format, SelectionVector &sel,
+                                  const idx_t count, const TupleDataLayout &rhs_layout, Vector &rhs_row_locations,
+                                  const idx_t col_idx, const vector<MatchFunction> &child_functions,
+                                  SelectionVector *no_match_sel, idx_t &no_match_count);
+struct MatchFunction {
+	match_function_t function;
+	vector<MatchFunction> child_functions;
+};
+struct RowMatcher {
+public:
+	using Predicates = vector<ExpressionType>;
+	//! Initializes the RowMatcher, filling match_functions using layout and predicates
+	void Initialize(const bool no_match_sel, const TupleDataLayout &layout, const Predicates &predicates);
+	//! Given a DataChunk on the LHS, on which we've called TupleDataCollection::ToUnifiedFormat,
+	//! we match it with rows on the RHS, according to the given layout and locations.
+	//! Initially, 'sel' has 'count' entries which point to what needs to be compared.
+	//! After matching is done, this returns how many matching entries there are, which 'sel' is modified to point to
+	idx_t Match(DataChunk &lhs, const vector<TupleDataVectorFormat> &lhs_formats, SelectionVector &sel, idx_t count,
+	            const TupleDataLayout &rhs_layout, Vector &rhs_row_locations, SelectionVector *no_match_sel,
+	            idx_t &no_match_count);
+private:
+	//! Gets the templated match function for a given column
+	MatchFunction GetMatchFunction(const bool no_match_sel, const LogicalType &type, const ExpressionType predicate);
+	template <bool NO_MATCH_SEL>
+	MatchFunction GetMatchFunction(const LogicalType &type, const ExpressionType predicate);
+	template <bool NO_MATCH_SEL, class T>
+	MatchFunction GetMatchFunction(const ExpressionType predicate);
+	template <bool NO_MATCH_SEL>
+	MatchFunction GetStructMatchFunction(const LogicalType &type, const ExpressionType predicate);
+	template <bool NO_MATCH_SEL>
+	MatchFunction GetListMatchFunction(const ExpressionType predicate);
+private:
+	vector<MatchFunction> match_functions;
+};
+} // namespace duckdb

package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp CHANGED Viewed

@@ -21,7 +21,7 @@ struct RowOperationsState;
 typedef void (*tuple_data_scatter_function_t)(const Vector &source, const TupleDataVectorFormat &source_format,
                                               const SelectionVector &append_sel, const idx_t append_count,
-                                              const TupleDataLayout &layout, Vector &row_locations,
+                                              const TupleDataLayout &layout, const Vector &row_locations,
                                               Vector &heap_locations, const idx_t col_idx,
                                               const UnifiedVectorFormat &list_format,
                                               const vector<TupleDataScatterFunction> &child_functions);
@@ -84,7 +84,11 @@ public:
 	                      TupleDataPinProperties = TupleDataPinProperties::UNPIN_AFTER_DONE);
 	//! Initializes the Chunk state of an Append state
 	//! - Useful for optimizing many appends made to the same tuple data collection
-	void InitializeAppend(TupleDataChunkState &chunk_state, vector<column_t> column_ids = {});
+	void InitializeChunkState(TupleDataChunkState &chunk_state, vector<column_t> column_ids = {});
+	//! Initializes the Chunk state of an Append state
+	//! - Useful for optimizing many appends made to the same tuple data collection
+	static void InitializeChunkState(TupleDataChunkState &chunk_state, const vector<LogicalType> &types,
+	                                 vector<column_t> column_ids = {});
 	//! Append a DataChunk directly to this TupleDataCollection - calls InitializeAppend and Append internally
 	void Append(DataChunk &new_chunk, const SelectionVector &append_sel = *FlatVector::IncrementalSelectionVector(),
 	            idx_t append_count = DConstants::INVALID_INDEX);

package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp CHANGED Viewed

@@ -42,8 +42,8 @@ struct TupleDataVectorFormat {
 	const SelectionVector *original_sel;
 	SelectionVector original_owned_sel;
-	UnifiedVectorFormat data;
-	vector<TupleDataVectorFormat> child_formats;
+	UnifiedVectorFormat unified;
+	vector<TupleDataVectorFormat> children;
 	unique_ptr<CombinedListData> combined_list_data;
 };

package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp CHANGED Viewed

@@ -148,6 +148,9 @@ public:
 		if (!validity_mask) {
 			return ValidityBuffer::MAX_ENTRY;
 		}
+		return GetValidityEntryUnsafe(entry_idx);
+	}
+	inline V &GetValidityEntryUnsafe(idx_t entry_idx) const {
 		return validity_mask[entry_idx];
 	}
 	static inline bool AllValid(V entry) {
@@ -156,7 +159,7 @@ public:
 	static inline bool NoneValid(V entry) {
 		return entry == 0;
 	}
-	static inline bool RowIsValid(V entry, idx_t idx_in_entry) {
+	static inline bool RowIsValid(const V &entry, const idx_t &idx_in_entry) {
 		return entry & (V(1) << V(idx_in_entry));
 	}
 	static inline void GetEntryIndex(idx_t row_idx, idx_t &entry_idx, idx_t &idx_in_entry) {

package/src/duckdb/src/include/duckdb/core_functions/scalar/string_functions.hpp CHANGED Viewed

@@ -285,7 +285,7 @@ struct RepeatFun {
 	static constexpr const char *Description = "Repeats the string count number of times";
 	static constexpr const char *Example = "repeat('A', 5)";
-	static ScalarFunction GetFunction();
+	static ScalarFunctionSet GetFunctions();
 };
 struct ReplaceFun {

package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp CHANGED Viewed

@@ -8,6 +8,7 @@
 #pragma once
+#include "duckdb/common/row_operations/row_matcher.hpp"
 #include "duckdb/common/types/row/partitioned_tuple_data.hpp"
 #include "duckdb/execution/base_aggregate_hashtable.hpp"
 #include "duckdb/storage/arena_allocator.hpp"
@@ -143,6 +144,9 @@ public:
 	void UnpinData();
 private:
+	//! Efficiently matches groups
+	RowMatcher row_matcher;
 	//! Append state
 	struct AggregateHTAppendState {
 		AggregateHTAppendState();