npm - duckdb - Versions diffs - 0.8.2-dev3458.0 → 0.8.2-dev3949.0 - Mend

duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp CHANGED Viewed

@@ -1,49 +1,28 @@
 #include "duckdb/execution/radix_partitioned_hashtable.hpp"
 #include "duckdb/common/radix_partitioning.hpp"
+#include "duckdb/common/row_operations/row_operations.hpp"
 #include "duckdb/common/types/row/tuple_data_collection.hpp"
+#include "duckdb/common/types/row/tuple_data_iterator.hpp"
+#include "duckdb/execution/aggregate_hashtable.hpp"
 #include "duckdb/execution/executor.hpp"
 #include "duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp"
+#include "duckdb/main/config.hpp"
 #include "duckdb/parallel/event.hpp"
-#include "duckdb/parallel/task_scheduler.hpp"
 #include "duckdb/planner/expression/bound_reference_expression.hpp"
 namespace duckdb {
-// compute the GROUPING values
-// for each parameter to the GROUPING clause, we check if the hash table groups on this particular group
-// if it does, we return 0, otherwise we return 1
-// we then use bitshifts to combine these values
-void RadixPartitionedHashTable::SetGroupingValues() {
-	auto &grouping_functions = op.GetGroupingFunctions();
-	for (auto &grouping : grouping_functions) {
-		int64_t grouping_value = 0;
-		D_ASSERT(grouping.size() < sizeof(int64_t) * 8);
-		for (idx_t i = 0; i < grouping.size(); i++) {
-			if (grouping_set.find(grouping[i]) == grouping_set.end()) {
-				// we don't group on this value!
-				grouping_value += (int64_t)1 << (grouping.size() - (i + 1));
-			}
-		}
-		grouping_values.push_back(Value::BIGINT(grouping_value));
-	}
-}
 RadixPartitionedHashTable::RadixPartitionedHashTable(GroupingSet &grouping_set_p, const GroupedAggregateData &op_p)
     : grouping_set(grouping_set_p), op(op_p) {
 	auto groups_count = op.GroupCount();
 	for (idx_t i = 0; i < groups_count; i++) {
 		if (grouping_set.find(i) == grouping_set.end()) {
 			null_groups.push_back(i);
 		}
 	}
-	// 10000 seems like a good compromise here
-	radix_limit = 10000;
 	if (grouping_set.empty()) {
-		// fake a single group with a constant value for aggregation without groups
+		// Fake a single group with a constant value for aggregation without groups
 		group_types.emplace_back(LogicalType::TINYINT);
 	}
 	for (auto &entry : grouping_set) {
@@ -51,79 +30,279 @@ RadixPartitionedHashTable::RadixPartitionedHashTable(GroupingSet &grouping_set_p
 		group_types.push_back(op.group_types[entry]);
 	}
 	SetGroupingValues();
+	auto group_types_copy = group_types;
+	group_types_copy.emplace_back(LogicalType::HASH);
+	layout.Initialize(std::move(group_types_copy), AggregateObject::CreateAggregateObjects(op.bindings));
+}
+void RadixPartitionedHashTable::SetGroupingValues() {
+	// Compute the GROUPING values:
+	// For each parameter to the GROUPING clause, we check if the hash table groups on this particular group
+	// If it does, we return 0, otherwise we return 1
+	// We then use bitshifts to combine these values
+	auto &grouping_functions = op.GetGroupingFunctions();
+	for (auto &grouping : grouping_functions) {
+		int64_t grouping_value = 0;
+		D_ASSERT(grouping.size() < sizeof(int64_t) * 8);
+		for (idx_t i = 0; i < grouping.size(); i++) {
+			if (grouping_set.find(grouping[i]) == grouping_set.end()) {
+				// We don't group on this value!
+				grouping_value += (int64_t)1 << (grouping.size() - (i + 1));
+			}
+		}
+		grouping_values.push_back(Value::BIGINT(grouping_value));
+	}
+}
+const TupleDataLayout &RadixPartitionedHashTable::GetLayout() const {
+	return layout;
+}
+unique_ptr<GroupedAggregateHashTable> RadixPartitionedHashTable::CreateHT(ClientContext &context, const idx_t capacity,
+                                                                          const idx_t radix_bits) const {
+	return make_uniq<GroupedAggregateHashTable>(context, BufferAllocator::Get(context), group_types, op.payload_types,
+	                                            op.bindings, capacity, radix_bits);
 }
 //===--------------------------------------------------------------------===//
 // Sink
 //===--------------------------------------------------------------------===//
-class RadixHTGlobalState : public GlobalSinkState {
-	constexpr const static idx_t MAX_RADIX_PARTITIONS = 32;
+struct AggregatePartition {
+	explicit AggregatePartition(unique_ptr<TupleDataCollection> data_p) : data(std::move(data_p)), finalized(false) {
+	}
+	unique_ptr<TupleDataCollection> data;
+	atomic<bool> finalized;
+};
+class RadixHTGlobalSinkState;
+struct RadixHTConfig {
 public:
-	explicit RadixHTGlobalState(ClientContext &context)
-	    : is_empty(true), multi_scan(true), partitioned(false),
-	      partition_info(make_uniq<RadixPartitionInfo>(
-	          MinValue<idx_t>(MAX_RADIX_PARTITIONS, TaskScheduler::GetScheduler(context).NumberOfThreads()))) {
-	}
+	explicit RadixHTConfig(ClientContext &context, RadixHTGlobalSinkState &sink);
-	vector<unique_ptr<PartitionableHashTable>> intermediate_hts;
-	vector<shared_ptr<GroupedAggregateHashTable>> finalized_hts;
+	void SetRadixBits(idx_t radix_bits_p);
+	bool SetRadixBitsToExternal();
+	idx_t GetRadixBits() const;
-	//! Whether or not any tuples were added to the HT
-	bool is_empty;
-	//! Whether or not the hash table should be scannable multiple times
-	bool multi_scan;
-	//! The lock for updating the global aggregate state
-	mutex lock;
-	//! Whether or not any thread has crossed the partitioning threshold
-	atomic<bool> partitioned;
-	bool is_finalized = false;
-	bool is_partitioned = false;
-	unique_ptr<RadixPartitionInfo> partition_info;
-	AggregateHTAppendState append_state;
-	//! Repartitioned HT info
-	bool repartitioned = false;
-	idx_t repartition_tasks_per_partition;
-	vector<vector<unique_ptr<PartitionableHashTable>>> repartition_tasks;
-	unique_array<atomic<idx_t>> repartition_tasks_assigned;
-	unique_array<atomic<idx_t>> repartition_tasks_done;
-	unique_array<atomic<bool>> finalize_assigned;
+private:
+	void SetRadixBitsInternal(const idx_t radix_bits_p, bool external);
+	static idx_t InitialSinkRadixBits(ClientContext &context);
+	static idx_t MaximumSinkRadixBits(ClientContext &context);
+	static idx_t ExternalRadixBits(const idx_t &maximum_sink_radix_bits_p);
+	static idx_t SinkCapacity(ClientContext &context);
+private:
+	//! Assume (1 << 15) = 32KB L1 cache per core, divided by two because hyperthreading
+	static constexpr const idx_t L1_CACHE_SIZE = 32768 / 2;
+	//! Assume (1 << 20) = 1MB L2 cache per core, divided by two because hyperthreading
+	static constexpr const idx_t L2_CACHE_SIZE = 1048576 / 2;
+	//! Assume (1 << 20) + (1 << 19) = 1.5MB L3 cache per core (shared), divided by two because hyperthreading
+	static constexpr const idx_t L3_CACHE_SIZE = 1572864 / 2;
+	//! Sink radix bits to initialize with
+	static constexpr const idx_t MAXIMUM_INITIAL_SINK_RADIX_BITS = 3;
+	//! Maximum Sink radix bits (independent of threads)
+	static constexpr const idx_t MAXIMUM_FINAL_SINK_RADIX_BITS = 7;
+	//! By how many radix bits to increment if we go external
+	static constexpr const idx_t EXTERNAL_RADIX_BITS_INCREMENT = 3;
+	//! The global sink state
+	RadixHTGlobalSinkState &sink;
+	//! Current thread-global sink radix bits
+	atomic<idx_t> sink_radix_bits;
+	//! Maximum Sink radix bits (set based on number of threads)
+	const idx_t maximum_sink_radix_bits;
+	//! Radix bits if we go external
+	const idx_t external_radix_bits;
+public:
+	//! Capacity of HTs during the Sink
+	const idx_t sink_capacity;
+	//! If we fill this many blocks per partition, we trigger a repartition
+	static constexpr const double BLOCK_FILL_FACTOR = 1.8;
+	//! By how many bits to repartition if a repartition is triggered
+	static constexpr const idx_t REPARTITION_RADIX_BITS = 2;
 };
-class RadixHTLocalState : public LocalSinkState {
+class RadixHTGlobalSinkState : public GlobalSinkState {
+public:
+	RadixHTGlobalSinkState(ClientContext &context, const RadixPartitionedHashTable &radix_ht);
+	//! Destroys aggregate states (if multi-scan)
+	~RadixHTGlobalSinkState() override;
+	void Destroy();
 public:
-	explicit RadixHTLocalState(const RadixPartitionedHashTable &ht) : total_groups(0), is_empty(true) {
-		// if there are no groups we create a fake group so everything has the same group
-		group_chunk.InitializeEmpty(ht.group_types);
-		if (ht.grouping_set.empty()) {
-			group_chunk.data[0].Reference(Value::TINYINT(42));
+	//! The radix HT
+	const RadixPartitionedHashTable &radix_ht;
+	//! Config for partitioning
+	RadixHTConfig config;
+	//! Whether we've called Finalize
+	bool finalized;
+	//! Whether we are doing an external aggregation
+	atomic<bool> external;
+	//! Threads that have called Sink
+	atomic<idx_t> active_threads;
+	//! If any thread has called combine
+	atomic<bool> any_combined;
+	//! Lock for uncombined_data/stored_allocators
+	mutex lock;
+	//! Uncombined partitioned data that will be put into the AggregatePartitions
+	unique_ptr<PartitionedTupleData> uncombined_data;
+	//! Allocators used during the Sink/Finalize
+	vector<shared_ptr<ArenaAllocator>> stored_allocators;
+	//! Partitions that are finalized during GetData
+	vector<unique_ptr<AggregatePartition>> partitions;
+	//! For synchronizing finalize tasks
+	atomic<idx_t> finalize_idx;
+	//! Pin properties when scanning
+	TupleDataPinProperties scan_pin_properties;
+	//! Total count before combining
+	idx_t count_before_combining;
+};
+RadixHTGlobalSinkState::RadixHTGlobalSinkState(ClientContext &context, const RadixPartitionedHashTable &radix_ht_p)
+    : radix_ht(radix_ht_p), config(context, *this), finalized(false), external(false), active_threads(0),
+      any_combined(false), finalize_idx(0), scan_pin_properties(TupleDataPinProperties::DESTROY_AFTER_DONE),
+      count_before_combining(0) {
+}
+RadixHTGlobalSinkState::~RadixHTGlobalSinkState() {
+	Destroy();
+}
+// LCOV_EXCL_START
+void RadixHTGlobalSinkState::Destroy() {
+	if (scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE || count_before_combining == 0 ||
+	    partitions.empty()) {
+		// Already destroyed / empty
+		return;
+	}
+	TupleDataLayout layout = partitions[0]->data->GetLayout().Copy();
+	if (!layout.HasDestructor()) {
+		return; // No destructors, exit
+	}
+	// There are aggregates with destructors: Call the destructor for each of the aggregates
+	RowOperationsState row_state(*stored_allocators.back());
+	for (auto &partition : partitions) {
+		auto &data_collection = *partition->data;
+		if (data_collection.Count() == 0) {
+			continue;
 		}
+		TupleDataChunkIterator iterator(data_collection, TupleDataPinProperties::DESTROY_AFTER_DONE, false);
+		auto &row_locations = iterator.GetChunkState().row_locations;
+		do {
+			RowOperations::DestroyStates(row_state, layout, row_locations, iterator.GetCurrentChunkCount());
+		} while (iterator.Next());
+		data_collection.Reset();
 	}
+}
+// LCOV_EXCL_STOP
+RadixHTConfig::RadixHTConfig(ClientContext &context, RadixHTGlobalSinkState &sink_p)
+    : sink(sink_p), sink_radix_bits(InitialSinkRadixBits(context)),
+      maximum_sink_radix_bits(MaximumSinkRadixBits(context)),
+      external_radix_bits(ExternalRadixBits(maximum_sink_radix_bits)), sink_capacity(SinkCapacity(context)) {
+}
+void RadixHTConfig::SetRadixBits(idx_t radix_bits_p) {
+	SetRadixBitsInternal(MinValue(radix_bits_p, maximum_sink_radix_bits), false);
+}
+bool RadixHTConfig::SetRadixBitsToExternal() {
+	SetRadixBitsInternal(external_radix_bits, true);
+	return sink.external;
+}
+idx_t RadixHTConfig::GetRadixBits() const {
+	return sink_radix_bits;
+}
+void RadixHTConfig::SetRadixBitsInternal(const idx_t radix_bits_p, bool external) {
+	if (sink_radix_bits >= radix_bits_p || sink.any_combined) {
+		return;
+	}
+	lock_guard<mutex> guard(sink.lock);
+	if (sink_radix_bits >= radix_bits_p || sink.any_combined) {
+		return;
+	}
+	if (external) {
+		sink.external = true;
+	}
+	sink_radix_bits = radix_bits_p;
+	return;
+}
+idx_t RadixHTConfig::InitialSinkRadixBits(ClientContext &context) {
+	const idx_t active_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
+	return MinValue(RadixPartitioning::RadixBits(NextPowerOfTwo(active_threads)), MAXIMUM_INITIAL_SINK_RADIX_BITS);
+}
+idx_t RadixHTConfig::MaximumSinkRadixBits(ClientContext &context) {
+	const idx_t active_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
+	return MinValue(RadixPartitioning::RadixBits(NextPowerOfTwo(active_threads)), MAXIMUM_FINAL_SINK_RADIX_BITS);
+}
+idx_t RadixHTConfig::ExternalRadixBits(const idx_t &maximum_sink_radix_bits_p) {
+	return MinValue(maximum_sink_radix_bits_p + EXTERNAL_RADIX_BITS_INCREMENT, MAXIMUM_FINAL_SINK_RADIX_BITS);
+}
+idx_t RadixHTConfig::SinkCapacity(ClientContext &context) {
+	// Get active and maximum number of threads
+	const idx_t active_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
+	const auto max_threads = DBConfig::GetSystemMaxThreads(FileSystem::GetFileSystem(context));
+	// Compute cache size per active thread (assuming cache is shared)
+	const auto total_shared_cache_size = max_threads * L3_CACHE_SIZE;
+	const auto cache_per_active_thread = L1_CACHE_SIZE + L2_CACHE_SIZE + total_shared_cache_size / active_threads;
+	// Divide cache per active thread by entry size, round up to next power of two, to get capacity
+	const auto size_per_entry = sizeof(aggr_ht_entry_t) * GroupedAggregateHashTable::LOAD_FACTOR;
+	const auto capacity = NextPowerOfTwo(cache_per_active_thread / size_per_entry);
+	// Capacity must be at least the minimum capacity
+	return MaxValue<idx_t>(capacity, GroupedAggregateHashTable::InitialCapacity());
+}
+class RadixHTLocalSinkState : public LocalSinkState {
+public:
+	RadixHTLocalSinkState(ClientContext &context, const RadixPartitionedHashTable &radix_ht);
+public:
+	//! Thread-local HT that is re-used after abandoning
+	unique_ptr<GroupedAggregateHashTable> ht;
+	//! Chunk with group columns
 	DataChunk group_chunk;
-	//! The aggregate HT
-	unique_ptr<PartitionableHashTable> ht;
-	//! The total number of groups found by this thread
-	idx_t total_groups;
-	//! Whether or not any tuples were added to the HT
-	bool is_empty;
+	//! Data that is abandoned ends up here (only if we're doing external aggregation)
+	unique_ptr<PartitionedTupleData> abandoned_data;
 };
-void RadixPartitionedHashTable::SetMultiScan(GlobalSinkState &state) {
-	auto &gstate = state.Cast<RadixHTGlobalState>();
-	gstate.multi_scan = true;
+RadixHTLocalSinkState::RadixHTLocalSinkState(ClientContext &, const RadixPartitionedHashTable &radix_ht) {
+	// If there are no groups we create a fake group so everything has the same group
+	group_chunk.InitializeEmpty(radix_ht.group_types);
+	if (radix_ht.grouping_set.empty()) {
+		group_chunk.data[0].Reference(Value::TINYINT(42));
+	}
 }
 unique_ptr<GlobalSinkState> RadixPartitionedHashTable::GetGlobalSinkState(ClientContext &context) const {
-	return make_uniq<RadixHTGlobalState>(context);
+	return make_uniq<RadixHTGlobalSinkState>(context, *this);
 }
 unique_ptr<LocalSinkState> RadixPartitionedHashTable::GetLocalSinkState(ExecutionContext &context) const {
-	return make_uniq<RadixHTLocalState>(*this);
+	return make_uniq<RadixHTLocalSinkState>(context.client, *this);
 }
 void RadixPartitionedHashTable::PopulateGroupChunk(DataChunk &group_chunk, DataChunk &input_chunk) const {
@@ -141,507 +320,448 @@ void RadixPartitionedHashTable::PopulateGroupChunk(DataChunk &group_chunk, DataC
 	group_chunk.Verify();
 }
-void RadixPartitionedHashTable::Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input,
-                                     DataChunk &payload_input, const unsafe_vector<idx_t> &filter) const {
-	auto &llstate = input.local_state.Cast<RadixHTLocalState>();
-	auto &gstate = input.global_state.Cast<RadixHTGlobalState>();
-	D_ASSERT(!gstate.is_finalized);
-	DataChunk &group_chunk = llstate.group_chunk;
-	PopulateGroupChunk(group_chunk, chunk);
+bool MaybeRepartition(ClientContext &context, RadixHTGlobalSinkState &gstate, RadixHTLocalSinkState &lstate) {
+	auto &config = gstate.config;
+	auto &ht = *lstate.ht;
+	auto &partitioned_data = ht.GetPartitionedData();
+	// Check if we're approaching the memory limit
+	const idx_t n_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
+	const idx_t limit = BufferManager::GetBufferManager(context).GetMaxMemory();
+	const idx_t thread_limit = 0.6 * limit / n_threads;
+	if (ht.GetPartitionedData()->SizeInBytes() > thread_limit || context.config.force_external) {
+		if (gstate.config.SetRadixBitsToExternal()) {
+			// We're approaching the memory limit, unpin the data
+			if (!lstate.abandoned_data) {
+				lstate.abandoned_data = make_uniq<RadixPartitionedTupleData>(
+				    BufferManager::GetBufferManager(context), gstate.radix_ht.GetLayout(), config.GetRadixBits(),
+				    gstate.radix_ht.GetLayout().ColumnCount() - 1);
+			}
-	// if we have non-combinable aggregates (e.g. string_agg) we cannot keep parallel hash
-	// tables
-	if (ForceSingleHT(input.global_state)) {
-		lock_guard<mutex> glock(gstate.lock);
-		gstate.is_empty = gstate.is_empty && group_chunk.size() == 0;
-		if (gstate.finalized_hts.empty()) {
-			// Create a finalized ht in the global state, that we can populate
-			gstate.finalized_hts.push_back(make_shared<GroupedAggregateHashTable>(
-			    context.client, BufferAllocator::Get(context.client), group_types, op.payload_types, op.bindings,
-			    HtEntryType::HT_WIDTH_64));
+			ht.UnpinData();
+			partitioned_data->Repartition(*lstate.abandoned_data);
+			ht.SetRadixBits(gstate.config.GetRadixBits());
+			ht.InitializePartitionedData();
+			return true;
 		}
-		D_ASSERT(gstate.finalized_hts.size() == 1);
-		D_ASSERT(gstate.finalized_hts[0]);
-		llstate.total_groups +=
-		    gstate.finalized_hts[0]->AddChunk(gstate.append_state, group_chunk, payload_input, filter);
-		return;
 	}
-	if (group_chunk.size() > 0) {
-		llstate.is_empty = false;
-	}
+	const auto partition_count = partitioned_data->PartitionCount();
+	const auto current_radix_bits = RadixPartitioning::RadixBits(partition_count);
+	D_ASSERT(current_radix_bits <= config.GetRadixBits());
-	if (!llstate.ht) {
-		llstate.ht =
-		    make_uniq<PartitionableHashTable>(context.client, BufferAllocator::Get(context.client),
-		                                      *gstate.partition_info, group_types, op.payload_types, op.bindings);
-		if (context.client.config.force_external) {
-			gstate.partitioned = true;
-		}
+	const auto row_size_per_partition =
+	    partitioned_data->Count() * partitioned_data->GetLayout().GetRowWidth() / partition_count;
+	if (row_size_per_partition > config.BLOCK_FILL_FACTOR * Storage::BLOCK_SIZE) {
+		// We crossed our block filling threshold, try to increment radix bits
+		config.SetRadixBits(current_radix_bits + config.REPARTITION_RADIX_BITS);
 	}
-	llstate.total_groups += llstate.ht->AddChunk(group_chunk, payload_input,
-	                                             gstate.partitioned && gstate.partition_info->n_partitions > 1, filter);
-	if (llstate.total_groups >= radix_limit) {
-		gstate.partitioned = true;
+	const auto global_radix_bits = config.GetRadixBits();
+	if (current_radix_bits == global_radix_bits) {
+		return false; // We're already on the right number of radix bits
 	}
+	// We're out-of-sync with the global radix bits, repartition
+	ht.UnpinData();
+	auto old_partitioned_data = std::move(partitioned_data);
+	ht.SetRadixBits(global_radix_bits);
+	ht.InitializePartitionedData();
+	old_partitioned_data->Repartition(*ht.GetPartitionedData());
+	return true;
 }
-void RadixPartitionedHashTable::Combine(ExecutionContext &context, GlobalSinkState &state,
-                                        LocalSinkState &lstate) const {
-	auto &llstate = lstate.Cast<RadixHTLocalState>();
-	auto &gstate = state.Cast<RadixHTGlobalState>();
-	D_ASSERT(!gstate.is_finalized);
+void RadixPartitionedHashTable::Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input,
+                                     DataChunk &payload_input, const unsafe_vector<idx_t> &filter) const {
+	auto &gstate = input.global_state.Cast<RadixHTGlobalSinkState>();
+	auto &lstate = input.local_state.Cast<RadixHTLocalSinkState>();
+	if (!lstate.ht) {
+		lstate.ht = CreateHT(context.client, gstate.config.sink_capacity, gstate.config.GetRadixBits());
+		gstate.active_threads++;
+	}
-	// this actually does not do a lot but just pushes the local HTs into the global state so we can later combine them
-	// in parallel
+	auto &group_chunk = lstate.group_chunk;
+	PopulateGroupChunk(group_chunk, chunk);
-	if (ForceSingleHT(state)) {
-		D_ASSERT(gstate.finalized_hts.size() <= 1);
-		return;
-	}
+	auto &ht = *lstate.ht;
+	ht.AddChunk(group_chunk, payload_input, filter);
-	if (!llstate.ht) {
-		return; // no data
+	if (ht.Count() + STANDARD_VECTOR_SIZE < ht.ResizeThreshold()) {
+		return; // We can fit another chunk
 	}
-	if (!llstate.ht->IsPartitioned() && gstate.partition_info->n_partitions > 1 && gstate.partitioned) {
-		llstate.ht->Partition(true);
+	if (gstate.active_threads > 2) {
+		// 'Reset' the HT without taking its data, we can just keep appending to the same collection
+		// This only works because we never resize the HT
+		ht.ClearPointerTable();
+		ht.ResetCount();
+		// We don't do this when running with 1 or 2 threads, it only makes sense when there's many threads
 	}
-	// we will never add new values to these HTs so we can drop the first part of the HT
-	llstate.ht->Finalize();
+	// Check if we need to repartition
+	auto repartitioned = MaybeRepartition(context.client, gstate, lstate);
-	lock_guard<mutex> glock(gstate.lock);
-	if (!llstate.is_empty) {
-		gstate.is_empty = false;
+	if (repartitioned && ht.Count() != 0) {
+		// We repartitioned, but we didn't clear the pointer table / reset the count because we're on 1 or 2 threads
+		ht.ClearPointerTable();
+		ht.ResetCount();
 	}
-	// at this point we just collect them the PhysicalHashAggregateFinalizeTask (below) will merge them in parallel
-	gstate.intermediate_hts.push_back(std::move(llstate.ht));
+	// TODO: combine early and often
 }
-void RadixPartitionedHashTable::InitializeFinalizedHTs(ClientContext &context, GlobalSinkState &gstate_p) const {
-	auto &gstate = gstate_p.Cast<RadixHTGlobalState>();
-	auto &allocator = BufferAllocator::Get(context);
-	gstate.finalized_hts.resize(gstate.partition_info->n_partitions);
-	for (idx_t r = 0; r < gstate.partition_info->n_partitions; r++) {
-		gstate.finalized_hts[r] = make_shared<GroupedAggregateHashTable>(
-		    context, allocator, group_types, op.payload_types, op.bindings, HtEntryType::HT_WIDTH_64);
+void RadixPartitionedHashTable::Combine(ExecutionContext &context, GlobalSinkState &gstate_p,
+                                        LocalSinkState &lstate_p) const {
+	auto &gstate = gstate_p.Cast<RadixHTGlobalSinkState>();
+	auto &lstate = lstate_p.Cast<RadixHTLocalSinkState>();
+	if (!lstate.ht) {
+		return;
 	}
-}
-bool RadixPartitionedHashTable::Finalize(ClientContext &context, GlobalSinkState &gstate_p) const {
-	auto &gstate = gstate_p.Cast<RadixHTGlobalState>();
-	D_ASSERT(!gstate.is_finalized);
-	gstate.is_finalized = true;
+	// Set any_combined, then check one last time whether we need to repartition
+	gstate.any_combined = true;
+	MaybeRepartition(context.client, gstate, lstate);
-	// special case if we have non-combinable aggregates
-	// we have already aggregated into a global shared HT that does not require any additional finalization steps
-	if (ForceSingleHT(gstate)) {
-		D_ASSERT(gstate.finalized_hts.size() <= 1);
-		D_ASSERT(gstate.finalized_hts.empty() || gstate.finalized_hts[0]);
-		return false;
+	auto &ht = *lstate.ht;
+	ht.UnpinData();
+	if (lstate.abandoned_data) {
+		D_ASSERT(gstate.external);
+		D_ASSERT(lstate.abandoned_data->PartitionCount() == lstate.ht->GetPartitionedData()->PartitionCount());
+		D_ASSERT(lstate.abandoned_data->PartitionCount() ==
+		         RadixPartitioning::NumberOfPartitions(gstate.config.GetRadixBits()));
+		lstate.abandoned_data->Combine(*lstate.ht->GetPartitionedData());
+	} else {
+		lstate.abandoned_data = std::move(ht.GetPartitionedData());
 	}
-	// we can have two cases now, non-partitioned for few groups and radix-partitioned for very many groups.
-	auto &allocator = BufferAllocator::Get(context);
-	if (AnyPartitioned(gstate_p)) {
-		// if one is partitioned, all have to be
-		// this should mostly have already happened in Combine, but if not we do it here
-		for (auto &pht : gstate.intermediate_hts) {
-			if (!pht->IsPartitioned()) {
-				pht->Partition(true);
-			}
-		}
-		// schedule additional tasks to combine the partial HTs
-		InitializeFinalizedHTs(context, gstate_p);
-		gstate.is_partitioned = true;
-		return true;
-	} else { // in the non-partitioned case we immediately combine all the unpartitioned hts created by the threads.
-		     // TODO possible optimization, if total count < limit for 32 bit ht, use that one
-		     // create this ht here so finalize needs no lock on gstate
-		gstate.finalized_hts.push_back(make_shared<GroupedAggregateHashTable>(
-		    context, allocator, group_types, op.payload_types, op.bindings, HtEntryType::HT_WIDTH_64));
-		for (auto &pht : gstate.intermediate_hts) {
-			auto unpartitioned = pht->GetUnpartitioned();
-			for (auto &unpartitioned_ht : unpartitioned) {
-				D_ASSERT(unpartitioned_ht);
-				gstate.finalized_hts[0]->Combine(*unpartitioned_ht);
-				unpartitioned_ht.reset();
-			}
-			unpartitioned.clear();
-		}
-		D_ASSERT(gstate.finalized_hts[0]);
-		gstate.finalized_hts[0]->Finalize();
-		return false;
+	lock_guard<mutex> guard(gstate.lock);
+	if (gstate.uncombined_data) {
+		gstate.uncombined_data->Combine(*lstate.abandoned_data);
+	} else {
+		gstate.uncombined_data = std::move(lstate.abandoned_data);
 	}
+	gstate.stored_allocators.emplace_back(ht.GetAggregateAllocator());
 }
-// this task is run in multiple threads and combines the radix-partitioned hash tables into a single one and then
-// folds them into the global ht finally.
-class RadixAggregateFinalizeTask : public ExecutorTask {
-public:
-	RadixAggregateFinalizeTask(Executor &executor, shared_ptr<Event> event_p, RadixHTGlobalState &state_p,
-	                           idx_t radix_p)
-	    : ExecutorTask(executor), event(std::move(event_p)), state(state_p), radix(radix_p) {
-	}
-	static void FinalizeHT(RadixHTGlobalState &gstate, idx_t radix) {
-		D_ASSERT(gstate.partition_info->n_partitions <= gstate.finalized_hts.size());
-		D_ASSERT(gstate.finalized_hts[radix]);
-		idx_t pht_idx_from = 0;
-		idx_t pht_idx_to = gstate.intermediate_hts.size();
-		if (gstate.repartitioned) {
-			const auto num_partitions_before = gstate.repartition_tasks.size();
-			const auto multiplier = gstate.partition_info->n_partitions / num_partitions_before;
-			const auto radix_before = radix / multiplier;
-			pht_idx_from = radix_before * gstate.repartition_tasks_per_partition;
-			pht_idx_to = pht_idx_from + gstate.repartition_tasks_per_partition;
-		}
+void RadixPartitionedHashTable::Finalize(ClientContext &, GlobalSinkState &gstate_p) const {
+	auto &gstate = gstate_p.Cast<RadixHTGlobalSinkState>();
+	if (gstate.uncombined_data) {
+		auto &uncombined_data = *gstate.uncombined_data;
+		gstate.count_before_combining = uncombined_data.Count();
+		// If true there is no need to combine, it was all done by a single thread in a single HT
+		const auto single_ht = !gstate.external && gstate.active_threads == 1;
-		for (idx_t i = pht_idx_from; i < pht_idx_to; i++) {
-			for (auto &ht : gstate.intermediate_hts[i]->GetPartition(radix)) {
-				gstate.finalized_hts[radix]->Combine(*ht);
-				ht.reset();
+		auto &uncombined_partition_data = uncombined_data.GetPartitions();
+		const auto n_partitions = uncombined_partition_data.size();
+		gstate.partitions.reserve(n_partitions);
+		for (idx_t i = 0; i < n_partitions; i++) {
+			gstate.partitions.emplace_back(make_uniq<AggregatePartition>(std::move(uncombined_partition_data[i])));
+			if (single_ht) {
+				gstate.finalize_idx++;
+				gstate.partitions.back()->finalized = true;
 			}
 		}
-		gstate.finalized_hts[radix]->Finalize();
+	} else {
+		gstate.count_before_combining = 0;
 	}
-	TaskExecutionResult ExecuteTask(TaskExecutionMode mode) override {
-		FinalizeHT(state, radix);
-		event->FinishTask();
-		return TaskExecutionResult::TASK_FINISHED;
-	}
+	gstate.finalized = true;
+}
-private:
-	shared_ptr<Event> event;
-	RadixHTGlobalState &state;
-	idx_t radix;
+//===--------------------------------------------------------------------===//
+// Source
+//===--------------------------------------------------------------------===//
+idx_t RadixPartitionedHashTable::Count(GlobalSinkState &sink_p) const {
+	const auto count = CountInternal(sink_p);
+	return count == 0 && grouping_set.empty() ? 1 : count;
+}
+idx_t RadixPartitionedHashTable::CountInternal(GlobalSinkState &sink_p) const {
+	auto &sink = sink_p.Cast<RadixHTGlobalSinkState>();
+	return sink.count_before_combining;
+}
+void RadixPartitionedHashTable::SetMultiScan(GlobalSinkState &sink_p) {
+	auto &sink = sink_p.Cast<RadixHTGlobalSinkState>();
+	sink.scan_pin_properties = TupleDataPinProperties::UNPIN_AFTER_DONE;
+}
+enum class RadixHTSourceTaskType : uint8_t { NO_TASK, FINALIZE, SCAN };
+class RadixHTLocalSourceState;
+class RadixHTGlobalSourceState : public GlobalSourceState {
+public:
+	RadixHTGlobalSourceState(ClientContext &context, const RadixPartitionedHashTable &radix_ht);
+	//! Assigns a task to a local source state
+	bool AssignTask(RadixHTGlobalSinkState &sink, RadixHTLocalSourceState &lstate);
+public:
+	//! The client context
+	ClientContext &context;
+	//! For synchronizing the source phase
+	atomic<bool> finished;
+	//! Column ids for scanning
+	vector<column_t> column_ids;
+	//! For synchronizing scan tasks
+	atomic<idx_t> scan_idx;
+	atomic<idx_t> scan_done;
 };
-class RadixAggregateRepartitionTask : public ExecutorTask {
+enum class RadixHTScanStatus : uint8_t { INIT, IN_PROGRESS, DONE };
+class RadixHTLocalSourceState : public LocalSourceState {
 public:
-	RadixAggregateRepartitionTask(Executor &executor, shared_ptr<Event> event_p, RadixHTGlobalState &state_p,
-	                              idx_t num_partitions_before_p)
-	    : ExecutorTask(executor), event(std::move(event_p)), state(state_p),
-	      num_partitions_before(num_partitions_before_p) {
-	}
-	TaskExecutionResult ExecuteTask(TaskExecutionMode mode) override {
-		const auto multiplier = state.partition_info->n_partitions / num_partitions_before;
-		idx_t repartition_radix = 0;
-		idx_t finalize_radix = 0;
-		while (repartition_radix < num_partitions_before && finalize_radix < state.partition_info->n_partitions) {
-			// Loop over original partitions until we find one that we can repartition
-			for (; repartition_radix < num_partitions_before; repartition_radix++) {
-				auto task_idx = state.repartition_tasks_assigned[repartition_radix]++;
-				if (task_idx >= state.repartition_tasks_per_partition) {
-					continue;
-				}
-				auto &ht = state.repartition_tasks[repartition_radix][task_idx];
-				ht->Partition(true);
-				state.intermediate_hts[repartition_radix * state.repartition_tasks_per_partition + task_idx] =
-				    std::move(ht);
-				state.repartition_tasks_done[repartition_radix]++;
-				break;
-			}
+	explicit RadixHTLocalSourceState(ExecutionContext &context, const RadixPartitionedHashTable &radix_ht);
-			// Loop over repartitioned partitions
-			for (; finalize_radix < state.partition_info->n_partitions; finalize_radix++) {
-				const auto original_radix = finalize_radix / multiplier;
-				if (state.repartition_tasks_done[original_radix] != state.repartition_tasks_per_partition) {
-					break; // Needs more repartitioning
-				}
-				if (state.finalize_assigned[finalize_radix]) {
-					continue; // Already assigned
-				}
-				{
-					lock_guard<mutex> guard(state.lock);
-					if (state.finalize_assigned[finalize_radix]) {
-						// LCOV_EXCL_START
-						continue; // Check again with lock, but already assigned
-						          // LCOV_EXCL_STOP
-					}
-					state.finalize_assigned[finalize_radix] = true;
-				}
-				// We can finalize!
-				RadixAggregateFinalizeTask::FinalizeHT(state, finalize_radix);
-			}
-		}
-		event->FinishTask();
-		return TaskExecutionResult::TASK_FINISHED;
-	}
+public:
+	//! Do the work this thread has been assigned
+	void ExecuteTask(RadixHTGlobalSinkState &sink, RadixHTGlobalSourceState &gstate, DataChunk &chunk);
+	//! Whether this thread has finished the work it has been assigned
+	bool TaskFinished();
 private:
-	shared_ptr<Event> event;
-	RadixHTGlobalState &state;
-	const idx_t num_partitions_before;
+	//! Execute the finalize or scan task
+	void Finalize(RadixHTGlobalSinkState &sink, RadixHTGlobalSourceState &gstate);
+	void Scan(RadixHTGlobalSinkState &sink, RadixHTGlobalSourceState &gstate, DataChunk &chunk);
+public:
+	//! Current task and index
+	RadixHTSourceTaskType task;
+	idx_t task_idx;
+	//! Thread-local HT that is re-used to Finalize
+	unique_ptr<GroupedAggregateHashTable> ht;
+	//! Current status of a Scan
+	RadixHTScanStatus scan_status;
+private:
+	//! Allocator and layout for finalizing state
+	TupleDataLayout layout;
+	ArenaAllocator aggregate_allocator;
+	//! State and chunk for scanning
+	TupleDataScanState scan_state;
+	DataChunk scan_chunk;
 };
-void RadixPartitionedHashTable::ScheduleTasks(Executor &executor, const shared_ptr<Event> &event,
-                                              GlobalSinkState &state, vector<shared_ptr<Task>> &tasks) const {
-	auto &gstate = state.Cast<RadixHTGlobalState>();
-	if (!gstate.is_partitioned) {
-		return;
+unique_ptr<GlobalSourceState> RadixPartitionedHashTable::GetGlobalSourceState(ClientContext &context) const {
+	return make_uniq<RadixHTGlobalSourceState>(context, *this);
+}
+unique_ptr<LocalSourceState> RadixPartitionedHashTable::GetLocalSourceState(ExecutionContext &context) const {
+	return make_uniq<RadixHTLocalSourceState>(context, *this);
+}
+RadixHTGlobalSourceState::RadixHTGlobalSourceState(ClientContext &context_p, const RadixPartitionedHashTable &radix_ht)
+    : context(context_p), finished(false), scan_idx(0), scan_done(0) {
+	for (column_t column_id = 0; column_id < radix_ht.group_types.size(); column_id++) {
+		column_ids.push_back(column_id);
 	}
+}
-	idx_t repartition_radix_bits;
-	idx_t concurrent_repartitions;
-	idx_t tasks_per_partition;
-	GetRepartitionInfo(executor.context, state, repartition_radix_bits, concurrent_repartitions, tasks_per_partition);
-	if (repartition_radix_bits == gstate.partition_info->radix_bits) {
-		// No repartitioning necessary
-		for (idx_t r = 0; r < gstate.partition_info->n_partitions; r++) {
-			D_ASSERT(gstate.partition_info->n_partitions <= gstate.finalized_hts.size());
-			D_ASSERT(gstate.finalized_hts[r]);
-			tasks.push_back(make_uniq<RadixAggregateFinalizeTask>(executor, event, gstate, r));
-		}
-	} else {
-		// Schedule repartition / finalize tasks
-		ScheduleRepartitionTasks(executor, event, state, tasks, repartition_radix_bits, concurrent_repartitions,
-		                         tasks_per_partition);
-	}
-}
-void RadixPartitionedHashTable::ScheduleRepartitionTasks(Executor &executor, const shared_ptr<Event> &event,
-                                                         GlobalSinkState &state, vector<shared_ptr<Task>> &tasks,
-                                                         const idx_t repartition_radix_bits,
-                                                         const idx_t concurrent_repartitions,
-                                                         const idx_t tasks_per_partition) const {
-	auto &gstate = state.Cast<RadixHTGlobalState>();
-	D_ASSERT(repartition_radix_bits > gstate.partition_info->radix_bits);
-	const auto num_partitions_before = gstate.partition_info->n_partitions;
-	const auto multiplier = RadixPartitioning::NumberOfPartitions(repartition_radix_bits) / num_partitions_before;
-	// Inititialize gstate
-	auto new_partition_info =
-	    make_uniq<RadixPartitionInfo>(RadixPartitioning::NumberOfPartitions(repartition_radix_bits));
-	gstate.repartitioned = true;
-	gstate.repartition_tasks_per_partition = tasks_per_partition;
-	gstate.repartition_tasks.resize(num_partitions_before);
-	gstate.repartition_tasks_assigned = make_uniq_array<atomic<idx_t>>(num_partitions_before);
-	gstate.repartition_tasks_done = make_uniq_array<atomic<idx_t>>(num_partitions_before);
-	gstate.finalize_assigned = make_uniq_array<atomic<bool>>(new_partition_info->n_partitions);
-	for (idx_t partition_idx = 0; partition_idx < num_partitions_before; partition_idx++) {
-		gstate.repartition_tasks_assigned[partition_idx] = 0;
-		gstate.repartition_tasks_done[partition_idx] = 0;
-		// Grab intermediate data from gstate
-		HashTableList partition_list;
-		for (auto &pht : gstate.intermediate_hts) {
-			for (auto &ht : pht->GetPartition(partition_idx)) {
-				partition_list.push_back(std::move(ht));
-			}
-		}
+bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTLocalSourceState &lstate) {
+	D_ASSERT(lstate.scan_status != RadixHTScanStatus::IN_PROGRESS);
-		// Spread the data across the tasks
-		const idx_t hts_per_task = (partition_list.size() + tasks_per_partition - 1) / tasks_per_partition;
-		idx_t ht_idx = 0;
-		for (idx_t task_idx = 0; task_idx < tasks_per_partition; task_idx++) {
-			auto task_ht =
-			    make_uniq<PartitionableHashTable>(executor.context, BufferAllocator::Get(executor.context),
-			                                      *new_partition_info, group_types, op.payload_types, op.bindings);
-			auto ht_idx_to = MinValue<idx_t>(ht_idx + hts_per_task, partition_list.size());
-			for (; ht_idx < ht_idx_to; ht_idx++) {
-				auto &ht = partition_list[ht_idx];
-				task_ht->Append(*ht);
-				ht.reset();
-			}
-			gstate.repartition_tasks[partition_idx].push_back(std::move(task_ht));
-		}
+	const auto n_partitions = sink.partitions.size();
+	if (scan_done == n_partitions) {
+		finished = true;
+		return false;
+	}
+	// We first try to assign a Scan task, then a Finalize task if that didn't work, without using any locks
-		for (idx_t i = 0; i < multiplier; i++) {
-			gstate.finalize_assigned[partition_idx * multiplier + i] = false;
+	// We need an atomic compare-and-swap to assign a Scan task, because we need to only increment
+	// the 'scan_idx' atomic if the 'finalize' of that partition is true, i.e., ready to be scanned
+	bool scan_assigned = true;
+	do {
+		lstate.task_idx = scan_idx.load();
+		if (lstate.task_idx >= n_partitions || !sink.partitions[lstate.task_idx]->finalized) {
+			scan_assigned = false;
+			break;
 		}
-	}
+	} while (!std::atomic_compare_exchange_weak(&scan_idx, &lstate.task_idx, lstate.task_idx + 1));
-	// Schedule tasks equal to number of therads
-	const idx_t num_threads = TaskScheduler::GetScheduler(executor.context).NumberOfThreads();
-	for (idx_t i = 0; i < num_threads; i++) {
-		tasks.emplace_back(make_shared<RadixAggregateRepartitionTask>(executor, event, gstate, num_partitions_before));
+	if (scan_assigned) {
+		// We successfully assigned a Scan task
+		D_ASSERT(lstate.task_idx < n_partitions && sink.partitions[lstate.task_idx]->finalized);
+		lstate.task = RadixHTSourceTaskType::SCAN;
+		lstate.scan_status = RadixHTScanStatus::INIT;
+		return true;
 	}
-	gstate.intermediate_hts.clear();
-	gstate.intermediate_hts.resize(num_partitions_before * tasks_per_partition);
+	// We can just increment the atomic here, much simpler than assigning the scan task
+	lstate.task_idx = sink.finalize_idx++;
+	if (lstate.task_idx < n_partitions) {
+		// We successfully assigned a Finalize task
+		lstate.task = RadixHTSourceTaskType::FINALIZE;
+		return true;
+	}
-	gstate.partition_info = std::move(new_partition_info);
-	InitializeFinalizedHTs(executor.context, state);
+	// We didn't manage to assign a finalize task
+	return false;
 }
-bool RadixPartitionedHashTable::ForceSingleHT(GlobalSinkState &state) {
-	auto &gstate = state.Cast<RadixHTGlobalState>();
-	return gstate.partition_info->n_partitions < 2;
+RadixHTLocalSourceState::RadixHTLocalSourceState(ExecutionContext &context, const RadixPartitionedHashTable &radix_ht)
+    : task(RadixHTSourceTaskType::NO_TASK), scan_status(RadixHTScanStatus::DONE), layout(radix_ht.GetLayout().Copy()),
+      aggregate_allocator(BufferAllocator::Get(context.client)) {
+	auto &allocator = BufferAllocator::Get(context.client);
+	auto scan_chunk_types = radix_ht.group_types;
+	for (auto &aggr_type : radix_ht.op.aggregate_return_types) {
+		scan_chunk_types.push_back(aggr_type);
+	}
+	scan_chunk.Initialize(allocator, scan_chunk_types);
 }
-bool RadixPartitionedHashTable::AnyPartitioned(GlobalSinkState &state) {
-	auto &gstate = state.Cast<RadixHTGlobalState>();
-	for (auto &pht : gstate.intermediate_hts) {
-		if (pht->IsPartitioned()) {
-			return true;
-		}
+void RadixHTLocalSourceState::ExecuteTask(RadixHTGlobalSinkState &sink, RadixHTGlobalSourceState &gstate,
+                                          DataChunk &chunk) {
+	switch (task) {
+	case RadixHTSourceTaskType::FINALIZE:
+		Finalize(sink, gstate);
+		break;
+	case RadixHTSourceTaskType::SCAN:
+		Scan(sink, gstate, chunk);
+		break;
+	default:
+		throw InternalException("Unexpected RadixHTSourceTaskType in ExecuteTask!");
 	}
-	return false;
 }
-void RadixPartitionedHashTable::GetRepartitionInfo(ClientContext &context, GlobalSinkState &state,
-                                                   idx_t &repartition_radix_bits, idx_t &concurrent_repartitions,
-                                                   idx_t &tasks_per_partition) {
-	auto &gstate = state.Cast<RadixHTGlobalState>();
-	const auto num_partitions = gstate.partition_info->n_partitions;
-	const auto radix_bits = gstate.partition_info->radix_bits;
-	D_ASSERT(IsPowerOfTwo(num_partitions));
-	vector<idx_t> partition_counts(num_partitions, 0);
-	vector<idx_t> partition_sizes(num_partitions, 0);
-	for (const auto &ht : gstate.intermediate_hts) {
-		for (idx_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) {
-			partition_counts[partition_idx] += ht->GetPartitionCount(partition_idx);
-			partition_sizes[partition_idx] += ht->GetPartitionSize(partition_idx);
-		}
-	}
+void RadixHTLocalSourceState::Finalize(RadixHTGlobalSinkState &sink, RadixHTGlobalSourceState &gstate) {
+	D_ASSERT(task == RadixHTSourceTaskType::FINALIZE);
+	D_ASSERT(scan_status != RadixHTScanStatus::IN_PROGRESS);
-	idx_t total_size = 0;
-	idx_t max_partition_idx = 0;
-	idx_t max_partition_size = 0;
-	for (idx_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) {
-		const auto &partition_count = partition_counts[partition_idx];
-		const auto &partition_size = partition_sizes[partition_idx];
-		auto partition_ht_size =
-		    partition_size + GroupedAggregateHashTable::FirstPartSize(partition_count, HtEntryType::HT_WIDTH_64);
-		if (partition_ht_size > max_partition_size) {
-			max_partition_idx = partition_idx;
-			max_partition_size = partition_ht_size;
-		}
-		total_size += partition_ht_size;
-	}
-	// Switch to out-of-core finalize at ~60%
-	const auto max_ht_size = double(0.6) * BufferManager::GetBufferManager(context).GetMaxMemory();
-	const idx_t n_threads = PreviousPowerOfTwo(TaskScheduler::GetScheduler(context).NumberOfThreads());
-	D_ASSERT(IsPowerOfTwo(n_threads));
-	if (!context.config.force_external && total_size < max_ht_size) {
-		// In-memory finalize
-		if (num_partitions >= n_threads) { // Can already keep all threads busy
-			repartition_radix_bits = radix_bits;
-			tasks_per_partition = 1;
-		} else { // Repartition to keep all threads busy
-			// Can't have coverage because RadixHTGlobalState::MAX_RADIX_PARTITIONS > threads on github actions
-			// LCOV_EXCL_START
-			repartition_radix_bits = RadixPartitioning::RadixBits(NextPowerOfTwo(n_threads));
-			tasks_per_partition = n_threads / num_partitions;
-			// LCOV_EXCL_STOP
-		}
-		concurrent_repartitions = num_partitions;
+	auto &partition = *sink.partitions[task_idx];
+	if (partition.data->Count() == 0) {
+		partition.finalized = true;
 		return;
 	}
-	// Out-of-core finalize
-	const auto partition_count = partition_counts[max_partition_idx];
-	const auto partition_size = partition_sizes[max_partition_idx];
+	if (!ht) {
+		// Create a HT with sufficient capacity
+		const auto capacity = GroupedAggregateHashTable::GetCapacityForCount(partition.data->Count());
+		ht = sink.radix_ht.CreateHT(gstate.context, capacity, 0);
+	} else {
+		// We may want to resize here to the size of this partition, but for now we just assume uniform partition sizes
+		ht->InitializePartitionedData();
+		ht->ClearPointerTable();
+		ht->ResetCount();
+	}
-	const auto max_added_bits = RadixPartitioning::MAX_RADIX_BITS - radix_bits;
-	idx_t added_bits;
-	for (added_bits = 1; added_bits < max_added_bits; added_bits++) {
-		double partition_multiplier = RadixPartitioning::NumberOfPartitions(added_bits);
+	// Now combine the uncombined data using this thread's HT
+	ht->Combine(*partition.data);
+	ht->UnpinData();
-		auto new_estimated_count = double(partition_count) / partition_multiplier;
-		auto new_estimated_size = double(partition_size) / partition_multiplier;
-		auto new_estimated_ht_size = new_estimated_size + GroupedAggregateHashTable::FirstPartSize(
-		                                                      new_estimated_count, HtEntryType::HT_WIDTH_64);
+	// Move the combined data back to the partition
+	partition.data =
+	    make_uniq<TupleDataCollection>(BufferManager::GetBufferManager(gstate.context), sink.radix_ht.GetLayout());
+	partition.data->Combine(*ht->GetPartitionedData()->GetPartitions()[0]);
-		if (new_estimated_ht_size <= max_ht_size / n_threads) {
-			break; // Max HT size is safe
-		}
-	}
-	repartition_radix_bits = radix_bits + added_bits;
-	concurrent_repartitions = MinValue<idx_t>(MaxValue<idx_t>(1, max_ht_size / max_partition_size), n_threads);
-	tasks_per_partition = NextPowerOfTwo(n_threads / concurrent_repartitions);
+	// Mark partition as ready to scan
+	partition.finalized = true;
+	// Make sure this thread's aggregate allocator does not get lost
+	lock_guard<mutex> guard(sink.lock);
+	sink.stored_allocators.emplace_back(ht->GetAggregateAllocator());
 }
-//===--------------------------------------------------------------------===//
-// Source
-//===--------------------------------------------------------------------===//
-class RadixHTGlobalSourceState : public GlobalSourceState {
-public:
-	explicit RadixHTGlobalSourceState(Allocator &allocator, const RadixPartitionedHashTable &ht)
-	    : ht_index(0), initialized(false), finished(false) {
-	}
+void RadixHTLocalSourceState::Scan(RadixHTGlobalSinkState &sink, RadixHTGlobalSourceState &gstate, DataChunk &chunk) {
+	D_ASSERT(task == RadixHTSourceTaskType::SCAN);
+	D_ASSERT(scan_status != RadixHTScanStatus::DONE);
-	//! Heavy handed for now.
-	mutex lock;
-	//! The current position to scan the HT for output tuples
-	idx_t ht_index;
-	//! The set of aggregate scan states
-	unsafe_unique_array<TupleDataParallelScanState> ht_scan_states;
-	atomic<bool> initialized;
-	atomic<bool> finished;
-};
+	auto &partition = *sink.partitions[task_idx];
+	D_ASSERT(partition.finalized);
+	auto &data_collection = *partition.data;
-class RadixHTLocalSourceState : public LocalSourceState {
-public:
-	explicit RadixHTLocalSourceState(ExecutionContext &context, const RadixPartitionedHashTable &ht) {
-		auto &allocator = BufferAllocator::Get(context.client);
-		auto scan_chunk_types = ht.group_types;
-		for (auto &aggr_type : ht.op.aggregate_return_types) {
-			scan_chunk_types.push_back(aggr_type);
+	if (data_collection.Count() == 0) {
+		scan_status = RadixHTScanStatus::DONE;
+		if (++gstate.scan_done == sink.partitions.size()) {
+			gstate.finished = true;
 		}
-		scan_chunk.Initialize(allocator, scan_chunk_types);
+		return;
 	}
-	//! Materialized GROUP BY expressions & aggregates
-	DataChunk scan_chunk;
-	//! HT index
-	idx_t ht_index = DConstants::INVALID_INDEX;
-	//! A reference to the current HT that we are scanning
-	shared_ptr<GroupedAggregateHashTable> ht;
-	//! Scan state for the current HT
-	TupleDataLocalScanState scan_state;
-};
+	if (scan_status == RadixHTScanStatus::INIT) {
+		data_collection.InitializeScan(scan_state, gstate.column_ids, sink.scan_pin_properties);
+		scan_status = RadixHTScanStatus::IN_PROGRESS;
+	}
-unique_ptr<GlobalSourceState> RadixPartitionedHashTable::GetGlobalSourceState(ClientContext &context) const {
-	return make_uniq<RadixHTGlobalSourceState>(BufferAllocator::Get(context), *this);
-}
+	if (!data_collection.Scan(scan_state, scan_chunk)) {
+		scan_status = RadixHTScanStatus::DONE;
+		if (++gstate.scan_done == sink.partitions.size()) {
+			gstate.finished = true;
+		}
+		if (sink.scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE) {
+			data_collection.Reset();
+		}
+		return;
+	}
-unique_ptr<LocalSourceState> RadixPartitionedHashTable::GetLocalSourceState(ExecutionContext &context) const {
-	return make_uniq<RadixHTLocalSourceState>(context, *this);
-}
+	RowOperationsState row_state(aggregate_allocator);
+	const auto group_cols = layout.ColumnCount() - 1;
+	RowOperations::FinalizeStates(row_state, layout, scan_state.chunk_state.row_locations, scan_chunk, group_cols);
-idx_t RadixPartitionedHashTable::Size(GlobalSinkState &sink_state) const {
-	auto &gstate = sink_state.Cast<RadixHTGlobalState>();
-	if (gstate.is_empty && grouping_set.empty()) {
-		return 1;
+	if (sink.scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE && layout.HasDestructor()) {
+		RowOperations::DestroyStates(row_state, layout, scan_state.chunk_state.row_locations, scan_chunk.size());
 	}
-	idx_t count = 0;
-	for (const auto &ht : gstate.finalized_hts) {
-		count += ht->Count();
+	auto &radix_ht = sink.radix_ht;
+	idx_t chunk_index = 0;
+	for (auto &entry : radix_ht.grouping_set) {
+		chunk.data[entry].Reference(scan_chunk.data[chunk_index++]);
+	}
+	for (auto null_group : radix_ht.null_groups) {
+		chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
+		ConstantVector::SetNull(chunk.data[null_group], true);
+	}
+	D_ASSERT(radix_ht.grouping_set.size() + radix_ht.null_groups.size() == radix_ht.op.GroupCount());
+	for (idx_t col_idx = 0; col_idx < radix_ht.op.aggregates.size(); col_idx++) {
+		chunk.data[radix_ht.op.GroupCount() + col_idx].Reference(
+		    scan_chunk.data[radix_ht.group_types.size() + col_idx]);
+	}
+	D_ASSERT(radix_ht.op.grouping_functions.size() == radix_ht.grouping_values.size());
+	for (idx_t i = 0; i < radix_ht.op.grouping_functions.size(); i++) {
+		chunk.data[radix_ht.op.GroupCount() + radix_ht.op.aggregates.size() + i].Reference(radix_ht.grouping_values[i]);
+	}
+	chunk.SetCardinality(scan_chunk);
+	D_ASSERT(chunk.size() != 0);
+}
+bool RadixHTLocalSourceState::TaskFinished() {
+	switch (task) {
+	case RadixHTSourceTaskType::FINALIZE:
+		return true;
+	case RadixHTSourceTaskType::SCAN:
+		return scan_status == RadixHTScanStatus::DONE;
+	default:
+		D_ASSERT(task == RadixHTSourceTaskType::NO_TASK);
+		return true;
 	}
-	return count;
 }
 SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, DataChunk &chunk,
-                                                    GlobalSinkState &sink_state, OperatorSourceInput &input) const {
-	auto &gstate = sink_state.Cast<RadixHTGlobalState>();
-	auto &state = input.global_state.Cast<RadixHTGlobalSourceState>();
+                                                    GlobalSinkState &sink_p, OperatorSourceInput &input) const {
+	auto &sink = sink_p.Cast<RadixHTGlobalSinkState>();
+	D_ASSERT(sink.finalized);
+	auto &gstate = input.global_state.Cast<RadixHTGlobalSourceState>();
 	auto &lstate = input.local_state.Cast<RadixHTLocalSourceState>();
-	D_ASSERT(gstate.is_finalized);
-	if (state.finished) {
+	D_ASSERT(sink.scan_pin_properties == TupleDataPinProperties::UNPIN_AFTER_DONE ||
+	         sink.scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE);
+	if (gstate.finished) {
 		return SourceResultType::FINISHED;
 	}
-	// special case hack to sort out aggregating from empty intermediates
-	// for aggregations without groups
-	if (gstate.is_empty && grouping_set.empty()) {
+	// Special case hack to sort out aggregating from empty intermediates for aggregations without groups
+	if (CountInternal(sink_p) == 0 && grouping_set.empty()) {
 		D_ASSERT(chunk.ColumnCount() == null_groups.size() + op.aggregates.size() + op.grouping_functions.size());
-		// for each column in the aggregates, set to initial state
+		// For each column in the aggregates, set to initial state
 		chunk.SetCardinality(1);
 		for (auto null_group : null_groups) {
 			chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
@@ -666,97 +786,17 @@ SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, D
 		for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
 			chunk.data[null_groups.size() + op.aggregates.size() + i].Reference(grouping_values[i]);
 		}
-		state.finished = true;
-		return chunk.size() == 0 ? SourceResultType::FINISHED : SourceResultType::HAVE_MORE_OUTPUT;
-	}
-	if (gstate.is_empty) {
-		state.finished = true;
-		return chunk.size() == 0 ? SourceResultType::FINISHED : SourceResultType::HAVE_MORE_OUTPUT;
-	}
-	idx_t elements_found = 0;
-	lstate.scan_chunk.Reset();
-	if (!state.initialized) {
-		lock_guard<mutex> l(state.lock);
-		if (!state.initialized) {
-			auto &finalized_hts = gstate.finalized_hts;
-			state.ht_scan_states = make_unsafe_uniq_array<TupleDataParallelScanState>(finalized_hts.size());
-			const auto &layout = gstate.finalized_hts[0]->GetDataCollection().GetLayout();
-			vector<column_t> column_ids;
-			column_ids.reserve(layout.ColumnCount() - 1);
-			for (idx_t col_idx = 0; col_idx < layout.ColumnCount() - 1; col_idx++) {
-				column_ids.emplace_back(col_idx);
-			}
-			for (idx_t ht_idx = 0; ht_idx < finalized_hts.size(); ht_idx++) {
-				gstate.finalized_hts[ht_idx]->GetDataCollection().InitializeScan(
-				    state.ht_scan_states.get()[ht_idx].scan_state, column_ids);
-			}
-			state.initialized = true;
-		}
+		gstate.finished = true;
+		return SourceResultType::HAVE_MORE_OUTPUT;
 	}
-	auto &local_scan_state = lstate.scan_state;
-	while (true) {
-		D_ASSERT(state.ht_scan_states);
-		idx_t ht_index;
-		{
-			lock_guard<mutex> l(state.lock);
-			ht_index = state.ht_index;
-			if (ht_index >= gstate.finalized_hts.size()) {
-				state.finished = true;
-				return chunk.size() == 0 ? SourceResultType::FINISHED : SourceResultType::HAVE_MORE_OUTPUT;
-			}
-		}
-		D_ASSERT(ht_index < gstate.finalized_hts.size());
-		if (lstate.ht_index != DConstants::INVALID_INDEX && ht_index != lstate.ht_index) {
-			lstate.ht->GetDataCollection().FinalizePinState(local_scan_state.pin_state);
-		}
-		lstate.ht_index = ht_index;
-		lstate.ht = gstate.finalized_hts[ht_index];
-		D_ASSERT(lstate.ht);
-		auto &global_scan_state = state.ht_scan_states[ht_index];
-		elements_found = lstate.ht->Scan(global_scan_state, local_scan_state, lstate.scan_chunk);
-		if (elements_found > 0) {
-			break;
-		}
-		lstate.ht->GetDataCollection().FinalizePinState(local_scan_state.pin_state);
-		// move to the next hash table
-		lock_guard<mutex> l(state.lock);
-		ht_index++;
-		if (ht_index > state.ht_index) {
-			// we have not yet worked on the table
-			// move the global index forwards
-			if (!gstate.multi_scan) {
-				gstate.finalized_hts[state.ht_index].reset();
-			}
-			state.ht_index = ht_index;
+	while (!gstate.finished && chunk.size() == 0) {
+		if (!lstate.TaskFinished() || gstate.AssignTask(sink, lstate)) {
+			lstate.ExecuteTask(sink, gstate, chunk);
 		}
 	}
-	// compute the final projection list
-	chunk.SetCardinality(elements_found);
-	idx_t chunk_index = 0;
-	for (auto &entry : grouping_set) {
-		chunk.data[entry].Reference(lstate.scan_chunk.data[chunk_index++]);
-	}
-	for (auto null_group : null_groups) {
-		chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
-		ConstantVector::SetNull(chunk.data[null_group], true);
-	}
-	D_ASSERT(grouping_set.size() + null_groups.size() == op.GroupCount());
-	for (idx_t col_idx = 0; col_idx < op.aggregates.size(); col_idx++) {
-		chunk.data[op.GroupCount() + col_idx].Reference(lstate.scan_chunk.data[group_types.size() + col_idx]);
-	}
-	D_ASSERT(op.grouping_functions.size() == grouping_values.size());
-	for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
-		chunk.data[op.GroupCount() + op.aggregates.size() + i].Reference(grouping_values[i]);
-	}
-	return chunk.size() == 0 ? SourceResultType::FINISHED : SourceResultType::HAVE_MORE_OUTPUT;
+	return SourceResultType::HAVE_MORE_OUTPUT;
 }
 } // namespace duckdb