npm - duckdb - Versions diffs - 0.7.2-dev1188.0 → 0.7.2-dev1238.0 - Mend

duckdb 0.7.2-dev1188.0 → 0.7.2-dev1238.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "0.7.2-dev1188.0",
+  "version": "0.7.2-dev1238.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb/extension/parquet/column_reader.cpp CHANGED Viewed

@@ -1401,7 +1401,7 @@ unique_ptr<ColumnReader> ColumnReader::CreateReader(ParquetReader &reader, const
 	case LogicalTypeId::TIME_TZ:
 		if (schema_p.__isset.logicalType && schema_p.logicalType.__isset.TIME) {
 			if (schema_p.logicalType.TIME.unit.__isset.MILLIS) {
-				return make_unique<CallbackColumnReader<int64_t, dtime_t, ParquetIntToTimeMs>>(
+				return make_unique<CallbackColumnReader<int32_t, dtime_t, ParquetIntToTimeMs>>(
 				    reader, type_p, schema_p, file_idx_p, max_define, max_repeat);
 			} else if (schema_p.logicalType.TIME.unit.__isset.MICROS) {
 				return make_unique<CallbackColumnReader<int64_t, dtime_t, ParquetIntToTime>>(
@@ -1416,7 +1416,7 @@ unique_ptr<ColumnReader> ColumnReader::CreateReader(ParquetReader &reader, const
 				return make_unique<CallbackColumnReader<int64_t, dtime_t, ParquetIntToTime>>(
 				    reader, type_p, schema_p, file_idx_p, max_define, max_repeat);
 			case ConvertedType::TIME_MILLIS:
-				return make_unique<CallbackColumnReader<int64_t, dtime_t, ParquetIntToTimeMs>>(
+				return make_unique<CallbackColumnReader<int32_t, dtime_t, ParquetIntToTimeMs>>(
 				    reader, type_p, schema_p, file_idx_p, max_define, max_repeat);
 			default:
 				break;

package/src/duckdb/extension/parquet/include/parquet_timestamp.hpp CHANGED Viewed

@@ -22,7 +22,7 @@ timestamp_t ParquetTimestampMicrosToTimestamp(const int64_t &raw_ts);
 timestamp_t ParquetTimestampMsToTimestamp(const int64_t &raw_ts);
 timestamp_t ParquetTimestampNsToTimestamp(const int64_t &raw_ts);
 date_t ParquetIntToDate(const int32_t &raw_date);
-dtime_t ParquetIntToTimeMs(const int64_t &raw_time);
+dtime_t ParquetIntToTimeMs(const int32_t &raw_time);
 dtime_t ParquetIntToTime(const int64_t &raw_time);
 dtime_t ParquetIntToTimeNs(const int64_t &raw_time);

package/src/duckdb/extension/parquet/parquet_reader.cpp CHANGED Viewed

@@ -200,6 +200,11 @@ LogicalType ParquetReader::DeriveLogicalType(const SchemaElement &s_ele, bool bi
 				throw IOException("UTF8 converted type can only be set for Type::(FIXED_LEN_)BYTE_ARRAY");
 			}
 		case ConvertedType::TIME_MILLIS:
+			if (s_ele.type == Type::INT32) {
+				return LogicalType::TIME;
+			} else {
+				throw IOException("TIME_MILLIS converted type can only be set for value of Type::INT32");
+			}
 		case ConvertedType::TIME_MICROS:
 			if (s_ele.type == Type::INT64) {
 				return LogicalType::TIME;

package/src/duckdb/extension/parquet/parquet_statistics.cpp CHANGED Viewed

@@ -1,12 +1,11 @@
 #include "parquet_statistics.hpp"
 #include "parquet_decimal_utils.hpp"
 #include "parquet_timestamp.hpp"
 #include "duckdb.hpp"
 #ifndef DUCKDB_AMALGAMATION
 #include "duckdb/common/types/blob.hpp"
 #include "duckdb/common/types/value.hpp"
+#include "duckdb/common/types/time.hpp"
 #endif
 namespace duckdb {
@@ -155,11 +154,31 @@ Value ParquetStatisticsUtils::ConvertValue(const LogicalType &type,
 		return Value::DATE(date_t(Load<int32_t>((data_ptr_t)stats.c_str())));
 	case LogicalTypeId::TIME:
 	case LogicalTypeId::TIME_TZ: {
-		if (stats.size() != sizeof(int64_t)) {
+		int64_t val;
+		if (stats.size() == sizeof(int32_t)) {
+			val = Load<int32_t>((data_ptr_t)stats.c_str());
+		} else if (stats.size() == sizeof(int64_t)) {
+			val = Load<int64_t>((data_ptr_t)stats.c_str());
+		} else {
 			throw InternalException("Incorrect stats size for type TIME");
 		}
-		auto time = dtime_t(Load<int64_t>((data_ptr_t)stats.c_str()));
-		return Value::TIME(time);
+		if (schema_ele.__isset.logicalType && schema_ele.logicalType.__isset.TIME) {
+			// logical type
+			if (schema_ele.logicalType.TIME.unit.__isset.MILLIS) {
+				return Value::TIME(Time::FromTimeMs(val));
+			} else if (schema_ele.logicalType.TIME.unit.__isset.NANOS) {
+				return Value::TIME(Time::FromTimeNs(val));
+			} else if (schema_ele.logicalType.TIME.unit.__isset.MICROS) {
+				return Value::TIME(dtime_t(val));
+			} else {
+				throw InternalException("Time logicalType is set but unit is not defined");
+			}
+		}
+		if (schema_ele.converted_type == duckdb_parquet::format::ConvertedType::TIME_MILLIS) {
+			return Value::TIME(Time::FromTimeMs(val));
+		} else {
+			return Value::TIME(dtime_t(val));
+		}
 	}
 	case LogicalTypeId::TIMESTAMP:
 	case LogicalTypeId::TIMESTAMP_TZ: {

package/src/duckdb/extension/parquet/parquet_timestamp.cpp CHANGED Viewed

@@ -54,7 +54,7 @@ date_t ParquetIntToDate(const int32_t &raw_date) {
 	return date_t(raw_date);
 }
-dtime_t ParquetIntToTimeMs(const int64_t &raw_time) {
+dtime_t ParquetIntToTimeMs(const int32_t &raw_time) {
 	return Time::FromTimeMs(raw_time);
 }

package/src/duckdb/src/common/string_util.cpp CHANGED Viewed

@@ -11,9 +11,23 @@
 #include <sstream>
 #include <stdarg.h>
 #include <string.h>
+#include <random>
 namespace duckdb {
+string StringUtil::GenerateRandomName(idx_t length) {
+	std::random_device rd;
+	std::mt19937 gen(rd());
+	std::uniform_int_distribution<> dis(0, 15);
+	std::stringstream ss;
+	ss << std::hex;
+	for (idx_t i = 0; i < length; i++) {
+		ss << dis(gen);
+	}
+	return ss.str();
+}
 bool StringUtil::Contains(const string &haystack, const string &needle) {
 	return (haystack.find(needle) != string::npos);
 }

package/src/duckdb/src/execution/aggregate_hashtable.cpp CHANGED Viewed

@@ -21,9 +21,9 @@ using ValidityBytes = RowLayout::ValidityBytes;
 GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
                                                      vector<LogicalType> group_types, vector<LogicalType> payload_types,
                                                      const vector<BoundAggregateExpression *> &bindings,
-                                                     HtEntryType entry_type)
+                                                     HtEntryType entry_type, idx_t initial_capacity)
     : GroupedAggregateHashTable(context, allocator, std::move(group_types), std::move(payload_types),
-                                AggregateObject::CreateAggregateObjects(bindings), entry_type) {
+                                AggregateObject::CreateAggregateObjects(bindings), entry_type, initial_capacity) {
 }
 GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
@@ -31,17 +31,19 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
     : GroupedAggregateHashTable(context, allocator, std::move(group_types), {}, vector<AggregateObject>()) {
 }
+AggregateHTAppendState::AggregateHTAppendState()
+    : ht_offsets(LogicalTypeId::BIGINT), hash_salts(LogicalTypeId::SMALLINT),
+      group_compare_vector(STANDARD_VECTOR_SIZE), no_match_vector(STANDARD_VECTOR_SIZE),
+      empty_vector(STANDARD_VECTOR_SIZE), new_groups(STANDARD_VECTOR_SIZE), addresses(LogicalType::POINTER) {
+}
 GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
                                                      vector<LogicalType> group_types_p,
                                                      vector<LogicalType> payload_types_p,
                                                      vector<AggregateObject> aggregate_objects_p,
-                                                     HtEntryType entry_type)
+                                                     HtEntryType entry_type, idx_t initial_capacity)
     : BaseAggregateHashTable(context, allocator, aggregate_objects_p, std::move(payload_types_p)),
-      entry_type(entry_type), capacity(0), entries(0), payload_page_offset(0), is_finalized(false),
-      ht_offsets(LogicalTypeId::BIGINT), hash_salts(LogicalTypeId::SMALLINT),
-      group_compare_vector(STANDARD_VECTOR_SIZE), no_match_vector(STANDARD_VECTOR_SIZE),
-      empty_vector(STANDARD_VECTOR_SIZE) {
+      entry_type(entry_type), capacity(0), entries(0), payload_page_offset(0), is_finalized(false) {
 	// Append hash column to the end and initialise the row layout
 	group_types_p.emplace_back(LogicalType::HASH);
 	layout.Initialize(std::move(group_types_p), std::move(aggregate_objects_p));
@@ -59,12 +61,12 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
 	switch (entry_type) {
 	case HtEntryType::HT_WIDTH_64: {
 		hash_prefix_shift = (HASH_WIDTH - sizeof(aggr_ht_entry_64::salt)) * 8;
-		Resize<aggr_ht_entry_64>(STANDARD_VECTOR_SIZE * 2L);
+		Resize<aggr_ht_entry_64>(initial_capacity);
 		break;
 	}
 	case HtEntryType::HT_WIDTH_32: {
 		hash_prefix_shift = (HASH_WIDTH - sizeof(aggr_ht_entry_32::salt)) * 8;
-		Resize<aggr_ht_entry_32>(STANDARD_VECTOR_SIZE * 2L);
+		Resize<aggr_ht_entry_32>(initial_capacity);
 		break;
 	}
 	default:
@@ -155,6 +157,10 @@ void GroupedAggregateHashTable::VerifyInternal() {
 	D_ASSERT(count == entries);
 }
+idx_t GroupedAggregateHashTable::InitialCapacity() {
+	return STANDARD_VECTOR_SIZE * 2ULL;
+}
 idx_t GroupedAggregateHashTable::GetMaxCapacity(HtEntryType entry_type, idx_t tuple_size) {
 	idx_t max_pages;
 	idx_t max_tuples;
@@ -213,7 +219,6 @@ void GroupedAggregateHashTable::Resize(idx_t size) {
 		hashes_hdl_ptr = hashes_hdl.Ptr();
 	}
 	memset(hashes_hdl_ptr, 0, byte_size);
-	hashes_end_ptr = hashes_hdl_ptr + byte_size;
 	capacity = size;
 	auto hashes_arr = (ENTRY *)hashes_hdl_ptr;
@@ -240,7 +245,8 @@ void GroupedAggregateHashTable::Resize(idx_t size) {
 	Verify();
 }
-idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, DataChunk &payload, AggregateType filter) {
+idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChunk &groups, DataChunk &payload,
+                                          AggregateType filter) {
 	vector<idx_t> aggregate_filter;
 	auto &aggregates = layout.GetAggregates();
@@ -250,34 +256,32 @@ idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, DataChunk &payload,
 			aggregate_filter.push_back(i);
 		}
 	}
-	return AddChunk(groups, payload, aggregate_filter);
+	return AddChunk(state, groups, payload, aggregate_filter);
 }
-idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, DataChunk &payload, const vector<idx_t> &filter) {
+idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChunk &groups, DataChunk &payload,
+                                          const vector<idx_t> &filter) {
 	Vector hashes(LogicalType::HASH);
 	groups.Hash(hashes);
-	return AddChunk(groups, hashes, payload, filter);
+	return AddChunk(state, groups, hashes, payload, filter);
 }
-idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, Vector &group_hashes, DataChunk &payload,
-                                          const vector<idx_t> &filter) {
+idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes,
+                                          DataChunk &payload, const vector<idx_t> &filter) {
 	D_ASSERT(!is_finalized);
 	if (groups.size() == 0) {
 		return 0;
 	}
-	// dummy
-	SelectionVector new_groups(STANDARD_VECTOR_SIZE);
 	D_ASSERT(groups.ColumnCount() + 1 == layout.ColumnCount());
 	for (idx_t i = 0; i < groups.ColumnCount(); i++) {
 		D_ASSERT(groups.GetTypes()[i] == layout.GetTypes()[i]);
 	}
-	Vector addresses(LogicalType::POINTER);
-	auto new_group_count = FindOrCreateGroups(groups, group_hashes, addresses, new_groups);
-	VectorOperations::AddInPlace(addresses, layout.GetAggrOffset(), payload.size());
+	auto new_group_count = FindOrCreateGroups(state, groups, group_hashes, state.addresses, state.new_groups);
+	VectorOperations::AddInPlace(state.addresses, layout.GetAggrOffset(), payload.size());
 	// now every cell has an entry
 	// update the aggregates
@@ -290,20 +294,21 @@ idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, Vector &group_hashe
 		if (filter_idx >= filter.size() || i < filter[filter_idx]) {
 			// Skip all the aggregates that are not in the filter
 			payload_idx += aggr.child_count;
-			VectorOperations::AddInPlace(addresses, aggr.payload_size, payload.size());
+			VectorOperations::AddInPlace(state.addresses, aggr.payload_size, payload.size());
 			continue;
 		}
 		D_ASSERT(i == filter[filter_idx]);
 		if (aggr.aggr_type != AggregateType::DISTINCT && aggr.filter) {
-			RowOperations::UpdateFilteredStates(filter_set.GetFilterData(i), aggr, addresses, payload, payload_idx);
+			RowOperations::UpdateFilteredStates(filter_set.GetFilterData(i), aggr, state.addresses, payload,
+			                                    payload_idx);
 		} else {
-			RowOperations::UpdateStates(aggr, addresses, payload, payload_idx, payload.size());
+			RowOperations::UpdateStates(aggr, state.addresses, payload, payload_idx, payload.size());
 		}
 		// move to the next aggregate
 		payload_idx += aggr.child_count;
-		VectorOperations::AddInPlace(addresses, aggr.payload_size, payload.size());
+		VectorOperations::AddInPlace(state.addresses, aggr.payload_size, payload.size());
 		filter_idx++;
 	}
@@ -321,16 +326,23 @@ void GroupedAggregateHashTable::FetchAggregates(DataChunk &groups, DataChunk &re
 	if (groups.size() == 0) {
 		return;
 	}
 	// find the groups associated with the addresses
 	// FIXME: this should not use the FindOrCreateGroups, creating them is unnecessary
+	AggregateHTAppendState append_state;
 	Vector addresses(LogicalType::POINTER);
-	FindOrCreateGroups(groups, addresses);
+	FindOrCreateGroups(append_state, groups, addresses);
 	// now fetch the aggregates
 	RowOperations::FinalizeStates(layout, addresses, result, 0);
 }
+idx_t GroupedAggregateHashTable::ResizeThreshold() {
+	return capacity / LOAD_FACTOR;
+}
 template <class ENTRY>
-idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, Vector &group_hashes, Vector &addresses,
+idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendState &state, DataChunk &groups,
+                                                            Vector &group_hashes, Vector &addresses,
                                                             SelectionVector &new_groups_out) {
 	D_ASSERT(!is_finalized);
@@ -339,7 +351,7 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
 	}
 	// resize at 50% capacity, also need to fit the entire vector
-	if (capacity - entries <= groups.size() || entries > capacity / LOAD_FACTOR) {
+	if (capacity - entries <= groups.size() || entries > ResizeThreshold()) {
 		Resize<ENTRY>(capacity * 2);
 	}
@@ -352,42 +364,47 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
 	group_hashes.Flatten(groups.size());
 	auto group_hashes_ptr = FlatVector::GetData<hash_t>(group_hashes);
-	D_ASSERT(ht_offsets.GetVectorType() == VectorType::FLAT_VECTOR);
-	D_ASSERT(ht_offsets.GetType() == LogicalType::BIGINT);
+	D_ASSERT(state.ht_offsets.GetVectorType() == VectorType::FLAT_VECTOR);
+	D_ASSERT(state.ht_offsets.GetType() == LogicalType::BIGINT);
 	D_ASSERT(addresses.GetType() == LogicalType::POINTER);
 	addresses.Flatten(groups.size());
 	auto addresses_ptr = FlatVector::GetData<data_ptr_t>(addresses);
-	// now compute the entry in the table based on the hash using a modulo
-	UnaryExecutor::Execute<hash_t, uint64_t>(group_hashes, ht_offsets, groups.size(), [&](hash_t element) {
+	// compute the entry in the table based on the hash using a modulo
+	// and precompute the hash salts for faster comparison below
+	D_ASSERT(state.hash_salts.GetType() == LogicalType::SMALLINT);
+	auto ht_offsets_ptr = FlatVector::GetData<uint64_t>(state.ht_offsets);
+	auto hash_salts_ptr = FlatVector::GetData<uint16_t>(state.hash_salts);
+	for (idx_t r = 0; r < groups.size(); r++) {
+		auto element = group_hashes_ptr[r];
 		D_ASSERT((element & bitmask) == (element % capacity));
-		return (element & bitmask);
-	});
-	auto ht_offsets_ptr = FlatVector::GetData<uint64_t>(ht_offsets);
-	// precompute the hash salts for faster comparison below
-	D_ASSERT(hash_salts.GetType() == LogicalType::SMALLINT);
-	UnaryExecutor::Execute<hash_t, uint16_t>(group_hashes, hash_salts, groups.size(),
-	                                         [&](hash_t element) { return (element >> hash_prefix_shift); });
-	auto hash_salts_ptr = FlatVector::GetData<uint16_t>(hash_salts);
+		ht_offsets_ptr[r] = element & bitmask;
+		hash_salts_ptr[r] = element >> hash_prefix_shift;
+	}
 	// we start out with all entries [0, 1, 2, ..., groups.size()]
 	const SelectionVector *sel_vector = FlatVector::IncrementalSelectionVector();
 	idx_t remaining_entries = groups.size();
 	// make a chunk that references the groups and the hashes
-	DataChunk group_chunk;
-	group_chunk.InitializeEmpty(layout.GetTypes());
+	if (state.group_chunk.ColumnCount() == 0) {
+		state.group_chunk.InitializeEmpty(layout.GetTypes());
+	}
+	D_ASSERT(state.group_chunk.ColumnCount() == layout.GetTypes().size());
 	for (idx_t grp_idx = 0; grp_idx < groups.ColumnCount(); grp_idx++) {
-		group_chunk.data[grp_idx].Reference(groups.data[grp_idx]);
+		state.group_chunk.data[grp_idx].Reference(groups.data[grp_idx]);
 	}
-	group_chunk.data[groups.ColumnCount()].Reference(group_hashes);
-	group_chunk.SetCardinality(groups);
+	state.group_chunk.data[groups.ColumnCount()].Reference(group_hashes);
+	state.group_chunk.SetCardinality(groups);
 	// convert all vectors to unified format
-	auto group_data = group_chunk.ToUnifiedFormat();
+	if (!state.group_data) {
+		state.group_data = unique_ptr<UnifiedVectorFormat[]>(new UnifiedVectorFormat[state.group_chunk.ColumnCount()]);
+	}
+	for (idx_t col_idx = 0; col_idx < state.group_chunk.ColumnCount(); col_idx++) {
+		state.group_chunk.data[col_idx].ToUnifiedFormat(state.group_chunk.size(), state.group_data[col_idx]);
+	}
 	idx_t new_group_count = 0;
 	while (remaining_entries > 0) {
@@ -420,7 +437,7 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
 				ht_entry_ptr->page_offset = payload_page_offset++;
 				// update selection lists for outer loops
-				empty_vector.set_index(new_entry_count++, index);
+				state.empty_vector.set_index(new_entry_count++, index);
 				new_groups_out.set_index(new_group_count++, index);
 				entries++;
@@ -430,37 +447,37 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
 				// cell is occupied: add to check list
 				// only need to check if hash salt in ptr == prefix of hash in payload
 				if (ht_entry_ptr->salt == hash_salts_ptr[index]) {
-					group_compare_vector.set_index(need_compare_count++, index);
+					state.group_compare_vector.set_index(need_compare_count++, index);
 					auto page_ptr = payload_hds_ptrs[ht_entry_ptr->page_nr - 1];
 					auto page_offset = ht_entry_ptr->page_offset * tuple_size;
 					addresses_ptr[index] = page_ptr + page_offset;
 				} else {
-					no_match_vector.set_index(no_match_count++, index);
+					state.no_match_vector.set_index(no_match_count++, index);
 				}
 			}
 		}
 		// for each of the locations that are empty, serialize the group columns to the locations
-		RowOperations::Scatter(group_chunk, group_data.get(), layout, addresses, *string_heap, empty_vector,
-		                       new_entry_count);
-		RowOperations::InitializeStates(layout, addresses, empty_vector, new_entry_count);
+		RowOperations::Scatter(state.group_chunk, state.group_data.get(), layout, addresses, *string_heap,
+		                       state.empty_vector, new_entry_count);
+		RowOperations::InitializeStates(layout, addresses, state.empty_vector, new_entry_count);
 		// now we have only the tuples remaining that might match to an existing group
 		// start performing comparisons with each of the groups
-		RowOperations::Match(group_chunk, group_data.get(), layout, addresses, predicates, group_compare_vector,
-		                     need_compare_count, &no_match_vector, no_match_count);
+		RowOperations::Match(state.group_chunk, state.group_data.get(), layout, addresses, predicates,
+		                     state.group_compare_vector, need_compare_count, &state.no_match_vector, no_match_count);
 		// each of the entries that do not match we move them to the next entry in the HT
 		for (idx_t i = 0; i < no_match_count; i++) {
-			idx_t index = no_match_vector.get_index(i);
+			idx_t index = state.no_match_vector.get_index(i);
 			ht_offsets_ptr[index]++;
 			if (ht_offsets_ptr[index] >= capacity) {
 				ht_offsets_ptr[index] = 0;
 			}
 		}
-		sel_vector = &no_match_vector;
+		sel_vector = &state.no_match_vector;
 		remaining_entries = no_match_count;
 	}
@@ -469,29 +486,30 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
 // this is to support distinct aggregations where we need to record whether we
 // have already seen a value for a group
-idx_t GroupedAggregateHashTable::FindOrCreateGroups(DataChunk &groups, Vector &group_hashes, Vector &addresses_out,
+idx_t GroupedAggregateHashTable::FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups,
+                                                    Vector &group_hashes, Vector &addresses_out,
                                                     SelectionVector &new_groups_out) {
 	switch (entry_type) {
 	case HtEntryType::HT_WIDTH_64:
-		return FindOrCreateGroupsInternal<aggr_ht_entry_64>(groups, group_hashes, addresses_out, new_groups_out);
+		return FindOrCreateGroupsInternal<aggr_ht_entry_64>(state, groups, group_hashes, addresses_out, new_groups_out);
 	case HtEntryType::HT_WIDTH_32:
-		return FindOrCreateGroupsInternal<aggr_ht_entry_32>(groups, group_hashes, addresses_out, new_groups_out);
+		return FindOrCreateGroupsInternal<aggr_ht_entry_32>(state, groups, group_hashes, addresses_out, new_groups_out);
 	default:
 		throw InternalException("Unknown HT entry width");
 	}
 }
-void GroupedAggregateHashTable::FindOrCreateGroups(DataChunk &groups, Vector &addresses) {
+void GroupedAggregateHashTable::FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups,
+                                                   Vector &addresses) {
 	// create a dummy new_groups sel vector
-	SelectionVector new_groups(STANDARD_VECTOR_SIZE);
-	FindOrCreateGroups(groups, addresses, new_groups);
+	FindOrCreateGroups(state, groups, addresses, state.new_groups);
 }
-idx_t GroupedAggregateHashTable::FindOrCreateGroups(DataChunk &groups, Vector &addresses_out,
-                                                    SelectionVector &new_groups_out) {
+idx_t GroupedAggregateHashTable::FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups,
+                                                    Vector &addresses_out, SelectionVector &new_groups_out) {
 	Vector hashes(LogicalType::HASH);
 	groups.Hash(hashes);
-	return FindOrCreateGroups(groups, hashes, addresses_out, new_groups_out);
+	return FindOrCreateGroups(state, groups, hashes, addresses_out, new_groups_out);
 }
 struct FlushMoveState {
@@ -521,7 +539,8 @@ void GroupedAggregateHashTable::FlushMove(FlushMoveState &state, Vector &source_
 		                      *FlatVector::IncrementalSelectionVector(), count, layout, col_no);
 	}
-	FindOrCreateGroups(state.groups, source_hashes, state.group_addresses, state.new_groups_sel);
+	AggregateHTAppendState append_state;
+	FindOrCreateGroups(append_state, state.groups, source_hashes, state.group_addresses, state.new_groups_sel);
 	RowOperations::CombineStates(layout, source_addresses, state.group_addresses, count);
 }

package/src/duckdb/src/execution/join_hashtable.cpp CHANGED Viewed

@@ -219,7 +219,9 @@ void JoinHashTable::Build(DataChunk &keys, DataChunk &payload) {
 		}
 		info.correlated_payload.SetCardinality(keys);
 		info.correlated_payload.data[0].Reference(keys.data[info.correlated_types.size()]);
-		info.correlated_counts->AddChunk(info.group_chunk, info.correlated_payload, AggregateType::NON_DISTINCT);
+		AggregateHTAppendState append_state;
+		info.correlated_counts->AddChunk(append_state, info.group_chunk, info.correlated_payload,
+		                                 AggregateType::NON_DISTINCT);
 	}
 	// prepare the keys for processing

package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp CHANGED Viewed

@@ -42,6 +42,7 @@ public:
 	bool initialized = false;
 	bool finished_scan = false;
 	SelectionVector new_groups;
+	AggregateHTAppendState append_state;
 };
 unique_ptr<GlobalSinkState> PhysicalRecursiveCTE::GetGlobalSinkState(ClientContext &context) const {
@@ -52,7 +53,7 @@ idx_t PhysicalRecursiveCTE::ProbeHT(DataChunk &chunk, RecursiveCTEState &state)
 	Vector dummy_addresses(LogicalType::POINTER);
 	// Use the HT to eliminate duplicate rows
-	idx_t new_group_count = state.ht->FindOrCreateGroups(chunk, dummy_addresses, state.new_groups);
+	idx_t new_group_count = state.ht->FindOrCreateGroups(state.append_state, chunk, dummy_addresses, state.new_groups);
 	// we only return entries we have not seen before (i.e. new groups)
 	chunk.Slice(state.new_groups, new_group_count);

package/src/duckdb/src/execution/partitionable_hashtable.cpp CHANGED Viewed

@@ -80,15 +80,17 @@ idx_t PartitionableHashTable::ListAddChunk(HashTableList &list, DataChunk &group
                                            DataChunk &payload, const vector<idx_t> &filter) {
 	// If this is false, a single AddChunk would overflow the max capacity
 	D_ASSERT(list.empty() || groups.size() <= list.back()->MaxCapacity());
-	if (list.empty() || list.back()->Size() + groups.size() > list.back()->MaxCapacity()) {
+	if (list.empty() || list.back()->Size() + groups.size() >= list.back()->MaxCapacity()) {
+		idx_t new_capacity = GroupedAggregateHashTable::InitialCapacity();
 		if (!list.empty()) {
+			new_capacity = list.back()->Capacity();
 			// early release first part of ht and prevent adding of more data
 			list.back()->Finalize();
 		}
 		list.push_back(make_unique<GroupedAggregateHashTable>(context, allocator, group_types, payload_types, bindings,
-		                                                      GetHTEntrySize()));
+		                                                      GetHTEntrySize(), new_capacity));
 	}
-	return list.back()->AddChunk(groups, group_hashes, payload, filter);
+	return list.back()->AddChunk(append_state, groups, group_hashes, payload, filter);
 }
 idx_t PartitionableHashTable::AddChunk(DataChunk &groups, DataChunk &payload, bool do_partition,
@@ -150,6 +152,7 @@ void PartitionableHashTable::Partition() {
 	D_ASSERT(partition_info.n_partitions > 1);
 	vector<GroupedAggregateHashTable *> partition_hts(partition_info.n_partitions);
+	radix_partitioned_hts.resize(partition_info.n_partitions);
 	for (auto &unpartitioned_ht : unpartitioned_hts) {
 		for (idx_t r = 0; r < partition_info.n_partitions; r++) {
 			radix_partitioned_hts[r].push_back(make_unique<GroupedAggregateHashTable>(
@@ -181,7 +184,7 @@ HashTableList PartitionableHashTable::GetUnpartitioned() {
 void PartitionableHashTable::Finalize() {
 	if (IsPartitioned()) {
 		for (auto &ht_list : radix_partitioned_hts) {
-			for (auto &ht : ht_list.second) {
+			for (auto &ht : ht_list) {
 				D_ASSERT(ht);
 				ht->Finalize();
 			}

package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp CHANGED Viewed

@@ -78,6 +78,7 @@ public:
 	bool is_partitioned = false;
 	RadixPartitionInfo partition_info;
+	AggregateHTAppendState append_state;
 };
 class RadixHTLocalState : public LocalSinkState {
@@ -151,7 +152,8 @@ void RadixPartitionedHashTable::Sink(ExecutionContext &context, GlobalSinkState
 		}
 		D_ASSERT(gstate.finalized_hts.size() == 1);
 		D_ASSERT(gstate.finalized_hts[0]);
-		llstate.total_groups += gstate.finalized_hts[0]->AddChunk(group_chunk, payload_input, filter);
+		llstate.total_groups +=
+		    gstate.finalized_hts[0]->AddChunk(gstate.append_state, group_chunk, payload_input, filter);
 		return;
 	}
@@ -194,15 +196,13 @@ void RadixPartitionedHashTable::Combine(ExecutionContext &context, GlobalSinkSta
 		llstate.ht->Partition();
 	}
-	lock_guard<mutex> glock(gstate.lock);
+	// we will never add new values to these HTs so we can drop the first part of the HT
+	llstate.ht->Finalize();
+	lock_guard<mutex> glock(gstate.lock);
 	if (!llstate.is_empty) {
 		gstate.is_empty = false;
 	}
-	// we will never add new values to these HTs so we can drop the first part of the HT
-	llstate.ht->Finalize();
 	// at this point we just collect them the PhysicalHashAggregateFinalizeTask (below) will merge them in parallel
 	gstate.intermediate_hts.push_back(std::move(llstate.ht));
 }

package/src/duckdb/src/function/aggregate/sorted_aggregate_function.cpp CHANGED Viewed

@@ -10,25 +10,29 @@
 namespace duckdb {
 struct SortedAggregateBindData : public FunctionData {
-	SortedAggregateBindData(ClientContext &context, const AggregateFunction &function_p,
-	                        vector<unique_ptr<Expression>> &children, unique_ptr<FunctionData> bind_info_p,
-	                        const BoundOrderModifier &order_bys)
-	    : buffer_manager(BufferManager::GetBufferManager(context)), function(function_p),
-	      bind_info(std::move(bind_info_p)) {
+	SortedAggregateBindData(ClientContext &context, BoundAggregateExpression &expr)
+	    : buffer_manager(BufferManager::GetBufferManager(context)), function(expr.function),
+	      bind_info(std::move(expr.bind_info)) {
+		auto &children = expr.children;
 		arg_types.reserve(children.size());
 		for (const auto &child : children) {
 			arg_types.emplace_back(child->return_type);
 		}
+		auto &order_bys = *expr.order_bys;
 		sort_types.reserve(order_bys.orders.size());
 		for (auto &order : order_bys.orders) {
 			orders.emplace_back(order.Copy());
 			sort_types.emplace_back(order.expression->return_type);
 		}
+		sorted_on_args = (children.size() == order_bys.orders.size());
+		for (size_t i = 0; sorted_on_args && i < children.size(); ++i) {
+			sorted_on_args = children[i]->Equals(order_bys.orders[i].expression.get());
+		}
 	}
 	SortedAggregateBindData(const SortedAggregateBindData &other)
 	    : buffer_manager(other.buffer_manager), function(other.function), arg_types(other.arg_types),
-	      sort_types(other.sort_types) {
+	      sort_types(other.sort_types), sorted_on_args(other.sorted_on_args) {
 		if (other.bind_info) {
 			bind_info = other.bind_info->Copy();
 		}
@@ -71,13 +75,14 @@ struct SortedAggregateBindData : public FunctionData {
 	vector<BoundOrderByNode> orders;
 	vector<LogicalType> sort_types;
+	bool sorted_on_args;
 };
 struct SortedAggregateState {
 	//! Default buffer size, optimised for small group to avoid blowing out memory.
 	static const idx_t BUFFER_CAPACITY = 16;
-	SortedAggregateState() : nsel(0) {
+	SortedAggregateState() : nsel(0), offset(0) {
 	}
 	static inline void InitializeBuffer(DataChunk &chunk, const vector<LogicalType> &types) {
@@ -103,23 +108,31 @@ struct SortedAggregateState {
 		ordering->Append(sort_buffer);
 		ResetBuffer(sort_buffer, order_bind.sort_types);
-		arguments = make_unique<ColumnDataCollection>(order_bind.buffer_manager, order_bind.arg_types);
-		InitializeBuffer(arg_buffer, order_bind.arg_types);
-		arguments->Append(arg_buffer);
-		ResetBuffer(arg_buffer, order_bind.arg_types);
+		if (!order_bind.sorted_on_args) {
+			arguments = make_unique<ColumnDataCollection>(order_bind.buffer_manager, order_bind.arg_types);
+			InitializeBuffer(arg_buffer, order_bind.arg_types);
+			arguments->Append(arg_buffer);
+			ResetBuffer(arg_buffer, order_bind.arg_types);
+		}
 	}
 	void Update(SortedAggregateBindData &order_bind, DataChunk &sort_chunk, DataChunk &arg_chunk) {
 		// Lazy instantiation of the buffer chunks
 		InitializeBuffer(sort_buffer, order_bind.sort_types);
-		InitializeBuffer(arg_buffer, order_bind.arg_types);
+		if (!order_bind.sorted_on_args) {
+			InitializeBuffer(arg_buffer, order_bind.arg_types);
+		}
 		if (sort_chunk.size() + sort_buffer.size() > STANDARD_VECTOR_SIZE) {
 			Flush(order_bind);
 		}
-		if (ordering) {
+		if (arguments) {
 			ordering->Append(sort_chunk);
 			arguments->Append(arg_chunk);
+		} else if (ordering) {
+			ordering->Append(sort_chunk);
+		} else if (order_bind.sorted_on_args) {
+			sort_buffer.Append(sort_chunk, true);
 		} else {
 			sort_buffer.Append(sort_chunk, true);
 			arg_buffer.Append(arg_chunk, true);
@@ -129,12 +142,14 @@ struct SortedAggregateState {
 	void UpdateSlice(SortedAggregateBindData &order_bind, DataChunk &sort_inputs, DataChunk &arg_inputs) {
 		// Lazy instantiation of the buffer chunks
 		InitializeBuffer(sort_buffer, order_bind.sort_types);
-		InitializeBuffer(arg_buffer, order_bind.arg_types);
+		if (!order_bind.sorted_on_args) {
+			InitializeBuffer(arg_buffer, order_bind.arg_types);
+		}
 		if (nsel + sort_buffer.size() > STANDARD_VECTOR_SIZE) {
 			Flush(order_bind);
 		}
-		if (ordering) {
+		if (arguments) {
 			sort_buffer.Reset();
 			sort_buffer.Slice(sort_inputs, sel, nsel);
 			ordering->Append(sort_buffer);
@@ -142,27 +157,38 @@ struct SortedAggregateState {
 			arg_buffer.Reset();
 			arg_buffer.Slice(arg_inputs, sel, nsel);
 			arguments->Append(arg_buffer);
+		} else if (ordering) {
+			sort_buffer.Reset();
+			sort_buffer.Slice(sort_inputs, sel, nsel);
+			ordering->Append(sort_buffer);
+		} else if (order_bind.sorted_on_args) {
+			sort_buffer.Append(sort_inputs, true, &sel, nsel);
 		} else {
 			sort_buffer.Append(sort_inputs, true, &sel, nsel);
 			arg_buffer.Append(arg_inputs, true, &sel, nsel);
 		}
 		nsel = 0;
+		offset = 0;
 	}
 	void Combine(SortedAggregateBindData &order_bind, SortedAggregateState &other) {
-		if (other.ordering) {
-			// Force CDC if the other hash it
+		if (other.arguments) {
+			// Force CDC if the other has it
 			Flush(order_bind);
 			ordering->Combine(*other.ordering);
 			arguments->Combine(*other.arguments);
+		} else if (other.ordering) {
+			// Force CDC if the other has it
+			Flush(order_bind);
+			ordering->Combine(*other.ordering);
 		} else if (other.sort_buffer.size()) {
 			Update(order_bind, other.sort_buffer, other.arg_buffer);
 		}
 	}
-	void Finalize(LocalSortState &local_sort) {
-		if (ordering) {
+	void Finalize(SortedAggregateBindData &order_bind, LocalSortState &local_sort) {
+		if (arguments) {
 			ColumnDataScanState sort_state;
 			ordering->InitializeScan(sort_state);
 			ColumnDataScanState arg_state;
@@ -174,6 +200,15 @@ struct SortedAggregateState {
 			}
 			ordering->Reset();
 			arguments->Reset();
+		} else if (ordering) {
+			ColumnDataScanState sort_state;
+			ordering->InitializeScan(sort_state);
+			for (sort_buffer.Reset(); ordering->Scan(sort_state, sort_buffer); sort_buffer.Reset()) {
+				local_sort.SinkChunk(sort_buffer, sort_buffer);
+			}
+			ordering->Reset();
+		} else if (order_bind.sorted_on_args) {
+			local_sort.SinkChunk(sort_buffer, sort_buffer);
 		} else {
 			local_sort.SinkChunk(sort_buffer, arg_buffer);
 		}
@@ -188,6 +223,7 @@ struct SortedAggregateState {
 	// Selection for scattering
 	SelectionVector sel;
 	idx_t nsel;
+	idx_t offset;
 };
 struct SortedAggregateFunction {
@@ -205,11 +241,13 @@ struct SortedAggregateFunction {
 	                          DataChunk &arg_chunk, DataChunk &sort_chunk) {
 		idx_t col = 0;
-		arg_chunk.InitializeEmpty(order_bind->arg_types);
-		for (auto &dst : arg_chunk.data) {
-			dst.Reference(inputs[col++]);
+		if (!order_bind->sorted_on_args) {
+			arg_chunk.InitializeEmpty(order_bind->arg_types);
+			for (auto &dst : arg_chunk.data) {
+				dst.Reference(inputs[col++]);
+			}
+			arg_chunk.SetCardinality(count);
 		}
-		arg_chunk.SetCardinality(count);
 		sort_chunk.InitializeEmpty(order_bind->sort_types);
 		for (auto &dst : sort_chunk.data) {
@@ -246,15 +284,27 @@ struct SortedAggregateFunction {
 		UnifiedVectorFormat svdata;
 		states.ToUnifiedFormat(count, svdata);
-		// Build the selection vector for each state.
+		// Size the selection vector for each state.
 		auto sdata = (SortedAggregateState **)svdata.data;
 		for (idx_t i = 0; i < count; ++i) {
 			auto sidx = svdata.sel->get_index(i);
 			auto order_state = sdata[sidx];
-			if (!order_state->sel.data()) {
-				order_state->sel.Initialize();
+			order_state->nsel++;
+		}
+		// Build the selection vector for each state.
+		vector<sel_t> sel_data(count);
+		idx_t start = 0;
+		for (idx_t i = 0; i < count; ++i) {
+			auto sidx = svdata.sel->get_index(i);
+			auto order_state = sdata[sidx];
+			if (!order_state->offset) {
+				//	First one
+				order_state->offset = start;
+				order_state->sel.Initialize(sel_data.data() + order_state->offset);
+				start += order_state->nsel;
 			}
-			order_state->sel.set_index(order_state->nsel++, i);
+			sel_data[order_state->offset++] = sidx;
 		}
 		// Append nonempty slices to the arguments
@@ -317,7 +367,7 @@ struct SortedAggregateFunction {
 			auto global_sort = make_unique<GlobalSortState>(buffer_manager, orders, payload_layout);
 			LocalSortState local_sort;
 			local_sort.Initialize(*global_sort, global_sort->buffer_manager);
-			state->Finalize(local_sort);
+			state->Finalize(*order_bind, local_sort);
 			global_sort->AddLocalState(local_sort);
 			if (!global_sort->sorted_blocks.empty()) {
@@ -399,12 +449,13 @@ void FunctionBinder::BindSortedAggregate(ClientContext &context, BoundAggregateE
 	auto &bound_function = expr.function;
 	auto &children = expr.children;
 	auto &order_bys = *expr.order_bys;
-	auto sorted_bind = make_unique<SortedAggregateBindData>(context, bound_function, expr.children,
-	                                                        std::move(expr.bind_info), order_bys);
+	auto sorted_bind = make_unique<SortedAggregateBindData>(context, expr);
-	// The arguments are the children plus the sort columns.
-	for (auto &order : order_bys.orders) {
-		children.emplace_back(std::move(order.expression));
+	if (!sorted_bind->sorted_on_args) {
+		// The arguments are the children plus the sort columns.
+		for (auto &order : order_bys.orders) {
+			children.emplace_back(std::move(order.expression));
+		}
 	}
 	vector<LogicalType> arguments;

package/src/duckdb/src/function/pragma/pragma_queries.cpp CHANGED Viewed

@@ -1,8 +1,11 @@
+#include "duckdb/catalog/catalog_search_path.hpp"
 #include "duckdb/common/constants.hpp"
 #include "duckdb/common/file_system.hpp"
 #include "duckdb/common/string_util.hpp"
 #include "duckdb/function/pragma/pragma_functions.hpp"
 #include "duckdb/main/config.hpp"
+#include "duckdb/main/database_manager.hpp"
+#include "duckdb/main/client_data.hpp"
 #include "duckdb/parser/parser.hpp"
 #include "duckdb/parser/qualified_name.hpp"
 #include "duckdb/parser/statement/copy_statement.hpp"
@@ -15,7 +18,27 @@ string PragmaTableInfo(ClientContext &context, const FunctionParameters &paramet
 }
 string PragmaShowTables(ClientContext &context, const FunctionParameters &parameters) {
-	return "SELECT name FROM sqlite_master ORDER BY name;";
+	auto catalog = DatabaseManager::GetDefaultDatabase(context);
+	auto schema = ClientData::Get(context).catalog_search_path->GetDefault().schema;
+	schema = (schema == INVALID_SCHEMA) ? DEFAULT_SCHEMA : schema; // NOLINT
+	auto where_clause =
+	    StringUtil::Join({"where database_name = '", catalog, "' and schema_name = '", schema, "'"}, "");
+	// clang-format off
+	auto pragma_query = StringUtil::Join(
+	    {"with tables as (",
+						"	SELECT table_name as name FROM duckdb_tables ", where_clause,
+			 "), views as (",
+						"	SELECT view_name as name FROM duckdb_views ", where_clause,
+			 "), indexes as (",
+						"	SELECT index_name as name FROM duckdb_indexes ", where_clause,
+			 "), db_objects as (",
+						"	SELECT name FROM tables UNION ALL SELECT name FROM views UNION ALL SELECT name FROM indexes",
+	     ") SELECT name FROM db_objects ORDER BY name;"
+			}, "");
+	// clang-format on
+	return pragma_query;
 }
 string PragmaShowTablesExpanded(ClientContext &context, const FunctionParameters &parameters) {

package/src/duckdb/src/function/table/version/pragma_version.cpp CHANGED Viewed

@@ -1,8 +1,8 @@
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "0.7.2-dev1188"
+#define DUCKDB_VERSION "0.7.2-dev1238"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "d1518bdfe8"
+#define DUCKDB_SOURCE_ID "4be6bdb565"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"

package/src/duckdb/src/include/duckdb/common/string_util.hpp CHANGED Viewed

@@ -21,6 +21,8 @@ namespace duckdb {
  */
 class StringUtil {
 public:
+	static string GenerateRandomName(idx_t length = 16);
 	static uint8_t GetHexValue(char c) {
 		if (c >= '0' && c <= '9') {
 			return c - '0';

package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp CHANGED Viewed

@@ -62,6 +62,20 @@ struct AggregateHTScanState {
 	idx_t scan_position = 0;
 };
+struct AggregateHTAppendState {
+	AggregateHTAppendState();
+	Vector ht_offsets;
+	Vector hash_salts;
+	SelectionVector group_compare_vector;
+	SelectionVector no_match_vector;
+	SelectionVector empty_vector;
+	SelectionVector new_groups;
+	Vector addresses;
+	unique_ptr<UnifiedVectorFormat[]> group_data;
+	DataChunk group_chunk;
+};
 class GroupedAggregateHashTable : public BaseAggregateHashTable {
 public:
 	//! The hash table load factor, when a resize is triggered
@@ -71,10 +85,12 @@ public:
 public:
 	GroupedAggregateHashTable(ClientContext &context, Allocator &allocator, vector<LogicalType> group_types,
 	                          vector<LogicalType> payload_types, const vector<BoundAggregateExpression *> &aggregates,
-	                          HtEntryType entry_type = HtEntryType::HT_WIDTH_64);
+	                          HtEntryType entry_type = HtEntryType::HT_WIDTH_64,
+	                          idx_t initial_capacity = InitialCapacity());
 	GroupedAggregateHashTable(ClientContext &context, Allocator &allocator, vector<LogicalType> group_types,
 	                          vector<LogicalType> payload_types, vector<AggregateObject> aggregates,
-	                          HtEntryType entry_type = HtEntryType::HT_WIDTH_64);
+	                          HtEntryType entry_type = HtEntryType::HT_WIDTH_64,
+	                          idx_t initial_capacity = InitialCapacity());
 	GroupedAggregateHashTable(ClientContext &context, Allocator &allocator, vector<LogicalType> group_types);
 	~GroupedAggregateHashTable() override;
@@ -85,9 +101,10 @@ public:
 	//! Add the given data to the HT, computing the aggregates grouped by the
 	//! data in the group chunk. When resize = true, aggregates will not be
 	//! computed but instead just assigned.
-	idx_t AddChunk(DataChunk &groups, DataChunk &payload, const vector<idx_t> &filter);
-	idx_t AddChunk(DataChunk &groups, Vector &group_hashes, DataChunk &payload, const vector<idx_t> &filter);
-	idx_t AddChunk(DataChunk &groups, DataChunk &payload, AggregateType filter);
+	idx_t AddChunk(AggregateHTAppendState &state, DataChunk &groups, DataChunk &payload, const vector<idx_t> &filter);
+	idx_t AddChunk(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes, DataChunk &payload,
+	               const vector<idx_t> &filter);
+	idx_t AddChunk(AggregateHTAppendState &state, DataChunk &groups, DataChunk &payload, AggregateType filter);
 	//! Scan the HT starting from the scan_position until the result and group
 	//! chunks are filled. scan_position will be updated by this function.
@@ -100,18 +117,24 @@ public:
 	//! Finds or creates groups in the hashtable using the specified group keys. The addresses vector will be filled
 	//! with pointers to the groups in the hash table, and the new_groups selection vector will point to the newly
 	//! created groups. The return value is the amount of newly created groups.
-	idx_t FindOrCreateGroups(DataChunk &groups, Vector &group_hashes, Vector &addresses_out,
+	idx_t FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes,
+	                         Vector &addresses_out, SelectionVector &new_groups_out);
+	idx_t FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups, Vector &addresses_out,
 	                         SelectionVector &new_groups_out);
-	idx_t FindOrCreateGroups(DataChunk &groups, Vector &addresses_out, SelectionVector &new_groups_out);
-	void FindOrCreateGroups(DataChunk &groups, Vector &addresses_out);
+	void FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups, Vector &addresses_out);
 	//! Executes the filter(if any) and update the aggregates
 	void Combine(GroupedAggregateHashTable &other);
+	static idx_t InitialCapacity();
 	idx_t Size() {
 		return entries;
 	}
+	idx_t Capacity() {
+		return capacity;
+	}
+	idx_t ResizeThreshold();
 	idx_t MaxCapacity();
 	static idx_t GetMaxCapacity(HtEntryType entry_type, idx_t tuple_size);
@@ -138,8 +161,7 @@ private:
 	//! The hashes of the HT
 	BufferHandle hashes_hdl;
 	data_ptr_t hashes_hdl_ptr;
-	data_ptr_t hashes_end_ptr; // of hashes
-	idx_t hash_offset;         // Offset into the layout of the hash column
+	idx_t hash_offset; // Offset into the layout of the hash column
 	hash_t hash_prefix_shift;
 	idx_t payload_page_offset;
@@ -147,16 +169,8 @@ private:
 	//! Bitmask for getting relevant bits from the hashes to determine the position
 	hash_t bitmask;
-	vector<unique_ptr<GroupedAggregateHashTable>> distinct_hashes;
 	bool is_finalized;
-	// some stuff from FindOrCreateGroupsInternal() to avoid allocation there
-	Vector ht_offsets;
-	Vector hash_salts;
-	SelectionVector group_compare_vector;
-	SelectionVector no_match_vector;
-	SelectionVector empty_vector;
 	vector<ExpressionType> predicates;
 private:
@@ -176,8 +190,8 @@ private:
 	template <class ENTRY>
 	void Resize(idx_t size);
 	template <class ENTRY>
-	idx_t FindOrCreateGroupsInternal(DataChunk &groups, Vector &group_hashes, Vector &addresses,
-	                                 SelectionVector &new_groups);
+	idx_t FindOrCreateGroupsInternal(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes,
+	                                 Vector &addresses, SelectionVector &new_groups);
 	template <class FUNC = std::function<void(idx_t, idx_t, data_ptr_t)>>
 	void PayloadApply(FUNC fun);

package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp CHANGED Viewed

@@ -54,9 +54,10 @@ private:
 	vector<idx_t> sel_vector_sizes;
 	DataChunk group_subset, payload_subset;
 	Vector hashes, hashes_subset;
+	AggregateHTAppendState append_state;
 	HashTableList unpartitioned_hts;
-	unordered_map<hash_t, HashTableList> radix_partitioned_hts;
+	vector<HashTableList> radix_partitioned_hts;
 	idx_t tuple_size;
 private: