npm - duckdb - Versions diffs - 0.8.2-dev4653.0 → 0.8.2-dev4871.0 - Mend

duckdb 0.8.2-dev4653.0 → 0.8.2-dev4871.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp CHANGED Viewed

@@ -46,7 +46,8 @@ struct Parse {
 				validity_mask.SetInvalid(machine.cur_rows);
 			}
 		}
-		if (machine.state == CSVState::STANDARD) {
+		if (machine.state == CSVState::STANDARD ||
+		    (machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
 			machine.value += current_char;
 		}
 		machine.cur_rows +=
@@ -57,7 +58,7 @@ struct Parse {
 		machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
 		machine.column_count -= machine.column_count * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return);
-		if (machine.cur_rows >= machine.options.sample_chunk_size) {
+		if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
 			// We sniffed enough rows
 			return true;
 		}
@@ -65,11 +66,22 @@ struct Parse {
 	}
 	inline static void Finalize(CSVStateMachine &machine, DataChunk &parse_chunk) {
-		if (machine.cur_rows < machine.options.sample_chunk_size && machine.state != CSVState::EMPTY_LINE) {
+		if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state != CSVState::EMPTY_LINE) {
 			machine.VerifyUTF8();
 			auto &v = parse_chunk.data[machine.column_count++];
 			auto parse_data = FlatVector::GetData<string_t>(v);
-			parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
+			if (machine.value.empty()) {
+				auto &validity_mask = FlatVector::Validity(v);
+				validity_mask.SetInvalid(machine.cur_rows);
+			} else {
+				parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
+			}
+			while (machine.column_count < parse_chunk.ColumnCount()) {
+				auto &v_pad = parse_chunk.data[machine.column_count++];
+				auto &validity_mask = FlatVector::Validity(v_pad);
+				validity_mask.SetInvalid(machine.cur_rows);
+			}
+			machine.cur_rows++;
 		}
 		parse_chunk.SetCardinality(machine.cur_rows);
 	}
@@ -104,8 +116,8 @@ void CSVSniffer::RefineTypes() {
 		return;
 	}
 	DataChunk parse_chunk;
-	parse_chunk.Initialize(BufferAllocator::Get(buffer_manager->context), detected_types, options.sample_chunk_size);
-	for (idx_t i = 1; i < best_candidate->options.sample_chunks; i++) {
+	parse_chunk.Initialize(BufferAllocator::Get(buffer_manager->context), detected_types, STANDARD_VECTOR_SIZE);
+	for (idx_t i = 1; i < best_candidate->options.sample_size_chunks; i++) {
 		bool finished_file = best_candidate->csv_buffer_iterator.Finished();
 		if (finished_file) {
 			// we finished the file: stop
@@ -124,6 +136,7 @@ void CSVSniffer::RefineTypes() {
 		best_candidate->csv_buffer_iterator.Process<Parse>(*best_candidate, parse_chunk);
 		for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) {
 			vector<LogicalType> &col_type_candidates = best_sql_types_candidates_per_column_idx[col];
+			bool is_bool_type = col_type_candidates.back() == LogicalType::BOOLEAN;
 			while (col_type_candidates.size() > 1) {
 				const auto &sql_type = col_type_candidates.back();
 				//	narrow down the date formats
@@ -154,6 +167,14 @@ void CSVSniffer::RefineTypes() {
 				if (TryCastVector(parse_chunk.data[col], parse_chunk.size(), sql_type)) {
 					break;
 				} else {
+					if (col_type_candidates.back() == LogicalType::BOOLEAN && is_bool_type) {
+						// If we thought this was a boolean value (i.e., T,F, True, False) and it is not, we
+						// immediately pop to varchar.
+						while (col_type_candidates.back() != LogicalType::VARCHAR) {
+							col_type_candidates.pop_back();
+						}
+						break;
+					}
 					col_type_candidates.pop_back();
 				}
 			}

package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp CHANGED Viewed

@@ -420,6 +420,8 @@ public:
 	}
 	DataChunk join_keys;
+	TupleDataChunkState join_key_state;
 	ExpressionExecutor probe_executor;
 	unique_ptr<JoinHashTable::ScanStructure> scan_structure;
 	unique_ptr<OperatorState> perfect_hash_join_state;
@@ -446,6 +448,7 @@ unique_ptr<OperatorState> PhysicalHashJoin::GetOperatorState(ExecutionContext &c
 		for (auto &cond : conditions) {
 			state->probe_executor.AddExpression(*cond.left);
 		}
+		TupleDataCollection::InitializeChunkState(state->join_key_state, condition_types);
 	}
 	if (sink.external) {
 		state->spill_chunk.Initialize(allocator, sink.probe_types);
@@ -502,10 +505,10 @@ OperatorResultType PhysicalHashJoin::ExecuteInternal(ExecutionContext &context,
 	// perform the actual probe
 	if (sink.external) {
-		state.scan_structure = sink.hash_table->ProbeAndSpill(state.join_keys, input, *sink.probe_spill,
-		                                                      state.spill_state, state.spill_chunk);
+		state.scan_structure = sink.hash_table->ProbeAndSpill(state.join_keys, state.join_key_state, input,
+		                                                      *sink.probe_spill, state.spill_state, state.spill_chunk);
 	} else {
-		state.scan_structure = sink.hash_table->Probe(state.join_keys);
+		state.scan_structure = sink.hash_table->Probe(state.join_keys, state.join_key_state);
 	}
 	state.scan_structure->Next(state.join_keys, input, chunk);
 	return OperatorResultType::HAVE_MORE_OUTPUT;
@@ -605,6 +608,7 @@ public:
 	DataChunk probe_chunk;
 	DataChunk join_keys;
 	DataChunk payload;
+	TupleDataChunkState join_key_state;
 	//! Column indices to easily reference the join keys/payload columns in probe_chunk
 	vector<idx_t> join_key_indices;
 	vector<idx_t> payload_indices;
@@ -782,6 +786,7 @@ HashJoinLocalSourceState::HashJoinLocalSourceState(const PhysicalHashJoin &op, A
 	probe_chunk.Initialize(allocator, sink.probe_types);
 	join_keys.Initialize(allocator, op.condition_types);
 	payload.Initialize(allocator, op.children[0]->types);
+	TupleDataCollection::InitializeChunkState(join_key_state, op.condition_types);
 	// Store the indices of the columns to reference them easily
 	idx_t col_idx = 0;
@@ -871,7 +876,7 @@ void HashJoinLocalSourceState::ExternalProbe(HashJoinGlobalSinkState &sink, Hash
 	}
 	// Perform the probe
-	scan_structure = sink.hash_table->Probe(join_keys, precomputed_hashes);
+	scan_structure = sink.hash_table->Probe(join_keys, join_key_state, precomputed_hashes);
 	scan_structure->Next(join_keys, payload, chunk);
 }

package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp CHANGED Viewed

@@ -254,7 +254,6 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::PlanComparisonJoin(LogicalCo
 	}
 	bool has_equality = false;
-	// bool has_inequality = false;
 	size_t has_range = 0;
 	for (size_t c = 0; c < op.conditions.size(); ++c) {
 		auto &cond = op.conditions[c];
@@ -271,7 +270,6 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::PlanComparisonJoin(LogicalCo
 			break;
 		case ExpressionType::COMPARE_NOTEQUAL:
 		case ExpressionType::COMPARE_DISTINCT_FROM:
-			// has_inequality = true;
 			break;
 		default:
 			throw NotImplementedException("Unimplemented comparison join");

package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp CHANGED Viewed

@@ -474,14 +474,9 @@ void RadixPartitionedHashTable::Finalize(ClientContext &, GlobalSinkState &gstat
 //===--------------------------------------------------------------------===//
 // Source
 //===--------------------------------------------------------------------===//
-idx_t RadixPartitionedHashTable::Count(GlobalSinkState &sink_p) const {
-	const auto count = CountInternal(sink_p);
-	return count == 0 && grouping_set.empty() ? 1 : count;
-}
-idx_t RadixPartitionedHashTable::CountInternal(GlobalSinkState &sink_p) const {
+idx_t RadixPartitionedHashTable::NumberOfPartitions(GlobalSinkState &sink_p) const {
 	auto &sink = sink_p.Cast<RadixHTGlobalSinkState>();
-	return sink.count_before_combining;
+	return sink.partitions.size();
 }
 void RadixPartitionedHashTable::SetMultiScan(GlobalSinkState &sink_p) {
@@ -570,8 +565,7 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
 	D_ASSERT(lstate.scan_status != RadixHTScanStatus::IN_PROGRESS);
 	const auto n_partitions = sink.partitions.size();
-	if (scan_done == n_partitions) {
-		finished = true;
+	if (finished) {
 		return false;
 	}
 	// We first try to assign a Scan task, then a Finalize task if that didn't work, without using any locks
@@ -595,6 +589,11 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
 		return true;
 	}
+	// We didn't assign a Scan task
+	if (sink.finalize_idx >= n_partitions) {
+		return false; // No finalize tasks left
+	}
 	// We can just increment the atomic here, much simpler than assigning the scan task
 	lstate.task_idx = sink.finalize_idx++;
 	if (lstate.task_idx < n_partitions) {
@@ -603,7 +602,7 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
 		return true;
 	}
-	// We didn't manage to assign a finalize task
+	// We didn't manage to assign a Finalize task
 	return false;
 }
@@ -693,15 +692,18 @@ void RadixHTLocalSourceState::Scan(RadixHTGlobalSinkState &sink, RadixHTGlobalSo
 	if (!data_collection.Scan(scan_state, scan_chunk)) {
 		scan_status = RadixHTScanStatus::DONE;
-		if (++gstate.scan_done == sink.partitions.size()) {
-			gstate.finished = true;
-		}
 		if (sink.scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE) {
 			data_collection.Reset();
 		}
 		return;
 	}
+	if (data_collection.ScanComplete(scan_state)) {
+		if (++gstate.scan_done == sink.partitions.size()) {
+			gstate.finished = true;
+		}
+	}
 	RowOperationsState row_state(aggregate_allocator);
 	const auto group_cols = layout.ColumnCount() - 1;
 	RowOperations::FinalizeStates(row_state, layout, scan_state.chunk_state.row_locations, scan_chunk, group_cols);
@@ -758,36 +760,38 @@ SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, D
 		return SourceResultType::FINISHED;
 	}
-	// Special case hack to sort out aggregating from empty intermediates for aggregations without groups
-	if (CountInternal(sink_p) == 0 && grouping_set.empty()) {
-		D_ASSERT(chunk.ColumnCount() == null_groups.size() + op.aggregates.size() + op.grouping_functions.size());
-		// For each column in the aggregates, set to initial state
-		chunk.SetCardinality(1);
-		for (auto null_group : null_groups) {
-			chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
-			ConstantVector::SetNull(chunk.data[null_group], true);
-		}
-		ArenaAllocator allocator(BufferAllocator::Get(context.client));
-		for (idx_t i = 0; i < op.aggregates.size(); i++) {
-			D_ASSERT(op.aggregates[i]->GetExpressionClass() == ExpressionClass::BOUND_AGGREGATE);
-			auto &aggr = op.aggregates[i]->Cast<BoundAggregateExpression>();
-			auto aggr_state = make_unsafe_uniq_array<data_t>(aggr.function.state_size());
-			aggr.function.initialize(aggr_state.get());
-			AggregateInputData aggr_input_data(aggr.bind_info.get(), allocator);
-			Vector state_vector(Value::POINTER(CastPointerToValue(aggr_state.get())));
-			aggr.function.finalize(state_vector, aggr_input_data, chunk.data[null_groups.size() + i], 1, 0);
-			if (aggr.function.destructor) {
-				aggr.function.destructor(state_vector, aggr_input_data, 1);
+	if (sink.count_before_combining == 0) {
+		if (grouping_set.empty()) {
+			// Special case hack to sort out aggregating from empty intermediates for aggregations without groups
+			D_ASSERT(chunk.ColumnCount() == null_groups.size() + op.aggregates.size() + op.grouping_functions.size());
+			// For each column in the aggregates, set to initial state
+			chunk.SetCardinality(1);
+			for (auto null_group : null_groups) {
+				chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
+				ConstantVector::SetNull(chunk.data[null_group], true);
+			}
+			ArenaAllocator allocator(BufferAllocator::Get(context.client));
+			for (idx_t i = 0; i < op.aggregates.size(); i++) {
+				D_ASSERT(op.aggregates[i]->GetExpressionClass() == ExpressionClass::BOUND_AGGREGATE);
+				auto &aggr = op.aggregates[i]->Cast<BoundAggregateExpression>();
+				auto aggr_state = make_unsafe_uniq_array<data_t>(aggr.function.state_size());
+				aggr.function.initialize(aggr_state.get());
+				AggregateInputData aggr_input_data(aggr.bind_info.get(), allocator);
+				Vector state_vector(Value::POINTER(CastPointerToValue(aggr_state.get())));
+				aggr.function.finalize(state_vector, aggr_input_data, chunk.data[null_groups.size() + i], 1, 0);
+				if (aggr.function.destructor) {
+					aggr.function.destructor(state_vector, aggr_input_data, 1);
+				}
+			}
+			// Place the grouping values (all the groups of the grouping_set condensed into a single value)
+			// Behind the null groups + aggregates
+			for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
+				chunk.data[null_groups.size() + op.aggregates.size() + i].Reference(grouping_values[i]);
 			}
-		}
-		// Place the grouping values (all the groups of the grouping_set condensed into a single value)
-		// Behind the null groups + aggregates
-		for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
-			chunk.data[null_groups.size() + op.aggregates.size() + i].Reference(grouping_values[i]);
 		}
 		gstate.finished = true;
-		return SourceResultType::HAVE_MORE_OUTPUT;
+		return SourceResultType::FINISHED;
 	}
 	while (!gstate.finished && chunk.size() == 0) {
@@ -796,7 +800,11 @@ SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, D
 		}
 	}
-	return SourceResultType::HAVE_MORE_OUTPUT;
+	if (chunk.size() != 0) {
+		return SourceResultType::HAVE_MORE_OUTPUT;
+	} else {
+		return SourceResultType::FINISHED;
+	}
 }
 } // namespace duckdb

package/src/duckdb/src/execution/reservoir_sample.cpp CHANGED Viewed

@@ -107,25 +107,19 @@ void ReservoirSamplePercentage::AddToReservoir(DataChunk &input) {
 			if (append_to_next_sample > 0) {
 				// we need to also add to the next sample
 				DataChunk new_chunk;
-				new_chunk.Initialize(allocator, input.GetTypes());
-				SelectionVector sel(append_to_current_sample_count);
-				for (idx_t r = 0; r < append_to_current_sample_count; r++) {
-					sel.set_index(r, r);
-				}
-				new_chunk.Slice(sel, append_to_current_sample_count);
+				new_chunk.InitializeEmpty(input.GetTypes());
+				new_chunk.Slice(input, *FlatVector::IncrementalSelectionVector(), append_to_current_sample_count);
 				new_chunk.Flatten();
 				current_sample->AddToReservoir(new_chunk);
 			} else {
 				input.Flatten();
 				input.SetCardinality(append_to_current_sample_count);
 				current_sample->AddToReservoir(input);
 			}
 		}
 		if (append_to_next_sample > 0) {
 			// slice the input for the remainder
-			SelectionVector sel(STANDARD_VECTOR_SIZE);
+			SelectionVector sel(append_to_next_sample);
 			for (idx_t i = 0; i < append_to_next_sample; i++) {
 				sel.set_index(i, append_to_current_sample_count + i);
 			}

package/src/duckdb/src/function/cast/vector_cast_helpers.cpp CHANGED Viewed

@@ -20,10 +20,16 @@ inline static void SkipWhitespace(const char *buf, idx_t &pos, idx_t len) {
 static bool SkipToCloseQuotes(idx_t &pos, const char *buf, idx_t &len) {
 	char quote = buf[pos];
 	pos++;
+	bool escaped = false;
 	while (pos < len) {
-		if (buf[pos] == quote) {
-			return true;
+		if (buf[pos] == '\\') {
+			escaped = !escaped;
+		} else {
+			if (buf[pos] == quote && !escaped) {
+				return true;
+			}
+			escaped = false;
 		}
 		pos++;
 	}

package/src/duckdb/src/function/function_binder.cpp CHANGED Viewed

@@ -1,16 +1,16 @@
 #include "duckdb/function/function_binder.hpp"
-#include "duckdb/common/limits.hpp"
-#include "duckdb/planner/expression/bound_cast_expression.hpp"
-#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
-#include "duckdb/planner/expression/bound_function_expression.hpp"
-#include "duckdb/planner/expression/bound_constant_expression.hpp"
+#include "duckdb/catalog/catalog.hpp"
 #include "duckdb/catalog/catalog_entry/scalar_function_catalog_entry.hpp"
-#include "duckdb/planner/expression_binder.hpp"
+#include "duckdb/common/limits.hpp"
+#include "duckdb/execution/expression_executor.hpp"
 #include "duckdb/function/aggregate_function.hpp"
 #include "duckdb/function/cast_rules.hpp"
-#include "duckdb/catalog/catalog.hpp"
+#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
+#include "duckdb/planner/expression/bound_cast_expression.hpp"
+#include "duckdb/planner/expression/bound_constant_expression.hpp"
+#include "duckdb/planner/expression/bound_function_expression.hpp"
+#include "duckdb/planner/expression_binder.hpp"
 namespace duckdb {
@@ -268,7 +268,8 @@ unique_ptr<Expression> FunctionBinder::BindScalarFunction(ScalarFunctionCatalogE
 	if (bound_function.null_handling == FunctionNullHandling::DEFAULT_NULL_HANDLING) {
 		for (auto &child : children) {
-			if (child->return_type == LogicalTypeId::SQLNULL) {
+			if (child->return_type == LogicalTypeId::SQLNULL ||
+			    (child->IsFoldable() && ExpressionExecutor::EvaluateScalar(context, *child).IsNull())) {
 				return make_uniq<BoundConstantExpression>(Value(LogicalType::SQLNULL));
 			}
 		}

package/src/duckdb/src/function/scalar/string/like.cpp CHANGED Viewed

@@ -196,9 +196,6 @@ static unique_ptr<FunctionData> LikeBindFunction(ClientContext &context, ScalarF
 	D_ASSERT(arguments.size() == 2 || arguments.size() == 3);
 	if (arguments[1]->IsFoldable()) {
 		Value pattern_str = ExpressionExecutor::EvaluateScalar(context, *arguments[1]);
-		if (pattern_str.IsNull()) {
-			return nullptr;
-		}
 		return LikeMatcher::CreateLikeMatcher(pattern_str.ToString());
 	}
 	return nullptr;

package/src/duckdb/src/function/table/read_csv.cpp CHANGED Viewed

@@ -107,11 +107,11 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
 		// Initialize Buffer Manager and Sniffer
 		auto file_handle = BaseCSVReader::OpenCSV(context, options);
 		result->buffer_manager = make_shared<CSVBufferManager>(context, std::move(file_handle), options);
-		CSVSniffer sniffer(options, result->buffer_manager, result->state_machine_cache);
+		CSVSniffer sniffer(options, result->buffer_manager, result->state_machine_cache, explicitly_set_columns);
 		auto sniffer_result = sniffer.SniffCSV();
-		return_types = sniffer_result.return_types;
 		if (names.empty()) {
 			names = sniffer_result.names;
+			return_types = sniffer_result.return_types;
 		} else {
 			if (explicitly_set_columns) {
 				// The user has influenced the names, can't assume they are valid anymore
@@ -195,6 +195,7 @@ public:
 		auto file_count = files_path_p.size();
 		line_info.current_batches.resize(file_count);
 		line_info.lines_read.resize(file_count);
+		line_info.lines_errored.resize(file_count);
 		tuple_start.resize(file_count);
 		tuple_end.resize(file_count);
 		tuple_end_to_batch.resize(file_count);
@@ -509,6 +510,11 @@ bool LineInfo::CanItGetLine(idx_t file_idx, idx_t batch_idx) {
 	return false;
 }
+void LineInfo::Increment(idx_t file_idx, idx_t batch_idx) {
+	auto parallel_lock = duckdb::make_uniq<lock_guard<mutex>>(main_mutex);
+	lines_errored[file_idx][batch_idx]++;
+}
 // Returns the 1-indexed line number
 idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t cur_start, bool verify,
                         bool stop_at_first) {
@@ -520,12 +526,11 @@ idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t
 	if (!stop_at_first) {
 		// Figure out the amount of lines read in the current file
-		auto &file_batches = current_batches[file_idx];
-		for (auto &batch : file_batches) {
-			if (batch > batch_idx) {
-				break;
+		for (idx_t cur_batch_idx = 0; cur_batch_idx <= batch_idx; cur_batch_idx++) {
+			if (cur_batch_idx < batch_idx) {
+				line_count += lines_errored[file_idx][cur_batch_idx];
 			}
-			line_count += lines_read[file_idx][batch];
+			line_count += lines_read[file_idx][cur_batch_idx];
 		}
 		return line_count + line_error + 1;
 	}
@@ -880,8 +885,6 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
 	table_function.named_parameters["header"] = LogicalType::BOOLEAN;
 	table_function.named_parameters["auto_detect"] = LogicalType::BOOLEAN;
 	table_function.named_parameters["sample_size"] = LogicalType::BIGINT;
-	table_function.named_parameters["sample_chunk_size"] = LogicalType::BIGINT;
-	table_function.named_parameters["sample_chunks"] = LogicalType::BIGINT;
 	table_function.named_parameters["all_varchar"] = LogicalType::BOOLEAN;
 	table_function.named_parameters["dateformat"] = LogicalType::VARCHAR;
 	table_function.named_parameters["timestampformat"] = LogicalType::VARCHAR;

package/src/duckdb/src/function/table/version/pragma_version.cpp CHANGED Viewed

@@ -1,8 +1,8 @@
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "0.8.2-dev4653"
+#define DUCKDB_VERSION "0.8.2-dev4871"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "bb287d4b22"
+#define DUCKDB_SOURCE_ID "5a29c99891"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"

package/src/duckdb/src/include/duckdb/common/enums/date_part_specifier.hpp CHANGED Viewed

@@ -25,7 +25,6 @@ enum class DatePartSpecifier : uint8_t {
 	SECOND,
 	MINUTE,
 	HOUR,
-	EPOCH,
 	DOW,
 	ISODOW,
 	WEEK,
@@ -39,11 +38,20 @@ enum class DatePartSpecifier : uint8_t {
 	TIMEZONE_MINUTE,
 	//	DOUBLE values
-	JULIAN_DAY
+	EPOCH,
+	JULIAN_DAY,
+	//	Invalid
+	INVALID,
+	//	Type ranges
+	BEGIN_BIGINT = YEAR,
+	BEGIN_DOUBLE = EPOCH,
+	BEGIN_INVALID = INVALID,
 };
 inline bool IsBigintDatepart(DatePartSpecifier part_code) {
-	return size_t(part_code) < size_t(DatePartSpecifier::JULIAN_DAY);
+	return size_t(part_code) < size_t(DatePartSpecifier::BEGIN_DOUBLE);
 }
 DUCKDB_API bool TryGetDatePartSpecifier(const string &specifier, DatePartSpecifier &result);

package/src/duckdb/src/include/duckdb/common/row_operations/row_matcher.hpp ADDED Viewed

@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/common/row_operations/row_matcher.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+#include "duckdb/common/enums/expression_type.hpp"
+#include "duckdb/common/types.hpp"
+namespace duckdb {
+class Vector;
+class DataChunk;
+class TupleDataLayout;
+struct TupleDataVectorFormat;
+struct SelectionVector;
+struct MatchFunction;
+typedef idx_t (*match_function_t)(Vector &lhs_vector, const TupleDataVectorFormat &lhs_format, SelectionVector &sel,
+                                  const idx_t count, const TupleDataLayout &rhs_layout, Vector &rhs_row_locations,
+                                  const idx_t col_idx, const vector<MatchFunction> &child_functions,
+                                  SelectionVector *no_match_sel, idx_t &no_match_count);
+struct MatchFunction {
+	match_function_t function;
+	vector<MatchFunction> child_functions;
+};
+struct RowMatcher {
+public:
+	using Predicates = vector<ExpressionType>;
+	//! Initializes the RowMatcher, filling match_functions using layout and predicates
+	void Initialize(const bool no_match_sel, const TupleDataLayout &layout, const Predicates &predicates);
+	//! Given a DataChunk on the LHS, on which we've called TupleDataCollection::ToUnifiedFormat,
+	//! we match it with rows on the RHS, according to the given layout and locations.
+	//! Initially, 'sel' has 'count' entries which point to what needs to be compared.
+	//! After matching is done, this returns how many matching entries there are, which 'sel' is modified to point to
+	idx_t Match(DataChunk &lhs, const vector<TupleDataVectorFormat> &lhs_formats, SelectionVector &sel, idx_t count,
+	            const TupleDataLayout &rhs_layout, Vector &rhs_row_locations, SelectionVector *no_match_sel,
+	            idx_t &no_match_count);
+private:
+	//! Gets the templated match function for a given column
+	MatchFunction GetMatchFunction(const bool no_match_sel, const LogicalType &type, const ExpressionType predicate);
+	template <bool NO_MATCH_SEL>
+	MatchFunction GetMatchFunction(const LogicalType &type, const ExpressionType predicate);
+	template <bool NO_MATCH_SEL, class T>
+	MatchFunction GetMatchFunction(const ExpressionType predicate);
+	template <bool NO_MATCH_SEL>
+	MatchFunction GetStructMatchFunction(const LogicalType &type, const ExpressionType predicate);
+	template <bool NO_MATCH_SEL>
+	MatchFunction GetListMatchFunction(const ExpressionType predicate);
+private:
+	vector<MatchFunction> match_functions;
+};
+} // namespace duckdb

package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp CHANGED Viewed

@@ -21,7 +21,7 @@ struct RowOperationsState;
 typedef void (*tuple_data_scatter_function_t)(const Vector &source, const TupleDataVectorFormat &source_format,
                                               const SelectionVector &append_sel, const idx_t append_count,
-                                              const TupleDataLayout &layout, Vector &row_locations,
+                                              const TupleDataLayout &layout, const Vector &row_locations,
                                               Vector &heap_locations, const idx_t col_idx,
                                               const UnifiedVectorFormat &list_format,
                                               const vector<TupleDataScatterFunction> &child_functions);
@@ -84,7 +84,11 @@ public:
 	                      TupleDataPinProperties = TupleDataPinProperties::UNPIN_AFTER_DONE);
 	//! Initializes the Chunk state of an Append state
 	//! - Useful for optimizing many appends made to the same tuple data collection
-	void InitializeAppend(TupleDataChunkState &chunk_state, vector<column_t> column_ids = {});
+	void InitializeChunkState(TupleDataChunkState &chunk_state, vector<column_t> column_ids = {});
+	//! Initializes the Chunk state of an Append state
+	//! - Useful for optimizing many appends made to the same tuple data collection
+	static void InitializeChunkState(TupleDataChunkState &chunk_state, const vector<LogicalType> &types,
+	                                 vector<column_t> column_ids = {});
 	//! Append a DataChunk directly to this TupleDataCollection - calls InitializeAppend and Append internally
 	void Append(DataChunk &new_chunk, const SelectionVector &append_sel = *FlatVector::IncrementalSelectionVector(),
 	            idx_t append_count = DConstants::INVALID_INDEX);
@@ -159,6 +163,8 @@ public:
 	bool Scan(TupleDataScanState &state, DataChunk &result);
 	//! Scans a DataChunk from the TupleDataCollection
 	bool Scan(TupleDataParallelScanState &gstate, TupleDataLocalScanState &lstate, DataChunk &result);
+	//! Whether the last scan has been completed on this TupleDataCollection
+	bool ScanComplete(const TupleDataScanState &state) const;
 	//! Gathers a DataChunk from the TupleDataCollection, given the specific row locations (requires full pin)
 	void Gather(Vector &row_locations, const SelectionVector &scan_sel, const idx_t scan_count, DataChunk &result,

package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp CHANGED Viewed

@@ -42,8 +42,8 @@ struct TupleDataVectorFormat {
 	const SelectionVector *original_sel;
 	SelectionVector original_owned_sel;
-	UnifiedVectorFormat data;
-	vector<TupleDataVectorFormat> child_formats;
+	UnifiedVectorFormat unified;
+	vector<TupleDataVectorFormat> children;
 	unique_ptr<CombinedListData> combined_list_data;
 };

package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp CHANGED Viewed

@@ -148,6 +148,9 @@ public:
 		if (!validity_mask) {
 			return ValidityBuffer::MAX_ENTRY;
 		}
+		return GetValidityEntryUnsafe(entry_idx);
+	}
+	inline V &GetValidityEntryUnsafe(idx_t entry_idx) const {
 		return validity_mask[entry_idx];
 	}
 	static inline bool AllValid(V entry) {
@@ -156,7 +159,7 @@ public:
 	static inline bool NoneValid(V entry) {
 		return entry == 0;
 	}
-	static inline bool RowIsValid(V entry, idx_t idx_in_entry) {
+	static inline bool RowIsValid(const V &entry, const idx_t &idx_in_entry) {
 		return entry & (V(1) << V(idx_in_entry));
 	}
 	static inline void GetEntryIndex(idx_t row_idx, idx_t &entry_idx, idx_t &idx_in_entry) {

package/src/duckdb/src/include/duckdb/core_functions/scalar/string_functions.hpp CHANGED Viewed

@@ -285,7 +285,7 @@ struct RepeatFun {
 	static constexpr const char *Description = "Repeats the string count number of times";
 	static constexpr const char *Example = "repeat('A', 5)";
-	static ScalarFunction GetFunction();
+	static ScalarFunctionSet GetFunctions();
 };
 struct ReplaceFun {

package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp CHANGED Viewed

@@ -8,6 +8,7 @@
 #pragma once
+#include "duckdb/common/row_operations/row_matcher.hpp"
 #include "duckdb/common/types/row/partitioned_tuple_data.hpp"
 #include "duckdb/execution/base_aggregate_hashtable.hpp"
 #include "duckdb/storage/arena_allocator.hpp"
@@ -143,6 +144,9 @@ public:
 	void UnpinData();
 private:
+	//! Efficiently matches groups
+	RowMatcher row_matcher;
 	//! Append state
 	struct AggregateHTAppendState {
 		AggregateHTAppendState();