npm - duckdb - Versions diffs - 1.2.1-dev6.0 → 1.2.1 - Mend

duckdb 1.2.1-dev6.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (117) hide show

package/src/duckdb/src/catalog/catalog_search_path.cpp CHANGED Viewed

@@ -189,11 +189,11 @@ void CatalogSearchPath::Set(CatalogSearchEntry new_value, CatalogSetPathType set
 	Set(std::move(new_paths), set_type);
 }
-const vector<CatalogSearchEntry> &CatalogSearchPath::Get() {
+const vector<CatalogSearchEntry> &CatalogSearchPath::Get() const {
 	return paths;
 }
-string CatalogSearchPath::GetDefaultSchema(const string &catalog) {
+string CatalogSearchPath::GetDefaultSchema(const string &catalog) const {
 	for (auto &path : paths) {
 		if (path.catalog == TEMP_CATALOG) {
 			continue;
@@ -205,7 +205,7 @@ string CatalogSearchPath::GetDefaultSchema(const string &catalog) {
 	return DEFAULT_SCHEMA;
 }
-string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string &catalog) {
+string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string &catalog) const {
 	for (auto &path : paths) {
 		if (path.catalog == TEMP_CATALOG) {
 			continue;
@@ -221,7 +221,7 @@ string CatalogSearchPath::GetDefaultSchema(ClientContext &context, const string
 	return DEFAULT_SCHEMA;
 }
-string CatalogSearchPath::GetDefaultCatalog(const string &schema) {
+string CatalogSearchPath::GetDefaultCatalog(const string &schema) const {
 	if (DefaultSchemaGenerator::IsDefaultSchema(schema)) {
 		return SYSTEM_CATALOG;
 	}
@@ -236,7 +236,7 @@ string CatalogSearchPath::GetDefaultCatalog(const string &schema) {
 	return INVALID_CATALOG;
 }
-vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) {
+vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) const {
 	vector<string> catalogs;
 	if (DefaultSchemaGenerator::IsDefaultSchema(schema)) {
 		catalogs.push_back(SYSTEM_CATALOG);
@@ -250,7 +250,7 @@ vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) {
 	return catalogs;
 }
-vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) {
+vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) const {
 	vector<string> schemas;
 	for (auto &path : paths) {
 		if (StringUtil::CIEquals(path.catalog, catalog)) {
@@ -260,7 +260,7 @@ vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) {
 	return schemas;
 }
-const CatalogSearchEntry &CatalogSearchPath::GetDefault() {
+const CatalogSearchEntry &CatalogSearchPath::GetDefault() const {
 	const auto &paths = Get();
 	D_ASSERT(paths.size() >= 2);
 	return paths[1];
@@ -281,7 +281,7 @@ void CatalogSearchPath::SetPathsInternal(vector<CatalogSearchEntry> new_paths) {
 }
 bool CatalogSearchPath::SchemaInSearchPath(ClientContext &context, const string &catalog_name,
-                                           const string &schema_name) {
+                                           const string &schema_name) const {
 	for (auto &path : paths) {
 		if (!StringUtil::CIEquals(path.schema, schema_name)) {
 			continue;

package/src/duckdb/src/common/bind_helpers.cpp CHANGED Viewed

@@ -56,6 +56,9 @@ vector<bool> ParseColumnList(const Value &value, vector<string> &names, const st
 		}
 		throw BinderException("\"%s\" expects a column list or * as parameter", loption);
 	}
+	if (value.IsNull()) {
+		throw BinderException("\"%s\" expects a column list or * as parameter, it can't be a NULL value", loption);
+	}
 	auto &children = ListValue::GetChildren(value);
 	// accept '*' as single argument
 	if (children.size() == 1 && children[0].type().id() == LogicalTypeId::VARCHAR &&

package/src/duckdb/src/common/compressed_file_system.cpp CHANGED Viewed

@@ -31,6 +31,8 @@ void CompressedFile::Initialize(bool write) {
 	stream_data.out_buff_start = stream_data.out_buff.get();
 	stream_data.out_buff_end = stream_data.out_buff.get();
+	current_position = 0;
 	stream_wrapper = compressed_fs.CreateStream();
 	stream_wrapper->Initialize(*this, write);
 }

package/src/duckdb/src/common/hive_partitioning.cpp CHANGED Viewed

@@ -245,7 +245,7 @@ static void TemplatedGetHivePartitionValues(Vector &input, vector<HivePartitionK
 	const auto &type = input.GetType();
-	const auto reinterpret = Value::CreateValue<T>(data[0]).GetTypeMutable() != type;
+	const auto reinterpret = Value::CreateValue<T>(data[sel.get_index(0)]).GetTypeMutable() != type;
 	if (reinterpret) {
 		for (idx_t i = 0; i < count; i++) {
 			auto &key = keys[i];

package/src/duckdb/src/common/multi_file_reader.cpp CHANGED Viewed

@@ -508,14 +508,14 @@ void MultiFileReader::CreateMapping(const string &file_name,
 	// copy global columns and inject any different defaults
 	CreateColumnMapping(file_name, local_columns, global_columns, global_column_ids, reader_data, bind_data,
 	                    initial_file, global_state);
-	CreateFilterMap(global_columns, filters, reader_data, global_state);
+	CreateFilterMap(global_column_ids, filters, reader_data, global_state);
 }
-void MultiFileReader::CreateFilterMap(const vector<MultiFileReaderColumnDefinition> &global_columns,
+void MultiFileReader::CreateFilterMap(const vector<ColumnIndex> &global_column_ids,
                                       optional_ptr<TableFilterSet> filters, MultiFileReaderData &reader_data,
                                       optional_ptr<MultiFileReaderGlobalState> global_state) {
 	if (filters) {
-		auto filter_map_size = global_columns.size();
+		auto filter_map_size = global_column_ids.size();
 		if (global_state) {
 			filter_map_size += global_state->extra_columns.size();
 		}

package/src/duckdb/src/execution/aggregate_hashtable.cpp CHANGED Viewed

@@ -329,7 +329,7 @@ optional_idx GroupedAggregateHashTable::TryAddDictionaryGroups(DataChunk &groups
 	if (dictionary_id.empty()) {
 		// dictionary has no id, we can't cache across vectors
 		// only use dictionary compression if there are fewer entries than groups
-		if (dict_size >= groups.size() * DICTIONARY_THRESHOLD) {
+		if (dict_size * DICTIONARY_THRESHOLD >= groups.size()) {
 			// dictionary is too large - use regular aggregation
 			return optional_idx();
 		}

package/src/duckdb/src/execution/index/art/art.cpp CHANGED Viewed

@@ -1038,9 +1038,11 @@ string ART::GenerateConstraintErrorMessage(VerifyExistenceType verify_type, cons
 	}
 	case VerifyExistenceType::DELETE_FK: {
 		// DELETE_FK that still exists in a FK table, i.e., not a valid delete.
-		return StringUtil::Format("Violates foreign key constraint because key \"%s\" is still referenced by a foreign "
-		                          "key in a different table",
-		                          key_name);
+		return StringUtil::Format(
+		    "Violates foreign key constraint because key \"%s\" is still referenced by a foreign "
+		    "key in a different table. If this is an unexpected constraint violation, please refer to our "
+		    "foreign key limitations in the documentation",
+		    key_name);
 	}
 	default:
 		throw NotImplementedException("Type not implemented for VerifyExistenceType");
@@ -1091,16 +1093,27 @@ void ART::VerifyLeaf(const Node &leaf, const ARTKey &key, optional_ptr<ART> dele
 		return;
 	}
+	// Fast path for FOREIGN KEY constraints.
+	// Up to here, the above code paths work implicitly for FKs, as the leaf is inlined.
 	// FIXME: proper foreign key + delete ART support.
-	// This implicitly works for foreign keys, as we do not have to consider the actual row IDs.
-	// We only need to know that there are conflicts (for now), as we still perform over-eager constraint checking.
+	if (index_constraint_type == IndexConstraintType::FOREIGN) {
+		D_ASSERT(!deleted_leaf);
+		// We don't handle FK conflicts in UPSERT, so the row ID should not matter.
+		if (manager.AddHit(i, MAX_ROW_ID)) {
+			conflict_idx = i;
+		}
+		return;
+	}
 	// Scan the two row IDs in the leaf.
 	Iterator it(*this);
 	it.FindMinimum(leaf);
 	ARTKey empty_key = ARTKey();
 	unsafe_vector<row_t> row_ids;
-	it.Scan(empty_key, 2, row_ids, false);
+	auto success = it.Scan(empty_key, 2, row_ids, false);
+	if (!success || row_ids.size() != 2) {
+		throw InternalException("VerifyLeaf expects exactly two row IDs to be scanned");
+	}
 	if (!deleted_leaf) {
 		if (manager.AddHit(i, row_ids[0]) || manager.AddHit(i, row_ids[1])) {

package/src/duckdb/src/execution/index/art/iterator.cpp CHANGED Viewed

@@ -46,9 +46,11 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec
 	bool has_next;
 	do {
 		// An empty upper bound indicates that no upper bound exists.
-		if (!upper_bound.Empty() && status == GateStatus::GATE_NOT_SET) {
-			if (current_key.GreaterThan(upper_bound, equal, nested_depth)) {
-				return true;
+		if (!upper_bound.Empty()) {
+			if (status == GateStatus::GATE_NOT_SET || entered_nested_leaf) {
+				if (current_key.GreaterThan(upper_bound, equal, nested_depth)) {
+					return true;
+				}
 			}
 		}
@@ -86,6 +88,7 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec
 			throw InternalException("Invalid leaf type for index scan.");
 		}
+		entered_nested_leaf = false;
 		has_next = Next();
 	} while (has_next);
 	return true;
@@ -104,6 +107,7 @@ void Iterator::FindMinimum(const Node &node) {
 	if (node.GetGateStatus() == GateStatus::GATE_SET) {
 		D_ASSERT(status == GateStatus::GATE_NOT_SET);
 		status = GateStatus::GATE_SET;
+		entered_nested_leaf = true;
 		nested_depth = 0;
 	}

package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp CHANGED Viewed

@@ -575,6 +575,11 @@ public:
 	explicit WindowLocalSourceState(WindowGlobalSourceState &gsource);
+	void ReleaseLocalStates() {
+		auto &local_states = window_hash_group->thread_states.at(task->thread_idx);
+		local_states.clear();
+	}
 	//! Does the task have more work to do?
 	bool TaskFinished() const {
 		return !task || task->begin_idx == task->end_idx;
@@ -792,6 +797,12 @@ void WindowGlobalSourceState::FinishTask(TaskPtr task) {
 }
 bool WindowLocalSourceState::TryAssignTask() {
+	D_ASSERT(TaskFinished());
+	if (task && task->stage == WindowGroupStage::GETDATA) {
+		// If this state completed the last block in the previous iteration,
+		// release out local state memory.
+		ReleaseLocalStates();
+	}
 	// Because downstream operators may be using our internal buffers,
 	// we can't "finish" a task until we are about to get the next one.
@@ -888,10 +899,6 @@ void WindowLocalSourceState::GetData(DataChunk &result) {
 		++task->begin_idx;
 	}
-	// If that was the last block, release out local state memory.
-	if (TaskFinished()) {
-		local_states.clear();
-	}
 	result.Verify();
 }

package/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp CHANGED Viewed

@@ -4,7 +4,7 @@
 namespace duckdb {
 CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
-                     idx_t &global_csv_current_position, idx_t file_number_p)
+                     const idx_t &global_csv_current_position, idx_t file_number_p)
     : context(context), requested_size(buffer_size_p), file_number(file_number_p), can_seek(file_handle.CanSeek()),
       is_pipe(file_handle.IsPipe()) {
 	AllocateBuffer(buffer_size_p);
@@ -34,7 +34,7 @@ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t b
 }
 shared_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p,
-                                      bool &has_seaked) {
+                                      bool &has_seaked) const {
 	if (has_seaked) {
 		// This means that at some point a reload was done, and we are currently on the incorrect position in our file
 		// handle

package/src/duckdb/src/execution/operator/csv_scanner/encode/csv_encoder.cpp CHANGED Viewed

@@ -36,7 +36,7 @@ void CSVEncoderBuffer::Reset() {
 	actual_encoded_buffer_size = 0;
 }
-CSVEncoder::CSVEncoder(DBConfig &config, const string &encoding_name_to_find, idx_t buffer_size) {
+CSVEncoder::CSVEncoder(const DBConfig &config, const string &encoding_name_to_find, idx_t buffer_size) {
 	encoding_name = StringUtil::Lower(encoding_name_to_find);
 	auto function = config.GetEncodeFunction(encoding_name_to_find);
 	if (!function) {
@@ -51,6 +51,10 @@ CSVEncoder::CSVEncoder(DBConfig &config, const string &encoding_name_to_find, id
 	}
 	// We ensure that the encoded buffer size is an even number to make the two byte lookup on utf-16 work
 	idx_t encoded_buffer_size = buffer_size % 2 != 0 ? buffer_size - 1 : buffer_size;
+	if (encoded_buffer_size == 0) {
+		// This might happen if buffer size = 1
+		encoded_buffer_size = 2;
+	}
 	D_ASSERT(encoded_buffer_size > 0);
 	encoded_buffer.Initialize(encoded_buffer_size);
 	remaining_bytes_buffer.Initialize(function->GetBytesPerIteration());

package/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp CHANGED Viewed

@@ -11,9 +11,10 @@ ScannerResult::ScannerResult(CSVStates &states_p, CSVStateMachine &state_machine
 BaseScanner::BaseScanner(shared_ptr<CSVBufferManager> buffer_manager_p, shared_ptr<CSVStateMachine> state_machine_p,
                          shared_ptr<CSVErrorHandler> error_handler_p, bool sniffing_p,
-                         shared_ptr<CSVFileScan> csv_file_scan_p, CSVIterator iterator_p)
+                         shared_ptr<CSVFileScan> csv_file_scan_p, const CSVIterator &iterator_p)
     : csv_file_scan(std::move(csv_file_scan_p)), sniffing(sniffing_p), error_handler(std::move(error_handler_p)),
-      state_machine(std::move(state_machine_p)), buffer_manager(std::move(buffer_manager_p)), iterator(iterator_p) {
+      state_machine(std::move(state_machine_p)), states(), buffer_manager(std::move(buffer_manager_p)),
+      iterator(iterator_p) {
 	D_ASSERT(buffer_manager);
 	D_ASSERT(state_machine);
 	// Initialize current buffer handle

package/src/duckdb/src/execution/operator/csv_scanner/scanner/csv_schema.cpp CHANGED Viewed

@@ -76,8 +76,8 @@ void CSVSchema::MergeSchemas(CSVSchema &other, bool null_padding) {
 	}
 }
-CSVSchema::CSVSchema(vector<string> &names, vector<LogicalType> &types, const string &file_path, idx_t rows_read_p,
-                     const bool empty_p)
+CSVSchema::CSVSchema(const vector<string> &names, const vector<LogicalType> &types, const string &file_path,
+                     idx_t rows_read_p, const bool empty_p)
     : rows_read(rows_read_p), empty(empty_p) {
 	Initialize(names, types, file_path);
 }

package/src/duckdb/src/execution/operator/csv_scanner/scanner/scanner_boundary.cpp CHANGED Viewed

@@ -13,7 +13,7 @@ CSVBoundary::CSVBoundary(idx_t buffer_idx_p, idx_t buffer_pos_p, idx_t boundary_
 CSVBoundary::CSVBoundary() : buffer_idx(0), buffer_pos(0), boundary_idx(0), end_pos(NumericLimits<idx_t>::Maximum()) {
 }
-CSVIterator::CSVIterator() : is_set(false) {
+CSVIterator::CSVIterator() : buffer_size(0), is_set(false) {
 }
 void CSVBoundary::Print() const {

package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp CHANGED Viewed

@@ -688,23 +688,29 @@ bool LineError::HandleErrors(StringValueResult &result) {
 				    line_pos.GetGlobalPosition(result.requested_size), result.path);
 			}
 			break;
-		case CAST_ERROR:
+		case CAST_ERROR: {
+			string column_name;
+			LogicalTypeId type_id;
+			if (cur_error.col_idx < result.names.size()) {
+				column_name = result.names[cur_error.col_idx];
+			}
+			if (cur_error.col_idx < result.number_of_columns) {
+				type_id = result.parse_types[cur_error.chunk_idx].type_id;
+			}
 			if (result.current_line_position.begin == line_pos) {
 				csv_error = CSVError::CastError(
-				    result.state_machine.options, result.names[cur_error.col_idx], cur_error.error_message,
-				    cur_error.col_idx, borked_line, lines_per_batch,
+				    result.state_machine.options, column_name, cur_error.error_message, cur_error.col_idx, borked_line,
+				    lines_per_batch,
 				    result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl),
-				    line_pos.GetGlobalPosition(result.requested_size, first_nl),
-				    result.parse_types[cur_error.chunk_idx].type_id, result.path);
+				    line_pos.GetGlobalPosition(result.requested_size, first_nl), type_id, result.path);
 			} else {
 				csv_error = CSVError::CastError(
-				    result.state_machine.options, result.names[cur_error.col_idx], cur_error.error_message,
-				    cur_error.col_idx, borked_line, lines_per_batch,
+				    result.state_machine.options, column_name, cur_error.error_message, cur_error.col_idx, borked_line,
+				    lines_per_batch,
 				    result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl),
-				    line_pos.GetGlobalPosition(result.requested_size), result.parse_types[cur_error.chunk_idx].type_id,
-				    result.path);
+				    line_pos.GetGlobalPosition(result.requested_size), type_id, result.path);
 			}
-			break;
+		} break;
 		case MAXIMUM_LINE_SIZE:
 			csv_error = CSVError::LineSizeError(
 			    result.state_machine.options, lines_per_batch, borked_line,
@@ -964,7 +970,8 @@ StringValueScanner::StringValueScanner(idx_t scanner_idx_p, const shared_ptr<CSV
       result(states, *state_machine, cur_buffer_handle, BufferAllocator::Get(buffer_manager->context), result_size,
              iterator.pos.buffer_pos, *error_handler, iterator,
              buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
-             buffer_manager->GetFilePath(), scanner_idx_p) {
+             buffer_manager->GetFilePath(), scanner_idx_p),
+      start_pos(0) {
 	iterator.buffer_size = state_machine->options.buffer_size_option.GetValue();
 }
@@ -976,7 +983,8 @@ StringValueScanner::StringValueScanner(const shared_ptr<CSVBufferManager> &buffe
       result(states, *state_machine, cur_buffer_handle, Allocator::DefaultAllocator(), result_size,
              iterator.pos.buffer_pos, *error_handler, iterator,
              buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing,
-             buffer_manager->GetFilePath(), 0) {
+             buffer_manager->GetFilePath(), 0),
+      start_pos(0) {
 	iterator.buffer_size = state_machine->options.buffer_size_option.GetValue();
 }

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp CHANGED Viewed

@@ -397,7 +397,13 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<ColumnCountScanner> scanner,
 				}
 			}
 		}
-		if (max_columns_found == num_cols && ignored_rows > min_ignored_rows) {
+		if (max_columns_found == num_cols && (ignored_rows > min_ignored_rows)) {
+			return;
+		}
+		if (max_columns_found > 1 && num_cols > max_columns_found && consistent_rows < best_consistent_rows / 2 &&
+		    options.null_padding) {
+			// When null_padding is true, we only give preference to a max number of columns if null padding is at least
+			// 50% as consistent as the best case scenario
 			return;
 		}
 		if (quoted && num_cols < max_columns_found) {
@@ -436,28 +442,19 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<ColumnCountScanner> scanner,
 	    !require_more_padding && !invalid_padding && num_cols == max_columns_found && comments_are_acceptable) {
 		auto &sniffing_state_machine = scanner->GetStateMachine();
-		bool same_quote_is_candidate = false;
-		for (const auto &candidate : candidates) {
-			if (sniffing_state_machine.dialect_options.state_machine_options.quote ==
-			    candidate->GetStateMachine().dialect_options.state_machine_options.quote) {
-				same_quote_is_candidate = true;
-			}
-		}
-		if (!same_quote_is_candidate) {
-			if (options.dialect_options.skip_rows.IsSetByUser()) {
-				// If skip rows is set by user, and we found dirty notes, we only accept it if either null_padding or
-				// ignore_errors is set
-				if (dirty_notes != 0 && !options.null_padding && !options.ignore_errors.GetValue()) {
-					return;
-				}
-				sniffing_state_machine.dialect_options.skip_rows = options.dialect_options.skip_rows.GetValue();
-			} else if (!options.null_padding) {
-				sniffing_state_machine.dialect_options.skip_rows = dirty_notes;
+		if (options.dialect_options.skip_rows.IsSetByUser()) {
+			// If skip rows is set by user, and we found dirty notes, we only accept it if either null_padding or
+			// ignore_errors is set
+			if (dirty_notes != 0 && !options.null_padding && !options.ignore_errors.GetValue()) {
+				return;
 			}
-			sniffing_state_machine.dialect_options.num_cols = num_cols;
-			lines_sniffed = sniffed_column_counts.result_position;
-			candidates.emplace_back(std::move(scanner));
+			sniffing_state_machine.dialect_options.skip_rows = options.dialect_options.skip_rows.GetValue();
+		} else if (!options.null_padding) {
+			sniffing_state_machine.dialect_options.skip_rows = dirty_notes;
 		}
+		sniffing_state_machine.dialect_options.num_cols = num_cols;
+		lines_sniffed = sniffed_column_counts.result_position;
+		candidates.emplace_back(std::move(scanner));
 	}
 }
@@ -491,7 +488,7 @@ void CSVSniffer::RefineCandidates() {
 	for (idx_t i = 1; i <= options.sample_size_chunks; i++) {
 		vector<unique_ptr<ColumnCountScanner>> successful_candidates;
-		bool done = false;
+		bool done = candidates.empty();
 		for (auto &cur_candidate : candidates) {
 			const bool finished_file = cur_candidate->FinishedFile();
 			if (successful_candidates.empty()) {

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp CHANGED Viewed

@@ -2,7 +2,7 @@
 #include "duckdb/execution/operator/csv_scanner/csv_casting.hpp"
 namespace duckdb {
-bool CSVSniffer::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) {
+bool CSVSniffer::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) const {
 	auto &sniffing_state_machine = best_candidate->GetStateMachine();
 	// try vector-cast from string to sql_type
 	Vector dummy_result(sql_type, size);

package/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp CHANGED Viewed

@@ -303,6 +303,7 @@ CSVError CSVError::CastError(const CSVReaderOptions &options, const string &colu
 		       "correctly parse this column."
 		    << '\n';
 	}
+	how_to_fix_it << "* Check whether the null string value is set correctly (e.g., nullstr = 'N/A')" << '\n';
 	return CSVError(error.str(), CAST_ERROR, column_idx, csv_row, error_info, row_byte_position, byte_position, options,
 	                how_to_fix_it.str(), current_path);

package/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp CHANGED Viewed

@@ -251,6 +251,10 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
 			throw BinderException("Invalid value for MAX_LINE_SIZE parameter: it cannot be smaller than 0");
 		}
 		maximum_line_size.Set(NumericCast<idx_t>(line_size));
+		if (buffer_size_option.IsSetByUser() && maximum_line_size.GetValue() > buffer_size_option.GetValue()) {
+			throw InvalidInputException("Buffer Size of %d must be a higher value than the maximum line size %d",
+			                            buffer_size_option.GetValue(), maximum_line_size.GetValue());
+		}
 	} else if (loption == "date_format" || loption == "dateformat") {
 		string format = ParseString(value, loption);
 		SetDateFormat(LogicalTypeId::DATE, format, true);
@@ -264,6 +268,12 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
 		if (buffer_size_option == 0) {
 			throw InvalidInputException("Buffer Size option must be higher than 0");
 		}
+		if (maximum_line_size.IsSetByUser() && maximum_line_size.GetValue() > buffer_size_option.GetValue()) {
+			throw InvalidInputException("Buffer Size of %d must be a higher value than the maximum line size %d",
+			                            buffer_size_option.GetValue(), maximum_line_size.GetValue());
+		} else {
+			maximum_line_size.Set(buffer_size_option.GetValue(), false);
+		}
 	} else if (loption == "decimal_separator") {
 		decimal_separator = ParseString(value, loption);
 		if (decimal_separator != "." && decimal_separator != ",") {
@@ -298,6 +308,9 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
 		if (table_name.empty()) {
 			throw BinderException("REJECTS_TABLE option cannot be empty");
 		}
+		if (KeywordHelper::RequiresQuotes(table_name)) {
+			throw BinderException("rejects_scan option: %s requires quotes to be used as an identifier", table_name);
+		}
 		rejects_table_name.Set(table_name);
 	} else if (loption == "rejects_scan") {
 		// skip, handled in SetRejectsOptions
@@ -305,6 +318,9 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
 		if (table_name.empty()) {
 			throw BinderException("rejects_scan option cannot be empty");
 		}
+		if (KeywordHelper::RequiresQuotes(table_name)) {
+			throw BinderException("rejects_scan option: %s requires quotes to be used as an identifier", table_name);
+		}
 		rejects_scan_name.Set(table_name);
 	} else if (loption == "rejects_limit") {
 		auto limit = ParseInteger(value, loption);

package/src/duckdb/src/execution/operator/helper/physical_reservoir_sample.cpp CHANGED Viewed

@@ -86,6 +86,7 @@ SourceResultType PhysicalReservoirSample::GetData(ExecutionContext &context, Dat
 		return SourceResultType::FINISHED;
 	}
 	auto sample_chunk = sink.sample->GetChunk();
 	if (!sample_chunk) {
 		return SourceResultType::FINISHED;
 	}

package/src/duckdb/src/execution/operator/helper/physical_streaming_sample.cpp CHANGED Viewed

@@ -5,10 +5,11 @@
 namespace duckdb {
-PhysicalStreamingSample::PhysicalStreamingSample(vector<LogicalType> types, SampleMethod method, double percentage,
-                                                 int64_t seed, idx_t estimated_cardinality)
-    : PhysicalOperator(PhysicalOperatorType::STREAMING_SAMPLE, std::move(types), estimated_cardinality), method(method),
-      percentage(percentage / 100), seed(seed) {
+PhysicalStreamingSample::PhysicalStreamingSample(vector<LogicalType> types, unique_ptr<SampleOptions> options,
+                                                 idx_t estimated_cardinality)
+    : PhysicalOperator(PhysicalOperatorType::STREAMING_SAMPLE, std::move(types), estimated_cardinality),
+      sample_options(std::move(options)) {
+	percentage = sample_options->sample_size.GetValue<double>() / 100;
 }
 //===--------------------------------------------------------------------===//
@@ -49,13 +50,21 @@ void PhysicalStreamingSample::BernoulliSample(DataChunk &input, DataChunk &resul
 	}
 }
+bool PhysicalStreamingSample::ParallelOperator() const {
+	return !(sample_options->repeatable || sample_options->seed.IsValid());
+}
 unique_ptr<OperatorState> PhysicalStreamingSample::GetOperatorState(ExecutionContext &context) const {
-	return make_uniq<StreamingSampleOperatorState>(seed);
+	if (!ParallelOperator()) {
+		return make_uniq<StreamingSampleOperatorState>(static_cast<int64_t>(sample_options->seed.GetIndex()));
+	}
+	RandomEngine random;
+	return make_uniq<StreamingSampleOperatorState>(static_cast<int64_t>(random.NextRandomInteger64()));
 }
 OperatorResultType PhysicalStreamingSample::Execute(ExecutionContext &context, DataChunk &input, DataChunk &chunk,
                                                     GlobalOperatorState &gstate, OperatorState &state) const {
-	switch (method) {
+	switch (sample_options->method) {
 	case SampleMethod::BERNOULLI_SAMPLE:
 		BernoulliSample(input, chunk, state);
 		break;
@@ -70,7 +79,7 @@ OperatorResultType PhysicalStreamingSample::Execute(ExecutionContext &context, D
 InsertionOrderPreservingMap<string> PhysicalStreamingSample::ParamsToString() const {
 	InsertionOrderPreservingMap<string> result;
-	result["Sample Method"] = EnumUtil::ToString(method) + ": " + to_string(100 * percentage) + "%";
+	result["Sample Method"] = EnumUtil::ToString(sample_options->method) + ": " + to_string(100 * percentage) + "%";
 	return result;
 }

package/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp CHANGED Viewed

@@ -215,7 +215,9 @@ public:
 		auto &gstate = gstate_p.Cast<BatchInsertGlobalState>();
 		auto &lstate = lstate_p.Cast<BatchInsertLocalState>();
 		// merge together the collections
-		D_ASSERT(lstate.writer);
+		if (!lstate.writer) {
+			lstate.writer = &gstate.table.GetStorage().CreateOptimisticWriter(context);
+		}
 		auto final_collection = gstate.MergeCollections(context, std::move(merge_collections), *lstate.writer);
 		// add the merged-together collection to the set of batch indexes
 		lock_guard<mutex> l(gstate.lock);

package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp CHANGED Viewed

@@ -108,7 +108,17 @@ SourceResultType PhysicalTableScan::GetData(ExecutionContext &context, DataChunk
 	if (g_state.in_out_final) {
 		function.in_out_function_final(context, data, chunk);
 	}
-	function.in_out_function(context, data, g_state.input_chunk, chunk);
+	switch (function.in_out_function(context, data, g_state.input_chunk, chunk)) {
+	case OperatorResultType::BLOCKED: {
+		auto guard = g_state.Lock();
+		return g_state.BlockSource(guard, input.interrupt_state);
+	}
+	default:
+		// FIXME: Handling for other cases (such as NEED_MORE_INPUT) breaks current functionality and extensions that
+		// might be relying on current behaviour. Needs a rework that is not in scope
+		break;
+	}
 	if (chunk.size() == 0 && function.in_out_function_final) {
 		function.in_out_function_final(context, data, chunk);
 		g_state.in_out_final = true;