npm - duckdb - Versions diffs - 0.7.2-dev3294.0 → 0.7.2-dev3353.0 - Mend

duckdb 0.7.2-dev3294.0 → 0.7.2-dev3353.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/src/duckdb/src/function/table/read_csv.cpp CHANGED Viewed

@@ -14,6 +14,7 @@
 #include "duckdb/main/extension_helper.hpp"
 #include "duckdb/common/multi_file_reader.hpp"
 #include "duckdb/main/client_data.hpp"
+#include "duckdb/execution/operator/persistent/csv_line_info.hpp"
 #include <limits>
@@ -262,19 +263,21 @@ static unique_ptr<FunctionData> ReadCSVAutoBind(ClientContext &context, TableFun
 //===--------------------------------------------------------------------===//
 // Parallel CSV Reader CSV Global State
 //===--------------------------------------------------------------------===//
-//===--------------------------------------------------------------------===//
-// Read CSV Global State
-//===--------------------------------------------------------------------===//
 struct ParallelCSVGlobalState : public GlobalTableFunctionState {
 public:
 	ParallelCSVGlobalState(ClientContext &context, unique_ptr<CSVFileHandle> file_handle_p,
 	                       vector<string> &files_path_p, idx_t system_threads_p, idx_t buffer_size_p,
-	                       idx_t rows_to_skip, bool force_parallelism_p, vector<column_t> column_ids_p)
+	                       idx_t rows_to_skip, bool force_parallelism_p, vector<column_t> column_ids_p, bool has_header)
 	    : file_handle(std::move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p),
-	      force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)) {
+	      force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
+	      line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
 		file_handle->DisableReset();
 		current_file_path = files_path_p[0];
-		estimated_linenr = rows_to_skip;
+		line_info.lines_read[0] = rows_to_skip;
+		if (has_header) {
+			line_info.lines_read[0]++;
+		}
 		file_size = file_handle->FileSize();
 		first_file_size = file_size;
 		bytes_read = 0;
@@ -296,8 +299,16 @@ public:
 		next_buffer = shared_ptr<CSVBuffer>(
 		    current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
 		running_threads = MaxThreads();
+		// Initialize all the book-keeping variables
+		auto file_count = files_path_p.size();
+		line_info.current_batches.resize(file_count);
+		tuple_start.resize(file_count);
+		tuple_end.resize(file_count);
+		tuple_end_to_batch.resize(file_count);
+		batch_to_tuple_end.resize(file_count);
 	}
-	ParallelCSVGlobalState() {
+	ParallelCSVGlobalState() : line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
 		running_threads = MaxThreads();
 	}
@@ -316,7 +327,9 @@ public:
 	//! Verify if the CSV File was read correctly
 	void Verify();
-	void UpdateVerification(VerificationPositions positions, idx_t file_number);
+	void UpdateVerification(VerificationPositions positions, idx_t file_number, idx_t batch_idx);
+	void UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx);
 	void IncrementThread();
@@ -354,8 +367,6 @@ private:
 	mutex main_mutex;
 	//! Byte set from for last thread
 	idx_t next_byte = 0;
-	//! The current estimated line number
-	idx_t estimated_linenr;
 	//! How many bytes we should execute per local state
 	idx_t bytes_per_local_state;
 	//! Size of first file
@@ -366,6 +377,8 @@ private:
 	idx_t buffer_size;
 	//! Current batch index
 	idx_t batch_index = 0;
+	idx_t local_batch_index = 0;
 	//! Forces parallelism for small CSV Files, should only be used for testing.
 	bool force_parallelism = false;
 	//! Current (Global) position of CSV
@@ -379,9 +392,15 @@ private:
 	//! positions where they started reading the first line.
 	vector<vector<idx_t>> tuple_end;
 	vector<set<idx_t>> tuple_start;
+	//! Tuple end to batch
+	vector<unordered_map<idx_t, idx_t>> tuple_end_to_batch;
+	//! Batch to Tuple End
+	vector<unordered_map<idx_t, idx_t>> batch_to_tuple_end;
 	idx_t running_threads = 0;
 	//! The column ids to read
 	vector<column_t> column_ids;
+	//! Line Info used in error messages
+	LineInfo line_info;
 };
 idx_t ParallelCSVGlobalState::MaxThreads() const {
@@ -425,32 +444,61 @@ void ParallelCSVGlobalState::Verify() {
 				return;
 			}
 			auto max_value = *max_element(std::begin(current_tuple_end), std::end(current_tuple_end));
-			for (auto &last_pos : current_tuple_end) {
+			for (idx_t tpl_idx = 0; tpl_idx < current_tuple_end.size(); tpl_idx++) {
+				auto last_pos = current_tuple_end[tpl_idx];
 				auto first_pos = current_tuple_start.find(last_pos);
 				if (first_pos == current_tuple_start.end()) {
 					// this might be necessary due to carriage returns outside buffer scopes.
 					first_pos = current_tuple_start.find(last_pos + 1);
 				}
 				if (first_pos == current_tuple_start.end() && last_pos != max_value) {
-					string error =
-					    "Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
-					    " does not have a match\n";
-					error += "End Lines: \n";
-					for (auto &end_line : current_tuple_end) {
-						error += to_string(end_line) + "\n";
-					}
-					error += "Start Lines: \n";
-					for (auto &start_line : current_tuple_start) {
-						error += to_string(start_line) + "\n";
-					}
+					auto batch_idx = tuple_end_to_batch[i][last_pos];
+					auto problematic_line = line_info.GetLine(batch_idx);
 					throw InvalidInputException(
-					    "CSV File not supported for multithreading. Please run single-threaded CSV Reading");
+					    "CSV File not supported for multithreading. This can be a problematic line in your CSV File or "
+					    "that this CSV can't be read in Parallel. Please, inspect if the line %llu is correct. If so, "
+					    "please run single-threaded CSV Reading by setting parallel=false in the read_csv call.",
+					    problematic_line);
 				}
 			}
 		}
 	}
 }
+void LineInfo::Verify(idx_t file_idx, idx_t batch_idx, idx_t cur_first_pos) {
+	auto &tuple_start_set = tuple_start[file_idx];
+	auto &processed_batches = batch_to_tuple_end[file_idx];
+	auto &tuple_end_vec = tuple_end[file_idx];
+	bool has_error = false;
+	idx_t problematic_line;
+	if (batch_idx == 0 || tuple_start_set.empty()) {
+		return;
+	}
+	for (idx_t cur_batch = 0; cur_batch < batch_idx - 1; cur_batch++) {
+		auto cur_end = tuple_end_vec[processed_batches[cur_batch]];
+		auto first_pos = tuple_start_set.find(cur_end);
+		if (first_pos == tuple_start_set.end()) {
+			has_error = true;
+			problematic_line = GetLine(cur_batch);
+			break;
+		}
+	}
+	if (!has_error) {
+		auto cur_end = tuple_end_vec[processed_batches[batch_idx - 1]];
+		if (cur_end != cur_first_pos) {
+			has_error = true;
+			problematic_line = GetLine(batch_idx);
+		}
+	}
+	if (has_error) {
+		throw InvalidInputException(
+		    "CSV File not supported for multithreading. This can be a problematic line in your CSV File or "
+		    "that this CSV can't be read in Parallel. Please, inspect if the line %llu is correct. If so, "
+		    "please run single-threaded CSV Reading by setting parallel=false in the read_csv call.",
+		    problematic_line);
+	}
+}
 bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bind_data,
                                   unique_ptr<ParallelCSVReader> &reader) {
 	lock_guard<mutex> parallel_lock(main_mutex);
@@ -461,6 +509,7 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
 			file_handle = ReadCSV::OpenCSV(current_file_path, bind_data.options.compression, context);
 			current_csv_position = 0;
 			file_number++;
+			local_batch_index = 0;
 			current_buffer =
 			    make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
 			next_buffer = shared_ptr<CSVBuffer>(
@@ -472,11 +521,11 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
 		}
 	}
 	// set up the current buffer
+	line_info.current_batches.back().insert(local_batch_index);
 	auto result = make_uniq<CSVBufferRead>(current_buffer, next_buffer, next_byte, next_byte + bytes_per_local_state,
-	                                       batch_index++, estimated_linenr);
+	                                       batch_index++, local_batch_index++, &line_info);
 	// move the byte index of the CSV reader to the next buffer
 	next_byte += bytes_per_local_state;
-	estimated_linenr += bytes_per_local_state / (bind_data.csv_types.size() * 5); // estimate 5 bytes per column
 	if (next_byte >= current_buffer->GetBufferSize()) {
 		// We replace the current buffer with the next buffer
 		next_byte = 0;
@@ -495,17 +544,17 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
 			// we are doing UNION BY NAME - fetch the options from the union reader for this file
 			auto &union_reader = *bind_data.union_readers[file_index - 1];
 			reader = make_uniq<ParallelCSVReader>(context, union_reader.options, std::move(result), first_position,
-			                                      union_reader.GetTypes());
+			                                      union_reader.GetTypes(), file_index - 1);
 			reader->names = union_reader.GetNames();
 		} else if (file_index <= bind_data.column_info.size()) {
 			// Serialized Union By name
 			reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
-			                                      bind_data.column_info[file_index - 1].types);
+			                                      bind_data.column_info[file_index - 1].types, file_index - 1);
 			reader->names = bind_data.column_info[file_index - 1].names;
 		} else {
 			// regular file - use the standard options
 			reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
-			                                      bind_data.csv_types);
+			                                      bind_data.csv_types, file_index - 1);
 			reader->names = bind_data.csv_names;
 		}
 		reader->options.file_path = current_file_path;
@@ -518,23 +567,64 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
 	}
 	return true;
 }
-void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions, idx_t file_number_p) {
+void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions, idx_t file_number_p, idx_t batch_idx) {
 	lock_guard<mutex> parallel_lock(main_mutex);
 	if (positions.beginning_of_first_line < positions.end_of_last_line) {
 		if (positions.end_of_last_line > max_tuple_end) {
 			max_tuple_end = positions.end_of_last_line;
 		}
-		while (file_number_p >= tuple_start.size()) {
-			vector<idx_t> empty_tuple_end;
-			set<idx_t> empty_set;
-			tuple_start.emplace_back(empty_set);
-			tuple_end.emplace_back(empty_tuple_end);
-		}
+		tuple_end_to_batch[file_number_p][positions.end_of_last_line] = batch_idx;
+		batch_to_tuple_end[file_number_p][batch_idx] = tuple_end[file_number_p].size();
 		tuple_start[file_number_p].insert(positions.beginning_of_first_line);
 		tuple_end[file_number_p].push_back(positions.end_of_last_line);
 	}
 }
+void ParallelCSVGlobalState::UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx) {
+	auto batch_idx = buffer_read.local_batch_index;
+	auto lines_read = buffer_read.lines_read;
+	lock_guard<mutex> parallel_lock(main_mutex);
+	line_info.current_batches[file_idx].erase(batch_idx);
+	line_info.lines_read[batch_idx] += lines_read;
+}
+bool LineInfo::CanItGetLine(idx_t file_idx, idx_t batch_idx) {
+	lock_guard<mutex> parallel_lock(main_mutex);
+	if (current_batches.empty() || done) {
+		return true;
+	}
+	auto min_value = *current_batches[file_idx].begin();
+	if (min_value >= batch_idx) {
+		return true;
+	}
+	return false;
+}
+idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t cur_start, bool verify) {
+	unique_ptr<lock_guard<mutex>> parallel_lock;
+	if (!verify) {
+		parallel_lock = duckdb::make_uniq<lock_guard<mutex>>(main_mutex);
+	}
+	idx_t line_count = 0;
+	if (done) {
+		return first_line;
+	}
+	for (idx_t i = 0; i <= batch_idx; i++) {
+		if (lines_read.find(i) == lines_read.end() && i != batch_idx) {
+			throw InternalException("Missing batch index on Parallel CSV Reader GetLine");
+		}
+		line_count += lines_read[i];
+	}
+	// before we are done, if this is not a call in Verify() we must check Verify up to this batch
+	if (!verify) {
+		Verify(file_idx, batch_idx, cur_start);
+	}
+	done = true;
+	first_line = line_count + line_error;
+	return first_line;
+}
 static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext &context,
                                                                   TableFunctionInitInput &input) {
 	auto &bind_data = (ReadCSVData &)*input.bind_data;
@@ -548,7 +638,8 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
 	file_handle = ReadCSV::OpenCSV(bind_data.options.file_path, bind_data.options.compression, context);
 	return make_uniq<ParallelCSVGlobalState>(
 	    context, std::move(file_handle), bind_data.files, context.db->NumberOfThreads(), bind_data.options.buffer_size,
-	    bind_data.options.skip_rows, ClientConfig::GetConfig(context).verify_parallelism, input.column_ids);
+	    bind_data.options.skip_rows, ClientConfig::GetConfig(context).verify_parallelism, input.column_ids,
+	    bind_data.options.header && bind_data.options.has_header);
 }
 //===--------------------------------------------------------------------===//
@@ -597,9 +688,15 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
 			auto verification_updates = csv_local_state.csv_reader->GetVerificationPositions();
 			if (verification_updates.beginning_of_first_line != verification_updates.end_of_last_line) {
 				csv_global_state.UpdateVerification(verification_updates,
-				                                    csv_local_state.csv_reader->buffer->buffer->GetFileNumber());
+				                                    csv_local_state.csv_reader->buffer->buffer->GetFileNumber(),
+				                                    csv_local_state.csv_reader->buffer->local_batch_index);
 			}
+			csv_global_state.UpdateLinesRead(*csv_local_state.csv_reader->buffer, csv_local_state.csv_reader->file_idx);
 			auto has_next = csv_global_state.Next(context, bind_data, csv_local_state.csv_reader);
+			if (csv_local_state.csv_reader) {
+				csv_local_state.csv_reader->linenr = 0;
+			}
 			if (!has_next) {
 				csv_global_state.DecrementThread();
 				break;
@@ -945,6 +1042,7 @@ void BufferedCSVReaderOptions::Serialize(FieldWriter &writer) const {
 	writer.WriteString(file_path);
 	writer.WriteString(decimal_separator);
 	writer.WriteField<bool>(null_padding);
+	writer.WriteField<idx_t>(buffer_size);
 	writer.WriteSerializable(file_options);
 	// write options
 	writer.WriteListNoReference<bool>(force_quote);
@@ -980,6 +1078,7 @@ void BufferedCSVReaderOptions::Deserialize(FieldReader &reader) {
 	file_path = reader.ReadRequired<string>();
 	decimal_separator = reader.ReadRequired<string>();
 	null_padding = reader.ReadRequired<bool>();
+	buffer_size = reader.ReadRequired<idx_t>();
 	file_options = reader.ReadRequiredSerializable<MultiFileReaderOptions, MultiFileReaderOptions>();
 	// write options
 	force_quote = reader.ReadRequiredList<bool>();

package/src/duckdb/src/function/table/version/pragma_version.cpp CHANGED Viewed

@@ -1,8 +1,8 @@
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "0.7.2-dev3294"
+#define DUCKDB_VERSION "0.7.2-dev3353"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "f8eae1c85d"
+#define DUCKDB_SOURCE_ID "ec0ca94cdf"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"

package/src/duckdb/src/include/duckdb/execution/operator/persistent/base_csv_reader.hpp CHANGED Viewed

@@ -17,6 +17,8 @@
 #include "duckdb/common/queue.hpp"
 #include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
 #include "duckdb/common/multi_file_reader.hpp"
+#include "duckdb/execution/operator/persistent/csv_line_info.hpp"
 #include <sstream>
 namespace duckdb {
@@ -74,6 +76,10 @@ public:
 	const vector<LogicalType> &GetTypes() {
 		return return_types;
 	}
+	virtual idx_t GetLineError(idx_t line_error, idx_t buffer_idx) {
+		return line_error;
+	};
 	//! Initialize projection indices to select all columns
 	void InitializeProjection();
@@ -88,17 +94,18 @@ protected:
 	bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type);
 	//! Adds a value to the current row
-	void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes);
+	void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes,
+	              idx_t buffer_idx = 0);
 	//! Adds a row to the insert_chunk, returns true if the chunk is filled as a result of this row being added
-	bool AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message);
+	bool AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message, idx_t buffer_idx = 0);
 	//! Finalizes a chunk, parsing all values that have been added so far and adding them to the insert_chunk
-	bool Flush(DataChunk &insert_chunk, bool try_add_line = false);
+	bool Flush(DataChunk &insert_chunk, idx_t buffer_idx = 0, bool try_add_line = false);
 	unique_ptr<CSVFileHandle> OpenCSV(const BufferedCSVReaderOptions &options);
 	void VerifyUTF8(idx_t col_idx);
 	void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0);
-	static string GetLineNumberStr(idx_t linenr, bool linenr_estimated);
+	string GetLineNumberStr(idx_t linenr, bool linenr_estimated, idx_t buffer_idx = 0);
 	//! Sets the newline delimiter
 	void SetNewLineDelimiter(bool carry = false, bool carry_followed_by_nl = false);

package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp CHANGED Viewed

@@ -57,7 +57,8 @@ public:
 	                  const vector<LogicalType> &requested_types = vector<LogicalType>());
 	BufferedCSVReader(ClientContext &context, string filename, BufferedCSVReaderOptions options,
 	                  const vector<LogicalType> &requested_types = vector<LogicalType>());
-	~BufferedCSVReader();
+	virtual ~BufferedCSVReader() {
+	}
 	unique_ptr<char[]> buffer;
 	idx_t buffer_size;

package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_line_info.hpp ADDED Viewed

@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/execution/operator/persistent/csv_line_info.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+namespace duckdb {
+struct LineInfo {
+public:
+	explicit LineInfo(mutex &main_mutex_p, vector<unordered_map<idx_t, idx_t>> &batch_to_tuple_end_p,
+	                  vector<set<idx_t>> &tuple_start_p, vector<vector<idx_t>> &tuple_end_p)
+	    : main_mutex(main_mutex_p), batch_to_tuple_end(batch_to_tuple_end_p), tuple_start(tuple_start_p),
+	      tuple_end(tuple_end_p) {};
+	bool CanItGetLine(idx_t file_idx, idx_t batch_idx);
+	idx_t GetLine(idx_t batch_idx, idx_t line_error = 0, idx_t file_idx = 0, idx_t cur_start = 0, bool verify = true);
+	//! Verify if the CSV File was read correctly from [0,batch_idx] batches.
+	void Verify(idx_t file_idx, idx_t batch_idx, idx_t cur_first_pos);
+	//! Lines read per batch, <batch_index,count>
+	unordered_map<idx_t, idx_t> lines_read;
+	//! Set of batches that have been initialized but are not yet finished.
+	vector<set<idx_t>> current_batches;
+	//! Pointer to CSV Reader Mutex
+	mutex &main_mutex;
+	//! Pointer Batch to Tuple End
+	vector<unordered_map<idx_t, idx_t>> &batch_to_tuple_end;
+	//! Pointer Batch to Tuple Start
+	vector<set<idx_t>> &tuple_start;
+	//! Pointer Batch to Tuple End
+	vector<vector<idx_t>> &tuple_end;
+	//! If we already threw an exception on a previous thread.
+	bool done = false;
+	idx_t first_line = 0;
+};
+} // namespace duckdb

package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp CHANGED Viewed

@@ -1,7 +1,7 @@
 //===----------------------------------------------------------------------===//
 //                         DuckDB
 //
-// duckdb/execution/operator/persistent/buffered_csv_reader.hpp
+// duckdb/execution/operator/persistent/parallel_csv_reader.hpp
 //
 //
 //===----------------------------------------------------------------------===//
@@ -12,6 +12,7 @@
 #include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
 #include "duckdb/execution/operator/persistent/csv_file_handle.hpp"
 #include "duckdb/execution/operator/persistent/csv_buffer.hpp"
+#include "duckdb/execution/operator/persistent/csv_line_info.hpp"
 #include <sstream>
 #include <utility>
@@ -20,9 +21,9 @@ namespace duckdb {
 struct CSVBufferRead {
 	CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, idx_t buffer_start_p, idx_t buffer_end_p, idx_t batch_index,
-	              idx_t estimated_linenr)
-	    : buffer(std::move(buffer_p)), buffer_start(buffer_start_p), buffer_end(buffer_end_p), batch_index(batch_index),
-	      estimated_linenr(estimated_linenr) {
+	              idx_t local_batch_index_p, optional_ptr<LineInfo> line_info_p)
+	    : buffer(std::move(buffer_p)), line_info(line_info_p), buffer_start(buffer_start_p), buffer_end(buffer_end_p),
+	      batch_index(batch_index), local_batch_index(local_batch_index_p) {
 		if (buffer) {
 			if (buffer_end > buffer->GetBufferSize()) {
 				buffer_end = buffer->GetBufferSize();
@@ -34,8 +35,9 @@ struct CSVBufferRead {
 	}
 	CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, shared_ptr<CSVBuffer> nxt_buffer_p, idx_t buffer_start_p,
-	              idx_t buffer_end_p, idx_t batch_index, idx_t estimated_linenr)
-	    : CSVBufferRead(std::move(buffer_p), buffer_start_p, buffer_end_p, batch_index, estimated_linenr) {
+	              idx_t buffer_end_p, idx_t batch_index, idx_t local_batch_index, optional_ptr<LineInfo> line_info_p)
+	    : CSVBufferRead(std::move(buffer_p), buffer_start_p, buffer_end_p, batch_index, local_batch_index,
+	                    line_info_p) {
 		next_buffer = std::move(nxt_buffer_p);
 	}
@@ -84,23 +86,27 @@ struct CSVBufferRead {
 	shared_ptr<CSVBuffer> buffer;
 	shared_ptr<CSVBuffer> next_buffer;
 	vector<unique_ptr<char[]>> intersections;
+	optional_ptr<LineInfo> line_info;
 	idx_t buffer_start;
 	idx_t buffer_end;
 	idx_t batch_index;
-	idx_t estimated_linenr;
+	idx_t local_batch_index;
+	idx_t lines_read = 0;
 };
 struct VerificationPositions {
 	idx_t beginning_of_first_line = 0;
 	idx_t end_of_last_line = 0;
 };
-//! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
+//! CSV Reader for Parallel Reading
 class ParallelCSVReader : public BaseCSVReader {
 public:
 	ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
-	                  idx_t first_pos_first_buffer, const vector<LogicalType> &requested_types);
-	~ParallelCSVReader();
+	                  idx_t first_pos_first_buffer, const vector<LogicalType> &requested_types, idx_t file_idx_p);
+	virtual ~ParallelCSVReader() {
+	}
 	//! Current Position (Relative to the Buffer)
 	idx_t position_buffer = 0;
@@ -118,13 +124,21 @@ public:
 	bool finished = false;
 	unique_ptr<CSVBufferRead> buffer;
+	idx_t file_idx;
 	VerificationPositions GetVerificationPositions();
+	//! Position of the first read line and last read line for verification purposes
+	VerificationPositions verification_positions;
 public:
 	void SetBufferRead(unique_ptr<CSVBufferRead> buffer);
 	//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
 	void ParseCSV(DataChunk &insert_chunk);
+	idx_t GetLineError(idx_t line_error, idx_t buffer_idx) override;
 private:
 	//! Initialize Parser
 	void Initialize(const vector<LogicalType> &requested_types);
@@ -135,7 +149,7 @@ private:
 	//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
 	bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
 	//! Sets Position depending on the byte_start of this thread
-	bool SetPosition(DataChunk &insert_chunk);
+	bool SetPosition();
 	//! Called when scanning the 1st buffer, skips empty lines
 	void SkipEmptyLines();
 	//! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
@@ -148,8 +162,9 @@ private:
 	//! Parses a CSV file with a one-byte delimiter, escape and quote character
 	bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
-	//! Position of the first read line and last read line for verification purposes
-	VerificationPositions verification_positions;
+	//! Verifies that the line length did not go over a pre-defined limit.
+	void VerifyLineLength(idx_t line_size);
 	//! First Position of First Buffer
 	idx_t first_pos_first_buffer = 0;
 };

package/src/duckdb/src/main/extension/extension_load.cpp CHANGED Viewed

@@ -226,19 +226,20 @@ bool ExtensionHelper::IsFullPath(const string &extension) {
 	       StringUtil::Contains(extension, "\\");
 }
-string ExtensionHelper::GetExtensionName(const string &extension) {
+string ExtensionHelper::GetExtensionName(const string &original_name) {
+	auto extension = StringUtil::Lower(original_name);
 	if (!IsFullPath(extension)) {
-		return extension;
+		return ExtensionHelper::ApplyExtensionAlias(extension);
 	}
 	auto splits = StringUtil::Split(StringUtil::Replace(extension, "\\", "/"), '/');
 	if (splits.empty()) {
-		return extension;
+		return ExtensionHelper::ApplyExtensionAlias(extension);
 	}
 	splits = StringUtil::Split(splits.back(), '.');
 	if (splits.empty()) {
-		return extension;
+		return ExtensionHelper::ApplyExtensionAlias(extension);
 	}
-	return StringUtil::Lower(splits.front());
+	return ExtensionHelper::ApplyExtensionAlias(splits.front());
 }
 void ExtensionHelper::LoadExternalExtension(DatabaseInstance &db, FileOpener *opener, const string &extension) {

package/src/duckdb/src/optimizer/rule/move_constants.cpp CHANGED Viewed

@@ -55,6 +55,9 @@ unique_ptr<Expression> MoveConstantsRule::Apply(LogicalOperator &op, vector<refe
 		}
 		auto result_value = Value::HUGEINT(outer_value);
 		if (!result_value.DefaultTryCastAs(constant_type)) {
+			if (comparison.type != ExpressionType::COMPARE_EQUAL) {
+				return nullptr;
+			}
 			// if the cast is not possible then the comparison is not possible
 			// for example, if we have x + 5 = 3, where x is an unsigned number, we will get x = -2
 			// since this is not possible we can remove the entire branch here

package/src/duckdb/src/storage/statistics/string_stats.cpp CHANGED Viewed

@@ -140,7 +140,7 @@ void StringStats::Update(BaseStatistics &stats, const string_t &value) {
 		if (unicode == UnicodeType::UNICODE) {
 			string_data.has_unicode = true;
 		} else if (unicode == UnicodeType::INVALID) {
-			throw InternalException(
+			throw InvalidInputException(
 			    ErrorManager::InvalidUnicodeError(string((char *)data, size), "segment statistics update"));
 		}
 	}