npm - duckdb - Versions diffs - 0.7.2-dev2144.0 → 0.7.2-dev2233.0 - Mend

duckdb 0.7.2-dev2144.0 → 0.7.2-dev2233.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp CHANGED Viewed

@@ -25,8 +25,9 @@
 namespace duckdb {
 ParallelCSVReader::ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
-                                     unique_ptr<CSVBufferRead> buffer_p, const vector<LogicalType> &requested_types)
-    : BaseCSVReader(context, std::move(options_p), requested_types) {
+                                     unique_ptr<CSVBufferRead> buffer_p, idx_t first_pos_first_buffer_p,
+                                     const vector<LogicalType> &requested_types)
+    : BaseCSVReader(context, std::move(options_p), requested_types), first_pos_first_buffer(first_pos_first_buffer_p) {
 	Initialize(requested_types);
 	SetBufferRead(std::move(buffer_p));
 	if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
@@ -52,9 +53,34 @@ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl,
 	return (carry && carry_followed_by_nl) || (!carry && first_char);
 }
+void ParallelCSVReader::SkipEmptyLines() {
+	idx_t new_pos_buffer = position_buffer;
+	if (parse_chunk.data.size() == 1) {
+		// Empty lines are null data.
+		return;
+	}
+	for (; new_pos_buffer < end_buffer; new_pos_buffer++) {
+		if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) {
+			bool carrier_return = (*buffer)[new_pos_buffer] == '\r';
+			new_pos_buffer++;
+			if (carrier_return && new_pos_buffer < buffer_size && (*buffer)[new_pos_buffer] == '\n') {
+				position_buffer++;
+			}
+			if (new_pos_buffer > end_buffer) {
+				return;
+			}
+			position_buffer = new_pos_buffer;
+		} else if ((*buffer)[new_pos_buffer] != ' ') {
+			return;
+		}
+	}
+}
 bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
 	if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer &&
-	    start_buffer == buffer->buffer->GetStart()) {
+	    start_buffer == first_pos_first_buffer) {
+		start_buffer = buffer->buffer->GetStart();
+		position_buffer = start_buffer;
 		verification_positions.beginning_of_first_line = position_buffer;
 		verification_positions.end_of_last_line = position_buffer;
 		// First buffer doesn't need any setting
@@ -70,11 +96,23 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
 					if (position_buffer > end_buffer) {
 						return false;
 					}
+					SkipEmptyLines();
+					if (verification_positions.beginning_of_first_line == 0) {
+						verification_positions.beginning_of_first_line = position_buffer;
+					}
+					verification_positions.end_of_last_line = position_buffer;
 					return true;
 				}
 			}
 			return false;
 		}
+		SkipEmptyLines();
+		if (verification_positions.beginning_of_first_line == 0) {
+			verification_positions.beginning_of_first_line = position_buffer;
+		}
+		verification_positions.end_of_last_line = position_buffer;
 		return true;
 	}
@@ -102,6 +140,11 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
 				}
 			}
 		}
+		SkipEmptyLines();
+		if (position_buffer > buffer_size) {
+			break;
+		}
 		if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
 			break;
@@ -113,18 +156,20 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
 		}
 		idx_t position_set = position_buffer;
 		start_buffer = position_buffer;
 		// We check if we can add this line
 		// disable the projection pushdown while reading the first line
 		// otherwise the first line parsing can be influenced by which columns we are reading
 		auto column_ids = std::move(reader_data.column_ids);
 		auto column_mapping = std::move(reader_data.column_mapping);
 		InitializeProjection();
-		successfully_read_first_line = TryParseSimpleCSV(first_line_chunk, error_message, true);
+		try {
+			successfully_read_first_line = TryParseSimpleCSV(first_line_chunk, error_message, true);
+		} catch (...) {
+			successfully_read_first_line = false;
+		}
 		// restore the projection pushdown
 		reader_data.column_ids = std::move(column_ids);
 		reader_data.column_mapping = std::move(column_mapping);
 		end_buffer = end_buffer_real;
 		start_buffer = position_set;
 		if (position_buffer >= end_buffer) {
@@ -190,27 +235,55 @@ bool ParallelCSVReader::BufferRemainder() {
 	return true;
 }
+void VerifyLineLength(idx_t line_size, idx_t max_line_size) {
+	if (line_size > max_line_size) {
+		// FIXME: this should also output the correct estimated linenumber where it broke
+		throw InvalidInputException("Maximum line size of %llu bytes exceeded!", max_line_size);
+	}
+}
+bool AllNewLine(string_t value, idx_t column_amount) {
+	auto value_str = value.GetString();
+	if (value_str.empty() && column_amount == 1) {
+		// This is a one column (empty)
+		return false;
+	}
+	for (idx_t i = 0; i < value.GetSize(); i++) {
+		if (!StringUtil::CharacterIsNewline(value_str[i])) {
+			return false;
+		}
+	}
+	return true;
+}
 bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
 	// used for parsing algorithm
+	if (start_buffer == buffer_size) {
+		// Nothing to read
+		finished = true;
+		return true;
+	}
 	D_ASSERT(end_buffer <= buffer_size);
 	bool finished_chunk = false;
 	idx_t column = 0;
 	idx_t offset = 0;
 	bool has_quotes = false;
 	vector<idx_t> escape_positions;
 	if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
 		// First time reading this buffer piece
 		if (!SetPosition(insert_chunk)) {
-			// This means the buffer size does not contain a new line
-			if (position_buffer - start_buffer == options.buffer_size) {
-				error_message = "Line does not fit in one buffer. Increase the buffer size.";
-				return false;
-			}
 			finished = true;
 			return true;
 		}
 	}
+	if (position_buffer == buffer_size) {
+		// Nothing to read
+		finished = true;
+		return true;
+	}
+	// Keep track of line size
+	idx_t line_start = position_buffer;
 	// start parsing the first value
 	goto value_start;
@@ -242,11 +315,16 @@ normal : {
 		if (c == options.delimiter[0]) {
 			// delimiter: end the value and add it to the chunk
 			goto add_value;
+		} else if (c == options.quote[0] && try_add_line) {
+			return false;
 		} else if (StringUtil::CharacterIsNewline(c)) {
 			// newline: add row
-			if (column > 0 || try_add_line || insert_chunk.data.size() == 1) {
+			if (column > 0 || try_add_line || parse_chunk.data.size() == 1) {
 				goto add_row;
 			}
+			if (column == 0 && position_buffer == start_buffer) {
+				start_buffer++;
+			}
 		}
 	}
 	if (!BufferRemainder()) {
@@ -285,12 +363,15 @@ add_row : {
 		parse_chunk.Reset();
 		return success;
 	} else {
+		VerifyLineLength(position_buffer - line_start, options.maximum_line_size);
+		line_start = position_buffer;
 		finished_chunk = AddRow(insert_chunk, column, error_message);
 	}
 	// increase position by 1 and move start to the new position
 	offset = 0;
 	has_quotes = false;
-	start_buffer = ++position_buffer;
+	position_buffer++;
+	start_buffer = position_buffer;
 	verification_positions.end_of_last_line = position_buffer;
 	if (reached_remainder_state) {
 		goto final_state;
@@ -309,7 +390,10 @@ add_row : {
 			// newline after carriage return: skip
 			// increase position by 1 and move start to the new position
 			start_buffer = ++position_buffer;
+			SkipEmptyLines();
 			verification_positions.end_of_last_line = position_buffer;
+			start_buffer = position_buffer;
 			if (reached_remainder_state) {
 				goto final_state;
 			}
@@ -331,6 +415,9 @@ add_row : {
 			error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
 			return false;
 		}
+		SkipEmptyLines();
+		verification_positions.end_of_last_line = position_buffer;
+		start_buffer = position_buffer;
 		// \n newline, move to value start
 		if (finished_chunk) {
 			goto final_state;
@@ -391,7 +478,7 @@ unquote : {
 	} else if (StringUtil::CharacterIsNewline(c)) {
 		offset = 1;
 		// FIXME: should this be an assertion?
-		D_ASSERT(column == parse_chunk.ColumnCount() - 1);
+		D_ASSERT(try_add_line || (!try_add_line && column == parse_chunk.ColumnCount() - 1));
 		goto add_row;
 	} else if (position_buffer >= end_buffer) {
 		// reached end of buffer
@@ -448,22 +535,27 @@ final_state : {
 	}
 	// If this is the last buffer, we have to read the last value
 	if (buffer->buffer->IsCSVFileLastBuffer() || (buffer->next_buffer && buffer->next_buffer->IsCSVFileLastBuffer())) {
-		if (column > 0 || try_add_line || (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
+		if (column > 0 || start_buffer != position_buffer || try_add_line ||
+		    (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
 			// remaining values to be added to the chunk
 			auto str_value = buffer->GetValue(start_buffer, position_buffer, offset);
-			AddValue(str_value, column, escape_positions, has_quotes);
-			if (try_add_line) {
-				bool success = column == return_types.size();
-				if (success) {
+			if (!AllNewLine(str_value, insert_chunk.data.size()) || offset == 0) {
+				AddValue(str_value, column, escape_positions, has_quotes);
+				if (try_add_line) {
+					bool success = column == return_types.size();
+					if (success) {
+						AddRow(insert_chunk, column, error_message);
+						success = Flush(insert_chunk);
+					}
+					parse_chunk.Reset();
+					reached_remainder_state = false;
+					return success;
+				} else {
+					VerifyLineLength(position_buffer - line_start, options.maximum_line_size);
+					line_start = position_buffer;
 					AddRow(insert_chunk, column, error_message);
-					success = Flush(insert_chunk);
+					verification_positions.end_of_last_line = position_buffer;
 				}
-				parse_chunk.Reset();
-				reached_remainder_state = false;
-				return success;
-			} else {
-				AddRow(insert_chunk, column, error_message);
-				verification_positions.end_of_last_line = position_buffer;
 			}
 		}
 	}
@@ -471,11 +563,14 @@ final_state : {
 	if (mode == ParserMode::PARSING) {
 		Flush(insert_chunk);
 	}
-	if (position_buffer != verification_positions.end_of_last_line &&
-	    !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
+	if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
 		error_message = "Line does not fit in one buffer. Increase the buffer size.";
 		return false;
 	}
+	end_buffer = buffer_size;
+	SkipEmptyLines();
+	end_buffer = buffer->buffer_end;
+	verification_positions.end_of_last_line = position_buffer;
 	if (position_buffer >= end_buffer) {
 		if (position_buffer >= end_buffer) {
 			if (position_buffer == end_buffer && StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1]) &&

package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp CHANGED Viewed

@@ -103,7 +103,7 @@ idx_t PhysicalTableScan::GetBatchIndex(ExecutionContext &context, DataChunk &chu
 }
 string PhysicalTableScan::GetName() const {
-	return StringUtil::Upper(function.name);
+	return StringUtil::Upper(function.name + " " + function.extra_info);
 }
 string PhysicalTableScan::ParamsToString() const {

package/src/duckdb/src/function/table/read_csv.cpp CHANGED Viewed

@@ -12,6 +12,7 @@
 #include "duckdb/planner/operator/logical_get.hpp"
 #include "duckdb/main/extension_helper.hpp"
 #include "duckdb/common/multi_file_reader.hpp"
+#include "duckdb/main/client_data.hpp"
 #include <limits>
@@ -23,21 +24,22 @@ unique_ptr<CSVFileHandle> ReadCSV::OpenCSV(const string &file_path, FileCompress
 	auto opener = FileSystem::GetFileOpener(context);
 	auto file_handle =
 	    fs.OpenFile(file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK, compression, opener);
+	if (file_handle->CanSeek()) {
+		file_handle->Reset();
+	}
 	return make_uniq<CSVFileHandle>(std::move(file_handle));
 }
 void ReadCSVData::FinalizeRead(ClientContext &context) {
 	BaseCSVData::Finalize();
-	auto &config = DBConfig::GetConfig(context);
-	single_threaded = !config.options.experimental_parallel_csv_reader;
-	if (options.has_parallel) {
-		// Override the option set in the config
-		single_threaded = !options.use_parallel;
-	}
+	// Here we identify if we can run this CSV file on parallel or not.
 	bool null_or_empty = options.delimiter.empty() || options.escape.empty() || options.quote.empty() ||
 	                     options.delimiter[0] == '\0' || options.escape[0] == '\0' || options.quote[0] == '\0';
 	bool complex_options = options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1;
-	if (null_or_empty || complex_options || options.new_line == NewLineIdentifier::MIX) {
+	bool not_supported_options = options.null_padding;
+	if (!options.run_parallel || null_or_empty || not_supported_options || complex_options ||
+	    options.new_line == NewLineIdentifier::MIX) {
 		// not supported for parallel CSV reading
 		single_threaded = true;
 	}
@@ -175,6 +177,8 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
 			options.all_varchar = BooleanValue::Get(kv.second);
 		} else if (loption == "normalize_names") {
 			options.normalize_names = BooleanValue::Get(kv.second);
+		} else if (loption == "parallel") {
+			options.run_parallel = BooleanValue::Get(kv.second);
 		} else {
 			options.SetReadOption(loption, kv.second, names);
 		}
@@ -214,6 +218,13 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
 	if (options.file_options.union_by_name) {
 		result->reader_bind =
 		    MultiFileReader::BindUnionReader<BufferedCSVReader>(context, return_types, names, *result, options);
+		if (result->union_readers.size() > 1) {
+			result->column_info.emplace_back(result->csv_names, result->csv_types);
+			for (idx_t i = 1; i < result->union_readers.size(); i++) {
+				result->column_info.emplace_back(result->union_readers[i]->names,
+				                                 result->union_readers[i]->return_types);
+			}
+		}
 		if (!options.sql_types_per_column.empty()) {
 			auto exception = BufferedCSVReader::ColumnTypesError(options.sql_types_per_column, names);
 			if (!exception.empty()) {
@@ -253,17 +264,27 @@ public:
 		file_size = file_handle->FileSize();
 		first_file_size = file_size;
 		bytes_read = 0;
-		if (buffer_size < file_size) {
+		if (buffer_size < file_size || file_size == 0) {
 			bytes_per_local_state = buffer_size / ParallelCSVGlobalState::MaxThreads();
 		} else {
 			bytes_per_local_state = file_size / MaxThreads();
 		}
-		current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position);
-		next_buffer =
-		    shared_ptr<CSVBuffer>(current_buffer->Next(*file_handle, buffer_size, current_csv_position).release());
+		if (bytes_per_local_state == 0) {
+			// In practice, I think this won't happen, it only happens because we are mocking up test scenarios
+			// this boy needs to be at least one.
+			bytes_per_local_state = 1;
+		}
+		for (idx_t i = 0; i < rows_to_skip; i++) {
+			file_handle->ReadLine();
+		}
+		first_position = current_csv_position;
+		current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
+		next_buffer = shared_ptr<CSVBuffer>(
+		    current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
 		running_threads = MaxThreads();
 	}
 	ParallelCSVGlobalState() {
+		running_threads = MaxThreads();
 	}
 	~ParallelCSVGlobalState() override {
@@ -281,7 +302,7 @@ public:
 	//! Verify if the CSV File was read correctly
 	void Verify();
-	void UpdateVerification(VerificationPositions positions);
+	void UpdateVerification(VerificationPositions positions, idx_t file_number);
 	void IncrementThread();
@@ -332,14 +353,18 @@ private:
 	//! Current batch index
 	idx_t batch_index = 0;
 	//! Forces parallelism for small CSV Files, should only be used for testing.
-	bool force_parallelism;
+	bool force_parallelism = false;
 	//! Current (Global) position of CSV
 	idx_t current_csv_position = 0;
+	//! First Position of First Buffer
+	idx_t first_position = 0;
+	//! Current File Number
+	idx_t file_number = 0;
 	idx_t max_tuple_end = 0;
 	//! the vector stores positions where threads ended the last line they read in the CSV File, and the set stores
 	//! positions where they started reading the first line.
-	vector<idx_t> tuple_end;
-	set<idx_t> tuple_start;
+	vector<vector<idx_t>> tuple_end;
+	vector<set<idx_t>> tuple_start;
 	idx_t running_threads = 0;
 	//! The column ids to read
 	vector<column_t> column_ids;
@@ -349,10 +374,9 @@ idx_t ParallelCSVGlobalState::MaxThreads() const {
 	if (force_parallelism) {
 		return system_threads;
 	}
 	idx_t one_mb = 1000000; // We initialize max one thread per Mb
 	idx_t threads_per_mb = first_file_size / one_mb + 1;
-	if (threads_per_mb < system_threads) {
+	if (threads_per_mb < system_threads || threads_per_mb == 1) {
 		return threads_per_mb;
 	}
@@ -378,25 +402,36 @@ bool ParallelCSVGlobalState::Finished() {
 void ParallelCSVGlobalState::Verify() {
 	// All threads are done, we run some magic sweet verification code
 	if (running_threads == 0) {
-		for (auto &last_pos : tuple_end) {
-			auto first_pos = tuple_start.find(last_pos);
-			if (first_pos == tuple_start.end()) {
-				// this might be necessary due to carriage returns outside buffer scopes.
-				first_pos = tuple_start.find(last_pos + 1);
+		D_ASSERT(tuple_end.size() == tuple_start.size());
+		for (idx_t i = 0; i < tuple_start.size(); i++) {
+			auto &current_tuple_end = tuple_end[i];
+			auto &current_tuple_start = tuple_start[i];
+			// figure out max value of last_pos
+			if (current_tuple_end.empty()) {
+				return;
 			}
-			if (first_pos == tuple_start.end() && last_pos != NumericLimits<uint64_t>::Maximum()) {
-				string error = "Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
-				               " does not have a match\n";
-				error += "End Lines: \n";
-				for (auto &end_line : tuple_end) {
-					error += to_string(end_line) + "\n";
+			auto max_value = *max_element(std::begin(current_tuple_end), std::end(current_tuple_end));
+			for (auto &last_pos : current_tuple_end) {
+				auto first_pos = current_tuple_start.find(last_pos);
+				if (first_pos == current_tuple_start.end()) {
+					// this might be necessary due to carriage returns outside buffer scopes.
+					first_pos = current_tuple_start.find(last_pos + 1);
 				}
-				error += "Start Lines: \n";
-				for (auto &start_line : tuple_start) {
-					error += to_string(start_line) + "\n";
+				if (first_pos == current_tuple_start.end() && last_pos != max_value) {
+					string error =
+					    "Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
+					    " does not have a match\n";
+					error += "End Lines: \n";
+					for (auto &end_line : current_tuple_end) {
+						error += to_string(end_line) + "\n";
+					}
+					error += "Start Lines: \n";
+					for (auto &start_line : current_tuple_start) {
+						error += to_string(start_line) + "\n";
+					}
+					throw InvalidInputException(
+					    "CSV File not supported for multithreading. Please run single-threaded CSV Reading");
 				}
-				throw InvalidInputException(
-				    "CSV File not supported for multithreading. Please run single-threaded CSV Reading");
 			}
 		}
 	}
@@ -411,9 +446,11 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
 			current_file_path = bind_data.files[file_index++];
 			file_handle = ReadCSV::OpenCSV(current_file_path, bind_data.options.compression, context);
 			current_csv_position = 0;
-			current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position);
-			next_buffer =
-			    shared_ptr<CSVBuffer>(current_buffer->Next(*file_handle, buffer_size, current_csv_position).release());
+			file_number++;
+			current_buffer =
+			    make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
+			next_buffer = shared_ptr<CSVBuffer>(
+			    current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
 		} else {
 			// We are done scanning.
 			reader.reset();
@@ -433,8 +470,8 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
 		current_buffer = next_buffer;
 		if (next_buffer) {
 			// Next buffer gets the next-next buffer
-			next_buffer =
-			    shared_ptr<CSVBuffer>(next_buffer->Next(*file_handle, buffer_size, current_csv_position).release());
+			next_buffer = shared_ptr<CSVBuffer>(
+			    next_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
 		}
 	}
 	if (!reader || reader->options.file_path != current_file_path) {
@@ -443,13 +480,18 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
 		if (file_index > 0 && file_index <= bind_data.union_readers.size() && bind_data.union_readers[file_index - 1]) {
 			// we are doing UNION BY NAME - fetch the options from the union reader for this file
 			auto &union_reader = *bind_data.union_readers[file_index - 1];
-			reader =
-			    make_uniq<ParallelCSVReader>(context, union_reader.options, std::move(result), union_reader.GetTypes());
+			reader = make_uniq<ParallelCSVReader>(context, union_reader.options, std::move(result), first_position,
+			                                      union_reader.GetTypes());
 			reader->names = union_reader.GetNames();
+		} else if (file_index <= bind_data.column_info.size()) {
+			// Serialized Union By name
+			reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
+			                                      bind_data.column_info[file_index - 1].types);
+			reader->names = bind_data.column_info[file_index - 1].names;
 		} else {
 			// regular file - use the standard options
-			reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), bind_data.csv_types);
-			reader->options.file_path = current_file_path;
+			reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
+			                                      bind_data.csv_types);
 			reader->names = bind_data.csv_names;
 		}
 		reader->options.file_path = current_file_path;
@@ -461,14 +503,20 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
 	}
 	return true;
 }
-void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions) {
+void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions, idx_t file_number_p) {
 	lock_guard<mutex> parallel_lock(main_mutex);
 	if (positions.beginning_of_first_line < positions.end_of_last_line) {
 		if (positions.end_of_last_line > max_tuple_end) {
 			max_tuple_end = positions.end_of_last_line;
 		}
-		tuple_start.insert(positions.beginning_of_first_line);
-		tuple_end.push_back(positions.end_of_last_line);
+		while (file_number_p >= tuple_start.size()) {
+			vector<idx_t> empty_tuple_end;
+			set<idx_t> empty_set;
+			tuple_start.emplace_back(empty_set);
+			tuple_end.emplace_back(empty_tuple_end);
+		}
+		tuple_start[file_number_p].insert(positions.beginning_of_first_line);
+		tuple_end[file_number_p].push_back(positions.end_of_last_line);
 	}
 }
@@ -483,11 +531,9 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
 	bind_data.options.file_path = bind_data.files[0];
 	file_handle = ReadCSV::OpenCSV(bind_data.options.file_path, bind_data.options.compression, context);
-	idx_t rows_to_skip =
-	    bind_data.options.skip_rows + (bind_data.options.has_header && bind_data.options.header ? 1 : 0);
-	return make_uniq<ParallelCSVGlobalState>(context, std::move(file_handle), bind_data.files,
-	                                         context.db->NumberOfThreads(), bind_data.options.buffer_size, rows_to_skip,
-	                                         ClientConfig::GetConfig(context).verify_parallelism, input.column_ids);
+	return make_uniq<ParallelCSVGlobalState>(
+	    context, std::move(file_handle), bind_data.files, context.db->NumberOfThreads(), bind_data.options.buffer_size,
+	    bind_data.options.skip_rows, ClientConfig::GetConfig(context).verify_parallelism, input.column_ids);
 }
 //===--------------------------------------------------------------------===//
@@ -534,11 +580,10 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
 		}
 		if (csv_local_state.csv_reader->finished) {
 			auto verification_updates = csv_local_state.csv_reader->GetVerificationPositions();
-			if (!csv_local_state.csv_reader->buffer->next_buffer) {
-				// if it's the last line of the file we mark as the maximum
-				verification_updates.end_of_last_line = NumericLimits<uint64_t>::Maximum();
+			if (verification_updates.beginning_of_first_line != verification_updates.end_of_last_line) {
+				csv_global_state.UpdateVerification(verification_updates,
+				                                    csv_local_state.csv_reader->buffer->buffer->GetFileNumber());
 			}
-			csv_global_state.UpdateVerification(verification_updates);
 			auto has_next = csv_global_state.Next(context, bind_data, csv_local_state.csv_reader);
 			if (!has_next) {
 				csv_global_state.DecrementThread();
@@ -642,14 +687,17 @@ static unique_ptr<GlobalTableFunctionState> SingleThreadedCSVInit(ClientContext
                                                                   TableFunctionInitInput &input) {
 	auto &bind_data = (ReadCSVData &)*input.bind_data;
 	auto result = make_uniq<SingleThreadedCSVState>(bind_data.files.size());
-	if (bind_data.initial_reader) {
-		result->initial_reader = std::move(bind_data.initial_reader);
-	} else if (bind_data.files.empty()) {
+	if (bind_data.files.empty()) {
 		// This can happen when a filename based filter pushdown has eliminated all possible files for this scan.
 		return std::move(result);
 	} else {
 		bind_data.options.file_path = bind_data.files[0];
-		result->initial_reader = make_uniq<BufferedCSVReader>(context, bind_data.options, bind_data.csv_types);
+		if (bind_data.initial_reader && !bind_data.file_exists) {
+			// If this is not an on disk file we gotta reuse the reader.
+			result->initial_reader = std::move(bind_data.initial_reader);
+		} else {
+			result->initial_reader = make_uniq<BufferedCSVReader>(context, bind_data.options, bind_data.csv_types);
+		}
 		if (!bind_data.options.file_options.union_by_name) {
 			result->initial_reader->names = bind_data.csv_names;
 		}
@@ -741,6 +789,14 @@ static void SingleThreadedCSVFunction(ClientContext &context, TableFunctionInput
 //===--------------------------------------------------------------------===//
 static unique_ptr<GlobalTableFunctionState> ReadCSVInitGlobal(ClientContext &context, TableFunctionInitInput &input) {
 	auto &bind_data = (ReadCSVData &)*input.bind_data;
+	auto &fs = FileSystem::GetFileSystem(context);
+	for (auto &file : bind_data.files) {
+		if (!fs.FileExists(file)) {
+			bind_data.file_exists = false;
+			break;
+		}
+	}
+	bind_data.single_threaded = bind_data.single_threaded || !bind_data.file_exists;
 	if (bind_data.single_threaded) {
 		return SingleThreadedCSVInit(context, input);
 	} else {
@@ -863,6 +919,7 @@ void BufferedCSVReaderOptions::Serialize(FieldWriter &writer) const {
 	writer.WriteField<idx_t>(buffer_sample_size);
 	writer.WriteString(null_str);
 	writer.WriteField<FileCompressionType>(compression);
+	writer.WriteField<NewLineIdentifier>(new_line);
 	// read options
 	writer.WriteField<idx_t>(skip_rows);
 	writer.WriteField<bool>(skip_rows_set);
@@ -896,6 +953,7 @@ void BufferedCSVReaderOptions::Deserialize(FieldReader &reader) {
 	buffer_sample_size = reader.ReadRequired<idx_t>();
 	null_str = reader.ReadRequired<string>();
 	compression = reader.ReadRequired<FileCompressionType>();
+	new_line = reader.ReadRequired<NewLineIdentifier>();
 	// read options
 	skip_rows = reader.ReadRequired<idx_t>();
 	skip_rows_set = reader.ReadRequired<bool>();
@@ -926,6 +984,10 @@ static void CSVReaderSerialize(FieldWriter &writer, const FunctionData *bind_dat
 	bind_data.options.Serialize(writer);
 	writer.WriteField<bool>(bind_data.single_threaded);
 	writer.WriteSerializable(bind_data.reader_bind);
+	writer.WriteField<uint32_t>(bind_data.column_info.size());
+	for (auto &col : bind_data.column_info) {
+		col.Serialize(writer);
+	}
 }
 static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, FieldReader &reader,
@@ -941,6 +1003,10 @@ static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, Fie
 	result_data->options.Deserialize(reader);
 	result_data->single_threaded = reader.ReadField<bool>(true);
 	result_data->reader_bind = reader.ReadRequiredSerializable<MultiFileReaderBindData, MultiFileReaderBindData>();
+	uint32_t file_number = reader.ReadRequired<uint32_t>();
+	for (idx_t i = 0; i < file_number; i++) {
+		result_data->column_info.emplace_back(ColumnInfo::Deserialize(reader));
+	}
 	return std::move(result_data);
 }

package/src/duckdb/src/function/table/version/pragma_version.cpp CHANGED Viewed

@@ -1,8 +1,8 @@
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "0.7.2-dev2144"
+#define DUCKDB_VERSION "0.7.2-dev2233"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "82211fc11b"
+#define DUCKDB_SOURCE_ID "c81600ed51"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"