npm - duckdb - Versions diffs - 0.6.2-dev1687.0 → 0.6.2-dev1736.0 - Mend

duckdb 0.6.2-dev1687.0 → 0.6.2-dev1736.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "0.6.2-dev1687.0",
+  "version": "0.6.2-dev1736.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp CHANGED Viewed

@@ -551,4 +551,32 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
 	parse_chunk.Reset();
 	return true;
 }
+void BaseCSVReader::SetNewLineDelimiter(bool carry, bool carry_followed_by_nl) {
+	if ((mode == ParserMode::SNIFFING_DIALECT && !options.has_newline) ||
+	    options.new_line == NewLineIdentifier::NOT_SET) {
+		if (options.new_line == NewLineIdentifier::MIX) {
+			return;
+		}
+		NewLineIdentifier this_line_identifier;
+		if (carry) {
+			if (carry_followed_by_nl) {
+				this_line_identifier = NewLineIdentifier::CARRY_ON;
+			} else {
+				this_line_identifier = NewLineIdentifier::SINGLE;
+			}
+		} else {
+			this_line_identifier = NewLineIdentifier::SINGLE;
+		}
+		if (options.new_line == NewLineIdentifier::NOT_SET) {
+			options.new_line = this_line_identifier;
+			return;
+		}
+		if (options.new_line != this_line_identifier) {
+			options.new_line = NewLineIdentifier::MIX;
+			return;
+		}
+		options.new_line = this_line_identifier;
+	}
+}
 } // namespace duckdb

package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp CHANGED Viewed

@@ -482,6 +482,7 @@ void BufferedCSVReader::DetectDialect(const vector<LogicalType> &requested_types
 					} else if ((more_values || single_column_before) && rows_consistent) {
 						sniff_info.skip_rows = start_row;
 						sniff_info.num_cols = num_cols;
+						sniff_info.new_line = options.new_line;
 						best_consistent_rows = consistent_rows;
 						best_num_cols = num_cols;
@@ -497,6 +498,7 @@ void BufferedCSVReader::DetectDialect(const vector<LogicalType> &requested_types
 						if (!same_quote_is_candidate) {
 							sniff_info.skip_rows = start_row;
 							sniff_info.num_cols = num_cols;
+							sniff_info.new_line = options.new_line;
 							info_candidates.push_back(sniff_info);
 						}
 					}
@@ -1264,6 +1266,7 @@ add_row : {
 		// \r newline, go to special state that parses an optional \n afterwards
 		goto carriage_return;
 	} else {
+		SetNewLineDelimiter();
 		// \n newline, move to value start
 		if (finished_chunk) {
 			return true;
@@ -1342,6 +1345,7 @@ carriage_return:
 	/* state: carriage_return */
 	// this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
 	if (buffer[position] == '\n') {
+		SetNewLineDelimiter(true, true);
 		// newline after carriage return: skip
 		// increase position by 1 and move start to the new position
 		start = ++position;
@@ -1349,6 +1353,8 @@ carriage_return:
 			// file ends right after delimiter, go to final state
 			goto final_state;
 		}
+	} else {
+		SetNewLineDelimiter(true, false);
 	}
 	if (finished_chunk) {
 		return true;

package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp CHANGED Viewed

@@ -3,12 +3,15 @@
 namespace duckdb {
-CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle)
+CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
+                     idx_t &global_csv_current_position)
     : context(context), first_buffer(true) {
 	this->handle = AllocateBuffer(buffer_size_p);
 	auto buffer = Ptr();
 	actual_size = file_handle.Read(buffer, buffer_size_p);
+	global_csv_start = global_csv_current_position;
+	global_csv_current_position += actual_size;
 	if (actual_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
 		start_position += 3;
 	}
@@ -16,21 +19,25 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle
 }
 CSVBuffer::CSVBuffer(ClientContext &context, BufferHandle buffer_p, idx_t buffer_size_p, idx_t actual_size_p,
-                     bool final_buffer)
-    : context(context), handle(std::move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer) {
+                     bool final_buffer, idx_t global_csv_current_position)
+    : context(context), handle(std::move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer),
+      global_csv_start(global_csv_current_position) {
 }
-unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t set_buffer_size) {
+unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size,
+                                      idx_t &global_csv_current_position) {
 	if (file_handle.FinishedReading()) {
 		// this was the last buffer
 		return nullptr;
 	}
-	auto next_buffer = AllocateBuffer(set_buffer_size);
-	idx_t next_buffer_actual_size = file_handle.Read(next_buffer.Ptr(), set_buffer_size);
+	auto next_buffer = AllocateBuffer(buffer_size);
+	idx_t next_buffer_actual_size = file_handle.Read(next_buffer.Ptr(), buffer_size);
-	return make_unique<CSVBuffer>(context, std::move(next_buffer), set_buffer_size, next_buffer_actual_size,
-	                              file_handle.FinishedReading());
+	auto next_csv_buffer = make_unique<CSVBuffer>(context, std::move(next_buffer), buffer_size, next_buffer_actual_size,
+	                                              file_handle.FinishedReading(), global_csv_current_position);
+	global_csv_current_position += next_buffer_actual_size;
+	return next_csv_buffer;
 }
 BufferHandle CSVBuffer::AllocateBuffer(idx_t buffer_size) {
@@ -54,4 +61,8 @@ bool CSVBuffer::IsCSVFileFirstBuffer() {
 	return first_buffer;
 }
+idx_t CSVBuffer::GetCSVGlobalStart() {
+	return global_csv_start;
+}
 } // namespace duckdb

package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp CHANGED Viewed

@@ -117,6 +117,17 @@ void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
 	}
 }
+void BufferedCSVReaderOptions::SetNewline(const string &input) {
+	if (input == "\\n" || input == "\\r") {
+		new_line = NewLineIdentifier::SINGLE;
+	} else if (input == "\\r\\n") {
+		new_line = NewLineIdentifier::CARRY_ON;
+	} else {
+		throw InvalidInputException("This is not accepted as a newline: " + input);
+	}
+	has_newline = true;
+}
 void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) {
 	string error;
 	if (read_format) {
@@ -233,6 +244,8 @@ bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value
 	} else if (loption == "quote") {
 		quote = ParseString(value, loption);
 		has_quote = true;
+	} else if (loption == "new_line") {
+		SetNewline(ParseString(value, loption));
 	} else if (loption == "escape") {
 		escape = ParseString(value, loption);
 		has_escape = true;

package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp CHANGED Viewed

@@ -43,15 +43,31 @@ void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
 	InitInsertChunkIdx(return_types.size());
 }
+bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl, bool first_char) {
+	// Set the delimiter if not set yet.
+	SetNewLineDelimiter(carry, carry_followed_by_nl);
+	D_ASSERT(options.new_line == NewLineIdentifier::SINGLE || options.new_line == NewLineIdentifier::CARRY_ON);
+	if (options.new_line == NewLineIdentifier::SINGLE) {
+		return (!carry) || (carry && !carry_followed_by_nl);
+	}
+	return (carry && carry_followed_by_nl) || (!carry && first_char);
+}
 bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
 	if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer &&
 	    start_buffer == buffer->buffer->GetStart()) {
+		verification_positions.beginning_of_first_line = position_buffer;
+		verification_positions.end_of_last_line = position_buffer;
 		// First buffer doesn't need any setting
 		// Unless we have a header
 		if (options.header && options.auto_detect) {
 			for (; position_buffer < end_buffer; position_buffer++) {
 				if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
+					bool carrier_return = (*buffer)[position_buffer] == '\r';
 					position_buffer++;
+					if (carrier_return && position_buffer < end_buffer && (*buffer)[position_buffer] == '\n') {
+						position_buffer++;
+					}
 					return true;
 				}
 			}
@@ -70,26 +86,50 @@ bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
 		first_line_chunk.Initialize(allocator, insert_chunk.GetTypes());
 		for (; position_buffer < end_buffer; position_buffer++) {
 			if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
+				bool carriage_return = (*buffer)[position_buffer] == '\r';
+				bool carriage_return_followed = false;
 				position_buffer++;
-				break;
+				if (position_buffer < end_buffer) {
+					if (carriage_return && (*buffer)[position_buffer] == '\n') {
+						carriage_return_followed = true;
+						position_buffer++;
+					}
+				}
+				if (NewLineDelimiter(carriage_return, carriage_return_followed, position_buffer - 1 == start_buffer)) {
+					break;
+				}
 			}
 		}
-		D_ASSERT(position_buffer <= end_buffer);
-		if (position_buffer == end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
+		if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
+			break;
+		}
+		if (position_buffer > end_buffer && options.new_line == NewLineIdentifier::CARRY_ON &&
+		    (*buffer)[position_buffer - 1] == '\n') {
 			break;
 		}
 		idx_t position_set = position_buffer;
 		start_buffer = position_buffer;
 		// We check if we can add this line
 		successfully_read_first_line = TryParseSimpleCSV(first_line_chunk, error_message, true);
-		start_buffer = position_set;
 		end_buffer = end_buffer_real;
-		position_buffer = position_set;
-		if (end_buffer == position_buffer) {
+		start_buffer = position_set;
+		if (position_buffer >= end_buffer) {
+			if (successfully_read_first_line) {
+				position_buffer = position_set;
+			}
 			break;
 		}
+		position_buffer = position_set;
+	}
+	if (verification_positions.beginning_of_first_line == 0) {
+		verification_positions.beginning_of_first_line = position_buffer;
 	}
+	verification_positions.end_of_last_line = position_buffer;
+	finished = false;
 	return successfully_read_first_line;
 }
@@ -110,9 +150,18 @@ void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
 	linenr_estimated = true;
 	reached_remainder_state = false;
+	verification_positions.beginning_of_first_line = 0;
+	verification_positions.end_of_last_line = 0;
+	finished = false;
 	D_ASSERT(end_buffer <= buffer_size);
 }
+VerificationPositions ParallelCSVReader::GetVerificationPositions() {
+	verification_positions.beginning_of_first_line += buffer->buffer->GetCSVGlobalStart();
+	verification_positions.end_of_last_line += buffer->buffer->GetCSVGlobalStart();
+	return verification_positions;
+}
 // If BufferRemainder returns false, it means we are done scanning this buffer and should go to the end_state
 bool ParallelCSVReader::BufferRemainder() {
 	if (position_buffer >= end_buffer && !reached_remainder_state) {
@@ -131,7 +180,6 @@ bool ParallelCSVReader::BufferRemainder() {
 }
 bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
 	// used for parsing algorithm
 	D_ASSERT(end_buffer <= buffer_size);
 	bool finished_chunk = false;
@@ -139,10 +187,15 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
 	idx_t offset = 0;
 	bool has_quotes = false;
 	vector<idx_t> escape_positions;
-	if (start_buffer == buffer->buffer_start && !try_add_line) {
+	if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
 		// First time reading this buffer piece
 		if (!SetPosition(insert_chunk)) {
 			// This means the buffer size does not contain a new line
+			if (position_buffer - start_buffer == options.buffer_size) {
+				error_message = "Line does not fit in one buffer. Increase the buffer size.";
+				return false;
+			}
+			finished = true;
 			return true;
 		}
 	}
@@ -180,7 +233,7 @@ normal : {
 			goto add_value;
 		} else if (StringUtil::CharacterIsNewline(c)) {
 			// newline: add row
-			if (column > 0 || try_add_line) {
+			if (column > 0 || try_add_line || insert_chunk.data.size() == 1) {
 				goto add_row;
 			}
 		}
@@ -227,7 +280,8 @@ add_row : {
 	offset = 0;
 	has_quotes = false;
 	start_buffer = ++position_buffer;
-	if (reached_remainder_state || finished_chunk) {
+	verification_positions.end_of_last_line = position_buffer;
+	if (reached_remainder_state) {
 		goto final_state;
 	}
 	if (!BufferRemainder()) {
@@ -235,8 +289,37 @@ add_row : {
 	}
 	if (carriage_return) {
 		// \r newline, go to special state that parses an optional \n afterwards
-		goto carriage_return;
+		// optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
+		if ((*buffer)[position_buffer] == '\n') {
+			if (options.new_line == NewLineIdentifier::SINGLE) {
+				error_message = "Wrong NewLine Identifier. Expecting \\r\\n";
+				return false;
+			}
+			// newline after carriage return: skip
+			// increase position by 1 and move start to the new position
+			start_buffer = ++position_buffer;
+			verification_positions.end_of_last_line = position_buffer;
+			if (reached_remainder_state) {
+				goto final_state;
+			}
+		} else {
+			if (options.new_line == NewLineIdentifier::CARRY_ON) {
+				error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
+				return false;
+			}
+		}
+		if (!BufferRemainder()) {
+			goto final_state;
+		}
+		if (reached_remainder_state || finished_chunk) {
+			goto final_state;
+		}
+		goto value_start;
 	} else {
+		if (options.new_line == NewLineIdentifier::CARRY_ON) {
+			error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
+			return false;
+		}
 		// \n newline, move to value start
 		if (finished_chunk) {
 			goto final_state;
@@ -332,33 +415,31 @@ handle_escape : {
 	// escape was followed by quote or escape, go back to quoted state
 	goto in_quotes;
 }
-carriage_return : {
-	/* state: carriage_return */
-	// this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
-	if ((*buffer)[position_buffer] == '\n') {
-		// newline after carriage return: skip
-		// increase position by 1 and move start to the new position
-		start_buffer = ++position_buffer;
-		if (position_buffer >= buffer_size) {
-			// file ends right after delimiter, go to final state
-			goto final_state;
-		}
-	}
-	goto value_start;
-}
 final_state : {
 	/* state: final_stage reached after we finished reading the end_buffer of the csv buffer */
 	// reset end buffer
 	end_buffer = buffer->buffer_end;
+	if (position_buffer == end_buffer) {
+		reached_remainder_state = false;
+	}
 	if (finished_chunk) {
+		if (position_buffer >= end_buffer) {
+			if (position_buffer == end_buffer && StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1]) &&
+			    position_buffer < buffer_size) {
+				// last position is a new line, we still have to go through one more line of this buffer
+				finished = false;
+			} else {
+				finished = true;
+			}
+		}
 		return true;
 	}
 	// If this is the last buffer, we have to read the last value
 	if (buffer->buffer->IsCSVFileLastBuffer() || (buffer->next_buffer->IsCSVFileLastBuffer())) {
-		if (column > 0 || try_add_line) {
+		if (column > 0 || try_add_line || (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
 			// remaining values to be added to the chunk
-			AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes);
+			auto str_value = buffer->GetValue(start_buffer, position_buffer, offset);
+			AddValue(str_value, column, escape_positions, has_quotes);
 			if (try_add_line) {
 				bool success = column == return_types.size();
 				if (success) {
@@ -370,6 +451,7 @@ final_state : {
 				return success;
 			} else {
 				AddRow(insert_chunk, column, error_message);
+				verification_positions.end_of_last_line = position_buffer;
 			}
 		}
 	}
@@ -377,6 +459,22 @@ final_state : {
 	if (mode == ParserMode::PARSING) {
 		Flush(insert_chunk);
 	}
+	if (position_buffer != verification_positions.end_of_last_line &&
+	    !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
+		error_message = "Line does not fit in one buffer. Increase the buffer size.";
+		return false;
+	}
+	if (position_buffer >= end_buffer) {
+		if (position_buffer >= end_buffer) {
+			if (position_buffer == end_buffer && StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1]) &&
+			    position_buffer < buffer_size) {
+				// last position is a new line, we still have to go through one more line of this buffer
+				finished = false;
+			} else {
+				finished = true;
+			}
+		}
+	}
 	return true;
 };
 }

package/src/duckdb/src/function/table/read_csv.cpp CHANGED Viewed

@@ -38,7 +38,10 @@ void ReadCSVData::FinalizeRead(ClientContext &context) {
 	BaseCSVData::Finalize();
 	auto &config = DBConfig::GetConfig(context);
 	single_threaded = !config.options.experimental_parallel_csv_reader;
-	if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
+	bool null_or_empty = options.delimiter.empty() || options.escape.empty() || options.quote.empty() ||
+	                     options.delimiter[0] == '\0' || options.escape[0] == '\0' || options.quote[0] == '\0';
+	bool complex_options = options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1;
+	if (null_or_empty || complex_options || options.new_line == NewLineIdentifier::MIX) {
 		// not supported for parallel CSV reading
 		single_threaded = true;
 	}
@@ -239,8 +242,9 @@ struct ParallelCSVGlobalState : public GlobalTableFunctionState {
 public:
 	ParallelCSVGlobalState(ClientContext &context, unique_ptr<CSVFileHandle> file_handle_p,
 	                       vector<string> &files_path_p, idx_t system_threads_p, idx_t buffer_size_p,
-	                       idx_t rows_to_skip)
-	    : file_handle(std::move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p) {
+	                       idx_t rows_to_skip, bool force_parallelism_p)
+	    : file_handle(std::move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p),
+	      force_parallelism(force_parallelism_p) {
 		for (idx_t i = 0; i < rows_to_skip; i++) {
 			file_handle->ReadLine();
 		}
@@ -253,23 +257,34 @@ public:
 		} else {
 			bytes_per_local_state = file_size / MaxThreads();
 		}
-		current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle);
-		next_buffer = current_buffer->Next(*file_handle, buffer_size);
+		current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position);
+		next_buffer = current_buffer->Next(*file_handle, buffer_size, current_csv_position);
+		running_threads = MaxThreads();
 	}
 	ParallelCSVGlobalState() {
 	}
+	~ParallelCSVGlobalState() override {
+	}
 	idx_t MaxThreads() const override;
 	//! Returns buffer and index that caller thread should read.
 	unique_ptr<CSVBufferRead> Next(ClientContext &context, ReadCSVData &bind_data);
-	//! If we finished reading all the CSV Files
+	//! Verify if the CSV File was read correctly
+	void Verify();
+	void UpdateVerification(VerificationPositions positions);
+	void IncrementThread();
+	void DecrementThread();
 	bool Finished();
 	//! How many bytes were read up to this point
 	atomic<idx_t> bytes_read;
 	//! Size of current file
 	idx_t file_size;
-	//! The index of the next file to read (i.e. current file + 1)
-	idx_t file_index = 1;
 	double GetProgress(ReadCSVData &bind_data) const {
 		idx_t total_files = bind_data.files.size();
@@ -290,21 +305,20 @@ public:
 private:
 	//! File Handle for current file
 	unique_ptr<CSVFileHandle> file_handle;
 	shared_ptr<CSVBuffer> current_buffer;
 	shared_ptr<CSVBuffer> next_buffer;
+	//! The index of the next file to read (i.e. current file + 1)
+	idx_t file_index = 1;
 	//! Mutex to lock when getting next batch of bytes (Parallel Only)
 	mutex main_mutex;
 	//! Byte set from for last thread
 	idx_t next_byte = 0;
 	//! The current estimated line number
 	idx_t estimated_linenr;
 	//! How many bytes we should execute per local state
 	idx_t bytes_per_local_state;
 	//! Size of first file
 	idx_t first_file_size;
 	//! Basically max number of threads in DuckDB
@@ -313,20 +327,73 @@ private:
 	idx_t buffer_size;
 	//! Current batch index
 	idx_t batch_index = 0;
+	//! Forces parallelism for small CSV Files, should only be used for testing.
+	bool force_parallelism;
+	//! Current (Global) position of CSV
+	idx_t current_csv_position = 0;
+	idx_t max_tuple_end = 0;
+	//! the vector stores positions where threads ended the last line they read in the CSV File, and the set stores
+	//! positions where they started reading the first line.
+	vector<idx_t> tuple_end;
+	set<idx_t> tuple_start;
+	idx_t running_threads = 0;
 };
 idx_t ParallelCSVGlobalState::MaxThreads() const {
-	//	idx_t one_mb = 1000000;
-	//	idx_t threads_per_mb = first_file_size / one_mb + 1;
-	//	if (threads_per_mb < system_threads) {
-	//		return threads_per_mb;
-	//	}
+	if (force_parallelism) {
+		return system_threads;
+	}
+	idx_t one_mb = 1000000; // We initialize max one thread per Mb
+	idx_t threads_per_mb = first_file_size / one_mb + 1;
+	if (threads_per_mb < system_threads) {
+		return threads_per_mb;
+	}
 	return system_threads;
 }
+void ParallelCSVGlobalState::IncrementThread() {
+	lock_guard<mutex> parallel_lock(main_mutex);
+	running_threads++;
+}
+void ParallelCSVGlobalState::DecrementThread() {
+	lock_guard<mutex> parallel_lock(main_mutex);
+	D_ASSERT(running_threads > 0);
+	running_threads--;
+}
 bool ParallelCSVGlobalState::Finished() {
 	lock_guard<mutex> parallel_lock(main_mutex);
-	return !current_buffer;
+	return running_threads == 0;
+}
+void ParallelCSVGlobalState::Verify() {
+	// All threads are done, we run some magic sweet verification code
+	if (running_threads == 0) {
+		for (auto &last_pos : tuple_end) {
+			auto first_pos = tuple_start.find(last_pos);
+			if (first_pos == tuple_start.end()) {
+				// this might be necessary due to carriage returns outside buffer scopes.
+				first_pos = tuple_start.find(last_pos + 1);
+			}
+			if (first_pos == tuple_start.end() && last_pos != max_tuple_end) {
+				string error = "Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
+				               " does not have a match\n";
+				error += "End Lines: \n";
+				for (auto &end_line : tuple_end) {
+					error += to_string(end_line) + "\n";
+				}
+				error += "Start Lines: \n";
+				for (auto &start_line : tuple_start) {
+					error += to_string(start_line) + "\n";
+				}
+				throw InvalidInputException(
+				    "CSV File not supported for multithreading. Please run single-threaded CSV Reading");
+			}
+		}
+	}
 }
 unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, ReadCSVData &bind_data) {
@@ -348,7 +415,7 @@ unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, R
 		current_buffer = next_buffer;
 		if (next_buffer) {
 			// Next buffer gets the next-next buffer
-			next_buffer = next_buffer->Next(*file_handle, buffer_size);
+			next_buffer = next_buffer->Next(*file_handle, buffer_size, current_csv_position);
 		}
 	}
 	if (current_buffer && !next_buffer) {
@@ -356,11 +423,26 @@ unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, R
 		if (file_index < bind_data.files.size()) {
 			bind_data.options.file_path = bind_data.files[file_index++];
 			file_handle = ReadCSV::OpenCSV(bind_data.options, context);
-			next_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle);
+			current_csv_position = 0;
+			// FIXME: This will probably require some changes on the verification code
+			next_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position);
 		}
 	}
 	return result;
 }
+void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions) {
+	lock_guard<mutex> parallel_lock(main_mutex);
+	if (positions.beginning_of_first_line < positions.end_of_last_line) {
+		if (positions.end_of_last_line > max_tuple_end) {
+			max_tuple_end = positions.end_of_last_line;
+		}
+		tuple_start.insert(positions.beginning_of_first_line);
+		tuple_end.push_back(positions.end_of_last_line);
+	}
+}
+void SetNewLine() {
+}
 static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext &context,
                                                                   TableFunctionInitInput &input) {
@@ -373,10 +455,11 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
 	bind_data.options.file_path = bind_data.files[0];
 	file_handle = ReadCSV::OpenCSV(bind_data.options, context);
-	idx_t rows_to_skip = bind_data.options.skip_rows + (bind_data.options.has_header ? 1 : 0);
+	idx_t rows_to_skip =
+	    bind_data.options.skip_rows + (bind_data.options.has_header && bind_data.options.header ? 1 : 0);
 	return make_unique<ParallelCSVGlobalState>(context, std::move(file_handle), bind_data.files,
 	                                           context.db->NumberOfThreads(), bind_data.options.buffer_size,
-	                                           rows_to_skip);
+	                                           rows_to_skip, ClientConfig::GetConfig(context).verify_parallelism);
 }
 //===--------------------------------------------------------------------===//
@@ -390,6 +473,7 @@ public:
 	//! The CSV reader
 	unique_ptr<ParallelCSVReader> csv_reader;
 	CSVBufferRead previous_buffer;
+	bool done = false;
 };
 unique_ptr<LocalTableFunctionState> ParallelReadCSVInitLocal(ExecutionContext &context, TableFunctionInitInput &input,
@@ -401,9 +485,10 @@ unique_ptr<LocalTableFunctionState> ParallelReadCSVInitLocal(ExecutionContext &c
 	if (next_local_buffer) {
 		csv_reader = make_unique<ParallelCSVReader>(context.client, csv_data.options, std::move(next_local_buffer),
 		                                            csv_data.sql_types);
+	} else {
+		global_state.DecrementThread();
 	}
-	auto new_local_state = make_unique<ParallelCSVLocalState>(std::move(csv_reader));
-	return std::move(new_local_state);
+	return make_unique<ParallelCSVLocalState>(std::move(csv_reader));
 }
 static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
@@ -417,13 +502,14 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
 	}
 	do {
-		if (output.size() != 0 || (csv_global_state.Finished() && csv_local_state.csv_reader->position_buffer >=
-		                                                              csv_local_state.csv_reader->end_buffer)) {
+		if (output.size() != 0) {
 			break;
 		}
-		if (csv_local_state.csv_reader->position_buffer >= csv_local_state.csv_reader->end_buffer) {
+		if (csv_local_state.csv_reader->finished) {
+			csv_global_state.UpdateVerification(csv_local_state.csv_reader->GetVerificationPositions());
 			auto next_chunk = csv_global_state.Next(context, bind_data);
 			if (!next_chunk) {
+				csv_global_state.DecrementThread();
 				break;
 			}
 			csv_local_state.csv_reader->SetBufferRead(std::move(next_chunk));
@@ -431,7 +517,9 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
 		csv_local_state.csv_reader->ParseCSV(output);
 	} while (true);
+	if (csv_global_state.Finished()) {
+		csv_global_state.Verify();
+	}
 	if (bind_data.options.union_by_name) {
 		throw InternalException("FIXME: union by name");
 	}
@@ -678,6 +766,7 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
 	table_function.named_parameters["sep"] = LogicalType::VARCHAR;
 	table_function.named_parameters["delim"] = LogicalType::VARCHAR;
 	table_function.named_parameters["quote"] = LogicalType::VARCHAR;
+	table_function.named_parameters["new_line"] = LogicalType::VARCHAR;
 	table_function.named_parameters["escape"] = LogicalType::VARCHAR;
 	table_function.named_parameters["nullstr"] = LogicalType::VARCHAR;
 	table_function.named_parameters["columns"] = LogicalType::ANY;

package/src/duckdb/src/function/table/version/pragma_version.cpp CHANGED Viewed

@@ -1,8 +1,8 @@
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "0.6.2-dev1687"
+#define DUCKDB_VERSION "0.6.2-dev1736"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "355d6ee967"
+#define DUCKDB_SOURCE_ID "424848838c"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"

package/src/duckdb/src/include/duckdb/common/assert.hpp CHANGED Viewed

@@ -14,6 +14,10 @@
 #include <assert.h>
 #define D_ASSERT assert
+namespace duckdb {
+DUCKDB_API void DuckDBAssertInternal(bool condition, const char *condition_name, const char *file, int linenr);
+}
 #else
 namespace duckdb {
 DUCKDB_API void DuckDBAssertInternal(bool condition, const char *condition_name, const char *file, int linenr);

package/src/duckdb/src/include/duckdb/execution/operator/persistent/base_csv_reader.hpp CHANGED Viewed

@@ -97,6 +97,9 @@ protected:
 	void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0);
 	static string GetLineNumberStr(idx_t linenr, bool linenr_estimated);
+	//! Sets the newline delimiter
+	void SetNewLineDelimiter(bool carry = false, bool carry_followed_by_nl = false);
 protected:
 	//! Whether or not the current row's columns have overflown return_types.size()
 	bool error_column_overflow = false;

package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp CHANGED Viewed

@@ -75,7 +75,6 @@ public:
 public:
 	//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
 	void ParseCSV(DataChunk &insert_chunk);
 	static string ColumnTypesError(case_insensitive_map_t<idx_t> sql_types_per_column, const vector<string> &names);
 private:

package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp CHANGED Viewed

@@ -20,13 +20,15 @@ public:
 	static constexpr idx_t INITIAL_BUFFER_SIZE_COLOSSAL = 32000000; // 32MB
 	//! Constructor for Initial Buffer
-	CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle);
+	CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
+	          idx_t &global_csv_current_position);
 	//! Constructor for `Next()` Buffers
-	CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer);
+	CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer,
+	          idx_t global_csv_current_position);
 	//! Creates a new buffer with the next part of the CSV File
-	unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t set_buffer_size);
+	unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position);
 	//! Gets the buffer actual size
 	idx_t GetBufferSize();
@@ -40,6 +42,8 @@ public:
 	//! If this buffer is the first buffer of the CSV File
 	bool IsCSVFileFirstBuffer();
+	idx_t GetCSVGlobalStart();
 	BufferHandle AllocateBuffer(idx_t buffer_size);
 	char *Ptr() {
@@ -59,5 +63,7 @@ private:
 	bool last_buffer = false;
 	//! If this is the first buffer of the CSV File
 	bool first_buffer = false;
+	//! Global position from the CSV File where this buffer starts
+	idx_t global_csv_start = 0;
 };
 } // namespace duckdb

package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp CHANGED Viewed

@@ -17,6 +17,13 @@
 namespace duckdb {
+enum NewLineIdentifier {
+	SINGLE = 1,   // Either \r or \n
+	CARRY_ON = 2, // \r\n
+	MIX = 3,      // Hippie-Land, can't run it multithreaded
+	NOT_SET = 4
+};
 struct BufferedCSVReaderOptions {
 	//===--------------------------------------------------------------------===//
 	// CommonCSVOptions
@@ -26,7 +33,11 @@ struct BufferedCSVReaderOptions {
 	bool has_delimiter = false;
 	//! Delimiter to separate columns within each line
 	string delimiter = ",";
-	//! Whether or not a quote sign was defined by the user
+	//! Whether or not a new_line was defined by the user
+	bool has_newline = false;
+	//! New Line separator
+	NewLineIdentifier new_line = NewLineIdentifier::NOT_SET;
 	bool has_quote = false;
 	//! Quote used for columns that contain reserved characters, e.g., delimiter
 	string quote = "\"";
@@ -112,6 +123,8 @@ struct BufferedCSVReaderOptions {
 	void Deserialize(FieldReader &reader);
 	void SetDelimiter(const string &delimiter);
+	void SetNewline(const string &input);
 	//! Set an option that is supported by both reading and writing functions, called by
 	//! the SetReadOption and SetWriteOption methods
 	bool SetBaseOption(const string &loption, const Value &value);

package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp CHANGED Viewed

@@ -91,6 +91,10 @@ struct CSVBufferRead {
 	idx_t estimated_linenr;
 };
+struct VerificationPositions {
+	idx_t beginning_of_first_line = 0;
+	idx_t end_of_last_line = 0;
+};
 //! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
 class ParallelCSVReader : public BaseCSVReader {
 public:
@@ -111,7 +115,10 @@ public:
 	//! If this flag is set, it means we are about to try to read our last row.
 	bool reached_remainder_state = false;
+	bool finished = false;
 	unique_ptr<CSVBufferRead> buffer;
+	VerificationPositions GetVerificationPositions();
 public:
 	void SetBufferRead(unique_ptr<CSVBufferRead> buffer);
@@ -134,8 +141,13 @@ private:
 	//! when changing the buffer end the first time.
 	//! It returns FALSE if the parser should jump to the final state of parsing or not
 	bool BufferRemainder();
+	bool NewLineDelimiter(bool carry, bool carry_followed_by_nl, bool first_char);
 	//! Parses a CSV file with a one-byte delimiter, escape and quote character
 	bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
+	//! Position of the first read line and last read line for verification purposes
+	VerificationPositions verification_positions;
 };
 } // namespace duckdb