npm - duckdb - Versions diffs - 0.8.2-dev3458.0 → 0.8.2-dev3949.0 - Mend

duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp RENAMED Viewed

@@ -1,4 +1,4 @@
-#include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
 #include "duckdb/common/bind_helpers.hpp"
 #include "duckdb/common/vector_size.hpp"
 #include "duckdb/common/string_util.hpp"
@@ -60,60 +60,77 @@ static int64_t ParseInteger(const Value &value, const string &loption) {
 	return value.GetValue<int64_t>();
 }
-void BufferedCSVReaderOptions::SetHeader(bool input) {
-	this->header = input;
+void CSVReaderOptions::SetHeader(bool input) {
+	this->dialect_options.header = input;
 	this->has_header = true;
 }
-void BufferedCSVReaderOptions::SetCompression(const string &compression_p) {
+void CSVReaderOptions::SetCompression(const string &compression_p) {
 	this->compression = FileCompressionTypeFromString(compression_p);
 }
-void BufferedCSVReaderOptions::SetEscape(const string &input) {
-	this->escape = input;
+void CSVReaderOptions::SetEscape(const string &input) {
+	auto escape_str = input;
+	if (escape_str.size() > 1) {
+		throw InvalidInputException("The escape option cannot exceed a size of 1 byte.");
+	}
+	if (escape_str.empty()) {
+		escape_str = string("\0", 1);
+	}
+	this->dialect_options.state_machine_options.escape = escape_str[0];
 	this->has_escape = true;
 }
-void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
-	this->delimiter = StringUtil::Replace(input, "\\t", "\t");
+void CSVReaderOptions::SetDelimiter(const string &input) {
+	auto delim_str = StringUtil::Replace(input, "\\t", "\t");
+	if (delim_str.size() > 1) {
+		throw InvalidInputException("The delimiter option cannot exceed a size of 1 byte.");
+	}
 	this->has_delimiter = true;
 	if (input.empty()) {
-		this->delimiter = string("\0", 1);
+		delim_str = string("\0", 1);
 	}
+	this->dialect_options.state_machine_options.delimiter = delim_str[0];
 }
-void BufferedCSVReaderOptions::SetQuote(const string &quote_p) {
-	this->quote = quote_p;
+void CSVReaderOptions::SetQuote(const string &quote_p) {
+	auto quote_str = quote_p;
+	if (quote_str.size() > 1) {
+		throw InvalidInputException("The quote option cannot exceed a size of 1 byte.");
+	}
+	if (quote_str.empty()) {
+		quote_str = string("\0", 1);
+	}
+	this->dialect_options.state_machine_options.quote = quote_str[0];
 	this->has_quote = true;
 }
-void BufferedCSVReaderOptions::SetNewline(const string &input) {
+void CSVReaderOptions::SetNewline(const string &input) {
 	if (input == "\\n" || input == "\\r") {
-		new_line = NewLineIdentifier::SINGLE;
+		dialect_options.new_line = NewLineIdentifier::SINGLE;
 	} else if (input == "\\r\\n") {
-		new_line = NewLineIdentifier::CARRY_ON;
+		dialect_options.new_line = NewLineIdentifier::CARRY_ON;
 	} else {
 		throw InvalidInputException("This is not accepted as a newline: " + input);
 	}
 	has_newline = true;
 }
-void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) {
+void CSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) {
 	string error;
 	if (read_format) {
-		error = StrTimeFormat::ParseFormatSpecifier(format, date_format[type]);
-		date_format[type].format_specifier = format;
+		error = StrTimeFormat::ParseFormatSpecifier(format, dialect_options.date_format[type]);
+		dialect_options.date_format[type].format_specifier = format;
 	} else {
 		error = StrTimeFormat::ParseFormatSpecifier(format, write_date_format[type]);
 	}
 	if (!error.empty()) {
 		throw InvalidInputException("Could not parse DATEFORMAT: %s", error.c_str());
 	}
-	has_format[type] = true;
+	dialect_options.has_format[type] = true;
 }
-void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value &value,
-                                             vector<string> &expected_names) {
+void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, vector<string> &expected_names) {
 	if (SetBaseOption(loption, value)) {
 		return;
 	}
@@ -135,7 +152,7 @@ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value
 			sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1;
 		}
 	} else if (loption == "skip") {
-		skip_rows = ParseInteger(value, loption);
+		dialect_options.skip_rows = ParseInteger(value, loption);
 		skip_rows_set = true;
 	} else if (loption == "max_line_size" || loption == "maximum_line_size") {
 		maximum_line_size = ParseInteger(value, loption);
@@ -204,7 +221,7 @@ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value
 	}
 }
-void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value &value) {
+void CSVReaderOptions::SetWriteOption(const string &loption, const Value &value) {
 	if (loption == "new_line") {
 		// Steal this from SetBaseOption so we can write different newlines (e.g., format JSON ARRAY)
 		write_newline = ParseString(value, loption);
@@ -236,7 +253,7 @@ void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value
 	}
 }
-bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value &value) {
+bool CSVReaderOptions::SetBaseOption(const string &loption, const Value &value) {
 	// Make sure this function was only called after the option was turned into lowercase
 	D_ASSERT(!std::any_of(loption.begin(), loption.end(), ::isupper));
@@ -266,12 +283,14 @@ bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value
 	return true;
 }
-std::string BufferedCSVReaderOptions::ToString() const {
-	return "  file=" + file_path + "\n  delimiter='" + delimiter +
-	       (has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n  quote='" + quote +
-	       (has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n  escape='" + escape +
+string CSVReaderOptions::ToString() const {
+	return "  file=" + file_path + "\n  delimiter='" + dialect_options.state_machine_options.delimiter +
+	       (has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n  quote='" +
+	       dialect_options.state_machine_options.quote +
+	       (has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n  escape='" +
+	       dialect_options.state_machine_options.escape +
 	       (has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
-	       "\n  header=" + std::to_string(header) +
+	       "\n  header=" + std::to_string(dialect_options.header) +
 	       (has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
 	       "\n  sample_size=" + std::to_string(sample_chunk_size * sample_chunks) +
 	       "\n  ignore_errors=" + std::to_string(ignore_errors) + "\n  all_varchar=" + std::to_string(all_varchar);

package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp ADDED Viewed

@@ -0,0 +1,35 @@
+#include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
+#include "utf8proc_wrapper.hpp"
+#include "duckdb/main/error_manager.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
+namespace duckdb {
+CSVStateMachine::CSVStateMachine(CSVReaderOptions &options_p, const CSVStateMachineOptions &state_machine_options,
+                                 shared_ptr<CSVBufferManager> buffer_manager_p,
+                                 CSVStateMachineCache &csv_state_machine_cache_p)
+    : csv_state_machine_cache(csv_state_machine_cache_p), options(options_p),
+      csv_buffer_iterator(std::move(buffer_manager_p)),
+      transition_array(csv_state_machine_cache.Get(state_machine_options)) {
+	dialect_options.state_machine_options = state_machine_options;
+	dialect_options.has_format = options.dialect_options.has_format;
+	dialect_options.date_format = options.dialect_options.date_format;
+	dialect_options.skip_rows = options.dialect_options.skip_rows;
+}
+void CSVStateMachine::Reset() {
+	csv_buffer_iterator.Reset();
+}
+void CSVStateMachine::VerifyUTF8() {
+	auto utf_type = Utf8Proc::Analyze(value.c_str(), value.size());
+	if (utf_type == UnicodeType::INVALID) {
+		int64_t error_line = cur_rows;
+		throw InvalidInputException("Error in file \"%s\" at line %llu: "
+		                            "%s. Parser options:\n%s",
+		                            options.file_path, error_line, ErrorManager::InvalidUnicodeError(value, "CSV file"),
+		                            options.ToString());
+	}
+}
+} // namespace duckdb

package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp ADDED Viewed

@@ -0,0 +1,107 @@
+#include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
+namespace duckdb {
+void InitializeTransitionArray(unsigned char *transition_array, const uint8_t state) {
+	for (uint32_t i = 0; i < NUM_TRANSITIONS; i++) {
+		transition_array[i] = state;
+	}
+}
+void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_options) {
+	D_ASSERT(state_machine_cache.find(state_machine_options) == state_machine_cache.end());
+	// Initialize transition array with default values to the Standard option
+	auto &transition_array = state_machine_cache[state_machine_options];
+	const uint8_t standard_state = static_cast<uint8_t>(CSVState::STANDARD);
+	const uint8_t field_separator_state = static_cast<uint8_t>(CSVState::DELIMITER);
+	const uint8_t record_separator_state = static_cast<uint8_t>(CSVState::RECORD_SEPARATOR);
+	const uint8_t carriage_return_state = static_cast<uint8_t>(CSVState::CARRIAGE_RETURN);
+	const uint8_t quoted_state = static_cast<uint8_t>(CSVState::QUOTED);
+	const uint8_t unquoted_state = static_cast<uint8_t>(CSVState::UNQUOTED);
+	const uint8_t escape_state = static_cast<uint8_t>(CSVState::ESCAPE);
+	const uint8_t empty_line_state = static_cast<uint8_t>(CSVState::EMPTY_LINE);
+	const uint8_t invalid_state = static_cast<uint8_t>(CSVState::INVALID);
+	for (uint32_t i = 0; i < NUM_STATES; i++) {
+		switch (i) {
+		case quoted_state:
+			InitializeTransitionArray(transition_array[i], quoted_state);
+			break;
+		case unquoted_state:
+			InitializeTransitionArray(transition_array[i], invalid_state);
+			break;
+		case escape_state:
+			InitializeTransitionArray(transition_array[i], invalid_state);
+			break;
+		default:
+			InitializeTransitionArray(transition_array[i], standard_state);
+			break;
+		}
+	}
+	// Now set values depending on configuration
+	// 1) Standard State
+	transition_array[standard_state][static_cast<uint8_t>(state_machine_options.delimiter)] = field_separator_state;
+	transition_array[standard_state][static_cast<uint8_t>('\n')] = record_separator_state;
+	transition_array[standard_state][static_cast<uint8_t>('\r')] = carriage_return_state;
+	transition_array[standard_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
+	// 2) Field Separator State
+	transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.delimiter)] =
+	    field_separator_state;
+	transition_array[field_separator_state][static_cast<uint8_t>('\n')] = record_separator_state;
+	transition_array[field_separator_state][static_cast<uint8_t>('\r')] = carriage_return_state;
+	transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
+	// 3) Record Separator State
+	transition_array[record_separator_state][static_cast<uint8_t>(state_machine_options.delimiter)] =
+	    field_separator_state;
+	transition_array[record_separator_state][static_cast<uint8_t>('\n')] = empty_line_state;
+	transition_array[record_separator_state][static_cast<uint8_t>('\r')] = empty_line_state;
+	transition_array[record_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
+	// 4) Carriage Return State
+	transition_array[carriage_return_state][static_cast<uint8_t>('\n')] = record_separator_state;
+	transition_array[carriage_return_state][static_cast<uint8_t>('\r')] = empty_line_state;
+	transition_array[carriage_return_state][static_cast<uint8_t>(state_machine_options.escape)] = escape_state;
+	// 5) Quoted State
+	transition_array[quoted_state][static_cast<uint8_t>(state_machine_options.quote)] = unquoted_state;
+	if (state_machine_options.quote != state_machine_options.escape) {
+		transition_array[quoted_state][static_cast<uint8_t>(state_machine_options.escape)] = escape_state;
+	}
+	// 6) Unquoted State
+	transition_array[unquoted_state][static_cast<uint8_t>('\n')] = record_separator_state;
+	transition_array[unquoted_state][static_cast<uint8_t>('\r')] = carriage_return_state;
+	transition_array[unquoted_state][static_cast<uint8_t>(state_machine_options.delimiter)] = field_separator_state;
+	if (state_machine_options.quote == state_machine_options.escape) {
+		transition_array[unquoted_state][static_cast<uint8_t>(state_machine_options.escape)] = quoted_state;
+	}
+	// 7) Escaped State
+	transition_array[escape_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
+	transition_array[escape_state][static_cast<uint8_t>(state_machine_options.escape)] = quoted_state;
+	// 8) Empty Line State
+	transition_array[empty_line_state][static_cast<uint8_t>('\r')] = empty_line_state;
+	transition_array[empty_line_state][static_cast<uint8_t>('\n')] = empty_line_state;
+}
+CSVStateMachineCache::CSVStateMachineCache() {
+	for (auto quoterule : default_quote_rule) {
+		const auto &quote_candidates = default_quote[static_cast<uint8_t>(quoterule)];
+		for (const auto &quote : quote_candidates) {
+			for (const auto &delimiter : default_delimiter) {
+				const auto &escape_candidates = default_escape[static_cast<uint8_t>(quoterule)];
+				for (const auto &escape : escape_candidates) {
+					Insert({delimiter, quote, escape});
+				}
+			}
+		}
+	}
+}
+const state_machine_t &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) {
+	//! Custom State Machine, we need to create it and cache it first
+	if (state_machine_cache.find(state_machine_options) == state_machine_cache.end()) {
+		Insert(state_machine_options);
+	}
+	const auto &transition_array = state_machine_cache[state_machine_options];
+	return transition_array;
+}
+} // namespace duckdb

package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp RENAMED Viewed

@@ -1,4 +1,4 @@
-#include "duckdb/execution/operator/persistent/parallel_csv_reader.hpp"
+#include "duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp"
 #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
 #include "duckdb/common/file_system.hpp"
@@ -15,7 +15,7 @@
 #include "utf8proc.hpp"
 #include "duckdb/parser/keyword_helper.hpp"
 #include "duckdb/function/table/read_csv.hpp"
-#include "duckdb/execution/operator/persistent/csv_line_info.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_line_info.hpp"
 #include <algorithm>
 #include <cctype>
@@ -24,16 +24,13 @@
 namespace duckdb {
-ParallelCSVReader::ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
+ParallelCSVReader::ParallelCSVReader(ClientContext &context, CSVReaderOptions options_p,
                                      unique_ptr<CSVBufferRead> buffer_p, idx_t first_pos_first_buffer_p,
                                      const vector<LogicalType> &requested_types, idx_t file_idx_p)
     : BaseCSVReader(context, std::move(options_p), requested_types), file_idx(file_idx_p),
       first_pos_first_buffer(first_pos_first_buffer_p) {
 	Initialize(requested_types);
 	SetBufferRead(std::move(buffer_p));
-	if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
-		throw InternalException("Parallel CSV reader cannot handle CSVs with multi-byte delimiters/escapes/quotes");
-	}
 }
 void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
@@ -44,8 +41,9 @@ void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
 bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl, bool first_char) {
 	// Set the delimiter if not set yet.
 	SetNewLineDelimiter(carry, carry_followed_by_nl);
-	D_ASSERT(options.new_line == NewLineIdentifier::SINGLE || options.new_line == NewLineIdentifier::CARRY_ON);
-	if (options.new_line == NewLineIdentifier::SINGLE) {
+	D_ASSERT(options.dialect_options.new_line == NewLineIdentifier::SINGLE ||
+	         options.dialect_options.new_line == NewLineIdentifier::CARRY_ON);
+	if (options.dialect_options.new_line == NewLineIdentifier::SINGLE) {
 		return (!carry) || (carry && !carry_followed_by_nl);
 	}
 	return (carry && carry_followed_by_nl) || (!carry && first_char);
@@ -75,15 +73,14 @@ void ParallelCSVReader::SkipEmptyLines() {
 }
 bool ParallelCSVReader::SetPosition() {
-	if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer &&
-	    start_buffer == first_pos_first_buffer) {
-		start_buffer = buffer->buffer->GetStart();
+	if (buffer->buffer->is_first_buffer && start_buffer == position_buffer && start_buffer == first_pos_first_buffer) {
+		start_buffer = buffer->buffer->start_position;
 		position_buffer = start_buffer;
 		verification_positions.beginning_of_first_line = position_buffer;
 		verification_positions.end_of_last_line = position_buffer;
 		// First buffer doesn't need any setting
-		if (options.header) {
+		if (options.dialect_options.header) {
 			for (; position_buffer < end_buffer; position_buffer++) {
 				if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
 					bool carrier_return = (*buffer)[position_buffer] == '\r';
@@ -150,7 +147,7 @@ bool ParallelCSVReader::SetPosition() {
 			break;
 		}
-		if (position_buffer > end_buffer && options.new_line == NewLineIdentifier::CARRY_ON &&
+		if (position_buffer > end_buffer && options.dialect_options.new_line == NewLineIdentifier::CARRY_ON &&
 		    (*buffer)[position_buffer - 1] == '\n') {
 			break;
 		}
@@ -199,9 +196,9 @@ void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
 	start_buffer = buffer_read_p->buffer_start;
 	end_buffer = buffer_read_p->buffer_end;
 	if (buffer_read_p->next_buffer) {
-		buffer_size = buffer_read_p->buffer->GetBufferSize() + buffer_read_p->next_buffer->GetBufferSize();
+		buffer_size = buffer_read_p->buffer->actual_size + buffer_read_p->next_buffer->actual_size;
 	} else {
-		buffer_size = buffer_read_p->buffer->GetBufferSize();
+		buffer_size = buffer_read_p->buffer->actual_size;
 	}
 	buffer = std::move(buffer_read_p);
@@ -213,8 +210,8 @@ void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
 }
 VerificationPositions ParallelCSVReader::GetVerificationPositions() {
-	verification_positions.beginning_of_first_line += buffer->buffer->GetCSVGlobalStart();
-	verification_positions.end_of_last_line += buffer->buffer->GetCSVGlobalStart();
+	verification_positions.beginning_of_first_line += buffer->buffer->csv_global_start;
+	verification_positions.end_of_last_line += buffer->buffer->csv_global_start;
 	return verification_positions;
 }
@@ -235,15 +232,6 @@ bool ParallelCSVReader::BufferRemainder() {
 	return true;
 }
-void ParallelCSVReader::VerifyLineLength(idx_t line_size) {
-	if (line_size > options.maximum_line_size) {
-		throw InvalidInputException("Error in file \"%s\" on line %s: Maximum line size of %llu bytes exceeded!",
-		                            options.file_path,
-		                            GetLineNumberStr(parse_chunk.size(), linenr_estimated, buffer->batch_index).c_str(),
-		                            options.maximum_line_size);
-	}
-}
 bool AllNewLine(string_t value, idx_t column_amount) {
 	auto value_str = value.GetString();
 	if (value_str.empty() && column_amount == 1) {
@@ -260,7 +248,7 @@ bool AllNewLine(string_t value, idx_t column_amount) {
 bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
 	// If line is not set, we have to figure it out, we assume whatever is in the first line
-	if (options.new_line == NewLineIdentifier::NOT_SET) {
+	if (options.dialect_options.new_line == NewLineIdentifier::NOT_SET) {
 		idx_t cur_pos = position_buffer;
 		// we can start in the middle of a new line, so move a bit forward.
 		while (cur_pos < end_buffer) {
@@ -324,7 +312,7 @@ value_start : {
 	offset = 0;
 	// this state parses the first character of a value
-	if ((*buffer)[position_buffer] == options.quote[0]) {
+	if ((*buffer)[position_buffer] == options.dialect_options.state_machine_options.quote) {
 		// quote: actual value starts in the next position
 		// move to in_quotes state
 		start_buffer = position_buffer + 1;
@@ -341,10 +329,10 @@ normal : {
 	// this state parses the remainder of a non-quoted value until we reach a delimiter or newline
 	for (; position_buffer < end_buffer; position_buffer++) {
 		auto c = (*buffer)[position_buffer];
-		if (c == options.delimiter[0]) {
+		if (c == options.dialect_options.state_machine_options.delimiter) {
 			// delimiter: end the value and add it to the chunk
 			goto add_value;
-		} else if (c == options.quote[0] && try_add_line) {
+		} else if (c == options.dialect_options.state_machine_options.quote && try_add_line) {
 			return false;
 		} else if (StringUtil::CharacterIsNewline(c)) {
 			// newline: add row
@@ -396,7 +384,7 @@ add_row : {
 		parse_chunk.Reset();
 		return success;
 	} else {
-		VerifyLineLength(position_buffer - line_start);
+		VerifyLineLength(position_buffer - line_start, buffer->batch_index);
 		line_start = position_buffer;
 		finished_chunk = AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
 	}
@@ -413,7 +401,7 @@ add_row : {
 			goto final_state;
 		}
 		if ((*buffer)[position_buffer] == '\n') {
-			if (options.new_line == NewLineIdentifier::SINGLE) {
+			if (options.dialect_options.new_line == NewLineIdentifier::SINGLE) {
 				error_message = "Wrong NewLine Identifier. Expecting \\r\\n";
 				return false;
 			}
@@ -428,7 +416,7 @@ add_row : {
 				goto final_state;
 			}
 		} else {
-			if (options.new_line == NewLineIdentifier::CARRY_ON) {
+			if (options.dialect_options.new_line == NewLineIdentifier::CARRY_ON) {
 				error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
 				return false;
 			}
@@ -441,7 +429,7 @@ add_row : {
 		}
 		goto value_start;
 	} else {
-		if (options.new_line == NewLineIdentifier::CARRY_ON) {
+		if (options.dialect_options.new_line == NewLineIdentifier::CARRY_ON) {
 			error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
 			return false;
 		}
@@ -452,6 +440,10 @@ add_row : {
 			goto final_state;
 		}
 		SkipEmptyLines();
+		if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
+			error_message = "Line does not fit in one buffer. Increase the buffer size.";
+			return false;
+		}
 		verification_positions.end_of_last_line = position_buffer;
 		start_buffer = position_buffer;
 		// \n newline, move to value start
@@ -467,17 +459,17 @@ in_quotes:
 	position_buffer++;
 	for (; position_buffer < end_buffer; position_buffer++) {
 		auto c = (*buffer)[position_buffer];
-		if (c == options.quote[0]) {
+		if (c == options.dialect_options.state_machine_options.quote) {
 			// quote: move to unquoted state
 			goto unquote;
-		} else if (c == options.escape[0]) {
+		} else if (c == options.dialect_options.state_machine_options.escape) {
 			// escape: store the escaped position and move to handle_escape state
 			escape_positions.push_back(position_buffer - start_buffer);
 			goto handle_escape;
 		}
 	}
 	if (!BufferRemainder()) {
-		if (buffer->buffer->IsCSVFileLastBuffer()) {
+		if (buffer->buffer->is_last_buffer) {
 			if (try_add_line) {
 				return false;
 			}
@@ -504,11 +496,13 @@ unquote : {
 		goto final_state;
 	}
 	auto c = (*buffer)[position_buffer];
-	if (c == options.quote[0] && (options.escape.empty() || options.escape[0] == options.quote[0])) {
+	if (c == options.dialect_options.state_machine_options.quote &&
+	    (options.dialect_options.state_machine_options.escape == '\0' ||
+	     options.dialect_options.state_machine_options.escape == options.dialect_options.state_machine_options.quote)) {
 		// escaped quote, return to quoted state and store escape position
 		escape_positions.push_back(position_buffer - start_buffer);
 		goto in_quotes;
-	} else if (c == options.delimiter[0]) {
+	} else if (c == options.dialect_options.state_machine_options.delimiter) {
 		// delimiter, add value
 		offset = 1;
 		goto add_value;
@@ -537,13 +531,14 @@ handle_escape : {
 	if (!BufferRemainder()) {
 		goto final_state;
 	}
-	if (position_buffer >= buffer_size && buffer->buffer->IsCSVFileLastBuffer()) {
+	if (position_buffer >= buffer_size && buffer->buffer->is_last_buffer) {
 		error_message = StringUtil::Format(
 		    "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
 		    GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), options.ToString());
 		return false;
 	}
-	if ((*buffer)[position_buffer] != options.quote[0] && (*buffer)[position_buffer] != options.escape[0]) {
+	if ((*buffer)[position_buffer] != options.dialect_options.state_machine_options.quote &&
+	    (*buffer)[position_buffer] != options.dialect_options.state_machine_options.escape) {
 		error_message = StringUtil::Format(
 		    "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
 		    GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), options.ToString());
@@ -573,7 +568,8 @@ final_state : {
 		return true;
 	}
 	// If this is the last buffer, we have to read the last value
-	if (buffer->buffer->IsCSVFileLastBuffer() || (buffer->next_buffer && buffer->next_buffer->IsCSVFileLastBuffer())) {
+	if (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
+	    (buffer->next_buffer && buffer->next_buffer->is_last_buffer)) {
 		if (column > 0 || start_buffer != position_buffer || try_add_line ||
 		    (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
 			// remaining values to be added to the chunk
@@ -592,9 +588,13 @@ final_state : {
 					reached_remainder_state = false;
 					return success;
 				} else {
-					VerifyLineLength(position_buffer - line_start);
+					VerifyLineLength(position_buffer - line_start, buffer->batch_index);
 					line_start = position_buffer;
 					AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
+					if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
+						error_message = "Line does not fit in one buffer. Increase the buffer size.";
+						return false;
+					}
 					verification_positions.end_of_last_line = position_buffer;
 				}
 			}
@@ -638,7 +638,7 @@ void ParallelCSVReader::ParseCSV(DataChunk &insert_chunk) {
 idx_t ParallelCSVReader::GetLineError(idx_t line_error, idx_t buffer_idx, bool stop_at_first) {
 	while (true) {
 		if (buffer->line_info->CanItGetLine(file_idx, buffer_idx)) {
-			auto cur_start = verification_positions.beginning_of_first_line + buffer->buffer->GetCSVGlobalStart();
+			auto cur_start = verification_positions.beginning_of_first_line + buffer->buffer->csv_global_start;
 			return buffer->line_info->GetLine(buffer_idx, line_error, file_idx, cur_start, false, stop_at_first);
 		}
 	}

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp ADDED Viewed

@@ -0,0 +1,52 @@
+#include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
+namespace duckdb {
+CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
+                       CSVStateMachineCache &state_machine_cache_p)
+    : state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p)) {
+	// Check if any type is BLOB
+	for (auto &type : options.sql_type_list) {
+		if (type.id() == LogicalTypeId::BLOB) {
+			throw InvalidInputException(
+			    "CSV auto-detect for blobs not supported: there may be invalid UTF-8 in the file");
+		}
+	}
+	// Initialize Format Candidates
+	for (const auto &format_template : format_template_candidates) {
+		auto &logical_type = format_template.first;
+		best_format_candidates[logical_type].clear();
+	}
+}
+SnifferResult CSVSniffer::SniffCSV() {
+	// 1. Dialect Detection
+	DetectDialect();
+	// 2. Type Detection
+	DetectTypes();
+	// 3. Header Detection
+	DetectHeader();
+	D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());
+	// 4. Type Replacement
+	ReplaceTypes();
+	// 5. Type Refinement
+	RefineTypes();
+	// We are done, construct and return the result.
+	// Set the CSV Options in the reference
+	options.dialect_options = best_candidate->dialect_options;
+	options.has_header = best_candidate->dialect_options.header;
+	options.skip_rows_set = options.dialect_options.skip_rows > 0;
+	if (options.has_header) {
+		options.dialect_options.true_start = best_start_with_header;
+	} else {
+		options.dialect_options.true_start = best_start_without_header;
+	}
+	// Return the types and names
+	return SnifferResult(detected_types, names);
+}
+} // namespace duckdb