npm - duckdb - Versions diffs - 0.8.2-dev3458.0 → 0.8.2-dev3949.0 - Mend

duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

package/src/duckdb/src/execution/window_executor.cpp CHANGED Viewed

@@ -344,13 +344,14 @@ void WindowBoundariesState::Update(const idx_t row_idx, const WindowInputColumn
 		// when the partition changes, recompute the boundaries
 		if (!is_same_partition || is_jump) {
-			partition_start = row_idx;
-			peer_start = row_idx;
 			if (is_jump) {
-				//	Go back as far as the previous partition start
 				idx_t n = 1;
-				partition_start = FindPrevStart(partition_mask, partition_start, row_idx + 1, n);
+				partition_start = FindPrevStart(partition_mask, 0, row_idx + 1, n);
+				n = 1;
+				peer_start = FindPrevStart(order_mask, 0, row_idx + 1, n);
+			} else {
+				partition_start = row_idx;
+				peer_start = row_idx;
 			}
 			// find end of partition

package/src/duckdb/src/function/cast/cast_function_set.cpp CHANGED Viewed

@@ -1,3 +1,4 @@
 #include "duckdb/function/cast/cast_function_set.hpp"
 #include "duckdb/common/pair.hpp"

package/src/duckdb/src/function/scalar/strftime_format.cpp CHANGED Viewed

@@ -668,7 +668,7 @@ int StrpTimeFormat::NumericSpecifierWidth(StrTimeSpecifier specifier) {
 enum class TimeSpecifierAMOrPM : uint8_t { TIME_SPECIFIER_NONE = 0, TIME_SPECIFIER_AM = 1, TIME_SPECIFIER_PM = 2 };
 int32_t StrpTimeFormat::TryParseCollection(const char *data, idx_t &pos, idx_t size, const string_t collection[],
-                                           idx_t collection_count) {
+                                           idx_t collection_count) const {
 	for (idx_t c = 0; c < collection_count; c++) {
 		auto &entry = collection[c];
 		auto entry_data = entry.GetData();
@@ -695,7 +695,7 @@ int32_t StrpTimeFormat::TryParseCollection(const char *data, idx_t &pos, idx_t s
 }
 //! Parses a timestamp using the given specifier
-bool StrpTimeFormat::Parse(string_t str, ParseResult &result) {
+bool StrpTimeFormat::Parse(string_t str, ParseResult &result) const {
 	auto &result_data = result.data;
 	auto &error_message = result.error_message;
 	auto &error_position = result.error_position;
@@ -1146,7 +1146,7 @@ string StrpTimeFormat::ParseResult::FormatError(string_t input, const string &fo
 	                          FormatStrpTimeError(input.GetString(), error_position), error_message);
 }
-bool StrpTimeFormat::TryParseDate(string_t input, date_t &result, string &error_message) {
+bool StrpTimeFormat::TryParseDate(string_t input, date_t &result, string &error_message) const {
 	ParseResult parse_result;
 	if (!Parse(input, parse_result)) {
 		error_message = parse_result.FormatError(input, format_specifier);
@@ -1155,7 +1155,7 @@ bool StrpTimeFormat::TryParseDate(string_t input, date_t &result, string &error_
 	return parse_result.TryToDate(result);
 }
-bool StrpTimeFormat::TryParseTimestamp(string_t input, timestamp_t &result, string &error_message) {
+bool StrpTimeFormat::TryParseTimestamp(string_t input, timestamp_t &result, string &error_message) const {
 	ParseResult parse_result;
 	if (!Parse(input, parse_result)) {
 		error_message = parse_result.FormatError(input, format_specifier);

package/src/duckdb/src/function/table/copy_csv.cpp CHANGED Viewed

@@ -6,6 +6,7 @@
 #include "duckdb/common/types/column/column_data_collection.hpp"
 #include "duckdb/common/types/string_type.hpp"
 #include "duckdb/common/vector_operations/vector_operations.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
 #include "duckdb/function/copy_function.hpp"
 #include "duckdb/function/scalar/string_functions.hpp"
 #include "duckdb/function/table/read_csv.hpp"
@@ -15,11 +16,20 @@
 namespace duckdb {
-void SubstringDetection(string &str_1, string &str_2, const string &name_str_1, const string &name_str_2) {
-	if (str_1.empty() || str_2.empty()) {
+void AreOptionsEqual(char &str_1, char &str_2, const string &name_str_1, const string &name_str_2) {
+	if (str_1 == '\0' || str_2 == '\0') {
 		return;
 	}
-	if ((str_1.find(str_2) != string::npos || str_2.find(str_1) != std::string::npos)) {
+	if (str_1 == str_2) {
+		throw BinderException("%s must not appear in the %s specification and vice versa", name_str_1, name_str_2);
+	}
+}
+void SubstringDetection(char &str_1, string &str_2, const string &name_str_1, const string &name_str_2) {
+	if (str_1 == '\0' || str_2.empty()) {
+		return;
+	}
+	if (str_2.find(str_1) != string::npos) {
 		throw BinderException("%s must not appear in the %s specification and vice versa", name_str_1, name_str_2);
 	}
 }
@@ -28,34 +38,46 @@ void SubstringDetection(string &str_1, string &str_2, const string &name_str_1,
 // Bind
 //===--------------------------------------------------------------------===//
+void WriteQuoteOrEscape(Serializer &serializer, char quote_or_escape) {
+	if (quote_or_escape != '\0') {
+		serializer.Write(quote_or_escape);
+	}
+}
 void BaseCSVData::Finalize() {
 	// verify that the options are correct in the final pass
-	if (options.escape.empty()) {
-		options.escape = options.quote;
+	if (options.dialect_options.state_machine_options.escape == '\0') {
+		options.dialect_options.state_machine_options.escape = options.dialect_options.state_machine_options.quote;
 	}
 	// escape and delimiter must not be substrings of each other
 	if (options.has_delimiter && options.has_escape) {
-		SubstringDetection(options.delimiter, options.escape, "DELIMITER", "ESCAPE");
+		AreOptionsEqual(options.dialect_options.state_machine_options.delimiter,
+		                options.dialect_options.state_machine_options.escape, "DELIMITER", "ESCAPE");
 	}
 	// delimiter and quote must not be substrings of each other
 	if (options.has_quote && options.has_delimiter) {
-		SubstringDetection(options.quote, options.delimiter, "DELIMITER", "QUOTE");
+		AreOptionsEqual(options.dialect_options.state_machine_options.quote,
+		                options.dialect_options.state_machine_options.delimiter, "DELIMITER", "QUOTE");
 	}
 	// escape and quote must not be substrings of each other (but can be the same)
-	if (options.quote != options.escape && options.has_quote && options.has_escape) {
-		SubstringDetection(options.quote, options.escape, "QUOTE", "ESCAPE");
+	if (options.dialect_options.state_machine_options.quote != options.dialect_options.state_machine_options.escape &&
+	    options.has_quote && options.has_escape) {
+		AreOptionsEqual(options.dialect_options.state_machine_options.quote,
+		                options.dialect_options.state_machine_options.escape, "QUOTE", "ESCAPE");
 	}
 	if (!options.null_str.empty()) {
 		// null string and delimiter must not be substrings of each other
 		if (options.has_delimiter) {
-			SubstringDetection(options.delimiter, options.null_str, "DELIMITER", "NULL");
+			SubstringDetection(options.dialect_options.state_machine_options.delimiter, options.null_str, "DELIMITER",
+			                   "NULL");
 		}
 		// quote/escape and nullstr must not be substrings of each other
 		if (options.has_quote) {
-			SubstringDetection(options.quote, options.null_str, "QUOTE", "NULL");
+			SubstringDetection(options.dialect_options.state_machine_options.quote, options.null_str, "QUOTE", "NULL");
 		}
 		if (options.has_escape) {
-			SubstringDetection(options.escape, options.null_str, "ESCAPE", "NULL");
+			SubstringDetection(options.dialect_options.state_machine_options.escape, options.null_str, "ESCAPE",
+			                   "NULL");
 		}
 	}
@@ -63,7 +85,7 @@ void BaseCSVData::Finalize() {
 		if (options.prefix.empty() || options.suffix.empty()) {
 			throw BinderException("COPY ... (FORMAT CSV) must have both PREFIX and SUFFIX, or none at all");
 		}
-		if (options.header) {
+		if (options.dialect_options.header) {
 			throw BinderException("COPY ... (FORMAT CSV)'s HEADER cannot be combined with PREFIX/SUFFIX");
 		}
 	}
@@ -85,16 +107,14 @@ static unique_ptr<FunctionData> WriteCSVBind(ClientContext &context, CopyInfo &i
 		bind_data->options.force_quote.resize(names.size(), false);
 	}
 	bind_data->Finalize();
-	bind_data->is_simple = bind_data->options.delimiter.size() == 1 && bind_data->options.escape.size() == 1 &&
-	                       bind_data->options.quote.size() == 1;
-	if (bind_data->is_simple) {
-		bind_data->requires_quotes = make_unsafe_uniq_array<bool>(256);
-		memset(bind_data->requires_quotes.get(), 0, sizeof(bool) * 256);
-		bind_data->requires_quotes['\n'] = true;
-		bind_data->requires_quotes['\r'] = true;
-		bind_data->requires_quotes[bind_data->options.delimiter[0]] = true;
-		bind_data->requires_quotes[bind_data->options.quote[0]] = true;
-	}
+	bind_data->requires_quotes = make_unsafe_uniq_array<bool>(256);
+	memset(bind_data->requires_quotes.get(), 0, sizeof(bool) * 256);
+	bind_data->requires_quotes['\n'] = true;
+	bind_data->requires_quotes['\r'] = true;
+	bind_data->requires_quotes[bind_data->options.dialect_options.state_machine_options.delimiter] = true;
+	bind_data->requires_quotes[bind_data->options.dialect_options.state_machine_options.quote] = true;
 	if (!bind_data->options.write_newline.empty()) {
 		bind_data->newline = bind_data->options.write_newline;
 	}
@@ -129,13 +149,24 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, CopyInfo &in
 	for (auto &option : info.options) {
 		options_map[option.first] = ConvertVectorToValue(std::move(option.second));
 	}
+	options.file_path = bind_data->files[0];
+	options.name_list = expected_names;
+	options.sql_type_list = expected_types;
+	for (idx_t i = 0; i < expected_types.size(); i++) {
+		options.sql_types_per_column[expected_names[i]] = i;
+	}
 	bind_data->FinalizeRead(context);
-	if (!bind_data->single_threaded && options.auto_detect) {
-		options.file_path = bind_data->files[0];
-		options.name_list = expected_names;
-		auto initial_reader = make_uniq<BufferedCSVReader>(context, options, expected_types);
-		options = initial_reader->options;
+	if (options.auto_detect) {
+		// We must run the sniffer.
+		auto file_handle = BaseCSVReader::OpenCSV(context, options);
+		auto buffer_manager = make_shared<CSVBufferManager>(context, std::move(file_handle), options);
+		CSVSniffer sniffer(options, buffer_manager, bind_data->state_machine_cache);
+		auto sniffer_result = sniffer.SniffCSV();
+		bind_data->csv_types = sniffer_result.return_types;
+		bind_data->csv_names = sniffer_result.names;
+		bind_data->return_types = sniffer_result.return_types;
+		bind_data->return_names = sniffer_result.names;
 	}
 	return std::move(bind_data);
 }
@@ -143,7 +174,7 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, CopyInfo &in
 //===--------------------------------------------------------------------===//
 // Helper writing functions
 //===--------------------------------------------------------------------===//
-static string AddEscapes(string &to_be_escaped, const string &escape, const string &val) {
+static string AddEscapes(char &to_be_escaped, const char &escape, const string &val) {
 	idx_t i = 0;
 	string new_val = "";
 	idx_t found = val.find(to_be_escaped);
@@ -153,8 +184,10 @@ static string AddEscapes(string &to_be_escaped, const string &escape, const stri
 			new_val += val[i];
 			i++;
 		}
-		new_val += escape;
-		found = val.find(to_be_escaped, found + escape.length());
+		if (escape != '\0') {
+			new_val += escape;
+			found = val.find(to_be_escaped, found + 1);
+		}
 	}
 	while (i < val.length()) {
 		new_val += val[i];
@@ -169,43 +202,16 @@ static bool RequiresQuotes(WriteCSVData &csv_data, const char *str, idx_t len) {
 	if (len == options.null_str.size() && memcmp(str, options.null_str.c_str(), len) == 0) {
 		return true;
 	}
-	if (csv_data.is_simple) {
-		// simple CSV: check for newlines, quotes and delimiter all at once
-		auto str_data = reinterpret_cast<const_data_ptr_t>(str);
-		for (idx_t i = 0; i < len; i++) {
-			if (csv_data.requires_quotes[str_data[i]]) {
-				// this byte requires quotes - write a quoted string
-				return true;
-			}
-		}
-		// no newline, quote or delimiter in the string
-		// no quoting or escaping necessary
-		return false;
-	} else {
-		// CSV with complex quotes/delimiter (multiple bytes)
-		// first check for \n, \r, \n\r in string
-		for (idx_t i = 0; i < len; i++) {
-			if (str[i] == '\n' || str[i] == '\r') {
-				// newline, write a quoted string
-				return true;
-			}
-		}
-		// check for delimiter
-		if (options.delimiter.length() != 0 &&
-		    ContainsFun::Find(const_uchar_ptr_cast(str), len, const_uchar_ptr_cast(options.delimiter.c_str()),
-		                      options.delimiter.size()) != DConstants::INVALID_INDEX) {
+	auto str_data = reinterpret_cast<const_data_ptr_t>(str);
+	for (idx_t i = 0; i < len; i++) {
+		if (csv_data.requires_quotes[str_data[i]]) {
+			// this byte requires quotes - write a quoted string
 			return true;
 		}
-		// check for quote
-		if (options.quote.length() != 0 &&
-		    ContainsFun::Find(const_uchar_ptr_cast(str), len, const_uchar_ptr_cast(options.quote.c_str()),
-		                      options.quote.size()) != DConstants::INVALID_INDEX) {
-			return true;
-		}
-		return false;
 	}
+	// no newline, quote or delimiter in the string
+	// no quoting or escaping necessary
+	return false;
 }
 static void WriteQuotedString(Serializer &serializer, WriteCSVData &csv_data, const char *str, idx_t len,
@@ -218,46 +224,37 @@ static void WriteQuotedString(Serializer &serializer, WriteCSVData &csv_data, co
 	if (force_quote) {
 		// quoting is enabled: we might need to escape things in the string
 		bool requires_escape = false;
-		if (csv_data.is_simple) {
-			// simple CSV
-			// do a single loop to check for a quote or escape value
-			for (idx_t i = 0; i < len; i++) {
-				if (str[i] == options.quote[0] || str[i] == options.escape[0]) {
-					requires_escape = true;
-					break;
-				}
-			}
-		} else {
-			// complex CSV
-			// check for quote or escape separately
-			if (options.quote.length() != 0 &&
-			    ContainsFun::Find(const_uchar_ptr_cast(str), len, const_uchar_ptr_cast(options.quote.c_str()),
-			                      options.quote.size()) != DConstants::INVALID_INDEX) {
-				requires_escape = true;
-			} else if (options.escape.length() != 0 &&
-			           ContainsFun::Find(const_uchar_ptr_cast(str), len, const_uchar_ptr_cast(options.escape.c_str()),
-			                             options.escape.size()) != DConstants::INVALID_INDEX) {
+		// simple CSV
+		// do a single loop to check for a quote or escape value
+		for (idx_t i = 0; i < len; i++) {
+			if (str[i] == options.dialect_options.state_machine_options.quote ||
+			    str[i] == options.dialect_options.state_machine_options.escape) {
 				requires_escape = true;
+				break;
 			}
 		}
 		if (!requires_escape) {
 			// fast path: no need to escape anything
-			serializer.WriteBufferData(options.quote);
+			WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.quote);
 			serializer.WriteData(const_data_ptr_cast(str), len);
-			serializer.WriteBufferData(options.quote);
+			WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.quote);
 			return;
 		}
 		// slow path: need to add escapes
 		string new_val(str, len);
-		new_val = AddEscapes(options.escape, options.escape, new_val);
-		if (options.escape != options.quote) {
+		new_val = AddEscapes(options.dialect_options.state_machine_options.escape,
+		                     options.dialect_options.state_machine_options.escape, new_val);
+		if (options.dialect_options.state_machine_options.escape !=
+		    options.dialect_options.state_machine_options.quote) {
 			// need to escape quotes separately
-			new_val = AddEscapes(options.quote, options.escape, new_val);
+			new_val = AddEscapes(options.dialect_options.state_machine_options.quote,
+			                     options.dialect_options.state_machine_options.escape, new_val);
 		}
-		serializer.WriteBufferData(options.quote);
+		WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.quote);
 		serializer.WriteBufferData(new_val);
-		serializer.WriteBufferData(options.quote);
+		WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.quote);
 	} else {
 		serializer.WriteData(const_data_ptr_cast(str), len);
 	}
@@ -335,12 +332,12 @@ static unique_ptr<GlobalFunctionData> WriteCSVInitializeGlobal(ClientContext &co
 		global_data->WriteData(options.prefix.c_str(), options.prefix.size());
 	}
-	if (options.header) {
+	if (options.dialect_options.header) {
 		BufferedSerializer serializer;
 		// write the header line to the file
 		for (idx_t i = 0; i < csv_data.options.name_list.size(); i++) {
 			if (i != 0) {
-				serializer.WriteBufferData(options.delimiter);
+				WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.delimiter);
 			}
 			WriteQuotedString(serializer, csv_data, csv_data.options.name_list[i].c_str(),
 			                  csv_data.options.name_list[i].size(), false);
@@ -365,11 +362,12 @@ static void WriteCSVChunkInternal(ClientContext &context, FunctionData &bind_dat
 		if (csv_data.sql_types[col_idx].id() == LogicalTypeId::VARCHAR) {
 			// VARCHAR, just reinterpret (cannot reference, because LogicalTypeId::VARCHAR is used by the JSON type too)
 			cast_chunk.data[col_idx].Reinterpret(input.data[col_idx]);
-		} else if (options.has_format[LogicalTypeId::DATE] && csv_data.sql_types[col_idx].id() == LogicalTypeId::DATE) {
+		} else if (options.dialect_options.has_format[LogicalTypeId::DATE] &&
+		           csv_data.sql_types[col_idx].id() == LogicalTypeId::DATE) {
 			// use the date format to cast the chunk
 			csv_data.options.write_date_format[LogicalTypeId::DATE].ConvertDateVector(
 			    input.data[col_idx], cast_chunk.data[col_idx], input.size());
-		} else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
+		} else if (options.dialect_options.has_format[LogicalTypeId::TIMESTAMP] &&
 		           (csv_data.sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP ||
 		            csv_data.sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP_TZ)) {
 			// use the timestamp format to cast the chunk
@@ -392,7 +390,7 @@ static void WriteCSVChunkInternal(ClientContext &context, FunctionData &bind_dat
 		// write values
 		for (idx_t col_idx = 0; col_idx < cast_chunk.ColumnCount(); col_idx++) {
 			if (col_idx != 0) {
-				writer.WriteBufferData(options.delimiter);
+				WriteQuoteOrEscape(writer, options.dialect_options.state_machine_options.delimiter);
 			}
 			if (FlatVector::IsNull(cast_chunk.data[col_idx], row_idx)) {
 				// write null value