npm - duckdb - Versions diffs - 0.7.2-dev2144.0 → 0.7.2-dev2233.0 - Mend

duckdb 0.7.2-dev2144.0 → 0.7.2-dev2233.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection.hpp CHANGED Viewed

@@ -131,8 +131,8 @@ public:
 	//! Compare two column data collections to another. If they are equal according to result equality rules,
 	//! return true. That means null values are equal, and approx equality is used for floating point values.
 	//! If they are not equal, return false and fill in the error message.
-	static bool ResultEquals(const ColumnDataCollection &left, const ColumnDataCollection &right,
-	                         string &error_message);
+	static bool ResultEquals(const ColumnDataCollection &left, const ColumnDataCollection &right, string &error_message,
+	                         bool ordered = false);
 	//! Obtains the next scan index to scan from
 	bool NextScanIndex(ColumnDataScanState &state, idx_t &chunk_index, idx_t &segment_index, idx_t &row_index) const;

package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp CHANGED Viewed

@@ -87,7 +87,7 @@ private:
 	//! Resets the steam
 	void ResetStream();
 	//! Reads a new buffer from the CSV file if the current one has been exhausted
-	bool ReadBuffer(idx_t &start);
+	bool ReadBuffer(idx_t &start, idx_t &line_start);
 	//! Jumps back to the beginning of input stream and resets necessary internal states
 	bool JumpToNextSample();
 	//! Initializes the TextSearchShiftArrays for complex parser
@@ -124,6 +124,9 @@ private:
 	                                        const vector<LogicalType> &requested_types,
 	                                        vector<vector<LogicalType>> &best_sql_types_candidates,
 	                                        map<LogicalTypeId, vector<string>> &best_format_candidates);
+	//! Skip Empty lines for tables with over one column
+	void SkipEmptyLines();
 };
 } // namespace duckdb

package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp CHANGED Viewed

@@ -21,14 +21,15 @@ public:
 	//! Constructor for Initial Buffer
 	CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
-	          idx_t &global_csv_current_position);
+	          idx_t &global_csv_current_position, idx_t file_number);
 	//! Constructor for `Next()` Buffers
 	CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer,
-	          idx_t global_csv_current_position);
+	          idx_t global_csv_current_position, idx_t file_number);
 	//! Creates a new buffer with the next part of the CSV File
-	unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position);
+	unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position,
+	                           idx_t file_number);
 	//! Gets the buffer actual size
 	idx_t GetBufferSize();
@@ -44,6 +45,8 @@ public:
 	idx_t GetCSVGlobalStart();
+	idx_t GetFileNumber();
 	BufferHandle AllocateBuffer(idx_t buffer_size);
 	char *Ptr() {
@@ -65,5 +68,7 @@ private:
 	bool first_buffer = false;
 	//! Global position from the CSV File where this buffer starts
 	idx_t global_csv_start = 0;
+	//! Number of the file that is in this buffer
+	idx_t file_number = 0;
 };
 } // namespace duckdb

package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp CHANGED Viewed

@@ -39,11 +39,6 @@ struct BufferedCSVReaderOptions {
 	bool has_newline = false;
 	//! New Line separator
 	NewLineIdentifier new_line = NewLineIdentifier::NOT_SET;
-	//! Whether or not an option was provided for parallel
-	bool has_parallel = false;
-	//! Whether or not the read will use the ParallelCSVReader
-	bool use_parallel = false;
 	//! Whether or not a quote was defined by the user
 	bool has_quote = false;
 	//! Quote used for columns that contain reserved characters, e.g., delimiter
@@ -114,8 +109,12 @@ struct BufferedCSVReaderOptions {
 	//! Decimal separator when reading as numeric
 	string decimal_separator = ".";
 	//! Whether or not to pad rows that do not have enough columns with NULL values
-	bool null_padding = true;
+	bool null_padding = false;
+	//! If we are running the parallel version of the CSV Reader. In general, the system should always auto-detect
+	//! When it can't execute a parallel run before execution. However, there are (rather specific) situations where
+	//! setting up this manually might be important
+	bool run_parallel = true;
 	//===--------------------------------------------------------------------===//
 	// WriteCSVOptions
 	//===--------------------------------------------------------------------===//
@@ -139,7 +138,6 @@ struct BufferedCSVReaderOptions {
 	void SetEscape(const string &escape);
 	void SetQuote(const string &quote);
 	void SetDelimiter(const string &delimiter);
-	void SetParallel(bool use_parallel);
 	void SetNewline(const string &input);
 	//! Set an option that is supported by both reading and writing functions, called by

package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp CHANGED Viewed

@@ -99,7 +99,7 @@ struct VerificationPositions {
 class ParallelCSVReader : public BaseCSVReader {
 public:
 	ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
-	                  const vector<LogicalType> &requested_types);
+	                  idx_t first_pos_first_buffer, const vector<LogicalType> &requested_types);
 	~ParallelCSVReader();
 	//! Current Position (Relative to the Buffer)
@@ -136,6 +136,8 @@ private:
 	bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
 	//! Sets Position depending on the byte_start of this thread
 	bool SetPosition(DataChunk &insert_chunk);
+	//! Called when scanning the 1st buffer, skips empty lines
+	void SkipEmptyLines();
 	//! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
 	//! Up to finding a new line. This function sets the buffer_end and marks a boolean variable
 	//! when changing the buffer end the first time.
@@ -148,6 +150,8 @@ private:
 	bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
 	//! Position of the first read line and last read line for verification purposes
 	VerificationPositions verification_positions;
+	//! First Position of First Buffer
+	idx_t first_pos_first_buffer = 0;
 };
 } // namespace duckdb

package/src/duckdb/src/include/duckdb/function/function.hpp CHANGED Viewed

@@ -91,6 +91,8 @@ public:
 	//! The name of the function
 	string name;
+	//! Additional Information to specify function from it's name
+	string extra_info;
 public:
 	//! Returns the formatted string name(arg1, arg2, ...)

package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp CHANGED Viewed

@@ -56,6 +56,28 @@ struct WriteCSVData : public BaseCSVData {
 	idx_t flush_size = 4096 * 8;
 };
+struct ColumnInfo {
+	ColumnInfo() {
+	}
+	ColumnInfo(vector<std::string> names_p, vector<LogicalType> types_p) {
+		names = std::move(names_p);
+		types = std::move(types_p);
+	}
+	void Serialize(FieldWriter &writer) {
+		writer.WriteList<string>(names);
+		writer.WriteRegularSerializableList<LogicalType>(types);
+	}
+	static ColumnInfo Deserialize(FieldReader &reader) {
+		ColumnInfo info;
+		info.names = reader.ReadRequiredList<string>();
+		info.types = reader.ReadRequiredSerializableList<LogicalType, LogicalType>();
+		return info;
+	}
+	vector<std::string> names;
+	vector<LogicalType> types;
+};
 struct ReadCSVData : public BaseCSVData {
 	//! The expected SQL types to read from the file
 	vector<LogicalType> csv_types;
@@ -75,6 +97,9 @@ struct ReadCSVData : public BaseCSVData {
 	bool single_threaded = false;
 	//! Reader bind data
 	MultiFileReaderBindData reader_bind;
+	//! If all files are On-Disk file (e.g., not a pipe)
+	bool file_exists = true;
+	vector<ColumnInfo> column_info;
 	void Initialize(unique_ptr<BufferedCSVReader> &reader) {
 		this->initial_reader = std::move(reader);

package/src/duckdb/src/include/duckdb/main/client_data.hpp CHANGED Viewed

@@ -58,6 +58,9 @@ struct ClientData {
 	//! The file search path
 	string file_search_path;
+	//! The Max Line Length Size of Last Query Executed on a CSV File. (Only used for testing)
+	idx_t max_line_length = 0;
 public:
 	DUCKDB_API static ClientData &Get(ClientContext &context);
 };

package/src/duckdb/src/include/duckdb/main/config.hpp CHANGED Viewed

@@ -143,8 +143,6 @@ struct DBConfigOptions {
 	bool allow_unsigned_extensions = false;
 	//! Enable emitting FSST Vectors
 	bool enable_fsst_vectors = false;
-	//! Experimental parallel CSV reader
-	bool experimental_parallel_csv_reader = false;
 	//! Start transactions immediately in all attached databases - instead of lazily when a database is referenced
 	bool immediate_transaction_mode = false;
 	//! The set of unrecognized (other) options

package/src/duckdb/src/main/settings/settings.cpp CHANGED Viewed

@@ -512,16 +512,15 @@ Value EnableProgressBarPrintSetting::GetSetting(ClientContext &context) {
 // Experimental Parallel CSV
 //===--------------------------------------------------------------------===//
 void ExperimentalParallelCSVSetting::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) {
-	config.options.experimental_parallel_csv_reader = input.GetValue<bool>();
+	Printer::Print("experimental_parallel_csv is deprecated and will be removed with the next release - the parallel "
+	               "CSV reader is now standard and does not need to be manually enabled anymore 1");
 }
 void ExperimentalParallelCSVSetting::ResetGlobal(DatabaseInstance *db, DBConfig &config) {
-	config.options.experimental_parallel_csv_reader = DBConfig().options.experimental_parallel_csv_reader;
 }
 Value ExperimentalParallelCSVSetting::GetSetting(ClientContext &context) {
-	auto &config = DBConfig::GetConfig(context);
-	return Value::BIGINT(config.options.experimental_parallel_csv_reader);
+	return Value();
 }
 //===--------------------------------------------------------------------===//

package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp CHANGED Viewed

@@ -21,8 +21,21 @@ static LogicalType ResolveInType(OperatorExpression &op, vector<BoundExpression
 	}
 	// get the maximum type from the children
 	LogicalType max_type = children[0]->expr->return_type;
+	bool any_varchar = children[0]->expr->return_type == LogicalType::VARCHAR;
+	bool any_enum = children[0]->expr->return_type.id() == LogicalTypeId::ENUM;
 	for (idx_t i = 1; i < children.size(); i++) {
 		max_type = LogicalType::MaxLogicalType(max_type, children[i]->expr->return_type);
+		if (children[i]->expr->return_type == LogicalType::VARCHAR) {
+			any_varchar = true;
+		}
+		if (children[i]->expr->return_type.id() == LogicalTypeId::ENUM) {
+			any_enum = true;
+		}
+	}
+	if (any_varchar && any_enum) {
+		// For the coalesce function, we must be sure we always upcast the parameters to VARCHAR, if there are at least
+		// one enum and one varchar
+		max_type = LogicalType::VARCHAR;
 	}
 	// cast all children to the same type

package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp CHANGED Viewed

@@ -19,6 +19,7 @@
 #include "duckdb/function/function_binder.hpp"
 #include "duckdb/catalog/catalog_entry/table_function_catalog_entry.hpp"
 #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
+#include "duckdb/function/table/read_csv.hpp"
 namespace duckdb {
@@ -143,6 +144,14 @@ Binder::BindTableFunctionInternal(TableFunction &table_function, const string &f
 			auto arrow_bind = (PyTableFunctionData *)bind_data.get();
 			arrow_bind->external_dependency = std::move(external_dependency);
 		}
+		if (table_function.name == "read_csv" || table_function.name == "read_csv_auto") {
+			auto &csv_bind = bind_data->Cast<ReadCSVData>();
+			if (csv_bind.single_threaded) {
+				table_function.extra_info = "(Single-Threaded)";
+			} else {
+				table_function.extra_info = "(Multi-Threaded)";
+			}
+		}
 	}
 	if (return_types.size() != return_names.size()) {
 		throw InternalException(