npm - duckdb - Versions diffs - 0.8.2-dev3458.0 → 0.8.2-dev3949.0 - Mend

duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp ADDED Viewed

@@ -0,0 +1,127 @@
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/execution/operator/scan/csv/csv_sniffer.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+#include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
+#include "duckdb/common/vector.hpp"
+#include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
+namespace duckdb {
+//! Struct to store the result of the Sniffer
+struct SnifferResult {
+	SnifferResult(vector<LogicalType> return_types_p, vector<string> names_p)
+	    : return_types(std::move(return_types_p)), names(std::move(names_p)) {
+	}
+	//! Return Types that were detected
+	vector<LogicalType> return_types;
+	//! Column Names that were detected
+	vector<string> names;
+};
+//! Sniffer that detects Header, Dialect and Types of CSV Files
+class CSVSniffer {
+public:
+	explicit CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
+	                    CSVStateMachineCache &state_machine_cache);
+	//! Main method that sniffs the CSV file, returns the types, names and options as a result
+	//! CSV Sniffing consists of five steps:
+	//! 1. Dialect Detection: Generate the CSV Options (delimiter, quote, escape, etc.)
+	//! 2. Type Detection: Figures out the types of the columns (For one chunk)
+	//! 3. Header Detection: Figures out if  the CSV file has a header and produces the names of the columns
+	//! 4. Type Replacement: Replaces the types of the columns if the user specified them
+	//! 5. Type Refinement: Refines the types of the columns for the remaining chunks
+	SnifferResult SniffCSV();
+private:
+	//! CSV State Machine Cache
+	CSVStateMachineCache &state_machine_cache;
+	//! Highest number of columns found
+	idx_t max_columns_found = 0;
+	//! Current Candidates being considered
+	vector<unique_ptr<CSVStateMachine>> candidates;
+	//! Reference to original CSV Options, it will be modified as a result of the sniffer.
+	CSVReaderOptions &options;
+	//! Buffer being used on sniffer
+	shared_ptr<CSVBufferManager> buffer_manager;
+	//! ------------------------------------------------------//
+	//! ----------------- Dialect Detection ----------------- //
+	//! ------------------------------------------------------//
+	//! First phase of auto detection: detect CSV dialect (i.e. delimiter, quote rules, etc)
+	void DetectDialect();
+	//! Functions called in the main DetectDialect(); function
+	//! 1. Generates the search space candidates for the dialect
+	void GenerateCandidateDetectionSearchSpace(vector<char> &delim_candidates, vector<QuoteRule> &quoterule_candidates,
+	                                           unordered_map<uint8_t, vector<char>> &quote_candidates_map,
+	                                           unordered_map<uint8_t, vector<char>> &escape_candidates_map);
+	//! 2. Generates the search space candidates for the state machines
+	void GenerateStateMachineSearchSpace(vector<unique_ptr<CSVStateMachine>> &csv_state_machines,
+	                                     const vector<char> &delimiter_candidates,
+	                                     const vector<QuoteRule> &quoterule_candidates,
+	                                     const unordered_map<uint8_t, vector<char>> &quote_candidates_map,
+	                                     const unordered_map<uint8_t, vector<char>> &escape_candidates_map);
+	//! 3. Analyzes if dialect candidate is a good candidate to be considered, if so, it adds it to the candidates
+	void AnalyzeDialectCandidate(unique_ptr<CSVStateMachine>, idx_t &rows_read, idx_t &best_consistent_rows,
+	                             idx_t &prev_padding_count);
+	//! 4. Refine Candidates over remaining chunks
+	void RefineCandidates();
+	//! Checks if candidate still produces good values for the next chunk
+	bool RefineCandidateNextChunk(CSVStateMachine &candidate);
+	//! ------------------------------------------------------//
+	//! ------------------- Type Detection ------------------ //
+	//! ------------------------------------------------------//
+	//! Second phase of auto detection: detect types, format template candidates
+	//! ordered by descending specificity (~ from high to low)
+	void DetectTypes();
+	//! Change the date format for the type to the string
+	//! Try to cast a string value to the specified sql type
+	bool TryCastValue(CSVStateMachine &candidate, const Value &value, const LogicalType &sql_type);
+	void SetDateFormat(CSVStateMachine &candidate, const string &format_specifier, const LogicalTypeId &sql_type);
+	//! Functions that performs detection for date and timestamp formats
+	void DetectDateAndTimeStampFormats(CSVStateMachine &candidate, map<LogicalTypeId, bool> &has_format_candidates,
+	                                   map<LogicalTypeId, vector<string>> &format_candidates,
+	                                   const LogicalType &sql_type, const string &separator, Value &dummy_val);
+	//! Variables for Type Detection
+	//! Format Candidates for Date and Timestamp Types
+	const map<LogicalTypeId, vector<const char *>> format_template_candidates = {
+	    {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
+	    {LogicalTypeId::TIMESTAMP,
+	     {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
+	      "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S"}},
+	};
+	unordered_map<idx_t, vector<LogicalType>> best_sql_types_candidates_per_column_idx;
+	map<LogicalTypeId, vector<string>> best_format_candidates;
+	unique_ptr<CSVStateMachine> best_candidate;
+	idx_t best_start_with_header = 0;
+	idx_t best_start_without_header = 0;
+	vector<Value> best_header_row;
+	//! ------------------------------------------------------//
+	//! ------------------ Header Detection ----------------- //
+	//! ------------------------------------------------------//
+	void DetectHeader();
+	vector<string> names;
+	//! ------------------------------------------------------//
+	//! ------------------ Type Replacement ----------------- //
+	//! ------------------------------------------------------//
+	void ReplaceTypes();
+	//! ------------------------------------------------------//
+	//! ------------------ Type Refinement ------------------ //
+	//! ------------------------------------------------------//
+	void RefineTypes();
+	bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type);
+	vector<LogicalType> detected_types;
+};
+} // namespace duckdb

package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp ADDED Viewed

@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/execution/operator/scan/csv/csv_state_machine.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
+namespace duckdb {
+//! All States of CSV Parsing
+enum class CSVState : uint8_t {
+	STANDARD = 0,         //! Regular unquoted field state
+	DELIMITER = 1,        //! State after encountering a field separator (e.g., ;)
+	RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n)
+	CARRIAGE_RETURN = 3,  //! State after encountering a carriage return(i.e., \r)
+	QUOTED = 4,           //! State when inside a quoted field
+	UNQUOTED = 5,         //! State when leaving a quoted field
+	ESCAPE = 6,           //! State when encountering an escape character (e.g., \)
+	EMPTY_LINE = 7,       //! State when encountering an empty line (i.e., \r\r \n\n, \n\r)
+	INVALID = 8           //! Got to an Invalid State, this should error.
+};
+//! The CSV State Machine comprises a state transition array (STA).
+//! The STA indicates the current state of parsing based on both the current and preceding characters.
+//! This reveals whether we are dealing with a Field, a New Line, a Delimiter, and so forth.
+//! The STA's creation depends on the provided quote, character, and delimiter options for that state machine.
+//! The motivation behind implementing an STA is to remove branching in regular CSV Parsing by predicting and detecting
+//! the states. Note: The State Machine is currently utilized solely in the CSV Sniffer.
+class CSVStateMachine {
+public:
+	explicit CSVStateMachine(CSVReaderOptions &options_p, const CSVStateMachineOptions &state_machine_options,
+	                         shared_ptr<CSVBufferManager> buffer_manager_p,
+	                         CSVStateMachineCache &csv_state_machine_cache_p);
+	//! Resets the state machine, so it can be used again
+	void Reset();
+	//! Aux Function for string UTF8 Verification
+	void VerifyUTF8();
+	CSVStateMachineCache &csv_state_machine_cache;
+	const CSVReaderOptions &options;
+	CSVBufferIterator csv_buffer_iterator;
+	//! Stores identified start row for this file (e.g., a file can start with garbage like notes, before the header)
+	idx_t start_row = 0;
+	//! The Transition Array is a Finite State Machine
+	//! It holds the transitions of all states, on all 256 possible different characters
+	const state_machine_t &transition_array;
+	//! Both these variables are used for new line identifier detection
+	bool single_record_separator = false;
+	bool carry_on_separator = false;
+	//! Variables Used for Sniffing
+	CSVState state;
+	CSVState previous_state;
+	CSVState pre_previous_state;
+	idx_t cur_rows;
+	idx_t column_count;
+	string value;
+	idx_t rows_read;
+	idx_t line_start_pos = 0;
+	//! Dialect options resulting from sniffing
+	DialectOptions dialect_options;
+};
+} // namespace duckdb

package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp ADDED Viewed

@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
+#include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
+namespace duckdb {
+static constexpr uint32_t NUM_STATES = 8;
+static constexpr uint32_t NUM_TRANSITIONS = 256;
+typedef uint8_t state_machine_t[NUM_STATES][NUM_TRANSITIONS];
+//! Hash function used in out state machine cache, it hashes and combines all options used to generate a state machine
+struct HashCSVStateMachineConfig {
+	size_t operator()(CSVStateMachineOptions const &config) const noexcept {
+		auto h_delimiter = Hash(config.delimiter);
+		auto h_quote = Hash(config.quote);
+		auto h_escape = Hash(config.escape);
+		return CombineHash(h_delimiter, CombineHash(h_quote, h_escape));
+	}
+};
+//! The CSVStateMachineCache caches state machines, although small ~2kb, the actual creation of multiple State Machines
+//! can become a bottleneck on sniffing, when reading very small csv files.
+//! Hence the cache stores State Machines based on their different delimiter|quote|escape options.
+class CSVStateMachineCache {
+public:
+	CSVStateMachineCache();
+	~CSVStateMachineCache() {};
+	//! Gets a state machine from the cache, if it's not from one the default options
+	//! It first caches it, then returns it.
+	const state_machine_t &Get(const CSVStateMachineOptions &state_machine_options);
+private:
+	void Insert(const CSVStateMachineOptions &state_machine_options);
+	//! Cache on delimiter|quote|escape
+	unordered_map<CSVStateMachineOptions, state_machine_t, HashCSVStateMachineConfig> state_machine_cache;
+	//! Default value for options used to intialize CSV State Machine Cache
+	const vector<char> default_delimiter = {',', '|', ';', '\t'};
+	const vector<vector<char>> default_quote = {{'\"'}, {'\"', '\''}, {'\0'}};
+	const vector<QuoteRule> default_quote_rule = {QuoteRule::QUOTES_RFC, QuoteRule::QUOTES_OTHER, QuoteRule::NO_QUOTES};
+	const vector<vector<char>> default_escape = {{'\0', '\"', '\''}, {'\\'}, {'\0'}};
+};
+} // namespace duckdb

package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp RENAMED Viewed

@@ -1,18 +1,18 @@
 //===----------------------------------------------------------------------===//
 //                         DuckDB
 //
-// duckdb/execution/operator/persistent/parallel_csv_reader.hpp
+// duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp
 //
 //
 //===----------------------------------------------------------------------===//
 #pragma once
-#include "duckdb/execution/operator/persistent/base_csv_reader.hpp"
-#include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
-#include "duckdb/execution/operator/persistent/csv_file_handle.hpp"
-#include "duckdb/execution/operator/persistent/csv_buffer.hpp"
-#include "duckdb/execution/operator/persistent/csv_line_info.hpp"
+#include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_file_handle.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_buffer.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_line_info.hpp"
 #include <sstream>
 #include <utility>
@@ -20,21 +20,17 @@
 namespace duckdb {
 struct CSVBufferRead {
-	CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, idx_t buffer_start_p, idx_t buffer_end_p, idx_t batch_index,
+	CSVBufferRead(unique_ptr<CSVBufferHandle> buffer_p, idx_t buffer_start_p, idx_t buffer_end_p, idx_t batch_index,
 	              idx_t local_batch_index_p, optional_ptr<LineInfo> line_info_p)
 	    : buffer(std::move(buffer_p)), line_info(line_info_p), buffer_start(buffer_start_p), buffer_end(buffer_end_p),
 	      batch_index(batch_index), local_batch_index(local_batch_index_p) {
-		if (buffer) {
-			if (buffer_end > buffer->GetBufferSize()) {
-				buffer_end = buffer->GetBufferSize();
-			}
-		} else {
-			buffer_start = 0;
-			buffer_end = 0;
+		D_ASSERT(buffer);
+		if (buffer_end > buffer->actual_size) {
+			buffer_end = buffer->actual_size;
 		}
 	}
-	CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, shared_ptr<CSVBuffer> nxt_buffer_p, idx_t buffer_start_p,
+	CSVBufferRead(unique_ptr<CSVBufferHandle> buffer_p, unique_ptr<CSVBufferHandle> nxt_buffer_p, idx_t buffer_start_p,
 	              idx_t buffer_end_p, idx_t batch_index, idx_t local_batch_index, optional_ptr<LineInfo> line_info_p)
 	    : CSVBufferRead(std::move(buffer_p), buffer_start_p, buffer_end_p, batch_index, local_batch_index,
 	                    line_info_p) {
@@ -44,33 +40,33 @@ struct CSVBufferRead {
 	CSVBufferRead() : buffer_start(0), buffer_end(NumericLimits<idx_t>::Maximum()) {};
 	const char &operator[](size_t i) const {
-		if (i < buffer->GetBufferSize()) {
+		if (i < buffer->actual_size) {
 			auto buffer_ptr = buffer->Ptr();
 			return buffer_ptr[i];
 		}
 		auto next_ptr = next_buffer->Ptr();
-		return next_ptr[i - buffer->GetBufferSize()];
+		return next_ptr[i - buffer->actual_size];
 	}
 	string_t GetValue(idx_t start_buffer, idx_t position_buffer, idx_t offset) {
 		idx_t length = position_buffer - start_buffer - offset;
 		// 1) It's all in the current buffer
-		if (start_buffer + length <= buffer->GetBufferSize()) {
+		if (start_buffer + length <= buffer->actual_size) {
 			auto buffer_ptr = buffer->Ptr();
 			return string_t(buffer_ptr + start_buffer, length);
-		} else if (start_buffer >= buffer->GetBufferSize()) {
+		} else if (start_buffer >= buffer->actual_size) {
 			// 2) It's all in the next buffer
 			D_ASSERT(next_buffer);
-			D_ASSERT(next_buffer->GetBufferSize() >= length + (start_buffer - buffer->GetBufferSize()));
+			D_ASSERT(next_buffer->actual_size >= length + (start_buffer - buffer->actual_size));
 			auto buffer_ptr = next_buffer->Ptr();
-			return string_t(buffer_ptr + (start_buffer - buffer->GetBufferSize()), length);
+			return string_t(buffer_ptr + (start_buffer - buffer->actual_size), length);
 		} else {
 			// 3) It starts in the current buffer and ends in the next buffer
 			D_ASSERT(next_buffer);
 			auto intersection = make_unsafe_uniq_array<char>(length);
 			idx_t cur_pos = 0;
 			auto buffer_ptr = buffer->Ptr();
-			for (idx_t i = start_buffer; i < buffer->GetBufferSize(); i++) {
+			for (idx_t i = start_buffer; i < buffer->actual_size; i++) {
 				intersection[cur_pos++] = buffer_ptr[i];
 			}
 			idx_t nxt_buffer_pos = 0;
@@ -83,8 +79,8 @@ struct CSVBufferRead {
 		}
 	}
-	shared_ptr<CSVBuffer> buffer;
-	shared_ptr<CSVBuffer> next_buffer;
+	unique_ptr<CSVBufferHandle> buffer;
+	unique_ptr<CSVBufferHandle> next_buffer;
 	vector<unsafe_unique_array<char>> intersections;
 	optional_ptr<LineInfo> line_info;
@@ -103,7 +99,7 @@ struct VerificationPositions {
 //! CSV Reader for Parallel Reading
 class ParallelCSVReader : public BaseCSVReader {
 public:
-	ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
+	ParallelCSVReader(ClientContext &context, CSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
 	                  idx_t first_pos_first_buffer, const vector<LogicalType> &requested_types, idx_t file_idx_p);
 	virtual ~ParallelCSVReader() {
 	}
@@ -162,8 +158,6 @@ private:
 	//! Parses a CSV file with a one-byte delimiter, escape and quote character
 	bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
-	//! Verifies that the line length did not go over a pre-defined limit.
-	void VerifyLineLength(idx_t line_size);
 	//! First Position of First Buffer
 	idx_t first_pos_first_buffer = 0;

package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp ADDED Viewed

@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/execution/operator/scan/csv/quote_rules.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+#include "duckdb/common/vector.hpp"
+namespace duckdb {
+//! Different Rules regarding possible combinations of Quote and Escape Values for CSV Dialects.
+//! Each rule has a comment on the possible combinations.
+enum class QuoteRule : uint8_t {
+	QUOTES_RFC = 0,   //! quote = " escape = (\0 || " || ')
+	QUOTES_OTHER = 1, //! quote = ( " || ' ) escape = '\\'
+	NO_QUOTES = 2     //! quote = \0 escape = \0
+};
+} // namespace duckdb

package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp CHANGED Viewed

@@ -8,31 +8,27 @@
 #pragma once
+#include "duckdb/common/types/row/tuple_data_layout.hpp"
 #include "duckdb/execution/operator/aggregate/grouped_aggregate_data.hpp"
-#include "duckdb/execution/partitionable_hashtable.hpp"
-#include "duckdb/execution/physical_operator.hpp"
 #include "duckdb/parser/group_by_node.hpp"
 namespace duckdb {
-class BufferManager;
-class Executor;
-class PhysicalHashAggregate;
-class Pipeline;
-class Task;
+class GroupedAggregateHashTable;
+struct AggregatePartition;
 class RadixPartitionedHashTable {
 public:
 	RadixPartitionedHashTable(GroupingSet &grouping_set, const GroupedAggregateData &op);
+	unique_ptr<GroupedAggregateHashTable> CreateHT(ClientContext &context, const idx_t capacity,
+	                                               const idx_t radix_bits) const;
+public:
 	GroupingSet &grouping_set;
 	//! The indices specified in the groups_count that do not appear in the grouping_set
 	unsafe_vector<idx_t> null_groups;
 	const GroupedAggregateData &op;
 	vector<LogicalType> group_types;
-	//! how many groups can we have in the operator before we switch to radix partitioning
-	idx_t radix_limit;
 	//! The GROUPING values that belong to this hash table
 	vector<Value> grouping_values;
@@ -43,32 +39,27 @@ public:
 	void Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input, DataChunk &aggregate_input_chunk,
 	          const unsafe_vector<idx_t> &filter) const;
-	void Combine(ExecutionContext &context, GlobalSinkState &state, LocalSinkState &lstate) const;
-	bool Finalize(ClientContext &context, GlobalSinkState &gstate_p) const;
-	void ScheduleTasks(Executor &executor, const shared_ptr<Event> &event, GlobalSinkState &state,
-	                   vector<shared_ptr<Task>> &tasks) const;
+	void Combine(ExecutionContext &context, GlobalSinkState &gstate, LocalSinkState &lstate) const;
+	void Finalize(ClientContext &context, GlobalSinkState &gstate) const;
+public:
 	//! Source interface
-	idx_t Size(GlobalSinkState &sink_state) const;
 	unique_ptr<GlobalSourceState> GetGlobalSourceState(ClientContext &context) const;
 	unique_ptr<LocalSourceState> GetLocalSourceState(ExecutionContext &context) const;
-	SourceResultType GetData(ExecutionContext &context, DataChunk &chunk, GlobalSinkState &sink_state,
+	SourceResultType GetData(ExecutionContext &context, DataChunk &chunk, GlobalSinkState &sink,
 	                         OperatorSourceInput &input) const;
-	static void SetMultiScan(GlobalSinkState &state);
-	static bool ForceSingleHT(GlobalSinkState &state);
-	static bool AnyPartitioned(GlobalSinkState &state);
-	static void GetRepartitionInfo(ClientContext &context, GlobalSinkState &state, idx_t &repartition_radix_bits,
-	                               idx_t &concurrent_repartitions, idx_t &tasks_per_partition);
+	const TupleDataLayout &GetLayout() const;
+	idx_t Count(GlobalSinkState &sink) const;
+	static void SetMultiScan(GlobalSinkState &sink);
 private:
 	void SetGroupingValues();
 	void PopulateGroupChunk(DataChunk &group_chunk, DataChunk &input_chunk) const;
-	void InitializeFinalizedHTs(ClientContext &context, GlobalSinkState &state) const;
-	void ScheduleRepartitionTasks(Executor &executor, const shared_ptr<Event> &event, GlobalSinkState &state,
-	                              vector<shared_ptr<Task>> &tasks, const idx_t repartition_radix_bits,
-	                              const idx_t concurrent_repartitions, const idx_t tasks_per_partition) const;
+	idx_t CountInternal(GlobalSinkState &sink) const;
+	TupleDataLayout layout;
 };
 } // namespace duckdb

package/src/duckdb/src/include/duckdb/function/function_serialization.hpp CHANGED Viewed

@@ -110,9 +110,8 @@ public:
 		bool has_serialize = function.format_serialize;
 		serializer.WriteProperty(503, "has_serialize", has_serialize);
 		if (has_serialize) {
-			serializer.BeginObject(504, "function_data");
-			function.format_serialize(serializer, bind_info, function);
-			serializer.EndObject();
+			serializer.WriteObject(504, "function_data",
+			                       [&](FormatSerializer &obj) { function.format_serialize(obj, bind_info, function); });
 			D_ASSERT(function.format_deserialize);
 		}
 	}
@@ -150,9 +149,9 @@ public:
 			throw SerializationException("Function requires deserialization but no deserialization function for %s",
 			                             function.name);
 		}
-		deserializer.BeginObject(504, "function_data");
-		auto result = function.format_deserialize(deserializer, function);
-		deserializer.EndObject();
+		unique_ptr<FunctionData> result;
+		deserializer.ReadObject(504, "function_data",
+		                        [&](FormatDeserializer &obj) { result = function.format_deserialize(obj, function); });
 		return result;
 	}

package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp CHANGED Viewed

@@ -142,10 +142,10 @@ public:
 public:
 	DUCKDB_API static ParseResult Parse(const string &format, const string &text);
-	DUCKDB_API bool Parse(string_t str, ParseResult &result);
+	DUCKDB_API bool Parse(string_t str, ParseResult &result) const;
-	DUCKDB_API bool TryParseDate(string_t str, date_t &result, string &error_message);
-	DUCKDB_API bool TryParseTimestamp(string_t str, timestamp_t &result, string &error_message);
+	DUCKDB_API bool TryParseDate(string_t str, date_t &result, string &error_message) const;
+	DUCKDB_API bool TryParseTimestamp(string_t str, timestamp_t &result, string &error_message) const;
 	date_t ParseDate(string_t str);
 	timestamp_t ParseTimestamp(string_t str);
@@ -158,7 +158,7 @@ protected:
 	DUCKDB_API void AddFormatSpecifier(string preceding_literal, StrTimeSpecifier specifier) override;
 	int NumericSpecifierWidth(StrTimeSpecifier specifier);
 	int32_t TryParseCollection(const char *data, idx_t &pos, idx_t size, const string_t collection[],
-	                           idx_t collection_count);
+	                           idx_t collection_count) const;
 private:
 	explicit StrpTimeFormat(const string &format_string);

package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp CHANGED Viewed

@@ -8,14 +8,16 @@
 #pragma once
-#include "duckdb/function/table_function.hpp"
-#include "duckdb/function/scalar/strftime_format.hpp"
-#include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
-#include "duckdb/execution/operator/persistent/buffered_csv_reader.hpp"
-#include "duckdb/execution/operator/persistent/parallel_csv_reader.hpp"
-#include "duckdb/execution/operator/persistent/csv_file_handle.hpp"
-#include "duckdb/execution/operator/persistent/csv_buffer.hpp"
+#include "duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_buffer.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_file_handle.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
+#include "duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp"
 #include "duckdb/function/built_in_functions.hpp"
+#include "duckdb/function/scalar/strftime_format.hpp"
+#include "duckdb/function/table_function.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
 namespace duckdb {
@@ -31,7 +33,7 @@ struct BaseCSVData : public TableFunctionData {
 	//! The file path of the CSV file to read or write
 	vector<string> files;
 	//! The CSV reader options
-	BufferedCSVReaderOptions options;
+	CSVReaderOptions options;
 	//! Offsets for generated columns
 	idx_t filename_col_idx;
 	idx_t hive_partition_col_idx;
@@ -50,8 +52,6 @@ struct WriteCSVData : public BaseCSVData {
 	vector<LogicalType> sql_types;
 	//! The newline string to write
 	string newline = "\n";
-	//! Whether or not we are writing a simple CSV (delimiter, quote and escape are all 1 byte in length)
-	bool is_simple;
 	//! The size of the CSV file (in bytes) that we buffer before we flush it to disk
 	idx_t flush_size = 4096 * 8;
 	//! For each byte whether or not the CSV file requires quotes when containing the byte
@@ -93,8 +93,9 @@ struct ReadCSVData : public BaseCSVData {
 	vector<LogicalType> return_types;
 	//! The expected SQL names to be returned from the read - including added constants (e.g. filename, hive partitions)
 	vector<string> return_names;
-	//! The initial reader (if any): this is used when automatic detection is used during binding.
-	//! In this case, the CSV reader is already created and might as well be re-used.
+	//! The buffer manager (if any): this is used when automatic detection is used during binding.
+	//! In this case, some CSV buffers have already been read and can be reused.
+	shared_ptr<CSVBufferManager> buffer_manager;
 	unique_ptr<BufferedCSVReader> initial_reader;
 	//! The union readers are created (when csv union_by_name option is on) during binding
 	//! Those readers can be re-used during ReadCSVFunction
@@ -104,6 +105,10 @@ struct ReadCSVData : public BaseCSVData {
 	//! Reader bind data
 	MultiFileReaderBindData reader_bind;
 	vector<ColumnInfo> column_info;
+	//! The CSVStateMachineCache caches state machines created for sniffing and parsing csv files
+	//! We cache them because when reading very small csv files, the cost of creating all the possible
+	//! State machines for sniffing becomes a major bottleneck.
+	CSVStateMachineCache state_machine_cache;
 	void Initialize(unique_ptr<BufferedCSVReader> &reader) {
 		this->initial_reader = std::move(reader);

package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp CHANGED Viewed

@@ -21,6 +21,7 @@ public:
 	explicit ClientContextFileOpener(ClientContext &context_p) : context(context_p) {
 	}
+	bool TryGetCurrentSetting(const string &key, Value &result, FileOpenerInfo &info) override;
 	bool TryGetCurrentSetting(const string &key, Value &result) override;
 	ClientContext *TryGetClientContext() override {

package/src/duckdb/src/include/duckdb/main/client_data.hpp CHANGED Viewed

@@ -13,6 +13,7 @@
 #include "duckdb/common/types/value.hpp"
 #include "duckdb/common/case_insensitive_map.hpp"
 #include "duckdb/common/atomic.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
 namespace duckdb {
 class AttachedDatabase;
@@ -29,7 +30,7 @@ class SchemaCatalogEntry;
 struct RandomEngine;
 struct ClientData {
-	ClientData(ClientContext &context);
+	explicit ClientData(ClientContext &context);
 	~ClientData();
 	//! Query profiler

package/src/duckdb/src/include/duckdb/main/config.hpp CHANGED Viewed

@@ -248,6 +248,7 @@ public:
 	bool operator!=(const DBConfig &other);
 	DUCKDB_API CastFunctionSet &GetCastFunctions();
+	static idx_t GetSystemMaxThreads(FileSystem &fs);
 	void SetDefaultMaxThreads();
 	void SetDefaultMaxMemory();

package/src/duckdb/src/include/duckdb/main/connection.hpp CHANGED Viewed

@@ -30,7 +30,7 @@ class DatabaseInstance;
 class DuckDB;
 class LogicalOperator;
 class SelectStatement;
-struct BufferedCSVReaderOptions;
+struct CSVReaderOptions;
 typedef void (*warning_callback)(std::string);
@@ -131,7 +131,7 @@ public:
 	//! Reads CSV file
 	DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file);
-	DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file, BufferedCSVReaderOptions &options);
+	DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file, CSVReaderOptions &options);
 	DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file, const vector<string> &columns);
 	//! Reads Parquet file