npm - duckdb - Versions diffs - 0.5.2-dev2006.0 → 0.5.2-dev2076.0 - Mend

duckdb 0.5.2-dev2006.0 → 0.5.2-dev2076.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +1 -1
package/src/duckdb.cpp +1649 -786
package/src/duckdb.hpp +373 -93
package/src/parquet-amalgamation.cpp +37721 -37721

package/src/duckdb.hpp CHANGED Viewed

@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
 #pragma once
 #define DUCKDB_AMALGAMATION 1
 #define DUCKDB_AMALGAMATION_EXTENDED 1
-#define DUCKDB_SOURCE_ID "2fc0fa3788"
-#define DUCKDB_VERSION "v0.5.2-dev2006"
+#define DUCKDB_SOURCE_ID "80861d6b6f"
+#define DUCKDB_VERSION "v0.5.2-dev2076"
 //===----------------------------------------------------------------------===//
 //                         DuckDB
 //
@@ -6293,6 +6293,9 @@ public:
 	DUCKDB_API static bool CharacterIsNewline(char c) {
 		return c == '\n' || c == '\r';
 	}
+	DUCKDB_API static bool CharacterIsNullTerminator(char c) {
+		return c == '\0';
+	}
 	DUCKDB_API static bool CharacterIsDigit(char c) {
 		return c >= '0' && c <= '9';
 	}
@@ -16607,6 +16610,8 @@ struct DBConfigOptions {
 	bool allow_unsigned_extensions = false;
 	//! Enable emitting FSST Vectors
 	bool enable_fsst_vectors = false;
+	//! Experimental parallel CSV reader
+	bool experimental_parallel_csv_reader = false;
 	bool operator==(const DBConfigOptions &other) const;
 };
@@ -27400,7 +27405,17 @@ public:
 //===----------------------------------------------------------------------===//
 //                         DuckDB
 //
-// duckdb/execution/operator/persistent/buffered_csv_reader.hpp
+// duckdb/execution/operator/persistent/base_csv_reader.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/execution/operator/persistent/base_csv_reader.hpp
 //
 //
 //===----------------------------------------------------------------------===//
@@ -27728,44 +27743,242 @@ namespace duckdb {
 using std::queue;
 }
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/execution/operator/persistent/csv_reader_options.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/execution/operator/persistent/csv_buffer.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/execution/operator/persistent/csv_file_handle.hpp
+//
+//
+//===----------------------------------------------------------------------===//
-#include <sstream>
 namespace duckdb {
-struct CopyInfo;
-struct CSVFileHandle;
-struct FileHandle;
-struct StrpTimeFormat;
-class FileOpener;
-class FileSystem;
+struct CSVFileHandle {
+public:
+	explicit CSVFileHandle(unique_ptr<FileHandle> file_handle_p) : file_handle(move(file_handle_p)) {
+		can_seek = file_handle->CanSeek();
+		plain_file_source = file_handle->OnDiskFile() && can_seek;
+		file_size = file_handle->GetFileSize();
+	}
-//! The shifts array allows for linear searching of multi-byte values. For each position, it determines the next
-//! position given that we encounter a byte with the given value.
-/*! For example, if we have a string "ABAC", the shifts array will have the following values:
- *  [0] --> ['A'] = 1, all others = 0
- *  [1] --> ['B'] = 2, ['A'] = 1, all others = 0
- *  [2] --> ['A'] = 3, all others = 0
- *  [3] --> ['C'] = 4 (match), 'B' = 2, 'A' = 1, all others = 0
- * Suppose we then search in the following string "ABABAC", our progression will be as follows:
- * 'A' -> [1], 'B' -> [2], 'A' -> [3], 'B' -> [2], 'A' -> [3], 'C' -> [4] (match!)
- */
-struct TextSearchShiftArray {
-	TextSearchShiftArray();
-	explicit TextSearchShiftArray(string search_term);
+	bool CanSeek() {
+		return can_seek;
+	}
+	void Seek(idx_t position) {
+		if (!can_seek) {
+			throw InternalException("Cannot seek in this file");
+		}
+		file_handle->Seek(position);
+	}
+	idx_t SeekPosition() {
+		if (!can_seek) {
+			throw InternalException("Cannot seek in this file");
+		}
+		return file_handle->SeekPosition();
+	}
+	void Reset() {
+		if (plain_file_source) {
+			file_handle->Reset();
+		} else {
+			if (!reset_enabled) {
+				throw InternalException("Reset called but reset is not enabled for this CSV Handle");
+			}
+			read_position = 0;
+		}
+	}
+	bool PlainFileSource() {
+		return plain_file_source;
+	}
-	inline bool Match(uint8_t &position, uint8_t byte_value) {
-		if (position >= length) {
-			return false;
+	bool OnDiskFile() {
+		return file_handle->OnDiskFile();
+	}
+	idx_t FileSize() {
+		return file_size;
+	}
+	bool FinishedReading() {
+		return requested_bytes >= file_size;
+	}
+	idx_t Read(void *buffer, idx_t nr_bytes) {
+		requested_bytes += nr_bytes;
+		if (!plain_file_source) {
+			// not a plain file source: we need to do some bookkeeping around the reset functionality
+			idx_t result_offset = 0;
+			if (read_position < buffer_size) {
+				// we need to read from our cached buffer
+				auto buffer_read_count = MinValue<idx_t>(nr_bytes, buffer_size - read_position);
+				memcpy(buffer, cached_buffer.get() + read_position, buffer_read_count);
+				result_offset += buffer_read_count;
+				read_position += buffer_read_count;
+				if (result_offset == nr_bytes) {
+					return nr_bytes;
+				}
+			} else if (!reset_enabled && cached_buffer) {
+				// reset is disabled, but we still have cached data
+				// we can remove any cached data
+				cached_buffer.reset();
+				buffer_size = 0;
+				buffer_capacity = 0;
+				read_position = 0;
+			}
+			// we have data left to read from the file
+			// read directly into the buffer
+			auto bytes_read = file_handle->Read((char *)buffer + result_offset, nr_bytes - result_offset);
+			read_position += bytes_read;
+			if (reset_enabled) {
+				// if reset caching is enabled, we need to cache the bytes that we have read
+				if (buffer_size + bytes_read >= buffer_capacity) {
+					// no space; first enlarge the buffer
+					buffer_capacity = MaxValue<idx_t>(NextPowerOfTwo(buffer_size + bytes_read), buffer_capacity * 2);
+					auto new_buffer = unique_ptr<data_t[]>(new data_t[buffer_capacity]);
+					if (buffer_size > 0) {
+						memcpy(new_buffer.get(), cached_buffer.get(), buffer_size);
+					}
+					cached_buffer = move(new_buffer);
+				}
+				memcpy(cached_buffer.get() + buffer_size, (char *)buffer + result_offset, bytes_read);
+				buffer_size += bytes_read;
+			}
+			return result_offset + bytes_read;
+		} else {
+			return file_handle->Read(buffer, nr_bytes);
 		}
-		position = shifts[position * 255 + byte_value];
-		return position == length;
 	}
-	idx_t length;
-	unique_ptr<uint8_t[]> shifts;
+	string ReadLine() {
+		bool carriage_return = false;
+		string result;
+		char buffer[1];
+		while (true) {
+			idx_t bytes_read = Read(buffer, 1);
+			if (bytes_read == 0) {
+				return result;
+			}
+			if (carriage_return) {
+				if (buffer[0] != '\n') {
+					if (!file_handle->CanSeek()) {
+						throw BinderException(
+						    "Carriage return newlines not supported when reading CSV files in which we cannot seek");
+					}
+					file_handle->Seek(file_handle->SeekPosition() - 1);
+					return result;
+				}
+			}
+			if (buffer[0] == '\n') {
+				return result;
+			}
+			if (buffer[0] != '\r') {
+				result += buffer[0];
+			} else {
+				carriage_return = true;
+			}
+		}
+	}
+	void DisableReset() {
+		this->reset_enabled = false;
+	}
+	mutex main_mutex;
+	idx_t count = 0;
+private:
+	unique_ptr<FileHandle> file_handle;
+	bool reset_enabled = true;
+	bool can_seek = false;
+	bool plain_file_source = false;
+	idx_t file_size = 0;
+	// reset support
+	unique_ptr<data_t[]> cached_buffer;
+	idx_t read_position = 0;
+	idx_t buffer_size = 0;
+	idx_t buffer_capacity = 0;
+	idx_t requested_bytes = 0;
 };
+} // namespace duckdb
+namespace duckdb {
+class CSVBuffer {
+public:
+	//! Colossal buffer size for multi-threading
+	static constexpr idx_t INITIAL_BUFFER_SIZE_COLOSSAL = 32000000; // 32MB
+	//! Constructor for Initial Buffer
+	CSVBuffer(idx_t buffer_size_p, CSVFileHandle &file_handle);
+	//! Constructor for `Next()` Buffers
+	CSVBuffer(unique_ptr<char[]> buffer_p, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer);
+	//! Creates a new buffer with the next part of the CSV File
+	unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t set_buffer_size);
+	//! Gets the buffer actual size
+	idx_t GetBufferSize();
+	//! Gets the start position of the buffer, only relevant for the first time it's scanned
+	idx_t GetStart();
+	//! If this buffer is the last buffer of the CSV File
+	bool IsCSVFileLastBuffer();
+	//! If this buffer is the first buffer of the CSV File
+	bool IsCSVFileFirstBuffer();
+	//! The actual buffer
+	unique_ptr<char[]> buffer;
+private:
+	//! Actual size can be smaller than the buffer size in case we allocate it too optimistically.
+	idx_t actual_size;
+	//! We need to check for Byte Order Mark, to define the start position of this buffer
+	//! https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8
+	idx_t start_position = 0;
+	//! If this is the last buffer of the CSV File
+	bool last_buffer = false;
+	//! If this is the first buffer of the CSV File
+	bool first_buffer = false;
+};
+} // namespace duckdb
+namespace duckdb {
 struct BufferedCSVReaderOptions {
 	//===--------------------------------------------------------------------===//
 	// CommonCSVOptions
@@ -27792,7 +28005,7 @@ struct BufferedCSVReaderOptions {
 	//! Expected number of columns
 	idx_t num_cols = 0;
 	//! Number of samples to buffer
-	idx_t buffer_size = STANDARD_VECTOR_SIZE * 50;
+	idx_t buffer_sample_size = STANDARD_VECTOR_SIZE * 50;
 	//! Specifies the string that represents a null value
 	string null_str;
 	//! Whether file is compressed or not, and if so which compression type
@@ -27830,6 +28043,8 @@ struct BufferedCSVReaderOptions {
 	bool include_parsed_hive_partitions = false;
 	//! Whether or not to union files with different (but compatible) columns
 	bool union_by_name = false;
+	//! Buffer Size (Parallel Scan)
+	idx_t buffer_size = CSVBuffer::INITIAL_BUFFER_SIZE_COLOSSAL;
 	//===--------------------------------------------------------------------===//
 	// WriteCSVOptions
@@ -27864,24 +28079,31 @@ struct BufferedCSVReaderOptions {
 	std::string ToString() const;
 };
+} // namespace duckdb
+#include <sstream>
+namespace duckdb {
+struct CopyInfo;
+struct CSVFileHandle;
+struct FileHandle;
+struct StrpTimeFormat;
+class FileOpener;
+class FileSystem;
 enum class ParserMode : uint8_t { PARSING = 0, SNIFFING_DIALECT = 1, SNIFFING_DATATYPES = 2, PARSING_HEADER = 3 };
 //! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
-class BufferedCSVReader {
-	//! Initial buffer read size; can be extended for long lines
-	static constexpr idx_t INITIAL_BUFFER_SIZE = 16384;
-	//! Larger buffer size for non disk files
-	static constexpr idx_t INITIAL_BUFFER_SIZE_LARGE = 10000000; // 10MB
-	ParserMode mode;
+class BaseCSVReader {
 public:
-	BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options,
-	                  const vector<LogicalType> &requested_types = vector<LogicalType>());
+	BaseCSVReader(ClientContext &context, BufferedCSVReaderOptions options,
+	              const vector<LogicalType> &requested_types = vector<LogicalType>());
-	BufferedCSVReader(FileSystem &fs, Allocator &allocator, FileOpener *opener, BufferedCSVReaderOptions options,
-	                  const vector<LogicalType> &requested_types = vector<LogicalType>());
-	~BufferedCSVReader();
+	BaseCSVReader(FileSystem &fs, Allocator &allocator, FileOpener *opener, BufferedCSVReaderOptions options,
+	              const vector<LogicalType> &requested_types = vector<LogicalType>());
+	~BaseCSVReader();
 	FileSystem &fs;
 	Allocator &allocator;
@@ -27895,17 +28117,9 @@ public:
 	vector<idx_t> insert_cols_idx;
 	vector<idx_t> insert_nulls_idx;
-	unique_ptr<CSVFileHandle> file_handle;
-	unique_ptr<char[]> buffer;
-	idx_t buffer_size;
-	idx_t position;
-	idx_t start = 0;
 	idx_t linenr = 0;
 	bool linenr_estimated = false;
-	vector<idx_t> sniffed_column_counts;
 	bool row_empty = false;
 	idx_t sample_chunk_idx = 0;
 	bool jumping_samples = false;
@@ -27915,72 +28129,145 @@ public:
 	idx_t bytes_in_chunk = 0;
 	double bytes_per_line_avg = 0;
-	vector<unique_ptr<char[]>> cached_buffers;
-	TextSearchShiftArray delimiter_search, escape_search, quote_search;
 	DataChunk parse_chunk;
 	std::queue<unique_ptr<DataChunk>> cached_chunks;
-public:
-	//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
-	void ParseCSV(DataChunk &insert_chunk);
-	idx_t GetFileSize();
+	ParserMode mode;
+public:
 	//! Fill nulls into the cols that mismtach union names
 	void SetNullUnionCols(DataChunk &insert_chunk);
-private:
-	//! Initialize Parser
-	void Initialize(const vector<LogicalType> &requested_types);
+protected:
 	//! Initializes the parse_chunk with varchar columns and aligns info with new number of cols
 	void InitParseChunk(idx_t num_cols);
 	//! Initializes the insert_chunk idx for mapping parse_chunk cols to insert_chunk cols
 	void InitInsertChunkIdx(idx_t num_cols);
-	//! Initializes the TextSearchShiftArrays for complex parser
-	void PrepareComplexParser();
-	//! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.
-	void ParseCSV(ParserMode mode);
-	//! Try to parse a single datachunk from the file. Returns whether or not the parsing is successful
-	bool TryParseCSV(ParserMode mode);
-	//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
-	bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
-	//! Sniffs CSV dialect and determines skip rows, header row, column types and column names
-	vector<LogicalType> SniffCSV(const vector<LogicalType> &requested_types);
 	//! Change the date format for the type to the string
 	void SetDateFormat(const string &format_specifier, const LogicalTypeId &sql_type);
 	//! Try to cast a string value to the specified sql type
 	bool TryCastValue(const Value &value, const LogicalType &sql_type);
 	//! Try to cast a vector of values to the specified sql type
 	bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type);
+	//! Adds a value to the current row
+	void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes);
+	//! Adds a row to the insert_chunk, returns true if the chunk is filled as a result of this row being added
+	bool AddRow(DataChunk &insert_chunk, idx_t &column);
+	//! Finalizes a chunk, parsing all values that have been added so far and adding them to the insert_chunk
+	bool Flush(DataChunk &insert_chunk, bool try_add_line = false);
+	unique_ptr<CSVFileHandle> OpenCSV(const BufferedCSVReaderOptions &options);
+	void VerifyUTF8(idx_t col_idx);
+	void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0);
+	static string GetLineNumberStr(idx_t linenr, bool linenr_estimated);
+protected:
+	//! Whether or not the current row's columns have overflown sql_types.size()
+	bool error_column_overflow = false;
+	//! Number of sniffed columns - only used when auto-detecting
+	vector<idx_t> sniffed_column_counts;
+};
+} // namespace duckdb
+namespace duckdb {
+struct CopyInfo;
+struct CSVFileHandle;
+struct FileHandle;
+struct StrpTimeFormat;
+class FileOpener;
+class FileSystem;
+//! The shifts array allows for linear searching of multi-byte values. For each position, it determines the next
+//! position given that we encounter a byte with the given value.
+/*! For example, if we have a string "ABAC", the shifts array will have the following values:
+ *  [0] --> ['A'] = 1, all others = 0
+ *  [1] --> ['B'] = 2, ['A'] = 1, all others = 0
+ *  [2] --> ['A'] = 3, all others = 0
+ *  [3] --> ['C'] = 4 (match), 'B' = 2, 'A' = 1, all others = 0
+ * Suppose we then search in the following string "ABABAC", our progression will be as follows:
+ * 'A' -> [1], 'B' -> [2], 'A' -> [3], 'B' -> [2], 'A' -> [3], 'C' -> [4] (match!)
+ */
+struct TextSearchShiftArray {
+	TextSearchShiftArray();
+	explicit TextSearchShiftArray(string search_term);
+	inline bool Match(uint8_t &position, uint8_t byte_value) {
+		if (position >= length) {
+			return false;
+		}
+		position = shifts[position * 255 + byte_value];
+		return position == length;
+	}
+	idx_t length;
+	unique_ptr<uint8_t[]> shifts;
+};
+//! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
+class BufferedCSVReader : public BaseCSVReader {
+	//! Initial buffer read size; can be extended for long lines
+	static constexpr idx_t INITIAL_BUFFER_SIZE = 16384;
+	//! Larger buffer size for non disk files
+	static constexpr idx_t INITIAL_BUFFER_SIZE_LARGE = 10000000; // 10MB
+public:
+	BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options,
+	                  const vector<LogicalType> &requested_types = vector<LogicalType>());
+	BufferedCSVReader(FileSystem &fs, Allocator &allocator, FileOpener *opener, BufferedCSVReaderOptions options,
+	                  const vector<LogicalType> &requested_types = vector<LogicalType>());
+	~BufferedCSVReader();
+	unique_ptr<char[]> buffer;
+	idx_t buffer_size;
+	idx_t position;
+	idx_t start = 0;
+	vector<unique_ptr<char[]>> cached_buffers;
+	unique_ptr<CSVFileHandle> file_handle;
+	TextSearchShiftArray delimiter_search, escape_search, quote_search;
+public:
+	//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
+	void ParseCSV(DataChunk &insert_chunk);
+private:
+	//! Initialize Parser
+	void Initialize(const vector<LogicalType> &requested_types);
 	//! Skips skip_rows, reads header row from input stream
 	void SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header);
 	//! Jumps back to the beginning of input stream and resets necessary internal states
 	void JumpToBeginning(idx_t skip_rows, bool skip_header);
-	//! Jumps back to the beginning of input stream and resets necessary internal states
-	bool JumpToNextSample();
 	//! Resets the buffer
 	void ResetBuffer();
 	//! Resets the steam
 	void ResetStream();
+	//! Reads a new buffer from the CSV file if the current one has been exhausted
+	bool ReadBuffer(idx_t &start);
+	//! Jumps back to the beginning of input stream and resets necessary internal states
+	bool JumpToNextSample();
+	//! Initializes the TextSearchShiftArrays for complex parser
+	void PrepareComplexParser();
+	//! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.
+	void ParseCSV(ParserMode mode);
+	//! Try to parse a single datachunk from the file. Returns whether or not the parsing is successful
+	bool TryParseCSV(ParserMode mode);
+	//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
+	bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
 	//! Parses a CSV file with a one-byte delimiter, escape and quote character
 	bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message);
 	//! Parses more complex CSV files with multi-byte delimiters, escapes or quotes
 	bool TryParseComplexCSV(DataChunk &insert_chunk, string &error_message);
-	//! Adds a value to the current row
-	void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes);
-	//! Adds a row to the insert_chunk, returns true if the chunk is filled as a result of this row being added
-	bool AddRow(DataChunk &insert_chunk, idx_t &column);
-	//! Finalizes a chunk, parsing all values that have been added so far and adding them to the insert_chunk
-	void Flush(DataChunk &insert_chunk);
-	//! Reads a new buffer from the CSV file if the current one has been exhausted
-	bool ReadBuffer(idx_t &start);
-	unique_ptr<CSVFileHandle> OpenCSV(const BufferedCSVReaderOptions &options);
+	//! Sniffs CSV dialect and determines skip rows, header row, column types and column names
+	vector<LogicalType> SniffCSV(const vector<LogicalType> &requested_types);
 	//! First phase of auto detection: detect CSV dialect (i.e. delimiter, quote rules, etc)
 	void DetectDialect(const vector<LogicalType> &requested_types, BufferedCSVReaderOptions &original_options,
@@ -28000,13 +28287,6 @@ private:
 	                                        const vector<LogicalType> &requested_types,
 	                                        vector<vector<LogicalType>> &best_sql_types_candidates,
 	                                        map<LogicalTypeId, vector<string>> &best_format_candidates);
-	void VerifyUTF8(idx_t col_idx);
-	void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0);
-private:
-	//! Whether or not the current row's columns have overflown sql_types.size()
-	bool error_column_overflow = false;
 };
 } // namespace duckdb