npm - duckdb - Versions diffs - 0.9.1-dev0.0 → 0.9.1-dev143.0 - Mend

duckdb 0.9.1-dev0.0 → 0.9.1-dev143.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "0.9.1-dev0.0",
+  "version": "0.9.1-dev143.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb/extension/parquet/column_reader.cpp CHANGED Viewed

@@ -243,6 +243,7 @@ void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChun
 void ColumnReader::PrepareRead(parquet_filter_t &filter) {
 	dict_decoder.reset();
 	defined_decoder.reset();
+	bss_decoder.reset();
 	block.reset();
 	PageHeader page_hdr;
 	page_hdr.read(protocol);
@@ -443,6 +444,13 @@ void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
 		PrepareDeltaByteArray(*block);
 		break;
 	}
+	case Encoding::BYTE_STREAM_SPLIT: {
+		// Subtract 1 from length as the block is allocated with 1 extra byte,
+		// but the byte stream split encoder needs to know the correct data size.
+		bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
+		block->inc(block->len);
+		break;
+	}
 	case Encoding::PLAIN:
 		// nothing to do here, will be read directly below
 		break;
@@ -488,7 +496,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
 		idx_t null_count = 0;
-		if ((dict_decoder || dbp_decoder || rle_decoder) && HasDefines()) {
+		if ((dict_decoder || dbp_decoder || rle_decoder || bss_decoder) && HasDefines()) {
 			// we need the null count because the dictionary offsets have no entries for nulls
 			for (idx_t i = 0; i < read_now; i++) {
 				if (define_out[i + result_offset] != max_define) {
@@ -534,6 +542,23 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
 		} else if (byte_array_data) {
 			// DELTA_BYTE_ARRAY or DELTA_LENGTH_BYTE_ARRAY
 			DeltaByteArray(define_out, read_now, filter, result_offset, result);
+		} else if (bss_decoder) {
+			auto read_buf = make_shared<ResizeableBuffer>();
+			switch (schema.type) {
+			case duckdb_parquet::format::Type::FLOAT:
+				read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
+				bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
+				break;
+			case duckdb_parquet::format::Type::DOUBLE:
+				read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
+				bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
+				break;
+			default:
+				throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
+			}
+			Plain(read_buf, define_out, read_now, filter, result_offset, result);
 		} else {
 			PlainReference(block, result);
 			Plain(block, define_out, read_now, filter, result_offset, result);

package/src/duckdb/extension/parquet/include/column_reader.hpp CHANGED Viewed

@@ -9,6 +9,7 @@
 #pragma once
 #include "duckdb.hpp"
+#include "parquet_bss_decoder.hpp"
 #include "parquet_dbp_decoder.hpp"
 #include "parquet_rle_bp_decoder.hpp"
 #include "parquet_statistics.hpp"
@@ -161,6 +162,7 @@ private:
 	unique_ptr<RleBpDecoder> repeated_decoder;
 	unique_ptr<DbpDecoder> dbp_decoder;
 	unique_ptr<RleBpDecoder> rle_decoder;
+	unique_ptr<BssDecoder> bss_decoder;
 	// dummies for Skip()
 	parquet_filter_t none_filter;

package/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp ADDED Viewed

@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// parquet_bss_decoder.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+#include "parquet_types.h"
+#include "resizable_buffer.hpp"
+namespace duckdb {
+/// Decoder for the Byte Stream Split encoding
+class BssDecoder {
+public:
+	/// Create a decoder object. buffer/buffer_len is the encoded data.
+	BssDecoder(data_ptr_t buffer, uint32_t buffer_len) : buffer_(buffer, buffer_len), value_offset_(0) {
+	}
+public:
+	template <typename T>
+	void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
+		if (buffer_.len % sizeof(T) != 0) {
+			std::stringstream error;
+			error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
+			      << ") should be a multiple of the type size (" << sizeof(T) << ")";
+			throw std::runtime_error(error.str());
+		}
+		uint32_t num_buffer_values = buffer_.len / sizeof(T);
+		buffer_.available((value_offset_ + batch_size) * sizeof(T));
+		for (uint32_t byte_offset = 0; byte_offset < sizeof(T); ++byte_offset) {
+			data_ptr_t input_bytes = buffer_.ptr + byte_offset * num_buffer_values + value_offset_;
+			for (uint32_t i = 0; i < batch_size; ++i) {
+				values_target_ptr[byte_offset + i * sizeof(T)] = *(input_bytes + i);
+			}
+		}
+		value_offset_ += batch_size;
+	}
+private:
+	ByteBuffer buffer_;
+	uint32_t value_offset_;
+};
+} // namespace duckdb

package/src/duckdb/src/common/enum_util.cpp CHANGED Viewed

@@ -68,7 +68,7 @@
 #include "duckdb/execution/index/art/node.hpp"
 #include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp"
 #include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
-#include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
+#include "duckdb/execution/operator/scan/csv/csv_state.hpp"
 #include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
 #include "duckdb/function/aggregate_state.hpp"
 #include "duckdb/function/function.hpp"

package/src/duckdb/src/common/serializer/binary_deserializer.cpp CHANGED Viewed

@@ -8,7 +8,8 @@ namespace duckdb {
 void BinaryDeserializer::OnPropertyBegin(const field_id_t field_id, const char *) {
 	auto field = NextField();
 	if (field != field_id) {
-		throw InternalException("Failed to deserialize: field id mismatch, expected: %d, got: %d", field_id, field);
+		throw SerializationException("Failed to deserialize: field id mismatch, expected: %d, got: %d", field_id,
+		                             field);
 	}
 }
@@ -34,7 +35,8 @@ void BinaryDeserializer::OnObjectBegin() {
 void BinaryDeserializer::OnObjectEnd() {
 	auto next_field = NextField();
 	if (next_field != MESSAGE_TERMINATOR_FIELD_ID) {
-		throw InternalException("Failed to deserialize: expected end of object, but found field id: %d", next_field);
+		throw SerializationException("Failed to deserialize: expected end of object, but found field id: %d",
+		                             next_field);
 	}
 	nesting_level--;
 }

package/src/duckdb/src/common/types/data_chunk.cpp CHANGED Viewed

@@ -64,7 +64,7 @@ void DataChunk::InitializeEmpty(vector<LogicalType>::const_iterator begin, vecto
 }
 void DataChunk::Reset() {
-	if (data.empty()) {
+	if (data.empty() || vector_caches.empty()) {
 		return;
 	}
 	if (vector_caches.size() != data.size()) {

package/src/duckdb/src/core_functions/scalar/map/map.cpp CHANGED Viewed

@@ -87,11 +87,24 @@ static bool ListEntriesEqual(Vector &keys, Vector &values, idx_t count) {
 	return true;
 }
+static list_entry_t *GetBiggestList(Vector &key, Vector &value, idx_t &size) {
+	auto key_size = ListVector::GetListSize(key);
+	auto value_size = ListVector::GetListSize(value);
+	if (key_size > value_size) {
+		size = key_size;
+		return ListVector::GetData(key);
+	}
+	size = value_size;
+	return ListVector::GetData(value);
+}
 static void MapFunction(DataChunk &args, ExpressionState &state, Vector &result) {
 	D_ASSERT(result.GetType().id() == LogicalTypeId::MAP);
-	auto &key_vector = MapVector::GetKeys(result);
-	auto &value_vector = MapVector::GetValues(result);
+	auto count = args.size();
+	auto &map_key_vector = MapVector::GetKeys(result);
+	auto &map_value_vector = MapVector::GetValues(result);
 	auto result_data = ListVector::GetData(result);
 	result.SetVectorType(VectorType::CONSTANT_VECTOR);
@@ -99,52 +112,73 @@ static void MapFunction(DataChunk &args, ExpressionState &state, Vector &result)
 		ListVector::SetListSize(result, 0);
 		result_data->offset = 0;
 		result_data->length = 0;
-		result.Verify(args.size());
+		result.Verify(count);
 		return;
 	}
-	bool keys_are_const = args.data[0].GetVectorType() == VectorType::CONSTANT_VECTOR;
-	bool values_are_const = args.data[1].GetVectorType() == VectorType::CONSTANT_VECTOR;
-	if (!keys_are_const || !values_are_const) {
-		result.SetVectorType(VectorType::FLAT_VECTOR);
+	D_ASSERT(args.ColumnCount() == 2);
+	auto &key_vector = args.data[0];
+	auto &value_vector = args.data[1];
+	if (args.AllConstant()) {
+		auto key_data = ListVector::GetData(key_vector);
+		auto value_data = ListVector::GetData(value_vector);
+		auto key_entry = key_data[0];
+		auto value_entry = value_data[0];
+		if (key_entry != value_entry) {
+			throw BinderException("Key and value list sizes don't match");
+		}
+		result_data[0] = key_entry;
+		ListVector::SetListSize(result, ListVector::GetListSize(key_vector));
+		map_key_vector.Reference(ListVector::GetEntry(key_vector));
+		map_value_vector.Reference(ListVector::GetEntry(value_vector));
+		MapVector::MapConversionVerify(result, count);
+		result.Verify(count);
+		return;
 	}
-	auto key_count = ListVector::GetListSize(args.data[0]);
-	auto value_count = ListVector::GetListSize(args.data[1]);
-	auto key_data = ListVector::GetData(args.data[0]);
-	auto value_data = ListVector::GetData(args.data[1]);
-	auto src_data = key_data;
-	if (keys_are_const && !values_are_const) {
-		AlignVectorToReference(args.data[0], args.data[1], args.size(), key_vector);
-		src_data = value_data;
-	} else if (values_are_const && !keys_are_const) {
-		AlignVectorToReference(args.data[1], args.data[0], args.size(), value_vector);
+	result.SetVectorType(VectorType::FLAT_VECTOR);
+	if (key_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) {
+		D_ASSERT(value_vector.GetVectorType() != VectorType::CONSTANT_VECTOR);
+		Vector expanded_const(ListType::GetChildType(key_vector.GetType()), count);
+		AlignVectorToReference(key_vector, value_vector, count, expanded_const);
+		map_key_vector.Reference(expanded_const);
+		value_vector.Flatten(count);
+		map_value_vector.Reference(ListVector::GetEntry(value_vector));
+	} else if (value_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) {
+		D_ASSERT(key_vector.GetVectorType() != VectorType::CONSTANT_VECTOR);
+		Vector expanded_const(ListType::GetChildType(value_vector.GetType()), count);
+		AlignVectorToReference(value_vector, key_vector, count, expanded_const);
+		map_value_vector.Reference(expanded_const);
+		key_vector.Flatten(count);
+		map_key_vector.Reference(ListVector::GetEntry(key_vector));
 	} else {
-		if (!ListEntriesEqual(args.data[0], args.data[1], args.size())) {
+		key_vector.Flatten(count);
+		value_vector.Flatten(count);
+		if (!ListEntriesEqual(key_vector, value_vector, count)) {
 			throw InvalidInputException("Error in MAP creation: key list and value list do not align. i.e. different "
 			                            "size or incompatible structure");
 		}
+		map_value_vector.Reference(ListVector::GetEntry(value_vector));
+		map_key_vector.Reference(ListVector::GetEntry(key_vector));
 	}
-	ListVector::SetListSize(result, MaxValue(key_count, value_count));
+	idx_t list_size;
+	auto src_data = GetBiggestList(key_vector, value_vector, list_size);
+	ListVector::SetListSize(result, list_size);
 	result_data = ListVector::GetData(result);
-	for (idx_t i = 0; i < args.size(); i++) {
+	for (idx_t i = 0; i < count; i++) {
 		result_data[i] = src_data[i];
 	}
-	// check whether one of the vectors has already been referenced to an expanded vector in the case of const/non-const
-	// combination. If not, then referencing is still necessary
-	if (!(keys_are_const && !values_are_const)) {
-		key_vector.Reference(ListVector::GetEntry(args.data[0]));
-	}
-	if (!(values_are_const && !keys_are_const)) {
-		value_vector.Reference(ListVector::GetEntry(args.data[1]));
-	}
-	MapVector::MapConversionVerify(result, args.size());
-	result.Verify(args.size());
+	MapVector::MapConversionVerify(result, count);
+	result.Verify(count);
 }
 static unique_ptr<FunctionData> MapBind(ClientContext &context, ScalarFunction &bound_function,

package/src/duckdb/src/execution/expression_executor/execute_reference.cpp CHANGED Viewed

@@ -6,7 +6,7 @@ namespace duckdb {
 unique_ptr<ExpressionState> ExpressionExecutor::InitializeState(const BoundReferenceExpression &expr,
                                                                 ExpressionExecutorState &root) {
 	auto result = make_uniq<ExpressionState>(expr, root);
-	result->Finalize();
+	result->Finalize(true);
 	return result;
 }

package/src/duckdb/src/execution/expression_executor_state.cpp CHANGED Viewed

@@ -1,4 +1,5 @@
 #include "duckdb/execution/expression_executor_state.hpp"
 #include "duckdb/execution/expression_executor.hpp"
 #include "duckdb/planner/expression.hpp"
 #include "duckdb/planner/expression/bound_function_expression.hpp"
@@ -10,8 +11,13 @@ void ExpressionState::AddChild(Expression *expr) {
 	child_states.push_back(ExpressionExecutor::InitializeState(*expr, root));
 }
-void ExpressionState::Finalize() {
-	if (!types.empty()) {
+void ExpressionState::Finalize(bool empty) {
+	if (types.empty()) {
+		return;
+	}
+	if (empty) {
+		intermediate_chunk.InitializeEmpty(types);
+	} else {
 		intermediate_chunk.Initialize(GetAllocator(), types);
 	}
 }

package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp CHANGED Viewed

@@ -3,8 +3,8 @@
 namespace duckdb {
-void InitializeTransitionArray(unsigned char *transition_array, const uint8_t state) {
-	for (uint32_t i = 0; i < NUM_TRANSITIONS; i++) {
+void InitializeTransitionArray(CSVState *transition_array, const CSVState state) {
+	for (uint32_t i = 0; i < StateMachine::NUM_TRANSITIONS; i++) {
 		transition_array[i] = state;
 	}
 }
@@ -13,72 +13,65 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
 	D_ASSERT(state_machine_cache.find(state_machine_options) == state_machine_cache.end());
 	// Initialize transition array with default values to the Standard option
 	auto &transition_array = state_machine_cache[state_machine_options];
-	const uint8_t standard_state = static_cast<uint8_t>(CSVState::STANDARD);
-	const uint8_t field_separator_state = static_cast<uint8_t>(CSVState::DELIMITER);
-	const uint8_t record_separator_state = static_cast<uint8_t>(CSVState::RECORD_SEPARATOR);
-	const uint8_t carriage_return_state = static_cast<uint8_t>(CSVState::CARRIAGE_RETURN);
-	const uint8_t quoted_state = static_cast<uint8_t>(CSVState::QUOTED);
-	const uint8_t unquoted_state = static_cast<uint8_t>(CSVState::UNQUOTED);
-	const uint8_t escape_state = static_cast<uint8_t>(CSVState::ESCAPE);
-	const uint8_t empty_line_state = static_cast<uint8_t>(CSVState::EMPTY_LINE);
-	const uint8_t invalid_state = static_cast<uint8_t>(CSVState::INVALID);
-	for (uint32_t i = 0; i < NUM_STATES; i++) {
-		switch (i) {
-		case quoted_state:
-			InitializeTransitionArray(transition_array[i], quoted_state);
+	for (uint32_t i = 0; i < StateMachine::NUM_STATES; i++) {
+		CSVState cur_state = CSVState(i);
+		switch (cur_state) {
+		case CSVState::QUOTED:
+			InitializeTransitionArray(transition_array[cur_state], CSVState::QUOTED);
 			break;
-		case unquoted_state:
-		case invalid_state:
-		case escape_state:
-			InitializeTransitionArray(transition_array[i], invalid_state);
+		case CSVState::UNQUOTED:
+		case CSVState::INVALID:
+		case CSVState::ESCAPE:
+			InitializeTransitionArray(transition_array[cur_state], CSVState::INVALID);
 			break;
 		default:
-			InitializeTransitionArray(transition_array[i], standard_state);
+			InitializeTransitionArray(transition_array[cur_state], CSVState::STANDARD);
 			break;
 		}
 	}
 	// Now set values depending on configuration
 	// 1) Standard State
-	transition_array[standard_state][static_cast<uint8_t>(state_machine_options.delimiter)] = field_separator_state;
-	transition_array[standard_state][static_cast<uint8_t>('\n')] = record_separator_state;
-	transition_array[standard_state][static_cast<uint8_t>('\r')] = carriage_return_state;
-	transition_array[standard_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
+	transition_array[CSVState::STANDARD][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
+	transition_array[CSVState::STANDARD][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
+	transition_array[CSVState::STANDARD][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
+	transition_array[CSVState::STANDARD][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
 	// 2) Field Separator State
-	transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.delimiter)] =
-	    field_separator_state;
-	transition_array[field_separator_state][static_cast<uint8_t>('\n')] = record_separator_state;
-	transition_array[field_separator_state][static_cast<uint8_t>('\r')] = carriage_return_state;
-	transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
+	transition_array[CSVState::DELIMITER][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
+	transition_array[CSVState::DELIMITER][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
+	transition_array[CSVState::DELIMITER][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
+	transition_array[CSVState::DELIMITER][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
 	// 3) Record Separator State
-	transition_array[record_separator_state][static_cast<uint8_t>(state_machine_options.delimiter)] =
-	    field_separator_state;
-	transition_array[record_separator_state][static_cast<uint8_t>('\n')] = empty_line_state;
-	transition_array[record_separator_state][static_cast<uint8_t>('\r')] = empty_line_state;
-	transition_array[record_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
+	transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>(state_machine_options.delimiter)] =
+	    CSVState::DELIMITER;
+	transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
+	transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
+	transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
 	// 4) Carriage Return State
-	transition_array[carriage_return_state][static_cast<uint8_t>('\n')] = record_separator_state;
-	transition_array[carriage_return_state][static_cast<uint8_t>('\r')] = empty_line_state;
-	transition_array[carriage_return_state][static_cast<uint8_t>(state_machine_options.escape)] = escape_state;
+	transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
+	transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
+	transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::ESCAPE;
 	// 5) Quoted State
-	transition_array[quoted_state][static_cast<uint8_t>(state_machine_options.quote)] = unquoted_state;
+	transition_array[CSVState::QUOTED][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::UNQUOTED;
 	if (state_machine_options.quote != state_machine_options.escape) {
-		transition_array[quoted_state][static_cast<uint8_t>(state_machine_options.escape)] = escape_state;
+		transition_array[CSVState::QUOTED][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::ESCAPE;
 	}
 	// 6) Unquoted State
-	transition_array[unquoted_state][static_cast<uint8_t>('\n')] = record_separator_state;
-	transition_array[unquoted_state][static_cast<uint8_t>('\r')] = carriage_return_state;
-	transition_array[unquoted_state][static_cast<uint8_t>(state_machine_options.delimiter)] = field_separator_state;
+	transition_array[CSVState::UNQUOTED][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
+	transition_array[CSVState::UNQUOTED][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
+	transition_array[CSVState::UNQUOTED][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
 	if (state_machine_options.quote == state_machine_options.escape) {
-		transition_array[unquoted_state][static_cast<uint8_t>(state_machine_options.escape)] = quoted_state;
+		transition_array[CSVState::UNQUOTED][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::QUOTED;
 	}
 	// 7) Escaped State
-	transition_array[escape_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
-	transition_array[escape_state][static_cast<uint8_t>(state_machine_options.escape)] = quoted_state;
+	transition_array[CSVState::ESCAPE][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
+	transition_array[CSVState::ESCAPE][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::QUOTED;
 	// 8) Empty Line State
-	transition_array[empty_line_state][static_cast<uint8_t>('\r')] = empty_line_state;
-	transition_array[empty_line_state][static_cast<uint8_t>('\n')] = empty_line_state;
+	transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
+	transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
+	transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
+	transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
 }
 CSVStateMachineCache::CSVStateMachineCache() {
@@ -95,7 +88,7 @@ CSVStateMachineCache::CSVStateMachineCache() {
 	}
 }
-const state_machine_t &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) {
+const StateMachine &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) {
 	//! Custom State Machine, we need to create it and cache it first
 	if (state_machine_cache.find(state_machine_options) == state_machine_cache.end()) {
 		Insert(state_machine_options);

package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp CHANGED Viewed

@@ -49,11 +49,12 @@ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl,
 	return (carry && carry_followed_by_nl) || (!carry && first_char);
 }
-void ParallelCSVReader::SkipEmptyLines() {
+bool ParallelCSVReader::SkipEmptyLines() {
+	const idx_t initial_position_buffer = position_buffer;
 	idx_t new_pos_buffer = position_buffer;
 	if (parse_chunk.data.size() == 1) {
 		// Empty lines are null data.
-		return;
+		return initial_position_buffer != position_buffer;
 	}
 	for (; new_pos_buffer < end_buffer; new_pos_buffer++) {
 		if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) {
@@ -63,13 +64,14 @@ void ParallelCSVReader::SkipEmptyLines() {
 				position_buffer++;
 			}
 			if (new_pos_buffer > end_buffer) {
-				return;
+				return initial_position_buffer != position_buffer;
 			}
 			position_buffer = new_pos_buffer;
 		} else if ((*buffer)[new_pos_buffer] != ' ') {
-			return;
+			return initial_position_buffer != position_buffer;
 		}
 	}
+	return initial_position_buffer != position_buffer;
 }
 bool ParallelCSVReader::SetPosition() {
@@ -185,7 +187,6 @@ bool ParallelCSVReader::SetPosition() {
 	}
 	// Ensure that parse_chunk has no gunk when trying to figure new line
 	parse_chunk.Reset();
 	verification_positions.end_of_last_line = position_buffer;
 	finished = false;
 	return successfully_read_first_line;
@@ -288,7 +289,7 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
 	idx_t column = 0;
 	idx_t offset = 0;
 	bool has_quotes = false;
+	bool last_line_empty = false;
 	vector<idx_t> escape_positions;
 	if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
 		// First time reading this buffer piece
@@ -454,7 +455,10 @@ add_row : {
 		if (!BufferRemainder()) {
 			goto final_state;
 		}
-		SkipEmptyLines();
+		if (SkipEmptyLines() && reached_remainder_state) {
+			last_line_empty = true;
+			goto final_state;
+		}
 		if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
 			error_message = "Line does not fit in one buffer. Increase the buffer size.";
 			return false;
@@ -583,8 +587,8 @@ final_state : {
 		return true;
 	}
 	// If this is the last buffer, we have to read the last value
-	if (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
-	    (buffer->next_buffer && buffer->next_buffer->is_last_buffer)) {
+	if (!last_line_empty && (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
+	                         (buffer->next_buffer && buffer->next_buffer->is_last_buffer))) {
 		if (column > 0 || start_buffer != position_buffer || try_add_line ||
 		    (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
 			// remaining values to be added to the chunk

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp CHANGED Viewed

@@ -22,30 +22,9 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager>
 	}
 }
-SnifferResult CSVSniffer::SniffCSV() {
-	// 1. Dialect Detection
-	DetectDialect();
-	if (explicit_set_columns) {
-		if (!candidates.empty()) {
-			options.dialect_options.state_machine_options = candidates[0]->dialect_options.state_machine_options;
-			options.dialect_options.new_line = candidates[0]->dialect_options.new_line;
-		}
-		// We do not need to run type and header detection as these were defined by the user
-		return SnifferResult(detected_types, names);
-	}
-	// 2. Type Detection
-	DetectTypes();
-	// 3. Header Detection
-	DetectHeader();
-	D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());
-	// 4. Type Replacement
-	ReplaceTypes();
-	// 5. Type Refinement
-	RefineTypes();
-	// We are done, construct and return the result.
-	// Set the CSV Options in the reference
+void CSVSniffer::SetResultOptions() {
 	options.dialect_options = best_candidate->dialect_options;
+	options.dialect_options.new_line = best_candidate->dialect_options.new_line;
 	options.has_header = best_candidate->dialect_options.header;
 	options.skip_rows_set = options.dialect_options.skip_rows > 0;
 	if (options.has_header) {
@@ -53,8 +32,27 @@ SnifferResult CSVSniffer::SniffCSV() {
 	} else {
 		options.dialect_options.true_start = best_start_without_header;
 	}
+}
-	// Return the types and names
+SnifferResult CSVSniffer::SniffCSV() {
+	// 1. Dialect Detection
+	DetectDialect();
+	// 2. Type Detection
+	DetectTypes();
+	// 3. Type Refinement
+	RefineTypes();
+	// 4. Header Detection
+	DetectHeader();
+	if (explicit_set_columns) {
+		SetResultOptions();
+		// We do not need to run type refinement, since the types have been given by the user
+		return SnifferResult({}, {});
+	}
+	// 5. Type Replacement
+	ReplaceTypes();
+	D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());
+	// We are done, Set the CSV Options in the reference. Construct and return the result.
+	SetResultOptions();
 	return SnifferResult(detected_types, names);
 }

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp CHANGED Viewed

@@ -5,9 +5,9 @@ namespace duckdb {
 struct SniffDialect {
 	inline static void Initialize(CSVStateMachine &machine) {
-		machine.state = CSVState::STANDARD;
-		machine.previous_state = CSVState::STANDARD;
-		machine.pre_previous_state = CSVState::STANDARD;
+		machine.state = CSVState::EMPTY_LINE;
+		machine.previous_state = CSVState::EMPTY_LINE;
+		machine.pre_previous_state = CSVState::EMPTY_LINE;
 		machine.cur_rows = 0;
 		machine.column_count = 1;
 	}
@@ -21,17 +21,12 @@ struct SniffDialect {
 			sniffed_column_counts.clear();
 			return true;
 		}
-		machine.pre_previous_state = machine.previous_state;
-		machine.previous_state = machine.state;
-		machine.state = static_cast<CSVState>(
-		    machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
+		machine.Transition(current_char);
 		bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
 		machine.column_count += machine.previous_state == CSVState::DELIMITER;
 		sniffed_column_counts[machine.cur_rows] = machine.column_count;
-		machine.cur_rows +=
-		    machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
+		machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
 		machine.column_count -= (machine.column_count - 1) * (machine.previous_state == CSVState::RECORD_SEPARATOR);
 		// It means our carriage return is actually a record separator
@@ -304,7 +299,7 @@ void CSVSniffer::DetectDialect() {
 	unordered_map<uint8_t, vector<char>> quote_candidates_map;
 	// Candidates for the escape option
 	unordered_map<uint8_t, vector<char>> escape_candidates_map;
-	escape_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {'\0', '\"', '\''};
+	escape_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {'\"', '\'', '\0'};
 	escape_candidates_map[(uint8_t)QuoteRule::QUOTES_OTHER] = {'\\'};
 	escape_candidates_map[(uint8_t)QuoteRule::NO_QUOTES] = {'\0'};
 	// Number of rows read