npm - duckdb - Versions diffs - 0.9.1-dev19.0 → 0.9.1-dev67.0 - Mend

duckdb 0.9.1-dev19.0 → 0.9.1-dev67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "0.9.1-dev19.0",
+  "version": "0.9.1-dev67.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp CHANGED Viewed

@@ -49,11 +49,12 @@ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl,
 	return (carry && carry_followed_by_nl) || (!carry && first_char);
 }
-void ParallelCSVReader::SkipEmptyLines() {
+bool ParallelCSVReader::SkipEmptyLines() {
+	const idx_t initial_position_buffer = position_buffer;
 	idx_t new_pos_buffer = position_buffer;
 	if (parse_chunk.data.size() == 1) {
 		// Empty lines are null data.
-		return;
+		return initial_position_buffer != position_buffer;
 	}
 	for (; new_pos_buffer < end_buffer; new_pos_buffer++) {
 		if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) {
@@ -63,13 +64,14 @@ void ParallelCSVReader::SkipEmptyLines() {
 				position_buffer++;
 			}
 			if (new_pos_buffer > end_buffer) {
-				return;
+				return initial_position_buffer != position_buffer;
 			}
 			position_buffer = new_pos_buffer;
 		} else if ((*buffer)[new_pos_buffer] != ' ') {
-			return;
+			return initial_position_buffer != position_buffer;
 		}
 	}
+	return initial_position_buffer != position_buffer;
 }
 bool ParallelCSVReader::SetPosition() {
@@ -185,7 +187,6 @@ bool ParallelCSVReader::SetPosition() {
 	}
 	// Ensure that parse_chunk has no gunk when trying to figure new line
 	parse_chunk.Reset();
 	verification_positions.end_of_last_line = position_buffer;
 	finished = false;
 	return successfully_read_first_line;
@@ -288,7 +289,7 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
 	idx_t column = 0;
 	idx_t offset = 0;
 	bool has_quotes = false;
+	bool last_line_empty = false;
 	vector<idx_t> escape_positions;
 	if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
 		// First time reading this buffer piece
@@ -454,7 +455,10 @@ add_row : {
 		if (!BufferRemainder()) {
 			goto final_state;
 		}
-		SkipEmptyLines();
+		if (SkipEmptyLines() && reached_remainder_state) {
+			last_line_empty = true;
+			goto final_state;
+		}
 		if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
 			error_message = "Line does not fit in one buffer. Increase the buffer size.";
 			return false;
@@ -583,8 +587,8 @@ final_state : {
 		return true;
 	}
 	// If this is the last buffer, we have to read the last value
-	if (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
-	    (buffer->next_buffer && buffer->next_buffer->is_last_buffer)) {
+	if (!last_line_empty && (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
+	                         (buffer->next_buffer && buffer->next_buffer->is_last_buffer))) {
 		if (column > 0 || start_buffer != position_buffer || try_add_line ||
 		    (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
 			// remaining values to be added to the chunk

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp CHANGED Viewed

@@ -26,8 +26,7 @@ struct SniffDialect {
 		bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
 		machine.column_count += machine.previous_state == CSVState::DELIMITER;
 		sniffed_column_counts[machine.cur_rows] = machine.column_count;
-		machine.cur_rows +=
-		    machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
+		machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
 		machine.column_count -= (machine.column_count - 1) * (machine.previous_state == CSVState::RECORD_SEPARATOR);
 		// It means our carriage return is actually a record separator

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp CHANGED Viewed

@@ -143,7 +143,7 @@ struct SniffValue {
 			machine.rows_read++;
 		}
-		if ((machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
+		if ((machine.previous_state == CSVState::RECORD_SEPARATOR) ||
 		    (machine.state != CSVState::RECORD_SEPARATOR && machine.previous_state == CSVState::CARRIAGE_RETURN)) {
 			sniffed_values[machine.cur_rows].position = machine.line_start_pos;
 			sniffed_values[machine.cur_rows].set = true;
@@ -153,8 +153,7 @@ struct SniffValue {
 		machine.Transition(current_char);
 		bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
-		if (machine.previous_state == CSVState::DELIMITER ||
-		    (machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
+		if (machine.previous_state == CSVState::DELIMITER || (machine.previous_state == CSVState::RECORD_SEPARATOR) ||
 		    (machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) {
 			// Started a new value
 			// Check if it's UTF-8
@@ -173,8 +172,7 @@ struct SniffValue {
 		    (machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
 			machine.value += current_char;
 		}
-		machine.cur_rows +=
-		    machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
+		machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
 		// It means our carriage return is actually a record separator
 		machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
 		if (machine.cur_rows >= sniffed_values.size()) {

package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp CHANGED Viewed

@@ -3,9 +3,9 @@
 namespace duckdb {
 struct Parse {
 	inline static void Initialize(CSVStateMachine &machine) {
-		machine.state = CSVState::STANDARD;
-		machine.previous_state = CSVState::STANDARD;
-		machine.pre_previous_state = CSVState::STANDARD;
+		machine.state = CSVState::EMPTY_LINE;
+		machine.previous_state = CSVState::EMPTY_LINE;
+		machine.pre_previous_state = CSVState::EMPTY_LINE;
 		machine.cur_rows = 0;
 		machine.column_count = 0;
@@ -17,16 +17,15 @@ struct Parse {
 		machine.Transition(current_char);
 		bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
-		if (machine.previous_state == CSVState::DELIMITER ||
-		    (machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
+		if (machine.previous_state == CSVState::DELIMITER || (machine.previous_state == CSVState::RECORD_SEPARATOR) ||
 		    (machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) {
 			// Started a new value
 			// Check if it's UTF-8 (Or not?)
 			machine.VerifyUTF8();
 			auto &v = parse_chunk.data[machine.column_count++];
 			auto parse_data = FlatVector::GetData<string_t>(v);
-			auto &validity_mask = FlatVector::Validity(v);
 			if (machine.value.empty()) {
+				auto &validity_mask = FlatVector::Validity(v);
 				validity_mask.SetInvalid(machine.cur_rows);
 			} else {
 				parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
@@ -47,12 +46,11 @@ struct Parse {
 		    (machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
 			machine.value += current_char;
 		}
-		machine.cur_rows +=
-		    machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
+		machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR && machine.column_count > 0;
 		machine.column_count -= machine.column_count * (machine.previous_state == CSVState::RECORD_SEPARATOR);
 		// It means our carriage return is actually a record separator
-		machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
+		machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return && machine.column_count > 0;
 		machine.column_count -= machine.column_count * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return);
 		if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {

package/src/duckdb/src/function/table/read_csv.cpp CHANGED Viewed

@@ -38,7 +38,7 @@ void ReadCSVData::FinalizeRead(ClientContext &context) {
 	auto number_of_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
 	//! If we have many csv files, we run single-threaded on each file and parallelize on the number of files
 	bool many_csv_files = files.size() > 1 && int64_t(files.size() * 2) >= number_of_threads;
-	if (options.parallel_mode != ParallelMode::PARALLEL && many_csv_files) {
+	if (options.parallel_mode != ParallelMode::PARALLEL && (many_csv_files || number_of_threads == 1)) {
 		single_threaded = true;
 	}
 	if (options.parallel_mode == ParallelMode::SINGLE_THREADED || not_supported_options ||

package/src/duckdb/src/function/table/version/pragma_version.cpp CHANGED Viewed

@@ -1,8 +1,8 @@
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "0.9.1-dev19"
+#define DUCKDB_VERSION "v0.9.1-dev67"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "1ea87567af"
+#define DUCKDB_SOURCE_ID "7512d7ff4f"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"

package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp CHANGED Viewed

@@ -148,7 +148,7 @@ private:
 	//! Sets Position depending on the byte_start of this thread
 	bool SetPosition();
 	//! Called when scanning the 1st buffer, skips empty lines
-	void SkipEmptyLines();
+	bool SkipEmptyLines();
 	//! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
 	//! Up to finding a new line. This function sets the buffer_end and marks a boolean variable
 	//! when changing the buffer end the first time.

package/src/duckdb/src/optimizer/common_aggregate_optimizer.cpp CHANGED Viewed

@@ -38,8 +38,8 @@ void CommonAggregateOptimizer::ExtractCommonAggregates(LogicalAggregate &aggr) {
 			// aggregate does not exist yet: add it to the map
 			aggregate_remap[*aggr.expressions[i]] = i;
 			if (i != original_index) {
-				// this aggregate is not erased, however an agregate BEFORE it has been erased
-				// so we need to remap this aggregaet
+				// this aggregate is not erased, however an aggregate BEFORE it has been erased
+				// so we need to remap this aggregate
 				ColumnBinding original_binding(aggr.aggregate_index, original_index);
 				ColumnBinding new_binding(aggr.aggregate_index, i);
 				aggregate_map[original_binding] = new_binding;