npm - duckdb - Versions diffs - 0.8.2-dev4376.0 → 0.8.2-dev4424.0 - Mend

duckdb 0.8.2-dev4376.0 → 0.8.2-dev4424.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "0.8.2-dev4376.0",
+  "version": "0.8.2-dev4424.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb/src/common/types/date.cpp CHANGED Viewed

@@ -492,7 +492,7 @@ int32_t Date::ExtractDayOfTheYear(date_t date) {
 int64_t Date::ExtractJulianDay(date_t date) {
 	// Julian Day 0 is (-4713, 11, 24) in the proleptic Gregorian calendar.
-	static const auto JULIAN_EPOCH = -2440588;
+	static const int64_t JULIAN_EPOCH = -2440588;
 	return date.days - JULIAN_EPOCH;
 }

package/src/duckdb/src/execution/index/fixed_size_buffer.cpp CHANGED Viewed

@@ -148,9 +148,6 @@ void FixedSizeBuffer::Pin() {
 uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
-	// this function calls Get() on the buffer, so the buffer must already be in memory
-	D_ASSERT(InMemory());
 	// get the bitmask data
 	auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());
 	ValidityMask mask(bitmask_ptr);
@@ -200,7 +197,7 @@ uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
 uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
-	// this function calls Get() on the buffer, so the buffer must already be in memory
+	// this function calls Get() on the buffer
 	D_ASSERT(InMemory());
 	// finds the maximum zero bit in a bitmask, and adds one to it,
@@ -259,17 +256,13 @@ uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
 	}
 	// there are no allocations in this buffer
-	// FIXME: put this line back in and then fix the missing vacuum bug in
-	// FIXME: test_index_large_aborted_append.test with force_restart
-	// FIXME: test if we still have non-dirty buffer to serialize after fixing this
-	//	throw InternalException("tried to serialize empty buffer");
-	return 0;
+	throw InternalException("tried to serialize empty buffer");
 }
 void FixedSizeBuffer::SetUninitializedRegions(PartialBlockForIndex &p_block_for_index, const idx_t segment_size,
                                               const idx_t offset, const idx_t bitmask_offset) {
-	// this function calls Get() on the buffer, so the buffer must already be in memory
+	// this function calls Get() on the buffer
 	D_ASSERT(InMemory());
 	auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());

package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp CHANGED Viewed

@@ -89,17 +89,19 @@ bool ParallelCSVReader::SetPosition() {
 						position_buffer++;
 					}
 					if (position_buffer > end_buffer) {
+						VerifyLineLength(position_buffer, buffer->batch_index);
 						return false;
 					}
 					SkipEmptyLines();
 					if (verification_positions.beginning_of_first_line == 0) {
 						verification_positions.beginning_of_first_line = position_buffer;
 					}
+					VerifyLineLength(position_buffer, buffer->batch_index);
 					verification_positions.end_of_last_line = position_buffer;
 					return true;
 				}
 			}
+			VerifyLineLength(position_buffer, buffer->batch_index);
 			return false;
 		}
 		SkipEmptyLines();
@@ -143,12 +145,13 @@ bool ParallelCSVReader::SetPosition() {
 			break;
 		}
-		if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
+		auto pos_check = position_buffer == 0 ? position_buffer : position_buffer - 1;
+		if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[pos_check])) {
 			break;
 		}
 		if (position_buffer > end_buffer && options.dialect_options.new_line == NewLineIdentifier::CARRY_ON &&
-		    (*buffer)[position_buffer - 1] == '\n') {
+		    (*buffer)[pos_check] == '\n') {
 			break;
 		}
 		idx_t position_set = position_buffer;

package/src/duckdb/src/function/table/read_csv.cpp CHANGED Viewed

@@ -300,7 +300,7 @@ public:
 	                       const CSVReaderOptions &options, idx_t system_threads_p, const vector<string> &files_path_p,
 	                       bool force_parallelism_p, vector<column_t> column_ids_p)
 	    : buffer_manager(std::move(buffer_manager_p)), system_threads(system_threads_p),
-	      buffer_size(options.buffer_size), force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
+	      force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
 	      line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
 		current_file_path = files_path_p[0];
 		CSVFileHandle *file_handle_ptr;
@@ -316,16 +316,6 @@ public:
 		first_file_size = file_size;
 		on_disk_file = file_handle_ptr->OnDiskFile();
 		bytes_read = 0;
-		if (buffer_size < file_size || file_size == 0) {
-			bytes_per_local_state = buffer_size / ParallelCSVGlobalState::MaxThreads();
-		} else {
-			bytes_per_local_state = file_size / MaxThreads();
-		}
-		if (bytes_per_local_state == 0) {
-			// In practice, I think this won't happen, it only happens because we are mocking up test scenarios
-			// this boy needs to be at least one.
-			bytes_per_local_state = 1;
-		}
 		running_threads = MaxThreads();
 		// Initialize all the book-keeping variables
@@ -368,8 +358,6 @@ public:
 	void UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx);
-	void IncrementThread();
 	void DecrementThread();
 	bool Finished();
@@ -402,16 +390,12 @@ private:
 	mutex main_mutex;
 	//! Byte set from for last thread
 	idx_t next_byte = 0;
-	//! How many bytes we should execute per local state
-	idx_t bytes_per_local_state;
 	//! Size of first file
 	idx_t first_file_size = 0;
 	//! Whether or not this is an on-disk file
 	bool on_disk_file = true;
 	//! Basically max number of threads in DuckDB
 	idx_t system_threads;
-	//! Size of the buffers
-	idx_t buffer_size;
 	//! Current batch index
 	idx_t batch_index = 0;
 	idx_t local_batch_index = 0;
@@ -454,11 +438,6 @@ idx_t ParallelCSVGlobalState::MaxThreads() const {
 	return system_threads;
 }
-void ParallelCSVGlobalState::IncrementThread() {
-	lock_guard<mutex> parallel_lock(main_mutex);
-	running_threads++;
-}
 void ParallelCSVGlobalState::DecrementThread() {
 	lock_guard<mutex> parallel_lock(main_mutex);
 	D_ASSERT(running_threads > 0);
@@ -572,6 +551,7 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
 	}
 	// set up the current buffer
 	line_info.current_batches[file_index - 1].insert(local_batch_index);
+	idx_t bytes_per_local_state = current_buffer->actual_size / MaxThreads() + 1;
 	auto result = make_uniq<CSVBufferRead>(
 	    buffer_manager->GetBuffer(cur_buffer_idx), buffer_manager->GetBuffer(cur_buffer_idx + 1), next_byte,
 	    next_byte + bytes_per_local_state, batch_index++, local_batch_index++, &line_info);
@@ -1135,6 +1115,9 @@ unique_ptr<TableRef> ReadCSVReplacement(ClientContext &context, const string &ta
 	if (StringUtil::EndsWith(lower_name, ".gz")) {
 		lower_name = lower_name.substr(0, lower_name.size() - 3);
 	} else if (StringUtil::EndsWith(lower_name, ".zst")) {
+		if (!Catalog::TryAutoLoad(context, "parquet")) {
+			throw MissingExtensionException("parquet extension is required for reading zst compressed file");
+		}
 		lower_name = lower_name.substr(0, lower_name.size() - 4);
 	}
 	if (!StringUtil::EndsWith(lower_name, ".csv") && !StringUtil::Contains(lower_name, ".csv?") &&

package/src/duckdb/src/function/table/version/pragma_version.cpp CHANGED Viewed

@@ -1,8 +1,8 @@
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "0.8.2-dev4376"
+#define DUCKDB_VERSION "0.8.2-dev4424"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "312b995450"
+#define DUCKDB_SOURCE_ID "b78b24ad26"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"

package/src/duckdb/src/include/duckdb/main/query_result.hpp CHANGED Viewed

@@ -40,7 +40,7 @@ public:
 	vector<string> names;
 public:
-	DUCKDB_API void ThrowError(const string &prepended_message = "") const;
+	[[noreturn]] DUCKDB_API void ThrowError(const string &prepended_message = "") const;
 	DUCKDB_API void SetError(PreservedError error);
 	DUCKDB_API bool HasError() const;
 	DUCKDB_API const ExceptionType &GetErrorType() const;

package/src/duckdb/src/storage/checkpoint_manager.cpp CHANGED Viewed

@@ -363,63 +363,64 @@ void CheckpointWriter::WriteIndex(IndexCatalogEntry &index_catalog, Serializer &
 void CheckpointReader::ReadIndex(ClientContext &context, Deserializer &deserializer) {
-	// Deserialize the index metadata
-	auto info = deserializer.ReadProperty<unique_ptr<CreateInfo>>(100, "index");
-	auto &index_info = info->Cast<CreateIndexInfo>();
-	// Create the index in the catalog
-	auto &schema_catalog = catalog.GetSchema(context, info->schema);
-	auto &table_catalog =
-	    catalog.GetEntry(context, CatalogType::TABLE_ENTRY, info->schema, index_info.table).Cast<DuckTableEntry>();
-	auto &index_catalog = schema_catalog.CreateIndex(context, index_info, table_catalog)->Cast<DuckIndexEntry>();
-	index_catalog.info = table_catalog.GetStorage().info;
-	// We deserialize the index lazily, i.e., we do not need to load any node information
+	// deserialize the index create info
+	auto create_info = deserializer.ReadProperty<unique_ptr<CreateInfo>>(100, "index");
+	auto &info = create_info->Cast<CreateIndexInfo>();
+	// create the index in the catalog
+	auto &schema = catalog.GetSchema(context, create_info->schema);
+	auto &table =
+	    catalog.GetEntry(context, CatalogType::TABLE_ENTRY, create_info->schema, info.table).Cast<DuckTableEntry>();
+	auto &index = schema.CreateIndex(context, info, table)->Cast<DuckIndexEntry>();
+	index.info = table.GetStorage().info;
+	// insert the parsed expressions into the stored index so that we correctly (de)serialize it during consecutive
+	// checkpoints
+	for (auto &parsed_expr : info.parsed_expressions) {
+		index.parsed_expressions.push_back(parsed_expr->Copy());
+	}
+	// we deserialize the index lazily, i.e., we do not need to load any node information
 	// except the root block pointer
-	auto index_block_pointer = deserializer.ReadProperty<BlockPointer>(101, "root_block_pointer");
+	auto root_block_pointer = deserializer.ReadProperty<BlockPointer>(101, "root_block_pointer");
-	// obtain the expressions of the ART from the index metadata
-	vector<unique_ptr<Expression>> unbound_expressions;
+	// obtain the parsed expressions of the ART from the index metadata
 	vector<unique_ptr<ParsedExpression>> parsed_expressions;
-	for (auto &p_exp : index_info.parsed_expressions) {
-		parsed_expressions.push_back(p_exp->Copy());
+	for (auto &parsed_expr : info.parsed_expressions) {
+		parsed_expressions.push_back(parsed_expr->Copy());
 	}
+	D_ASSERT(!parsed_expressions.empty());
-	// bind the parsed expressions
-	// add the table to the bind context
+	// add the table to the bind context to bind the parsed expressions
 	auto binder = Binder::CreateBinder(context);
 	vector<LogicalType> column_types;
 	vector<string> column_names;
-	for (auto &col : table_catalog.GetColumns().Logical()) {
+	for (auto &col : table.GetColumns().Logical()) {
 		column_types.push_back(col.Type());
 		column_names.push_back(col.Name());
 	}
+	// create a binder to bind the parsed expressions
 	vector<column_t> column_ids;
-	binder->bind_context.AddBaseTable(0, index_info.table, column_names, column_types, column_ids, &table_catalog);
+	binder->bind_context.AddBaseTable(0, info.table, column_names, column_types, column_ids, &table);
 	IndexBinder idx_binder(*binder, context);
+	// bind the parsed expressions to create unbound expressions
+	vector<unique_ptr<Expression>> unbound_expressions;
 	unbound_expressions.reserve(parsed_expressions.size());
 	for (auto &expr : parsed_expressions) {
 		unbound_expressions.push_back(idx_binder.Bind(expr));
 	}
-	if (parsed_expressions.empty()) {
-		// this is a PK/FK index: we create the necessary bound column ref expressions
-		unbound_expressions.reserve(index_info.column_ids.size());
-		for (idx_t key_nr = 0; key_nr < index_info.column_ids.size(); key_nr++) {
-			auto &col = table_catalog.GetColumn(LogicalIndex(index_info.column_ids[key_nr]));
-			unbound_expressions.push_back(
-			    make_uniq<BoundColumnRefExpression>(col.GetName(), col.GetType(), ColumnBinding(0, key_nr)));
-		}
-	}
 	// create the index and add it to the storage
-	switch (index_info.index_type) {
+	switch (info.index_type) {
 	case IndexType::ART: {
-		auto &storage = table_catalog.GetStorage();
-		auto art = make_uniq<ART>(index_info.column_ids, TableIOManager::Get(storage), std::move(unbound_expressions),
-		                          index_info.constraint_type, storage.db, nullptr, index_block_pointer);
+		auto &storage = table.GetStorage();
+		auto art = make_uniq<ART>(info.column_ids, TableIOManager::Get(storage), std::move(unbound_expressions),
+		                          info.constraint_type, storage.db, nullptr, root_block_pointer);
-		index_catalog.index = art.get();
+		index.index = art.get();
 		storage.info->indexes.AddIndex(std::move(art));
 	} break;
 	default:

package/src/duckdb/src/storage/local_storage.cpp CHANGED Viewed

@@ -159,7 +159,7 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen
 		    AppendToIndexes(transaction, *row_groups, table.info->indexes, table.GetTypes(), append_state.current_row);
 	}
 	if (error) {
-		// need to revert the append
+		// need to revert all appended row ids
 		row_t current_row = append_state.row_start;
 		// remove the data from the indexes, if there are any indexes
 		row_groups->Scan(transaction, [&](DataChunk &chunk) -> bool {
@@ -184,6 +184,13 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen
 		if (append_to_table) {
 			table.RevertAppendInternal(append_state.row_start, append_count);
 		}
+		// we need to vacuum the indexes to remove any buffers that are now empty
+		// due to reverting the appends
+		table.info->indexes.Scan([&](Index &index) {
+			index.Vacuum();
+			return false;
+		});
 		error.Throw();
 	}
 }

package/test/prepare.test.ts CHANGED Viewed

@@ -652,7 +652,16 @@ describe('prepare', function() {
             });
             it("should aggregate kurtosis(num)", function (done) {
                 db.all("SELECT kurtosis(num) as kurtosis FROM foo", function (err: null | Error, res: TableData) {
-                    assert.equal(res[0].kurtosis, -1.1999999999999997);
+                    // The `num` column of table `foo` contains each integer from 0 to 999,999 exactly once.
+                    // This is a uniform distribution. The excess kurtosis for a uniform distribution is exactly -1.2.
+                    // See https://en.wikipedia.org/wiki/Kurtosis#Other_well-known_distributions
+                    const expected = -1.2;
+                    // The calculated value can differ from the exact answer by small amounts on different platforms due
+                    // to floating-point errors. This tolerance was determined experimentally.
+                    const tolerance = Number.EPSILON * 10;
+                    assert.ok(Math.abs(res[0].kurtosis - expected) < tolerance);
                     done(err);
                 });
             });

package/test/test_all_types.test.ts CHANGED Viewed

@@ -90,7 +90,7 @@ const correct_answer_map: Record<string, any[]> = {
   date_array: [
     [],
     [
-      new Date(1970, 0, 1),
+      new Date(Date.UTC(1970, 0, 1)),
       null,
       new Date("0001-01-01T00:00:00.000Z"),
       new Date("9999-12-31T00:00:00.000Z"),
@@ -100,7 +100,7 @@ const correct_answer_map: Record<string, any[]> = {
   timestamp_array: [
     [],
     [
-      new Date(1970, 0, 1),
+      new Date(Date.UTC(1970, 0, 1)),
       null,
       new Date("0001-01-01T00:00:00.000Z"),
       new Date("9999-12-31T23:59:59.999Z"),
@@ -111,7 +111,7 @@ const correct_answer_map: Record<string, any[]> = {
   timestamptz_array: [
     [],
     [
-      new Date(1970, 0, 1),
+      new Date(Date.UTC(1970, 0, 1)),
       null,
       new Date("0001-01-01T00:00:00.000Z"),
       new Date("9999-12-31T23:59:59.999Z"),
@@ -171,7 +171,7 @@ const correct_answer_map: Record<string, any[]> = {
   ],
   timestamp: [
-    new Date("1990-01-01T00:00"),
+    new Date(Date.UTC(1990, 0, 1)),
     new Date("9999-12-31T23:59:59.000Z"),
     null,
   ],