npm - duckdb - Versions diffs - 0.7.2-dev1138.0 → 0.7.2-dev1188.0 - Mend

duckdb 0.7.2-dev1138.0 → 0.7.2-dev1188.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/src/duckdb/extension/parquet/parquet-extension.cpp CHANGED Viewed

@@ -122,7 +122,8 @@ struct ParquetWriteGlobalState : public GlobalFunctionData {
 };
 struct ParquetWriteLocalState : public LocalFunctionData {
-	explicit ParquetWriteLocalState(ClientContext &context, const vector<LogicalType> &types) : buffer(context, types) {
+	explicit ParquetWriteLocalState(ClientContext &context, const vector<LogicalType> &types)
+	    : buffer(Allocator::Get(context), types) {
 	}
 	ColumnDataCollection buffer;

package/src/duckdb/src/common/local_file_system.cpp CHANGED Viewed

@@ -832,6 +832,46 @@ static bool HasGlob(const string &str) {
 	}
 	return false;
 }
+static bool IsCrawl(const string &glob) {
+	// glob must match exactly
+	return glob == "**";
+}
+static bool HasMultipleCrawl(const vector<string> &splits) {
+	return std::count(splits.begin(), splits.end(), "**") > 1;
+}
+static bool IsSymbolicLink(const string &path) {
+#ifndef _WIN32
+	struct stat status;
+	return (lstat(path.c_str(), &status) != -1 && S_ISLNK(status.st_mode));
+#else
+	auto attributes = WindowsGetFileAttributes(path);
+	if (attributes == INVALID_FILE_ATTRIBUTES)
+		return false;
+	return attributes & FILE_ATTRIBUTE_REPARSE_POINT;
+#endif
+}
+static void RecursiveGlobDirectories(FileSystem &fs, const string &path, vector<string> &result, bool match_directory,
+                                     bool join_path) {
+	fs.ListFiles(path, [&](const string &fname, bool is_directory) {
+		string concat;
+		if (join_path) {
+			concat = fs.JoinPath(path, fname);
+		} else {
+			concat = fname;
+		}
+		if (IsSymbolicLink(concat)) {
+			return;
+		}
+		if (is_directory == match_directory) {
+			result.push_back(concat);
+		}
+		if (is_directory) {
+			RecursiveGlobDirectories(fs, concat, result, match_directory, true);
+		}
+	});
+}
 static void GlobFilesInternal(FileSystem &fs, const string &path, const string &glob, bool match_directory,
                               vector<string> &result, bool join_path) {
@@ -933,6 +973,10 @@ vector<string> LocalFileSystem::Glob(const string &path, FileOpener *opener) {
 		}
 	}
+	if (HasMultipleCrawl(splits)) {
+		throw IOException("Cannot use multiple \'**\' in one path");
+	}
 	for (idx_t i = absolute_path ? 1 : 0; i < splits.size(); i++) {
 		bool is_last_chunk = i + 1 == splits.size();
 		bool has_glob = HasGlob(splits[i]);
@@ -949,14 +993,27 @@ vector<string> LocalFileSystem::Glob(const string &path, FileOpener *opener) {
 				}
 			}
 		} else {
-			if (previous_directories.empty()) {
-				// no previous directories: list in the current path
-				GlobFilesInternal(*this, ".", splits[i], !is_last_chunk, result, false);
+			if (IsCrawl(splits[i])) {
+				if (!is_last_chunk) {
+					result = previous_directories;
+				}
+				if (previous_directories.empty()) {
+					RecursiveGlobDirectories(*this, ".", result, !is_last_chunk, false);
+				} else {
+					for (auto &prev_dir : previous_directories) {
+						RecursiveGlobDirectories(*this, prev_dir, result, !is_last_chunk, true);
+					}
+				}
 			} else {
-				// previous directories
-				// we iterate over each of the previous directories, and apply the glob of the current directory
-				for (auto &prev_directory : previous_directories) {
-					GlobFilesInternal(*this, prev_directory, splits[i], !is_last_chunk, result, true);
+				if (previous_directories.empty()) {
+					// no previous directories: list in the current path
+					GlobFilesInternal(*this, ".", splits[i], !is_last_chunk, result, false);
+				} else {
+					// previous directories
+					// we iterate over each of the previous directories, and apply the glob of the current directory
+					for (auto &prev_directory : previous_directories) {
+						GlobFilesInternal(*this, prev_directory, splits[i], !is_last_chunk, result, true);
+					}
 				}
 			}
 		}

package/src/duckdb/src/common/types/column_data_collection_segment.cpp CHANGED Viewed

@@ -169,11 +169,8 @@ idx_t ColumnDataCollectionSegment::ReadVectorInternal(ChunkManagementState &stat
 		if (type_size > 0) {
 			memcpy(target_data + current_offset * type_size, base_ptr, current_vdata.count * type_size);
 		}
-		// FIXME: use bitwise operations here
 		ValidityMask current_validity(validity_data);
-		for (idx_t k = 0; k < current_vdata.count; k++) {
-			target_validity.Set(current_offset + k, current_validity.RowIsValid(k));
-		}
+		target_validity.SliceInPlace(current_validity, current_offset, 0, current_vdata.count);
 		current_offset += current_vdata.count;
 		next_index = current_vdata.next_data;
 	}

package/src/duckdb/src/common/types/validity_mask.cpp CHANGED Viewed

@@ -68,24 +68,41 @@ void ValidityMask::Resize(idx_t old_size, idx_t new_size) {
 	}
 }
-void ValidityMask::Slice(const ValidityMask &other, idx_t offset, idx_t end) {
+void ValidityMask::Slice(const ValidityMask &other, idx_t source_offset, idx_t count) {
 	if (other.AllValid()) {
 		validity_mask = nullptr;
 		validity_data.reset();
 		return;
 	}
-	if (offset == 0) {
+	if (source_offset == 0) {
 		Initialize(other);
 		return;
 	}
-	ValidityMask new_mask(end - offset);
+	ValidityMask new_mask(count);
+	new_mask.SliceInPlace(other, 0, source_offset, count);
+	Initialize(new_mask);
+}
-// FIXME THIS NEEDS FIXING!
+bool ValidityMask::IsAligned(idx_t count) {
+	return count % BITS_PER_VALUE == 0;
+}
+void ValidityMask::SliceInPlace(const ValidityMask &other, idx_t target_offset, idx_t source_offset, idx_t count) {
+	if (IsAligned(source_offset) && IsAligned(target_offset)) {
+		auto target_validity = GetData();
+		auto source_validity = other.GetData();
+		auto source_offset_entries = EntryCount(source_offset);
+		auto target_offset_entries = EntryCount(target_offset);
+		memcpy(target_validity + target_offset_entries, source_validity + source_offset_entries,
+		       sizeof(validity_t) * EntryCount(count));
+		return;
+	}
+	// FIXME: use bitwise operations here
 #if 1
-	for (idx_t i = offset; i < end; i++) {
-		new_mask.Set(i - offset, other.RowIsValid(i));
+	for (idx_t i = 0; i < count; i++) {
+		Set(target_offset + i, other.RowIsValid(source_offset + i));
 	}
-	Initialize(new_mask);
 #else
 	// first shift the "whole" units
 	idx_t entire_units = offset / BITS_PER_VALUE;

package/src/duckdb/src/common/types/vector.cpp CHANGED Viewed

@@ -136,17 +136,13 @@ void Vector::Slice(Vector &other, idx_t offset, idx_t end) {
 		for (idx_t i = 0; i < entries.size(); i++) {
 			entries[i]->Slice(*other_entries[i], offset, end);
 		}
-		if (offset > 0) {
-			new_vector.validity.Slice(other.validity, offset, end);
-		} else {
-			new_vector.validity = other.validity;
-		}
+		new_vector.validity.Slice(other.validity, offset, end - offset);
 		Reference(new_vector);
 	} else {
 		Reference(other);
 		if (offset > 0) {
 			data = data + GetTypeIdSize(internal_type) * offset;
-			validity.Slice(other.validity, offset, end);
+			validity.Slice(other.validity, offset, end - offset);
 		}
 	}
 }

package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp CHANGED Viewed

@@ -53,10 +53,13 @@ RadixPartitionedHashTable::RadixPartitionedHashTable(GroupingSet &grouping_set_p
 // Sink
 //===--------------------------------------------------------------------===//
 class RadixHTGlobalState : public GlobalSinkState {
+	constexpr const static idx_t MAX_RADIX_PARTITIONS = 32;
 public:
 	explicit RadixHTGlobalState(ClientContext &context)
-	    : is_empty(true), multi_scan(true), total_groups(0),
-	      partition_info((idx_t)TaskScheduler::GetScheduler(context).NumberOfThreads()) {
+	    : is_empty(true), multi_scan(true), partitioned(false),
+	      partition_info(
+	          MinValue<idx_t>(MAX_RADIX_PARTITIONS, TaskScheduler::GetScheduler(context).NumberOfThreads())) {
 	}
 	vector<unique_ptr<PartitionableHashTable>> intermediate_hts;
@@ -68,8 +71,8 @@ public:
 	bool multi_scan;
 	//! The lock for updating the global aggregate state
 	mutex lock;
-	//! a counter to determine if we should switch over to partitioning
-	atomic<idx_t> total_groups;
+	//! Whether or not any thread has crossed the partitioning threshold
+	atomic<bool> partitioned;
 	bool is_finalized = false;
 	bool is_partitioned = false;
@@ -79,7 +82,7 @@ public:
 class RadixHTLocalState : public LocalSinkState {
 public:
-	explicit RadixHTLocalState(const RadixPartitionedHashTable &ht) : is_empty(true) {
+	explicit RadixHTLocalState(const RadixPartitionedHashTable &ht) : total_groups(0), is_empty(true) {
 		// if there are no groups we create a fake group so everything has the same group
 		group_chunk.InitializeEmpty(ht.group_types);
 		if (ht.grouping_set.empty()) {
@@ -90,6 +93,8 @@ public:
 	DataChunk group_chunk;
 	//! The aggregate HT
 	unique_ptr<PartitionableHashTable> ht;
+	//! The total number of groups found by this thread
+	idx_t total_groups;
 	//! Whether or not any tuples were added to the HT
 	bool is_empty;
@@ -146,7 +151,7 @@ void RadixPartitionedHashTable::Sink(ExecutionContext &context, GlobalSinkState
 		}
 		D_ASSERT(gstate.finalized_hts.size() == 1);
 		D_ASSERT(gstate.finalized_hts[0]);
-		gstate.total_groups += gstate.finalized_hts[0]->AddChunk(group_chunk, payload_input, filter);
+		llstate.total_groups += gstate.finalized_hts[0]->AddChunk(group_chunk, payload_input, filter);
 		return;
 	}
@@ -160,9 +165,11 @@ void RadixPartitionedHashTable::Sink(ExecutionContext &context, GlobalSinkState
 		                                        group_types, op.payload_types, op.bindings);
 	}
-	gstate.total_groups +=
-	    llstate.ht->AddChunk(group_chunk, payload_input,
-	                         gstate.total_groups > radix_limit && gstate.partition_info.n_partitions > 1, filter);
+	llstate.total_groups += llstate.ht->AddChunk(group_chunk, payload_input,
+	                                             gstate.partitioned && gstate.partition_info.n_partitions > 1, filter);
+	if (llstate.total_groups >= radix_limit) {
+		gstate.partitioned = true;
+	}
 }
 void RadixPartitionedHashTable::Combine(ExecutionContext &context, GlobalSinkState &state,
@@ -183,7 +190,7 @@ void RadixPartitionedHashTable::Combine(ExecutionContext &context, GlobalSinkSta
 		return; // no data
 	}
-	if (!llstate.ht->IsPartitioned() && gstate.partition_info.n_partitions > 1 && gstate.total_groups > radix_limit) {
+	if (!llstate.ht->IsPartitioned() && gstate.partition_info.n_partitions > 1 && gstate.partitioned) {
 		llstate.ht->Partition();
 	}

package/src/duckdb/src/function/cast_rules.cpp CHANGED Viewed

@@ -207,6 +207,15 @@ int64_t CastRules::ImplicitCast(const LogicalType &from, const LogicalType &to)
 		// if aliases are different, an implicit cast is not possible
 		return -1;
 	}
+	if (from.id() == LogicalTypeId::LIST && to.id() == LogicalTypeId::LIST) {
+		// Lists can be cast if their child types can be cast
+		auto child_cost = ImplicitCast(ListType::GetChildType(from), ListType::GetChildType(to));
+		if (child_cost >= 100) {
+			// subtract one from the cost because we prefer LIST[X] -> LIST[VARCHAR] over LIST[X] -> VARCHAR
+			child_cost--;
+		}
+		return child_cost;
+	}
 	if (from.id() == to.id()) {
 		// arguments match: do nothing
 		return 0;
@@ -219,10 +228,6 @@ int64_t CastRules::ImplicitCast(const LogicalType &from, const LogicalType &to)
 		// everything can be cast to VARCHAR, but this cast has a high cost
 		return TargetTypeCost(to);
 	}
-	if (from.id() == LogicalTypeId::LIST && to.id() == LogicalTypeId::LIST) {
-		// Lists can be cast if their child types can be cast
-		return ImplicitCast(ListType::GetChildType(from), ListType::GetChildType(to));
-	}
 	if (from.id() == LogicalTypeId::UNION && to.id() == LogicalTypeId::UNION) {
 		// Unions can be cast if the source tags are a subset of the target tags

package/src/duckdb/src/function/table/version/pragma_version.cpp CHANGED Viewed

@@ -1,8 +1,8 @@
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "0.7.2-dev1138"
+#define DUCKDB_VERSION "0.7.2-dev1188"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "41104b611e"
+#define DUCKDB_SOURCE_ID "d1518bdfe8"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"

package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp CHANGED Viewed

@@ -323,9 +323,12 @@ public:
 public:
 	DUCKDB_API void Resize(idx_t old_size, idx_t new_size);
-	DUCKDB_API void Slice(const ValidityMask &other, idx_t offset, idx_t end);
+	DUCKDB_API void SliceInPlace(const ValidityMask &other, idx_t target_offset, idx_t source_offset, idx_t count);
+	DUCKDB_API void Slice(const ValidityMask &other, idx_t source_offset, idx_t count);
 	DUCKDB_API void Combine(const ValidityMask &other, idx_t count);
 	DUCKDB_API string ToString(idx_t count) const;
+	DUCKDB_API static bool IsAligned(idx_t count);
 };
 } // namespace duckdb