npm - duckdb - Versions diffs - 0.8.2-dev3989.0 → 0.8.2-dev4126.0 - Mend

duckdb 0.8.2-dev3989.0 → 0.8.2-dev4126.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

package/src/duckdb/extension/json/json_scan.cpp CHANGED Viewed

@@ -2,11 +2,11 @@
 #include "duckdb/common/enum_util.hpp"
 #include "duckdb/common/multi_file_reader.hpp"
+#include "duckdb/common/serializer/format_deserializer.hpp"
+#include "duckdb/common/serializer/format_serializer.hpp"
 #include "duckdb/main/extension_helper.hpp"
 #include "duckdb/parallel/task_scheduler.hpp"
 #include "duckdb/storage/buffer_manager.hpp"
-#include "duckdb/common/serializer/format_serializer.hpp"
-#include "duckdb/common/serializer/format_deserializer.hpp"
 namespace duckdb {
@@ -39,16 +39,16 @@ void JSONScanData::Bind(ClientContext &context, TableFunctionBindInput &input) {
 			maximum_object_size = MaxValue<idx_t>(UIntegerValue::Get(kv.second), maximum_object_size);
 		} else if (loption == "format") {
 			auto arg = StringUtil::Lower(StringValue::Get(kv.second));
-			static const auto format_options =
+			static const auto FORMAT_OPTIONS =
 			    case_insensitive_map_t<JSONFormat> {{"auto", JSONFormat::AUTO_DETECT},
 			                                        {"unstructured", JSONFormat::UNSTRUCTURED},
 			                                        {"newline_delimited", JSONFormat::NEWLINE_DELIMITED},
 			                                        {"nd", JSONFormat::NEWLINE_DELIMITED},
 			                                        {"array", JSONFormat::ARRAY}};
-			auto lookup = format_options.find(arg);
-			if (lookup == format_options.end()) {
+			auto lookup = FORMAT_OPTIONS.find(arg);
+			if (lookup == FORMAT_OPTIONS.end()) {
 				vector<string> valid_options;
-				for (auto &pair : format_options) {
+				for (auto &pair : FORMAT_OPTIONS) {
 					valid_options.push_back(StringUtil::Format("'%s'", pair.first));
 				}
 				throw BinderException("format must be one of [%s], not '%s'", StringUtil::Join(valid_options, ", "),
@@ -198,7 +198,8 @@ JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, const JSONScanD
     : bind_data(bind_data_p), transform_options(bind_data.transform_options),
       allocator(BufferManager::GetBufferManager(context).GetBufferAllocator()),
       buffer_capacity(bind_data.maximum_object_size * 2), file_index(0), batch_index(0),
-      system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()) {
+      system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()),
+      enable_parallel_scans(bind_data.files.size() < system_threads) {
 }
 JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
@@ -275,7 +276,7 @@ idx_t JSONGlobalTableFunctionState::MaxThreads() const {
 		return state.system_threads;
 	}
-	if (!state.json_readers.empty() && state.json_readers[0]->IsOpen()) {
+	if (!state.json_readers.empty() && state.json_readers[0]->HasFileHandle()) {
 		auto &reader = *state.json_readers[0];
 		if (reader.GetFormat() == JSONFormat::NEWLINE_DELIMITED) { // Auto-detected NDJSON
 			return state.system_threads;
@@ -291,7 +292,7 @@ JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context,
 }
 unique_ptr<LocalTableFunctionState> JSONLocalTableFunctionState::Init(ExecutionContext &context,
-                                                                      TableFunctionInitInput &input,
+                                                                      TableFunctionInitInput &,
                                                                       GlobalTableFunctionState *global_state) {
 	auto &gstate = global_state->Cast<JSONGlobalTableFunctionState>();
 	auto result = make_uniq<JSONLocalTableFunctionState>(context.client, gstate.state);
@@ -318,19 +319,24 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
 idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
 	allocator.Reset();
 	scan_count = 0;
-	if (buffer_offset == buffer_size) {
-		if (!ReadNextBuffer(gstate)) {
-			return scan_count;
-		}
-		D_ASSERT(buffer_size != 0);
-		if (current_buffer_handle->buffer_index != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
-			ReconstructFirstObject(gstate);
-			scan_count++;
+	// We have to wrap this in a loop otherwise we stop scanning too early when there's an empty JSON file
+	while (scan_count == 0) {
+		if (buffer_offset == buffer_size) {
+			if (!ReadNextBuffer(gstate)) {
+				break;
+			}
+			D_ASSERT(buffer_size != 0);
+			if (current_buffer_handle->buffer_index != 0 &&
+			    current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
+				ReconstructFirstObject();
+				scan_count++;
+			}
 		}
+		ParseNextChunk();
 	}
-	ParseNextChunk();
 	return scan_count;
 }
@@ -349,7 +355,7 @@ static inline const char *PreviousNewline(const char *ptr) {
 	return ptr;
 }
-static inline const char *NextJSONDefault(const char *ptr, const idx_t size, const char *const end) {
+static inline const char *NextJSONDefault(const char *ptr, const char *const end) {
 	idx_t parents = 0;
 	while (ptr != end) {
 		switch (*ptr++) {
@@ -393,7 +399,7 @@ static inline const char *NextJSON(const char *ptr, const idx_t size) {
 	case '{':
 	case '[':
 	case '"':
-		ptr = NextJSONDefault(ptr, size, end);
+		ptr = NextJSONDefault(ptr, end);
 		break;
 	default:
 		// Special case: JSON array containing JSON without clear "parents", i.e., not obj/arr/str
@@ -482,18 +488,21 @@ void JSONScanLocalState::ThrowInvalidAtEndError() {
 	throw InvalidInputException("Invalid JSON detected at the end of file \"%s\".", current_reader->GetFileName());
 }
-bool JSONScanLocalState::IsParallel(JSONScanGlobalState &gstate) const {
-	if (bind_data.files.size() >= gstate.system_threads) {
-		// More files than threads, just parallelize over the files
-		return false;
+void JSONScanLocalState::TryIncrementFileIndex(JSONScanGlobalState &gstate) const {
+	lock_guard<mutex> guard(gstate.lock);
+	if (gstate.file_index < gstate.json_readers.size() &&
+	    current_reader.get() == gstate.json_readers[gstate.file_index].get()) {
+		gstate.file_index++;
 	}
+}
-	if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
-		// NDJSON can be read in parallel
-		return true;
+bool JSONScanLocalState::IsParallel(JSONScanGlobalState &gstate) const {
+	if (bind_data.files.size() >= gstate.system_threads) {
+		return false; // More files than threads, just parallelize over the files
 	}
-	return false;
+	// NDJSON can be read in parallel
+	return current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED;
 }
 static pair<JSONFormat, JSONRecordType> DetectFormatAndRecordType(char *const buffer_ptr, const idx_t buffer_size,
@@ -578,104 +587,107 @@ static pair<JSONFormat, JSONRecordType> DetectFormatAndRecordType(char *const bu
 }
 bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
+	// First we make sure we have a buffer to read into
 	AllocatedData buffer;
+	// Try to re-use a buffer that was used before
 	if (current_reader) {
-		// Keep track of this for accurate errors
 		current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
-		// Try to re-use existing buffer
 		if (current_buffer_handle && --current_buffer_handle->readers == 0) {
 			buffer = current_reader->RemoveBuffer(current_buffer_handle->buffer_index);
-		} else {
-			buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
 		}
+	}
-		if (!is_last) {
-			if (current_reader->GetFormat() != JSONFormat::NEWLINE_DELIMITED) {
-				memcpy(buffer.get(), reconstruct_buffer.get(),
-				       prev_buffer_remainder); // Copy last bit of previous buffer
-			}
-		} else {
-			if (gstate.bind_data.type != JSONScanType::SAMPLE) {
-				current_reader->CloseJSONFile(); // Close files that are done if we're not sampling
-			}
-			current_reader = nullptr;
-		}
-	} else {
+	// If we cannot re-use a buffer we create a new one
+	if (!buffer.IsSet()) {
 		buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
 	}
 	buffer_ptr = char_ptr_cast(buffer.get());
-	idx_t buffer_index;
+	// Copy last bit of previous buffer
+	if (current_reader && current_reader->GetFormat() != JSONFormat::NEWLINE_DELIMITED && !is_last) {
+		memcpy(buffer_ptr, reconstruct_buffer.get(), prev_buffer_remainder);
+	}
+	optional_idx buffer_index;
 	while (true) {
+		// Now we finish the current reader
 		if (current_reader) {
-			ReadNextBufferInternal(gstate, buffer_index);
-			if (buffer_size == 0) {
-				if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
+			// If we performed the final read of this reader in the previous iteration, close it now
+			if (is_last) {
+				if (gstate.bind_data.type != JSONScanType::SAMPLE) {
+					TryIncrementFileIndex(gstate);
 					current_reader->CloseJSONFile();
 				}
-				if (IsParallel(gstate)) {
-					// If this threads' current reader is still the one at gstate.file_index,
-					// this thread can end the parallel scan
-					lock_guard<mutex> guard(gstate.lock);
-					if (gstate.file_index < gstate.json_readers.size() &&
-					    current_reader == gstate.json_readers[gstate.file_index].get()) {
-						gstate.file_index++; // End parallel scan
-					}
-				}
 				current_reader = nullptr;
+				continue;
+			}
+			// Try to read
+			ReadNextBufferInternal(gstate, buffer_index);
+			// If this is the last read, end the parallel scan now so threads can move on
+			if (is_last && IsParallel(gstate)) {
+				TryIncrementFileIndex(gstate);
+			}
+			if (buffer_size == 0) {
+				// We didn't read anything, re-enter the loop
+				continue;
 			} else {
-				break; // We read something!
+				// We read something!
+				break;
 			}
 		}
-		// This thread needs a new reader
+		// If we got here, we don't have a reader (anymore). Try to get one
+		is_last = false;
 		{
 			lock_guard<mutex> guard(gstate.lock);
 			if (gstate.file_index == gstate.json_readers.size()) {
 				return false; // No more files left
 			}
-			// Try the next reader
+			// Assign the next reader to this thread
 			current_reader = gstate.json_readers[gstate.file_index].get();
-			if (current_reader->IsOpen()) {
-				// Can only be open from auto detection, so these should be known
-				if (!IsParallel(gstate)) {
-					batch_index = gstate.batch_index++;
-					gstate.file_index++;
-				}
-				continue; // Re-enter the loop to start scanning the assigned file
-			}
-			current_reader->OpenJSONFile();
-			batch_index = gstate.batch_index++;
-			if (current_reader->GetFormat() != JSONFormat::AUTO_DETECT) {
-				if (!IsParallel(gstate)) {
-					gstate.file_index++;
-				}
-				continue;
+			// Open the file if it is not yet open
+			if (!current_reader->IsOpen()) {
+				current_reader->OpenJSONFile();
 			}
+			batch_index = gstate.batch_index++;
-			// If we have less files than threads, we auto-detect within the lock,
-			// so other threads may join a parallel NDJSON scan
-			if (gstate.json_readers.size() < gstate.system_threads) {
-				if (ReadAndAutoDetect(gstate, buffer_index, false)) {
-					continue;
+			// Auto-detect format / record type
+			if (gstate.enable_parallel_scans) {
+				// Auto-detect within the lock, so threads may join a parallel NDJSON scan
+				if (current_reader->GetFormat() == JSONFormat::AUTO_DETECT) {
+					ReadAndAutoDetect(gstate, buffer_index);
 				}
-				break;
+			} else {
+				gstate.file_index++; // Increment the file index before dropping lock so other threads move on
 			}
+		}
-			// Increment the file index within the lock, then read/auto-detect outside of the lock
-			gstate.file_index++;
+		// If we didn't auto-detect within the lock, do it now
+		if (current_reader->GetFormat() == JSONFormat::AUTO_DETECT) {
+			ReadAndAutoDetect(gstate, buffer_index);
 		}
-		// High amount of files, just do 1 thread per file
-		if (ReadAndAutoDetect(gstate, buffer_index, true)) {
+		// If we haven't already, increment the file index if non-parallel scan
+		if (gstate.enable_parallel_scans && !IsParallel(gstate)) {
+			TryIncrementFileIndex(gstate);
+		}
+		if (!buffer_index.IsValid() || buffer_size == 0) {
+			// If we didn't get a buffer index (because not auto-detecting), or the file was empty, just re-enter loop
 			continue;
 		}
 		break;
 	}
 	D_ASSERT(buffer_size != 0); // We should have read something if we got here
+	D_ASSERT(buffer_index.IsValid());
 	idx_t readers = 1;
 	if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
@@ -683,9 +695,10 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
 	}
 	// Create an entry and insert it into the map
-	auto json_buffer_handle = make_uniq<JSONBufferHandle>(buffer_index, readers, std::move(buffer), buffer_size);
+	auto json_buffer_handle =
+	    make_uniq<JSONBufferHandle>(buffer_index.GetIndex(), readers, std::move(buffer), buffer_size);
 	current_buffer_handle = json_buffer_handle.get();
-	current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
+	current_reader->InsertBuffer(buffer_index.GetIndex(), std::move(json_buffer_handle));
 	prev_buffer_remainder = 0;
 	lines_or_objects_in_buffer = 0;
@@ -696,15 +709,11 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
 	return true;
 }
-bool JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, idx_t &buffer_index,
-                                           const bool already_incremented_file_idx) {
+void JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
 	// We have to detect the JSON format - hold the gstate lock while we do this
 	ReadNextBufferInternal(gstate, buffer_index);
 	if (buffer_size == 0) {
-		if (!already_incremented_file_idx) {
-			gstate.file_index++; // Empty file, move to the next one
-		}
-		return true;
+		return;
 	}
 	auto format_and_record_type = DetectFormatAndRecordType(buffer_ptr, buffer_size, allocator.GetYYAlc());
@@ -721,13 +730,9 @@ bool JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, idx_t &b
 		throw InvalidInputException("Expected file \"%s\" to contain records, detected non-record JSON instead.",
 		                            current_reader->GetFileName());
 	}
-	if (!already_incremented_file_idx && !IsParallel(gstate)) {
-		gstate.file_index++;
-	}
-	return false;
 }
-void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, idx_t &buffer_index) {
+void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
 	if (current_reader->GetFileHandle().CanSeek()) {
 		ReadNextBufferSeek(gstate, buffer_index);
 	} else {
@@ -735,12 +740,12 @@ void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, idx
 	}
 	buffer_offset = 0;
-	if (buffer_index == 0 && current_reader->GetFormat() == JSONFormat::ARRAY) {
+	if (buffer_index.GetIndex() == 0 && current_reader->GetFormat() == JSONFormat::ARRAY) {
 		SkipOverArrayStart();
 	}
 }
-void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
+void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
 	auto &file_handle = current_reader->GetFileHandle();
 	idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
@@ -758,13 +763,13 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
 			ThrowInvalidAtEndError();
 		}
-		if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
+		if (read_size != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
 			batch_index = gstate.batch_index++;
 		}
 	}
 	buffer_size = prev_buffer_remainder + read_size;
 	if (buffer_size == 0) {
-		current_reader->SetBufferLineOrObjectCount(buffer_index, 0);
+		current_reader->SetBufferLineOrObjectCount(buffer_index.GetIndex(), 0);
 		return;
 	}
@@ -773,33 +778,33 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
 	                           gstate.bind_data.type == JSONScanType::SAMPLE);
 }
-void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
+void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, optional_idx &buffer_index) {
 	idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
 	idx_t read_size;
 	{
 		lock_guard<mutex> reader_guard(current_reader->lock);
 		buffer_index = current_reader->GetBufferIndex();
-		if (current_reader->IsOpen() && !current_reader->IsDone()) {
+		if (current_reader->HasFileHandle() && current_reader->IsOpen()) {
 			read_size = current_reader->GetFileHandle().Read(buffer_ptr + prev_buffer_remainder, request_size,
 			                                                 gstate.bind_data.type == JSONScanType::SAMPLE);
 			is_last = read_size < request_size;
 		} else {
 			read_size = 0;
-			is_last = false;
+			is_last = true;
 		}
 		if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
 			ThrowInvalidAtEndError();
 		}
-		if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
+		if (read_size != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
 			batch_index = gstate.batch_index++;
 		}
 	}
 	buffer_size = prev_buffer_remainder + read_size;
 	if (buffer_size == 0) {
-		current_reader->SetBufferLineOrObjectCount(buffer_index, 0);
+		current_reader->SetBufferLineOrObjectCount(buffer_index.GetIndex(), 0);
 		return;
 	}
 }
@@ -833,7 +838,7 @@ void JSONScanLocalState::SkipOverArrayStart() {
 	}
 }
-void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
+void JSONScanLocalState::ReconstructFirstObject() {
 	D_ASSERT(current_buffer_handle->buffer_index != 0);
 	D_ASSERT(current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED);
@@ -947,8 +952,7 @@ void JSONScanLocalState::ThrowTransformError(idx_t object_index, const string &e
 	current_reader->ThrowTransformError(current_buffer_handle->buffer_index, line_or_object_in_buffer, error_message);
 }
-double JSONScan::ScanProgress(ClientContext &context, const FunctionData *bind_data_p,
-                              const GlobalTableFunctionState *global_state) {
+double JSONScan::ScanProgress(ClientContext &, const FunctionData *, const GlobalTableFunctionState *global_state) {
 	auto &gstate = global_state->Cast<JSONGlobalTableFunctionState>().state;
 	double progress = 0;
 	for (auto &reader : gstate.json_readers) {
@@ -957,16 +961,16 @@ double JSONScan::ScanProgress(ClientContext &context, const FunctionData *bind_d
 	return progress / double(gstate.json_readers.size());
 }
-idx_t JSONScan::GetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
-                              LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
+idx_t JSONScan::GetBatchIndex(ClientContext &, const FunctionData *, LocalTableFunctionState *local_state,
+                              GlobalTableFunctionState *) {
 	auto &lstate = local_state->Cast<JSONLocalTableFunctionState>();
 	return lstate.GetBatchIndex();
 }
-unique_ptr<NodeStatistics> JSONScan::Cardinality(ClientContext &context, const FunctionData *bind_data) {
+unique_ptr<NodeStatistics> JSONScan::Cardinality(ClientContext &, const FunctionData *bind_data) {
 	auto &data = bind_data->Cast<JSONScanData>();
 	idx_t per_file_cardinality;
-	if (data.initial_reader && data.initial_reader->IsOpen()) {
+	if (data.initial_reader && data.initial_reader->HasFileHandle()) {
 		per_file_cardinality = data.initial_reader->GetFileHandle().FileSize() / data.avg_tuple_size;
 	} else {
 		per_file_cardinality = 42; // The cardinality of an unknown JSON file is the almighty number 42
@@ -984,25 +988,24 @@ void JSONScan::ComplexFilterPushdown(ClientContext &context, LogicalGet &get, Fu
 	}
 }
-void JSONScan::Serialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &function) {
+void JSONScan::Serialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &) {
 	auto &bind_data = bind_data_p->Cast<JSONScanData>();
 	bind_data.Serialize(writer);
 }
-unique_ptr<FunctionData> JSONScan::Deserialize(PlanDeserializationState &state, FieldReader &reader,
-                                               TableFunction &function) {
+unique_ptr<FunctionData> JSONScan::Deserialize(PlanDeserializationState &state, FieldReader &reader, TableFunction &) {
 	auto result = make_uniq<JSONScanData>();
 	result->Deserialize(state.context, reader);
 	return std::move(result);
 }
 void JSONScan::FormatSerialize(FormatSerializer &serializer, const optional_ptr<FunctionData> bind_data_p,
-                               const TableFunction &function) {
+                               const TableFunction &) {
 	auto &bind_data = bind_data_p->Cast<JSONScanData>();
 	serializer.WriteProperty(100, "scan_data", &bind_data);
 }
-unique_ptr<FunctionData> JSONScan::FormatDeserialize(FormatDeserializer &deserializer, TableFunction &function) {
+unique_ptr<FunctionData> JSONScan::FormatDeserialize(FormatDeserializer &deserializer, TableFunction &) {
 	unique_ptr<JSONScanData> result;
 	deserializer.ReadProperty(100, "scan_data", result);
 	return std::move(result);

package/src/duckdb/extension/parquet/parquet_extension.cpp CHANGED Viewed

@@ -21,6 +21,8 @@
 #include "duckdb/common/field_writer.hpp"
 #include "duckdb/common/file_system.hpp"
 #include "duckdb/common/multi_file_reader.hpp"
+#include "duckdb/common/serializer/format_deserializer.hpp"
+#include "duckdb/common/serializer/format_serializer.hpp"
 #include "duckdb/common/types/chunk_collection.hpp"
 #include "duckdb/function/copy_function.hpp"
 #include "duckdb/function/table_function.hpp"
@@ -35,8 +37,6 @@
 #include "duckdb/planner/operator/logical_get.hpp"
 #include "duckdb/storage/statistics/base_statistics.hpp"
 #include "duckdb/storage/table/row_group.hpp"
-#include "duckdb/common/serializer/format_serializer.hpp"
-#include "duckdb/common/serializer/format_deserializer.hpp"
 #endif
@@ -78,6 +78,8 @@ struct ParquetReadLocalState : public LocalTableFunctionState {
 	DataChunk all_columns;
 };
+enum class ParquetFileState : uint8_t { UNOPENED, OPENING, OPEN, CLOSED };
 struct ParquetReadGlobalState : public GlobalTableFunctionState {
 	mutex lock;
@@ -86,7 +88,7 @@ struct ParquetReadGlobalState : public GlobalTableFunctionState {
 	//! Currently opened readers
 	vector<shared_ptr<ParquetReader>> readers;
 	//! Flag to indicate a file is being opened
-	vector<bool> file_opening;
+	vector<ParquetFileState> file_states;
 	//! Mutexes to wait for a file that is currently being opened
 	unique_ptr<mutex[]> file_mutexes;
 	//! Signal to other threads that a file failed to open, letting every thread abort.
@@ -359,7 +361,7 @@ public:
 		auto &bind_data = input.bind_data->CastNoConst<ParquetReadBindData>();
 		auto result = make_uniq<ParquetReadGlobalState>();
-		result->file_opening = vector<bool>(bind_data.files.size(), false);
+		result->file_states = vector<ParquetFileState>(bind_data.files.size(), ParquetFileState::UNOPENED);
 		result->file_mutexes = unique_ptr<mutex[]>(new mutex[bind_data.files.size()]);
 		if (bind_data.files.empty()) {
 			result->initial_reader = nullptr;
@@ -367,6 +369,8 @@ public:
 			result->readers = std::move(bind_data.union_readers);
 			if (result->readers.size() != bind_data.files.size()) {
 				result->readers = vector<shared_ptr<ParquetReader>>(bind_data.files.size(), nullptr);
+			} else {
+				std::fill(result->file_states.begin(), result->file_states.end(), ParquetFileState::OPEN);
 			}
 			if (bind_data.initial_reader) {
 				result->initial_reader = std::move(bind_data.initial_reader);
@@ -378,6 +382,7 @@ public:
 				    make_shared<ParquetReader>(context, bind_data.files[0], bind_data.parquet_options);
 				result->readers[0] = result->initial_reader;
 			}
+			result->file_states[0] = ParquetFileState::OPEN;
 		}
 		for (auto &reader : result->readers) {
 			if (!reader) {
@@ -511,7 +516,7 @@ public:
 			D_ASSERT(parallel_state.initial_reader);
-			if (parallel_state.readers[parallel_state.file_index]) {
+			if (parallel_state.file_states[parallel_state.file_index] == ParquetFileState::OPEN) {
 				if (parallel_state.row_group_index <
 				    parallel_state.readers[parallel_state.file_index]->NumRowGroups()) {
 					// The current reader has rowgroups left to be scanned
@@ -523,12 +528,14 @@ public:
 					parallel_state.row_group_index++;
 					return true;
 				} else {
+					// Close current file
+					parallel_state.file_states[parallel_state.file_index] = ParquetFileState::CLOSED;
+					parallel_state.readers[parallel_state.file_index] = nullptr;
 					// Set state to the next file
 					parallel_state.file_index++;
 					parallel_state.row_group_index = 0;
-					parallel_state.readers[parallel_state.file_index - 1] = nullptr;
 					if (parallel_state.file_index >= bind_data.files.size()) {
 						return false;
 					}
@@ -541,8 +548,7 @@ public:
 			}
 			// Check if the current file is being opened, in that case we need to wait for it.
-			if (!parallel_state.readers[parallel_state.file_index] &&
-			    parallel_state.file_opening[parallel_state.file_index]) {
+			if (parallel_state.file_states[parallel_state.file_index] == ParquetFileState::OPENING) {
 				WaitForFile(parallel_state.file_index, parallel_state, parallel_lock);
 			}
 		}
@@ -573,7 +579,8 @@ public:
 			// - the thread opening the file has failed
 			// - the file was somehow scanned till the end while we were waiting
 			if (parallel_state.file_index >= parallel_state.readers.size() ||
-			    parallel_state.readers[parallel_state.file_index] || parallel_state.error_opening_file) {
+			    parallel_state.file_states[parallel_state.file_index] != ParquetFileState::OPENING ||
+			    parallel_state.error_opening_file) {
 				return;
 			}
 		}
@@ -583,10 +590,12 @@ public:
 	static bool TryOpenNextFile(ClientContext &context, const ParquetReadBindData &bind_data,
 	                            ParquetReadLocalState &scan_data, ParquetReadGlobalState &parallel_state,
 	                            unique_lock<mutex> &parallel_lock) {
-		for (idx_t i = parallel_state.file_index; i < bind_data.files.size(); i++) {
-			if (!parallel_state.readers[i] && parallel_state.file_opening[i] == false) {
+		const auto num_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
+		const auto file_index_limit = MinValue<idx_t>(parallel_state.file_index + num_threads, bind_data.files.size());
+		for (idx_t i = parallel_state.file_index; i < file_index_limit; i++) {
+			if (parallel_state.file_states[i] == ParquetFileState::UNOPENED) {
 				string file = bind_data.files[i];
-				parallel_state.file_opening[i] = true;
+				parallel_state.file_states[i] = ParquetFileState::OPENING;
 				auto pq_options = parallel_state.initial_reader->parquet_options;
 				// Now we switch which lock we are holding, instead of locking the global state, we grab the lock on
@@ -611,6 +620,7 @@ public:
 				// Now re-lock the state and add the reader
 				parallel_lock.lock();
 				parallel_state.readers[i] = reader;
+				parallel_state.file_states[i] = ParquetFileState::OPEN;
 				return true;
 			}

package/src/duckdb/src/catalog/catalog_entry/duck_index_entry.cpp CHANGED Viewed

@@ -24,4 +24,9 @@ string DuckIndexEntry::GetTableName() const {
 	return info->table;
 }
+void DuckIndexEntry::CommitDrop() {
+	D_ASSERT(info && index);
+	index->CommitDrop();
+}
 } // namespace duckdb

package/src/duckdb/src/common/crypto/md5.cpp CHANGED Viewed

@@ -20,6 +20,7 @@
  * will fill a supplied 16-byte array with the digest.
  */
 #include "duckdb/common/crypto/md5.hpp"
+#include "mbedtls_wrapper.hpp"
 namespace duckdb {
@@ -236,21 +237,10 @@ void MD5Context::Finish(data_ptr_t out_digest) {
 	memcpy(out_digest, buf, 16);
 }
-void MD5Context::DigestToBase16(const_data_ptr_t digest, char *zbuf) {
-	static char const HEX_CODES[] = "0123456789abcdef";
-	int i, j;
-	for (j = i = 0; i < 16; i++) {
-		int a = digest[i];
-		zbuf[j++] = HEX_CODES[(a >> 4) & 0xf];
-		zbuf[j++] = HEX_CODES[a & 0xf];
-	}
-}
 void MD5Context::FinishHex(char *out_digest) {
 	data_t digest[MD5_HASH_LENGTH_BINARY];
 	Finish(digest);
-	DigestToBase16(digest, out_digest);
+	duckdb_mbedtls::MbedTlsWrapper::ToBase16(reinterpret_cast<char *>(digest), out_digest, MD5_HASH_LENGTH_BINARY);
 }
 string MD5Context::FinishHex() {

package/src/duckdb/src/common/radix_partitioning.cpp CHANGED Viewed

@@ -26,7 +26,7 @@ public:
 };
 template <class OP, class RETURN_TYPE, typename... ARGS>
-RETURN_TYPE RadixBitsSwitch(idx_t radix_bits, ARGS &&...args) {
+RETURN_TYPE RadixBitsSwitch(idx_t radix_bits, ARGS &&... args) {
 	D_ASSERT(radix_bits <= RadixPartitioning::MAX_RADIX_BITS);
 	switch (radix_bits) {
 	case 0: