npm - duckdb - Versions diffs - 0.7.1-dev16.0 → 0.7.1-dev187.0 - Mend

duckdb 0.7.1-dev16.0 → 0.7.1-dev187.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

package/binding.gyp CHANGED Viewed

@@ -222,16 +222,16 @@
                 "src/duckdb/third_party/zstd/compress/zstd_lazy.cpp",
                 "src/duckdb/third_party/zstd/compress/zstd_ldm.cpp",
                 "src/duckdb/third_party/zstd/compress/zstd_opt.cpp",
-                "src/duckdb/extension/icu/./icu-timezone.cpp",
-                "src/duckdb/extension/icu/./icu-makedate.cpp",
-                "src/duckdb/extension/icu/./icu-datepart.cpp",
-                "src/duckdb/extension/icu/./icu-datesub.cpp",
+                "src/duckdb/extension/icu/./icu-dateadd.cpp",
                 "src/duckdb/extension/icu/./icu-datetrunc.cpp",
-                "src/duckdb/extension/icu/./icu-timebucket.cpp",
                 "src/duckdb/extension/icu/./icu-strptime.cpp",
-                "src/duckdb/extension/icu/./icu-extension.cpp",
-                "src/duckdb/extension/icu/./icu-dateadd.cpp",
                 "src/duckdb/extension/icu/./icu-datefunc.cpp",
+                "src/duckdb/extension/icu/./icu-extension.cpp",
+                "src/duckdb/extension/icu/./icu-makedate.cpp",
+                "src/duckdb/extension/icu/./icu-timezone.cpp",
+                "src/duckdb/extension/icu/./icu-datesub.cpp",
+                "src/duckdb/extension/icu/./icu-timebucket.cpp",
+                "src/duckdb/extension/icu/./icu-datepart.cpp",
                 "src/duckdb/ub_extension_icu_third_party_icu_common.cpp",
                 "src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp",
                 "src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp",

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "0.7.1-dev16.0",
+  "version": "0.7.1-dev187.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb/extension/json/buffered_json_reader.cpp CHANGED Viewed

@@ -25,7 +25,12 @@ JSONBufferHandle::JSONBufferHandle(idx_t buffer_index_p, idx_t readers_p, Alloca
 JSONFileHandle::JSONFileHandle(unique_ptr<FileHandle> file_handle_p, Allocator &allocator_p)
     : file_handle(std::move(file_handle_p)), allocator(allocator_p), can_seek(file_handle->CanSeek()),
       plain_file_source(file_handle->OnDiskFile() && can_seek), file_size(file_handle->GetFileSize()), read_position(0),
-      cached_size(0) {
+      requested_reads(0), actual_reads(0), cached_size(0) {
+}
+void JSONFileHandle::Close() {
+	file_handle->Close();
+	cached_buffers.clear();
 }
 idx_t JSONFileHandle::FileSize() const {
@@ -36,10 +41,6 @@ idx_t JSONFileHandle::Remaining() const {
 	return file_size - read_position;
 }
-bool JSONFileHandle::PlainFileSource() const {
-	return plain_file_source;
-}
 bool JSONFileHandle::CanSeek() const {
 	return can_seek;
 }
@@ -53,6 +54,9 @@ idx_t JSONFileHandle::GetPositionAndSize(idx_t &position, idx_t requested_size)
 	position = read_position;
 	auto actual_size = MinValue<idx_t>(requested_size, Remaining());
 	read_position += actual_size;
+	if (actual_size != 0) {
+		requested_reads++;
+	}
 	return actual_size;
 }
@@ -60,11 +64,13 @@ void JSONFileHandle::ReadAtPosition(const char *pointer, idx_t size, idx_t posit
 	D_ASSERT(size != 0);
 	if (plain_file_source) {
 		file_handle->Read((void *)pointer, size, position);
+		actual_reads++;
 		return;
 	}
 	if (sample_run) { // Cache the buffer
 		file_handle->Read((void *)pointer, size, position);
+		actual_reads++;
 		cached_buffers.emplace_back(allocator.Allocate(size));
 		memcpy(cached_buffers.back().get(), pointer, size);
 		cached_size += size;
@@ -73,9 +79,11 @@ void JSONFileHandle::ReadAtPosition(const char *pointer, idx_t size, idx_t posit
 	if (!cached_buffers.empty() || position < cached_size) {
 		ReadFromCache(pointer, size, position);
+		actual_reads++;
 	}
 	if (size != 0) {
 		file_handle->Read((void *)pointer, size, position);
+		actual_reads++;
 	}
 }
@@ -143,6 +151,16 @@ void BufferedJSONReader::OpenJSONFile() {
 	file_handle = make_unique<JSONFileHandle>(std::move(regular_file_handle), BufferAllocator::Get(context));
 }
+void BufferedJSONReader::CloseJSONFile() {
+	while (true) {
+		lock_guard<mutex> guard(lock);
+		if (file_handle->RequestedReadsComplete()) {
+			file_handle->Close();
+			break;
+		}
+	}
+}
 bool BufferedJSONReader::IsOpen() {
 	return file_handle != nullptr;
 }
@@ -246,9 +264,15 @@ void BufferedJSONReader::Reset() {
 void JSONFileHandle::Reset() {
 	read_position = 0;
+	requested_reads = 0;
+	actual_reads = 0;
 	if (plain_file_source) {
 		file_handle->Reset();
 	}
 }
+bool JSONFileHandle::RequestedReadsComplete() {
+	return requested_reads == actual_reads;
+}
 } // namespace duckdb

package/src/duckdb/extension/json/include/buffered_json_reader.hpp CHANGED Viewed

@@ -58,11 +58,11 @@ public:
 struct JSONFileHandle {
 public:
 	JSONFileHandle(unique_ptr<FileHandle> file_handle, Allocator &allocator);
+	void Close();
 	idx_t FileSize() const;
 	idx_t Remaining() const;
-	bool PlainFileSource() const;
 	bool CanSeek() const;
 	void Seek(idx_t position);
@@ -71,6 +71,7 @@ public:
 	idx_t Read(const char *pointer, idx_t requested_size, bool sample_run);
 	void Reset();
+	bool RequestedReadsComplete();
 private:
 	idx_t ReadFromCache(const char *&pointer, idx_t &size, idx_t &position);
@@ -87,6 +88,8 @@ private:
 	//! Read properties
 	idx_t read_position;
+	idx_t requested_reads;
+	atomic<idx_t> actual_reads;
 	//! Cached buffers for resetting when reading stream
 	vector<AllocatedData> cached_buffers;
@@ -98,6 +101,7 @@ public:
 	BufferedJSONReader(ClientContext &context, BufferedJSONReaderOptions options, string file_path);
 	void OpenJSONFile();
+	void CloseJSONFile();
 	bool IsOpen();
 	BufferedJSONReaderOptions &GetOptions();

package/src/duckdb/extension/json/include/json_scan.hpp CHANGED Viewed

@@ -26,6 +26,16 @@ enum class JSONScanType : uint8_t {
 	SAMPLE = 3,
 };
+enum class JSONScanTopLevelType : uint8_t {
+	INVALID = 0,
+	//! Sequential objects, e.g., NDJSON
+	OBJECTS = 1,
+	//! Top-level array containing objects
+	ARRAY_OF_OBJECTS = 2,
+	//! Other, e.g., array of integer, or just strings
+	OTHER = 3
+};
 //! Even though LogicalTypeId is just a uint8_t, this is still needed ...
 struct LogicalTypeIdHash {
 	inline std::size_t operator()(const LogicalTypeId &id) const {
@@ -105,7 +115,7 @@ public:
 	//! Max depth we go to detect nested JSON schema (defaults to unlimited)
 	idx_t max_depth = NumericLimits<idx_t>::Maximum();
 	//! Whether we're parsing objects (usually), or something else like arrays
-	bool objects = true;
+	JSONScanTopLevelType top_level_type = JSONScanTopLevelType::OBJECTS;
 	//! Forced date/timestamp formats
 	string date_format;
 	string timestamp_format;
@@ -181,9 +191,14 @@ public:
 	yyjson_alc *GetAllocator();
 	void ThrowTransformError(idx_t count, idx_t object_index, const string &error_message);
+	idx_t scan_count;
 	JSONLine lines[STANDARD_VECTOR_SIZE];
 	yyjson_val *objects[STANDARD_VECTOR_SIZE];
+	idx_t array_idx;
+	idx_t array_offset;
+	yyjson_val *array_objects[STANDARD_VECTOR_SIZE];
 	idx_t batch_index;
 	//! Options when transforming the JSON to columnar data
@@ -192,6 +207,7 @@ public:
 private:
 	yyjson_val *ParseLine(char *line_start, idx_t line_size, idx_t remaining, JSONLine &line);
+	idx_t GetObjectsFromArray();
 private:
 	//! Bind data
@@ -300,7 +316,6 @@ public:
 		table_function.serialize = JSONScanSerialize;
 		table_function.deserialize = JSONScanDeserialize;
-		// TODO: might be able to do some of these
 		table_function.projection_pushdown = false;
 		table_function.filter_pushdown = false;
 		table_function.filter_prune = false;

package/src/duckdb/extension/json/json_functions/json_transform.cpp CHANGED Viewed

@@ -523,6 +523,21 @@ static bool TransformArray(yyjson_val *arrays[], yyjson_alc *alc, Vector &result
 	return success;
 }
+bool TransformToJSON(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count) {
+	auto data = (string_t *)FlatVector::GetData(result);
+	auto &validity = FlatVector::Validity(result);
+	for (idx_t i = 0; i < count; i++) {
+		const auto &val = vals[i];
+		if (!val) {
+			validity.SetInvalid(i);
+		} else {
+			data[i] = JSONCommon::WriteVal(val, alc);
+		}
+	}
+	// Can always transform to JSON
+	return true;
+}
 bool JSONTransform::Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count,
                               JSONTransformOptions &options) {
 	auto result_type = result.GetType();
@@ -531,6 +546,10 @@ bool JSONTransform::Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &resul
 		return TransformFromStringWithFormat(vals, result, count, options);
 	}
+	if (JSONCommon::LogicalTypeIsJSON(result_type)) {
+		return TransformToJSON(vals, alc, result, count);
+	}
 	switch (result_type.id()) {
 	case LogicalTypeId::SQLNULL:
 		return true;

package/src/duckdb/extension/json/json_functions/read_json.cpp CHANGED Viewed

@@ -13,32 +13,17 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
 	JSONScanLocalState lstate(context, gstate);
 	ArenaAllocator allocator(BufferAllocator::Get(context));
-	static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
-	    {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
-	    {LogicalTypeId::TIMESTAMP,
-	     {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
-	      "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"}},
-	};
-	// Populate possible date/timestamp formats, assume this is consistent across columns
-	for (auto &kv : FORMAT_TEMPLATES) {
-		const auto &type = kv.first;
-		if (bind_data.date_format_map.HasFormats(type)) {
-			continue; // Already populated
-		}
-		const auto &format_strings = kv.second;
-		for (auto &format_string : format_strings) {
-			bind_data.date_format_map.AddFormat(type, format_string);
-		}
-	}
 	// Read for the specified sample size
 	JSONStructureNode node;
+	bool more_than_one = false;
 	Vector string_vector(LogicalType::VARCHAR);
 	idx_t remaining = bind_data.sample_size;
 	while (remaining != 0) {
 		allocator.Reset();
 		auto read_count = lstate.ReadNext(gstate);
+		if (read_count > 1) {
+			more_than_one = true;
+		}
 		if (read_count == 0) {
 			break;
 		}
@@ -54,15 +39,29 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
 		node.InitializeCandidateTypes(bind_data.max_depth);
 		node.RefineCandidateTypes(lstate.objects, next, string_vector, allocator, bind_data.date_format_map);
 		remaining -= next;
+		if (gstate.file_index == 10) {
+			// We really shouldn't open more than 10 files when sampling
+			break;
+		}
 	}
 	bind_data.type = original_scan_type;
 	bind_data.transform_options.date_format_map = &bind_data.date_format_map;
-	const auto type = JSONStructure::StructureToType(context, node, bind_data.max_depth);
+	auto type = JSONStructure::StructureToType(context, node, bind_data.max_depth);
+	if (type.id() == LogicalTypeId::STRUCT) {
+		bind_data.top_level_type = JSONScanTopLevelType::OBJECTS;
+	} else if (!more_than_one && type.id() == LogicalTypeId::LIST &&
+	           ListType::GetChildType(type).id() == LogicalTypeId::STRUCT) {
+		bind_data.top_level_type = JSONScanTopLevelType::ARRAY_OF_OBJECTS;
+		bind_data.options.format = JSONFormat::UNSTRUCTURED;
+		type = ListType::GetChildType(type);
+	}
 	if (type.id() != LogicalTypeId::STRUCT) {
 		return_types.emplace_back(type);
 		names.emplace_back("json");
-		bind_data.objects = false;
+		bind_data.top_level_type = JSONScanTopLevelType::OTHER;
 	} else {
 		const auto &child_types = StructType::GetChildTypes(type);
 		return_types.reserve(child_types.size());
@@ -189,9 +188,11 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
 	auto &gstate = ((JSONGlobalTableFunctionState &)*data_p.global_state).state;
 	auto &lstate = ((JSONLocalTableFunctionState &)*data_p.local_state).state;
-	// Fetch next lines
 	const auto count = lstate.ReadNext(gstate);
-	const auto objects = lstate.objects;
+	const auto objects = gstate.bind_data.top_level_type == JSONScanTopLevelType::ARRAY_OF_OBJECTS
+	                         ? lstate.array_objects
+	                         : lstate.objects;
+	output.SetCardinality(count);
 	vector<Vector *> result_vectors;
 	result_vectors.reserve(output.ColumnCount());
@@ -202,13 +203,14 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
 	// Pass current reader to transform options so we can get line number information if an error occurs
 	bool success;
-	if (gstate.bind_data.objects) {
-		success = JSONTransform::TransformObject(objects, lstate.GetAllocator(), count, gstate.bind_data.names,
-		                                         result_vectors, lstate.transform_options);
-	} else {
+	if (gstate.bind_data.top_level_type == JSONScanTopLevelType::OTHER) {
 		success = JSONTransform::Transform(objects, lstate.GetAllocator(), *result_vectors[0], count,
 		                                   lstate.transform_options);
+	} else {
+		success = JSONTransform::TransformObject(objects, lstate.GetAllocator(), count, gstate.bind_data.names,
+		                                         result_vectors, lstate.transform_options);
 	}
 	if (!success) {
 		string hint = gstate.bind_data.auto_detect
 		                  ? "\nTry increasing 'sample_size', reducing 'maximum_depth', specifying 'columns' manually, "
@@ -217,7 +219,6 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
 		lstate.ThrowTransformError(count, lstate.transform_options.object_index,
 		                           lstate.transform_options.error_message + hint);
 	}
-	output.SetCardinality(count);
 }
 TableFunction JSONFunctions::GetReadJSONTableFunction(bool list_parameter, shared_ptr<JSONScanInfo> function_info) {
@@ -235,6 +236,7 @@ TableFunction JSONFunctions::GetReadJSONTableFunction(bool list_parameter, share
 	table_function.named_parameters["timestamp_format"] = LogicalType::VARCHAR;
 	table_function.projection_pushdown = true;
+	// TODO: might be able to do filter pushdown/prune too
 	table_function.function_info = std::move(function_info);

package/src/duckdb/extension/json/json_functions.cpp CHANGED Viewed

@@ -166,6 +166,12 @@ vector<CreateTableFunctionInfo> JSONFunctions::GetTableFunctions() {
 unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context, const string &table_name,
                                                         ReplacementScanData *data) {
 	auto lower_name = StringUtil::Lower(table_name);
+	// remove any compression
+	if (StringUtil::EndsWith(lower_name, ".gz")) {
+		lower_name = lower_name.substr(0, lower_name.size() - 3);
+	} else if (StringUtil::EndsWith(lower_name, ".zst")) {
+		lower_name = lower_name.substr(0, lower_name.size() - 4);
+	}
 	if (!StringUtil::EndsWith(lower_name, ".json") && !StringUtil::Contains(lower_name, ".json?") &&
 	    !StringUtil::EndsWith(lower_name, ".ndjson") && !StringUtil::Contains(lower_name, ".ndjson?")) {
 		return nullptr;

package/src/duckdb/extension/json/json_scan.cpp CHANGED Viewed

@@ -3,6 +3,7 @@
 #include "duckdb/main/database.hpp"
 #include "duckdb/parallel/task_scheduler.hpp"
 #include "duckdb/storage/buffer_manager.hpp"
+#include "duckdb/main/extension_helper.hpp"
 namespace duckdb {
@@ -47,8 +48,11 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
 				options.format = JSONFormat::UNSTRUCTURED;
 			} else if (format == "newline_delimited") {
 				options.format = JSONFormat::NEWLINE_DELIMITED;
+			} else if (format == "array_of_objects") {
+				result->top_level_type = JSONScanTopLevelType::ARRAY_OF_OBJECTS;
 			} else {
-				throw BinderException("format must be one of ['auto', 'unstructured', 'newline_delimited']");
+				throw BinderException(
+				    "format must be one of ['auto', 'unstructured', 'newline_delimited', 'array_of_objects']");
 			}
 		} else if (loption == "compression") {
 			auto compression = StringUtil::Lower(StringValue::Get(kv.second));
@@ -66,6 +70,10 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
 		}
 	}
+	if (result->top_level_type == JSONScanTopLevelType::ARRAY_OF_OBJECTS) {
+		result->options.format = JSONFormat::UNSTRUCTURED;
+	}
 	return std::move(result);
 }
@@ -75,7 +83,7 @@ void JSONScanData::InitializeFilePaths(ClientContext &context, const vector<stri
 	for (auto &file_pattern : patterns) {
 		auto found_files = fs.Glob(file_pattern, context);
 		if (found_files.empty()) {
-			throw IOException("No files found that match the pattern \"%s\"", file_pattern);
+			throw FileSystem::MissingFileException(file_pattern, context);
 		}
 		file_paths.insert(file_paths.end(), found_files.begin(), found_files.end());
 	}
@@ -97,6 +105,27 @@ void JSONScanData::InitializeFormats() {
 	if (!timestamp_format.empty()) {
 		date_format_map.AddFormat(LogicalTypeId::TIMESTAMP, timestamp_format);
 	}
+	if (auto_detect) {
+		static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
+		    {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
+		    {LogicalTypeId::TIMESTAMP,
+		     {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
+		      "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"}},
+		};
+		// Populate possible date/timestamp formats, assume this is consistent across columns
+		for (auto &kv : FORMAT_TEMPLATES) {
+			const auto &type = kv.first;
+			if (date_format_map.HasFormats(type)) {
+				continue; // Already populated
+			}
+			const auto &format_strings = kv.second;
+			for (auto &format_string : format_strings) {
+				date_format_map.AddFormat(type, format_string);
+			}
+		}
+	}
 }
 void JSONScanData::Serialize(FieldWriter &writer) {
@@ -111,9 +140,17 @@ void JSONScanData::Serialize(FieldWriter &writer) {
 	writer.WriteList<string>(names);
 	writer.WriteList<idx_t>(valid_cols);
 	writer.WriteField<idx_t>(max_depth);
-	writer.WriteField<bool>(objects);
-	writer.WriteString(date_format);
-	writer.WriteString(timestamp_format);
+	writer.WriteField<JSONScanTopLevelType>(top_level_type);
+	if (!date_format.empty()) {
+		writer.WriteString(date_format);
+	} else {
+		writer.WriteString(date_format_map.GetFormat(LogicalTypeId::DATE).format_specifier);
+	}
+	if (!timestamp_format.empty()) {
+		writer.WriteString(timestamp_format);
+	} else {
+		writer.WriteString(date_format_map.GetFormat(LogicalTypeId::TIMESTAMP).format_specifier);
+	}
 }
 void JSONScanData::Deserialize(FieldReader &reader) {
@@ -128,9 +165,12 @@ void JSONScanData::Deserialize(FieldReader &reader) {
 	names = reader.ReadRequiredList<string>();
 	valid_cols = reader.ReadRequiredList<idx_t>();
 	max_depth = reader.ReadRequired<idx_t>();
-	objects = reader.ReadRequired<bool>();
+	top_level_type = reader.ReadRequired<JSONScanTopLevelType>();
 	date_format = reader.ReadRequired<string>();
 	timestamp_format = reader.ReadRequired<string>();
+	InitializeFormats();
+	transform_options.date_format_map = &date_format_map;
 }
 JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data_p)
@@ -149,9 +189,9 @@ JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &b
 }
 JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
-    : batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
+    : scan_count(0), array_idx(0), array_offset(0), batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
       json_allocator(BufferAllocator::Get(context)), current_reader(nullptr), current_buffer_handle(nullptr),
-      buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
+      is_last(false), buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
 	// Buffer to reconstruct JSON objects when they cross a buffer boundary
 	reconstruct_buffer = gstate.allocator.Allocate(gstate.bind_data.maximum_object_size + YYJSON_PADDING_SIZE);
@@ -173,11 +213,6 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
 	// Perform projection pushdown
 	if (bind_data.type == JSONScanType::READ_JSON) {
 		D_ASSERT(input.column_ids.size() <= bind_data.names.size()); // Can't project to have more columns
-		if (bind_data.auto_detect && input.column_ids.size() < bind_data.names.size()) {
-			// If we are auto-detecting, but don't need all columns present in the file,
-			// then we don't need to throw an error if we encounter an unseen column
-			bind_data.transform_options.error_unknown_key = false;
-		}
 		vector<string> names;
 		names.reserve(input.column_ids.size());
 		for (idx_t i = 0; i < input.column_ids.size(); i++) {
@@ -188,6 +223,11 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
 			names.push_back(std::move(bind_data.names[id]));
 			bind_data.valid_cols.push_back(i);
 		}
+		if (names.size() < bind_data.names.size()) {
+			// If we are auto-detecting, but don't need all columns present in the file,
+			// then we don't need to throw an error if we encounter an unseen column
+			bind_data.transform_options.error_unknown_key = false;
+		}
 		bind_data.names = std::move(names);
 	}
 	return result;
@@ -230,6 +270,10 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
 idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
 	json_allocator.Reset();
+	if (gstate.bind_data.top_level_type == JSONScanTopLevelType::ARRAY_OF_OBJECTS && array_idx < scan_count) {
+		return GetObjectsFromArray();
+	}
 	idx_t count = 0;
 	if (buffer_offset == buffer_size) {
 		if (!ReadNextBuffer(gstate)) {
@@ -253,10 +297,20 @@ idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
 	default:
 		throw InternalException("Unknown JSON format");
 	}
+	scan_count = count;
 	// Skip over any remaining whitespace for the next scan
 	SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
+	if (gstate.bind_data.top_level_type == JSONScanTopLevelType::ARRAY_OF_OBJECTS) {
+		if (scan_count > 1) {
+			throw InvalidInputException("File must have exactly one array of objects when format='array_of_objects'");
+		}
+		array_idx = 0;
+		array_offset = 0;
+		return GetObjectsFromArray();
+	}
 	return count;
 }
@@ -331,10 +385,39 @@ yyjson_val *JSONScanLocalState::ParseLine(char *line_start, idx_t line_size, idx
 	}
 }
+idx_t JSONScanLocalState::GetObjectsFromArray() {
+	idx_t arr_count = 0;
+	size_t idx, max;
+	yyjson_val *val;
+	for (; array_idx < scan_count; array_idx++, array_offset = 0) {
+		if (objects[array_idx]) {
+			yyjson_arr_foreach(objects[array_idx], idx, max, val) {
+				if (idx < array_offset) {
+					continue;
+				}
+				array_objects[arr_count++] = val;
+				if (arr_count == STANDARD_VECTOR_SIZE) {
+					break;
+				}
+			}
+			array_offset = idx + 1;
+			if (arr_count == STANDARD_VECTOR_SIZE) {
+				break;
+			}
+		}
+	}
+	return arr_count;
+}
 bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
 	if (current_reader) {
 		D_ASSERT(current_buffer_handle);
 		current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
+		if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
+			// Close files that are done if we're not sampling
+			current_reader->CloseJSONFile();
+		}
 	}
 	AllocatedData buffer;
@@ -395,7 +478,9 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
 		// Unopened file
 		current_reader->OpenJSONFile();
 		batch_index = gstate.batch_index++;
-		if (options.format == JSONFormat::UNSTRUCTURED) {
+		if (options.format == JSONFormat::UNSTRUCTURED || (options.format == JSONFormat::NEWLINE_DELIMITED &&
+		                                                   options.compression != FileCompressionType::UNCOMPRESSED &&
+		                                                   gstate.file_index < gstate.json_readers.size())) {
 			gstate.file_index++; // UNSTRUCTURED necessitates single-threaded read
 		}
 		if (options.format != JSONFormat::AUTO_DETECT) {
@@ -449,9 +534,6 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
 	auto json_buffer_handle = make_unique<JSONBufferHandle>(buffer_index, readers, std::move(buffer), buffer_size);
 	current_buffer_handle = json_buffer_handle.get();
 	current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
-	if (!current_reader->GetFileHandle().PlainFileSource() && gstate.bind_data.type == JSONScanType::SAMPLE) {
-		// TODO: store buffer
-	}
 	buffer_offset = 0;
 	prev_buffer_remainder = 0;
@@ -507,16 +589,18 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
 }
 void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
-	auto &file_handle = current_reader->GetFileHandle();
 	idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
 	idx_t read_size;
 	{
 		lock_guard<mutex> reader_guard(current_reader->lock);
 		buffer_index = current_reader->GetBufferIndex();
-		read_size = file_handle.Read(buffer_ptr + prev_buffer_remainder, request_size,
-		                             gstate.bind_data.type == JSONScanType::SAMPLE);
+		if (current_reader->IsOpen()) {
+			read_size = current_reader->GetFileHandle().Read(buffer_ptr + prev_buffer_remainder, request_size,
+			                                                 gstate.bind_data.type == JSONScanType::SAMPLE);
+		} else {
+			read_size = 0;
+		}
 		is_last = read_size < request_size;
 		if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
@@ -582,6 +666,11 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
 }
 void JSONScanLocalState::ReadUnstructured(idx_t &count) {
+	// yyjson does not always return YYJSON_READ_ERROR_UNEXPECTED_END properly
+	// if a different error code happens within the last 50 bytes
+	// we assume it should be YYJSON_READ_ERROR_UNEXPECTED_END instead
+	static constexpr idx_t END_BOUND = 50;
 	const auto max_obj_size = reconstruct_buffer.GetSize();
 	yyjson_read_err error;
 	for (; count < STANDARD_VECTOR_SIZE; count++) {
@@ -607,8 +696,7 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
 		} else if (error.pos > max_obj_size) {
 			current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error,
 			                                "Try increasing \"maximum_object_size\".");
-		} else if (error.code == YYJSON_READ_ERROR_UNEXPECTED_END && !is_last) {
+		} else if (!is_last && (error.code == YYJSON_READ_ERROR_UNEXPECTED_END || remaining - error.pos < END_BOUND)) {
 			// Copy remaining to reconstruct_buffer
 			const auto reconstruct_ptr = reconstruct_buffer.get();
 			memcpy(reconstruct_ptr, obj_copy_start, remaining);

package/src/duckdb/extension/parquet/parquet-extension.cpp CHANGED Viewed

@@ -223,7 +223,7 @@ public:
 		FileSystem &fs = FileSystem::GetFileSystem(context);
 		auto files = fs.Glob(info.file_path, context);
 		if (files.empty()) {
-			throw IOException("No files found that match the pattern \"%s\"", info.file_path);
+			throw FileSystem::MissingFileException(info.file_path, context);
 		}
 		// The most likely path (Parquet read without union by name option)
@@ -363,8 +363,9 @@ public:
 	static vector<string> ParquetGlob(FileSystem &fs, const string &glob, ClientContext &context) {
 		auto files = fs.Glob(glob, FileSystem::GetFileOpener(context));
 		if (files.empty()) {
-			throw IOException("No files found that match the pattern \"%s\"", glob);
+			throw FileSystem::MissingFileException(glob, context);
 		}
 		return files;
 	}