npm - duckdb - Versions diffs - 0.7.1-dev2.0 → 0.7.1-dev240.0 - Mend

duckdb 0.7.1-dev2.0 → 0.7.1-dev240.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

package/src/duckdb/extension/json/json_functions/read_json.cpp CHANGED Viewed

@@ -13,63 +13,88 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
 	JSONScanLocalState lstate(context, gstate);
 	ArenaAllocator allocator(BufferAllocator::Get(context));
-	static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
-	    {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
-	    {LogicalTypeId::TIMESTAMP,
-	     {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
-	      "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S"}},
-	};
-	// Populate possible date/timestamp formats, assume this is consistent across columns
-	for (auto &kv : FORMAT_TEMPLATES) {
-		const auto &type = kv.first;
-		if (bind_data.date_format_map.HasFormats(type)) {
-			continue; // Already populated
-		}
-		const auto &format_strings = kv.second;
-		for (auto &format_string : format_strings) {
-			bind_data.date_format_map.AddFormat(type, format_string);
-		}
-	}
 	// Read for the specified sample size
 	JSONStructureNode node;
+	bool more_than_one = false;
 	Vector string_vector(LogicalType::VARCHAR);
 	idx_t remaining = bind_data.sample_size;
 	while (remaining != 0) {
 		allocator.Reset();
 		auto read_count = lstate.ReadNext(gstate);
+		if (lstate.scan_count > 1) {
+			more_than_one = true;
+		}
 		if (read_count == 0) {
 			break;
 		}
 		idx_t next = MinValue<idx_t>(read_count, remaining);
+		yyjson_val **values;
+		if (bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
+		    bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) {
+			values = lstate.array_values;
+		} else {
+			values = lstate.values;
+		}
 		for (idx_t i = 0; i < next; i++) {
-			if (lstate.objects[i]) {
-				JSONStructure::ExtractStructure(lstate.objects[i], node);
+			if (values[i]) {
+				JSONStructure::ExtractStructure(values[i], node);
 			}
 		}
 		if (!node.ContainsVarchar()) { // Can't refine non-VARCHAR types
 			continue;
 		}
 		node.InitializeCandidateTypes(bind_data.max_depth);
-		node.RefineCandidateTypes(lstate.objects, next, string_vector, allocator, bind_data.date_format_map);
+		node.RefineCandidateTypes(values, next, string_vector, allocator, bind_data.date_format_map);
 		remaining -= next;
+		if (gstate.file_index == 10) {
+			// We really shouldn't open more than 10 files when sampling
+			break;
+		}
 	}
 	bind_data.type = original_scan_type;
-	bind_data.transform_options.date_format_map = &bind_data.date_format_map;
-	const auto type = JSONStructure::StructureToType(context, node, bind_data.max_depth);
-	if (type.id() != LogicalTypeId::STRUCT) {
-		return_types.emplace_back(type);
-		names.emplace_back("json");
-		bind_data.objects = false;
-	} else {
-		const auto &child_types = StructType::GetChildTypes(type);
-		return_types.reserve(child_types.size());
-		names.reserve(child_types.size());
-		for (auto &child_type : child_types) {
-			return_types.emplace_back(child_type.second);
-			names.emplace_back(child_type.first);
+	// Convert structure to logical type
+	auto type = JSONStructure::StructureToType(context, node, bind_data.max_depth);
+	// Detect record type
+	if (bind_data.record_type == JSONRecordType::AUTO) {
+		switch (type.id()) {
+		case LogicalTypeId::STRUCT:
+			bind_data.record_type = JSONRecordType::RECORDS;
+			break;
+		case LogicalTypeId::LIST: {
+			if (more_than_one) {
+				bind_data.record_type = JSONRecordType::JSON;
+			} else {
+				type = ListType::GetChildType(type);
+				if (type.id() == LogicalTypeId::STRUCT) {
+					bind_data.record_type = JSONRecordType::ARRAY_OF_RECORDS;
+				} else {
+					bind_data.record_type = JSONRecordType::ARRAY_OF_JSON;
+				}
+			}
+			break;
+		}
+		default:
+			bind_data.record_type = JSONRecordType::JSON;
+		}
+	}
+	// Detect return type
+	if (bind_data.auto_detect) {
+		bind_data.transform_options.date_format_map = &bind_data.date_format_map;
+		if (type.id() != LogicalTypeId::STRUCT) {
+			return_types.emplace_back(type);
+			names.emplace_back("json");
+		} else {
+			const auto &child_types = StructType::GetChildTypes(type);
+			return_types.reserve(child_types.size());
+			names.reserve(child_types.size());
+			for (auto &child_type : child_types) {
+				return_types.emplace_back(child_type.second);
+				names.emplace_back(child_type.first);
+			}
 		}
 	}
@@ -150,6 +175,22 @@ void JSONScan::InitializeBindData(ClientContext &context, JSONScanData &bind_dat
 			if (!error.empty()) {
 				throw InvalidInputException("Could not parse TIMESTAMPFORMAT: %s", error.c_str());
 			}
+		} else if (loption == "json_format") {
+			auto arg = StringValue::Get(kv.second);
+			if (arg == "records") {
+				bind_data.record_type = JSONRecordType::RECORDS;
+			} else if (arg == "array_of_records") {
+				bind_data.record_type = JSONRecordType::ARRAY_OF_RECORDS;
+			} else if (arg == "values") {
+				bind_data.record_type = JSONRecordType::JSON;
+			} else if (arg == "array_of_values") {
+				bind_data.record_type = JSONRecordType::ARRAY_OF_JSON;
+			} else if (arg == "auto") {
+				bind_data.record_type = JSONRecordType::AUTO;
+			} else {
+				throw InvalidInputException("\"json_format\" must be one of ['records', 'array_of_records', 'json', "
+				                            "'array_of_json', 'auto']");
+			}
 		}
 	}
 }
@@ -170,7 +211,7 @@ unique_ptr<FunctionData> ReadJSONBind(ClientContext &context, TableFunctionBindI
 	bind_data.InitializeFormats();
-	if (bind_data.auto_detect) {
+	if (bind_data.auto_detect || bind_data.record_type == JSONRecordType::AUTO) {
 		JSONScan::AutoDetect(context, bind_data, return_types, names);
 		bind_data.names = names;
 	}
@@ -189,9 +230,16 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
 	auto &gstate = ((JSONGlobalTableFunctionState &)*data_p.global_state).state;
 	auto &lstate = ((JSONLocalTableFunctionState &)*data_p.local_state).state;
-	// Fetch next lines
 	const auto count = lstate.ReadNext(gstate);
-	const auto objects = lstate.objects;
+	yyjson_val **values;
+	if (gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
+	    gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) {
+		values = lstate.array_values;
+	} else {
+		D_ASSERT(gstate.bind_data.record_type != JSONRecordType::AUTO);
+		values = lstate.values;
+	}
+	output.SetCardinality(count);
 	vector<Vector *> result_vectors;
 	result_vectors.reserve(output.ColumnCount());
@@ -202,22 +250,23 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
 	// Pass current reader to transform options so we can get line number information if an error occurs
 	bool success;
-	if (gstate.bind_data.objects) {
-		success = JSONTransform::TransformObject(objects, lstate.GetAllocator(), count, gstate.bind_data.names,
+	if (gstate.bind_data.record_type == JSONRecordType::RECORDS ||
+	    gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS) {
+		success = JSONTransform::TransformObject(values, lstate.GetAllocator(), count, gstate.bind_data.names,
 		                                         result_vectors, lstate.transform_options);
 	} else {
-		success = JSONTransform::Transform(objects, lstate.GetAllocator(), *result_vectors[0], count,
+		success = JSONTransform::Transform(values, lstate.GetAllocator(), *result_vectors[0], count,
 		                                   lstate.transform_options);
 	}
 	if (!success) {
 		string hint = gstate.bind_data.auto_detect
 		                  ? "\nTry increasing 'sample_size', reducing 'maximum_depth', specifying 'columns' manually, "
-		                    "or setting 'ignore_errors' to true."
-		                  : "";
-		lstate.ThrowTransformError(count, lstate.transform_options.object_index,
+		                    "specifying 'lines' or 'json_format' manually, or setting 'ignore_errors' to true."
+		                  : "\n Try specifying 'lines' or 'json_format' manually, or setting 'ignore_errors' to true.";
+		lstate.ThrowTransformError(lstate.transform_options.object_index,
 		                           lstate.transform_options.error_message + hint);
 	}
-	output.SetCardinality(count);
 }
 TableFunction JSONFunctions::GetReadJSONTableFunction(bool list_parameter, shared_ptr<JSONScanInfo> function_info) {
@@ -233,8 +282,10 @@ TableFunction JSONFunctions::GetReadJSONTableFunction(bool list_parameter, share
 	table_function.named_parameters["date_format"] = LogicalType::VARCHAR;
 	table_function.named_parameters["timestampformat"] = LogicalType::VARCHAR;
 	table_function.named_parameters["timestamp_format"] = LogicalType::VARCHAR;
+	table_function.named_parameters["json_format"] = LogicalType::VARCHAR;
 	table_function.projection_pushdown = true;
+	// TODO: might be able to do filter pushdown/prune too
 	table_function.function_info = std::move(function_info);
@@ -249,7 +300,8 @@ TableFunction GetReadJSONAutoTableFunction(bool list_parameter, shared_ptr<JSONS
 CreateTableFunctionInfo JSONFunctions::GetReadJSONFunction() {
 	TableFunctionSet function_set("read_json");
-	auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::UNSTRUCTURED, false);
+	auto function_info =
+	    make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::UNSTRUCTURED, JSONRecordType::RECORDS, false);
 	function_set.AddFunction(JSONFunctions::GetReadJSONTableFunction(false, function_info));
 	function_set.AddFunction(JSONFunctions::GetReadJSONTableFunction(true, function_info));
 	return CreateTableFunctionInfo(function_set);
@@ -257,7 +309,8 @@ CreateTableFunctionInfo JSONFunctions::GetReadJSONFunction() {
 CreateTableFunctionInfo JSONFunctions::GetReadNDJSONFunction() {
 	TableFunctionSet function_set("read_ndjson");
-	auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED, false);
+	auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED,
+	                                               JSONRecordType::RECORDS, false);
 	function_set.AddFunction(JSONFunctions::GetReadJSONTableFunction(false, function_info));
 	function_set.AddFunction(JSONFunctions::GetReadJSONTableFunction(true, function_info));
 	return CreateTableFunctionInfo(function_set);
@@ -265,7 +318,8 @@ CreateTableFunctionInfo JSONFunctions::GetReadNDJSONFunction() {
 CreateTableFunctionInfo JSONFunctions::GetReadJSONAutoFunction() {
 	TableFunctionSet function_set("read_json_auto");
-	auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT, true);
+	auto function_info =
+	    make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT, JSONRecordType::AUTO, true);
 	function_set.AddFunction(GetReadJSONAutoTableFunction(false, function_info));
 	function_set.AddFunction(GetReadJSONAutoTableFunction(true, function_info));
 	return CreateTableFunctionInfo(function_set);
@@ -273,7 +327,8 @@ CreateTableFunctionInfo JSONFunctions::GetReadJSONAutoFunction() {
 CreateTableFunctionInfo JSONFunctions::GetReadNDJSONAutoFunction() {
 	TableFunctionSet function_set("read_ndjson_auto");
-	auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED, true);
+	auto function_info =
+	    make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED, JSONRecordType::AUTO, true);
 	function_set.AddFunction(GetReadJSONAutoTableFunction(false, function_info));
 	function_set.AddFunction(GetReadJSONAutoTableFunction(true, function_info));
 	return CreateTableFunctionInfo(function_set);

package/src/duckdb/extension/json/json_functions/read_json_objects.cpp CHANGED Viewed

@@ -20,7 +20,7 @@ static void ReadJSONObjectsFunction(ClientContext &context, TableFunctionInput &
 	// Fetch next lines
 	const auto count = lstate.ReadNext(gstate);
 	const auto lines = lstate.lines;
-	const auto objects = lstate.objects;
+	const auto objects = lstate.values;
 	// Create the strings without copying them
 	auto strings = FlatVector::GetData<string_t>(output.data[0]);
@@ -48,7 +48,8 @@ TableFunction GetReadJSONObjectsTableFunction(bool list_parameter, shared_ptr<JS
 CreateTableFunctionInfo JSONFunctions::GetReadJSONObjectsFunction() {
 	TableFunctionSet function_set("read_json_objects");
-	auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::UNSTRUCTURED);
+	auto function_info =
+	    make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::UNSTRUCTURED, JSONRecordType::JSON);
 	function_set.AddFunction(GetReadJSONObjectsTableFunction(false, function_info));
 	function_set.AddFunction(GetReadJSONObjectsTableFunction(true, function_info));
 	return CreateTableFunctionInfo(function_set);
@@ -56,7 +57,8 @@ CreateTableFunctionInfo JSONFunctions::GetReadJSONObjectsFunction() {
 CreateTableFunctionInfo JSONFunctions::GetReadNDJSONObjectsFunction() {
 	TableFunctionSet function_set("read_ndjson_objects");
-	auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::NEWLINE_DELIMITED);
+	auto function_info =
+	    make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::NEWLINE_DELIMITED, JSONRecordType::JSON);
 	function_set.AddFunction(GetReadJSONObjectsTableFunction(false, function_info));
 	function_set.AddFunction(GetReadJSONObjectsTableFunction(true, function_info));
 	return CreateTableFunctionInfo(function_set);

package/src/duckdb/extension/json/json_functions.cpp CHANGED Viewed

@@ -166,6 +166,12 @@ vector<CreateTableFunctionInfo> JSONFunctions::GetTableFunctions() {
 unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context, const string &table_name,
                                                         ReplacementScanData *data) {
 	auto lower_name = StringUtil::Lower(table_name);
+	// remove any compression
+	if (StringUtil::EndsWith(lower_name, ".gz")) {
+		lower_name = lower_name.substr(0, lower_name.size() - 3);
+	} else if (StringUtil::EndsWith(lower_name, ".zst")) {
+		lower_name = lower_name.substr(0, lower_name.size() - 4);
+	}
 	if (!StringUtil::EndsWith(lower_name, ".json") && !StringUtil::Contains(lower_name, ".json?") &&
 	    !StringUtil::EndsWith(lower_name, ".ndjson") && !StringUtil::Contains(lower_name, ".ndjson?")) {
 		return nullptr;

package/src/duckdb/extension/json/json_scan.cpp CHANGED Viewed

@@ -1,6 +1,7 @@
 #include "json_scan.hpp"
 #include "duckdb/main/database.hpp"
+#include "duckdb/main/extension_helper.hpp"
 #include "duckdb/parallel/task_scheduler.hpp"
 #include "duckdb/storage/buffer_manager.hpp"
@@ -19,8 +20,9 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
 	auto &options = result->options;
 	auto &info = (JSONScanInfo &)*input.info;
-	options.format = info.format;
 	result->type = info.type;
+	options.format = info.format;
+	result->record_type = info.record_type;
 	result->auto_detect = info.auto_detect;
 	vector<string> patterns;
@@ -39,16 +41,16 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
 			result->ignore_errors = BooleanValue::Get(kv.second);
 		} else if (loption == "maximum_object_size") {
 			result->maximum_object_size = MaxValue<idx_t>(UIntegerValue::Get(kv.second), result->maximum_object_size);
-		} else if (loption == "format") {
+		} else if (loption == "lines") {
 			auto format = StringUtil::Lower(StringValue::Get(kv.second));
 			if (format == "auto") {
 				options.format = JSONFormat::AUTO_DETECT;
-			} else if (format == "unstructured") {
+			} else if (format == "false") {
 				options.format = JSONFormat::UNSTRUCTURED;
-			} else if (format == "newline_delimited") {
+			} else if (format == "true") {
 				options.format = JSONFormat::NEWLINE_DELIMITED;
 			} else {
-				throw BinderException("format must be one of ['auto', 'unstructured', 'newline_delimited']");
+				throw BinderException("\"lines\" must be one of ['auto', 'true', 'false']");
 			}
 		} else if (loption == "compression") {
 			auto compression = StringUtil::Lower(StringValue::Get(kv.second));
@@ -75,7 +77,7 @@ void JSONScanData::InitializeFilePaths(ClientContext &context, const vector<stri
 	for (auto &file_pattern : patterns) {
 		auto found_files = fs.Glob(file_pattern, context);
 		if (found_files.empty()) {
-			throw IOException("No files found that match the pattern \"%s\"", file_pattern);
+			throw FileSystem::MissingFileException(file_pattern, context);
 		}
 		file_paths.insert(file_paths.end(), found_files.begin(), found_files.end());
 	}
@@ -97,6 +99,27 @@ void JSONScanData::InitializeFormats() {
 	if (!timestamp_format.empty()) {
 		date_format_map.AddFormat(LogicalTypeId::TIMESTAMP, timestamp_format);
 	}
+	if (auto_detect) {
+		static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
+		    {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
+		    {LogicalTypeId::TIMESTAMP,
+		     {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
+		      "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"}},
+		};
+		// Populate possible date/timestamp formats, assume this is consistent across columns
+		for (auto &kv : FORMAT_TEMPLATES) {
+			const auto &type = kv.first;
+			if (date_format_map.HasFormats(type)) {
+				continue; // Already populated
+			}
+			const auto &format_strings = kv.second;
+			for (auto &format_string : format_strings) {
+				date_format_map.AddFormat(type, format_string);
+			}
+		}
+	}
 }
 void JSONScanData::Serialize(FieldWriter &writer) {
@@ -111,9 +134,17 @@ void JSONScanData::Serialize(FieldWriter &writer) {
 	writer.WriteList<string>(names);
 	writer.WriteList<idx_t>(valid_cols);
 	writer.WriteField<idx_t>(max_depth);
-	writer.WriteField<bool>(objects);
-	writer.WriteString(date_format);
-	writer.WriteString(timestamp_format);
+	writer.WriteField<JSONRecordType>(record_type);
+	if (!date_format.empty()) {
+		writer.WriteString(date_format);
+	} else {
+		writer.WriteString(date_format_map.GetFormat(LogicalTypeId::DATE).format_specifier);
+	}
+	if (!timestamp_format.empty()) {
+		writer.WriteString(timestamp_format);
+	} else {
+		writer.WriteString(date_format_map.GetFormat(LogicalTypeId::TIMESTAMP).format_specifier);
+	}
 }
 void JSONScanData::Deserialize(FieldReader &reader) {
@@ -128,9 +159,12 @@ void JSONScanData::Deserialize(FieldReader &reader) {
 	names = reader.ReadRequiredList<string>();
 	valid_cols = reader.ReadRequiredList<idx_t>();
 	max_depth = reader.ReadRequired<idx_t>();
-	objects = reader.ReadRequired<bool>();
+	record_type = reader.ReadRequired<JSONRecordType>();
 	date_format = reader.ReadRequired<string>();
 	timestamp_format = reader.ReadRequired<string>();
+	InitializeFormats();
+	transform_options.date_format_map = &date_format_map;
 }
 JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data_p)
@@ -149,11 +183,11 @@ JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &b
 }
 JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
-    : batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
+    : scan_count(0), array_idx(0), array_offset(0), batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
       json_allocator(BufferAllocator::Get(context)), current_reader(nullptr), current_buffer_handle(nullptr),
-      buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
+      is_last(false), buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
-	// Buffer to reconstruct JSON objects when they cross a buffer boundary
+	// Buffer to reconstruct JSON values when they cross a buffer boundary
 	reconstruct_buffer = gstate.allocator.Allocate(gstate.bind_data.maximum_object_size + YYJSON_PADDING_SIZE);
 	// This is needed for JSONFormat::UNSTRUCTURED, to make use of YYJSON_READ_INSITU
@@ -173,11 +207,6 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
 	// Perform projection pushdown
 	if (bind_data.type == JSONScanType::READ_JSON) {
 		D_ASSERT(input.column_ids.size() <= bind_data.names.size()); // Can't project to have more columns
-		if (bind_data.auto_detect && input.column_ids.size() < bind_data.names.size()) {
-			// If we are auto-detecting, but don't need all columns present in the file,
-			// then we don't need to throw an error if we encounter an unseen column
-			bind_data.transform_options.error_unknown_key = false;
-		}
 		vector<string> names;
 		names.reserve(input.column_ids.size());
 		for (idx_t i = 0; i < input.column_ids.size(); i++) {
@@ -188,13 +217,37 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
 			names.push_back(std::move(bind_data.names[id]));
 			bind_data.valid_cols.push_back(i);
 		}
+		if (names.size() < bind_data.names.size()) {
+			// If we are auto-detecting, but don't need all columns present in the file,
+			// then we don't need to throw an error if we encounter an unseen column
+			bind_data.transform_options.error_unknown_key = false;
+		}
 		bind_data.names = std::move(names);
 	}
 	return result;
 }
 idx_t JSONGlobalTableFunctionState::MaxThreads() const {
-	return state.system_threads;
+	auto &bind_data = state.bind_data;
+	auto num_files = bind_data.file_paths.size();
+	idx_t readers_per_file;
+	if (bind_data.options.format == JSONFormat::UNSTRUCTURED) {
+		// Unstructured necessitates single thread
+		readers_per_file = 1;
+	} else if (!state.json_readers.empty() && state.json_readers[0]->IsOpen()) {
+		auto &reader = *state.json_readers[0];
+		const auto &options = reader.GetOptions();
+		if (options.format == JSONFormat::UNSTRUCTURED || options.compression != FileCompressionType::UNCOMPRESSED) {
+			// Auto-detected unstructured - same story, compression also really limits parallelism
+			readers_per_file = 1;
+		} else {
+			return state.system_threads;
+		}
+	} else {
+		return state.system_threads;
+	}
+	return num_files * readers_per_file;
 }
 JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate)
@@ -230,6 +283,12 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
 idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
 	json_allocator.Reset();
+	if ((gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
+	     gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) &&
+	    array_idx < scan_count) {
+		return GetObjectsFromArray(gstate);
+	}
 	idx_t count = 0;
 	if (buffer_offset == buffer_size) {
 		if (!ReadNextBuffer(gstate)) {
@@ -253,10 +312,18 @@ idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
 	default:
 		throw InternalException("Unknown JSON format");
 	}
+	scan_count = count;
 	// Skip over any remaining whitespace for the next scan
 	SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
+	if (gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
+	    gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) {
+		array_idx = 0;
+		array_offset = 0;
+		return GetObjectsFromArray(gstate);
+	}
 	return count;
 }
@@ -331,10 +398,48 @@ yyjson_val *JSONScanLocalState::ParseLine(char *line_start, idx_t line_size, idx
 	}
 }
+idx_t JSONScanLocalState::GetObjectsFromArray(JSONScanGlobalState &gstate) {
+	idx_t arr_count = 0;
+	size_t idx, max;
+	yyjson_val *val;
+	for (; array_idx < scan_count; array_idx++, array_offset = 0) {
+		auto &value = values[array_idx];
+		if (!value) {
+			continue;
+		}
+		if (unsafe_yyjson_is_arr(value)) {
+			yyjson_arr_foreach(value, idx, max, val) {
+				if (idx < array_offset) {
+					continue;
+				}
+				array_values[arr_count++] = val;
+				if (arr_count == STANDARD_VECTOR_SIZE) {
+					break;
+				}
+			}
+			array_offset = idx + 1;
+			if (arr_count == STANDARD_VECTOR_SIZE) {
+				break;
+			}
+		} else if (!gstate.bind_data.ignore_errors) {
+			ThrowTransformError(
+			    array_idx,
+			    StringUtil::Format("Expected JSON ARRAY but got %s: %s\nTry setting json_format to 'records'",
+			                       JSONCommon::ValTypeToString(value), JSONCommon::ValToString(value, 50)));
+		}
+	}
+	return arr_count;
+}
 bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
 	if (current_reader) {
 		D_ASSERT(current_buffer_handle);
 		current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
+		if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
+			// Close files that are done if we're not sampling
+			current_reader->CloseJSONFile();
+		}
 	}
 	AllocatedData buffer;
@@ -395,7 +500,9 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
 		// Unopened file
 		current_reader->OpenJSONFile();
 		batch_index = gstate.batch_index++;
-		if (options.format == JSONFormat::UNSTRUCTURED) {
+		if (options.format == JSONFormat::UNSTRUCTURED || (options.format == JSONFormat::NEWLINE_DELIMITED &&
+		                                                   options.compression != FileCompressionType::UNCOMPRESSED &&
+		                                                   gstate.file_index < gstate.json_readers.size())) {
 			gstate.file_index++; // UNSTRUCTURED necessitates single-threaded read
 		}
 		if (options.format != JSONFormat::AUTO_DETECT) {
@@ -449,9 +556,6 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
 	auto json_buffer_handle = make_unique<JSONBufferHandle>(buffer_index, readers, std::move(buffer), buffer_size);
 	current_buffer_handle = json_buffer_handle.get();
 	current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
-	if (!current_reader->GetFileHandle().PlainFileSource() && gstate.bind_data.type == JSONScanType::SAMPLE) {
-		// TODO: store buffer
-	}
 	buffer_offset = 0;
 	prev_buffer_remainder = 0;
@@ -507,16 +611,18 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
 }
 void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
-	auto &file_handle = current_reader->GetFileHandle();
 	idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
 	idx_t read_size;
 	{
 		lock_guard<mutex> reader_guard(current_reader->lock);
 		buffer_index = current_reader->GetBufferIndex();
-		read_size = file_handle.Read(buffer_ptr + prev_buffer_remainder, request_size,
-		                             gstate.bind_data.type == JSONScanType::SAMPLE);
+		if (current_reader->IsOpen()) {
+			read_size = current_reader->GetFileHandle().Read(buffer_ptr + prev_buffer_remainder, request_size,
+			                                                 gstate.bind_data.type == JSONScanType::SAMPLE);
+		} else {
+			read_size = 0;
+		}
 		is_last = read_size < request_size;
 		if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
@@ -578,10 +684,15 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
 		current_reader->RemoveBuffer(current_buffer_handle->buffer_index - 1);
 	}
-	objects[0] = ParseLine((char *)reconstruct_ptr, line_size, line_size, lines[0]);
+	values[0] = ParseLine((char *)reconstruct_ptr, line_size, line_size, lines[0]);
 }
 void JSONScanLocalState::ReadUnstructured(idx_t &count) {
+	// yyjson does not always return YYJSON_READ_ERROR_UNEXPECTED_END properly
+	// if a different error code happens within the last 50 bytes
+	// we assume it should be YYJSON_READ_ERROR_UNEXPECTED_END instead
+	static constexpr idx_t END_BOUND = 50;
 	const auto max_obj_size = reconstruct_buffer.GetSize();
 	yyjson_read_err error;
 	for (; count < STANDARD_VECTOR_SIZE; count++) {
@@ -607,8 +718,7 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
 		} else if (error.pos > max_obj_size) {
 			current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error,
 			                                "Try increasing \"maximum_object_size\".");
-		} else if (error.code == YYJSON_READ_ERROR_UNEXPECTED_END && !is_last) {
+		} else if (!is_last && (error.code == YYJSON_READ_ERROR_UNEXPECTED_END || remaining - error.pos < END_BOUND)) {
 			// Copy remaining to reconstruct_buffer
 			const auto reconstruct_ptr = reconstruct_buffer.get();
 			memcpy(reconstruct_ptr, obj_copy_start, remaining);
@@ -618,7 +728,7 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
 		} else {
 			current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error);
 		}
-		objects[count] = read_doc->root;
+		values[count] = read_doc->root;
 	}
 }
@@ -644,7 +754,7 @@ void JSONScanLocalState::ReadNewlineDelimited(idx_t &count) {
 		}
 		idx_t line_size = line_end - line_start;
-		objects[count] = ParseLine((char *)line_start, line_size, remaining, lines[count]);
+		values[count] = ParseLine((char *)line_start, line_size, remaining, lines[count]);
 		buffer_offset += line_size;
 		SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
@@ -655,11 +765,11 @@ yyjson_alc *JSONScanLocalState::GetAllocator() {
 	return json_allocator.GetYYJSONAllocator();
 }
-void JSONScanLocalState::ThrowTransformError(idx_t count, idx_t object_index, const string &error_message) {
+void JSONScanLocalState::ThrowTransformError(idx_t object_index, const string &error_message) {
 	D_ASSERT(current_reader);
 	D_ASSERT(current_buffer_handle);
 	D_ASSERT(object_index != DConstants::INVALID_INDEX);
-	auto line_or_object_in_buffer = lines_or_objects_in_buffer - count + object_index;
+	auto line_or_object_in_buffer = lines_or_objects_in_buffer - scan_count + object_index;
 	current_reader->ThrowTransformError(current_buffer_handle->buffer_index, line_or_object_in_buffer, error_message);
 }

package/src/duckdb/extension/parquet/parquet-extension.cpp CHANGED Viewed

@@ -223,7 +223,7 @@ public:
 		FileSystem &fs = FileSystem::GetFileSystem(context);
 		auto files = fs.Glob(info.file_path, context);
 		if (files.empty()) {
-			throw IOException("No files found that match the pattern \"%s\"", info.file_path);
+			throw FileSystem::MissingFileException(info.file_path, context);
 		}
 		// The most likely path (Parquet read without union by name option)
@@ -363,8 +363,9 @@ public:
 	static vector<string> ParquetGlob(FileSystem &fs, const string &glob, ClientContext &context) {
 		auto files = fs.Glob(glob, FileSystem::GetFileOpener(context));
 		if (files.empty()) {
-			throw IOException("No files found that match the pattern \"%s\"", glob);
+			throw FileSystem::MissingFileException(glob, context);
 		}
 		return files;
 	}