npm - duckdb - Versions diffs - 0.7.1-dev90.0 → 0.7.1 - Mend

duckdb 0.7.1-dev90.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

package/src/duckdb/extension/json/json_scan.cpp CHANGED Viewed

@@ -1,9 +1,9 @@
 #include "json_scan.hpp"
 #include "duckdb/main/database.hpp"
+#include "duckdb/main/extension_helper.hpp"
 #include "duckdb/parallel/task_scheduler.hpp"
 #include "duckdb/storage/buffer_manager.hpp"
-#include "duckdb/main/extension_helper.hpp"
 namespace duckdb {
@@ -20,8 +20,9 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
 	auto &options = result->options;
 	auto &info = (JSONScanInfo &)*input.info;
-	options.format = info.format;
 	result->type = info.type;
+	options.format = info.format;
+	result->record_type = info.record_type;
 	result->auto_detect = info.auto_detect;
 	vector<string> patterns;
@@ -40,16 +41,16 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
 			result->ignore_errors = BooleanValue::Get(kv.second);
 		} else if (loption == "maximum_object_size") {
 			result->maximum_object_size = MaxValue<idx_t>(UIntegerValue::Get(kv.second), result->maximum_object_size);
-		} else if (loption == "format") {
+		} else if (loption == "lines") {
 			auto format = StringUtil::Lower(StringValue::Get(kv.second));
 			if (format == "auto") {
 				options.format = JSONFormat::AUTO_DETECT;
-			} else if (format == "unstructured") {
+			} else if (format == "false") {
 				options.format = JSONFormat::UNSTRUCTURED;
-			} else if (format == "newline_delimited") {
+			} else if (format == "true") {
 				options.format = JSONFormat::NEWLINE_DELIMITED;
 			} else {
-				throw BinderException("format must be one of ['auto', 'unstructured', 'newline_delimited']");
+				throw BinderException("\"lines\" must be one of ['auto', 'true', 'false']");
 			}
 		} else if (loption == "compression") {
 			auto compression = StringUtil::Lower(StringValue::Get(kv.second));
@@ -74,10 +75,7 @@ void JSONScanData::InitializeFilePaths(ClientContext &context, const vector<stri
                                        vector<string> &file_paths) {
 	auto &fs = FileSystem::GetFileSystem(context);
 	for (auto &file_pattern : patterns) {
-		auto found_files = fs.Glob(file_pattern, context);
-		if (found_files.empty()) {
-			throw FileSystem::MissingFileException(file_pattern, context);
-		}
+		auto found_files = fs.GlobFiles(file_pattern, context);
 		file_paths.insert(file_paths.end(), found_files.begin(), found_files.end());
 	}
 }
@@ -98,6 +96,27 @@ void JSONScanData::InitializeFormats() {
 	if (!timestamp_format.empty()) {
 		date_format_map.AddFormat(LogicalTypeId::TIMESTAMP, timestamp_format);
 	}
+	if (auto_detect) {
+		static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
+		    {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
+		    {LogicalTypeId::TIMESTAMP,
+		     {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
+		      "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"}},
+		};
+		// Populate possible date/timestamp formats, assume this is consistent across columns
+		for (auto &kv : FORMAT_TEMPLATES) {
+			const auto &type = kv.first;
+			if (date_format_map.HasFormats(type)) {
+				continue; // Already populated
+			}
+			const auto &format_strings = kv.second;
+			for (auto &format_string : format_strings) {
+				date_format_map.AddFormat(type, format_string);
+			}
+		}
+	}
 }
 void JSONScanData::Serialize(FieldWriter &writer) {
@@ -112,9 +131,17 @@ void JSONScanData::Serialize(FieldWriter &writer) {
 	writer.WriteList<string>(names);
 	writer.WriteList<idx_t>(valid_cols);
 	writer.WriteField<idx_t>(max_depth);
-	writer.WriteField<bool>(objects);
-	writer.WriteString(date_format);
-	writer.WriteString(timestamp_format);
+	writer.WriteField<JSONRecordType>(record_type);
+	if (!date_format.empty()) {
+		writer.WriteString(date_format);
+	} else {
+		writer.WriteString(date_format_map.GetFormat(LogicalTypeId::DATE).format_specifier);
+	}
+	if (!timestamp_format.empty()) {
+		writer.WriteString(timestamp_format);
+	} else {
+		writer.WriteString(date_format_map.GetFormat(LogicalTypeId::TIMESTAMP).format_specifier);
+	}
 }
 void JSONScanData::Deserialize(FieldReader &reader) {
@@ -129,9 +156,12 @@ void JSONScanData::Deserialize(FieldReader &reader) {
 	names = reader.ReadRequiredList<string>();
 	valid_cols = reader.ReadRequiredList<idx_t>();
 	max_depth = reader.ReadRequired<idx_t>();
-	objects = reader.ReadRequired<bool>();
+	record_type = reader.ReadRequired<JSONRecordType>();
 	date_format = reader.ReadRequired<string>();
 	timestamp_format = reader.ReadRequired<string>();
+	InitializeFormats();
+	transform_options.date_format_map = &date_format_map;
 }
 JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data_p)
@@ -150,11 +180,11 @@ JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &b
 }
 JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
-    : batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
+    : scan_count(0), array_idx(0), array_offset(0), batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
       json_allocator(BufferAllocator::Get(context)), current_reader(nullptr), current_buffer_handle(nullptr),
-      buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
+      is_last(false), buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
-	// Buffer to reconstruct JSON objects when they cross a buffer boundary
+	// Buffer to reconstruct JSON values when they cross a buffer boundary
 	reconstruct_buffer = gstate.allocator.Allocate(gstate.bind_data.maximum_object_size + YYJSON_PADDING_SIZE);
 	// This is needed for JSONFormat::UNSTRUCTURED, to make use of YYJSON_READ_INSITU
@@ -174,11 +204,6 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
 	// Perform projection pushdown
 	if (bind_data.type == JSONScanType::READ_JSON) {
 		D_ASSERT(input.column_ids.size() <= bind_data.names.size()); // Can't project to have more columns
-		if (bind_data.auto_detect && input.column_ids.size() < bind_data.names.size()) {
-			// If we are auto-detecting, but don't need all columns present in the file,
-			// then we don't need to throw an error if we encounter an unseen column
-			bind_data.transform_options.error_unknown_key = false;
-		}
 		vector<string> names;
 		names.reserve(input.column_ids.size());
 		for (idx_t i = 0; i < input.column_ids.size(); i++) {
@@ -189,13 +214,37 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
 			names.push_back(std::move(bind_data.names[id]));
 			bind_data.valid_cols.push_back(i);
 		}
+		if (names.size() < bind_data.names.size()) {
+			// If we are auto-detecting, but don't need all columns present in the file,
+			// then we don't need to throw an error if we encounter an unseen column
+			bind_data.transform_options.error_unknown_key = false;
+		}
 		bind_data.names = std::move(names);
 	}
 	return result;
 }
 idx_t JSONGlobalTableFunctionState::MaxThreads() const {
-	return state.system_threads;
+	auto &bind_data = state.bind_data;
+	auto num_files = bind_data.file_paths.size();
+	idx_t readers_per_file;
+	if (bind_data.options.format == JSONFormat::UNSTRUCTURED) {
+		// Unstructured necessitates single thread
+		readers_per_file = 1;
+	} else if (!state.json_readers.empty() && state.json_readers[0]->IsOpen()) {
+		auto &reader = *state.json_readers[0];
+		const auto &options = reader.GetOptions();
+		if (options.format == JSONFormat::UNSTRUCTURED || options.compression != FileCompressionType::UNCOMPRESSED) {
+			// Auto-detected unstructured - same story, compression also really limits parallelism
+			readers_per_file = 1;
+		} else {
+			return state.system_threads;
+		}
+	} else {
+		return state.system_threads;
+	}
+	return num_files * readers_per_file;
 }
 JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate)
@@ -231,6 +280,12 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
 idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
 	json_allocator.Reset();
+	if ((gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
+	     gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) &&
+	    array_idx < scan_count) {
+		return GetObjectsFromArray(gstate);
+	}
 	idx_t count = 0;
 	if (buffer_offset == buffer_size) {
 		if (!ReadNextBuffer(gstate)) {
@@ -254,10 +309,18 @@ idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
 	default:
 		throw InternalException("Unknown JSON format");
 	}
+	scan_count = count;
 	// Skip over any remaining whitespace for the next scan
 	SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
+	if (gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
+	    gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) {
+		array_idx = 0;
+		array_offset = 0;
+		return GetObjectsFromArray(gstate);
+	}
 	return count;
 }
@@ -332,10 +395,48 @@ yyjson_val *JSONScanLocalState::ParseLine(char *line_start, idx_t line_size, idx
 	}
 }
+idx_t JSONScanLocalState::GetObjectsFromArray(JSONScanGlobalState &gstate) {
+	idx_t arr_count = 0;
+	size_t idx, max;
+	yyjson_val *val;
+	for (; array_idx < scan_count; array_idx++, array_offset = 0) {
+		auto &value = values[array_idx];
+		if (!value) {
+			continue;
+		}
+		if (unsafe_yyjson_is_arr(value)) {
+			yyjson_arr_foreach(value, idx, max, val) {
+				if (idx < array_offset) {
+					continue;
+				}
+				array_values[arr_count++] = val;
+				if (arr_count == STANDARD_VECTOR_SIZE) {
+					break;
+				}
+			}
+			array_offset = idx + 1;
+			if (arr_count == STANDARD_VECTOR_SIZE) {
+				break;
+			}
+		} else if (!gstate.bind_data.ignore_errors) {
+			ThrowTransformError(
+			    array_idx,
+			    StringUtil::Format("Expected JSON ARRAY but got %s: %s\nTry setting json_format to 'records'",
+			                       JSONCommon::ValTypeToString(value), JSONCommon::ValToString(value, 50)));
+		}
+	}
+	return arr_count;
+}
 bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
 	if (current_reader) {
 		D_ASSERT(current_buffer_handle);
 		current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
+		if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
+			// Close files that are done if we're not sampling
+			current_reader->CloseJSONFile();
+		}
 	}
 	AllocatedData buffer;
@@ -396,7 +497,9 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
 		// Unopened file
 		current_reader->OpenJSONFile();
 		batch_index = gstate.batch_index++;
-		if (options.format == JSONFormat::UNSTRUCTURED) {
+		if (options.format == JSONFormat::UNSTRUCTURED || (options.format == JSONFormat::NEWLINE_DELIMITED &&
+		                                                   options.compression != FileCompressionType::UNCOMPRESSED &&
+		                                                   gstate.file_index < gstate.json_readers.size())) {
 			gstate.file_index++; // UNSTRUCTURED necessitates single-threaded read
 		}
 		if (options.format != JSONFormat::AUTO_DETECT) {
@@ -450,9 +553,6 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
 	auto json_buffer_handle = make_unique<JSONBufferHandle>(buffer_index, readers, std::move(buffer), buffer_size);
 	current_buffer_handle = json_buffer_handle.get();
 	current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
-	if (!current_reader->GetFileHandle().PlainFileSource() && gstate.bind_data.type == JSONScanType::SAMPLE) {
-		// TODO: store buffer
-	}
 	buffer_offset = 0;
 	prev_buffer_remainder = 0;
@@ -508,16 +608,18 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
 }
 void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
-	auto &file_handle = current_reader->GetFileHandle();
 	idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
 	idx_t read_size;
 	{
 		lock_guard<mutex> reader_guard(current_reader->lock);
 		buffer_index = current_reader->GetBufferIndex();
-		read_size = file_handle.Read(buffer_ptr + prev_buffer_remainder, request_size,
-		                             gstate.bind_data.type == JSONScanType::SAMPLE);
+		if (current_reader->IsOpen()) {
+			read_size = current_reader->GetFileHandle().Read(buffer_ptr + prev_buffer_remainder, request_size,
+			                                                 gstate.bind_data.type == JSONScanType::SAMPLE);
+		} else {
+			read_size = 0;
+		}
 		is_last = read_size < request_size;
 		if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
@@ -579,10 +681,15 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
 		current_reader->RemoveBuffer(current_buffer_handle->buffer_index - 1);
 	}
-	objects[0] = ParseLine((char *)reconstruct_ptr, line_size, line_size, lines[0]);
+	values[0] = ParseLine((char *)reconstruct_ptr, line_size, line_size, lines[0]);
 }
 void JSONScanLocalState::ReadUnstructured(idx_t &count) {
+	// yyjson does not always return YYJSON_READ_ERROR_UNEXPECTED_END properly
+	// if a different error code happens within the last 50 bytes
+	// we assume it should be YYJSON_READ_ERROR_UNEXPECTED_END instead
+	static constexpr idx_t END_BOUND = 50;
 	const auto max_obj_size = reconstruct_buffer.GetSize();
 	yyjson_read_err error;
 	for (; count < STANDARD_VECTOR_SIZE; count++) {
@@ -608,8 +715,7 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
 		} else if (error.pos > max_obj_size) {
 			current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error,
 			                                "Try increasing \"maximum_object_size\".");
-		} else if (error.code == YYJSON_READ_ERROR_UNEXPECTED_END && !is_last) {
+		} else if (!is_last && (error.code == YYJSON_READ_ERROR_UNEXPECTED_END || remaining - error.pos < END_BOUND)) {
 			// Copy remaining to reconstruct_buffer
 			const auto reconstruct_ptr = reconstruct_buffer.get();
 			memcpy(reconstruct_ptr, obj_copy_start, remaining);
@@ -619,7 +725,7 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
 		} else {
 			current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error);
 		}
-		objects[count] = read_doc->root;
+		values[count] = read_doc->root;
 	}
 }
@@ -645,7 +751,7 @@ void JSONScanLocalState::ReadNewlineDelimited(idx_t &count) {
 		}
 		idx_t line_size = line_end - line_start;
-		objects[count] = ParseLine((char *)line_start, line_size, remaining, lines[count]);
+		values[count] = ParseLine((char *)line_start, line_size, remaining, lines[count]);
 		buffer_offset += line_size;
 		SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
@@ -656,11 +762,11 @@ yyjson_alc *JSONScanLocalState::GetAllocator() {
 	return json_allocator.GetYYJSONAllocator();
 }
-void JSONScanLocalState::ThrowTransformError(idx_t count, idx_t object_index, const string &error_message) {
+void JSONScanLocalState::ThrowTransformError(idx_t object_index, const string &error_message) {
 	D_ASSERT(current_reader);
 	D_ASSERT(current_buffer_handle);
 	D_ASSERT(object_index != DConstants::INVALID_INDEX);
-	auto line_or_object_in_buffer = lines_or_objects_in_buffer - count + object_index;
+	auto line_or_object_in_buffer = lines_or_objects_in_buffer - scan_count + object_index;
 	current_reader->ThrowTransformError(current_buffer_handle->buffer_index, line_or_object_in_buffer, error_message);
 }

package/src/duckdb/extension/parquet/column_reader.cpp CHANGED Viewed

@@ -589,6 +589,7 @@ void StringColumnReader::PrepareDeltaLengthByteArray(ResizeableBuffer &buffer) {
 	}
 	auto length_data = (uint32_t *)length_buffer->ptr;
 	byte_array_data = make_unique<Vector>(LogicalType::VARCHAR, value_count);
+	byte_array_count = value_count;
 	auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
 	for (idx_t i = 0; i < value_count; i++) {
 		auto str_len = length_data[i];
@@ -615,6 +616,7 @@ void StringColumnReader::PrepareDeltaByteArray(ResizeableBuffer &buffer) {
 	auto prefix_data = (uint32_t *)prefix_buffer->ptr;
 	auto suffix_data = (uint32_t *)suffix_buffer->ptr;
 	byte_array_data = make_unique<Vector>(LogicalType::VARCHAR, prefix_count);
+	byte_array_count = prefix_count;
 	auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
 	for (idx_t i = 0; i < prefix_count; i++) {
 		auto str_len = prefix_data[i] + suffix_data[i];
@@ -646,6 +648,11 @@ void StringColumnReader::DeltaByteArray(uint8_t *defines, idx_t num_values, parq
 			continue;
 		}
 		if (filter[row_idx + result_offset]) {
+			if (delta_offset >= byte_array_count) {
+				throw IOException("DELTA_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
+				                  "read of %d from %d entries) - corrupt file?",
+				                  delta_offset + 1, byte_array_count);
+			}
 			result_ptr[row_idx + result_offset] = string_data[delta_offset++];
 		} else {
 			delta_offset++;

package/src/duckdb/extension/parquet/include/column_reader.hpp CHANGED Viewed

@@ -131,6 +131,7 @@ protected:
 	ParquetReader &reader;
 	LogicalType type;
 	unique_ptr<Vector> byte_array_data;
+	idx_t byte_array_count = 0;
 	idx_t pending_skips = 0;

package/src/duckdb/extension/parquet/parquet-extension.cpp CHANGED Viewed

@@ -221,10 +221,7 @@ public:
 		}
 		FileSystem &fs = FileSystem::GetFileSystem(context);
-		auto files = fs.Glob(info.file_path, context);
-		if (files.empty()) {
-			throw FileSystem::MissingFileException(info.file_path, context);
-		}
+		auto files = fs.GlobFiles(info.file_path, context);
 		// The most likely path (Parquet read without union by name option)
 		if (!parquet_options.union_by_name) {
@@ -362,12 +359,7 @@ public:
 	}
 	static vector<string> ParquetGlob(FileSystem &fs, const string &glob, ClientContext &context) {
-		auto files = fs.Glob(glob, FileSystem::GetFileOpener(context));
-		if (files.empty()) {
-			throw FileSystem::MissingFileException(glob, context);
-		}
-		return files;
+		return fs.GlobFiles(glob, context);
 	}
 	static unique_ptr<FunctionData> ParquetScanBind(ClientContext &context, TableFunctionBindInput &input,

package/src/duckdb/src/catalog/catalog.cpp CHANGED Viewed

@@ -2,6 +2,7 @@
 #include "duckdb/catalog/catalog_search_path.hpp"
 #include "duckdb/catalog/catalog_entry/list.hpp"
+#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
 #include "duckdb/catalog/catalog_set.hpp"
 #include "duckdb/catalog/default/default_schemas.hpp"
 #include "duckdb/catalog/catalog_entry/type_catalog_entry.hpp"
@@ -26,7 +27,7 @@
 #include "duckdb/planner/parsed_data/bound_create_table_info.hpp"
 #include "duckdb/planner/binder.hpp"
 #include "duckdb/catalog/default/default_types.hpp"
-#include "duckdb/main/extension_functions.hpp"
+#include "duckdb/main/extension_entries.hpp"
 #include "duckdb/main/connection.hpp"
 #include "duckdb/main/attached_database.hpp"
 #include "duckdb/main/database_manager.hpp"
@@ -251,6 +252,20 @@ CatalogEntry *Catalog::CreateCollation(CatalogTransaction transaction, SchemaCat
 	return schema->CreateCollation(transaction, info);
 }
+//===--------------------------------------------------------------------===//
+// Index
+//===--------------------------------------------------------------------===//
+CatalogEntry *Catalog::CreateIndex(CatalogTransaction transaction, CreateIndexInfo *info) {
+	auto &context = transaction.GetContext();
+	return CreateIndex(context, info);
+}
+CatalogEntry *Catalog::CreateIndex(ClientContext &context, CreateIndexInfo *info) {
+	auto schema = GetSchema(context, info->schema);
+	auto table = GetEntry<TableCatalogEntry>(context, schema->name, info->table->table_name);
+	return schema->CreateIndex(context, info, table);
+}
 //===--------------------------------------------------------------------===//
 // Lookup Structures
 //===--------------------------------------------------------------------===//
@@ -317,17 +332,26 @@ SimilarCatalogEntry Catalog::SimilarEntryInSchemas(ClientContext &context, const
 	return result;
 }
-string FindExtension(const string &function_name) {
-	auto size = sizeof(EXTENSION_FUNCTIONS) / sizeof(ExtensionFunction);
-	auto it = std::lower_bound(
-	    EXTENSION_FUNCTIONS, EXTENSION_FUNCTIONS + size, function_name,
-	    [](const ExtensionFunction &element, const string &value) { return element.function < value; });
-	if (it != EXTENSION_FUNCTIONS + size && it->function == function_name) {
+string FindExtensionGeneric(const string &name, const ExtensionEntry entries[], idx_t size) {
+	auto lcase = StringUtil::Lower(name);
+	auto it = std::lower_bound(entries, entries + size, lcase,
+	                           [](const ExtensionEntry &element, const string &value) { return element.name < value; });
+	if (it != entries + size && it->name == lcase) {
 		return it->extension;
 	}
 	return "";
 }
+string FindExtensionForFunction(const string &name) {
+	idx_t size = sizeof(EXTENSION_FUNCTIONS) / sizeof(ExtensionEntry);
+	return FindExtensionGeneric(name, EXTENSION_FUNCTIONS, size);
+}
+string FindExtensionForSetting(const string &name) {
+	idx_t size = sizeof(EXTENSION_SETTINGS) / sizeof(ExtensionEntry);
+	return FindExtensionGeneric(name, EXTENSION_SETTINGS, size);
+}
 vector<CatalogSearchEntry> GetCatalogEntries(ClientContext &context, const string &catalog, const string &schema) {
 	vector<CatalogSearchEntry> entries;
 	auto &search_path = *context.client_data->catalog_search_path;
@@ -392,6 +416,26 @@ void FindMinimalQualification(ClientContext &context, const string &catalog_name
 	qualify_schema = true;
 }
+CatalogException Catalog::UnrecognizedConfigurationError(ClientContext &context, const string &name) {
+	// check if the setting exists in any extensions
+	auto extension_name = FindExtensionForSetting(name);
+	if (!extension_name.empty()) {
+		return CatalogException(
+		    "Setting with name \"%s\" is not in the catalog, but it exists in the %s extension.\n\nTo "
+		    "install and load the extension, run:\nINSTALL %s;\nLOAD %s;",
+		    name, extension_name, extension_name, extension_name);
+	}
+	// the setting is not in an extension
+	// get a list of all options
+	vector<string> potential_names = DBConfig::GetOptionNames();
+	for (auto &entry : DBConfig::GetConfig(context).extension_parameters) {
+		potential_names.push_back(entry.first);
+	}
+	throw CatalogException("unrecognized configuration parameter \"%s\"\n%s", name,
+	                       StringUtil::CandidatesErrorMessage(potential_names, name, "Did you mean"));
+}
 CatalogException Catalog::CreateMissingEntryException(ClientContext &context, const string &entry_name,
                                                       CatalogType type,
                                                       const unordered_set<SchemaCatalogEntry *> &schemas,
@@ -408,13 +452,18 @@ CatalogException Catalog::CreateMissingEntryException(ClientContext &context, co
 			unseen_schemas.insert(current_schema);
 		}
 	}
-	auto unseen_entry = SimilarEntryInSchemas(context, entry_name, type, unseen_schemas);
-	auto extension_name = FindExtension(entry_name);
-	if (!extension_name.empty()) {
-		return CatalogException("Function with name %s is not on the catalog, but it exists in the %s extension. To "
-		                        "Install and Load the extension, run: INSTALL %s; LOAD %s;",
-		                        entry_name, extension_name, extension_name, extension_name);
+	// check if the entry exists in any extension
+	if (type == CatalogType::TABLE_FUNCTION_ENTRY || type == CatalogType::SCALAR_FUNCTION_ENTRY ||
+	    type == CatalogType::AGGREGATE_FUNCTION_ENTRY) {
+		auto extension_name = FindExtensionForFunction(entry_name);
+		if (!extension_name.empty()) {
+			return CatalogException(
+			    "Function with name \"%s\" is not in the catalog, but it exists in the %s extension.\n\nTo "
+			    "install and load the extension, run:\nINSTALL %s;\nLOAD %s;",
+			    entry_name, extension_name, extension_name, extension_name);
+		}
 	}
+	auto unseen_entry = SimilarEntryInSchemas(context, entry_name, type, unseen_schemas);
 	string did_you_mean;
 	if (unseen_entry.Found() && unseen_entry.distance < entry.distance) {
 		// the closest matching entry requires qualification as it is not in the default search path

package/src/duckdb/src/catalog/catalog_entry/index_catalog_entry.cpp CHANGED Viewed

@@ -19,10 +19,11 @@ string IndexCatalogEntry::ToSQL() {
 	return sql;
 }
-void IndexCatalogEntry::Serialize(duckdb::MetaBlockWriter &serializer) {
-	// Here we serialize the index metadata in the following order:
-	// schema name, table name, index name, sql, index type, index constraint type, expression list.
-	// column_ids, unbound_expression
+void IndexCatalogEntry::Serialize(Serializer &serializer) {
+	// here we serialize the index metadata in the following order:
+	// schema name, table name, index name, sql, index type, index constraint type, expression list, parsed expressions,
+	// column IDs
 	FieldWriter writer(serializer);
 	writer.WriteString(GetSchemaName());
 	writer.WriteString(GetTableName());
@@ -37,9 +38,9 @@ void IndexCatalogEntry::Serialize(duckdb::MetaBlockWriter &serializer) {
 }
 unique_ptr<CreateIndexInfo> IndexCatalogEntry::Deserialize(Deserializer &source, ClientContext &context) {
-	// Here we deserialize the index metadata in the following order:
-	// root block, root offset, schema name, table name, index name, sql, index type, index constraint type, expression
-	// list.
+	// here we deserialize the index metadata in the following order:
+	// schema name, table schema name, table name, index name, sql, index type, index constraint type, expression list,
+	// parsed expression list, column IDs
 	auto create_index_info = make_unique<CreateIndexInfo>();

package/src/duckdb/src/catalog/catalog_entry/schema_catalog_entry.cpp CHANGED Viewed

@@ -24,7 +24,7 @@ SimilarCatalogEntry SchemaCatalogEntry::GetSimilarEntry(CatalogTransaction trans
                                                         const string &name) {
 	SimilarCatalogEntry result;
 	Scan(transaction.GetContext(), type, [&](CatalogEntry *entry) {
-		auto ldist = StringUtil::LevenshteinDistance(entry->name, name);
+		auto ldist = StringUtil::SimilarityScore(entry->name, name);
 		if (ldist < result.distance) {
 			result.distance = ldist;
 			result.name = entry->name;

package/src/duckdb/src/catalog/catalog_set.cpp CHANGED Viewed

@@ -460,7 +460,7 @@ SimilarCatalogEntry CatalogSet::SimilarEntry(CatalogTransaction transaction, con
 	for (auto &kv : mapping) {
 		auto mapping_value = GetMapping(transaction, kv.first);
 		if (mapping_value && !mapping_value->deleted) {
-			auto ldist = StringUtil::LevenshteinDistance(kv.first, name);
+			auto ldist = StringUtil::SimilarityScore(kv.first, name);
 			if (ldist < result.distance) {
 				result.distance = ldist;
 				result.name = kv.first;

package/src/duckdb/src/catalog/default/default_functions.cpp CHANGED Viewed

@@ -93,6 +93,7 @@ static DefaultMacro internal_macros[] = {
 	{DEFAULT_SCHEMA, "fdiv", {"x", "y", nullptr}, "floor(x/y)"},
 	{DEFAULT_SCHEMA, "fmod", {"x", "y", nullptr}, "(x-y*floor(x/y))"},
 	{DEFAULT_SCHEMA, "count_if", {"l", nullptr}, "sum(if(l, 1, 0))"},
+	{DEFAULT_SCHEMA, "split_part", {"string", "delimiter", "position", nullptr}, "coalesce(string_split(string, delimiter)[position],'')"},
 	// algebraic list aggregates
 	{DEFAULT_SCHEMA, "list_avg", {"l", nullptr}, "list_aggr(l, 'avg')"},

package/src/duckdb/src/catalog/default/default_views.cpp CHANGED Viewed

@@ -48,7 +48,7 @@ static DefaultView internal_views[] = {
     {"pg_catalog", "pg_views", "SELECT schema_name schemaname, view_name viewname, 'duckdb' viewowner, sql definition FROM duckdb_views()"},
     {"information_schema", "columns", "SELECT database_name table_catalog, schema_name table_schema, table_name, column_name, column_index ordinal_position, column_default, CASE WHEN is_nullable THEN 'YES' ELSE 'NO' END is_nullable, data_type, character_maximum_length, NULL character_octet_length, numeric_precision, numeric_precision_radix, numeric_scale, NULL datetime_precision, NULL interval_type, NULL interval_precision, NULL character_set_catalog, NULL character_set_schema, NULL character_set_name, NULL collation_catalog, NULL collation_schema, NULL collation_name, NULL domain_catalog, NULL domain_schema, NULL domain_name, NULL udt_catalog, NULL udt_schema, NULL udt_name, NULL scope_catalog, NULL scope_schema, NULL scope_name, NULL maximum_cardinality, NULL dtd_identifier, NULL is_self_referencing, NULL is_identity, NULL identity_generation, NULL identity_start, NULL identity_increment, NULL identity_maximum, NULL identity_minimum, NULL identity_cycle, NULL is_generated, NULL generation_expression, NULL is_updatable FROM duckdb_columns;"},
     {"information_schema", "schemata", "SELECT database_name catalog_name, schema_name, 'duckdb' schema_owner, NULL default_character_set_catalog, NULL default_character_set_schema, NULL default_character_set_name, sql sql_path FROM duckdb_schemas()"},
-    {"information_schema", "tables", "SELECT database_name table_catalog, schema_name table_schema, table_name, CASE WHEN temporary THEN 'LOCAL TEMPORARY' ELSE 'BASE TABLE' END table_type, NULL self_referencing_column_name, NULL reference_generation, NULL user_defined_type_catalog, NULL user_defined_type_schema, NULL user_defined_type_name, 'YES' is_insertable_into, 'NO' is_typed, CASE WHEN temporary THEN 'PRESERVE' ELSE NULL END commit_action FROM duckdb_tables() UNION ALL SELECT NULL table_catalog, schema_name table_schema, view_name table_name, 'VIEW' table_type, NULL self_referencing_column_name, NULL reference_generation, NULL user_defined_type_catalog, NULL user_defined_type_schema, NULL user_defined_type_name, 'NO' is_insertable_into, 'NO' is_typed, NULL commit_action FROM duckdb_views;"},
+    {"information_schema", "tables", "SELECT database_name table_catalog, schema_name table_schema, table_name, CASE WHEN temporary THEN 'LOCAL TEMPORARY' ELSE 'BASE TABLE' END table_type, NULL self_referencing_column_name, NULL reference_generation, NULL user_defined_type_catalog, NULL user_defined_type_schema, NULL user_defined_type_name, 'YES' is_insertable_into, 'NO' is_typed, CASE WHEN temporary THEN 'PRESERVE' ELSE NULL END commit_action FROM duckdb_tables() UNION ALL SELECT database_name table_catalog, schema_name table_schema, view_name table_name, 'VIEW' table_type, NULL self_referencing_column_name, NULL reference_generation, NULL user_defined_type_catalog, NULL user_defined_type_schema, NULL user_defined_type_name, 'NO' is_insertable_into, 'NO' is_typed, NULL commit_action FROM duckdb_views;"},
     {nullptr, nullptr, nullptr}};
 static unique_ptr<CreateViewInfo> GetDefaultView(ClientContext &context, const string &input_schema, const string &input_name) {