npm - duckdb - Versions diffs - 1.4.0 → 1.4.1-dev2.0 - Mend

duckdb 1.4.0 → 1.4.1-dev2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (144) hide show

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "1.4.0",
+  "version": "1.4.1-dev2.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb/extension/core_functions/scalar/generic/current_setting.cpp CHANGED Viewed

@@ -53,10 +53,7 @@ unique_ptr<FunctionData> CurrentSettingBind(ClientContext &context, ScalarFuncti
 	if (!context.TryGetCurrentSetting(key, val)) {
 		auto extension_name = Catalog::AutoloadExtensionByConfigName(context, key);
 		// If autoloader didn't throw, the config is now available
-		if (!context.TryGetCurrentSetting(key, val)) {
-			throw InternalException("Extension %s did not provide the '%s' config setting",
-			                        extension_name.ToStdString(), key);
-		}
+		context.TryGetCurrentSetting(key, val);
 	}
 	bound_function.return_type = val.type();

package/src/duckdb/extension/icu/icu-strptime.cpp CHANGED Viewed

@@ -221,8 +221,9 @@ struct ICUStrptime : public ICUDateFunc {
 				if (!error.empty()) {
 					throw InvalidInputException("Failed to parse format specifier %s: %s", format_string, error);
 				}
-				// If any format has UTC offsets, then we have to produce TSTZ
+				// If any format has UTC offsets or names, then we have to produce TSTZ
 				has_tz = has_tz || format.HasFormatSpecifier(StrTimeSpecifier::TZ_NAME);
+				has_tz = has_tz || format.HasFormatSpecifier(StrTimeSpecifier::UTC_OFFSET);
 				formats.emplace_back(format);
 			}
 			if (has_tz) {

package/src/duckdb/extension/json/include/json_common.hpp CHANGED Viewed

@@ -13,6 +13,7 @@
 #include "duckdb/common/operator/string_cast.hpp"
 #include "duckdb/planner/expression/bound_function_expression.hpp"
 #include "yyjson.hpp"
+#include "duckdb/common/types/blob.hpp"
 using namespace duckdb_yyjson; // NOLINT
@@ -228,11 +229,8 @@ public:
 	static string FormatParseError(const char *data, idx_t length, yyjson_read_err &error, const string &extra = "") {
 		D_ASSERT(error.code != YYJSON_READ_SUCCESS);
-		// Go to blob so we can have a better error message for weird strings
-		auto blob = Value::BLOB(string(data, length));
 		// Truncate, so we don't print megabytes worth of JSON
-		string input = blob.ToString();
-		input = input.length() > 50 ? string(input.c_str(), 47) + "..." : input;
+		auto input = length > 50 ? string(data, 47) + "..." : string(data, length);
 		// Have to replace \r, otherwise output is unreadable
 		input = StringUtil::Replace(input, "\r", "\\r");
 		return StringUtil::Format("Malformed JSON at byte %lld of input: %s. %s Input: \"%s\"", error.pos, error.msg,

package/src/duckdb/extension/json/json_functions.cpp CHANGED Viewed

@@ -394,7 +394,11 @@ void JSONFunctions::RegisterSimpleCastFunctions(ExtensionLoader &loader) {
 	loader.RegisterCastFunction(LogicalType::LIST(LogicalType::JSON()), LogicalTypeId::VARCHAR, CastJSONListToVarchar,
 	                            json_list_to_varchar_cost);
-	// VARCHAR to JSON[] (also needs a special case otherwise get a VARCHAR -> VARCHAR[] cast first)
+	// JSON[] to JSON is allowed implicitly
+	loader.RegisterCastFunction(LogicalType::LIST(LogicalType::JSON()), LogicalType::JSON(), CastJSONListToVarchar,
+	                            100);
+	// VARCHAR to JSON[] (also needs a special case otherwise we get a VARCHAR -> VARCHAR[] cast first)
 	const auto varchar_to_json_list_cost =
 	    CastFunctionSet::ImplicitCastCost(db, LogicalType::VARCHAR, LogicalType::LIST(LogicalType::JSON())) - 1;
 	BoundCastInfo varchar_to_json_list_info(CastVarcharToJSONList, nullptr, JSONFunctionLocalState::InitCastLocalState);

package/src/duckdb/extension/parquet/column_writer.cpp CHANGED Viewed

@@ -187,9 +187,12 @@ void ColumnWriter::HandleRepeatLevels(ColumnWriterState &state, ColumnWriterStat
 		// no repeat levels without a parent node
 		return;
 	}
-	while (state.repetition_levels.size() < parent->repetition_levels.size()) {
-		state.repetition_levels.push_back(parent->repetition_levels[state.repetition_levels.size()]);
+	if (state.repetition_levels.size() >= parent->repetition_levels.size()) {
+		return;
 	}
+	state.repetition_levels.insert(state.repetition_levels.end(),
+	                               parent->repetition_levels.begin() + state.repetition_levels.size(),
+	                               parent->repetition_levels.end());
 }
 void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterState *parent, const ValidityMask &validity,
@@ -200,36 +203,41 @@ void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterStat
 		while (state.definition_levels.size() < parent->definition_levels.size()) {
 			idx_t current_index = state.definition_levels.size();
 			if (parent->definition_levels[current_index] != PARQUET_DEFINE_VALID) {
+				//! Inherit nulls from parent
 				state.definition_levels.push_back(parent->definition_levels[current_index]);
 				state.parent_null_count++;
 			} else if (validity.RowIsValid(vector_index)) {
+				//! Produce a non-null define
 				state.definition_levels.push_back(define_value);
 			} else {
+				//! Produce a null define
 				if (!can_have_nulls) {
 					throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
 				}
 				state.null_count++;
 				state.definition_levels.push_back(null_value);
 			}
+			D_ASSERT(parent->is_empty.empty() || current_index < parent->is_empty.size());
 			if (parent->is_empty.empty() || !parent->is_empty[current_index]) {
 				vector_index++;
 			}
 		}
+		return;
+	}
+	// no parent: set definition levels only from this validity mask
+	if (validity.AllValid()) {
+		state.definition_levels.insert(state.definition_levels.end(), count, define_value);
 	} else {
-		// no parent: set definition levels only from this validity mask
-		if (validity.AllValid()) {
-			state.definition_levels.insert(state.definition_levels.end(), count, define_value);
-		} else {
-			for (idx_t i = 0; i < count; i++) {
-				const auto is_null = !validity.RowIsValid(i);
-				state.definition_levels.emplace_back(is_null ? null_value : define_value);
-				state.null_count += is_null;
-			}
-		}
-		if (!can_have_nulls && state.null_count != 0) {
-			throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
+		for (idx_t i = 0; i < count; i++) {
+			const auto is_null = !validity.RowIsValid(i);
+			state.definition_levels.emplace_back(is_null ? null_value : define_value);
+			state.null_count += is_null;
 		}
 	}
+	if (!can_have_nulls && state.null_count != 0) {
+		throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
+	}
 }
 //===--------------------------------------------------------------------===//
@@ -237,7 +245,7 @@ void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterStat
 //===--------------------------------------------------------------------===//
 ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::SchemaElement> &schemas,
-                                                    const LogicalType &type, const string &name,
+                                                    const LogicalType &type, const string &name, bool allow_geometry,
                                                     optional_ptr<const ChildFieldIDs> field_ids, idx_t max_repeat,
                                                     idx_t max_define, bool can_have_nulls) {
 	auto null_type = can_have_nulls ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED;
@@ -277,7 +285,8 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
 		struct_column.children.reserve(child_types.size());
 		for (auto &child_type : child_types) {
 			struct_column.children.emplace_back(FillParquetSchema(schemas, child_type.second, child_type.first,
-			                                                      child_field_ids, max_repeat, max_define + 1));
+			                                                      allow_geometry, child_field_ids, max_repeat,
+			                                                      max_define + 1));
 		}
 		return struct_column;
 	}
@@ -313,8 +322,8 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
 		schemas.push_back(std::move(repeated_element));
 		ParquetColumnSchema list_column(name, type, max_define, max_repeat, schema_idx, 0);
-		list_column.children.push_back(
-		    FillParquetSchema(schemas, child_type, "element", child_field_ids, max_repeat + 1, max_define + 2));
+		list_column.children.push_back(FillParquetSchema(schemas, child_type, "element", allow_geometry,
+		                                                 child_field_ids, max_repeat + 1, max_define + 2));
 		return list_column;
 	}
 	if (type.id() == LogicalTypeId::MAP) {
@@ -361,13 +370,14 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
 		for (idx_t i = 0; i < 2; i++) {
 			// key needs to be marked as REQUIRED
 			bool is_key = i == 0;
-			auto child_schema = FillParquetSchema(schemas, kv_types[i], kv_names[i], child_field_ids, max_repeat + 1,
-			                                      max_define + 2, !is_key);
+			auto child_schema = FillParquetSchema(schemas, kv_types[i], kv_names[i], allow_geometry, child_field_ids,
+			                                      max_repeat + 1, max_define + 2, !is_key);
 			map_column.children.push_back(std::move(child_schema));
 		}
 		return map_column;
 	}
 	duckdb_parquet::SchemaElement schema_element;
 	schema_element.type = ParquetWriter::DuckDBTypeToParquetType(type);
 	schema_element.repetition_type = null_type;
@@ -379,7 +389,7 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
 		schema_element.__isset.field_id = true;
 		schema_element.field_id = field_id->field_id;
 	}
-	ParquetWriter::SetSchemaProperties(type, schema_element);
+	ParquetWriter::SetSchemaProperties(type, schema_element, allow_geometry);
 	schemas.push_back(std::move(schema_element));
 	return ParquetColumnSchema(name, type, max_define, max_repeat, schema_idx, 0);
 }

package/src/duckdb/extension/parquet/geo_parquet.cpp CHANGED Viewed

@@ -208,17 +208,19 @@ unique_ptr<GeoParquetFileMetadata> GeoParquetFileMetadata::TryRead(const duckdb_
 					throw InvalidInputException("Geoparquet metadata is not an object");
 				}
-				auto result = make_uniq<GeoParquetFileMetadata>();
+				// We dont actually care about the version for now, as we only support V1+native
+				auto result = make_uniq<GeoParquetFileMetadata>(GeoParquetVersion::BOTH);
 				// Check and parse the version
 				const auto version_val = yyjson_obj_get(root, "version");
 				if (!yyjson_is_str(version_val)) {
 					throw InvalidInputException("Geoparquet metadata does not have a version");
 				}
-				result->version = yyjson_get_str(version_val);
-				if (StringUtil::StartsWith(result->version, "2")) {
-					// Guard against a breaking future 2.0 version
-					throw InvalidInputException("Geoparquet version %s is not supported", result->version);
+				auto version = yyjson_get_str(version_val);
+				if (StringUtil::StartsWith(version, "3")) {
+					// Guard against a breaking future 3.0 version
+					throw InvalidInputException("Geoparquet version %s is not supported", version);
 				}
 				// Check and parse the geometry columns
@@ -344,7 +346,20 @@ void GeoParquetFileMetadata::Write(duckdb_parquet::FileMetaData &file_meta_data)
 	yyjson_mut_doc_set_root(doc, root);
 	// Add the version
-	yyjson_mut_obj_add_strncpy(doc, root, "version", version.c_str(), version.size());
+	switch (version) {
+	case GeoParquetVersion::V1:
+	case GeoParquetVersion::BOTH:
+		yyjson_mut_obj_add_strcpy(doc, root, "version", "1.0.0");
+		break;
+	case GeoParquetVersion::V2:
+		yyjson_mut_obj_add_strcpy(doc, root, "version", "2.0.0");
+		break;
+	case GeoParquetVersion::NONE:
+	default:
+		// Should never happen, we should not be writing anything
+		yyjson_mut_doc_free(doc);
+		throw InternalException("GeoParquetVersion::NONE should not write metadata");
+	}
 	// Add the primary column
 	yyjson_mut_obj_add_strncpy(doc, root, "primary_column", primary_geometry_column.c_str(),

package/src/duckdb/extension/parquet/include/column_writer.hpp CHANGED Viewed

@@ -27,7 +27,7 @@ public:
 	unsafe_vector<uint16_t> definition_levels;
 	unsafe_vector<uint16_t> repetition_levels;
-	vector<bool> is_empty;
+	unsafe_vector<uint8_t> is_empty;
 	idx_t parent_null_count = 0;
 	idx_t null_count = 0;
@@ -94,7 +94,7 @@ public:
 	}
 	static ParquetColumnSchema FillParquetSchema(vector<duckdb_parquet::SchemaElement> &schemas,
-	                                             const LogicalType &type, const string &name,
+	                                             const LogicalType &type, const string &name, bool allow_geometry,
 	                                             optional_ptr<const ChildFieldIDs> field_ids, idx_t max_repeat = 0,
 	                                             idx_t max_define = 1, bool can_have_nulls = true);
 	//! Create the column writer for a specific type recursively

package/src/duckdb/extension/parquet/include/geo_parquet.hpp CHANGED Viewed

@@ -199,6 +199,31 @@ enum class GeoParquetColumnEncoding : uint8_t {
 	MULTIPOLYGON,
 };
+enum class GeoParquetVersion : uint8_t {
+	// Write GeoParquet 1.0 metadata
+	// GeoParquet 1.0 has the widest support among readers and writers
+	V1,
+	// Write GeoParquet 2.0
+	// The GeoParquet 2.0 options is identical to GeoParquet 1.0 except the underlying storage
+	// of spatial columns is Parquet native geometry, where the Parquet writer will include
+	// native statistics according to the underlying Parquet options. Compared to 'BOTH', this will
+	// actually write the metadata as containing GeoParquet version 2.0.0
+	// However, V2 isnt standardized yet, so this option is still a bit experimental
+	V2,
+	// Write GeoParquet 1.0 metadata, with native Parquet geometry types
+	// This is a bit of a hold-over option for compatibility with systems that
+	// reject GeoParquet 2.0 metadata, but can read Parquet native geometry types as they simply ignore the extra
+	// logical type. DuckDB v1.4.0 falls into this category.
+	BOTH,
+	// Do not write GeoParquet metadata
+	// This option suppresses GeoParquet metadata; however, spatial types will be written as
+	// Parquet native Geometry/Geography.
+	NONE,
+};
 struct GeoParquetColumnMetadata {
 	// The encoding of the geometry column
 	GeoParquetColumnEncoding geometry_encoding;
@@ -215,6 +240,8 @@ struct GeoParquetColumnMetadata {
 class GeoParquetFileMetadata {
 public:
+	GeoParquetFileMetadata(GeoParquetVersion geo_parquet_version) : version(geo_parquet_version) {
+	}
 	void AddGeoParquetStats(const string &column_name, const LogicalType &type, const GeometryStats &stats);
 	void Write(duckdb_parquet::FileMetaData &file_meta_data);
@@ -234,8 +261,8 @@ public:
 private:
 	mutex write_lock;
-	string version = "1.1.0";
 	unordered_map<string, GeoParquetColumnMetadata> geometry_columns;
+	GeoParquetVersion version;
 };
 } // namespace duckdb

package/src/duckdb/extension/parquet/include/parquet_writer.hpp CHANGED Viewed

@@ -85,7 +85,7 @@ public:
 	              shared_ptr<ParquetEncryptionConfig> encryption_config, optional_idx dictionary_size_limit,
 	              idx_t string_dictionary_page_size_limit, bool enable_bloom_filters,
 	              double bloom_filter_false_positive_ratio, int64_t compression_level, bool debug_use_openssl,
-	              ParquetVersion parquet_version);
+	              ParquetVersion parquet_version, GeoParquetVersion geoparquet_version);
 	~ParquetWriter();
 public:
@@ -95,7 +95,8 @@ public:
 	void Finalize();
 	static duckdb_parquet::Type::type DuckDBTypeToParquetType(const LogicalType &duckdb_type);
-	static void SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele);
+	static void SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele,
+	                                bool allow_geometry);
 	ClientContext &GetContext() {
 		return context;
@@ -139,6 +140,9 @@ public:
 	ParquetVersion GetParquetVersion() const {
 		return parquet_version;
 	}
+	GeoParquetVersion GetGeoParquetVersion() const {
+		return geoparquet_version;
+	}
 	const string &GetFileName() const {
 		return file_name;
 	}
@@ -175,6 +179,7 @@ private:
 	bool debug_use_openssl;
 	shared_ptr<EncryptionUtil> encryption_util;
 	ParquetVersion parquet_version;
+	GeoParquetVersion geoparquet_version;
 	vector<ParquetColumnSchema> column_schemas;
 	unique_ptr<BufferedFileWriter> writer;

package/src/duckdb/extension/parquet/include/reader/string_column_reader.hpp CHANGED Viewed

@@ -14,12 +14,25 @@
 namespace duckdb {
 class StringColumnReader : public ColumnReader {
+	enum class StringColumnType : uint8_t { VARCHAR, JSON, OTHER };
+	static StringColumnType GetStringColumnType(const LogicalType &type) {
+		if (type.IsJSONType()) {
+			return StringColumnType::JSON;
+		}
+		if (type.id() == LogicalTypeId::VARCHAR) {
+			return StringColumnType::VARCHAR;
+		}
+		return StringColumnType::OTHER;
+	}
 public:
 	static constexpr const PhysicalType TYPE = PhysicalType::VARCHAR;
 public:
 	StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema);
 	idx_t fixed_width_string_length;
+	const StringColumnType string_column_type;
 public:
 	static void VerifyString(const char *str_data, uint32_t str_len, const bool isVarchar);

package/src/duckdb/extension/parquet/include/writer/array_column_writer.hpp CHANGED Viewed

@@ -25,6 +25,10 @@ public:
 	void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
 	             bool vector_can_span_multiple_pages) override;
 	void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
+protected:
+	void WriteArrayState(ListColumnWriterState &state, idx_t array_size, uint16_t first_repeat_level,
+	                     idx_t define_value, const bool is_empty = false);
 };
 } // namespace duckdb

package/src/duckdb/extension/parquet/parquet_extension.cpp CHANGED Viewed

@@ -238,6 +238,9 @@ struct ParquetWriteBindData : public TableFunctionData {
 	//! Which encodings to include when writing
 	ParquetVersion parquet_version = ParquetVersion::V1;
+	//! Which geo-parquet version to use when writing
+	GeoParquetVersion geoparquet_version = GeoParquetVersion::V1;
 };
 struct ParquetWriteGlobalState : public GlobalFunctionData {
@@ -291,6 +294,7 @@ static void ParquetListCopyOptions(ClientContext &context, CopyOptionsInput &inp
 	copy_options["binary_as_string"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::READ_ONLY);
 	copy_options["file_row_number"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::READ_ONLY);
 	copy_options["can_have_nan"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::READ_ONLY);
+	copy_options["geoparquet_version"] = CopyOption(LogicalType::VARCHAR, CopyOptionMode::WRITE_ONLY);
 }
 static unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyFunctionBindInput &input,
@@ -426,6 +430,19 @@ static unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyFun
 			} else {
 				throw BinderException("Expected parquet_version 'V1' or 'V2'");
 			}
+		} else if (loption == "geoparquet_version") {
+			const auto roption = StringUtil::Upper(option.second[0].ToString());
+			if (roption == "NONE") {
+				bind_data->geoparquet_version = GeoParquetVersion::NONE;
+			} else if (roption == "V1") {
+				bind_data->geoparquet_version = GeoParquetVersion::V1;
+			} else if (roption == "V2") {
+				bind_data->geoparquet_version = GeoParquetVersion::V2;
+			} else if (roption == "BOTH") {
+				bind_data->geoparquet_version = GeoParquetVersion::BOTH;
+			} else {
+				throw BinderException("Expected geoparquet_version 'NONE', 'V1' or 'BOTH'");
+			}
 		} else {
 			throw InternalException("Unrecognized option for PARQUET: %s", option.first.c_str());
 		}
@@ -457,7 +474,8 @@ static unique_ptr<GlobalFunctionData> ParquetWriteInitializeGlobal(ClientContext
 	    parquet_bind.field_ids.Copy(), parquet_bind.kv_metadata, parquet_bind.encryption_config,
 	    parquet_bind.dictionary_size_limit, parquet_bind.string_dictionary_page_size_limit,
 	    parquet_bind.enable_bloom_filters, parquet_bind.bloom_filter_false_positive_ratio,
-	    parquet_bind.compression_level, parquet_bind.debug_use_openssl, parquet_bind.parquet_version);
+	    parquet_bind.compression_level, parquet_bind.debug_use_openssl, parquet_bind.parquet_version,
+	    parquet_bind.geoparquet_version);
 	return std::move(global_state);
 }
@@ -626,6 +644,39 @@ ParquetVersion EnumUtil::FromString<ParquetVersion>(const char *value) {
 	throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
 }
+template <>
+const char *EnumUtil::ToChars<GeoParquetVersion>(GeoParquetVersion value) {
+	switch (value) {
+	case GeoParquetVersion::NONE:
+		return "NONE";
+	case GeoParquetVersion::V1:
+		return "V1";
+	case GeoParquetVersion::V2:
+		return "V2";
+	case GeoParquetVersion::BOTH:
+		return "BOTH";
+	default:
+		throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
+	}
+}
+template <>
+GeoParquetVersion EnumUtil::FromString<GeoParquetVersion>(const char *value) {
+	if (StringUtil::Equals(value, "NONE")) {
+		return GeoParquetVersion::NONE;
+	}
+	if (StringUtil::Equals(value, "V1")) {
+		return GeoParquetVersion::V1;
+	}
+	if (StringUtil::Equals(value, "V2")) {
+		return GeoParquetVersion::V2;
+	}
+	if (StringUtil::Equals(value, "BOTH")) {
+		return GeoParquetVersion::BOTH;
+	}
+	throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
+}
 static optional_idx SerializeCompressionLevel(const int64_t compression_level) {
 	return compression_level < 0 ? NumericLimits<idx_t>::Maximum() - NumericCast<idx_t>(AbsValue(compression_level))
 	                             : NumericCast<idx_t>(compression_level);
@@ -679,6 +730,8 @@ static void ParquetCopySerialize(Serializer &serializer, const FunctionData &bin
 	serializer.WritePropertyWithDefault(115, "string_dictionary_page_size_limit",
 	                                    bind_data.string_dictionary_page_size_limit,
 	                                    default_value.string_dictionary_page_size_limit);
+	serializer.WritePropertyWithDefault(116, "geoparquet_version", bind_data.geoparquet_version,
+	                                    default_value.geoparquet_version);
 }
 static unique_ptr<FunctionData> ParquetCopyDeserialize(Deserializer &deserializer, CopyFunction &function) {
@@ -711,6 +764,8 @@ static unique_ptr<FunctionData> ParquetCopyDeserialize(Deserializer &deserialize
 	    deserializer.ReadPropertyWithExplicitDefault(114, "parquet_version", default_value.parquet_version);
 	data->string_dictionary_page_size_limit = deserializer.ReadPropertyWithExplicitDefault(
 	    115, "string_dictionary_page_size_limit", default_value.string_dictionary_page_size_limit);
+	data->geoparquet_version =
+	    deserializer.ReadPropertyWithExplicitDefault(116, "geoparquet_version", default_value.geoparquet_version);
 	return std::move(data);
 }

package/src/duckdb/extension/parquet/parquet_reader.cpp CHANGED Viewed

@@ -570,7 +570,10 @@ ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_d
 	auto file_meta_data = GetFileMetadata();
 	D_ASSERT(file_meta_data);
-	D_ASSERT(next_schema_idx < file_meta_data->schema.size());
+	if (next_schema_idx >= file_meta_data->schema.size()) {
+		throw InvalidInputException("Malformed Parquet schema in file \"%s\": invalid schema index %d", file.path,
+		                            next_schema_idx);
+	}
 	auto &s_ele = file_meta_data->schema[next_schema_idx];
 	auto this_idx = next_schema_idx;

package/src/duckdb/extension/parquet/parquet_statistics.cpp CHANGED Viewed

@@ -395,23 +395,21 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
 		}
 		break;
 	case LogicalTypeId::VARCHAR: {
-		auto string_stats = StringStats::CreateEmpty(type);
+		auto string_stats = StringStats::CreateUnknown(type);
 		if (parquet_stats.__isset.min_value) {
 			StringColumnReader::VerifyString(parquet_stats.min_value.c_str(), parquet_stats.min_value.size(), true);
-			StringStats::Update(string_stats, parquet_stats.min_value);
+			StringStats::SetMin(string_stats, parquet_stats.min_value);
 		} else if (parquet_stats.__isset.min) {
 			StringColumnReader::VerifyString(parquet_stats.min.c_str(), parquet_stats.min.size(), true);
-			StringStats::Update(string_stats, parquet_stats.min);
+			StringStats::SetMin(string_stats, parquet_stats.min);
 		}
 		if (parquet_stats.__isset.max_value) {
 			StringColumnReader::VerifyString(parquet_stats.max_value.c_str(), parquet_stats.max_value.size(), true);
-			StringStats::Update(string_stats, parquet_stats.max_value);
+			StringStats::SetMax(string_stats, parquet_stats.max_value);
 		} else if (parquet_stats.__isset.max) {
 			StringColumnReader::VerifyString(parquet_stats.max.c_str(), parquet_stats.max.size(), true);
-			StringStats::Update(string_stats, parquet_stats.max);
+			StringStats::SetMax(string_stats, parquet_stats.max);
 		}
-		StringStats::SetContainsUnicode(string_stats);
-		StringStats::ResetMaxStringLength(string_stats);
 		row_group_stats = string_stats.ToUnique();
 		break;
 	}

package/src/duckdb/extension/parquet/parquet_writer.cpp CHANGED Viewed

@@ -166,7 +166,8 @@ Type::type ParquetWriter::DuckDBTypeToParquetType(const LogicalType &duckdb_type
 	throw NotImplementedException("Unimplemented type for Parquet \"%s\"", duckdb_type.ToString());
 }
-void ParquetWriter::SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele) {
+void ParquetWriter::SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele,
+                                        bool allow_geometry) {
 	if (duckdb_type.IsJSONType()) {
 		schema_ele.converted_type = ConvertedType::JSON;
 		schema_ele.__isset.converted_type = true;
@@ -174,7 +175,7 @@ void ParquetWriter::SetSchemaProperties(const LogicalType &duckdb_type, duckdb_p
 		schema_ele.logicalType.__set_JSON(duckdb_parquet::JsonType());
 		return;
 	}
-	if (duckdb_type.GetAlias() == "WKB_BLOB") {
+	if (duckdb_type.GetAlias() == "WKB_BLOB" && allow_geometry) {
 		schema_ele.__isset.logicalType = true;
 		schema_ele.logicalType.__isset.GEOMETRY = true;
 		// TODO: Set CRS in the future
@@ -356,14 +357,16 @@ ParquetWriter::ParquetWriter(ClientContext &context, FileSystem &fs, string file
                              shared_ptr<ParquetEncryptionConfig> encryption_config_p,
                              optional_idx dictionary_size_limit_p, idx_t string_dictionary_page_size_limit_p,
                              bool enable_bloom_filters_p, double bloom_filter_false_positive_ratio_p,
-                             int64_t compression_level_p, bool debug_use_openssl_p, ParquetVersion parquet_version)
+                             int64_t compression_level_p, bool debug_use_openssl_p, ParquetVersion parquet_version,
+                             GeoParquetVersion geoparquet_version)
     : context(context), file_name(std::move(file_name_p)), sql_types(std::move(types_p)),
       column_names(std::move(names_p)), codec(codec), field_ids(std::move(field_ids_p)),
       encryption_config(std::move(encryption_config_p)), dictionary_size_limit(dictionary_size_limit_p),
       string_dictionary_page_size_limit(string_dictionary_page_size_limit_p),
       enable_bloom_filters(enable_bloom_filters_p),
       bloom_filter_false_positive_ratio(bloom_filter_false_positive_ratio_p), compression_level(compression_level_p),
-      debug_use_openssl(debug_use_openssl_p), parquet_version(parquet_version), total_written(0), num_row_groups(0) {
+      debug_use_openssl(debug_use_openssl_p), parquet_version(parquet_version), geoparquet_version(geoparquet_version),
+      total_written(0), num_row_groups(0) {
 	// initialize the file writer
 	writer = make_uniq<BufferedFileWriter>(fs, file_name.c_str(),
@@ -416,10 +419,13 @@ ParquetWriter::ParquetWriter(ClientContext &context, FileSystem &fs, string file
 	auto &unique_names = column_names;
 	VerifyUniqueNames(unique_names);
+	// V1 GeoParquet stores geometries as blobs, no logical type
+	auto allow_geometry = geoparquet_version != GeoParquetVersion::V1;
 	// construct the child schemas
 	for (idx_t i = 0; i < sql_types.size(); i++) {
-		auto child_schema =
-		    ColumnWriter::FillParquetSchema(file_meta_data.schema, sql_types[i], unique_names[i], &field_ids);
+		auto child_schema = ColumnWriter::FillParquetSchema(file_meta_data.schema, sql_types[i], unique_names[i],
+		                                                    allow_geometry, &field_ids);
 		column_schemas.push_back(std::move(child_schema));
 	}
 	// now construct the writers based on the schemas
@@ -975,7 +981,8 @@ void ParquetWriter::Finalize() {
 	}
 	// Add geoparquet metadata to the file metadata
-	if (geoparquet_data && GeoParquetFileMetadata::IsGeoParquetConversionEnabled(context)) {
+	if (geoparquet_data && GeoParquetFileMetadata::IsGeoParquetConversionEnabled(context) &&
+	    geoparquet_version != GeoParquetVersion::NONE) {
 		geoparquet_data->Write(file_meta_data);
 	}
@@ -1005,7 +1012,7 @@ void ParquetWriter::Finalize() {
 GeoParquetFileMetadata &ParquetWriter::GetGeoParquetData() {
 	if (!geoparquet_data) {
-		geoparquet_data = make_uniq<GeoParquetFileMetadata>();
+		geoparquet_data = make_uniq<GeoParquetFileMetadata>(geoparquet_version);
 	}
 	return *geoparquet_data;
 }

package/src/duckdb/extension/parquet/reader/string_column_reader.cpp CHANGED Viewed

@@ -9,7 +9,7 @@ namespace duckdb {
 // String Column Reader
 //===--------------------------------------------------------------------===//
 StringColumnReader::StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
-    : ColumnReader(reader, schema) {
+    : ColumnReader(reader, schema), string_column_type(GetStringColumnType(Type())) {
 	fixed_width_string_length = 0;
 	if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
 		fixed_width_string_length = schema.type_length;
@@ -26,13 +26,26 @@ void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, co
 	size_t pos;
 	auto utf_type = Utf8Proc::Analyze(str_data, str_len, &reason, &pos);
 	if (utf_type == UnicodeType::INVALID) {
-		throw InvalidInputException("Invalid string encoding found in Parquet file: value \"" +
-		                            Blob::ToString(string_t(str_data, str_len)) + "\" is not valid UTF8!");
+		throw InvalidInputException("Invalid string encoding found in Parquet file: value \"%s\" is not valid UTF8!",
+		                            Blob::ToString(string_t(str_data, str_len)));
 	}
 }
 void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len) {
-	VerifyString(str_data, str_len, Type().id() == LogicalTypeId::VARCHAR);
+	switch (string_column_type) {
+	case StringColumnType::VARCHAR:
+		VerifyString(str_data, str_len, true);
+		break;
+	case StringColumnType::JSON: {
+		const auto error = StringUtil::ValidateJSON(str_data, str_len);
+		if (!error.empty()) {
+			throw InvalidInputException("Invalid JSON found in Parquet file: %s", error);
+		}
+		break;
+	}
+	default:
+		break;
+	}
 }
 class ParquetStringVectorBuffer : public VectorBuffer {