npm - duckdb - Versions diffs - 1.3.1-dev6.0 → 1.3.1 - Mend

duckdb 1.3.1-dev6.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

package/src/duckdb/extension/json/json_functions/json_serialize_sql.cpp CHANGED Viewed

@@ -171,8 +171,8 @@ ScalarFunctionSet JSONFunctions::GetSerializeSqlFunction() {
 //----------------------------------------------------------------------
 // JSON DESERIALIZE
 //----------------------------------------------------------------------
-static unique_ptr<SelectStatement> DeserializeSelectStatement(string_t input, yyjson_alc *alc) {
-	auto doc = JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc);
+static vector<unique_ptr<SelectStatement>> DeserializeSelectStatement(string_t input, yyjson_alc *alc) {
+	auto doc = yyjson_doc_ptr(JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc));
 	if (!doc) {
 		throw ParserException("Could not parse json");
 	}
@@ -196,16 +196,22 @@ static unique_ptr<SelectStatement> DeserializeSelectStatement(string_t input, yy
 	if (size == 0) {
 		throw ParserException("Error parsing json: no statements");
 	}
-	if (size > 1) {
-		throw ParserException("Error parsing json: more than one statement");
-	}
-	auto stmt_json = yyjson_arr_get(statements, 0);
-	JsonDeserializer deserializer(stmt_json, doc);
-	auto stmt = SelectStatement::Deserialize(deserializer);
-	if (!stmt->node) {
-		throw ParserException("Error parsing json: no select node found in json");
+	vector<unique_ptr<SelectStatement>> result;
+	idx_t idx;
+	idx_t max;
+	yyjson_val *stmt_json;
+	yyjson_arr_foreach(statements, idx, max, stmt_json) {
+		JsonDeserializer deserializer(stmt_json, doc);
+		auto stmt = SelectStatement::Deserialize(deserializer);
+		if (!stmt->node) {
+			throw ParserException("Error parsing json: no select node found in json");
+		}
+		result.push_back(std::move(stmt));
 	}
-	return stmt;
+	return result;
 }
 //----------------------------------------------------------------------
@@ -217,8 +223,17 @@ static void JsonDeserializeFunction(DataChunk &args, ExpressionState &state, Vec
 	auto &inputs = args.data[0];
 	UnaryExecutor::Execute<string_t, string_t>(inputs, result, args.size(), [&](string_t input) {
-		auto stmt = DeserializeSelectStatement(input, alc);
-		return StringVector::AddString(result, stmt->ToString());
+		auto stmts = DeserializeSelectStatement(input, alc);
+		// Combine all statements into a single semicolon separated string
+		string str;
+		for (idx_t i = 0; i < stmts.size(); i++) {
+			if (i > 0) {
+				str += "; ";
+			}
+			str += stmts[i]->ToString();
+		}
+		return StringVector::AddString(result, str);
 	});
 }
@@ -237,8 +252,11 @@ static string ExecuteJsonSerializedSqlPragmaFunction(ClientContext &context, con
 	auto alc = local_state.json_allocator->GetYYAlc();
 	auto input = parameters.values[0].GetValueUnsafe<string_t>();
-	auto stmt = DeserializeSelectStatement(input, alc);
-	return stmt->ToString();
+	auto stmts = DeserializeSelectStatement(input, alc);
+	if (stmts.size() != 1) {
+		throw BinderException("json_execute_serialized_sql pragma expects exactly one statement");
+	}
+	return stmts[0]->ToString();
 }
 PragmaFunctionSet JSONFunctions::GetExecuteJsonSerializedSqlPragmaFunction() {
@@ -268,8 +286,11 @@ struct ExecuteSqlTableFunction {
 			throw BinderException("json_execute_serialized_sql cannot execute NULL plan");
 		}
 		auto serialized = input.inputs[0].GetValueUnsafe<string>();
-		auto stmt = DeserializeSelectStatement(serialized, alc);
-		result->plan = result->con->RelationFromQuery(std::move(stmt));
+		auto stmts = DeserializeSelectStatement(serialized, alc);
+		if (stmts.size() != 1) {
+			throw BinderException("json_execute_serialized_sql expects exactly one statement");
+		}
+		result->plan = result->con->RelationFromQuery(std::move(stmts[0]));
 		for (auto &col : result->plan->Columns()) {
 			return_types.emplace_back(col.Type());

package/src/duckdb/extension/json/json_functions/json_table_in_out.cpp CHANGED Viewed

@@ -124,17 +124,21 @@ struct JSONTableInOutLocalState : LocalTableFunctionState {
 		return result;
 	}
-	void AddRecursionNode(yyjson_val *val, optional_ptr<yyjson_val> vkey) {
-		const auto vkey_str =
-		    vkey ? "." + string(unsafe_yyjson_get_str(vkey.get()), unsafe_yyjson_get_len(vkey.get())) : "";
-		recursion_nodes.emplace_back(vkey_str, val);
+	void AddRecursionNode(yyjson_val *val, optional_ptr<yyjson_val> vkey, const optional_idx arr_index) {
+		string str;
+		if (vkey) {
+			str = "." + string(unsafe_yyjson_get_str(vkey.get()), unsafe_yyjson_get_len(vkey.get()));
+		} else if (arr_index.IsValid()) {
+			str = "[" + to_string(arr_index.GetIndex()) + "]";
+		}
+		recursion_nodes.emplace_back(str, val);
 	}
 	JSONAllocator json_allocator;
 	yyjson_alc *alc;
 	string path;
-	size_t len;
+	idx_t len;
 	yyjson_doc *doc;
 	bool initialized;
@@ -269,7 +273,7 @@ static void InitializeLocalState(JSONTableInOutLocalState &lstate, DataChunk &in
 		result.AddRow<TYPE>(lstate, nullptr, root);
 	}
 	if (is_container) {
-		lstate.AddRecursionNode(root, nullptr);
+		lstate.AddRecursionNode(root, nullptr, optional_idx());
 	}
 }
@@ -283,7 +287,7 @@ static bool JSONTableInOutHandleValue(JSONTableInOutLocalState &lstate, JSONTabl
 	result.AddRow<TYPE>(lstate, child_key, child_val);
 	child_index++; // We finished processing the array element
 	if (TYPE == JSONTableInOutType::TREE && (unsafe_yyjson_is_arr(child_val) || unsafe_yyjson_is_obj(child_val))) {
-		lstate.AddRecursionNode(child_val, child_key);
+		lstate.AddRecursionNode(child_val, child_key, idx);
 		return true; // Break: We added a recursion node, go depth-first
 	}
 	if (result.count == STANDARD_VECTOR_SIZE) {

package/src/duckdb/extension/json/json_functions.cpp CHANGED Viewed

@@ -14,7 +14,7 @@ namespace duckdb {
 using JSONPathType = JSONCommon::JSONPathType;
-JSONPathType JSONReadFunctionData::CheckPath(const Value &path_val, string &path, size_t &len) {
+JSONPathType JSONReadFunctionData::CheckPath(const Value &path_val, string &path, idx_t &len) {
 	if (path_val.IsNull()) {
 		throw BinderException("JSON path cannot be NULL");
 	}
@@ -60,7 +60,7 @@ unique_ptr<FunctionData> JSONReadFunctionData::Bind(ClientContext &context, Scal
 	D_ASSERT(bound_function.arguments.size() == 2);
 	bool constant = false;
 	string path;
-	size_t len = 0;
+	idx_t len = 0;
 	JSONPathType path_type = JSONPathType::REGULAR;
 	if (arguments[1]->IsFoldable()) {
 		const auto path_val = ExpressionExecutor::EvaluateScalar(context, *arguments[1]);
@@ -80,7 +80,7 @@ unique_ptr<FunctionData> JSONReadFunctionData::Bind(ClientContext &context, Scal
 	return make_uniq<JSONReadFunctionData>(constant, std::move(path), len, path_type);
 }
-JSONReadManyFunctionData::JSONReadManyFunctionData(vector<string> paths_p, vector<size_t> lens_p)
+JSONReadManyFunctionData::JSONReadManyFunctionData(vector<string> paths_p, vector<idx_t> lens_p)
     : paths(std::move(paths_p)), lens(std::move(lens_p)) {
 	for (const auto &path : paths) {
 		ptrs.push_back(path.c_str());
@@ -107,7 +107,7 @@ unique_ptr<FunctionData> JSONReadManyFunctionData::Bind(ClientContext &context,
 	}
 	vector<string> paths;
-	vector<size_t> lens;
+	vector<idx_t> lens;
 	auto paths_val = ExpressionExecutor::EvaluateScalar(context, *arguments[1]);
 	for (auto &path_val : ListValue::GetChildren(paths_val)) {

package/src/duckdb/extension/json/json_reader.cpp CHANGED Viewed

@@ -737,7 +737,7 @@ bool JSONReader::CopyRemainderFromPreviousBuffer(JSONReaderScanState &scan_state
 	idx_t prev_buffer_size = previous_buffer_handle->buffer_size - previous_buffer_handle->buffer_start;
 	auto prev_buffer_ptr = char_ptr_cast(previous_buffer_handle->buffer.get()) + previous_buffer_handle->buffer_size;
 	auto prev_object_start = PreviousNewline(prev_buffer_ptr, prev_buffer_size);
-	auto prev_object_size = prev_buffer_ptr - prev_object_start;
+	auto prev_object_size = NumericCast<idx_t>(prev_buffer_ptr - prev_object_start);
 	D_ASSERT(scan_state.buffer_offset == options.maximum_object_size);
 	if (prev_object_size > scan_state.buffer_offset) {

package/src/duckdb/extension/parquet/column_reader.cpp CHANGED Viewed

@@ -412,7 +412,7 @@ void ColumnReader::DecompressInternal(CompressionCodec::type codec, const_data_p
 	}
 	default: {
-		std::stringstream codec_name;
+		duckdb::stringstream codec_name;
 		codec_name << codec;
 		throw std::runtime_error("Unsupported compression codec \"" + codec_name.str() +
 		                         "\". Supported options are uncompressed, brotli, gzip, lz4_raw, snappy or zstd");
@@ -713,6 +713,12 @@ void ColumnReader::ApplyPendingSkips(data_ptr_t define_out, data_ptr_t repeat_ou
 	while (to_skip > 0) {
 		auto skip_now = ReadPageHeaders(to_skip);
+		if (page_is_filtered_out) {
+			// the page has been filtered out entirely - skip
+			page_rows_available -= skip_now;
+			to_skip -= skip_now;
+			continue;
+		}
 		const auto all_valid = PrepareRead(skip_now, define_out, repeat_out, 0);
 		const auto define_ptr = all_valid ? nullptr : static_cast<uint8_t *>(define_out);

package/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp CHANGED Viewed

@@ -23,7 +23,7 @@ public:
 	template <typename T>
 	void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
 		if (buffer_.len % sizeof(T) != 0) {
-			std::stringstream error;
+			duckdb::stringstream error;
 			error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
 			      << ") should be a multiple of the type size (" << sizeof(T) << ")";
 			throw std::runtime_error(error.str());
@@ -44,7 +44,7 @@ public:
 	template <typename T>
 	void Skip(uint32_t batch_size) {
 		if (buffer_.len % sizeof(T) != 0) {
-			std::stringstream error;
+			duckdb::stringstream error;
 			error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
 			      << ") should be a multiple of the type size (" << sizeof(T) << ")";
 			throw std::runtime_error(error.str());

package/src/duckdb/extension/parquet/include/parquet_dbp_encoder.hpp CHANGED Viewed

@@ -155,8 +155,8 @@ private:
 			int64_t verification_data[NUMBER_OF_VALUES_IN_A_MINIBLOCK];
 			ByteBuffer byte_buffer(data_ptr_cast(data_packed), write_size);
 			bitpacking_width_t bitpack_pos = 0;
-			ParquetDecodeUtils::BitUnpack(byte_buffer, bitpack_pos, verification_data, NUMBER_OF_VALUES_IN_A_MINIBLOCK,
-			                              width);
+			ParquetDecodeUtils::BitUnpack(byte_buffer, bitpack_pos, reinterpret_cast<uint64_t *>(verification_data),
+			                              NUMBER_OF_VALUES_IN_A_MINIBLOCK, width);
 			for (idx_t i = 0; i < NUMBER_OF_VALUES_IN_A_MINIBLOCK; i++) {
 				D_ASSERT(src[i] == verification_data[i]);
 			}

package/src/duckdb/extension/parquet/include/parquet_reader.hpp CHANGED Viewed

@@ -62,7 +62,7 @@ struct ParquetReaderScanState {
 	idx_t group_offset;
 	unique_ptr<CachingFileHandle> file_handle;
 	unique_ptr<ColumnReader> root_reader;
-	std::unique_ptr<duckdb_apache::thrift::protocol::TProtocol> thrift_file_proto;
+	duckdb_base_std::unique_ptr<duckdb_apache::thrift::protocol::TProtocol> thrift_file_proto;
 	bool finished;
 	SelectionVector sel;
@@ -108,6 +108,7 @@ struct ParquetOptions {
 	vector<ParquetColumnDefinition> schema;
 	idx_t explicit_cardinality = 0;
+	bool can_have_nan = false; // if floats or doubles can contain NaN values
 };
 struct ParquetOptionsSerialization {

package/src/duckdb/extension/parquet/include/parquet_statistics.hpp CHANGED Viewed

@@ -27,7 +27,7 @@ class ResizeableBuffer;
 struct ParquetStatisticsUtils {
 	static unique_ptr<BaseStatistics> TransformColumnStatistics(const ParquetColumnSchema &reader,
-	                                                            const vector<ColumnChunk> &columns);
+	                                                            const vector<ColumnChunk> &columns, bool can_have_nan);
 	static Value ConvertValue(const LogicalType &type, const ParquetColumnSchema &schema_ele, const std::string &stats);

package/src/duckdb/extension/parquet/include/parquet_writer.hpp CHANGED Viewed

@@ -134,6 +134,9 @@ public:
 	ParquetVersion GetParquetVersion() const {
 		return parquet_version;
 	}
+	const string &GetFileName() const {
+		return file_name;
+	}
 	uint32_t Write(const duckdb_apache::thrift::TBase &object);
 	uint32_t WriteData(const const_data_ptr_t buffer, const uint32_t buffer_size);

package/src/duckdb/extension/parquet/include/writer/parquet_write_operators.hpp CHANGED Viewed

@@ -138,7 +138,9 @@ struct ParquetBaseStringOperator : public BaseParquetOperator {
 	template <class SRC, class TGT>
 	static idx_t GetRowSize(const Vector &vector, idx_t index) {
-		return FlatVector::GetData<string_t>(vector)[index].GetSize();
+		// This needs to add the 4 bytes (just like WriteSize) otherwise we underestimate and we have to realloc
+		// This seriously harms performance, mostly by making it very inconsistent (see internal issue #4990)
+		return sizeof(uint32_t) + FlatVector::GetData<string_t>(vector)[index].GetSize();
 	}
 };

package/src/duckdb/extension/parquet/include/writer/templated_column_writer.hpp CHANGED Viewed

@@ -403,7 +403,7 @@ private:
 			break;
 		}
 		case duckdb_parquet::Encoding::BYTE_STREAM_SPLIT: {
-			if (page_state.bss_initialized) {
+			if (!page_state.bss_initialized) {
 				page_state.bss_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()));
 				page_state.bss_initialized = true;
 			}

package/src/duckdb/extension/parquet/parquet_crypto.cpp CHANGED Viewed

@@ -300,14 +300,15 @@ private:
 uint32_t ParquetCrypto::Read(TBase &object, TProtocol &iprot, const string &key,
                              const EncryptionUtil &encryption_util_p) {
 	TCompactProtocolFactoryT<DecryptionTransport> tproto_factory;
-	auto dprot = tproto_factory.getProtocol(std::make_shared<DecryptionTransport>(iprot, key, encryption_util_p));
+	auto dprot =
+	    tproto_factory.getProtocol(duckdb_base_std::make_shared<DecryptionTransport>(iprot, key, encryption_util_p));
 	auto &dtrans = reinterpret_cast<DecryptionTransport &>(*dprot->getTransport());
 	// We have to read the whole thing otherwise thrift throws an error before we realize we're decryption is wrong
 	auto all = dtrans.ReadAll();
 	TCompactProtocolFactoryT<SimpleReadTransport> tsimple_proto_factory;
 	auto simple_prot =
-	    tsimple_proto_factory.getProtocol(std::make_shared<SimpleReadTransport>(all.get(), all.GetSize()));
+	    tsimple_proto_factory.getProtocol(duckdb_base_std::make_shared<SimpleReadTransport>(all.get(), all.GetSize()));
 	// Read the object
 	object.read(simple_prot.get());
@@ -319,7 +320,8 @@ uint32_t ParquetCrypto::Write(const TBase &object, TProtocol &oprot, const strin
                               const EncryptionUtil &encryption_util_p) {
 	// Create encryption protocol
 	TCompactProtocolFactoryT<EncryptionTransport> tproto_factory;
-	auto eprot = tproto_factory.getProtocol(std::make_shared<EncryptionTransport>(oprot, key, encryption_util_p));
+	auto eprot =
+	    tproto_factory.getProtocol(duckdb_base_std::make_shared<EncryptionTransport>(oprot, key, encryption_util_p));
 	auto &etrans = reinterpret_cast<EncryptionTransport &>(*eprot->getTransport());
 	// Write the object in memory
@@ -333,7 +335,8 @@ uint32_t ParquetCrypto::ReadData(TProtocol &iprot, const data_ptr_t buffer, cons
                                  const string &key, const EncryptionUtil &encryption_util_p) {
 	// Create decryption protocol
 	TCompactProtocolFactoryT<DecryptionTransport> tproto_factory;
-	auto dprot = tproto_factory.getProtocol(std::make_shared<DecryptionTransport>(iprot, key, encryption_util_p));
+	auto dprot =
+	    tproto_factory.getProtocol(duckdb_base_std::make_shared<DecryptionTransport>(iprot, key, encryption_util_p));
 	auto &dtrans = reinterpret_cast<DecryptionTransport &>(*dprot->getTransport());
 	// Read buffer
@@ -348,7 +351,8 @@ uint32_t ParquetCrypto::WriteData(TProtocol &oprot, const const_data_ptr_t buffe
 	// FIXME: we know the size upfront so we could do a streaming write instead of this
 	// Create encryption protocol
 	TCompactProtocolFactoryT<EncryptionTransport> tproto_factory;
-	auto eprot = tproto_factory.getProtocol(std::make_shared<EncryptionTransport>(oprot, key, encryption_util_p));
+	auto eprot =
+	    tproto_factory.getProtocol(duckdb_base_std::make_shared<EncryptionTransport>(oprot, key, encryption_util_p));
 	auto &etrans = reinterpret_cast<EncryptionTransport &>(*eprot->getTransport());
 	// Write the data in memory

package/src/duckdb/extension/parquet/parquet_extension.cpp CHANGED Viewed

@@ -243,6 +243,18 @@ struct ParquetWriteBindData : public TableFunctionData {
 struct ParquetWriteGlobalState : public GlobalFunctionData {
 	unique_ptr<ParquetWriter> writer;
+	optional_ptr<const PhysicalOperator> op;
+	void LogFlushingRowGroup(const ColumnDataCollection &buffer, const string &reason) {
+		if (!op) {
+			return;
+		}
+		DUCKDB_LOG(writer->GetContext(), PhysicalOperatorLogType, *op, "ParquetWriter", "FlushRowGroup",
+		           {{"file", writer->GetFileName()},
+		            {"rows", to_string(buffer.Count())},
+		            {"size", to_string(buffer.SizeInBytes())},
+		            {"reason", reason}});
+	}
 	mutex lock;
 	unique_ptr<ColumnDataCollection> combine_buffer;
@@ -446,6 +458,9 @@ void ParquetWriteSink(ExecutionContext &context, FunctionData &bind_data_p, Glob
 	if (local_state.buffer.Count() >= bind_data.row_group_size ||
 	    local_state.buffer.SizeInBytes() >= bind_data.row_group_size_bytes) {
+		const string reason =
+		    local_state.buffer.Count() >= bind_data.row_group_size ? "ROW_GROUP_SIZE" : "ROW_GROUP_SIZE_BYTES";
+		global_state.LogFlushingRowGroup(local_state.buffer, reason);
 		// if the chunk collection exceeds a certain size (rows/bytes) we flush it to the parquet file
 		local_state.append_state.current_chunk_state.handles.clear();
 		global_state.writer->Flush(local_state.buffer);
@@ -462,6 +477,7 @@ void ParquetWriteCombine(ExecutionContext &context, FunctionData &bind_data_p, G
 	if (local_state.buffer.Count() >= bind_data.row_group_size / 2 ||
 	    local_state.buffer.SizeInBytes() >= bind_data.row_group_size_bytes / 2) {
 		// local state buffer is more than half of the row_group_size(_bytes), just flush it
+		global_state.LogFlushingRowGroup(local_state.buffer, "Combine");
 		global_state.writer->Flush(local_state.buffer);
 		return;
 	}
@@ -475,6 +491,7 @@ void ParquetWriteCombine(ExecutionContext &context, FunctionData &bind_data_p, G
 			// After combining, the combine buffer is more than half of the row_group_size(_bytes), so we flush
 			auto owned_combine_buffer = std::move(global_state.combine_buffer);
 			guard.unlock();
+			global_state.LogFlushingRowGroup(*owned_combine_buffer, "Combine");
 			// Lock free, of course
 			global_state.writer->Flush(*owned_combine_buffer);
 		}
@@ -489,6 +506,7 @@ void ParquetWriteFinalize(ClientContext &context, FunctionData &bind_data, Globa
 	auto &global_state = gstate.Cast<ParquetWriteGlobalState>();
 	// flush the combine buffer (if it's there)
 	if (global_state.combine_buffer) {
+		global_state.LogFlushingRowGroup(*global_state.combine_buffer, "Finalize");
 		global_state.writer->Flush(*global_state.combine_buffer);
 	}
@@ -691,6 +709,13 @@ CopyFunctionExecutionMode ParquetWriteExecutionMode(bool preserve_insertion_orde
 	return CopyFunctionExecutionMode::REGULAR_COPY_TO_FILE;
 }
 //===--------------------------------------------------------------------===//
+// Initialize Logger
+//===--------------------------------------------------------------------===//
+void ParquetWriteInitializeOperator(GlobalFunctionData &gstate, const PhysicalOperator &op) {
+	auto &global_state = gstate.Cast<ParquetWriteGlobalState>();
+	global_state.op = &op;
+}
+//===--------------------------------------------------------------------===//
 // Prepare Batch
 //===--------------------------------------------------------------------===//
 struct ParquetWriteBatchData : public PreparedBatchData {
@@ -889,6 +914,7 @@ void ParquetExtension::Load(DuckDB &db) {
 	function.copy_to_combine = ParquetWriteCombine;
 	function.copy_to_finalize = ParquetWriteFinalize;
 	function.execution_mode = ParquetWriteExecutionMode;
+	function.initialize_operator = ParquetWriteInitializeOperator;
 	function.copy_from_bind = MultiFileFunction<ParquetMultiFileInfo>::MultiFileBindCopy;
 	function.copy_from_function = scan_fun.functions[0];
 	function.prepare_batch = ParquetWritePrepareBatch;

package/src/duckdb/extension/parquet/parquet_float16.cpp CHANGED Viewed

@@ -11,7 +11,9 @@ float Float16ToFloat32(const uint16_t &float16_value) {
 	uint32_t sign = float16_value >> 15;
 	uint32_t exponent = (float16_value >> 10) & 0x1F;
 	uint32_t fraction = (float16_value & 0x3FF);
-	uint32_t float32_value;
+	// Avoid strict aliasing issues and compiler warnings
+	uint32_t float32_value = 0;
 	if (exponent == 0) {
 		if (fraction == 0) {
 			// zero
@@ -39,7 +41,7 @@ float Float16ToFloat32(const uint16_t &float16_value) {
 		float32_value = (sign << 31) | ((exponent + (127 - 15)) << 23) | (fraction << 13);
 	}
-	return *reinterpret_cast<float *>(&float32_value);
+	return Load<float>(const_data_ptr_cast(&float32_value));
 }
 } // namespace duckdb

package/src/duckdb/extension/parquet/parquet_metadata.cpp CHANGED Viewed

@@ -63,14 +63,14 @@ public:
 template <class T>
 string ConvertParquetElementToString(T &&entry) {
-	std::stringstream ss;
+	duckdb::stringstream ss;
 	ss << entry;
 	return ss.str();
 }
 template <class T>
 string PrintParquetElementToString(T &&entry) {
-	std::stringstream ss;
+	duckdb::stringstream ss;
 	entry.printTo(ss);
 	return ss.str();
 }
@@ -652,7 +652,7 @@ void ParquetMetaDataOperatorData::ExecuteBloomProbe(ClientContext &context, cons
 	}
 	auto &allocator = Allocator::DefaultAllocator();
-	auto transport = std::make_shared<ThriftFileTransport>(reader->GetHandle(), false);
+	auto transport = duckdb_base_std::make_shared<ThriftFileTransport>(reader->GetHandle(), false);
 	auto protocol =
 	    make_uniq<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>>(std::move(transport));

package/src/duckdb/extension/parquet/parquet_multi_file_info.cpp CHANGED Viewed

@@ -318,6 +318,7 @@ TableFunctionSet ParquetScanFunction::GetFunctionSet() {
 	table_function.named_parameters["schema"] = LogicalTypeId::ANY;
 	table_function.named_parameters["encryption_config"] = LogicalTypeId::ANY;
 	table_function.named_parameters["parquet_version"] = LogicalType::VARCHAR;
+	table_function.named_parameters["can_have_nan"] = LogicalType::BOOLEAN;
 	table_function.statistics = MultiFileFunction<ParquetMultiFileInfo>::MultiFileScanStats;
 	table_function.serialize = ParquetScanSerialize;
 	table_function.deserialize = ParquetScanDeserialize;
@@ -365,6 +366,13 @@ bool ParquetMultiFileInfo::ParseCopyOption(ClientContext &context, const string
 		options.encryption_config = ParquetEncryptionConfig::Create(context, values[0]);
 		return true;
 	}
+	if (key == "can_have_nan") {
+		if (values.size() != 1) {
+			throw BinderException("Parquet can_have_nan cannot be empty!");
+		}
+		options.can_have_nan = GetBooleanArgument(key, values);
+		return true;
+	}
 	return false;
 }
@@ -393,6 +401,10 @@ bool ParquetMultiFileInfo::ParseOption(ClientContext &context, const string &ori
 		options.debug_use_openssl = BooleanValue::Get(val);
 		return true;
 	}
+	if (key == "can_have_nan") {
+		options.can_have_nan = BooleanValue::Get(val);
+		return true;
+	}
 	if (key == "schema") {
 		// Argument is a map that defines the schema
 		const auto &schema_value = val;

package/src/duckdb/extension/parquet/parquet_reader.cpp CHANGED Viewed

@@ -48,7 +48,7 @@ using duckdb_parquet::Type;
 static unique_ptr<duckdb_apache::thrift::protocol::TProtocol> CreateThriftFileProtocol(CachingFileHandle &file_handle,
                                                                                        bool prefetch_mode) {
-	auto transport = std::make_shared<ThriftFileTransport>(file_handle, prefetch_mode);
+	auto transport = duckdb_base_std::make_shared<ThriftFileTransport>(file_handle, prefetch_mode);
 	return make_uniq<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>>(std::move(transport));
 }
@@ -501,7 +501,7 @@ unique_ptr<BaseStatistics> ParquetColumnSchema::Stats(ParquetReader &reader, idx
 		stats.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
 		return stats.ToUnique();
 	}
-	return ParquetStatisticsUtils::TransformColumnStatistics(*this, columns);
+	return ParquetStatisticsUtils::TransformColumnStatistics(*this, columns, reader.parquet_options.can_have_nan);
 }
 ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_define, idx_t max_repeat,
@@ -1052,7 +1052,8 @@ void ParquetReader::PrepareRowGroupBuffer(ParquetReaderScanState &state, idx_t i
 				    *stats, group.columns[column_reader.ColumnIndex()].meta_data.statistics, filter);
 			} else if (!is_generated_column && has_min_max &&
 			           (column_reader.Type().id() == LogicalTypeId::FLOAT ||
-			            column_reader.Type().id() == LogicalTypeId::DOUBLE)) {
+			            column_reader.Type().id() == LogicalTypeId::DOUBLE) &&
+			           parquet_options.can_have_nan) {
 				// floating point columns can have NaN values in addition to the min/max bounds defined in the file
 				// in order to do optimal pruning - we prune based on the [min, max] of the file followed by pruning
 				// based on nan
@@ -1116,7 +1117,7 @@ void ParquetReader::InitializeScan(ClientContext &context, ParquetReaderScanStat
 			state.prefetch_mode = false;
 		}
-		state.file_handle = fs.OpenFile(file_handle->GetPath(), flags);
+		state.file_handle = fs.OpenFile(file, flags);
 	}
 	state.adaptive_filter.reset();
 	state.scan_filters.clear();

package/src/duckdb/extension/parquet/parquet_statistics.cpp CHANGED Viewed

@@ -304,7 +304,8 @@ Value ParquetStatisticsUtils::ConvertValueInternal(const LogicalType &type, cons
 }
 unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(const ParquetColumnSchema &schema,
-                                                                             const vector<ColumnChunk> &columns) {
+                                                                             const vector<ColumnChunk> &columns,
+                                                                             bool can_have_nan) {
 	// Not supported types
 	auto &type = schema.type;
@@ -320,7 +321,7 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
 		// Recurse into child readers
 		for (idx_t i = 0; i < schema.children.size(); i++) {
 			auto &child_schema = schema.children[i];
-			auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns);
+			auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns, can_have_nan);
 			StructStats::SetChildStats(struct_stats, i, std::move(child_stats));
 		}
 		row_group_stats = struct_stats.ToUnique();
@@ -363,7 +364,16 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
 		break;
 	case LogicalTypeId::FLOAT:
 	case LogicalTypeId::DOUBLE:
-		row_group_stats = CreateFloatingPointStats(type, schema, parquet_stats);
+		if (can_have_nan) {
+			// Since parquet doesn't tell us if the column has NaN values, if the user has explicitly declared that it
+			// does, we create stats without an upper max value, as NaN compares larger than anything else.
+			row_group_stats = CreateFloatingPointStats(type, schema, parquet_stats);
+		} else {
+			// Otherwise we use the numeric stats as usual, which might lead to "wrong" pruning if the column contains
+			// NaN values. The parquet spec is not clear on how to handle NaN values in statistics, and so this is
+			// probably the best we can do for now.
+			row_group_stats = CreateNumericStats(type, schema, parquet_stats);
+		}
 		break;
 	case LogicalTypeId::VARCHAR: {
 		auto string_stats = StringStats::CreateEmpty(type);

package/src/duckdb/extension/parquet/parquet_writer.cpp CHANGED Viewed

@@ -376,7 +376,7 @@ ParquetWriter::ParquetWriter(ClientContext &context, FileSystem &fs, string file
 	}
 	TCompactProtocolFactoryT<MyTransport> tproto_factory;
-	protocol = tproto_factory.getProtocol(std::make_shared<MyTransport>(*writer));
+	protocol = tproto_factory.getProtocol(duckdb_base_std::make_shared<MyTransport>(*writer));
 	file_meta_data.num_rows = 0;
 	file_meta_data.version = 1;

package/src/duckdb/extension/parquet/reader/decimal_column_reader.cpp CHANGED Viewed

@@ -46,7 +46,7 @@ double ParquetDecimalUtils::ReadDecimalValue(const_data_ptr_t pointer, idx_t siz
 }
 unique_ptr<ColumnReader> ParquetDecimalUtils::CreateReader(ParquetReader &reader, const ParquetColumnSchema &schema) {
-	if (schema.type_length > 0) {
+	if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
 		return CreateDecimalReaderInternal<true>(reader, schema);
 	} else {
 		return CreateDecimalReaderInternal<false>(reader, schema);

package/src/duckdb/extension/parquet/reader/string_column_reader.cpp CHANGED Viewed

@@ -11,7 +11,7 @@ namespace duckdb {
 StringColumnReader::StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
     : ColumnReader(reader, schema) {
 	fixed_width_string_length = 0;
-	if (schema.type_length > 0) {
+	if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
 		fixed_width_string_length = schema.type_length;
 	}
 }

package/src/duckdb/extension/parquet/reader/struct_column_reader.cpp CHANGED Viewed

@@ -118,12 +118,21 @@ static bool TypeHasExactRowCount(const LogicalType &type) {
 }
 idx_t StructColumnReader::GroupRowsAvailable() {
-	for (idx_t i = 0; i < child_readers.size(); i++) {
-		if (TypeHasExactRowCount(child_readers[i]->Type())) {
-			return child_readers[i]->GroupRowsAvailable();
+	for (auto &child : child_readers) {
+		if (!child) {
+			continue;
+		}
+		if (TypeHasExactRowCount(child->Type())) {
+			return child->GroupRowsAvailable();
+		}
+	}
+	for (auto &child : child_readers) {
+		if (!child) {
+			continue;
 		}
+		return child->GroupRowsAvailable();
 	}
-	return child_readers[0]->GroupRowsAvailable();
+	throw InternalException("No projected columns in struct?");
 }
 } // namespace duckdb

package/src/duckdb/extension/parquet/serialize_parquet.cpp CHANGED Viewed

@@ -73,6 +73,7 @@ void ParquetOptionsSerialization::Serialize(Serializer &serializer) const {
 	serializer.WritePropertyWithDefault<shared_ptr<ParquetEncryptionConfig>>(104, "encryption_config", parquet_options.encryption_config, nullptr);
 	serializer.WritePropertyWithDefault<bool>(105, "debug_use_openssl", parquet_options.debug_use_openssl, true);
 	serializer.WritePropertyWithDefault<idx_t>(106, "explicit_cardinality", parquet_options.explicit_cardinality, 0);
+	serializer.WritePropertyWithDefault<bool>(107, "can_have_nan", parquet_options.can_have_nan, false);
 }
 ParquetOptionsSerialization ParquetOptionsSerialization::Deserialize(Deserializer &deserializer) {
@@ -84,6 +85,7 @@ ParquetOptionsSerialization ParquetOptionsSerialization::Deserialize(Deserialize
 	deserializer.ReadPropertyWithExplicitDefault<shared_ptr<ParquetEncryptionConfig>>(104, "encryption_config", result.parquet_options.encryption_config, nullptr);
 	deserializer.ReadPropertyWithExplicitDefault<bool>(105, "debug_use_openssl", result.parquet_options.debug_use_openssl, true);
 	deserializer.ReadPropertyWithExplicitDefault<idx_t>(106, "explicit_cardinality", result.parquet_options.explicit_cardinality, 0);
+	deserializer.ReadPropertyWithExplicitDefault<bool>(107, "can_have_nan", result.parquet_options.can_have_nan, false);
 	return result;
 }