PyPI - onnxruntime-directml - Versions diffs - 1.20.0__cp313-cp313-win_amd64.whl - Mend

onnxruntime-directml 1.20.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (305) hide show

onnxruntime/LICENSE +21 -0
onnxruntime/Privacy.md +21 -0
onnxruntime/ThirdPartyNotices.txt +6508 -0
onnxruntime/__init__.py +78 -0
onnxruntime/backend/__init__.py +6 -0
onnxruntime/backend/backend.py +174 -0
onnxruntime/backend/backend_rep.py +53 -0
onnxruntime/capi/DirectML.dll +0 -0
onnxruntime/capi/__init__.py +4 -0
onnxruntime/capi/_ld_preload.py +7 -0
onnxruntime/capi/_pybind_state.py +33 -0
onnxruntime/capi/convert_npz_to_onnx_adapter.py +48 -0
onnxruntime/capi/onnxruntime.dll +0 -0
onnxruntime/capi/onnxruntime_collect_build_info.py +47 -0
onnxruntime/capi/onnxruntime_inference_collection.py +1108 -0
onnxruntime/capi/onnxruntime_providers_shared.dll +0 -0
onnxruntime/capi/onnxruntime_pybind11_state.pyd +0 -0
onnxruntime/capi/onnxruntime_validation.py +150 -0
onnxruntime/capi/version_info.py +2 -0
onnxruntime/datasets/__init__.py +17 -0
onnxruntime/datasets/logreg_iris.onnx +0 -0
onnxruntime/datasets/mul_1.onnx +0 -0
onnxruntime/datasets/sigmoid.onnx +13 -0
onnxruntime/quantization/CalTableFlatBuffers/KeyValue.py +78 -0
onnxruntime/quantization/CalTableFlatBuffers/TrtTable.py +90 -0
onnxruntime/quantization/CalTableFlatBuffers/__init__.py +0 -0
onnxruntime/quantization/__init__.py +16 -0
onnxruntime/quantization/base_quantizer.py +532 -0
onnxruntime/quantization/calibrate.py +1245 -0
onnxruntime/quantization/execution_providers/qnn/__init__.py +2 -0
onnxruntime/quantization/execution_providers/qnn/fusion_lpnorm.py +132 -0
onnxruntime/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py +413 -0
onnxruntime/quantization/execution_providers/qnn/preprocess.py +307 -0
onnxruntime/quantization/execution_providers/qnn/quant_config.py +387 -0
onnxruntime/quantization/fusions/__init__.py +3 -0
onnxruntime/quantization/fusions/fusion.py +311 -0
onnxruntime/quantization/fusions/fusion_gelu.py +272 -0
onnxruntime/quantization/fusions/fusion_layernorm.py +135 -0
onnxruntime/quantization/matmul_4bits_quantizer.py +1480 -0
onnxruntime/quantization/matmul_bnb4_quantizer.py +240 -0
onnxruntime/quantization/onnx_model.py +580 -0
onnxruntime/quantization/onnx_quantizer.py +1008 -0
onnxruntime/quantization/operators/__init__.py +2 -0
onnxruntime/quantization/operators/activation.py +119 -0
onnxruntime/quantization/operators/argmax.py +18 -0
onnxruntime/quantization/operators/attention.py +73 -0
onnxruntime/quantization/operators/base_operator.py +26 -0
onnxruntime/quantization/operators/binary_op.py +72 -0
onnxruntime/quantization/operators/concat.py +62 -0
onnxruntime/quantization/operators/conv.py +258 -0
onnxruntime/quantization/operators/direct_q8.py +78 -0
onnxruntime/quantization/operators/embed_layernorm.py +121 -0
onnxruntime/quantization/operators/gather.py +64 -0
onnxruntime/quantization/operators/gavgpool.py +62 -0
onnxruntime/quantization/operators/gemm.py +166 -0
onnxruntime/quantization/operators/lstm.py +117 -0
onnxruntime/quantization/operators/matmul.py +231 -0
onnxruntime/quantization/operators/maxpool.py +34 -0
onnxruntime/quantization/operators/norm.py +40 -0
onnxruntime/quantization/operators/pad.py +100 -0
onnxruntime/quantization/operators/pooling.py +67 -0
onnxruntime/quantization/operators/qdq_base_operator.py +22 -0
onnxruntime/quantization/operators/resize.py +34 -0
onnxruntime/quantization/operators/softmax.py +74 -0
onnxruntime/quantization/operators/split.py +63 -0
onnxruntime/quantization/operators/where.py +87 -0
onnxruntime/quantization/preprocess.py +141 -0
onnxruntime/quantization/qdq_loss_debug.py +389 -0
onnxruntime/quantization/qdq_quantizer.py +1187 -0
onnxruntime/quantization/quant_utils.py +891 -0
onnxruntime/quantization/quantize.py +748 -0
onnxruntime/quantization/registry.py +106 -0
onnxruntime/quantization/shape_inference.py +187 -0
onnxruntime/quantization/tensor_quant_overrides.py +516 -0
onnxruntime/tools/__init__.py +10 -0
onnxruntime/tools/check_onnx_model_mobile_usability.py +47 -0
onnxruntime/tools/convert_onnx_models_to_ort.py +377 -0
onnxruntime/tools/file_utils.py +46 -0
onnxruntime/tools/logger.py +11 -0
onnxruntime/tools/make_dynamic_shape_fixed.py +72 -0
onnxruntime/tools/mobile_helpers/__init__.py +0 -0
onnxruntime/tools/mobile_helpers/coreml_supported_mlprogram_ops.md +33 -0
onnxruntime/tools/mobile_helpers/coreml_supported_neuralnetwork_ops.md +43 -0
onnxruntime/tools/mobile_helpers/nnapi_supported_ops.md +58 -0
onnxruntime/tools/mobile_helpers/usability_checker.py +739 -0
onnxruntime/tools/offline_tuning.py +169 -0
onnxruntime/tools/onnx_model_utils.py +413 -0
onnxruntime/tools/onnx_randomizer.py +85 -0
onnxruntime/tools/onnxruntime_test.py +164 -0
onnxruntime/tools/optimize_onnx_model.py +55 -0
onnxruntime/tools/ort_format_model/__init__.py +25 -0
onnxruntime/tools/ort_format_model/operator_type_usage_processors.py +663 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/__init__.py +0 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ArgType.py +7 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ArgTypeAndIndex.py +67 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Attribute.py +337 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/AttributeType.py +18 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Checkpoint.py +125 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedKernelCreateInfos.py +120 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedNodeIndexAndKernelDefHash.py +68 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedSessionState.py +96 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedSubGraphSessionState.py +72 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Dimension.py +71 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DimensionValue.py +80 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DimensionValueType.py +8 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/EdgeEnd.py +32 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/FloatProperty.py +67 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Graph.py +320 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/InferenceSession.py +88 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/IntProperty.py +67 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/KernelTypeStrArgsEntry.py +91 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/KernelTypeStrResolver.py +78 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/MapType.py +71 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Model.py +223 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ModuleState.py +141 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Node.py +317 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodeEdge.py +126 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodeType.py +7 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodesToOptimizeIndices.py +160 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OpIdKernelTypeStrArgsEntry.py +91 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OperatorSetId.py +67 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OptimizerGroup.py +117 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ParameterOptimizerState.py +91 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/PropertyBag.py +152 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizationRecord.py +105 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizationRecordContainerEntry.py +91 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizations.py +79 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/SequenceType.py +58 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Shape.py +78 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/SparseTensor.py +114 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/StringProperty.py +67 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/StringStringEntry.py +67 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Tensor.py +203 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TensorDataType.py +26 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TensorTypeAndShape.py +71 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TypeInfo.py +83 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TypeInfoValue.py +9 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ValueInfo.py +84 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/__init__.py +6 -0
onnxruntime/tools/ort_format_model/ort_model_processor.py +86 -0
onnxruntime/tools/ort_format_model/types.py +84 -0
onnxruntime/tools/ort_format_model/utils.py +62 -0
onnxruntime/tools/pytorch_export_contrib_ops.py +108 -0
onnxruntime/tools/pytorch_export_helpers.py +131 -0
onnxruntime/tools/qdq_helpers/__init__.py +0 -0
onnxruntime/tools/qdq_helpers/optimize_qdq_model.py +37 -0
onnxruntime/tools/reduced_build_config_parser.py +202 -0
onnxruntime/tools/symbolic_shape_infer.py +3016 -0
onnxruntime/tools/update_onnx_opset.py +31 -0
onnxruntime/transformers/__init__.py +8 -0
onnxruntime/transformers/affinity_helper.py +40 -0
onnxruntime/transformers/benchmark.py +944 -0
onnxruntime/transformers/benchmark_helper.py +646 -0
onnxruntime/transformers/bert_perf_test.py +634 -0
onnxruntime/transformers/bert_test_data.py +642 -0
onnxruntime/transformers/compare_bert_results.py +246 -0
onnxruntime/transformers/constants.py +47 -0
onnxruntime/transformers/convert_generation.py +3124 -0
onnxruntime/transformers/convert_tf_models_to_pytorch.py +205 -0
onnxruntime/transformers/convert_to_packing_mode.py +387 -0
onnxruntime/transformers/dynamo_onnx_helper.py +104 -0
onnxruntime/transformers/float16.py +501 -0
onnxruntime/transformers/fusion_attention.py +1235 -0
onnxruntime/transformers/fusion_attention_clip.py +257 -0
onnxruntime/transformers/fusion_attention_sam2.py +534 -0
onnxruntime/transformers/fusion_attention_unet.py +1304 -0
onnxruntime/transformers/fusion_attention_vae.py +301 -0
onnxruntime/transformers/fusion_bart_attention.py +640 -0
onnxruntime/transformers/fusion_base.py +137 -0
onnxruntime/transformers/fusion_bias_add.py +58 -0
onnxruntime/transformers/fusion_biasgelu.py +66 -0
onnxruntime/transformers/fusion_biassplitgelu.py +111 -0
onnxruntime/transformers/fusion_conformer_attention.py +143 -0
onnxruntime/transformers/fusion_embedlayer.py +811 -0
onnxruntime/transformers/fusion_fastgelu.py +360 -0
onnxruntime/transformers/fusion_gelu.py +259 -0
onnxruntime/transformers/fusion_gelu_approximation.py +25 -0
onnxruntime/transformers/fusion_gemmfastgelu.py +122 -0
onnxruntime/transformers/fusion_gpt_attention.py +546 -0
onnxruntime/transformers/fusion_gpt_attention_megatron.py +355 -0
onnxruntime/transformers/fusion_gpt_attention_no_past.py +260 -0
onnxruntime/transformers/fusion_group_norm.py +179 -0
onnxruntime/transformers/fusion_layernorm.py +465 -0
onnxruntime/transformers/fusion_nhwc_conv.py +100 -0
onnxruntime/transformers/fusion_options.py +340 -0
onnxruntime/transformers/fusion_qordered_attention.py +421 -0
onnxruntime/transformers/fusion_qordered_gelu.py +119 -0
onnxruntime/transformers/fusion_qordered_layernorm.py +123 -0
onnxruntime/transformers/fusion_qordered_matmul.py +217 -0
onnxruntime/transformers/fusion_quickgelu.py +74 -0
onnxruntime/transformers/fusion_reshape.py +173 -0
onnxruntime/transformers/fusion_rotary_attention.py +1592 -0
onnxruntime/transformers/fusion_shape.py +110 -0
onnxruntime/transformers/fusion_simplified_layernorm.py +159 -0
onnxruntime/transformers/fusion_skip_group_norm.py +255 -0
onnxruntime/transformers/fusion_skiplayernorm.py +209 -0
onnxruntime/transformers/fusion_transpose.py +168 -0
onnxruntime/transformers/fusion_utils.py +307 -0
onnxruntime/transformers/huggingface_models.py +167 -0
onnxruntime/transformers/import_utils.py +20 -0
onnxruntime/transformers/io_binding_helper.py +442 -0
onnxruntime/transformers/large_model_exporter.py +395 -0
onnxruntime/transformers/machine_info.py +221 -0
onnxruntime/transformers/metrics.py +164 -0
onnxruntime/transformers/models/bart/__init__.py +12 -0
onnxruntime/transformers/models/bart/export.py +98 -0
onnxruntime/transformers/models/bert/__init__.py +12 -0
onnxruntime/transformers/models/bert/eval_squad.py +329 -0
onnxruntime/transformers/models/gpt2/__init__.py +12 -0
onnxruntime/transformers/models/gpt2/benchmark_gpt2.py +413 -0
onnxruntime/transformers/models/gpt2/convert_to_onnx.py +561 -0
onnxruntime/transformers/models/gpt2/gpt2_helper.py +1032 -0
onnxruntime/transformers/models/gpt2/gpt2_parity.py +513 -0
onnxruntime/transformers/models/gpt2/gpt2_tester.py +501 -0
onnxruntime/transformers/models/gpt2/parity_check_helper.py +146 -0
onnxruntime/transformers/models/llama/__init__.py +12 -0
onnxruntime/transformers/models/llama/benchmark.py +703 -0
onnxruntime/transformers/models/llama/benchmark_all.py +488 -0
onnxruntime/transformers/models/llama/benchmark_e2e.py +606 -0
onnxruntime/transformers/models/llama/convert_to_onnx.py +1027 -0
onnxruntime/transformers/models/llama/dist_settings.py +57 -0
onnxruntime/transformers/models/llama/llama_inputs.py +503 -0
onnxruntime/transformers/models/llama/llama_parity.py +309 -0
onnxruntime/transformers/models/llama/llama_torch.py +47 -0
onnxruntime/transformers/models/llama/quant_kv_dataloader.py +108 -0
onnxruntime/transformers/models/longformer/__init__.py +12 -0
onnxruntime/transformers/models/longformer/benchmark_longformer.py +821 -0
onnxruntime/transformers/models/longformer/convert_to_onnx.py +413 -0
onnxruntime/transformers/models/longformer/generate_test_data.py +347 -0
onnxruntime/transformers/models/longformer/longformer_helper.py +77 -0
onnxruntime/transformers/models/phi2/__init__.py +12 -0
onnxruntime/transformers/models/phi2/convert_to_onnx.py +576 -0
onnxruntime/transformers/models/phi2/inference_example.py +414 -0
onnxruntime/transformers/models/sam2/__init__.py +12 -0
onnxruntime/transformers/models/sam2/benchmark_sam2.py +625 -0
onnxruntime/transformers/models/sam2/convert_to_onnx.py +260 -0
onnxruntime/transformers/models/sam2/image_decoder.py +273 -0
onnxruntime/transformers/models/sam2/image_encoder.py +186 -0
onnxruntime/transformers/models/sam2/mask_decoder.py +208 -0
onnxruntime/transformers/models/sam2/nvtx_helper.py +33 -0
onnxruntime/transformers/models/sam2/prompt_encoder.py +189 -0
onnxruntime/transformers/models/sam2/sam2_demo.py +322 -0
onnxruntime/transformers/models/sam2/sam2_image_onnx_predictor.py +280 -0
onnxruntime/transformers/models/sam2/sam2_utils.py +147 -0
onnxruntime/transformers/models/stable_diffusion/__init__.py +12 -0
onnxruntime/transformers/models/stable_diffusion/benchmark.py +1429 -0
onnxruntime/transformers/models/stable_diffusion/benchmark_controlnet.py +426 -0
onnxruntime/transformers/models/stable_diffusion/demo_txt2img.py +102 -0
onnxruntime/transformers/models/stable_diffusion/demo_txt2img_xl.py +268 -0
onnxruntime/transformers/models/stable_diffusion/demo_utils.py +778 -0
onnxruntime/transformers/models/stable_diffusion/diffusion_models.py +1319 -0
onnxruntime/transformers/models/stable_diffusion/diffusion_schedulers.py +1181 -0
onnxruntime/transformers/models/stable_diffusion/engine_builder.py +296 -0
onnxruntime/transformers/models/stable_diffusion/engine_builder_ort_cuda.py +388 -0
onnxruntime/transformers/models/stable_diffusion/engine_builder_ort_trt.py +288 -0
onnxruntime/transformers/models/stable_diffusion/engine_builder_tensorrt.py +395 -0
onnxruntime/transformers/models/stable_diffusion/engine_builder_torch.py +108 -0
onnxruntime/transformers/models/stable_diffusion/optimize_pipeline.py +350 -0
onnxruntime/transformers/models/stable_diffusion/ort_optimizer.py +136 -0
onnxruntime/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +831 -0
onnxruntime/transformers/models/stable_diffusion/trt_utilities.py +12 -0
onnxruntime/transformers/models/t5/__init__.py +12 -0
onnxruntime/transformers/models/t5/convert_to_onnx.py +278 -0
onnxruntime/transformers/models/t5/past_helper.py +150 -0
onnxruntime/transformers/models/t5/t5_decoder.py +438 -0
onnxruntime/transformers/models/t5/t5_encoder.py +171 -0
onnxruntime/transformers/models/t5/t5_encoder_decoder_init.py +299 -0
onnxruntime/transformers/models/t5/t5_helper.py +272 -0
onnxruntime/transformers/models/whisper/__init__.py +12 -0
onnxruntime/transformers/models/whisper/benchmark.py +610 -0
onnxruntime/transformers/models/whisper/benchmark_all.py +528 -0
onnxruntime/transformers/models/whisper/convert_to_onnx.py +536 -0
onnxruntime/transformers/models/whisper/whisper_chain.py +329 -0
onnxruntime/transformers/models/whisper/whisper_decoder.py +402 -0
onnxruntime/transformers/models/whisper/whisper_encoder.py +164 -0
onnxruntime/transformers/models/whisper/whisper_encoder_decoder_init.py +306 -0
onnxruntime/transformers/models/whisper/whisper_helper.py +524 -0
onnxruntime/transformers/models/whisper/whisper_openai_helper.py +84 -0
onnxruntime/transformers/onnx_exporter.py +717 -0
onnxruntime/transformers/onnx_model.py +1569 -0
onnxruntime/transformers/onnx_model_bart.py +142 -0
onnxruntime/transformers/onnx_model_bert.py +481 -0
onnxruntime/transformers/onnx_model_bert_keras.py +475 -0
onnxruntime/transformers/onnx_model_bert_tf.py +589 -0
onnxruntime/transformers/onnx_model_clip.py +40 -0
onnxruntime/transformers/onnx_model_conformer.py +33 -0
onnxruntime/transformers/onnx_model_gpt2.py +101 -0
onnxruntime/transformers/onnx_model_phi.py +930 -0
onnxruntime/transformers/onnx_model_sam2.py +138 -0
onnxruntime/transformers/onnx_model_t5.py +791 -0
onnxruntime/transformers/onnx_model_tnlr.py +227 -0
onnxruntime/transformers/onnx_model_unet.py +259 -0
onnxruntime/transformers/onnx_model_vae.py +43 -0
onnxruntime/transformers/onnx_utils.py +55 -0
onnxruntime/transformers/optimizer.py +612 -0
onnxruntime/transformers/profiler.py +725 -0
onnxruntime/transformers/quantize_helper.py +76 -0
onnxruntime/transformers/shape_infer_helper.py +122 -0
onnxruntime/transformers/shape_optimizer.py +401 -0
onnxruntime/transformers/torch_onnx_export_helper.py +74 -0
onnxruntime_directml-1.20.0.dist-info/METADATA +187 -0
onnxruntime_directml-1.20.0.dist-info/RECORD +305 -0
onnxruntime_directml-1.20.0.dist-info/WHEEL +5 -0
onnxruntime_directml-1.20.0.dist-info/entry_points.txt +2 -0
onnxruntime_directml-1.20.0.dist-info/top_level.txt +1 -0

onnxruntime/transformers/models/stable_diffusion/benchmark.py ADDED Viewed

@@ -0,0 +1,1429 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import argparse
+import csv
+import os
+import statistics
+import sys
+import time
+import __init__  # noqa: F401. Walk-around to run this script directly
+import coloredlogs
+# import torch before onnxruntime so that onnxruntime uses the cuDNN in the torch package.
+import torch
+from benchmark_helper import measure_memory
+SD_MODELS = {
+    "1.5": "runwayml/stable-diffusion-v1-5",
+    "2.0": "stabilityai/stable-diffusion-2",
+    "2.1": "stabilityai/stable-diffusion-2-1",
+    "xl-1.0": "stabilityai/stable-diffusion-xl-refiner-1.0",
+}
+PROVIDERS = {
+    "cuda": "CUDAExecutionProvider",
+    "rocm": "ROCMExecutionProvider",
+    "migraphx": "MIGraphXExecutionProvider",
+    "tensorrt": "TensorrtExecutionProvider",
+}
+def example_prompts():
+    prompts = [
+        "a photo of an astronaut riding a horse on mars",
+        "cute grey cat with blue eyes, wearing a bowtie, acrylic painting",
+        "a cute magical flying dog, fantasy art drawn by disney concept artists, highly detailed, digital painting",
+        "an illustration of a house with large barn with many cute flower pots and beautiful blue sky scenery",
+        "one apple sitting on a table, still life, reflective, full color photograph, centered, close-up product",
+        "background texture of stones, masterpiece, artistic, stunning photo, award winner photo",
+        "new international organic style house, tropical surroundings, architecture, 8k, hdr",
+        "beautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstation",
+        "blue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realistic",
+        "delicate elvish moonstone necklace on a velvet background, symmetrical intricate motifs, leaves, flowers, 8k",
+    ]
+    negative_prompt = "bad composition, ugly, abnormal, malformed"
+    return prompts, negative_prompt
+def measure_gpu_memory(monitor_type, func, start_memory=None):
+    return measure_memory(is_gpu=True, func=func, monitor_type=monitor_type, start_memory=start_memory)
+def get_ort_pipeline(model_name: str, directory: str, provider, disable_safety_checker: bool):
+    from diffusers import DDIMScheduler, OnnxStableDiffusionPipeline
+    import onnxruntime
+    if directory is not None:
+        assert os.path.exists(directory)
+        session_options = onnxruntime.SessionOptions()
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            directory,
+            provider=provider,
+            sess_options=session_options,
+        )
+    else:
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            model_name,
+            revision="onnx",
+            provider=provider,
+            use_auth_token=True,
+        )
+    pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+    pipe.set_progress_bar_config(disable=True)
+    if disable_safety_checker:
+        pipe.safety_checker = None
+        pipe.feature_extractor = None
+    return pipe
+def get_torch_pipeline(model_name: str, disable_safety_checker: bool, enable_torch_compile: bool, use_xformers: bool):
+    from diffusers import DDIMScheduler, StableDiffusionPipeline
+    from torch import channels_last, float16
+    pipe = StableDiffusionPipeline.from_pretrained(model_name, torch_dtype=float16).to("cuda")
+    pipe.unet.to(memory_format=channels_last)  # in-place operation
+    if use_xformers:
+        pipe.enable_xformers_memory_efficient_attention()
+    if enable_torch_compile:
+        pipe.unet = torch.compile(pipe.unet)
+        pipe.vae = torch.compile(pipe.vae)
+        pipe.text_encoder = torch.compile(pipe.text_encoder)
+        print("Torch compiled unet, vae and text_encoder")
+    pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+    pipe.set_progress_bar_config(disable=True)
+    if disable_safety_checker:
+        pipe.safety_checker = None
+        pipe.feature_extractor = None
+    return pipe
+def get_image_filename_prefix(engine: str, model_name: str, batch_size: int, disable_safety_checker: bool):
+    short_model_name = model_name.split("/")[-1].replace("stable-diffusion-", "sd")
+    return f"{engine}_{short_model_name}_b{batch_size}" + ("" if disable_safety_checker else "_safe")
+def run_ort_pipeline(
+    pipe,
+    batch_size: int,
+    image_filename_prefix: str,
+    height,
+    width,
+    steps,
+    num_prompts,
+    batch_count,
+    start_memory,
+    memory_monitor_type,
+):
+    from diffusers import OnnxStableDiffusionPipeline
+    assert isinstance(pipe, OnnxStableDiffusionPipeline)
+    prompts, negative_prompt = example_prompts()
+    def warmup():
+        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
+    # Run warm up, and measure GPU memory of two runs
+    # cuDNN/MIOpen The first run has  algo search so it might need more memory)
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    latency_list = []
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        for j in range(batch_count):
+            inference_start = time.time()
+            images = pipe(
+                [prompt] * batch_size,
+                height,
+                width,
+                num_inference_steps=steps,
+                negative_prompt=[negative_prompt] * batch_size,
+                guidance_scale=7.5,
+            ).images
+            inference_end = time.time()
+            latency = inference_end - inference_start
+            latency_list.append(latency)
+            print(f"Inference took {latency:.3f} seconds")
+            for k, image in enumerate(images):
+                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+    from onnxruntime import __version__ as ort_version
+    return {
+        "engine": "onnxruntime",
+        "version": ort_version,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+    }
+def run_torch_pipeline(
+    pipe,
+    batch_size: int,
+    image_filename_prefix: str,
+    height,
+    width,
+    steps,
+    num_prompts,
+    batch_count,
+    start_memory,
+    memory_monitor_type,
+):
+    prompts, negative_prompt = example_prompts()
+    # total 2 runs of warm up, and measure GPU memory for CUDA EP
+    def warmup():
+        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
+    # Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    torch.set_grad_enabled(False)
+    latency_list = []
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        torch.cuda.synchronize()
+        for j in range(batch_count):
+            inference_start = time.time()
+            images = pipe(
+                prompt=[prompt] * batch_size,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                guidance_scale=7.5,
+                negative_prompt=[negative_prompt] * batch_size,
+                generator=None,  # torch.Generator
+            ).images
+            torch.cuda.synchronize()
+            inference_end = time.time()
+            latency = inference_end - inference_start
+            latency_list.append(latency)
+            print(f"Inference took {latency:.3f} seconds")
+            for k, image in enumerate(images):
+                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+    return {
+        "engine": "torch",
+        "version": torch.__version__,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+    }
+def run_ort(
+    model_name: str,
+    directory: str,
+    provider: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+    tuning: bool,
+):
+    provider_and_options = provider
+    if tuning and provider in ["CUDAExecutionProvider", "ROCMExecutionProvider"]:
+        provider_and_options = (provider, {"tunable_op_enable": 1, "tunable_op_tuning_enable": 1})
+    load_start = time.time()
+    pipe = get_ort_pipeline(model_name, directory, provider_and_options, disable_safety_checker)
+    load_end = time.time()
+    print(f"Model loading took {load_end - load_start} seconds")
+    image_filename_prefix = get_image_filename_prefix("ort", model_name, batch_size, disable_safety_checker)
+    result = run_ort_pipeline(
+        pipe,
+        batch_size,
+        image_filename_prefix,
+        height,
+        width,
+        steps,
+        num_prompts,
+        batch_count,
+        start_memory,
+        memory_monitor_type,
+    )
+    result.update(
+        {
+            "model_name": model_name,
+            "directory": directory,
+            "provider": provider.replace("ExecutionProvider", ""),
+            "disable_safety_checker": disable_safety_checker,
+            "enable_cuda_graph": False,
+        }
+    )
+    return result
+def get_optimum_ort_pipeline(
+    model_name: str,
+    directory: str,
+    provider="CUDAExecutionProvider",
+    disable_safety_checker: bool = True,
+):
+    from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
+    if directory is not None and os.path.exists(directory):
+        if "xl" in model_name:
+            pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
+                directory,
+                provider=provider,
+                session_options=None,
+                use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
+            )
+        else:
+            pipeline = ORTStableDiffusionPipeline.from_pretrained(
+                directory,
+                provider=provider,
+                use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
+            )
+    elif "xl" in model_name:
+        pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
+            model_name,
+            export=True,
+            provider=provider,
+            session_options=None,
+            use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
+        )
+        pipeline.save_pretrained(directory)
+    else:
+        pipeline = ORTStableDiffusionPipeline.from_pretrained(
+            model_name,
+            export=True,
+            provider=provider,
+            use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
+        )
+        pipeline.save_pretrained(directory)
+    if disable_safety_checker:
+        pipeline.safety_checker = None
+        pipeline.feature_extractor = None
+    return pipeline
+def run_optimum_ort_pipeline(
+    pipe,
+    batch_size: int,
+    image_filename_prefix: str,
+    height,
+    width,
+    steps,
+    num_prompts,
+    batch_count,
+    start_memory,
+    memory_monitor_type,
+):
+    from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
+    assert isinstance(pipe, (ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline))
+    prompts = example_prompts()
+    def warmup():
+        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
+    # Run warm up, and measure GPU memory of two runs.
+    # The first run has algo search for cuDNN/MIOpen, so it might need more memory.
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    latency_list = []
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        for j in range(batch_count):
+            inference_start = time.time()
+            images = pipe(
+                prompt,
+                height,
+                width,
+                num_inference_steps=steps,
+                negative_prompt=None,
+                guidance_scale=0.0,  # 7.5
+                num_images_per_prompt=batch_size,
+            ).images
+            inference_end = time.time()
+            latency = inference_end - inference_start
+            latency_list.append(latency)
+            print(f"Inference took {latency:.3f} seconds")
+            for k, image in enumerate(images):
+                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+    from onnxruntime import __version__ as ort_version
+    return {
+        "engine": "optimum_ort",
+        "version": ort_version,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+    }
+def run_optimum_ort(
+    model_name: str,
+    directory: str,
+    provider: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+):
+    load_start = time.time()
+    pipe = get_optimum_ort_pipeline(model_name, directory, provider, disable_safety_checker)
+    load_end = time.time()
+    print(f"Model loading took {load_end - load_start} seconds")
+    image_filename_prefix = get_image_filename_prefix("optimum", model_name, batch_size, disable_safety_checker)
+    result = run_optimum_ort_pipeline(
+        pipe,
+        batch_size,
+        image_filename_prefix,
+        height,
+        width,
+        steps,
+        num_prompts,
+        batch_count,
+        start_memory,
+        memory_monitor_type,
+    )
+    result.update(
+        {
+            "model_name": model_name,
+            "directory": directory,
+            "provider": provider.replace("ExecutionProvider", ""),
+            "disable_safety_checker": disable_safety_checker,
+            "enable_cuda_graph": False,
+        }
+    )
+    return result
+def run_ort_trt_static(
+    work_dir: str,
+    version: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+    max_batch_size: int,
+    nvtx_profile: bool = False,
+    use_cuda_graph: bool = True,
+):
+    print("[I] Initializing ORT TensorRT EP accelerated StableDiffusionXL txt2img pipeline (static input shape)")
+    # Register TensorRT plugins
+    from trt_utilities import init_trt_plugins
+    init_trt_plugins()
+    assert batch_size <= max_batch_size
+    from diffusion_models import PipelineInfo
+    pipeline_info = PipelineInfo(version)
+    short_name = pipeline_info.short_name()
+    from engine_builder import EngineType, get_engine_paths
+    from pipeline_stable_diffusion import StableDiffusionPipeline
+    engine_type = EngineType.ORT_TRT
+    onnx_dir, engine_dir, output_dir, framework_model_dir, _ = get_engine_paths(work_dir, pipeline_info, engine_type)
+    # Initialize pipeline
+    pipeline = StableDiffusionPipeline(
+        pipeline_info,
+        scheduler="DDIM",
+        output_dir=output_dir,
+        verbose=False,
+        nvtx_profile=nvtx_profile,
+        max_batch_size=max_batch_size,
+        use_cuda_graph=use_cuda_graph,
+        framework_model_dir=framework_model_dir,
+        engine_type=engine_type,
+    )
+    # Load TensorRT engines and pytorch modules
+    pipeline.backend.build_engines(
+        engine_dir,
+        framework_model_dir,
+        onnx_dir,
+        17,
+        opt_image_height=height,
+        opt_image_width=width,
+        opt_batch_size=batch_size,
+        static_batch=True,
+        static_image_shape=True,
+        max_workspace_size=0,
+        device_id=torch.cuda.current_device(),
+    )
+    # Here we use static batch and image size, so the resource allocation only need done once.
+    # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
+    pipeline.load_resources(height, width, batch_size)
+    def warmup():
+        pipeline.run(
+            ["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
+        )
+    # Run warm up, and measure GPU memory of two runs
+    # The first run has algo search so it might need more memory
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    image_filename_prefix = get_image_filename_prefix("ort_trt", short_name, batch_size, disable_safety_checker)
+    latency_list = []
+    prompts, negative_prompt = example_prompts()
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        for j in range(batch_count):
+            inference_start = time.time()
+            # Use warmup mode here since non-warmup mode will save image to disk.
+            images, pipeline_time = pipeline.run(
+                [prompt] * batch_size,
+                [negative_prompt] * batch_size,
+                height,
+                width,
+                denoising_steps=steps,
+                guidance=7.5,
+                seed=123,
+            )
+            inference_end = time.time()
+            latency = inference_end - inference_start
+            latency_list.append(latency)
+            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+            for k, image in enumerate(images):
+                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+    pipeline.teardown()
+    from tensorrt import __version__ as trt_version
+    from onnxruntime import __version__ as ort_version
+    return {
+        "model_name": pipeline_info.name(),
+        "engine": "onnxruntime",
+        "version": ort_version,
+        "provider": f"tensorrt({trt_version})",
+        "directory": engine_dir,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+        "disable_safety_checker": disable_safety_checker,
+        "enable_cuda_graph": use_cuda_graph,
+    }
+def run_tensorrt_static(
+    work_dir: str,
+    version: str,
+    model_name: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+    max_batch_size: int,
+    nvtx_profile: bool = False,
+    use_cuda_graph: bool = True,
+):
+    print("[I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)")
+    from cuda import cudart
+    # Register TensorRT plugins
+    from trt_utilities import init_trt_plugins
+    init_trt_plugins()
+    assert batch_size <= max_batch_size
+    from diffusion_models import PipelineInfo
+    pipeline_info = PipelineInfo(version)
+    from engine_builder import EngineType, get_engine_paths
+    from pipeline_stable_diffusion import StableDiffusionPipeline
+    engine_type = EngineType.TRT
+    onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
+        work_dir, pipeline_info, engine_type
+    )
+    # Initialize pipeline
+    pipeline = StableDiffusionPipeline(
+        pipeline_info,
+        scheduler="DDIM",
+        output_dir=output_dir,
+        verbose=False,
+        nvtx_profile=nvtx_profile,
+        max_batch_size=max_batch_size,
+        use_cuda_graph=True,
+        engine_type=engine_type,
+    )
+    # Load TensorRT engines and pytorch modules
+    pipeline.backend.load_engines(
+        engine_dir=engine_dir,
+        framework_model_dir=framework_model_dir,
+        onnx_dir=onnx_dir,
+        onnx_opset=17,
+        opt_batch_size=batch_size,
+        opt_image_height=height,
+        opt_image_width=width,
+        static_batch=True,
+        static_shape=True,
+        enable_all_tactics=False,
+        timing_cache=timing_cache,
+    )
+    # activate engines
+    max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory())
+    _, shared_device_memory = cudart.cudaMalloc(max_device_memory)
+    pipeline.backend.activate_engines(shared_device_memory)
+    # Here we use static batch and image size, so the resource allocation only need done once.
+    # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
+    pipeline.load_resources(height, width, batch_size)
+    def warmup():
+        pipeline.run(
+            ["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
+        )
+    # Run warm up, and measure GPU memory of two runs
+    # The first run has algo search so it might need more memory
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, disable_safety_checker)
+    latency_list = []
+    prompts, negative_prompt = example_prompts()
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        for j in range(batch_count):
+            inference_start = time.time()
+            # Use warmup mode here since non-warmup mode will save image to disk.
+            images, pipeline_time = pipeline.run(
+                [prompt] * batch_size,
+                [negative_prompt] * batch_size,
+                height,
+                width,
+                denoising_steps=steps,
+                guidance=7.5,
+                seed=123,
+            )
+            inference_end = time.time()
+            latency = inference_end - inference_start
+            latency_list.append(latency)
+            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+            for k, image in enumerate(images):
+                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+    pipeline.teardown()
+    import tensorrt as trt
+    return {
+        "engine": "tensorrt",
+        "version": trt.__version__,
+        "provider": "default",
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+        "enable_cuda_graph": use_cuda_graph,
+    }
+def run_tensorrt_static_xl(
+    work_dir: str,
+    version: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+    max_batch_size: int,
+    nvtx_profile: bool = False,
+    use_cuda_graph=True,
+):
+    print("[I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)")
+    import tensorrt as trt
+    from cuda import cudart
+    from trt_utilities import init_trt_plugins
+    # Validate image dimensions
+    image_height = height
+    image_width = width
+    if image_height % 8 != 0 or image_width % 8 != 0:
+        raise ValueError(
+            f"Image height and width have to be divisible by 8 but specified as: {image_height} and {image_width}."
+        )
+    # Register TensorRT plugins
+    init_trt_plugins()
+    assert batch_size <= max_batch_size
+    from diffusion_models import PipelineInfo
+    from engine_builder import EngineType, get_engine_paths
+    def init_pipeline(pipeline_class, pipeline_info):
+        engine_type = EngineType.TRT
+        onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
+            work_dir, pipeline_info, engine_type
+        )
+        # Initialize pipeline
+        pipeline = pipeline_class(
+            pipeline_info,
+            scheduler="DDIM",
+            output_dir=output_dir,
+            verbose=False,
+            nvtx_profile=nvtx_profile,
+            max_batch_size=max_batch_size,
+            use_cuda_graph=use_cuda_graph,
+            framework_model_dir=framework_model_dir,
+            engine_type=engine_type,
+        )
+        pipeline.backend.load_engines(
+            engine_dir=engine_dir,
+            framework_model_dir=framework_model_dir,
+            onnx_dir=onnx_dir,
+            onnx_opset=17,
+            opt_batch_size=batch_size,
+            opt_image_height=height,
+            opt_image_width=width,
+            static_batch=True,
+            static_shape=True,
+            enable_all_tactics=False,
+            timing_cache=timing_cache,
+        )
+        return pipeline
+    from pipeline_stable_diffusion import StableDiffusionPipeline
+    pipeline_info = PipelineInfo(version)
+    pipeline = init_pipeline(StableDiffusionPipeline, pipeline_info)
+    max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory())
+    _, shared_device_memory = cudart.cudaMalloc(max_device_memory)
+    pipeline.backend.activate_engines(shared_device_memory)
+    # Here we use static batch and image size, so the resource allocation only need done once.
+    # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
+    pipeline.load_resources(image_height, image_width, batch_size)
+    def run_sd_xl_inference(prompt, negative_prompt, seed=None):
+        return pipeline.run(
+            prompt,
+            negative_prompt,
+            image_height,
+            image_width,
+            denoising_steps=steps,
+            guidance=5.0,
+            seed=seed,
+        )
+    def warmup():
+        run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
+    # Run warm up, and measure GPU memory of two runs
+    # The first run has algo search so it might need more memory
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    model_name = pipeline_info.name()
+    image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, disable_safety_checker)
+    latency_list = []
+    prompts, negative_prompt = example_prompts()
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        for j in range(batch_count):
+            inference_start = time.time()
+            # Use warmup mode here since non-warmup mode will save image to disk.
+            if nvtx_profile:
+                cudart.cudaProfilerStart()
+            images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
+            if nvtx_profile:
+                cudart.cudaProfilerStop()
+            inference_end = time.time()
+            latency = inference_end - inference_start
+            latency_list.append(latency)
+            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+            for k, image in enumerate(images):
+                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.png")
+    pipeline.teardown()
+    return {
+        "model_name": model_name,
+        "engine": "tensorrt",
+        "version": trt.__version__,
+        "provider": "default",
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+        "enable_cuda_graph": use_cuda_graph,
+    }
+def run_ort_trt_xl(
+    work_dir: str,
+    version: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+    max_batch_size: int,
+    nvtx_profile: bool = False,
+    use_cuda_graph=True,
+):
+    from demo_utils import initialize_pipeline
+    from engine_builder import EngineType
+    pipeline = initialize_pipeline(
+        version=version,
+        engine_type=EngineType.ORT_TRT,
+        work_dir=work_dir,
+        height=height,
+        width=width,
+        use_cuda_graph=use_cuda_graph,
+        max_batch_size=max_batch_size,
+        opt_batch_size=batch_size,
+    )
+    from cuda import cudart
+    assert batch_size <= max_batch_size
+    pipeline.load_resources(height, width, batch_size)
+    def run_sd_xl_inference(prompt, negative_prompt, seed=None):
+        return pipeline.run(
+            prompt,
+            negative_prompt,
+            height,
+            width,
+            denoising_steps=steps,
+            guidance=5.0,
+            seed=seed,
+        )
+    def warmup():
+        run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
+    # Run warm up, and measure GPU memory of two runs
+    # The first run has algo search so it might need more memory
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    model_name = pipeline.pipeline_info.name()
+    image_filename_prefix = get_image_filename_prefix("ort_trt", model_name, batch_size, disable_safety_checker)
+    latency_list = []
+    prompts, negative_prompt = example_prompts()
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        for j in range(batch_count):
+            inference_start = time.time()
+            # Use warmup mode here since non-warmup mode will save image to disk.
+            if nvtx_profile:
+                cudart.cudaProfilerStart()
+            images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
+            if nvtx_profile:
+                cudart.cudaProfilerStop()
+            inference_end = time.time()
+            latency = inference_end - inference_start
+            latency_list.append(latency)
+            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+            for k, image in enumerate(images):
+                filename = f"{image_filename_prefix}_{i}_{j}_{k}.png"
+                image.save(filename)
+                print("Image saved to", filename)
+    pipeline.teardown()
+    from tensorrt import __version__ as trt_version
+    from onnxruntime import __version__ as ort_version
+    return {
+        "model_name": model_name,
+        "engine": "onnxruntime",
+        "version": ort_version,
+        "provider": f"tensorrt{trt_version})",
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+        "enable_cuda_graph": use_cuda_graph,
+    }
+def run_torch(
+    model_name: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    enable_torch_compile: bool,
+    use_xformers: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+):
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+    torch.set_grad_enabled(False)
+    load_start = time.time()
+    pipe = get_torch_pipeline(model_name, disable_safety_checker, enable_torch_compile, use_xformers)
+    load_end = time.time()
+    print(f"Model loading took {load_end - load_start} seconds")
+    image_filename_prefix = get_image_filename_prefix("torch", model_name, batch_size, disable_safety_checker)
+    if not enable_torch_compile:
+        with torch.inference_mode():
+            result = run_torch_pipeline(
+                pipe,
+                batch_size,
+                image_filename_prefix,
+                height,
+                width,
+                steps,
+                num_prompts,
+                batch_count,
+                start_memory,
+                memory_monitor_type,
+            )
+    else:
+        result = run_torch_pipeline(
+            pipe,
+            batch_size,
+            image_filename_prefix,
+            height,
+            width,
+            steps,
+            num_prompts,
+            batch_count,
+            start_memory,
+            memory_monitor_type,
+        )
+    result.update(
+        {
+            "model_name": model_name,
+            "directory": None,
+            "provider": "compile" if enable_torch_compile else "xformers" if use_xformers else "default",
+            "disable_safety_checker": disable_safety_checker,
+            "enable_cuda_graph": False,
+        }
+    )
+    return result
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-e",
+        "--engine",
+        required=False,
+        type=str,
+        default="onnxruntime",
+        choices=["onnxruntime", "optimum", "torch", "tensorrt"],
+        help="Engines to benchmark. Default is onnxruntime.",
+    )
+    parser.add_argument(
+        "-r",
+        "--provider",
+        required=False,
+        type=str,
+        default="cuda",
+        choices=list(PROVIDERS.keys()),
+        help="Provider to benchmark. Default is CUDAExecutionProvider.",
+    )
+    parser.add_argument(
+        "-t",
+        "--tuning",
+        action="store_true",
+        help="Enable TunableOp and tuning. "
+        "This will incur longer warmup latency, and is mandatory for some operators of ROCm EP.",
+    )
+    parser.add_argument(
+        "-v",
+        "--version",
+        required=False,
+        type=str,
+        choices=list(SD_MODELS.keys()),
+        default="1.5",
+        help="Stable diffusion version like 1.5, 2.0 or 2.1. Default is 1.5.",
+    )
+    parser.add_argument(
+        "-p",
+        "--pipeline",
+        required=False,
+        type=str,
+        default=None,
+        help="Directory of saved onnx pipeline. It could be the output directory of optimize_pipeline.py.",
+    )
+    parser.add_argument(
+        "-w",
+        "--work_dir",
+        required=False,
+        type=str,
+        default=".",
+        help="Root directory to save exported onnx models, built engines etc.",
+    )
+    parser.add_argument(
+        "--enable_safety_checker",
+        required=False,
+        action="store_true",
+        help="Enable safety checker",
+    )
+    parser.set_defaults(enable_safety_checker=False)
+    parser.add_argument(
+        "--enable_torch_compile",
+        required=False,
+        action="store_true",
+        help="Enable compile unet for PyTorch 2.0",
+    )
+    parser.set_defaults(enable_torch_compile=False)
+    parser.add_argument(
+        "--use_xformers",
+        required=False,
+        action="store_true",
+        help="Use xformers for PyTorch",
+    )
+    parser.set_defaults(use_xformers=False)
+    parser.add_argument(
+        "-b",
+        "--batch_size",
+        type=int,
+        default=1,
+        choices=[1, 2, 3, 4, 8, 10, 16, 32],
+        help="Number of images per batch. Default is 1.",
+    )
+    parser.add_argument(
+        "--height",
+        required=False,
+        type=int,
+        default=512,
+        help="Output image height. Default is 512.",
+    )
+    parser.add_argument(
+        "--width",
+        required=False,
+        type=int,
+        default=512,
+        help="Output image width. Default is 512.",
+    )
+    parser.add_argument(
+        "-s",
+        "--steps",
+        required=False,
+        type=int,
+        default=50,
+        help="Number of steps. Default is 50.",
+    )
+    parser.add_argument(
+        "-n",
+        "--num_prompts",
+        required=False,
+        type=int,
+        default=1,
+        help="Number of prompts. Default is 1.",
+    )
+    parser.add_argument(
+        "-c",
+        "--batch_count",
+        required=False,
+        type=int,
+        choices=range(1, 11),
+        default=5,
+        help="Number of batches to test. Default is 5.",
+    )
+    parser.add_argument(
+        "-m",
+        "--max_trt_batch_size",
+        required=False,
+        type=int,
+        choices=range(1, 16),
+        default=4,
+        help="Maximum batch size for TensorRT. Change the value may trigger TensorRT engine rebuild. Default is 4.",
+    )
+    parser.add_argument(
+        "-g",
+        "--enable_cuda_graph",
+        required=False,
+        action="store_true",
+        help="Enable Cuda Graph. Requires onnxruntime >= 1.16",
+    )
+    parser.set_defaults(enable_cuda_graph=False)
+    args = parser.parse_args()
+    return args
+def print_loaded_libraries(cuda_related_only=True):
+    import psutil
+    p = psutil.Process(os.getpid())
+    for lib in p.memory_maps():
+        if (not cuda_related_only) or any(x in lib.path for x in ("libcu", "libnv", "tensorrt")):
+            print(lib.path)
+def main():
+    args = parse_arguments()
+    print(args)
+    if args.engine == "onnxruntime":
+        if args.version in ["2.1"]:
+            # Set a flag to avoid overflow in attention, which causes black image output in SD 2.1 model.
+            # The environment variables shall be set before the first run of Attention or MultiHeadAttention operator.
+            os.environ["ORT_DISABLE_TRT_FLASH_ATTENTION"] = "1"
+        from packaging import version
+        from onnxruntime import __version__ as ort_version
+        if version.parse(ort_version) == version.parse("1.16.0"):
+            # ORT 1.16 has a bug that might trigger Attention RuntimeError when latest fusion script is applied on clip model.
+            # The walkaround is to enable fused causal attention, or disable Attention fusion for clip model.
+            os.environ["ORT_ENABLE_FUSED_CAUSAL_ATTENTION"] = "1"
+        if args.enable_cuda_graph:
+            if not (args.engine == "onnxruntime" and args.provider in ["cuda", "tensorrt"] and args.pipeline is None):
+                raise ValueError("The stable diffusion pipeline does not support CUDA graph.")
+            if version.parse(ort_version) < version.parse("1.16"):
+                raise ValueError("CUDA graph requires ONNX Runtime 1.16 or later")
+    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
+    memory_monitor_type = "rocm" if args.provider == "rocm" else "cuda"
+    start_memory = measure_gpu_memory(memory_monitor_type, None)
+    print("GPU memory used before loading models:", start_memory)
+    sd_model = SD_MODELS[args.version]
+    provider = PROVIDERS[args.provider]
+    if args.engine == "onnxruntime" and args.provider == "tensorrt":
+        if "xl" in args.version:
+            print("Testing Txt2ImgXLPipeline with static input shape. Backend is ORT TensorRT EP.")
+            result = run_ort_trt_xl(
+                work_dir=args.work_dir,
+                version=args.version,
+                batch_size=args.batch_size,
+                disable_safety_checker=True,
+                height=args.height,
+                width=args.width,
+                steps=args.steps,
+                num_prompts=args.num_prompts,
+                batch_count=args.batch_count,
+                start_memory=start_memory,
+                memory_monitor_type=memory_monitor_type,
+                max_batch_size=args.max_trt_batch_size,
+                nvtx_profile=False,
+                use_cuda_graph=args.enable_cuda_graph,
+            )
+        else:
+            print("Testing Txt2ImgPipeline with static input shape. Backend is ORT TensorRT EP.")
+            result = run_ort_trt_static(
+                work_dir=args.work_dir,
+                version=args.version,
+                batch_size=args.batch_size,
+                disable_safety_checker=not args.enable_safety_checker,
+                height=args.height,
+                width=args.width,
+                steps=args.steps,
+                num_prompts=args.num_prompts,
+                batch_count=args.batch_count,
+                start_memory=start_memory,
+                memory_monitor_type=memory_monitor_type,
+                max_batch_size=args.max_trt_batch_size,
+                nvtx_profile=False,
+                use_cuda_graph=args.enable_cuda_graph,
+            )
+    elif args.engine == "optimum" and provider == "CUDAExecutionProvider":
+        if "xl" in args.version:
+            os.environ["ORT_ENABLE_FUSED_CAUSAL_ATTENTION"] = "1"
+        result = run_optimum_ort(
+            model_name=sd_model,
+            directory=args.pipeline,
+            provider=provider,
+            batch_size=args.batch_size,
+            disable_safety_checker=not args.enable_safety_checker,
+            height=args.height,
+            width=args.width,
+            steps=args.steps,
+            num_prompts=args.num_prompts,
+            batch_count=args.batch_count,
+            start_memory=start_memory,
+            memory_monitor_type=memory_monitor_type,
+        )
+    elif args.engine == "onnxruntime":
+        assert args.pipeline and os.path.isdir(
+            args.pipeline
+        ), "--pipeline should be specified for the directory of ONNX models"
+        print(f"Testing diffusers StableDiffusionPipeline with {provider} provider and tuning={args.tuning}")
+        result = run_ort(
+            model_name=sd_model,
+            directory=args.pipeline,
+            provider=provider,
+            batch_size=args.batch_size,
+            disable_safety_checker=not args.enable_safety_checker,
+            height=args.height,
+            width=args.width,
+            steps=args.steps,
+            num_prompts=args.num_prompts,
+            batch_count=args.batch_count,
+            start_memory=start_memory,
+            memory_monitor_type=memory_monitor_type,
+            tuning=args.tuning,
+        )
+    elif args.engine == "tensorrt" and "xl" in args.version:
+        print("Testing Txt2ImgXLPipeline with static input shape. Backend is TensorRT.")
+        result = run_tensorrt_static_xl(
+            work_dir=args.work_dir,
+            version=args.version,
+            batch_size=args.batch_size,
+            disable_safety_checker=True,
+            height=args.height,
+            width=args.width,
+            steps=args.steps,
+            num_prompts=args.num_prompts,
+            batch_count=args.batch_count,
+            start_memory=start_memory,
+            memory_monitor_type=memory_monitor_type,
+            max_batch_size=args.max_trt_batch_size,
+            nvtx_profile=False,
+            use_cuda_graph=args.enable_cuda_graph,
+        )
+    elif args.engine == "tensorrt":
+        print("Testing Txt2ImgPipeline with static input shape. Backend is TensorRT.")
+        result = run_tensorrt_static(
+            work_dir=args.work_dir,
+            version=args.version,
+            model_name=sd_model,
+            batch_size=args.batch_size,
+            disable_safety_checker=True,
+            height=args.height,
+            width=args.width,
+            steps=args.steps,
+            num_prompts=args.num_prompts,
+            batch_count=args.batch_count,
+            start_memory=start_memory,
+            memory_monitor_type=memory_monitor_type,
+            max_batch_size=args.max_trt_batch_size,
+            nvtx_profile=False,
+            use_cuda_graph=args.enable_cuda_graph,
+        )
+    else:
+        print(
+            f"Testing Txt2ImgPipeline with dynamic input shape. Backend is PyTorch: compile={args.enable_torch_compile}, xformers={args.use_xformers}."
+        )
+        result = run_torch(
+            model_name=sd_model,
+            batch_size=args.batch_size,
+            disable_safety_checker=not args.enable_safety_checker,
+            enable_torch_compile=args.enable_torch_compile,
+            use_xformers=args.use_xformers,
+            height=args.height,
+            width=args.width,
+            steps=args.steps,
+            num_prompts=args.num_prompts,
+            batch_count=args.batch_count,
+            start_memory=start_memory,
+            memory_monitor_type=memory_monitor_type,
+        )
+    print(result)
+    with open("benchmark_result.csv", mode="a", newline="") as csv_file:
+        column_names = [
+            "model_name",
+            "directory",
+            "engine",
+            "version",
+            "provider",
+            "disable_safety_checker",
+            "height",
+            "width",
+            "steps",
+            "batch_size",
+            "batch_count",
+            "num_prompts",
+            "average_latency",
+            "median_latency",
+            "first_run_memory_MB",
+            "second_run_memory_MB",
+            "enable_cuda_graph",
+        ]
+        csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
+        csv_writer.writeheader()
+        csv_writer.writerow(result)
+    # Show loaded DLLs when steps == 1 for debugging purpose.
+    if args.steps == 1:
+        print_loaded_libraries(args.provider in ["cuda", "tensorrt"])
+if __name__ == "__main__":
+    import traceback
+    try:
+        main()
+    except Exception:
+        traceback.print_exception(*sys.exc_info())