PyPI - onnxruntime-directml - Versions diffs - 1.24.1__cp314-cp314-win_amd64.whl - Mend

onnxruntime-directml 1.24.1__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (322) hide show

onnxruntime/LICENSE +21 -0
onnxruntime/Privacy.md +21 -0
onnxruntime/ThirdPartyNotices.txt +6121 -0
onnxruntime/__init__.py +418 -0
onnxruntime/backend/__init__.py +6 -0
onnxruntime/backend/backend.py +175 -0
onnxruntime/backend/backend_rep.py +52 -0
onnxruntime/capi/DirectML.dll +0 -0
onnxruntime/capi/__init__.py +4 -0
onnxruntime/capi/_ld_preload.py +7 -0
onnxruntime/capi/_pybind_state.py +33 -0
onnxruntime/capi/build_and_package_info.py +2 -0
onnxruntime/capi/convert_npz_to_onnx_adapter.py +48 -0
onnxruntime/capi/onnxruntime.dll +0 -0
onnxruntime/capi/onnxruntime_collect_build_info.py +47 -0
onnxruntime/capi/onnxruntime_inference_collection.py +1440 -0
onnxruntime/capi/onnxruntime_providers_shared.dll +0 -0
onnxruntime/capi/onnxruntime_pybind11_state.pyd +0 -0
onnxruntime/capi/onnxruntime_validation.py +154 -0
onnxruntime/capi/version_info.py +2 -0
onnxruntime/datasets/__init__.py +18 -0
onnxruntime/datasets/logreg_iris.onnx +0 -0
onnxruntime/datasets/mul_1.onnx +0 -0
onnxruntime/datasets/sigmoid.onnx +13 -0
onnxruntime/quantization/CalTableFlatBuffers/KeyValue.py +78 -0
onnxruntime/quantization/CalTableFlatBuffers/TrtTable.py +90 -0
onnxruntime/quantization/CalTableFlatBuffers/__init__.py +0 -0
onnxruntime/quantization/__init__.py +19 -0
onnxruntime/quantization/base_quantizer.py +529 -0
onnxruntime/quantization/calibrate.py +1267 -0
onnxruntime/quantization/execution_providers/qnn/__init__.py +2 -0
onnxruntime/quantization/execution_providers/qnn/fusion_lpnorm.py +132 -0
onnxruntime/quantization/execution_providers/qnn/fusion_spacetodepth.py +162 -0
onnxruntime/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py +413 -0
onnxruntime/quantization/execution_providers/qnn/preprocess.py +353 -0
onnxruntime/quantization/execution_providers/qnn/quant_config.py +389 -0
onnxruntime/quantization/fusions/__init__.py +4 -0
onnxruntime/quantization/fusions/fusion.py +311 -0
onnxruntime/quantization/fusions/fusion_gelu.py +272 -0
onnxruntime/quantization/fusions/fusion_layernorm.py +146 -0
onnxruntime/quantization/fusions/replace_upsample_with_resize.py +96 -0
onnxruntime/quantization/matmul_bnb4_quantizer.py +239 -0
onnxruntime/quantization/matmul_nbits_quantizer.py +1638 -0
onnxruntime/quantization/neural_compressor/__init__.py +1 -0
onnxruntime/quantization/neural_compressor/onnx_model.py +1251 -0
onnxruntime/quantization/neural_compressor/util.py +80 -0
onnxruntime/quantization/neural_compressor/weight_only.py +932 -0
onnxruntime/quantization/onnx_model.py +600 -0
onnxruntime/quantization/onnx_quantizer.py +1163 -0
onnxruntime/quantization/operators/__init__.py +2 -0
onnxruntime/quantization/operators/activation.py +119 -0
onnxruntime/quantization/operators/argmax.py +18 -0
onnxruntime/quantization/operators/attention.py +73 -0
onnxruntime/quantization/operators/base_operator.py +26 -0
onnxruntime/quantization/operators/binary_op.py +72 -0
onnxruntime/quantization/operators/concat.py +62 -0
onnxruntime/quantization/operators/conv.py +260 -0
onnxruntime/quantization/operators/direct_q8.py +78 -0
onnxruntime/quantization/operators/embed_layernorm.py +121 -0
onnxruntime/quantization/operators/gather.py +64 -0
onnxruntime/quantization/operators/gavgpool.py +62 -0
onnxruntime/quantization/operators/gemm.py +172 -0
onnxruntime/quantization/operators/lstm.py +121 -0
onnxruntime/quantization/operators/matmul.py +231 -0
onnxruntime/quantization/operators/maxpool.py +34 -0
onnxruntime/quantization/operators/norm.py +40 -0
onnxruntime/quantization/operators/pad.py +172 -0
onnxruntime/quantization/operators/pooling.py +67 -0
onnxruntime/quantization/operators/qdq_base_operator.py +22 -0
onnxruntime/quantization/operators/resize.py +34 -0
onnxruntime/quantization/operators/softmax.py +74 -0
onnxruntime/quantization/operators/split.py +63 -0
onnxruntime/quantization/operators/where.py +87 -0
onnxruntime/quantization/preprocess.py +141 -0
onnxruntime/quantization/qdq_loss_debug.py +389 -0
onnxruntime/quantization/qdq_quantizer.py +1477 -0
onnxruntime/quantization/quant_utils.py +1051 -0
onnxruntime/quantization/quantize.py +953 -0
onnxruntime/quantization/registry.py +110 -0
onnxruntime/quantization/shape_inference.py +204 -0
onnxruntime/quantization/static_quantize_runner.py +256 -0
onnxruntime/quantization/tensor_quant_overrides.py +520 -0
onnxruntime/tools/__init__.py +10 -0
onnxruntime/tools/check_onnx_model_mobile_usability.py +47 -0
onnxruntime/tools/convert_onnx_models_to_ort.py +380 -0
onnxruntime/tools/file_utils.py +47 -0
onnxruntime/tools/logger.py +11 -0
onnxruntime/tools/make_dynamic_shape_fixed.py +73 -0
onnxruntime/tools/mobile_helpers/__init__.py +0 -0
onnxruntime/tools/mobile_helpers/coreml_supported_mlprogram_ops.md +53 -0
onnxruntime/tools/mobile_helpers/coreml_supported_neuralnetwork_ops.md +43 -0
onnxruntime/tools/mobile_helpers/nnapi_supported_ops.md +58 -0
onnxruntime/tools/mobile_helpers/usability_checker.py +738 -0
onnxruntime/tools/offline_tuning.py +169 -0
onnxruntime/tools/onnx_model_utils.py +416 -0
onnxruntime/tools/onnx_randomizer.py +85 -0
onnxruntime/tools/onnxruntime_test.py +164 -0
onnxruntime/tools/optimize_onnx_model.py +56 -0
onnxruntime/tools/ort_format_model/__init__.py +27 -0
onnxruntime/tools/ort_format_model/operator_type_usage_processors.py +653 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/__init__.py +0 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ArgType.py +7 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ArgTypeAndIndex.py +67 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Attribute.py +337 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/AttributeType.py +18 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Checkpoint.py +125 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedKernelCreateInfos.py +120 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedNodeIndexAndKernelDefHash.py +68 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedSessionState.py +96 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedSubGraphSessionState.py +72 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Dimension.py +71 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DimensionValue.py +80 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DimensionValueType.py +8 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/EdgeEnd.py +32 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/FloatProperty.py +67 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Graph.py +320 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/InferenceSession.py +88 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/IntProperty.py +67 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/KernelTypeStrArgsEntry.py +91 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/KernelTypeStrResolver.py +78 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/MapType.py +71 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Model.py +223 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ModuleState.py +141 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Node.py +317 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodeEdge.py +126 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodeType.py +7 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodesToOptimizeIndices.py +160 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OpIdKernelTypeStrArgsEntry.py +91 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OperatorSetId.py +67 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OptimizerGroup.py +117 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ParameterOptimizerState.py +91 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/PropertyBag.py +152 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizationRecord.py +105 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizationRecordContainerEntry.py +91 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizations.py +79 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/SequenceType.py +58 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Shape.py +78 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/SparseTensor.py +114 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/StringProperty.py +67 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/StringStringEntry.py +67 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Tensor.py +203 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TensorDataType.py +26 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TensorTypeAndShape.py +71 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TypeInfo.py +83 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TypeInfoValue.py +9 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ValueInfo.py +84 -0
onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/__init__.py +6 -0
onnxruntime/tools/ort_format_model/ort_model_processor.py +86 -0
onnxruntime/tools/ort_format_model/types.py +85 -0
onnxruntime/tools/ort_format_model/utils.py +61 -0
onnxruntime/tools/pytorch_export_contrib_ops.py +129 -0
onnxruntime/tools/pytorch_export_helpers.py +131 -0
onnxruntime/tools/qdq_helpers/__init__.py +0 -0
onnxruntime/tools/qdq_helpers/optimize_qdq_model.py +37 -0
onnxruntime/tools/qnn/add_trans_cast.py +292 -0
onnxruntime/tools/qnn/gen_qnn_ctx_onnx_model.py +364 -0
onnxruntime/tools/qnn/preprocess.py +165 -0
onnxruntime/tools/reduced_build_config_parser.py +203 -0
onnxruntime/tools/remove_initializer_from_input.py +37 -0
onnxruntime/tools/symbolic_shape_infer.py +3094 -0
onnxruntime/tools/update_onnx_opset.py +31 -0
onnxruntime/transformers/__init__.py +8 -0
onnxruntime/transformers/affinity_helper.py +40 -0
onnxruntime/transformers/benchmark.py +942 -0
onnxruntime/transformers/benchmark_helper.py +643 -0
onnxruntime/transformers/bert_perf_test.py +629 -0
onnxruntime/transformers/bert_test_data.py +641 -0
onnxruntime/transformers/compare_bert_results.py +256 -0
onnxruntime/transformers/constants.py +47 -0
onnxruntime/transformers/convert_generation.py +3605 -0
onnxruntime/transformers/convert_tf_models_to_pytorch.py +205 -0
onnxruntime/transformers/convert_to_packing_mode.py +385 -0
onnxruntime/transformers/dynamo_onnx_helper.py +205 -0
onnxruntime/transformers/float16.py +501 -0
onnxruntime/transformers/fusion_attention.py +1189 -0
onnxruntime/transformers/fusion_attention_clip.py +340 -0
onnxruntime/transformers/fusion_attention_sam2.py +533 -0
onnxruntime/transformers/fusion_attention_unet.py +1307 -0
onnxruntime/transformers/fusion_attention_vae.py +300 -0
onnxruntime/transformers/fusion_bart_attention.py +435 -0
onnxruntime/transformers/fusion_base.py +141 -0
onnxruntime/transformers/fusion_bias_add.py +57 -0
onnxruntime/transformers/fusion_biasgelu.py +66 -0
onnxruntime/transformers/fusion_biassplitgelu.py +110 -0
onnxruntime/transformers/fusion_conformer_attention.py +222 -0
onnxruntime/transformers/fusion_constant_fold.py +144 -0
onnxruntime/transformers/fusion_embedlayer.py +810 -0
onnxruntime/transformers/fusion_fastgelu.py +492 -0
onnxruntime/transformers/fusion_gelu.py +258 -0
onnxruntime/transformers/fusion_gelu_approximation.py +25 -0
onnxruntime/transformers/fusion_gemmfastgelu.py +121 -0
onnxruntime/transformers/fusion_gpt_attention.py +546 -0
onnxruntime/transformers/fusion_gpt_attention_megatron.py +355 -0
onnxruntime/transformers/fusion_gpt_attention_no_past.py +260 -0
onnxruntime/transformers/fusion_group_norm.py +180 -0
onnxruntime/transformers/fusion_layernorm.py +489 -0
onnxruntime/transformers/fusion_mha_mmdit.py +667 -0
onnxruntime/transformers/fusion_nhwc_conv.py +99 -0
onnxruntime/transformers/fusion_options.py +340 -0
onnxruntime/transformers/fusion_qordered_attention.py +420 -0
onnxruntime/transformers/fusion_qordered_gelu.py +118 -0
onnxruntime/transformers/fusion_qordered_layernorm.py +122 -0
onnxruntime/transformers/fusion_qordered_matmul.py +216 -0
onnxruntime/transformers/fusion_quickgelu.py +74 -0
onnxruntime/transformers/fusion_reshape.py +173 -0
onnxruntime/transformers/fusion_rotary_attention.py +1591 -0
onnxruntime/transformers/fusion_shape.py +109 -0
onnxruntime/transformers/fusion_simplified_layernorm.py +165 -0
onnxruntime/transformers/fusion_skip_group_norm.py +254 -0
onnxruntime/transformers/fusion_skiplayernorm.py +209 -0
onnxruntime/transformers/fusion_transpose.py +167 -0
onnxruntime/transformers/fusion_utils.py +321 -0
onnxruntime/transformers/huggingface_models.py +74 -0
onnxruntime/transformers/import_utils.py +20 -0
onnxruntime/transformers/io_binding_helper.py +487 -0
onnxruntime/transformers/large_model_exporter.py +395 -0
onnxruntime/transformers/machine_info.py +230 -0
onnxruntime/transformers/metrics.py +163 -0
onnxruntime/transformers/models/bart/__init__.py +12 -0
onnxruntime/transformers/models/bart/export.py +98 -0
onnxruntime/transformers/models/bert/__init__.py +12 -0
onnxruntime/transformers/models/bert/eval_squad.py +329 -0
onnxruntime/transformers/models/gpt2/__init__.py +12 -0
onnxruntime/transformers/models/gpt2/benchmark_gpt2.py +413 -0
onnxruntime/transformers/models/gpt2/convert_to_onnx.py +566 -0
onnxruntime/transformers/models/gpt2/gpt2_helper.py +1031 -0
onnxruntime/transformers/models/gpt2/gpt2_parity.py +513 -0
onnxruntime/transformers/models/gpt2/gpt2_tester.py +501 -0
onnxruntime/transformers/models/gpt2/parity_check_helper.py +146 -0
onnxruntime/transformers/models/llama/__init__.py +12 -0
onnxruntime/transformers/models/llama/benchmark.py +700 -0
onnxruntime/transformers/models/llama/benchmark_all.py +488 -0
onnxruntime/transformers/models/llama/benchmark_e2e.py +608 -0
onnxruntime/transformers/models/llama/convert_to_onnx.py +1064 -0
onnxruntime/transformers/models/llama/dist_settings.py +57 -0
onnxruntime/transformers/models/llama/llama_inputs.py +504 -0
onnxruntime/transformers/models/llama/llama_parity.py +343 -0
onnxruntime/transformers/models/llama/llama_torch.py +47 -0
onnxruntime/transformers/models/llama/quant_kv_dataloader.py +108 -0
onnxruntime/transformers/models/longformer/__init__.py +12 -0
onnxruntime/transformers/models/longformer/benchmark_longformer.py +821 -0
onnxruntime/transformers/models/longformer/convert_to_onnx.py +413 -0
onnxruntime/transformers/models/longformer/generate_test_data.py +347 -0
onnxruntime/transformers/models/longformer/longformer_helper.py +76 -0
onnxruntime/transformers/models/phi2/__init__.py +12 -0
onnxruntime/transformers/models/phi2/convert_to_onnx.py +590 -0
onnxruntime/transformers/models/phi2/inference_example.py +414 -0
onnxruntime/transformers/models/sam2/__init__.py +12 -0
onnxruntime/transformers/models/sam2/benchmark_sam2.py +638 -0
onnxruntime/transformers/models/sam2/convert_to_onnx.py +270 -0
onnxruntime/transformers/models/sam2/image_decoder.py +272 -0
onnxruntime/transformers/models/sam2/image_encoder.py +236 -0
onnxruntime/transformers/models/sam2/mask_decoder.py +208 -0
onnxruntime/transformers/models/sam2/nvtx_helper.py +33 -0
onnxruntime/transformers/models/sam2/prompt_encoder.py +189 -0
onnxruntime/transformers/models/sam2/sam2_demo.py +321 -0
onnxruntime/transformers/models/sam2/sam2_image_onnx_predictor.py +279 -0
onnxruntime/transformers/models/sam2/sam2_utils.py +147 -0
onnxruntime/transformers/models/stable_diffusion/__init__.py +12 -0
onnxruntime/transformers/models/stable_diffusion/benchmark.py +1519 -0
onnxruntime/transformers/models/stable_diffusion/benchmark_controlnet.py +426 -0
onnxruntime/transformers/models/stable_diffusion/demo_txt2img.py +103 -0
onnxruntime/transformers/models/stable_diffusion/demo_txt2img_xl.py +269 -0
onnxruntime/transformers/models/stable_diffusion/demo_utils.py +778 -0
onnxruntime/transformers/models/stable_diffusion/diffusion_models.py +1318 -0
onnxruntime/transformers/models/stable_diffusion/diffusion_schedulers.py +1179 -0
onnxruntime/transformers/models/stable_diffusion/engine_builder.py +295 -0
onnxruntime/transformers/models/stable_diffusion/engine_builder_ort_cuda.py +387 -0
onnxruntime/transformers/models/stable_diffusion/engine_builder_ort_trt.py +288 -0
onnxruntime/transformers/models/stable_diffusion/engine_builder_tensorrt.py +395 -0
onnxruntime/transformers/models/stable_diffusion/engine_builder_torch.py +108 -0
onnxruntime/transformers/models/stable_diffusion/optimize_pipeline.py +590 -0
onnxruntime/transformers/models/stable_diffusion/ort_optimizer.py +136 -0
onnxruntime/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +831 -0
onnxruntime/transformers/models/stable_diffusion/trt_utilities.py +12 -0
onnxruntime/transformers/models/t5/__init__.py +12 -0
onnxruntime/transformers/models/t5/convert_to_onnx.py +318 -0
onnxruntime/transformers/models/t5/t5_decoder.py +437 -0
onnxruntime/transformers/models/t5/t5_encoder.py +70 -0
onnxruntime/transformers/models/t5/t5_encoder_decoder_init.py +361 -0
onnxruntime/transformers/models/t5/t5_helper.py +302 -0
onnxruntime/transformers/models/whisper/__init__.py +12 -0
onnxruntime/transformers/models/whisper/benchmark.py +585 -0
onnxruntime/transformers/models/whisper/benchmark_all.py +526 -0
onnxruntime/transformers/models/whisper/convert_to_onnx.py +609 -0
onnxruntime/transformers/models/whisper/whisper_chain.py +334 -0
onnxruntime/transformers/models/whisper/whisper_decoder.py +464 -0
onnxruntime/transformers/models/whisper/whisper_encoder.py +164 -0
onnxruntime/transformers/models/whisper/whisper_encoder_decoder_init.py +371 -0
onnxruntime/transformers/models/whisper/whisper_helper.py +1035 -0
onnxruntime/transformers/models/whisper/whisper_inputs.py +380 -0
onnxruntime/transformers/models/whisper/whisper_jump_times.py +477 -0
onnxruntime/transformers/onnx_exporter.py +719 -0
onnxruntime/transformers/onnx_model.py +1636 -0
onnxruntime/transformers/onnx_model_bart.py +141 -0
onnxruntime/transformers/onnx_model_bert.py +488 -0
onnxruntime/transformers/onnx_model_bert_keras.py +474 -0
onnxruntime/transformers/onnx_model_bert_tf.py +588 -0
onnxruntime/transformers/onnx_model_clip.py +42 -0
onnxruntime/transformers/onnx_model_conformer.py +32 -0
onnxruntime/transformers/onnx_model_gpt2.py +101 -0
onnxruntime/transformers/onnx_model_mmdit.py +112 -0
onnxruntime/transformers/onnx_model_phi.py +929 -0
onnxruntime/transformers/onnx_model_sam2.py +137 -0
onnxruntime/transformers/onnx_model_t5.py +985 -0
onnxruntime/transformers/onnx_model_tnlr.py +226 -0
onnxruntime/transformers/onnx_model_unet.py +258 -0
onnxruntime/transformers/onnx_model_vae.py +42 -0
onnxruntime/transformers/onnx_utils.py +55 -0
onnxruntime/transformers/optimizer.py +620 -0
onnxruntime/transformers/past_helper.py +149 -0
onnxruntime/transformers/profile_result_processor.py +358 -0
onnxruntime/transformers/profiler.py +434 -0
onnxruntime/transformers/quantize_helper.py +76 -0
onnxruntime/transformers/shape_infer_helper.py +121 -0
onnxruntime/transformers/shape_optimizer.py +400 -0
onnxruntime/transformers/torch_onnx_export_helper.py +74 -0
onnxruntime_directml-1.24.1.dist-info/METADATA +216 -0
onnxruntime_directml-1.24.1.dist-info/RECORD +322 -0
onnxruntime_directml-1.24.1.dist-info/WHEEL +5 -0
onnxruntime_directml-1.24.1.dist-info/entry_points.txt +2 -0
onnxruntime_directml-1.24.1.dist-info/top_level.txt +1 -0

onnxruntime/transformers/models/stable_diffusion/benchmark.py ADDED Viewed

@@ -0,0 +1,1519 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import argparse
+import csv
+import logging
+import os
+import statistics
+import sys
+import time
+from pathlib import Path
+# import torch before onnxruntime so that onnxruntime uses the cuDNN in the torch package.
+import torch
+from benchmark_helper import measure_memory
+SD_MODELS = {
+    "1.5": "runwayml/stable-diffusion-v1-5",
+    "2.0": "stabilityai/stable-diffusion-2",
+    "2.1": "stabilityai/stable-diffusion-2-1",
+    "xl-1.0": "stabilityai/stable-diffusion-xl-refiner-1.0",
+    "3.0M": "stabilityai/stable-diffusion-3-medium-diffusers",
+    "3.5M": "stabilityai/stable-diffusion-3.5-medium",
+    "3.5L": "stabilityai/stable-diffusion-3.5-large",
+    "Flux.1S": "black-forest-labs/FLUX.1-schnell",
+    "Flux.1D": "black-forest-labs/FLUX.1-dev",
+}
+PROVIDERS = {
+    "cuda": "CUDAExecutionProvider",
+    "migraphx": "MIGraphXExecutionProvider",
+    "tensorrt": "TensorrtExecutionProvider",
+}
+def example_prompts():
+    prompts = [
+        "a photo of an astronaut riding a horse on mars",
+        "cute grey cat with blue eyes, wearing a bowtie, acrylic painting",
+        "a cute magical flying dog, fantasy art drawn by disney concept artists, highly detailed, digital painting",
+        "an illustration of a house with large barn with many cute flower pots and beautiful blue sky scenery",
+        "one apple sitting on a table, still life, reflective, full color photograph, centered, close-up product",
+        "background texture of stones, masterpiece, artistic, stunning photo, award winner photo",
+        "new international organic style house, tropical surroundings, architecture, 8k, hdr",
+        "beautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstation",
+        "blue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realistic",
+        "delicate elvish moonstone necklace on a velvet background, symmetrical intricate motifs, leaves, flowers, 8k",
+    ]
+    negative_prompt = "bad composition, ugly, abnormal, malformed"
+    return prompts, negative_prompt
+def warmup_prompts():
+    return "warm up", "bad"
+def measure_gpu_memory(monitor_type, func, start_memory=None):
+    return measure_memory(is_gpu=True, func=func, monitor_type=monitor_type, start_memory=start_memory)
+def get_ort_pipeline(model_name: str, directory: str, provider, disable_safety_checker: bool):
+    from diffusers import DDIMScheduler, OnnxStableDiffusionPipeline  # noqa: PLC0415
+    import onnxruntime  # noqa: PLC0415
+    if directory is not None:
+        assert os.path.exists(directory)
+        session_options = onnxruntime.SessionOptions()
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            directory,
+            provider=provider,
+            sess_options=session_options,
+        )
+    else:
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            model_name,
+            revision="onnx",
+            provider=provider,
+            use_auth_token=True,
+        )
+    pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+    pipe.set_progress_bar_config(disable=True)
+    if disable_safety_checker:
+        pipe.safety_checker = None
+        pipe.feature_extractor = None
+    return pipe
+def get_torch_pipeline(model_name: str, disable_safety_checker: bool, enable_torch_compile: bool, use_xformers: bool):
+    if "FLUX" in model_name:
+        from diffusers import FluxPipeline  # noqa: PLC0415
+        pipe = FluxPipeline.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda")
+        if enable_torch_compile:
+            pipe.transformer.to(memory_format=torch.channels_last)
+            pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
+        return pipe
+    if "stable-diffusion-3" in model_name:
+        from diffusers import StableDiffusion3Pipeline  # noqa: PLC0415
+        pipe = StableDiffusion3Pipeline.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda")
+        if enable_torch_compile:
+            pipe.transformer.to(memory_format=torch.channels_last)
+            pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
+        return pipe
+    from diffusers import DDIMScheduler, StableDiffusionPipeline  # noqa: PLC0415
+    from torch import channels_last, float16  # noqa: PLC0415
+    pipe = StableDiffusionPipeline.from_pretrained(model_name, torch_dtype=float16).to("cuda")
+    pipe.unet.to(memory_format=channels_last)  # in-place operation
+    if use_xformers:
+        pipe.enable_xformers_memory_efficient_attention()
+    if enable_torch_compile:
+        pipe.unet = torch.compile(pipe.unet)
+        pipe.vae = torch.compile(pipe.vae)
+        pipe.text_encoder = torch.compile(pipe.text_encoder)
+        print("Torch compiled unet, vae and text_encoder")
+    pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+    pipe.set_progress_bar_config(disable=True)
+    if disable_safety_checker:
+        pipe.safety_checker = None
+        pipe.feature_extractor = None
+    return pipe
+def get_image_filename_prefix(engine: str, model_name: str, batch_size: int, steps: int, disable_safety_checker: bool):
+    short_model_name = model_name.split("/")[-1].replace("stable-diffusion-", "sd")
+    return f"{engine}_{short_model_name}_b{batch_size}_s{steps}" + ("" if disable_safety_checker else "_safe")
+def run_ort_pipeline(
+    pipe,
+    batch_size: int,
+    image_filename_prefix: str,
+    height,
+    width,
+    steps,
+    num_prompts,
+    batch_count,
+    start_memory,
+    memory_monitor_type,
+    skip_warmup: bool = False,
+):
+    from diffusers import OnnxStableDiffusionPipeline  # noqa: PLC0415
+    assert isinstance(pipe, OnnxStableDiffusionPipeline)
+    prompts, negative_prompt = example_prompts()
+    def warmup():
+        if skip_warmup:
+            return
+        prompt, negative = warmup_prompts()
+        pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative] * batch_size,
+        )
+    # Run warm up, and measure GPU memory of two runs
+    # cuDNN/MIOpen The first run has  algo search so it might need more memory)
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    latency_list = []
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        inference_start = time.time()
+        images = pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative_prompt] * batch_size,
+        ).images
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"Inference took {latency:.3f} seconds")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
+    from onnxruntime import __version__ as ort_version  # noqa: PLC0415
+    return {
+        "engine": "onnxruntime",
+        "version": ort_version,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+    }
+def get_negative_prompt_kwargs(negative_prompt, use_num_images_per_prompt, is_flux, batch_size) -> dict:
+    # Flux does not support negative prompt
+    kwargs = (
+        (
+            {"negative_prompt": negative_prompt}
+            if use_num_images_per_prompt
+            else {"negative_prompt": [negative_prompt] * batch_size}
+        )
+        if not is_flux
+        else {}
+    )
+    # Fix the random seed so that we can inspect the output quality easily.
+    if torch.cuda.is_available():
+        kwargs["generator"] = torch.Generator(device="cuda").manual_seed(123)
+    return kwargs
+def run_torch_pipeline(
+    pipe,
+    batch_size: int,
+    image_filename_prefix: str,
+    height,
+    width,
+    steps,
+    num_prompts,
+    batch_count,
+    start_memory,
+    memory_monitor_type,
+    skip_warmup=False,
+):
+    prompts, negative_prompt = example_prompts()
+    import diffusers  # noqa: PLC0415
+    is_flux = isinstance(pipe, diffusers.FluxPipeline)
+    def warmup():
+        if skip_warmup:
+            return
+        prompt, negative = warmup_prompts()
+        extra_kwargs = get_negative_prompt_kwargs(negative, False, is_flux, batch_size)
+        pipe(prompt=[prompt] * batch_size, height=height, width=width, num_inference_steps=steps, **extra_kwargs)
+    # Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    torch.set_grad_enabled(False)
+    latency_list = []
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        torch.cuda.synchronize()
+        inference_start = time.time()
+        extra_kwargs = get_negative_prompt_kwargs(negative_prompt, False, is_flux, batch_size)
+        images = pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            **extra_kwargs,
+        ).images
+        torch.cuda.synchronize()
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"Inference took {latency:.3f} seconds")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
+    return {
+        "engine": "torch",
+        "version": torch.__version__,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+    }
+def run_ort(
+    model_name: str,
+    directory: str,
+    provider: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+    tuning: bool,
+    skip_warmup: bool = False,
+):
+    provider_and_options = provider
+    if tuning and provider in ["CUDAExecutionProvider"]:
+        provider_and_options = (provider, {"tunable_op_enable": 1, "tunable_op_tuning_enable": 1})
+    load_start = time.time()
+    pipe = get_ort_pipeline(model_name, directory, provider_and_options, disable_safety_checker)
+    load_end = time.time()
+    print(f"Model loading took {load_end - load_start} seconds")
+    image_filename_prefix = get_image_filename_prefix("ort", model_name, batch_size, steps, disable_safety_checker)
+    result = run_ort_pipeline(
+        pipe,
+        batch_size,
+        image_filename_prefix,
+        height,
+        width,
+        steps,
+        num_prompts,
+        batch_count,
+        start_memory,
+        memory_monitor_type,
+        skip_warmup=skip_warmup,
+    )
+    result.update(
+        {
+            "model_name": model_name,
+            "directory": directory,
+            "provider": provider.replace("ExecutionProvider", ""),
+            "disable_safety_checker": disable_safety_checker,
+            "enable_cuda_graph": False,
+        }
+    )
+    return result
+def get_optimum_ort_pipeline(
+    model_name: str,
+    directory: str,
+    provider="CUDAExecutionProvider",
+    disable_safety_checker: bool = True,
+    use_io_binding: bool = False,
+):
+    from optimum.onnxruntime import ORTPipelineForText2Image  # noqa: PLC0415
+    if directory is not None and os.path.exists(directory):
+        pipeline = ORTPipelineForText2Image.from_pretrained(directory, provider=provider, use_io_binding=use_io_binding)
+    else:
+        pipeline = ORTPipelineForText2Image.from_pretrained(
+            model_name,
+            export=True,
+            provider=provider,
+            use_io_binding=use_io_binding,
+        )
+        pipeline.save_pretrained(directory)
+    if disable_safety_checker:
+        pipeline.safety_checker = None
+        pipeline.feature_extractor = None
+    return pipeline
+def run_optimum_ort_pipeline(
+    pipe,
+    batch_size: int,
+    image_filename_prefix: str,
+    height,
+    width,
+    steps,
+    num_prompts,
+    batch_count,
+    start_memory,
+    memory_monitor_type,
+    use_num_images_per_prompt=False,
+    skip_warmup=False,
+):
+    print("Pipeline type", type(pipe))
+    from optimum.onnxruntime.modeling_diffusion import ORTFluxPipeline  # noqa: PLC0415
+    is_flux = isinstance(pipe, ORTFluxPipeline)
+    prompts, negative_prompt = example_prompts()
+    def warmup():
+        if skip_warmup:
+            return
+        prompt, negative = warmup_prompts()
+        extra_kwargs = get_negative_prompt_kwargs(negative, use_num_images_per_prompt, is_flux, batch_size)
+        if use_num_images_per_prompt:
+            pipe(
+                prompt=prompt,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                num_images_per_prompt=batch_count,
+                **extra_kwargs,
+            )
+        else:
+            pipe(prompt=[prompt] * batch_size, height=height, width=width, num_inference_steps=steps, **extra_kwargs)
+    # Run warm up, and measure GPU memory of two runs.
+    # The first run has algo search for cuDNN/MIOpen, so it might need more memory.
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    extra_kwargs = get_negative_prompt_kwargs(negative_prompt, use_num_images_per_prompt, is_flux, batch_size)
+    latency_list = []
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        inference_start = time.time()
+        if use_num_images_per_prompt:
+            images = pipe(
+                prompt=prompt,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                num_images_per_prompt=batch_size,
+                **extra_kwargs,
+            ).images
+        else:
+            images = pipe(
+                prompt=[prompt] * batch_size, height=height, width=width, num_inference_steps=steps, **extra_kwargs
+            ).images
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"Inference took {latency:.3f} seconds")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
+    from onnxruntime import __version__ as ort_version  # noqa: PLC0415
+    return {
+        "engine": "optimum_ort",
+        "version": ort_version,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+    }
+def run_optimum_ort(
+    model_name: str,
+    directory: str,
+    provider: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+    use_io_binding: bool = False,
+    skip_warmup: bool = False,
+):
+    load_start = time.time()
+    pipe = get_optimum_ort_pipeline(
+        model_name, directory, provider, disable_safety_checker, use_io_binding=use_io_binding
+    )
+    load_end = time.time()
+    print(f"Model loading took {load_end - load_start} seconds")
+    full_model_name = model_name + "_" + Path(directory).name if directory else model_name
+    image_filename_prefix = get_image_filename_prefix(
+        "optimum", full_model_name, batch_size, steps, disable_safety_checker
+    )
+    result = run_optimum_ort_pipeline(
+        pipe,
+        batch_size,
+        image_filename_prefix,
+        height,
+        width,
+        steps,
+        num_prompts,
+        batch_count,
+        start_memory,
+        memory_monitor_type,
+        skip_warmup=skip_warmup,
+    )
+    result.update(
+        {
+            "model_name": model_name,
+            "directory": directory,
+            "provider": provider.replace("ExecutionProvider", ""),
+            "disable_safety_checker": disable_safety_checker,
+            "enable_cuda_graph": False,
+        }
+    )
+    return result
+def run_ort_trt_static(
+    work_dir: str,
+    version: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+    max_batch_size: int,
+    nvtx_profile: bool = False,
+    use_cuda_graph: bool = True,
+):
+    print("[I] Initializing ORT TensorRT EP accelerated StableDiffusionXL txt2img pipeline (static input shape)")
+    # Register TensorRT plugins
+    from trt_utilities import init_trt_plugins  # noqa: PLC0415
+    init_trt_plugins()
+    assert batch_size <= max_batch_size
+    from diffusion_models import PipelineInfo  # noqa: PLC0415
+    pipeline_info = PipelineInfo(version)
+    short_name = pipeline_info.short_name()
+    from engine_builder import EngineType, get_engine_paths  # noqa: PLC0415
+    from pipeline_stable_diffusion import StableDiffusionPipeline  # noqa: PLC0415
+    engine_type = EngineType.ORT_TRT
+    onnx_dir, engine_dir, output_dir, framework_model_dir, _ = get_engine_paths(work_dir, pipeline_info, engine_type)
+    # Initialize pipeline
+    pipeline = StableDiffusionPipeline(
+        pipeline_info,
+        scheduler="DDIM",
+        output_dir=output_dir,
+        verbose=False,
+        nvtx_profile=nvtx_profile,
+        max_batch_size=max_batch_size,
+        use_cuda_graph=use_cuda_graph,
+        framework_model_dir=framework_model_dir,
+        engine_type=engine_type,
+    )
+    # Load TensorRT engines and pytorch modules
+    pipeline.backend.build_engines(
+        engine_dir,
+        framework_model_dir,
+        onnx_dir,
+        17,
+        opt_image_height=height,
+        opt_image_width=width,
+        opt_batch_size=batch_size,
+        static_batch=True,
+        static_image_shape=True,
+        max_workspace_size=0,
+        device_id=torch.cuda.current_device(),
+    )
+    # Here we use static batch and image size, so the resource allocation only need done once.
+    # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
+    pipeline.load_resources(height, width, batch_size)
+    def warmup():
+        prompt, negative = warmup_prompts()
+        pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
+    # Run warm up, and measure GPU memory of two runs
+    # The first run has algo search so it might need more memory
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    image_filename_prefix = get_image_filename_prefix("ort_trt", short_name, batch_size, steps, disable_safety_checker)
+    latency_list = []
+    prompts, negative_prompt = example_prompts()
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = pipeline.run(
+            [prompt] * batch_size,
+            [negative_prompt] * batch_size,
+            height,
+            width,
+            denoising_steps=steps,
+            guidance=7.5,
+            seed=123,
+        )
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
+    pipeline.teardown()
+    from tensorrt import __version__ as trt_version  # noqa: PLC0415
+    from onnxruntime import __version__ as ort_version  # noqa: PLC0415
+    return {
+        "model_name": pipeline_info.name(),
+        "engine": "onnxruntime",
+        "version": ort_version,
+        "provider": f"tensorrt({trt_version})",
+        "directory": engine_dir,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+        "disable_safety_checker": disable_safety_checker,
+        "enable_cuda_graph": use_cuda_graph,
+    }
+def run_tensorrt_static(
+    work_dir: str,
+    version: str,
+    model_name: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+    max_batch_size: int,
+    nvtx_profile: bool = False,
+    use_cuda_graph: bool = True,
+    skip_warmup: bool = False,
+):
+    print("[I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)")
+    from cuda import cudart  # noqa: PLC0415
+    # Register TensorRT plugins
+    from trt_utilities import init_trt_plugins  # noqa: PLC0415
+    init_trt_plugins()
+    assert batch_size <= max_batch_size
+    from diffusion_models import PipelineInfo  # noqa: PLC0415
+    pipeline_info = PipelineInfo(version)
+    from engine_builder import EngineType, get_engine_paths  # noqa: PLC0415
+    from pipeline_stable_diffusion import StableDiffusionPipeline  # noqa: PLC0415
+    engine_type = EngineType.TRT
+    onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
+        work_dir, pipeline_info, engine_type
+    )
+    # Initialize pipeline
+    pipeline = StableDiffusionPipeline(
+        pipeline_info,
+        scheduler="DDIM",
+        output_dir=output_dir,
+        verbose=False,
+        nvtx_profile=nvtx_profile,
+        max_batch_size=max_batch_size,
+        use_cuda_graph=True,
+        engine_type=engine_type,
+    )
+    # Load TensorRT engines and pytorch modules
+    pipeline.backend.load_engines(
+        engine_dir=engine_dir,
+        framework_model_dir=framework_model_dir,
+        onnx_dir=onnx_dir,
+        onnx_opset=17,
+        opt_batch_size=batch_size,
+        opt_image_height=height,
+        opt_image_width=width,
+        static_batch=True,
+        static_shape=True,
+        enable_all_tactics=False,
+        timing_cache=timing_cache,
+    )
+    # activate engines
+    max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory())
+    _, shared_device_memory = cudart.cudaMalloc(max_device_memory)
+    pipeline.backend.activate_engines(shared_device_memory)
+    # Here we use static batch and image size, so the resource allocation only need done once.
+    # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
+    pipeline.load_resources(height, width, batch_size)
+    def warmup():
+        if skip_warmup:
+            return
+        prompt, negative = warmup_prompts()
+        pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
+    # Run warm up, and measure GPU memory of two runs
+    # The first run has algo search so it might need more memory
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, steps, disable_safety_checker)
+    latency_list = []
+    prompts, negative_prompt = example_prompts()
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = pipeline.run(
+            [prompt] * batch_size,
+            [negative_prompt] * batch_size,
+            height,
+            width,
+            denoising_steps=steps,
+            seed=123,
+        )
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
+    pipeline.teardown()
+    import tensorrt as trt  # noqa: PLC0415
+    return {
+        "engine": "tensorrt",
+        "version": trt.__version__,
+        "provider": "default",
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+        "enable_cuda_graph": use_cuda_graph,
+    }
+def run_tensorrt_static_xl(
+    work_dir: str,
+    version: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+    max_batch_size: int,
+    nvtx_profile: bool = False,
+    use_cuda_graph=True,
+    skip_warmup: bool = False,
+):
+    print("[I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)")
+    import tensorrt as trt  # noqa: PLC0415
+    from cuda import cudart  # noqa: PLC0415
+    from trt_utilities import init_trt_plugins  # noqa: PLC0415
+    # Validate image dimensions
+    image_height = height
+    image_width = width
+    if image_height % 8 != 0 or image_width % 8 != 0:
+        raise ValueError(
+            f"Image height and width have to be divisible by 8 but specified as: {image_height} and {image_width}."
+        )
+    # Register TensorRT plugins
+    init_trt_plugins()
+    assert batch_size <= max_batch_size
+    from diffusion_models import PipelineInfo  # noqa: PLC0415
+    from engine_builder import EngineType, get_engine_paths  # noqa: PLC0415
+    def init_pipeline(pipeline_class, pipeline_info):
+        engine_type = EngineType.TRT
+        onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
+            work_dir, pipeline_info, engine_type
+        )
+        # Initialize pipeline
+        pipeline = pipeline_class(
+            pipeline_info,
+            scheduler="DDIM",
+            output_dir=output_dir,
+            verbose=False,
+            nvtx_profile=nvtx_profile,
+            max_batch_size=max_batch_size,
+            use_cuda_graph=use_cuda_graph,
+            framework_model_dir=framework_model_dir,
+            engine_type=engine_type,
+        )
+        pipeline.backend.load_engines(
+            engine_dir=engine_dir,
+            framework_model_dir=framework_model_dir,
+            onnx_dir=onnx_dir,
+            onnx_opset=17,
+            opt_batch_size=batch_size,
+            opt_image_height=height,
+            opt_image_width=width,
+            static_batch=True,
+            static_shape=True,
+            enable_all_tactics=False,
+            timing_cache=timing_cache,
+        )
+        return pipeline
+    from pipeline_stable_diffusion import StableDiffusionPipeline  # noqa: PLC0415
+    pipeline_info = PipelineInfo(version)
+    pipeline = init_pipeline(StableDiffusionPipeline, pipeline_info)
+    max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory())
+    _, shared_device_memory = cudart.cudaMalloc(max_device_memory)
+    pipeline.backend.activate_engines(shared_device_memory)
+    # Here we use static batch and image size, so the resource allocation only need done once.
+    # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
+    pipeline.load_resources(image_height, image_width, batch_size)
+    def run_sd_xl_inference(prompt, negative_prompt, seed=None):
+        return pipeline.run(
+            prompt,
+            negative_prompt,
+            image_height,
+            image_width,
+            denoising_steps=steps,
+            guidance=5.0,
+            seed=seed,
+        )
+    def warmup():
+        if skip_warmup:
+            return
+        prompt, negative = warmup_prompts()
+        run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
+    # Run warm up, and measure GPU memory of two runs
+    # The first run has algo search so it might need more memory
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    model_name = pipeline_info.name()
+    image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, steps, disable_safety_checker)
+    latency_list = []
+    prompts, negative_prompt = example_prompts()
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.png")
+    pipeline.teardown()
+    return {
+        "model_name": model_name,
+        "engine": "tensorrt",
+        "version": trt.__version__,
+        "provider": "default",
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+        "enable_cuda_graph": use_cuda_graph,
+    }
+def run_ort_trt_xl(
+    work_dir: str,
+    version: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+    max_batch_size: int,
+    nvtx_profile: bool = False,
+    use_cuda_graph=True,
+    skip_warmup: bool = False,
+):
+    from demo_utils import initialize_pipeline  # noqa: PLC0415
+    from engine_builder import EngineType  # noqa: PLC0415
+    pipeline = initialize_pipeline(
+        version=version,
+        engine_type=EngineType.ORT_TRT,
+        work_dir=work_dir,
+        height=height,
+        width=width,
+        use_cuda_graph=use_cuda_graph,
+        max_batch_size=max_batch_size,
+        opt_batch_size=batch_size,
+    )
+    assert batch_size <= max_batch_size
+    pipeline.load_resources(height, width, batch_size)
+    def run_sd_xl_inference(prompt, negative_prompt, seed=None):
+        return pipeline.run(
+            prompt,
+            negative_prompt,
+            height,
+            width,
+            denoising_steps=steps,
+            guidance=5.0,
+            seed=seed,
+        )
+    def warmup():
+        if skip_warmup:
+            return
+        prompt, negative = warmup_prompts()
+        run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
+    # Run warm up, and measure GPU memory of two runs
+    # The first run has algo search so it might need more memory
+    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
+    warmup()
+    model_name = pipeline.pipeline_info.name()
+    image_filename_prefix = get_image_filename_prefix("ort_trt", model_name, batch_size, steps, disable_safety_checker)
+    latency_list = []
+    prompts, negative_prompt = example_prompts()
+    for i, prompt in enumerate(prompts):
+        if i >= num_prompts:
+            break
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            filename = f"{image_filename_prefix}_{i}_{k}.png"
+            image.save(filename)
+            print("Image saved to", filename)
+    pipeline.teardown()
+    from tensorrt import __version__ as trt_version  # noqa: PLC0415
+    from onnxruntime import __version__ as ort_version  # noqa: PLC0415
+    return {
+        "model_name": model_name,
+        "engine": "onnxruntime",
+        "version": ort_version,
+        "provider": f"tensorrt{trt_version})",
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "batch_size": batch_size,
+        "batch_count": batch_count,
+        "num_prompts": num_prompts,
+        "average_latency": sum(latency_list) / len(latency_list),
+        "median_latency": statistics.median(latency_list),
+        "first_run_memory_MB": first_run_memory,
+        "second_run_memory_MB": second_run_memory,
+        "enable_cuda_graph": use_cuda_graph,
+    }
+def run_torch(
+    model_name: str,
+    batch_size: int,
+    disable_safety_checker: bool,
+    enable_torch_compile: bool,
+    use_xformers: bool,
+    height: int,
+    width: int,
+    steps: int,
+    num_prompts: int,
+    batch_count: int,
+    start_memory,
+    memory_monitor_type,
+    skip_warmup: bool = True,
+):
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+    torch.set_grad_enabled(False)
+    load_start = time.time()
+    pipe = get_torch_pipeline(model_name, disable_safety_checker, enable_torch_compile, use_xformers)
+    load_end = time.time()
+    print(f"Model loading took {load_end - load_start} seconds")
+    image_filename_prefix = get_image_filename_prefix("torch", model_name, batch_size, steps, disable_safety_checker)
+    if not enable_torch_compile:
+        with torch.inference_mode():
+            result = run_torch_pipeline(
+                pipe,
+                batch_size,
+                image_filename_prefix,
+                height,
+                width,
+                steps,
+                num_prompts,
+                batch_count,
+                start_memory,
+                memory_monitor_type,
+                skip_warmup=skip_warmup,
+            )
+    else:
+        result = run_torch_pipeline(
+            pipe,
+            batch_size,
+            image_filename_prefix,
+            height,
+            width,
+            steps,
+            num_prompts,
+            batch_count,
+            start_memory,
+            memory_monitor_type,
+            skip_warmup=skip_warmup,
+        )
+    result.update(
+        {
+            "model_name": model_name,
+            "directory": None,
+            "provider": "compile" if enable_torch_compile else "xformers" if use_xformers else "default",
+            "disable_safety_checker": disable_safety_checker,
+            "enable_cuda_graph": False,
+        }
+    )
+    return result
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-e",
+        "--engine",
+        required=False,
+        type=str,
+        default="onnxruntime",
+        choices=["onnxruntime", "optimum", "torch", "tensorrt"],
+        help="Engines to benchmark. Default is onnxruntime.",
+    )
+    parser.add_argument(
+        "-r",
+        "--provider",
+        required=False,
+        type=str,
+        default="cuda",
+        choices=list(PROVIDERS.keys()),
+        help="Provider to benchmark. Default is CUDAExecutionProvider.",
+    )
+    parser.add_argument(
+        "-t",
+        "--tuning",
+        action="store_true",
+        help="Enable TunableOp and tuning. This will incur longer warmup latency.",
+    )
+    parser.add_argument(
+        "-v",
+        "--version",
+        required=False,
+        type=str,
+        choices=list(SD_MODELS.keys()),
+        default="1.5",
+        help="Stable diffusion version like 1.5, 2.0 or 2.1. Default is 1.5.",
+    )
+    parser.add_argument(
+        "-p",
+        "--pipeline",
+        required=False,
+        type=str,
+        default=None,
+        help="Directory of saved onnx pipeline. It could be the output directory of optimize_pipeline.py.",
+    )
+    parser.add_argument(
+        "-w",
+        "--work_dir",
+        required=False,
+        type=str,
+        default=".",
+        help="Root directory to save exported onnx models, built engines etc.",
+    )
+    parser.add_argument(
+        "--enable_safety_checker",
+        required=False,
+        action="store_true",
+        help="Enable safety checker",
+    )
+    parser.set_defaults(enable_safety_checker=False)
+    parser.add_argument(
+        "--enable_torch_compile",
+        required=False,
+        action="store_true",
+        help="Enable compile unet for PyTorch 2.0",
+    )
+    parser.set_defaults(enable_torch_compile=False)
+    parser.add_argument(
+        "--use_xformers",
+        required=False,
+        action="store_true",
+        help="Use xformers for PyTorch",
+    )
+    parser.set_defaults(use_xformers=False)
+    parser.add_argument(
+        "--use_io_binding",
+        required=False,
+        action="store_true",
+        help="Use I/O Binding for Optimum.",
+    )
+    parser.set_defaults(use_io_binding=False)
+    parser.add_argument(
+        "--skip_warmup",
+        required=False,
+        action="store_true",
+        help="No warmup.",
+    )
+    parser.set_defaults(skip_warmup=False)
+    parser.add_argument(
+        "-b",
+        "--batch_size",
+        type=int,
+        default=1,
+        choices=[1, 2, 3, 4, 8, 10, 16, 32],
+        help="Number of images per batch. Default is 1.",
+    )
+    parser.add_argument(
+        "--height",
+        required=False,
+        type=int,
+        default=512,
+        help="Output image height. Default is 512.",
+    )
+    parser.add_argument(
+        "--width",
+        required=False,
+        type=int,
+        default=512,
+        help="Output image width. Default is 512.",
+    )
+    parser.add_argument(
+        "-s",
+        "--steps",
+        required=False,
+        type=int,
+        default=50,
+        help="Number of steps. Default is 50.",
+    )
+    parser.add_argument(
+        "-n",
+        "--num_prompts",
+        required=False,
+        type=int,
+        default=10,
+        help="Number of prompts. Default is 10.",
+    )
+    parser.add_argument(
+        "-c",
+        "--batch_count",
+        required=False,
+        type=int,
+        choices=range(1, 11),
+        default=5,
+        help="Number of batches to test. Default is 5.",
+    )
+    parser.add_argument(
+        "-m",
+        "--max_trt_batch_size",
+        required=False,
+        type=int,
+        choices=range(1, 16),
+        default=4,
+        help="Maximum batch size for TensorRT. Change the value may trigger TensorRT engine rebuild. Default is 4.",
+    )
+    parser.add_argument(
+        "-g",
+        "--enable_cuda_graph",
+        required=False,
+        action="store_true",
+        help="Enable Cuda Graph. Requires onnxruntime >= 1.16",
+    )
+    parser.set_defaults(enable_cuda_graph=False)
+    args = parser.parse_args()
+    return args
+def print_loaded_libraries(cuda_related_only=True):
+    import psutil  # noqa: PLC0415
+    p = psutil.Process(os.getpid())
+    for lib in p.memory_maps():
+        if (not cuda_related_only) or any(x in lib.path for x in ("libcu", "libnv", "tensorrt")):
+            print(lib.path)
+def main():
+    args = parse_arguments()
+    print(args)
+    if args.engine == "onnxruntime":
+        if args.version in ["2.1"]:
+            # Set a flag to avoid overflow in attention, which causes black image output in SD 2.1 model.
+            # The environment variables shall be set before the first run of Attention or MultiHeadAttention operator.
+            os.environ["ORT_DISABLE_TRT_FLASH_ATTENTION"] = "1"
+        from packaging import version  # noqa: PLC0415
+        from onnxruntime import __version__ as ort_version  # noqa: PLC0415
+        if version.parse(ort_version) == version.parse("1.16.0"):
+            # ORT 1.16 has a bug that might trigger Attention RuntimeError when latest fusion script is applied on clip model.
+            # The walkaround is to enable fused causal attention, or disable Attention fusion for clip model.
+            os.environ["ORT_ENABLE_FUSED_CAUSAL_ATTENTION"] = "1"
+        if args.enable_cuda_graph:
+            if not (args.engine == "onnxruntime" and args.provider in ["cuda", "tensorrt"] and args.pipeline is None):
+                raise ValueError("The stable diffusion pipeline does not support CUDA graph.")
+            if version.parse(ort_version) < version.parse("1.16"):
+                raise ValueError("CUDA graph requires ONNX Runtime 1.16 or later")
+    logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO, force=True)
+    memory_monitor_type = "cuda"
+    start_memory = measure_gpu_memory(memory_monitor_type, None)
+    print("GPU memory used before loading models:", start_memory)
+    sd_model = SD_MODELS[args.version]
+    provider = PROVIDERS[args.provider]
+    if args.engine == "onnxruntime" and args.provider == "tensorrt":
+        if "xl" in args.version:
+            print("Testing Txt2ImgXLPipeline with static input shape. Backend is ORT TensorRT EP.")
+            result = run_ort_trt_xl(
+                work_dir=args.work_dir,
+                version=args.version,
+                batch_size=args.batch_size,
+                disable_safety_checker=True,
+                height=args.height,
+                width=args.width,
+                steps=args.steps,
+                num_prompts=args.num_prompts,
+                batch_count=args.batch_count,
+                start_memory=start_memory,
+                memory_monitor_type=memory_monitor_type,
+                max_batch_size=args.max_trt_batch_size,
+                nvtx_profile=False,
+                use_cuda_graph=args.enable_cuda_graph,
+                skip_warmup=args.skip_warmup,
+            )
+        else:
+            print("Testing Txt2ImgPipeline with static input shape. Backend is ORT TensorRT EP.")
+            result = run_ort_trt_static(
+                work_dir=args.work_dir,
+                version=args.version,
+                batch_size=args.batch_size,
+                disable_safety_checker=not args.enable_safety_checker,
+                height=args.height,
+                width=args.width,
+                steps=args.steps,
+                num_prompts=args.num_prompts,
+                batch_count=args.batch_count,
+                start_memory=start_memory,
+                memory_monitor_type=memory_monitor_type,
+                max_batch_size=args.max_trt_batch_size,
+                nvtx_profile=False,
+                use_cuda_graph=args.enable_cuda_graph,
+                skip_warmup=args.skip_warmup,
+            )
+    elif args.engine == "optimum" and provider == "CUDAExecutionProvider":
+        if "xl" in args.version:
+            os.environ["ORT_ENABLE_FUSED_CAUSAL_ATTENTION"] = "1"
+        result = run_optimum_ort(
+            model_name=sd_model,
+            directory=args.pipeline,
+            provider=provider,
+            batch_size=args.batch_size,
+            disable_safety_checker=not args.enable_safety_checker,
+            height=args.height,
+            width=args.width,
+            steps=args.steps,
+            num_prompts=args.num_prompts,
+            batch_count=args.batch_count,
+            start_memory=start_memory,
+            memory_monitor_type=memory_monitor_type,
+            use_io_binding=args.use_io_binding,
+            skip_warmup=args.skip_warmup,
+        )
+    elif args.engine == "onnxruntime":
+        assert args.pipeline and os.path.isdir(args.pipeline), (
+            "--pipeline should be specified for the directory of ONNX models"
+        )
+        print(f"Testing diffusers StableDiffusionPipeline with {provider} provider and tuning={args.tuning}")
+        result = run_ort(
+            model_name=sd_model,
+            directory=args.pipeline,
+            provider=provider,
+            batch_size=args.batch_size,
+            disable_safety_checker=not args.enable_safety_checker,
+            height=args.height,
+            width=args.width,
+            steps=args.steps,
+            num_prompts=args.num_prompts,
+            batch_count=args.batch_count,
+            start_memory=start_memory,
+            memory_monitor_type=memory_monitor_type,
+            tuning=args.tuning,
+            skip_warmup=args.skip_warmup,
+        )
+    elif args.engine == "tensorrt" and "xl" in args.version:
+        print("Testing Txt2ImgXLPipeline with static input shape. Backend is TensorRT.")
+        result = run_tensorrt_static_xl(
+            work_dir=args.work_dir,
+            version=args.version,
+            batch_size=args.batch_size,
+            disable_safety_checker=True,
+            height=args.height,
+            width=args.width,
+            steps=args.steps,
+            num_prompts=args.num_prompts,
+            batch_count=args.batch_count,
+            start_memory=start_memory,
+            memory_monitor_type=memory_monitor_type,
+            max_batch_size=args.max_trt_batch_size,
+            nvtx_profile=False,
+            use_cuda_graph=args.enable_cuda_graph,
+            skip_warmup=args.skip_warmup,
+        )
+    elif args.engine == "tensorrt":
+        print("Testing Txt2ImgPipeline with static input shape. Backend is TensorRT.")
+        result = run_tensorrt_static(
+            work_dir=args.work_dir,
+            version=args.version,
+            model_name=sd_model,
+            batch_size=args.batch_size,
+            disable_safety_checker=True,
+            height=args.height,
+            width=args.width,
+            steps=args.steps,
+            num_prompts=args.num_prompts,
+            batch_count=args.batch_count,
+            start_memory=start_memory,
+            memory_monitor_type=memory_monitor_type,
+            max_batch_size=args.max_trt_batch_size,
+            nvtx_profile=False,
+            use_cuda_graph=args.enable_cuda_graph,
+            skip_warmup=args.skip_warmup,
+        )
+    else:
+        print(
+            f"Testing Txt2ImgPipeline with dynamic input shape. Backend is PyTorch: compile={args.enable_torch_compile}, xformers={args.use_xformers}."
+        )
+        result = run_torch(
+            model_name=sd_model,
+            batch_size=args.batch_size,
+            disable_safety_checker=not args.enable_safety_checker,
+            enable_torch_compile=args.enable_torch_compile,
+            use_xformers=args.use_xformers,
+            height=args.height,
+            width=args.width,
+            steps=args.steps,
+            num_prompts=args.num_prompts,
+            batch_count=args.batch_count,
+            start_memory=start_memory,
+            memory_monitor_type=memory_monitor_type,
+            skip_warmup=args.skip_warmup,
+        )
+    print(result)
+    with open("benchmark_result.csv", mode="a", newline="") as csv_file:
+        column_names = [
+            "model_name",
+            "directory",
+            "engine",
+            "version",
+            "provider",
+            "disable_safety_checker",
+            "height",
+            "width",
+            "steps",
+            "batch_size",
+            "batch_count",
+            "num_prompts",
+            "average_latency",
+            "median_latency",
+            "first_run_memory_MB",
+            "second_run_memory_MB",
+            "enable_cuda_graph",
+        ]
+        csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
+        csv_writer.writeheader()
+        csv_writer.writerow(result)
+    # Show loaded DLLs when steps == 1 for debugging purpose.
+    if args.steps == 1:
+        print_loaded_libraries(args.provider in ["cuda", "tensorrt"])
+if __name__ == "__main__":
+    import traceback
+    try:
+        main()
+    except Exception:
+        traceback.print_exception(*sys.exc_info())