onnxruntime-directml 1.20.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (305) hide show
  1. onnxruntime/LICENSE +21 -0
  2. onnxruntime/Privacy.md +21 -0
  3. onnxruntime/ThirdPartyNotices.txt +6508 -0
  4. onnxruntime/__init__.py +78 -0
  5. onnxruntime/backend/__init__.py +6 -0
  6. onnxruntime/backend/backend.py +174 -0
  7. onnxruntime/backend/backend_rep.py +53 -0
  8. onnxruntime/capi/DirectML.dll +0 -0
  9. onnxruntime/capi/__init__.py +4 -0
  10. onnxruntime/capi/_ld_preload.py +7 -0
  11. onnxruntime/capi/_pybind_state.py +33 -0
  12. onnxruntime/capi/convert_npz_to_onnx_adapter.py +48 -0
  13. onnxruntime/capi/onnxruntime.dll +0 -0
  14. onnxruntime/capi/onnxruntime_collect_build_info.py +47 -0
  15. onnxruntime/capi/onnxruntime_inference_collection.py +1108 -0
  16. onnxruntime/capi/onnxruntime_providers_shared.dll +0 -0
  17. onnxruntime/capi/onnxruntime_pybind11_state.pyd +0 -0
  18. onnxruntime/capi/onnxruntime_validation.py +150 -0
  19. onnxruntime/capi/version_info.py +2 -0
  20. onnxruntime/datasets/__init__.py +17 -0
  21. onnxruntime/datasets/logreg_iris.onnx +0 -0
  22. onnxruntime/datasets/mul_1.onnx +0 -0
  23. onnxruntime/datasets/sigmoid.onnx +13 -0
  24. onnxruntime/quantization/CalTableFlatBuffers/KeyValue.py +78 -0
  25. onnxruntime/quantization/CalTableFlatBuffers/TrtTable.py +90 -0
  26. onnxruntime/quantization/CalTableFlatBuffers/__init__.py +0 -0
  27. onnxruntime/quantization/__init__.py +16 -0
  28. onnxruntime/quantization/base_quantizer.py +532 -0
  29. onnxruntime/quantization/calibrate.py +1245 -0
  30. onnxruntime/quantization/execution_providers/qnn/__init__.py +2 -0
  31. onnxruntime/quantization/execution_providers/qnn/fusion_lpnorm.py +132 -0
  32. onnxruntime/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py +413 -0
  33. onnxruntime/quantization/execution_providers/qnn/preprocess.py +307 -0
  34. onnxruntime/quantization/execution_providers/qnn/quant_config.py +387 -0
  35. onnxruntime/quantization/fusions/__init__.py +3 -0
  36. onnxruntime/quantization/fusions/fusion.py +311 -0
  37. onnxruntime/quantization/fusions/fusion_gelu.py +272 -0
  38. onnxruntime/quantization/fusions/fusion_layernorm.py +135 -0
  39. onnxruntime/quantization/matmul_4bits_quantizer.py +1480 -0
  40. onnxruntime/quantization/matmul_bnb4_quantizer.py +240 -0
  41. onnxruntime/quantization/onnx_model.py +580 -0
  42. onnxruntime/quantization/onnx_quantizer.py +1008 -0
  43. onnxruntime/quantization/operators/__init__.py +2 -0
  44. onnxruntime/quantization/operators/activation.py +119 -0
  45. onnxruntime/quantization/operators/argmax.py +18 -0
  46. onnxruntime/quantization/operators/attention.py +73 -0
  47. onnxruntime/quantization/operators/base_operator.py +26 -0
  48. onnxruntime/quantization/operators/binary_op.py +72 -0
  49. onnxruntime/quantization/operators/concat.py +62 -0
  50. onnxruntime/quantization/operators/conv.py +258 -0
  51. onnxruntime/quantization/operators/direct_q8.py +78 -0
  52. onnxruntime/quantization/operators/embed_layernorm.py +121 -0
  53. onnxruntime/quantization/operators/gather.py +64 -0
  54. onnxruntime/quantization/operators/gavgpool.py +62 -0
  55. onnxruntime/quantization/operators/gemm.py +166 -0
  56. onnxruntime/quantization/operators/lstm.py +117 -0
  57. onnxruntime/quantization/operators/matmul.py +231 -0
  58. onnxruntime/quantization/operators/maxpool.py +34 -0
  59. onnxruntime/quantization/operators/norm.py +40 -0
  60. onnxruntime/quantization/operators/pad.py +100 -0
  61. onnxruntime/quantization/operators/pooling.py +67 -0
  62. onnxruntime/quantization/operators/qdq_base_operator.py +22 -0
  63. onnxruntime/quantization/operators/resize.py +34 -0
  64. onnxruntime/quantization/operators/softmax.py +74 -0
  65. onnxruntime/quantization/operators/split.py +63 -0
  66. onnxruntime/quantization/operators/where.py +87 -0
  67. onnxruntime/quantization/preprocess.py +141 -0
  68. onnxruntime/quantization/qdq_loss_debug.py +389 -0
  69. onnxruntime/quantization/qdq_quantizer.py +1187 -0
  70. onnxruntime/quantization/quant_utils.py +891 -0
  71. onnxruntime/quantization/quantize.py +748 -0
  72. onnxruntime/quantization/registry.py +106 -0
  73. onnxruntime/quantization/shape_inference.py +187 -0
  74. onnxruntime/quantization/tensor_quant_overrides.py +516 -0
  75. onnxruntime/tools/__init__.py +10 -0
  76. onnxruntime/tools/check_onnx_model_mobile_usability.py +47 -0
  77. onnxruntime/tools/convert_onnx_models_to_ort.py +377 -0
  78. onnxruntime/tools/file_utils.py +46 -0
  79. onnxruntime/tools/logger.py +11 -0
  80. onnxruntime/tools/make_dynamic_shape_fixed.py +72 -0
  81. onnxruntime/tools/mobile_helpers/__init__.py +0 -0
  82. onnxruntime/tools/mobile_helpers/coreml_supported_mlprogram_ops.md +33 -0
  83. onnxruntime/tools/mobile_helpers/coreml_supported_neuralnetwork_ops.md +43 -0
  84. onnxruntime/tools/mobile_helpers/nnapi_supported_ops.md +58 -0
  85. onnxruntime/tools/mobile_helpers/usability_checker.py +739 -0
  86. onnxruntime/tools/offline_tuning.py +169 -0
  87. onnxruntime/tools/onnx_model_utils.py +413 -0
  88. onnxruntime/tools/onnx_randomizer.py +85 -0
  89. onnxruntime/tools/onnxruntime_test.py +164 -0
  90. onnxruntime/tools/optimize_onnx_model.py +55 -0
  91. onnxruntime/tools/ort_format_model/__init__.py +25 -0
  92. onnxruntime/tools/ort_format_model/operator_type_usage_processors.py +663 -0
  93. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/__init__.py +0 -0
  94. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ArgType.py +7 -0
  95. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ArgTypeAndIndex.py +67 -0
  96. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Attribute.py +337 -0
  97. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/AttributeType.py +18 -0
  98. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Checkpoint.py +125 -0
  99. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedKernelCreateInfos.py +120 -0
  100. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedNodeIndexAndKernelDefHash.py +68 -0
  101. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedSessionState.py +96 -0
  102. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedSubGraphSessionState.py +72 -0
  103. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Dimension.py +71 -0
  104. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DimensionValue.py +80 -0
  105. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DimensionValueType.py +8 -0
  106. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/EdgeEnd.py +32 -0
  107. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/FloatProperty.py +67 -0
  108. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Graph.py +320 -0
  109. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/InferenceSession.py +88 -0
  110. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/IntProperty.py +67 -0
  111. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/KernelTypeStrArgsEntry.py +91 -0
  112. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/KernelTypeStrResolver.py +78 -0
  113. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/MapType.py +71 -0
  114. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Model.py +223 -0
  115. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ModuleState.py +141 -0
  116. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Node.py +317 -0
  117. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodeEdge.py +126 -0
  118. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodeType.py +7 -0
  119. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodesToOptimizeIndices.py +160 -0
  120. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OpIdKernelTypeStrArgsEntry.py +91 -0
  121. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OperatorSetId.py +67 -0
  122. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OptimizerGroup.py +117 -0
  123. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ParameterOptimizerState.py +91 -0
  124. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/PropertyBag.py +152 -0
  125. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizationRecord.py +105 -0
  126. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizationRecordContainerEntry.py +91 -0
  127. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizations.py +79 -0
  128. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/SequenceType.py +58 -0
  129. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Shape.py +78 -0
  130. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/SparseTensor.py +114 -0
  131. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/StringProperty.py +67 -0
  132. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/StringStringEntry.py +67 -0
  133. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Tensor.py +203 -0
  134. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TensorDataType.py +26 -0
  135. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TensorTypeAndShape.py +71 -0
  136. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TypeInfo.py +83 -0
  137. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TypeInfoValue.py +9 -0
  138. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ValueInfo.py +84 -0
  139. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/__init__.py +6 -0
  140. onnxruntime/tools/ort_format_model/ort_model_processor.py +86 -0
  141. onnxruntime/tools/ort_format_model/types.py +84 -0
  142. onnxruntime/tools/ort_format_model/utils.py +62 -0
  143. onnxruntime/tools/pytorch_export_contrib_ops.py +108 -0
  144. onnxruntime/tools/pytorch_export_helpers.py +131 -0
  145. onnxruntime/tools/qdq_helpers/__init__.py +0 -0
  146. onnxruntime/tools/qdq_helpers/optimize_qdq_model.py +37 -0
  147. onnxruntime/tools/reduced_build_config_parser.py +202 -0
  148. onnxruntime/tools/symbolic_shape_infer.py +3016 -0
  149. onnxruntime/tools/update_onnx_opset.py +31 -0
  150. onnxruntime/transformers/__init__.py +8 -0
  151. onnxruntime/transformers/affinity_helper.py +40 -0
  152. onnxruntime/transformers/benchmark.py +944 -0
  153. onnxruntime/transformers/benchmark_helper.py +646 -0
  154. onnxruntime/transformers/bert_perf_test.py +634 -0
  155. onnxruntime/transformers/bert_test_data.py +642 -0
  156. onnxruntime/transformers/compare_bert_results.py +246 -0
  157. onnxruntime/transformers/constants.py +47 -0
  158. onnxruntime/transformers/convert_generation.py +3124 -0
  159. onnxruntime/transformers/convert_tf_models_to_pytorch.py +205 -0
  160. onnxruntime/transformers/convert_to_packing_mode.py +387 -0
  161. onnxruntime/transformers/dynamo_onnx_helper.py +104 -0
  162. onnxruntime/transformers/float16.py +501 -0
  163. onnxruntime/transformers/fusion_attention.py +1235 -0
  164. onnxruntime/transformers/fusion_attention_clip.py +257 -0
  165. onnxruntime/transformers/fusion_attention_sam2.py +534 -0
  166. onnxruntime/transformers/fusion_attention_unet.py +1304 -0
  167. onnxruntime/transformers/fusion_attention_vae.py +301 -0
  168. onnxruntime/transformers/fusion_bart_attention.py +640 -0
  169. onnxruntime/transformers/fusion_base.py +137 -0
  170. onnxruntime/transformers/fusion_bias_add.py +58 -0
  171. onnxruntime/transformers/fusion_biasgelu.py +66 -0
  172. onnxruntime/transformers/fusion_biassplitgelu.py +111 -0
  173. onnxruntime/transformers/fusion_conformer_attention.py +143 -0
  174. onnxruntime/transformers/fusion_embedlayer.py +811 -0
  175. onnxruntime/transformers/fusion_fastgelu.py +360 -0
  176. onnxruntime/transformers/fusion_gelu.py +259 -0
  177. onnxruntime/transformers/fusion_gelu_approximation.py +25 -0
  178. onnxruntime/transformers/fusion_gemmfastgelu.py +122 -0
  179. onnxruntime/transformers/fusion_gpt_attention.py +546 -0
  180. onnxruntime/transformers/fusion_gpt_attention_megatron.py +355 -0
  181. onnxruntime/transformers/fusion_gpt_attention_no_past.py +260 -0
  182. onnxruntime/transformers/fusion_group_norm.py +179 -0
  183. onnxruntime/transformers/fusion_layernorm.py +465 -0
  184. onnxruntime/transformers/fusion_nhwc_conv.py +100 -0
  185. onnxruntime/transformers/fusion_options.py +340 -0
  186. onnxruntime/transformers/fusion_qordered_attention.py +421 -0
  187. onnxruntime/transformers/fusion_qordered_gelu.py +119 -0
  188. onnxruntime/transformers/fusion_qordered_layernorm.py +123 -0
  189. onnxruntime/transformers/fusion_qordered_matmul.py +217 -0
  190. onnxruntime/transformers/fusion_quickgelu.py +74 -0
  191. onnxruntime/transformers/fusion_reshape.py +173 -0
  192. onnxruntime/transformers/fusion_rotary_attention.py +1592 -0
  193. onnxruntime/transformers/fusion_shape.py +110 -0
  194. onnxruntime/transformers/fusion_simplified_layernorm.py +159 -0
  195. onnxruntime/transformers/fusion_skip_group_norm.py +255 -0
  196. onnxruntime/transformers/fusion_skiplayernorm.py +209 -0
  197. onnxruntime/transformers/fusion_transpose.py +168 -0
  198. onnxruntime/transformers/fusion_utils.py +307 -0
  199. onnxruntime/transformers/huggingface_models.py +167 -0
  200. onnxruntime/transformers/import_utils.py +20 -0
  201. onnxruntime/transformers/io_binding_helper.py +442 -0
  202. onnxruntime/transformers/large_model_exporter.py +395 -0
  203. onnxruntime/transformers/machine_info.py +221 -0
  204. onnxruntime/transformers/metrics.py +164 -0
  205. onnxruntime/transformers/models/bart/__init__.py +12 -0
  206. onnxruntime/transformers/models/bart/export.py +98 -0
  207. onnxruntime/transformers/models/bert/__init__.py +12 -0
  208. onnxruntime/transformers/models/bert/eval_squad.py +329 -0
  209. onnxruntime/transformers/models/gpt2/__init__.py +12 -0
  210. onnxruntime/transformers/models/gpt2/benchmark_gpt2.py +413 -0
  211. onnxruntime/transformers/models/gpt2/convert_to_onnx.py +561 -0
  212. onnxruntime/transformers/models/gpt2/gpt2_helper.py +1032 -0
  213. onnxruntime/transformers/models/gpt2/gpt2_parity.py +513 -0
  214. onnxruntime/transformers/models/gpt2/gpt2_tester.py +501 -0
  215. onnxruntime/transformers/models/gpt2/parity_check_helper.py +146 -0
  216. onnxruntime/transformers/models/llama/__init__.py +12 -0
  217. onnxruntime/transformers/models/llama/benchmark.py +703 -0
  218. onnxruntime/transformers/models/llama/benchmark_all.py +488 -0
  219. onnxruntime/transformers/models/llama/benchmark_e2e.py +606 -0
  220. onnxruntime/transformers/models/llama/convert_to_onnx.py +1027 -0
  221. onnxruntime/transformers/models/llama/dist_settings.py +57 -0
  222. onnxruntime/transformers/models/llama/llama_inputs.py +503 -0
  223. onnxruntime/transformers/models/llama/llama_parity.py +309 -0
  224. onnxruntime/transformers/models/llama/llama_torch.py +47 -0
  225. onnxruntime/transformers/models/llama/quant_kv_dataloader.py +108 -0
  226. onnxruntime/transformers/models/longformer/__init__.py +12 -0
  227. onnxruntime/transformers/models/longformer/benchmark_longformer.py +821 -0
  228. onnxruntime/transformers/models/longformer/convert_to_onnx.py +413 -0
  229. onnxruntime/transformers/models/longformer/generate_test_data.py +347 -0
  230. onnxruntime/transformers/models/longformer/longformer_helper.py +77 -0
  231. onnxruntime/transformers/models/phi2/__init__.py +12 -0
  232. onnxruntime/transformers/models/phi2/convert_to_onnx.py +576 -0
  233. onnxruntime/transformers/models/phi2/inference_example.py +414 -0
  234. onnxruntime/transformers/models/sam2/__init__.py +12 -0
  235. onnxruntime/transformers/models/sam2/benchmark_sam2.py +625 -0
  236. onnxruntime/transformers/models/sam2/convert_to_onnx.py +260 -0
  237. onnxruntime/transformers/models/sam2/image_decoder.py +273 -0
  238. onnxruntime/transformers/models/sam2/image_encoder.py +186 -0
  239. onnxruntime/transformers/models/sam2/mask_decoder.py +208 -0
  240. onnxruntime/transformers/models/sam2/nvtx_helper.py +33 -0
  241. onnxruntime/transformers/models/sam2/prompt_encoder.py +189 -0
  242. onnxruntime/transformers/models/sam2/sam2_demo.py +322 -0
  243. onnxruntime/transformers/models/sam2/sam2_image_onnx_predictor.py +280 -0
  244. onnxruntime/transformers/models/sam2/sam2_utils.py +147 -0
  245. onnxruntime/transformers/models/stable_diffusion/__init__.py +12 -0
  246. onnxruntime/transformers/models/stable_diffusion/benchmark.py +1429 -0
  247. onnxruntime/transformers/models/stable_diffusion/benchmark_controlnet.py +426 -0
  248. onnxruntime/transformers/models/stable_diffusion/demo_txt2img.py +102 -0
  249. onnxruntime/transformers/models/stable_diffusion/demo_txt2img_xl.py +268 -0
  250. onnxruntime/transformers/models/stable_diffusion/demo_utils.py +778 -0
  251. onnxruntime/transformers/models/stable_diffusion/diffusion_models.py +1319 -0
  252. onnxruntime/transformers/models/stable_diffusion/diffusion_schedulers.py +1181 -0
  253. onnxruntime/transformers/models/stable_diffusion/engine_builder.py +296 -0
  254. onnxruntime/transformers/models/stable_diffusion/engine_builder_ort_cuda.py +388 -0
  255. onnxruntime/transformers/models/stable_diffusion/engine_builder_ort_trt.py +288 -0
  256. onnxruntime/transformers/models/stable_diffusion/engine_builder_tensorrt.py +395 -0
  257. onnxruntime/transformers/models/stable_diffusion/engine_builder_torch.py +108 -0
  258. onnxruntime/transformers/models/stable_diffusion/optimize_pipeline.py +350 -0
  259. onnxruntime/transformers/models/stable_diffusion/ort_optimizer.py +136 -0
  260. onnxruntime/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +831 -0
  261. onnxruntime/transformers/models/stable_diffusion/trt_utilities.py +12 -0
  262. onnxruntime/transformers/models/t5/__init__.py +12 -0
  263. onnxruntime/transformers/models/t5/convert_to_onnx.py +278 -0
  264. onnxruntime/transformers/models/t5/past_helper.py +150 -0
  265. onnxruntime/transformers/models/t5/t5_decoder.py +438 -0
  266. onnxruntime/transformers/models/t5/t5_encoder.py +171 -0
  267. onnxruntime/transformers/models/t5/t5_encoder_decoder_init.py +299 -0
  268. onnxruntime/transformers/models/t5/t5_helper.py +272 -0
  269. onnxruntime/transformers/models/whisper/__init__.py +12 -0
  270. onnxruntime/transformers/models/whisper/benchmark.py +610 -0
  271. onnxruntime/transformers/models/whisper/benchmark_all.py +528 -0
  272. onnxruntime/transformers/models/whisper/convert_to_onnx.py +536 -0
  273. onnxruntime/transformers/models/whisper/whisper_chain.py +329 -0
  274. onnxruntime/transformers/models/whisper/whisper_decoder.py +402 -0
  275. onnxruntime/transformers/models/whisper/whisper_encoder.py +164 -0
  276. onnxruntime/transformers/models/whisper/whisper_encoder_decoder_init.py +306 -0
  277. onnxruntime/transformers/models/whisper/whisper_helper.py +524 -0
  278. onnxruntime/transformers/models/whisper/whisper_openai_helper.py +84 -0
  279. onnxruntime/transformers/onnx_exporter.py +717 -0
  280. onnxruntime/transformers/onnx_model.py +1569 -0
  281. onnxruntime/transformers/onnx_model_bart.py +142 -0
  282. onnxruntime/transformers/onnx_model_bert.py +481 -0
  283. onnxruntime/transformers/onnx_model_bert_keras.py +475 -0
  284. onnxruntime/transformers/onnx_model_bert_tf.py +589 -0
  285. onnxruntime/transformers/onnx_model_clip.py +40 -0
  286. onnxruntime/transformers/onnx_model_conformer.py +33 -0
  287. onnxruntime/transformers/onnx_model_gpt2.py +101 -0
  288. onnxruntime/transformers/onnx_model_phi.py +930 -0
  289. onnxruntime/transformers/onnx_model_sam2.py +138 -0
  290. onnxruntime/transformers/onnx_model_t5.py +791 -0
  291. onnxruntime/transformers/onnx_model_tnlr.py +227 -0
  292. onnxruntime/transformers/onnx_model_unet.py +259 -0
  293. onnxruntime/transformers/onnx_model_vae.py +43 -0
  294. onnxruntime/transformers/onnx_utils.py +55 -0
  295. onnxruntime/transformers/optimizer.py +612 -0
  296. onnxruntime/transformers/profiler.py +725 -0
  297. onnxruntime/transformers/quantize_helper.py +76 -0
  298. onnxruntime/transformers/shape_infer_helper.py +122 -0
  299. onnxruntime/transformers/shape_optimizer.py +401 -0
  300. onnxruntime/transformers/torch_onnx_export_helper.py +74 -0
  301. onnxruntime_directml-1.20.0.dist-info/METADATA +187 -0
  302. onnxruntime_directml-1.20.0.dist-info/RECORD +305 -0
  303. onnxruntime_directml-1.20.0.dist-info/WHEEL +5 -0
  304. onnxruntime_directml-1.20.0.dist-info/entry_points.txt +2 -0
  305. onnxruntime_directml-1.20.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1429 @@
1
+ # -------------------------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # Licensed under the MIT License.
4
+ # --------------------------------------------------------------------------
5
+
6
+ import argparse
7
+ import csv
8
+ import os
9
+ import statistics
10
+ import sys
11
+ import time
12
+
13
+ import __init__ # noqa: F401. Walk-around to run this script directly
14
+ import coloredlogs
15
+
16
+ # import torch before onnxruntime so that onnxruntime uses the cuDNN in the torch package.
17
+ import torch
18
+ from benchmark_helper import measure_memory
19
+
20
+ SD_MODELS = {
21
+ "1.5": "runwayml/stable-diffusion-v1-5",
22
+ "2.0": "stabilityai/stable-diffusion-2",
23
+ "2.1": "stabilityai/stable-diffusion-2-1",
24
+ "xl-1.0": "stabilityai/stable-diffusion-xl-refiner-1.0",
25
+ }
26
+
27
+ PROVIDERS = {
28
+ "cuda": "CUDAExecutionProvider",
29
+ "rocm": "ROCMExecutionProvider",
30
+ "migraphx": "MIGraphXExecutionProvider",
31
+ "tensorrt": "TensorrtExecutionProvider",
32
+ }
33
+
34
+
35
+ def example_prompts():
36
+ prompts = [
37
+ "a photo of an astronaut riding a horse on mars",
38
+ "cute grey cat with blue eyes, wearing a bowtie, acrylic painting",
39
+ "a cute magical flying dog, fantasy art drawn by disney concept artists, highly detailed, digital painting",
40
+ "an illustration of a house with large barn with many cute flower pots and beautiful blue sky scenery",
41
+ "one apple sitting on a table, still life, reflective, full color photograph, centered, close-up product",
42
+ "background texture of stones, masterpiece, artistic, stunning photo, award winner photo",
43
+ "new international organic style house, tropical surroundings, architecture, 8k, hdr",
44
+ "beautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstation",
45
+ "blue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realistic",
46
+ "delicate elvish moonstone necklace on a velvet background, symmetrical intricate motifs, leaves, flowers, 8k",
47
+ ]
48
+
49
+ negative_prompt = "bad composition, ugly, abnormal, malformed"
50
+
51
+ return prompts, negative_prompt
52
+
53
+
54
+ def measure_gpu_memory(monitor_type, func, start_memory=None):
55
+ return measure_memory(is_gpu=True, func=func, monitor_type=monitor_type, start_memory=start_memory)
56
+
57
+
58
+ def get_ort_pipeline(model_name: str, directory: str, provider, disable_safety_checker: bool):
59
+ from diffusers import DDIMScheduler, OnnxStableDiffusionPipeline
60
+
61
+ import onnxruntime
62
+
63
+ if directory is not None:
64
+ assert os.path.exists(directory)
65
+ session_options = onnxruntime.SessionOptions()
66
+ pipe = OnnxStableDiffusionPipeline.from_pretrained(
67
+ directory,
68
+ provider=provider,
69
+ sess_options=session_options,
70
+ )
71
+ else:
72
+ pipe = OnnxStableDiffusionPipeline.from_pretrained(
73
+ model_name,
74
+ revision="onnx",
75
+ provider=provider,
76
+ use_auth_token=True,
77
+ )
78
+ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
79
+ pipe.set_progress_bar_config(disable=True)
80
+
81
+ if disable_safety_checker:
82
+ pipe.safety_checker = None
83
+ pipe.feature_extractor = None
84
+
85
+ return pipe
86
+
87
+
88
+ def get_torch_pipeline(model_name: str, disable_safety_checker: bool, enable_torch_compile: bool, use_xformers: bool):
89
+ from diffusers import DDIMScheduler, StableDiffusionPipeline
90
+ from torch import channels_last, float16
91
+
92
+ pipe = StableDiffusionPipeline.from_pretrained(model_name, torch_dtype=float16).to("cuda")
93
+
94
+ pipe.unet.to(memory_format=channels_last) # in-place operation
95
+
96
+ if use_xformers:
97
+ pipe.enable_xformers_memory_efficient_attention()
98
+
99
+ if enable_torch_compile:
100
+ pipe.unet = torch.compile(pipe.unet)
101
+ pipe.vae = torch.compile(pipe.vae)
102
+ pipe.text_encoder = torch.compile(pipe.text_encoder)
103
+ print("Torch compiled unet, vae and text_encoder")
104
+
105
+ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
106
+ pipe.set_progress_bar_config(disable=True)
107
+
108
+ if disable_safety_checker:
109
+ pipe.safety_checker = None
110
+ pipe.feature_extractor = None
111
+
112
+ return pipe
113
+
114
+
115
+ def get_image_filename_prefix(engine: str, model_name: str, batch_size: int, disable_safety_checker: bool):
116
+ short_model_name = model_name.split("/")[-1].replace("stable-diffusion-", "sd")
117
+ return f"{engine}_{short_model_name}_b{batch_size}" + ("" if disable_safety_checker else "_safe")
118
+
119
+
120
+ def run_ort_pipeline(
121
+ pipe,
122
+ batch_size: int,
123
+ image_filename_prefix: str,
124
+ height,
125
+ width,
126
+ steps,
127
+ num_prompts,
128
+ batch_count,
129
+ start_memory,
130
+ memory_monitor_type,
131
+ ):
132
+ from diffusers import OnnxStableDiffusionPipeline
133
+
134
+ assert isinstance(pipe, OnnxStableDiffusionPipeline)
135
+
136
+ prompts, negative_prompt = example_prompts()
137
+
138
+ def warmup():
139
+ pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
140
+
141
+ # Run warm up, and measure GPU memory of two runs
142
+ # cuDNN/MIOpen The first run has algo search so it might need more memory)
143
+ first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
144
+ second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
145
+
146
+ warmup()
147
+
148
+ latency_list = []
149
+ for i, prompt in enumerate(prompts):
150
+ if i >= num_prompts:
151
+ break
152
+ for j in range(batch_count):
153
+ inference_start = time.time()
154
+ images = pipe(
155
+ [prompt] * batch_size,
156
+ height,
157
+ width,
158
+ num_inference_steps=steps,
159
+ negative_prompt=[negative_prompt] * batch_size,
160
+ guidance_scale=7.5,
161
+ ).images
162
+ inference_end = time.time()
163
+ latency = inference_end - inference_start
164
+ latency_list.append(latency)
165
+ print(f"Inference took {latency:.3f} seconds")
166
+ for k, image in enumerate(images):
167
+ image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
168
+
169
+ from onnxruntime import __version__ as ort_version
170
+
171
+ return {
172
+ "engine": "onnxruntime",
173
+ "version": ort_version,
174
+ "height": height,
175
+ "width": width,
176
+ "steps": steps,
177
+ "batch_size": batch_size,
178
+ "batch_count": batch_count,
179
+ "num_prompts": num_prompts,
180
+ "average_latency": sum(latency_list) / len(latency_list),
181
+ "median_latency": statistics.median(latency_list),
182
+ "first_run_memory_MB": first_run_memory,
183
+ "second_run_memory_MB": second_run_memory,
184
+ }
185
+
186
+
187
+ def run_torch_pipeline(
188
+ pipe,
189
+ batch_size: int,
190
+ image_filename_prefix: str,
191
+ height,
192
+ width,
193
+ steps,
194
+ num_prompts,
195
+ batch_count,
196
+ start_memory,
197
+ memory_monitor_type,
198
+ ):
199
+ prompts, negative_prompt = example_prompts()
200
+
201
+ # total 2 runs of warm up, and measure GPU memory for CUDA EP
202
+ def warmup():
203
+ pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
204
+
205
+ # Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
206
+ first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
207
+ second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
208
+
209
+ warmup()
210
+
211
+ torch.set_grad_enabled(False)
212
+
213
+ latency_list = []
214
+ for i, prompt in enumerate(prompts):
215
+ if i >= num_prompts:
216
+ break
217
+ torch.cuda.synchronize()
218
+ for j in range(batch_count):
219
+ inference_start = time.time()
220
+ images = pipe(
221
+ prompt=[prompt] * batch_size,
222
+ height=height,
223
+ width=width,
224
+ num_inference_steps=steps,
225
+ guidance_scale=7.5,
226
+ negative_prompt=[negative_prompt] * batch_size,
227
+ generator=None, # torch.Generator
228
+ ).images
229
+
230
+ torch.cuda.synchronize()
231
+ inference_end = time.time()
232
+ latency = inference_end - inference_start
233
+ latency_list.append(latency)
234
+ print(f"Inference took {latency:.3f} seconds")
235
+ for k, image in enumerate(images):
236
+ image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
237
+
238
+ return {
239
+ "engine": "torch",
240
+ "version": torch.__version__,
241
+ "height": height,
242
+ "width": width,
243
+ "steps": steps,
244
+ "batch_size": batch_size,
245
+ "batch_count": batch_count,
246
+ "num_prompts": num_prompts,
247
+ "average_latency": sum(latency_list) / len(latency_list),
248
+ "median_latency": statistics.median(latency_list),
249
+ "first_run_memory_MB": first_run_memory,
250
+ "second_run_memory_MB": second_run_memory,
251
+ }
252
+
253
+
254
+ def run_ort(
255
+ model_name: str,
256
+ directory: str,
257
+ provider: str,
258
+ batch_size: int,
259
+ disable_safety_checker: bool,
260
+ height: int,
261
+ width: int,
262
+ steps: int,
263
+ num_prompts: int,
264
+ batch_count: int,
265
+ start_memory,
266
+ memory_monitor_type,
267
+ tuning: bool,
268
+ ):
269
+ provider_and_options = provider
270
+ if tuning and provider in ["CUDAExecutionProvider", "ROCMExecutionProvider"]:
271
+ provider_and_options = (provider, {"tunable_op_enable": 1, "tunable_op_tuning_enable": 1})
272
+
273
+ load_start = time.time()
274
+ pipe = get_ort_pipeline(model_name, directory, provider_and_options, disable_safety_checker)
275
+ load_end = time.time()
276
+ print(f"Model loading took {load_end - load_start} seconds")
277
+
278
+ image_filename_prefix = get_image_filename_prefix("ort", model_name, batch_size, disable_safety_checker)
279
+ result = run_ort_pipeline(
280
+ pipe,
281
+ batch_size,
282
+ image_filename_prefix,
283
+ height,
284
+ width,
285
+ steps,
286
+ num_prompts,
287
+ batch_count,
288
+ start_memory,
289
+ memory_monitor_type,
290
+ )
291
+
292
+ result.update(
293
+ {
294
+ "model_name": model_name,
295
+ "directory": directory,
296
+ "provider": provider.replace("ExecutionProvider", ""),
297
+ "disable_safety_checker": disable_safety_checker,
298
+ "enable_cuda_graph": False,
299
+ }
300
+ )
301
+ return result
302
+
303
+
304
+ def get_optimum_ort_pipeline(
305
+ model_name: str,
306
+ directory: str,
307
+ provider="CUDAExecutionProvider",
308
+ disable_safety_checker: bool = True,
309
+ ):
310
+ from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
311
+
312
+ if directory is not None and os.path.exists(directory):
313
+ if "xl" in model_name:
314
+ pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
315
+ directory,
316
+ provider=provider,
317
+ session_options=None,
318
+ use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification.
319
+ )
320
+ else:
321
+ pipeline = ORTStableDiffusionPipeline.from_pretrained(
322
+ directory,
323
+ provider=provider,
324
+ use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification.
325
+ )
326
+ elif "xl" in model_name:
327
+ pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
328
+ model_name,
329
+ export=True,
330
+ provider=provider,
331
+ session_options=None,
332
+ use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification.
333
+ )
334
+ pipeline.save_pretrained(directory)
335
+ else:
336
+ pipeline = ORTStableDiffusionPipeline.from_pretrained(
337
+ model_name,
338
+ export=True,
339
+ provider=provider,
340
+ use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification.
341
+ )
342
+ pipeline.save_pretrained(directory)
343
+
344
+ if disable_safety_checker:
345
+ pipeline.safety_checker = None
346
+ pipeline.feature_extractor = None
347
+
348
+ return pipeline
349
+
350
+
351
+ def run_optimum_ort_pipeline(
352
+ pipe,
353
+ batch_size: int,
354
+ image_filename_prefix: str,
355
+ height,
356
+ width,
357
+ steps,
358
+ num_prompts,
359
+ batch_count,
360
+ start_memory,
361
+ memory_monitor_type,
362
+ ):
363
+ from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
364
+
365
+ assert isinstance(pipe, (ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline))
366
+
367
+ prompts = example_prompts()
368
+
369
+ def warmup():
370
+ pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
371
+
372
+ # Run warm up, and measure GPU memory of two runs.
373
+ # The first run has algo search for cuDNN/MIOpen, so it might need more memory.
374
+ first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
375
+ second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
376
+
377
+ warmup()
378
+
379
+ latency_list = []
380
+ for i, prompt in enumerate(prompts):
381
+ if i >= num_prompts:
382
+ break
383
+ for j in range(batch_count):
384
+ inference_start = time.time()
385
+ images = pipe(
386
+ prompt,
387
+ height,
388
+ width,
389
+ num_inference_steps=steps,
390
+ negative_prompt=None,
391
+ guidance_scale=0.0, # 7.5
392
+ num_images_per_prompt=batch_size,
393
+ ).images
394
+ inference_end = time.time()
395
+ latency = inference_end - inference_start
396
+ latency_list.append(latency)
397
+ print(f"Inference took {latency:.3f} seconds")
398
+ for k, image in enumerate(images):
399
+ image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
400
+
401
+ from onnxruntime import __version__ as ort_version
402
+
403
+ return {
404
+ "engine": "optimum_ort",
405
+ "version": ort_version,
406
+ "height": height,
407
+ "width": width,
408
+ "steps": steps,
409
+ "batch_size": batch_size,
410
+ "batch_count": batch_count,
411
+ "num_prompts": num_prompts,
412
+ "average_latency": sum(latency_list) / len(latency_list),
413
+ "median_latency": statistics.median(latency_list),
414
+ "first_run_memory_MB": first_run_memory,
415
+ "second_run_memory_MB": second_run_memory,
416
+ }
417
+
418
+
419
+ def run_optimum_ort(
420
+ model_name: str,
421
+ directory: str,
422
+ provider: str,
423
+ batch_size: int,
424
+ disable_safety_checker: bool,
425
+ height: int,
426
+ width: int,
427
+ steps: int,
428
+ num_prompts: int,
429
+ batch_count: int,
430
+ start_memory,
431
+ memory_monitor_type,
432
+ ):
433
+ load_start = time.time()
434
+ pipe = get_optimum_ort_pipeline(model_name, directory, provider, disable_safety_checker)
435
+ load_end = time.time()
436
+ print(f"Model loading took {load_end - load_start} seconds")
437
+
438
+ image_filename_prefix = get_image_filename_prefix("optimum", model_name, batch_size, disable_safety_checker)
439
+ result = run_optimum_ort_pipeline(
440
+ pipe,
441
+ batch_size,
442
+ image_filename_prefix,
443
+ height,
444
+ width,
445
+ steps,
446
+ num_prompts,
447
+ batch_count,
448
+ start_memory,
449
+ memory_monitor_type,
450
+ )
451
+
452
+ result.update(
453
+ {
454
+ "model_name": model_name,
455
+ "directory": directory,
456
+ "provider": provider.replace("ExecutionProvider", ""),
457
+ "disable_safety_checker": disable_safety_checker,
458
+ "enable_cuda_graph": False,
459
+ }
460
+ )
461
+ return result
462
+
463
+
464
+ def run_ort_trt_static(
465
+ work_dir: str,
466
+ version: str,
467
+ batch_size: int,
468
+ disable_safety_checker: bool,
469
+ height: int,
470
+ width: int,
471
+ steps: int,
472
+ num_prompts: int,
473
+ batch_count: int,
474
+ start_memory,
475
+ memory_monitor_type,
476
+ max_batch_size: int,
477
+ nvtx_profile: bool = False,
478
+ use_cuda_graph: bool = True,
479
+ ):
480
+ print("[I] Initializing ORT TensorRT EP accelerated StableDiffusionXL txt2img pipeline (static input shape)")
481
+
482
+ # Register TensorRT plugins
483
+ from trt_utilities import init_trt_plugins
484
+
485
+ init_trt_plugins()
486
+
487
+ assert batch_size <= max_batch_size
488
+
489
+ from diffusion_models import PipelineInfo
490
+
491
+ pipeline_info = PipelineInfo(version)
492
+ short_name = pipeline_info.short_name()
493
+
494
+ from engine_builder import EngineType, get_engine_paths
495
+ from pipeline_stable_diffusion import StableDiffusionPipeline
496
+
497
+ engine_type = EngineType.ORT_TRT
498
+ onnx_dir, engine_dir, output_dir, framework_model_dir, _ = get_engine_paths(work_dir, pipeline_info, engine_type)
499
+
500
+ # Initialize pipeline
501
+ pipeline = StableDiffusionPipeline(
502
+ pipeline_info,
503
+ scheduler="DDIM",
504
+ output_dir=output_dir,
505
+ verbose=False,
506
+ nvtx_profile=nvtx_profile,
507
+ max_batch_size=max_batch_size,
508
+ use_cuda_graph=use_cuda_graph,
509
+ framework_model_dir=framework_model_dir,
510
+ engine_type=engine_type,
511
+ )
512
+
513
+ # Load TensorRT engines and pytorch modules
514
+ pipeline.backend.build_engines(
515
+ engine_dir,
516
+ framework_model_dir,
517
+ onnx_dir,
518
+ 17,
519
+ opt_image_height=height,
520
+ opt_image_width=width,
521
+ opt_batch_size=batch_size,
522
+ static_batch=True,
523
+ static_image_shape=True,
524
+ max_workspace_size=0,
525
+ device_id=torch.cuda.current_device(),
526
+ )
527
+
528
+ # Here we use static batch and image size, so the resource allocation only need done once.
529
+ # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
530
+ pipeline.load_resources(height, width, batch_size)
531
+
532
+ def warmup():
533
+ pipeline.run(
534
+ ["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
535
+ )
536
+
537
+ # Run warm up, and measure GPU memory of two runs
538
+ # The first run has algo search so it might need more memory
539
+ first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
540
+ second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
541
+
542
+ warmup()
543
+
544
+ image_filename_prefix = get_image_filename_prefix("ort_trt", short_name, batch_size, disable_safety_checker)
545
+
546
+ latency_list = []
547
+ prompts, negative_prompt = example_prompts()
548
+ for i, prompt in enumerate(prompts):
549
+ if i >= num_prompts:
550
+ break
551
+ for j in range(batch_count):
552
+ inference_start = time.time()
553
+ # Use warmup mode here since non-warmup mode will save image to disk.
554
+ images, pipeline_time = pipeline.run(
555
+ [prompt] * batch_size,
556
+ [negative_prompt] * batch_size,
557
+ height,
558
+ width,
559
+ denoising_steps=steps,
560
+ guidance=7.5,
561
+ seed=123,
562
+ )
563
+ inference_end = time.time()
564
+ latency = inference_end - inference_start
565
+ latency_list.append(latency)
566
+ print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
567
+ for k, image in enumerate(images):
568
+ image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
569
+
570
+ pipeline.teardown()
571
+
572
+ from tensorrt import __version__ as trt_version
573
+
574
+ from onnxruntime import __version__ as ort_version
575
+
576
+ return {
577
+ "model_name": pipeline_info.name(),
578
+ "engine": "onnxruntime",
579
+ "version": ort_version,
580
+ "provider": f"tensorrt({trt_version})",
581
+ "directory": engine_dir,
582
+ "height": height,
583
+ "width": width,
584
+ "steps": steps,
585
+ "batch_size": batch_size,
586
+ "batch_count": batch_count,
587
+ "num_prompts": num_prompts,
588
+ "average_latency": sum(latency_list) / len(latency_list),
589
+ "median_latency": statistics.median(latency_list),
590
+ "first_run_memory_MB": first_run_memory,
591
+ "second_run_memory_MB": second_run_memory,
592
+ "disable_safety_checker": disable_safety_checker,
593
+ "enable_cuda_graph": use_cuda_graph,
594
+ }
595
+
596
+
597
+ def run_tensorrt_static(
598
+ work_dir: str,
599
+ version: str,
600
+ model_name: str,
601
+ batch_size: int,
602
+ disable_safety_checker: bool,
603
+ height: int,
604
+ width: int,
605
+ steps: int,
606
+ num_prompts: int,
607
+ batch_count: int,
608
+ start_memory,
609
+ memory_monitor_type,
610
+ max_batch_size: int,
611
+ nvtx_profile: bool = False,
612
+ use_cuda_graph: bool = True,
613
+ ):
614
+ print("[I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)")
615
+
616
+ from cuda import cudart
617
+
618
+ # Register TensorRT plugins
619
+ from trt_utilities import init_trt_plugins
620
+
621
+ init_trt_plugins()
622
+
623
+ assert batch_size <= max_batch_size
624
+
625
+ from diffusion_models import PipelineInfo
626
+
627
+ pipeline_info = PipelineInfo(version)
628
+
629
+ from engine_builder import EngineType, get_engine_paths
630
+ from pipeline_stable_diffusion import StableDiffusionPipeline
631
+
632
+ engine_type = EngineType.TRT
633
+ onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
634
+ work_dir, pipeline_info, engine_type
635
+ )
636
+
637
+ # Initialize pipeline
638
+ pipeline = StableDiffusionPipeline(
639
+ pipeline_info,
640
+ scheduler="DDIM",
641
+ output_dir=output_dir,
642
+ verbose=False,
643
+ nvtx_profile=nvtx_profile,
644
+ max_batch_size=max_batch_size,
645
+ use_cuda_graph=True,
646
+ engine_type=engine_type,
647
+ )
648
+
649
+ # Load TensorRT engines and pytorch modules
650
+ pipeline.backend.load_engines(
651
+ engine_dir=engine_dir,
652
+ framework_model_dir=framework_model_dir,
653
+ onnx_dir=onnx_dir,
654
+ onnx_opset=17,
655
+ opt_batch_size=batch_size,
656
+ opt_image_height=height,
657
+ opt_image_width=width,
658
+ static_batch=True,
659
+ static_shape=True,
660
+ enable_all_tactics=False,
661
+ timing_cache=timing_cache,
662
+ )
663
+
664
+ # activate engines
665
+ max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory())
666
+ _, shared_device_memory = cudart.cudaMalloc(max_device_memory)
667
+ pipeline.backend.activate_engines(shared_device_memory)
668
+
669
+ # Here we use static batch and image size, so the resource allocation only need done once.
670
+ # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
671
+ pipeline.load_resources(height, width, batch_size)
672
+
673
+ def warmup():
674
+ pipeline.run(
675
+ ["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
676
+ )
677
+
678
+ # Run warm up, and measure GPU memory of two runs
679
+ # The first run has algo search so it might need more memory
680
+ first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
681
+ second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
682
+
683
+ warmup()
684
+
685
+ image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, disable_safety_checker)
686
+
687
+ latency_list = []
688
+ prompts, negative_prompt = example_prompts()
689
+ for i, prompt in enumerate(prompts):
690
+ if i >= num_prompts:
691
+ break
692
+ for j in range(batch_count):
693
+ inference_start = time.time()
694
+ # Use warmup mode here since non-warmup mode will save image to disk.
695
+ images, pipeline_time = pipeline.run(
696
+ [prompt] * batch_size,
697
+ [negative_prompt] * batch_size,
698
+ height,
699
+ width,
700
+ denoising_steps=steps,
701
+ guidance=7.5,
702
+ seed=123,
703
+ )
704
+ inference_end = time.time()
705
+ latency = inference_end - inference_start
706
+ latency_list.append(latency)
707
+ print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
708
+ for k, image in enumerate(images):
709
+ image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
710
+
711
+ pipeline.teardown()
712
+
713
+ import tensorrt as trt
714
+
715
+ return {
716
+ "engine": "tensorrt",
717
+ "version": trt.__version__,
718
+ "provider": "default",
719
+ "height": height,
720
+ "width": width,
721
+ "steps": steps,
722
+ "batch_size": batch_size,
723
+ "batch_count": batch_count,
724
+ "num_prompts": num_prompts,
725
+ "average_latency": sum(latency_list) / len(latency_list),
726
+ "median_latency": statistics.median(latency_list),
727
+ "first_run_memory_MB": first_run_memory,
728
+ "second_run_memory_MB": second_run_memory,
729
+ "enable_cuda_graph": use_cuda_graph,
730
+ }
731
+
732
+
733
+ def run_tensorrt_static_xl(
734
+ work_dir: str,
735
+ version: str,
736
+ batch_size: int,
737
+ disable_safety_checker: bool,
738
+ height: int,
739
+ width: int,
740
+ steps: int,
741
+ num_prompts: int,
742
+ batch_count: int,
743
+ start_memory,
744
+ memory_monitor_type,
745
+ max_batch_size: int,
746
+ nvtx_profile: bool = False,
747
+ use_cuda_graph=True,
748
+ ):
749
+ print("[I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)")
750
+
751
+ import tensorrt as trt
752
+ from cuda import cudart
753
+ from trt_utilities import init_trt_plugins
754
+
755
+ # Validate image dimensions
756
+ image_height = height
757
+ image_width = width
758
+ if image_height % 8 != 0 or image_width % 8 != 0:
759
+ raise ValueError(
760
+ f"Image height and width have to be divisible by 8 but specified as: {image_height} and {image_width}."
761
+ )
762
+
763
+ # Register TensorRT plugins
764
+ init_trt_plugins()
765
+
766
+ assert batch_size <= max_batch_size
767
+
768
+ from diffusion_models import PipelineInfo
769
+ from engine_builder import EngineType, get_engine_paths
770
+
771
+ def init_pipeline(pipeline_class, pipeline_info):
772
+ engine_type = EngineType.TRT
773
+
774
+ onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
775
+ work_dir, pipeline_info, engine_type
776
+ )
777
+
778
+ # Initialize pipeline
779
+ pipeline = pipeline_class(
780
+ pipeline_info,
781
+ scheduler="DDIM",
782
+ output_dir=output_dir,
783
+ verbose=False,
784
+ nvtx_profile=nvtx_profile,
785
+ max_batch_size=max_batch_size,
786
+ use_cuda_graph=use_cuda_graph,
787
+ framework_model_dir=framework_model_dir,
788
+ engine_type=engine_type,
789
+ )
790
+
791
+ pipeline.backend.load_engines(
792
+ engine_dir=engine_dir,
793
+ framework_model_dir=framework_model_dir,
794
+ onnx_dir=onnx_dir,
795
+ onnx_opset=17,
796
+ opt_batch_size=batch_size,
797
+ opt_image_height=height,
798
+ opt_image_width=width,
799
+ static_batch=True,
800
+ static_shape=True,
801
+ enable_all_tactics=False,
802
+ timing_cache=timing_cache,
803
+ )
804
+ return pipeline
805
+
806
+ from pipeline_stable_diffusion import StableDiffusionPipeline
807
+
808
+ pipeline_info = PipelineInfo(version)
809
+ pipeline = init_pipeline(StableDiffusionPipeline, pipeline_info)
810
+
811
+ max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory())
812
+ _, shared_device_memory = cudart.cudaMalloc(max_device_memory)
813
+ pipeline.backend.activate_engines(shared_device_memory)
814
+
815
+ # Here we use static batch and image size, so the resource allocation only need done once.
816
+ # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
817
+ pipeline.load_resources(image_height, image_width, batch_size)
818
+
819
+ def run_sd_xl_inference(prompt, negative_prompt, seed=None):
820
+ return pipeline.run(
821
+ prompt,
822
+ negative_prompt,
823
+ image_height,
824
+ image_width,
825
+ denoising_steps=steps,
826
+ guidance=5.0,
827
+ seed=seed,
828
+ )
829
+
830
+ def warmup():
831
+ run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
832
+
833
+ # Run warm up, and measure GPU memory of two runs
834
+ # The first run has algo search so it might need more memory
835
+ first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
836
+ second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
837
+
838
+ warmup()
839
+
840
+ model_name = pipeline_info.name()
841
+ image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, disable_safety_checker)
842
+
843
+ latency_list = []
844
+ prompts, negative_prompt = example_prompts()
845
+ for i, prompt in enumerate(prompts):
846
+ if i >= num_prompts:
847
+ break
848
+ for j in range(batch_count):
849
+ inference_start = time.time()
850
+ # Use warmup mode here since non-warmup mode will save image to disk.
851
+ if nvtx_profile:
852
+ cudart.cudaProfilerStart()
853
+ images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
854
+ if nvtx_profile:
855
+ cudart.cudaProfilerStop()
856
+ inference_end = time.time()
857
+ latency = inference_end - inference_start
858
+ latency_list.append(latency)
859
+ print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
860
+ for k, image in enumerate(images):
861
+ image.save(f"{image_filename_prefix}_{i}_{j}_{k}.png")
862
+
863
+ pipeline.teardown()
864
+
865
+ return {
866
+ "model_name": model_name,
867
+ "engine": "tensorrt",
868
+ "version": trt.__version__,
869
+ "provider": "default",
870
+ "height": height,
871
+ "width": width,
872
+ "steps": steps,
873
+ "batch_size": batch_size,
874
+ "batch_count": batch_count,
875
+ "num_prompts": num_prompts,
876
+ "average_latency": sum(latency_list) / len(latency_list),
877
+ "median_latency": statistics.median(latency_list),
878
+ "first_run_memory_MB": first_run_memory,
879
+ "second_run_memory_MB": second_run_memory,
880
+ "enable_cuda_graph": use_cuda_graph,
881
+ }
882
+
883
+
884
+ def run_ort_trt_xl(
885
+ work_dir: str,
886
+ version: str,
887
+ batch_size: int,
888
+ disable_safety_checker: bool,
889
+ height: int,
890
+ width: int,
891
+ steps: int,
892
+ num_prompts: int,
893
+ batch_count: int,
894
+ start_memory,
895
+ memory_monitor_type,
896
+ max_batch_size: int,
897
+ nvtx_profile: bool = False,
898
+ use_cuda_graph=True,
899
+ ):
900
+ from demo_utils import initialize_pipeline
901
+ from engine_builder import EngineType
902
+
903
+ pipeline = initialize_pipeline(
904
+ version=version,
905
+ engine_type=EngineType.ORT_TRT,
906
+ work_dir=work_dir,
907
+ height=height,
908
+ width=width,
909
+ use_cuda_graph=use_cuda_graph,
910
+ max_batch_size=max_batch_size,
911
+ opt_batch_size=batch_size,
912
+ )
913
+
914
+ from cuda import cudart
915
+
916
+ assert batch_size <= max_batch_size
917
+
918
+ pipeline.load_resources(height, width, batch_size)
919
+
920
+ def run_sd_xl_inference(prompt, negative_prompt, seed=None):
921
+ return pipeline.run(
922
+ prompt,
923
+ negative_prompt,
924
+ height,
925
+ width,
926
+ denoising_steps=steps,
927
+ guidance=5.0,
928
+ seed=seed,
929
+ )
930
+
931
+ def warmup():
932
+ run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
933
+
934
+ # Run warm up, and measure GPU memory of two runs
935
+ # The first run has algo search so it might need more memory
936
+ first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
937
+ second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
938
+
939
+ warmup()
940
+
941
+ model_name = pipeline.pipeline_info.name()
942
+ image_filename_prefix = get_image_filename_prefix("ort_trt", model_name, batch_size, disable_safety_checker)
943
+
944
+ latency_list = []
945
+ prompts, negative_prompt = example_prompts()
946
+ for i, prompt in enumerate(prompts):
947
+ if i >= num_prompts:
948
+ break
949
+ for j in range(batch_count):
950
+ inference_start = time.time()
951
+ # Use warmup mode here since non-warmup mode will save image to disk.
952
+ if nvtx_profile:
953
+ cudart.cudaProfilerStart()
954
+ images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
955
+ if nvtx_profile:
956
+ cudart.cudaProfilerStop()
957
+ inference_end = time.time()
958
+ latency = inference_end - inference_start
959
+ latency_list.append(latency)
960
+ print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
961
+ for k, image in enumerate(images):
962
+ filename = f"{image_filename_prefix}_{i}_{j}_{k}.png"
963
+ image.save(filename)
964
+ print("Image saved to", filename)
965
+
966
+ pipeline.teardown()
967
+
968
+ from tensorrt import __version__ as trt_version
969
+
970
+ from onnxruntime import __version__ as ort_version
971
+
972
+ return {
973
+ "model_name": model_name,
974
+ "engine": "onnxruntime",
975
+ "version": ort_version,
976
+ "provider": f"tensorrt{trt_version})",
977
+ "height": height,
978
+ "width": width,
979
+ "steps": steps,
980
+ "batch_size": batch_size,
981
+ "batch_count": batch_count,
982
+ "num_prompts": num_prompts,
983
+ "average_latency": sum(latency_list) / len(latency_list),
984
+ "median_latency": statistics.median(latency_list),
985
+ "first_run_memory_MB": first_run_memory,
986
+ "second_run_memory_MB": second_run_memory,
987
+ "enable_cuda_graph": use_cuda_graph,
988
+ }
989
+
990
+
991
+ def run_torch(
992
+ model_name: str,
993
+ batch_size: int,
994
+ disable_safety_checker: bool,
995
+ enable_torch_compile: bool,
996
+ use_xformers: bool,
997
+ height: int,
998
+ width: int,
999
+ steps: int,
1000
+ num_prompts: int,
1001
+ batch_count: int,
1002
+ start_memory,
1003
+ memory_monitor_type,
1004
+ ):
1005
+ torch.backends.cudnn.enabled = True
1006
+ torch.backends.cudnn.benchmark = True
1007
+
1008
+ torch.set_grad_enabled(False)
1009
+
1010
+ load_start = time.time()
1011
+ pipe = get_torch_pipeline(model_name, disable_safety_checker, enable_torch_compile, use_xformers)
1012
+ load_end = time.time()
1013
+ print(f"Model loading took {load_end - load_start} seconds")
1014
+
1015
+ image_filename_prefix = get_image_filename_prefix("torch", model_name, batch_size, disable_safety_checker)
1016
+
1017
+ if not enable_torch_compile:
1018
+ with torch.inference_mode():
1019
+ result = run_torch_pipeline(
1020
+ pipe,
1021
+ batch_size,
1022
+ image_filename_prefix,
1023
+ height,
1024
+ width,
1025
+ steps,
1026
+ num_prompts,
1027
+ batch_count,
1028
+ start_memory,
1029
+ memory_monitor_type,
1030
+ )
1031
+ else:
1032
+ result = run_torch_pipeline(
1033
+ pipe,
1034
+ batch_size,
1035
+ image_filename_prefix,
1036
+ height,
1037
+ width,
1038
+ steps,
1039
+ num_prompts,
1040
+ batch_count,
1041
+ start_memory,
1042
+ memory_monitor_type,
1043
+ )
1044
+
1045
+ result.update(
1046
+ {
1047
+ "model_name": model_name,
1048
+ "directory": None,
1049
+ "provider": "compile" if enable_torch_compile else "xformers" if use_xformers else "default",
1050
+ "disable_safety_checker": disable_safety_checker,
1051
+ "enable_cuda_graph": False,
1052
+ }
1053
+ )
1054
+ return result
1055
+
1056
+
1057
+ def parse_arguments():
1058
+ parser = argparse.ArgumentParser()
1059
+
1060
+ parser.add_argument(
1061
+ "-e",
1062
+ "--engine",
1063
+ required=False,
1064
+ type=str,
1065
+ default="onnxruntime",
1066
+ choices=["onnxruntime", "optimum", "torch", "tensorrt"],
1067
+ help="Engines to benchmark. Default is onnxruntime.",
1068
+ )
1069
+
1070
+ parser.add_argument(
1071
+ "-r",
1072
+ "--provider",
1073
+ required=False,
1074
+ type=str,
1075
+ default="cuda",
1076
+ choices=list(PROVIDERS.keys()),
1077
+ help="Provider to benchmark. Default is CUDAExecutionProvider.",
1078
+ )
1079
+
1080
+ parser.add_argument(
1081
+ "-t",
1082
+ "--tuning",
1083
+ action="store_true",
1084
+ help="Enable TunableOp and tuning. "
1085
+ "This will incur longer warmup latency, and is mandatory for some operators of ROCm EP.",
1086
+ )
1087
+
1088
+ parser.add_argument(
1089
+ "-v",
1090
+ "--version",
1091
+ required=False,
1092
+ type=str,
1093
+ choices=list(SD_MODELS.keys()),
1094
+ default="1.5",
1095
+ help="Stable diffusion version like 1.5, 2.0 or 2.1. Default is 1.5.",
1096
+ )
1097
+
1098
+ parser.add_argument(
1099
+ "-p",
1100
+ "--pipeline",
1101
+ required=False,
1102
+ type=str,
1103
+ default=None,
1104
+ help="Directory of saved onnx pipeline. It could be the output directory of optimize_pipeline.py.",
1105
+ )
1106
+
1107
+ parser.add_argument(
1108
+ "-w",
1109
+ "--work_dir",
1110
+ required=False,
1111
+ type=str,
1112
+ default=".",
1113
+ help="Root directory to save exported onnx models, built engines etc.",
1114
+ )
1115
+
1116
+ parser.add_argument(
1117
+ "--enable_safety_checker",
1118
+ required=False,
1119
+ action="store_true",
1120
+ help="Enable safety checker",
1121
+ )
1122
+ parser.set_defaults(enable_safety_checker=False)
1123
+
1124
+ parser.add_argument(
1125
+ "--enable_torch_compile",
1126
+ required=False,
1127
+ action="store_true",
1128
+ help="Enable compile unet for PyTorch 2.0",
1129
+ )
1130
+ parser.set_defaults(enable_torch_compile=False)
1131
+
1132
+ parser.add_argument(
1133
+ "--use_xformers",
1134
+ required=False,
1135
+ action="store_true",
1136
+ help="Use xformers for PyTorch",
1137
+ )
1138
+ parser.set_defaults(use_xformers=False)
1139
+
1140
+ parser.add_argument(
1141
+ "-b",
1142
+ "--batch_size",
1143
+ type=int,
1144
+ default=1,
1145
+ choices=[1, 2, 3, 4, 8, 10, 16, 32],
1146
+ help="Number of images per batch. Default is 1.",
1147
+ )
1148
+
1149
+ parser.add_argument(
1150
+ "--height",
1151
+ required=False,
1152
+ type=int,
1153
+ default=512,
1154
+ help="Output image height. Default is 512.",
1155
+ )
1156
+
1157
+ parser.add_argument(
1158
+ "--width",
1159
+ required=False,
1160
+ type=int,
1161
+ default=512,
1162
+ help="Output image width. Default is 512.",
1163
+ )
1164
+
1165
+ parser.add_argument(
1166
+ "-s",
1167
+ "--steps",
1168
+ required=False,
1169
+ type=int,
1170
+ default=50,
1171
+ help="Number of steps. Default is 50.",
1172
+ )
1173
+
1174
+ parser.add_argument(
1175
+ "-n",
1176
+ "--num_prompts",
1177
+ required=False,
1178
+ type=int,
1179
+ default=1,
1180
+ help="Number of prompts. Default is 1.",
1181
+ )
1182
+
1183
+ parser.add_argument(
1184
+ "-c",
1185
+ "--batch_count",
1186
+ required=False,
1187
+ type=int,
1188
+ choices=range(1, 11),
1189
+ default=5,
1190
+ help="Number of batches to test. Default is 5.",
1191
+ )
1192
+
1193
+ parser.add_argument(
1194
+ "-m",
1195
+ "--max_trt_batch_size",
1196
+ required=False,
1197
+ type=int,
1198
+ choices=range(1, 16),
1199
+ default=4,
1200
+ help="Maximum batch size for TensorRT. Change the value may trigger TensorRT engine rebuild. Default is 4.",
1201
+ )
1202
+
1203
+ parser.add_argument(
1204
+ "-g",
1205
+ "--enable_cuda_graph",
1206
+ required=False,
1207
+ action="store_true",
1208
+ help="Enable Cuda Graph. Requires onnxruntime >= 1.16",
1209
+ )
1210
+ parser.set_defaults(enable_cuda_graph=False)
1211
+
1212
+ args = parser.parse_args()
1213
+
1214
+ return args
1215
+
1216
+
1217
+ def print_loaded_libraries(cuda_related_only=True):
1218
+ import psutil
1219
+
1220
+ p = psutil.Process(os.getpid())
1221
+ for lib in p.memory_maps():
1222
+ if (not cuda_related_only) or any(x in lib.path for x in ("libcu", "libnv", "tensorrt")):
1223
+ print(lib.path)
1224
+
1225
+
1226
+ def main():
1227
+ args = parse_arguments()
1228
+ print(args)
1229
+
1230
+ if args.engine == "onnxruntime":
1231
+ if args.version in ["2.1"]:
1232
+ # Set a flag to avoid overflow in attention, which causes black image output in SD 2.1 model.
1233
+ # The environment variables shall be set before the first run of Attention or MultiHeadAttention operator.
1234
+ os.environ["ORT_DISABLE_TRT_FLASH_ATTENTION"] = "1"
1235
+
1236
+ from packaging import version
1237
+
1238
+ from onnxruntime import __version__ as ort_version
1239
+
1240
+ if version.parse(ort_version) == version.parse("1.16.0"):
1241
+ # ORT 1.16 has a bug that might trigger Attention RuntimeError when latest fusion script is applied on clip model.
1242
+ # The walkaround is to enable fused causal attention, or disable Attention fusion for clip model.
1243
+ os.environ["ORT_ENABLE_FUSED_CAUSAL_ATTENTION"] = "1"
1244
+
1245
+ if args.enable_cuda_graph:
1246
+ if not (args.engine == "onnxruntime" and args.provider in ["cuda", "tensorrt"] and args.pipeline is None):
1247
+ raise ValueError("The stable diffusion pipeline does not support CUDA graph.")
1248
+
1249
+ if version.parse(ort_version) < version.parse("1.16"):
1250
+ raise ValueError("CUDA graph requires ONNX Runtime 1.16 or later")
1251
+
1252
+ coloredlogs.install(fmt="%(funcName)20s: %(message)s")
1253
+
1254
+ memory_monitor_type = "rocm" if args.provider == "rocm" else "cuda"
1255
+
1256
+ start_memory = measure_gpu_memory(memory_monitor_type, None)
1257
+ print("GPU memory used before loading models:", start_memory)
1258
+
1259
+ sd_model = SD_MODELS[args.version]
1260
+ provider = PROVIDERS[args.provider]
1261
+ if args.engine == "onnxruntime" and args.provider == "tensorrt":
1262
+ if "xl" in args.version:
1263
+ print("Testing Txt2ImgXLPipeline with static input shape. Backend is ORT TensorRT EP.")
1264
+ result = run_ort_trt_xl(
1265
+ work_dir=args.work_dir,
1266
+ version=args.version,
1267
+ batch_size=args.batch_size,
1268
+ disable_safety_checker=True,
1269
+ height=args.height,
1270
+ width=args.width,
1271
+ steps=args.steps,
1272
+ num_prompts=args.num_prompts,
1273
+ batch_count=args.batch_count,
1274
+ start_memory=start_memory,
1275
+ memory_monitor_type=memory_monitor_type,
1276
+ max_batch_size=args.max_trt_batch_size,
1277
+ nvtx_profile=False,
1278
+ use_cuda_graph=args.enable_cuda_graph,
1279
+ )
1280
+ else:
1281
+ print("Testing Txt2ImgPipeline with static input shape. Backend is ORT TensorRT EP.")
1282
+ result = run_ort_trt_static(
1283
+ work_dir=args.work_dir,
1284
+ version=args.version,
1285
+ batch_size=args.batch_size,
1286
+ disable_safety_checker=not args.enable_safety_checker,
1287
+ height=args.height,
1288
+ width=args.width,
1289
+ steps=args.steps,
1290
+ num_prompts=args.num_prompts,
1291
+ batch_count=args.batch_count,
1292
+ start_memory=start_memory,
1293
+ memory_monitor_type=memory_monitor_type,
1294
+ max_batch_size=args.max_trt_batch_size,
1295
+ nvtx_profile=False,
1296
+ use_cuda_graph=args.enable_cuda_graph,
1297
+ )
1298
+ elif args.engine == "optimum" and provider == "CUDAExecutionProvider":
1299
+ if "xl" in args.version:
1300
+ os.environ["ORT_ENABLE_FUSED_CAUSAL_ATTENTION"] = "1"
1301
+
1302
+ result = run_optimum_ort(
1303
+ model_name=sd_model,
1304
+ directory=args.pipeline,
1305
+ provider=provider,
1306
+ batch_size=args.batch_size,
1307
+ disable_safety_checker=not args.enable_safety_checker,
1308
+ height=args.height,
1309
+ width=args.width,
1310
+ steps=args.steps,
1311
+ num_prompts=args.num_prompts,
1312
+ batch_count=args.batch_count,
1313
+ start_memory=start_memory,
1314
+ memory_monitor_type=memory_monitor_type,
1315
+ )
1316
+ elif args.engine == "onnxruntime":
1317
+ assert args.pipeline and os.path.isdir(
1318
+ args.pipeline
1319
+ ), "--pipeline should be specified for the directory of ONNX models"
1320
+ print(f"Testing diffusers StableDiffusionPipeline with {provider} provider and tuning={args.tuning}")
1321
+ result = run_ort(
1322
+ model_name=sd_model,
1323
+ directory=args.pipeline,
1324
+ provider=provider,
1325
+ batch_size=args.batch_size,
1326
+ disable_safety_checker=not args.enable_safety_checker,
1327
+ height=args.height,
1328
+ width=args.width,
1329
+ steps=args.steps,
1330
+ num_prompts=args.num_prompts,
1331
+ batch_count=args.batch_count,
1332
+ start_memory=start_memory,
1333
+ memory_monitor_type=memory_monitor_type,
1334
+ tuning=args.tuning,
1335
+ )
1336
+ elif args.engine == "tensorrt" and "xl" in args.version:
1337
+ print("Testing Txt2ImgXLPipeline with static input shape. Backend is TensorRT.")
1338
+ result = run_tensorrt_static_xl(
1339
+ work_dir=args.work_dir,
1340
+ version=args.version,
1341
+ batch_size=args.batch_size,
1342
+ disable_safety_checker=True,
1343
+ height=args.height,
1344
+ width=args.width,
1345
+ steps=args.steps,
1346
+ num_prompts=args.num_prompts,
1347
+ batch_count=args.batch_count,
1348
+ start_memory=start_memory,
1349
+ memory_monitor_type=memory_monitor_type,
1350
+ max_batch_size=args.max_trt_batch_size,
1351
+ nvtx_profile=False,
1352
+ use_cuda_graph=args.enable_cuda_graph,
1353
+ )
1354
+ elif args.engine == "tensorrt":
1355
+ print("Testing Txt2ImgPipeline with static input shape. Backend is TensorRT.")
1356
+ result = run_tensorrt_static(
1357
+ work_dir=args.work_dir,
1358
+ version=args.version,
1359
+ model_name=sd_model,
1360
+ batch_size=args.batch_size,
1361
+ disable_safety_checker=True,
1362
+ height=args.height,
1363
+ width=args.width,
1364
+ steps=args.steps,
1365
+ num_prompts=args.num_prompts,
1366
+ batch_count=args.batch_count,
1367
+ start_memory=start_memory,
1368
+ memory_monitor_type=memory_monitor_type,
1369
+ max_batch_size=args.max_trt_batch_size,
1370
+ nvtx_profile=False,
1371
+ use_cuda_graph=args.enable_cuda_graph,
1372
+ )
1373
+ else:
1374
+ print(
1375
+ f"Testing Txt2ImgPipeline with dynamic input shape. Backend is PyTorch: compile={args.enable_torch_compile}, xformers={args.use_xformers}."
1376
+ )
1377
+ result = run_torch(
1378
+ model_name=sd_model,
1379
+ batch_size=args.batch_size,
1380
+ disable_safety_checker=not args.enable_safety_checker,
1381
+ enable_torch_compile=args.enable_torch_compile,
1382
+ use_xformers=args.use_xformers,
1383
+ height=args.height,
1384
+ width=args.width,
1385
+ steps=args.steps,
1386
+ num_prompts=args.num_prompts,
1387
+ batch_count=args.batch_count,
1388
+ start_memory=start_memory,
1389
+ memory_monitor_type=memory_monitor_type,
1390
+ )
1391
+
1392
+ print(result)
1393
+
1394
+ with open("benchmark_result.csv", mode="a", newline="") as csv_file:
1395
+ column_names = [
1396
+ "model_name",
1397
+ "directory",
1398
+ "engine",
1399
+ "version",
1400
+ "provider",
1401
+ "disable_safety_checker",
1402
+ "height",
1403
+ "width",
1404
+ "steps",
1405
+ "batch_size",
1406
+ "batch_count",
1407
+ "num_prompts",
1408
+ "average_latency",
1409
+ "median_latency",
1410
+ "first_run_memory_MB",
1411
+ "second_run_memory_MB",
1412
+ "enable_cuda_graph",
1413
+ ]
1414
+ csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
1415
+ csv_writer.writeheader()
1416
+ csv_writer.writerow(result)
1417
+
1418
+ # Show loaded DLLs when steps == 1 for debugging purpose.
1419
+ if args.steps == 1:
1420
+ print_loaded_libraries(args.provider in ["cuda", "tensorrt"])
1421
+
1422
+
1423
+ if __name__ == "__main__":
1424
+ import traceback
1425
+
1426
+ try:
1427
+ main()
1428
+ except Exception:
1429
+ traceback.print_exception(*sys.exc_info())