onnxruntime-directml 1.23.0__cp313-cp313-win_amd64.whl → 1.24.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. onnxruntime/ThirdPartyNotices.txt +0 -35
  2. onnxruntime/__init__.py +96 -34
  3. onnxruntime/capi/DirectML.dll +0 -0
  4. onnxruntime/capi/build_and_package_info.py +1 -1
  5. onnxruntime/capi/onnxruntime.dll +0 -0
  6. onnxruntime/capi/onnxruntime_inference_collection.py +74 -17
  7. onnxruntime/capi/onnxruntime_providers_shared.dll +0 -0
  8. onnxruntime/capi/onnxruntime_pybind11_state.pyd +0 -0
  9. onnxruntime/capi/onnxruntime_validation.py +2 -2
  10. onnxruntime/quantization/calibrate.py +17 -2
  11. onnxruntime/quantization/execution_providers/qnn/preprocess.py +21 -3
  12. onnxruntime/quantization/execution_providers/qnn/quant_config.py +0 -17
  13. onnxruntime/quantization/fusions/fusion_layernorm.py +18 -7
  14. onnxruntime/quantization/matmul_nbits_quantizer.py +32 -12
  15. onnxruntime/quantization/qdq_quantizer.py +0 -1
  16. onnxruntime/quantization/quant_utils.py +12 -27
  17. onnxruntime/quantization/registry.py +1 -0
  18. onnxruntime/quantization/shape_inference.py +13 -18
  19. onnxruntime/quantization/static_quantize_runner.py +1 -1
  20. onnxruntime/tools/mobile_helpers/coreml_supported_mlprogram_ops.md +3 -0
  21. onnxruntime/transformers/benchmark.py +1 -4
  22. onnxruntime/transformers/benchmark_helper.py +6 -10
  23. onnxruntime/transformers/bert_perf_test.py +0 -6
  24. onnxruntime/transformers/convert_to_packing_mode.py +4 -5
  25. onnxruntime/transformers/fusion_attention_clip.py +0 -1
  26. onnxruntime/transformers/fusion_base.py +2 -2
  27. onnxruntime/transformers/fusion_utils.py +9 -5
  28. onnxruntime/transformers/io_binding_helper.py +60 -21
  29. onnxruntime/transformers/machine_info.py +8 -6
  30. onnxruntime/transformers/models/gpt2/convert_to_onnx.py +10 -2
  31. onnxruntime/transformers/models/llama/benchmark.py +1 -4
  32. onnxruntime/transformers/models/llama/benchmark_all.py +1 -1
  33. onnxruntime/transformers/models/llama/convert_to_onnx.py +11 -1
  34. onnxruntime/transformers/models/llama/llama_parity.py +1 -1
  35. onnxruntime/transformers/models/longformer/benchmark_longformer.py +1 -1
  36. onnxruntime/transformers/models/longformer/convert_to_onnx.py +1 -1
  37. onnxruntime/transformers/models/phi2/convert_to_onnx.py +8 -0
  38. onnxruntime/transformers/models/stable_diffusion/benchmark.py +5 -8
  39. onnxruntime/transformers/models/stable_diffusion/demo_txt2img.py +3 -2
  40. onnxruntime/transformers/models/stable_diffusion/demo_txt2img_xl.py +3 -2
  41. onnxruntime/transformers/models/stable_diffusion/optimize_pipeline.py +8 -2
  42. onnxruntime/transformers/models/whisper/benchmark.py +3 -28
  43. onnxruntime/transformers/models/whisper/benchmark_all.py +2 -2
  44. onnxruntime/transformers/models/whisper/convert_to_onnx.py +75 -39
  45. onnxruntime/transformers/models/whisper/whisper_chain.py +10 -7
  46. onnxruntime/transformers/models/whisper/whisper_helper.py +1 -1
  47. onnxruntime/transformers/optimizer.py +5 -10
  48. {onnxruntime_directml-1.23.0.dist-info → onnxruntime_directml-1.24.1.dist-info}/METADATA +7 -3
  49. {onnxruntime_directml-1.23.0.dist-info → onnxruntime_directml-1.24.1.dist-info}/RECORD +52 -52
  50. {onnxruntime_directml-1.23.0.dist-info → onnxruntime_directml-1.24.1.dist-info}/WHEEL +1 -1
  51. {onnxruntime_directml-1.23.0.dist-info → onnxruntime_directml-1.24.1.dist-info}/entry_points.txt +0 -0
  52. {onnxruntime_directml-1.23.0.dist-info → onnxruntime_directml-1.24.1.dist-info}/top_level.txt +0 -0
@@ -5806,41 +5806,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5806
5806
 
5807
5807
  _____
5808
5808
 
5809
- composable_kernel
5810
-
5811
- https://github.com/ROCmSoftwarePlatform/composable_kernel
5812
-
5813
- Copyright (c) 2018- , Advanced Micro Devices, Inc. (Chao Liu, Jing Zhang)
5814
- Copyright (c) 2019- , Advanced Micro Devices, Inc. (Letao Qin, Qianfeng Zhang, Liang Huang, Shaojie Wang)
5815
- Copyright (c) 2022- , Advanced Micro Devices, Inc. (Anthony Chang, Chunyu Lai, Illia Silin, Adam Osewski, Poyen Chen, Jehandad Khan)
5816
- Copyright (c) 2019-2021, Advanced Micro Devices, Inc. (Hanwen Chang)
5817
- Copyright (c) 2019-2020, Advanced Micro Devices, Inc. (Tejash Shah)
5818
- Copyright (c) 2020 , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
5819
- Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
5820
-
5821
- SPDX-License-Identifier: MIT
5822
- Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
5823
-
5824
- Permission is hereby granted, free of charge, to any person obtaining a copy
5825
- of this software and associated documentation files (the "Software"), to deal
5826
- in the Software without restriction, including without limitation the rights
5827
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
5828
- copies of the Software, and to permit persons to whom the Software is
5829
- furnished to do so, subject to the following conditions:
5830
-
5831
- The above copyright notice and this permission notice shall be included in all
5832
- copies or substantial portions of the Software.
5833
-
5834
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
5835
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
5836
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
5837
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
5838
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
5839
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
5840
- SOFTWARE.
5841
-
5842
- _____
5843
-
5844
5809
  neural-speed
5845
5810
 
5846
5811
  https://github.com/intel/neural-speed
onnxruntime/__init__.py CHANGED
@@ -8,7 +8,9 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
8
8
  or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
9
9
  """
10
10
 
11
- __version__ = "1.23.0"
11
+ import contextlib
12
+
13
+ __version__ = "1.24.1"
12
14
  __author__ = "Microsoft"
13
15
 
14
16
  # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
@@ -31,14 +33,19 @@ try:
31
33
  OrtAllocatorType, # noqa: F401
32
34
  OrtArenaCfg, # noqa: F401
33
35
  OrtCompileApiFlags, # noqa: F401
36
+ OrtDeviceMemoryType, # noqa: F401
37
+ OrtEpAssignedNode, # noqa: F401
38
+ OrtEpAssignedSubgraph, # noqa: F401
34
39
  OrtEpDevice, # noqa: F401
35
40
  OrtExecutionProviderDevicePolicy, # noqa: F401
36
41
  OrtExternalInitializerInfo, # noqa: F401
37
42
  OrtHardwareDevice, # noqa: F401
38
43
  OrtHardwareDeviceType, # noqa: F401
39
44
  OrtMemoryInfo, # noqa: F401
45
+ OrtMemoryInfoDeviceType, # noqa: F401
40
46
  OrtMemType, # noqa: F401
41
47
  OrtSparseFormat, # noqa: F401
48
+ OrtSyncStream, # noqa: F401
42
49
  RunOptions, # noqa: F401
43
50
  SessionIOBinding, # noqa: F401
44
51
  SessionOptions, # noqa: F401
@@ -78,6 +85,7 @@ from onnxruntime.capi.onnxruntime_inference_collection import (
78
85
  OrtDevice, # noqa: F401
79
86
  OrtValue, # noqa: F401
80
87
  SparseTensor, # noqa: F401
88
+ copy_tensors, # noqa: F401
81
89
  )
82
90
 
83
91
  # TODO: thiagofc: Temporary experimental namespace for new PyTorch front-end
@@ -129,14 +137,43 @@ def _get_package_root(package_name: str, directory_name: str | None = None):
129
137
  return None
130
138
 
131
139
 
140
+ def _extract_cuda_major_version(version_str: str) -> str:
141
+ """Extract CUDA major version from version string (e.g., '12.1' -> '12').
142
+
143
+ Args:
144
+ version_str: CUDA version string to parse
145
+
146
+ Returns:
147
+ Major version as string, or "12" if parsing fails
148
+ """
149
+ return version_str.split(".")[0] if version_str else "12"
150
+
151
+
152
+ def _get_cufft_version(cuda_major: str) -> str:
153
+ """Get cufft library version based on CUDA major version.
154
+
155
+ Args:
156
+ cuda_major: CUDA major version as string (e.g., "12", "13")
157
+
158
+ Returns:
159
+ cufft version as string
160
+ """
161
+ # cufft versions: CUDA 12.x -> 11, CUDA 13.x -> 12
162
+ return "12" if cuda_major == "13" else "11"
163
+
164
+
132
165
  def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = True):
166
+ # Dynamically determine CUDA major version from build info
167
+ cuda_major_version = _extract_cuda_major_version(cuda_version)
168
+ cufft_version = _get_cufft_version(cuda_major_version)
169
+
133
170
  if is_windows:
134
171
  # Path is relative to site-packages directory.
135
172
  cuda_dll_paths = [
136
- ("nvidia", "cublas", "bin", "cublasLt64_12.dll"),
137
- ("nvidia", "cublas", "bin", "cublas64_12.dll"),
138
- ("nvidia", "cufft", "bin", "cufft64_11.dll"),
139
- ("nvidia", "cuda_runtime", "bin", "cudart64_12.dll"),
173
+ ("nvidia", "cublas", "bin", f"cublasLt64_{cuda_major_version}.dll"),
174
+ ("nvidia", "cublas", "bin", f"cublas64_{cuda_major_version}.dll"),
175
+ ("nvidia", "cufft", "bin", f"cufft64_{cufft_version}.dll"),
176
+ ("nvidia", "cuda_runtime", "bin", f"cudart64_{cuda_major_version}.dll"),
140
177
  ]
141
178
  cudnn_dll_paths = [
142
179
  ("nvidia", "cudnn", "bin", "cudnn_engines_runtime_compiled64_9.dll"),
@@ -150,12 +187,12 @@ def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = Tru
150
187
  else: # Linux
151
188
  # cublas64 depends on cublasLt64, so cublasLt64 should be loaded first.
152
189
  cuda_dll_paths = [
153
- ("nvidia", "cublas", "lib", "libcublasLt.so.12"),
154
- ("nvidia", "cublas", "lib", "libcublas.so.12"),
155
- ("nvidia", "cuda_nvrtc", "lib", "libnvrtc.so.12"),
190
+ ("nvidia", "cublas", "lib", f"libcublasLt.so.{cuda_major_version}"),
191
+ ("nvidia", "cublas", "lib", f"libcublas.so.{cuda_major_version}"),
192
+ ("nvidia", "cuda_nvrtc", "lib", f"libnvrtc.so.{cuda_major_version}"),
156
193
  ("nvidia", "curand", "lib", "libcurand.so.10"),
157
- ("nvidia", "cufft", "lib", "libcufft.so.11"),
158
- ("nvidia", "cuda_runtime", "lib", "libcudart.so.12"),
194
+ ("nvidia", "cufft", "lib", f"libcufft.so.{cufft_version}"),
195
+ ("nvidia", "cuda_runtime", "lib", f"libcudart.so.{cuda_major_version}"),
159
196
  ]
160
197
 
161
198
  # Do not load cudnn sub DLLs (they will be dynamically loaded later) to be consistent with PyTorch in Linux.
@@ -197,15 +234,17 @@ def print_debug_info():
197
234
 
198
235
  if cuda_version:
199
236
  # Print version of installed packages that is related to CUDA or cuDNN DLLs.
237
+ cuda_major = _extract_cuda_major_version(cuda_version)
238
+
200
239
  packages = [
201
240
  "torch",
202
- "nvidia-cuda-runtime-cu12",
203
- "nvidia-cudnn-cu12",
204
- "nvidia-cublas-cu12",
205
- "nvidia-cufft-cu12",
206
- "nvidia-curand-cu12",
207
- "nvidia-cuda-nvrtc-cu12",
208
- "nvidia-nvjitlink-cu12",
241
+ f"nvidia-cuda-runtime-cu{cuda_major}",
242
+ f"nvidia-cudnn-cu{cuda_major}",
243
+ f"nvidia-cublas-cu{cuda_major}",
244
+ f"nvidia-cufft-cu{cuda_major}",
245
+ f"nvidia-curand-cu{cuda_major}",
246
+ f"nvidia-cuda-nvrtc-cu{cuda_major}",
247
+ f"nvidia-nvjitlink-cu{cuda_major}",
209
248
  ]
210
249
  for package in packages:
211
250
  directory_name = "nvidia" if package.startswith("nvidia-") else None
@@ -216,9 +255,9 @@ def print_debug_info():
216
255
  print(f"{package} not installed")
217
256
 
218
257
  if platform.system() == "Windows":
219
- print(f"\nEnvironment variable:\nPATH={os.environ['PATH']}")
258
+ print(f"\nEnvironment variable:\nPATH={os.environ.get('PATH', '(unset)')}")
220
259
  elif platform.system() == "Linux":
221
- print(f"\nEnvironment variable:\nLD_LIBRARY_PATH={os.environ['LD_LIBRARY_PATH']}")
260
+ print(f"\nEnvironment variable:\nLD_LIBRARY_PATH={os.environ.get('LD_LIBRARY_PATH', '(unset)')}")
222
261
 
223
262
  if importlib.util.find_spec("psutil"):
224
263
 
@@ -250,7 +289,7 @@ def print_debug_info():
250
289
 
251
290
 
252
291
  def preload_dlls(cuda: bool = True, cudnn: bool = True, msvc: bool = True, directory=None):
253
- """Preload CUDA 12.x and cuDNN 9.x DLLs in Windows or Linux, and MSVC runtime DLLs in Windows.
292
+ """Preload CUDA 12.x+ and cuDNN 9.x DLLs in Windows or Linux, and MSVC runtime DLLs in Windows.
254
293
 
255
294
  When the installed PyTorch is compatible (using same major version of CUDA and cuDNN),
256
295
  there is no need to call this function if `import torch` is done before `import onnxruntime`.
@@ -285,30 +324,53 @@ def preload_dlls(cuda: bool = True, cudnn: bool = True, msvc: bool = True, direc
285
324
  print("Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.")
286
325
  print("It can be downloaded at https://aka.ms/vs/17/release/vc_redist.x64.exe.")
287
326
 
288
- if not (cuda_version and cuda_version.startswith("12.")) and (cuda or cudnn):
289
- print(
290
- f"\033[33mWARNING: {package_name} is not built with CUDA 12.x support. "
291
- "Please install a version that supports CUDA 12.x, or call preload_dlls with cuda=False and cudnn=False.\033[0m"
292
- )
293
- return
294
-
295
- if not (cuda_version and cuda_version.startswith("12.") and (cuda or cudnn)):
327
+ # Check if CUDA version is supported (12.x or 13.x+)
328
+ ort_cuda_major = None
329
+ if cuda_version:
330
+ try:
331
+ ort_cuda_major = int(cuda_version.split(".")[0])
332
+ if ort_cuda_major < 12 and (cuda or cudnn):
333
+ print(
334
+ f"\033[33mWARNING: {package_name} is built with CUDA {cuda_version}, which is not supported for preloading. "
335
+ f"CUDA 12.x or newer is required. Call preload_dlls with cuda=False and cudnn=False.\033[0m"
336
+ )
337
+ return
338
+ except ValueError:
339
+ print(
340
+ f"\033[33mWARNING: Unable to parse CUDA version '{cuda_version}'. "
341
+ "Skipping DLL preloading. Call preload_dlls with cuda=False and cudnn=False.\033[0m"
342
+ )
343
+ return
344
+ elif cuda or cudnn:
345
+ # No CUDA version info available but CUDA/cuDNN preloading requested
296
346
  return
297
347
 
298
348
  is_cuda_cudnn_imported_by_torch = False
299
349
 
300
350
  if is_windows:
301
351
  torch_version = _get_package_version("torch")
302
- is_torch_for_cuda_12 = torch_version and "+cu12" in torch_version
352
+ # Check if torch CUDA version matches onnxruntime CUDA version
353
+ torch_cuda_major = None
354
+ if torch_version and "+cu" in torch_version:
355
+ with contextlib.suppress(ValueError):
356
+ # Extract CUDA version from torch (e.g., "2.0.0+cu121" -> 12)
357
+ cu_part = torch_version.split("+cu")[1]
358
+ torch_cuda_major = int(cu_part[:2]) # First 2 digits are major version
359
+
360
+ is_torch_cuda_compatible = (
361
+ torch_cuda_major == ort_cuda_major if (torch_cuda_major and ort_cuda_major) else False
362
+ )
363
+
303
364
  if "torch" in sys.modules:
304
- is_cuda_cudnn_imported_by_torch = is_torch_for_cuda_12
305
- if (torch_version and "+cu" in torch_version) and not is_torch_for_cuda_12:
365
+ is_cuda_cudnn_imported_by_torch = is_torch_cuda_compatible
366
+ if torch_cuda_major and ort_cuda_major and torch_cuda_major != ort_cuda_major:
306
367
  print(
307
- f"\033[33mWARNING: The installed PyTorch {torch_version} does not support CUDA 12.x. "
308
- f"Please install PyTorch for CUDA 12.x to be compatible with {package_name}.\033[0m"
368
+ f"\033[33mWARNING: The installed PyTorch {torch_version} uses CUDA {torch_cuda_major}.x, "
369
+ f"but {package_name} is built with CUDA {ort_cuda_major}.x. "
370
+ f"Please install PyTorch for CUDA {ort_cuda_major}.x to be compatible.\033[0m"
309
371
  )
310
372
 
311
- if is_torch_for_cuda_12 and directory is None:
373
+ if is_torch_cuda_compatible and directory is None:
312
374
  torch_root = _get_package_root("torch", "torch")
313
375
  if torch_root:
314
376
  directory = os.path.join(torch_root, "lib")
Binary file
@@ -1,2 +1,2 @@
1
1
  package_name = 'onnxruntime-directml'
2
- __version__ = '1.23.0'
2
+ __version__ = '1.24.1'
Binary file
@@ -199,6 +199,18 @@ class Session:
199
199
  "Return the metadata. See :class:`onnxruntime.ModelMetadata`."
200
200
  return self._model_meta
201
201
 
202
+ def get_input_memory_infos(self) -> Sequence[onnxruntime.MemoryInfo]:
203
+ "Return the memory info for the inputs."
204
+ return self._input_meminfos
205
+
206
+ def get_output_memory_infos(self) -> Sequence[onnxruntime.MemoryInfo]:
207
+ "Return the memory info for the outputs."
208
+ return self._output_meminfos
209
+
210
+ def get_input_epdevices(self) -> Sequence[onnxruntime.OrtEpDevice]:
211
+ "Return the execution providers for the inputs."
212
+ return self._input_epdevices
213
+
202
214
  def get_providers(self) -> Sequence[str]:
203
215
  "Return list of registered execution providers."
204
216
  return self._providers
@@ -207,6 +219,15 @@ class Session:
207
219
  "Return registered execution providers' configurations."
208
220
  return self._provider_options
209
221
 
222
+ def get_provider_graph_assignment_info(self) -> Sequence[onnxruntime.OrtEpAssignedSubgraph]:
223
+ """
224
+ Get information about the subgraphs assigned to each execution provider and the nodes within.
225
+
226
+ Application must enable the recording of graph assignment information by setting the session configuration
227
+ for the key "session.record_ep_graph_assignment_info" to "1".
228
+ """
229
+ return self._sess.get_provider_graph_assignment_info()
230
+
210
231
  def set_providers(self, providers=None, provider_options=None) -> None:
211
232
  """
212
233
  Register the input list of execution providers. The underlying session is re-created.
@@ -385,6 +406,16 @@ class Session:
385
406
  """
386
407
  self._sess.run_with_iobinding(iobinding._iobinding, run_options)
387
408
 
409
+ def set_ep_dynamic_options(self, options: dict[str, str]):
410
+ """
411
+ Set dynamic options for execution providers.
412
+
413
+ :param options: Dictionary of key-value pairs where both keys and values are strings.
414
+ These options will be passed to the execution providers to modify
415
+ their runtime behavior.
416
+ """
417
+ self._sess.set_ep_dynamic_options(options)
418
+
388
419
  def get_tuning_results(self):
389
420
  return self._sess.get_tuning_results()
390
421
 
@@ -490,8 +521,25 @@ class InferenceSession(Session):
490
521
  def _create_inference_session(self, providers, provider_options, disabled_optimizers=None):
491
522
  available_providers = C.get_available_providers()
492
523
 
493
- # Tensorrt can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
494
- if "TensorrtExecutionProvider" in available_providers:
524
+ # Validate that TensorrtExecutionProvider and NvTensorRTRTXExecutionProvider are not both specified
525
+ if providers:
526
+ has_tensorrt = any(
527
+ provider == "TensorrtExecutionProvider"
528
+ or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
529
+ for provider in providers
530
+ )
531
+ has_tensorrt_rtx = any(
532
+ provider == "NvTensorRTRTXExecutionProvider"
533
+ or (isinstance(provider, tuple) and provider[0] == "NvTensorRTRTXExecutionProvider")
534
+ for provider in providers
535
+ )
536
+ if has_tensorrt and has_tensorrt_rtx:
537
+ raise ValueError(
538
+ "Cannot enable both 'TensorrtExecutionProvider' and 'NvTensorRTRTXExecutionProvider' "
539
+ "in the same session."
540
+ )
541
+ # Tensorrt and TensorRT RTX can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
542
+ if "NvTensorRTRTXExecutionProvider" in available_providers:
495
543
  if (
496
544
  providers
497
545
  and any(
@@ -500,15 +548,15 @@ class InferenceSession(Session):
500
548
  for provider in providers
501
549
  )
502
550
  and any(
503
- provider == "TensorrtExecutionProvider"
504
- or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
551
+ provider == "NvTensorRTRTXExecutionProvider"
552
+ or (isinstance(provider, tuple) and provider[0] == "NvTensorRTRTXExecutionProvider")
505
553
  for provider in providers
506
554
  )
507
555
  ):
508
556
  self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
509
557
  else:
510
558
  self._fallback_providers = ["CPUExecutionProvider"]
511
- if "NvTensorRTRTXExecutionProvider" in available_providers:
559
+ elif "TensorrtExecutionProvider" in available_providers:
512
560
  if (
513
561
  providers
514
562
  and any(
@@ -517,24 +565,14 @@ class InferenceSession(Session):
517
565
  for provider in providers
518
566
  )
519
567
  and any(
520
- provider == "NvTensorRTRTXExecutionProvider"
521
- or (isinstance(provider, tuple) and provider[0] == "NvExecutionProvider")
568
+ provider == "TensorrtExecutionProvider"
569
+ or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
522
570
  for provider in providers
523
571
  )
524
572
  ):
525
573
  self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
526
574
  else:
527
575
  self._fallback_providers = ["CPUExecutionProvider"]
528
- # MIGraphX can fall back to ROCM if it's explicitly assigned. All others fall back to CPU.
529
- elif "MIGraphXExecutionProvider" in available_providers:
530
- if providers and any(
531
- provider == "ROCMExecutionProvider"
532
- or (isinstance(provider, tuple) and provider[0] == "ROCMExecutionProvider")
533
- for provider in providers
534
- ):
535
- self._fallback_providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
536
- else:
537
- self._fallback_providers = ["CPUExecutionProvider"]
538
576
  else:
539
577
  self._fallback_providers = ["CPUExecutionProvider"]
540
578
 
@@ -576,6 +614,9 @@ class InferenceSession(Session):
576
614
  self._inputs_meta = self._sess.inputs_meta
577
615
  self._outputs_meta = self._sess.outputs_meta
578
616
  self._overridable_initializers = self._sess.overridable_initializers
617
+ self._input_meminfos = self._sess.input_meminfos
618
+ self._output_meminfos = self._sess.output_meminfos
619
+ self._input_epdevices = self._sess.input_epdevices
579
620
  self._model_meta = self._sess.model_meta
580
621
  self._providers = self._sess.get_providers()
581
622
  self._provider_options = self._sess.get_provider_options()
@@ -589,6 +630,9 @@ class InferenceSession(Session):
589
630
  self._inputs_meta = None
590
631
  self._outputs_meta = None
591
632
  self._overridable_initializers = None
633
+ self._input_meminfos = None
634
+ self._output_meminfos = None
635
+ self._input_epdevices = None
592
636
  self._model_meta = None
593
637
  self._providers = None
594
638
  self._provider_options = None
@@ -1134,6 +1178,15 @@ class OrtValue:
1134
1178
  self._ortvalue.update_inplace(np_arr)
1135
1179
 
1136
1180
 
1181
+ def copy_tensors(src: Sequence[OrtValue], dst: Sequence[OrtValue], stream=None) -> None:
1182
+ """
1183
+ Copy tensor data from source OrtValue sequence to destination OrtValue sequence.
1184
+ """
1185
+ c_sources = [s._get_c_value() for s in src]
1186
+ c_dsts = [d._get_c_value() for d in dst]
1187
+ C.copy_tensors(c_sources, c_dsts, stream)
1188
+
1189
+
1137
1190
  class OrtDevice:
1138
1191
  """
1139
1192
  A data structure that exposes the underlying C++ OrtDevice
@@ -1146,6 +1199,7 @@ class OrtDevice:
1146
1199
  if isinstance(c_ort_device, C.OrtDevice):
1147
1200
  self._ort_device = c_ort_device
1148
1201
  else:
1202
+ # An end user won't hit this error
1149
1203
  raise ValueError(
1150
1204
  "`Provided object` needs to be of type `onnxruntime.capi.onnxruntime_pybind11_state.OrtDevice`"
1151
1205
  )
@@ -1188,6 +1242,9 @@ class OrtDevice:
1188
1242
  def device_vendor_id(self):
1189
1243
  return self._ort_device.vendor_id()
1190
1244
 
1245
+ def device_mem_type(self):
1246
+ return self._ort_device.mem_type()
1247
+
1191
1248
 
1192
1249
  class SparseTensor:
1193
1250
  """
@@ -23,9 +23,9 @@ def check_distro_info():
23
23
  __my_distro__ = __my_system__
24
24
  __my_distro_ver__ = platform.release().lower()
25
25
 
26
- if __my_distro_ver__ not in ["10", "11"]:
26
+ if __my_distro_ver__ not in ["10", "11", "2016server", "2019server", "2022server", "2025server"]:
27
27
  warnings.warn(
28
- f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only."
28
+ f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, or Windows Server 2016 and above."
29
29
  )
30
30
  elif __my_system__ == "linux":
31
31
  """Although the 'platform' python module for getting Distro information works well on standard OS images
@@ -353,6 +353,14 @@ class MinMaxCalibrater(CalibraterBase):
353
353
  return opset_import.version
354
354
  raise RuntimeError(f"Model does not contain a version for '{op_type}'.")
355
355
 
356
+ def insert_nodes(tensor_name, new_nodes):
357
+ index = next(
358
+ (i for i, x in enumerate(self.model.graph.node) if tensor_name in x.input), len(self.model.graph.node)
359
+ )
360
+ for node in new_nodes:
361
+ self.model.graph.node.insert(index, node)
362
+ index += 1
363
+
356
364
  def add_reduce_min_max(tensor_name, reduce_op_name):
357
365
  # When doing ReduceMax/ReduceMin, ORT can't reduce on dim with value of 0 if 'keepdims' is false.
358
366
  # To make the code simple, we always let keepdims to be 1.
@@ -396,7 +404,7 @@ class MinMaxCalibrater(CalibraterBase):
396
404
  reduce_node.input.append(reduce_axes_name)
397
405
  self.model.graph.initializer.append(reduce_axes)
398
406
 
399
- self.model.graph.node.extend([reduce_node, reshape_node])
407
+ insert_nodes(tensor_name, [reduce_node, reshape_node])
400
408
  self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [None]))
401
409
 
402
410
  for tensor in tensors:
@@ -417,7 +425,14 @@ class MinMaxCalibrater(CalibraterBase):
417
425
  inputs = data_reader.get_next()
418
426
  if not inputs:
419
427
  break
420
- self.intermediate_outputs.append(self.infer_session.run(None, inputs))
428
+ self.intermediate_outputs.append(
429
+ [
430
+ value if sess_o.name not in self.model_original_outputs else None
431
+ for sess_o, value in zip(
432
+ self.infer_session.get_outputs(), self.infer_session.run(None, inputs), strict=False
433
+ )
434
+ ]
435
+ )
421
436
  if (
422
437
  self.max_intermediate_outputs is not None
423
438
  and len(self.intermediate_outputs) == self.max_intermediate_outputs
@@ -6,15 +6,15 @@
6
6
  from __future__ import annotations
7
7
 
8
8
  import logging
9
+ import tempfile
9
10
  from pathlib import Path
10
11
 
11
12
  import onnx
12
13
 
13
- from ....tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed
14
+ from ....tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed, optimize_model
14
15
  from ....tools.remove_initializer_from_input import remove_initializer_from_input
15
16
  from ...fusions import FusionGelu, FusionLayerNormalization
16
17
  from ...onnx_model import ONNXModel
17
- from ...quant_utils import save_and_reload_model_with_shape_infer
18
18
  from .fusion_lpnorm import FusionLpNormalization
19
19
  from .fusion_spacetodepth import FusionSpaceToDepth
20
20
 
@@ -93,7 +93,7 @@ def qnn_preprocess_model(
93
93
  """
94
94
  modified = False
95
95
  model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
96
- model = save_and_reload_model_with_shape_infer(model)
96
+ model = save_and_reload_optimize_model(model, shape_infer=True)
97
97
  onnx_model = ONNXModel(model)
98
98
 
99
99
  # Optionally, fix the dynamic input shapes.
@@ -178,6 +178,24 @@ def qnn_preprocess_model(
178
178
  return modified
179
179
 
180
180
 
181
+ def save_and_reload_optimize_model(model: onnx.ModelProto, shape_infer: bool) -> onnx.ModelProto:
182
+ with tempfile.TemporaryDirectory(prefix="ort.qnn_preproc.") as qnn_preproc_tmp_dir:
183
+ model_in_path = Path(qnn_preproc_tmp_dir).joinpath("qnn_proc_input.onnx")
184
+ onnx.save_model(model, model_in_path, save_as_external_data=True)
185
+ if shape_infer:
186
+ model_infer_path = Path(qnn_preproc_tmp_dir).joinpath("qnn_proc_infer.onnx")
187
+ onnx.shape_inference.infer_shapes_path(str(model_in_path), str(model_infer_path))
188
+ model_in_path = model_infer_path
189
+ model_out_path = Path(qnn_preproc_tmp_dir).joinpath("qnn_proc_output.onnx")
190
+ optimize_model(model_in_path, model_out_path)
191
+ ret_model = onnx.load_model(model_out_path)
192
+ ret_metaprops = {"onnx.infer": "onnxruntime.tools.qnn.preprocess"}
193
+ if ret_model.metadata_props:
194
+ ret_metaprops.update(ret_model.metadata_props)
195
+ onnx.helper.set_model_props(ret_model, ret_metaprops)
196
+ return ret_model
197
+
198
+
181
199
  class InputOutputNameMap:
182
200
  def __init__(
183
201
  self,
@@ -331,23 +331,6 @@ class QnnCompatibilityOverrides:
331
331
 
332
332
  if not self.per_channel:
333
333
  self._make_static_inputs_use_default_weight_type(node)
334
- return
335
-
336
- has_weight_no_overrides = node.input[1] in self.initializers and node.input[1] not in self.overrides
337
- has_bias_no_overrides = (
338
- len(node.input) > 2
339
- and node.input[2]
340
- and node.input[2] in self.initializers
341
- and node.input[2] not in self.overrides
342
- )
343
-
344
- if has_weight_no_overrides or has_bias_no_overrides:
345
- # TODO: Make bias input not per-channel. QNN needs it to be per-tensor, but quantizer
346
- # tries to makes it per-channel if the weight is also per-channel.
347
- raise ValueError(
348
- "get_qnn_qdq_config() does not currently support the global per_channel option with LayerNormalization."
349
- " Please try using custom overrides that make bias per-tensor quantized."
350
- )
351
334
 
352
335
  def _process_sigmoid(self, node: onnx.NodeProto):
353
336
  """
@@ -33,6 +33,16 @@ class FusionLayerNormalization(Fusion):
33
33
  | |
34
34
  +-------------------------------------------------+
35
35
 
36
+ Or, using Mul instead of Pow:
37
+
38
+ +----------------------+
39
+ | |
40
+ | v
41
+ [Root] --> ReduceMean --> Sub --> Mul --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
42
+ (axis=2 or -1) | (in0=in1) (axis=2 or -1) (E-6 or E-12 or 0) ^
43
+ | |
44
+ +-------------------------------------------------+
45
+
36
46
  It also handles cases of duplicated sub nodes exported from older version of PyTorch:
37
47
 
38
48
  +----------------------+
@@ -40,7 +50,7 @@ class FusionLayerNormalization(Fusion):
40
50
  | +-------> Sub-----------------------------------------------+
41
51
  | | |
42
52
  | | v
43
- [Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
53
+ [Root] --> ReduceMean --> Sub --> (Pow or Mul) --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
44
54
  | ^
45
55
  | |
46
56
  +----------------------+
@@ -70,10 +80,9 @@ class FusionLayerNormalization(Fusion):
70
80
  div_node,
71
81
  [
72
82
  (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
73
- (
74
- ["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
75
- [1, 0, 0, 0, 0, 0],
76
- ),
83
+ (["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"], [1, 0, 0, 0, 0, 0]),
84
+ (["Sqrt", "Add", "ReduceMean", "Mul", "Sub"], [1, 0, 0, 0, 0]),
85
+ (["Sqrt", "Add", "ReduceMean", "Mul", "Cast", "Sub"], [1, 0, 0, 0, 0, 0]),
77
86
  ],
78
87
  output_name_to_node,
79
88
  )
@@ -90,8 +99,10 @@ class FusionLayerNormalization(Fusion):
90
99
  # Skip fusion since epsilon value is not expected.
91
100
  return
92
101
 
93
- pow_node = parent_nodes[3]
94
- if self.find_constant_input(pow_node, 2.0) != 1:
102
+ pow_or_mul_node = parent_nodes[3]
103
+ if pow_or_mul_node.op_type == "Pow" and self.find_constant_input(pow_or_mul_node, 2.0) != 1:
104
+ return
105
+ elif pow_or_mul_node.op_type == "Mul" and pow_or_mul_node.input[0] != pow_or_mul_node.input[1]:
95
106
  return
96
107
 
97
108
  mul_node = input_name_to_nodes[div_node.output[0]][0]