onnxruntime-directml 1.23.0__cp313-cp313-win_amd64.whl → 1.24.1__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onnxruntime/ThirdPartyNotices.txt +0 -35
- onnxruntime/__init__.py +96 -34
- onnxruntime/capi/DirectML.dll +0 -0
- onnxruntime/capi/build_and_package_info.py +1 -1
- onnxruntime/capi/onnxruntime.dll +0 -0
- onnxruntime/capi/onnxruntime_inference_collection.py +74 -17
- onnxruntime/capi/onnxruntime_providers_shared.dll +0 -0
- onnxruntime/capi/onnxruntime_pybind11_state.pyd +0 -0
- onnxruntime/capi/onnxruntime_validation.py +2 -2
- onnxruntime/quantization/calibrate.py +17 -2
- onnxruntime/quantization/execution_providers/qnn/preprocess.py +21 -3
- onnxruntime/quantization/execution_providers/qnn/quant_config.py +0 -17
- onnxruntime/quantization/fusions/fusion_layernorm.py +18 -7
- onnxruntime/quantization/matmul_nbits_quantizer.py +32 -12
- onnxruntime/quantization/qdq_quantizer.py +0 -1
- onnxruntime/quantization/quant_utils.py +12 -27
- onnxruntime/quantization/registry.py +1 -0
- onnxruntime/quantization/shape_inference.py +13 -18
- onnxruntime/quantization/static_quantize_runner.py +1 -1
- onnxruntime/tools/mobile_helpers/coreml_supported_mlprogram_ops.md +3 -0
- onnxruntime/transformers/benchmark.py +1 -4
- onnxruntime/transformers/benchmark_helper.py +6 -10
- onnxruntime/transformers/bert_perf_test.py +0 -6
- onnxruntime/transformers/convert_to_packing_mode.py +4 -5
- onnxruntime/transformers/fusion_attention_clip.py +0 -1
- onnxruntime/transformers/fusion_base.py +2 -2
- onnxruntime/transformers/fusion_utils.py +9 -5
- onnxruntime/transformers/io_binding_helper.py +60 -21
- onnxruntime/transformers/machine_info.py +8 -6
- onnxruntime/transformers/models/gpt2/convert_to_onnx.py +10 -2
- onnxruntime/transformers/models/llama/benchmark.py +1 -4
- onnxruntime/transformers/models/llama/benchmark_all.py +1 -1
- onnxruntime/transformers/models/llama/convert_to_onnx.py +11 -1
- onnxruntime/transformers/models/llama/llama_parity.py +1 -1
- onnxruntime/transformers/models/longformer/benchmark_longformer.py +1 -1
- onnxruntime/transformers/models/longformer/convert_to_onnx.py +1 -1
- onnxruntime/transformers/models/phi2/convert_to_onnx.py +8 -0
- onnxruntime/transformers/models/stable_diffusion/benchmark.py +5 -8
- onnxruntime/transformers/models/stable_diffusion/demo_txt2img.py +3 -2
- onnxruntime/transformers/models/stable_diffusion/demo_txt2img_xl.py +3 -2
- onnxruntime/transformers/models/stable_diffusion/optimize_pipeline.py +8 -2
- onnxruntime/transformers/models/whisper/benchmark.py +3 -28
- onnxruntime/transformers/models/whisper/benchmark_all.py +2 -2
- onnxruntime/transformers/models/whisper/convert_to_onnx.py +75 -39
- onnxruntime/transformers/models/whisper/whisper_chain.py +10 -7
- onnxruntime/transformers/models/whisper/whisper_helper.py +1 -1
- onnxruntime/transformers/optimizer.py +5 -10
- {onnxruntime_directml-1.23.0.dist-info → onnxruntime_directml-1.24.1.dist-info}/METADATA +7 -3
- {onnxruntime_directml-1.23.0.dist-info → onnxruntime_directml-1.24.1.dist-info}/RECORD +52 -52
- {onnxruntime_directml-1.23.0.dist-info → onnxruntime_directml-1.24.1.dist-info}/WHEEL +1 -1
- {onnxruntime_directml-1.23.0.dist-info → onnxruntime_directml-1.24.1.dist-info}/entry_points.txt +0 -0
- {onnxruntime_directml-1.23.0.dist-info → onnxruntime_directml-1.24.1.dist-info}/top_level.txt +0 -0
|
@@ -5806,41 +5806,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
5806
5806
|
|
|
5807
5807
|
_____
|
|
5808
5808
|
|
|
5809
|
-
composable_kernel
|
|
5810
|
-
|
|
5811
|
-
https://github.com/ROCmSoftwarePlatform/composable_kernel
|
|
5812
|
-
|
|
5813
|
-
Copyright (c) 2018- , Advanced Micro Devices, Inc. (Chao Liu, Jing Zhang)
|
|
5814
|
-
Copyright (c) 2019- , Advanced Micro Devices, Inc. (Letao Qin, Qianfeng Zhang, Liang Huang, Shaojie Wang)
|
|
5815
|
-
Copyright (c) 2022- , Advanced Micro Devices, Inc. (Anthony Chang, Chunyu Lai, Illia Silin, Adam Osewski, Poyen Chen, Jehandad Khan)
|
|
5816
|
-
Copyright (c) 2019-2021, Advanced Micro Devices, Inc. (Hanwen Chang)
|
|
5817
|
-
Copyright (c) 2019-2020, Advanced Micro Devices, Inc. (Tejash Shah)
|
|
5818
|
-
Copyright (c) 2020 , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
|
|
5819
|
-
Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
|
|
5820
|
-
|
|
5821
|
-
SPDX-License-Identifier: MIT
|
|
5822
|
-
Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
|
5823
|
-
|
|
5824
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
5825
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
5826
|
-
in the Software without restriction, including without limitation the rights
|
|
5827
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
5828
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
5829
|
-
furnished to do so, subject to the following conditions:
|
|
5830
|
-
|
|
5831
|
-
The above copyright notice and this permission notice shall be included in all
|
|
5832
|
-
copies or substantial portions of the Software.
|
|
5833
|
-
|
|
5834
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
5835
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
5836
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
5837
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
5838
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
5839
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
5840
|
-
SOFTWARE.
|
|
5841
|
-
|
|
5842
|
-
_____
|
|
5843
|
-
|
|
5844
5809
|
neural-speed
|
|
5845
5810
|
|
|
5846
5811
|
https://github.com/intel/neural-speed
|
onnxruntime/__init__.py
CHANGED
|
@@ -8,7 +8,9 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
|
|
|
8
8
|
or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
import contextlib
|
|
12
|
+
|
|
13
|
+
__version__ = "1.24.1"
|
|
12
14
|
__author__ = "Microsoft"
|
|
13
15
|
|
|
14
16
|
# we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
|
|
@@ -31,14 +33,19 @@ try:
|
|
|
31
33
|
OrtAllocatorType, # noqa: F401
|
|
32
34
|
OrtArenaCfg, # noqa: F401
|
|
33
35
|
OrtCompileApiFlags, # noqa: F401
|
|
36
|
+
OrtDeviceMemoryType, # noqa: F401
|
|
37
|
+
OrtEpAssignedNode, # noqa: F401
|
|
38
|
+
OrtEpAssignedSubgraph, # noqa: F401
|
|
34
39
|
OrtEpDevice, # noqa: F401
|
|
35
40
|
OrtExecutionProviderDevicePolicy, # noqa: F401
|
|
36
41
|
OrtExternalInitializerInfo, # noqa: F401
|
|
37
42
|
OrtHardwareDevice, # noqa: F401
|
|
38
43
|
OrtHardwareDeviceType, # noqa: F401
|
|
39
44
|
OrtMemoryInfo, # noqa: F401
|
|
45
|
+
OrtMemoryInfoDeviceType, # noqa: F401
|
|
40
46
|
OrtMemType, # noqa: F401
|
|
41
47
|
OrtSparseFormat, # noqa: F401
|
|
48
|
+
OrtSyncStream, # noqa: F401
|
|
42
49
|
RunOptions, # noqa: F401
|
|
43
50
|
SessionIOBinding, # noqa: F401
|
|
44
51
|
SessionOptions, # noqa: F401
|
|
@@ -78,6 +85,7 @@ from onnxruntime.capi.onnxruntime_inference_collection import (
|
|
|
78
85
|
OrtDevice, # noqa: F401
|
|
79
86
|
OrtValue, # noqa: F401
|
|
80
87
|
SparseTensor, # noqa: F401
|
|
88
|
+
copy_tensors, # noqa: F401
|
|
81
89
|
)
|
|
82
90
|
|
|
83
91
|
# TODO: thiagofc: Temporary experimental namespace for new PyTorch front-end
|
|
@@ -129,14 +137,43 @@ def _get_package_root(package_name: str, directory_name: str | None = None):
|
|
|
129
137
|
return None
|
|
130
138
|
|
|
131
139
|
|
|
140
|
+
def _extract_cuda_major_version(version_str: str) -> str:
|
|
141
|
+
"""Extract CUDA major version from version string (e.g., '12.1' -> '12').
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
version_str: CUDA version string to parse
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Major version as string, or "12" if parsing fails
|
|
148
|
+
"""
|
|
149
|
+
return version_str.split(".")[0] if version_str else "12"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _get_cufft_version(cuda_major: str) -> str:
|
|
153
|
+
"""Get cufft library version based on CUDA major version.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
cuda_major: CUDA major version as string (e.g., "12", "13")
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
cufft version as string
|
|
160
|
+
"""
|
|
161
|
+
# cufft versions: CUDA 12.x -> 11, CUDA 13.x -> 12
|
|
162
|
+
return "12" if cuda_major == "13" else "11"
|
|
163
|
+
|
|
164
|
+
|
|
132
165
|
def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = True):
|
|
166
|
+
# Dynamically determine CUDA major version from build info
|
|
167
|
+
cuda_major_version = _extract_cuda_major_version(cuda_version)
|
|
168
|
+
cufft_version = _get_cufft_version(cuda_major_version)
|
|
169
|
+
|
|
133
170
|
if is_windows:
|
|
134
171
|
# Path is relative to site-packages directory.
|
|
135
172
|
cuda_dll_paths = [
|
|
136
|
-
("nvidia", "cublas", "bin", "
|
|
137
|
-
("nvidia", "cublas", "bin", "
|
|
138
|
-
("nvidia", "cufft", "bin", "
|
|
139
|
-
("nvidia", "cuda_runtime", "bin", "
|
|
173
|
+
("nvidia", "cublas", "bin", f"cublasLt64_{cuda_major_version}.dll"),
|
|
174
|
+
("nvidia", "cublas", "bin", f"cublas64_{cuda_major_version}.dll"),
|
|
175
|
+
("nvidia", "cufft", "bin", f"cufft64_{cufft_version}.dll"),
|
|
176
|
+
("nvidia", "cuda_runtime", "bin", f"cudart64_{cuda_major_version}.dll"),
|
|
140
177
|
]
|
|
141
178
|
cudnn_dll_paths = [
|
|
142
179
|
("nvidia", "cudnn", "bin", "cudnn_engines_runtime_compiled64_9.dll"),
|
|
@@ -150,12 +187,12 @@ def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = Tru
|
|
|
150
187
|
else: # Linux
|
|
151
188
|
# cublas64 depends on cublasLt64, so cublasLt64 should be loaded first.
|
|
152
189
|
cuda_dll_paths = [
|
|
153
|
-
("nvidia", "cublas", "lib", "libcublasLt.so.
|
|
154
|
-
("nvidia", "cublas", "lib", "libcublas.so.
|
|
155
|
-
("nvidia", "cuda_nvrtc", "lib", "libnvrtc.so.
|
|
190
|
+
("nvidia", "cublas", "lib", f"libcublasLt.so.{cuda_major_version}"),
|
|
191
|
+
("nvidia", "cublas", "lib", f"libcublas.so.{cuda_major_version}"),
|
|
192
|
+
("nvidia", "cuda_nvrtc", "lib", f"libnvrtc.so.{cuda_major_version}"),
|
|
156
193
|
("nvidia", "curand", "lib", "libcurand.so.10"),
|
|
157
|
-
("nvidia", "cufft", "lib", "libcufft.so.
|
|
158
|
-
("nvidia", "cuda_runtime", "lib", "libcudart.so.
|
|
194
|
+
("nvidia", "cufft", "lib", f"libcufft.so.{cufft_version}"),
|
|
195
|
+
("nvidia", "cuda_runtime", "lib", f"libcudart.so.{cuda_major_version}"),
|
|
159
196
|
]
|
|
160
197
|
|
|
161
198
|
# Do not load cudnn sub DLLs (they will be dynamically loaded later) to be consistent with PyTorch in Linux.
|
|
@@ -197,15 +234,17 @@ def print_debug_info():
|
|
|
197
234
|
|
|
198
235
|
if cuda_version:
|
|
199
236
|
# Print version of installed packages that is related to CUDA or cuDNN DLLs.
|
|
237
|
+
cuda_major = _extract_cuda_major_version(cuda_version)
|
|
238
|
+
|
|
200
239
|
packages = [
|
|
201
240
|
"torch",
|
|
202
|
-
"nvidia-cuda-runtime-
|
|
203
|
-
"nvidia-cudnn-
|
|
204
|
-
"nvidia-cublas-
|
|
205
|
-
"nvidia-cufft-
|
|
206
|
-
"nvidia-curand-
|
|
207
|
-
"nvidia-cuda-nvrtc-
|
|
208
|
-
"nvidia-nvjitlink-
|
|
241
|
+
f"nvidia-cuda-runtime-cu{cuda_major}",
|
|
242
|
+
f"nvidia-cudnn-cu{cuda_major}",
|
|
243
|
+
f"nvidia-cublas-cu{cuda_major}",
|
|
244
|
+
f"nvidia-cufft-cu{cuda_major}",
|
|
245
|
+
f"nvidia-curand-cu{cuda_major}",
|
|
246
|
+
f"nvidia-cuda-nvrtc-cu{cuda_major}",
|
|
247
|
+
f"nvidia-nvjitlink-cu{cuda_major}",
|
|
209
248
|
]
|
|
210
249
|
for package in packages:
|
|
211
250
|
directory_name = "nvidia" if package.startswith("nvidia-") else None
|
|
@@ -216,9 +255,9 @@ def print_debug_info():
|
|
|
216
255
|
print(f"{package} not installed")
|
|
217
256
|
|
|
218
257
|
if platform.system() == "Windows":
|
|
219
|
-
print(f"\nEnvironment variable:\nPATH={os.environ
|
|
258
|
+
print(f"\nEnvironment variable:\nPATH={os.environ.get('PATH', '(unset)')}")
|
|
220
259
|
elif platform.system() == "Linux":
|
|
221
|
-
print(f"\nEnvironment variable:\nLD_LIBRARY_PATH={os.environ
|
|
260
|
+
print(f"\nEnvironment variable:\nLD_LIBRARY_PATH={os.environ.get('LD_LIBRARY_PATH', '(unset)')}")
|
|
222
261
|
|
|
223
262
|
if importlib.util.find_spec("psutil"):
|
|
224
263
|
|
|
@@ -250,7 +289,7 @@ def print_debug_info():
|
|
|
250
289
|
|
|
251
290
|
|
|
252
291
|
def preload_dlls(cuda: bool = True, cudnn: bool = True, msvc: bool = True, directory=None):
|
|
253
|
-
"""Preload CUDA 12.x and cuDNN 9.x DLLs in Windows or Linux, and MSVC runtime DLLs in Windows.
|
|
292
|
+
"""Preload CUDA 12.x+ and cuDNN 9.x DLLs in Windows or Linux, and MSVC runtime DLLs in Windows.
|
|
254
293
|
|
|
255
294
|
When the installed PyTorch is compatible (using same major version of CUDA and cuDNN),
|
|
256
295
|
there is no need to call this function if `import torch` is done before `import onnxruntime`.
|
|
@@ -285,30 +324,53 @@ def preload_dlls(cuda: bool = True, cudnn: bool = True, msvc: bool = True, direc
|
|
|
285
324
|
print("Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.")
|
|
286
325
|
print("It can be downloaded at https://aka.ms/vs/17/release/vc_redist.x64.exe.")
|
|
287
326
|
|
|
288
|
-
if
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
327
|
+
# Check if CUDA version is supported (12.x or 13.x+)
|
|
328
|
+
ort_cuda_major = None
|
|
329
|
+
if cuda_version:
|
|
330
|
+
try:
|
|
331
|
+
ort_cuda_major = int(cuda_version.split(".")[0])
|
|
332
|
+
if ort_cuda_major < 12 and (cuda or cudnn):
|
|
333
|
+
print(
|
|
334
|
+
f"\033[33mWARNING: {package_name} is built with CUDA {cuda_version}, which is not supported for preloading. "
|
|
335
|
+
f"CUDA 12.x or newer is required. Call preload_dlls with cuda=False and cudnn=False.\033[0m"
|
|
336
|
+
)
|
|
337
|
+
return
|
|
338
|
+
except ValueError:
|
|
339
|
+
print(
|
|
340
|
+
f"\033[33mWARNING: Unable to parse CUDA version '{cuda_version}'. "
|
|
341
|
+
"Skipping DLL preloading. Call preload_dlls with cuda=False and cudnn=False.\033[0m"
|
|
342
|
+
)
|
|
343
|
+
return
|
|
344
|
+
elif cuda or cudnn:
|
|
345
|
+
# No CUDA version info available but CUDA/cuDNN preloading requested
|
|
296
346
|
return
|
|
297
347
|
|
|
298
348
|
is_cuda_cudnn_imported_by_torch = False
|
|
299
349
|
|
|
300
350
|
if is_windows:
|
|
301
351
|
torch_version = _get_package_version("torch")
|
|
302
|
-
|
|
352
|
+
# Check if torch CUDA version matches onnxruntime CUDA version
|
|
353
|
+
torch_cuda_major = None
|
|
354
|
+
if torch_version and "+cu" in torch_version:
|
|
355
|
+
with contextlib.suppress(ValueError):
|
|
356
|
+
# Extract CUDA version from torch (e.g., "2.0.0+cu121" -> 12)
|
|
357
|
+
cu_part = torch_version.split("+cu")[1]
|
|
358
|
+
torch_cuda_major = int(cu_part[:2]) # First 2 digits are major version
|
|
359
|
+
|
|
360
|
+
is_torch_cuda_compatible = (
|
|
361
|
+
torch_cuda_major == ort_cuda_major if (torch_cuda_major and ort_cuda_major) else False
|
|
362
|
+
)
|
|
363
|
+
|
|
303
364
|
if "torch" in sys.modules:
|
|
304
|
-
is_cuda_cudnn_imported_by_torch =
|
|
305
|
-
if
|
|
365
|
+
is_cuda_cudnn_imported_by_torch = is_torch_cuda_compatible
|
|
366
|
+
if torch_cuda_major and ort_cuda_major and torch_cuda_major != ort_cuda_major:
|
|
306
367
|
print(
|
|
307
|
-
f"\033[33mWARNING: The installed PyTorch {torch_version}
|
|
308
|
-
f"
|
|
368
|
+
f"\033[33mWARNING: The installed PyTorch {torch_version} uses CUDA {torch_cuda_major}.x, "
|
|
369
|
+
f"but {package_name} is built with CUDA {ort_cuda_major}.x. "
|
|
370
|
+
f"Please install PyTorch for CUDA {ort_cuda_major}.x to be compatible.\033[0m"
|
|
309
371
|
)
|
|
310
372
|
|
|
311
|
-
if
|
|
373
|
+
if is_torch_cuda_compatible and directory is None:
|
|
312
374
|
torch_root = _get_package_root("torch", "torch")
|
|
313
375
|
if torch_root:
|
|
314
376
|
directory = os.path.join(torch_root, "lib")
|
onnxruntime/capi/DirectML.dll
CHANGED
|
Binary file
|
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
package_name = 'onnxruntime-directml'
|
|
2
|
-
__version__ = '1.
|
|
2
|
+
__version__ = '1.24.1'
|
onnxruntime/capi/onnxruntime.dll
CHANGED
|
Binary file
|
|
@@ -199,6 +199,18 @@ class Session:
|
|
|
199
199
|
"Return the metadata. See :class:`onnxruntime.ModelMetadata`."
|
|
200
200
|
return self._model_meta
|
|
201
201
|
|
|
202
|
+
def get_input_memory_infos(self) -> Sequence[onnxruntime.MemoryInfo]:
|
|
203
|
+
"Return the memory info for the inputs."
|
|
204
|
+
return self._input_meminfos
|
|
205
|
+
|
|
206
|
+
def get_output_memory_infos(self) -> Sequence[onnxruntime.MemoryInfo]:
|
|
207
|
+
"Return the memory info for the outputs."
|
|
208
|
+
return self._output_meminfos
|
|
209
|
+
|
|
210
|
+
def get_input_epdevices(self) -> Sequence[onnxruntime.OrtEpDevice]:
|
|
211
|
+
"Return the execution providers for the inputs."
|
|
212
|
+
return self._input_epdevices
|
|
213
|
+
|
|
202
214
|
def get_providers(self) -> Sequence[str]:
|
|
203
215
|
"Return list of registered execution providers."
|
|
204
216
|
return self._providers
|
|
@@ -207,6 +219,15 @@ class Session:
|
|
|
207
219
|
"Return registered execution providers' configurations."
|
|
208
220
|
return self._provider_options
|
|
209
221
|
|
|
222
|
+
def get_provider_graph_assignment_info(self) -> Sequence[onnxruntime.OrtEpAssignedSubgraph]:
|
|
223
|
+
"""
|
|
224
|
+
Get information about the subgraphs assigned to each execution provider and the nodes within.
|
|
225
|
+
|
|
226
|
+
Application must enable the recording of graph assignment information by setting the session configuration
|
|
227
|
+
for the key "session.record_ep_graph_assignment_info" to "1".
|
|
228
|
+
"""
|
|
229
|
+
return self._sess.get_provider_graph_assignment_info()
|
|
230
|
+
|
|
210
231
|
def set_providers(self, providers=None, provider_options=None) -> None:
|
|
211
232
|
"""
|
|
212
233
|
Register the input list of execution providers. The underlying session is re-created.
|
|
@@ -385,6 +406,16 @@ class Session:
|
|
|
385
406
|
"""
|
|
386
407
|
self._sess.run_with_iobinding(iobinding._iobinding, run_options)
|
|
387
408
|
|
|
409
|
+
def set_ep_dynamic_options(self, options: dict[str, str]):
|
|
410
|
+
"""
|
|
411
|
+
Set dynamic options for execution providers.
|
|
412
|
+
|
|
413
|
+
:param options: Dictionary of key-value pairs where both keys and values are strings.
|
|
414
|
+
These options will be passed to the execution providers to modify
|
|
415
|
+
their runtime behavior.
|
|
416
|
+
"""
|
|
417
|
+
self._sess.set_ep_dynamic_options(options)
|
|
418
|
+
|
|
388
419
|
def get_tuning_results(self):
|
|
389
420
|
return self._sess.get_tuning_results()
|
|
390
421
|
|
|
@@ -490,8 +521,25 @@ class InferenceSession(Session):
|
|
|
490
521
|
def _create_inference_session(self, providers, provider_options, disabled_optimizers=None):
|
|
491
522
|
available_providers = C.get_available_providers()
|
|
492
523
|
|
|
493
|
-
#
|
|
494
|
-
if
|
|
524
|
+
# Validate that TensorrtExecutionProvider and NvTensorRTRTXExecutionProvider are not both specified
|
|
525
|
+
if providers:
|
|
526
|
+
has_tensorrt = any(
|
|
527
|
+
provider == "TensorrtExecutionProvider"
|
|
528
|
+
or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
|
|
529
|
+
for provider in providers
|
|
530
|
+
)
|
|
531
|
+
has_tensorrt_rtx = any(
|
|
532
|
+
provider == "NvTensorRTRTXExecutionProvider"
|
|
533
|
+
or (isinstance(provider, tuple) and provider[0] == "NvTensorRTRTXExecutionProvider")
|
|
534
|
+
for provider in providers
|
|
535
|
+
)
|
|
536
|
+
if has_tensorrt and has_tensorrt_rtx:
|
|
537
|
+
raise ValueError(
|
|
538
|
+
"Cannot enable both 'TensorrtExecutionProvider' and 'NvTensorRTRTXExecutionProvider' "
|
|
539
|
+
"in the same session."
|
|
540
|
+
)
|
|
541
|
+
# Tensorrt and TensorRT RTX can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
|
|
542
|
+
if "NvTensorRTRTXExecutionProvider" in available_providers:
|
|
495
543
|
if (
|
|
496
544
|
providers
|
|
497
545
|
and any(
|
|
@@ -500,15 +548,15 @@ class InferenceSession(Session):
|
|
|
500
548
|
for provider in providers
|
|
501
549
|
)
|
|
502
550
|
and any(
|
|
503
|
-
provider == "
|
|
504
|
-
or (isinstance(provider, tuple) and provider[0] == "
|
|
551
|
+
provider == "NvTensorRTRTXExecutionProvider"
|
|
552
|
+
or (isinstance(provider, tuple) and provider[0] == "NvTensorRTRTXExecutionProvider")
|
|
505
553
|
for provider in providers
|
|
506
554
|
)
|
|
507
555
|
):
|
|
508
556
|
self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
|
509
557
|
else:
|
|
510
558
|
self._fallback_providers = ["CPUExecutionProvider"]
|
|
511
|
-
|
|
559
|
+
elif "TensorrtExecutionProvider" in available_providers:
|
|
512
560
|
if (
|
|
513
561
|
providers
|
|
514
562
|
and any(
|
|
@@ -517,24 +565,14 @@ class InferenceSession(Session):
|
|
|
517
565
|
for provider in providers
|
|
518
566
|
)
|
|
519
567
|
and any(
|
|
520
|
-
provider == "
|
|
521
|
-
or (isinstance(provider, tuple) and provider[0] == "
|
|
568
|
+
provider == "TensorrtExecutionProvider"
|
|
569
|
+
or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
|
|
522
570
|
for provider in providers
|
|
523
571
|
)
|
|
524
572
|
):
|
|
525
573
|
self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
|
526
574
|
else:
|
|
527
575
|
self._fallback_providers = ["CPUExecutionProvider"]
|
|
528
|
-
# MIGraphX can fall back to ROCM if it's explicitly assigned. All others fall back to CPU.
|
|
529
|
-
elif "MIGraphXExecutionProvider" in available_providers:
|
|
530
|
-
if providers and any(
|
|
531
|
-
provider == "ROCMExecutionProvider"
|
|
532
|
-
or (isinstance(provider, tuple) and provider[0] == "ROCMExecutionProvider")
|
|
533
|
-
for provider in providers
|
|
534
|
-
):
|
|
535
|
-
self._fallback_providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
|
|
536
|
-
else:
|
|
537
|
-
self._fallback_providers = ["CPUExecutionProvider"]
|
|
538
576
|
else:
|
|
539
577
|
self._fallback_providers = ["CPUExecutionProvider"]
|
|
540
578
|
|
|
@@ -576,6 +614,9 @@ class InferenceSession(Session):
|
|
|
576
614
|
self._inputs_meta = self._sess.inputs_meta
|
|
577
615
|
self._outputs_meta = self._sess.outputs_meta
|
|
578
616
|
self._overridable_initializers = self._sess.overridable_initializers
|
|
617
|
+
self._input_meminfos = self._sess.input_meminfos
|
|
618
|
+
self._output_meminfos = self._sess.output_meminfos
|
|
619
|
+
self._input_epdevices = self._sess.input_epdevices
|
|
579
620
|
self._model_meta = self._sess.model_meta
|
|
580
621
|
self._providers = self._sess.get_providers()
|
|
581
622
|
self._provider_options = self._sess.get_provider_options()
|
|
@@ -589,6 +630,9 @@ class InferenceSession(Session):
|
|
|
589
630
|
self._inputs_meta = None
|
|
590
631
|
self._outputs_meta = None
|
|
591
632
|
self._overridable_initializers = None
|
|
633
|
+
self._input_meminfos = None
|
|
634
|
+
self._output_meminfos = None
|
|
635
|
+
self._input_epdevices = None
|
|
592
636
|
self._model_meta = None
|
|
593
637
|
self._providers = None
|
|
594
638
|
self._provider_options = None
|
|
@@ -1134,6 +1178,15 @@ class OrtValue:
|
|
|
1134
1178
|
self._ortvalue.update_inplace(np_arr)
|
|
1135
1179
|
|
|
1136
1180
|
|
|
1181
|
+
def copy_tensors(src: Sequence[OrtValue], dst: Sequence[OrtValue], stream=None) -> None:
|
|
1182
|
+
"""
|
|
1183
|
+
Copy tensor data from source OrtValue sequence to destination OrtValue sequence.
|
|
1184
|
+
"""
|
|
1185
|
+
c_sources = [s._get_c_value() for s in src]
|
|
1186
|
+
c_dsts = [d._get_c_value() for d in dst]
|
|
1187
|
+
C.copy_tensors(c_sources, c_dsts, stream)
|
|
1188
|
+
|
|
1189
|
+
|
|
1137
1190
|
class OrtDevice:
|
|
1138
1191
|
"""
|
|
1139
1192
|
A data structure that exposes the underlying C++ OrtDevice
|
|
@@ -1146,6 +1199,7 @@ class OrtDevice:
|
|
|
1146
1199
|
if isinstance(c_ort_device, C.OrtDevice):
|
|
1147
1200
|
self._ort_device = c_ort_device
|
|
1148
1201
|
else:
|
|
1202
|
+
# An end user won't hit this error
|
|
1149
1203
|
raise ValueError(
|
|
1150
1204
|
"`Provided object` needs to be of type `onnxruntime.capi.onnxruntime_pybind11_state.OrtDevice`"
|
|
1151
1205
|
)
|
|
@@ -1188,6 +1242,9 @@ class OrtDevice:
|
|
|
1188
1242
|
def device_vendor_id(self):
|
|
1189
1243
|
return self._ort_device.vendor_id()
|
|
1190
1244
|
|
|
1245
|
+
def device_mem_type(self):
|
|
1246
|
+
return self._ort_device.mem_type()
|
|
1247
|
+
|
|
1191
1248
|
|
|
1192
1249
|
class SparseTensor:
|
|
1193
1250
|
"""
|
|
Binary file
|
|
Binary file
|
|
@@ -23,9 +23,9 @@ def check_distro_info():
|
|
|
23
23
|
__my_distro__ = __my_system__
|
|
24
24
|
__my_distro_ver__ = platform.release().lower()
|
|
25
25
|
|
|
26
|
-
if __my_distro_ver__ not in ["10", "11"]:
|
|
26
|
+
if __my_distro_ver__ not in ["10", "11", "2016server", "2019server", "2022server", "2025server"]:
|
|
27
27
|
warnings.warn(
|
|
28
|
-
f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above,
|
|
28
|
+
f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, or Windows Server 2016 and above."
|
|
29
29
|
)
|
|
30
30
|
elif __my_system__ == "linux":
|
|
31
31
|
"""Although the 'platform' python module for getting Distro information works well on standard OS images
|
|
@@ -353,6 +353,14 @@ class MinMaxCalibrater(CalibraterBase):
|
|
|
353
353
|
return opset_import.version
|
|
354
354
|
raise RuntimeError(f"Model does not contain a version for '{op_type}'.")
|
|
355
355
|
|
|
356
|
+
def insert_nodes(tensor_name, new_nodes):
|
|
357
|
+
index = next(
|
|
358
|
+
(i for i, x in enumerate(self.model.graph.node) if tensor_name in x.input), len(self.model.graph.node)
|
|
359
|
+
)
|
|
360
|
+
for node in new_nodes:
|
|
361
|
+
self.model.graph.node.insert(index, node)
|
|
362
|
+
index += 1
|
|
363
|
+
|
|
356
364
|
def add_reduce_min_max(tensor_name, reduce_op_name):
|
|
357
365
|
# When doing ReduceMax/ReduceMin, ORT can't reduce on dim with value of 0 if 'keepdims' is false.
|
|
358
366
|
# To make the code simple, we always let keepdims to be 1.
|
|
@@ -396,7 +404,7 @@ class MinMaxCalibrater(CalibraterBase):
|
|
|
396
404
|
reduce_node.input.append(reduce_axes_name)
|
|
397
405
|
self.model.graph.initializer.append(reduce_axes)
|
|
398
406
|
|
|
399
|
-
|
|
407
|
+
insert_nodes(tensor_name, [reduce_node, reshape_node])
|
|
400
408
|
self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [None]))
|
|
401
409
|
|
|
402
410
|
for tensor in tensors:
|
|
@@ -417,7 +425,14 @@ class MinMaxCalibrater(CalibraterBase):
|
|
|
417
425
|
inputs = data_reader.get_next()
|
|
418
426
|
if not inputs:
|
|
419
427
|
break
|
|
420
|
-
self.intermediate_outputs.append(
|
|
428
|
+
self.intermediate_outputs.append(
|
|
429
|
+
[
|
|
430
|
+
value if sess_o.name not in self.model_original_outputs else None
|
|
431
|
+
for sess_o, value in zip(
|
|
432
|
+
self.infer_session.get_outputs(), self.infer_session.run(None, inputs), strict=False
|
|
433
|
+
)
|
|
434
|
+
]
|
|
435
|
+
)
|
|
421
436
|
if (
|
|
422
437
|
self.max_intermediate_outputs is not None
|
|
423
438
|
and len(self.intermediate_outputs) == self.max_intermediate_outputs
|
|
@@ -6,15 +6,15 @@
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
8
|
import logging
|
|
9
|
+
import tempfile
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
|
|
11
12
|
import onnx
|
|
12
13
|
|
|
13
|
-
from ....tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed
|
|
14
|
+
from ....tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed, optimize_model
|
|
14
15
|
from ....tools.remove_initializer_from_input import remove_initializer_from_input
|
|
15
16
|
from ...fusions import FusionGelu, FusionLayerNormalization
|
|
16
17
|
from ...onnx_model import ONNXModel
|
|
17
|
-
from ...quant_utils import save_and_reload_model_with_shape_infer
|
|
18
18
|
from .fusion_lpnorm import FusionLpNormalization
|
|
19
19
|
from .fusion_spacetodepth import FusionSpaceToDepth
|
|
20
20
|
|
|
@@ -93,7 +93,7 @@ def qnn_preprocess_model(
|
|
|
93
93
|
"""
|
|
94
94
|
modified = False
|
|
95
95
|
model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
|
|
96
|
-
model =
|
|
96
|
+
model = save_and_reload_optimize_model(model, shape_infer=True)
|
|
97
97
|
onnx_model = ONNXModel(model)
|
|
98
98
|
|
|
99
99
|
# Optionally, fix the dynamic input shapes.
|
|
@@ -178,6 +178,24 @@ def qnn_preprocess_model(
|
|
|
178
178
|
return modified
|
|
179
179
|
|
|
180
180
|
|
|
181
|
+
def save_and_reload_optimize_model(model: onnx.ModelProto, shape_infer: bool) -> onnx.ModelProto:
|
|
182
|
+
with tempfile.TemporaryDirectory(prefix="ort.qnn_preproc.") as qnn_preproc_tmp_dir:
|
|
183
|
+
model_in_path = Path(qnn_preproc_tmp_dir).joinpath("qnn_proc_input.onnx")
|
|
184
|
+
onnx.save_model(model, model_in_path, save_as_external_data=True)
|
|
185
|
+
if shape_infer:
|
|
186
|
+
model_infer_path = Path(qnn_preproc_tmp_dir).joinpath("qnn_proc_infer.onnx")
|
|
187
|
+
onnx.shape_inference.infer_shapes_path(str(model_in_path), str(model_infer_path))
|
|
188
|
+
model_in_path = model_infer_path
|
|
189
|
+
model_out_path = Path(qnn_preproc_tmp_dir).joinpath("qnn_proc_output.onnx")
|
|
190
|
+
optimize_model(model_in_path, model_out_path)
|
|
191
|
+
ret_model = onnx.load_model(model_out_path)
|
|
192
|
+
ret_metaprops = {"onnx.infer": "onnxruntime.tools.qnn.preprocess"}
|
|
193
|
+
if ret_model.metadata_props:
|
|
194
|
+
ret_metaprops.update(ret_model.metadata_props)
|
|
195
|
+
onnx.helper.set_model_props(ret_model, ret_metaprops)
|
|
196
|
+
return ret_model
|
|
197
|
+
|
|
198
|
+
|
|
181
199
|
class InputOutputNameMap:
|
|
182
200
|
def __init__(
|
|
183
201
|
self,
|
|
@@ -331,23 +331,6 @@ class QnnCompatibilityOverrides:
|
|
|
331
331
|
|
|
332
332
|
if not self.per_channel:
|
|
333
333
|
self._make_static_inputs_use_default_weight_type(node)
|
|
334
|
-
return
|
|
335
|
-
|
|
336
|
-
has_weight_no_overrides = node.input[1] in self.initializers and node.input[1] not in self.overrides
|
|
337
|
-
has_bias_no_overrides = (
|
|
338
|
-
len(node.input) > 2
|
|
339
|
-
and node.input[2]
|
|
340
|
-
and node.input[2] in self.initializers
|
|
341
|
-
and node.input[2] not in self.overrides
|
|
342
|
-
)
|
|
343
|
-
|
|
344
|
-
if has_weight_no_overrides or has_bias_no_overrides:
|
|
345
|
-
# TODO: Make bias input not per-channel. QNN needs it to be per-tensor, but quantizer
|
|
346
|
-
# tries to makes it per-channel if the weight is also per-channel.
|
|
347
|
-
raise ValueError(
|
|
348
|
-
"get_qnn_qdq_config() does not currently support the global per_channel option with LayerNormalization."
|
|
349
|
-
" Please try using custom overrides that make bias per-tensor quantized."
|
|
350
|
-
)
|
|
351
334
|
|
|
352
335
|
def _process_sigmoid(self, node: onnx.NodeProto):
|
|
353
336
|
"""
|
|
@@ -33,6 +33,16 @@ class FusionLayerNormalization(Fusion):
|
|
|
33
33
|
| |
|
|
34
34
|
+-------------------------------------------------+
|
|
35
35
|
|
|
36
|
+
Or, using Mul instead of Pow:
|
|
37
|
+
|
|
38
|
+
+----------------------+
|
|
39
|
+
| |
|
|
40
|
+
| v
|
|
41
|
+
[Root] --> ReduceMean --> Sub --> Mul --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
|
|
42
|
+
(axis=2 or -1) | (in0=in1) (axis=2 or -1) (E-6 or E-12 or 0) ^
|
|
43
|
+
| |
|
|
44
|
+
+-------------------------------------------------+
|
|
45
|
+
|
|
36
46
|
It also handles cases of duplicated sub nodes exported from older version of PyTorch:
|
|
37
47
|
|
|
38
48
|
+----------------------+
|
|
@@ -40,7 +50,7 @@ class FusionLayerNormalization(Fusion):
|
|
|
40
50
|
| +-------> Sub-----------------------------------------------+
|
|
41
51
|
| | |
|
|
42
52
|
| | v
|
|
43
|
-
[Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
|
|
53
|
+
[Root] --> ReduceMean --> Sub --> (Pow or Mul) --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
|
|
44
54
|
| ^
|
|
45
55
|
| |
|
|
46
56
|
+----------------------+
|
|
@@ -70,10 +80,9 @@ class FusionLayerNormalization(Fusion):
|
|
|
70
80
|
div_node,
|
|
71
81
|
[
|
|
72
82
|
(["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
|
|
73
|
-
(
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
),
|
|
83
|
+
(["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"], [1, 0, 0, 0, 0, 0]),
|
|
84
|
+
(["Sqrt", "Add", "ReduceMean", "Mul", "Sub"], [1, 0, 0, 0, 0]),
|
|
85
|
+
(["Sqrt", "Add", "ReduceMean", "Mul", "Cast", "Sub"], [1, 0, 0, 0, 0, 0]),
|
|
77
86
|
],
|
|
78
87
|
output_name_to_node,
|
|
79
88
|
)
|
|
@@ -90,8 +99,10 @@ class FusionLayerNormalization(Fusion):
|
|
|
90
99
|
# Skip fusion since epsilon value is not expected.
|
|
91
100
|
return
|
|
92
101
|
|
|
93
|
-
|
|
94
|
-
if self.find_constant_input(
|
|
102
|
+
pow_or_mul_node = parent_nodes[3]
|
|
103
|
+
if pow_or_mul_node.op_type == "Pow" and self.find_constant_input(pow_or_mul_node, 2.0) != 1:
|
|
104
|
+
return
|
|
105
|
+
elif pow_or_mul_node.op_type == "Mul" and pow_or_mul_node.input[0] != pow_or_mul_node.input[1]:
|
|
95
106
|
return
|
|
96
107
|
|
|
97
108
|
mul_node = input_name_to_nodes[div_node.output[0]][0]
|