PyPI - onnxruntime-directml - Versions diffs - 1.23.0__cp313-cp313-win_amd64.whl → 1.24.1__cp313-cp313-win_amd64.whl - Mend

onnxruntime-directml 1.23.0__cp313-cp313-win_amd64.whl → 1.24.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

onnxruntime/ThirdPartyNotices.txt CHANGED Viewed

@@ -5806,41 +5806,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 _____
-composable_kernel
-https://github.com/ROCmSoftwarePlatform/composable_kernel
-Copyright (c) 2018-    , Advanced Micro Devices, Inc. (Chao Liu, Jing Zhang)
-Copyright (c) 2019-    , Advanced Micro Devices, Inc. (Letao Qin, Qianfeng Zhang, Liang Huang, Shaojie Wang)
-Copyright (c) 2022-    , Advanced Micro Devices, Inc. (Anthony Chang, Chunyu Lai, Illia Silin, Adam Osewski, Poyen Chen, Jehandad Khan)
-Copyright (c) 2019-2021, Advanced Micro Devices, Inc. (Hanwen Chang)
-Copyright (c) 2019-2020, Advanced Micro Devices, Inc. (Tejash Shah)
-Copyright (c) 2020     , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
-Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
-SPDX-License-Identifier: MIT
-Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-_____
 neural-speed
 https://github.com/intel/neural-speed

onnxruntime/__init__.py CHANGED Viewed

@@ -8,7 +8,9 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
 or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
 """
-__version__ = "1.23.0"
+import contextlib
+__version__ = "1.24.1"
 __author__ = "Microsoft"
 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
@@ -31,14 +33,19 @@ try:
         OrtAllocatorType,  # noqa: F401
         OrtArenaCfg,  # noqa: F401
         OrtCompileApiFlags,  # noqa: F401
+        OrtDeviceMemoryType,  # noqa: F401
+        OrtEpAssignedNode,  # noqa: F401
+        OrtEpAssignedSubgraph,  # noqa: F401
         OrtEpDevice,  # noqa: F401
         OrtExecutionProviderDevicePolicy,  # noqa: F401
         OrtExternalInitializerInfo,  # noqa: F401
         OrtHardwareDevice,  # noqa: F401
         OrtHardwareDeviceType,  # noqa: F401
         OrtMemoryInfo,  # noqa: F401
+        OrtMemoryInfoDeviceType,  # noqa: F401
         OrtMemType,  # noqa: F401
         OrtSparseFormat,  # noqa: F401
+        OrtSyncStream,  # noqa: F401
         RunOptions,  # noqa: F401
         SessionIOBinding,  # noqa: F401
         SessionOptions,  # noqa: F401
@@ -78,6 +85,7 @@ from onnxruntime.capi.onnxruntime_inference_collection import (
     OrtDevice,  # noqa: F401
     OrtValue,  # noqa: F401
     SparseTensor,  # noqa: F401
+    copy_tensors,  # noqa: F401
 )
 # TODO: thiagofc: Temporary experimental namespace for new PyTorch front-end
@@ -129,14 +137,43 @@ def _get_package_root(package_name: str, directory_name: str | None = None):
     return None
+def _extract_cuda_major_version(version_str: str) -> str:
+    """Extract CUDA major version from version string (e.g., '12.1' -> '12').
+    Args:
+        version_str: CUDA version string to parse
+    Returns:
+        Major version as string, or "12" if parsing fails
+    """
+    return version_str.split(".")[0] if version_str else "12"
+def _get_cufft_version(cuda_major: str) -> str:
+    """Get cufft library version based on CUDA major version.
+    Args:
+        cuda_major: CUDA major version as string (e.g., "12", "13")
+    Returns:
+        cufft version as string
+    """
+    # cufft versions: CUDA 12.x -> 11, CUDA 13.x -> 12
+    return "12" if cuda_major == "13" else "11"
 def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = True):
+    # Dynamically determine CUDA major version from build info
+    cuda_major_version = _extract_cuda_major_version(cuda_version)
+    cufft_version = _get_cufft_version(cuda_major_version)
     if is_windows:
         # Path is relative to site-packages directory.
         cuda_dll_paths = [
-            ("nvidia", "cublas", "bin", "cublasLt64_12.dll"),
-            ("nvidia", "cublas", "bin", "cublas64_12.dll"),
-            ("nvidia", "cufft", "bin", "cufft64_11.dll"),
-            ("nvidia", "cuda_runtime", "bin", "cudart64_12.dll"),
+            ("nvidia", "cublas", "bin", f"cublasLt64_{cuda_major_version}.dll"),
+            ("nvidia", "cublas", "bin", f"cublas64_{cuda_major_version}.dll"),
+            ("nvidia", "cufft", "bin", f"cufft64_{cufft_version}.dll"),
+            ("nvidia", "cuda_runtime", "bin", f"cudart64_{cuda_major_version}.dll"),
         ]
         cudnn_dll_paths = [
             ("nvidia", "cudnn", "bin", "cudnn_engines_runtime_compiled64_9.dll"),
@@ -150,12 +187,12 @@ def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = Tru
     else:  # Linux
         # cublas64 depends on cublasLt64, so cublasLt64 should be loaded first.
         cuda_dll_paths = [
-            ("nvidia", "cublas", "lib", "libcublasLt.so.12"),
-            ("nvidia", "cublas", "lib", "libcublas.so.12"),
-            ("nvidia", "cuda_nvrtc", "lib", "libnvrtc.so.12"),
+            ("nvidia", "cublas", "lib", f"libcublasLt.so.{cuda_major_version}"),
+            ("nvidia", "cublas", "lib", f"libcublas.so.{cuda_major_version}"),
+            ("nvidia", "cuda_nvrtc", "lib", f"libnvrtc.so.{cuda_major_version}"),
             ("nvidia", "curand", "lib", "libcurand.so.10"),
-            ("nvidia", "cufft", "lib", "libcufft.so.11"),
-            ("nvidia", "cuda_runtime", "lib", "libcudart.so.12"),
+            ("nvidia", "cufft", "lib", f"libcufft.so.{cufft_version}"),
+            ("nvidia", "cuda_runtime", "lib", f"libcudart.so.{cuda_major_version}"),
         ]
         # Do not load cudnn sub DLLs (they will be dynamically loaded later) to be consistent with PyTorch in Linux.
@@ -197,15 +234,17 @@ def print_debug_info():
     if cuda_version:
         # Print version of installed packages that is related to CUDA or cuDNN DLLs.
+        cuda_major = _extract_cuda_major_version(cuda_version)
         packages = [
             "torch",
-            "nvidia-cuda-runtime-cu12",
-            "nvidia-cudnn-cu12",
-            "nvidia-cublas-cu12",
-            "nvidia-cufft-cu12",
-            "nvidia-curand-cu12",
-            "nvidia-cuda-nvrtc-cu12",
-            "nvidia-nvjitlink-cu12",
+            f"nvidia-cuda-runtime-cu{cuda_major}",
+            f"nvidia-cudnn-cu{cuda_major}",
+            f"nvidia-cublas-cu{cuda_major}",
+            f"nvidia-cufft-cu{cuda_major}",
+            f"nvidia-curand-cu{cuda_major}",
+            f"nvidia-cuda-nvrtc-cu{cuda_major}",
+            f"nvidia-nvjitlink-cu{cuda_major}",
         ]
         for package in packages:
             directory_name = "nvidia" if package.startswith("nvidia-") else None
@@ -216,9 +255,9 @@ def print_debug_info():
                 print(f"{package} not installed")
     if platform.system() == "Windows":
-        print(f"\nEnvironment variable:\nPATH={os.environ['PATH']}")
+        print(f"\nEnvironment variable:\nPATH={os.environ.get('PATH', '(unset)')}")
     elif platform.system() == "Linux":
-        print(f"\nEnvironment variable:\nLD_LIBRARY_PATH={os.environ['LD_LIBRARY_PATH']}")
+        print(f"\nEnvironment variable:\nLD_LIBRARY_PATH={os.environ.get('LD_LIBRARY_PATH', '(unset)')}")
     if importlib.util.find_spec("psutil"):
@@ -250,7 +289,7 @@ def print_debug_info():
 def preload_dlls(cuda: bool = True, cudnn: bool = True, msvc: bool = True, directory=None):
-    """Preload CUDA 12.x and cuDNN 9.x DLLs in Windows or Linux, and MSVC runtime DLLs in Windows.
+    """Preload CUDA 12.x+ and cuDNN 9.x DLLs in Windows or Linux, and MSVC runtime DLLs in Windows.
        When the installed PyTorch is compatible (using same major version of CUDA and cuDNN),
        there is no need to call this function if `import torch` is done before `import onnxruntime`.
@@ -285,30 +324,53 @@ def preload_dlls(cuda: bool = True, cudnn: bool = True, msvc: bool = True, direc
             print("Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.")
             print("It can be downloaded at https://aka.ms/vs/17/release/vc_redist.x64.exe.")
-    if not (cuda_version and cuda_version.startswith("12.")) and (cuda or cudnn):
-        print(
-            f"\033[33mWARNING: {package_name} is not built with CUDA 12.x support. "
-            "Please install a version that supports CUDA 12.x, or call preload_dlls with cuda=False and cudnn=False.\033[0m"
-        )
-        return
-    if not (cuda_version and cuda_version.startswith("12.") and (cuda or cudnn)):
+    # Check if CUDA version is supported (12.x or 13.x+)
+    ort_cuda_major = None
+    if cuda_version:
+        try:
+            ort_cuda_major = int(cuda_version.split(".")[0])
+            if ort_cuda_major < 12 and (cuda or cudnn):
+                print(
+                    f"\033[33mWARNING: {package_name} is built with CUDA {cuda_version}, which is not supported for preloading. "
+                    f"CUDA 12.x or newer is required. Call preload_dlls with cuda=False and cudnn=False.\033[0m"
+                )
+                return
+        except ValueError:
+            print(
+                f"\033[33mWARNING: Unable to parse CUDA version '{cuda_version}'. "
+                "Skipping DLL preloading. Call preload_dlls with cuda=False and cudnn=False.\033[0m"
+            )
+            return
+    elif cuda or cudnn:
+        # No CUDA version info available but CUDA/cuDNN preloading requested
         return
     is_cuda_cudnn_imported_by_torch = False
     if is_windows:
         torch_version = _get_package_version("torch")
-        is_torch_for_cuda_12 = torch_version and "+cu12" in torch_version
+        # Check if torch CUDA version matches onnxruntime CUDA version
+        torch_cuda_major = None
+        if torch_version and "+cu" in torch_version:
+            with contextlib.suppress(ValueError):
+                # Extract CUDA version from torch (e.g., "2.0.0+cu121" -> 12)
+                cu_part = torch_version.split("+cu")[1]
+                torch_cuda_major = int(cu_part[:2])  # First 2 digits are major version
+        is_torch_cuda_compatible = (
+            torch_cuda_major == ort_cuda_major if (torch_cuda_major and ort_cuda_major) else False
+        )
         if "torch" in sys.modules:
-            is_cuda_cudnn_imported_by_torch = is_torch_for_cuda_12
-            if (torch_version and "+cu" in torch_version) and not is_torch_for_cuda_12:
+            is_cuda_cudnn_imported_by_torch = is_torch_cuda_compatible
+            if torch_cuda_major and ort_cuda_major and torch_cuda_major != ort_cuda_major:
                 print(
-                    f"\033[33mWARNING: The installed PyTorch {torch_version} does not support CUDA 12.x. "
-                    f"Please install PyTorch for CUDA 12.x to be compatible with {package_name}.\033[0m"
+                    f"\033[33mWARNING: The installed PyTorch {torch_version} uses CUDA {torch_cuda_major}.x, "
+                    f"but {package_name} is built with CUDA {ort_cuda_major}.x. "
+                    f"Please install PyTorch for CUDA {ort_cuda_major}.x to be compatible.\033[0m"
                 )
-        if is_torch_for_cuda_12 and directory is None:
+        if is_torch_cuda_compatible and directory is None:
             torch_root = _get_package_root("torch", "torch")
             if torch_root:
                 directory = os.path.join(torch_root, "lib")

onnxruntime/capi/DirectML.dll CHANGED Viewed

Binary file

onnxruntime/capi/build_and_package_info.py CHANGED Viewed

@@ -1,2 +1,2 @@
 package_name = 'onnxruntime-directml'
-__version__ = '1.23.0'
+__version__ = '1.24.1'

onnxruntime/capi/onnxruntime.dll CHANGED Viewed

Binary file

onnxruntime/capi/onnxruntime_inference_collection.py CHANGED Viewed

@@ -199,6 +199,18 @@ class Session:
         "Return the metadata. See :class:`onnxruntime.ModelMetadata`."
         return self._model_meta
+    def get_input_memory_infos(self) -> Sequence[onnxruntime.MemoryInfo]:
+        "Return the memory info for the inputs."
+        return self._input_meminfos
+    def get_output_memory_infos(self) -> Sequence[onnxruntime.MemoryInfo]:
+        "Return the memory info for the outputs."
+        return self._output_meminfos
+    def get_input_epdevices(self) -> Sequence[onnxruntime.OrtEpDevice]:
+        "Return the execution providers for the inputs."
+        return self._input_epdevices
     def get_providers(self) -> Sequence[str]:
         "Return list of registered execution providers."
         return self._providers
@@ -207,6 +219,15 @@ class Session:
         "Return registered execution providers' configurations."
         return self._provider_options
+    def get_provider_graph_assignment_info(self) -> Sequence[onnxruntime.OrtEpAssignedSubgraph]:
+        """
+        Get information about the subgraphs assigned to each execution provider and the nodes within.
+        Application must enable the recording of graph assignment information by setting the session configuration
+        for the key "session.record_ep_graph_assignment_info" to "1".
+        """
+        return self._sess.get_provider_graph_assignment_info()
     def set_providers(self, providers=None, provider_options=None) -> None:
         """
         Register the input list of execution providers. The underlying session is re-created.
@@ -385,6 +406,16 @@ class Session:
         """
         self._sess.run_with_iobinding(iobinding._iobinding, run_options)
+    def set_ep_dynamic_options(self, options: dict[str, str]):
+        """
+        Set dynamic options for execution providers.
+        :param options: Dictionary of key-value pairs where both keys and values are strings.
+                        These options will be passed to the execution providers to modify
+                        their runtime behavior.
+        """
+        self._sess.set_ep_dynamic_options(options)
     def get_tuning_results(self):
         return self._sess.get_tuning_results()
@@ -490,8 +521,25 @@ class InferenceSession(Session):
     def _create_inference_session(self, providers, provider_options, disabled_optimizers=None):
         available_providers = C.get_available_providers()
-        # Tensorrt can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
-        if "TensorrtExecutionProvider" in available_providers:
+        # Validate that TensorrtExecutionProvider and NvTensorRTRTXExecutionProvider are not both specified
+        if providers:
+            has_tensorrt = any(
+                provider == "TensorrtExecutionProvider"
+                or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
+                for provider in providers
+            )
+            has_tensorrt_rtx = any(
+                provider == "NvTensorRTRTXExecutionProvider"
+                or (isinstance(provider, tuple) and provider[0] == "NvTensorRTRTXExecutionProvider")
+                for provider in providers
+            )
+            if has_tensorrt and has_tensorrt_rtx:
+                raise ValueError(
+                    "Cannot enable both 'TensorrtExecutionProvider' and 'NvTensorRTRTXExecutionProvider' "
+                    "in the same session."
+                )
+        # Tensorrt and TensorRT RTX can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
+        if "NvTensorRTRTXExecutionProvider" in available_providers:
             if (
                 providers
                 and any(
@@ -500,15 +548,15 @@ class InferenceSession(Session):
                     for provider in providers
                 )
                 and any(
-                    provider == "TensorrtExecutionProvider"
-                    or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
+                    provider == "NvTensorRTRTXExecutionProvider"
+                    or (isinstance(provider, tuple) and provider[0] == "NvTensorRTRTXExecutionProvider")
                     for provider in providers
                 )
             ):
                 self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
             else:
                 self._fallback_providers = ["CPUExecutionProvider"]
-        if "NvTensorRTRTXExecutionProvider" in available_providers:
+        elif "TensorrtExecutionProvider" in available_providers:
             if (
                 providers
                 and any(
@@ -517,24 +565,14 @@ class InferenceSession(Session):
                     for provider in providers
                 )
                 and any(
-                    provider == "NvTensorRTRTXExecutionProvider"
-                    or (isinstance(provider, tuple) and provider[0] == "NvExecutionProvider")
+                    provider == "TensorrtExecutionProvider"
+                    or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
                     for provider in providers
                 )
             ):
                 self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
             else:
                 self._fallback_providers = ["CPUExecutionProvider"]
-        # MIGraphX can fall back to ROCM if it's explicitly assigned. All others fall back to CPU.
-        elif "MIGraphXExecutionProvider" in available_providers:
-            if providers and any(
-                provider == "ROCMExecutionProvider"
-                or (isinstance(provider, tuple) and provider[0] == "ROCMExecutionProvider")
-                for provider in providers
-            ):
-                self._fallback_providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
-            else:
-                self._fallback_providers = ["CPUExecutionProvider"]
         else:
             self._fallback_providers = ["CPUExecutionProvider"]
@@ -576,6 +614,9 @@ class InferenceSession(Session):
         self._inputs_meta = self._sess.inputs_meta
         self._outputs_meta = self._sess.outputs_meta
         self._overridable_initializers = self._sess.overridable_initializers
+        self._input_meminfos = self._sess.input_meminfos
+        self._output_meminfos = self._sess.output_meminfos
+        self._input_epdevices = self._sess.input_epdevices
         self._model_meta = self._sess.model_meta
         self._providers = self._sess.get_providers()
         self._provider_options = self._sess.get_provider_options()
@@ -589,6 +630,9 @@ class InferenceSession(Session):
         self._inputs_meta = None
         self._outputs_meta = None
         self._overridable_initializers = None
+        self._input_meminfos = None
+        self._output_meminfos = None
+        self._input_epdevices = None
         self._model_meta = None
         self._providers = None
         self._provider_options = None
@@ -1134,6 +1178,15 @@ class OrtValue:
         self._ortvalue.update_inplace(np_arr)
+def copy_tensors(src: Sequence[OrtValue], dst: Sequence[OrtValue], stream=None) -> None:
+    """
+    Copy tensor data from source OrtValue sequence to destination OrtValue sequence.
+    """
+    c_sources = [s._get_c_value() for s in src]
+    c_dsts = [d._get_c_value() for d in dst]
+    C.copy_tensors(c_sources, c_dsts, stream)
 class OrtDevice:
     """
     A data structure that exposes the underlying C++ OrtDevice
@@ -1146,6 +1199,7 @@ class OrtDevice:
         if isinstance(c_ort_device, C.OrtDevice):
             self._ort_device = c_ort_device
         else:
+            # An end user won't hit this error
             raise ValueError(
                 "`Provided object` needs to be of type `onnxruntime.capi.onnxruntime_pybind11_state.OrtDevice`"
             )
@@ -1188,6 +1242,9 @@ class OrtDevice:
     def device_vendor_id(self):
         return self._ort_device.vendor_id()
+    def device_mem_type(self):
+        return self._ort_device.mem_type()
 class SparseTensor:
     """

onnxruntime/capi/onnxruntime_providers_shared.dll CHANGED Viewed

Binary file

onnxruntime/capi/onnxruntime_pybind11_state.pyd CHANGED Viewed

Binary file

onnxruntime/capi/onnxruntime_validation.py CHANGED Viewed

@@ -23,9 +23,9 @@ def check_distro_info():
         __my_distro__ = __my_system__
         __my_distro_ver__ = platform.release().lower()
-        if __my_distro_ver__ not in ["10", "11"]:
+        if __my_distro_ver__ not in ["10", "11", "2016server", "2019server", "2022server", "2025server"]:
             warnings.warn(
-                f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only."
+                f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, or Windows Server 2016 and above."
             )
     elif __my_system__ == "linux":
         """Although the 'platform' python module for getting Distro information works well on standard OS images

onnxruntime/quantization/calibrate.py CHANGED Viewed

@@ -353,6 +353,14 @@ class MinMaxCalibrater(CalibraterBase):
                     return opset_import.version
             raise RuntimeError(f"Model does not contain a version for '{op_type}'.")
+        def insert_nodes(tensor_name, new_nodes):
+            index = next(
+                (i for i, x in enumerate(self.model.graph.node) if tensor_name in x.input), len(self.model.graph.node)
+            )
+            for node in new_nodes:
+                self.model.graph.node.insert(index, node)
+                index += 1
         def add_reduce_min_max(tensor_name, reduce_op_name):
             # When doing ReduceMax/ReduceMin, ORT can't reduce on dim with value of 0 if 'keepdims' is false.
             # To make the code simple, we always let keepdims to be 1.
@@ -396,7 +404,7 @@ class MinMaxCalibrater(CalibraterBase):
                     reduce_node.input.append(reduce_axes_name)
                     self.model.graph.initializer.append(reduce_axes)
-            self.model.graph.node.extend([reduce_node, reshape_node])
+            insert_nodes(tensor_name, [reduce_node, reshape_node])
             self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [None]))
         for tensor in tensors:
@@ -417,7 +425,14 @@ class MinMaxCalibrater(CalibraterBase):
             inputs = data_reader.get_next()
             if not inputs:
                 break
-            self.intermediate_outputs.append(self.infer_session.run(None, inputs))
+            self.intermediate_outputs.append(
+                [
+                    value if sess_o.name not in self.model_original_outputs else None
+                    for sess_o, value in zip(
+                        self.infer_session.get_outputs(), self.infer_session.run(None, inputs), strict=False
+                    )
+                ]
+            )
             if (
                 self.max_intermediate_outputs is not None
                 and len(self.intermediate_outputs) == self.max_intermediate_outputs

onnxruntime/quantization/execution_providers/qnn/preprocess.py CHANGED Viewed

@@ -6,15 +6,15 @@
 from __future__ import annotations
 import logging
+import tempfile
 from pathlib import Path
 import onnx
-from ....tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed
+from ....tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed, optimize_model
 from ....tools.remove_initializer_from_input import remove_initializer_from_input
 from ...fusions import FusionGelu, FusionLayerNormalization
 from ...onnx_model import ONNXModel
-from ...quant_utils import save_and_reload_model_with_shape_infer
 from .fusion_lpnorm import FusionLpNormalization
 from .fusion_spacetodepth import FusionSpaceToDepth
@@ -93,7 +93,7 @@ def qnn_preprocess_model(
     """
     modified = False
     model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
-    model = save_and_reload_model_with_shape_infer(model)
+    model = save_and_reload_optimize_model(model, shape_infer=True)
     onnx_model = ONNXModel(model)
     # Optionally, fix the dynamic input shapes.
@@ -178,6 +178,24 @@ def qnn_preprocess_model(
     return modified
+def save_and_reload_optimize_model(model: onnx.ModelProto, shape_infer: bool) -> onnx.ModelProto:
+    with tempfile.TemporaryDirectory(prefix="ort.qnn_preproc.") as qnn_preproc_tmp_dir:
+        model_in_path = Path(qnn_preproc_tmp_dir).joinpath("qnn_proc_input.onnx")
+        onnx.save_model(model, model_in_path, save_as_external_data=True)
+        if shape_infer:
+            model_infer_path = Path(qnn_preproc_tmp_dir).joinpath("qnn_proc_infer.onnx")
+            onnx.shape_inference.infer_shapes_path(str(model_in_path), str(model_infer_path))
+            model_in_path = model_infer_path
+        model_out_path = Path(qnn_preproc_tmp_dir).joinpath("qnn_proc_output.onnx")
+        optimize_model(model_in_path, model_out_path)
+        ret_model = onnx.load_model(model_out_path)
+        ret_metaprops = {"onnx.infer": "onnxruntime.tools.qnn.preprocess"}
+        if ret_model.metadata_props:
+            ret_metaprops.update(ret_model.metadata_props)
+        onnx.helper.set_model_props(ret_model, ret_metaprops)
+        return ret_model
 class InputOutputNameMap:
     def __init__(
         self,

onnxruntime/quantization/execution_providers/qnn/quant_config.py CHANGED Viewed

@@ -331,23 +331,6 @@ class QnnCompatibilityOverrides:
         if not self.per_channel:
             self._make_static_inputs_use_default_weight_type(node)
-            return
-        has_weight_no_overrides = node.input[1] in self.initializers and node.input[1] not in self.overrides
-        has_bias_no_overrides = (
-            len(node.input) > 2
-            and node.input[2]
-            and node.input[2] in self.initializers
-            and node.input[2] not in self.overrides
-        )
-        if has_weight_no_overrides or has_bias_no_overrides:
-            # TODO: Make bias input not per-channel. QNN needs it to be per-tensor, but quantizer
-            # tries to makes it per-channel if the weight is also per-channel.
-            raise ValueError(
-                "get_qnn_qdq_config() does not currently support the global per_channel option with LayerNormalization."
-                " Please try using custom overrides that make bias per-tensor quantized."
-            )
     def _process_sigmoid(self, node: onnx.NodeProto):
         """

onnxruntime/quantization/fusions/fusion_layernorm.py CHANGED Viewed

@@ -33,6 +33,16 @@ class FusionLayerNormalization(Fusion):
                                      |                                                 |
                                      +-------------------------------------------------+
+         Or, using Mul instead of Pow:
+              +----------------------+
+              |                      |
+              |                      v
+          [Root] --> ReduceMean -->  Sub  --> Mul --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
+                     (axis=2 or -1)  |     (in0=in1)   (axis=2 or -1)  (E-6 or E-12 or 0) ^
+                                     |                                                 |
+                                     +-------------------------------------------------+
          It also handles cases of duplicated sub nodes exported from older version of PyTorch:
               +----------------------+
@@ -40,7 +50,7 @@ class FusionLayerNormalization(Fusion):
               |           +-------> Sub-----------------------------------------------+
               |           |                                                           |
               |           |                                                           v
-          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div  --> Mul --> Add
+          [Root] --> ReduceMean -->  Sub  --> (Pow or Mul) --> ReduceMean --> Add --> Sqrt --> Div  --> Mul --> Add
               |                      ^
               |                      |
               +----------------------+
@@ -70,10 +80,9 @@ class FusionLayerNormalization(Fusion):
             div_node,
             [
                 (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
-                (
-                    ["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
-                    [1, 0, 0, 0, 0, 0],
-                ),
+                (["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"], [1, 0, 0, 0, 0, 0]),
+                (["Sqrt", "Add", "ReduceMean", "Mul", "Sub"], [1, 0, 0, 0, 0]),
+                (["Sqrt", "Add", "ReduceMean", "Mul", "Cast", "Sub"], [1, 0, 0, 0, 0, 0]),
             ],
             output_name_to_node,
         )
@@ -90,8 +99,10 @@ class FusionLayerNormalization(Fusion):
             # Skip fusion since epsilon value is not expected.
             return
-        pow_node = parent_nodes[3]
-        if self.find_constant_input(pow_node, 2.0) != 1:
+        pow_or_mul_node = parent_nodes[3]
+        if pow_or_mul_node.op_type == "Pow" and self.find_constant_input(pow_or_mul_node, 2.0) != 1:
+            return
+        elif pow_or_mul_node.op_type == "Mul" and pow_or_mul_node.input[0] != pow_or_mul_node.input[1]:
             return
         mul_node = input_name_to_nodes[div_node.output[0]][0]