PyPI - onnxruntime-directml - Versions diffs - 1.17.1__cp39-cp39-win_amd64.whl → 1.17.3__cp39-cp39-win_amd64.whl - Mend

onnxruntime-directml 1.17.1__cp39-cp39-win_amd64.whl → 1.17.3__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

onnxruntime/__init__.py CHANGED Viewed

@@ -7,7 +7,7 @@ ONNX Runtime is a performance-focused scoring engine for Open Neural Network Exc
 For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
 or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
 """
-__version__ = "1.17.1"
+__version__ = "1.17.3"
 __author__ = "Microsoft"
 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).

onnxruntime/capi/DirectML.dll CHANGED Viewed

Binary file

onnxruntime/capi/onnxruntime_providers_shared.dll CHANGED Viewed

Binary file

onnxruntime/capi/onnxruntime_pybind11_state.pyd CHANGED Viewed

Binary file

onnxruntime/capi/onnxruntime_validation.py CHANGED Viewed

@@ -22,7 +22,7 @@ def check_distro_info():
         __my_distro__ = __my_system__
         __my_distro_ver__ = platform.release().lower()
-        if __my_distro_ver__ != "10":
+        if __my_distro_ver__ not in ["10", "11"]:
             warnings.warn(
                 "Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only."
                 % __my_distro_ver__

onnxruntime/transformers/convert_generation.py CHANGED Viewed

@@ -1273,7 +1273,7 @@ def find_past_seq_len_usage(subg: GraphProto):
 def replace_mha_with_gqa(
-    model: OnnxModel, attn_mask: str, kv_num_heads: int = 0, world_size: int = 1, window_size: int = 0
+    model: OnnxModel, attn_mask: str, kv_num_heads: int = 0, world_size: int = 1, window_size: int = -1
 ):
     # Insert attention_mask subgraph to calculate shared inputs for all GroupQueryAttention nodes
     #
@@ -1339,31 +1339,163 @@ def replace_mha_with_gqa(
     )
     # Replace MultiHeadAttention with GroupQueryAttention
+    #
+    # When replacing, fuse the following subgraph:
+    #
+    #                 root_input
+    #               /     |      \
+    #         MatMul    MatMul    MatMul
+    #           |         |         |
+    #          Add       Add       Add      (optional Adds)
+    #           |         |         |
+    #         RotEmb    RotEmb      |
+    #            \        |        /
+    #             MultiHeadAttention
+    #
+    # to this new subgraph:
+    #
+    #                 root_input
+    #                     |
+    #                PackedMatMul           (if possible)
+    #                     |
+    #                 PackedAdd             (if possible)
+    #                     |
+    #             GroupQueryAttention
+    #
     mha_nodes = list(filter(lambda node: node.op_type == "MultiHeadAttention", model.model.graph.node))
-    for node in mha_nodes:
-        num_heads_mha = 0
+    for idx, node in enumerate(mha_nodes):
+        # Detect Q path to MHA
+        q_path_1 = model.match_parent_path(node, ["RotaryEmbedding", "Add", "MatMul"], [0, 0, 0])
+        q_path_2 = model.match_parent_path(node, ["RotaryEmbedding", "MatMul"], [0, 0])
+        q_rotary, q_add, q_matmul = None, None, None
+        if q_path_1 is not None:
+            q_rotary, q_add, q_matmul = q_path_1
+        elif q_path_2 is not None:
+            q_rotary, q_matmul = q_path_2
+        # Detect K path to MHA
+        k_path_1 = model.match_parent_path(node, ["RotaryEmbedding", "Add", "MatMul"], [1, 0, 0])
+        k_path_2 = model.match_parent_path(node, ["RotaryEmbedding", "MatMul"], [1, 0])
+        k_rotary, k_add, k_matmul = None, None, None
+        if k_path_1 is not None:
+            k_rotary, k_add, k_matmul = k_path_1
+        elif k_path_2 is not None:
+            k_rotary, k_matmul = k_path_2
+        # Detect V path to MHA
+        v_path_1 = model.match_parent_path(node, ["Add", "MatMul"], [2, 0])
+        v_path_2 = model.match_parent_path(node, ["MatMul"], [2])
+        v_add, v_matmul = None, None
+        if v_path_1 is not None:
+            v_add, v_matmul = v_path_1
+        elif v_path_2 is not None:
+            v_matmul = v_path_2[0]
+        # Get `interleaved` attribute from RotaryEmbedding
+        interleaved = 0
+        if q_rotary is not None and k_rotary is not None:
+            for att in q_rotary.attribute:
+                if att.name == "interleaved":
+                    interleaved = att.i
+        # Get `num_heads` attribute from MHA
+        num_heads = 0
         for att in node.attribute:
             if att.name == "num_heads":
-                num_heads_mha = att.i
+                num_heads = att.i
+        # Check if root_input to Q/K/V paths is the same
+        root_input_is_same = q_matmul.input[0] == k_matmul.input[0] and k_matmul.input[0] == v_matmul.input[0]
+        # Check if Q/K/V paths all have bias or all don't have bias
+        all_paths_have_bias = q_add is not None and k_add is not None and v_add is not None
+        all_paths_have_no_bias = q_add is None and k_add is None and v_add is None
+        # Make PackedMatMul node if possible
+        q_input_to_attention, k_input_to_attention, v_input_to_attention = "", "", ""
+        if root_input_is_same and (all_paths_have_bias or all_paths_have_no_bias):
+            qw = NumpyHelper.to_array(model.get_initializer(q_matmul.input[1]))
+            kw = NumpyHelper.to_array(model.get_initializer(k_matmul.input[1]))
+            vw = NumpyHelper.to_array(model.get_initializer(v_matmul.input[1]))
+            dim = qw.shape[-1]
+            qkv_weight = np.stack((qw, kw, vw), axis=1).reshape(dim, 3 * dim)
+            qkv_weight = onnx.numpy_helper.from_array(qkv_weight, name=f"QKV_Weight_{idx}")
+            model.add_initializer(qkv_weight)
+            packed_matmul_node = onnx.helper.make_node(
+                "MatMul",
+                inputs=[q_matmul.input[0], qkv_weight.name],
+                outputs=[f"{qkv_weight.name}_output"],
+                name=model.create_node_name("MatMul"),
+            )
+            model.model.graph.node.extend([packed_matmul_node])
+            model.model.graph.node.remove(q_matmul)
+            model.model.graph.node.remove(k_matmul)
+            model.model.graph.node.remove(v_matmul)
+            q_input_to_attention = packed_matmul_node.output[0]
+            # Make PackedAdd node if possible
+            if all_paths_have_bias:
+                qb = NumpyHelper.to_array(model.get_initializer(q_add.input[1]))
+                kb = NumpyHelper.to_array(model.get_initializer(k_add.input[1]))
+                vb = NumpyHelper.to_array(model.get_initializer(v_add.input[1]))
+                dim = qb.shape[-1]
+                qkv_bias = np.stack((qb, kb, vb), axis=0).reshape(3 * dim)
+                qkv_bias = onnx.numpy_helper.from_array(qkv_bias, name=f"QKV_Bias_{idx}")
+                model.add_initializer(qkv_bias)
+                packed_add_node = onnx.helper.make_node(
+                    "Add",
+                    inputs=[packed_matmul_node.output[0], qkv_bias.name],
+                    outputs=[f"{qkv_bias.name}_output"],
+                )
+                model.model.graph.node.extend([packed_add_node])
+                model.model.graph.node.remove(q_add)
+                model.model.graph.node.remove(k_add)
+                model.model.graph.node.remove(v_add)
+                q_input_to_attention = packed_add_node.output[0]
+        else:
+            q_input_to_attention = q_matmul.output[0]
+            k_input_to_attention = k_matmul.output[0]
+            v_input_to_attention = v_matmul.output[0]
+        # Make GQA node
         gqa_node = onnx.helper.make_node(
             "GroupQueryAttention",
             inputs=[
-                node.input[0],  # query
-                node.input[1],  # key
-                node.input[2],  # value
+                q_input_to_attention,  # query
+                k_input_to_attention,  # key
+                v_input_to_attention,  # value
                 node.input[6],  # past_key
                 node.input[7],  # past_value
-                "seqlens_k",  # seqlens_k (for attention_mask)
-                "total_seq_len",  # total_seq_len (for attention_mask)
+                seqlen_k_cast_node.output[0],  # seqlens_k (for attention mask)
+                total_seqlen_cast_node.output[0],  # total_seq_len (for attention mask)
+                q_rotary.input[2] if q_rotary is not None else "",  # cos_cache (for rotary embeddings)
+                q_rotary.input[3] if q_rotary is not None else "",  # sin_cache (for rotary embeddings)
             ],
             outputs=node.output,
             name=node.name.replace("MultiHeadAttention", "GroupQueryAttention"),
             domain="com.microsoft",
-            num_heads=num_heads_mha // world_size,
-            kv_num_heads=num_heads_mha // world_size if kv_num_heads == 0 else kv_num_heads // world_size,
+            num_heads=num_heads // world_size,
+            kv_num_heads=num_heads // world_size if kv_num_heads == 0 else kv_num_heads // world_size,
+            local_window_size=window_size,
+            do_rotary=int(q_rotary is not None and k_rotary is not None),
+            rotary_interleaved=interleaved,
         )
         model.model.graph.node.remove(node)
         model.model.graph.node.extend([gqa_node])
+        if q_rotary is not None:
+            model.model.graph.node.remove(q_rotary)
+        if k_rotary is not None:
+            model.model.graph.node.remove(k_rotary)
     return model

onnxruntime/transformers/models/llama/benchmark.py CHANGED Viewed

@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import argparse
 import datetime
 import gc
@@ -14,11 +19,12 @@ import torch
 from benchmark_helper import measure_memory, setup_logger
 from dist_settings import get_rank, get_size
 from llama_inputs import (
-    add_io_bindings,
+    add_io_bindings_as_ortvalues,
     get_merged_sample_with_past_kv_inputs,
     get_msft_sample_inputs,
     get_sample_inputs,
     get_sample_with_past_kv_inputs,
+    verify_ort_inputs,
 )
 from optimum.onnxruntime import ORTModelForCausalLM
 from torch.profiler import ProfilerActivity, profile, record_function
@@ -203,6 +209,7 @@ def get_model(args: argparse.Namespace):
             torch_dtype=torch.float16 if args.use_fp16 else torch.float32,
             use_auth_token=args.auth,
             use_cache=True,
+            cache_dir=args.cache_dir,
         ).to(args.target_device)
         end_time = time.time()
@@ -444,24 +451,12 @@ def run_hf_inference(args, init_inputs, iter_inputs, model):
 def run_ort_inference(args, init_inputs, iter_inputs, model):
     def prepare_ort_inputs(inputs, kv_cache_ortvalues):
-        # Check that all model inputs will be provided
-        model_inputs = set(map(lambda model_input: model_input.name, model.get_inputs()))
-        user_inputs = set(inputs.keys())
-        missing_inputs = model_inputs - user_inputs
-        if len(missing_inputs):
-            logger.error(f"The following model inputs are missing: {missing_inputs}")
-            raise Exception("There are missing inputs to the model. Please add them and try again.")
-        # Remove unnecessary inputs from model inputs
-        unnecessary_inputs = user_inputs - model_inputs
-        if len(unnecessary_inputs):
-            for unnecessary_input in unnecessary_inputs:
-                logger.info(f"Removing unnecessary input '{unnecessary_input}' from user provided inputs")
-                del inputs[unnecessary_input]
+        # Verify model inputs
+        inputs = verify_ort_inputs(model, inputs)
         # Add IO bindings for non-CPU execution providers
         if args.device != "cpu":
-            io_binding, kv_cache_ortvalues = add_io_bindings(
+            io_binding, kv_cache_ortvalues = add_io_bindings_as_ortvalues(
                 model, inputs, args.device, int(args.rank), args.use_gqa, kv_cache_ortvalues
             )
             setattr(args, "io_binding", io_binding)  # noqa: B010
@@ -612,6 +607,13 @@ def get_args(rank=0):
     parser.add_argument("--pt-num-rows", type=int, default=1000, help="Number of rows for PyTorch profiler to display")
     parser.add_argument("--verbose", default=False, action="store_true")
     parser.add_argument("--log-folder", type=str, default=os.path.join("."), help="Folder to cache log files")
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        required=True,
+        default="./model_cache",
+        help="Cache dir where Hugging Face files are stored",
+    )
     args = parser.parse_args()
@@ -662,8 +664,8 @@ def main():
     args.rank = rank
     args.world_size = world_size
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
-    config = AutoConfig.from_pretrained(args.model_name)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name, cache_dir=args.cache_dir)
+    config = AutoConfig.from_pretrained(args.model_name, cache_dir=args.cache_dir)
     target_device = f"cuda:{args.rank}" if args.device != "cpu" else args.device
     use_fp16 = args.precision == "fp16"

onnxruntime/transformers/models/llama/benchmark_all.py CHANGED Viewed

@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import argparse
 import datetime
 import json
@@ -78,6 +83,13 @@ def get_args():
         help="Path to ONNX model from convert_to_onnx",
     )
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        default="./model_cache",
+        help="Cache dir where Hugging Face files are stored",
+    )
     parser.add_argument(
         "--model-name",
         type=str,
@@ -332,6 +344,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
             "--auth",
         ]
         logger.info("Benchmark PyTorch without torch.compile")
@@ -362,6 +376,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
             "--auth",
         ]
         logger.info("Benchmark PyTorch with torch.compile")
@@ -394,6 +410,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
             "--auth",
         ]
         logger.info("Benchmark Optimum + ONNX Runtime")
@@ -426,6 +444,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
         ]
         logger.info("Benchmark Microsoft model in ONNX Runtime")
         results = benchmark(args, benchmark_cmd, "ort-msft")
@@ -457,6 +477,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
         ]
         logger.info("Benchmark convert_to_onnx model in ONNX Runtime")
         results = benchmark(args, benchmark_cmd, "onnxruntime")