PyPI - onnx-diagnostic - Versions diffs - 0.7.15__py3-none-any.whl → 0.7.16__py3-none-any.whl - Mend

onnx-diagnostic 0.7.15py3-none-any.whl → 0.7.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

onnx_diagnostic/helpers/cache_helper.py CHANGED Viewed

@@ -168,7 +168,33 @@ if pv.Version(transformers.__version__) > pv.Version("4.49.99999"):
                 ]
             )
             print(string_type(past_key_values, with_shape=True))
+        The function is fully able to handle ``FakeTensor`` with dynamic dimensions if
+        ``transformers>=4.56``. Before that version, only FakeTensor with static dimensions
+        are supported.
         """
+        if (
+            key_value_pairs
+            and isinstance(key_value_pairs[0][0], torch._subclasses.fake_tensor.FakeTensor)
+            and pv.Version(transformers.__version__) >= pv.Version("4.56")
+        ):
+            cache = transformers.cache_utils.DynamicCache()
+            cache.layers.extend(
+                [transformers.cache_utils.DynamicLayer() for _ in key_value_pairs]
+            )
+            for i, layer in enumerate(cache.layers):
+                k, v = key_value_pairs[i][0], key_value_pairs[i][1]
+                layer.dtype = k.dtype
+                layer.device = k.device
+                layer.keys = k
+                layer.values = v
+                layer.is_initialized = True
+            assert not hasattr(cache, "layers") or len(key_value_pairs) == len(cache.layers), (
+                f"Unexpected number of layers in the cache ({len(cache.layers)}), "
+                f"{len(key_value_pairs)} expected."
+            )
+            return finalize_cache(cache)
         cache = transformers.cache_utils.DynamicCache(key_value_pairs)
         if hasattr(cache, "layers") and len(key_value_pairs) < len(cache.layers):
             # The cache constructor contains the two following lines
@@ -494,51 +520,51 @@ def make_hybrid_cache(
     .. code-block:: python
-            self.max_cache_len = (
-                max_cache_len if max_cache_len is not None else config.max_position_embeddings)
+        self.max_cache_len = (
+            max_cache_len if max_cache_len is not None else config.max_position_embeddings)
-            # Sliding layers can't be larger than the overall max cache len
-            self.sliding_window_len = min(config.sliding_window, self.max_cache_len)
-            self.max_batch_size = max_batch_size
+        # Sliding layers can't be larger than the overall max cache len
+        self.sliding_window_len = min(config.sliding_window, self.max_cache_len)
+        self.max_batch_size = max_batch_size
-            self.head_dim = (
-                config.head_dim if hasattr(config, "head_dim")
-                else config.hidden_size // config.num_attention_heads
-            )
+        self.head_dim = (
+            config.head_dim if hasattr(config, "head_dim")
+            else config.hidden_size // config.num_attention_heads
+        )
-            self._dtype = dtype
-            self.num_key_value_heads = (
-                config.num_attention_heads
-                if getattr(config, "num_key_value_heads", None) is None
-                else config.num_key_value_heads
-            )
+        self._dtype = dtype
+        self.num_key_value_heads = (
+            config.num_attention_heads
+            if getattr(config, "num_key_value_heads", None) is None
+            else config.num_key_value_heads
+        )
-            # If the attribute does not exist in the config, fallback to a simple StaticCache
-            if hasattr(config, "layer_types"):
-                self.is_sliding = [
-                    layer_type != "full_attention" for layer_type in config.layer_types]
-            else:
-                self.is_sliding = [False] * config.num_hidden_layers
-            self.key_cache: list[torch.Tensor] = []
-            self.value_cache: list[torch.Tensor] = []
-            global_cache_shape = (self.max_batch_size, self.num_key_value_heads,
-                                  self.max_cache_len, self.head_dim)
-            sliding_cache_shape = (self.max_batch_size, self.num_key_value_heads,
-                                   self.sliding_window_len, self.head_dim)
-            self.sliding_window = min(config.sliding_window, max_cache_len)
-            device = torch.device(device) if device is not None else None
-            for i in range(config.num_hidden_layers):
-                layer_device = layer_device_map[i] if layer_device_map is not None else device
-                cache_shape = sliding_cache_shape if self.is_sliding[i] else global_cache_shape
-                new_layer_key_cache = torch.zeros(
-                    cache_shape, dtype=self._dtype, device=layer_device)
-                new_layer_value_cache = torch.zeros(
-                    cache_shape, dtype=self._dtype, device=layer_device)
-                torch._dynamo.mark_static_address(new_layer_key_cache)
-                torch._dynamo.mark_static_address(new_layer_value_cache)
-                self.key_cache.append(new_layer_key_cache)
-                self.value_cache.append(new_layer_value_cache)
+        # If the attribute does not exist in the config, fallback to a simple StaticCache
+        if hasattr(config, "layer_types"):
+            self.is_sliding = [
+                layer_type != "full_attention" for layer_type in config.layer_types]
+        else:
+            self.is_sliding = [False] * config.num_hidden_layers
+        self.key_cache: list[torch.Tensor] = []
+        self.value_cache: list[torch.Tensor] = []
+        global_cache_shape = (self.max_batch_size, self.num_key_value_heads,
+                                self.max_cache_len, self.head_dim)
+        sliding_cache_shape = (self.max_batch_size, self.num_key_value_heads,
+                                self.sliding_window_len, self.head_dim)
+        self.sliding_window = min(config.sliding_window, max_cache_len)
+        device = torch.device(device) if device is not None else None
+        for i in range(config.num_hidden_layers):
+            layer_device = layer_device_map[i] if layer_device_map is not None else device
+            cache_shape = sliding_cache_shape if self.is_sliding[i] else global_cache_shape
+            new_layer_key_cache = torch.zeros(
+                cache_shape, dtype=self._dtype, device=layer_device)
+            new_layer_value_cache = torch.zeros(
+                cache_shape, dtype=self._dtype, device=layer_device)
+            torch._dynamo.mark_static_address(new_layer_key_cache)
+            torch._dynamo.mark_static_address(new_layer_value_cache)
+            self.key_cache.append(new_layer_key_cache)
+            self.value_cache.append(new_layer_value_cache)
     """
     layer_types = None
     if key_value_pairs:

onnx_diagnostic/helpers/fake_tensor_helper.py ADDED Viewed

@@ -0,0 +1,153 @@
+from typing import Any, Dict, Optional, Tuple
+_UNIQUE = set()
+def _unique():
+    i = 129 + 1
+    while i in _UNIQUE:
+        i += 1
+    _UNIQUE.add(i)
+    return i
+def fake_reshape(
+    true_tensor: "torch.Tensor",  # noqa: F821
+    sh: Dict[int, Any],  # noqa: F821
+    fake_tensor: Optional["FakeTensor"] = None,  # noqa: F821
+    fake_mode: Optional["FakeTensorMode"] = None,  # noqa: F821
+) -> "FakeTensor":  # noqa: F821
+    """
+    Changes the shape of a true tensor to make it dynamic.
+    :param true_tensor: true tensor
+    :param sh: dynamic shape
+    :param fake_tensor: fake tensor, if None, make a fake one
+    :param fake_mode: fake tensor mode
+    :return: fake tensor
+    """
+    import torch
+    # deal with 0/1
+    for i in sh:
+        if true_tensor.shape[i] <= 1:
+            expanded_shape = list(true_tensor.shape)
+            expanded_shape[i] = _unique()
+            true_tensor = torch.empty(
+                tuple(expanded_shape), dtype=true_tensor.dtype, device=true_tensor.device
+            )
+    # deal with equivalent dimension
+    new_shape = list(true_tensor.shape)
+    mapping = {}
+    for i, s in sh.items():
+        d = true_tensor.shape[i]
+        if d not in mapping:
+            mapping[d] = s
+        elif mapping[d] != s:
+            d = _unique()
+            mapping[d] = s
+            new_shape[i] = d
+    true_tensor = torch.empty(
+        tuple(new_shape), dtype=true_tensor.dtype, device=true_tensor.device
+    )
+    # now switch to FakeTensor
+    if fake_mode is None:
+        from torch.fx.experimental.symbolic_shapes import ShapeEnv
+        from torch._subclasses.fake_tensor import FakeTensorMode
+        shape_env = ShapeEnv()
+        fake_mode = FakeTensorMode(shape_env=shape_env)
+    if fake_tensor is None:
+        fake_tensor = fake_mode.from_tensor(true_tensor, static_shapes=False)
+    assert fake_mode is not None, "fake_mode must be provided"
+    new_shape = list(true_tensor.shape)
+    for i in sh:
+        new_shape[i] = fake_tensor.shape[i]
+    reduced_tensor = fake_mode.from_tensor(true_tensor, static_shapes=True).sum(
+        axis=tuple(sorted(sh)), keepdim=True
+    )
+    return reduced_tensor.expand(*new_shape)
+def make_fake(
+    x: Any, fake_mode: Optional["FakeTensorMode"] = None  # noqa: F821
+) -> Tuple[Optional["FakeTensor"], Optional["FakeTensorMode"]]:  # noqa: F821
+    """
+    Replaces all tensors by fake tensors.
+    This modification happens inplace for caches.
+    This function is only implemented for cache with
+    ``transformers>=4.55``.
+    .. runpython::
+        :showcode:
+        import pprint
+        import torch
+        from onnx_diagnostic.helpers.cache_helper import make_dynamic_cache
+        from onnx_diagnostic.helpers.fake_tensor_helper import make_fake
+        inputs, _ = make_fake(
+            dict(
+                input_ids=torch.randint(30360, size=(2, 3), dtype=torch.int64),
+                attention_mask=torch.randint(1, size=(2, 33), dtype=torch.int64),
+                position_ids=torch.randint(32, size=(2, 3), dtype=torch.int64),
+                past_key_values=make_dynamic_cache(
+                    [
+                        (
+                            torch.rand((2, 32, 30, 96), dtype=torch.float16),
+                            torch.rand((2, 32, 30, 96), dtype=torch.float16),
+                        ),
+                        (
+                            torch.rand((2, 32, 30, 96), dtype=torch.float16),
+                            torch.rand((2, 32, 30, 96), dtype=torch.float16),
+                        ),
+                    ]
+                ),
+            )
+        )
+        pprint.pprint(inputs)
+    """
+    if x is None:
+        return None, None
+    if fake_mode is None:
+        from torch.fx.experimental.symbolic_shapes import ShapeEnv
+        from torch._subclasses.fake_tensor import FakeTensorMode
+        shape_env = ShapeEnv()
+        fake_mode = FakeTensorMode(shape_env=shape_env)
+    if isinstance(x, (list, tuple)):
+        return x.__class__([make_fake(i, fake_mode=fake_mode)[0] for i in x]), fake_mode
+    if isinstance(x, dict):
+        return {k: make_fake(v, fake_mode=fake_mode)[0] for k, v in x.items()}, fake_mode
+    if x.__class__.__name__ in {"DynamicCache", "StaticCache", "HybridCache"}:
+        assert hasattr(x, "layers"), (
+            f"Une more recent version of transformers (>=4.55), "
+            f"'layers' not found in class {type(x)}"
+        )
+        for layer in x.layers:
+            assert hasattr(layer, "keys") and hasattr(layer, "values"), (
+                f"Une more recent version of transformers (>=4.55), 'layers' "
+                f"not found in class {type(layer)} ({dir(layer)})"
+            )
+            layer.keys = make_fake(layer.keys, fake_mode=fake_mode)[0]
+            layer.values = make_fake(layer.values, fake_mode=fake_mode)[0]
+        return x, fake_mode
+    if x.__class__.__name__ == "EncoderDecoderCache":
+        make_fake(x.self_attention_cache, fake_mode=fake_mode)
+        make_fake(x.cross_attention_cache, fake_mode=fake_mode)
+        return x, fake_mode
+    if hasattr(x, "shape"):
+        t = fake_mode.from_tensor(x, static_shapes=False)
+        return t, fake_mode
+    from . import string_type
+    raise TypeError(
+        f"Unexpected type {type(x)} for x, content is {string_type(x, with_shape=True)}"
+    )

onnx_diagnostic/helpers/helper.py CHANGED Viewed

@@ -463,6 +463,7 @@ def string_type(
         if verbose:
             print(f"[string_type] F2:{type(obj)}")
         return f"{prefix}F{i}s{'x'.join(map(str, obj.shape))}"
     if isinstance(obj, torch.Tensor):
         from .torch_helper import torch_dtype_to_onnx_dtype
@@ -783,6 +784,8 @@ def string_type(
             obj, ultralytics.engine.results.Results
         ), f"Unexpected type={type(obj)}"
         return f"ultralytics.{obj.__class__.__name__}(...)"
+    if obj.__class__.__name__ == "FakeTensorMode":
+        return f"{obj}"
     if verbose:
         print(f"[string_type] END:{type(obj)}")

onnx_diagnostic/tasks/image_text_to_text.py CHANGED Viewed

@@ -271,7 +271,7 @@ def get_inputs_default(
         "input_ids": {0: batch, 1: seq_length},
         "token_type_ids": {0: batch, 1: seq_length},
         "attention_mask": {0: batch, 1: "cache+seq"},
-        "position_ids": {0: batch, 1: "cache+seq"},
+        "position_ids": {0: batch, 1: seq_length},
         "past_key_values": [
             [{0: batch} for _ in range(num_hidden_layers)],
             [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],

onnx_diagnostic/tasks/text_generation.py CHANGED Viewed

@@ -220,10 +220,7 @@ def get_inputs(
                     0: batch,
                     1: "cache+seq",  # cache_length + seq_length
                 },
-                "position_ids": {
-                    0: batch,
-                    1: "cache+seq",  # cache_length + seq_length
-                },
+                "position_ids": {0: batch, 1: seq_length},
                 "past_key_values": [
                     [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
                     [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],

onnx_diagnostic/torch_export_patches/onnx_export_errors.py CHANGED Viewed

@@ -2,7 +2,7 @@ import functools
 import importlib
 import contextlib
 import re
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from .onnx_export_serialization import (
     register_cache_serialization,
     unregister_cache_serialization,
@@ -160,7 +160,7 @@ def register_additional_serialization_functions(
 @contextlib.contextmanager
 def torch_export_patches(
     patch_sympy: bool = True,
-    patch_torch: bool = True,
+    patch_torch: Union[bool, int] = True,
     patch_transformers: bool = False,
     patch_diffusers: bool = False,
     catch_constraints: bool = True,
@@ -349,6 +349,7 @@ def torch_export_patches(
                 _catch_produce_guards_and_solve_constraints,
                 patch__check_input_constraints_for_graph,
                 patched__broadcast_in_dim_meta,
+                patched__broadcast_in_dim_meta_level_2,
                 patched__maybe_broadcast,
                 patched_ShapeEnv,
             )
@@ -390,8 +391,13 @@ def torch_export_patches(
             # torch._prims._broadcast_in_dim_meta
             f_broadcast_in_dim = torch._prims.broadcast_in_dim
             f__broadcast_in_dim_meta = torch._prims._broadcast_in_dim_meta
-            torch._prims._broadcast_in_dim_meta = patched__broadcast_in_dim_meta
-            torch._prims.broadcast_in_dim = patched__broadcast_in_dim_meta
+            _patched_dim_f = (
+                patched__broadcast_in_dim_meta_level_2
+                if patch_torch == 2
+                else patched__broadcast_in_dim_meta
+            )
+            torch._prims._broadcast_in_dim_meta = _patched_dim_f
+            torch._prims.broadcast_in_dim = _patched_dim_f
             # torch._refs._maybe_broadcast
             f__maybe_broadcast = torch._refs._maybe_broadcast
@@ -453,6 +459,16 @@ def torch_export_patches(
             except ImportError:
                 masking_utils = None
+            try:
+                import transformers.integrations.sdpa_attention as sdpa_attention
+            except ImportError:
+                sdpa_attention = None
+            try:
+                import transformers.modeling_utils as modeling_utils
+            except ImportError:
+                modeling_utils = None
             if verbose:
                 import transformers
@@ -464,7 +480,7 @@ def torch_export_patches(
                 patch_transformers_list, verbose=verbose
             )
-            if (
+            if (  # vmap
                 masking_utils
                 and patch_transformers_list.patch_masking_utils
                 and hasattr(masking_utils, "_vmap_for_bhqkv")
@@ -499,7 +515,7 @@ def torch_export_patches(
                 else:
                     f_transformers_sdpa_mask = None
-            if (
+            if (  # eager_mask
                 masking_utils
                 and patch_transformers_list.patch_masking_utils
                 and hasattr(masking_utils, "eager_mask")
@@ -526,7 +542,7 @@ def torch_export_patches(
                         patch_transformers_list.patched_eager_mask
                     )
-            if (
+            if (  # sdpa_mask
                 masking_utils
                 and patch_transformers_list.patch_masking_utils
                 and hasattr(masking_utils, "sdpa_mask")
@@ -547,6 +563,29 @@ def torch_export_patches(
                         patch_transformers_list.patched_sdpa_mask_recent_torch
                     )
+            if (  # sdpa_attention_forward
+                sdpa_attention is not None
+                and modeling_utils is not None
+                and hasattr(sdpa_attention, "sdpa_attention_forward")
+                and hasattr(sdpa_attention, "use_gqa_in_sdpa")
+                and hasattr(modeling_utils, "AttentionInterface")
+            ):
+                if verbose:
+                    print(
+                        "[torch_export_patches] patches "
+                        "transformers.integrations.sdpa_attention.sdpa_attention_forward"
+                    )
+                f_sdpa_attention_forward = sdpa_attention.sdpa_attention_forward
+                sdpa_attention.sdpa_attention_forward = (
+                    patch_transformers_list.patched_sdpa_attention_forward
+                )
+                modeling_utils.sdpa_attention_forward = (
+                    patch_transformers_list.patched_sdpa_attention_forward
+                )
+                modeling_utils.AttentionInterface._global_mapping["sdpa"] = (
+                    patch_transformers_list.patched_sdpa_attention_forward
+                )
         if custom_patches:
             if verbose:
                 print("[torch_export_patches] applies custom patches")
@@ -656,7 +695,7 @@ def torch_export_patches(
                     patch_transformers_list, revert_patches_info, verbose=verbose
                 )
-                if (
+                if (  # vmap
                     masking_utils
                     and patch_transformers_list.patch_masking_utils
                     and hasattr(masking_utils, "_vmap_for_bhqkv")
@@ -687,7 +726,7 @@ def torch_export_patches(
                                 "transformers.masking_utils.sdpa_mask"
                             )
-                if (
+                if (  # eager_mask
                     masking_utils
                     and patch_transformers_list.patch_masking_utils
                     and hasattr(masking_utils, "eager_mask")
@@ -714,7 +753,7 @@ def torch_export_patches(
                                 "in ALL_MASK_ATTENTION_FUNCTIONS"
                             )
-                if (
+                if (  # sdpa_mask
                     masking_utils
                     and patch_transformers_list.patch_masking_utils
                     and hasattr(masking_utils, "sdpa_mask")
@@ -734,6 +773,25 @@ def torch_export_patches(
                                 "in ALL_MASK_ATTENTION_FUNCTIONS"
                             )
+                if (  # sdpa_attention_forward
+                    sdpa_attention is not None
+                    and modeling_utils is not None
+                    and hasattr(sdpa_attention, "sdpa_attention_forward")
+                    and hasattr(sdpa_attention, "use_gqa_in_sdpa")
+                    and hasattr(modeling_utils, "AttentionInterface")
+                ):
+                    sdpa_attention.sdpa_attention_forward = f_sdpa_attention_forward
+                    modeling_utils.sdpa_attention_forward = f_sdpa_attention_forward
+                    modeling_utils.AttentionInterface._global_mapping["sdpa"] = (
+                        f_sdpa_attention_forward
+                    )
+                    if verbose:
+                        print(
+                            "[torch_export_patches] restored "
+                            "transformers.integrations.sdpa_attention."
+                            "sdpa_attention_forward"
+                        )
             ########
             # caches
             ########

onnx_diagnostic/torch_export_patches/patches/patch_torch.py CHANGED Viewed

@@ -25,8 +25,8 @@ def retrieve_stacktrace():
 def _catch_produce_guards_and_solve_constraints(
     previous_function: Callable,
-    fake_mode: "FakeTensorMode",  # noqa: F821
-    gm: "torch.fx.GraphModule",  # noqa: F821
+    fake_mode: FakeTensorMode,
+    gm: torch.fx.GraphModule,
     dynamic_shapes: Union[Dict[str, Any], Tuple[Any], List[Any], None],
     equalities_inputs: "EqualityConstraint",  # noqa: F821
     original_signature: inspect.Signature,
@@ -982,16 +982,21 @@ def patched__broadcast_in_dim_meta(
             elif guard_or_false(a.shape[original_idx] != 1):
                 new_strides.append(a.stride()[original_idx])
             else:
+                # This checks generates the following issue:
+                # non-broadcasting semantics require s3 == Max(s10, s3), False,
+                # guard_or_false(a.shape[idx]==1)=False, a.stride()=(1, 2),
+                # idx=1, a.shape=torch.Size([2, s3]), shape=[2, Max(s10, s3)],
+                # original_idx=1
                 torch._check(
                     a.shape[original_idx] == shape[idx],
                     lambda idx=idx, original_idx=original_idx: (
                         f"non-broadcasting semantics require "
                         f"{a.shape[original_idx]} == {shape[idx]}, "
                         f"{guard_or_false(a.shape[idx] != 1)}, "
-                        f"guard_or_false(a.shape[idx] == 1)="
+                        f"guard_or_false(a.shape[idx]==1)="
                         f"{guard_or_false(a.shape[idx] == 1)}, "
-                        f"a.stride()={a.stride()}, idx={idx}, "
-                        f"original_idx={original_idx}"
+                        f"a.stride()={a.stride()}, idx={idx}, a.shape={a.shape}, "
+                        f"shape={shape}, original_idx={original_idx}"
                     ),
                 )
                 new_strides.append(a.stride()[original_idx])
@@ -1006,3 +1011,77 @@ def patched__broadcast_in_dim_meta(
                 new_strides.append(a.stride()[original_idx] * a.size()[original_idx])
     return a.as_strided(shape, new_strides, a.storage_offset())
+def patched__broadcast_in_dim_meta_level_2(
+    a: torch._prims_common.TensorLikeType,
+    shape: torch._prims_common.ShapeType,
+    broadcast_dimensions: Sequence[int],
+):
+    """Patches ``torch._prims._broadcast_in_dim_meta``."""
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_or_true,
+        sym_or,
+    )
+    # Type checks
+    assert isinstance(a, torch._prims_common.TensorLike)
+    assert isinstance(shape, Sequence)
+    assert isinstance(broadcast_dimensions, Sequence)
+    # every dimension must be accounted for
+    assert a.ndim == len(broadcast_dimensions)
+    # broadcast shape must have weakly more dimensions
+    assert len(shape) >= a.ndim
+    # broadcast_dimensions must be an ascending sequence
+    # (no relative reordering of dims) of integers and
+    # each dimension must be within the new shape
+    def _greater_than_reduce(acc, x):
+        assert isinstance(x, (int, torch.export.Dim)), f"unexpected type {type(x)} for x"
+        assert x > acc
+        assert x < len(shape)
+        return x
+    reduce(_greater_than_reduce, broadcast_dimensions, -1)
+    # shape must be broadcastable to
+    for idx, new_idx in enumerate(broadcast_dimensions):
+        torch._check(
+            sym_or(a.shape[idx] == 1, shape[new_idx] == a.shape[idx]),
+            lambda idx=idx, new_idx=new_idx: (
+                f"{a.shape[idx]} must be broadcastable to {shape[new_idx]}"
+            ),
+        )
+    new_strides = []
+    original_idx = 0
+    for idx in range(len(shape)):
+        if idx in broadcast_dimensions:
+            # Assigns a stride of zero to dimensions
+            # which were actually broadcast
+            if guard_or_false(a.shape[original_idx] == 1):
+                if guard_or_false(a.shape[original_idx] == shape[idx]):
+                    new_strides.append(a.stride()[original_idx])
+                else:
+                    new_strides.append(0)
+            # PATCHED: disabled this check
+            elif guard_or_false(a.shape[original_idx] != 1):
+                new_strides.append(a.stride()[original_idx])
+            else:
+                # PATCHED: torch._check was removed
+                new_strides.append(a.stride()[original_idx])
+            original_idx = original_idx + 1
+        else:
+            if guard_or_true(shape[idx] != 1):
+                # consistent with previous use of guard_size_oblivious
+                new_strides.append(0)
+            elif original_idx == a.ndim:
+                new_strides.append(1)
+            else:
+                new_strides.append(a.stride()[original_idx] * a.size()[original_idx])
+    return a.as_strided(shape, new_strides, a.storage_offset())

onnx_diagnostic/torch_export_patches/patches/patch_transformers.py CHANGED Viewed

@@ -1276,6 +1276,60 @@ def common_eager_attention_forward(
     return attn_output, attn_weights
+def patched_sdpa_attention_forward(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    is_causal: Optional[bool] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, None]:
+    """[patch:transformers.integrations.sdpa_attention.sdpa_attention_forward]"""
+    assert not kwargs.get("output_attentions", False), (
+        "`sdpa` attention does not support `output_attentions=True`."
+        " Please set your attention to `eager` if you want any of these features."
+    )
+    sdpa_kwargs = {}
+    if hasattr(module, "num_key_value_groups"):
+        if not transformers.integrations.sdpa_attention.use_gqa_in_sdpa(attention_mask, key):
+            key = transformers.integrations.sdpa_attention.repeat_kv(
+                key, module.num_key_value_groups
+            )
+            value = transformers.integrations.sdpa_attention.repeat_kv(
+                value, module.num_key_value_groups
+            )
+        else:
+            sdpa_kwargs = {"enable_gqa": True}
+    if attention_mask is not None and attention_mask.ndim == 4:
+        attention_mask = attention_mask[:, :, :, : key.shape[-2]]
+    is_causal = is_causal if is_causal is not None else getattr(module, "is_causal", True)
+    # PATCHED: remove the test query.shape[2] > 1
+    # is_causal = query.shape[2] > 1 and attention_mask is None and is_causal
+    is_causal = attention_mask is None and is_causal
+    torch._check(
+        attention_mask is None or attention_mask.shape[3] == key.shape[2],
+        "Attention mask shape incompatible with key shape.",
+    )
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=attention_mask,
+        dropout_p=dropout,
+        scale=scaling,
+        is_causal=is_causal,
+        **sdpa_kwargs,
+    )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, None
 def patched_model_bart_eager_attention_forward(
     module: torch.nn.Module,
     query: torch.Tensor,

onnx-diagnostic 0.7.15__py3-none-any.whl → 0.7.16__py3-none-any.whl

onnx-diagnostic 0.7.15py3-none-any.whl → 0.7.16py3-none-any.whl