PyPI - onnx-diagnostic - Versions diffs - 0.7.14__py3-none-any.whl → 0.7.16__py3-none-any.whl - Mend

onnx-diagnostic 0.7.14py3-none-any.whl → 0.7.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

onnx_diagnostic/torch_export_patches/patches/patch_torch.py CHANGED Viewed

@@ -25,8 +25,8 @@ def retrieve_stacktrace():
 def _catch_produce_guards_and_solve_constraints(
     previous_function: Callable,
-    fake_mode: "FakeTensorMode",  # noqa: F821
-    gm: "torch.fx.GraphModule",  # noqa: F821
+    fake_mode: FakeTensorMode,
+    gm: torch.fx.GraphModule,
     dynamic_shapes: Union[Dict[str, Any], Tuple[Any], List[Any], None],
     equalities_inputs: "EqualityConstraint",  # noqa: F821
     original_signature: inspect.Signature,
@@ -88,7 +88,7 @@ def patch__check_input_constraints_for_graph(
 def patched_infer_size(a, b):
     """Patches ``torch._subclasses.fake_impls.infer_size``."""
-    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
     dimsA = len(a)
     dimsB = len(b)
@@ -113,19 +113,19 @@ def patched_infer_size(a, b):
         # were not the case, we'd need to write this using torch.sym_or() or
         # something like that).
         try:
-            b1 = guard_size_oblivious(sizeA == 1)
+            b1 = guard_or_false(sizeA == 1)
         except torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode:
             b1 = False
         try:
-            b2 = guard_size_oblivious(sizeB == 1)
+            b2 = guard_or_false(sizeB == 1)
         except torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode:
             b2 = False
         try:
-            b3 = guard_size_oblivious(sizeA == sizeB)
+            b3 = guard_or_false(sizeA == sizeB)
         except torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode:
             b3 = False
         if b1 or b2 or b3:
-            expandedSizes[i] = sizeB if guard_size_oblivious(sizeA == 1) else sizeA
+            expandedSizes[i] = sizeB if guard_or_false(sizeA == 1) else sizeA
         else:
             # PATCHED: generic case, the dimension is known, no need to assert
             expandedSizes[i] = torch.sym_max(sizeA, sizeB)
@@ -137,7 +137,6 @@ def patched__broadcast_shapes(*_shapes):
     from functools import reduce
     from torch._prims_common import IntLike
     from torch.fx.experimental.symbolic_shapes import (
-        guard_size_oblivious,
         guard_or_false,
         is_nested_int,
     )
@@ -174,13 +173,15 @@ def patched__broadcast_shapes(*_shapes):
                     continue
             # PATCHED: two cases, if == for sure, no broadcast,
             # otherwise maybe broadcast with max(dimensions)
-            if guard_size_oblivious(common_shape[idx] == 1):
+            if guard_or_false(common_shape[idx] != 1):
+                pass
+            elif guard_or_false(common_shape[idx] == 1) or guard_or_false(shape[idx] != 1):
                 if shape[idx] < 0:
                     raise ValueError(
                         "Attempting to broadcast a dimension with negative length!"
                     )
                 common_shape[idx] = shape[idx]
-            elif guard_size_oblivious(shape[idx] != 1):
+            else:
                 common_shape[idx] = torch.sym_max(common_shape[idx], shape[idx])
     return common_shape
@@ -360,6 +361,10 @@ class patched_ShapeEnv:
                 },
             )
+            for source in self.var_to_sources.get(a, []):
+                if user_tb:
+                    self.specialization_stacks[source] = user_tb
             # PATCHED: removed lines
             # if config.print_specializations:
             #    self.log.warning(
@@ -973,15 +978,101 @@ def patched__broadcast_in_dim_meta(
                     new_strides.append(a.stride()[original_idx])
                 else:
                     new_strides.append(0)
+            # PATCHED: disabled this check
+            elif guard_or_false(a.shape[original_idx] != 1):
+                new_strides.append(a.stride()[original_idx])
             else:
-                # PATCHED: disabled this check
-                # torch._check(
-                #    a.shape[original_idx] == shape[idx],
-                #    lambda idx=idx, original_idx=original_idx: (
-                #        f"non-broadcasting semantics require "
-                #        f"{a.shape[original_idx]} == {shape[idx]}"
-                #    ),
-                # )
+                # This checks generates the following issue:
+                # non-broadcasting semantics require s3 == Max(s10, s3), False,
+                # guard_or_false(a.shape[idx]==1)=False, a.stride()=(1, 2),
+                # idx=1, a.shape=torch.Size([2, s3]), shape=[2, Max(s10, s3)],
+                # original_idx=1
+                torch._check(
+                    a.shape[original_idx] == shape[idx],
+                    lambda idx=idx, original_idx=original_idx: (
+                        f"non-broadcasting semantics require "
+                        f"{a.shape[original_idx]} == {shape[idx]}, "
+                        f"{guard_or_false(a.shape[idx] != 1)}, "
+                        f"guard_or_false(a.shape[idx]==1)="
+                        f"{guard_or_false(a.shape[idx] == 1)}, "
+                        f"a.stride()={a.stride()}, idx={idx}, a.shape={a.shape}, "
+                        f"shape={shape}, original_idx={original_idx}"
+                    ),
+                )
+                new_strides.append(a.stride()[original_idx])
+            original_idx = original_idx + 1
+        else:
+            if guard_or_true(shape[idx] != 1):
+                # consistent with previous use of guard_size_oblivious
+                new_strides.append(0)
+            elif original_idx == a.ndim:
+                new_strides.append(1)
+            else:
+                new_strides.append(a.stride()[original_idx] * a.size()[original_idx])
+    return a.as_strided(shape, new_strides, a.storage_offset())
+def patched__broadcast_in_dim_meta_level_2(
+    a: torch._prims_common.TensorLikeType,
+    shape: torch._prims_common.ShapeType,
+    broadcast_dimensions: Sequence[int],
+):
+    """Patches ``torch._prims._broadcast_in_dim_meta``."""
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_or_true,
+        sym_or,
+    )
+    # Type checks
+    assert isinstance(a, torch._prims_common.TensorLike)
+    assert isinstance(shape, Sequence)
+    assert isinstance(broadcast_dimensions, Sequence)
+    # every dimension must be accounted for
+    assert a.ndim == len(broadcast_dimensions)
+    # broadcast shape must have weakly more dimensions
+    assert len(shape) >= a.ndim
+    # broadcast_dimensions must be an ascending sequence
+    # (no relative reordering of dims) of integers and
+    # each dimension must be within the new shape
+    def _greater_than_reduce(acc, x):
+        assert isinstance(x, (int, torch.export.Dim)), f"unexpected type {type(x)} for x"
+        assert x > acc
+        assert x < len(shape)
+        return x
+    reduce(_greater_than_reduce, broadcast_dimensions, -1)
+    # shape must be broadcastable to
+    for idx, new_idx in enumerate(broadcast_dimensions):
+        torch._check(
+            sym_or(a.shape[idx] == 1, shape[new_idx] == a.shape[idx]),
+            lambda idx=idx, new_idx=new_idx: (
+                f"{a.shape[idx]} must be broadcastable to {shape[new_idx]}"
+            ),
+        )
+    new_strides = []
+    original_idx = 0
+    for idx in range(len(shape)):
+        if idx in broadcast_dimensions:
+            # Assigns a stride of zero to dimensions
+            # which were actually broadcast
+            if guard_or_false(a.shape[original_idx] == 1):
+                if guard_or_false(a.shape[original_idx] == shape[idx]):
+                    new_strides.append(a.stride()[original_idx])
+                else:
+                    new_strides.append(0)
+            # PATCHED: disabled this check
+            elif guard_or_false(a.shape[original_idx] != 1):
+                new_strides.append(a.stride()[original_idx])
+            else:
+                # PATCHED: torch._check was removed
                 new_strides.append(a.stride()[original_idx])
             original_idx = original_idx + 1
         else:

onnx_diagnostic/torch_export_patches/patches/patch_transformers.py CHANGED Viewed

@@ -1019,6 +1019,26 @@ def patched__compute_dynamic_ntk_parameters(
     return inv_freq, attention_factor
+def _get_rope_init_fn(self, layer_type=None) -> Callable:
+    if hasattr(self, "rope_init_fn"):
+        # transformers<=5.0
+        rope_init_fn = (
+            patched__compute_dynamic_ntk_parameters
+            if self.rope_init_fn
+            is transformers.modeling_rope_utils._compute_dynamic_ntk_parameters
+            else self.rope_init_fn
+        )
+        return rope_init_fn
+    rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type]
+    rope_init_fn = self.compute_default_rope_parameters
+    if rope_type != "default":
+        rope_init_fn = transformers.modeling_rope_utils.ROPE_INIT_FUNCTIONS[self.rope_type]
+    if rope_init_fn is transformers.modeling_rope_utils._compute_dynamic_ntk_parameters:
+        return patched__compute_dynamic_ntk_parameters
+    return rope_init_fn
 def patched_dynamic_rope_update(rope_forward):
     """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``
@@ -1082,22 +1102,27 @@ def patched_dynamic_rope_update(rope_forward):
     """
-    def longrope_frequency_update(self, position_ids, device):
+    def longrope_frequency_update(self, position_ids, device, layer_type=None):
         # It is no use to patch the function after the model is created
         # as rope_init_fn is an attribute set to one function when the model
         # is created and when no patch is applied yet.
         # So we select the patched version here.
-        rope_init_fn = (
-            patched__compute_dynamic_ntk_parameters
-            if self.rope_init_fn
-            is transformers.modeling_rope_utils._compute_dynamic_ntk_parameters
-            else self.rope_init_fn
-        )
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         seq_len = torch.max(position_ids) + 1
         if hasattr(self.config, "original_max_position_embeddings"):
             original_max_position_embeddings = self.config.original_max_position_embeddings
         else:
             original_max_position_embeddings = self.config.max_position_embeddings
+        if layer_type is None:
+            # rope_type = self.rope_type
+            original_inv_freq = self.original_inv_freq
+            prefix = ""
+        else:
+            # rope_type = self.rope_type[layer_type]
+            original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
+            prefix = f"{layer_type}_"
         # At export time, seq_len is unknown.
         long_inv_freq, _ = rope_init_fn(
             self.config, device, seq_len=original_max_position_embeddings + 1
@@ -1112,13 +1137,13 @@ def patched_dynamic_rope_update(rope_forward):
             (lambda x, y: y.clone()),
             [long_inv_freq, original_inv_freq],
         )
-        self.inv_freq = inv_freq
+        setattr(self, f"{prefix}inv_freq", inv_freq)
         # if seq_len > original_max_position_embeddings:
         #    self.inv_freq = self.long_inv_freq
         # else:
         #    self.inv_freq = self.original_inv_freq
-    def dynamic_frequency_update(self, position_ids, device):
+    def dynamic_frequency_update(self, position_ids, device, layer_type=None):
         # constructor:
         # - self.max_seq_len_cached = config.max_position_embeddings
         # - self.original_max_seq_len = config.max_position_embeddings
@@ -1128,12 +1153,7 @@ def patched_dynamic_rope_update(rope_forward):
         # as rope_init_fn is an attribute set to one function when the model
         # is created and when no patch is applied yet.
         # So we select the patched version here.
-        rope_init_fn = (
-            patched__compute_dynamic_ntk_parameters
-            if self.rope_init_fn
-            is transformers.modeling_rope_utils._compute_dynamic_ntk_parameters
-            else self.rope_init_fn
-        )
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         # This behaviour is difficult to translate.
         # The sequence always grows.
@@ -1162,6 +1182,19 @@ def patched_dynamic_rope_update(rope_forward):
             self.config, device, seq_len=seq_len
         )
+        if layer_type is None:
+            # rope_type = self.rope_type
+            # max_seq_len_cached = self.max_seq_len_cached
+            original_inv_freq = self.original_inv_freq
+            prefix = ""
+        else:
+            # rope_type = self.rope_type[layer_type]
+            # max_seq_len_cached = getattr(
+            #     self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
+            # )
+            original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
+            prefix = f"{layer_type}_"
         # Second test to translate.
         # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
         # But in that case the following condition is a way to restore the original cache.
@@ -1183,15 +1216,26 @@ def patched_dynamic_rope_update(rope_forward):
             (lambda x, y: y.clone()),
             [long_inv_freq, original_inv_freq],
         )
-        self.inv_freq = inv_freq
+        setattr(self, f"{prefix}inv_freq", inv_freq)
     @wraps(rope_forward)
-    def wrapper(self, x, position_ids):
+    def wrapper(self, x, position_ids, layer_type=None):
+        if layer_type is None:
+            if "dynamic" in self.rope_type:
+                dynamic_frequency_update(self, position_ids, device=x.device)
+            elif self.rope_type == "longrope":
+                longrope_frequency_update(self, position_ids, device=x.device)
+            return rope_forward(self, x, position_ids)
         if "dynamic" in self.rope_type:
-            dynamic_frequency_update(self, position_ids, device=x.device)
+            dynamic_frequency_update(
+                self, position_ids, device=x.device, layer_type=layer_type
+            )
         elif self.rope_type == "longrope":
-            longrope_frequency_update(self, position_ids, device=x.device)
-        return rope_forward(self, x, position_ids)
+            longrope_frequency_update(
+                self, position_ids, device=x.device, layer_type=layer_type
+            )
+        return rope_forward(self, x, position_ids, layer_type=layer_type)
     return wrapper
@@ -1232,6 +1276,60 @@ def common_eager_attention_forward(
     return attn_output, attn_weights
+def patched_sdpa_attention_forward(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    is_causal: Optional[bool] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, None]:
+    """[patch:transformers.integrations.sdpa_attention.sdpa_attention_forward]"""
+    assert not kwargs.get("output_attentions", False), (
+        "`sdpa` attention does not support `output_attentions=True`."
+        " Please set your attention to `eager` if you want any of these features."
+    )
+    sdpa_kwargs = {}
+    if hasattr(module, "num_key_value_groups"):
+        if not transformers.integrations.sdpa_attention.use_gqa_in_sdpa(attention_mask, key):
+            key = transformers.integrations.sdpa_attention.repeat_kv(
+                key, module.num_key_value_groups
+            )
+            value = transformers.integrations.sdpa_attention.repeat_kv(
+                value, module.num_key_value_groups
+            )
+        else:
+            sdpa_kwargs = {"enable_gqa": True}
+    if attention_mask is not None and attention_mask.ndim == 4:
+        attention_mask = attention_mask[:, :, :, : key.shape[-2]]
+    is_causal = is_causal if is_causal is not None else getattr(module, "is_causal", True)
+    # PATCHED: remove the test query.shape[2] > 1
+    # is_causal = query.shape[2] > 1 and attention_mask is None and is_causal
+    is_causal = attention_mask is None and is_causal
+    torch._check(
+        attention_mask is None or attention_mask.shape[3] == key.shape[2],
+        "Attention mask shape incompatible with key shape.",
+    )
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=attention_mask,
+        dropout_p=dropout,
+        scale=scaling,
+        is_causal=is_causal,
+        **sdpa_kwargs,
+    )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, None
 def patched_model_bart_eager_attention_forward(
     module: torch.nn.Module,
     query: torch.Tensor,
@@ -1287,12 +1385,18 @@ class common_RotaryEmbedding(torch.nn.Module):
     # @torch.no_grad()
     # PATCHED: the decorator
     @patched_dynamic_rope_update
-    def forward(self, x, position_ids):
+    def forward(self, x, position_ids, layer_type=None):
+        if layer_type is not None:
+            # transformers>=5.0
+            inv_freq = getattr(self, f"{layer_type}_inv_freq")
+            attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+        else:
+            # transformers<5.0
+            inv_freq = self.inv_freq
+            attention_scaling = self.attention_scaling
         inv_freq_expanded = (
-            self.inv_freq[None, :, None]
-            .float()
-            .expand(position_ids.shape[0], -1, 1)
-            .to(x.device)
+            inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         )
         position_ids_expanded = position_ids[:, None, :].float()
@@ -1304,8 +1408,8 @@ class common_RotaryEmbedding(torch.nn.Module):
         with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos() * self.attention_scaling
-            sin = emb.sin() * self.attention_scaling
+            cos = emb.cos() * attention_scaling
+            sin = emb.sin() * attention_scaling
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
@@ -1380,7 +1484,8 @@ class patched_IdeficsEmbedding(torch.nn.Module):
         def _set_cos_sin_cache_then(x, inv_freq, seq_len, _cos_cached, _sin_cached):
             t = torch.arange(seq_len, device=x.device, dtype=torch.int64).type_as(inv_freq)
-            freqs = torch.einsum("i,j->ij", t, inv_freq)
+            # freqs = torch.einsum("i,j->ij", t, inv_freq)
+            freqs = t.reshape((-1, 1)) * inv_freq.reshape((1, -1))
             emb = torch.cat((freqs, freqs), dim=-1)
             return emb.cos().to(x.dtype), emb.sin().to(x.dtype)

onnx-diagnostic 0.7.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

onnx-diagnostic 0.7.14py3-none-any.whl → 0.7.16py3-none-any.whl