PyPI - onnx-diagnostic - Versions diffs - 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl - Mend

onnx-diagnostic 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

onnx_diagnostic/torch_export_patches/patches/_patch_transformers_masking_utils.py ADDED Viewed

@@ -0,0 +1,173 @@
+from typing import Callable, List, Optional, Tuple
+import torch
+try:
+    import transformers.masking_utils  # noqa: F401
+    patch_masking_utils = True
+except ImportError:
+    patch_masking_utils = False
+if patch_masking_utils:
+    # Introduced in 4.52
+    from transformers.masking_utils import (
+        _ignore_causal_mask_sdpa,
+        and_masks,
+        causal_mask_function,
+        padding_mask_function,
+        prepare_padding_mask,
+    )
+    try:
+        # transformers>=5.0
+        from transformers.masking_utils import (
+            _ignore_bidirectional_mask_sdpa,
+            bidirectional_mask_function,
+        )
+    except ImportError:
+        _ignore_bidirectional_mask_sdpa = None
+        bidirectional_mask_function = None
+    def patched__vmap_for_bhqkv(mask_function: Callable, bh_indices: bool = True) -> Callable:
+        """manual patch for function ``transformers.masking_utils._vmap_for_bhqkv``."""
+        from ...helpers import string_type
+        dimensions: List[Tuple[Optional[int], ...]] = [
+            (None, None, None, 0),
+            (None, None, 0, None),
+        ]
+        if bh_indices:
+            dimensions.extend([(None, 0, None, None), (0, None, None, None)])
+        # reshape
+        dimensions = [tuple(1 if d is None else -1 for d in shape) for shape in dimensions]
+        dimensions = tuple(reversed(dimensions))
+        indices = tuple(shape.index(-1) for shape in dimensions)
+        # unsqueeze
+        udimensions = [
+            tuple(di for di, d in enumerate(shape) if d == 1) for shape in dimensions
+        ]
+        def vector_mask_function(
+            *args, mask_function=mask_function, dimensions=dimensions, indices=indices
+        ):
+            assert len(args) == len(dimensions) == len(udimensions), (
+                f"Mismatch between args={string_type(args)} and dimensions={dimensions} "
+                f"and udimensions={udimensions}."
+            )
+            assert len(indices) == len(args), (
+                f"Mismatch between args={string_type(args)} and indices={indices}, "
+                f"they should have the same length."
+            )
+            for a in args:
+                assert (
+                    a.ndim == 1
+                ), f"Expected a tensor with 1 dimension not {string_type(a, with_shape=True)}"
+                torch._check(a.shape[0] > 0)
+            new_args = [a.reshape(shape) for a, shape in zip(args, dimensions)]
+            # new_args = [
+            #    a.unsqueeze(dims[0]).unsqueeze(dims[1]).unsqueeze(dims[2])
+            #    for a, dims in zip(args, udimensions)
+            # ]
+            max_shape = tuple(args[i].shape[0] for i in indices)
+            # if _is_torchdynamo_exporting():
+            #     for a in args:
+            #         # The exporter should export with a dimension > 1
+            #         # to make sure it is dynamic.
+            #         torch._check(a.shape[0] > 1)
+            expanded_args = [a.expand(max_shape) for a in new_args]
+            return mask_function(*expanded_args)
+        return vector_mask_function
+    def patched_eager_mask(
+        batch_size: int,
+        cache_position: torch.Tensor,
+        kv_length: int,
+        kv_offset: int = 0,
+        mask_function: Callable = causal_mask_function,
+        attention_mask: Optional[torch.Tensor] = None,
+        dtype: torch.dtype = torch.float32,
+        **kwargs,
+    ) -> torch.Tensor:
+        """manual patch for function ``transformers.masking_utils.eager_mask``."""
+        # The masks for eager attention are simply boolean mask from sdpa, casted to 0 and -inf
+        _ = kwargs.pop("allow_is_causal_skip", None)
+        _ = kwargs.pop("allow_is_bidirectional_skip", None)
+        # PATCHED: this line called the patched version of sdpa_mask
+        mask = patched_sdpa_mask_recent_torch(
+            batch_size=batch_size,
+            cache_position=cache_position,
+            kv_length=kv_length,
+            kv_offset=kv_offset,
+            mask_function=mask_function,
+            attention_mask=attention_mask,
+            allow_is_causal_skip=False,
+            allow_is_bidirectional_skip=False,
+            allow_torch_fix=False,
+            **kwargs,
+        )
+        min_dtype = torch.finfo(dtype).min
+        # PATCHED: the following line
+        # we need 0s where the tokens should be taken into account,
+        # and -inf otherwise (mask is already of boolean type)
+        # mask =
+        #   torch.where(mask, torch.tensor(0.0, device=mask.device, dtype=dtype), min_dtype)
+        mask = (~mask).to(dtype) * min_dtype
+        return mask
+    def patched_sdpa_mask_recent_torch(
+        batch_size: int,
+        cache_position: torch.Tensor,
+        kv_length: int,
+        kv_offset: int = 0,
+        mask_function: Callable = causal_mask_function,
+        attention_mask: Optional[torch.Tensor] = None,
+        local_size: Optional[int] = None,
+        allow_is_causal_skip: bool = True,
+        allow_is_bidirectional_skip: bool = False,
+        **kwargs,
+    ) -> Optional[torch.Tensor]:
+        """manual patch for function ``transformers.masking_utils.sdpa_mask_recent_torch``."""
+        q_length = cache_position.shape[0]
+        padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset, _slice=False)
+        if allow_is_causal_skip and _ignore_causal_mask_sdpa(
+            padding_mask, q_length, kv_length, kv_offset, local_size
+        ):
+            return None
+        if (
+            allow_is_bidirectional_skip
+            and _ignore_bidirectional_mask_sdpa
+            and _ignore_bidirectional_mask_sdpa(padding_mask)
+        ):
+            return None
+        if mask_function is bidirectional_mask_function:
+            if padding_mask is not None:
+                # used for slicing without data-dependent slicing
+                mask_indices = (
+                    torch.arange(kv_length, device=cache_position.device) + kv_offset
+                )
+                return padding_mask[:, None, None, mask_indices].expand(-1, -1, q_length, -1)
+            return torch.ones(
+                batch_size,
+                1,
+                q_length,
+                kv_length,
+                dtype=torch.bool,
+                device=cache_position.device,
+            )
+        kv_arange = torch.arange(kv_length, device=cache_position.device)
+        kv_arange += kv_offset
+        if padding_mask is not None:
+            mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
+        batch_arange = torch.arange(batch_size, device=cache_position.device)
+        head_arange = torch.arange(1, device=cache_position.device)
+        # PATCHED: this line calls the patched version of vmap_for_bhqkv
+        causal_mask = patched__vmap_for_bhqkv(mask_function)(
+            batch_arange, head_arange, cache_position, kv_arange
+        )
+        return causal_mask

onnx_diagnostic/torch_export_patches/patches/_patch_transformers_qwen2.py ADDED Viewed

@@ -0,0 +1,99 @@
+from typing import Optional, Tuple
+import torch
+try:
+    import transformers.models.qwen2_vl
+    patch_qwen2 = True
+except ImportError:
+    patch_qwen2 = False
+def rewrite_loop_for_square_mask(mask: torch.Tensor, seq: torch.Tensor):
+    """
+    Rewrites the loop in:
+    .. code-block:: python
+        attention_mask = torch.full(
+            [1, seq_length, seq_length], torch.finfo(q.dtype).min, dtype=q.dtype
+        )
+        for i in range(1, len(seq)):
+            attention_mask[..., seq[i - 1] : seq[i], seq[i - 1] : seq[i]] = 0
+    """
+    r = torch.arange(0, mask.shape[-1], dtype=torch.int64)
+    less0 = (r.reshape((-1, 1)) < seq.reshape((1, -1))).to(torch.int64)
+    less = less0.sum(axis=-1, keepdim=True) + 1
+    sq = less * less.T
+    look = (
+        torch.max(seq.min() == 0, less != less.max())
+        * torch.max(seq.max() == mask.shape[-1], less != less.min())
+        * less
+    )
+    filt = (sq != look**2).to(mask.dtype)
+    return mask * filt
+if patch_qwen2:
+    class patched_VisionAttention(torch.nn.Module):
+        _PATCHES_ = ["forward"]
+        _PATCHED_CLASS_ = transformers.models.qwen2_vl.modeling_qwen2_vl.VisionAttention
+        def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: Optional[torch.Tensor] = None,
+            position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        ) -> torch.Tensor:
+            seq_length = hidden_states.shape[0]
+            q, k, v = (
+                self.qkv(hidden_states)
+                .reshape(seq_length, 3, self.num_heads, -1)
+                .permute(1, 0, 2, 3)
+                .unbind(0)
+            )
+            if position_embeddings is None:
+                transformers.models.qwen2_vl.modeling_qwen2_vl.logger.warning_once(
+                    "The attention layers in this model are transitioning from "
+                    " computing the RoPE embeddings internally "
+                    "through `rotary_pos_emb` (2D tensor of RoPE theta values), "
+                    "to using externally computed "
+                    "`position_embeddings` (Tuple of tensors, containing cos and sin)."
+                    " In v4.54 `rotary_pos_emb` will be "
+                    "removed and `position_embeddings` will be mandatory."
+                )
+                emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+                cos = emb.cos()
+                sin = emb.sin()
+            else:
+                cos, sin = position_embeddings
+            q, k = transformers.models.qwen2_vl.modeling_qwen2_vl.apply_rotary_pos_emb_vision(
+                q, k, cos, sin
+            )
+            attention_mask = torch.full(
+                [1, seq_length, seq_length],
+                torch.finfo(q.dtype).min,
+                device=q.device,
+                dtype=q.dtype,
+            )
+            # for i in range(1, len(cu_seqlens)):
+            #     attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i],
+            #                         cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+            attention_mask = rewrite_loop_for_square_mask(attention_mask, cu_seqlens)
+            q = q.transpose(0, 1)
+            k = k.transpose(0, 1)
+            v = v.transpose(0, 1)
+            attn_weights = torch.matmul(q, k.transpose(1, 2)) / (self.head_dim**0.5)
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.nn.functional.softmax(
+                attn_weights, dim=-1, dtype=torch.float32
+            ).to(q.dtype)
+            attn_output = torch.matmul(attn_weights, v)
+            attn_output = attn_output.transpose(0, 1)
+            attn_output = attn_output.reshape(seq_length, -1)
+            attn_output = self.proj(attn_output)
+            return attn_output

onnx-diagnostic 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl

onnx-diagnostic 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl