PyPI - onnx-diagnostic - Versions diffs - 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl - Mend

onnx-diagnostic 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

onnx_diagnostic/tasks/summarization.py CHANGED Viewed

@@ -1,23 +1,16 @@
 from typing import Any, Callable, Dict, Optional, Tuple
 import torch
 from ..helpers.cache_helper import make_dynamic_cache, make_encoder_decoder_cache
-from ..helpers.config_helper import (
-    update_config,
-    check_hasattr,
-    _pick,
-    default_num_hidden_layers as nhl,
-)
+from ..helpers.config_helper import update_config, check_hasattr
 __TASK__ = "summarization"
 def reduce_model_config(config: Any) -> Dict[str, Any]:
     """Reduces a model size."""
-    kwargs: Dict[str, Any] = {}
-    if hasattr(config, "num_decoder_layers"):
-        config.num_decoder_layers = min(config.num_decoder_layers, 2)
-    if hasattr(config, "num_hidden_layers"):
-        config.num_hidden_layers = min(config.num_hidden_layers, nhl())
+    check_hasattr(config, "vocab_size")
+    # Bart architecture does not like too much that the number of layers is changed.
+    kwargs = dict(vocab_size=2056)
     update_config(config, kwargs)
     return kwargs
@@ -25,96 +18,66 @@ def reduce_model_config(config: Any) -> Dict[str, Any]:
 def get_inputs(
     model: torch.nn.Module,
     config: Optional[Any],
+    batch_size: int,
+    sequence_length: int,
     dummy_max_token_id: int,
-    num_key_value_heads_encoder: int,
-    num_key_value_heads_decoder: int,
-    num_hidden_layers: int,
-    head_dim_encoder: int,
-    head_dim_decoder: int,
-    batch_size: int = 2,
-    sequence_length: int = 30,
-    sequence_length2: int = 3,
+    past_length: int = 30,
+    past_length2: int = 4,
+    decoder_attention_heads: Optional[int] = None,
+    encoder_attention_heads: Optional[int] = None,
+    encoder_ffn_dim: Optional[int] = None,
+    decoder_ffn_dim: Optional[int] = None,
+    num_hidden_layers: Optional[int] = None,
     add_second_input: int = 1,
     **kwargs,  # unused
 ):
     """
-    Generates input for task ``summarization``.
-    :param model: model to get the missing information
-    :param config: configuration used to generate the model
-    :param head_dim_encoder: last dimension of the cache for the encoder
-    :param head_dim_decoder: last dimension of the cache for the decoder
-    :param num_key_value_heads_encoder: number of heads for the encoder
-    :param num_key_value_heads_decoder: number of heads for the decoder
-    :param dummy_max_token_id: dummy max token id
-    :param batch_size: batch size
-    :param sequence_length: sequence length
-    :param sequence_length2: new sequence length
-    :return: dictionary
-    Stolen inputs for one model.
+    Generates inputs for task ``feature-extraction``.
+    Example:
     ::
-        cache_position:T7s1
-        past_key_values:EncoderDecoderCache(
-            self_attention_cache=DynamicCache(
-                key_cache=#6[T1s1x8x1x64,...],
-                value_cache=#6[T1s1x8x1x64,...]),
-            cross_attention_cache=DynamicCache(
-                key_cache=#6[T1s1x8x16x64,...],
-                value_cache=#6[T1s1x8x16x64,...])),
-        decoder_input_ids:T7s1x1,
-        encoder_outputs:dict(last_hidden_state:T1s1x16x512)
+        input_ids:T7s1x13[101,72654:A16789.23076923077],
+        token_type_ids:T7s1x13[0,0:A0.0],
+        attention_mask:T7s1x13[1,1:A1.0])
     """
     assert (
         "cls_cache" not in kwargs
     ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
     batch = "batch"
-    seq_length = "seq_length"  # torch.export.Dim("seq_length", min=1, max=4096)
-    cache_length = "cache_length_key"  # torch.export.Dim("cache_length", min=1, max=4096)
-    cache_length2 = "cache_length_val"  # torch.export.Dim("cache_length2", min=1, max=4096)
+    seq_length = "sequence_length"
     shapes = {
         "input_ids": {0: batch, 1: seq_length},
-        "decoder_input_ids": {0: batch, 1: "seq_ids"},
-        "attention_mask": {0: batch, 1: "seq_mask"},
-        # "cache_position": {0: batch, 1: torch.export.Dim.DYNAMIC},
-        "past_key_values": [
-            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers * 2)],
-            [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers * 2)],
-        ],
-        # one these is selected based on the forward method signature
-        # "encoder_last_hidden_state": {0: batch, 1: torch.export.Dim.DYNAMIC},
-        # "encoder_outputs": {0: batch, 1: torch.export.Dim.DYNAMIC},
+        "attention_mask": {0: batch, 1: seq_length},
     }
     inputs = dict(
         input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length)).to(
             torch.int64
         ),
-        decoder_input_ids=torch.randint(
-            0, dummy_max_token_id, (batch_size, sequence_length2)
-        ).to(torch.int64),
         attention_mask=torch.ones((batch_size, sequence_length)).to(torch.int64),
-        # cache_position=torch.arange(sequence_length, sequence_length + sequence_length2)
-        # .to(torch.int64)
-        # .expand((batch_size, -1)),
-        past_key_values=make_encoder_decoder_cache(
+    )
+    if (
+        encoder_attention_heads
+        and decoder_attention_heads
+        and encoder_ffn_dim
+        and decoder_ffn_dim
+        and num_hidden_layers
+    ):
+        inputs["past_key_values"] = make_encoder_decoder_cache(
             make_dynamic_cache(
                 [
                     (
                         torch.randn(
                             batch_size,
-                            num_key_value_heads_encoder,
-                            sequence_length,
-                            head_dim_encoder,
+                            encoder_attention_heads,
+                            past_length,
+                            encoder_ffn_dim,
                         ),
                         torch.randn(
                             batch_size,
-                            num_key_value_heads_encoder,
-                            sequence_length,
-                            head_dim_encoder,
+                            encoder_attention_heads,
+                            past_length,
+                            encoder_ffn_dim,
                         ),
                     )
                     for i in range(num_hidden_layers)
@@ -125,22 +88,28 @@ def get_inputs(
                     (
                         torch.randn(
                             batch_size,
-                            num_key_value_heads_decoder,
-                            sequence_length2,
-                            head_dim_decoder,
+                            decoder_attention_heads,
+                            past_length2,
+                            decoder_ffn_dim,
                         ),
                         torch.randn(
                             batch_size,
-                            num_key_value_heads_decoder,
-                            sequence_length2,
-                            head_dim_decoder,
+                            decoder_attention_heads,
+                            past_length2,
+                            decoder_ffn_dim,
                         ),
                     )
                     for i in range(num_hidden_layers)
                 ]
             ),
-        ),
-    )
+        )
+        cache_length = "cache_length_key"
+        cache_length2 = "cache_length_val"
+        shapes["past_key_values"] = [  # type: ignore[assignment]
+            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers * 2)],
+            [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers * 2)],
+        ]
     res = dict(inputs=inputs, dynamic_shapes=shapes)
     if add_second_input:
         assert (
@@ -149,15 +118,16 @@ def get_inputs(
         res["inputs2"] = get_inputs(
             model=model,
             config=config,
-            dummy_max_token_id=dummy_max_token_id,
-            num_key_value_heads_encoder=num_key_value_heads_encoder,
-            num_key_value_heads_decoder=num_key_value_heads_decoder,
-            num_hidden_layers=num_hidden_layers,
-            head_dim_encoder=head_dim_encoder,
-            head_dim_decoder=head_dim_decoder,
             batch_size=batch_size + 1,
             sequence_length=sequence_length + add_second_input,
-            sequence_length2=sequence_length2 + 1,
+            dummy_max_token_id=dummy_max_token_id,
+            past_length=past_length,
+            past_length2=past_length2,
+            decoder_attention_heads=decoder_attention_heads,
+            encoder_attention_heads=encoder_attention_heads,
+            encoder_ffn_dim=encoder_ffn_dim,
+            decoder_ffn_dim=decoder_ffn_dim,
+            num_hidden_layers=num_hidden_layers,
             add_second_input=0,
             **kwargs,
         )["inputs"]
@@ -171,57 +141,22 @@ def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
     If the configuration is None, the function selects typical dimensions.
     """
     if config is not None:
-        check_hasattr(
-            config,
-            "vocab_size",
-            "hidden_size",
-            "num_attention_heads",
-            ("num_hidden_layers", "num_layers"),
-            ("n_positions", "d_model"),
-            (
-                "num_key_value_heads",
-                "num_heads",
-                ("decoder_attention_heads", "encoder_attention_heads"),
-            ),
-        )
-    # exceptions = {
-    #     "PLBartForConditionalGeneration": (
-    #         lambda c: c.encoder_attention_heads + c.decoder_attention_heads
-    #    )
-    # }
+        check_hasattr(config, "vocab_size")
     kwargs = dict(
         batch_size=2,
-        sequence_length=30,
-        sequence_length2=3,
-        head_dim_encoder=(
-            16 if config is None else int(_pick(config, "encoder_ffn_dim") ** 0.5)
-        ),
-        head_dim_decoder=(
-            16 if config is None else int(_pick(config, "decoder_ffn_dim") ** 0.5)
-        ),
-        dummy_max_token_id=31999 if config is None else config.vocab_size - 1,
-        num_hidden_layers=(
-            8 if config is None else _pick(config, "num_hidden_layers", "num_layers")
-        ),
-        num_key_value_heads_encoder=(
-            16
-            if config is None
-            else _pick(
-                config,
-                "encoder_attention_heads",
-                "num_key_value_heads",
-                "num_heads",
-            )
-        ),
-        num_key_value_heads_decoder=(
-            16
-            if config is None
-            else _pick(
-                config,
-                "decoder_attention_heads",
-                "num_key_value_heads",
-                "num_heads",
-            )
-        ),
+        sequence_length=12,
+        past_length=30,
+        past_length2=4,
+        dummy_max_token_id=31999 if config is None else (config.vocab_size - 1),
     )
+    for att in [
+        "decoder_attention_heads",
+        "encoder_attention_heads",
+        "encoder_ffn_dim",
+        "decoder_ffn_dim",
+        "num_hidden_layers",
+    ]:
+        if hasattr(config, att):
+            kwargs[att] = getattr(config, att)
+    kwargs["decoder_ffn_dim"] = kwargs["encoder_ffn_dim"] = 64
     return kwargs, get_inputs

onnx_diagnostic/torch_export_patches/patches/_patch_transformers_attention.py ADDED Viewed

@@ -0,0 +1,236 @@
+from typing import Optional
+import torch
+import transformers
+from .patch_helper import _has_transformers
+patch_sdpa_is_causal = _has_transformers("4.99")
+def common_eager_attention_forward(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        # PATCHED
+        # The two following lines were added.
+        if attention_mask is not None and attention_mask.ndim == 4:
+            attention_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + attention_mask
+    attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+    attn_weights = torch.nn.functional.dropout(
+        attn_weights, p=dropout, training=module.training
+    )
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def patched_sdpa_attention_forward(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    is_causal: Optional[bool] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, None]:
+    """
+    manual patch for function
+    ``transformers.integrations.sdpa_attention.sdpa_attention_forward``
+    """
+    assert not kwargs.get("output_attentions", False), (
+        "`sdpa` attention does not support `output_attentions=True`."
+        " Please set your attention to `eager` if you want any of these features."
+    )
+    torch._check(
+        query.shape[0] == key.shape[0] or query.shape[0] == 1,
+        lambda: (
+            f"broadcast issue query (1): {query.shape}, key: {key.shape}, "
+            f"value: {value.shape}"
+        ),
+    )
+    torch._check(
+        key.shape[0] == value.shape[0] or key.shape[0] == 1,
+        lambda: (
+            f"broadcast issue query (2): {query.shape}, key: {key.shape}, "
+            f"value: {value.shape}"
+        ),
+    )
+    sdpa_kwargs = {}
+    if hasattr(module, "num_key_value_groups"):
+        if not transformers.integrations.sdpa_attention.use_gqa_in_sdpa(attention_mask, key):
+            key = transformers.integrations.sdpa_attention.repeat_kv(
+                key, module.num_key_value_groups
+            )
+            value = transformers.integrations.sdpa_attention.repeat_kv(
+                value, module.num_key_value_groups
+            )
+        else:
+            sdpa_kwargs = {"enable_gqa": True}
+    if attention_mask is not None and attention_mask.ndim == 4:
+        attention_mask = attention_mask[:, :, :, : key.shape[-2]]
+    torch._check(
+        attention_mask is None or attention_mask.shape[3] == key.shape[2],
+        lambda: "Attention mask shape incompatible with key shape.",
+    )
+    if patch_sdpa_is_causal:
+        # transformers>=4.55
+        is_causal = is_causal if is_causal is not None else getattr(module, "is_causal", True)
+        # PATCHED: remove the test query.shape[2] > 1
+        # is_causal = query.shape[2] > 1 and attention_mask is None and is_causal
+        # and we split the test to keep the minimum in torch.cond
+        is_causal = attention_mask is None and is_causal
+        if not is_causal:
+            torch._check(query.shape[0] > 0)
+            torch._check(query.shape[1] > 0)
+            torch._check(query.shape[2] > 0)
+            torch._check(query.shape[3] > 0)
+            torch._check(key.shape[0] > 0)
+            torch._check(key.shape[1] > 0)
+            torch._check(key.shape[2] > 0)
+            torch._check(key.shape[3] > 0)
+            torch._check(value.shape[0] > 0)
+            torch._check(value.shape[1] > 0)
+            torch._check(value.shape[2] > 0)
+            torch._check(value.shape[3] > 0)
+            return (
+                torch.nn.functional.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attention_mask,
+                    dropout_p=dropout,
+                    scale=scaling,
+                    is_causal=is_causal,
+                    **sdpa_kwargs,
+                )
+                .transpose(1, 2)
+                .contiguous(),
+                None,
+            )
+    else:
+        # transformers<4.55
+        if is_causal is None and attention_mask is not None:
+            is_causal = False
+        if is_causal is not None:
+            return (
+                torch.nn.functional.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attention_mask,
+                    dropout_p=dropout,
+                    scale=scaling,
+                    is_causal=is_causal,
+                    **sdpa_kwargs,
+                )
+                .transpose(1, 2)
+                .contiguous(),
+                None,
+            )
+    # To avoid the following errors:
+    # is_causal=query.shape[2] > 1
+    # TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not SymBool
+    # is_causal=torch.tensor(query.shape[2] > 1)
+    # TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor
+    attn_output = torch.cond(
+        query.shape[2] > 1,  # distinction between prefill and decoding steps
+        lambda query, key, value: torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            dropout_p=dropout,
+            scale=scaling,
+            is_causal=True,
+            **sdpa_kwargs,
+        ).contiguous(),
+        lambda query, key, value: torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            dropout_p=dropout,
+            scale=scaling,
+            is_causal=False,
+            **sdpa_kwargs,
+        ).contiguous(),
+        [query, key, value],
+    )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, None
+def patched_model_bart_eager_attention_forward(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    """[patch:transformers.models.bart.modeling_bart.eager_attention_forward]"""
+    return common_eager_attention_forward(
+        module,
+        query,
+        key,
+        value,
+        attention_mask=attention_mask,
+        scaling=scaling,
+        dropout=dropout,
+        head_mask=head_mask,
+        **kwargs,
+    )
+def patched_modeling_marian_eager_attention_forward(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    """[patch:transformers.models.marian.modeling_marian.eager_attention_forward]"""
+    return common_eager_attention_forward(
+        module,
+        query,
+        key,
+        value,
+        attention_mask=attention_mask,
+        scaling=scaling,
+        dropout=dropout,
+        head_mask=head_mask,
+        **kwargs,
+    )

onnx_diagnostic/torch_export_patches/patches/_patch_transformers_cache_utils.py ADDED Viewed

@@ -0,0 +1,50 @@
+from typing import Optional
+import inspect
+import transformers
+try:
+    from transformers.cache_utils import parse_processor_args  # noqa: F401
+    patch_parse_processor_args = True
+except ImportError:
+    patch_parse_processor_args = False
+if patch_parse_processor_args:
+    def _init_cache_inspect():
+        res = {}
+        for processor_class in transformers.cache_utils.PROCESSOR_CLASS_MAP.values():
+            try:
+                params = list(inspect.signature(processor_class.__init__).parameters)[2:]
+                res[processor_class.__init__] = params
+            except Exception:
+                res[processor_class.__init__] = None
+        return res
+    _cache_inspect = _init_cache_inspect()
+    def patched_parse_processor_args(
+        processor_class: Optional[type["CacheProcessor"]], kwargs: dict  # noqa: F821
+    ) -> tuple[dict, dict]:
+        """[patch:transformers.cache_utils.parse_processor_args]"""
+        # If not patched...
+        # Fails with transformers>=4.54 because function ``parse_processor_args``
+        # relies in inspect and the exporter is not very fond of that.
+        # torch._dynamo.exc.Unsupported: id() with unsupported args
+        # Explanation: Dynamo doesn't know how to trace id()
+        # call with args
+        # (GetAttrVariable(ConstantVariable(NoneType: None), __init__),)
+        # Hint: Supported args are Tensors, and functions/nn.Modules/user-defined
+        # objects from outside the compiled region.
+        # Hint: It may be possible to write Dynamo tracing rules for this code.
+        #
+        # The patch is caching the signature to avoid any call to inspect.
+        if processor_class is None:
+            return {}, kwargs
+        params = _cache_inspect[processor_class.__init__]
+        if params is None:
+            return {}, kwargs
+        processor_kwargs = {k: kwargs[k] for k in params if k in kwargs}
+        remaining_kwargs = {k: v for k, v in kwargs.items() if k not in processor_kwargs}
+        return processor_kwargs, remaining_kwargs

onnx_diagnostic/torch_export_patches/patches/_patch_transformers_causal_mask.py ADDED Viewed

@@ -0,0 +1,89 @@
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from .patch_helper import _has_transformers
+def _patch_make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+    sliding_window: Optional[int] = None,
+):
+    """Patched method."""
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [
+                torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device),
+                mask,
+            ],
+            dim=-1,
+        )
+    if sliding_window is not None:
+        diagonal = past_key_values_length - sliding_window - 1
+        context_mask = torch.tril(torch.ones_like(mask, dtype=torch.bool), diagonal=diagonal)
+        # PATCHED: removed if is_torchdynamo_compiling(): mask = mask.clone()
+        # and used masked_fill instead of masked_fill_
+        # In this case, the current implementation of torch fails (17/12/2024).
+        # Try model Phi-3.5-Mini-Instruct.
+        mask = mask.masked_fill(context_mask, torch.finfo(dtype).min)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+@dataclass
+class patched_AttentionMaskConverter:
+    """
+    Patches
+    ``transformers.modeling_attn_mask_utils.AttentionMaskConverter._make_causal_mask``.
+    """
+    # This method was fixed in 4.51 at least.
+    _PATCHES_ = ["_make_causal_mask"] if not _has_transformers("4.48.3") else []
+    _PATCHED_CLASS_ = AttentionMaskConverter
+    @staticmethod
+    def _make_causal_mask(
+        *args,
+        **kwargs,
+        # input_ids_shape: torch.Size,
+        # dtype: torch.dtype,
+        # device: torch.device,
+        # past_key_values_length: int = 0,
+        # sliding_window: Optional[int] = None,
+    ):
+        """
+        Patched method.
+        This static method may be called with ``AttentionMaskConverter._make_causal_mask``
+        or ``self._make_causal_mask``. That changes this argument is receives.
+        That should not matter but...
+        The patch should be implemented in another way. static methods do not play well
+        with a simple replacement.
+        Fortunately, this patch does not seem to be needed anymore with transformers>=4.48.3.
+        """
+        if args:
+            index = 0 if isinstance(args[0], (tuple, torch.Size)) else 1
+            names = [
+                "input_ids_shape",
+                "dtype",
+                "device",
+                "past_key_values_length",
+                "sliding_window",
+            ]
+            for i, a in enumerate(args):
+                if i < index:
+                    continue
+                kwargs[names[i - index]] = a
+        return _patch_make_causal_mask(**kwargs)

onnx-diagnostic 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl

onnx-diagnostic 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl