PyPI - onnx-diagnostic - Versions diffs - 0.7.16__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

onnx-diagnostic 0.7.16py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

onnx_diagnostic/__init__.py +1 -1
onnx_diagnostic/_command_lines_parser.py +78 -22
onnx_diagnostic/export/api.py +124 -0
onnx_diagnostic/export/dynamic_shapes.py +2 -1
onnx_diagnostic/export/shape_helper.py +47 -70
onnx_diagnostic/ext_test_case.py +11 -0
onnx_diagnostic/helpers/cache_helper.py +38 -7
onnx_diagnostic/helpers/fake_tensor_helper.py +224 -104
onnx_diagnostic/helpers/helper.py +27 -33
onnx_diagnostic/helpers/log_helper.py +109 -5
onnx_diagnostic/helpers/memory_peak.py +2 -0
onnx_diagnostic/helpers/mini_onnx_builder.py +1 -1
onnx_diagnostic/helpers/model_builder_helper.py +132 -2
onnx_diagnostic/helpers/onnx_helper.py +1 -1
onnx_diagnostic/helpers/ort_session.py +4 -0
onnx_diagnostic/helpers/rt_helper.py +393 -43
onnx_diagnostic/helpers/torch_helper.py +20 -1
onnx_diagnostic/tasks/__init__.py +7 -0
onnx_diagnostic/tasks/automatic_speech_recognition.py +2 -8
onnx_diagnostic/tasks/feature_extraction.py +2 -8
onnx_diagnostic/tasks/image_text_to_text.py +10 -8
onnx_diagnostic/tasks/summarization.py +2 -8
onnx_diagnostic/tasks/text2text_generation.py +3 -8
onnx_diagnostic/tasks/text_generation.py +86 -65
onnx_diagnostic/torch_export_patches/onnx_export_errors.py +718 -438
onnx_diagnostic/torch_export_patches/patch_details.py +340 -0
onnx_diagnostic/torch_export_patches/patch_inputs.py +1 -1
onnx_diagnostic/torch_export_patches/patch_module.py +9 -36
onnx_diagnostic/torch_export_patches/patches/patch_torch.py +12 -6
onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +162 -24
onnx_diagnostic/torch_export_patches/serialization/transformers_impl.py +140 -104
onnx_diagnostic/torch_models/untrained/llm_phi2.py +1 -4
onnx_diagnostic/torch_models/validate.py +626 -228
{onnx_diagnostic-0.7.16.dist-info → onnx_diagnostic-0.8.1.dist-info}/METADATA +1 -1
{onnx_diagnostic-0.7.16.dist-info → onnx_diagnostic-0.8.1.dist-info}/RECORD +38 -36
{onnx_diagnostic-0.7.16.dist-info → onnx_diagnostic-0.8.1.dist-info}/WHEEL +0 -0
{onnx_diagnostic-0.7.16.dist-info → onnx_diagnostic-0.8.1.dist-info}/licenses/LICENSE.txt +0 -0
{onnx_diagnostic-0.7.16.dist-info → onnx_diagnostic-0.8.1.dist-info}/top_level.txt +0 -0

onnx_diagnostic/torch_export_patches/patches/patch_transformers.py CHANGED Viewed

@@ -39,22 +39,57 @@ try:
 except ImportError:
     patch_DynamicLayer = False
-from ...ext_test_case import has_transformers
-from ...helpers.torch_helper import is_torchdynamo_exporting
-patch_is_initialized = pv.Version(transformers.__version__) > pv.Version("4.56.99")
+def _has_transformers(version: str) -> bool:
+    return pv.Version(transformers.__version__) >= pv.Version(version)
+def _is_torchdynamo_exporting() -> bool:
+    """
+    Tells if :epkg:`torch` is exporting a model.
+    Relies on ``torch.compiler.is_exporting()``.
+    """
+    import torch
+    if not hasattr(torch.compiler, "is_exporting"):
+        # torch.compiler.is_exporting requires torch>=2.7
+        return False
+    try:
+        return torch.compiler.is_exporting()
+    except Exception:
+        try:
+            import torch._dynamo as dynamo
+            return dynamo.is_exporting()  # type: ignore
+        except Exception:
+            return False
+patch_sdpa_is_causal = _has_transformers("4.99")
+patch_is_initialized = _has_transformers("4.56.99")
 if patch_masking_utils:
     # Introduced in 4.52
     from transformers.masking_utils import (
+        _ignore_causal_mask_sdpa,
+        and_masks,
         causal_mask_function,
         padding_mask_function,
-        and_masks,
-        _ignore_causal_mask_sdpa,
         prepare_padding_mask,
     )
+    try:
+        # transformers>=5.0
+        from transformers.masking_utils import (
+            _ignore_bidirectional_mask_sdpa,
+            bidirectional_mask_function,
+        )
+    except ImportError:
+        _ignore_bidirectional_mask_sdpa = None
+        bidirectional_mask_function = None
     def patched__vmap_for_bhqkv(mask_function: Callable, bh_indices: bool = True) -> Callable:
         """manual patch for function ``transformers.masking_utils._vmap_for_bhqkv``."""
         from ...helpers import string_type
@@ -98,7 +133,7 @@ if patch_masking_utils:
             #    for a, dims in zip(args, udimensions)
             # ]
             max_shape = tuple(args[i].shape[0] for i in indices)
-            # if is_torchdynamo_exporting():
+            # if _is_torchdynamo_exporting():
             #     for a in args:
             #         # The exporter should export with a dimension > 1
             #         # to make sure it is dynamic.
@@ -121,6 +156,7 @@ if patch_masking_utils:
         """manual patch for function ``transformers.masking_utils.eager_mask``."""
         # The masks for eager attention are simply boolean mask from sdpa, casted to 0 and -inf
         _ = kwargs.pop("allow_is_causal_skip", None)
+        _ = kwargs.pop("allow_is_bidirectional_skip", None)
         # PATCHED: this line called the patched version of sdpa_mask
         mask = patched_sdpa_mask_recent_torch(
             batch_size=batch_size,
@@ -130,6 +166,7 @@ if patch_masking_utils:
             mask_function=mask_function,
             attention_mask=attention_mask,
             allow_is_causal_skip=False,
+            allow_is_bidirectional_skip=False,
             allow_torch_fix=False,
             **kwargs,
         )
@@ -151,6 +188,7 @@ if patch_masking_utils:
         attention_mask: Optional[torch.Tensor] = None,
         local_size: Optional[int] = None,
         allow_is_causal_skip: bool = True,
+        allow_is_bidirectional_skip: bool = False,
         **kwargs,
     ) -> Optional[torch.Tensor]:
         """manual patch for function ``transformers.masking_utils.sdpa_mask_recent_torch``."""
@@ -160,6 +198,29 @@ if patch_masking_utils:
             padding_mask, q_length, kv_length, kv_offset, local_size
         ):
             return None
+        if (
+            allow_is_bidirectional_skip
+            and _ignore_bidirectional_mask_sdpa
+            and _ignore_bidirectional_mask_sdpa(padding_mask)
+        ):
+            return None
+        if mask_function is bidirectional_mask_function:
+            if padding_mask is not None:
+                # used for slicing without data-dependent slicing
+                mask_indices = (
+                    torch.arange(kv_length, device=cache_position.device) + kv_offset
+                )
+                return padding_mask[:, None, None, mask_indices].expand(-1, -1, q_length, -1)
+            return torch.ones(
+                batch_size,
+                1,
+                q_length,
+                kv_length,
+                dtype=torch.bool,
+                device=cache_position.device,
+            )
         kv_arange = torch.arange(kv_length, device=cache_position.device)
         kv_arange += kv_offset
         if padding_mask is not None:
@@ -275,7 +336,7 @@ class patched_AttentionMaskConverter:
     """
     # This method was fixed in 4.51 at least.
-    _PATCHES_ = ["_make_causal_mask"] if not has_transformers("4.48.3") else []
+    _PATCHES_ = ["_make_causal_mask"] if not _has_transformers("4.48.3") else []
     _PATCHED_CLASS_ = AttentionMaskConverter
     @staticmethod
@@ -507,7 +568,7 @@ class patched_GenerationMixin:
         The current implementation does not rely on ``self`` and could be
         a class method. It is left as a standard method to be easily rewritten.
         """
-        if is_torchdynamo_exporting():
+        if _is_torchdynamo_exporting():
             return self._cache_dependant_input_preparation_exporting(
                 input_ids, inputs_embeds, cache_position
             )
@@ -1287,11 +1348,29 @@ def patched_sdpa_attention_forward(
     is_causal: Optional[bool] = None,
     **kwargs,
 ) -> tuple[torch.Tensor, None]:
-    """[patch:transformers.integrations.sdpa_attention.sdpa_attention_forward]"""
+    """
+    manual patch for function
+    ``transformers.integrations.sdpa_attention.sdpa_attention_forward``
+    """
     assert not kwargs.get("output_attentions", False), (
         "`sdpa` attention does not support `output_attentions=True`."
         " Please set your attention to `eager` if you want any of these features."
     )
+    torch._check(
+        query.shape[0] == key.shape[0] or query.shape[0] == 1,
+        lambda: (
+            f"broadcast issue query (1): {query.shape}, key: {key.shape}, "
+            f"value: {value.shape}"
+        ),
+    )
+    torch._check(
+        key.shape[0] == value.shape[0] or key.shape[0] == 1,
+        lambda: (
+            f"broadcast issue query (2): {query.shape}, key: {key.shape}, "
+            f"value: {value.shape}"
+        ),
+    )
     sdpa_kwargs = {}
     if hasattr(module, "num_key_value_groups"):
         if not transformers.integrations.sdpa_attention.use_gqa_in_sdpa(attention_mask, key):
@@ -1307,24 +1386,83 @@ def patched_sdpa_attention_forward(
     if attention_mask is not None and attention_mask.ndim == 4:
         attention_mask = attention_mask[:, :, :, : key.shape[-2]]
-    is_causal = is_causal if is_causal is not None else getattr(module, "is_causal", True)
-    # PATCHED: remove the test query.shape[2] > 1
-    # is_causal = query.shape[2] > 1 and attention_mask is None and is_causal
-    is_causal = attention_mask is None and is_causal
     torch._check(
         attention_mask is None or attention_mask.shape[3] == key.shape[2],
-        "Attention mask shape incompatible with key shape.",
+        lambda: "Attention mask shape incompatible with key shape.",
     )
-    attn_output = torch.nn.functional.scaled_dot_product_attention(
-        query,
-        key,
-        value,
-        attn_mask=attention_mask,
-        dropout_p=dropout,
-        scale=scaling,
-        is_causal=is_causal,
-        **sdpa_kwargs,
+    if patch_sdpa_is_causal:
+        # transformers>=4.55
+        is_causal = is_causal if is_causal is not None else getattr(module, "is_causal", True)
+        # PATCHED: remove the test query.shape[2] > 1
+        # is_causal = query.shape[2] > 1 and attention_mask is None and is_causal
+        # and we split the test to keep the minimum in torch.cond
+        is_causal = attention_mask is None and is_causal
+        if not is_causal:
+            return (
+                torch.nn.functional.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attention_mask,
+                    dropout_p=dropout,
+                    scale=scaling,
+                    is_causal=is_causal,
+                    **sdpa_kwargs,
+                )
+                .transpose(1, 2)
+                .contiguous(),
+                None,
+            )
+    else:
+        # transformers<4.55
+        if is_causal is None and attention_mask is not None:
+            is_causal = False
+        if is_causal is not None:
+            return (
+                torch.nn.functional.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attention_mask,
+                    dropout_p=dropout,
+                    scale=scaling,
+                    is_causal=is_causal,
+                    **sdpa_kwargs,
+                )
+                .transpose(1, 2)
+                .contiguous(),
+                None,
+            )
+    # To avoid the following errors:
+    # is_causal=query.shape[2] > 1
+    # TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not SymBool
+    # is_causal=torch.tensor(query.shape[2] > 1)
+    # TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor
+    attn_output = torch.cond(
+        query.shape[2] > 1,  # distinction between prefill and decoding steps
+        lambda query, key, value: torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            dropout_p=dropout,
+            scale=scaling,
+            is_causal=True,
+            **sdpa_kwargs,
+        ).contiguous(),
+        lambda query, key, value: torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            dropout_p=dropout,
+            scale=scaling,
+            is_causal=False,
+            **sdpa_kwargs,
+        ).contiguous(),
+        [query, key, value],
     )
     attn_output = attn_output.transpose(1, 2).contiguous()
     return attn_output, None

onnx_diagnostic/torch_export_patches/serialization/transformers_impl.py CHANGED Viewed

@@ -1,13 +1,20 @@
-from typing import Any, List, Set, Tuple
+import itertools
+from typing import Any, Callable, List, Set, Tuple
 import torch
 from transformers.cache_utils import (
+    Cache,
     DynamicCache,
     EncoderDecoderCache,
     HybridCache,
-    SlidingWindowCache,
     StaticCache,
 )
+try:
+    from transformers.cache_utils import SlidingWindowCache
+except ImportError:
+    SlidingWindowCache = None
 try:
     from transformers.models.mamba.modeling_mamba import MambaCache
 except ImportError:
@@ -30,66 +37,36 @@ WRONG_REGISTRATIONS = {
 }
-############
-# MambaCache
-############
-def flatten_mamba_cache(
-    mamba_cache: MambaCache,
-) -> Tuple[List[Any], torch.utils._pytree.Context]:
-    """Serializes a :class:`transformers.cache_utils.MambaCache` with python objects."""
-    assert isinstance(mamba_cache.conv_states, list) and isinstance(
-        mamba_cache.ssm_states, list
-    ), (
-        f"Unexpected types for conv_states and ssm_states {type(mamba_cache.conv_states)}, "
-        f"{type(mamba_cache.ssm_states)}"
+def _flatten_key_value_cache(cache: Cache) -> Tuple[List[Any], torch.utils._pytree.Context]:
+    ca = CacheKeyValue(cache)
+    flat = list(itertools.chain.from_iterable(zip(ca.key_cache, ca.value_cache)))
+    keys = list(
+        itertools.chain.from_iterable(
+            (f"key_{i}", f"value_{i}") for i in range(len(ca.key_cache))
+        )
     )
-    flat = [
-        ("conv_states", mamba_cache.conv_states),
-        ("ssm_states", mamba_cache.ssm_states),
-    ]
-    return [f[1] for f in flat], [f[0] for f in flat]
+    return flat, keys
-def unflatten_mamba_cache(
-    values: List[Any], context: torch.utils._pytree.Context, output_type=None
-) -> MambaCache:
-    """Restores a :class:`transformers.cache_utils.MambaCache` from python objects."""
-    conv_states, ssm_states = values
-    class _config:
-        def __init__(self):
-            if isinstance(conv_states, list):
-                self.intermediate_size = conv_states[0].shape[1]
-                self.state_size = ssm_states[0].shape[2]
-                self.conv_kernel = conv_states[0].shape[2]
-                self.num_hidden_layers = len(conv_states)
-            else:
-                self.intermediate_size = conv_states.shape[2]
-                self.state_size = ssm_states.shape[3]
-                self.conv_kernel = conv_states.shape[3]
-                self.num_hidden_layers = conv_states.shape[0]
-    cache = MambaCache(
-        _config(),
-        max_batch_size=1,
-        dtype=values[-1][0].dtype,
-        device="cpu" if values[-1][0].get_device() < 0 else "cuda",
-    )
-    values = dict(zip(context, values))
-    for k, v in values.items():
-        setattr(cache, k, v)
-    return cache
+def _flatten_with_keys_cache(
+    cache: Cache,
+) -> Tuple[List[Tuple[torch.utils._pytree.KeyEntry, Any]], torch.utils._pytree.Context]:
+    values, context = _flatten_key_value_cache(cache)
+    return [(torch.utils._pytree.MappingKey(k), v) for k, v in zip(context, values)], context
-def flatten_with_keys_mamba_cache(cache: MambaCache) -> Tuple[
-    List[Tuple[torch.utils._pytree.KeyEntry, Any]],
-    torch.utils._pytree.Context,
-]:
-    """Serializes a :class:`transformers.cache_utils.MambaCache` with python objects."""
-    values, context = flatten_mamba_cache(cache)
-    return [(torch.utils._pytree.MappingKey(k), v) for k, v in zip(context, values)], context
+def _unflatten_cache(
+    make_cache: Callable,
+    values: List[Any],
+    context: torch.utils._pytree.Context,
+    output_type=None,
+) -> DynamicCache:
+    """Restores a :class:`transformers.cache_utils.DynamicCache` from python objects."""
+    res = make_cache(list(zip(values[::2], values[1::2])))
+    assert output_type is None or isinstance(
+        res, output_type
+    ), f"Type mismatch between {output_type} (expected) and {type(res)}"
+    return res
 ##############
@@ -101,24 +78,21 @@ def flatten_dynamic_cache(
     dynamic_cache: DynamicCache,
 ) -> Tuple[List[Any], torch.utils._pytree.Context]:
     """Serializes a :class:`transformers.cache_utils.DynamicCache` with python objects."""
-    ca = CacheKeyValue(dynamic_cache)
-    flat = [("key_cache", ca.key_cache), ("value_cache", ca.value_cache)]
-    return [f[1] for f in flat], [f[0] for f in flat]
+    return _flatten_key_value_cache(dynamic_cache)
 def flatten_with_keys_dynamic_cache(
     dynamic_cache: DynamicCache,
 ) -> Tuple[List[Tuple[torch.utils._pytree.KeyEntry, Any]], torch.utils._pytree.Context]:
     """Serializes a :class:`transformers.cache_utils.DynamicCache` with python objects."""
-    values, context = flatten_dynamic_cache(dynamic_cache)
-    return [(torch.utils._pytree.MappingKey(k), v) for k, v in zip(context, values)], context
+    return _flatten_with_keys_cache(dynamic_cache)
 def unflatten_dynamic_cache(
     values: List[Any], context: torch.utils._pytree.Context, output_type=None
 ) -> DynamicCache:
     """Restores a :class:`transformers.cache_utils.DynamicCache` from python objects."""
-    return make_dynamic_cache(list(zip(values[0], values[1])))
+    return _unflatten_cache(make_dynamic_cache, values, context, output_type=output_type)
 #############
@@ -130,24 +104,21 @@ def flatten_hybrid_cache(
     cache: HybridCache,
 ) -> Tuple[List[Any], torch.utils._pytree.Context]:
     """Serializes a :class:`transformers.cache_utils.HybridCache` with python objects."""
-    ca = CacheKeyValue(cache)
-    flat = [("key_cache", ca.key_cache), ("value_cache", ca.value_cache)]
-    return [f[1] for f in flat], [f[0] for f in flat]
+    return _flatten_key_value_cache(cache)
 def flatten_with_keys_hybrid_cache(
     cache: HybridCache,
 ) -> Tuple[List[Tuple[torch.utils._pytree.KeyEntry, Any]], torch.utils._pytree.Context]:
     """Serializes a :class:`transformers.cache_utils.HybridCache` with python objects."""
-    values, context = flatten_hybrid_cache(cache)
-    return [(torch.utils._pytree.MappingKey(k), v) for k, v in zip(context, values)], context
+    return _flatten_with_keys_cache(cache)
 def unflatten_hybrid_cache(
     values: List[Any], context: torch.utils._pytree.Context, output_type=None
 ) -> HybridCache:
     """Restores a :class:`transformers.cache_utils.HybridCache` from python objects."""
-    return make_hybrid_cache(list(zip(values[0], values[1])))
+    return _unflatten_cache(make_hybrid_cache, values, context, output_type=output_type)
 #############
@@ -163,26 +134,27 @@ def flatten_static_cache(
     assert not ca.key_cache or cache.max_cache_len == ca.key_cache[0].shape[2], (
         f"Serialization doet not work when "
         f"cache.max_cache_len={cache.max_cache_len} != "
-        f"cache.key_cache[0].shape[2]={ca.keu_cache[0].shape[2]}"
+        f"cache.key_cache[0].shape[2]={ca.key_cache[0].shape[2]}"
     )
-    flat = [("key_cache", ca.key_cache), ("value_cache", ca.value_cache)]
-    return [f[1] for f in flat], [f[0] for f in flat]
+    return _flatten_key_value_cache(cache)
 def flatten_with_keys_static_cache(
     cache: StaticCache,
 ) -> Tuple[List[Tuple[torch.utils._pytree.KeyEntry, Any]], torch.utils._pytree.Context]:
     """Serializes a :class:`transformers.cache_utils.StaticCache` with python objects."""
-    values, context = flatten_static_cache(cache)
-    return [(torch.utils._pytree.MappingKey(k), v) for k, v in zip(context, values)], context
+    return _flatten_with_keys_cache(cache)
 def unflatten_static_cache(
     values: List[Any], context: torch.utils._pytree.Context, output_type=None
 ) -> StaticCache:
     """Restores a :class:`transformers.cache_utils.StaticCache` from python objects."""
-    return make_static_cache(
-        list(zip(values[0], values[1])), max_cache_len=values[0][0].shape[2]
+    return _unflatten_cache(
+        lambda *args: make_static_cache(*args, max_cache_len=values[0].shape[2]),
+        values,
+        context,
+        output_type=output_type,
     )
@@ -191,34 +163,36 @@ def unflatten_static_cache(
 ####################
-def flatten_sliding_window_cache(
-    cache: SlidingWindowCache,
-) -> Tuple[List[Any], torch.utils._pytree.Context]:
-    """
-    Serializes a :class:`transformers.cache_utils.SlidingWindowCache`
-    with python objects.
-    """
-    ca = CacheKeyValue(cache)
-    flat = [("key_cache", ca.key_cache), ("value_cache", ca.value_cache)]
-    return [f[1] for f in flat], [f[0] for f in flat]
-def flatten_with_keys_sliding_window_cache(
-    cache: SlidingWindowCache,
-) -> Tuple[List[Tuple[torch.utils._pytree.KeyEntry, Any]], torch.utils._pytree.Context]:
-    """
-    Serializes a :class:`transformers.cache_utils.SlidingWindowCache`
-    with python objects.
-    """
-    values, context = flatten_sliding_window_cache(cache)
-    return [(torch.utils._pytree.MappingKey(k), v) for k, v in zip(context, values)], context
-def unflatten_sliding_window_cache(
-    values: List[Any], context: torch.utils._pytree.Context, output_type=None
-) -> SlidingWindowCache:
-    """Restores a :class:`transformers.cache_utils.SlidingWindowCache` from python objects."""
-    return make_sliding_window_cache(list(zip(values[0], values[1])))
+if SlidingWindowCache:
+    def flatten_sliding_window_cache(
+        cache: SlidingWindowCache,
+    ) -> Tuple[List[Any], torch.utils._pytree.Context]:
+        """
+        Serializes a :class:`transformers.cache_utils.SlidingWindowCache`
+        with python objects.
+        """
+        return _flatten_key_value_cache(cache)
+    def flatten_with_keys_sliding_window_cache(
+        cache: SlidingWindowCache,
+    ) -> Tuple[List[Tuple[torch.utils._pytree.KeyEntry, Any]], torch.utils._pytree.Context]:
+        """
+        Serializes a :class:`transformers.cache_utils.SlidingWindowCache`
+        with python objects.
+        """
+        return _flatten_with_keys_cache(cache)
+    def unflatten_sliding_window_cache(
+        values: List[Any], context: torch.utils._pytree.Context, output_type=None
+    ) -> SlidingWindowCache:
+        """
+        Restores a :class:`transformers.cache_utils.SlidingWindowCache`
+        from python objects.
+        """
+        return _unflatten_cache(
+            make_sliding_window_cache, values, context, output_type=output_type
+        )
 #####################
@@ -265,6 +239,68 @@ def unflatten_encoder_decoder_cache(
     )
+############
+# MambaCache
+############
+def flatten_mamba_cache(
+    mamba_cache: MambaCache,
+) -> Tuple[List[Any], torch.utils._pytree.Context]:
+    """Serializes a :class:`transformers.cache_utils.MambaCache` with python objects."""
+    assert isinstance(mamba_cache.conv_states, list) and isinstance(
+        mamba_cache.ssm_states, list
+    ), (
+        f"Unexpected types for conv_states and ssm_states {type(mamba_cache.conv_states)}, "
+        f"{type(mamba_cache.ssm_states)}"
+    )
+    flat = [
+        ("conv_states", mamba_cache.conv_states),
+        ("ssm_states", mamba_cache.ssm_states),
+    ]
+    return [f[1] for f in flat], [f[0] for f in flat]
+def unflatten_mamba_cache(
+    values: List[Any], context: torch.utils._pytree.Context, output_type=None
+) -> MambaCache:
+    """Restores a :class:`transformers.cache_utils.MambaCache` from python objects."""
+    conv_states, ssm_states = values
+    class _config:
+        def __init__(self):
+            if isinstance(conv_states, list):
+                self.intermediate_size = conv_states[0].shape[1]
+                self.state_size = ssm_states[0].shape[2]
+                self.conv_kernel = conv_states[0].shape[2]
+                self.num_hidden_layers = len(conv_states)
+            else:
+                self.intermediate_size = conv_states.shape[2]
+                self.state_size = ssm_states.shape[3]
+                self.conv_kernel = conv_states.shape[3]
+                self.num_hidden_layers = conv_states.shape[0]
+    cache = MambaCache(
+        _config(),
+        max_batch_size=1,
+        dtype=values[-1][0].dtype,
+        device="cpu" if values[-1][0].get_device() < 0 else "cuda",
+    )
+    values = dict(zip(context, values))
+    for k, v in values.items():
+        setattr(cache, k, v)
+    return cache
+def flatten_with_keys_mamba_cache(cache: MambaCache) -> Tuple[
+    List[Tuple[torch.utils._pytree.KeyEntry, Any]],
+    torch.utils._pytree.Context,
+]:
+    """Serializes a :class:`transformers.cache_utils.MambaCache` with python objects."""
+    values, context = flatten_mamba_cache(cache)
+    return [(torch.utils._pytree.MappingKey(k), v) for k, v in zip(context, values)], context
 #############
 # dataclasses
 #############

onnx_diagnostic/torch_models/untrained/llm_phi2.py CHANGED Viewed

@@ -84,10 +84,7 @@ def get_phi2(
             0: batch,
             1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
         },
-        "past_key_values": [
-            [{0: batch, 2: cache_length} for _ in range(n_layers)],
-            [{0: batch, 2: cache_length} for _ in range(n_layers)],
-        ],
+        "past_key_values": [{0: batch, 2: cache_length} for _ in range(n_layers * 2)],
     }
     inputs = dict(
         input_ids=torch.randint(0, max_token_id, (batch_size, sequence_length2)).to(

onnx-diagnostic 0.7.16__py3-none-any.whl → 0.8.1__py3-none-any.whl

onnx-diagnostic 0.7.16py3-none-any.whl → 0.8.1py3-none-any.whl