PyPI - onnx-diagnostic - Versions diffs - 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl - Mend

onnx-diagnostic 0.8.1py3-none-any.whl → 0.8.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

onnx_diagnostic/torch_export_patches/patches/_patch_transformers_causal_mask.py ADDED Viewed

@@ -0,0 +1,89 @@
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from .patch_helper import _has_transformers
+def _patch_make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+    sliding_window: Optional[int] = None,
+):
+    """Patched method."""
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [
+                torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device),
+                mask,
+            ],
+            dim=-1,
+        )
+    if sliding_window is not None:
+        diagonal = past_key_values_length - sliding_window - 1
+        context_mask = torch.tril(torch.ones_like(mask, dtype=torch.bool), diagonal=diagonal)
+        # PATCHED: removed if is_torchdynamo_compiling(): mask = mask.clone()
+        # and used masked_fill instead of masked_fill_
+        # In this case, the current implementation of torch fails (17/12/2024).
+        # Try model Phi-3.5-Mini-Instruct.
+        mask = mask.masked_fill(context_mask, torch.finfo(dtype).min)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+@dataclass
+class patched_AttentionMaskConverter:
+    """
+    Patches
+    ``transformers.modeling_attn_mask_utils.AttentionMaskConverter._make_causal_mask``.
+    """
+    # This method was fixed in 4.51 at least.
+    _PATCHES_ = ["_make_causal_mask"] if not _has_transformers("4.48.3") else []
+    _PATCHED_CLASS_ = AttentionMaskConverter
+    @staticmethod
+    def _make_causal_mask(
+        *args,
+        **kwargs,
+        # input_ids_shape: torch.Size,
+        # dtype: torch.dtype,
+        # device: torch.device,
+        # past_key_values_length: int = 0,
+        # sliding_window: Optional[int] = None,
+    ):
+        """
+        Patched method.
+        This static method may be called with ``AttentionMaskConverter._make_causal_mask``
+        or ``self._make_causal_mask``. That changes this argument is receives.
+        That should not matter but...
+        The patch should be implemented in another way. static methods do not play well
+        with a simple replacement.
+        Fortunately, this patch does not seem to be needed anymore with transformers>=4.48.3.
+        """
+        if args:
+            index = 0 if isinstance(args[0], (tuple, torch.Size)) else 1
+            names = [
+                "input_ids_shape",
+                "dtype",
+                "device",
+                "past_key_values_length",
+                "sliding_window",
+            ]
+            for i, a in enumerate(args):
+                if i < index:
+                    continue
+                kwargs[names[i - index]] = a
+        return _patch_make_causal_mask(**kwargs)

onnx_diagnostic/torch_export_patches/patches/_patch_transformers_dynamic_cache.py ADDED Viewed

@@ -0,0 +1,177 @@
+from typing import List, Optional, Tuple
+import packaging.version as pv
+import torch
+import transformers
+from .patch_helper import _has_transformers
+patch_is_initialized = _has_transformers("4.56.99")
+patch_DynamicCache = pv.Version(transformers.__version__) < pv.Version("4.51")
+try:
+    # transformers>= 4.55.1
+    from transformers.cache_utils import DynamicLayer
+    patch_DynamicLayer = hasattr(DynamicLayer, "lazy_initialization")
+except ImportError:
+    patch_DynamicLayer = False
+if patch_DynamicLayer:
+    class patched_DynamicLayer:
+        _PATCHES_ = ["lazy_initialization"]
+        _PATCHED_CLASS_ = DynamicLayer
+        def lazy_initialization(self, key_states: torch.Tensor):
+            self.dtype, self.device = key_states.dtype, key_states.device
+            new_shape = list(key_states.shape)
+            new_shape[-2] = 0
+            # PATCHED: used a tensor with an empty shape and not en empty list to initialize
+            self.keys = torch.empty(new_shape, dtype=self.dtype, device=self.device)
+            self.values = torch.empty(new_shape, dtype=self.dtype, device=self.device)
+            if patch_is_initialized:
+                self.is_initialized = True
+if patch_DynamicCache:
+    from typing import Any, Dict
+    from transformers.cache_utils import DynamicCache
+    class patched_DynamicCache:
+        """
+        Applies modifications implemented in PR
+        `transformers/#36652 <https://github.com/huggingface/transformers/pull/36652>`_.
+        """
+        _PATCHES_ = ["reorder_cache", "update", "crop", "from_batch_splits", "get_seq_length"]
+        _PATCHED_CLASS_ = transformers.cache_utils.DynamicCache
+        def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+            """Returns the sequence length of the cached states.
+            A layer index can be optionally passed."""
+            # TODO: deprecate this function in favor of `cache_position`
+            is_empty_layer = (
+                len(self.key_cache) == 0  # no cache in any layer
+                or len(self.key_cache)
+                <= layer_idx  # skipped `layer_idx` and hasn't run a layer with cache after it
+                or self.key_cache[layer_idx].numel() == 0  # the layer has no cache
+            )
+            layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
+            return layer_seq_length
+        def reorder_cache(self, beam_idx: torch.LongTensor):
+            """Reorders the cache for beam search, given the selected beam indices."""
+            for layer_idx in range(len(self.key_cache)):
+                if self.key_cache[layer_idx].numel():
+                    device = self.key_cache[layer_idx].device
+                    self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(
+                        0, beam_idx.to(device)
+                    )
+                if self.value_cache[layer_idx].numel():
+                    device = self.value_cache[layer_idx].device
+                    self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(
+                        0, beam_idx.to(device)
+                    )
+        def update(
+            self,
+            key_states: torch.Tensor,
+            value_states: torch.Tensor,
+            layer_idx: int,
+            cache_kwargs: Optional[Dict[str, Any]] = None,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
+            """
+            Updates the cache with the new `key_states`
+            and `value_states` for the layer `layer_idx`.
+            Parameters:
+                key_states (`torch.Tensor`):
+                    The new key states to cache.
+                value_states (`torch.Tensor`):
+                    The new value states to cache.
+                layer_idx (`int`):
+                    The index of the layer to cache the states for.
+                cache_kwargs (`Dict[str, Any]`, `optional`):
+                    Additional arguments for the cache subclass.
+                    No additional arguments are used in `DynamicCache`.
+            Return:
+                A tuple containing the updated key and value states.
+            """
+            # Update the number of seen tokens
+            if layer_idx == 0:
+                if hasattr(self, "_seen_tokens"):
+                    self._seen_tokens += key_states.shape[-2]
+            # Update the cache
+            if key_states is not None:
+                if len(self.key_cache) <= layer_idx:
+                    # There may be skipped layers, fill them with empty lists
+                    for _ in range(len(self.key_cache), layer_idx):
+                        self.key_cache.append(torch.tensor([], dtype=key_states.dtype))
+                        self.value_cache.append(torch.tensor([], dtype=key_states.dtype))
+                    self.key_cache.append(key_states)
+                    self.value_cache.append(value_states)
+                elif not self.key_cache[
+                    layer_idx
+                ].numel():  # prefers not t.numel() to len(t) == 0 to export the model
+                    # fills previously skipped layers; checking for tensor causes errors
+                    self.key_cache[layer_idx] = key_states
+                    self.value_cache[layer_idx] = value_states
+                else:
+                    torch._check(
+                        len(self.key_cache[layer_idx].shape) == len(key_states.shape),
+                        lambda: (
+                            f"Rank mismatch len(self.key_cache[layer_idx].shape)="
+                            f"{len(self.key_cache[layer_idx].shape)}, "
+                            f"len(key_states.shape)={len(key_states.shape)}"
+                        ),
+                    )
+                    self.key_cache[layer_idx] = torch.cat(
+                        [self.key_cache[layer_idx], key_states], dim=-2
+                    )
+                    self.value_cache[layer_idx] = torch.cat(
+                        [self.value_cache[layer_idx], value_states], dim=-2
+                    )
+            return self.key_cache[layer_idx], self.value_cache[layer_idx]
+        def crop(self, max_length: int):
+            """Crop the past key values up to a new `max_length`
+            in terms of tokens. `max_length` can also be
+            negative to remove `max_length` tokens.
+            This is used in assisted decoding and contrastive search.
+            """
+            # In case it is negative
+            if max_length < 0:
+                max_length = self.get_seq_length() - abs(max_length)
+            if self.get_seq_length() <= max_length:
+                return
+            if hasattr(self, "_seen_tokens"):
+                self._seen_tokens = max_length
+            for idx in range(len(self.key_cache)):
+                if self.key_cache[idx].numel():
+                    self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
+                    self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
+        @classmethod
+        def from_batch_splits(cls, splits: List[DynamicCache]) -> DynamicCache:
+            """This is the opposite of the above `batch_split()` method.
+            This will be used by `stack_model_outputs` in
+            `generation.utils`"""
+            cache = cls()
+            for idx in range(len(splits[0])):
+                key_cache = [
+                    current.key_cache[idx]
+                    for current in splits
+                    if current.key_cache[idx].numel()
+                ]
+                value_cache = [
+                    current.value_cache[idx]
+                    for current in splits
+                    if current.value_cache[idx].numel()
+                ]
+                if key_cache != []:
+                    layer_keys = torch.cat(key_cache, dim=0)
+                    layer_values = torch.cat(value_cache, dim=0)
+                    cache.update(layer_keys, layer_values, idx)
+            return cache

onnx_diagnostic/torch_export_patches/patches/_patch_transformers_gemma3.py ADDED Viewed

@@ -0,0 +1,54 @@
+import torch
+import transformers
+try:
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3Model  # noqa: F401
+    patch_gemma3 = True
+except ImportError:
+    patch_gemma3 = False
+if patch_gemma3:
+    class patched_Gemma3Model(torch.nn.Module):
+        _PATCHES_ = ["get_placeholder_mask"]
+        _PATCHED_CLASS_ = transformers.models.gemma3.modeling_gemma3.Gemma3Model
+        _PATCHED_PR_ = "https://github.com/huggingface/transformers/pull/41319"
+        def get_placeholder_mask(
+            self,
+            input_ids: torch.LongTensor,
+            inputs_embeds: torch.FloatTensor,
+            image_features: torch.FloatTensor,
+        ):
+            if input_ids is None:
+                special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                    torch.tensor(
+                        self.config.image_token_id,
+                        dtype=torch.long,
+                        device=inputs_embeds.device,
+                    )
+                )
+                special_image_mask = special_image_mask.all(-1)
+            else:
+                special_image_mask = input_ids == self.config.image_token_id
+            n_image_tokens = special_image_mask.sum()
+            special_image_mask = (
+                special_image_mask.unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            n_image_features = image_features.shape[0] * image_features.shape[1]
+            # PATCHED: torch._check
+            # if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            #    raise ValueError( ... )
+            torch._check(
+                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                lambda: (
+                    f"Image features and image tokens do not match: tokens: "
+                    f"{n_image_tokens}, features {n_image_features}"
+                ),
+            )
+            return special_image_mask

onnx-diagnostic 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl

onnx-diagnostic 0.8.1py3-none-any.whl → 0.8.3py3-none-any.whl