PyPI - onnx-diagnostic - Versions diffs - 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

onnx-diagnostic 0.6.3py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

onnx_diagnostic/__init__.py +1 -1
onnx_diagnostic/_command_lines_parser.py +87 -77
onnx_diagnostic/doc.py +22 -0
onnx_diagnostic/ext_test_case.py +1 -1
onnx_diagnostic/helpers/cache_helper.py +59 -0
onnx_diagnostic/helpers/config_helper.py +8 -4
onnx_diagnostic/helpers/helper.py +30 -3
onnx_diagnostic/helpers/log_helper.py +585 -0
onnx_diagnostic/helpers/mini_onnx_builder.py +4 -1
onnx_diagnostic/helpers/model_builder_helper.py +54 -73
onnx_diagnostic/helpers/torch_helper.py +18 -2
onnx_diagnostic/reference/__init__.py +1 -0
onnx_diagnostic/reference/ort_evaluator.py +29 -4
onnx_diagnostic/reference/report_results_comparison.py +95 -0
onnx_diagnostic/reference/torch_evaluator.py +21 -0
onnx_diagnostic/tasks/automatic_speech_recognition.py +3 -0
onnx_diagnostic/tasks/feature_extraction.py +3 -0
onnx_diagnostic/tasks/fill_mask.py +3 -0
onnx_diagnostic/tasks/image_classification.py +7 -1
onnx_diagnostic/tasks/image_text_to_text.py +3 -0
onnx_diagnostic/tasks/mixture_of_expert.py +3 -0
onnx_diagnostic/tasks/object_detection.py +3 -0
onnx_diagnostic/tasks/sentence_similarity.py +3 -0
onnx_diagnostic/tasks/summarization.py +3 -0
onnx_diagnostic/tasks/text2text_generation.py +3 -0
onnx_diagnostic/tasks/text_classification.py +3 -0
onnx_diagnostic/tasks/text_generation.py +90 -43
onnx_diagnostic/tasks/zero_shot_image_classification.py +3 -0
onnx_diagnostic/torch_export_patches/onnx_export_errors.py +78 -25
onnx_diagnostic/torch_export_patches/onnx_export_serialization.py +37 -0
onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +365 -17
onnx_diagnostic/torch_models/hghub/hub_api.py +20 -4
onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py +209 -0
onnx_diagnostic/torch_models/hghub/model_inputs.py +3 -0
onnx_diagnostic/torch_models/untrained/llm_tiny_llm.py +23 -50
onnx_diagnostic/torch_models/{test_helper.py → validate.py} +158 -103
{onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.0.dist-info}/METADATA +2 -2
{onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.0.dist-info}/RECORD +41 -39
{onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.0.dist-info}/WHEEL +0 -0
{onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.0.dist-info}/licenses/LICENSE.txt +0 -0
{onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.0.dist-info}/top_level.txt +0 -0

onnx_diagnostic/torch_export_patches/patches/patch_transformers.py CHANGED Viewed

@@ -11,7 +11,7 @@ from ...helpers.torch_helper import is_torchdynamo_exporting
 def patched__vmap_for_bhqkv(mask_function: Callable, bh_indices: bool = True) -> Callable:
-    """Patch for function ``transformers.masking_utils._vmap_for_bhqkv``."""
+    """manual patch for function ``transformers.masking_utils._vmap_for_bhqkv``."""
     from ...helpers import string_type
     dimensions: List[Tuple[Optional[int], ...]] = [
@@ -534,19 +534,169 @@ class patched_GenerationMixin:
         return model_inputs
-def patched_dynamic_rope_update(rope_forward):
+def patched__compute_dynamic_ntk_parameters(
+    config: Optional[transformers.PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> Tuple["torch.Tensor", float]:
+    """
+    manual patch:
+    ``[patch:transformers.modeling_rope_utils._compute_dynamic_ntk_parameters]``
+    Computes the inverse frequencies with NTK scaling.
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length,
+            used to update the dynamic RoPE at inference time.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous
+            RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`),
+        containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the
+        omputed cos/sin (unused in this type of RoPE).
     """
-    patch:transformers.modeling_rope_utils.dynamic_rope_update
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_dynamic_ntk_parameters`, got "
+            f"`rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        base = rope_kwargs["base"]
+        dim = rope_kwargs["dim"]
+        max_position_embeddings = rope_kwargs["max_position_embeddings"]
+        factor = rope_kwargs["factor"]
+    elif config is not None:
+        base = config.rope_theta
+        partial_rotary_factor = (
+            config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+        )
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        dim = int(head_dim * partial_rotary_factor)
+        max_position_embeddings = config.max_position_embeddings
+        factor = config.rope_scaling["factor"]
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # seq_len: default to max_position_embeddings, e.g. at init time
+    # seq_len = seq_len if seq_len is not None and
+    #       seq_len > max_position_embeddings else max_position_embeddings
+    if seq_len is None:
+        seq_len = max_position_embeddings
+    else:
+        torch._check(isinstance(seq_len, torch.Tensor))
+        seq_len = torch.maximum(
+            seq_len,
+            torch.tensor(max_position_embeddings, dtype=seq_len.dtype, device=seq_len.device),
+        )
+    # Compute the inverse frequencies
+    base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (
+        dim / (dim - 2)
+    )
+    inv_freq = 1.0 / (
+        base
+        ** (
+            torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float)
+            / dim
+        )
+    )
+    return inv_freq, attention_factor
+def patched_dynamic_rope_update(rope_forward):
+    """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``
+    ``rope_type`` is determined in the constructor of class
+    :class:`transformers.models.phi3.modeling_phi3.Phi3RotaryEmbedding`.
+    .. code-block:: python
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+    The original code of the patched function:
+    .. code-block:: python
+        def dynamic_rope_update(rope_forward):
+            def longrope_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if hasattr(self.config, "original_max_position_embeddings"):
+                    original_max_position_embeddings =
+                        self.config.original_max_position_embeddings
+                else:
+                    original_max_position_embeddings =
+                        self.config.max_position_embeddings
+                if seq_len > original_max_position_embeddings:
+                    if not hasattr(self, "long_inv_freq"):
+                        self.long_inv_freq, _ = self.rope_init_fn(
+                            self.config, device, seq_len=original_max_position_embeddings + 1
+                        )
+                    self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+                else:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            def dynamic_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if seq_len > self.max_seq_len_cached:  # growth
+                    inv_freq, self.attention_scaling = self.rope_init_fn(
+                        self.config, device, seq_len=seq_len)
+                    self.register_buffer("inv_freq", inv_freq, persistent=False)
+                    self.max_seq_len_cached = seq_len
+                if seq_len < self.original_max_seq_len and
+                        self.max_seq_len_cached > self.original_max_seq_len:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+                    self.max_seq_len_cached = self.original_max_seq_len
+            @wraps(rope_forward)
+            def wrapper(self, x, position_ids):
+                if "dynamic" in self.rope_type:
+                    dynamic_frequency_update(self, position_ids, device=x.device)
+                elif self.rope_type == "longrope":
+                    longrope_frequency_update(self, position_ids, device=x.device)
+                return rope_forward(self, x, position_ids)
+            return wrapper
     """
     def longrope_frequency_update(self, position_ids, device):
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = (
+            patched__compute_dynamic_ntk_parameters
+            if self.rope_init_fn
+            is transformers.modeling_rope_utils._compute_dynamic_ntk_parameters
+            else self.rope_init_fn
+        )
         seq_len = torch.max(position_ids) + 1
         if hasattr(self.config, "original_max_position_embeddings"):
             original_max_position_embeddings = self.config.original_max_position_embeddings
         else:
             original_max_position_embeddings = self.config.max_position_embeddings
         # At export time, seq_len is unknown.
-        long_inv_freq, _ = self.rope_init_fn(
+        long_inv_freq, _ = rope_init_fn(
             self.config, device, seq_len=original_max_position_embeddings + 1
         )
         original_inv_freq = self.original_inv_freq.to(device)
@@ -565,21 +715,70 @@ def patched_dynamic_rope_update(rope_forward):
         #    self.inv_freq = self.original_inv_freq
     def dynamic_frequency_update(self, position_ids, device):
+        # constructor:
+        # - self.max_seq_len_cached = config.max_position_embeddings
+        # - self.original_max_seq_len = config.max_position_embeddings
+        # - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = (
+            patched__compute_dynamic_ntk_parameters
+            if self.rope_init_fn
+            is transformers.modeling_rope_utils._compute_dynamic_ntk_parameters
+            else self.rope_init_fn
+        )
+        # This behaviour is difficult to translate.
+        # The sequence always grows.
+        # The test should always True.
+        # So:  self.max_seq_len_cached = max(self.max_seq_len_cached, seq_len) --> seq_len
+        #
+        # if seq_len > self.max_seq_len_cached:  # growth
+        #    inv_freq, self.attention_scaling = self.rope_init_fn(
+        #        self.config, device, seq_len=seq_len
+        #    )
+        #    self.register_buffer("inv_freq", inv_freq, persistent=False)
+        #    self.max_seq_len_cached = seq_len
+        #
+        # So we should not need what follows.
+        #
+        # cond = (seq_len > self.max_seq_len_cached).item()
+        # self.attention_scaling = torch.cond(
+        #    cond,
+        #    (lambda x, y: x.clone()),
+        #    (lambda x, y: y.clone()),
+        #    [attention_scaling, self.attention_scaling],
+        # )
         seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len
-            )
-            self.register_buffer("inv_freq", inv_freq, persistent=False)
-            self.max_seq_len_cached = seq_len
+        long_inv_freq, self.attention_scaling = rope_init_fn(
+            self.config, device, seq_len=seq_len
+        )
-        if (
-            seq_len < self.original_max_seq_len
-            and self.max_seq_len_cached > self.original_max_seq_len
-        ):
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
+        # Second test to translate.
+        # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
+        # But in that case the following condition is a way to restore the original cache.
+        # if (
+        #    seq_len < self.original_max_seq_len
+        #    and self.max_seq_len_cached > self.original_max_seq_len
+        # ):
+        #    self.original_inv_freq = self.original_inv_freq.to(device)
+        #    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+        #    self.max_seq_len_cached = self.original_max_seq_len
+        original_inv_freq = self.original_inv_freq.to(device)
+        cond = (seq_len >= self.original_max_seq_len).item()
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        self.inv_freq = inv_freq
     @wraps(rope_forward)
     def wrapper(self, x, position_ids):
@@ -619,3 +818,152 @@ class patched_Phi3RotaryEmbedding(torch.nn.Module):
             sin = emb.sin() * self.attention_scaling
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class patched_IdeficsEmbedding(torch.nn.Module):
+    _PATCHES_ = ["forward"]
+    _PATCHED_CLASS_ = transformers.models.idefics.modeling_idefics.IdeficsEmbedding
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # if seq_len > self.max_seq_len_cached:
+        #    self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        def _set_cos_sin_cache_then(x, inv_freq, seq_len, _cos_cached, _sin_cached):
+            t = torch.arange(seq_len, device=x.device, dtype=torch.int64).type_as(inv_freq)
+            freqs = torch.einsum("i,j->ij", t, inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            return emb.cos().to(x.dtype), emb.sin().to(x.dtype)
+        def _set_cos_sin_cache_else(_x, _inv_freq, _seq_len, cos_cached, sin_cached):
+            torch._check(seq_len.item() <= cos_cached.shape[0])
+            co = cos_cached[: seq_len.item()].detach().clone()
+            torch._check(seq_len.item() <= sin_cached.shape[0])
+            si = sin_cached[: seq_len.item()].detach().clone()
+            return co.to(dtype=x.dtype), si.to(dtype=x.dtype)
+        cos_cached, sin_cached = torch.cond(
+            (seq_len > self.max_seq_len_cached).item(),
+            _set_cos_sin_cache_then,
+            _set_cos_sin_cache_else,
+            [x, self.inv_freq, seq_len, self.cos_cached, self.sin_cached],
+        )
+        return cos_cached, sin_cached
+class patched_IdeficsAttention(torch.nn.Module):
+    _PATCHES_ = ["forward"]
+    _PATCHED_CLASS_ = transformers.models.idefics.modeling_idefics.IdeficsAttention
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        is_cross_attention = self.is_cross_attention or key_value_states is not None
+        bsz, q_len, _ = hidden_states.size()
+        query_states = (
+            self.q_proj(hidden_states)
+            .view(bsz, q_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )
+        if not is_cross_attention:
+            key_states = (
+                self.k_proj(hidden_states)
+                .view(bsz, q_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+            value_states = (
+                self.v_proj(hidden_states)
+                .view(bsz, q_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+        else:
+            _, kv_len, _ = (
+                key_value_states.size()
+            )  # Note that, in this case, `kv_len` == `kv_seq_len`
+            key_states = (
+                self.k_proj(key_value_states)
+                .view(bsz, kv_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+            value_states = (
+                self.v_proj(key_value_states)
+                .view(bsz, kv_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += cache_position[0]
+        if not is_cross_attention:
+            rotary_length = torch.maximum(
+                torch.tensor(kv_seq_len, dtype=torch.int64),
+                torch.tensor(q_len, dtype=torch.int64),
+            )
+            cos, sin = self.rotary_emb(value_states, seq_len=rotary_length)
+            query_states, key_states = (
+                transformers.models.idefics.modeling_idefics.apply_rotary_pos_emb(
+                    query_states, key_states, cos, sin, position_ids
+                )
+            )
+        # [bsz, nh, t, hd]
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models;
+            # cache_position needed for the static cache
+            cache_kwargs = {"cache_position": cache_position}
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+        if self.qk_layer_norms:
+            query_states = self.q_layer_norm(query_states)
+            key_states = self.k_layer_norm(key_states)
+        attention_interface: Callable = (
+            transformers.models.idefics.modeling_idefics.eager_attention_forward
+        )
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                transformers.models.idefics.modeling_idefics.logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support "
+                    "`output_attentions=True`. Falling back to "
+                    "eager attention. This warning can be removed using the argument "
+                    '`attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = transformers.modeling_utils.ALL_ATTENTION_FUNCTIONS[
+                    self.config._attn_implementation
+                ]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value

onnx_diagnostic/torch_models/hghub/hub_api.py CHANGED Viewed

@@ -2,6 +2,7 @@ import copy
 import functools
 import json
 import os
+import pprint
 from typing import Any, Dict, List, Optional, Union
 import transformers
 from huggingface_hub import HfApi, model_info, hf_hub_download
@@ -33,10 +34,14 @@ def _retrieve_cached_configurations() -> Dict[str, transformers.PretrainedConfig
     return res
-def get_cached_configuration(name: str, **kwargs) -> Optional[transformers.PretrainedConfig]:
+def get_cached_configuration(
+    name: str, exc: bool = False, **kwargs
+) -> Optional[transformers.PretrainedConfig]:
     """
     Returns cached configuration to avoid having to many accesses to internet.
     It returns None if not Cache. The list of cached models follows.
+    If *exc* is True or if environment variable ``NOHTTP`` is defined,
+    the function raises an exception if *name* is not found.
     .. runpython::
@@ -54,8 +59,11 @@ def get_cached_configuration(name: str, **kwargs) -> Optional[transformers.Pretr
             conf = copy.deepcopy(conf)
             update_config(conf, kwargs)
         return conf
-    if os.environ.get("NOHTTP", ""):
-        raise AssertionError(f"Unable to find {name!r} in {sorted(cached)}")
+    assert not exc and not os.environ.get("NOHTTP", ""), (
+        f"Unable to find {name!r} (exc={exc}, "
+        f"NOHTTP={os.environ.get('NOHTTP', '')!r}) "
+        f"in {pprint.pformat(sorted(cached))}"
+    )
     return None
@@ -64,6 +72,7 @@ def get_pretrained_config(
     trust_remote_code: bool = True,
     use_preinstalled: bool = True,
     subfolder: Optional[str] = None,
+    use_only_preinstalled: bool = False,
     **kwargs,
 ) -> Any:
     """
@@ -77,13 +86,20 @@ def get_pretrained_config(
         :func:`get_cached_configuration`, the cached list is mostly for
         unit tests
     :param subfolder: subfolder for the given model id
+    :param use_only_preinstalled: if True, raises an exception if not preinstalled
     :param kwargs: additional kwargs
     :return: a configuration
     """
     if use_preinstalled:
-        conf = get_cached_configuration(model_id, subfolder=subfolder, **kwargs)
+        conf = get_cached_configuration(
+            model_id, exc=use_only_preinstalled, subfolder=subfolder, **kwargs
+        )
         if conf is not None:
             return conf
+    assert not use_only_preinstalled, (
+        f"Inconsistencies: use_only_preinstalled={use_only_preinstalled}, "
+        f"use_preinstalled={use_preinstalled!r}"
+    )
     if subfolder:
         try:
             return transformers.AutoConfig.from_pretrained(

onnx-diagnostic 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

onnx-diagnostic 0.6.3py3-none-any.whl → 0.7.0py3-none-any.whl