PyPI - onnx-diagnostic - Versions diffs - 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl - Mend

onnx-diagnostic 0.8.3py3-none-any.whl → 0.8.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

onnx_diagnostic/__init__.py +1 -1
onnx_diagnostic/_command_lines_parser.py +47 -10
onnx_diagnostic/export/api.py +81 -50
onnx_diagnostic/export/control_flow_research.py +10 -5
onnx_diagnostic/export/onnx_plug.py +250 -61
onnx_diagnostic/ext_test_case.py +99 -53
onnx_diagnostic/helpers/dot_helper.py +37 -25
onnx_diagnostic/helpers/helper.py +44 -38
onnx_diagnostic/helpers/onnx_helper.py +441 -18
onnx_diagnostic/helpers/ort_session.py +8 -8
onnx_diagnostic/helpers/torch_helper.py +28 -2
onnx_diagnostic/reference/ort_evaluator.py +6 -29
onnx_diagnostic/torch_export_patches/patches/_patch_transformers_attention.py +1 -0
onnx_diagnostic/torch_export_patches/patches/_patch_transformers_masking_utils.py +10 -1
onnx_diagnostic/torch_export_patches/patches/_patch_transformers_qwen2_5.py +168 -113
onnx_diagnostic/torch_models/code_sample.py +2 -1
onnx_diagnostic/torch_models/hghub/model_inputs.py +34 -7
onnx_diagnostic/torch_models/validate.py +14 -1
onnx_diagnostic/torch_onnx/runtime_info.py +1 -24
onnx_diagnostic/torch_onnx/sbs.py +11 -5
onnx_diagnostic/torch_onnx/sbs_dataclasses.py +48 -4
{onnx_diagnostic-0.8.3.dist-info → onnx_diagnostic-0.8.5.dist-info}/METADATA +1 -1
{onnx_diagnostic-0.8.3.dist-info → onnx_diagnostic-0.8.5.dist-info}/RECORD +26 -26
{onnx_diagnostic-0.8.3.dist-info → onnx_diagnostic-0.8.5.dist-info}/WHEEL +0 -0
{onnx_diagnostic-0.8.3.dist-info → onnx_diagnostic-0.8.5.dist-info}/licenses/LICENSE.txt +0 -0
{onnx_diagnostic-0.8.3.dist-info → onnx_diagnostic-0.8.5.dist-info}/top_level.txt +0 -0

onnx_diagnostic/reference/ort_evaluator.py CHANGED Viewed

@@ -18,10 +18,11 @@ from onnx.defs import onnx_opset_version
 import onnxruntime
 from ..helpers import string_type
 from ..helpers.onnx_helper import (
-    pretty_onnx,
+    get_hidden_inputs,
     dtype_to_tensor_dtype,
-    to_array_extended,
     np_dtype_to_tensor_dtype,
+    to_array_extended,
+    pretty_onnx,
 )
 from ..helpers.torch_helper import onnx_dtype_to_torch_dtype, torch_dtype_to_onnx_dtype
 from ..helpers.ort_session import (
@@ -472,39 +473,15 @@ class OnnxruntimeEvaluator:
                         yield from self.enumerate_nodes(att.g.node)
             yield node
-    @classmethod
-    def _get_hidden_inputs(cls, graph: GraphProto) -> Set[str]:
-        """
-        Returns the hidden inputs (inputs coming from an upper context)
-        used by a subgraph.
-        """
-        hidden = set()
-        memo = (
-            {i.name for i in graph.initializer}
-            | {i.name for i in graph.sparse_initializer}
-            | {i.name for i in graph.input}
-        )
-        for node in graph.node:
-            for i in node.input:
-                if i not in memo:
-                    hidden.add(i)
-            for att in node.attribute:
-                if att.type == AttributeProto.GRAPH and att.g:
-                    hid = cls._get_hidden_inputs(att.g)
-                    less = set(h for h in hid if h not in memo)
-                    hidden |= less
-            memo |= set(node.output)
-        return hidden
     @classmethod
     def _get_hidden_node_inputs(cls, node: NodeProto) -> Set[str]:
-        """Calls multiple _get_hidden_inputs on every attribute."""
+        """Calls multiple get_hidden_inputs on every attribute."""
         if node.op_type not in {"Loop", "Scan", "If"}:
             return set()
         hidden = set()
         for att in node.attribute:
             if att.type == AttributeProto.GRAPH:
-                hidden |= cls._get_hidden_inputs(att.g)
+                hidden |= get_hidden_inputs(att.g)
         return hidden - (hidden & set(node.input))
     def _get_sess(
@@ -624,7 +601,7 @@ class OnnxruntimeEvaluator:
                 value = oh.make_tensor_value_info(i, dtype_to_tensor_dtype(it.dtype), it.shape)
             vinputs.append(value)
-        reduced_set = self._get_hidden_inputs(g)
+        reduced_set = get_hidden_inputs(g)
         for i, v in context.items():
             if i in reduced_set and i not in unique_names:
                 unique_names.add(i)

onnx_diagnostic/torch_export_patches/patches/_patch_transformers_attention.py CHANGED Viewed

@@ -118,6 +118,7 @@ def patched_sdpa_attention_forward(
             torch._check(value.shape[1] > 0)
             torch._check(value.shape[2] > 0)
             torch._check(value.shape[3] > 0)
             return (
                 torch.nn.functional.scaled_dot_product_attention(
                     query,

onnx_diagnostic/torch_export_patches/patches/_patch_transformers_masking_utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import inspect
 from typing import Callable, List, Optional, Tuple
 import torch
@@ -19,6 +20,12 @@ if patch_masking_utils:
         prepare_padding_mask,
     )
+    _prepare_padding_mask_kwargs = (
+        dict(_slice=False)
+        if "_slice" in inspect.signature(prepare_padding_mask).parameters
+        else {}
+    )
     try:
         # transformers>=5.0
         from transformers.masking_utils import (
@@ -132,7 +139,9 @@ if patch_masking_utils:
     ) -> Optional[torch.Tensor]:
         """manual patch for function ``transformers.masking_utils.sdpa_mask_recent_torch``."""
         q_length = cache_position.shape[0]
-        padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset, _slice=False)
+        padding_mask = prepare_padding_mask(
+            attention_mask, kv_length, kv_offset, **_prepare_padding_mask_kwargs
+        )
         if allow_is_causal_skip and _ignore_causal_mask_sdpa(
             padding_mask, q_length, kv_length, kv_offset, local_size
         ):

onnx_diagnostic/torch_export_patches/patches/_patch_transformers_qwen2_5.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import os
-from typing import Callable, Optional
+from typing import Callable, Optional, Tuple
 import onnx
+import onnx.helper as oh
 import torch
 import torch.nn.functional as F
 from ...export.onnx_plug import EagerDirectReplacementWithOnnx
+from ...helpers.torch_helper import torch_dtype_to_onnx_dtype
 from .patch_helper import _is_torchdynamo_exporting
 from ._patch_transformers_attention import patched_sdpa_attention_forward
@@ -22,7 +24,43 @@ if patch_qwen2_5:
     onnx_plugs_op = onnxscript.values.Opset("onnx_plug", 1)
     op = onnxscript.opset22
+    op23 = onnxscript.onnx_opset.opset23
     msft_op = onnxscript.values.Opset("com.microsoft", 1)
+    STOPAT = (
+        int(os.environ.get("STOPAT", None))
+        if os.environ.get("STOPAT", None) is not None
+        else None
+    )
+    def _add_com_microsoft_opset(function_proto: onnx.FunctionProto) -> onnx.FunctionProto:
+        opsets = {d.domain: d.version for d in function_proto.opset_import}
+        if "com.microsoft" not in opsets:
+            d = function_proto.opset_import.add()
+            d.domain = "com.microsoft"
+            d.version = 1
+        return function_proto
+    def _update_sequence_type(
+        itype: int, function_proto: onnx.FunctionProto
+    ) -> onnx.FunctionProto:
+        proto = oh.make_function(
+            function_proto.domain,
+            function_proto.name,
+            function_proto.input,
+            function_proto.output,
+            [
+                (
+                    oh.make_node("SequenceEmpty", node.input, node.output, dtype=itype)
+                    if node.op_type == "SequenceEmpty"
+                    else node
+                )
+                for node in function_proto.node
+            ],
+            attributes=function_proto.attribute,
+            attribute_protos=function_proto.attribute_proto,
+            opset_imports=function_proto.opset_import,
+        )
+        return proto
     @onnxscript.script(opset=onnx_plugs_op)
     def LoopMHAAttention(
@@ -32,7 +70,6 @@ if patch_qwen2_5:
         cu_seqlens,
         scaling: float = 0.11180339887498948,
         num_heads: int = 16,
-        itype: int = onnx.TensorProto.FLOAT,
     ):
         to_3d_shape = op.Constant(value_ints=[0, 0, -1])
         query_transposed = op.Transpose(query_states, perm=[0, 2, 1, 3])
@@ -45,7 +82,7 @@ if patch_qwen2_5:
         seq_axis = op.Constant(value_ints=[1])
         seq_axis_int32 = op.Cast(seq_axis, to=onnx.TensorProto.INT32)
         # attn_output = op.Slice(value_3d, [0], [0], seq_axis)
-        seq_attn = op.SequenceEmpty(dtype=itype)
+        seq_attn = op.SequenceEmpty(dtype=onnx.TensorProto.FLOAT)
         for i_patch in range(num_patches):
             i_1d = op.Reshape(i_patch, [1])
             i_plus_1_1d = i_1d + 1
@@ -55,11 +92,7 @@ if patch_qwen2_5:
             key_i = op.Slice(key_3d, start, end, seq_axis_int32)
             value_i = op.Slice(value_3d, start, end, seq_axis_int32)
             mha_output = msft_op.MultiHeadAttention(
-                query_i,
-                key_i,
-                value_i,
-                num_heads=num_heads,
-                scale=scaling,
+                query_i, key_i, value_i, num_heads=num_heads, scale=scaling
             )
             # attn_output = op.Concat(attn_output, mha_output, axis=1)
             seq_attn = op.SequenceInsert(seq_attn, mha_output)
@@ -67,13 +100,47 @@ if patch_qwen2_5:
         attn_output_4d = op.Reshape(attn_output, output_shape)
         return attn_output_4d
-    def _add_com_microsoft_opset(function_proto):
-        opsets = {d.domain: d.version for d in function_proto.opset_import}
-        if "com.microsoft" not in opsets:
-            d = function_proto.opset_import.add()
-            d.domain = "com.microsoft"
-            d.version = 1
-        return function_proto
+    @onnxscript.script(opset=onnx_plugs_op)
+    def LoopAttention23(
+        query_states,
+        key_states,
+        value_states,
+        cu_seqlens,
+        scaling: float = 0.11180339887498948,
+        num_heads: int = 16,
+    ):
+        to_3d_shape = op23.Constant(value_ints=[0, 0, -1])
+        query_transposed = op23.Transpose(query_states, perm=[0, 2, 1, 3])
+        output_shape = op23.Shape(query_transposed)
+        query_3d = op23.Reshape(query_transposed, to_3d_shape)
+        value_3d = op23.Reshape(op23.Transpose(value_states, perm=[0, 2, 1, 3]), to_3d_shape)
+        key_3d = op23.Reshape(op23.Transpose(key_states, perm=[0, 2, 1, 3]), to_3d_shape)
+        cu_seqlens = op23.Cast(cu_seqlens, to=onnx.TensorProto.INT32)
+        num_patches = op23.Size(cu_seqlens) - 1
+        seq_axis = op23.Constant(value_ints=[1])
+        seq_axis_int32 = op23.Cast(seq_axis, to=onnx.TensorProto.INT32)
+        seq_attn = op23.SequenceEmpty(dtype=onnx.TensorProto.FLOAT)
+        for i_patch in range(num_patches):
+            i_1d = op23.Reshape(i_patch, [1])
+            i_plus_1_1d = i_1d + 1
+            start = op23.Gather(cu_seqlens, i_1d, axis=0)
+            end = op23.Gather(cu_seqlens, i_plus_1_1d, axis=0)
+            query_i = op23.Slice(query_3d, start, end, seq_axis_int32)
+            key_i = op23.Slice(key_3d, start, end, seq_axis_int32)
+            value_i = op23.Slice(value_3d, start, end, seq_axis_int32)
+            mha_output = op23.Attention(
+                query_i,
+                key_i,
+                value_i,
+                scale=scaling,
+                q_num_heads=num_heads,
+                kv_num_heads=num_heads,
+                softmax_precision=onnx.TensorProto.FLOAT,
+            )
+            seq_attn = op23.SequenceInsert(seq_attn, mha_output)
+        attn_output = op23.ConcatFromSequence(seq_attn, axis=1)
+        attn_output_4d = op23.Reshape(attn_output, output_shape)
+        return attn_output_4d
     @onnxscript.script(opset=onnx_plugs_op)
     def PackedAttention(
@@ -132,8 +199,40 @@ if patch_qwen2_5:
         cu_seqlens: torch.Tensor,  # F7su19
         scaling: float = 0,
         num_heads: int = 16,
-        itype: int = onnx.TensorProto.FLOAT,
     ) -> torch.Tensor:
+        """
+        The loop can be removed with the following code
+        but it hits memory overflow for big inputs.
+        .. code-block:: python
+            # make square mask
+            indices = torch.arange(
+                cu_seqlens.max(), dtype=cu_seqlens.dtype, device=cu_seqlens.device
+            )
+            dot = (cu_seqlens.unsqueeze(1) <= indices.unsqueeze(0)).to(
+                cu_seqlens.dtype
+            )
+            dot = dot.sum(dim=0)
+            mask = dot.unsqueeze(1) - dot.unsqueeze(0)
+            bool_mask = mask == 0
+            bool_mask = bool_mask.unsqueeze(0).unsqueeze(0)
+            torch._check(bool_mask.shape[2] == key_states.shape[2])
+            torch._check(bool_mask.shape[3] == key_states.shape[2])
+            attn_output, _ = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask=bool_mask,
+                scaling=self.scaling,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                is_causal=False,
+                **kwargs,
+            )
+        """
         lengths = cu_seqlens[1:] - cu_seqlens[:-1]
         splits = [
             torch.split(tensor, lengths.tolist(), dim=2)
@@ -156,36 +255,58 @@ if patch_qwen2_5:
         attn_output = torch.cat(attn_outputs, dim=1)
         return attn_output
-    # not ideal
-    qwen_sdpa_attention_packed_versatile = EagerDirectReplacementWithOnnx(
-        qwen_sdpa_attention,
-        lambda qs, *args, **kwargs: torch.empty(
-            (qs.shape[0], qs.shape[2], qs.shape[1], qs.shape[3]),
-            dtype=qs.dtype,
-            device=qs.device,
-        ),
-        _add_com_microsoft_opset(PackedAttention.to_function_proto()),
-        n_inputs=4,
-        n_outputs=1,
-        kwargs=dict(scaling=0.11180339887498948, num_heads=16, itype=onnx.TensorProto.FLOAT),
-        name="qwen_sdpa_attention_packed",
-    )
-    PLUGS.append(qwen_sdpa_attention_packed_versatile)
+    def qwen_version_selector(opset: int, *args: torch.Tensor) -> Tuple[str, torch.dtype]:
+        first_tensor = next(a for a in args if a is not None)
+        dtype = first_tensor.dtype
+        strategy = patched_Qwen2_5_VLVisionAttention.STRATEGY_FOR_ATTENTION()
+        itype = torch_dtype_to_onnx_dtype(dtype)
+        if strategy is not None:
+            return strategy, itype
+        if dtype == torch.float32 or itype == onnx.TensorProto.FLOAT:
+            if opset >= 23:
+                return "LOOPA23", itype
+            return "LOOPMHA", itype
+        if dtype == torch.float16 or itype == onnx.TensorProto.FLOAT16:
+            # first_tensor may be a SymbolicTensor (onnx).
+            # is_cuda is not available.
+            if hasattr(first_tensor, "is_cuda") and first_tensor.is_cuda:
+                return "PACKED", itype
+            return "LOOPMHA", itype
+        raise AssertionError(
+            f"Unable to handle type {torch.dtype} (itype={itype}) "
+            f"on device {torch.device} with opset={opset}"
+        )
-    qwen_sdpa_attention_loopmha_versatile = EagerDirectReplacementWithOnnx(
+    qwen_sdpa_attention_versatile = EagerDirectReplacementWithOnnx(
         qwen_sdpa_attention,
         lambda qs, *args, **kwargs: torch.empty(
             (qs.shape[0], qs.shape[2], qs.shape[1], qs.shape[3]),
             dtype=qs.dtype,
             device=qs.device,
         ),
-        _add_com_microsoft_opset(LoopMHAAttention.to_function_proto()),
+        {
+            ("PACKED", onnx.TensorProto.FLOAT16): _add_com_microsoft_opset(
+                PackedAttention.to_function_proto()
+            ),
+            ("LOOPA23", onnx.TensorProto.FLOAT): LoopAttention23.to_function_proto(),
+            ("LOOPA23", onnx.TensorProto.FLOAT16): _update_sequence_type(
+                onnx.TensorProto.FLOAT16, LoopAttention23.to_function_proto()
+            ),
+            ("LOOPMHA", onnx.TensorProto.FLOAT): _add_com_microsoft_opset(
+                LoopMHAAttention.to_function_proto()
+            ),
+            ("LOOPMHA", onnx.TensorProto.FLOAT16): _update_sequence_type(
+                onnx.TensorProto.FLOAT16,
+                _add_com_microsoft_opset(LoopMHAAttention.to_function_proto()),
+            ),
+        },
         n_inputs=4,
         n_outputs=1,
-        kwargs=dict(scaling=0.11180339887498948, num_heads=16, itype=onnx.TensorProto.FLOAT),
-        name="qwen_sdpa_attention_loopmha",
+        kwargs=dict(scaling=0.11180339887498948, num_heads=16),
+        name="qwen_sdpa_attention_versatile",
+        version_selector=qwen_version_selector,
     )
-    PLUGS.append(qwen_sdpa_attention_loopmha_versatile)
+    PLUGS.append(qwen_sdpa_attention_versatile)
     class patched_Qwen2_5_VLForConditionalGeneration:
         _PATCHES_ = ["prepare_inputs_for_generation"]
@@ -434,6 +555,8 @@ if patch_qwen2_5:
                     position_embeddings=position_embeddings,
                     **kwargs,
                 )
+                if STOPAT is not None and layer_num > STOPAT:
+                    break
             hidden_states = self.merger(hidden_states)
             reverse_indices = torch.argsort(window_index)
@@ -473,9 +596,7 @@ if patch_qwen2_5:
         _PATCHED_CLASS_ = (
             transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLVisionAttention
         )
-        STRATEGY_FOR_ATTENTION = lambda: os.environ.get(  # noqa: E731
-            "QWEN25ATTENTION", "PACKED"
-        )
+        STRATEGY_FOR_ATTENTION = lambda: os.environ.get("QWEN25ATTENTION", None)  # noqa: E731
         def forward(
             self,
@@ -519,14 +640,15 @@ if patch_qwen2_5:
                     self.config._attn_implementation
                 ]
-            is_sdpa = (
+            is_sdpa_or_eager = (
                 attention_interface
                 is transformers.integrations.sdpa_attention.sdpa_attention_forward
                 or attention_interface is patched_sdpa_attention_forward
+                or attention_interface
+                is transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.eager_attention_forward
             )
-            attention_strategy = patched_Qwen2_5_VLVisionAttention.STRATEGY_FOR_ATTENTION()
-            if is_sdpa and attention_strategy in "PACKED":
-                attn_output = qwen_sdpa_attention_packed_versatile(
+            if is_sdpa_or_eager:
+                attn_output = qwen_sdpa_attention_versatile(
                     query_states,
                     key_states,
                     value_states,
@@ -558,78 +680,10 @@ if patch_qwen2_5:
                         ),
                         version=1,
                     )
-                elif is_sdpa and attention_strategy == "LOOPMHA":
-                    attn_output = qwen_sdpa_attention_loopmha_versatile(
-                        query_states,
-                        key_states,
-                        value_states,
-                        cu_seqlens,
-                        self.scaling,
-                        self.num_heads,
-                        (
-                            onnx.TensorProto.FLOAT
-                            if query_states.dtype == torch.float32
-                            else (
-                                onnx.TensorProto.FLOAT16
-                                if query_states.dtype == torch.float16
-                                else onnx.TensorProto.BFLOAT16
-                            )
-                        ),
-                    )
-                    # to rewrite later with a for loop
-                    # def _iteration(start_end, query_states, key_states, value_states):
-                    #     return patched_Qwen2_5_VLVisionAttentionOneIteration.forward(
-                    #         self,
-                    #         start_end,
-                    #         query_states,
-                    #         key_states,
-                    #         value_states,
-                    #         scaling=self.scaling,
-                    #         dropout=0.0 if not self.training else self.attention_dropout,
-                    #    )
-                    # starts = cu_seqlens[:-1]
-                    # ends = cu_seqlens[1:]
-                    # torch._check(starts.shape[0] > 0)
-                    # torch._check(ends.shape[0] > 0)
-                    # starts_ends = torch.cat([starts.unsqueeze(1), ends.unsqueeze(1)], dim=1)
-                    # attn_outputs = [
-                    #    _iteration(start_end, query_states, key_states, value_states)
-                    #    for start_end in starts_ends
-                    # ]
-                    # attn_output = torch.cat(attn_outputs, dim=1)
-                elif is_sdpa and attention_strategy == "BIGMASK":
-                    # make square mask
-                    indices = torch.arange(
-                        cu_seqlens.max(), dtype=cu_seqlens.dtype, device=cu_seqlens.device
-                    )
-                    dot = (cu_seqlens.unsqueeze(1) <= indices.unsqueeze(0)).to(
-                        cu_seqlens.dtype
-                    )
-                    dot = dot.sum(dim=0)
-                    mask = dot.unsqueeze(1) - dot.unsqueeze(0)
-                    bool_mask = mask == 0
-                    bool_mask = bool_mask.unsqueeze(0).unsqueeze(0)
-                    torch._check(bool_mask.shape[2] == key_states.shape[2])
-                    torch._check(bool_mask.shape[3] == key_states.shape[2])
-                    attn_output, _ = attention_interface(
-                        self,
-                        query_states,
-                        key_states,
-                        value_states,
-                        attention_mask=bool_mask,
-                        scaling=self.scaling,
-                        dropout=0.0 if not self.training else self.attention_dropout,
-                        is_causal=False,
-                        **kwargs,
-                    )
                 else:
                     raise NotImplementedError(
-                        f"No corresponding export strategy for "
-                        f"{attention_strategy!r}, "
+                        f"No corresponding export strategy for implementation "
+                        f"{self.config._attn_implementation!r}, "
                         f"(use QWEN25ATTENTION to change it), and attention_interface="
                         f"{attention_interface!r} (use sdpa)"
                     )
@@ -653,6 +707,7 @@ if patch_qwen2_5:
                 )
             else:
                 # Other implementations: Process each chunk separately
+                # = qwen_sdpa_attention
                 lengths = cu_seqlens[1:] - cu_seqlens[:-1]
                 splits = [
                     torch.split(tensor, lengths.tolist(), dim=2)

onnx_diagnostic/torch_models/code_sample.py CHANGED Viewed

@@ -236,7 +236,7 @@ def code_sample(
             )
         )
     """
-    model_id, subfolder, same_as_pretrained, use_pretrained = _preprocess_model_id(
+    model_id, subfolder, same_as_pretrained, use_pretrained, submodule = _preprocess_model_id(
         model_id,
         subfolder,
         same_as_pretrained=same_as_pretrained,
@@ -256,6 +256,7 @@ def code_sample(
         model_kwargs=mop,
         subfolder=subfolder,
         add_second_input=False,
+        submodule=submodule,
     )
     if drop_inputs:
         update = {}

onnx_diagnostic/torch_models/hghub/model_inputs.py CHANGED Viewed

@@ -26,17 +26,26 @@ def _code_needing_rewriting(model: Any) -> Any:
 def _preprocess_model_id(
-    model_id: str, subfolder: Optional[str], same_as_pretrained: bool, use_pretrained: bool
-) -> Tuple[str, Optional[str], bool, bool]:
+    model_id: str,
+    subfolder: Optional[str],
+    same_as_pretrained: bool,
+    use_pretrained: bool,
+    submodule: Optional[str] = None,
+) -> Tuple[str, Optional[str], bool, bool, Optional[str]]:
+    if "::" in model_id:
+        assert (
+            not submodule
+        ), f"submodule={submodule!r} cannot be defined in model_id={model_id!r} as well"
+        model_id, submodule = model_id.split("::", maxsplit=1)
     if subfolder or "//" not in model_id:
-        return model_id, subfolder, same_as_pretrained, use_pretrained
+        return model_id, subfolder, same_as_pretrained, use_pretrained, submodule
     spl = model_id.split("//")
     if spl[-1] == "pretrained":
-        return _preprocess_model_id("//".join(spl[:-1]), "", True, True)
+        return _preprocess_model_id("//".join(spl[:-1]), "", True, True, submodule)
     if spl[-1] in {"transformer", "vae"}:
         # known subfolder
-        return "//".join(spl[:-1]), spl[-1], same_as_pretrained, use_pretrained
-    return model_id, subfolder, same_as_pretrained, use_pretrained
+        return "//".join(spl[:-1]), spl[-1], same_as_pretrained, use_pretrained, submodule
+    return model_id, subfolder, same_as_pretrained, use_pretrained, submodule
 def get_untrained_model_with_inputs(
@@ -54,6 +63,7 @@ def get_untrained_model_with_inputs(
     subfolder: Optional[str] = None,
     use_only_preinstalled: bool = False,
     config_reduction: Optional[Callable[[Any, str], Dict]] = None,
+    submodule: Optional[str] = None,
 ) -> Dict[str, Any]:
     """
     Gets a non initialized model similar to the original model
@@ -82,6 +92,7 @@ def get_untrained_model_with_inputs(
         <onnx_diagnostic.torch_models.hghub.reduce_model_config>`,
         this function takes a configuration and a task (string)
         as arguments
+    :param submodule: use a submodule instead of the main model
     :return: dictionary with a model, inputs, dynamic shapes, and the configuration,
         some necessary rewriting as well
@@ -108,11 +119,12 @@ def get_untrained_model_with_inputs(
         f"model_id={model_id!r}, preinstalled model is only available "
         f"if use_only_preinstalled is False."
     )
-    model_id, subfolder, same_as_pretrained, use_pretrained = _preprocess_model_id(
+    model_id, subfolder, same_as_pretrained, use_pretrained, submodule = _preprocess_model_id(
         model_id,
         subfolder,
         same_as_pretrained=same_as_pretrained,
         use_pretrained=use_pretrained,
+        submodule=submodule,
     )
     if verbose:
         print(
@@ -147,6 +159,8 @@ def get_untrained_model_with_inputs(
         if verbose:
             print(f"[get_untrained_model_with_inputs] architecture={arch!r}")
             print(f"[get_untrained_model_with_inputs] cls={config.__class__.__name__!r}")
+            if submodule:
+                print(f"[get_untrained_model_with_inputs] submodule={submodule!r}")
         if task is None:
             task = task_from_arch(arch, model_id=model_id, subfolder=subfolder)
         if verbose:
@@ -357,6 +371,19 @@ def get_untrained_model_with_inputs(
     if diff_config is not None:
         res["dump_info"] = dict(config_diff=diff_config)
+    if submodule:
+        path = submodule.split("::") if "::" in submodule else [submodule]
+        for p in path:
+            assert hasattr(model, p), (
+                f"Unable to find submodule {p!r} in in class {type(model)}, "
+                f"submodule={submodule!r}, possible candidates: "
+                f"{[k for k in dir(model) if isinstance(getattr(model, k), torch.nn.Module)]}"
+            )
+            model = getattr(model, p)
+    if verbose:
+        print(f"[get_untrained_model_with_inputs] model class={model.__class__.__name__!r}")
     sizes = compute_model_size(model)
     res["model"] = model
     res["configuration"] = config

onnx-diagnostic 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl

onnx-diagnostic 0.8.3py3-none-any.whl → 0.8.5py3-none-any.whl