PyPI - onnx-diagnostic - Versions diffs - 0.7.5__py3-none-any.whl → 0.7.7__py3-none-any.whl - Mend

onnx-diagnostic 0.7.5py3-none-any.whl → 0.7.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

onnx_diagnostic/__init__.py +1 -1
onnx_diagnostic/_command_lines_parser.py +56 -3
onnx_diagnostic/export/dynamic_shapes.py +24 -10
onnx_diagnostic/export/shape_helper.py +6 -2
onnx_diagnostic/ext_test_case.py +2 -0
onnx_diagnostic/helpers/_log_helper.py +6 -6
onnx_diagnostic/helpers/cache_helper.py +326 -18
onnx_diagnostic/helpers/config_helper.py +10 -0
onnx_diagnostic/helpers/helper.py +152 -11
onnx_diagnostic/helpers/mini_onnx_builder.py +7 -2
onnx_diagnostic/helpers/onnx_helper.py +13 -7
onnx_diagnostic/helpers/torch_helper.py +33 -11
onnx_diagnostic/reference/ops/op_cast_like.py +15 -11
onnx_diagnostic/reference/torch_ops/__init__.py +1 -0
onnx_diagnostic/reference/torch_ops/unary_ops.py +7 -0
onnx_diagnostic/tasks/__init__.py +2 -0
onnx_diagnostic/tasks/automatic_speech_recognition.py +6 -2
onnx_diagnostic/tasks/feature_extraction.py +7 -3
onnx_diagnostic/tasks/fill_mask.py +6 -2
onnx_diagnostic/tasks/image_classification.py +6 -2
onnx_diagnostic/tasks/image_text_to_text.py +289 -62
onnx_diagnostic/tasks/mask_generation.py +143 -0
onnx_diagnostic/tasks/mixture_of_expert.py +2 -2
onnx_diagnostic/tasks/object_detection.py +6 -2
onnx_diagnostic/tasks/sentence_similarity.py +6 -2
onnx_diagnostic/tasks/summarization.py +7 -2
onnx_diagnostic/tasks/text2text_generation.py +7 -2
onnx_diagnostic/tasks/text_classification.py +6 -2
onnx_diagnostic/tasks/text_generation.py +14 -16
onnx_diagnostic/torch_export_patches/onnx_export_errors.py +3 -3
onnx_diagnostic/torch_export_patches/onnx_export_serialization.py +17 -1
onnx_diagnostic/torch_export_patches/patch_inputs.py +5 -2
onnx_diagnostic/torch_export_patches/patches/patch_torch.py +4 -4
onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +428 -129
onnx_diagnostic/torch_export_patches/serialization/transformers_impl.py +60 -41
onnx_diagnostic/torch_models/hghub/hub_data.py +5 -0
onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py +288 -0
onnx_diagnostic/torch_models/validate.py +1 -0
{onnx_diagnostic-0.7.5.dist-info → onnx_diagnostic-0.7.7.dist-info}/METADATA +2 -2
{onnx_diagnostic-0.7.5.dist-info → onnx_diagnostic-0.7.7.dist-info}/RECORD +43 -42
{onnx_diagnostic-0.7.5.dist-info → onnx_diagnostic-0.7.7.dist-info}/WHEEL +0 -0
{onnx_diagnostic-0.7.5.dist-info → onnx_diagnostic-0.7.7.dist-info}/licenses/LICENSE.txt +0 -0
{onnx_diagnostic-0.7.5.dist-info → onnx_diagnostic-0.7.7.dist-info}/top_level.txt +0 -0

onnx_diagnostic/tasks/image_text_to_text.py CHANGED Viewed

@@ -1,7 +1,12 @@
 from typing import Any, Callable, Dict, Optional, Tuple
 import torch
-from ..helpers.cache_helper import make_dynamic_cache
-from ..helpers.config_helper import update_config, check_hasattr, _pick
+from ..helpers.cache_helper import make_dynamic_cache, make_hybrid_cache
+from ..helpers.config_helper import (
+    update_config,
+    check_hasattr,
+    _pick,
+    default_num_hidden_layers as nhl,
+)
 __TASK__ = "image-text-to-text"
@@ -10,100 +15,285 @@ def reduce_model_config(config: Any) -> Dict[str, Any]:
     """Reduces a model size."""
     kwargs: Dict[str, Any] = {}
     if hasattr(config, "num_hidden_layers"):
-        config.num_hidden_layers = min(config.num_hidden_layers, 2)
-    if hasattr(config, "vision_config") and hasattr(config.vision_config, "num_hidden_layers"):
-        config.vision_config.num_hidden_layers = min(config.vision_config.num_hidden_layers, 2)
+        config.num_hidden_layers = min(config.num_hidden_layers, nhl())
+    if hasattr(config, "mm_tokens_per_image"):
+        config.mm_tokens_per_image = min(config.mm_tokens_per_image, 2)
+    if hasattr(config, "vision_config"):
+        if hasattr(config.vision_config, "num_hidden_layers"):
+            config.vision_config.num_hidden_layers = min(
+                config.vision_config.num_hidden_layers, 2
+            )
+        if hasattr(config.vision_config, "image_size"):
+            config.vision_config.image_size = min(config.vision_config.image_size, 96)
+        if hasattr(config.vision_config, "intermediate_size"):
+            config.vision_config.intermediate_size = min(
+                config.vision_config.intermediate_size, 1076
+            )
+        if hasattr(config.vision_config, "patch_size"):
+            config.vision_config.patch_size = min(config.vision_config.patch_size, 2)
+        if hasattr(config.vision_config, "hidden_size"):
+            config.vision_config.hidden_size = min(config.vision_config.hidden_size, 16)
+    if hasattr(config, "text_config"):
+        if hasattr(config.text_config, "intermediate_size"):
+            config.text_config.intermediate_size = min(
+                config.text_config.intermediate_size, 320
+            )
+        if hasattr(config.text_config, "hidden_size"):
+            config.text_config.hidden_size = min(config.text_config.hidden_size, 16)
+        if hasattr(config.text_config, "num_hidden_layers"):
+            config.text_config.num_hidden_layers = min(config.text_config.num_hidden_layers, 2)
+        if hasattr(config.text_config, "layer_types"):
+            config.text_config.layer_types = config.text_config.layer_types[
+                : config.text_config.num_hidden_layers
+            ]
+        if hasattr(config.text_config, "num_attention_heads"):
+            config.text_config.num_attention_heads = min(
+                config.text_config.num_attention_heads, 2
+            )
     update_config(config, kwargs)
     return kwargs
-def get_inputs(
+def _get_inputs_gemma3(
     model: torch.nn.Module,
     config: Optional[Any],
     dummy_max_token_id: int,
     num_key_value_heads: int,
     num_hidden_layers: int,
+    pad_token_id: int,
+    image_token_index: int,
     head_dim: int,
     width: int,
     height: int,
     num_channels: int,
     batch_size: int = 2,
-    sequence_length: int = 30,
-    sequence_length2: int = 3,
+    sequence_length: int = 43,
+    sequence_length2: int = 43,
     n_images: int = 2,
     dynamic_rope: bool = False,
-    add_second_input: int = 1,
+    max_sequence_length: int = 380,
     **kwargs,  # unused
 ):
     """
-    Generates input for task ``image-text-to-text``.
+    ::
-    :param model: model to get the missing information
-    :param config: configuration used to generate the model
-    :param head_dim: last dimension of the cache
-    :param dummy_max_token_id: dummy max token id
-    :param batch_size: batch size
-    :param sequence_length: sequence length
-    :param sequence_length2: new sequence length
-    :param n_images: number of images
-    :param width: width of the image
-    :param height: height of the image
-    :param num_channels: number of channels
-    :param dynamic_rope: use dynamic rope (see :class:`transformers.LlamaConfig`)
-    :return: dictionary
+        dict(input_ids:T7s1x281,
+            pixel_values:T16s1x3x896x896,
+            attention_mask:dict(full_attention:T9s1x1x281x380,sliding_attention:T9s1x1x281x380),
+            position_ids:T7s1x281,
+            past_key_values:HybridCache(
+                key_cache=#34[T1s1x4x380x256,...],
+                value_cache=#34[T1s1x4x380x256,...]),
+            token_type_ids:T7s1x281,
+            cache_position:T7s281,
+            logits_to_keep:1)
+        dict(input_ids:T7s1x1,
+            pixel_values:None,
+            attention_mask:dict(full_attention:T9s1x1x1x380,sliding_attention:T9s1x1x1x380),
+            position_ids:T7s1x1,
+            past_key_values:HybridCache(
+                key_cache=#34[T1s1x4x380x256,...],
+                value_cache=#34[T1s1x4x380x256,...]),
+            token_type_ids:T7s1x1,
+            cache_position:T7s1,
+            logits_to_keep:1)
     """
     assert (
         "cls_cache" not in kwargs
     ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
     batch = torch.export.Dim("batch", min=1, max=1024)
     seq_length = "seq_length"  # torch.export.Dim("seq_length", min=1, max=4096)
-    cache_length = "cache_length"  # torch.export.Dim("cache_length", min=1, max=4096)
-    images = "images"  # torch.export.Dim("images", min=1, max=4096)
+    # cache_length = "cache_length"  # torch.export.Dim("cache_length", min=1, max=4096)
     shapes = {
         "input_ids": {0: batch, 1: seq_length},
+        "token_type_ids": {0: batch, 1: seq_length},
         "attention_mask": {
-            0: batch,
-            1: "cache+seq",  # cache_length + seq_length
-        },
-        "position_ids": {
-            0: batch,
-            1: "cache+seq",  # cache_length + seq_length
+            "full_attention": {0: batch, 2: seq_length},
+            "sliding_attention": {0: batch, 2: seq_length},
         },
+        "position_ids": {0: batch, 1: seq_length},
+        "cache_position": {1: seq_length},
         "past_key_values": [
-            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+            [{0: batch} for _ in range(num_hidden_layers)],
+            [{0: batch} for _ in range(num_hidden_layers)],
         ],
-        "pixel_values": {0: batch, 1: images},
-        "image_attention_mask": {0: batch, 1: seq_length, 2: images},
+        "pixel_values": {0: batch},
+        "use_cache": None,
     }
+    input_ids = torch.randint(0, dummy_max_token_id, (batch_size, sequence_length2)).to(
+        torch.int64
+    )
+    input_ids[:, 1] = image_token_index
+    # input_ids[input_ids == image_token_index] = pad_token_id
+    token_type_ids = torch.zeros_like(input_ids)
+    token_type_ids[input_ids == image_token_index] = 1
     inputs = dict(
-        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length2)).to(
-            torch.int64
+        input_ids=input_ids,
+        token_type_ids=token_type_ids,
+        attention_mask=dict(
+            full_attention=torch.randn(batch_size, 1, sequence_length, max_sequence_length),
+            sliding_attention=torch.randn(batch_size, 1, sequence_length, max_sequence_length),
         ),
-        attention_mask=torch.ones((batch_size, sequence_length + sequence_length2)).to(
-            torch.int64
-        ),
-        position_ids=torch.arange(sequence_length, sequence_length + sequence_length2)
-        .to(torch.int64)
-        .expand((batch_size, -1)),
-        past_key_values=make_dynamic_cache(
+        cache_position=torch.arange(0, sequence_length).to(torch.int64),
+        position_ids=torch.arange(0, sequence_length).to(torch.int64).expand((batch_size, -1)),
+        past_key_values=make_hybrid_cache(
             [
                 (
-                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
-                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
+                    torch.randn(
+                        batch_size, num_key_value_heads, max_sequence_length, head_dim
+                    ),
+                    torch.randn(
+                        batch_size, num_key_value_heads, max_sequence_length, head_dim
+                    ),
                 )
                 for i in range(num_hidden_layers)
             ]
         ),
-        pixel_values=torch.ones((batch_size, n_images, num_channels, width, height)).to(
-            torch.int64
-        ),
+        pixel_values=torch.randn(n_images, num_channels, width, height).clamp(-1, 1),
         image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
             torch.int64
         ),
+        use_cache=True,  # Gemma3 does not set this value to true when a cache is provided
     )
-    res = dict(inputs=inputs, dynamic_shapes=shapes)
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    dummy_max_token_id: int,
+    num_key_value_heads: int,
+    num_hidden_layers: int,
+    pad_token_id: int,
+    image_token_index: int,
+    head_dim: int,
+    width: int,
+    height: int,
+    num_channels: int,
+    batch_size: int = 2,
+    sequence_length: int = 43,
+    sequence_length2: int = 43,
+    n_images: int = 2,
+    dynamic_rope: bool = False,
+    add_second_input: int = 1,
+    **kwargs,  # unused
+):
+    """
+    Generates input for task ``image-text-to-text``.
+    :param model: model to get the missing information
+    :param config: configuration used to generate the model
+    :param head_dim: last dimension of the cache
+    :param dummy_max_token_id: dummy max token id
+    :param pad_token_id: pad_token_id
+    :param image_token_index: image_token_index
+    :param batch_size: batch size
+    :param sequence_length: sequence length
+    :param sequence_length2: new sequence length
+    :param n_images: number of images
+    :param width: width of the image
+    :param height: height of the image
+    :param num_channels: number of channels
+    :param dynamic_rope: use dynamic rope (see :class:`transformers.LlamaConfig`)
+    :return: dictionary
+    """
+    if model.__class__.__name__.startswith("Gemma3"):
+        res = _get_inputs_gemma3(
+            model,
+            config,
+            dummy_max_token_id=dummy_max_token_id,
+            num_key_value_heads=num_key_value_heads,
+            num_hidden_layers=num_hidden_layers,
+            pad_token_id=pad_token_id,
+            image_token_index=image_token_index,
+            head_dim=head_dim,
+            width=width,
+            height=height,
+            num_channels=num_channels,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            sequence_length2=sequence_length2,
+            n_images=n_images,
+            dynamic_rope=dynamic_rope,
+            **kwargs,
+        )
+    else:
+        assert (
+            "cls_cache" not in kwargs
+        ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
+        batch = torch.export.Dim("batch", min=1, max=1024)
+        batch_img = torch.export.Dim("batch_img", min=1, max=1024)
+        seq_length = "seq_length"  # torch.export.Dim("seq_length", min=1, max=4096)
+        cache_length = "cache_length"  # torch.export.Dim("cache_length", min=1, max=4096)
+        images = "images"  # torch.export.Dim("images", min=1, max=4096)
+        shapes = {
+            "input_ids": {0: batch, 1: seq_length},
+            "token_type_ids": {0: batch, 1: seq_length},
+            "attention_mask": {0: batch, 1: "cache+seq"},
+            "position_ids": {0: batch, 1: "cache+seq"},
+            "past_key_values": [
+                [{0: batch} for _ in range(num_hidden_layers)],
+                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+            ],
+            "pixel_values": (
+                {0: batch, 1: images}
+                if model.__class__.__name__ == "IdeficsForVisionText2Text"
+                else {0: batch_img}
+            ),
+            "image_attention_mask": {0: batch, 1: seq_length, 2: images},
+            "use_cache": None,
+        }
+        input_ids = torch.randint(0, dummy_max_token_id, (batch_size, sequence_length2)).to(
+            torch.int64
+        )
+        input_ids[0, 0] = image_token_index
+        input_ids[1, 1] = image_token_index
+        # input_ids[input_ids == image_token_index] = pad_token_id
+        token_type_ids = torch.zeros_like(input_ids)
+        token_type_ids[input_ids == image_token_index] = 1
+        inputs = dict(
+            input_ids=input_ids,
+            attention_mask=torch.cat(
+                [
+                    torch.ones((batch_size, sequence_length), dtype=torch.int64),
+                    input_ids.ne(pad_token_id).to(torch.int64),
+                ],
+                axis=-1,
+            ),
+            position_ids=torch.arange(0, sequence_length2)
+            .to(torch.int64)
+            .expand((batch_size, -1)),
+            past_key_values=make_dynamic_cache(
+                [
+                    (
+                        torch.randn(
+                            batch_size, num_key_value_heads, sequence_length, head_dim
+                        ),
+                        torch.randn(
+                            batch_size, num_key_value_heads, sequence_length, head_dim
+                        ),
+                    )
+                    for i in range(num_hidden_layers)
+                ]
+            ),
+            pixel_values=(
+                torch.randn((batch_size, n_images, num_channels, width, height)).clamp(-1, 1)
+                if model.__class__.__name__ == "IdeficsForVisionText2Text"
+                else torch.randn(n_images, num_channels, width, height).clamp(-1, 1)
+            ),
+            image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
+                torch.int64
+            ),
+            token_type_ids=token_type_ids,
+            use_cache=True,  # Gemma3 does not set this value to true when a cache is provided
+        )
+        res = dict(inputs=inputs, dynamic_shapes=shapes)
     if add_second_input:
         assert (
             add_second_input > 0
@@ -123,6 +313,8 @@ def get_inputs(
             sequence_length2=sequence_length2 + 1,
             n_images=n_images + 1,
             dynamic_rope=dynamic_rope,
+            pad_token_id=pad_token_id,
+            image_token_index=image_token_index,
             add_second_input=0,
             **kwargs,
         )["inputs"]
@@ -145,8 +337,9 @@ def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
                 ("num_key_value_heads", "num_attention_heads"),
                 "intermediate_size",
                 "hidden_size",
+                "pad_token_id",
             )
-            check_hasattr(config, "vision_config")
+            check_hasattr(config, "vision_config", ("image_token_index", "image_token_id"))
             text_config = True
         else:
             check_hasattr(
@@ -160,22 +353,28 @@ def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
                 "vision_config",
             )
             text_config = False
-        check_hasattr(config.vision_config, "image_size", "num_channels")
+        check_hasattr(config.vision_config, ("num_channels", "in_chans", "in_channels"))
     kwargs = dict(
         batch_size=2,
-        sequence_length=30,
-        sequence_length2=3,
+        sequence_length=43,
+        sequence_length2=43,
         head_dim=(
             16
             if config is None
             else getattr(
                 config,
                 "head_dim",
-                (config.text_config.hidden_size if text_config else config.hidden_size)
-                // (
-                    config.text_config.num_attention_heads
-                    if text_config
-                    else config.num_attention_heads
+                (
+                    config.text_config.head_dim
+                    if text_config and hasattr(config.text_config, "head_dim")
+                    else (
+                        (config.text_config.hidden_size if text_config else config.hidden_size)
+                        // (
+                            config.text_config.num_attention_heads
+                            if text_config
+                            else config.num_attention_heads
+                        )
+                    )
                 ),
             )
         ),
@@ -216,8 +415,36 @@ def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
             if config is None
             else (config.text_config.hidden_size if text_config else config.hidden_size)
         ),
-        width=224 if config is None else config.vision_config.image_size,
-        height=224 if config is None else config.vision_config.image_size,
-        num_channels=3 if config is None else config.vision_config.num_channels,
+        width=(
+            224
+            if config is None or not hasattr(config.vision_config, "image_size")
+            else config.vision_config.image_size
+        ),
+        height=(
+            224
+            if config is None or not hasattr(config.vision_config, "image_size")
+            else config.vision_config.image_size
+        ),
+        num_channels=(
+            3
+            if config is None
+            else _pick(config.vision_config, "num_channels", "in_chans", "in_channels")
+        ),
+        pad_token_id=(
+            0
+            if config is None
+            or not hasattr(config, "text_config")
+            or not hasattr(config.text_config, "pad_token_id")
+            else config.text_config.pad_token_id
+        ),
+        image_token_index=(
+            4
+            if config is None
+            or (
+                not hasattr(config, "image_token_index")
+                and not hasattr(config, "image_token_id")
+            )
+            else _pick(config, "image_token_index", "image_token_id")
+        ),
     )
     return kwargs, get_inputs

onnx_diagnostic/tasks/mask_generation.py ADDED Viewed

@@ -0,0 +1,143 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.config_helper import (
+    update_config,
+    check_hasattr,
+    default_num_hidden_layers as nhl,
+)
+__TASK__ = "mask-generation"
+def reduce_model_config(config: Any) -> Dict[str, Any]:
+    """Reduces a model size."""
+    kwargs: Dict[str, Any] = {}
+    if hasattr(config, "num_hidden_layers"):
+        config.num_hidden_layers = min(config.num_hidden_layers, nhl())
+    if hasattr(config, "vision_config") and hasattr(config.vision_config, "num_hidden_layers"):
+        config.vision_config.num_hidden_layers = min(config.vision_config.num_hidden_layers, 2)
+    update_config(config, kwargs)
+    return kwargs
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    batch_size: int,
+    width: int,
+    height: int,
+    num_channels: int,
+    output_channels: int,
+    window_size: int,
+    add_second_input: bool = True,
+    **kwargs,  # unused
+):
+    """
+    Generates input for task ``mask-generation``.
+    :param model: model to get the missing information
+    :param config: configuration used to generate the model
+    :param batch_size: batch size
+    :param width: width of the image
+    :param height: height of the image
+    :param num_channels: number of channels in the image
+    :param output_channels: number of output channels
+    :param window_size: size of the window for the vision model
+    :return: dictionary with inputs and dynamic shapes
+    """
+    assert (
+        "cls_cache" not in kwargs
+    ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
+    # TODO(anyone): input_masks is weirdly failing all the time with mismatch channels
+    # with Conv or embedding_size. I guess maybe the model is too implicit on the
+    # input_masks shape.
+    # TODO(titaiwang): modeling code specifically requires the height and width of inputs
+    # should be the same as the config.vision_config.image_size. Does that make sense?
+    shapes = {
+        "pixel_values": {0: "batch"},  # 1: num_channels is static
+        "input_points": {0: "batch", 1: "point_batch_size", 2: "nb_points_per_image"},
+        "input_boxes": {0: "batch", 1: "point_batch_size"},
+        # "input_masks": {0: "batch", 2: "height", 3: "width"},
+    }
+    inputs = dict(
+        pixel_values=torch.randn(
+            (batch_size, num_channels, height, width), dtype=torch.float32
+        ).clamp(-1, 1),
+        input_points=torch.randn(
+            (batch_size, 2, 10, 2), dtype=torch.float32
+        ),  # 10 points per image
+        input_boxes=torch.randn((batch_size, 2, 4), dtype=torch.float32),  # 1 box per image
+        # input_masks=torch.randn(
+        #     (batch_size, 1, height, width), dtype=torch.float32
+        # ),  # mask for the image
+    )
+    res = dict(inputs=inputs, dynamic_shapes=shapes)
+    if add_second_input:
+        assert (
+            add_second_input > 0
+        ), f"Not implemented for add_second_input={add_second_input}."
+        res["inputs2"] = get_inputs(
+            model=model,
+            config=config,
+            batch_size=batch_size + 1,
+            width=width,
+            height=height,
+            num_channels=num_channels,
+            output_channels=output_channels,
+            window_size=window_size,
+            add_second_input=False,
+            **kwargs,
+        )["inputs"]
+    return res
+def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        # generates mask as outputs
+        if hasattr(config, "mask_decoder_config"):
+            check_hasattr(
+                config.mask_decoder_config,
+                "hidden_size",
+                "iou_head_hidden_dim",
+                "iou_head_depth",
+                "num_hidden_layers",
+                "num_multimask_outputs",
+            )
+        if hasattr(config, "prompt_encoder_config"):
+            check_hasattr(
+                config.prompt_encoder_config,
+                "hidden_size",
+                "image_embedding_size",
+                "image_size",
+                "mask_input_channels",
+            )
+        if hasattr(config, "vision_config"):
+            check_hasattr(
+                config.vision_config,
+                "image_size",
+                "hidden_size",
+                "intermediate_size",
+                "num_hidden_layers",
+                "output_channels",
+                "num_channels",
+                "window_size",
+            )
+    kwargs = dict(
+        batch_size=2,
+        width=1024 if config is None else config.vision_config.image_size,
+        height=1024 if config is None else config.vision_config.image_size,
+        num_channels=3 if config is None else config.vision_config.num_channels,
+        output_channels=256 if config is None else config.vision_config.output_channels,
+        window_size=14 if config is None else config.vision_config.window_size,
+    )
+    return kwargs, get_inputs

onnx_diagnostic/tasks/mixture_of_expert.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Any, Callable, Dict, Optional, Tuple
 import torch
 # from ..helpers.cache_helper import make_dynamic_cache
-from ..helpers.config_helper import update_config  # , check_hasattr, _pick
+from ..helpers.config_helper import update_config, default_num_hidden_layers as nhl
 __TASK__ = "MoE"
@@ -11,7 +11,7 @@ def reduce_model_config(config: Any) -> Dict[str, Any]:
     """Reduces a model size."""
     kwargs: Dict[str, Any] = {}
     if hasattr(config, "num_hidden_layers"):
-        config.num_hidden_layers = min(config.num_hidden_layers, 2)
+        config.num_hidden_layers = min(config.num_hidden_layers, nhl())
     if hasattr(config, "vision_config") and hasattr(config.vision_config, "num_hidden_layers"):
         config.vision_config.num_hidden_layers = min(config.vision_config.num_hidden_layers, 2)
     if hasattr(config, "audio_processor") and hasattr(

onnx_diagnostic/tasks/object_detection.py CHANGED Viewed

@@ -1,6 +1,10 @@
 from typing import Any, Callable, Dict, Optional, Tuple
 import torch
-from ..helpers.config_helper import update_config, check_hasattr
+from ..helpers.config_helper import (
+    update_config,
+    check_hasattr,
+    default_num_hidden_layers as nhl,
+)
 __TASK__ = "object-detection"
@@ -10,7 +14,7 @@ def reduce_model_config(config: Any) -> Dict[str, Any]:
     check_hasattr(config, ("num_hidden_layers", "hidden_sizes"))
     kwargs = dict(
         num_hidden_layers=(
-            min(config.num_hidden_layers, 2)
+            min(config.num_hidden_layers, nhl())
             if hasattr(config, "num_hidden_layers")
             else len(config.hidden_sizes)
         )

onnx_diagnostic/tasks/sentence_similarity.py CHANGED Viewed

@@ -1,6 +1,10 @@
 from typing import Any, Callable, Dict, Optional, Tuple
 import torch
-from ..helpers.config_helper import update_config, check_hasattr
+from ..helpers.config_helper import (
+    update_config,
+    check_hasattr,
+    default_num_hidden_layers as nhl,
+)
 __TASK__ = "sentence-similarity"
@@ -9,7 +13,7 @@ def reduce_model_config(config: Any) -> Dict[str, Any]:
     """Reduces a model size."""
     check_hasattr(config, "num_attention_heads", "num_hidden_layers")
     kwargs = dict(
-        num_hidden_layers=min(config.num_hidden_layers, 2),
+        num_hidden_layers=min(config.num_hidden_layers, nhl()),
         num_attention_heads=min(config.num_attention_heads, 4),
     )
     update_config(config, kwargs)

onnx_diagnostic/tasks/summarization.py CHANGED Viewed

@@ -1,7 +1,12 @@
 from typing import Any, Callable, Dict, Optional, Tuple
 import torch
 from ..helpers.cache_helper import make_dynamic_cache, make_encoder_decoder_cache
-from ..helpers.config_helper import update_config, check_hasattr, _pick
+from ..helpers.config_helper import (
+    update_config,
+    check_hasattr,
+    _pick,
+    default_num_hidden_layers as nhl,
+)
 __TASK__ = "summarization"
@@ -12,7 +17,7 @@ def reduce_model_config(config: Any) -> Dict[str, Any]:
     if hasattr(config, "num_decoder_layers"):
         config.num_decoder_layers = min(config.num_decoder_layers, 2)
     if hasattr(config, "num_hidden_layers"):
-        config.num_hidden_layers = min(config.num_hidden_layers, 2)
+        config.num_hidden_layers = min(config.num_hidden_layers, nhl())
     update_config(config, kwargs)
     return kwargs

onnx-diagnostic 0.7.5__py3-none-any.whl → 0.7.7__py3-none-any.whl

onnx-diagnostic 0.7.5py3-none-any.whl → 0.7.7py3-none-any.whl