PyPI - onnx-diagnostic - Versions diffs - 0.7.12__py3-none-any.whl → 0.7.13__py3-none-any.whl - Mend

onnx-diagnostic 0.7.12py3-none-any.whl → 0.7.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

onnx_diagnostic/tasks/image_text_to_text.py CHANGED Viewed

@@ -7,6 +7,7 @@ from ..helpers.config_helper import (
     _pick,
     default_num_hidden_layers as nhl,
 )
+from .data import get_data
 __TASK__ = "image-text-to-text"
@@ -14,6 +15,27 @@ __TASK__ = "image-text-to-text"
 def reduce_model_config(config: Any) -> Dict[str, Any]:
     """Reduces a model size."""
     kwargs: Dict[str, Any] = {}
+    if (
+        hasattr(config, "architectures")
+        and config.architectures
+        and config.architectures[0] == "Gemma3ForConditionalGeneration"
+    ):
+        if hasattr(config, "vision_config"):
+            if hasattr(config.vision_config, "num_hidden_layers"):
+                config.vision_config.num_hidden_layers = min(
+                    config.vision_config.num_hidden_layers, nhl()
+                )
+        if hasattr(config, "text_config"):
+            if hasattr(config.text_config, "intermediate_size"):
+                config.text_config.intermediate_size = min(
+                    config.text_config.intermediate_size, 10240 // 10 * 5 // 2
+                )
+                config.text_config.hidden_size = min(
+                    config.text_config.hidden_size, 2560 // 10 * 5 // 2
+                )
+        update_config(config, kwargs)
+        return kwargs
     if hasattr(config, "num_hidden_layers"):
         config.num_hidden_layers = min(config.num_hidden_layers, nhl())
     if hasattr(config, "mm_tokens_per_image"):
@@ -72,54 +94,63 @@ def _get_inputs_gemma3(
     width: int,
     height: int,
     num_channels: int,
-    batch_size: int = 2,
-    sequence_length: int = 43,
-    sequence_length2: int = 43,
-    n_images: int = 2,
-    dynamic_rope: bool = False,
-    max_sequence_length: int = 380,
+    batch_size: Optional[int] = 1,
+    sequence_length: Optional[int] = 281,
+    n_images: Optional[int] = 1,
+    max_sequence_length: Optional[int] = 580,
+    total_sequence_length: Optional[int] = 860,
     **kwargs,  # unused
 ):
     """
+    The functions uses predefined values for input_ids and token_type_ids.
+    **google/gemma-3-4b-it**
+    iteration 1
     ::
+           cache_position:T7s281,
+           input_ids:T7s1x281,
+           token_type_ids:T7s1x281,
+           attention_mask:dict(sliding_attention:T9s1x1x281x580,
+                               full_attention:T9s1x1x281x580),
+           pixel_values:T16s1x3x896x896,
-        dict(input_ids:T7s1x281,
-            pixel_values:T16s1x3x896x896,
-            attention_mask:dict(full_attention:T9s1x1x281x380,sliding_attention:T9s1x1x281x380),
-            position_ids:T7s1x281,
-            past_key_values:HybridCache(
-                key_cache=#34[T1s1x4x380x256,...],
-                value_cache=#34[T1s1x4x380x256,...]),
-            token_type_ids:T7s1x281,
-            cache_position:T7s281,
-            logits_to_keep:1)
-        dict(input_ids:T7s1x1,
-            pixel_values:None,
-            attention_mask:dict(full_attention:T9s1x1x1x380,sliding_attention:T9s1x1x1x380),
-            position_ids:T7s1x1,
-            past_key_values:HybridCache(
-                key_cache=#34[T1s1x4x380x256,...],
-                value_cache=#34[T1s1x4x380x256,...]),
-            token_type_ids:T7s1x1,
-            cache_position:T7s1,
-            logits_to_keep:1)
+    iteration 2
+    ::
+           cache_position:T7s1,
+           past_key_values:StaticCache(key_cache=#34[T1s1x4x580x256,...],
+                                       value_cache=#34[T1s1x4x580x256,...]),
+           input_ids:T7s1x1,
+           inputs_embeds:None,
+           token_type_ids:T7s1x1,
+           attention_mask:dict(sliding_attention:T9s1x1x1x580,full_attention:T9s1x1x1x580),
+           position_ids:None,
     """
+    batch_size = 1 if batch_size is None else batch_size
+    sequence_length = 281 if sequence_length is None else sequence_length
+    n_images = 1 if n_images is None else n_images
+    max_sequence_length = 580 if max_sequence_length is None else max_sequence_length
+    total_sequence_length = 860 if total_sequence_length is None else total_sequence_length
     assert (
         "cls_cache" not in kwargs
     ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
     batch = "batch"
-    seq_length = "seq_length"  # torch.export.Dim("seq_length", min=1, max=4096)
-    # cache_length = "cache_length"  # torch.export.Dim("cache_length", min=1, max=4096)
+    seq_length = "seq_length"
+    tot_length = "total_length"
     shapes = {
         "input_ids": {0: batch, 1: seq_length},
         "token_type_ids": {0: batch, 1: seq_length},
         "attention_mask": {
-            "full_attention": {0: batch, 2: seq_length},
-            "sliding_attention": {0: batch, 2: seq_length},
+            "full_attention": {0: batch, 2: seq_length, 3: tot_length},
+            "sliding_attention": {0: batch, 2: seq_length, 3: tot_length},
         },
         "position_ids": {0: batch, 1: seq_length},
-        "cache_position": {1: seq_length},
+        "cache_position": {0: seq_length},
         "past_key_values": [
             [{0: batch} for _ in range(num_hidden_layers)],
             [{0: batch} for _ in range(num_hidden_layers)],
@@ -128,23 +159,55 @@ def _get_inputs_gemma3(
         "use_cache": None,
     }
-    input_ids = torch.randint(0, dummy_max_token_id, (batch_size, sequence_length2)).to(
-        torch.int64
-    )
-    input_ids[:, 1] = image_token_index
-    # input_ids[input_ids == image_token_index] = pad_token_id
-    token_type_ids = torch.zeros_like(input_ids)
-    token_type_ids[input_ids == image_token_index] = 1
+    # retrieve specific inputs to keep the consistency between
+    # ids and images
+    dummies = get_data("dummies_imagetext2text_generation_gemma3.onnx")
+    dummies = dummies[("", 0, "I")][1]
+    dummies = {k: v for k, v in dummies.items() if k in shapes}
+    expected = {"input_ids", "token_type_ids", "position_ids", "cache_position"}
+    def _check_():
+        assert expected & set(
+            dummies
+        ), f"Unable to find expected inputs {expected} in loaded inputs {set(dummies)}"
+        assert sequence_length == dummies["input_ids"].shape[-1], (
+            f"sequence_length={sequence_length} != {dummies['input_ids'].shape[-1]} for "
+            f"model class {model.__class__.__name__}"
+        )
+        assert batch_size == dummies["input_ids"].shape[0], (
+            f"batch_size={batch_size} != {dummies['input_ids'].shape[0]} for "
+            f"model class {model.__class__.__name__}"
+        )
+        assert max_sequence_length == 580, (
+            f"max_sequence_length={max_sequence_length} != 580 "
+            f"for model {model.__class__.__name__}"
+        )
+        assert total_sequence_length == 860, (
+            f"total_sequence_length={total_sequence_length} != 860 "
+            f"for model {model.__class__.__name__}"
+        )
+        assert (
+            head_dim == 256
+        ), f"head_dim={head_dim} != 256 for model {model.__class__.__name__}"
+        assert n_images == 1, f"n_images={n_images} != 1 for model {model.__class__.__name__}"
+        assert num_key_value_heads == 4, (
+            f"num_key_value_heads={num_key_value_heads} != 256 "
+            f"for this model {model.__class__.__name__}"
+        )
+    _check_()
     inputs = dict(
-        input_ids=input_ids,
-        token_type_ids=token_type_ids,
+        input_ids=dummies["input_ids"],
+        token_type_ids=dummies["token_type_ids"],
         attention_mask=dict(
-            full_attention=torch.randn(batch_size, 1, sequence_length, max_sequence_length),
-            sliding_attention=torch.randn(batch_size, 1, sequence_length, max_sequence_length),
+            full_attention=torch.randn(batch_size, 1, sequence_length, total_sequence_length),
+            sliding_attention=torch.randn(
+                batch_size, 1, sequence_length, total_sequence_length
+            ),
         ),
-        cache_position=torch.arange(0, sequence_length).to(torch.int64),
         position_ids=torch.arange(0, sequence_length).to(torch.int64).expand((batch_size, -1)),
+        cache_position=torch.arange(0, sequence_length).to(torch.int64),
         past_key_values=make_hybrid_cache(
             [
                 (
@@ -159,12 +222,121 @@ def _get_inputs_gemma3(
             ]
         ),
         pixel_values=torch.randn(n_images, num_channels, width, height).clamp(-1, 1),
-        image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
+        # image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
+        #    torch.int64
+        # ),
+        use_cache=True,  # Gemma3 does not set this value to true when a cache is provided
+    )
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+def get_inputs_default(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    dummy_max_token_id: int,
+    num_key_value_heads: int,
+    num_hidden_layers: int,
+    pad_token_id: int,
+    image_token_index: int,
+    head_dim: int,
+    width: int,
+    height: int,
+    num_channels: int,
+    batch_size: Optional[int] = 2,
+    sequence_length: Optional[int] = 43,
+    n_images: Optional[int] = 2,
+    max_sequence_length: Optional[int] = 43,
+    total_sequence_length: Optional[int] = 43,
+    add_second_input: int = 0,
+    **kwargs,  # unused
+):
+    batch_size = 2 if batch_size is None else batch_size
+    sequence_length = 43 if sequence_length is None else sequence_length
+    n_images = 2 if n_images is None else n_images
+    max_sequence_length = 43 if max_sequence_length is None else max_sequence_length
+    total_sequence_length = 43 if total_sequence_length is None else total_sequence_length
+    assert batch_size > 0, "batch_size cannot be null"
+    assert (
+        "cls_cache" not in kwargs
+    ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
+    batch = "batch"
+    batch_img = torch.export.Dim("batch_img", min=1, max=1024)
+    seq_length = "seq_length"  # torch.export.Dim("seq_length", min=1, max=4096)
+    cache_length = "cache_length"  # torch.export.Dim("cache_length", min=1, max=4096)
+    images = "images"  # torch.export.Dim("images", min=1, max=4096)
+    shapes = {
+        "input_ids": {0: batch, 1: seq_length},
+        "token_type_ids": {0: batch, 1: seq_length},
+        "attention_mask": {0: batch, 1: "cache+seq"},
+        "position_ids": {0: batch, 1: "cache+seq"},
+        "past_key_values": [
+            [{0: batch} for _ in range(num_hidden_layers)],
+            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+        ],
+        "pixel_values": (
+            {0: batch, 1: images}
+            if model.__class__.__name__ == "IdeficsForVisionText2Text"
+            else {0: batch_img}
+        ),
+        "image_attention_mask": {0: batch, 1: seq_length, 2: images},
+        "image_grid_thw": {0: batch},
+        "use_cache": None,
+    }
+    input_ids = torch.randint(0, dummy_max_token_id, (batch_size, total_sequence_length)).to(
+        torch.int64
+    )
+    if total_sequence_length > 0:
+        input_ids[0, 0] = image_token_index
+        if min(input_ids.shape) > 1:
+            input_ids[1, 1] = image_token_index
+    # input_ids[input_ids == image_token_index] = pad_token_id
+    token_type_ids = torch.zeros_like(input_ids)
+    token_type_ids[input_ids == image_token_index] = 1
+    image_grid_thw = torch.zeros((n_images, 3), dtype=torch.int64)
+    if n_images > 0:
+        image_grid_thw[:, 1] = height
+        image_grid_thw[:, 2] = width
+        image_grid_thw[0, :] //= 2
+        image_grid_thw[:, 0] = torch.arange(n_images, dtype=image_grid_thw.dtype)
+    inputs = dict(
+        input_ids=input_ids,
+        token_type_ids=token_type_ids,
+        attention_mask=torch.cat(
+            [
+                torch.ones((batch_size, sequence_length), dtype=torch.int64),
+                input_ids.ne(pad_token_id).to(torch.int64),
+            ],
+            axis=-1,
+        ),
+        position_ids=torch.arange(0, total_sequence_length)
+        .to(torch.int64)
+        .expand((batch_size, -1)),
+        past_key_values=make_dynamic_cache(
+            [
+                (
+                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
+                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
+                )
+                for i in range(num_hidden_layers)
+            ]
+        ),
+        pixel_values=(
+            torch.randn((batch_size, n_images, num_channels, width, height)).clamp(-1, 1)
+            if model.__class__.__name__ == "IdeficsForVisionText2Text"
+            else torch.randn(n_images, num_channels, width, height).clamp(-1, 1)
+        ),
+        image_attention_mask=torch.ones((batch_size, total_sequence_length, n_images)).to(
             torch.int64
         ),
+        image_grid_thw=image_grid_thw,
         use_cache=True,  # Gemma3 does not set this value to true when a cache is provided
     )
-    return dict(inputs=inputs, dynamic_shapes=shapes)
+    res = dict(inputs=inputs, dynamic_shapes=shapes)
+    return res
 def get_inputs(
@@ -179,12 +351,12 @@ def get_inputs(
     width: int,
     height: int,
     num_channels: int,
-    batch_size: int = 2,
-    sequence_length: int = 43,
-    sequence_length2: int = 43,
-    n_images: int = 2,
-    dynamic_rope: bool = False,
-    add_second_input: int = 1,
+    batch_size: Optional[int] = None,
+    sequence_length: Optional[int] = None,
+    n_images: Optional[int] = None,
+    max_sequence_length: Optional[int] = None,
+    total_sequence_length: Optional[int] = None,
+    add_second_input: int = 0,
     **kwargs,  # unused
 ):
     """
@@ -198,13 +370,19 @@ def get_inputs(
     :param image_token_index: image_token_index
     :param batch_size: batch size
     :param sequence_length: sequence length
-    :param sequence_length2: new sequence length
+    :param max_sequence_length: for the cache
+    :param total_sequence_length: for the mask
     :param n_images: number of images
     :param width: width of the image
     :param height: height of the image
     :param num_channels: number of channels
-    :param dynamic_rope: use dynamic rope (see :class:`transformers.LlamaConfig`)
     :return: dictionary
+    .. note::
+        The content of the input_ids and its shape is correlated to the images.
+        The function uses a predefined values. The function raises an exception
+        if dimension are not the expected ones.
     """
     if model.__class__.__name__.startswith("Gemma3"):
         res = _get_inputs_gemma3(
@@ -221,92 +399,32 @@ def get_inputs(
             num_channels=num_channels,
             batch_size=batch_size,
             sequence_length=sequence_length,
-            sequence_length2=sequence_length2,
+            max_sequence_length=max_sequence_length,
+            total_sequence_length=total_sequence_length,
             n_images=n_images,
-            dynamic_rope=dynamic_rope,
             **kwargs,
         )
     else:
-        assert (
-            "cls_cache" not in kwargs
-        ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
-        batch = "batch"
-        batch_img = torch.export.Dim("batch_img", min=1, max=1024)
-        seq_length = "seq_length"  # torch.export.Dim("seq_length", min=1, max=4096)
-        cache_length = "cache_length"  # torch.export.Dim("cache_length", min=1, max=4096)
-        images = "images"  # torch.export.Dim("images", min=1, max=4096)
-        shapes = {
-            "input_ids": {0: batch, 1: seq_length},
-            "token_type_ids": {0: batch, 1: seq_length},
-            "attention_mask": {0: batch, 1: "cache+seq"},
-            "position_ids": {0: batch, 1: "cache+seq"},
-            "past_key_values": [
-                [{0: batch} for _ in range(num_hidden_layers)],
-                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-            ],
-            "pixel_values": (
-                {0: batch, 1: images}
-                if model.__class__.__name__ == "IdeficsForVisionText2Text"
-                else {0: batch_img}
-            ),
-            "image_attention_mask": {0: batch, 1: seq_length, 2: images},
-            "image_grid_thw": {0: batch},
-            "use_cache": None,
-        }
-        input_ids = torch.randint(0, dummy_max_token_id, (batch_size, sequence_length2)).to(
-            torch.int64
+        res = get_inputs_default(
+            model,
+            config,
+            dummy_max_token_id=dummy_max_token_id,
+            num_key_value_heads=num_key_value_heads,
+            num_hidden_layers=num_hidden_layers,
+            pad_token_id=pad_token_id,
+            image_token_index=image_token_index,
+            head_dim=head_dim,
+            width=width,
+            height=height,
+            num_channels=num_channels,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            max_sequence_length=max_sequence_length,
+            total_sequence_length=total_sequence_length,
+            n_images=n_images,
+            **kwargs,
         )
-        input_ids[0, 0] = image_token_index
-        input_ids[1, 1] = image_token_index
-        # input_ids[input_ids == image_token_index] = pad_token_id
-        token_type_ids = torch.zeros_like(input_ids)
-        token_type_ids[input_ids == image_token_index] = 1
-        image_grid_thw = torch.zeros((n_images, 3), dtype=torch.int64)
-        image_grid_thw[:, 1] = height
-        image_grid_thw[:, 2] = width
-        image_grid_thw[0, :] //= 2
-        image_grid_thw[:, 0] = torch.arange(n_images, dtype=image_grid_thw.dtype)
-        inputs = dict(
-            input_ids=input_ids,
-            attention_mask=torch.cat(
-                [
-                    torch.ones((batch_size, sequence_length), dtype=torch.int64),
-                    input_ids.ne(pad_token_id).to(torch.int64),
-                ],
-                axis=-1,
-            ),
-            position_ids=torch.arange(0, sequence_length2)
-            .to(torch.int64)
-            .expand((batch_size, -1)),
-            past_key_values=make_dynamic_cache(
-                [
-                    (
-                        torch.randn(
-                            batch_size, num_key_value_heads, sequence_length, head_dim
-                        ),
-                        torch.randn(
-                            batch_size, num_key_value_heads, sequence_length, head_dim
-                        ),
-                    )
-                    for i in range(num_hidden_layers)
-                ]
-            ),
-            pixel_values=(
-                torch.randn((batch_size, n_images, num_channels, width, height)).clamp(-1, 1)
-                if model.__class__.__name__ == "IdeficsForVisionText2Text"
-                else torch.randn(n_images, num_channels, width, height).clamp(-1, 1)
-            ),
-            image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
-                torch.int64
-            ),
-            token_type_ids=token_type_ids,
-            image_grid_thw=image_grid_thw,
-            use_cache=True,  # Gemma3 does not set this value to true when a cache is provided
-        )
-        res = dict(inputs=inputs, dynamic_shapes=shapes)
     if add_second_input:
         assert (
             add_second_input > 0
@@ -321,11 +439,11 @@ def get_inputs(
             width=width,
             height=height,
             num_channels=num_channels,
-            batch_size=batch_size + 1,
-            sequence_length=sequence_length + add_second_input,
-            sequence_length2=sequence_length2 + 1,
-            n_images=n_images + 1,
-            dynamic_rope=dynamic_rope,
+            batch_size=3,
+            sequence_length=1,
+            max_sequence_length=1,
+            total_sequence_length=1,
+            n_images=0,
             pad_token_id=pad_token_id,
             image_token_index=image_token_index,
             add_second_input=0,
@@ -368,9 +486,6 @@ def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
             text_config = False
         check_hasattr(config.vision_config, ("num_channels", "in_chans", "in_channels"))
     kwargs = dict(
-        batch_size=2,
-        sequence_length=43,
-        sequence_length2=43,
         head_dim=(
             16
             if config is None

onnx_diagnostic/tasks/text_generation.py CHANGED Viewed

@@ -269,6 +269,21 @@ def get_inputs(
             add_second_input=0,
             **kwargs,
         )["inputs"]
+        res["inputs_empty_cache"] = get_inputs(
+            model=model,
+            config=config,
+            dummy_max_token_id=dummy_max_token_id,
+            num_hidden_layers=num_hidden_layers,
+            batch_size=batch_size,
+            sequence_length=0,
+            sequence_length2=sequence_length2,
+            dynamic_rope=dynamic_rope,
+            num_key_value_heads=num_key_value_heads,
+            head_dim=head_dim,
+            cls_cache=cls_cache,
+            add_second_input=0,
+            **kwargs,
+        )["inputs"]
     return res

onnx-diagnostic 0.7.12__py3-none-any.whl → 0.7.13__py3-none-any.whl

onnx-diagnostic 0.7.12py3-none-any.whl → 0.7.13py3-none-any.whl