PyPI - onnx-diagnostic - Versions diffs - 0.7.16__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

onnx-diagnostic 0.7.16py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

onnx_diagnostic/__init__.py +1 -1
onnx_diagnostic/_command_lines_parser.py +78 -22
onnx_diagnostic/export/api.py +124 -0
onnx_diagnostic/export/dynamic_shapes.py +2 -1
onnx_diagnostic/export/shape_helper.py +47 -70
onnx_diagnostic/ext_test_case.py +11 -0
onnx_diagnostic/helpers/cache_helper.py +38 -7
onnx_diagnostic/helpers/fake_tensor_helper.py +224 -104
onnx_diagnostic/helpers/helper.py +27 -33
onnx_diagnostic/helpers/log_helper.py +109 -5
onnx_diagnostic/helpers/memory_peak.py +2 -0
onnx_diagnostic/helpers/mini_onnx_builder.py +1 -1
onnx_diagnostic/helpers/model_builder_helper.py +132 -2
onnx_diagnostic/helpers/onnx_helper.py +1 -1
onnx_diagnostic/helpers/ort_session.py +4 -0
onnx_diagnostic/helpers/rt_helper.py +393 -43
onnx_diagnostic/helpers/torch_helper.py +20 -1
onnx_diagnostic/tasks/__init__.py +7 -0
onnx_diagnostic/tasks/automatic_speech_recognition.py +2 -8
onnx_diagnostic/tasks/feature_extraction.py +2 -8
onnx_diagnostic/tasks/image_text_to_text.py +10 -8
onnx_diagnostic/tasks/summarization.py +2 -8
onnx_diagnostic/tasks/text2text_generation.py +3 -8
onnx_diagnostic/tasks/text_generation.py +86 -65
onnx_diagnostic/torch_export_patches/onnx_export_errors.py +718 -438
onnx_diagnostic/torch_export_patches/patch_details.py +340 -0
onnx_diagnostic/torch_export_patches/patch_inputs.py +1 -1
onnx_diagnostic/torch_export_patches/patch_module.py +9 -36
onnx_diagnostic/torch_export_patches/patches/patch_torch.py +12 -6
onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +162 -24
onnx_diagnostic/torch_export_patches/serialization/transformers_impl.py +140 -104
onnx_diagnostic/torch_models/untrained/llm_phi2.py +1 -4
onnx_diagnostic/torch_models/validate.py +626 -228
{onnx_diagnostic-0.7.16.dist-info → onnx_diagnostic-0.8.1.dist-info}/METADATA +1 -1
{onnx_diagnostic-0.7.16.dist-info → onnx_diagnostic-0.8.1.dist-info}/RECORD +38 -36
{onnx_diagnostic-0.7.16.dist-info → onnx_diagnostic-0.8.1.dist-info}/WHEEL +0 -0
{onnx_diagnostic-0.7.16.dist-info → onnx_diagnostic-0.8.1.dist-info}/licenses/LICENSE.txt +0 -0
{onnx_diagnostic-0.7.16.dist-info → onnx_diagnostic-0.8.1.dist-info}/top_level.txt +0 -0

onnx_diagnostic/helpers/rt_helper.py CHANGED Viewed

@@ -1,18 +1,18 @@
-from typing import Any, Dict, List, Union
+import json
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import onnx
 import torch
-from .helper import string_type, flatten_object
+from .helper import string_type, flatten_object, max_diff
+from .torch_helper import torch_deepcopy
+from .ort_session import InferenceSessionForTorch
 def name_type_to_onnx_dtype(name: str) -> int:
-    if name == "tensor(int64)":
-        return onnx.TensorProto.INT64
-    if name == "tensor(float)":
-        return onnx.TensorProto.FLOAT
-    if name == "tensor(float16)":
-        return onnx.TensorProto.FLOAT16
-    raise AssertionError(f"Unexpected value {name!r}")
+    assert name.startswith("tensor(") and name.endswith(")"), f"Invalid value name={name!r}"
+    look = name[7:-1]
+    return getattr(onnx.TensorProto, look.upper())
 def make_feeds(
@@ -95,49 +95,399 @@ def make_feeds(
         elif isinstance(i, float):
             i = np.array(i, dtype=np.float32)
         new_flat.append(i)
-    # NOTE: model builder has a different order for past_key_values
-    #       we need to reorder them to match the expected order
-    if is_modelbuilder:
-        # We assume that if "past_key_values" is in the names when it's
-        # modelbuilder
-        non_past_kv_input_names = [n for n in names if "past_key_values" not in n]
-        past_kv_names = [n for n in names if "past_key_values" in n]
-        reorder_past_kv_names = reorder_modelbuilder_cache_to_torch(past_kv_names)
-        names = non_past_kv_input_names + reorder_past_kv_names
     return dict(zip(names, new_flat))
-def reorder_modelbuilder_cache_to_torch(past_kv: List[Any]) -> List[Any]:
+def _get_dim(i: int, s: Union[str, int], batch: int = 1) -> int:
+    if isinstance(s, int):
+        return s
+    if s == "batch":
+        return batch
+    # Everything else is cache length or sequence length.
+    return 0
+_DTYPES = {
+    "tensor(float)": torch.float32,
+    "tensor(float16)": torch.float16,
+    "tensor(bfloat16)": torch.bfloat16,
+    "tensor(int64)": torch.int64,
+    "tensor(int32)": torch.int32,
+}
+def rt_type_to_torch_dtype(typename: str) -> torch.dtype:
+    """Converts a string such as ``tensor(float)`` into a dtype (torch.float32)."""
+    return _DTYPES[typename]
+def make_empty_cache(
+    batch: int,
+    onnx_input_names: List[str],
+    onnx_input_shapes: List[Tuple[Union[int, str], ...]],
+    onnx_input_types: List[str],
+) -> Dict[str, torch.Tensor]:
+    """
+    Creates an empty cache. Example:
+    .. code-block:: python
+        make_empty_cache(
+            1,
+            sess.input_names[2:],
+            [i.shape for i in sess.get_inputs()[2:]],
+            [i.type for i in sess.get_inputs()[2:]],
+        )
+    """
+    feeds = {}
+    for name, shape, dtype in zip(onnx_input_names, onnx_input_shapes, onnx_input_types):
+        new_shape = tuple(_get_dim(i, s, batch=batch) for i, s in enumerate(shape))
+        feeds[name] = torch.empty(new_shape, dtype=rt_type_to_torch_dtype(dtype))
+    return feeds
+def generate_and_validate(
+    model,
+    input_ids: torch.Tensor,
+    eos_token_id: int = 2,
+    max_new_tokens: int = 100,
+    session: Optional[Union[InferenceSessionForTorch, onnx.ModelProto, str]] = None,
+    atol: float = 0.1,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, List[Dict]]]:
+    """
+    Implements a simple method ``generate`` for a torch model.
+    The function does not expect any ``position_ids`` as input.
+    The function also checks the outputs coming from an onnx model
+    are close to the output the torch model produces.
+    :param model_or_path: model or loaded model
+    :param input_ids: input tokens
+    :param eos_token_ids: token representing the end of an answer
+    :param max_new_tokens: stops after this number of generated tokens
+    :param session: the onnx model
+    :return: input tokens concatenated with new tokens,
+        if session is not null, it also returns the maximum differences
+        at every iterations
+    See example given with function :func:`onnx_generate
+    <onnx_diagnostic.helpers.rt_helper.onnx_generate>`.
+    """
+    if session is not None:
+        if not isinstance(session, InferenceSessionForTorch):
+            providers = ["CUDAExecutionProvider"] if input_ids.is_cuda else []
+            providers.append("CPUExecutionProvider")
+            session = InferenceSessionForTorch(session, providers=providers)
+    # First call: prefill
+    attention_mask = torch.ones(
+        input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
+    )
+    if session:
+        feeds = {
+            **dict(zip(session.input_names[:2], [input_ids, attention_mask])),
+            **make_empty_cache(
+                input_ids.shape[0],
+                session.input_names[2:],
+                session.input_shapes[2:],
+                session.input_types[2:],
+            ),
+        }
+        onnx_results = session.run(None, feeds)
+    outputs = model(input_ids, use_cache=True, attention_mask=attention_mask)
+    if session:
+        diff = max_diff(outputs, onnx_results)
+        assert isinstance(diff["abs"], float) and diff["abs"] <= atol, (
+            f"Unexpected issue with {type(model)}\ndiff={diff}"
+            f"\ninput_ids.shape={input_ids.shape}"
+            f"\nexpected={string_type(outputs, with_shape=True, with_min_max=True)}"
+            f"\n     got=\n"
+            f"{string_type(onnx_results, with_shape=True, with_min_max=True)}\n"
+            f"feeds={string_type(feeds, with_shape=True, with_min_max=True)}"
+        )
+        diffs = [diff]
+    # Next calls: decode
+    for iteration in range(max_new_tokens):
+        next_token_logits = outputs.logits[:, -1, :]
+        next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+        if next_token_id.item() == eos_token_id:
+            break
+        input_ids = torch.cat([input_ids, next_token_id], dim=-1)
+        attention_mask = torch.ones(
+            input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
+        )
+        if session:
+            feeds = dict(
+                zip(
+                    session.input_names,
+                    [
+                        t.detach()
+                        for t in torch_deepcopy(
+                            flatten_object(
+                                [next_token_id, attention_mask, outputs.past_key_values]
+                            )
+                        )
+                    ],
+                )
+            )
+            onnx_results = session.run(None, feeds)
+        outputs = model(
+            next_token_id,
+            use_cache=True,
+            past_key_values=outputs.past_key_values,
+            attention_mask=attention_mask,
+        )
+        if session:
+            diff = max_diff(outputs, onnx_results)
+            assert isinstance(diff["abs"], float) and diff["abs"] <= atol, (
+                f"Unexpected issue with {type(model)}, iteration={iteration}"
+                f"\ndiff={diff}\ninput_ids.shape={input_ids.shape}"
+                f"\nexpected={string_type(outputs, with_shape=True, with_min_max=True)}"
+                f"\n     got=\n"
+                f"{string_type(onnx_results, with_shape=True, with_min_max=True)}\n"
+                f"feeds={string_type(feeds, with_shape=True, with_min_max=True)}"
+            )
+            diffs.append(diff)
+    if session:
+        return input_ids, diffs
+    return input_ids
+def onnx_generate(
+    model_or_path: Union[onnx.ModelProto, str, InferenceSessionForTorch],
+    input_ids: torch.Tensor,
+    eos_token_id: int = 2,
+    max_new_tokens=100,
+    return_session: bool = False,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, InferenceSessionForTorch, Dict[str, Any]]]:
+    """
+    Implements a simple method ``generate`` for an ONNX model.
+    The function does not expect any ``position_ids`` as input.
+    :param model_or_path: model or loaded model
+    :param input_ids: input tokens
+    :param eos_token_ids: token representing the end of an answer
+    :param max_new_tokens: stops after this number of generated tokens
+    :param return_session: returns the instance of class
+        :class:`InferenceSessionForTorch
+        <onnx_diagnostic.helpers.ort_session.InferenceSessionForTorch>`
+        created if necessary, the function returns the feeds for the next iteration
+    :return: input tokens concatenated with new tokens
+    .. runpython::
+        :showcode:
+        import os
+        from onnx_diagnostic.helpers import string_type, string_diff
+        from onnx_diagnostic.helpers.rt_helper import (
+            onnx_generate,
+            generate_and_validate,
+            onnx_generate_with_genai,
+        )
+        from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
+        from onnx_diagnostic.torch_export_patches import torch_export_patches
+        from onnx_diagnostic.export.api import to_onnx
+        mid = "arnir0/Tiny-LLM"
+        print(f"-- get model for {mid!r}")
+        data = get_untrained_model_with_inputs(mid)
+        model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
+        del inputs["position_ids"]
+        del ds["position_ids"]
+        input_ids = inputs["input_ids"]
+        print(f"-- input_ids={input_ids.shape}")
+        print(f"-- inputs: {string_type(inputs, with_shape=True)}")
+        print(f"-- dynamic_shapes: {string_type(ds)}")
+        folder = "dump_test"
+        os.makedirs(folder, exist_ok=True)
+        model_name = os.path.join(folder, "model.onnx")
+        print("-- test_onnx_generate: export model")
+        with torch_export_patches(patch_transformers=True, patch_torch=False):
+            to_onnx(
+                model,
+                (),
+                kwargs=inputs,
+                dynamic_shapes=ds,
+                filename=model_name,
+                exporter="custom",  # custom, dynamo or onnx-dynamo, modelbuilder
+            )
+        print("-- generate with onnx")
+        onnx_outputs = onnx_generate(model_name, input_ids[:1], 2, max_new_tokens=10)
+        print("-- onnx output", onnx_outputs)
+        # The example continues with other functions doing the same.
+        print("-- generate with pytorch")
+        torch_outputs, diffs = generate_and_validate(
+            model, input_ids[:1], 2, max_new_tokens=10, session=model_name
+        )
+        print("-- torch output", torch_outputs)
+        print("-- differences at each step:")
+        for i, d in enumerate(diffs):
+            print(f"iteration {i}: {string_diff(d)}")
+        print("-- generate with genai")
+        genai_outputs, session = onnx_generate_with_genai(
+            model_name,
+            input_ids[:1],
+            max_new_tokens=10,
+            return_session=True,
+            transformers_config=data["configuration"],
+        )
+        print("-- genai output", genai_outputs)
     """
-    Reorders the past_kvs for ModelBuilder to match the expected order
-    by PyTorch exported models.
+    if not isinstance(model_or_path, InferenceSessionForTorch):
+        providers = ["CUDAExecutionProvider"] if input_ids.is_cuda else []
+        providers.append("CPUExecutionProvider")
+        session = InferenceSessionForTorch(model_or_path, providers=providers)
+    else:
+        session = model_or_path
+    input_shapes = session.input_shapes
+    input_names = session.input_names
+    input_types = session.input_types
+    has_position_ids = "position_ids" in session.input_names
+    assert (
+        len(input_names) > 2
+        and input_names[:2] == ["input_ids", "attention_mask"]
+        and input_names[3 if has_position_ids else 2].startswith("past_key_values")
+    ), (
+        f"Only text generation is supported but input_names == {input_names}, "
+        f"has_position_ids={has_position_ids}"
+    )
+    assert (
+        not has_position_ids or input_names[2] == "position_ids"
+    ), f"position_ids must the third input but input_names={input_names}"
+    # First call: prefill
+    feeds = dict(
+        input_ids=input_ids,
+        attention_mask=torch.ones(
+            input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
+        ),
+        **make_empty_cache(
+            input_ids.shape[0], input_names[2:], input_shapes[2:], input_types[2:]
+        ),
+    )
+    if has_position_ids:
+        feeds["position_ids"] = torch.unsqueeze(
+            torch.arange(input_ids.shape[1], dtype=torch.int64, device=input_ids.device), 0
+        )
-    .. note::
-        This function can take either the names or the actual tensors
-        as long as they are in a list.
+    outputs = session.run(None, feeds)
-    Conceptually,
+    # Next calls: decode
+    for _ in range(max_new_tokens):
+        next_token_logits = outputs[0][:, -1, :]
-    From::
+        # The most probable next token is chosen.
+        next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+        # But we could select it using a multinomial law
+        # <<< probs = torch.softmax(next_token_logits / temperature, dim=-1)
+        # <<< top_probs, top_indices = torch.topk(probs, top_k)
+        # <<< next_token_id = top_indices[torch.multinomial(top_probs, 1)]
-        [past_key_values.0.key, past_key_values.0.value,
-        past_key_values.1.key, past_key_values.1.value, ...]
+        if next_token_id.item() == eos_token_id:
+            break
+        input_ids = torch.cat([input_ids, next_token_id.to(input_ids.device)], dim=-1)
+        feeds = dict(
+            input_ids=next_token_id,
+            attention_mask=torch.ones(
+                input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
+            ),
+        )
+        if has_position_ids:
+            feeds["position_ids"] = torch.unsqueeze(
+                torch.arange(
+                    input_ids.shape[1],
+                    input_ids.shape[1] + 1,
+                    dtype=torch.int64,
+                    device=input_ids.device,
+                ),
+                0,
+            )
+        feeds.update(dict(zip(input_names[3 if has_position_ids else 2 :], outputs[1:])))
+        outputs = session.run(None, feeds)
+    if return_session:
+        return input_ids, session, feeds
+    return input_ids
-    To::
-        [past_key_values.0.key, past_key_values.1.key,
-        ..., past_key_values.0.value, past_key_values.1.value, ...]
+def onnx_generate_with_genai(
+    model_or_path: Union[onnx.ModelProto, str, InferenceSessionForTorch],
+    input_ids: torch.Tensor,
+    max_new_tokens=100,
+    return_session: bool = False,
+    transformers_config: Optional[Any] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, InferenceSessionForTorch]]:
+    """
+    Uses :epkg:`onnxruntime-genai` to implement a simple method ``generate``
+    for an ONNX model. The function does not expect any ``position_ids`` as input.
+    :param model_or_path: model or loaded model
+    :param input_ids: input tokens
+    :param eos_token_ids: token representing the end of an answer
+    :param max_new_tokens: stops after this number of generated tokens
+    :param return_session: returns the instance of class
+        :class:`InferenceSessionForTorch
+        <onnx_diagnostic.helpers.ort_session.InferenceSessionForTorch>`
+        created if necessary
+    :param transformers_config: write configuration
+        if missing and if this configuration is provided
+    :return: input tokens concatenated with new tokens
-    :param past_kv: list of flattened inputs
-    :return: reordered list of flattened inputs
+    See example given with function :func:`onnx_generate
+    <onnx_diagnostic.helpers.rt_helper.onnx_generate>`.
     """
-    total_len = len(past_kv)
-    if total_len % 2 != 0:
-        raise ValueError("The length of past_key_values should be even.")
-    keys = []
-    values = []
-    for i in range(0, total_len, 2):
-        keys.append(past_kv[i])
-        values.append(past_kv[i + 1])
-    return keys + values
+    import onnxruntime_genai as og
+    if not isinstance(model_or_path, og.Model):
+        from .model_builder_helper import make_genai_config
+        assert isinstance(
+            model_or_path, str
+        ), f"Only a filename is allowed for model_or_path but type is {type(model_or_path)}"
+        folder = os.path.dirname(model_or_path)
+        assert os.path.exists(folder), f"Folder {folder!r} does not exists."
+        assert os.path.exists(model_or_path), f"Folder {model_or_path!r} does not exists."
+        config_file = os.path.join(folder, "genai_config.json")
+        if not os.path.exists(config_file):
+            if not transformers_config:
+                raise FileNotFoundError(
+                    f"Folder {model_or_path!r} does not contain 'genai_config.json'."
+                )
+            config = make_genai_config(transformers_config, model_or_path)
+            with open(config_file, "w") as f:
+                json.dump(config, f, indent=4)
+        config = og.Config(os.path.dirname(config_file))
+        if input_ids.is_cuda:
+            config.clear_providers()
+            config.append_provider("cuda")
+        session = og.Model(config)
+    else:
+        session = model_or_path
+    params = og.GeneratorParams(session)
+    params.set_search_options(
+        max_length=max_new_tokens + input_ids.shape[1], batch_size=input_ids.shape[0]
+    )
+    generator = og.Generator(session, params)
+    # First call: prefill
+    cats = []
+    generator.append_tokens(input_ids)
+    while not generator.is_done():
+        generator.generate_next_token()
+        new_token = generator.get_next_tokens()[0]
+        cats.append(int(new_token))
+    input_ids = torch.cat([input_ids, torch.tensor([cats], dtype=torch.int64)], dim=-1)
+    if return_session:
+        return input_ids, session
+    return input_ids

onnx_diagnostic/helpers/torch_helper.py CHANGED Viewed

@@ -856,9 +856,15 @@ def torch_deepcopy(value: Any) -> Any:
         ), f"Unexpected type={type(value)}"
         return copy.deepcopy(value)
+    if hasattr(value, "__nocopy__"):
+        return value
     # We should have a code using serialization, deserialization assuming a model
     # cannot be exported without them.
-    raise NotImplementedError(f"torch_deepcopy not implemented for type {type(value)}")
+    raise NotImplementedError(
+        f"torch_deepcopy not implemented for type {type(value)}, "
+        f"add attribute '__nocopy__' to return it as is."
+    )
 def torch_tensor_size(value: Any) -> Any:
@@ -966,3 +972,16 @@ def to_tensor(tensor: onnx.TensorProto, base_dir: str = "") -> torch.Tensor:
     # Other cases, it should be small tensor. We use numpy.
     np_tensor = to_array_extended(tensor)
     return torch.from_numpy(np_tensor)
+def get_weight_type(model: torch.nn.Module) -> torch.dtype:
+    """Returns the most probable dtype in a model."""
+    counts = {}
+    for _name, param in model.named_parameters():
+        dt = param.dtype
+        if dt not in counts:
+            counts[dt] = 1
+        else:
+            counts[dt] += 1
+    final = max(list(counts.items()))
+    return final[0]

onnx_diagnostic/tasks/__init__.py CHANGED Viewed

@@ -77,6 +77,13 @@ def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callabl
     If the configuration is None, the function selects typical dimensions.
     It returns parameters and a function. The function creates dummy inputs
     if it receives the parameters returned as a first result.
+    .. code-block:: python
+        config = get_pretrained_config(model_id)
+        task = task = task_from_id(name)
+        kwargs, fct = random_input_kwargs(config, task)
+        res = fct(model, config, add_second_input=False, **kwargs)
     """
     tasks = {mod.__TASK__: mod.random_input_kwargs for mod in __TASKS__}
     assert task in tasks, f"Task {task!r} not found in {sorted(tasks)}"

onnx_diagnostic/tasks/automatic_speech_recognition.py CHANGED Viewed

@@ -84,14 +84,8 @@ def get_inputs(
         "cache_position": {0: seq_length},
         "encoder_outputs": [{0: batch}],  # last_hidden_state
         "past_key_values": [
-            [
-                [{0: batch} for _ in range(num_hidden_layers)],
-                [{0: batch} for _ in range(num_hidden_layers)],
-            ],
-            [
-                [{0: batch} for _ in range(num_hidden_layers)],
-                [{0: batch} for _ in range(num_hidden_layers)],
-            ],
+            [{0: batch} for _ in range(num_hidden_layers * 2)],
+            [{0: batch} for _ in range(num_hidden_layers * 2)],
         ],
     }
     inputs = dict(

onnx_diagnostic/tasks/feature_extraction.py CHANGED Viewed

@@ -109,14 +109,8 @@ def get_inputs(
         cache_length = "cache_length_key"
         cache_length2 = "cache_length_val"
         shapes["past_key_values"] = [  # type: ignore[assignment]
-            [
-                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-            ],
-            [
-                [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers)],
-                [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers)],
-            ],
+            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers * 2)],
+            [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers * 2)],
         ]
     res = dict(inputs=inputs, dynamic_shapes=shapes)

onnx_diagnostic/tasks/image_text_to_text.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import itertools
 from typing import Any, Callable, Dict, Optional, Tuple
 import torch
 from ..helpers.cache_helper import make_dynamic_cache, make_hybrid_cache
@@ -151,10 +152,7 @@ def _get_inputs_gemma3(
         },
         "position_ids": {0: batch, 1: seq_length},
         "cache_position": {0: seq_length},
-        "past_key_values": [
-            [{0: batch} for _ in range(num_hidden_layers)],
-            [{0: batch} for _ in range(num_hidden_layers)],
-        ],
+        "past_key_values": [{0: batch} for _ in range(num_hidden_layers * 2)],
         "pixel_values": {0: batch},
         "use_cache": None,
     }
@@ -272,10 +270,14 @@ def get_inputs_default(
         "token_type_ids": {0: batch, 1: seq_length},
         "attention_mask": {0: batch, 1: "cache+seq"},
         "position_ids": {0: batch, 1: seq_length},
-        "past_key_values": [
-            [{0: batch} for _ in range(num_hidden_layers)],
-            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-        ],
+        "past_key_values": list(
+            itertools.chain.from_iterable(
+                zip(
+                    [{0: batch} for _ in range(num_hidden_layers)],
+                    [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+                )
+            )
+        ),
         "pixel_values": (
             {0: batch, 1: images}
             if model.__class__.__name__ == "IdeficsForVisionText2Text"

onnx_diagnostic/tasks/summarization.py CHANGED Viewed

@@ -81,14 +81,8 @@ def get_inputs(
         "attention_mask": {0: batch, 1: "seq_mask"},
         # "cache_position": {0: batch, 1: torch.export.Dim.DYNAMIC},
         "past_key_values": [
-            [
-                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-            ],
-            [
-                [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers)],
-                [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers)],
-            ],
+            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers * 2)],
+            [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers * 2)],
         ],
         # one these is selected based on the forward method signature
         # "encoder_last_hidden_state": {0: batch, 1: torch.export.Dim.DYNAMIC},

onnx_diagnostic/tasks/text2text_generation.py CHANGED Viewed

@@ -83,14 +83,8 @@ def get_inputs(
         "attention_mask": {0: batch, 1: "seq_mask"},
         # "cache_position": {0: batch, 1: torch.export.Dim.DYNAMIC},
         "past_key_values": [
-            [
-                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-            ],
-            [
-                [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers)],
-                [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers)],
-            ],
+            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers * 2)],
+            [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers * 2)],
         ],
         # one these is selected based on the forward method signature
         # "encoder_last_hidden_state": {0: batch, 1: torch.export.Dim.DYNAMIC},
@@ -157,6 +151,7 @@ def get_inputs(
         assert (
             add_second_input > 0
         ), f"Not implemented for add_second_input={add_second_input}."
+        res["inputs_prompt"] = dict(input_ids=torch.randint(1000, 30000, (1, 11)))
         res["inputs2"] = get_inputs(
             model=model,
             config=config,

onnx-diagnostic 0.7.16__py3-none-any.whl → 0.8.1__py3-none-any.whl

onnx-diagnostic 0.7.16py3-none-any.whl → 0.8.1py3-none-any.whl