PyPI - opentau - Versions diffs - 0.1.0__py3-none-any.whl - Mend

opentau 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

opentau/__init__.py +179 -0
opentau/__version__.py +24 -0
opentau/configs/__init__.py +19 -0
opentau/configs/default.py +297 -0
opentau/configs/libero.py +113 -0
opentau/configs/parser.py +393 -0
opentau/configs/policies.py +297 -0
opentau/configs/reward.py +42 -0
opentau/configs/train.py +370 -0
opentau/configs/types.py +76 -0
opentau/constants.py +52 -0
opentau/datasets/__init__.py +84 -0
opentau/datasets/backward_compatibility.py +78 -0
opentau/datasets/compute_stats.py +333 -0
opentau/datasets/dataset_mixture.py +460 -0
opentau/datasets/factory.py +232 -0
opentau/datasets/grounding/__init__.py +67 -0
opentau/datasets/grounding/base.py +154 -0
opentau/datasets/grounding/clevr.py +110 -0
opentau/datasets/grounding/cocoqa.py +130 -0
opentau/datasets/grounding/dummy.py +101 -0
opentau/datasets/grounding/pixmo.py +177 -0
opentau/datasets/grounding/vsr.py +141 -0
opentau/datasets/image_writer.py +304 -0
opentau/datasets/lerobot_dataset.py +1910 -0
opentau/datasets/online_buffer.py +442 -0
opentau/datasets/push_dataset_to_hub/utils.py +132 -0
opentau/datasets/sampler.py +99 -0
opentau/datasets/standard_data_format_mapping.py +278 -0
opentau/datasets/transforms.py +330 -0
opentau/datasets/utils.py +1243 -0
opentau/datasets/v2/batch_convert_dataset_v1_to_v2.py +887 -0
opentau/datasets/v2/convert_dataset_v1_to_v2.py +829 -0
opentau/datasets/v21/_remove_language_instruction.py +109 -0
opentau/datasets/v21/batch_convert_dataset_v20_to_v21.py +60 -0
opentau/datasets/v21/convert_dataset_v20_to_v21.py +183 -0
opentau/datasets/v21/convert_stats.py +150 -0
opentau/datasets/video_utils.py +597 -0
opentau/envs/__init__.py +18 -0
opentau/envs/configs.py +178 -0
opentau/envs/factory.py +99 -0
opentau/envs/libero.py +439 -0
opentau/envs/utils.py +204 -0
opentau/optim/__init__.py +16 -0
opentau/optim/factory.py +43 -0
opentau/optim/optimizers.py +121 -0
opentau/optim/schedulers.py +140 -0
opentau/planner/__init__.py +82 -0
opentau/planner/high_level_planner.py +366 -0
opentau/planner/utils/memory.py +64 -0
opentau/planner/utils/utils.py +65 -0
opentau/policies/__init__.py +24 -0
opentau/policies/factory.py +172 -0
opentau/policies/normalize.py +315 -0
opentau/policies/pi0/__init__.py +19 -0
opentau/policies/pi0/configuration_pi0.py +250 -0
opentau/policies/pi0/modeling_pi0.py +994 -0
opentau/policies/pi0/paligemma_with_expert.py +516 -0
opentau/policies/pi05/__init__.py +20 -0
opentau/policies/pi05/configuration_pi05.py +231 -0
opentau/policies/pi05/modeling_pi05.py +1257 -0
opentau/policies/pi05/paligemma_with_expert.py +572 -0
opentau/policies/pretrained.py +315 -0
opentau/policies/utils.py +123 -0
opentau/policies/value/__init__.py +18 -0
opentau/policies/value/configuration_value.py +170 -0
opentau/policies/value/modeling_value.py +512 -0
opentau/policies/value/reward.py +87 -0
opentau/policies/value/siglip_gemma.py +221 -0
opentau/scripts/actions_mse_loss.py +89 -0
opentau/scripts/bin_to_safetensors.py +116 -0
opentau/scripts/compute_max_token_length.py +111 -0
opentau/scripts/display_sys_info.py +90 -0
opentau/scripts/download_libero_benchmarks.py +54 -0
opentau/scripts/eval.py +877 -0
opentau/scripts/export_to_onnx.py +180 -0
opentau/scripts/fake_tensor_training.py +87 -0
opentau/scripts/get_advantage_and_percentiles.py +220 -0
opentau/scripts/high_level_planner_inference.py +114 -0
opentau/scripts/inference.py +70 -0
opentau/scripts/launch_train.py +63 -0
opentau/scripts/libero_simulation_parallel.py +356 -0
opentau/scripts/libero_simulation_sequential.py +122 -0
opentau/scripts/nav_high_level_planner_inference.py +61 -0
opentau/scripts/train.py +379 -0
opentau/scripts/visualize_dataset.py +294 -0
opentau/scripts/visualize_dataset_html.py +507 -0
opentau/scripts/zero_to_fp32.py +760 -0
opentau/utils/__init__.py +20 -0
opentau/utils/accelerate_utils.py +79 -0
opentau/utils/benchmark.py +98 -0
opentau/utils/fake_tensor.py +81 -0
opentau/utils/hub.py +209 -0
opentau/utils/import_utils.py +79 -0
opentau/utils/io_utils.py +137 -0
opentau/utils/libero.py +214 -0
opentau/utils/libero_dataset_recorder.py +460 -0
opentau/utils/logging_utils.py +180 -0
opentau/utils/monkey_patch.py +278 -0
opentau/utils/random_utils.py +244 -0
opentau/utils/train_utils.py +198 -0
opentau/utils/utils.py +471 -0
opentau-0.1.0.dist-info/METADATA +161 -0
opentau-0.1.0.dist-info/RECORD +108 -0
opentau-0.1.0.dist-info/WHEEL +5 -0
opentau-0.1.0.dist-info/entry_points.txt +2 -0
opentau-0.1.0.dist-info/licenses/LICENSE +508 -0
opentau-0.1.0.dist-info/top_level.txt +1 -0

opentau/policies/pi05/paligemma_with_expert.py ADDED Viewed

@@ -0,0 +1,572 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2026 Tensor Auto Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PaliGemma with Expert Module.
+This module implements the PaliGemma model with an additional expert module,
+specifically designed for the Pi05 policy. It combines a pre-trained PaliGemma
+Vision-Language Model (VLM) with a Gemma-based expert model to handle
+action generation and conditioning.
+"""
+import torch
+import torch.version
+from pytest import Cache
+from torch import nn
+from transformers import (
+    AutoConfig,
+    GemmaForCausalLM,
+    PaliGemmaForConditionalGeneration,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+from transformers.models.auto import CONFIG_MAPPING
+from transformers.models.gemma import modeling_gemma
+def apply_rope(x: torch.Tensor, positions: torch.Tensor, max_wavelength: int = 10_000) -> torch.Tensor:
+    """Applies RoPE positions to the input tensor.
+    Args:
+        x: Input tensor of shape [B, L, H, D].
+        positions: Position tensor of shape [B, L].
+        max_wavelength: Maximum wavelength for RoPE. Defaults to 10_000.
+    Returns:
+        Tensor: The input tensor with RoPE applied, of shape [B, L, H, D].
+    """
+    d_half = x.shape[-1] // 2
+    device = x.device
+    dtype = x.dtype
+    x = x.to(torch.float32)
+    freq_exponents = (2.0 / x.shape[-1]) * torch.arange(d_half, dtype=torch.float32, device=device)
+    timescale = max_wavelength**freq_exponents
+    radians = positions[..., None].to(torch.float32) / timescale[None, None, :].to(torch.float32)
+    radians = radians[..., None, :]
+    sin = torch.sin(radians)  # .to(dtype=dtype)
+    cos = torch.cos(radians)  # .to(dtype=dtype)
+    x1, x2 = x.split(d_half, dim=-1)
+    res = torch.empty_like(x)
+    res[..., :d_half] = x1 * cos - x2 * sin
+    res[..., d_half:] = x2 * cos + x1 * sin
+    return res.to(dtype)
+class PaliGemmaWithExpertConfig(PretrainedConfig):
+    """Configuration class for PaliGemmaWithExpertModel."""
+    model_type = "PaliGemmaWithExpertModel"
+    sub_configs = {"paligemma_config": AutoConfig, "gemma_expert_config": AutoConfig}
+    def __init__(
+        self,
+        paligemma_config: dict | None = None,
+        gemma_expert_config: dict | None = None,
+        freeze_vision_encoder: bool = True,
+        train_expert_only: bool = True,
+        attention_implementation: str = "eager",
+        load_pretrained_paligemma: bool = False,
+        discrete_action_vocab_size: int | None = None,
+        dropout: float = 0.1,
+        **kwargs,
+    ):
+        """Initializes the configuration.
+        Args:
+            paligemma_config: Configuration dictionary for the PaliGemma model.
+            gemma_expert_config: Configuration dictionary for the Gemma expert model.
+            freeze_vision_encoder: Whether to freeze the vision encoder. Defaults to True.
+            train_expert_only: Whether to train only the expert model. Defaults to True.
+            attention_implementation: Attention implementation to use ("eager" or "fa2"). Defaults to "eager".
+            load_pretrained_paligemma: Whether to load a pretrained PaliGemma model. Defaults to False.
+            discrete_action_vocab_size: Vocabulary size for discrete actions.
+            dropout: Dropout probability. Defaults to 0.1.
+            **kwargs: Additional keyword arguments passed to PretrainedConfig.
+        """
+        self.freeze_vision_encoder = freeze_vision_encoder
+        self.train_expert_only = train_expert_only
+        self.attention_implementation = attention_implementation
+        self.load_pretrained_paligemma = load_pretrained_paligemma
+        self.discrete_action_vocab_size = discrete_action_vocab_size
+        self.dropout = dropout
+        if paligemma_config is None:
+            # Default config from Pi0
+            self.paligemma_config = CONFIG_MAPPING["paligemma"](
+                transformers_version="4.48.1",
+                _vocab_size=257152,
+                bos_token_id=2,
+                eos_token_id=1,
+                hidden_size=2048,
+                image_token_index=257152,
+                model_type="paligemma",
+                pad_token_id=0,
+                projection_dim=2048,
+                text_config={
+                    "hidden_activation": "gelu_pytorch_tanh",
+                    "hidden_size": 2048,
+                    "intermediate_size": 16384,
+                    "model_type": "gemma",
+                    "num_attention_heads": 8,
+                    "num_hidden_layers": 18,
+                    "num_image_tokens": 256,
+                    "num_key_value_heads": 1,
+                    "torch_dtype": "float32",
+                    "vocab_size": 257152,
+                    "use_adarms": False,
+                    "adarms_cond_dim": None,
+                },
+                vision_config={
+                    "hidden_size": 1152,
+                    "intermediate_size": 4304,
+                    "model_type": "siglip_vision_model",
+                    "num_attention_heads": 16,
+                    "num_hidden_layers": 27,
+                    "num_image_tokens": 256,
+                    "patch_size": 14,
+                    "projection_dim": 2048,
+                    "projector_hidden_act": "gelu_fast",
+                    "torch_dtype": "float32",
+                    "vision_use_head": False,
+                },
+            )
+        elif isinstance(self.paligemma_config, dict):
+            # Override Pi0 default config for PaliGemma
+            if "model_type" not in gemma_expert_config:
+                paligemma_config["model_type"] = "paligemma"
+            cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]]
+            self.paligemma_config = cfg_cls(**paligemma_config)
+        if gemma_expert_config is None:
+            # Default config from Pi0
+            self.gemma_expert_config = CONFIG_MAPPING["gemma"](
+                attention_bias=False,
+                attention_dropout=0.0,
+                bos_token_id=2,
+                eos_token_id=1,
+                head_dim=256,
+                hidden_act="gelu_pytorch_tanh",
+                hidden_activation="gelu_pytorch_tanh",
+                hidden_size=1024,
+                initializer_range=0.02,
+                intermediate_size=4096,
+                max_position_embeddings=8192,
+                model_type="gemma",
+                num_attention_heads=8,
+                num_hidden_layers=18,
+                num_key_value_heads=1,
+                pad_token_id=0,
+                rms_norm_eps=1e-06,
+                rope_theta=10000.0,
+                torch_dtype="float32",
+                use_adarms=True,
+                adarms_cond_dim=1024,
+                transformers_version="4.48.1",
+                use_cache=True,
+                vocab_size=257152,
+            )
+        elif isinstance(self.gemma_expert_config, dict):
+            # Override Pi0 default config for Gemma Expert
+            if "model_type" not in gemma_expert_config:
+                gemma_expert_config["model_type"] = "gemma"
+            cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]]
+            self.gemma_expert_config = cfg_cls(**gemma_expert_config)
+        super().__init__(**kwargs)
+    def __post_init__(self):
+        """Validates configuration parameters."""
+        super().__post_init__()
+        if self.train_expert_only and not self.freeze_vision_encoder:
+            raise ValueError(
+                "You set `freeze_vision_encoder=False` and `train_expert_only=True` which are not compatible."
+            )
+        if self.attention_implementation not in ["eager", "fa2"]:
+            raise ValueError(
+                f"Wrong value provided for `attention_implementation` ({self.attention_implementation}). Expected 'eager' or 'fa2'."
+            )
+class PaliGemmaWithExpertModel(PreTrainedModel):
+    """PaliGemma model with an additional expert module for action generation."""
+    config_class = PaliGemmaWithExpertConfig
+    def __init__(self, config: PaliGemmaWithExpertConfig):
+        """Initializes the PaliGemmaWithExpertModel.
+        Args:
+            config: Configuration object for the model.
+        """
+        super().__init__(config=config)
+        self.config = config
+        if config.load_pretrained_paligemma:
+            self.paligemma = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-pt-224")
+        else:
+            self.paligemma = PaliGemmaForConditionalGeneration(config=config.paligemma_config)
+        self.gemma_expert = GemmaForCausalLM(config=config.gemma_expert_config)
+        # Remove unused embed_tokens
+        self.gemma_expert.model.embed_tokens = None
+        # Learned embedding layer for discrete actions
+        # Embedding dimension matches expert model hidden size
+        self.discrete_action_embedding = nn.Embedding(
+            num_embeddings=config.discrete_action_vocab_size,
+            embedding_dim=config.paligemma_config.text_config.hidden_size,
+            padding_idx=0,  # 0 is used for padding in pad_fast_tokens
+        )
+        # discrete action head that maps to action vocab size and not language vocab size
+        self.da_head = nn.Linear(
+            in_features=config.paligemma_config.text_config.hidden_size,
+            out_features=config.discrete_action_vocab_size,
+        )
+        self.dropout = nn.Dropout(config.dropout)
+        self.to_bfloat16_like_physical_intelligence()
+        self.set_requires_grad()
+    def set_requires_grad(self) -> None:
+        """Sets the requires_grad attribute for model parameters based on configuration."""
+        if self.config.freeze_vision_encoder:
+            self.paligemma.vision_tower.eval()
+            for params in self.paligemma.vision_tower.parameters():
+                params.requires_grad = False
+        if self.config.train_expert_only:
+            self.paligemma.eval()
+            for params in self.paligemma.parameters():
+                params.requires_grad = False
+    def train(self, mode: bool = True) -> None:
+        """Sets the module in training mode.
+        Args:
+            mode: whether to set training mode (True) or evaluation mode (False). Defaults to True.
+        """
+        super().train(mode)
+        if self.config.freeze_vision_encoder:
+            self.paligemma.vision_tower.eval()
+        if self.config.train_expert_only:
+            self.paligemma.eval()
+    def to_bfloat16_like_physical_intelligence(self) -> None:
+        """Casts specific model components to bfloat16 dtype."""
+        self.paligemma = self.paligemma.to(dtype=torch.bfloat16)
+        params_to_change_dtype = [
+            "language_model.model.layers",
+            "gemma_expert.model.layers",
+            "vision_tower",
+            "multi_modal",
+        ]
+        for name, param in self.named_parameters():
+            if any(selector in name for selector in params_to_change_dtype):
+                param.data = param.data.to(dtype=torch.bfloat16)
+    def embed_image(self, image: torch.Tensor) -> torch.Tensor:
+        """Computes image embeddings.
+        Args:
+            image: Input image tensor.
+        Returns:
+            torch.Tensor: Image embeddings.
+        """
+        # Handle different transformers versions
+        if hasattr(self.paligemma, "get_image_features"):
+            return self.paligemma.get_image_features(image)
+        else:
+            return self.paligemma.model.get_image_features(image)
+    def embed_language_tokens(self, tokens: torch.Tensor) -> torch.Tensor:
+        """Embeds language tokens.
+        Args:
+            tokens: Input token indices.
+        Returns:
+            torch.Tensor: Token embeddings.
+        """
+        return self.paligemma.language_model.embed_tokens(tokens)
+    def embed_discrete_actions(self, actions: torch.Tensor) -> torch.Tensor:
+        """Embeds discrete action tokens.
+        Args:
+            actions: Input discrete action indices.
+        Returns:
+            torch.Tensor: Action embeddings.
+        """
+        # Ensure actions are long integers for embedding lookup
+        if actions.dtype != torch.long:
+            actions = actions.long()
+        # Apply embedding layer
+        embedded = self.discrete_action_embedding(actions)
+        return embedded
+    # TODO: break down this huge forward into modules or functions
+    def forward(
+        self,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | Cache | None = None,
+        inputs_embeds: list[torch.FloatTensor] = None,
+        n_cross_att_tokens: int | None = None,
+        use_cache: bool | None = None,
+        fill_kv_cache: bool | None = None,
+        adarms_cond: list[torch.Tensor] | None = None,
+    ) -> tuple[list[torch.FloatTensor | None], list[torch.FloatTensor] | Cache | None]:
+        """Forward pass of the model.
+        Args:
+            attention_mask: Attention mask tensor.
+            position_ids: Position IDs tensor.
+            past_key_values: Past key values for caching.
+            inputs_embeds: List of input embeddings for the different model parts.
+            n_cross_att_tokens: Number of cross-attention tokens.
+            use_cache: Whether to use KV cache.
+            fill_kv_cache: Whether to fill the KV cache.
+            adarms_cond: List of AdaRMS conditioning tensors.
+        Returns:
+            tuple: A tuple containing:
+                - outputs_embeds: List of output embeddings.
+                - past_key_values: Updated past key values.
+        Raises:
+            ValueError: If `n_cross_att_tokens` is not provided when `fill_kv_cache` is True.
+        """
+        if adarms_cond is None:
+            adarms_cond = [None, None]
+        models = [self.paligemma.language_model, self.gemma_expert.model]
+        for hidden_states in inputs_embeds:
+            # TODO this is very inefficient
+            # dtype is always the same, batch size too (if > 1 len)
+            # device could be trickier in multi gpu edge cases but that's it
+            if hidden_states is None:
+                continue
+            batch_size = hidden_states.shape[0]
+        # RMSNorm
+        num_layers = self.paligemma.config.text_config.num_hidden_layers
+        head_dim = self.paligemma.config.text_config.head_dim
+        for layer_idx in range(num_layers):
+            query_states = []
+            key_states = []
+            value_states = []
+            gates = []
+            for i, hidden_states in enumerate(inputs_embeds):
+                if hidden_states is None:
+                    gates.append(None)
+                    continue
+                layer = models[i].layers[layer_idx]
+                # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype)
+                # hidden_states = hidden_states * normalizer
+                hidden_states, gate = layer.input_layernorm(hidden_states, cond=adarms_cond[i])
+                gates.append(gate)
+                input_shape = hidden_states.shape[:-1]
+                hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
+                hidden_states = hidden_states.to(dtype=torch.bfloat16)
+                query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
+                key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
+                value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
+                query_states.append(query_state)
+                key_states.append(key_state)
+                value_states.append(value_state)
+            # B,L,H,D with L sequence length, H number of heads, D head dim
+            # concatenate on the number of embeddings/tokens
+            query_states = torch.cat(query_states, dim=1)
+            key_states = torch.cat(key_states, dim=1)
+            value_states = torch.cat(value_states, dim=1)
+            query_states = apply_rope(query_states, position_ids)
+            key_states = apply_rope(key_states, position_ids)
+            if use_cache and past_key_values is None:
+                past_key_values = {}
+            if use_cache:
+                if fill_kv_cache:
+                    if n_cross_att_tokens is None:
+                        raise ValueError("n_cross_att_tokens must be provided when fill_kv_cache is True")
+                    past_key_values[layer_idx] = {
+                        # save the first n_cross_att_tokens for action expert cross attention
+                        "key_states": key_states[:, :n_cross_att_tokens, :, :],
+                        "value_states": value_states[:, :n_cross_att_tokens, :, :],
+                    }
+                else:
+                    # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before.
+                    # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach
+                    # the max len, then we (for instance) double the cache size. This implementation already exists
+                    # in `transformers`. (molbap)
+                    key_states = torch.cat([key_states, past_key_values[layer_idx]["key_states"]], dim=1)
+                    value_states = torch.cat(
+                        [value_states, past_key_values[layer_idx]["value_states"]], dim=1
+                    )
+            attention_interface = self.get_attention_interface()
+            att_output = attention_interface(
+                attention_mask, batch_size, head_dim, query_states, key_states, value_states
+            )
+            att_output = att_output.to(dtype=torch.bfloat16)
+            # first part of att_output is prefix (up to sequence length, [:, 0:prefix_seq_len])
+            outputs_embeds = []
+            start = 0
+            for i, hidden_states in enumerate(inputs_embeds):
+                layer = models[i].layers[layer_idx]
+                if hidden_states is not None:
+                    end = start + hidden_states.shape[1]
+                    if att_output.dtype != layer.self_attn.o_proj.weight.dtype:
+                        att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
+                    out_emb = layer.self_attn.o_proj(att_output[:, start:end])
+                    out_emb = self.dropout(out_emb)
+                    # first residual
+                    out_emb = modeling_gemma._gated_residual(hidden_states, out_emb, gates[i])  # noqa: SLF001
+                    after_first_residual = out_emb.clone()
+                    out_emb, gate = layer.post_attention_layernorm(out_emb, cond=adarms_cond[i])
+                    out_emb = layer.mlp(out_emb)
+                    out_emb = self.dropout(out_emb)
+                    # second residual
+                    out_emb = modeling_gemma._gated_residual(after_first_residual, out_emb, gate)  # noqa: SLF001
+                    outputs_embeds.append(out_emb)
+                    start = end
+                else:
+                    outputs_embeds.append(None)
+            inputs_embeds = outputs_embeds
+        # final norm
+        outputs_embeds = []
+        for i, hidden_states in enumerate(inputs_embeds):
+            if hidden_states is not None:
+                out_emb, _ = models[i].norm(hidden_states, cond=adarms_cond[i])
+                outputs_embeds.append(out_emb)
+            else:
+                outputs_embeds.append(None)
+        return outputs_embeds, past_key_values
+    def get_attention_interface(self):
+        """Returns the attention implementation function based on config.
+        Returns:
+            callable: The attention function to use.
+        """
+        return self.eager_attention_forward
+    def eager_attention_forward(
+        self,
+        attention_mask: torch.Tensor,
+        batch_size: int,
+        head_dim: int,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+    ) -> torch.Tensor:
+        """Eager attention forward pass using standard matrix multiplications.
+        Args:
+            attention_mask: Attention mask tensor.
+            batch_size: Batch size.
+            head_dim: Head dimension.
+            query_states: Query states tensor.
+            key_states: Key states tensor.
+            value_states: Value states tensor.
+        Returns:
+            torch.Tensor: Attention output.
+        """
+        num_att_heads = self.config.paligemma_config.text_config.num_attention_heads
+        num_key_value_heads = self.config.paligemma_config.text_config.num_key_value_heads
+        num_key_value_groups = num_att_heads // num_key_value_heads
+        # query_states: batch_size, sequence_length, num_att_head, head_dim
+        # key_states: batch_size, sequence_length, num_key_value_head, head_dim
+        # value_states: batch_size, sequence_length, num_key_value_head, head_dim
+        sequence_length = key_states.shape[1]
+        key_states = key_states[:, :, :, None, :].expand(
+            batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim
+        )
+        key_states = key_states.reshape(
+            batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim
+        )
+        value_states = value_states[:, :, :, None, :].expand(
+            batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim
+        )
+        value_states = value_states.reshape(
+            batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim
+        )
+        # Attention here is upcasted to float32 to match the original eager implementation.
+        query_states = query_states.to(dtype=torch.float32)
+        key_states = key_states.to(dtype=torch.float32)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        att_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+        att_weights *= head_dim**-0.5
+        big_neg = -2.3819763e38  # See gemma/modules.py
+        masked_att_weights = torch.where(attention_mask[:, None, :, :], att_weights, big_neg)
+        probs = nn.functional.softmax(masked_att_weights, dim=-1)
+        probs = probs.to(dtype=value_states.dtype)
+        # probs: batch_size, num_key_value_head, num_att_head, sequence_length, sequence_length
+        # value_states: batch_size, sequence_length, num_att_heads, head_dim
+        att_output = torch.matmul(probs, value_states.permute(0, 2, 1, 3))
+        att_output = att_output.permute(0, 2, 1, 3)
+        # we use -1 because sequence length can change
+        att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim)
+        return att_output