PyPI - optimum-rbln - Versions diffs - 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl - Mend

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

optimum/rbln/transformers/models/exaone/exaone_architecture.py CHANGED Viewed

@@ -18,9 +18,6 @@ import torch.nn as nn
 from ....utils import logging
 from ...models.decoderonly.decoderonly_architecture import (
-    DecoderOnlyAttention,
-    DecoderOnlyLayer,
-    DecoderOnlyModel,
     DecoderOnlyWrapper,
 )
@@ -42,36 +39,3 @@ class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
     def get_model_layer(self, causal_lm: "ExaoneForCausalLM"):
         return causal_lm.transformer
-    def get_rbln_attn_class(self):
-        return ExaoneAttention
-    def get_rbln_layer_class(self):
-        return ExaoneLayer
-    def get_rbln_model_class(self):
-        return ExaoneModel
-class ExaoneModel(DecoderOnlyModel):
-    def get_embedding(self) -> nn.Embedding:
-        return self._original_mod.wte
-    def get_last_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_f
-class ExaoneLayer(DecoderOnlyLayer):
-    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_1
-    def get_post_attention_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_2
-class ExaoneAttention(DecoderOnlyAttention):
-    def __post_init__(self):
-        self.q_proj = self._original_mod.q_proj
-        self.k_proj = self._original_mod.k_proj
-        self.v_proj = self._original_mod.v_proj
-        self.o_proj = self._original_mod.out_proj

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -24,4 +24,4 @@ class GemmaWrapper(DecoderOnlyWrapper):
 class GemmaModel(DecoderOnlyModel):
     @property
     def hidden_multiplier(self):
-        return self._original_mod.config.hidden_size**0.5
+        return self.config.hidden_size**0.5

optimum/rbln/transformers/models/gemma2/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_gemma2 import RBLNGemma2ForCausalLMConfig, RBLNGemma2ModelConfig
+from .modeling_gemma2 import RBLNGemma2ForCausalLM, RBLNGemma2Model

optimum/rbln/transformers/models/gemma2/configuration_gemma2.py ADDED Viewed

@@ -0,0 +1,45 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..decoderonly.configuration_decoderonly import RBLNDecoderOnlyModelConfig, RBLNDecoderOnlyModelForCausalLMConfig
+class RBLNGemma2ForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
+    """
+    Configuration class for RBLN Gemma2 models.
+    This class is an alias of RBLNDecoderOnlyModelForCausalLMConfig.
+    Example usage:
+    ```python
+    from optimum.rbln import RBLNGemma2ForCausalLM, RBLNGemma2ForCausalLMConfig
+    # Create a configuration object
+    config = RBLNGemma2ForCausalLMConfig(
+        batch_size=1,
+        max_seq_len=8192,
+        tensor_parallel_size=4
+    )
+    # Use the configuration with from_pretrained
+    model = RBLNGemma2ForCausalLM.from_pretrained(
+        "google/gemma-2-9b",
+        export=True,
+        rbln_config=config
+    )
+    ```
+    """
+class RBLNGemma2ModelConfig(RBLNDecoderOnlyModelConfig):
+    """
+    Configuration class for RBLN Gemma2 models.
+    This class is an alias of RBLNDecoderOnlyModelConfig.
+    """

optimum/rbln/transformers/models/gemma2/gemma2_architecture.py ADDED Viewed

@@ -0,0 +1,83 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+import torch
+from ...models.decoderonly.decoderonly_architecture import DecoderOnlyAttention, DecoderOnlyLayer, DecoderOnlyModel
+from ..decoderonly.decoderonly_architecture import DecoderOnlyWrapper
+class Gemma2Wrapper(DecoderOnlyWrapper):
+    def get_rbln_layer_class(self):
+        return Gemma2DecoderLayer
+    def get_rbln_attn_class(self):
+        return Gemma2Attention
+    def get_rbln_model_class(self):
+        return Gemma2Model
+class Gemma2DecoderLayer(DecoderOnlyLayer):
+    _PRE_FF_LAYERNORM_ATTRS = ["pre_feedforward_layernorm"]
+    _POST_FF_LAYERNORM_ATTRS = ["post_feedforward_layernorm"]
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        seq_positions: Union[torch.LongTensor, Tuple[torch.LongTensor]],
+        past_key_values: Tuple[Tuple[torch.Tensor]],
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+        block_tables: Optional[torch.Tensor] = None,
+        lora_int_id: Optional[torch.Tensor] = None,
+    ):
+        residual = hidden_states
+        hidden_states = self.get_pre_attention_layernorm()(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            seq_positions=seq_positions,
+            past_key_values=past_key_values,
+            cos=cos,
+            sin=sin,
+            block_tables=block_tables,
+            lora_int_id=lora_int_id,
+        )
+        hidden_states = self.get_post_attention_layernorm()(hidden_states)
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.get_pre_feedforward_layernorm()(hidden_states)
+        hidden_states = self.forward_mlp(hidden_states, lora_int_id)
+        hidden_states = self.get_post_feedforward_layernorm()(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class Gemma2Attention(DecoderOnlyAttention):
+    def get_attn_scale(self, self_attn):
+        return self_attn.config.query_pre_attn_scalar**-0.5
+class Gemma2Model(DecoderOnlyModel):
+    @property
+    def hidden_multiplier(self):
+        return self.config.hidden_size**0.5

optimum/rbln/transformers/models/gemma2/modeling_gemma2.py ADDED Viewed

@@ -0,0 +1,101 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ....utils import logging
+from ...models.decoderonly import (
+    RBLNDecoderOnlyModel,
+    RBLNDecoderOnlyModelForCausalLM,
+)
+from .gemma2_architecture import Gemma2Wrapper
+logger = logging.get_logger(__name__)
+class RBLNGemma2ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
+    """
+    The Gemma2 Model transformer with a language modeling head (linear layer) on top.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    A class to convert and run pre-trained transformers based Gemma2ForCausalLM model on RBLN devices.
+    It implements the methods to convert a pre-trained transformers Gemma2ForCausalLM model into a RBLN transformer model by:
+    - transferring the checkpoint weights of the original into an optimized RBLN graph,
+    - compiling the resulting graph using the RBLN compiler.
+    **Configuration:**
+    This model uses [`RBLNGemma2ForCausalLMConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
+    the `rbln_config` parameter should be an instance of [`RBLNGemma2ForCausalLMConfig`] or a dictionary conforming to its structure.
+    See the [`RBLNGemma2ForCausalLMConfig`] class for all available configuration options.
+    Examples:
+        ```python
+        from optimum.rbln import RBLNGemma2ForCausalLM
+        # Simple usage using rbln_* arguments
+        # `max_seq_len` is automatically inferred from the model config
+        model = RBLNGemma2ForCausalLM.from_pretrained(
+            "google/gemma-2-9b",
+            export=True,
+            rbln_batch_size=1,
+            rbln_tensor_parallel_size=4,
+        )
+        # Using a config dictionary
+        rbln_config = {
+            "batch_size": 1,
+            "max_seq_len": 8192,
+            "tensor_parallel_size": 4,
+        }
+        model = RBLNGemma2ForCausalLM.from_pretrained(
+            "google/gemma-2-9b",
+            export=True,
+            rbln_config=rbln_config
+        )
+        # Using a RBLNMistralForCausalLMConfig instance (recommended for type checking)
+        from optimum.rbln import RBLNGemma2ForCausalLMConfig
+        config = RBLNGemma2ForCausalLMConfig(
+            batch_size=1,
+            max_seq_len=8192,
+            tensor_parallel_size=4
+        )
+        model = RBLNGemma2ForCausalLM.from_pretrained(
+            "google/gemma-2-9b",
+            export=True,
+            rbln_config=config
+        )
+        ```
+    """
+    _decoder_wrapper_cls = Gemma2Wrapper
+class RBLNGemma2Model(RBLNDecoderOnlyModel):
+    """
+    The Gemma2 Model transformer without a language modeling head.
+    This model inherits from [`RBLNDecoderOnlyModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    A class to convert and run pre-trained transformers based Gemma2Model model on RBLN devices.
+    It implements the methods to convert a pre-trained transformers Gemma2Model model into a RBLN transformer model by:
+    - transferring the checkpoint weights of the original into an optimized RBLN graph,
+    - compiling the resulting graph using the RBLN compiler.
+    **Configuration:**
+    This model uses [`RBLNGemma2ModelConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
+    the `rbln_config` parameter should be an instance of [`RBLNGemma2ModelConfig`] or a dictionary conforming to its structure.
+    See the [`RBLNGemma2ModelConfig`] class for all available configuration options.
+    """
+    _decoder_wrapper_cls = Gemma2Wrapper

optimum/rbln/transformers/models/gemma3/gemma3_architecture.py CHANGED Viewed

@@ -16,7 +16,6 @@ import copy
 from typing import Optional, Tuple, Union
 import torch
-from transformers.models.gemma3.modeling_gemma3 import Gemma3RMSNorm
 from ..decoderonly.decoderonly_architecture import (
     DecoderOnlyAttention,
@@ -64,6 +63,7 @@ class Gemma3TextModel(DecoderOnlyModel):
         global_block_tables: Optional[torch.Tensor] = None,
         local_block_tables: Optional[torch.Tensor] = None,
         lora_int_id: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
     ):
         # retrieve input_ids and inputs_embeds
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -94,13 +94,18 @@ class Gemma3TextModel(DecoderOnlyModel):
         else:
             seq_positions = cache_position[:, :1]
-        sliding_cache_pos = self.get_local_cache_positions(position_ids, query_position)
+        cache_seq_len, cache_offset, swa_attn_mask = self.get_swa_custom_op_args(position_ids, query_position)
+        sliding_cache_pos = (cache_seq_len, cache_offset)
+        all_hidden_states = () if output_hidden_states else None
         for layer_idx, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
             is_sliding = True if layer_idx in self.sliding_window_layers else False
+            is_sliding_decode = is_sliding and self.phase == "decode"
             hidden_states = layer(
                 hidden_states=hidden_states,
-                attention_mask=attention_mask,
+                attention_mask=swa_attn_mask if is_sliding_decode else attention_mask,
                 seq_positions=sliding_cache_pos if is_sliding else seq_positions,
                 past_key_values=past_key_values,
                 cos=cos_local if is_sliding else cos_global,
@@ -110,15 +115,14 @@ class Gemma3TextModel(DecoderOnlyModel):
             )
         hidden_states = self.get_last_layernorm()(hidden_states)
-        return hidden_states
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        return hidden_states, all_hidden_states
 class Gemma3DecoderLayer(DecoderOnlyLayer):
-    def get_pre_feedforward_layernorm(self) -> Gemma3RMSNorm:
-        return self._original_mod.pre_feedforward_layernorm
-    def get_post_feedforward_layernorm(self) -> Gemma3RMSNorm:
-        return self._original_mod.post_feedforward_layernorm
+    _PRE_FF_LAYERNORM_ATTRS = ["pre_feedforward_layernorm"]
+    _POST_FF_LAYERNORM_ATTRS = ["post_feedforward_layernorm"]
     def forward(
         self,
@@ -158,13 +162,13 @@ class Gemma3DecoderLayer(DecoderOnlyLayer):
 class Gemma3Attention(DecoderOnlyAttention):
-    def __post_init__(self):
-        self.q_proj = self._original_mod.q_proj
-        self.k_proj = self._original_mod.k_proj
-        self.v_proj = self._original_mod.v_proj
-        self.o_proj = self._original_mod.o_proj
-        self.q_norm = self._original_mod.q_norm
-        self.k_norm = self._original_mod.k_norm
-    def get_attn_scale(self):
-        return self._original_mod.config.query_pre_attn_scalar**-0.5
+    def __post_init__(self, self_attn):
+        self.q_proj = self_attn.q_proj
+        self.k_proj = self_attn.k_proj
+        self.v_proj = self_attn.v_proj
+        self.o_proj = self_attn.o_proj
+        self.q_norm = self_attn.q_norm
+        self.k_norm = self_attn.k_norm
+    def get_attn_scale(self, self_attn):
+        return self_attn.config.query_pre_attn_scalar**-0.5

optimum/rbln/transformers/models/gemma3/gemma3_runtime_utils.py CHANGED Viewed

@@ -16,7 +16,7 @@ from typing import Optional
 import rebel
 import torch
-from ...modeling_outputs import RBLNDecoderOnlyOutput, RBLNGemma3ForCausalLMOutput
+from ...modeling_outputs import RBLNGemma3ForCausalLMOutput
 from ..decoderonly.decoderonly_runtime_utils import RBLNPytorchRuntime
 from ..decoderonly.modeling_decoderonly import RBLNRuntimeModel
@@ -26,7 +26,6 @@ class RBLNGemma3RuntimeModel(RBLNRuntimeModel):
         super().__init__(*args, **kwargs)
         self.image_prefill = RBLNPytorchRuntime(image_prefill)  # FIXME(taehoon)
         self.prefill = RBLNPytorchRuntime(self.runtime) if self.phase == "prefill" else None  # FIXME
-        self.decode = RBLNPytorchRuntime(self.runtime) if self.phase == "decode" else None
     def _prepare_prefill_inputs(self, *args, **kwargs):
         (
@@ -106,6 +105,8 @@ class RBLNGemma3RuntimeModel(RBLNRuntimeModel):
         )
         step = 0
+        output_logits = []
+        all_hidden_states = [] if self.rbln_config.output_hidden_states else None
         while step < query_length:
             if self.rbln_config.use_image_prefill:
                 # Check if the prefill chunk is an image prefill
@@ -146,7 +147,7 @@ class RBLNGemma3RuntimeModel(RBLNRuntimeModel):
             query_position = torch.tensor(num_processed_tokens - 1, dtype=torch.int16)
             if is_image_prefill:
-                logits = self.image_prefill(
+                outputs = self.image_prefill(
                     input_chunk,
                     cache_pos_chunk,
                     block_tables,
@@ -157,7 +158,7 @@ class RBLNGemma3RuntimeModel(RBLNRuntimeModel):
                     lora_int_ids if self.rbln_config.use_lora else None,
                 )
             else:
-                logits = self.prefill(
+                outputs = self.prefill(
                     input_chunk,
                     cache_pos_chunk,
                     block_tables,
@@ -168,78 +169,49 @@ class RBLNGemma3RuntimeModel(RBLNRuntimeModel):
                     lora_int_ids if self.rbln_config.use_lora else None,
                 )
+            if self.rbln_config.output_hidden_states:
+                output_logits.append(outputs[0])
+                all_hidden_states.append(tuple(outputs[1:]))
+            else:
+                output_logits.append(outputs)
             padded_cache_lengths += current_padded_cache_lengths
             step += num_processed_tokens
-        if not is_external_block_tables:
-            self.dec_attn_mask[batch_idx : batch_idx + 1] = chunked_attention_mask
-        return RBLNGemma3ForCausalLMOutput(
-            logits=logits, padded_cache_lengths=padded_cache_lengths, attention_mask=chunked_attention_mask
-        )
-    def decode_forward(
-        self,
-        inputs: torch.Tensor,
-        cache_position: torch.Tensor = None,
-        block_tables: torch.Tensor = None,
-        is_external_block_tables: bool = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_embed: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        local_block_tables: Optional[torch.Tensor] = None,
-        lora_int_ids: Optional[torch.Tensor] = None,
-    ) -> torch.FloatTensor:
-        if self.rbln_config.use_lora and lora_int_ids is None:
-            if self.lora_int_ids is None:
-                raise ValueError(
-                    "lora_int_id is required when using LoRA. "
-                    "You should call set_lora_int_ids() before forward() or pass lora_int_id to forward()."
-                )
-            lora_int_ids = self.lora_int_ids
-        if lora_int_ids is not None and lora_int_ids.shape[0] != self.batch_size:
-            raise ValueError(f"lora_int_ids size mismatch: got {lora_int_ids.shape[0]}, expected {self.batch_size}.")
-        batch_size = inputs.shape[0]
-        if batch_size != self.batch_size:
-            raise RuntimeError(
-                f"Batch size mismatch: got {batch_size}, expected {self.batch_size} (compiled batch size)."
-            )
+        if self.rbln_config.output_hidden_states:
+            num_hidden_layers = len(all_hidden_states[0]) - 1
+            concatenated_hidden_states = ()
+            for l_idx in range(num_hidden_layers + 1):
+                l_hidden_states = torch.cat([hidden_states[l_idx] for hidden_states in all_hidden_states], dim=1)
+                l_hidden_states = l_hidden_states[:, :query_length, :]
+                concatenated_hidden_states += (l_hidden_states,)
-        if batch_size != cache_position.shape[0]:
-            raise RuntimeError(f"Cache position size mismatch: got {cache_position.shape[0]}, expected {batch_size}.")
+            all_hidden_states = concatenated_hidden_states
-        # FIXME(taehoon): how to handle pos_attn_mask with external block tables
-        if is_external_block_tables:
-            if attention_mask is None:
-                raise ValueError("attention_mask should be provided with external block tables.")
-            if local_block_tables is None:
-                raise ValueError("local_block_tables should be provided with external block tables.")
+        # Aggregate output_logits
+        output_logits = torch.concat(output_logits, dim=-2)
+        if self.rbln_config.logits_to_keep > 0:
+            output_logits = output_logits[:, -self.rbln_config.logits_to_keep :, :]
         else:
-            local_block_tables = (
-                local_block_tables
-                if local_block_tables is not None
-                else torch.arange(0, self.batch_size, dtype=torch.int16).view(self.batch_size, -1)
-            )
-            if self.rbln_config.use_attention_mask and attention_mask is None:
-                for b_idx in range(batch_size):
-                    decoding_step = cache_position[b_idx].item()
-                    if not (0 <= decoding_step < self.dec_attn_mask.shape[-1]):
-                        raise ValueError(
-                            f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
-                        )
-                    self.dec_attn_mask[b_idx, decoding_step] = 1
-                attention_mask = self.dec_attn_mask
-        if self.batch_size < block_tables.shape[0]:
-            block_tables = block_tables[: self.batch_size]
+            output_logits = output_logits[:, :query_length, :]
+            # index copy for masked output_logits
+            if attention_mask is not None:
+                new_output_logits = torch.full(
+                    (1, attention_mask.shape[-1], output_logits.shape[-1]),
+                    fill_value=1e-10,
+                    dtype=output_logits.dtype,
+                )
+                mask_indices = torch.nonzero(attention_mask, as_tuple=True)[0]
+                new_output_logits.index_copy_(dim=-2, index=mask_indices, source=output_logits)
-        if attention_mask is not None and self.batch_size < attention_mask.shape[0]:
-            attention_mask = attention_mask[: self.batch_size]
+            output_logits = new_output_logits
-        logits = self.decode(inputs, cache_position, block_tables, local_block_tables, attention_mask, position_ids)
+        if not is_external_block_tables:
+            self.dec_attn_mask[batch_idx : batch_idx + 1] = chunked_attention_mask
-        return RBLNDecoderOnlyOutput(logits=logits)
+        return RBLNGemma3ForCausalLMOutput(
+            logits=output_logits,
+            padded_cache_lengths=padded_cache_lengths,
+            attention_mask=chunked_attention_mask,
+            hidden_states=all_hidden_states,
+        )

optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl