PyPI - optimum-rbln - Versions diffs - 0.9.4a2__py3-none-any.whl → 0.10.0.post1__py3-none-any.whl - Mend

optimum-rbln 0.9.4a2py3-none-any.whl → 0.10.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

optimum/rbln/transformers/models/gemma2/configuration_gemma2.py ADDED Viewed

@@ -0,0 +1,45 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..decoderonly.configuration_decoderonly import RBLNDecoderOnlyModelConfig, RBLNDecoderOnlyModelForCausalLMConfig
+class RBLNGemma2ForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
+    """
+    Configuration class for RBLN Gemma2 models.
+    This class is an alias of RBLNDecoderOnlyModelForCausalLMConfig.
+    Example usage:
+    ```python
+    from optimum.rbln import RBLNGemma2ForCausalLM, RBLNGemma2ForCausalLMConfig
+    # Create a configuration object
+    config = RBLNGemma2ForCausalLMConfig(
+        batch_size=1,
+        max_seq_len=8192,
+        tensor_parallel_size=4
+    )
+    # Use the configuration with from_pretrained
+    model = RBLNGemma2ForCausalLM.from_pretrained(
+        "google/gemma-2-9b",
+        export=True,
+        rbln_config=config
+    )
+    ```
+    """
+class RBLNGemma2ModelConfig(RBLNDecoderOnlyModelConfig):
+    """
+    Configuration class for RBLN Gemma2 models.
+    This class is an alias of RBLNDecoderOnlyModelConfig.
+    """

optimum/rbln/transformers/models/gemma2/gemma2_architecture.py ADDED Viewed

@@ -0,0 +1,83 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+import torch
+from ...models.decoderonly.decoderonly_architecture import DecoderOnlyAttention, DecoderOnlyLayer, DecoderOnlyModel
+from ..decoderonly.decoderonly_architecture import DecoderOnlyWrapper
+class Gemma2Wrapper(DecoderOnlyWrapper):
+    def get_rbln_layer_class(self):
+        return Gemma2DecoderLayer
+    def get_rbln_attn_class(self):
+        return Gemma2Attention
+    def get_rbln_model_class(self):
+        return Gemma2Model
+class Gemma2DecoderLayer(DecoderOnlyLayer):
+    _PRE_FF_LAYERNORM_ATTRS = ["pre_feedforward_layernorm"]
+    _POST_FF_LAYERNORM_ATTRS = ["post_feedforward_layernorm"]
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        seq_positions: Union[torch.LongTensor, Tuple[torch.LongTensor]],
+        past_key_values: Tuple[Tuple[torch.Tensor]],
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+        block_tables: Optional[torch.Tensor] = None,
+        lora_int_id: Optional[torch.Tensor] = None,
+    ):
+        residual = hidden_states
+        hidden_states = self.get_pre_attention_layernorm()(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            seq_positions=seq_positions,
+            past_key_values=past_key_values,
+            cos=cos,
+            sin=sin,
+            block_tables=block_tables,
+            lora_int_id=lora_int_id,
+        )
+        hidden_states = self.get_post_attention_layernorm()(hidden_states)
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.get_pre_feedforward_layernorm()(hidden_states)
+        hidden_states = self.forward_mlp(hidden_states, lora_int_id)
+        hidden_states = self.get_post_feedforward_layernorm()(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class Gemma2Attention(DecoderOnlyAttention):
+    def get_attn_scale(self, self_attn):
+        return self_attn.config.query_pre_attn_scalar**-0.5
+class Gemma2Model(DecoderOnlyModel):
+    @property
+    def hidden_multiplier(self):
+        return self.config.hidden_size**0.5

optimum/rbln/transformers/models/gemma2/modeling_gemma2.py ADDED Viewed

@@ -0,0 +1,101 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ....utils import logging
+from ...models.decoderonly import (
+    RBLNDecoderOnlyModel,
+    RBLNDecoderOnlyModelForCausalLM,
+)
+from .gemma2_architecture import Gemma2Wrapper
+logger = logging.get_logger(__name__)
+class RBLNGemma2ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
+    """
+    The Gemma2 Model transformer with a language modeling head (linear layer) on top.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    A class to convert and run pre-trained transformers based Gemma2ForCausalLM model on RBLN devices.
+    It implements the methods to convert a pre-trained transformers Gemma2ForCausalLM model into a RBLN transformer model by:
+    - transferring the checkpoint weights of the original into an optimized RBLN graph,
+    - compiling the resulting graph using the RBLN compiler.
+    **Configuration:**
+    This model uses [`RBLNGemma2ForCausalLMConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
+    the `rbln_config` parameter should be an instance of [`RBLNGemma2ForCausalLMConfig`] or a dictionary conforming to its structure.
+    See the [`RBLNGemma2ForCausalLMConfig`] class for all available configuration options.
+    Examples:
+        ```python
+        from optimum.rbln import RBLNGemma2ForCausalLM
+        # Simple usage using rbln_* arguments
+        # `max_seq_len` is automatically inferred from the model config
+        model = RBLNGemma2ForCausalLM.from_pretrained(
+            "google/gemma-2-9b",
+            export=True,
+            rbln_batch_size=1,
+            rbln_tensor_parallel_size=4,
+        )
+        # Using a config dictionary
+        rbln_config = {
+            "batch_size": 1,
+            "max_seq_len": 8192,
+            "tensor_parallel_size": 4,
+        }
+        model = RBLNGemma2ForCausalLM.from_pretrained(
+            "google/gemma-2-9b",
+            export=True,
+            rbln_config=rbln_config
+        )
+        # Using a RBLNMistralForCausalLMConfig instance (recommended for type checking)
+        from optimum.rbln import RBLNGemma2ForCausalLMConfig
+        config = RBLNGemma2ForCausalLMConfig(
+            batch_size=1,
+            max_seq_len=8192,
+            tensor_parallel_size=4
+        )
+        model = RBLNGemma2ForCausalLM.from_pretrained(
+            "google/gemma-2-9b",
+            export=True,
+            rbln_config=config
+        )
+        ```
+    """
+    _decoder_wrapper_cls = Gemma2Wrapper
+class RBLNGemma2Model(RBLNDecoderOnlyModel):
+    """
+    The Gemma2 Model transformer without a language modeling head.
+    This model inherits from [`RBLNDecoderOnlyModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    A class to convert and run pre-trained transformers based Gemma2Model model on RBLN devices.
+    It implements the methods to convert a pre-trained transformers Gemma2Model model into a RBLN transformer model by:
+    - transferring the checkpoint weights of the original into an optimized RBLN graph,
+    - compiling the resulting graph using the RBLN compiler.
+    **Configuration:**
+    This model uses [`RBLNGemma2ModelConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
+    the `rbln_config` parameter should be an instance of [`RBLNGemma2ModelConfig`] or a dictionary conforming to its structure.
+    See the [`RBLNGemma2ModelConfig`] class for all available configuration options.
+    """
+    _decoder_wrapper_cls = Gemma2Wrapper

optimum/rbln/transformers/models/gemma3/configuration_gemma3.py CHANGED Viewed

@@ -58,13 +58,8 @@ class RBLNGemma3ForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
         )
         self.image_prefill_chunk_size = image_prefill_chunk_size
-    @property
-    def use_image_prefill(self):
-        return self.image_prefill_chunk_size is not None
-    @property
-    def decoder_runtime_idx(self):
-        return 2 if self.use_image_prefill else 1
+        if not (self.use_attention_mask and self.use_position_ids):
+            raise ValueError("use_attention_mask and use_position_ids must be True for RBLNGemma3ForCausalLM")
 class RBLNGemma3ForConditionalGenerationConfig(RBLNModelConfig):

optimum/rbln/transformers/models/gemma3/gemma3_architecture.py CHANGED Viewed

@@ -16,7 +16,6 @@ import copy
 from typing import Optional, Tuple, Union
 import torch
-from transformers.models.gemma3.modeling_gemma3 import Gemma3RMSNorm
 from ..decoderonly.decoderonly_architecture import (
     DecoderOnlyAttention,
@@ -95,16 +94,18 @@ class Gemma3TextModel(DecoderOnlyModel):
         else:
             seq_positions = cache_position[:, :1]
-        sliding_cache_pos = self.get_local_cache_positions(position_ids, query_position)
+        cache_seq_len, cache_offset, swa_attn_mask = self.get_swa_custom_op_args(position_ids, query_position)
+        sliding_cache_pos = (cache_seq_len, cache_offset)
         all_hidden_states = () if output_hidden_states else None
         for layer_idx, layer in enumerate(self.layers):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
             is_sliding = True if layer_idx in self.sliding_window_layers else False
+            is_sliding_decode = is_sliding and self.phase == "decode"
             hidden_states = layer(
                 hidden_states=hidden_states,
-                attention_mask=attention_mask,
+                attention_mask=swa_attn_mask if is_sliding_decode else attention_mask,
                 seq_positions=sliding_cache_pos if is_sliding else seq_positions,
                 past_key_values=past_key_values,
                 cos=cos_local if is_sliding else cos_global,
@@ -120,11 +121,8 @@ class Gemma3TextModel(DecoderOnlyModel):
 class Gemma3DecoderLayer(DecoderOnlyLayer):
-    def get_pre_feedforward_layernorm(self) -> Gemma3RMSNorm:
-        return self._original_mod.pre_feedforward_layernorm
-    def get_post_feedforward_layernorm(self) -> Gemma3RMSNorm:
-        return self._original_mod.post_feedforward_layernorm
+    _PRE_FF_LAYERNORM_ATTRS = ["pre_feedforward_layernorm"]
+    _POST_FF_LAYERNORM_ATTRS = ["post_feedforward_layernorm"]
     def forward(
         self,
@@ -164,13 +162,13 @@ class Gemma3DecoderLayer(DecoderOnlyLayer):
 class Gemma3Attention(DecoderOnlyAttention):
-    def __post_init__(self):
-        self.q_proj = self._original_mod.q_proj
-        self.k_proj = self._original_mod.k_proj
-        self.v_proj = self._original_mod.v_proj
-        self.o_proj = self._original_mod.o_proj
-        self.q_norm = self._original_mod.q_norm
-        self.k_norm = self._original_mod.k_norm
-    def get_attn_scale(self):
-        return self._original_mod.config.query_pre_attn_scalar**-0.5
+    def __post_init__(self, self_attn):
+        self.q_proj = self_attn.q_proj
+        self.k_proj = self_attn.k_proj
+        self.v_proj = self_attn.v_proj
+        self.o_proj = self_attn.o_proj
+        self.q_norm = self_attn.q_norm
+        self.k_norm = self_attn.k_norm
+    def get_attn_scale(self, self_attn):
+        return self_attn.config.query_pre_attn_scalar**-0.5

optimum/rbln/transformers/models/gemma3/modeling_gemma3.py CHANGED Viewed

@@ -13,11 +13,9 @@
 # limitations under the License.
 import importlib
 import inspect
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
-import rebel
 import torch
-from rebel.compile_context import CompileContext
 from transformers import AutoModelForImageTextToText, Gemma3ForConditionalGeneration, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.modeling_utils import no_init_weights
@@ -29,10 +27,7 @@ from ...modeling_outputs import RBLNDecoderOnlyOutput
 from ...utils.rbln_runtime_wrapper import LoopProcessor
 from ..decoderonly.decoderonly_runtime_utils import RBLNPageTableManager
 from ..decoderonly.generation_decoderonly import RBLNDecoderOnlyGenerationMixin
-from ..decoderonly.modeling_decoderonly import (
-    RBLNDecoderOnlyModelForCausalLM,
-)
-from .configuration_gemma3 import RBLNGemma3ForCausalLMConfig
+from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .gemma3_architecture import Gemma3ForCausalLMWrapper
 from .gemma3_runtime_utils import RBLNGemma3RuntimeModel
@@ -325,7 +320,7 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMix
                         batch_size,
                         inputs_embeds.shape[1],
                         self.config.text_config.hidden_size,
-                        dtype=self.rbln_config.torch_dtype,
+                        dtype=self.rbln_config.dtype,
                     )
                     for _ in range(self.config.text_config.num_hidden_layers + 1)
                 )
@@ -455,174 +450,7 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
                 f"Image prefill chunk size is different from mm_tokens_per_image: {rbln_config.image_prefill_chunk_size} != {model.config.mm_tokens_per_image}"
             )
-        return rbln_config
-    @classmethod
-    def _update_rbln_config(
-        cls,
-        preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]] = None,
-        model: Optional["PreTrainedModel"] = None,
-        model_config: Optional["PretrainedConfig"] = None,
-        rbln_config: Optional[RBLNGemma3ForCausalLMConfig] = None,
-    ) -> RBLNGemma3ForCausalLMConfig:
-        # Update rbln_config with super class
-        rbln_config = super()._update_rbln_config(preprocessors, model, model_config, rbln_config)
-        if not (rbln_config.use_attention_mask and rbln_config.use_position_ids):
-            raise ValueError("use_attention_mask and use_position_ids must be True for RBLNGemma3ForCausalLM")
-        if rbln_config.use_image_prefill:
-            if rbln_config.prefill_chunk_size != rbln_config.image_prefill_chunk_size:
-                raise NotImplementedError(
-                    "Not implemented for different prefill chunk sizes between text and image prefill."
-                )
-            # Update image prefill compile config
-            img_prefill_input_info = cls.get_input_info(
-                batch_size=1,
-                query_length=rbln_config.image_prefill_chunk_size,
-                rbln_config=rbln_config,
-                model_config=model_config,
-            )
-            image_prefill_compile_config = RBLNCompileConfig(
-                compiled_model_name="image_prefill", input_info=img_prefill_input_info
-            )
-            # Insert image_prefill compile config at index 1
-            compile_cfgs = rbln_config.compile_cfgs
-            compile_cfgs.insert(1, image_prefill_compile_config)
-            rbln_config.set_compile_cfgs(compile_cfgs)
+        if "image_prefill" not in rbln_config.phases:
+            rbln_config.phases = ["prefill", "image_prefill", "decode"]
         return rbln_config
-    @classmethod
-    @torch.inference_mode()
-    def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNGemma3ForCausalLMConfig):
-        wrapped_model = cls._wrap_model_if_needed(model, rbln_config)
-        rbln_compile_configs = rbln_config.compile_cfgs
-        prefill_compile_config = rbln_compile_configs[0]
-        context = CompileContext(use_weight_sharing=True)
-        # Here we use meta tensor, for the memory efficiency.
-        meta_tensor_names = [name for name, _, _ in prefill_compile_config.input_info if "past_key_values" in name]
-        prefill_example_inputs = prefill_compile_config.get_dummy_inputs(fill=0, meta_tensor_names=meta_tensor_names)
-        # Mark static tensors (self kv states)
-        static_tensors = {}
-        for (name, _, _), tensor in zip(prefill_compile_config.input_info, prefill_example_inputs):
-            if "past_key_values" in name:
-                static_tensors[name] = tensor
-                context.mark_static_address(tensor)
-        def compile_model(wrapped_model, compile_config, example_inputs, compile_context, quantization):
-            try:
-                if quantization:
-                    quantization.maybe_set_quantization_env()
-                original_linear = torch.nn.functional.linear
-                torch.nn.functional.linear = torch.ops.rbln_custom_ops.linear
-                compiled_model = cls.compile(
-                    wrapped_model,
-                    compile_config,
-                    create_runtimes=rbln_config.create_runtimes,
-                    device=rbln_config.device,
-                    example_inputs=example_inputs,
-                    compile_context=compile_context,
-                )
-                return compiled_model
-            finally:
-                torch.nn.functional.linear = original_linear
-                if quantization:
-                    quantization.maybe_reset_quantization_env()
-        wrapped_model.phase = "prefill"
-        compiled_prefill = compile_model(
-            wrapped_model,
-            prefill_compile_config,
-            prefill_example_inputs,
-            context,
-            rbln_config.quantization,
-        )
-        compiled_models = {"prefill": compiled_prefill}
-        if rbln_config.use_image_prefill:
-            image_prefill_compile_config = rbln_compile_configs[1]
-            image_prefill_example_inputs = image_prefill_compile_config.get_dummy_inputs(
-                fill=0, static_tensors=static_tensors
-            )
-            wrapped_model.phase = "image_prefill"
-            compiled_image_prefill = compile_model(
-                wrapped_model,
-                image_prefill_compile_config,
-                image_prefill_example_inputs,
-                context,
-                rbln_config.quantization,
-            )
-            compiled_models["image_prefill"] = compiled_image_prefill
-        wrapped_model.phase = "decode"
-        for batch_size, dec_compile_config in zip(
-            rbln_config.decoder_batch_sizes, rbln_compile_configs[rbln_config.decoder_runtime_idx :]
-        ):
-            dec_example_inputs = dec_compile_config.get_dummy_inputs(fill=0, static_tensors=static_tensors)
-            compiled_decoder = compile_model(
-                wrapped_model,
-                dec_compile_config,
-                dec_example_inputs,
-                context,
-                rbln_config.quantization,
-            )
-            compiled_models[f"decoder_batch_{batch_size}"] = compiled_decoder
-        return compiled_models
-    @classmethod
-    def _create_runtimes(
-        cls,
-        compiled_models: List[rebel.RBLNCompiledModel],
-        rbln_config: RBLNGemma3ForCausalLMConfig,
-    ) -> List[rebel.Runtime]:
-        expected_model_names = [
-            "prefill",
-            *[f"decoder_batch_{batch_size}" for batch_size in rbln_config.decoder_batch_sizes],
-        ]
-        if rbln_config.use_image_prefill:
-            expected_model_names.insert(1, "image_prefill")
-        if any(model_name not in rbln_config.device_map for model_name in expected_model_names):
-            cls._raise_missing_compiled_file_error(expected_model_names)
-        ret_val = [
-            rebel.Runtime(
-                compiled_models[0],
-                tensor_type="pt",
-                device=rbln_config.device_map["prefill"],
-                activate_profiler=rbln_config.activate_profiler,
-                timeout=rbln_config.timeout,
-            )
-        ]
-        if rbln_config.use_image_prefill:
-            ret_val.append(
-                rebel.Runtime(
-                    compiled_models[1],
-                    tensor_type="pt",
-                    device=rbln_config.device_map["image_prefill"],
-                    activate_profiler=rbln_config.activate_profiler,
-                    timeout=rbln_config.timeout,
-                ),
-            )
-        ret_val.extend(
-            [
-                rebel.Runtime(
-                    compiled_models[i + rbln_config.decoder_runtime_idx],
-                    tensor_type="pt",
-                    device=rbln_config.device_map[f"decoder_batch_{batch_size}"],
-                    activate_profiler=rbln_config.activate_profiler,
-                    timeout=rbln_config.timeout,
-                )
-                for i, batch_size in enumerate(rbln_config.decoder_batch_sizes)
-            ]
-        )
-        return ret_val

optimum/rbln/transformers/models/gpt2/gpt2_architecture.py CHANGED Viewed

@@ -20,8 +20,6 @@ import torch.nn as nn
 from ..decoderonly.decoderonly_architecture import (
     DecoderOnlyAttention,
-    DecoderOnlyLayer,
-    DecoderOnlyModel,
     DecoderOnlyWrapper,
 )
@@ -34,12 +32,6 @@ class GPT2Wrapper(DecoderOnlyWrapper):
     def get_rbln_attn_class(self):
         return GPT2Attention
-    def get_rbln_layer_class(self):
-        return GPT2Layer
-    def get_rbln_model_class(self):
-        return GPT2Model
     def get_attn_layer(self, layer: nn.Module):
         return layer.attn
@@ -50,30 +42,12 @@ class GPT2Wrapper(DecoderOnlyWrapper):
         return model.transformer.h if self.is_causal_lm else model.h
-class GPT2Model(DecoderOnlyModel):
-    def get_last_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_f
-    def get_embedding(self) -> nn.Embedding:
-        return self._original_mod.wte
-    def get_pos_embedding(self) -> nn.Embedding:
-        return self._original_mod.wpe
-class GPT2Layer(DecoderOnlyLayer):
-    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_1
-    def get_post_attention_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_2
 class GPT2Attention(DecoderOnlyAttention):
-    def __post_init__(self):
-        self.c_attn = self._original_mod.c_attn
-        self.o_proj = self._original_mod.c_proj
-        self.split_size = self._original_mod.split_size
+    def __post_init__(self, self_attn):
+        self.c_attn = self_attn.c_attn
+        self.o_proj = self_attn.c_proj
+        self.split_size = self_attn.split_size
+        self.num_key_value_heads = self_attn.num_heads
     def projection(self, hidden_states, lora_int_id) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         if lora_int_id is not None:
@@ -82,12 +56,12 @@ class GPT2Attention(DecoderOnlyAttention):
         query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
         return query_states, key_states, value_states
-    def get_attn_scale(self):
+    def get_attn_scale(self, self_attn):
         scale = 1.0
-        if self._original_mod.scale_attn_weights:
+        if self_attn.scale_attn_weights:
             scale /= math.sqrt(self.head_dim)
-        if self._original_mod.scale_attn_by_inverse_layer_idx:
+        if self_attn.scale_attn_by_inverse_layer_idx:
             scale /= 1 + self.layer_idx
         return scale

optimum/rbln/transformers/models/gpt_oss/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_gpt_oss import RBLNGptOssForCausalLMConfig
+from .modeling_gpt_oss import RBLNGptOssForCausalLM

optimum/rbln/transformers/models/gpt_oss/configuration_gpt_oss.py ADDED Viewed

@@ -0,0 +1,42 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..decoderonly.configuration_decoderonly import RBLNDecoderOnlyModelForCausalLMConfig
+class RBLNGptOssForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
+    """
+    Configuration class for RBLN GptOss models.
+    This class is an alias of RBLNDecoderOnlyModelForCausalLMConfig.
+    Example usage:
+    ```python
+    from optimum.rbln import RBLNGptOssForCausalLM, RBLNGptOssForCausalLMConfig
+    # Create a configuration object
+    config = RBLNGptOssForCausalLMConfig(
+        batch_size=1,
+        tensor_parallel_size=8,
+        kvcache_partition_len=8192,
+    )
+    # Use the configuration with from_pretrained
+    model = RBLNGptOssForCausalLM.from_pretrained(
+        "openai/gpt-oss-20b",
+        export=True,
+        rbln_config=config,
+    )
+    ```
+    """

optimum-rbln 0.9.4a2__py3-none-any.whl → 0.10.0.post1__py3-none-any.whl

optimum-rbln 0.9.4a2py3-none-any.whl → 0.10.0.post1py3-none-any.whl