PyPI - optimum-rbln - Versions diffs - 0.9.4a2__py3-none-any.whl → 0.9.5a4__py3-none-any.whl - Mend

optimum-rbln 0.9.4a2py3-none-any.whl → 0.9.5a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -26,15 +26,16 @@ from transformers.modeling_utils import no_init_weights
 from ....configuration_utils import RBLNCompileConfig
 from ....modeling import RBLNModel
 from ....utils.logging import get_logger
+from ....utils.runtime_utils import is_compiler_supports_buffer_resize
 from ...modeling_attention_utils import (
     RBLNDecoderOnlyFlashAttentionMixin,
     set_default_values,
     validate_attention_method,
     validate_sliding_window,
 )
-from ...modeling_outputs import RBLNDecoderOnlyOutput
+from ...modeling_outputs import RBLNDecoderOnlyOutput, _validate_output_hidden_states
 from ...utils.rbln_quantization import get_quantized_model
-from .configuration_decoderonly import RBLNDecoderOnlyModelConfig, RBLNDecoderOnlyModelForCausalLMConfig
+from .configuration_decoderonly import KVCacheMeta, RBLNDecoderOnlyModelConfig, RBLNDecoderOnlyModelForCausalLMConfig
 from .decoderonly_architecture import DecoderOnlyWrapper
 from .decoderonly_runtime_utils import RBLNPageTableManager, RBLNRuntimeModel
 from .generation_decoderonly import RBLNDecoderOnlyGenerationMixin
@@ -230,7 +231,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
         quantization=None,
         phase: str = "prefill",
-    ):
+    ) -> rebel.RBLNCompiledModel:
         try:
             wrapped_model.phase = phase
             if quantization:
@@ -252,21 +253,15 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
                 quantization.maybe_reset_quantization_env()
     @classmethod
-    def _get_compile_context(
-        cls,
-        compile_config: RBLNCompileConfig,
-        example_inputs: List[torch.Tensor],
-    ):
+    def _get_compile_context(cls, compile_config: RBLNCompileConfig, example_inputs: List[torch.Tensor]):
         context = CompileContext(use_weight_sharing=True)
         # Mark static tensors (self kv states)
         static_tensors = {}
-        idx = 0
         for (name, _, _), tensor in zip(compile_config.input_info, example_inputs):
             if "past_key_values" in name:
                 static_tensors[name] = tensor
-                context.mark_static_address(tensor, f"kv_cache_{idx}")
-                idx += 1
+                context.mark_static_address(tensor, name)
         return context, static_tensors
@@ -281,7 +276,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         prefill_example_inputs = prefill_compile_config.get_dummy_inputs(fill=0, meta_tensor_names=meta_tensor_names)
         context, static_tensors = cls._get_compile_context(prefill_compile_config, prefill_example_inputs)
-        compiled_models = {}
+        compiled_models: dict[str, rebel.RBLNCompiledModel] = {}
         compiled_models["prefill"] = cls._compile_model(
             wrapped_model,
             prefill_compile_config,
@@ -307,14 +302,10 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
                 )
                 compiled_models[f"decoder_batch_{batch_size}"] = compiled_decoder
-            # check if the memory is enough to have additional blocks
-            required_num_blocks = (rbln_config.max_seq_len // rbln_config.kvcache_block_size) * rbln_config.batch_size
-            if rbln_config.kvcache_num_blocks < required_num_blocks:
-                cls.maybe_suggest_kvcache_num_blocks(
-                    compiled_models=compiled_models,
-                    model_config=model.config,
-                    rbln_config=rbln_config,
-                )
+        if rbln_config.is_auto_num_blocks:
+            if not is_compiler_supports_buffer_resize():
+                raise RuntimeError("`kvcache_num_blocks` must be set.")
+            cls.set_kvcache_num_blocks_after_compilation(compiled_models, rbln_config)
         return compiled_models
@@ -330,8 +321,8 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         return model
     @classmethod
-    def use_query_position(cls, use_local_attention: bool, is_prefill: bool = True):
-        return use_local_attention
+    def use_query_position(cls, use_local_attention: bool, is_prefill: bool = True, logits_to_keep: int = None):
+        return is_prefill and (use_local_attention or logits_to_keep == 1)
     @classmethod
     def get_input_info(
@@ -350,7 +341,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         input_info = []
         if rbln_config.use_inputs_embeds:
-            input_info.append(("inputs_embeds", [batch_size, query_length, hidden_size], rbln_config.torch_dtype))
+            input_info.append(("inputs_embeds", [batch_size, query_length, hidden_size], rbln_config.dtype))
         else:
             input_info.append(("input_ids", [batch_size, query_length], "int64"))
@@ -364,15 +355,15 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         if rbln_config.use_local_attention:
             input_info.append(("local_block_tables", [1] if is_prefill else [batch_size, 1], "int16"))
-        if cls.use_query_position(rbln_config.use_local_attention, is_prefill):
+        if cls.use_query_position(rbln_config.use_local_attention, is_prefill, rbln_config.logits_to_keep):
             input_info.append(("query_position", [], "int16"))
         if rbln_config.use_attention_mask:
             if rbln_config.use_position_ids:
-                input_info.append(("attention_mask", [batch_size, rbln_config.max_seq_len], rbln_config.torch_dtype))
+                input_info.append(("attention_mask", [batch_size, rbln_config.max_seq_len], rbln_config.dtype))
             else:
                 input_info.append(
-                    ("attention_mask", [batch_size, 1, query_length, rbln_config.max_seq_len], rbln_config.torch_dtype)
+                    ("attention_mask", [batch_size, 1, query_length, rbln_config.max_seq_len], rbln_config.dtype)
                 )
         if rbln_config.use_position_ids:
@@ -381,29 +372,36 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         if rbln_config.use_lora:
             input_info.append(("lora_int_ids", [batch_size], "int32"))
-        kvcache_dtype = rbln_config.torch_dtype
-        if rbln_config.quantization and rbln_config.quantization.kv_caches == "fp8":
-            kvcache_dtype = "float8_e4m3fn"
+        if len(rbln_config.kvcache_metas) > 0:
+            # Meta is already set, use it
+            input_info.extend(
+                [
+                    (kvcache_meta.name, kvcache_meta.compile_shape, kvcache_meta.dtype)
+                    for kvcache_meta in rbln_config.kvcache_metas
+                ]
+            )
-        global_kvcache_shape = [
-            rbln_config.kvcache_num_blocks,
-            num_key_value_heads,
-            rbln_config.kvcache_block_size,
-            head_dim,
-        ]
-        local_kvcache_shape = [rbln_config.batch_size, num_key_value_heads, rbln_config.sliding_window, head_dim]
-        input_info.extend(
-            [
-                (
-                    f"past_key_values_{i}",
-                    local_kvcache_shape
-                    if rbln_config.sliding_window is not None and ((i // 2) in rbln_config.sliding_window_layers)
-                    else global_kvcache_shape,
-                    kvcache_dtype,
+        else:
+            kvcache_dtype = rbln_config.dtype
+            if rbln_config.quantization and rbln_config.quantization.kv_caches == "fp8":
+                kvcache_dtype = "float8_e4m3fn"
+            kvcache_metas = []
+            for i in range(num_hidden_layers * 2):
+                layer_idx = i // 2
+                name = f"past_key_values_{i}"
+                kvcache_meta = KVCacheMeta.make(
+                    name,
+                    layer_idx,
+                    num_key_value_heads,
+                    head_dim,
+                    RBLNCompileConfig.normalize_dtype(kvcache_dtype),
+                    rbln_config,
                 )
-                for i in range(num_hidden_layers * 2)
-            ]
-        )
+                kvcache_metas.append(kvcache_meta)
+                input_info.append((name, kvcache_meta.compile_shape, kvcache_meta.dtype))
+            rbln_config.kvcache_metas.extend(kvcache_metas)
         return input_info
@@ -475,51 +473,39 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
             max_seq_len=rbln_config.max_seq_len,
         )
-        num_full_blocks = (rbln_config.max_seq_len // rbln_config.kvcache_block_size) * rbln_config.batch_size
-        # Update kvcache_num_blocks based on the attention implementation.
+        # Validate kvcache_num_blocks based on the number of full blocks required.
+        # Eager mode restriction:
+        # - num_blocks must be at least equal to the batch size
+        # Flash attention restriction:
+        # - num_blocks must be at least equal to (max_seq_len // kvcache_block_size) + 1
+        # - num_blocks must be no greater than the number of full blocks.
         if rbln_config.attn_impl == "flash_attn":
-            estimated_max_num_blocks = cls.get_maximum_num_blocks_by_model(
-                model=model, model_config=model_config, rbln_config=rbln_config
-            )
+            if rbln_config.is_auto_num_blocks:
+                # Do nothing
+                pass
-            if rbln_config.kvcache_num_blocks is None:
-                if estimated_max_num_blocks < num_full_blocks:
-                    # lower bound of the number of blocks for flash attention.
-                    min_blocks_for_flash = min(
-                        rbln_config.max_seq_len // rbln_config.kvcache_block_size + 1, num_full_blocks
+            else:
+                if rbln_config.kvcache_num_blocks > rbln_config.num_full_blocks:
+                    logger.warning(
+                        f"The set `kvcache_num_blocks` ({rbln_config.kvcache_num_blocks}) is greater"
+                        f" than the required number of blocks ({rbln_config.num_full_blocks})."
+                        "This can cause a failure during model compilation."
+                    )
+                elif rbln_config.kvcache_num_blocks < rbln_config.num_min_blocks:
+                    raise ValueError(
+                        f"The set `kvcache_num_blocks` ({rbln_config.kvcache_num_blocks}) is less"
+                        f" than the minimum number of blocks ({rbln_config.num_min_blocks})."
                     )
-                    if min_blocks_for_flash > estimated_max_num_blocks:
-                        # NOTE: Just try to compile with lower bound of blocks for flash attention.
-                        # Even if it's larger than the estimated maximum number of blocks.
-                        rbln_config.kvcache_num_blocks = min_blocks_for_flash
-                    else:
-                        logger.info(f"[KVCache] Compiling with num_blocks: {rbln_config.kvcache_num_blocks}")
-                        rbln_config.kvcache_num_blocks = estimated_max_num_blocks
-                    if rbln_config.kvcache_num_blocks < rbln_config.batch_size:
-                        raise RuntimeError(
-                            f"Batch size ({rbln_config.batch_size}) exceeds num_blocks ({rbln_config.kvcache_num_blocks}). "
-                            "Ensure the number of blocks is at least equal to the batch size."
-                        )
-                else:
-                    rbln_config.kvcache_num_blocks = num_full_blocks
-            elif rbln_config.kvcache_num_blocks > estimated_max_num_blocks:
-                logger.warning(
-                    f"The set `kvcache_num_blocks` ({rbln_config.kvcache_num_blocks}) is greater"
-                    f" than the estimated maximum number of blocks ({estimated_max_num_blocks})."
-                    "This can cause a failure during model compilation."
-                )
         else:
-            if rbln_config.kvcache_num_blocks is None:
-                rbln_config.kvcache_num_blocks = num_full_blocks
-            elif rbln_config.kvcache_num_blocks > num_full_blocks:
+            if rbln_config.is_auto_num_blocks:
+                # Eager attention should use fixed number of blocks.
+                rbln_config.kvcache_num_blocks = rbln_config.num_full_blocks
+            elif rbln_config.kvcache_num_blocks > rbln_config.num_full_blocks:
                 logger.warning(
                     f"The set `kvcache_num_blocks` ({rbln_config.kvcache_num_blocks}) is greater"
-                    f" than the required number of blocks ({num_full_blocks})."
+                    f" than the required number of blocks ({rbln_config.num_full_blocks})."
                     "This can cause a failure during model compilation."
                 )
-        logger.info(f"[KVCache] Compiling with num_blocks: {rbln_config.kvcache_num_blocks}")
         return rbln_config
@@ -643,15 +629,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
             raise ValueError(
                 f"Batch size ({batch_size}) must be equal to the batch size of the model ({self.rbln_config.batch_size})."
             )
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.rbln_config.output_hidden_states
-        )
-        if output_hidden_states != self.rbln_config.output_hidden_states:
-            raise ValueError(
-                f"Variable output_hidden_states {output_hidden_states} is not equal to rbln_config.output_hidden_states {self.rbln_config.output_hidden_states} "
-                f"Please compile again with the correct argument."
-            )
+        output_hidden_states = _validate_output_hidden_states(output_hidden_states, self.rbln_config)
         all_last_hidden_states = []
         all_hidden_states = (
@@ -660,7 +638,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
                     self.rbln_config.batch_size,
                     inputs.shape[1],
                     self.config.hidden_size,
-                    dtype=self.rbln_config.torch_dtype,
+                    dtype=self.rbln_config.dtype,
                 )
                 for _ in range(self.config.num_hidden_layers + 1)
             )
@@ -700,6 +678,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
     1. Converting pre-trained transformer models to RBLN-optimized format
     2. Handling the compilation process for RBLN devices
     3. Managing inference operations for causal language modeling
     This class inherits from RBLNModel and implements specific methods required for
     decoder-only architectures and causal language modeling tasks.
@@ -716,10 +695,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
     def logits_last_dim(self):
         return self.config.vocab_size
-    @classmethod
-    def use_query_position(cls, use_local_attention: bool, is_prefill: bool = True):
-        return is_prefill
     def set_lora_int_ids(self, lora_int_ids: Optional[torch.Tensor]):
         if isinstance(lora_int_ids, int):
             lora_int_ids = torch.tensor([lora_int_ids], dtype=torch.int32)
@@ -803,14 +778,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
             )
             padded_cache_lengths = torch.zeros_like(generate_idx)
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.rbln_config.output_hidden_states
-        )
-        if output_hidden_states != self.rbln_config.output_hidden_states:
-            raise ValueError(
-                f"Variable output_hidden_states {output_hidden_states} is not equal to rbln_config.output_hidden_states {self.rbln_config.output_hidden_states} "
-                f"Please compile again with the correct argument."
-            )
+        output_hidden_states = _validate_output_hidden_states(output_hidden_states, self.rbln_config)
         # Prefill
         if cache_position is None:
@@ -829,7 +797,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
             all_hidden_states = (
                 tuple(
-                    torch.zeros(batch_size, input_len, self.config.hidden_size, dtype=self.rbln_config.torch_dtype)
+                    torch.zeros(batch_size, input_len, self.config.hidden_size, dtype=self.rbln_config.dtype)
                     for _ in range(self.config.num_hidden_layers + 1)
                 )
                 if self.rbln_config.output_hidden_states

optimum/rbln/transformers/models/exaone/exaone_architecture.py CHANGED Viewed

@@ -18,9 +18,6 @@ import torch.nn as nn
 from ....utils import logging
 from ...models.decoderonly.decoderonly_architecture import (
-    DecoderOnlyAttention,
-    DecoderOnlyLayer,
-    DecoderOnlyModel,
     DecoderOnlyWrapper,
 )
@@ -42,36 +39,3 @@ class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
     def get_model_layer(self, causal_lm: "ExaoneForCausalLM"):
         return causal_lm.transformer
-    def get_rbln_attn_class(self):
-        return ExaoneAttention
-    def get_rbln_layer_class(self):
-        return ExaoneLayer
-    def get_rbln_model_class(self):
-        return ExaoneModel
-class ExaoneModel(DecoderOnlyModel):
-    def get_embedding(self) -> nn.Embedding:
-        return self._original_mod.wte
-    def get_last_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_f
-class ExaoneLayer(DecoderOnlyLayer):
-    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_1
-    def get_post_attention_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_2
-class ExaoneAttention(DecoderOnlyAttention):
-    def __post_init__(self):
-        self.q_proj = self._original_mod.q_proj
-        self.k_proj = self._original_mod.k_proj
-        self.v_proj = self._original_mod.v_proj
-        self.o_proj = self._original_mod.out_proj

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -24,4 +24,4 @@ class GemmaWrapper(DecoderOnlyWrapper):
 class GemmaModel(DecoderOnlyModel):
     @property
     def hidden_multiplier(self):
-        return self._original_mod.config.hidden_size**0.5
+        return self.config.hidden_size**0.5

optimum/rbln/transformers/models/gemma2/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_gemma2 import RBLNGemma2ForCausalLMConfig, RBLNGemma2ModelConfig
+from .modeling_gemma2 import RBLNGemma2ForCausalLM, RBLNGemma2Model

optimum/rbln/transformers/models/gemma2/configuration_gemma2.py ADDED Viewed

@@ -0,0 +1,45 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..decoderonly.configuration_decoderonly import RBLNDecoderOnlyModelConfig, RBLNDecoderOnlyModelForCausalLMConfig
+class RBLNGemma2ForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
+    """
+    Configuration class for RBLN Gemma2 models.
+    This class is an alias of RBLNDecoderOnlyModelForCausalLMConfig.
+    Example usage:
+    ```python
+    from optimum.rbln import RBLNGemma2ForCausalLM, RBLNGemma2ForCausalLMConfig
+    # Create a configuration object
+    config = RBLNGemma2ForCausalLMConfig(
+        batch_size=1,
+        max_seq_len=8192,
+        tensor_parallel_size=4
+    )
+    # Use the configuration with from_pretrained
+    model = RBLNGemma2ForCausalLM.from_pretrained(
+        "google/gemma-2-9b",
+        export=True,
+        rbln_config=config
+    )
+    ```
+    """
+class RBLNGemma2ModelConfig(RBLNDecoderOnlyModelConfig):
+    """
+    Configuration class for RBLN Gemma2 models.
+    This class is an alias of RBLNDecoderOnlyModelConfig.
+    """

optimum/rbln/transformers/models/gemma2/gemma2_architecture.py ADDED Viewed

@@ -0,0 +1,83 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+import torch
+from ...models.decoderonly.decoderonly_architecture import DecoderOnlyAttention, DecoderOnlyLayer, DecoderOnlyModel
+from ..decoderonly.decoderonly_architecture import DecoderOnlyWrapper
+class Gemma2Wrapper(DecoderOnlyWrapper):
+    def get_rbln_layer_class(self):
+        return Gemma2DecoderLayer
+    def get_rbln_attn_class(self):
+        return Gemma2Attention
+    def get_rbln_model_class(self):
+        return Gemma2Model
+class Gemma2DecoderLayer(DecoderOnlyLayer):
+    _PRE_FF_LAYERNORM_ATTRS = ["pre_feedforward_layernorm"]
+    _POST_FF_LAYERNORM_ATTRS = ["post_feedforward_layernorm"]
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        seq_positions: Union[torch.LongTensor, Tuple[torch.LongTensor]],
+        past_key_values: Tuple[Tuple[torch.Tensor]],
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+        block_tables: Optional[torch.Tensor] = None,
+        lora_int_id: Optional[torch.Tensor] = None,
+    ):
+        residual = hidden_states
+        hidden_states = self.get_pre_attention_layernorm()(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            seq_positions=seq_positions,
+            past_key_values=past_key_values,
+            cos=cos,
+            sin=sin,
+            block_tables=block_tables,
+            lora_int_id=lora_int_id,
+        )
+        hidden_states = self.get_post_attention_layernorm()(hidden_states)
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.get_pre_feedforward_layernorm()(hidden_states)
+        hidden_states = self.forward_mlp(hidden_states, lora_int_id)
+        hidden_states = self.get_post_feedforward_layernorm()(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class Gemma2Attention(DecoderOnlyAttention):
+    def get_attn_scale(self, self_attn):
+        return self_attn.config.query_pre_attn_scalar**-0.5
+class Gemma2Model(DecoderOnlyModel):
+    @property
+    def hidden_multiplier(self):
+        return self.config.hidden_size**0.5

optimum/rbln/transformers/models/gemma2/modeling_gemma2.py ADDED Viewed

@@ -0,0 +1,101 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ....utils import logging
+from ...models.decoderonly import (
+    RBLNDecoderOnlyModel,
+    RBLNDecoderOnlyModelForCausalLM,
+)
+from .gemma2_architecture import Gemma2Wrapper
+logger = logging.get_logger(__name__)
+class RBLNGemma2ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
+    """
+    The Gemma2 Model transformer with a language modeling head (linear layer) on top.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    A class to convert and run pre-trained transformers based Gemma2ForCausalLM model on RBLN devices.
+    It implements the methods to convert a pre-trained transformers Gemma2ForCausalLM model into a RBLN transformer model by:
+    - transferring the checkpoint weights of the original into an optimized RBLN graph,
+    - compiling the resulting graph using the RBLN compiler.
+    **Configuration:**
+    This model uses [`RBLNGemma2ForCausalLMConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
+    the `rbln_config` parameter should be an instance of [`RBLNGemma2ForCausalLMConfig`] or a dictionary conforming to its structure.
+    See the [`RBLNGemma2ForCausalLMConfig`] class for all available configuration options.
+    Examples:
+        ```python
+        from optimum.rbln import RBLNGemma2ForCausalLM
+        # Simple usage using rbln_* arguments
+        # `max_seq_len` is automatically inferred from the model config
+        model = RBLNGemma2ForCausalLM.from_pretrained(
+            "google/gemma-2-9b",
+            export=True,
+            rbln_batch_size=1,
+            rbln_tensor_parallel_size=4,
+        )
+        # Using a config dictionary
+        rbln_config = {
+            "batch_size": 1,
+            "max_seq_len": 8192,
+            "tensor_parallel_size": 4,
+        }
+        model = RBLNGemma2ForCausalLM.from_pretrained(
+            "google/gemma-2-9b",
+            export=True,
+            rbln_config=rbln_config
+        )
+        # Using a RBLNMistralForCausalLMConfig instance (recommended for type checking)
+        from optimum.rbln import RBLNGemma2ForCausalLMConfig
+        config = RBLNGemma2ForCausalLMConfig(
+            batch_size=1,
+            max_seq_len=8192,
+            tensor_parallel_size=4
+        )
+        model = RBLNGemma2ForCausalLM.from_pretrained(
+            "google/gemma-2-9b",
+            export=True,
+            rbln_config=config
+        )
+        ```
+    """
+    _decoder_wrapper_cls = Gemma2Wrapper
+class RBLNGemma2Model(RBLNDecoderOnlyModel):
+    """
+    The Gemma2 Model transformer without a language modeling head.
+    This model inherits from [`RBLNDecoderOnlyModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    A class to convert and run pre-trained transformers based Gemma2Model model on RBLN devices.
+    It implements the methods to convert a pre-trained transformers Gemma2Model model into a RBLN transformer model by:
+    - transferring the checkpoint weights of the original into an optimized RBLN graph,
+    - compiling the resulting graph using the RBLN compiler.
+    **Configuration:**
+    This model uses [`RBLNGemma2ModelConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
+    the `rbln_config` parameter should be an instance of [`RBLNGemma2ModelConfig`] or a dictionary conforming to its structure.
+    See the [`RBLNGemma2ModelConfig`] class for all available configuration options.
+    """
+    _decoder_wrapper_cls = Gemma2Wrapper

optimum-rbln 0.9.4a2__py3-none-any.whl → 0.9.5a4__py3-none-any.whl

optimum-rbln 0.9.4a2py3-none-any.whl → 0.9.5a4py3-none-any.whl