PyPI - optimum-rbln - Versions diffs - 0.8.2a4__py3-none-any.whl → 0.9.3rc0__py3-none-any.whl - Mend

optimum-rbln 0.8.2a4py3-none-any.whl → 0.9.3rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (167) hide show

optimum/rbln/transformers/models/gemma3/modeling_gemma3.py CHANGED Viewed

@@ -11,95 +11,74 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import inspect
-from collections import deque
-from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 import rebel
 import torch
 from rebel.compile_context import CompileContext
-from transformers import (
-    AutoModelForImageTextToText,
-    Gemma3ForConditionalGeneration,
-    PretrainedConfig,
-    PreTrainedModel,
-)
+from transformers import AutoModelForImageTextToText, Gemma3ForConditionalGeneration, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.modeling_utils import no_init_weights
 from transformers.models.gemma3.modeling_gemma3 import Gemma3TextScaledWordEmbedding
 from ....configuration_utils import RBLNCompileConfig, RBLNModelConfig
 from ....modeling import RBLNModel
-from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyModelForCausalLM, RBLNDecoderOnlyOutput, RBLNRuntimeModel
+from ...modeling_outputs import RBLNDecoderOnlyOutput
+from ...utils.rbln_runtime_wrapper import LoopProcessor
+from ..decoderonly.decoderonly_runtime_utils import RBLNPageTableManager
+from ..decoderonly.generation_decoderonly import RBLNDecoderOnlyGenerationMixin
+from ..decoderonly.modeling_decoderonly import (
+    RBLNDecoderOnlyModelForCausalLM,
+)
 from .configuration_gemma3 import RBLNGemma3ForCausalLMConfig
 from .gemma3_architecture import Gemma3ForCausalLMWrapper
+from .gemma3_runtime_utils import RBLNGemma3RuntimeModel
 if TYPE_CHECKING:
     from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, Gemma3ForConditionalGeneration
-@dataclass
-class RBLNGemma3ForCausalLMOutput(RBLNDecoderOnlyOutput):
-    attention_mask: Optional[torch.Tensor] = None
-class LoopVisionTower:
-    def __init__(self, vision_tower: RBLNModel) -> None:
-        self.vision_tower = vision_tower
+class LoopVisionTower(LoopProcessor):
+    def __init__(self, vision_tower: "RBLNModel"):
+        super().__init__(model=vision_tower)
-    def forward(self, *args, **kwargs):
-        # Loop instead of batch
-        # shape of pixel_values : [batch, num_channel, height, width]
-        pixel_values = args[0]
+    def _get_batch_size(self, pixel_values, **kwargs):
+        return pixel_values.shape[0]
-        batch_size = pixel_values.shape[0]
-        outputs = []
-        for i in range(batch_size):
-            outputs.append(self.vision_tower(pixel_values=pixel_values[i : i + 1], return_dict=True))
+    def _prepare_inputs_for_iteration(self, index, common_inputs, pixel_values, **kwargs):
+        pixel_values_item = pixel_values[index : index + 1]
+        out_buffer = [tensor[index : index + 1] for tensor in kwargs["out"]]
+        return ([pixel_values_item], {"out": out_buffer})
-        last_hidden_states = [output.last_hidden_state for output in outputs]
-        # FIXME:: This can be optimized using out= API of rbln runtime.
-        last_hidden_states = torch.cat(last_hidden_states, dim=0)
+    def _process_outputs(self, outputs: list, **kwargs) -> "BaseModelOutputWithPooling":
+        output = kwargs["out"]
         return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_states,
+            last_hidden_state=output[0],
         )
-    def __call__(self, *args: Any, **kwds: Any) -> Any:
-        return self.forward(*args, **kwds)
-    def __repr__(self) -> str:
-        return repr(self.vision_tower)
-class LoopProjector:
-    def __init__(self, multi_modal_projector) -> None:
-        self.multi_modal_projector = multi_modal_projector
+class LoopProjector(LoopProcessor):
+    def __init__(self, multi_modal_projector: "RBLNModel"):
+        super().__init__(model=multi_modal_projector)
-    def forward(self, *args, **kwargs):
-        # Loop instead of batch
-        image_feature = args[0]
+    def _get_batch_size(self, image_feature, **kwargs):
+        return image_feature.shape[0]
-        batch_size = image_feature.shape[0]
-        outputs = []
-        for i in range(batch_size):
-            outputs.append(self.multi_modal_projector(image_feature[i : i + 1]))
+    def _prepare_inputs_for_iteration(self, index, common_inputs, image_feature, **kwargs):
+        image_feature_item = image_feature[index : index + 1]
+        out_buffer = [tensor[index : index + 1] for tensor in kwargs["out"]]
+        return ([image_feature_item], {"out": out_buffer})
-        # FIXME:: This can be optimized using out= API of rbln runtime.
-        outputs = torch.cat(outputs, dim=0)
-        return outputs
+    def _process_outputs(self, outputs: list, **kwargs):
+        output = kwargs["out"]
+        return output[0]
-    def __call__(self, *args: Any, **kwds: Any) -> Any:
-        return self.forward(*args, **kwds)
-    def __repr__(self) -> str:
-        return repr(self.multi_modal_projector)
-class RBLNGemma3ForConditionalGeneration(RBLNModel):
+class RBLNGemma3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixin):
     auto_model_class = AutoModelForImageTextToText
     _rbln_submodules = [
         {"name": "vision_tower"},
@@ -119,6 +98,23 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
     def can_generate(self):
         return True
+    @classmethod
+    def get_pytorch_model(cls, *args, **kwargs):
+        model = super().get_pytorch_model(*args, **kwargs)
+        with no_init_weights():
+            model_cls_name = model.model.language_model.__class__.__name__
+            causal_model_cls_name = model_cls_name.replace("TextModel", "ForCausalLM")
+            causal_model_cls = getattr(importlib.import_module("transformers"), causal_model_cls_name)
+            new_language_model = causal_model_cls(model.model.language_model.config)
+        new_language_model.lm_head = model.lm_head
+        new_language_model.model = model.model.language_model
+        model.model.language_model = new_language_model
+        model.lm_head = None
+        del model.lm_head
+        return model
     def __post_init__(self, **kwargs):
         self.vision_tower = LoopVisionTower(self.rbln_submodules[0])
         self.language_model = self.rbln_submodules[1]
@@ -208,18 +204,30 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
         return model_kwargs
     def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        """
-        Projects the last hidden state from the vision model into language model space.
-        Args:
-            pixel_values: (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`)
-                The tensors corresponding to the input images.
-        Returns:
-            Image feature tensor of shape `(num_images, image_length, embed_dim)`.
-        """
-        vision_outputs = self.vision_tower(pixel_values).last_hidden_state
-        image_features = self.multi_modal_projector(vision_outputs)
+        # Projects the last hidden state from the vision model into language model space.
+        # Args:
+        #     pixel_values: (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`)
+        #         The tensors corresponding to the input images.
+        # Returns:
+        #     Image feature tensor of shape `(num_images, image_length, embed_dim)`.
+        vision_out_buffer = []
+        vision_out_size = [
+            pixel_values.shape[0],
+            (self.config.vision_config.image_size // self.config.vision_config.patch_size) ** 2,
+            self.config.vision_config.hidden_size,
+        ]
+        projector_out_size = [
+            pixel_values.shape[0],
+            self.config.mm_tokens_per_image,
+            self.config.text_config.hidden_size,
+        ]
+        vision_out_buffer.append(torch.empty(size=vision_out_size, dtype=torch.float32, device="cpu"))
+        projector_out_buffer = [torch.empty(size=projector_out_size, dtype=torch.float32, device="cpu")]
+        vision_outputs = self.vision_tower(pixel_values, out=vision_out_buffer).last_hidden_state
+        image_features = self.multi_modal_projector(vision_outputs, out=projector_out_buffer)
         return image_features
     def _preprocess_prefill(
@@ -254,17 +262,45 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
         return inputs_embeds
+    def get_padded_cache_position(
+        self,
+        cache_position: torch.Tensor,  # shape: [1, seq_len]
+        token_type_ids: torch.Tensor,  # shape: [1, seq_len]
+    ) -> torch.Tensor:
+        seq_len = cache_position[0][-1].item() + 1
+        # Find image start positions
+        image_starts = [
+            s
+            for s in torch.where(token_type_ids == 1)[1]
+            if torch.all(token_type_ids[:, s : s + self.rbln_config.image_prefill_chunk_size] == 1)
+        ]
+        # Initialize padded tensors
+        padded_input_len = seq_len
+        for image_start in image_starts:
+            pad_needed = (
+                self.rbln_config.image_prefill_chunk_size
+                - (image_start + padded_input_len - seq_len) % self.rbln_config.image_prefill_chunk_size
+            ) % self.rbln_config.image_prefill_chunk_size
+            padded_input_len += pad_needed
+        return torch.cat(
+            [cache_position, torch.arange(seq_len, padded_input_len, dtype=torch.int32).unsqueeze(0)],
+            dim=1,
+        )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
+        attention_mask: torch.Tensor = None,
+        token_type_ids: torch.Tensor = None,
         pixel_values: torch.FloatTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         generate_idx: Optional[torch.Tensor] = None,
         padded_cache_lengths: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
         **lm_kwargs: Dict[str, Any],
     ) -> Union[Tuple, RBLNDecoderOnlyOutput]:
         # prefill
@@ -275,12 +311,15 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
             for b_idx in range(batch_size):
                 cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
+                token_type_id = token_type_ids[b_idx : b_idx + 1, attention_mask[b_idx].bool()]
+                cache_position = self.get_padded_cache_position(cache_position, token_type_id)
                 output = self.language_model.prefill_decoder(
                     inputs_embeds=inputs_embeds[b_idx : b_idx + 1],
                     attention_mask=attention_mask[b_idx],
                     cache_position=cache_position,
                     batch_idx=b_idx,
-                    token_type_ids=token_type_ids[b_idx : b_idx + 1] if token_type_ids is not None else None,
+                    token_type_ids=token_type_ids[b_idx : b_idx + 1],  # do not pass token_type_id
                 )
                 padded_cache_lengths[b_idx] += output.padded_cache_lengths
                 logits.append(output.logits)
@@ -309,209 +348,6 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
         )
-class RBLNGemma3RuntimeModel(RBLNRuntimeModel):
-    def __init__(self, *args, image_prefill: Optional[rebel.Runtime] = None, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.image_prefill = image_prefill  # FIXME(taehoon)
-        self.prefill = self.runtime if self.phase == "prefill" else None  # FIXME
-        self.decode = self.runtime if self.phase == "decode" else None
-    def _prepare_prefill_inputs(self, *args, **kwargs):
-        (
-            inputs,
-            cache_position,
-            chunked_attention_mask,
-            out_buffers,
-            position_ids,
-            position_embed,
-            padded_cache_lengths,
-            query_length,
-            token_type_ids,
-        ) = super()._prepare_prefill_inputs(*args, **kwargs)
-        # chunked_attention_mask shape
-        chunked_attention_mask = torch.zeros(1, chunked_attention_mask.shape[-1], dtype=torch.float32)
-        # as gemma3 has different prefill chunk size for image and text, we need to pad the inputs to the max of the two.
-        padding_size = max(self.rbln_config.prefill_chunk_size, self.rbln_config.image_prefill_chunk_size)
-        inputs = torch.nn.functional.pad(inputs, (0, 0, 0, padding_size))
-        cache_position = torch.nn.functional.pad(cache_position, (0, padding_size))
-        position_ids = torch.nn.functional.pad(position_ids, (0, padding_size))
-        token_type_ids = torch.nn.functional.pad(token_type_ids, (0, padding_size), value=-1)
-        return (
-            inputs,
-            cache_position,
-            chunked_attention_mask,
-            out_buffers,
-            position_ids,
-            position_embed,
-            padded_cache_lengths,
-            query_length,
-            token_type_ids,
-        )
-    def prefill_forward(
-        self,
-        inputs: torch.Tensor,
-        cache_position: torch.Tensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        batch_idx: int = None,
-        block_tables: torch.Tensor = None,
-        is_external_block_tables: bool = None,
-        position_embed: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        local_block_tables: Optional[torch.Tensor] = None,
-    ) -> torch.FloatTensor:
-        """
-        Performs chunked prefill for efficient KV-cache updates and memory optimization.
-        Instead of processing the entire sequence at once, the input is divided into chunks of size `prefill_chunk_size`,
-        and each chunk is processed sequentially. This allows for better memory utilization and compatibility with continuous batching.
-        """
-        (
-            inputs,
-            cache_position,
-            chunked_attention_mask,
-            out_buffers,
-            position_ids,
-            position_embed,
-            padded_cache_lengths,
-            query_length,
-            token_type_ids,
-        ) = self._prepare_prefill_inputs(
-            inputs, cache_position, attention_mask, position_embed, token_type_ids=token_type_ids
-        )
-        step = 0
-        while step < query_length:
-            # Check if the prefill chunk is an image prefill
-            is_image_prefill = torch.all(
-                token_type_ids[:, step : step + self.rbln_config.image_prefill_chunk_size] == 1
-            )
-            prefill_chunk_size = (
-                self.rbln_config.image_prefill_chunk_size if is_image_prefill else self.rbln_config.prefill_chunk_size
-            )
-            # Check if the prefill chunk is a text prefill which have image_tokens in it.
-            is_text_prefill_with_image_tokens = not is_image_prefill and torch.any(
-                token_type_ids[:, step : step + prefill_chunk_size] == 1
-            )
-            # Check if the prefill chunk crosses a block boundary, requiring padding to align with block boundaries
-            is_cross_block_boundary = (
-                step // self.rbln_config.kvcache_block_size
-                != (step + prefill_chunk_size) // self.rbln_config.kvcache_block_size
-            )
-            # Check if the prefill chunk is the last chunk
-            is_last_chunk = step + prefill_chunk_size >= query_length
-            if is_cross_block_boundary:
-                padding_size = prefill_chunk_size - (step + prefill_chunk_size) % self.rbln_config.kvcache_block_size
-                padded_cache_lengths += padding_size
-            # if text_prefill end with image_tokens, we only treat the text part.
-            num_processed_tokens = prefill_chunk_size
-            if is_text_prefill_with_image_tokens:
-                first_image_token_idx = torch.where(token_type_ids[:, step : step + prefill_chunk_size] == 1)[1][0]
-                num_processed_tokens = first_image_token_idx
-            if is_last_chunk:
-                num_processed_tokens = query_length - step
-            input_chunk = inputs[:, step : step + prefill_chunk_size]
-            cache_pos_chunk = cache_position[:, step : step + prefill_chunk_size].clone() + padded_cache_lengths
-            position_ids_chunk = position_ids[:, step : step + prefill_chunk_size].clone()
-            chunked_attention_mask[
-                :, step + padded_cache_lengths : step + num_processed_tokens + padded_cache_lengths
-            ] = 1
-            query_position = torch.tensor(num_processed_tokens - 1, dtype=torch.int16)
-            if is_image_prefill:
-                logits = self.image_prefill(
-                    input_chunk,
-                    cache_pos_chunk,
-                    block_tables,
-                    local_block_tables,
-                    query_position,
-                    chunked_attention_mask,
-                    position_ids_chunk,
-                    out=out_buffers,
-                )
-            else:
-                logits = self.prefill(
-                    input_chunk,
-                    cache_pos_chunk,
-                    block_tables,
-                    local_block_tables,
-                    query_position,
-                    chunked_attention_mask,
-                    position_ids_chunk,
-                    out=out_buffers,
-                )
-            step += num_processed_tokens
-        if not is_external_block_tables:
-            self.dec_attn_mask[batch_idx : batch_idx + 1] = chunked_attention_mask
-        return RBLNGemma3ForCausalLMOutput(
-            logits=logits, padded_cache_lengths=padded_cache_lengths, attention_mask=chunked_attention_mask
-        )
-    def decode_forward(
-        self,
-        inputs: torch.Tensor,
-        cache_position: torch.Tensor = None,
-        block_tables: torch.Tensor = None,
-        is_external_block_tables: bool = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_embed: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        local_block_tables: Optional[torch.Tensor] = None,
-    ) -> torch.FloatTensor:
-        batch_size = inputs.shape[0]
-        if batch_size != self.batch_size:
-            raise RuntimeError(
-                f"Batch size mismatch: got {batch_size}, expected {self.batch_size} (compiled batch size)."
-            )
-        if batch_size != cache_position.shape[0]:
-            raise RuntimeError(f"Cache position size mismatch: got {cache_position.shape[0]}, expected {batch_size}.")
-        # FIXME(taehoon): how to handle pos_attn_mask with external block tables
-        if is_external_block_tables:
-            if attention_mask is None:
-                raise ValueError("attention_mask should be provided with external block tables.")
-            if local_block_tables is None:
-                raise ValueError("local_block_tables should be provided with external block tables.")
-        else:
-            local_block_tables = (
-                local_block_tables
-                if local_block_tables is not None
-                else torch.arange(0, self.batch_size, dtype=torch.int16).view(self.batch_size, -1)
-            )
-            if self.rbln_config.use_attention_mask and attention_mask is None:
-                for b_idx in range(batch_size):
-                    decoding_step = cache_position[b_idx].item()
-                    if not (0 <= decoding_step < self.dec_attn_mask.shape[-1]):
-                        raise ValueError(
-                            f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
-                        )
-                    self.dec_attn_mask[b_idx, decoding_step] = 1
-                attention_mask = self.dec_attn_mask
-        if self.batch_size < block_tables.shape[0]:
-            block_tables = block_tables[: self.batch_size]
-        if attention_mask is not None and self.batch_size < attention_mask.shape[0]:
-            attention_mask = attention_mask[: self.batch_size]
-        logits = self.decode(inputs, cache_position, block_tables, local_block_tables, attention_mask, position_ids)
-        return RBLNDecoderOnlyOutput(logits=logits)
 class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
     The Gemma3 Model transformer with a language modeling head (linear layer) on top.
@@ -524,52 +360,36 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
     _decoder_wrapper_cls = Gemma3ForCausalLMWrapper
+    _supports_non_fp32 = False
-    def __post_init__(self, **kwargs):
-        main_input_name = self.main_input_name
-        if self.rbln_config.use_inputs_embeds:
-            main_input_name = "inputs_embeds"
-            artifacts = torch.load(self.model_save_dir / self.subfolder / "torch_artifacts.pth", weights_only=False)
-            self.embed_tokens = self._create_embedding_layer()
-            self.embed_tokens.load_state_dict(artifacts["embed_tokens"])
-        else:
-            self.embed_tokens = None
+    def setup_runtime(self):
         # Initialize shared resources to be used across Runtime instances (prefill and decode phases)
         dec_attn_mask = torch.zeros(self.rbln_config.batch_size, self.rbln_config.max_seq_len, dtype=torch.float32)
-        block_tables = torch.zeros(
-            self.rbln_config.batch_size,
-            self.rbln_config.max_seq_len // self.rbln_config.kvcache_block_size,
-            dtype=torch.int16,
-        ).fill_(-1)
-        free_block_pool = deque(x for x in range(self.rbln_config.kvcache_num_blocks))
+        page_table_manager = RBLNPageTableManager(self.rbln_config)
+        common_kwargs = {
+            "main_input_name": "inputs_embeds" if self.rbln_config.use_inputs_embeds else "input_ids",
+            "embed_tokens": self.embed_tokens,
+            "dec_attn_mask": dec_attn_mask,
+            "page_table_manager": page_table_manager,
+            "rbln_config": self.rbln_config,
+        }
         self.prefill_decoder = RBLNGemma3RuntimeModel(
             runtime=self.model[0],
-            image_prefill=self.model[1],
-            main_input_name=main_input_name,
-            embed_tokens=self.embed_tokens,
+            image_prefill=self.model[1] if self.rbln_config.use_image_prefill else None,
             phase="prefill",
             batch_size=self.rbln_config.batch_size,
-            dec_attn_mask=dec_attn_mask,
-            block_tables=block_tables,
-            vocab_size=self.config.vocab_size,
-            free_block_pool=free_block_pool,
-            rbln_config=self.rbln_config,
+            **common_kwargs,
         )
         self.decoders = {}
         for i, batch_size in enumerate(self.rbln_config.decoder_batch_sizes):
             self.decoders[batch_size] = RBLNGemma3RuntimeModel(
-                runtime=self.model[i + 2],
-                main_input_name=main_input_name,
-                embed_tokens=self.embed_tokens,
+                runtime=self.model[i + self.rbln_config.decoder_runtime_idx],
                 phase="decode",
                 batch_size=batch_size,
-                dec_attn_mask=dec_attn_mask,
-                block_tables=block_tables,
-                free_block_pool=free_block_pool,
-                rbln_config=self.rbln_config,
+                **common_kwargs,
             )
         # NOTE(eunji): Use a decoder whose batch size matches the model's main batch size for compatibility.
@@ -589,6 +409,13 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     def _update_sliding_window_config(cls, model_config: PretrainedConfig, rbln_config: RBLNGemma3ForCausalLMConfig):
         sliding_window = getattr(model_config, "sliding_window", None)
         sliding_window_pattern = getattr(model_config, "sliding_window_pattern", None)
+        if sliding_window_pattern is None:
+            if hasattr(model_config, "layer_types"):
+                first_full_attention_index = model_config.layer_types.index("full_attention")
+                sliding_window_pattern = first_full_attention_index + 1
+            else:
+                raise ValueError("Cannot determine sliding_window_pattern from model_config")
         if sliding_window_pattern <= model_config.num_hidden_layers:
             rbln_config.cache_impl = "hybrid"
             rbln_config.sliding_window = sliding_window
@@ -599,7 +426,12 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
         return rbln_config
     @classmethod
-    def _update_submodule_config(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
+    def _update_submodule_config(
+        cls,
+        model: "PreTrainedModel",
+        rbln_config: RBLNModelConfig,
+        preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]],
+    ):
         if rbln_config.image_prefill_chunk_size is None:
             rbln_config.image_prefill_chunk_size = model.config.mm_tokens_per_image
@@ -624,20 +456,26 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
         if not (rbln_config.use_attention_mask and rbln_config.use_position_ids):
             raise ValueError("use_attention_mask and use_position_ids must be True for RBLNGemma3ForCausalLM")
-        # Update image prefill compile config
-        img_prefill_input_info = cls.get_input_info(
-            batch_size=1,
-            query_length=rbln_config.image_prefill_chunk_size,
-            rbln_config=rbln_config,
-            model_config=model_config,
-        )
-        image_prefill_compile_config = RBLNCompileConfig(
-            compiled_model_name="image_prefill", input_info=img_prefill_input_info
-        )
-        # Insert image_prefill compile config at index 1
-        compile_cfgs = rbln_config.compile_cfgs
-        compile_cfgs.insert(1, image_prefill_compile_config)
-        rbln_config.set_compile_cfgs(compile_cfgs)
+        if rbln_config.use_image_prefill:
+            if rbln_config.prefill_chunk_size != rbln_config.image_prefill_chunk_size:
+                raise NotImplementedError(
+                    "Not implemented for different prefill chunk sizes between text and image prefill."
+                )
+            # Update image prefill compile config
+            img_prefill_input_info = cls.get_input_info(
+                batch_size=1,
+                query_length=rbln_config.image_prefill_chunk_size,
+                rbln_config=rbln_config,
+                model_config=model_config,
+            )
+            image_prefill_compile_config = RBLNCompileConfig(
+                compiled_model_name="image_prefill", input_info=img_prefill_input_info
+            )
+            # Insert image_prefill compile config at index 1
+            compile_cfgs = rbln_config.compile_cfgs
+            compile_cfgs.insert(1, image_prefill_compile_config)
+            rbln_config.set_compile_cfgs(compile_cfgs)
         return rbln_config
@@ -690,23 +528,27 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
             context,
             rbln_config.quantization,
         )
+        compiled_models = {"prefill": compiled_prefill}
-        image_prefill_compile_config = rbln_compile_configs[1]
-        image_prefill_example_inputs = image_prefill_compile_config.get_dummy_inputs(
-            fill=0, static_tensors=static_tensors
-        )
-        wrapped_model.phase = "image_prefill"
-        compiled_image_prefill = compile_model(
-            wrapped_model,
-            image_prefill_compile_config,
-            image_prefill_example_inputs,
-            context,
-            rbln_config.quantization,
-        )
+        if rbln_config.use_image_prefill:
+            image_prefill_compile_config = rbln_compile_configs[1]
+            image_prefill_example_inputs = image_prefill_compile_config.get_dummy_inputs(
+                fill=0, static_tensors=static_tensors
+            )
+            wrapped_model.phase = "image_prefill"
+            compiled_image_prefill = compile_model(
+                wrapped_model,
+                image_prefill_compile_config,
+                image_prefill_example_inputs,
+                context,
+                rbln_config.quantization,
+            )
+            compiled_models["image_prefill"] = compiled_image_prefill
-        compiled_models = {"prefill": compiled_prefill, "image_prefill": compiled_image_prefill}
         wrapped_model.phase = "decode"
-        for batch_size, dec_compile_config in zip(rbln_config.decoder_batch_sizes, rbln_compile_configs[2:]):
+        for batch_size, dec_compile_config in zip(
+            rbln_config.decoder_batch_sizes, rbln_compile_configs[rbln_config.decoder_runtime_idx :]
+        ):
             dec_example_inputs = dec_compile_config.get_dummy_inputs(fill=0, static_tensors=static_tensors)
             compiled_decoder = compile_model(
                 wrapped_model,
@@ -727,35 +569,45 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     ) -> List[rebel.Runtime]:
         expected_model_names = [
             "prefill",
-            "image_prefill",
             *[f"decoder_batch_{batch_size}" for batch_size in rbln_config.decoder_batch_sizes],
         ]
+        if rbln_config.use_image_prefill:
+            expected_model_names.insert(1, "image_prefill")
         if any(model_name not in rbln_config.device_map for model_name in expected_model_names):
             cls._raise_missing_compiled_file_error(expected_model_names)
-        return [
+        ret_val = [
             rebel.Runtime(
                 compiled_models[0],
                 tensor_type="pt",
                 device=rbln_config.device_map["prefill"],
                 activate_profiler=rbln_config.activate_profiler,
                 timeout=rbln_config.timeout,
-            ),
-            rebel.Runtime(
-                compiled_models[1],
-                tensor_type="pt",
-                device=rbln_config.device_map["image_prefill"],
-                activate_profiler=rbln_config.activate_profiler,
-                timeout=rbln_config.timeout,
-            ),
-            *[
+            )
+        ]
+        if rbln_config.use_image_prefill:
+            ret_val.append(
+                rebel.Runtime(
+                    compiled_models[1],
+                    tensor_type="pt",
+                    device=rbln_config.device_map["image_prefill"],
+                    activate_profiler=rbln_config.activate_profiler,
+                    timeout=rbln_config.timeout,
+                ),
+            )
+        ret_val.extend(
+            [
                 rebel.Runtime(
-                    compiled_models[i + 2],
+                    compiled_models[i + rbln_config.decoder_runtime_idx],
                     tensor_type="pt",
                     device=rbln_config.device_map[f"decoder_batch_{batch_size}"],
                     activate_profiler=rbln_config.activate_profiler,
                     timeout=rbln_config.timeout,
                 )
                 for i, batch_size in enumerate(rbln_config.decoder_batch_sizes)
-            ],
-        ]
+            ]
+        )
+        return ret_val

optimum-rbln 0.8.2a4__py3-none-any.whl → 0.9.3rc0__py3-none-any.whl

optimum-rbln 0.8.2a4py3-none-any.whl → 0.9.3rc0py3-none-any.whl