PyPI - optimum-rbln - Versions diffs - 0.9.5a4__py3-none-any.whl → 0.10.0.post1__py3-none-any.whl - Mend

optimum-rbln 0.9.5a4py3-none-any.whl → 0.10.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

optimum/rbln/transformers/modeling_attention_utils.py CHANGED Viewed

@@ -192,20 +192,24 @@ class RBLNDecoderOnlyFlashAttentionMixin:
             available_dram - without_dramtensor for without_dramtensor in alloc_per_node_without_dram
         ]
-        kvcache_tensor_sizes: dict[str, list[int]] = compiled_models["prefill"].exp_get_dram_tensor_sizes()
+        # kvcache_tensor_sizes[key][node_id][chiplet_id] = alloc_size
+        kvcache_tensor_sizes: dict[str, list[list[int]]] = compiled_models["prefill"].exp_get_dram_tensor_sizes()
         kvcache_meta_can_resize: dict[str, bool] = {
             kvcache_meta.name: kvcache_meta.can_resize for kvcache_meta in rbln_config.kvcache_metas
         }
         def get_updated_kvcache_tensor_sizes(
-            kvcache_tensor_sizes: dict[str, list[int]], multiplier: int
-        ) -> dict[str, list[int]]:
+            kvcache_tensor_sizes: dict[str, list[list[int]]], multiplier: int
+        ) -> dict[str, list[list[int]]]:
             # Get the updated KV cache tensor sizes by multiplying the multiplier
             # with considering attention type (full or sliding), and memory alignment.
-            ret = {}
-            for key, sizes in kvcache_tensor_sizes.items():
+            ret: dict[str, list[list[int]]] = {}
+            for key, sizes_at_node in kvcache_tensor_sizes.items():
                 m = multiplier if kvcache_meta_can_resize[key] else 1
-                ret[key] = [align_2MB(size * m) for size in sizes]
+                ret[key] = [
+                    [align_2MB(size_at_chiplet * m) for size_at_chiplet in sizes_at_node_at_chiplet]
+                    for sizes_at_node_at_chiplet in sizes_at_node
+                ]
             return ret
         def check_memory_fits(multiplier: int) -> tuple[bool, list[int]]:
@@ -214,9 +218,11 @@ class RBLNDecoderOnlyFlashAttentionMixin:
             updated_kvcache_tensor_sizes = get_updated_kvcache_tensor_sizes(kvcache_tensor_sizes, multiplier)
             kvcache_tensor_sizes_at_node: list[int] = [0] * num_node
-            for tensor_sizes in updated_kvcache_tensor_sizes.values():
-                for node_id, size in enumerate(tensor_sizes):
-                    kvcache_tensor_sizes_at_node[node_id] += size
+            for tensor_sizes_at_node in updated_kvcache_tensor_sizes.values():
+                tensor_sizes_at_node: list[list[int]]
+                for node_id, sizes_at_chiplet in enumerate(tensor_sizes_at_node):
+                    sizes_at_chiplet: list[int]
+                    kvcache_tensor_sizes_at_node[node_id] += sum(sizes_at_chiplet)
             fits = all(
                 remaining_dram_at_node[node_id] >= kvcache_tensor_sizes_at_node[node_id] for node_id in range(num_node)

optimum/rbln/transformers/models/__init__.py CHANGED Viewed

@@ -79,6 +79,10 @@ _import_structure = {
         "RBLNColQwen2ForRetrieval",
         "RBLNColQwen2ForRetrievalConfig",
     ],
+    "detr": [
+        "RBLNDetrForObjectDetection",
+        "RBLNDetrForObjectDetectionConfig",
+    ],
     "distilbert": [
         "RBLNDistilBertForQuestionAnswering",
         "RBLNDistilBertForQuestionAnsweringConfig",
@@ -169,6 +173,10 @@ _import_structure = {
         "RBLNSiglipVisionModel",
         "RBLNSiglipVisionModelConfig",
     ],
+    "mixtral": [
+        "RBLNMixtralForCausalLM",
+        "RBLNMixtralForCausalLMConfig",
+    ],
     "swin": [
         "RBLNSwinBackbone",
         "RBLNSwinBackboneConfig",
@@ -264,6 +272,7 @@ if TYPE_CHECKING:
         RBLNLoRAConfig,
     )
     from .depth_anything import RBLNDepthAnythingForDepthEstimation, RBLNDepthAnythingForDepthEstimationConfig
+    from .detr import RBLNDetrForObjectDetection, RBLNDetrForObjectDetectionConfig
     from .distilbert import RBLNDistilBertForQuestionAnswering, RBLNDistilBertForQuestionAnsweringConfig
     from .dpt import RBLNDPTForDepthEstimation, RBLNDPTForDepthEstimationConfig
     from .exaone import RBLNExaoneForCausalLM, RBLNExaoneForCausalLMConfig
@@ -296,6 +305,7 @@ if TYPE_CHECKING:
     from .llava_next import RBLNLlavaNextForConditionalGeneration, RBLNLlavaNextForConditionalGenerationConfig
     from .midm import RBLNMidmLMHeadModel, RBLNMidmLMHeadModelConfig
     from .mistral import RBLNMistralForCausalLM, RBLNMistralForCausalLMConfig, RBLNMistralModel, RBLNMistralModelConfig
+    from .mixtral import RBLNMixtralForCausalLM, RBLNMixtralForCausalLMConfig
     from .opt import RBLNOPTForCausalLM, RBLNOPTForCausalLMConfig, RBLNOPTModel, RBLNOPTModelConfig
     from .paligemma import (
         RBLNPaliGemmaForConditionalGeneration,

optimum/rbln/transformers/models/auto/auto_factory.py CHANGED Viewed

@@ -184,8 +184,8 @@ class _BaseAutoModelClass:
         model_id: Union[str, Path],
         export: bool = None,
         rbln_config: Optional[Union[Dict, RBLNModelConfig]] = None,
-        **kwargs,
-    ):
+        **kwargs: Optional[Dict[str, Any]],
+    ) -> RBLNBaseModel:
         """
         Load an RBLN-accelerated model from a pretrained checkpoint or a compiled RBLN artifact.
@@ -213,7 +213,7 @@ class _BaseAutoModelClass:
                   `token`, `trust_remote_code`, `cache_dir`, `subfolder`, `local_files_only`).
         Returns:
-            An instantiated RBLN model ready for inference on RBLN NPUs.
+            RBLNBaseModel: An instantiated RBLN model ready for inference on RBLN NPUs.
         """
         rbln_cls = cls.get_rbln_cls(model_id, export=export, **kwargs)
         return rbln_cls.from_pretrained(model_id, export=export, rbln_config=rbln_config, **kwargs)

optimum/rbln/transformers/models/blip_2/configuration_blip_2.py CHANGED Viewed

@@ -32,8 +32,13 @@ class RBLNBlip2VisionModelConfig(RBLNModelConfig):
     def __init__(
         self,
         batch_size: Optional[int] = None,
-        **kwargs,
+        **kwargs: Any,
     ):
+        """
+        Args:
+            batch_size (Optional[int]): The batch size for inference. Defaults to 1.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
+        """
         super().__init__(**kwargs)
         self.batch_size = batch_size or 1
         if not isinstance(self.batch_size, int) or self.batch_size < 0:
@@ -53,7 +58,7 @@ class RBLNBlip2QFormerModelConfig(RBLNModelConfig):
         batch_size: Optional[int] = None,
         num_query_tokens: Optional[int] = None,
         image_text_hidden_size: Optional[int] = None,
-        **kwargs,
+        **kwargs: Any,
     ):
         """
         Args:

optimum/rbln/transformers/models/blip_2/modeling_blip_2.py CHANGED Viewed

@@ -468,7 +468,7 @@ class RBLNBlip2ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
             input_ids (torch.LongTensor, optional): The sequence used as a prompt for the generation.
             attention_mask (torch.LongTensor, optional): Mask to avoid performing attention on padding token indices
             inputs_embeds (torch.FloatTensor, optional): Embedded representation of the inputs. Should be float, not int tokens.
-            interpolate_pos_encoding (bool, optional, defaults to False) — Whether to interpolate the positional encoding of the image embeddings.
+            interpolate_pos_encoding (bool, optional, defaults to False): Whether to interpolate the positional encoding of the image embeddings.
         Returns:
             A list of strings of length batch_size * num_captions.
         """

optimum/rbln/transformers/models/colqwen2/configuration_colqwen2.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import Any, Optional
 from optimum.rbln.configuration_utils import RBLNModelConfig
@@ -61,7 +61,7 @@ class RBLNColQwen2ForRetrievalConfig(RBLNDecoderOnlyModelConfig):
         batch_size: Optional[int] = None,
         output_hidden_states: Optional[bool] = None,
         vlm: Optional[RBLNModelConfig] = None,
-        **kwargs,
+        **kwargs: Any,
     ):
         """
         Args:

optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py CHANGED Viewed

@@ -61,7 +61,7 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
         logits_to_keep: Optional[int] = None,
         output_hidden_states: Optional[bool] = None,
         kvcache_metas: Optional[List["KVCacheMeta"]] = None,
-        **kwargs,
+        **kwargs: Any,
     ):
         """
         Args:
@@ -288,6 +288,31 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
     def can_generate(self) -> bool:
         return "decode" in self.phases
+    @property
+    def use_image_prefill(self):
+        return "image_prefill" in self.phases
+    @property
+    def image_prefill_runtime_idx(self):
+        return self.phases.index("image_prefill")
+    @property
+    def expected_compiled_model_names(self):
+        # ["prefill", "image_prefill", "decoder_batch_1", "decoder_batch_2", ...]
+        if self.can_generate:
+            return self.phases[: self.decoder_runtime_idx] + [
+                f"decoder_batch_{batch_size}" for batch_size in self.decoder_batch_sizes
+            ]
+        else:
+            return self.phases
+    @property
+    def decoder_runtime_idx(self):
+        if self.can_generate:
+            return self.phases.index("decode")
+        else:
+            raise ValueError("`decode` phase is not in the phases.")
     @property
     def nbits_per_param(self) -> int:
         if self.quantization:

optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py CHANGED Viewed

@@ -539,6 +539,7 @@ class DecoderOnlyLayer(nn.Module):
     _POST_ATTN_LAYERNORM = ["post_attention_layernorm", "ln_2", "final_layer_norm", "post_feedforward_layernorm"]
     _PRE_FF_LAYERNORM_ATTRS = None
     _POST_FF_LAYERNORM_ATTRS = None
+    _MLP_ATTR = ("mlp",)
     def __init__(self, layer, self_attn: "DecoderOnlyAttention", lora_config: Optional[RBLNLoRAConfig] = None):
         super().__init__()
@@ -547,7 +548,7 @@ class DecoderOnlyLayer(nn.Module):
         self.post_attention_layernorm = _get_attr_from_candidates(layer, self._POST_ATTN_LAYERNORM)
         self.pre_feedforward_layernorm = _get_attr_from_candidates(layer, self._PRE_FF_LAYERNORM_ATTRS)
         self.post_feedforward_layernorm = _get_attr_from_candidates(layer, self._POST_FF_LAYERNORM_ATTRS)
-        self.mlp = layer.mlp
+        self.mlp = _get_attr_from_candidates(layer, self._MLP_ATTR)
         self.self_attn = self_attn
         self._phase = "prefill"
         self.lora_config = lora_config

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -104,6 +104,11 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
             "rbln_config": self.rbln_config,
             "config": self.config,
         }
+        if self.rbln_config.use_image_prefill:
+            # TODO(sdk-gen): Implement and combine prefill and image prefill into a single runtime.
+            raise NotImplementedError(f"Image prefill at {self.__class__.__name__} is not supported yet.")
         self.prefill_decoder = RBLNRuntimeModel(
             runtime=self.model[0],
             phase="prefill",
@@ -287,9 +292,27 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
             phase="prefill",
         )
+        if rbln_config.use_image_prefill:
+            image_prefill_compile_config = rbln_config.compile_cfgs[rbln_config.image_prefill_runtime_idx]
+            image_prefill_example_inputs = image_prefill_compile_config.get_dummy_inputs(
+                fill=0, static_tensors=static_tensors
+            )
+            compiled_image_prefill = cls._compile_model(
+                wrapped_model,
+                image_prefill_compile_config,
+                image_prefill_example_inputs,
+                context,
+                rbln_config,
+                rbln_config.quantization,
+                phase="image_prefill",
+            )
+            compiled_models["image_prefill"] = compiled_image_prefill
         if rbln_config.can_generate:
             wrapped_model.phase = "decode"
-            for batch_size, dec_compile_config in zip(rbln_config.decoder_batch_sizes, rbln_config.compile_cfgs[1:]):
+            for batch_size, dec_compile_config in zip(
+                rbln_config.decoder_batch_sizes, rbln_config.compile_cfgs[rbln_config.decoder_runtime_idx :]
+            ):
                 dec_example_inputs = dec_compile_config.get_dummy_inputs(fill=0, static_tensors=static_tensors)
                 compiled_decoder = cls._compile_model(
                     wrapped_model,
@@ -548,6 +571,22 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         prefill_compile_config = RBLNCompileConfig(compiled_model_name="prefill", input_info=prefill_input_info)
         compile_cfgs = [prefill_compile_config]
+        if rbln_config.use_image_prefill:
+            if rbln_config.prefill_chunk_size != rbln_config.image_prefill_chunk_size:
+                raise NotImplementedError(
+                    "Not implemented for different prefill chunk sizes between text and image prefill."
+                )
+            image_prefill_input_info = cls.get_input_info(
+                batch_size=1,
+                query_length=rbln_config.image_prefill_chunk_size,
+                rbln_config=rbln_config,
+                model_config=model_config,
+            )
+            image_prefill_compile_config = RBLNCompileConfig(
+                compiled_model_name="image_prefill", input_info=image_prefill_input_info
+            )
+            compile_cfgs.append(image_prefill_compile_config)
         if rbln_config.can_generate:
             for batch_size in rbln_config.decoder_batch_sizes:
                 dec_input_info = cls.get_input_info(
@@ -569,36 +608,21 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         compiled_models: List[rebel.RBLNCompiledModel],
         rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
     ) -> List[rebel.Runtime]:
-        expected_model_names = ["prefill"]
-        if rbln_config.can_generate:
-            expected_model_names.extend(
-                [f"decoder_batch_{batch_size}" for batch_size in rbln_config.decoder_batch_sizes]
-            )
+        expected_model_names = rbln_config.expected_compiled_model_names
         if any(model_name not in rbln_config.device_map for model_name in expected_model_names):
             cls._raise_missing_compiled_file_error(expected_model_names)
         ret_val = [
             rebel.Runtime(
-                compiled_models[0],
+                compiled_models[i],
                 tensor_type="pt",
-                device=rbln_config.device_map["prefill"],
+                device=rbln_config.device_map[model_name],
                 activate_profiler=rbln_config.activate_profiler,
                 timeout=rbln_config.timeout,
             )
+            for i, model_name in enumerate(expected_model_names)
         ]
-        if rbln_config.can_generate:
-            ret_val.extend(
-                [
-                    rebel.Runtime(
-                        compiled_models[i + 1],
-                        tensor_type="pt",
-                        device=rbln_config.device_map[f"decoder_batch_{batch_size}"],
-                        activate_profiler=rbln_config.activate_profiler,
-                        timeout=rbln_config.timeout,
-                    )
-                    for i, batch_size in enumerate(rbln_config.decoder_batch_sizes)
-                ]
-            )
         return ret_val
     def forward(

optimum/rbln/transformers/models/detr/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_detr import RBLNDetrForObjectDetectionConfig
+from .modeling_detr import RBLNDetrForObjectDetection
+__all__ = [
+    "RBLNDetrForObjectDetectionConfig",
+    "RBLNDetrForObjectDetection",
+]

optimum/rbln/transformers/models/detr/configuration_detr.py ADDED Viewed

@@ -0,0 +1,38 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_generic import RBLNModelForImageClassificationConfig
+class RBLNDetrForObjectDetectionConfig(RBLNModelForImageClassificationConfig):
+    """
+    Configuration class for RBLNDetrForObjectDetection.
+    This configuration class stores the configuration parameters specific to
+    RBLN-optimized DETR models for object detection tasks.
+    """
+    def __init__(self, **kwargs):
+        """
+        Args:
+            image_size (Optional[Union[int, Tuple[int, int]]]): The size of input images.
+                Can be an integer for square images or a tuple (height, width).
+            batch_size (Optional[int]): The batch size for inference. Defaults to 1.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
+        Raises:
+            ValueError: If batch_size is not a positive integer.
+        """
+        super().__init__(**kwargs)

optimum/rbln/transformers/models/detr/modeling_detr.py ADDED Viewed

@@ -0,0 +1,53 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Tuple, Union
+import torch
+from transformers.models.detr.modeling_detr import DetrObjectDetectionOutput
+from ...modeling_generic import RBLNModelForImageClassification
+if TYPE_CHECKING:
+    pass
+class RBLNDetrForObjectDetection(RBLNModelForImageClassification):
+    """
+    RBLN optimized DETR model for object detection tasks.
+    This class provides hardware-accelerated inference for DETR models
+    on RBLN devices, supporting object detection with detection heads
+    designed for object detection tasks.
+    """
+    def forward(
+        self, pixel_values: torch.Tensor, return_dict: bool = None, **kwargs
+    ) -> Union[Tuple, DetrObjectDetectionOutput]:
+        """
+        Foward pass for the RBLN-optimized DETR model for object detection.
+        Args:
+            pixel_values (torch.FloatTensor of shape (batch_size, channels, height, width)): The tensors corresponding to the input images.
+            return_dict (bool, *optional*, defaults to True): Whether to return a dictionary of outputs.
+        Returns:
+            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a ImageClassifierOutputWithNoAttention object.
+        """
+        output = self.model[0](pixel_values=pixel_values, **kwargs)
+        return DetrObjectDetectionOutput(
+            logits=output[0], pred_boxes=output[1], last_hidden_state=output[2], encoder_last_hidden_state=output[3]
+        )

optimum/rbln/transformers/models/gemma3/configuration_gemma3.py CHANGED Viewed

@@ -58,13 +58,8 @@ class RBLNGemma3ForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
         )
         self.image_prefill_chunk_size = image_prefill_chunk_size
-    @property
-    def use_image_prefill(self):
-        return self.image_prefill_chunk_size is not None
-    @property
-    def decoder_runtime_idx(self):
-        return 2 if self.use_image_prefill else 1
+        if not (self.use_attention_mask and self.use_position_ids):
+            raise ValueError("use_attention_mask and use_position_ids must be True for RBLNGemma3ForCausalLM")
 class RBLNGemma3ForConditionalGenerationConfig(RBLNModelConfig):

optimum/rbln/transformers/models/gemma3/modeling_gemma3.py CHANGED Viewed

@@ -13,11 +13,9 @@
 # limitations under the License.
 import importlib
 import inspect
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
-import rebel
 import torch
-from rebel.compile_context import CompileContext
 from transformers import AutoModelForImageTextToText, Gemma3ForConditionalGeneration, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.modeling_utils import no_init_weights
@@ -29,10 +27,7 @@ from ...modeling_outputs import RBLNDecoderOnlyOutput
 from ...utils.rbln_runtime_wrapper import LoopProcessor
 from ..decoderonly.decoderonly_runtime_utils import RBLNPageTableManager
 from ..decoderonly.generation_decoderonly import RBLNDecoderOnlyGenerationMixin
-from ..decoderonly.modeling_decoderonly import (
-    RBLNDecoderOnlyModelForCausalLM,
-)
-from .configuration_gemma3 import RBLNGemma3ForCausalLMConfig
+from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .gemma3_architecture import Gemma3ForCausalLMWrapper
 from .gemma3_runtime_utils import RBLNGemma3RuntimeModel
@@ -455,174 +450,7 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
                 f"Image prefill chunk size is different from mm_tokens_per_image: {rbln_config.image_prefill_chunk_size} != {model.config.mm_tokens_per_image}"
             )
-        return rbln_config
-    @classmethod
-    def _update_rbln_config(
-        cls,
-        preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]] = None,
-        model: Optional["PreTrainedModel"] = None,
-        model_config: Optional["PretrainedConfig"] = None,
-        rbln_config: Optional[RBLNGemma3ForCausalLMConfig] = None,
-    ) -> RBLNGemma3ForCausalLMConfig:
-        # Update rbln_config with super class
-        rbln_config = super()._update_rbln_config(preprocessors, model, model_config, rbln_config)
-        if not (rbln_config.use_attention_mask and rbln_config.use_position_ids):
-            raise ValueError("use_attention_mask and use_position_ids must be True for RBLNGemma3ForCausalLM")
-        if rbln_config.use_image_prefill:
-            if rbln_config.prefill_chunk_size != rbln_config.image_prefill_chunk_size:
-                raise NotImplementedError(
-                    "Not implemented for different prefill chunk sizes between text and image prefill."
-                )
-            # Update image prefill compile config
-            img_prefill_input_info = cls.get_input_info(
-                batch_size=1,
-                query_length=rbln_config.image_prefill_chunk_size,
-                rbln_config=rbln_config,
-                model_config=model_config,
-            )
-            image_prefill_compile_config = RBLNCompileConfig(
-                compiled_model_name="image_prefill", input_info=img_prefill_input_info
-            )
-            # Insert image_prefill compile config at index 1
-            compile_cfgs = rbln_config.compile_cfgs
-            compile_cfgs.insert(1, image_prefill_compile_config)
-            rbln_config.set_compile_cfgs(compile_cfgs)
+        if "image_prefill" not in rbln_config.phases:
+            rbln_config.phases = ["prefill", "image_prefill", "decode"]
         return rbln_config
-    @classmethod
-    @torch.inference_mode()
-    def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNGemma3ForCausalLMConfig):
-        wrapped_model = cls._wrap_model_if_needed(model, rbln_config)
-        rbln_compile_configs = rbln_config.compile_cfgs
-        prefill_compile_config = rbln_compile_configs[0]
-        context = CompileContext(use_weight_sharing=True)
-        # Here we use meta tensor, for the memory efficiency.
-        meta_tensor_names = [name for name, _, _ in prefill_compile_config.input_info if "past_key_values" in name]
-        prefill_example_inputs = prefill_compile_config.get_dummy_inputs(fill=0, meta_tensor_names=meta_tensor_names)
-        # Mark static tensors (self kv states)
-        static_tensors = {}
-        for (name, _, _), tensor in zip(prefill_compile_config.input_info, prefill_example_inputs):
-            if "past_key_values" in name:
-                static_tensors[name] = tensor
-                context.mark_static_address(tensor)
-        def compile_model(wrapped_model, compile_config, example_inputs, compile_context, quantization):
-            try:
-                if quantization:
-                    quantization.maybe_set_quantization_env()
-                original_linear = torch.nn.functional.linear
-                torch.nn.functional.linear = torch.ops.rbln_custom_ops.linear
-                compiled_model = cls.compile(
-                    wrapped_model,
-                    compile_config,
-                    create_runtimes=rbln_config.create_runtimes,
-                    device=rbln_config.device,
-                    example_inputs=example_inputs,
-                    compile_context=compile_context,
-                )
-                return compiled_model
-            finally:
-                torch.nn.functional.linear = original_linear
-                if quantization:
-                    quantization.maybe_reset_quantization_env()
-        wrapped_model.phase = "prefill"
-        compiled_prefill = compile_model(
-            wrapped_model,
-            prefill_compile_config,
-            prefill_example_inputs,
-            context,
-            rbln_config.quantization,
-        )
-        compiled_models = {"prefill": compiled_prefill}
-        if rbln_config.use_image_prefill:
-            image_prefill_compile_config = rbln_compile_configs[1]
-            image_prefill_example_inputs = image_prefill_compile_config.get_dummy_inputs(
-                fill=0, static_tensors=static_tensors
-            )
-            wrapped_model.phase = "image_prefill"
-            compiled_image_prefill = compile_model(
-                wrapped_model,
-                image_prefill_compile_config,
-                image_prefill_example_inputs,
-                context,
-                rbln_config.quantization,
-            )
-            compiled_models["image_prefill"] = compiled_image_prefill
-        wrapped_model.phase = "decode"
-        for batch_size, dec_compile_config in zip(
-            rbln_config.decoder_batch_sizes, rbln_compile_configs[rbln_config.decoder_runtime_idx :]
-        ):
-            dec_example_inputs = dec_compile_config.get_dummy_inputs(fill=0, static_tensors=static_tensors)
-            compiled_decoder = compile_model(
-                wrapped_model,
-                dec_compile_config,
-                dec_example_inputs,
-                context,
-                rbln_config.quantization,
-            )
-            compiled_models[f"decoder_batch_{batch_size}"] = compiled_decoder
-        return compiled_models
-    @classmethod
-    def _create_runtimes(
-        cls,
-        compiled_models: List[rebel.RBLNCompiledModel],
-        rbln_config: RBLNGemma3ForCausalLMConfig,
-    ) -> List[rebel.Runtime]:
-        expected_model_names = [
-            "prefill",
-            *[f"decoder_batch_{batch_size}" for batch_size in rbln_config.decoder_batch_sizes],
-        ]
-        if rbln_config.use_image_prefill:
-            expected_model_names.insert(1, "image_prefill")
-        if any(model_name not in rbln_config.device_map for model_name in expected_model_names):
-            cls._raise_missing_compiled_file_error(expected_model_names)
-        ret_val = [
-            rebel.Runtime(
-                compiled_models[0],
-                tensor_type="pt",
-                device=rbln_config.device_map["prefill"],
-                activate_profiler=rbln_config.activate_profiler,
-                timeout=rbln_config.timeout,
-            )
-        ]
-        if rbln_config.use_image_prefill:
-            ret_val.append(
-                rebel.Runtime(
-                    compiled_models[1],
-                    tensor_type="pt",
-                    device=rbln_config.device_map["image_prefill"],
-                    activate_profiler=rbln_config.activate_profiler,
-                    timeout=rbln_config.timeout,
-                ),
-            )
-        ret_val.extend(
-            [
-                rebel.Runtime(
-                    compiled_models[i + rbln_config.decoder_runtime_idx],
-                    tensor_type="pt",
-                    device=rbln_config.device_map[f"decoder_batch_{batch_size}"],
-                    activate_profiler=rbln_config.activate_profiler,
-                    timeout=rbln_config.timeout,
-                )
-                for i, batch_size in enumerate(rbln_config.decoder_batch_sizes)
-            ]
-        )
-        return ret_val

optimum-rbln 0.9.5a4__py3-none-any.whl → 0.10.0.post1__py3-none-any.whl

optimum-rbln 0.9.5a4py3-none-any.whl → 0.10.0.post1py3-none-any.whl