PyPI - optimum-rbln - Versions diffs - 0.9.4a2__py3-none-any.whl → 0.10.0.post1__py3-none-any.whl - Mend

optimum-rbln 0.9.4a2py3-none-any.whl → 0.10.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py CHANGED Viewed

@@ -177,7 +177,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         dec_attn_mask: torch.Tensor,
         page_table_manager: RBLNPageTableManager,
         rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
-        config: "PreTrainedConfig" = None,
+        config: Optional["PreTrainedConfig"] = None,
         logits_last_dim: Optional[int] = None,
         **kwargs: Any,
     ) -> None:
@@ -391,16 +391,14 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         # Initialize attention mask for chunked processing
         if self.rbln_config.use_attention_mask:
             if self.rbln_config.use_position_ids:
-                chunked_attention_mask = torch.zeros(
-                    1, self.rbln_config.max_seq_len, dtype=self.rbln_config.torch_dtype
-                )
+                chunked_attention_mask = torch.zeros(1, self.rbln_config.max_seq_len, dtype=self.rbln_config.dtype)
             else:
                 chunked_attention_mask = torch.zeros(
                     1,
                     1,
                     self.rbln_config.prefill_chunk_size,
                     self.rbln_config.max_seq_len,
-                    dtype=self.rbln_config.torch_dtype,
+                    dtype=self.rbln_config.dtype,
                 )
         else:
             chunked_attention_mask = None
@@ -467,7 +465,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             1 if self.rbln_config.logits_to_keep == 1 else padded_mask_length,
             logits_last_dim,
         )
-        output_logits = torch.full(logits_size, fill_value=1e-10, dtype=self.rbln_config.torch_dtype)
+        output_logits = torch.full(logits_size, fill_value=1e-10, dtype=self.rbln_config.dtype)
         if self.rbln_config.logits_to_keep == 1:
             for i in range(padded_input_length // self.rbln_config.prefill_chunk_size):
@@ -486,7 +484,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                 self.config.hidden_size,
             )
             output_hidden_states = [
-                torch.full(hidden_states_size, fill_value=1e-10, dtype=self.rbln_config.torch_dtype)
+                torch.full(hidden_states_size, fill_value=1e-10, dtype=self.rbln_config.dtype)
                 for _ in range(self.config.num_hidden_layers + 1)
             ]

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -26,15 +26,16 @@ from transformers.modeling_utils import no_init_weights
 from ....configuration_utils import RBLNCompileConfig
 from ....modeling import RBLNModel
 from ....utils.logging import get_logger
+from ....utils.runtime_utils import is_compiler_supports_buffer_resize
 from ...modeling_attention_utils import (
     RBLNDecoderOnlyFlashAttentionMixin,
     set_default_values,
     validate_attention_method,
     validate_sliding_window,
 )
-from ...modeling_outputs import RBLNDecoderOnlyOutput
+from ...modeling_outputs import RBLNDecoderOnlyOutput, _validate_output_hidden_states
 from ...utils.rbln_quantization import get_quantized_model
-from .configuration_decoderonly import RBLNDecoderOnlyModelConfig, RBLNDecoderOnlyModelForCausalLMConfig
+from .configuration_decoderonly import KVCacheMeta, RBLNDecoderOnlyModelConfig, RBLNDecoderOnlyModelForCausalLMConfig
 from .decoderonly_architecture import DecoderOnlyWrapper
 from .decoderonly_runtime_utils import RBLNPageTableManager, RBLNRuntimeModel
 from .generation_decoderonly import RBLNDecoderOnlyGenerationMixin
@@ -103,6 +104,11 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
             "rbln_config": self.rbln_config,
             "config": self.config,
         }
+        if self.rbln_config.use_image_prefill:
+            # TODO(sdk-gen): Implement and combine prefill and image prefill into a single runtime.
+            raise NotImplementedError(f"Image prefill at {self.__class__.__name__} is not supported yet.")
         self.prefill_decoder = RBLNRuntimeModel(
             runtime=self.model[0],
             phase="prefill",
@@ -230,7 +236,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
         quantization=None,
         phase: str = "prefill",
-    ):
+    ) -> rebel.RBLNCompiledModel:
         try:
             wrapped_model.phase = phase
             if quantization:
@@ -252,21 +258,15 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
                 quantization.maybe_reset_quantization_env()
     @classmethod
-    def _get_compile_context(
-        cls,
-        compile_config: RBLNCompileConfig,
-        example_inputs: List[torch.Tensor],
-    ):
+    def _get_compile_context(cls, compile_config: RBLNCompileConfig, example_inputs: List[torch.Tensor]):
         context = CompileContext(use_weight_sharing=True)
         # Mark static tensors (self kv states)
         static_tensors = {}
-        idx = 0
         for (name, _, _), tensor in zip(compile_config.input_info, example_inputs):
             if "past_key_values" in name:
                 static_tensors[name] = tensor
-                context.mark_static_address(tensor, f"kv_cache_{idx}")
-                idx += 1
+                context.mark_static_address(tensor, name)
         return context, static_tensors
@@ -281,7 +281,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         prefill_example_inputs = prefill_compile_config.get_dummy_inputs(fill=0, meta_tensor_names=meta_tensor_names)
         context, static_tensors = cls._get_compile_context(prefill_compile_config, prefill_example_inputs)
-        compiled_models = {}
+        compiled_models: dict[str, rebel.RBLNCompiledModel] = {}
         compiled_models["prefill"] = cls._compile_model(
             wrapped_model,
             prefill_compile_config,
@@ -292,9 +292,27 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
             phase="prefill",
         )
+        if rbln_config.use_image_prefill:
+            image_prefill_compile_config = rbln_config.compile_cfgs[rbln_config.image_prefill_runtime_idx]
+            image_prefill_example_inputs = image_prefill_compile_config.get_dummy_inputs(
+                fill=0, static_tensors=static_tensors
+            )
+            compiled_image_prefill = cls._compile_model(
+                wrapped_model,
+                image_prefill_compile_config,
+                image_prefill_example_inputs,
+                context,
+                rbln_config,
+                rbln_config.quantization,
+                phase="image_prefill",
+            )
+            compiled_models["image_prefill"] = compiled_image_prefill
         if rbln_config.can_generate:
             wrapped_model.phase = "decode"
-            for batch_size, dec_compile_config in zip(rbln_config.decoder_batch_sizes, rbln_config.compile_cfgs[1:]):
+            for batch_size, dec_compile_config in zip(
+                rbln_config.decoder_batch_sizes, rbln_config.compile_cfgs[rbln_config.decoder_runtime_idx :]
+            ):
                 dec_example_inputs = dec_compile_config.get_dummy_inputs(fill=0, static_tensors=static_tensors)
                 compiled_decoder = cls._compile_model(
                     wrapped_model,
@@ -307,14 +325,10 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
                 )
                 compiled_models[f"decoder_batch_{batch_size}"] = compiled_decoder
-            # check if the memory is enough to have additional blocks
-            required_num_blocks = (rbln_config.max_seq_len // rbln_config.kvcache_block_size) * rbln_config.batch_size
-            if rbln_config.kvcache_num_blocks < required_num_blocks:
-                cls.maybe_suggest_kvcache_num_blocks(
-                    compiled_models=compiled_models,
-                    model_config=model.config,
-                    rbln_config=rbln_config,
-                )
+        if rbln_config.is_auto_num_blocks:
+            if not is_compiler_supports_buffer_resize():
+                raise RuntimeError("`kvcache_num_blocks` must be set.")
+            cls.set_kvcache_num_blocks_after_compilation(compiled_models, rbln_config)
         return compiled_models
@@ -330,8 +344,8 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         return model
     @classmethod
-    def use_query_position(cls, use_local_attention: bool, is_prefill: bool = True):
-        return use_local_attention
+    def use_query_position(cls, use_local_attention: bool, is_prefill: bool = True, logits_to_keep: int = None):
+        return is_prefill and (use_local_attention or logits_to_keep == 1)
     @classmethod
     def get_input_info(
@@ -350,7 +364,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         input_info = []
         if rbln_config.use_inputs_embeds:
-            input_info.append(("inputs_embeds", [batch_size, query_length, hidden_size], rbln_config.torch_dtype))
+            input_info.append(("inputs_embeds", [batch_size, query_length, hidden_size], rbln_config.dtype))
         else:
             input_info.append(("input_ids", [batch_size, query_length], "int64"))
@@ -364,15 +378,15 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         if rbln_config.use_local_attention:
             input_info.append(("local_block_tables", [1] if is_prefill else [batch_size, 1], "int16"))
-        if cls.use_query_position(rbln_config.use_local_attention, is_prefill):
+        if cls.use_query_position(rbln_config.use_local_attention, is_prefill, rbln_config.logits_to_keep):
             input_info.append(("query_position", [], "int16"))
         if rbln_config.use_attention_mask:
             if rbln_config.use_position_ids:
-                input_info.append(("attention_mask", [batch_size, rbln_config.max_seq_len], rbln_config.torch_dtype))
+                input_info.append(("attention_mask", [batch_size, rbln_config.max_seq_len], rbln_config.dtype))
             else:
                 input_info.append(
-                    ("attention_mask", [batch_size, 1, query_length, rbln_config.max_seq_len], rbln_config.torch_dtype)
+                    ("attention_mask", [batch_size, 1, query_length, rbln_config.max_seq_len], rbln_config.dtype)
                 )
         if rbln_config.use_position_ids:
@@ -381,29 +395,36 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         if rbln_config.use_lora:
             input_info.append(("lora_int_ids", [batch_size], "int32"))
-        kvcache_dtype = rbln_config.torch_dtype
-        if rbln_config.quantization and rbln_config.quantization.kv_caches == "fp8":
-            kvcache_dtype = "float8_e4m3fn"
+        if len(rbln_config.kvcache_metas) > 0:
+            # Meta is already set, use it
+            input_info.extend(
+                [
+                    (kvcache_meta.name, kvcache_meta.compile_shape, kvcache_meta.dtype)
+                    for kvcache_meta in rbln_config.kvcache_metas
+                ]
+            )
-        global_kvcache_shape = [
-            rbln_config.kvcache_num_blocks,
-            num_key_value_heads,
-            rbln_config.kvcache_block_size,
-            head_dim,
-        ]
-        local_kvcache_shape = [rbln_config.batch_size, num_key_value_heads, rbln_config.sliding_window, head_dim]
-        input_info.extend(
-            [
-                (
-                    f"past_key_values_{i}",
-                    local_kvcache_shape
-                    if rbln_config.sliding_window is not None and ((i // 2) in rbln_config.sliding_window_layers)
-                    else global_kvcache_shape,
-                    kvcache_dtype,
+        else:
+            kvcache_dtype = rbln_config.dtype
+            if rbln_config.quantization and rbln_config.quantization.kv_caches == "fp8":
+                kvcache_dtype = "float8_e4m3fn"
+            kvcache_metas = []
+            for i in range(num_hidden_layers * 2):
+                layer_idx = i // 2
+                name = f"past_key_values_{i}"
+                kvcache_meta = KVCacheMeta.make(
+                    name,
+                    layer_idx,
+                    num_key_value_heads,
+                    head_dim,
+                    RBLNCompileConfig.normalize_dtype(kvcache_dtype),
+                    rbln_config,
                 )
-                for i in range(num_hidden_layers * 2)
-            ]
-        )
+                kvcache_metas.append(kvcache_meta)
+                input_info.append((name, kvcache_meta.compile_shape, kvcache_meta.dtype))
+            rbln_config.kvcache_metas.extend(kvcache_metas)
         return input_info
@@ -475,51 +496,39 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
             max_seq_len=rbln_config.max_seq_len,
         )
-        num_full_blocks = (rbln_config.max_seq_len // rbln_config.kvcache_block_size) * rbln_config.batch_size
-        # Update kvcache_num_blocks based on the attention implementation.
+        # Validate kvcache_num_blocks based on the number of full blocks required.
+        # Eager mode restriction:
+        # - num_blocks must be at least equal to the batch size
+        # Flash attention restriction:
+        # - num_blocks must be at least equal to (max_seq_len // kvcache_block_size) + 1
+        # - num_blocks must be no greater than the number of full blocks.
         if rbln_config.attn_impl == "flash_attn":
-            estimated_max_num_blocks = cls.get_maximum_num_blocks_by_model(
-                model=model, model_config=model_config, rbln_config=rbln_config
-            )
+            if rbln_config.is_auto_num_blocks:
+                # Do nothing
+                pass
-            if rbln_config.kvcache_num_blocks is None:
-                if estimated_max_num_blocks < num_full_blocks:
-                    # lower bound of the number of blocks for flash attention.
-                    min_blocks_for_flash = min(
-                        rbln_config.max_seq_len // rbln_config.kvcache_block_size + 1, num_full_blocks
+            else:
+                if rbln_config.kvcache_num_blocks > rbln_config.num_full_blocks:
+                    logger.warning(
+                        f"The set `kvcache_num_blocks` ({rbln_config.kvcache_num_blocks}) is greater"
+                        f" than the required number of blocks ({rbln_config.num_full_blocks})."
+                        "This can cause a failure during model compilation."
+                    )
+                elif rbln_config.kvcache_num_blocks < rbln_config.num_min_blocks:
+                    raise ValueError(
+                        f"The set `kvcache_num_blocks` ({rbln_config.kvcache_num_blocks}) is less"
+                        f" than the minimum number of blocks ({rbln_config.num_min_blocks})."
                     )
-                    if min_blocks_for_flash > estimated_max_num_blocks:
-                        # NOTE: Just try to compile with lower bound of blocks for flash attention.
-                        # Even if it's larger than the estimated maximum number of blocks.
-                        rbln_config.kvcache_num_blocks = min_blocks_for_flash
-                    else:
-                        logger.info(f"[KVCache] Compiling with num_blocks: {rbln_config.kvcache_num_blocks}")
-                        rbln_config.kvcache_num_blocks = estimated_max_num_blocks
-                    if rbln_config.kvcache_num_blocks < rbln_config.batch_size:
-                        raise RuntimeError(
-                            f"Batch size ({rbln_config.batch_size}) exceeds num_blocks ({rbln_config.kvcache_num_blocks}). "
-                            "Ensure the number of blocks is at least equal to the batch size."
-                        )
-                else:
-                    rbln_config.kvcache_num_blocks = num_full_blocks
-            elif rbln_config.kvcache_num_blocks > estimated_max_num_blocks:
-                logger.warning(
-                    f"The set `kvcache_num_blocks` ({rbln_config.kvcache_num_blocks}) is greater"
-                    f" than the estimated maximum number of blocks ({estimated_max_num_blocks})."
-                    "This can cause a failure during model compilation."
-                )
         else:
-            if rbln_config.kvcache_num_blocks is None:
-                rbln_config.kvcache_num_blocks = num_full_blocks
-            elif rbln_config.kvcache_num_blocks > num_full_blocks:
+            if rbln_config.is_auto_num_blocks:
+                # Eager attention should use fixed number of blocks.
+                rbln_config.kvcache_num_blocks = rbln_config.num_full_blocks
+            elif rbln_config.kvcache_num_blocks > rbln_config.num_full_blocks:
                 logger.warning(
                     f"The set `kvcache_num_blocks` ({rbln_config.kvcache_num_blocks}) is greater"
-                    f" than the required number of blocks ({num_full_blocks})."
+                    f" than the required number of blocks ({rbln_config.num_full_blocks})."
                     "This can cause a failure during model compilation."
                 )
-        logger.info(f"[KVCache] Compiling with num_blocks: {rbln_config.kvcache_num_blocks}")
         return rbln_config
@@ -562,6 +571,22 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         prefill_compile_config = RBLNCompileConfig(compiled_model_name="prefill", input_info=prefill_input_info)
         compile_cfgs = [prefill_compile_config]
+        if rbln_config.use_image_prefill:
+            if rbln_config.prefill_chunk_size != rbln_config.image_prefill_chunk_size:
+                raise NotImplementedError(
+                    "Not implemented for different prefill chunk sizes between text and image prefill."
+                )
+            image_prefill_input_info = cls.get_input_info(
+                batch_size=1,
+                query_length=rbln_config.image_prefill_chunk_size,
+                rbln_config=rbln_config,
+                model_config=model_config,
+            )
+            image_prefill_compile_config = RBLNCompileConfig(
+                compiled_model_name="image_prefill", input_info=image_prefill_input_info
+            )
+            compile_cfgs.append(image_prefill_compile_config)
         if rbln_config.can_generate:
             for batch_size in rbln_config.decoder_batch_sizes:
                 dec_input_info = cls.get_input_info(
@@ -583,36 +608,21 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         compiled_models: List[rebel.RBLNCompiledModel],
         rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
     ) -> List[rebel.Runtime]:
-        expected_model_names = ["prefill"]
-        if rbln_config.can_generate:
-            expected_model_names.extend(
-                [f"decoder_batch_{batch_size}" for batch_size in rbln_config.decoder_batch_sizes]
-            )
+        expected_model_names = rbln_config.expected_compiled_model_names
         if any(model_name not in rbln_config.device_map for model_name in expected_model_names):
             cls._raise_missing_compiled_file_error(expected_model_names)
         ret_val = [
             rebel.Runtime(
-                compiled_models[0],
+                compiled_models[i],
                 tensor_type="pt",
-                device=rbln_config.device_map["prefill"],
+                device=rbln_config.device_map[model_name],
                 activate_profiler=rbln_config.activate_profiler,
                 timeout=rbln_config.timeout,
             )
+            for i, model_name in enumerate(expected_model_names)
         ]
-        if rbln_config.can_generate:
-            ret_val.extend(
-                [
-                    rebel.Runtime(
-                        compiled_models[i + 1],
-                        tensor_type="pt",
-                        device=rbln_config.device_map[f"decoder_batch_{batch_size}"],
-                        activate_profiler=rbln_config.activate_profiler,
-                        timeout=rbln_config.timeout,
-                    )
-                    for i, batch_size in enumerate(rbln_config.decoder_batch_sizes)
-                ]
-            )
         return ret_val
     def forward(
@@ -643,15 +653,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
             raise ValueError(
                 f"Batch size ({batch_size}) must be equal to the batch size of the model ({self.rbln_config.batch_size})."
             )
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.rbln_config.output_hidden_states
-        )
-        if output_hidden_states != self.rbln_config.output_hidden_states:
-            raise ValueError(
-                f"Variable output_hidden_states {output_hidden_states} is not equal to rbln_config.output_hidden_states {self.rbln_config.output_hidden_states} "
-                f"Please compile again with the correct argument."
-            )
+        output_hidden_states = _validate_output_hidden_states(output_hidden_states, self.rbln_config)
         all_last_hidden_states = []
         all_hidden_states = (
@@ -660,7 +662,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
                     self.rbln_config.batch_size,
                     inputs.shape[1],
                     self.config.hidden_size,
-                    dtype=self.rbln_config.torch_dtype,
+                    dtype=self.rbln_config.dtype,
                 )
                 for _ in range(self.config.num_hidden_layers + 1)
             )
@@ -700,6 +702,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
     1. Converting pre-trained transformer models to RBLN-optimized format
     2. Handling the compilation process for RBLN devices
     3. Managing inference operations for causal language modeling
     This class inherits from RBLNModel and implements specific methods required for
     decoder-only architectures and causal language modeling tasks.
@@ -716,10 +719,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
     def logits_last_dim(self):
         return self.config.vocab_size
-    @classmethod
-    def use_query_position(cls, use_local_attention: bool, is_prefill: bool = True):
-        return is_prefill
     def set_lora_int_ids(self, lora_int_ids: Optional[torch.Tensor]):
         if isinstance(lora_int_ids, int):
             lora_int_ids = torch.tensor([lora_int_ids], dtype=torch.int32)
@@ -803,14 +802,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
             )
             padded_cache_lengths = torch.zeros_like(generate_idx)
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.rbln_config.output_hidden_states
-        )
-        if output_hidden_states != self.rbln_config.output_hidden_states:
-            raise ValueError(
-                f"Variable output_hidden_states {output_hidden_states} is not equal to rbln_config.output_hidden_states {self.rbln_config.output_hidden_states} "
-                f"Please compile again with the correct argument."
-            )
+        output_hidden_states = _validate_output_hidden_states(output_hidden_states, self.rbln_config)
         # Prefill
         if cache_position is None:
@@ -829,7 +821,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
             all_hidden_states = (
                 tuple(
-                    torch.zeros(batch_size, input_len, self.config.hidden_size, dtype=self.rbln_config.torch_dtype)
+                    torch.zeros(batch_size, input_len, self.config.hidden_size, dtype=self.rbln_config.dtype)
                     for _ in range(self.config.num_hidden_layers + 1)
                 )
                 if self.rbln_config.output_hidden_states

optimum/rbln/transformers/models/detr/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_detr import RBLNDetrForObjectDetectionConfig
+from .modeling_detr import RBLNDetrForObjectDetection
+__all__ = [
+    "RBLNDetrForObjectDetectionConfig",
+    "RBLNDetrForObjectDetection",
+]

optimum/rbln/transformers/models/detr/configuration_detr.py ADDED Viewed

@@ -0,0 +1,38 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_generic import RBLNModelForImageClassificationConfig
+class RBLNDetrForObjectDetectionConfig(RBLNModelForImageClassificationConfig):
+    """
+    Configuration class for RBLNDetrForObjectDetection.
+    This configuration class stores the configuration parameters specific to
+    RBLN-optimized DETR models for object detection tasks.
+    """
+    def __init__(self, **kwargs):
+        """
+        Args:
+            image_size (Optional[Union[int, Tuple[int, int]]]): The size of input images.
+                Can be an integer for square images or a tuple (height, width).
+            batch_size (Optional[int]): The batch size for inference. Defaults to 1.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
+        Raises:
+            ValueError: If batch_size is not a positive integer.
+        """
+        super().__init__(**kwargs)

optimum/rbln/transformers/models/detr/modeling_detr.py ADDED Viewed

@@ -0,0 +1,53 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Tuple, Union
+import torch
+from transformers.models.detr.modeling_detr import DetrObjectDetectionOutput
+from ...modeling_generic import RBLNModelForImageClassification
+if TYPE_CHECKING:
+    pass
+class RBLNDetrForObjectDetection(RBLNModelForImageClassification):
+    """
+    RBLN optimized DETR model for object detection tasks.
+    This class provides hardware-accelerated inference for DETR models
+    on RBLN devices, supporting object detection with detection heads
+    designed for object detection tasks.
+    """
+    def forward(
+        self, pixel_values: torch.Tensor, return_dict: bool = None, **kwargs
+    ) -> Union[Tuple, DetrObjectDetectionOutput]:
+        """
+        Foward pass for the RBLN-optimized DETR model for object detection.
+        Args:
+            pixel_values (torch.FloatTensor of shape (batch_size, channels, height, width)): The tensors corresponding to the input images.
+            return_dict (bool, *optional*, defaults to True): Whether to return a dictionary of outputs.
+        Returns:
+            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a ImageClassifierOutputWithNoAttention object.
+        """
+        output = self.model[0](pixel_values=pixel_values, **kwargs)
+        return DetrObjectDetectionOutput(
+            logits=output[0], pred_boxes=output[1], last_hidden_state=output[2], encoder_last_hidden_state=output[3]
+        )

optimum/rbln/transformers/models/exaone/exaone_architecture.py CHANGED Viewed

@@ -18,9 +18,6 @@ import torch.nn as nn
 from ....utils import logging
 from ...models.decoderonly.decoderonly_architecture import (
-    DecoderOnlyAttention,
-    DecoderOnlyLayer,
-    DecoderOnlyModel,
     DecoderOnlyWrapper,
 )
@@ -42,36 +39,3 @@ class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
     def get_model_layer(self, causal_lm: "ExaoneForCausalLM"):
         return causal_lm.transformer
-    def get_rbln_attn_class(self):
-        return ExaoneAttention
-    def get_rbln_layer_class(self):
-        return ExaoneLayer
-    def get_rbln_model_class(self):
-        return ExaoneModel
-class ExaoneModel(DecoderOnlyModel):
-    def get_embedding(self) -> nn.Embedding:
-        return self._original_mod.wte
-    def get_last_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_f
-class ExaoneLayer(DecoderOnlyLayer):
-    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_1
-    def get_post_attention_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_2
-class ExaoneAttention(DecoderOnlyAttention):
-    def __post_init__(self):
-        self.q_proj = self._original_mod.q_proj
-        self.k_proj = self._original_mod.k_proj
-        self.v_proj = self._original_mod.v_proj
-        self.o_proj = self._original_mod.out_proj

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -24,4 +24,4 @@ class GemmaWrapper(DecoderOnlyWrapper):
 class GemmaModel(DecoderOnlyModel):
     @property
     def hidden_multiplier(self):
-        return self._original_mod.config.hidden_size**0.5
+        return self.config.hidden_size**0.5

optimum/rbln/transformers/models/gemma2/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_gemma2 import RBLNGemma2ForCausalLMConfig, RBLNGemma2ModelConfig
+from .modeling_gemma2 import RBLNGemma2ForCausalLM, RBLNGemma2Model

optimum-rbln 0.9.4a2__py3-none-any.whl → 0.10.0.post1__py3-none-any.whl

optimum-rbln 0.9.4a2py3-none-any.whl → 0.10.0.post1py3-none-any.whl