PyPI - optimum-rbln - Versions diffs - 0.8.2a7__py3-none-any.whl → 0.8.3__py3-none-any.whl - Mend

optimum-rbln 0.8.2a7py3-none-any.whl → 0.8.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of optimum-rbln might be problematic. Click here for more details.

Files changed (105) hide show

optimum/rbln/transformers/models/idefics3/configuration_idefics3.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 from ....configuration_utils import RBLNModelConfig
@@ -39,7 +39,7 @@ class RBLNIdefics3ForConditionalGenerationConfig(RBLNModelConfig):
         batch_size: Optional[int] = None,
         vision_model: Optional[RBLNModelConfig] = None,
         text_model: Optional[RBLNModelConfig] = None,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ):
         """
         Args:

optimum/rbln/transformers/models/idefics3/modeling_idefics3.py CHANGED Viewed

@@ -34,17 +34,11 @@ from transformers.models.idefics3.modeling_idefics3 import Idefics3CausalLMOutpu
 from ....configuration_utils import RBLNCompileConfig, RBLNModelConfig
 from ....modeling import RBLNModel
 from ....utils.runtime_utils import RBLNPytorchRuntime
-from ..decoderonly.modeling_decoderonly import (
-    RBLNDecoderOnlyForCausalLMOutput,
-)
+from ...modeling_outputs import RBLNDecoderOnlyOutput
 if TYPE_CHECKING:
-    from transformers import (
-        AutoFeatureExtractor,
-        AutoProcessor,
-        AutoTokenizer,
-    )
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
 class RBLNRuntimeVisionModel(RBLNPytorchRuntime):
@@ -494,7 +488,7 @@ class RBLNIdefics3ForConditionalGeneration(RBLNModel):
         if not return_dict:
             return logits, generate_idx
         else:
-            return RBLNDecoderOnlyForCausalLMOutput(
+            return RBLNDecoderOnlyOutput(
                 logits=logits,
                 generate_idx=generate_idx,
             )

optimum/rbln/transformers/models/llama/modeling_llama.py CHANGED Viewed

@@ -85,11 +85,20 @@ class RBLNLlamaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
 class RBLNLlamaModel(RBLNDecoderOnlyModel):
     """
-    The Llama Model transformer with a language modeling head (linear layer) on top.
+    The Llama Model transformer outputting raw hidden-states without any specific head on top.
     This model inherits from [`RBLNDecoderOnlyModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
-    A class to convert and run pre-trained transformers based LlamaModel model on RBLN devices.
-    It implements the methods to convert a pre-trained transformers LlamaModel model into a RBLN transformer model by:
+    A class to convert and run pre-trained transformers based LlamaModel on RBLN devices.
+    It implements the methods to convert a pre-trained transformers LlamaModel into a RBLN transformer model by:
+    - transferring the checkpoint weights of the original into an optimized RBLN graph,
+    - compiling the resulting graph using the RBLN compiler.
+    **Configuration:**
+    This model uses [`RBLNLlamaModelConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
+    the `rbln_config` parameter should be an instance of [`RBLNLlamaModelConfig`] or a dictionary conforming to its structure.
+    See the [`RBLNLlamaModelConfig`] class for all available configuration options.
     """
     _decoder_wrapper_cls = LlamaWrapper

optimum/rbln/transformers/models/llava/configuration_llava.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 from ....configuration_utils import RBLNModelConfig
@@ -33,7 +33,7 @@ class RBLNLlavaForConditionalGenerationConfig(RBLNModelConfig):
         batch_size: Optional[int] = None,
         vision_tower: Optional[RBLNModelConfig] = None,
         language_model: Optional[RBLNModelConfig] = None,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ):
         """
         Args:

optimum/rbln/transformers/models/llava/modeling_llava.py CHANGED Viewed

@@ -16,30 +16,20 @@ import inspect
 from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union
 import torch
-from transformers import (
-    AutoModelForImageTextToText,
-    LlavaForConditionalGeneration,
-    PretrainedConfig,
-    PreTrainedModel,
-)
+from transformers import AutoModelForImageTextToText, LlavaForConditionalGeneration, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
 from ....configuration_utils import RBLNCompileConfig, RBLNModelConfig
 from ....modeling import RBLNModel
 from ....utils.logging import get_logger
-from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyForCausalLMOutput
+from ...modeling_outputs import RBLNDecoderOnlyOutput
 logger = get_logger(__name__)
 if TYPE_CHECKING:
-    from transformers import (
-        AutoFeatureExtractor,
-        AutoProcessor,
-        AutoTokenizer,
-        PretrainedConfig,
-    )
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig
 class LoopVisionTower:
@@ -111,6 +101,55 @@ class LoopProjector:
 class RBLNLlavaForConditionalGeneration(RBLNModel):
+    """
+    RBLNLlavaForConditionalGeneration is a multi-modal model that combines vision and language processing capabilities,
+    optimized for RBLN NPUs. It is designed for conditional generation tasks that involve both image and text inputs.
+    This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    Important Note:
+        This model includes a Large Language Model (LLM) as a submodule. For optimal performance, it is highly recommended to use
+        tensor parallelism for the language model. This can be achieved by using the `rbln_config` parameter in the
+        `from_pretrained` method. Refer to the `from_pretrained` documentation and the RBLNLlavaForConditionalGeneration class for details.
+    Examples:
+        ```python
+        from optimum.rbln import RBLNLlavaForConditionalGeneration
+        model = RBLNLlavaForConditionalGeneration.from_pretrained(
+            "llava-hf/llava-1.5-7b-hf",
+            export=True,
+            rbln_config={
+                "vision_tower": {"output_hidden_states": True},
+                "language_model": {
+                    "tensor_parallel_size": 4,
+                    "use_inputs_embeds": True,  # In Llava, language model must use inputs_embeds as input.
+                },
+            },
+        )
+        model.save_pretrained("compiled-llava-1.5-7b-hf")
+        # Using a RBLNLlavaForConditionalGenerationConfig instance (recommended for type checking)
+        from optimum.rbln import RBLNLlavaForConditionalGenerationConfig
+        vision_config = RBLNCLIPVisionModelConfig(
+            batch_size=1,
+            output_hidden_states=True
+        )
+        language_model_config = RBLNLlamaForCausalLMConfig(
+            batch_size=1,
+            max_seq_len=4096,
+            use_inputs_embeds=True,
+            tensor_parallel_size=4
+        )
+        llava_config = RBLNLlavaForConditionalGenerationConfig(
+            batch_size=1,
+            vision_tower=vision_config,
+            language_model=language_model_config
+        )
+        model = RBLNLlavaForConditionalGeneration.from_pretrained(
+            "llava-hf/llava-1.5-7b-hf",
+            export=True,
+            rbln_config=llava_config
+        )
+        ```
+    """
     auto_model_class = AutoModelForImageTextToText
     _rbln_submodules = [
         {"name": "vision_tower"},
@@ -374,7 +413,7 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
         if not return_dict:
             return logits, generate_idx
         else:
-            return RBLNDecoderOnlyForCausalLMOutput(
+            return RBLNDecoderOnlyOutput(
                 logits=logits,
                 generate_idx=generate_idx,
             )

optimum/rbln/transformers/models/llava_next/configuration_llava_next.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 from ....configuration_utils import RBLNModelConfig
 from ....utils.logging import get_logger
@@ -38,7 +38,7 @@ class RBLNLlavaNextForConditionalGenerationConfig(RBLNModelConfig):
         batch_size: Optional[int] = None,
         vision_tower: Optional[RBLNModelConfig] = None,
         language_model: Optional[RBLNModelConfig] = None,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ):
         """
         Args:

optimum/rbln/transformers/models/llava_next/modeling_llava_next.py CHANGED Viewed

@@ -18,29 +18,19 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
 import numpy as np
 import torch
-from transformers import (
-    AutoModelForVision2Seq,
-    LlavaNextForConditionalGeneration,
-    PretrainedConfig,
-    PreTrainedModel,
-)
+from transformers import AutoModelForVision2Seq, LlavaNextForConditionalGeneration, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 from ....configuration_utils import RBLNCompileConfig, RBLNModelConfig
 from ....modeling import RBLNModel
 from ....utils.logging import get_logger
-from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyForCausalLMOutput
+from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyOutput
 logger = get_logger(__name__)
 if TYPE_CHECKING:
-    from transformers import (
-        AutoFeatureExtractor,
-        AutoProcessor,
-        AutoTokenizer,
-        PretrainedConfig,
-    )
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig
 class LoopVisionTower:
@@ -258,7 +248,7 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
     def _update_model_kwargs_for_generation(
         self,
-        outputs: RBLNDecoderOnlyForCausalLMOutput,
+        outputs: RBLNDecoderOnlyOutput,
         model_kwargs: Dict[str, Any],
         **kwargs,
     ) -> Dict[str, Any]:
@@ -359,7 +349,7 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         generate_idx: Optional[torch.Tensor] = None,
         batch_idx: Optional[int] = None,
         **kwargs,
-    ) -> Union[Tuple, RBLNDecoderOnlyForCausalLMOutput]:
+    ) -> Union[Tuple, RBLNDecoderOnlyOutput]:
         vision_feature_layer = (
             vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
         )
@@ -418,7 +408,7 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
                 cache_position=cache_position,
             )
             logits = output.logits
-        return RBLNDecoderOnlyForCausalLMOutput(logits=logits, generate_idx=generate_idx)
+        return RBLNDecoderOnlyOutput(logits=logits, generate_idx=generate_idx)
     # Almost copied from : https://github.com/huggingface/transformers/blob/6b550462139655d488d4c663086a63e98713c6b9/src/transformers/models/llava_next/modeling_llava_next.py
     def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):

optimum/rbln/transformers/models/opt/modeling_opt.py CHANGED Viewed

@@ -70,24 +70,10 @@ class RBLNOPTForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     @classmethod
     def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
-        wrapper_cfg = {
-            "max_seq_len": rbln_config.max_seq_len,
-            "attn_impl": rbln_config.attn_impl,
-            "kvcache_partition_len": rbln_config.kvcache_partition_len,
-            "kvcache_block_size": rbln_config.kvcache_block_size,
-            "use_rotary_emb": cls._use_rotary_emb,
-            "use_attention_mask": rbln_config.use_attention_mask,
-            "use_position_ids": rbln_config.use_position_ids,
-            "use_inputs_embeds": rbln_config.use_inputs_embeds,
-            "cache_impl": rbln_config.cache_impl,
-            "sliding_window": rbln_config.sliding_window,
-            "sliding_window_layers": rbln_config.sliding_window_layers,
-        }
         for i in range(len(model.model.decoder.layers)):
             model.model.decoder.layers[i] = cls.modify_opt_decoder_layer(model.model.decoder.layers[i])
-        return cls._decoder_wrapper_cls(model, **wrapper_cfg).eval()
+        return cls._decoder_wrapper_cls(model, rbln_config=rbln_config, use_rotary_emb=cls._use_rotary_emb).eval()
 class RBLNOPTModel(RBLNDecoderOnlyModel):
@@ -110,21 +96,7 @@ class RBLNOPTModel(RBLNDecoderOnlyModel):
     @classmethod
     def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
-        wrapper_cfg = {
-            "max_seq_len": rbln_config.max_seq_len,
-            "attn_impl": rbln_config.attn_impl,
-            "kvcache_partition_len": rbln_config.kvcache_partition_len,
-            "kvcache_block_size": rbln_config.kvcache_block_size,
-            "use_rotary_emb": cls._use_rotary_emb,
-            "use_attention_mask": rbln_config.use_attention_mask,
-            "use_position_ids": rbln_config.use_position_ids,
-            "use_inputs_embeds": rbln_config.use_inputs_embeds,
-            "cache_impl": rbln_config.cache_impl,
-            "sliding_window": rbln_config.sliding_window,
-            "sliding_window_layers": rbln_config.sliding_window_layers,
-        }
         for i in range(len(model.decoder.layers)):
             model.decoder.layers[i] = cls.modify_opt_decoder_layer(model.decoder.layers[i])
-        return cls._decoder_wrapper_cls(model, **wrapper_cfg).eval()
+        return cls._decoder_wrapper_cls(model, rbln_config=rbln_config, use_rotary_emb=cls._use_rotary_emb).eval()

optimum/rbln/transformers/models/pegasus/configuration_pegasus.py CHANGED Viewed

@@ -24,6 +24,8 @@ class RBLNPegasusModelConfig(RBLNTransformerEncoderForFeatureExtractionConfig):
     RBLN-optimized PEGASUS models for feature extraction tasks.
     """
+    rbln_model_input_names = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
 class RBLNPegasusForConditionalGenerationConfig(RBLNModelForSeq2SeqLMConfig):
     """
@@ -32,3 +34,5 @@ class RBLNPegasusForConditionalGenerationConfig(RBLNModelForSeq2SeqLMConfig):
     This configuration class stores the configuration parameters specific to
     RBLN-optimized PEGASUS models for conditional text generation tasks.
     """
+    support_paged_attention = True

optimum/rbln/transformers/models/pegasus/modeling_pegasus.py CHANGED Viewed

@@ -39,6 +39,8 @@ class RBLNPegasusModel(RBLNTransformerEncoderForFeatureExtraction):
     on RBLN devices, optimized for feature extraction use cases.
     """
+    rbln_model_input_names = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
 class RBLNPegasusForConditionalGeneration(RBLNModelForSeq2SeqLM):
     """

optimum/rbln/transformers/models/pegasus/pegasus_architecture.py CHANGED Viewed

@@ -16,9 +16,7 @@ from typing import Tuple
 import torch
 from torch import nn
-from transformers.modeling_attn_mask_utils import (
-    _prepare_4d_attention_mask,
-)
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
 from transformers.utils import logging
 from ..seq2seq.seq2seq_architecture import (

optimum/rbln/transformers/models/pixtral/configuration_pixtral.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Optional, Tuple
 from ....configuration_utils import RBLNModelConfig
@@ -23,7 +23,7 @@ class RBLNPixtralVisionModelConfig(RBLNModelConfig):
         max_image_size: Tuple = None,
         batch_size: Optional[int] = None,
         output_hidden_states: Optional[bool] = None,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ):
         """
         Args:

optimum/rbln/transformers/models/pixtral/modeling_pixtral.py CHANGED Viewed

@@ -21,10 +21,7 @@ import torch.nn as nn
 from transformers import PixtralVisionConfig, PixtralVisionModel
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.modeling_utils import no_init_weights
-from transformers.models.pixtral.modeling_pixtral import (
-    PixtralRMSNorm,
-    PixtralRotaryEmbedding,
-)
+from transformers.models.pixtral.modeling_pixtral import PixtralRMSNorm, PixtralRotaryEmbedding
 from ....configuration_utils import RBLNCompileConfig, RBLNModelConfig
 from ....modeling import RBLNModel

optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, List, Optional, Union
 from ....configuration_utils import RBLNModelConfig
 from ..decoderonly.configuration_decoderonly import RBLNDecoderOnlyModelForCausalLMConfig
@@ -33,7 +33,7 @@ class RBLNQwen2_5_VLForConditionalGenerationConfig(RBLNDecoderOnlyModelForCausal
         self,
         visual: Optional[RBLNModelConfig] = None,
         use_inputs_embeds: bool = True,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ):
         super().__init__(use_inputs_embeds=use_inputs_embeds, **kwargs)
         if not self.use_inputs_embeds:
@@ -53,7 +53,7 @@ class RBLNQwen2_5_VisionTransformerPretrainedModelConfig(RBLNModelConfig):
     mechanisms for processing images and videos.
     """
-    def __init__(self, max_seq_lens: Union[int, List[int]] = None, **kwargs: Dict[str, Any]):
+    def __init__(self, max_seq_lens: Union[int, List[int]] = None, **kwargs: Any):
         """
         Args:
             max_seq_lens (Optional[Union[int, List[int]]]): Maximum sequence lengths for Vision

optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py CHANGED Viewed

@@ -17,12 +17,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union
 import torch
-from transformers import (
-    AutoModelForVision2Seq,
-    PretrainedConfig,
-    PreTrainedModel,
-    Qwen2_5_VLForConditionalGeneration,
-)
+from transformers import AutoModelForVision2Seq, PretrainedConfig, PreTrainedModel, Qwen2_5_VLForConditionalGeneration
 from transformers.modeling_utils import no_init_weights
 from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
     Qwen2_5_VisionPatchEmbed,
@@ -34,7 +29,8 @@ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
 from ....configuration_utils import RBLNCompileConfig
 from ....modeling import RBLNModel
 from ....utils.logging import get_logger
-from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyForCausalLMOutput, RBLNDecoderOnlyModelForCausalLM
+from ...modeling_outputs import RBLNDecoderOnlyOutput
+from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .configuration_qwen2_5_vl import (
     RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
     RBLNQwen2_5_VLForConditionalGenerationConfig,
@@ -45,12 +41,7 @@ from .qwen2_5_vl_architecture import Qwen2_5_VisionTransformerWrapper, Qwen2_5_V
 logger = get_logger(__name__)
 if TYPE_CHECKING:
-    from transformers import (
-        AutoFeatureExtractor,
-        AutoProcessor,
-        AutoTokenizer,
-        PretrainedConfig,
-    )
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig
 class RBLNQwen2_5_VisionTransformerPretrainedModel(RBLNModel):
@@ -595,7 +586,7 @@ class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
         generate_idx: Optional[torch.Tensor] = None,
         return_dict: Optional[bool] = None,
         **kwargs,
-    ) -> RBLNDecoderOnlyForCausalLMOutput:
+    ) -> RBLNDecoderOnlyOutput:
         # Prefill
         if cache_position is None:
             inputs_embeds, position_embed, rope_deltas = self._preprocess_prefill(
@@ -637,7 +628,7 @@ class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
         if not return_dict:
             return logits, generate_idx
         else:
-            return RBLNDecoderOnlyForCausalLMOutput(
+            return RBLNDecoderOnlyOutput(
                 logits=logits,
                 generate_idx=generate_idx,
             )

optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py CHANGED Viewed

@@ -4,10 +4,7 @@ from typing import Tuple
 import torch
 import torch.nn as nn
-from ..decoderonly.decoderonly_architecture import (
-    DecoderOnlyWrapper,
-    apply_rotary_pos_emb,
-)
+from ..decoderonly.decoderonly_architecture import DecoderOnlyWrapper, apply_rotary_pos_emb
 class Qwen2_5_VisionTransformerWrapper(nn.Module):
@@ -159,15 +156,15 @@ class Qwen2_5_VLVisionWindowAttention(nn.Module):
 class Qwen2_5_VL_LanguageModelWrapper(DecoderOnlyWrapper):
     def prepare_forward_args(self, *args):
         args = list(args)
-        input_ids = None if self.use_inputs_embeds else args.pop(0)
-        inputs_embeds = args.pop(0) if self.use_inputs_embeds else None
+        input_ids = None if self.rbln_config.use_inputs_embeds else args.pop(0)
+        inputs_embeds = args.pop(0) if self.rbln_config.use_inputs_embeds else None
         cache_position = args.pop(0)
         global_block_tables = args.pop(0)
         local_block_tables = None
         position_embeds = args.pop(0)
         query_position = args.pop(0) if self.phase == "prefill" else None
         position_ids = None
-        attention_mask = args.pop(0) if self.use_attention_mask else None
+        attention_mask = args.pop(0) if self.rbln_config.use_attention_mask else None
         past_key_values = args
         if len(past_key_values) != 2 * self.num_hidden_layers:

optimum/rbln/transformers/models/qwen3/modeling_qwen3.py CHANGED Viewed

@@ -28,12 +28,60 @@ from .qwen3_architecture import Qwen3Wrapper
 logger = logging.get_logger(__name__)
 if TYPE_CHECKING:
-    from transformers import (
-        PretrainedConfig,
-    )
+    from transformers import PretrainedConfig
 class RBLNQwen3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
+    """
+    The Qwen3 Model transformer with a language modeling head (linear layer) on top.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    A class to convert and run pre-trained transformers based Qwen3ForCausalLM model on RBLN devices.
+    It implements the methods to convert a pre-trained transformers Qwen3ForCausalLM model into a RBLN transformer model by:
+    - transferring the checkpoint weights of the original into an optimized RBLN graph,
+    - compiling the resulting graph using the RBLN compiler.
+    **Configuration:**
+    This model uses [`RBLNQwen3ForCausalLMConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
+    the `rbln_config` parameter should be an instance of [`RBLNQwen3ForCausalLMConfig`] or a dictionary conforming to its structure.
+    See the [`RBLNQwen3ForCausalLMConfig`] class for all available configuration options.
+    Examples:
+        ```python
+        from optimum.rbln import RBLNQwen3ForCausalLM
+        # Simple usage using rbln_* arguments
+        # `max_seq_len` is automatically inferred from the model config
+        model = RBLNQwen3ForCausalLM.from_pretrained(
+            "Qwen/Qwen3-4B",
+            export=True,
+            rbln_batch_size=1,
+            rbln_tensor_parallel_size=4,
+        )
+        # Using a config dictionary
+        rbln_config = {
+            "batch_size": 1,
+            "max_seq_len": 40_960,
+            "tensor_parallel_size": 4,
+            "kvcache_partition_len": 8192,
+        }
+        model = RBLNQwen3ForCausalLM.from_pretrained(
+            "Qwen/Qwen3-4B",
+            export=True,
+            rbln_config=rbln_config
+        )
+        # Using a RBLNQwen3ForCausalLMConfig instance (recommended for type checking)
+        from optimum.rbln import RBLNQwen3ForCausalLMConfig
+        config = RBLNQwen3ForCausalLMConfig(
+            batch_size=1,
+            max_seq_len=40_960,
+            tensor_parallel_size=4,
+            kvcache_partition_len=8192,
+        )
+        model = RBLNQwen3ForCausalLM.from_pretrained(
+            "Qwen/Qwen3-4B",
+            export=True,
+            rbln_config=config
+        )
+        ```
+    """
     _decoder_wrapper_cls = Qwen3Wrapper
     @classmethod
@@ -55,5 +103,31 @@ class RBLNQwen3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
 class RBLNQwen3Model(RBLNDecoderOnlyModel):
+    """
+    The bare Qwen3 Model outputting raw hidden-states without any specific head on top.
+    This model inherits from [`RBLNDecoderOnlyModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    A class to convert and run pre-trained transformers based Qwen3Model on RBLN devices.
+    It implements the methods to convert a pre-trained transformers Qwen3Model into a RBLN transformer model by:
+    - transferring the checkpoint weights of the original into an optimized RBLN graph,
+    - compiling the resulting graph using the RBLN compiler.
+    **Configuration:**
+    This model uses [`RBLNQwen3ModelConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
+    the `rbln_config` parameter should be an instance of [`RBLNQwen3ModelConfig`] or a dictionary conforming to its structure.
+    See the [`RBLNQwen3ModelConfig`] class for all available configuration options.
+    Examples:
+        ```python
+        from optimum.rbln import RBLNQwen3Model
+        # Simple usage using rbln_* arguments
+        # `max_seq_len` is automatically inferred from the model config
+        model = RBLNQwen3Model.from_pretrained(
+            "Qwen/Qwen3-Embedding-4B",
+            export=True,
+            rbln_batch_size=1,
+            rbln_max_seq_len=40_960,
+            rbln_tensor_parallel_size=4,
+            rbln_kvcache_partition_len=8192,
+        )
+    """
     _decoder_wrapper_cls = Qwen3Wrapper
     _use_rotary_emb = True

optimum/rbln/transformers/models/qwen3/qwen3_architecture.py CHANGED Viewed

@@ -13,10 +13,7 @@
 # limitations under the License.
-from ..decoderonly.decoderonly_architecture import (
-    DecoderOnlyAttention,
-    DecoderOnlyWrapper,
-)
+from ..decoderonly.decoderonly_architecture import DecoderOnlyAttention, DecoderOnlyWrapper
 class Qwen3Wrapper(DecoderOnlyWrapper):

optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 from ....configuration_utils import RBLNModelConfig
 from ....utils.logging import get_logger
@@ -22,6 +22,8 @@ logger = get_logger()
 class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
+    support_paged_attention = None
     def __init__(
         self,
         batch_size: Optional[int] = None,
@@ -29,7 +31,9 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
         dec_max_seq_len: Optional[int] = None,
         use_attention_mask: Optional[bool] = None,
         pad_token_id: Optional[int] = None,
-        **kwargs: Dict[str, Any],
+        kvcache_num_blocks: Optional[int] = None,
+        kvcache_block_size: Optional[int] = None,
+        **kwargs: Any,
     ):
         """
         Args:
@@ -38,6 +42,10 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
             dec_max_seq_len (Optional[int]): Maximum sequence length for the decoder.
             use_attention_mask (Optional[bool]): Whether to use attention masks during inference.
             pad_token_id (Optional[int]): The ID of the padding token in the vocabulary.
+            kvcache_num_blocks (Optional[int]): The total number of blocks to allocate for the
+                PagedAttention KV cache for the SelfAttention. Defaults to batch_size.
+            kvcache_block_size (Optional[int]): Sets the size (in number of tokens) of each block
+                in the PagedAttention KV cache for the SelfAttention. Defaults to dec_max_seq_len.
             **kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
@@ -54,3 +62,12 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
         self.use_attention_mask = use_attention_mask
         self.pad_token_id = pad_token_id
+        if self.support_paged_attention:
+            self.kvcache_num_blocks = kvcache_num_blocks
+            self.kvcache_block_size = kvcache_block_size
+        else:
+            if kvcache_num_blocks is not None or kvcache_block_size is not None:
+                raise ValueError(
+                    "You cannot set kvcache_num_blocks or kvcache_block_size as paged attention is not supported for the model."
+                )

optimum-rbln 0.8.2a7__py3-none-any.whl → 0.8.3__py3-none-any.whl

Potentially problematic release.

optimum-rbln 0.8.2a7py3-none-any.whl → 0.8.3py3-none-any.whl