PyPI - optimum-rbln - Versions diffs - 0.8.1a1__tar.gz → 0.8.1a2__tar.gz - Mend

optimum-rbln 0.8.1a1tar.gz → 0.8.1a2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (267) hide show

optimum_rbln-0.8.1a2/.github/version.yaml ADDED Viewed

	@@ -0,0 +1 @@
1	+ rebel_compiler_version: 0.8.1.dev142+gab6ad3c7

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: optimum-rbln
-Version: 0.8.1a1
+Version: 0.8.1a2
 Summary: Optimum RBLN is the interface between the HuggingFace Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
 Project-URL: Homepage, https://rebellions.ai
 Project-URL: Documentation, https://docs.rbln.ai
@@ -28,7 +28,7 @@ Requires-Dist: packaging>=24.1
 Requires-Dist: torch==2.6.0
 Requires-Dist: torchaudio<=2.6.0
 Requires-Dist: torchvision<=0.21.0
-Requires-Dist: transformers==4.50.3
+Requires-Dist: transformers==4.51.3
 Description-Content-Type: text/markdown

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/pyproject.toml RENAMED Viewed

@@ -32,7 +32,7 @@ dependencies = [
     "torchaudio<=2.6.0",
     "torchvision<=0.21.0",
     "accelerate>=1.0.1",
-    "transformers==4.50.3",
+    "transformers==4.51.3",
     "diffusers<=0.31.0",
     "packaging>=24.1",
 ]

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/src/optimum/rbln/__version__.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.8.1a1'
-__version_tuple__ = version_tuple = (0, 8, 1, 'a1')
+__version__ = version = '0.8.1a2'
+__version_tuple__ = version_tuple = (0, 8, 1, 'a2')

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/src/optimum/rbln/transformers/modeling_rope_utils.py RENAMED Viewed

@@ -48,10 +48,13 @@ def _compute_default_rope_parameters(
         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
     """
     base = config.rope_theta
     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
-    head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+    head_dim = (
+        config.head_dim
+        if hasattr(config, "head_dim") and config.head_dim is not None
+        else config.hidden_size // config.num_attention_heads
+    )
     dim = int(head_dim * partial_rotary_factor)
     attention_factor = 1.0  # Unused in this type of RoPE

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/src/optimum/rbln/transformers/models/auto/auto_factory.py RENAMED Viewed

@@ -167,6 +167,11 @@ class _BaseAutoModelClass:
         rbln_cls = cls.get_rbln_cls(model_id, *args, **kwargs)
         return rbln_cls.from_pretrained(model_id, *args, **kwargs)
+    @classmethod
+    def from_model(cls, model, *args, **kwargs):
+        rbln_cls = get_rbln_model_cls(f"RBLN{model.__class__.__name__}")
+        return rbln_cls.from_model(model, *args, **kwargs)
     @staticmethod
     def register(rbln_cls: Type[RBLNBaseModel], exist_ok=False):
         """

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/src/optimum/rbln/transformers/models/bart/bart_architecture.py RENAMED Viewed

@@ -22,6 +22,7 @@ from transformers.modeling_attn_mask_utils import (
 from transformers.utils import logging
 from ..seq2seq.seq2seq_architecture import (
+    Seq2SeqCrossAttention,
     Seq2SeqDecoder,
     Seq2SeqDecoderLayer,
     Seq2SeqDecoderWrapper,
@@ -45,7 +46,8 @@ class BartDecoderWrapper(Seq2SeqDecoderWrapper):
         new_layers = []
         for layer in model.get_decoder().layers:
             self_attn = BartSelfAttention(layer.self_attn, use_attention_mask=self.use_attention_mask)
-            new_layers.append(BartDecoderLayer(layer, self_attn))
+            cross_attn = BartCrossAttention(layer.encoder_attn)
+            new_layers.append(BartDecoderLayer(layer, self_attn, cross_attn))
         decoder_model = BartDecoder(model.get_decoder(), new_layers)
         new_model = BartForConditionalGeneration(model, decoder_model)
@@ -153,3 +155,14 @@ class BartSelfAttention(Seq2SeqSelfAttention):
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
         return query_states, key_states, value_states
+class BartCrossAttention(Seq2SeqCrossAttention):
+    def __post_init__(self):
+        self.q_proj = self._original_mod.q_proj
+        self.k_proj = self._original_mod.k_proj
+        self.v_proj = self._original_mod.v_proj
+        self.out_proj = self._original_mod.out_proj
+        self.num_heads = self._original_mod.num_heads
+        self.head_dim = self._original_mod.embed_dim // self._original_mod.num_heads
+        self.embed_dim = self._original_mod.embed_dim

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/src/optimum/rbln/transformers/models/exaone/modeling_exaone.py RENAMED Viewed

@@ -13,7 +13,11 @@
 # limitations under the License.
+import inspect
+from typing import Any, Callable
 from transformers import AutoModelForCausalLM
+from transformers.generation.utils import GenerationMixin
 from ....utils import logging
 from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
@@ -85,8 +89,19 @@ class RBLNExaoneForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     _decoder_wrapper_cls = ExaoneForCausalLMWrapper
     _hf_class = AutoModelForCausalLM
+    _supports_cache_class = True
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         kwargs.setdefault("trust_remote_code", True)
         return super().from_pretrained(*args, **kwargs)
+    def __getattr__(self, __name: str) -> Any:
+        def redirect(func):
+            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
+        val = getattr(GenerationMixin, __name)
+        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
+            return redirect(val)
+        return val

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/src/optimum/rbln/transformers/models/gemma3/modeling_gemma3.py RENAMED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import inspect
 from collections import deque
 from dataclasses import dataclass
@@ -123,6 +124,23 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
     def can_generate(self):
         return True
+    @classmethod
+    def get_pytorch_model(cls, *args, **kwargs):
+        model = super().get_pytorch_model(*args, **kwargs)
+        with no_init_weights():
+            model_cls_name = model.model.language_model.__class__.__name__
+            causal_model_cls_name = model_cls_name.replace("TextModel", "ForCausalLM")
+            causal_model_cls = getattr(importlib.import_module("transformers"), causal_model_cls_name)
+            new_language_model = causal_model_cls(model.model.language_model.config)
+        new_language_model.lm_head = model.lm_head
+        new_language_model.model = model.model.language_model
+        model.model.language_model = new_language_model
+        model.lm_head = None
+        del model.lm_head
+        return model
     def __post_init__(self, **kwargs):
         self.vision_tower = LoopVisionTower(self.rbln_submodules[0])
         self.language_model = self.rbln_submodules[1]

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/src/optimum/rbln/transformers/models/llava_next/modeling_llava_next.py RENAMED Viewed

@@ -168,7 +168,6 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
     ):
         # If you are unavoidably running on a CPU rather than an RBLN device,
         # store the torch tensor, weight, etc. in this function.
         save_dict = {}
         save_dict["image_newline"] = model.image_newline
         torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/src/optimum/rbln/transformers/models/midm/modeling_midm.py RENAMED Viewed

@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import inspect
+from typing import Any, Callable
 from transformers import AutoModelForCausalLM
+from transformers.generation.utils import GenerationMixin
 from ....utils import logging
 from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
@@ -84,8 +88,19 @@ class RBLNMidmLMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     _decoder_wrapper_cls = MidmLMHeadModelWrapper
     _hf_class = AutoModelForCausalLM
+    _supports_cache_class = True
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         kwargs.setdefault("trust_remote_code", True)
         return super().from_pretrained(*args, **kwargs)
+    def __getattr__(self, __name: str) -> Any:
+        def redirect(func):
+            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
+        val = getattr(GenerationMixin, __name)
+        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
+            return redirect(val)
+        return val

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/src/optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py RENAMED Viewed

@@ -28,6 +28,7 @@ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
     Qwen2_5_VisionPatchEmbed,
     Qwen2_5_VisionRotaryEmbedding,
     Qwen2_5_VisionTransformerPretrainedModel,
+    Qwen2_5_VLModel,
     Qwen2_5_VLRotaryEmbedding,
 )
@@ -390,6 +391,14 @@ class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
     def can_generate(self):
         return True
+    @classmethod
+    def get_pytorch_model(cls, *args, **kwargs):
+        model = super().get_pytorch_model(*args, **kwargs)
+        model.model.lm_head = model.lm_head
+        model.lm_head = None
+        del model.lm_head
+        return model
     @classmethod
     def update_kwargs(cls, kwargs):
         kwargs.update(
@@ -531,7 +540,8 @@ class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
             vision_tokens = input_id[0][vision_start_indices + 1]
             image_nums = (vision_tokens == image_token_id).sum()
             video_nums = (vision_tokens == video_token_id).sum()
-            position_ids, rope_deltas = self.get_rope_index(
+            position_ids, rope_deltas = Qwen2_5_VLModel.get_rope_index(
+                self,
                 input_id,
                 image_grid_thw[image_idx : image_idx + image_nums] if image_grid_thw is not None else None,
                 video_grid_thw[video_idx : video_idx + video_nums] if video_grid_thw is not None else None,

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/src/optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py RENAMED Viewed

@@ -3,8 +3,14 @@ from typing import Tuple
 import torch
 import torch.nn as nn
+from transformers import PreTrainedModel
 from ..decoderonly.decoderonly_architecture import (
+    DecoderOnlyAttention,
+    DecoderOnlyFlashAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
+    DecoderOnlyModel,
     DecoderOnlyWrapper,
     apply_rotary_pos_emb,
 )
@@ -197,3 +203,40 @@ class Qwen2_5_VL_LanguageModelWrapper(DecoderOnlyWrapper):
             past_key_values,
             position_embeds,
         )
+    def convert_to_rbln_causal_lm(self, causal_lm: PreTrainedModel, max_seq_len: int):
+        new_layers = []
+        for layer in causal_lm.model.language_model.layers:
+            if self.attn_impl == "eager":
+                new_self_attn = DecoderOnlyAttention(
+                    layer.self_attn,
+                    self.use_attention_mask,
+                    self.use_position_ids,
+                    kvcache_block_size=self.kvcache_block_size,
+                )
+            elif self.attn_impl == "flash_attn":
+                new_self_attn = DecoderOnlyFlashAttention(
+                    layer.self_attn,
+                    kvcache_partition_len=self.kvcache_partition_len,
+                    kvcache_block_size=self.kvcache_block_size,
+                    use_attention_mask=self.use_attention_mask,
+                    use_position_ids=self.use_position_ids,
+                )
+            else:
+                raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
+            new_layer = DecoderOnlyLayer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = DecoderOnlyModel(
+            causal_lm.model.language_model,
+            new_layers,
+            partition_len=self.kvcache_partition_len,
+            max_seq_len=max_seq_len,
+            kvcache_block_size=self.kvcache_block_size,
+            use_learned_pos_emb=self.use_learned_pos_emb,
+            sliding_window_layers=self.sliding_window_layers,
+        )
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm.model, new_model)
+        return new_causal_lm

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/src/optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py RENAMED Viewed

@@ -148,7 +148,8 @@ class Seq2SeqDecoderWrapper(nn.Module):
         new_layers = []
         for layer in model.get_decoder().layers:
             self_attn = Seq2SeqSelfAttention(layer.self_attn)
-            new_layers.append(Seq2SeqDecoderLayer(layer, self_attn))
+            cross_attn = Seq2SeqCrossAttention(layer.encoder_attn)
+            new_layers.append(Seq2SeqDecoderLayer(layer, self_attn, cross_attn))
         decoder_model = Seq2SeqDecoder(model.get_decoder(), new_layers)
         new_model = Seq2SeqForConditionalGeneration(model, decoder_model)
@@ -341,10 +342,11 @@ class Seq2SeqDecoderLayer(torch.nn.Module):
         self_attn (Seq2SeqSelfAttention): Modified self-attention layer optimized for RBLN
     """
-    def __init__(self, decoder_layer, self_attn):
+    def __init__(self, decoder_layer, self_attn, cross_attn):
         super().__init__()
         self._original_mod = decoder_layer
         self.self_attn = self_attn
+        self.cross_attn = cross_attn
         self.__post_init__()
     def __post_init__(self, **kwargs):
@@ -402,7 +404,8 @@ class Seq2SeqDecoderLayer(torch.nn.Module):
         # Cross-Attention Block
         residual = hidden_states
         hidden_states = self.pre_cross_attn_layer_norm(hidden_states)
-        cross_attn_output = self.encoder_attn(
+        cross_attn_output = self.cross_attn(
             hidden_states=hidden_states,
             past_key_value=cross_past_key_value,
             attention_mask=encoder_attention_mask,
@@ -487,3 +490,38 @@ class Seq2SeqSelfAttention(nn.Module):
         attn_output = self.out_proj(attn_output)
         return attn_output
+class Seq2SeqCrossAttention(nn.Module):
+    def __init__(self, attn, **kwargs):
+        super().__init__()
+        self._original_mod = attn
+        self.__post_init__(**kwargs)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: torch.Tensor = None,
+        past_key_value: Optional[object] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        bsz, tgt_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        is_cross_attention = key_value_states is not None
+        if is_cross_attention:
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, None, past_key_value

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/src/optimum/rbln/transformers/models/t5/t5_architecture.py RENAMED Viewed

@@ -136,10 +136,14 @@ class T5Decoder(Seq2SeqDecoder):
 class T5Block(Seq2SeqDecoderLayer):
+    def __init__(self, decoder_layer, self_attn):
+        super().__init__(decoder_layer, self_attn, cross_attn=None)
+        self.__post_init__()
     def __post_init__(self):
         self.self_attn_layer_norm = self._original_mod.layer[0].layer_norm
         self.encoder_attn_layer_norm = self._original_mod.layer[1].layer_norm
-        self.encoder_attn = T5CrossAttention(self._original_mod.layer[1].EncDecAttention)
+        self.cross_attn = T5CrossAttention(self._original_mod.layer[1].EncDecAttention)
         self.ff_layer = self._original_mod.layer[2]
     def pre_self_attn_layer_norm(self, hidden_states):

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a2}/tests/test_transformers.py RENAMED Viewed

@@ -247,6 +247,7 @@ class TestWhisperModel(BaseTest.TestModel):
                 data,
                 generate_kwargs={
                     "repetition_penalty": 1.3,
+                    "num_beams": 1,
                 },
                 batch_size=2,
             )

optimum-rbln 0.8.1a1__tar.gz → 0.8.1a2__tar.gz

optimum-rbln 0.8.1a1tar.gz → 0.8.1a2tar.gz