PyPI - optimum-rbln - Versions diffs - 0.8.1a1__tar.gz → 0.8.1a3__tar.gz - Mend

optimum-rbln 0.8.1a1tar.gz → 0.8.1a3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (267) hide show

optimum_rbln-0.8.1a3/.github/version.yaml ADDED Viewed

	@@ -0,0 +1 @@
1	+ rebel_compiler_version: 0.8.1.dev142+gab6ad3c7

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: optimum-rbln
-Version: 0.8.1a1
+Version: 0.8.1a3
 Summary: Optimum RBLN is the interface between the HuggingFace Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
 Project-URL: Homepage, https://rebellions.ai
 Project-URL: Documentation, https://docs.rbln.ai
@@ -28,7 +28,7 @@ Requires-Dist: packaging>=24.1
 Requires-Dist: torch==2.6.0
 Requires-Dist: torchaudio<=2.6.0
 Requires-Dist: torchvision<=0.21.0
-Requires-Dist: transformers==4.50.3
+Requires-Dist: transformers==4.51.3
 Description-Content-Type: text/markdown

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/pyproject.toml RENAMED Viewed

@@ -32,7 +32,7 @@ dependencies = [
     "torchaudio<=2.6.0",
     "torchvision<=0.21.0",
     "accelerate>=1.0.1",
-    "transformers==4.50.3",
+    "transformers==4.51.3",
     "diffusers<=0.31.0",
     "packaging>=24.1",
 ]

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/src/optimum/rbln/__version__.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.8.1a1'
-__version_tuple__ = version_tuple = (0, 8, 1, 'a1')
+__version__ = version = '0.8.1a3'
+__version_tuple__ = version_tuple = (0, 8, 1, 'a3')

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/src/optimum/rbln/transformers/modeling_rope_utils.py RENAMED Viewed

@@ -48,10 +48,13 @@ def _compute_default_rope_parameters(
         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
     """
     base = config.rope_theta
     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
-    head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+    head_dim = (
+        config.head_dim
+        if hasattr(config, "head_dim") and config.head_dim is not None
+        else config.hidden_size // config.num_attention_heads
+    )
     dim = int(head_dim * partial_rotary_factor)
     attention_factor = 1.0  # Unused in this type of RoPE

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/src/optimum/rbln/transformers/models/auto/auto_factory.py RENAMED Viewed

@@ -167,6 +167,11 @@ class _BaseAutoModelClass:
         rbln_cls = cls.get_rbln_cls(model_id, *args, **kwargs)
         return rbln_cls.from_pretrained(model_id, *args, **kwargs)
+    @classmethod
+    def from_model(cls, model, *args, **kwargs):
+        rbln_cls = get_rbln_model_cls(f"RBLN{model.__class__.__name__}")
+        return rbln_cls.from_model(model, *args, **kwargs)
     @staticmethod
     def register(rbln_cls: Type[RBLNBaseModel], exist_ok=False):
         """

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/src/optimum/rbln/transformers/models/bart/bart_architecture.py RENAMED Viewed

@@ -22,6 +22,7 @@ from transformers.modeling_attn_mask_utils import (
 from transformers.utils import logging
 from ..seq2seq.seq2seq_architecture import (
+    Seq2SeqCrossAttention,
     Seq2SeqDecoder,
     Seq2SeqDecoderLayer,
     Seq2SeqDecoderWrapper,
@@ -45,7 +46,8 @@ class BartDecoderWrapper(Seq2SeqDecoderWrapper):
         new_layers = []
         for layer in model.get_decoder().layers:
             self_attn = BartSelfAttention(layer.self_attn, use_attention_mask=self.use_attention_mask)
-            new_layers.append(BartDecoderLayer(layer, self_attn))
+            cross_attn = BartCrossAttention(layer.encoder_attn)
+            new_layers.append(BartDecoderLayer(layer, self_attn, cross_attn))
         decoder_model = BartDecoder(model.get_decoder(), new_layers)
         new_model = BartForConditionalGeneration(model, decoder_model)
@@ -153,3 +155,14 @@ class BartSelfAttention(Seq2SeqSelfAttention):
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
         return query_states, key_states, value_states
+class BartCrossAttention(Seq2SeqCrossAttention):
+    def __post_init__(self):
+        self.q_proj = self._original_mod.q_proj
+        self.k_proj = self._original_mod.k_proj
+        self.v_proj = self._original_mod.v_proj
+        self.out_proj = self._original_mod.out_proj
+        self.num_heads = self._original_mod.num_heads
+        self.head_dim = self._original_mod.embed_dim // self._original_mod.num_heads
+        self.embed_dim = self._original_mod.embed_dim

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/src/optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py RENAMED Viewed

@@ -177,8 +177,8 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                 )
             elif block_tables is None and local_block_tables is None:
                 return False
-        else:
-            return True
+        return True
     def forward(
         self,

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/src/optimum/rbln/transformers/models/exaone/modeling_exaone.py RENAMED Viewed

@@ -13,7 +13,11 @@
 # limitations under the License.
+import inspect
+from typing import Any, Callable
 from transformers import AutoModelForCausalLM
+from transformers.generation.utils import GenerationMixin
 from ....utils import logging
 from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
@@ -85,8 +89,19 @@ class RBLNExaoneForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     _decoder_wrapper_cls = ExaoneForCausalLMWrapper
     _hf_class = AutoModelForCausalLM
+    _supports_cache_class = True
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         kwargs.setdefault("trust_remote_code", True)
         return super().from_pretrained(*args, **kwargs)
+    def __getattr__(self, __name: str) -> Any:
+        def redirect(func):
+            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
+        val = getattr(GenerationMixin, __name)
+        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
+            return redirect(val)
+        return val

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/src/optimum/rbln/transformers/models/gemma3/modeling_gemma3.py RENAMED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import inspect
 from collections import deque
 from dataclasses import dataclass
@@ -123,6 +124,23 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
     def can_generate(self):
         return True
+    @classmethod
+    def get_pytorch_model(cls, *args, **kwargs):
+        model = super().get_pytorch_model(*args, **kwargs)
+        with no_init_weights():
+            model_cls_name = model.model.language_model.__class__.__name__
+            causal_model_cls_name = model_cls_name.replace("TextModel", "ForCausalLM")
+            causal_model_cls = getattr(importlib.import_module("transformers"), causal_model_cls_name)
+            new_language_model = causal_model_cls(model.model.language_model.config)
+        new_language_model.lm_head = model.lm_head
+        new_language_model.model = model.model.language_model
+        model.model.language_model = new_language_model
+        model.lm_head = None
+        del model.lm_head
+        return model
     def __post_init__(self, **kwargs):
         self.vision_tower = LoopVisionTower(self.rbln_submodules[0])
         self.language_model = self.rbln_submodules[1]
@@ -541,7 +559,7 @@ class RBLNGemma3RuntimeModel(RBLNRuntimeModel):
         (
             inputs,
             cache_position,
-            chunked_attention_mask,
+            padded_attention_mask,
             out_buffers,
             position_ids,
             position_embed,
@@ -553,7 +571,7 @@ class RBLNGemma3RuntimeModel(RBLNRuntimeModel):
         )
         if not is_external_block_tables:
             local_block_tables = torch.tensor([batch_idx], dtype=torch.int16)
-            self.dec_attn_mask[batch_idx : batch_idx + 1] = chunked_attention_mask[:1]
+            self.dec_attn_mask[batch_idx : batch_idx + 1] = padded_attention_mask[:1]
         if self.rbln_config.use_attention_mask and self.rbln_config.use_position_ids:
             chunked_attention_mask = torch.zeros(1, self.rbln_config.max_seq_len, dtype=torch.float32)
@@ -569,18 +587,10 @@ class RBLNGemma3RuntimeModel(RBLNRuntimeModel):
                 else None
             )
-            # Not used in Gemma3 yet.
             if self.rbln_config.use_attention_mask:
                 if self.rbln_config.use_position_ids:
-                    chunked_attention_mask[0, step : step + self.rbln_config.prefill_chunk_size] = self.dec_attn_mask[
-                        batch_idx, step : step + self.rbln_config.prefill_chunk_size
-                    ]
-                else:
-                    # Update attention mask to ensure proper causal behavior
-                    if step >= self.rbln_config.prefill_chunk_size:
-                        chunked_attention_mask[:, :, :, step - self.rbln_config.prefill_chunk_size : step] = 1
-                    chunked_attention_mask[:, :, :, step : step + self.rbln_config.prefill_chunk_size] = (
-                        self.causal_mask
+                    chunked_attention_mask[0, step : step + self.rbln_config.prefill_chunk_size] = (
+                        padded_attention_mask[0, step : step + self.rbln_config.prefill_chunk_size]
                     )
             # Define query position

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/src/optimum/rbln/transformers/models/llava_next/modeling_llava_next.py RENAMED Viewed

@@ -168,7 +168,6 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
     ):
         # If you are unavoidably running on a CPU rather than an RBLN device,
         # store the torch tensor, weight, etc. in this function.
         save_dict = {}
         save_dict["image_newline"] = model.image_newline
         torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/src/optimum/rbln/transformers/models/midm/modeling_midm.py RENAMED Viewed

@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import inspect
+from typing import Any, Callable
 from transformers import AutoModelForCausalLM
+from transformers.generation.utils import GenerationMixin
 from ....utils import logging
 from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
@@ -84,8 +88,19 @@ class RBLNMidmLMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     _decoder_wrapper_cls = MidmLMHeadModelWrapper
     _hf_class = AutoModelForCausalLM
+    _supports_cache_class = True
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         kwargs.setdefault("trust_remote_code", True)
         return super().from_pretrained(*args, **kwargs)
+    def __getattr__(self, __name: str) -> Any:
+        def redirect(func):
+            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
+        val = getattr(GenerationMixin, __name)
+        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
+            return redirect(val)
+        return val

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/src/optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py RENAMED Viewed

@@ -148,7 +148,8 @@ class Seq2SeqDecoderWrapper(nn.Module):
         new_layers = []
         for layer in model.get_decoder().layers:
             self_attn = Seq2SeqSelfAttention(layer.self_attn)
-            new_layers.append(Seq2SeqDecoderLayer(layer, self_attn))
+            cross_attn = Seq2SeqCrossAttention(layer.encoder_attn)
+            new_layers.append(Seq2SeqDecoderLayer(layer, self_attn, cross_attn))
         decoder_model = Seq2SeqDecoder(model.get_decoder(), new_layers)
         new_model = Seq2SeqForConditionalGeneration(model, decoder_model)
@@ -341,10 +342,11 @@ class Seq2SeqDecoderLayer(torch.nn.Module):
         self_attn (Seq2SeqSelfAttention): Modified self-attention layer optimized for RBLN
     """
-    def __init__(self, decoder_layer, self_attn):
+    def __init__(self, decoder_layer, self_attn, cross_attn):
         super().__init__()
         self._original_mod = decoder_layer
         self.self_attn = self_attn
+        self.cross_attn = cross_attn
         self.__post_init__()
     def __post_init__(self, **kwargs):
@@ -402,7 +404,8 @@ class Seq2SeqDecoderLayer(torch.nn.Module):
         # Cross-Attention Block
         residual = hidden_states
         hidden_states = self.pre_cross_attn_layer_norm(hidden_states)
-        cross_attn_output = self.encoder_attn(
+        cross_attn_output = self.cross_attn(
             hidden_states=hidden_states,
             past_key_value=cross_past_key_value,
             attention_mask=encoder_attention_mask,
@@ -487,3 +490,38 @@ class Seq2SeqSelfAttention(nn.Module):
         attn_output = self.out_proj(attn_output)
         return attn_output
+class Seq2SeqCrossAttention(nn.Module):
+    def __init__(self, attn, **kwargs):
+        super().__init__()
+        self._original_mod = attn
+        self.__post_init__(**kwargs)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: torch.Tensor = None,
+        past_key_value: Optional[object] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        bsz, tgt_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        is_cross_attention = key_value_states is not None
+        if is_cross_attention:
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, None, past_key_value

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/src/optimum/rbln/transformers/models/t5/t5_architecture.py RENAMED Viewed

@@ -136,10 +136,14 @@ class T5Decoder(Seq2SeqDecoder):
 class T5Block(Seq2SeqDecoderLayer):
+    def __init__(self, decoder_layer, self_attn):
+        super().__init__(decoder_layer, self_attn, cross_attn=None)
+        self.__post_init__()
     def __post_init__(self):
         self.self_attn_layer_norm = self._original_mod.layer[0].layer_norm
         self.encoder_attn_layer_norm = self._original_mod.layer[1].layer_norm
-        self.encoder_attn = T5CrossAttention(self._original_mod.layer[1].EncDecAttention)
+        self.cross_attn = T5CrossAttention(self._original_mod.layer[1].EncDecAttention)
         self.ff_layer = self._original_mod.layer[2]
     def pre_self_attn_layer_norm(self, hidden_states):

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/tests/test_llm.py RENAMED Viewed

@@ -67,7 +67,7 @@ class LLMTest:
 class TestQwen2Model(LLMTest.TestLLM):
     RBLN_CLASS = RBLNQwen2ForCausalLM
     HF_MODEL_ID = "Qwen/Qwen2-0.5B-Instruct"
-    EXPECTED_OUTPUT = " I am a 30-year-old woman who has been living with lupus for over 1"
+    EXPECTED_OUTPUT = " I am a 20 year old girl from the United States. I have been studying English for"
     HF_CONFIG_KWARGS = {"max_position_embeddings": 1024}
@@ -108,7 +108,7 @@ class TestLlamaForCausalLM_Flash(LLMTest.TestLLM):
 class TestLlamaForCausalLM_Multibatch(TestLlamaForCausalLM):
     PROMPT = ["Who are you?", "What is the capital of France?", "What is the capital of Germany?"]
     EXPECTED_OUTPUT = [
-        "reress makefable R���� noethetsshss rechoolso�",
+        "reress makefable R���� noethetss0oss invetetet",
         "resget makeget makeichget makeichualichual#choolchool accngngngng",
         "resget makeget makeichget makeichualichual#choolchool accngngngng",
     ]

{optimum_rbln-0.8.1a1 → optimum_rbln-0.8.1a3}/tests/test_transformers.py RENAMED Viewed

@@ -247,6 +247,7 @@ class TestWhisperModel(BaseTest.TestModel):
                 data,
                 generate_kwargs={
                     "repetition_penalty": 1.3,
+                    "num_beams": 1,
                 },
                 batch_size=2,
             )

optimum-rbln 0.8.1a1__tar.gz → 0.8.1a3__tar.gz

optimum-rbln 0.8.1a1tar.gz → 0.8.1a3tar.gz