PyPI - optimum-rbln - Versions diffs - 0.7.4a5__tar.gz → 0.7.4a7__tar.gz - Mend

optimum-rbln 0.7.4a5tar.gz → 0.7.4a7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (237) hide show

{optimum_rbln-0.7.4a5 → optimum_rbln-0.7.4a7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: optimum-rbln
-Version: 0.7.4a5
+Version: 0.7.4a7
 Summary: Optimum RBLN is the interface between the Hugging Face Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
 Project-URL: Homepage, https://rebellions.ai
 Project-URL: Documentation, https://docs.rbln.ai

optimum_rbln-0.7.4a7/examples/image-to-text/run_idefics3.py ADDED Viewed

@@ -0,0 +1,67 @@
+import os
+import typing
+import fire
+from datasets import load_dataset
+from transformers import AutoProcessor
+from optimum.rbln import RBLNIdefics3ForConditionalGeneration
+def main(
+    model_id: str = "HuggingFaceM4/Idefics3-8B-Llama3",
+    batch_size: int = 1,
+    from_transformers: bool = False,
+    prompt: typing.Optional[str] = None,
+    max_seq_len: typing.Optional[int] = None,
+    tensor_parallel_size: typing.Optional[int] = 4,
+):
+    processor = AutoProcessor.from_pretrained(model_id)
+    if from_transformers:
+        model = RBLNIdefics3ForConditionalGeneration.from_pretrained(
+            model_id,
+            export=True,
+            rbln_config={
+                "text_model": {
+                    "attn_impl": "flash_attn",
+                    "max_seq_len": max_seq_len,
+                    "use_inputs_embeds": True,
+                    "tensor_parallel_size": tensor_parallel_size,
+                    "batch_size": batch_size,
+                }
+            },
+        )
+        model.save_pretrained(os.path.basename(model_id))
+    else:
+        model = RBLNIdefics3ForConditionalGeneration.from_pretrained(
+            os.path.basename(model_id),
+            export=False,
+        )
+    ds = load_dataset("HuggingFaceM4/the_cauldron", "ai2d", split="train")
+    samples = ds.select(range(batch_size))
+    images = []
+    prompts = []
+    for sample in samples:
+        img = sample["images"]
+        images.append(img)
+        message = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
+        prompt = processor.apply_chat_template(message, add_generation_prompt=True)
+        prompts.append(prompt)
+    inputs = processor(text=prompts, images=images, return_tensors="pt", padding=True)
+    inputs = dict(inputs)
+    # Generate
+    generated_ids = model.generate(**inputs, max_new_tokens=500)
+    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+    for i, text in enumerate(generated_texts):
+        print(f"Sample {i + 1} generate:\n{text}\n")
+if __name__ == "__main__":
+    fire.Fire(main)

{optimum_rbln-0.7.4a5 → optimum_rbln-0.7.4a7}/src/optimum/rbln/__init__.py RENAMED Viewed

@@ -74,6 +74,10 @@ _import_structure = {
         "RBLNGemmaForCausalLMConfig",
         "RBLNGPT2LMHeadModel",
         "RBLNGPT2LMHeadModelConfig",
+        "RBLNIdefics3VisionTransformer",
+        "RBLNIdefics3ForConditionalGeneration",
+        "RBLNIdefics3ForConditionalGenerationConfig",
+        "RBLNIdefics3VisionTransformerConfig",
         "RBLNLlamaForCausalLM",
         "RBLNLlamaForCausalLMConfig",
         "RBLNLlavaNextForConditionalGeneration",
@@ -86,6 +90,10 @@ _import_structure = {
         "RBLNPhiForCausalLMConfig",
         "RBLNQwen2ForCausalLM",
         "RBLNQwen2ForCausalLMConfig",
+        "RBLNQwen2_5_VisionTransformerPretrainedModel",
+        "RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
+        "RBLNQwen2_5_VLForConditionalGeneration",
+        "RBLNQwen2_5_VLForConditionalGenerationConfig",
         "RBLNResNetForImageClassification",
         "RBLNResNetForImageClassificationConfig",
         "RBLNRobertaForMaskedLM",
@@ -277,6 +285,10 @@ if TYPE_CHECKING:
         RBLNGemmaForCausalLMConfig,
         RBLNGPT2LMHeadModel,
         RBLNGPT2LMHeadModelConfig,
+        RBLNIdefics3ForConditionalGeneration,
+        RBLNIdefics3ForConditionalGenerationConfig,
+        RBLNIdefics3VisionTransformer,
+        RBLNIdefics3VisionTransformerConfig,
         RBLNLlamaForCausalLM,
         RBLNLlamaForCausalLMConfig,
         RBLNLlavaNextForConditionalGeneration,
@@ -287,6 +299,10 @@ if TYPE_CHECKING:
         RBLNMistralForCausalLMConfig,
         RBLNPhiForCausalLM,
         RBLNPhiForCausalLMConfig,
+        RBLNQwen2_5_VisionTransformerPretrainedModel,
+        RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
+        RBLNQwen2_5_VLForConditionalGeneration,
+        RBLNQwen2_5_VLForConditionalGenerationConfig,
         RBLNQwen2ForCausalLM,
         RBLNQwen2ForCausalLMConfig,
         RBLNResNetForImageClassification,

{optimum_rbln-0.7.4a5 → optimum_rbln-0.7.4a7}/src/optimum/rbln/__version__.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.7.4a5'
-__version_tuple__ = version_tuple = (0, 7, 4)
+__version__ = version = '0.7.4a7'
+__version_tuple__ = version_tuple = (0, 7, 4, 'a7')

{optimum_rbln-0.7.4a5 → optimum_rbln-0.7.4a7}/src/optimum/rbln/modeling_base.py RENAMED Viewed

@@ -314,10 +314,15 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
             )
         except rebel.core.exception.RBLNRuntimeError as e:
-            logger.warning(
-                f"Failed to create the runtime for the model due to a runtime error: {e.__class__.__name__} - {e}"
+            error_msg = (
+                f"\nFailed to create RBLN runtime: {str(e)}\n\n"
+                f"If you only need to compile the model without loading it to NPU, you can use:\n"
+                f"  from_pretrained(..., rbln_create_runtimes=False) or\n"
+                f"  from_pretrained(..., rbln_config={{..., 'create_runtimes': False}})\n\n"
+                f"To check your NPU status, run the 'rbln-stat' command in your terminal.\n"
+                f"Make sure your NPU is properly installed and operational."
             )
-            models = UnavailableRuntime()
+            raise rebel.core.exception.RBLNRuntimeError(error_msg) from e
         return cls(
             models,
@@ -423,6 +428,20 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
     def to(self, *args, **kwargs):
         return self
+    def parameters(self):
+        """
+        Provides a dummy parameter generator for compatibility.
+        This method mimics the interface of torch.nn.Module.parameters()
+        specifically for code that uses `next(model.parameters())` to infer
+        the device or dtype. It yields a single dummy tensor on CPU with float32 dtype.
+        Warning:
+            This does NOT yield the actual model parameters used by the RBLN runtime.
+            Code relying on iterating through all model parameters will not work as expected.
+        """
+        yield torch.tensor([1.0], dtype=torch.float32, device=torch.device("cpu"))
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)

{optimum_rbln-0.7.4a5 → optimum_rbln-0.7.4a7}/src/optimum/rbln/transformers/__init__.py RENAMED Viewed

@@ -68,6 +68,10 @@ _import_structure = {
         "RBLNGemmaForCausalLMConfig",
         "RBLNGPT2LMHeadModel",
         "RBLNGPT2LMHeadModelConfig",
+        "RBLNIdefics3VisionTransformer",
+        "RBLNIdefics3ForConditionalGeneration",
+        "RBLNIdefics3ForConditionalGenerationConfig",
+        "RBLNIdefics3VisionTransformerConfig",
         "RBLNLlamaForCausalLM",
         "RBLNLlamaForCausalLMConfig",
         "RBLNLlavaNextForConditionalGeneration",
@@ -80,6 +84,10 @@ _import_structure = {
         "RBLNPhiForCausalLMConfig",
         "RBLNQwen2ForCausalLM",
         "RBLNQwen2ForCausalLMConfig",
+        "RBLNQwen2_5_VisionTransformerPretrainedModel",
+        "RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
+        "RBLNQwen2_5_VLForConditionalGeneration",
+        "RBLNQwen2_5_VLForConditionalGenerationConfig",
         "RBLNT5EncoderModel",
         "RBLNT5EncoderModelConfig",
         "RBLNT5ForConditionalGeneration",
@@ -165,6 +173,10 @@ if TYPE_CHECKING:
         RBLNGemmaForCausalLMConfig,
         RBLNGPT2LMHeadModel,
         RBLNGPT2LMHeadModelConfig,
+        RBLNIdefics3ForConditionalGeneration,
+        RBLNIdefics3ForConditionalGenerationConfig,
+        RBLNIdefics3VisionTransformer,
+        RBLNIdefics3VisionTransformerConfig,
         RBLNLlamaForCausalLM,
         RBLNLlamaForCausalLMConfig,
         RBLNLlavaNextForConditionalGeneration,
@@ -175,6 +187,10 @@ if TYPE_CHECKING:
         RBLNMistralForCausalLMConfig,
         RBLNPhiForCausalLM,
         RBLNPhiForCausalLMConfig,
+        RBLNQwen2_5_VisionTransformerPretrainedModel,
+        RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
+        RBLNQwen2_5_VLForConditionalGeneration,
+        RBLNQwen2_5_VLForConditionalGenerationConfig,
         RBLNQwen2ForCausalLM,
         RBLNQwen2ForCausalLMConfig,
         RBLNT5EncoderModel,

{optimum_rbln-0.7.4a5 → optimum_rbln-0.7.4a7}/src/optimum/rbln/transformers/models/__init__.py RENAMED Viewed

@@ -56,6 +56,12 @@ _import_structure = {
         "RBLNCLIPVisionModelWithProjection",
         "RBLNCLIPVisionModelWithProjectionConfig",
     ],
+    "qwen2_5_vl": [
+        "RBLNQwen2_5_VisionTransformerPretrainedModel",
+        "RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
+        "RBLNQwen2_5_VLForConditionalGeneration",
+        "RBLNQwen2_5_VLForConditionalGenerationConfig",
+    ],
     "decoderonly": [
         "RBLNDecoderOnlyModelForCausalLM",
         "RBLNDecoderOnlyModelForCausalLMConfig",
@@ -67,6 +73,12 @@ _import_structure = {
     "exaone": ["RBLNExaoneForCausalLM", "RBLNExaoneForCausalLMConfig"],
     "gemma": ["RBLNGemmaForCausalLM", "RBLNGemmaForCausalLMConfig"],
     "gpt2": ["RBLNGPT2LMHeadModel", "RBLNGPT2LMHeadModelConfig"],
+    "idefics3": [
+        "RBLNIdefics3VisionTransformer",
+        "RBLNIdefics3ForConditionalGeneration",
+        "RBLNIdefics3ForConditionalGenerationConfig",
+        "RBLNIdefics3VisionTransformerConfig",
+    ],
     "llama": ["RBLNLlamaForCausalLM", "RBLNLlamaForCausalLMConfig"],
     "llava_next": ["RBLNLlavaNextForConditionalGeneration", "RBLNLlavaNextForConditionalGenerationConfig"],
     "midm": ["RBLNMidmLMHeadModel", "RBLNMidmLMHeadModelConfig"],
@@ -138,12 +150,24 @@ if TYPE_CHECKING:
     from .exaone import RBLNExaoneForCausalLM, RBLNExaoneForCausalLMConfig
     from .gemma import RBLNGemmaForCausalLM, RBLNGemmaForCausalLMConfig
     from .gpt2 import RBLNGPT2LMHeadModel, RBLNGPT2LMHeadModelConfig
+    from .idefics3 import (
+        RBLNIdefics3ForConditionalGeneration,
+        RBLNIdefics3ForConditionalGenerationConfig,
+        RBLNIdefics3VisionTransformer,
+        RBLNIdefics3VisionTransformerConfig,
+    )
     from .llama import RBLNLlamaForCausalLM, RBLNLlamaForCausalLMConfig
     from .llava_next import RBLNLlavaNextForConditionalGeneration, RBLNLlavaNextForConditionalGenerationConfig
     from .midm import RBLNMidmLMHeadModel, RBLNMidmLMHeadModelConfig
     from .mistral import RBLNMistralForCausalLM, RBLNMistralForCausalLMConfig
     from .phi import RBLNPhiForCausalLM, RBLNPhiForCausalLMConfig
     from .qwen2 import RBLNQwen2ForCausalLM, RBLNQwen2ForCausalLMConfig
+    from .qwen2_5_vl import (
+        RBLNQwen2_5_VisionTransformerPretrainedModel,
+        RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
+        RBLNQwen2_5_VLForConditionalGeneration,
+        RBLNQwen2_5_VLForConditionalGenerationConfig,
+    )
     from .t5 import (
         RBLNT5EncoderModel,
         RBLNT5EncoderModelConfig,

{optimum_rbln-0.7.4a5 → optimum_rbln-0.7.4a7}/src/optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py RENAMED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 import math
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 import torch
 from torch import nn
@@ -184,6 +184,7 @@ class DecoderOnlyWrapper(nn.Module):
     def convert_to_rbln_causal_lm(self, causal_lm: PreTrainedModel, max_seq_len: int):
         new_layers = []
         for layer in causal_lm.model.layers:
             if self.attn_impl == "eager":
                 new_self_attn = DecoderOnlyAttention(
@@ -201,6 +202,7 @@ class DecoderOnlyWrapper(nn.Module):
             new_layer = DecoderOnlyLayer(layer, new_self_attn)
             new_layers.append(new_layer)
         new_model = DecoderOnlyModel(
             causal_lm.model,
             new_layers,
@@ -220,6 +222,53 @@ class DecoderOnlyWrapper(nn.Module):
         self._phase = phase
         self.causal_lm.phase = phase
+    def forward_common(
+        self,
+        input_ids_or_inputs_embeds: torch.Tensor,
+        cache_position: torch.Tensor,
+        attention_mask: torch.Tensor,
+        query_position: torch.Tensor,
+        block_tables: torch.Tensor,
+        rotary_emb: Union[nn.Module, torch.Tensor],
+        *past_key_values: List[torch.Tensor],
+    ):
+        if input_ids_or_inputs_embeds.ndim == 2:
+            input_ids = input_ids_or_inputs_embeds
+            inputs_embeds = None
+        elif input_ids_or_inputs_embeds.ndim == 3:
+            input_ids = None
+            inputs_embeds = input_ids_or_inputs_embeds
+        else:
+            raise NotImplementedError(f"Unknown ndim of input : {input_ids_or_inputs_embeds.ndim}")
+        if len(past_key_values) != 2 * self.num_hidden_layers:
+            raise ValueError(
+                f"Different past_key_values to model's config. {len(past_key_values)} != {2 * self.num_hidden_layers}"
+            )
+        # [key, value] * n_layer -> ( (key, value) ) * n_layer
+        # cache shape : batch, n_heads, 1, max_seq_len, head_dim
+        _past_key_values = []
+        for i in range(self.config.num_hidden_layers):
+            key_states = past_key_values[i * 2]
+            value_states = past_key_values[i * 2 + 1]
+            past_key_value = [key_states, value_states]
+            _past_key_values.append(past_key_value)
+        past_key_values = _past_key_values
+        logit = self.causal_lm(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            query_position=query_position,
+            past_key_values=past_key_values,
+            rotary_emb=rotary_emb,
+            block_tables=block_tables,
+        )
+        return logit
     def forward(self, *args):
         if self.phase == "decode":
             if self.use_attention_mask:
@@ -262,43 +311,16 @@ class DecoderOnlyWrapper(nn.Module):
         else:
             raise ValueError(f"Unknown phase: {self.phase}")
-        if input_ids_or_inputs_embeds.ndim == 2:
-            input_ids = input_ids_or_inputs_embeds
-            inputs_embeds = None
-        elif input_ids_or_inputs_embeds.ndim == 3:
-            input_ids = None
-            inputs_embeds = input_ids_or_inputs_embeds
-        else:
-            raise NotImplementedError(f"Unknown ndim of input : {input_ids_or_inputs_embeds.ndim}")
-        if len(past_key_values) != 2 * self.num_hidden_layers:
-            raise ValueError(
-                f"Different past_key_values to model's config. {len(past_key_values)} != {2 * self.num_hidden_layers}"
-            )
-        # [key, value] * n_layer -> ( (key, value) ) * n_layer
-        # cache shape : batch, n_heads, 1, max_seq_len, head_dim
-        _past_key_values = []
-        for i in range(self.config.num_hidden_layers):
-            key_states = past_key_values[i * 2]
-            value_states = past_key_values[i * 2 + 1]
-            past_key_value = [key_states, value_states]
-            _past_key_values.append(past_key_value)
-        past_key_values = _past_key_values
-        logit = self.causal_lm(
-            input_ids=input_ids,
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            cache_position=cache_position,
-            query_position=query_position,
-            past_key_values=past_key_values,
-            rotary_emb=self.rotary_emb,
-            block_tables=block_tables,
+        return self.forward_common(
+            input_ids_or_inputs_embeds,
+            cache_position,
+            attention_mask,
+            query_position,
+            block_tables,
+            self.rotary_emb,
+            *past_key_values,
         )
-        return logit
 class DecoderOnlyForCausalLM(nn.Module):
     """A specialized wrapper for Causal Language Models optimized for RBLN compilation.
@@ -322,12 +344,13 @@ class DecoderOnlyForCausalLM(nn.Module):
         _phase: Current processing phase ("prefill" or "decode")
     """
-    def __init__(self, causal_lm: PreTrainedModel, model):
+    def __init__(self, causal_lm: PreTrainedModel, model: nn.Module):
         super().__init__()
         self.config = causal_lm.config
         self._original_mod = causal_lm
         self.model = model
         self._phase = "prefill"
+        self.lm_head = self._original_mod.lm_head
     @property
     def phase(self):
@@ -363,7 +386,7 @@ class DecoderOnlyForCausalLM(nn.Module):
         if self.phase == "prefill":
             hidden_states = hidden_states[:, query_position.to(torch.int).unsqueeze(0)]
-        logits = self._original_mod.lm_head(hidden_states)
+        logits = self.lm_head(hidden_states)
         return logits
@@ -455,8 +478,12 @@ class DecoderOnlyModel(nn.Module):
         # get cos,sin vector if needed
         if rotary_emb is not None:
-            cos, sin = rotary_emb(hidden_states, self.max_seq_len)  # dtype carrier, max_seq_len
-            cos, sin = slice_and_unsqueeze_cos_sin(cos, sin, cache_position)
+            if isinstance(rotary_emb, torch.Tensor):
+                cos = rotary_emb[0]
+                sin = rotary_emb[1]
+            else:
+                cos, sin = rotary_emb(hidden_states, self.max_seq_len)  # dtype carrier, max_seq_len
+                cos, sin = slice_and_unsqueeze_cos_sin(cos, sin, cache_position)
         else:
             batch_size = inputs_embeds.shape[0]
             if cache_position.shape[0] > 1:
@@ -833,7 +860,6 @@ def rotate_half(x):
 def apply_rotary_pos_emb(q, k, cos, sin):
     """Applies Rotary Position Embedding to the query and key tensors."""
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed

optimum-rbln 0.7.4a5__tar.gz → 0.7.4a7__tar.gz

optimum-rbln 0.7.4a5tar.gz → 0.7.4a7tar.gz