PyPI - optimum-rbln - Versions diffs - 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

optimum-rbln 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

optimum/rbln/transformers/models/llava_next/modeling_llava_next.py CHANGED Viewed

@@ -20,10 +20,11 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
 import inspect
 import logging
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -36,7 +37,7 @@ from transformers import (
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.models.llava_next.modeling_llava_next import LlavaNextCausalLMOutputWithPast
-from ....modeling_base import RBLNModel
+from ....modeling import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig
 from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyOutput
@@ -166,19 +167,6 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         self._padding_side = "left"  # set it to left by default, user can use setter to change padding_sides
         return super().__post_init__(**kwargs)
-    @classmethod
-    def get_pytorch_model(
-        cls,
-        model_id: str,
-        *args,
-        rbln_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs,
-    ) -> "PreTrainedModel":
-        # Optimum's TasksManager does not handle Llava.
-        kwargs = cls.update_kwargs(kwargs)
-        model = LlavaNextForConditionalGeneration.from_pretrained(model_id, *args, **kwargs)
-        return model
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
@@ -422,66 +410,6 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         return outputs
-    def vllm_forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
-        image_sizes: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        vision_feature_layer: Optional[int] = None,
-        vision_feature_select_strategy: Optional[str] = None,
-        cache_position: Union[List[torch.Tensor], torch.Tensor] = None,  # vllm keyword argument
-        batch_idx: Optional[int] = None,
-        **kwargs,
-    ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
-        is_prefill = cache_position.shape[-1] > 1
-        if inputs_embeds is not None:
-            raise NotImplementedError("Specifying inputs_embeds is not supported.")
-        if is_prefill:
-            # Get text_embeds
-            inputs_embeds = self.text_embedding(input_ids)
-            # If any images in the prompt, get image_embeds and merge with text
-            if pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) > 0:
-                image_features, _ = self.image_embedding(
-                    image_sizes, pixel_values, vision_feature_layer, vision_feature_select_strategy
-                )
-                def merge_vllm_multimodal_embeddings(
-                    input_ids: torch.Tensor,
-                    inputs_embeds: torch.Tensor,
-                    multimodal_embeddings: torch.Tensor,
-                    placeholder_token_id: int,
-                ) -> torch.Tensor:
-                    mask = input_ids == placeholder_token_id
-                    num_expected_tokens = mask.sum().item()
-                    if multimodal_embeddings.shape[0] != num_expected_tokens:
-                        raise ValueError(
-                            f"Attempted to assign {inputs_embeds[mask].shape} = {multimodal_embeddings.shape} "
-                            f"multimodal tokens to {num_expected_tokens} placeholders"
-                        )
-                    inputs_embeds[mask] = multimodal_embeddings
-                    return inputs_embeds
-                inputs_embeds = merge_vllm_multimodal_embeddings(
-                    input_ids, inputs_embeds, image_features, self.config.image_token_index
-                )
-        else:
-            inputs_embeds = self.text_embedding(input_ids=input_ids)
-        outputs: RBLNDecoderOnlyOutput = self.language_model.vllm_forward(
-            inputs_embeds=inputs_embeds,
-            batch_idx=batch_idx,
-            cache_position=cache_position,
-        )
-        return outputs
     # Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
     def pack_image_features(self, image_features, image_sizes, image_newline=None):
         """

optimum/rbln/transformers/models/midm/midm_architecture.py CHANGED Viewed

@@ -21,18 +21,25 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-from typing import Optional, Tuple, Union
+import math
+from typing import TYPE_CHECKING, Tuple
 import torch
 import torch.nn as nn
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from ....transformers.models.decoderonly.decoderonly_architecture import (
-    RotaryEmbedding,
+from ..decoderonly.decoderonly_architecture import (
+    DecoderOnlyAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
+    DecoderOnlyModel,
+    DecoderOnlyWrapper,
+    apply_rotary_pos_emb_partial,
     rotate_half,
-    slice_and_unsqueeze_cos_sin,
 )
-from ...cache_utils import RebelDynamicCache_4D
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel as MidmLMHeadModel
 def apply_rotary_to_tensor(tensor, cos, sin, rot_dim):
@@ -50,253 +57,92 @@ def apply_rotary_pos_emb(q, k, cos, sin):
     return q_embed, k_embed
-class MidmLMHeadModelWrapper(torch.nn.Module):
-    """A wrapper class for the Midm model with a language modeling head."""
-    def __init__(self, model, max_seq_len):
-        super().__init__()
-        self.model = model.transformer
-        self.lm_head = model.lm_head
-        self.config = model.config
-        self.max_seq_len = max_seq_len
-        self.config.partial_rotary_factor = model.config.rotary_percentage
-        self.config.head_dim = self.config.n_embd // self.config.n_head
+class MidmLMHeadModelWrapper(DecoderOnlyWrapper):
+    def get_rotary_emb(self, max_seq_len):
         self.config.rope_theta = 10000
-        self.rotary_emb = RotaryEmbedding(config=self.config, max_seq_len_cached=max_seq_len)
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        cache_position: torch.LongTensor,
-        batch_position: int,
-        query_idx: int,
-        *past_key_values,
-    ):
-        """Defines the forward pass for the wrapper model."""
-        if input_ids.shape[1] == 1:
-            rbln_batch_position = None
-        else:
-            rbln_batch_position = batch_position
-        past_key_values = RebelDynamicCache_4D.from_input_format(
-            cache_position,
-            self.config.num_hidden_layers,
-            *past_key_values,
-        )
-        outputs = _MidmModel.forward(
-            self.model,
-            input_ids=input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            position_ids=cache_position,
-            rotary_pos_emb=self.rotary_emb,
-            batch_ids=rbln_batch_position,
-        )
-        hidden_states = outputs[0]
-        if batch_position >= 0:
-            hidden_states = hidden_states[:, query_idx].unsqueeze(1)
-        logits = self.lm_head(hidden_states)
-        output = (logits,) + outputs[1:]
-        return output, batch_position + query_idx
-def layernorm1p(module, input):
-    """Applies Layer Normalization with a slight modification on the weights."""
-    return torch.nn.functional.layer_norm(input, module.normalized_shape, module.weight + 1, module.bias, module.eps)
-class _MidmAttention:
-    """Custom implementation of the MidmAttention class with specific modifications."""
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        """Computes the attention weights and output."""
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+        self.config.head_dim = self.config.n_embd // self.config.n_head
+        self.config.partial_rotary_factor = self.config.rotary_percentage
+        return super().get_rotary_emb(max_seq_len=max_seq_len)
+    def convert_to_rbln_causal_lm(self, causal_lm: "MidmLMHeadModel"):
+        if self.attn_impl != "eager":
+            raise NotImplementedError(f"flash attention ({self.attn_impl}) is not implemented for {self.__class__}")
+        new_layers = []
+        for layer in causal_lm.transformer.h:
+            new_self_attn = MidmAttention(layer.attn)
+            new_layer = MidmLayer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = MidmModel(causal_lm.transformer, new_layers)
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
+        return new_causal_lm
+class MidmModel(DecoderOnlyModel):
+    def get_layernorm1p(self, module: nn.LayerNorm):
+        def layernorm1p(input: torch.Tensor):
+            """Applies Layer Normalization with a slight modification on the weights."""
+            return torch.nn.functional.layer_norm(
+                input, module.normalized_shape, module.weight + 1, module.bias, module.eps
             )
-        if self.scale_attn_by_inverse_layer_idx or self.scale_qk_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-        if attention_mask is not None:
-            attn_weights = attn_weights + attention_mask
-        if self.scale_qk_by_inverse_layer_idx:
-            attn_weights = attn_weights * float(self.layer_idx + 1)
+        return layernorm1p
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        attn_weights = attn_weights.type(value.dtype)
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-        attn_output = torch.matmul(attn_weights, value)
-        return attn_output, attn_weights
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_index: Optional[int] = None,
-        cos: Optional[torch.Tensor] = None,
-        sin: Optional[torch.Tensor] = None,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        """Defines the forward pass for the attention mechanism."""
-        bsz, q_len, _ = hidden_states.size()
+    def get_last_layernorm(self) -> nn.LayerNorm:
+        if self._original_mod.use_layernorm1p:
+            return self.get_layernorm1p(self._original_mod.ln_f)
+        else:
+            return self._original_mod.ln_f
-        querys, keys, values = self.c_attn(hidden_states).split(self.split_size, dim=2)
+    def get_embedding(self) -> nn.Embedding:
+        return self._original_mod.wte
-        querys = self._split_heads(querys, self.num_heads, self.head_dim).contiguous()
-        keys = self._split_heads(keys, self.num_heads, self.head_dim).contiguous()
-        values = self._split_heads(values, self.num_heads, self.head_dim).contiguous()
+    def get_pos_embedding(self) -> nn.Embedding:
+        return self._original_mod.wpe
-        querys, keys = apply_rotary_pos_emb(querys, keys, cos, sin)
-        # Decoder
-        if (batch_index is None or batch_index == -1) and bsz > 1:
-            all_key_states = []
-            all_value_states = []
-            all_attn_output = []
+class MidmLayer(DecoderOnlyLayer):
+    def get_layernorm1p(self, module: nn.LayerNorm):
+        def layernorm1p(input: torch.Tensor):
+            """Applies Layer Normalization with a slight modification on the weights."""
+            return torch.nn.functional.layer_norm(
+                input, module.normalized_shape, module.weight + 1, module.bias, module.eps
+            )
-            for b in range(bsz):
-                query = querys[b].unsqueeze(0)
-                attn_mask = attention_mask[b].unsqueeze(0)
-                key = keys[b].unsqueeze(0)
-                value = values[b].unsqueeze(0)
+        return layernorm1p
-                key, value = past_key_value.update(
-                    key,
-                    value,
-                    self.layer_idx,
-                    b,
-                )
+    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
+        if self._original_mod.use_layernorm1p:
+            return self.get_layernorm1p(self._original_mod.ln_1)
+        else:
+            return self._original_mod.ln_1
-                attn_output, _ = _MidmAttention._attn(self, query, key, value, attn_mask)
-                attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+    def get_post_attention_layernorm(self) -> nn.LayerNorm:
+        if self._original_mod.use_layernorm1p:
+            return self.get_layernorm1p(self._original_mod.ln_2)
+        else:
+            return self._original_mod.ln_2
-                all_key_states.append(key)
-                all_value_states.append(value)
-                all_attn_output.append(attn_output)
-            keys = torch.cat(all_key_states, dim=0)
-            values = torch.cat(all_value_states, dim=0)
-            attn_output = torch.cat(all_attn_output, dim=0)
+class MidmAttention(DecoderOnlyAttention):
+    def __post_init__(self):
+        self.c_attn = self._original_mod.c_attn
+        self.o_proj = self._original_mod.c_proj
+        self.split_size = self._original_mod.split_size
+        self.num_key_value_heads = self._original_mod.num_heads
-        else:
-            if batch_index is None or batch_index == -1:
-                batch_index = 0
-            keys, values = past_key_value.update(
-                keys,
-                values,
-                self.layer_idx,
-                batch_index,
-                read_first_step=True,
-            )
+    def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
+        return query_states, key_states, value_states
-            attn_output, _ = _MidmAttention._attn(self, querys, keys, values, attention_mask)
-            attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-        attn_output = self.c_proj(attn_output)
-        return attn_output, keys, values
-class _MidmBlock:
-    """Custom implementation of the MidmBlock class with specific modifications."""
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_idx: int,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_ids: Optional[torch.LongTensor] = None,
-        cos: Optional[torch.Tensor] = None,
-        sin: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-        """Defines the forward pass for the block."""
-        residual = hidden_states
-        if self.use_layernorm1p:
-            hidden_states = layernorm1p(self.ln_1, hidden_states)
-        else:
-            hidden_states = self.ln_1(hidden_states)
-        hidden_states, k, v = _MidmAttention.forward(
-            self.attn,
-            hidden_states,
-            attention_mask=attention_mask,
-            past_key_value=past_key_value,
-            cos=cos,
-            sin=sin,
-            batch_index=batch_ids,
-        )
-        past_key_value.assign(k, v, layer_idx)
-        hidden_states = hidden_states + residual
-        residual = hidden_states
-        if self.use_layernorm1p:
-            hidden_states = layernorm1p(self.ln_2, hidden_states)
-        else:
-            hidden_states = self.ln_2(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + feed_forward_hidden_states
-        return hidden_states, past_key_value
-class _MidmModel:
-    """Custom implementation of the MidmModel class with specific modifications."""
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[RebelDynamicCache_4D] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        rotary_pos_emb=None,
-        batch_ids: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        """Defines the forward pass for the model."""
-        input_shape = input_ids.size()
-        attention_mask = (1.0 - attention_mask) * -10000.0
-        inputs_embeds = self.wte(input_ids)
-        cos, sin = rotary_pos_emb(inputs_embeds, attention_mask.shape[-1])
-        cos, sin = slice_and_unsqueeze_cos_sin(cos, sin, position_ids)
-        hidden_states = inputs_embeds
-        for layer_idx, (block, _) in enumerate(zip(self.h, past_key_values)):
-            hidden_states, updated_cache = _MidmBlock.forward(
-                block,
-                hidden_states,
-                layer_idx,
-                attention_mask=attention_mask,
-                past_key_value=past_key_values,
-                batch_ids=batch_ids,
-                cos=cos,
-                sin=sin,
-            )
+    def get_attn_scale(self):
+        scale = 1.0
+        if self._original_mod.scale_attn_weights:
+            scale /= math.sqrt(self.head_dim)
-        hidden_states = layernorm1p(self.ln_f, hidden_states)
-        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
-        hidden_states = hidden_states.view(output_shape)
+        if self._original_mod.scale_attn_by_inverse_layer_idx and not self._original_mod.scale_qk_by_inverse_layer_idx:
+            scale /= 1 + self.layer_idx
-        next_cache = updated_cache.to_legacy_cache()
+        return scale
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-        )
+    def apply_rotary_pos_embed(self, query_states, key_states, cos, sin):
+        return apply_rotary_pos_emb_partial(query_states, key_states, cos, sin, ndim=cos.shape[-1])

optimum/rbln/transformers/models/midm/modeling_midm.py CHANGED Viewed

@@ -21,12 +21,11 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+from transformers import AutoModelForCausalLM
 from ....utils import logging
-from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
-from .hf_hub_cached.modeling_midm import MidmLMHeadModel
-from .midm_architecture import (
-    MidmLMHeadModelWrapper,
-)
+from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
+from .midm_architecture import MidmLMHeadModelWrapper
 logger = logging.get_logger(__name__)
@@ -47,7 +46,7 @@ class RBLNMidmLMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     """
     _decoder_wrapper_cls = MidmLMHeadModelWrapper
-    _original_cls = MidmLMHeadModel
+    _hf_class = AutoModelForCausalLM
     @classmethod
     def from_pretrained(cls, *args, **kwargs):

optimum/rbln/transformers/models/mistral/mistral_architecture.py CHANGED Viewed

@@ -21,7 +21,6 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
 from ..decoderonly.decoderonly_architecture import DecoderOnlyWrapper

optimum-rbln 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

optimum-rbln 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl