PyPI - optimum-rbln - Versions diffs - 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

optimum-rbln 0.1.13py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

optimum/rbln/transformers/models/gpt2/gpt2_architecture.py CHANGED Viewed

@@ -21,262 +21,74 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-from typing import Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Tuple
 import torch
 import torch.nn as nn
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from ...cache_utils import RebelDynamicCache_4D
+from ..decoderonly.decoderonly_architecture import (
+    DecoderOnlyAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
+    DecoderOnlyModel,
+    DecoderOnlyWrapper,
+)
-class GPT2LMHeadModelWrapper(torch.nn.Module):
-    def __init__(self, model, max_seq_len):
-        super().__init__()
-        self.model = model.transformer
-        self.lm_head = model.lm_head
-        self.config = model.config
-        self.max_seq_len = max_seq_len
-        self.forward_dict = self.get_forward_dict()
+if TYPE_CHECKING:
+    from transformers import GPT2LMHeadModel
-    def get_forward_dict(self):
-        forward_dict = {
-            "wrapper": _GPT2Model.forward,
-            "model": _GPT2Block.forward,
-            "decoder_layer": _GPT2Attention.forward,
-        }
-        return forward_dict
-    def forward(
-        self,
-        input_ids,
-        attention_mask,
-        cache_position,
-        batch_position,
-        query_idx,
-        *past_key_values,
-    ):
-        if input_ids.shape[1] == 1:
-            rbln_batch_position = None
-        else:
-            rbln_batch_position = batch_position
+class GPT2Wrapper(DecoderOnlyWrapper):
+    def convert_to_rbln_causal_lm(self, causal_lm: "GPT2LMHeadModel"):
+        if self.attn_impl != "eager":
+            raise NotImplementedError(f"flash attention ({self.attn_impl}) is not implemented for {self.__class__}")
+        new_layers = []
+        for layer in causal_lm.transformer.h:
+            new_self_attn = GPT2Attention(layer.attn)
+            new_layer = GPT2Layer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = GPT2Model(causal_lm.transformer, new_layers)
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
+        return new_causal_lm
-        # Formatting list of past_kv to DynamicCache class.
-        past_key_value = RebelDynamicCache_4D.from_input_format(
-            cache_position,
-            self.config.n_layer,
-            *past_key_values,
-        )
-        outputs = self.forward_dict["wrapper"](
-            self.model,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=cache_position,
-            past_key_value=past_key_value,
-            batch_ids=rbln_batch_position,
-            forward_dict=self.forward_dict,
-            # rotary_emb  differenct from_llama
-        )
-        hidden_states = outputs[0]
-        if batch_position >= 0:
-            hidden_states = hidden_states[:, query_idx].unsqueeze(1)
-        logits = self.lm_head(hidden_states)
-        output = (logits,) + outputs[1:]
-        return output, batch_position + query_idx
+class GPT2Model(DecoderOnlyModel):
+    mask_fmin = torch.finfo(torch.float32).min
+    def get_last_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_f
-class _GPT2Model:
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_ids: Optional[torch.LongTensor] = None,
-        forward_dict: Optional[Dict[str, classmethod]] = None,
-    ) -> BaseModelOutputWithPast:
-        b_size, q_len = input_ids.shape
-        inputs_embeds = self.wte(input_ids)
+    def get_embedding(self) -> nn.Embedding:
+        return self._original_mod.wte
-        if position_ids.shape[0] > 1:
-            position_embeds = []
-            for b_idx in range(b_size):
-                position_embed = self.wpe(position_ids[b_idx])
-                # position_embed = position_embed.dtype(inputs_embeds.dtype)
-                position_embeds.append(position_embed)
+    def get_pos_embedding(self) -> nn.Embedding:
+        return self._original_mod.wpe
-            position_embeds = torch.cat(position_embeds, dim=0).unsqueeze(1)
-        else:
-            position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
+class GPT2Layer(DecoderOnlyLayer):
+    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_1
-        # GPT2Attention mask.
-        # Here we assume mask is causal mask, (batch, 1, query_length, key_length + query_length)
-        attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+    def get_post_attention_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_2
-        for layer_idx, block in enumerate(self.h):
-            hidden_states, updated_cache = forward_dict["model"](
-                block,
-                hidden_states,
-                layer_idx,
-                attention_mask=attention_mask,
-                past_key_value=past_key_value,
-                position_ids=position_ids,
-                batch_ids=batch_ids,
-                forward_dict=forward_dict,
-            )
-        hidden_states = self.ln_f(hidden_states)
-        output_shape = (-1,) + (q_len,) + (hidden_states.size(-1),)
-        hidden_states = hidden_states.view(output_shape)
-        # convert RebelDynamicCache to legacy Tuple[Tuple[torch.Tensor]]
-        next_cache = updated_cache.to_legacy_cache()
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-        )
+class GPT2Attention(DecoderOnlyAttention):
+    def __post_init__(self):
+        self.c_attn = self._original_mod.c_attn
+        self.o_proj = self._original_mod.c_proj
+        self.split_size = self._original_mod.split_size
+        self.num_key_value_heads = self._original_mod.num_heads
-class _GPT2Block:
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_idx: int,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_ids: Optional[torch.LongTensor] = None,
-        forward_dict: Optional[Dict[str, classmethod]] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, RebelDynamicCache_4D]:
-        residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
+    def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
+        return query_states, key_states, value_states
-        hidden_states, k, v = forward_dict["decoder_layer"](
-            self.attn,
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            batch_index=batch_ids,
+    def rbln_attention(self, *args, **kwargs):
+        return super().rbln_attention(
+            *args,
+            **kwargs,
+            layer_idx=self.layer_idx,
+            scale_attn_by_inverse_layer_idx=self._original_mod.scale_attn_by_inverse_layer_idx,
         )
-        past_key_value.assign(k, v, layer_idx)
-        # residual connection
-        hidden_states = residual + hidden_states
-        residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states, past_key_value
-class _GPT2Attention:
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
-            )
-        # Layer-wise attention scaling
-        if self.scale_attn_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-        # -------------------
-        # Below are deleted since "where" op does not supported on RBLN graph.
-        # -------------------
-        # if not self.is_cross_attention:
-        #     # if only "normal" attention layer implements causal mask
-        #     query_length, key_length = query.size(-2), key.size(-2)
-        #     causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
-        #     mask_value = torch.finfo(attn_weights.dtype).min
-        #     # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-        #     # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-        #     mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
-        #     attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
-        # Apply the attention mask
-        attn_weights.view(
-            -1,
-        )
-        attn_weights = attn_weights + attention_mask
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        attn_output = torch.matmul(attn_weights, value)
-        return attn_output, attn_weights
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_index: Optional[int] = None,
-        **kwargs,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        bsz, q_len, _ = hidden_states.size()
-        query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-        querys = self._split_heads(query, self.num_heads, self.head_dim)  # (batch, head, seq_length, head_features)
-        keys = self._split_heads(key, self.num_heads, self.head_dim)
-        values = self._split_heads(value, self.num_heads, self.head_dim)
-        # Decoder
-        if (batch_index is None or batch_index == -1) and bsz > 1:
-            all_keys = []
-            all_values = []
-            all_attn_output = []
-            for b in range(bsz):
-                query = querys[b].unsqueeze(0)
-                attn_mask = attention_mask[b].unsqueeze(0)
-                key = keys[b].unsqueeze(0)
-                value = values[b].unsqueeze(0)
-                key, value = past_key_value.update(
-                    key,
-                    value,
-                    self.layer_idx,
-                    b,
-                )
-                attn_output, _ = _GPT2Attention._attn(self, query, key, value, attn_mask)
-                attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-                all_keys.append(key)
-                all_values.append(value)
-                all_attn_output.append(attn_output)
-            keys = torch.cat(all_keys, dim=0)
-            values = torch.cat(all_values, dim=0)
-            attn_output = torch.cat(all_attn_output, dim=0)
-        # Prefill
-        else:
-            if batch_index is None or batch_index == -1:
-                batch_index = 0
-            keys, values = past_key_value.update(
-                keys,
-                values,
-                self.layer_idx,
-                batch_index,
-                read_first_step=True,
-            )
-            attn_output, _ = _GPT2Attention._attn(self, querys, keys, values, attention_mask)
-            attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-        attn_output = self.c_proj(attn_output)
-        return attn_output, keys, values

optimum/rbln/transformers/models/gpt2/modeling_gpt2.py CHANGED Viewed

@@ -23,7 +23,7 @@
 from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
-from .gpt2_architecture import GPT2LMHeadModelWrapper
+from .gpt2_architecture import GPT2Wrapper  # GPT2LMHeadModelWrapper
 logger = logging.get_logger(__name__)
@@ -43,4 +43,5 @@ class RBLNGPT2LMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     """
-    _decoder_wrapper_cls = GPT2LMHeadModelWrapper
+    _decoder_wrapper_cls = GPT2Wrapper
+    _use_rotary_emb = False

optimum/rbln/transformers/models/llava_next/modeling_llava_next.py CHANGED Viewed

@@ -23,7 +23,7 @@
 import inspect
 import logging
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -36,7 +36,7 @@ from transformers import (
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.models.llava_next.modeling_llava_next import LlavaNextCausalLMOutputWithPast
-from ....modeling_base import RBLNModel
+from ....modeling import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig
 from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyOutput
@@ -166,19 +166,6 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         self._padding_side = "left"  # set it to left by default, user can use setter to change padding_sides
         return super().__post_init__(**kwargs)
-    @classmethod
-    def get_pytorch_model(
-        cls,
-        model_id: str,
-        *args,
-        rbln_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs,
-    ) -> "PreTrainedModel":
-        # Optimum's TasksManager does not handle Llava.
-        kwargs = cls.update_kwargs(kwargs)
-        model = LlavaNextForConditionalGeneration.from_pretrained(model_id, *args, **kwargs)
-        return model
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
@@ -422,66 +409,6 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         return outputs
-    def vllm_forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
-        image_sizes: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        vision_feature_layer: Optional[int] = None,
-        vision_feature_select_strategy: Optional[str] = None,
-        cache_position: Union[List[torch.Tensor], torch.Tensor] = None,  # vllm keyword argument
-        batch_idx: Optional[int] = None,
-        **kwargs,
-    ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
-        is_prefill = cache_position.shape[-1] > 1
-        if inputs_embeds is not None:
-            raise NotImplementedError("Specifying inputs_embeds is not supported.")
-        if is_prefill:
-            # Get text_embeds
-            inputs_embeds = self.text_embedding(input_ids)
-            # If any images in the prompt, get image_embeds and merge with text
-            if pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) > 0:
-                image_features, _ = self.image_embedding(
-                    image_sizes, pixel_values, vision_feature_layer, vision_feature_select_strategy
-                )
-                def merge_vllm_multimodal_embeddings(
-                    input_ids: torch.Tensor,
-                    inputs_embeds: torch.Tensor,
-                    multimodal_embeddings: torch.Tensor,
-                    placeholder_token_id: int,
-                ) -> torch.Tensor:
-                    mask = input_ids == placeholder_token_id
-                    num_expected_tokens = mask.sum().item()
-                    if multimodal_embeddings.shape[0] != num_expected_tokens:
-                        raise ValueError(
-                            f"Attempted to assign {inputs_embeds[mask].shape} = {multimodal_embeddings.shape} "
-                            f"multimodal tokens to {num_expected_tokens} placeholders"
-                        )
-                    inputs_embeds[mask] = multimodal_embeddings
-                    return inputs_embeds
-                inputs_embeds = merge_vllm_multimodal_embeddings(
-                    input_ids, inputs_embeds, image_features, self.config.image_token_index
-                )
-        else:
-            inputs_embeds = self.text_embedding(input_ids=input_ids)
-        outputs: RBLNDecoderOnlyOutput = self.language_model.vllm_forward(
-            inputs_embeds=inputs_embeds,
-            batch_idx=batch_idx,
-            cache_position=cache_position,
-        )
-        return outputs
     # Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
     def pack_image_features(self, image_features, image_sizes, image_newline=None):
         """

optimum-rbln 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

optimum-rbln 0.1.13py3-none-any.whl → 0.1.15py3-none-any.whl