PyPI - optimum-rbln - Versions diffs - 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

optimum-rbln 0.1.13py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

optimum/rbln/transformers/models/midm/midm_architecture.py CHANGED Viewed

@@ -21,18 +21,24 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-from typing import Optional, Tuple, Union
+from typing import TYPE_CHECKING, Tuple
 import torch
 import torch.nn as nn
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from ....transformers.models.decoderonly.decoderonly_architecture import (
-    RotaryEmbedding,
-    rotate_half,
-    slice_and_unsqueeze_cos_sin,
+from ....transformers.models.decoderonly.decoderonly_architecture import rotate_half
+from ..decoderonly.decoderonly_architecture import (
+    DecoderOnlyAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
+    DecoderOnlyModel,
+    DecoderOnlyWrapper,
+    apply_rotary_pos_emb_partial,
 )
-from ...cache_utils import RebelDynamicCache_4D
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel as MidmLMHeadModel
 def apply_rotary_to_tensor(tensor, cos, sin, rot_dim):
@@ -50,253 +56,93 @@ def apply_rotary_pos_emb(q, k, cos, sin):
     return q_embed, k_embed
-class MidmLMHeadModelWrapper(torch.nn.Module):
-    """A wrapper class for the Midm model with a language modeling head."""
-    def __init__(self, model, max_seq_len):
-        super().__init__()
-        self.model = model.transformer
-        self.lm_head = model.lm_head
-        self.config = model.config
-        self.max_seq_len = max_seq_len
-        self.config.partial_rotary_factor = model.config.rotary_percentage
-        self.config.head_dim = self.config.n_embd // self.config.n_head
+class MidmLMHeadModelWrapper(DecoderOnlyWrapper):
+    def get_rotary_emb(self, max_seq_len):
         self.config.rope_theta = 10000
-        self.rotary_emb = RotaryEmbedding(config=self.config, max_seq_len_cached=max_seq_len)
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        cache_position: torch.LongTensor,
-        batch_position: int,
-        query_idx: int,
-        *past_key_values,
-    ):
-        """Defines the forward pass for the wrapper model."""
-        if input_ids.shape[1] == 1:
-            rbln_batch_position = None
-        else:
-            rbln_batch_position = batch_position
-        past_key_values = RebelDynamicCache_4D.from_input_format(
-            cache_position,
-            self.config.num_hidden_layers,
-            *past_key_values,
-        )
-        outputs = _MidmModel.forward(
-            self.model,
-            input_ids=input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            position_ids=cache_position,
-            rotary_pos_emb=self.rotary_emb,
-            batch_ids=rbln_batch_position,
-        )
-        hidden_states = outputs[0]
-        if batch_position >= 0:
-            hidden_states = hidden_states[:, query_idx].unsqueeze(1)
-        logits = self.lm_head(hidden_states)
-        output = (logits,) + outputs[1:]
-        return output, batch_position + query_idx
-def layernorm1p(module, input):
-    """Applies Layer Normalization with a slight modification on the weights."""
-    return torch.nn.functional.layer_norm(input, module.normalized_shape, module.weight + 1, module.bias, module.eps)
-class _MidmAttention:
-    """Custom implementation of the MidmAttention class with specific modifications."""
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        """Computes the attention weights and output."""
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+        self.config.head_dim = self.config.n_embd // self.config.n_head
+        self.config.partial_rotary_factor = self.config.rotary_percentage
+        return super().get_rotary_emb(max_seq_len=max_seq_len)
+    def convert_to_rbln_causal_lm(self, causal_lm: "MidmLMHeadModel"):
+        if self.attn_impl != "eager":
+            raise NotImplementedError(f"flash attention ({self.attn_impl}) is not implemented for {self.__class__}")
+        new_layers = []
+        for layer in causal_lm.transformer.h:
+            new_self_attn = MidmAttention(layer.attn)
+            new_layer = MidmLayer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = MidmModel(causal_lm.transformer, new_layers)
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
+        return new_causal_lm
+class MidmModel(DecoderOnlyModel):
+    mask_fmin = -10000.0
+    def get_layernorm1p(self, module: nn.LayerNorm):
+        def layernorm1p(input: torch.Tensor):
+            """Applies Layer Normalization with a slight modification on the weights."""
+            return torch.nn.functional.layer_norm(
+                input, module.normalized_shape, module.weight + 1, module.bias, module.eps
             )
-        if self.scale_attn_by_inverse_layer_idx or self.scale_qk_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
+        return layernorm1p
-        if attention_mask is not None:
-            attn_weights = attn_weights + attention_mask
-        if self.scale_qk_by_inverse_layer_idx:
-            attn_weights = attn_weights * float(self.layer_idx + 1)
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        attn_weights = attn_weights.type(value.dtype)
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-        attn_output = torch.matmul(attn_weights, value)
-        return attn_output, attn_weights
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_index: Optional[int] = None,
-        cos: Optional[torch.Tensor] = None,
-        sin: Optional[torch.Tensor] = None,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        """Defines the forward pass for the attention mechanism."""
-        bsz, q_len, _ = hidden_states.size()
-        querys, keys, values = self.c_attn(hidden_states).split(self.split_size, dim=2)
-        querys = self._split_heads(querys, self.num_heads, self.head_dim).contiguous()
-        keys = self._split_heads(keys, self.num_heads, self.head_dim).contiguous()
-        values = self._split_heads(values, self.num_heads, self.head_dim).contiguous()
-        querys, keys = apply_rotary_pos_emb(querys, keys, cos, sin)
-        # Decoder
-        if (batch_index is None or batch_index == -1) and bsz > 1:
-            all_key_states = []
-            all_value_states = []
-            all_attn_output = []
+    def get_last_layernorm(self) -> nn.LayerNorm:
+        if self._original_mod.use_layernorm1p:
+            return self.get_layernorm1p(self._original_mod.ln_f)
+        else:
+            return self._original_mod.ln_f
-            for b in range(bsz):
-                query = querys[b].unsqueeze(0)
-                attn_mask = attention_mask[b].unsqueeze(0)
-                key = keys[b].unsqueeze(0)
-                value = values[b].unsqueeze(0)
+    def get_embedding(self) -> nn.Embedding:
+        return self._original_mod.wte
-                key, value = past_key_value.update(
-                    key,
-                    value,
-                    self.layer_idx,
-                    b,
-                )
+    def get_pos_embedding(self) -> nn.Embedding:
+        return self._original_mod.wpe
-                attn_output, _ = _MidmAttention._attn(self, query, key, value, attn_mask)
-                attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-                all_key_states.append(key)
-                all_value_states.append(value)
-                all_attn_output.append(attn_output)
+class MidmLayer(DecoderOnlyLayer):
+    def get_layernorm1p(self, module: nn.LayerNorm):
+        def layernorm1p(input: torch.Tensor):
+            """Applies Layer Normalization with a slight modification on the weights."""
+            return torch.nn.functional.layer_norm(
+                input, module.normalized_shape, module.weight + 1, module.bias, module.eps
+            )
-            keys = torch.cat(all_key_states, dim=0)
-            values = torch.cat(all_value_states, dim=0)
-            attn_output = torch.cat(all_attn_output, dim=0)
+        return layernorm1p
+    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
+        if self._original_mod.use_layernorm1p:
+            return self.get_layernorm1p(self._original_mod.ln_1)
         else:
-            if batch_index is None or batch_index == -1:
-                batch_index = 0
-            keys, values = past_key_value.update(
-                keys,
-                values,
-                self.layer_idx,
-                batch_index,
-                read_first_step=True,
-            )
+            return self._original_mod.ln_1
-            attn_output, _ = _MidmAttention._attn(self, querys, keys, values, attention_mask)
-            attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-        attn_output = self.c_proj(attn_output)
-        return attn_output, keys, values
-class _MidmBlock:
-    """Custom implementation of the MidmBlock class with specific modifications."""
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_idx: int,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_ids: Optional[torch.LongTensor] = None,
-        cos: Optional[torch.Tensor] = None,
-        sin: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-        """Defines the forward pass for the block."""
-        residual = hidden_states
-        if self.use_layernorm1p:
-            hidden_states = layernorm1p(self.ln_1, hidden_states)
+    def get_post_attention_layernorm(self) -> nn.LayerNorm:
+        if self._original_mod.use_layernorm1p:
+            return self.get_layernorm1p(self._original_mod.ln_2)
         else:
-            hidden_states = self.ln_1(hidden_states)
-        hidden_states, k, v = _MidmAttention.forward(
-            self.attn,
-            hidden_states,
-            attention_mask=attention_mask,
-            past_key_value=past_key_value,
-            cos=cos,
-            sin=sin,
-            batch_index=batch_ids,
+            return self._original_mod.ln_2
+class MidmAttention(DecoderOnlyAttention):
+    def __post_init__(self):
+        self.c_attn = self._original_mod.c_attn
+        self.o_proj = self._original_mod.c_proj
+        self.split_size = self._original_mod.split_size
+        self.num_key_value_heads = self._original_mod.num_heads
+    def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
+        return query_states, key_states, value_states
+    def rbln_attention(self, *args, **kwargs):
+        return super().rbln_attention(
+            *args,
+            **kwargs,
+            layer_idx=self.layer_idx,
+            scale_attn_weights=self._original_mod.scale_attn_weights,
+            scale_attn_by_inverse_layer_idx=self._original_mod.scale_attn_by_inverse_layer_idx,
         )
-        past_key_value.assign(k, v, layer_idx)
-        hidden_states = hidden_states + residual
-        residual = hidden_states
-        if self.use_layernorm1p:
-            hidden_states = layernorm1p(self.ln_2, hidden_states)
-        else:
-            hidden_states = self.ln_2(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + feed_forward_hidden_states
-        return hidden_states, past_key_value
-class _MidmModel:
-    """Custom implementation of the MidmModel class with specific modifications."""
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[RebelDynamicCache_4D] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        rotary_pos_emb=None,
-        batch_ids: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        """Defines the forward pass for the model."""
-        input_shape = input_ids.size()
-        attention_mask = (1.0 - attention_mask) * -10000.0
-        inputs_embeds = self.wte(input_ids)
-        cos, sin = rotary_pos_emb(inputs_embeds, attention_mask.shape[-1])
-        cos, sin = slice_and_unsqueeze_cos_sin(cos, sin, position_ids)
-        hidden_states = inputs_embeds
-        for layer_idx, (block, _) in enumerate(zip(self.h, past_key_values)):
-            hidden_states, updated_cache = _MidmBlock.forward(
-                block,
-                hidden_states,
-                layer_idx,
-                attention_mask=attention_mask,
-                past_key_value=past_key_values,
-                batch_ids=batch_ids,
-                cos=cos,
-                sin=sin,
-            )
-        hidden_states = layernorm1p(self.ln_f, hidden_states)
-        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
-        hidden_states = hidden_states.view(output_shape)
-        next_cache = updated_cache.to_legacy_cache()
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-        )
+    def apply_rotary_pos_embed(self, query_states, key_states, cos, sin):
+        return apply_rotary_pos_emb_partial(query_states, key_states, cos, sin, ndim=cos.shape[-1])

optimum/rbln/transformers/models/midm/modeling_midm.py CHANGED Viewed

@@ -21,12 +21,12 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+from transformers import AutoModelForCausalLM
 from ....utils import logging
-from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
-from .hf_hub_cached.modeling_midm import MidmLMHeadModel
-from .midm_architecture import (
-    MidmLMHeadModelWrapper,
-)
+from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
+from .midm_architecture import MidmLMHeadModelWrapper
 logger = logging.get_logger(__name__)
@@ -47,7 +47,7 @@ class RBLNMidmLMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     """
     _decoder_wrapper_cls = MidmLMHeadModelWrapper
-    _original_cls = MidmLMHeadModel
+    _hf_class = AutoModelForCausalLM
     @classmethod
     def from_pretrained(cls, *args, **kwargs):

optimum-rbln 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

optimum-rbln 0.1.13py3-none-any.whl → 0.1.15py3-none-any.whl