PyPI - optimum-rbln - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

optimum-rbln 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

optimum/rbln/transformers/models/gpt2/gpt2_architecture.py CHANGED Viewed

@@ -21,224 +21,259 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-from typing import List, Optional, Tuple, Union
+from typing import Dict, Optional, Tuple, Union
 import torch
 import torch.nn as nn
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    BaseModelOutputWithPastAndCrossAttentions,
-)
-from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2Model
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from ...cache_utils import RebelDynamicCache_4D
-class _GPT2Attention(GPT2Attention):
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
-            )
-        # Layer-wise attention scaling
-        if self.scale_attn_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-        # -------------------
-        # Below are deleted since "where" op does not supported on RBLN graph.
-        # -------------------
-        # if not self.is_cross_attention:
-        #     # if only "normal" attention layer implements causal mask
-        #     query_length, key_length = query.size(-2), key.size(-2)
-        #     causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
-        #     mask_value = torch.finfo(attn_weights.dtype).min
-        #     # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-        #     # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-        #     mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
-        #     attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
+class GPT2LMHeadModelWrapper(torch.nn.Module):
+    def __init__(self, model, max_seq_len):
+        super().__init__()
+        self.model = model.transformer
+        self.lm_head = model.lm_head
+        self.config = model.config
+        self.max_seq_len = max_seq_len
+        self.forward_dict = self.get_forward_dict()
+    def get_forward_dict(self):
+        forward_dict = {
+            "wrapper": _GPT2Model.forward,
+            "model": _GPT2Block.forward,
+            "decoder_layer": _GPT2Attention.forward,
+        }
+        return forward_dict
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
+    def forward(
+        self,
+        input_ids,
+        attention_mask,
+        cache_position,
+        batch_position,
+        *past_key_values,
+    ):
+        if input_ids.shape[1] == 1:
+            rbln_batch_position = None
+        else:
+            rbln_batch_position = batch_position
+        # Formatting list of past_kv to DynamicCache class.
+        past_key_value = RebelDynamicCache_4D.from_input_format(
+            cache_position,
+            self.config.n_layer,
+            *past_key_values,
+        )
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        outputs = self.forward_dict["wrapper"](
+            self.model,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=cache_position,
+            past_key_value=past_key_value,
+            batch_ids=rbln_batch_position,
+            forward_dict=self.forward_dict,
+            # rotary_emb  differenct from_llama
+        )
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
-        attn_weights = attn_weights.type(value.dtype)
-        # attn_weights = self.attn_dropout(attn_weights)
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
+        output = (logits,) + outputs[1:]
-        attn_output = torch.matmul(attn_weights, value)
+        return output, batch_position
-        return attn_output, attn_weights
+class _GPT2Model:
     def forward(
         self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        past_key_values: List[List[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[RebelDynamicCache_4D] = None,
+        batch_ids: Optional[torch.LongTensor] = None,
+        forward_dict: Optional[Dict[str, classmethod]] = None,
+    ) -> BaseModelOutputWithPast:
+        b_size, q_len = input_ids.shape
+        inputs_embeds = self.wte(input_ids)
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_heads, self.head_dim)
-        value = self._split_heads(value, self.num_heads, self.head_dim)
+        if position_ids.shape[0] > 1:
+            position_embeds = []
+            for b_idx in range(b_size):
+                position_embed = self.wpe(position_ids[b_idx])
+                # position_embed = position_embed.dtype(inputs_embeds.dtype)
+                position_embeds.append(position_embed)
-        if past_key_values is not None:
-            past_key, past_value = past_key_values[self.layer_idx]
-            query_length = query.shape[-2]
+            position_embeds = torch.cat(position_embeds, dim=0).unsqueeze(1)
+        else:
+            position_embeds = self.wpe(position_ids)
-            key = past_key.slice_scatter(key, dim=2, start=cache_position, end=cache_position + query_length)
-            value = past_value.slice_scatter(value, dim=2, start=cache_position, end=cache_position + query_length)
+        hidden_states = inputs_embeds + position_embeds
-            past_key_values[self.layer_idx] = [key, value]
+        # GPT2Attention mask.
+        # Here we assume mask is causal mask, (batch, 1, query_length, key_length + query_length)
+        attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-        attn_output, _ = _GPT2Attention._attn(self, query, key, value, attention_mask, head_mask)
+        for layer_idx, block in enumerate(self.h):
+            hidden_states, updated_cache = forward_dict["model"](
+                block,
+                hidden_states,
+                layer_idx,
+                attention_mask=attention_mask,
+                past_key_value=past_key_value,
+                position_ids=position_ids,
+                batch_ids=batch_ids,
+                forward_dict=forward_dict,
+            )
-        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        hidden_states = self.ln_f(hidden_states)
+        output_shape = (-1,) + (q_len,) + (hidden_states.size(-1),)
+        hidden_states = hidden_states.view(output_shape)
-        attn_output = self.c_proj(attn_output)
-        attn_output = self.resid_dropout(attn_output)
+        # convert RebelDynamicCache to legacy Tuple[Tuple[torch.Tensor]]
+        next_cache = updated_cache.to_legacy_cache()
-        return attn_output
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+        )
-class _GPT2Block(GPT2Block):
+class _GPT2Block:
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
-        past_key_values: List[List[torch.Tensor]] = None,
+        layer_idx: int,
         attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[RebelDynamicCache_4D] = None,
+        batch_ids: Optional[torch.LongTensor] = None,
+        forward_dict: Optional[Dict[str, classmethod]] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, RebelDynamicCache_4D]:
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
-        attn_output = _GPT2Attention.forward(
+        hidden_states, k, v = forward_dict["decoder_layer"](
             self.attn,
-            hidden_states,
-            past_key_values=past_key_values,
+            hidden_states=hidden_states,
             attention_mask=attention_mask,
-            head_mask=head_mask,
-            cache_position=cache_position,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            batch_index=batch_ids,
         )
+        past_key_value.assign(k, v, layer_idx)
         # residual connection
-        hidden_states = attn_output + residual
+        hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.ln_2(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        # residual connection
-        hidden_states = residual + feed_forward_hidden_states
-        return hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
-class _GPT2Model(GPT2Model):
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: List[List[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-        input_shape = input_ids.size()
+        return hidden_states, past_key_value
-        if position_ids is None:
-            # force dtype to torch.long -> torch.int32 (to match cache_position)
-            position_ids = torch.arange(0, input_shape[-1], dtype=torch.int32) + cache_position
-            position_ids = position_ids.unsqueeze(0)
-        # GPT2Attention mask.
-        # Here we assume mask is causal mask, (batch, 1, query_length, key_length + query_length)
-        attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-        inputs_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
-        hidden_states = self.drop(hidden_states)
-        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+class _GPT2Attention:
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-        for i, block in enumerate(self.h):
-            hidden_states = _GPT2Block.forward(
-                block,
-                hidden_states,
-                past_key_values=past_key_values,
-                attention_mask=attention_mask,
-                head_mask=head_mask[i],
-                cache_position=cache_position,
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / torch.full(
+                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
             )
-        hidden_states = self.ln_f(hidden_states)
-        hidden_states = hidden_states.view(output_shape)
-        return BaseModelOutputWithPast(last_hidden_state=hidden_states, past_key_values=past_key_values)
+        # Layer-wise attention scaling
+        if self.scale_attn_by_inverse_layer_idx:
+            attn_weights = attn_weights / float(self.layer_idx + 1)
+        # -------------------
+        # Below are deleted since "where" op does not supported on RBLN graph.
+        # -------------------
+        # if not self.is_cross_attention:
+        #     # if only "normal" attention layer implements causal mask
+        #     query_length, key_length = query.size(-2), key.size(-2)
+        #     causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+        #     mask_value = torch.finfo(attn_weights.dtype).min
+        #     # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        #     # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+        #     mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+        #     attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
-class GPT2LMHeadModelWrapper(torch.nn.Module):
-    def __init__(self, gpt):
-        super().__init__()
-        self.model = gpt
+        # Apply the attention mask
+        attn_weights.view(
+            -1,
+        )
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
     def forward(
         self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        cache_position: torch.LongTensor,
-        *past_key_values: torch.Tensor,
-    ):
-        kv_cache = []
-        for i in range(self.model.config.n_layer):
-            kv_cache.append((past_key_values[2 * i], past_key_values[2 * i + 1]))
-        transformer_outputs = _GPT2Model.forward(
-            self.model.transformer,
-            input_ids=input_ids,
-            past_key_values=kv_cache,
-            attention_mask=attention_mask,
-            cache_position=cache_position,
-        )
-        hidden_states = transformer_outputs[0]
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[RebelDynamicCache_4D] = None,
+        batch_index: Optional[int] = None,
+        **kwargs,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+        bsz, q_len, _ = hidden_states.size()
+        query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-        # TODO : Use query_length here to pick last logit
-        # batch_size, sequence_length = hidden_states.shape[:2]
-        # hidden_states = hidden_states.view(batch_size * sequence_length, -1)
-        # hidden_states = torch.nn.functional.embedding(query_length, hidden_states)
-        # hidden_states = hidden_states.view(batch_size, 1, -1)
+        querys = self._split_heads(query, self.num_heads, self.head_dim)  # (batch, head, seq_length, head_features)
+        keys = self._split_heads(key, self.num_heads, self.head_dim)
+        values = self._split_heads(value, self.num_heads, self.head_dim)
+        # Decoder
+        if (batch_index is None or batch_index == -1) and bsz > 1:
+            all_keys = []
+            all_values = []
+            all_attn_output = []
+            for b in range(bsz):
+                query = querys[b].unsqueeze(0)
+                attn_mask = attention_mask[b].unsqueeze(0)
+                key = keys[b].unsqueeze(0)
+                value = values[b].unsqueeze(0)
+                key, value = past_key_value.update(
+                    key,
+                    value,
+                    self.layer_idx,
+                    b,
+                )
+                attn_output, _ = _GPT2Attention._attn(self, query, key, value, attn_mask)
+                attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+                all_keys.append(key)
+                all_values.append(value)
+                all_attn_output.append(attn_output)
+            keys = torch.cat(all_keys, dim=0)
+            values = torch.cat(all_values, dim=0)
+            attn_output = torch.cat(all_attn_output, dim=0)
+        # Prefill
+        else:
+            if batch_index is None or batch_index == -1:
+                batch_index = 0
+            keys, values = past_key_value.update(
+                keys,
+                values,
+                self.layer_idx,
+                batch_index,
+                read_first_step=True,
+            )
-        lm_logits = self.model.lm_head(hidden_states)
-        kv_cache = transformer_outputs[1]
+            attn_output, _ = _GPT2Attention._attn(self, querys, keys, values, attention_mask)
+            attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-        past_key_values = []
-        for i in range(self.model.config.n_layer):
-            past_key_values.append(kv_cache[i][0])
-            past_key_values.append(kv_cache[i][1])
+        attn_output = self.c_proj(attn_output)
-        output = (lm_logits,) + tuple(past_key_values)
-        return output
+        return attn_output, keys, values

optimum-rbln 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

optimum-rbln 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl