PyPI - optimum-rbln - Versions diffs - 0.1.15__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

optimum-rbln 0.1.15py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

optimum/rbln/transformers/models/whisper/whisper_architecture.py CHANGED Viewed

@@ -27,401 +27,308 @@ import torch
 from torch import nn
 from transformers.modeling_attn_mask_utils import (
     _prepare_4d_causal_attention_mask,
-    _prepare_4d_causal_attention_mask_for_sdpa,
 )
 from transformers.modeling_outputs import (
     BaseModelOutput,
-    BaseModelOutputWithPastAndCrossAttentions,
     Seq2SeqLMOutput,
 )
-from transformers.models.whisper.modeling_whisper import (
-    WhisperAttention,
-    WhisperDecoder,
-    WhisperDecoderLayer,
-    WhisperPositionalEmbedding,
-    WhisperSdpaAttention,
-)
 from transformers.utils import logging
+from ....ops import register_rbln_custom_cache_update
-logger = logging.get_logger(__name__)
+logger = logging.get_logger(__name__)
-class _WhisperAttention(WhisperAttention):
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        cache_position: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, tgt_len, _ = hidden_states.size()
-        is_cross_attention = key_value_states is not None
-        query_states = self.q_proj(hidden_states) * self.scaling
-        if is_cross_attention:
-            is_dummy_decoder = len(key_value_states.shape) > 1
-            if is_dummy_decoder:
-                key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-                value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-            else:
-                key_states = past_key_value[0]
-                value_states = past_key_value[1]
-        else:
-            if self.is_decoder:
-                key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-                value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-                key_states = past_key_value[0].slice_scatter(
-                    key_states, dim=2, start=cache_position, end=cache_position + 1
-                )
-                value_states = past_key_value[1].slice_scatter(
-                    value_states, dim=2, start=cache_position, end=cache_position + 1
-                )
-            else:
-                key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-                value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-        if self.is_decoder:
-            present_key_value = (key_states, value_states)
-        else:
-            present_key_value = None
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.reshape(*proj_shape)
-        value_states = value_states.reshape(*proj_shape)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-        src_len = key_states.size(1)
-        if attention_mask is not None:
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+class WhisperWrapper:
+    def __init__(self, model, rbln_token_timestamps):
+        register_rbln_custom_cache_update()
+        self.encoder = WhisperEncoderWrapper(model)
+        self.decoder = WhisperDecoderWrapper(model, output_attentions=rbln_token_timestamps)
-        attn_output = torch.bmm(attn_weights, value_states)
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
+class WhisperEncoderWrapper(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.config = model.config
+        self.encoder = model.get_encoder()
+        self.num_heads = self.config.decoder_attention_heads
+        self.d_kv = self.config.d_model // self.num_heads
+        self.cross_k_projects, self.cross_v_projects = self._extract_cross_kv_projects(model.get_decoder().layers)
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-        attn_output = self.out_proj(attn_output)
+    def _extract_cross_kv_projects(self, decoder_layers: nn.Module):
+        return (
+            nn.ModuleList(layer.encoder_attn.k_proj for layer in decoder_layers),
+            nn.ModuleList(layer.encoder_attn.v_proj for layer in decoder_layers),
+        )
-        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+    def forward(
+        self,
+        input_features: Optional[torch.LongTensor],
+        cross_key_values: torch.Tensor,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
+        # 1. get encoder last_hidden_states
+        encoder_outputs = self.encoder(input_features=input_features)
+        last_hidden_states = encoder_outputs[0]
-        return attn_output, attn_weights, present_key_value
+        # 2. pre-compute cross_attention's past_key_value which used in decoder phase.
+        cross_kv = []
+        batch_size = input_features.shape[0]
+        for k_proj, v_proj in zip(self.cross_k_projects, self.cross_v_projects):
+            past_k = k_proj(last_hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)
+            past_v = v_proj(last_hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)
+            cross_kv.append(past_k)
+            cross_kv.append(past_v)
-class _WhisperSdpaAttention(WhisperSdpaAttention):
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        cache_position: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, tgt_len, _ = hidden_states.size()
+        cross_kv = torch.stack(cross_kv, dim=0)
-        is_cross_attention = key_value_states is not None
-        query_states = self.q_proj(hidden_states)
-        if is_cross_attention:
-            is_dummy_decoder = len(key_value_states.shape) > 1
-            if is_dummy_decoder:
-                key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-                value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-            else:
-                key_states = past_key_value[0]
-                value_states = past_key_value[1]
-        else:
-            if self.is_decoder:
-                key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-                value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-                key_states = past_key_value[0].slice_scatter(
-                    key_states, dim=2, start=cache_position, end=cache_position + 1
-                )
-                value_states = past_key_value[1].slice_scatter(
-                    value_states, dim=2, start=cache_position, end=cache_position + 1
-                )
-            else:
-                key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-                value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-        if self.is_decoder:
-            present_key_value = (key_states, value_states)
-        else:
-            present_key_value = None
-        query_states = self._shape(query_states, tgt_len, bsz)
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=0.0,
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
-        )
+        # 3. update cross_attention's past_key_value to the device-dram for optimization.
+        bidx = torch.tensor(0, dtype=torch.int16)
+        axis = torch.tensor(1, dtype=torch.int16)
+        cross_key_values = torch.ops.rbln_custom_ops.rbln_cache_update(cross_key_values, cross_kv, bidx, axis)
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        return cross_key_values
-        attn_output = self.out_proj(attn_output)
-        return attn_output, None, present_key_value
+class WhisperDecoderWrapper(torch.nn.Module):
+    def __init__(self, model, output_attentions: bool = False):
+        super().__init__()
+        self.config = model.config
+        self.num_layers = self.config.decoder_layers
+        self.proj_out = model.proj_out
+        self.decoder = self.convert_to_rbln_conditional_generation(model)
+        self.output_attentions = output_attentions
+    def convert_to_rbln_conditional_generation(self, model: nn.Module):
+        new_layers = []
+        for layer in model.get_decoder().layers:
+            self_attn = WhisperSelfAttention(layer.self_attn)
+            cross_attn = WhisperCrossAttention(layer.encoder_attn)
+            new_layers.append(WhisperDecoderLayer(layer, self_attn, cross_attn))
-ATTN_FORWARD_MAP = {"eager": _WhisperAttention.forward, "sdpa": _WhisperSdpaAttention.forward}
+        decoder_model = WhisperDecoder(model.get_decoder(), new_layers)
+        return decoder_model
-class _WhisperDecoderLayer(WhisperDecoderLayer):
     def forward(
         self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        cache_position: Optional[torch.Tensor] = None,
-        attn_impl: str = "eager",
-        output_attentions: bool = False,
-    ) -> torch.Tensor:
-        # Self Attention Block
-        residual = hidden_states
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        decoder_input_ids: torch.Tensor,
+        decoder_attention_mask: torch.Tensor,
+        cache_position: torch.Tensor,
+        cross_kv_cache: torch.Tensor,
+        *self_kv_cache: torch.Tensor,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        # prepare past_key_values
+        self_past_key_values = ()
+        cross_past_key_values = ()
+        for i in range(0, self.num_layers * 2, 2):
+            self_past_key_values = self_past_key_values + ((self_kv_cache[i], self_kv_cache[i + 1]),)
+            cross_past_key_values = cross_past_key_values + ((cross_kv_cache[i], cross_kv_cache[i + 1]),)
-        hidden_states, _, present_key_value = ATTN_FORWARD_MAP[attn_impl](
-            self.self_attn,
-            hidden_states=hidden_states,
-            past_key_value=self_attn_past_key_value,
-            attention_mask=attention_mask,
+        # Decode
+        sequence_output, self_present_key_values, cross_attentions = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
             cache_position=cache_position,
+            self_past_key_values=self_past_key_values,
+            cross_past_key_values=cross_past_key_values,
         )
-        hidden_states = residual + hidden_states
-        # Cross-Attention Block
-        residual = hidden_states
-        hidden_states = self.encoder_attn_layer_norm(hidden_states)
-        cross_attn_past_key_value = past_key_value[2:] if past_key_value is not None else None
-        if output_attentions:
-            hidden_states, cross_attn_weights, cross_attn_present_key_value = _WhisperAttention.forward(
-                self.encoder_attn,
-                hidden_states=hidden_states,
-                key_value_states=encoder_hidden_states,
-                past_key_value=cross_attn_past_key_value,
-                cache_position=cache_position,
-            )
-        else:
-            hidden_states, cross_attn_weights, cross_attn_present_key_value = ATTN_FORWARD_MAP[attn_impl](
-                self.encoder_attn,
-                hidden_states=hidden_states,
-                key_value_states=encoder_hidden_states,
-                past_key_value=cross_attn_past_key_value,
-                cache_position=cache_position,
-            )
-        hidden_states = residual + hidden_states
-        present_key_value = present_key_value + cross_attn_present_key_value
+        lm_logits = self.proj_out(sequence_output)
-        # Fully Connected Block
-        residual = hidden_states
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = residual + hidden_states
+        outputs = (lm_logits,)
+        outputs += self_present_key_values
-        return hidden_states, present_key_value, cross_attn_weights
+        if self.output_attentions:
+            # deocder's cross attention is used for token_timestamps
+            cross_attention = torch.stack(cross_attentions, dim=0)
+            outputs += (cross_attention,)
+        return outputs
-class _WhisperPositionalEmbedding(WhisperPositionalEmbedding):
-    def forward(self, input_ids, past_key_values_length=0, position_ids=None):
-        if position_ids is None:
-            return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[1]]
-        else:
-            return self.weight[position_ids]
+class WhisperDecoder(nn.Module):
+    def __init__(self, model, layers, **kwargs):
+        super().__init__()
+        self._original_mod = model
+        self.layers = nn.ModuleList(layers)
+        self.embed_tokens = model.embed_tokens
+        self.layer_norm = model.layer_norm
+        self.embed_positions = model.embed_positions
-class _WhisperDecoder(WhisperDecoder):
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        past_key_values: Optional[torch.Tensor] = None,
+        self_past_key_values: Optional[torch.Tensor] = None,
+        cross_past_key_values: Optional[torch.Tensor] = None,
         cache_position: Optional[torch.Tensor] = None,
-        attn_impl: str = "eager",
-        output_attentions: bool = False,
-        **kwargs,
     ):
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_shape[-1])
         # positional embeding
         inputs_embeds = self.embed_tokens(input_ids)
-        positions = _WhisperPositionalEmbedding.forward(
-            self.embed_positions, input_ids, cache_position, cache_position
-        )
+        positions = self.embed_positions(input_ids, position_ids=cache_position)
         hidden_states = inputs_embeds + positions
         # prepare casual_attn_mask
-        if self._use_sdpa:
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask, input_shape, inputs_embeds, cache_position
-            )
-        else:
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask, input_shape, inputs_embeds, cache_position
-            )
+        attention_mask = _prepare_4d_causal_attention_mask(attention_mask, input_shape, inputs_embeds, cache_position)
-        next_decoder_cache = ()
-        all_cross_attentions = () if output_attentions else None
+        self_present_key_values = ()
+        cross_attentions = ()
         # iterate decoder_layer
-        for idx, decoder_layer in enumerate(self.layers):
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-            layer_outputs = _WhisperDecoderLayer.forward(
-                decoder_layer,
+        for self_past_key_value, cross_past_key_value, decoder_layer in zip(
+            self_past_key_values, cross_past_key_values, self.layers
+        ):
+            layer_outputs = decoder_layer(
                 hidden_states,
                 attention_mask=attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                past_key_value=past_key_value,
+                self_past_key_value=self_past_key_value,
+                cross_past_key_value=cross_past_key_value,
                 cache_position=cache_position,
-                attn_impl=attn_impl,
-                output_attentions=output_attentions,
             )
             hidden_states = layer_outputs[0]
+            self_present_key_values += layer_outputs[1]
+            cross_attentions += (layer_outputs[2],)
-            next_decoder_cache += (layer_outputs[1],)
-            if output_attentions:
-                all_cross_attentions += (layer_outputs[2],)
-        # layer_norm
         hidden_states = self.layer_norm(hidden_states)
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            cross_attentions=all_cross_attentions,
-        )
+        return hidden_states, self_present_key_values, cross_attentions
-class _WhisperDecoderWrapper(torch.nn.Module):
-    def __init__(self, model, output_attentions: bool = False):
+class WhisperDecoderLayer(nn.Module):
+    def __init__(self, decoder_layer, self_attn, cross_attn):
         super().__init__()
-        self.proj_out = model.proj_out
-        self.config = model.config
-        self.decoder = model.get_decoder()
-        self.num_layers = self.config.decoder_layers
-        self.attn_impl = self.config._attn_implementation
-        self.output_attentions = output_attentions
+        self._original_mod = decoder_layer
+        self.self_attn = self_attn
+        self.encoder_attn = cross_attn
+        self.self_attn_layer_norm = decoder_layer.self_attn_layer_norm
+        self.encoder_attn_layer_norm = decoder_layer.encoder_attn_layer_norm
+        self.final_layer_norm = decoder_layer.final_layer_norm
+        self.activation_fn = decoder_layer.activation_fn
+        self.fc1 = decoder_layer.fc1
+        self.fc2 = decoder_layer.fc2
     def forward(
         self,
-        decoder_input_ids: torch.Tensor,
-        decoder_attention_mask: torch.Tensor,
-        cache_position: torch.Tensor,
-        self_kv_cache: torch.Tensor,
-        cross_kv_cache: torch.Tensor,
-    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
-        # prepare past_key_values
-        kv_cache = ()
-        for i in range(0, self.num_layers * 2, 2):
-            kv_cache = kv_cache + (
-                (
-                    self_kv_cache[i],
-                    self_kv_cache[i + 1],
-                    cross_kv_cache[i],
-                    cross_kv_cache[i + 1],
-                ),
-            )
-        # Decode
-        decoder_outputs = _WhisperDecoder.forward(
-            self.decoder,
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        self_past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        cross_past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Self Attention Block
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, _, self_present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_past_key_value,
+            attention_mask=attention_mask,
             cache_position=cache_position,
-            past_key_values=kv_cache,
-            encoder_hidden_states=torch.tensor([1]),
-            attn_impl=self.attn_impl,
-            output_attentions=self.output_attentions,
         )
-        sequence_output = decoder_outputs[0]
-        lm_logits = self.proj_out(sequence_output)
+        hidden_states = residual + hidden_states
-        # get self_kv_cache from ouputs
-        past_key_values = decoder_outputs[1]
-        self_kv_cache = []
-        for i in range(self.config.decoder_layers):
-            self_kv_cache.append(past_key_values[i][0])
-            self_kv_cache.append(past_key_values[i][1])
-        self_kv_cache = torch.stack(self_kv_cache, dim=0)
+        # Cross-Attention Block
+        residual = hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+        hidden_states, cross_attn_weights, cross_present_key_value = self.encoder_attn(
+            hidden_states=hidden_states,
+            past_key_value=cross_past_key_value,
+        )
+        hidden_states = residual + hidden_states
-        if self.output_attentions:
-            # deocder's cross attention is used for token_timestamps
-            cross_attention = torch.stack(decoder_outputs[2], dim=0)
-            return lm_logits, self_kv_cache, cross_attention
-        else:
-            return lm_logits, self_kv_cache
+        # Fully Connected Block
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states, self_present_key_value, cross_attn_weights
-class _WhisperEncoderWrapper(torch.nn.Module):
-    def __init__(self, model):
+class WhisperAttention(nn.Module):
+    def __init__(self, attn):
         super().__init__()
-        self.model = model
-        self.config = model.config
-        self.decoder = model.get_decoder()
-        self.encoder = model.get_encoder()
-        self.num_layers = self.config.decoder_layers
-        self.decoder_max_length = self.config.max_target_positions
-        self.encoder_max_length = self.config.max_source_positions
-        self.num_heads = self.config.decoder_attention_heads
-        self.d_kv = self.config.d_model // self.num_heads
-        self.attn_impl = self.config._attn_implementation
+        self._original_mod = attn
+        self.q_proj = attn.q_proj
+        self.k_proj = attn.k_proj
+        self.v_proj = attn.v_proj
+        self.out_proj = attn.out_proj
+        self.num_heads = attn.num_heads
+        self.embed_dim = attn.embed_dim
+        self.head_dim = attn.embed_dim // attn.num_heads
+        self.scaling = self.head_dim**-0.5
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int) -> torch.Tensor:
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+class WhisperSelfAttention(WhisperAttention):
+    def rbln_cache_update(
+        self,
+        past_key_value: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        cache_position: torch.Tensor,
+    ):
+        s_idx = torch.tensor(cache_position, dtype=torch.int16)
+        axis = torch.tensor(2, dtype=torch.int16)
+        key_states = torch.ops.rbln_custom_ops.rbln_cache_update(past_key_value[0], key_states, s_idx, axis)
+        value_states = torch.ops.rbln_custom_ops.rbln_cache_update(past_key_value[1], value_states, s_idx, axis)
+        return key_states, value_states
     def forward(
         self,
-        input_features: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
-        encoder_outputs = self.encoder(input_features=input_features)
+        hidden_states: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, tgt_len, _ = hidden_states.size()
+        query_states = self._shape(self.q_proj(hidden_states), tgt_len, bsz)
+        query_states = query_states * self.scaling
-        last_hidden_states = encoder_outputs[0]
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        key_states, value_states = self.rbln_cache_update(past_key_value, key_states, value_states, cache_position)
-        encoder_batch_size = input_features.shape[0]
-        decoder_batch_size = encoder_batch_size  # TODO fix in future
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        dummy_past_key_value = []
-        for _ in range(self.num_layers):
-            pkv_self_attn_key = torch.zeros(decoder_batch_size, self.num_heads, self.decoder_max_length, self.d_kv)
-            pkv_self_attn_value = torch.zeros(decoder_batch_size, self.num_heads, self.decoder_max_length, self.d_kv)
-            pkv_cross_attn_key = torch.zeros(encoder_batch_size, self.num_heads, self.encoder_max_length, self.d_kv)
-            pkv_cross_attn_value = torch.zeros(encoder_batch_size, self.num_heads, self.encoder_max_length, self.d_kv)
-            layer_pkv = (pkv_self_attn_key, pkv_self_attn_value, pkv_cross_attn_key, pkv_cross_attn_value)
-            dummy_past_key_value.append(layer_pkv)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
-        decoder_attention_mask = torch.zeros(decoder_batch_size, self.decoder_max_length, dtype=torch.int64)
-        decoder_attention_mask[:, :1] = 1
+        return attn_output, attn_weights, (key_states, value_states)
-        decoder_outputs = _WhisperDecoder.forward(
-            self.decoder,
-            input_ids=torch.zeros((decoder_batch_size, 1), dtype=torch.int64),
-            attention_mask=decoder_attention_mask,
-            cache_position=torch.tensor(0, dtype=torch.int32),
-            encoder_hidden_states=last_hidden_states,
-            past_key_values=dummy_past_key_value,
-            attn_impl=self.attn_impl,
-            output_attentions=False,
-        )
-        first_past_kv = decoder_outputs[1]
+class WhisperCrossAttention(WhisperSelfAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        batch_size, query_len, _ = hidden_states.size()
+        query_states = self._shape(self.q_proj(hidden_states), query_len, batch_size)
+        query_states = query_states * self.scaling
-        cross_kv = []
-        for layer_out in first_past_kv:  # for layer
-            cross_kv.append(layer_out[2])
-            cross_kv.append(layer_out[3])
-        cross_kv = torch.stack(cross_kv, dim=0)
+        key_states = past_key_value[0]
+        value_states = past_key_value[1]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.view(batch_size, self.num_heads, query_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, query_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
-        return cross_kv
+        return attn_output, attn_weights, (key_states, value_states)

optimum/rbln/transformers/utils/rbln_quantization.py CHANGED Viewed

@@ -21,7 +21,6 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
 import functools
 import glob
 import os

optimum-rbln 0.1.15__py3-none-any.whl → 0.2.0__py3-none-any.whl

optimum-rbln 0.1.15py3-none-any.whl → 0.2.0py3-none-any.whl