PyPI - optimum-rbln - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

optimum-rbln 0.1.9py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

optimum/rbln/transformers/models/t5/t5_architecture.py CHANGED Viewed

@@ -43,12 +43,19 @@ if TYPE_CHECKING:
     from transformers import T5ForConditionalGeneration
+class T5Wrapper:
+    def __init__(self, model):
+        self.encoder = T5EncoderWrapper(model)
+        self.decoder = T5DecoderWrapper(model)
 class T5Encoder(T5Stack):
     def forward(
         self,
         input_ids: torch.Tensor,
         attention_mask: torch.Tensor,
         position_bias: torch.Tensor,
+        batch_ids: torch.Tensor = None,
     ) -> BaseModelOutput:
         hidden_states = self.embed_tokens(input_ids)
         extended_attention_mask = self.invert_attention_mask(attention_mask)
@@ -58,6 +65,7 @@ class T5Encoder(T5Stack):
                 layer_module,
                 hidden_states,
                 position_bias=position_bias,
+                batch_ids=batch_ids,
             )
             hidden_states = layer_outputs[0]
         hidden_states = self.final_layer_norm(hidden_states)
@@ -75,6 +83,7 @@ class T5Decoder(T5Stack):
         position_bias: torch.Tensor,
         encoder_decoder_position_bias: torch.Tensor,
         cache_position: torch.Tensor,
+        batch_ids: torch.Tensor,
     ) -> BaseModelOutputWithPastAndCrossAttentions:
         hidden_states = self.embed_tokens(input_ids)
         extended_attention_mask = self.invert_attention_mask(attention_mask)
@@ -84,6 +93,7 @@ class T5Decoder(T5Stack):
         encoder_decoder_position_bias = encoder_decoder_position_bias + encoder_extended_attention_mask
         present_key_value_states = ()
         for layer_module, past_key_value in zip(self.block, past_key_values):
             layer_outputs = _T5Block.forward(
                 layer_module,
@@ -93,6 +103,7 @@ class T5Decoder(T5Stack):
                 encoder_decoder_position_bias=encoder_decoder_position_bias,
                 past_key_value=past_key_value,
                 cache_position=cache_position,
+                batch_ids=batch_ids,
             )
             hidden_states, present_key_value_state = layer_outputs[:2]
             present_key_value_states = present_key_value_states + (present_key_value_state,)
@@ -117,17 +128,26 @@ class T5EncoderWrapper(torch.nn.Module):
         )
         self.encoder_max_length = None
         self.decoder_max_length = None
-        self.decoder_batch_size = 1
-    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        encoder_batch_size = input_ids.shape[0]
-        decoder_batch_size = self.decoder_batch_size
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        cross_key_value: torch.Tensor = None,
+        batch_idx: torch.Tensor = None,
+    ) -> torch.Tensor:
         decoder_max_length = self.decoder_max_length or self.default_max_length
         encoder_max_length = self.encoder_max_length or self.default_max_length
         attn_layer = self.encoder.block[0].layer[0].SelfAttention
         encoder_position_bias = T5Attention.compute_bias(attn_layer, encoder_max_length, encoder_max_length)
-        encoder_outputs = T5Encoder.forward(self.encoder, input_ids, attention_mask, encoder_position_bias)
+        encoder_outputs = T5Encoder.forward(
+            self.encoder,
+            input_ids,
+            attention_mask,
+            encoder_position_bias,
+            batch_ids=torch.tensor(0, dtype=torch.int32),
+        )
         attn_layer = self.decoder.block[0].layer[0].SelfAttention
         decoder_position_bias = T5Attention.compute_bias(attn_layer, decoder_max_length, decoder_max_length)
@@ -138,22 +158,14 @@ class T5EncoderWrapper(torch.nn.Module):
         dummy_past_key_value = []
         for i in range(self.config.num_layers):
-            pkv_self_attn_key = torch.zeros(
-                decoder_batch_size, self.config.num_heads, decoder_max_length, self.config.d_kv
-            )
-            pkv_self_attn_value = torch.zeros(
-                decoder_batch_size, self.config.num_heads, decoder_max_length, self.config.d_kv
-            )
-            pkv_cross_attn_key = torch.zeros(
-                encoder_batch_size, self.config.num_heads, encoder_max_length, self.config.d_kv
-            )
-            pkv_cross_attn_value = torch.zeros(
-                encoder_batch_size, self.config.num_heads, encoder_max_length, self.config.d_kv
-            )
+            pkv_self_attn_key = torch.zeros(1, self.config.num_heads, decoder_max_length, self.config.d_kv)
+            pkv_self_attn_value = torch.zeros(1, self.config.num_heads, decoder_max_length, self.config.d_kv)
+            pkv_cross_attn_key = torch.zeros(1, self.config.num_heads, encoder_max_length, self.config.d_kv)
+            pkv_cross_attn_value = torch.zeros(1, self.config.num_heads, encoder_max_length, self.config.d_kv)
             layer_pkv = (pkv_self_attn_key, pkv_self_attn_value, pkv_cross_attn_key, pkv_cross_attn_value)
             dummy_past_key_value.append(layer_pkv)
-        decoder_attention_mask = torch.zeros(decoder_batch_size, decoder_max_length, dtype=torch.int64)
+        decoder_attention_mask = torch.zeros(1, decoder_max_length, dtype=torch.float32)
         decoder_attention_mask[:, :1] = 1
         # Since first step of decoder has different graph to further step of it,
@@ -161,7 +173,7 @@ class T5EncoderWrapper(torch.nn.Module):
         # TODO(jongho): Separate first-step-decoder.
         decoder_outputs = T5Decoder.forward(
             self.decoder,
-            input_ids=torch.zeros(decoder_batch_size, 1, dtype=torch.int64),
+            input_ids=torch.zeros(1, 1, dtype=torch.int64),
             attention_mask=decoder_attention_mask,
             position_bias=decoder_position_bias,
             encoder_decoder_position_bias=encoder_decoder_position_bias,
@@ -169,6 +181,7 @@ class T5EncoderWrapper(torch.nn.Module):
             encoder_attention_mask=attention_mask,
             past_key_values=dummy_past_key_value,
             cache_position=torch.tensor(0, dtype=torch.int32),
+            batch_ids=torch.tensor(0, dtype=torch.int32),
         )
         past_key_values = decoder_outputs.past_key_values
@@ -179,7 +192,9 @@ class T5EncoderWrapper(torch.nn.Module):
             cross_kv_cache.append(past_key_values[i][3])
         cross_kv_cache = torch.stack(cross_kv_cache, dim=0)
-        return cross_kv_cache
+        cross_key_value = cross_key_value.slice_scatter(cross_kv_cache, dim=1, start=batch_idx, end=batch_idx + 1)
+        return cross_key_value
 class T5DecoderWrapper(torch.nn.Module):
@@ -201,6 +216,7 @@ class T5DecoderWrapper(torch.nn.Module):
         attention_mask: torch.Tensor,
         encoder_attention_mask: torch.Tensor,
         cache_position: torch.Tensor,
+        batch_position: torch.Tensor,
         self_kv_cache: torch.Tensor,
         cross_kv_cache: torch.Tensor,
     ) -> Tuple[torch.Tensor]:
@@ -210,6 +226,11 @@ class T5DecoderWrapper(torch.nn.Module):
         encoder_max_length = self.encoder_max_length or self.default_max_length
         decoder_max_length = self.decoder_max_length or self.default_max_length
+        if input_ids.shape[1] == 1:
+            rbln_batch_position = None
+        else:
+            rbln_batch_position = batch_position
         kv_cache = ()
         for i in range(0, num_layers * 2, 2):
             kv_cache = kv_cache + (
@@ -223,7 +244,13 @@ class T5DecoderWrapper(torch.nn.Module):
         attn_layer = self.model.decoder.block[0].layer[0].SelfAttention
         _decoder_position_bias = T5Attention.compute_bias(attn_layer, decoder_max_length, decoder_max_length)
-        decoder_position_bias = _decoder_position_bias[:, :, cache_position].unsqueeze(2)
+        # position_bias need to compute with batch (for cb)
+        batch_decoder_position_bias = []
+        for i in range(input_ids.shape[0]):
+            batch_position_bias = _decoder_position_bias[:, :, cache_position[i][0]].unsqueeze(2)
+            batch_decoder_position_bias.append(batch_position_bias)
+        decoder_position_bias = torch.cat(batch_decoder_position_bias, dim=0)
         attn_layer = self.model.decoder.block[0].layer[1].EncDecAttention
         encoder_decoder_position_bias = torch.zeros(1, attn_layer.n_heads, 1, encoder_max_length)
@@ -238,6 +265,7 @@ class T5DecoderWrapper(torch.nn.Module):
             encoder_decoder_position_bias=encoder_decoder_position_bias,
             past_key_values=kv_cache,
             cache_position=cache_position,
+            batch_ids=rbln_batch_position,
         )
         past_key_values = decoder_outputs.past_key_values
@@ -255,7 +283,7 @@ class T5DecoderWrapper(torch.nn.Module):
         self_kv_cache = torch.stack(self_kv_cache, dim=0)
-        return lm_logits, self_kv_cache
+        return lm_logits, self_kv_cache, batch_position
 class _T5Attention(T5Attention):
@@ -269,10 +297,10 @@ class _T5Attention(T5Attention):
         position_bias: torch.Tensor = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         cache_position: Optional[torch.Tensor] = None,  # 현재 cache sequence 길이
+        batch_index: torch.Tensor = None,
         is_self_attn: Optional[bool] = None,
     ) -> Tuple[torch.Tensor]:
         batch_size = hidden_states.shape[0]
-        cross_batch_size = key_value_states.shape[0] if not is_self_attn and cache_position == 0 else None
         def shape(states, batch_size):
             """projection"""
@@ -288,39 +316,80 @@ class _T5Attention(T5Attention):
         if is_self_attn:
             key_states = shape(self.k(hidden_states), batch_size)
             value_states = shape(self.v(hidden_states), batch_size)
-            if past_key_value is not None:
-                # decoder self attn
-                cache_k = past_key_value[0].slice_scatter(
-                    key_states, dim=2, start=cache_position, end=cache_position + 1
-                )
-                cache_v = past_key_value[1].slice_scatter(
-                    value_states, dim=2, start=cache_position, end=cache_position + 1
-                )
-                past_key_value = (cache_k, cache_v)
-                key_states, value_states = past_key_value
         else:
             # cross-attn
-            if cache_position == 0:
-                key_states = shape(self.k(key_value_states), cross_batch_size)
-                value_states = shape(self.v(key_value_states), cross_batch_size)
+            if cache_position.dim() == 0:
+                key_states = shape(self.k(key_value_states), key_value_states.shape[0])
+                value_states = shape(self.v(key_value_states), key_value_states.shape[0])
                 past_key_value = key_states, value_states
             else:
                 key_states = past_key_value[0]
                 value_states = past_key_value[1]
-        # compute scores
-        scores = torch.matmul(query_states, key_states.transpose(3, 2))
-        scores += position_bias
+        if (batch_index is None or batch_index == -1) and batch_size > 1:
+            all_key_states = []
+            all_value_states = []
+            all_attn_output = []
+            for b in range(batch_size):
+                batch_query_states = query_states[b].unsqueeze(0)
+                batch_key_states = key_states[b].unsqueeze(0)
+                batch_value_states = value_states[b].unsqueeze(0)
+                if is_self_attn and past_key_value is not None:
+                    batch_key_states = (
+                        past_key_value[0][b]
+                        .unsqueeze(0)
+                        .slice_scatter(
+                            batch_key_states, dim=-2, start=cache_position[b][0], end=cache_position[b][0] + 1
+                        )
+                    )
+                    batch_value_states = (
+                        past_key_value[1][b]
+                        .unsqueeze(0)
+                        .slice_scatter(
+                            batch_value_states, dim=-2, start=cache_position[b][0], end=cache_position[b][0] + 1
+                        )
+                    )
+                scores = torch.matmul(batch_query_states, batch_key_states.transpose(3, 2))
+                scores += position_bias[b]
+                attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
+                attn_output = unshape(torch.matmul(attn_weights, batch_value_states), 1)
+                all_key_states.append(batch_key_states)
+                all_value_states.append(batch_value_states)
+                all_attn_output.append(attn_output)
+            key_states = torch.cat(all_key_states, dim=0)
+            value_states = torch.cat(all_value_states, dim=0)
+            attn_output = torch.cat(all_attn_output, dim=0)
-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
-            scores
-        )  # (batch_size, n_heads, seq_length, key_length)
+        else:
+            if batch_index is None or batch_index == -1:
+                batch_index = 0
-        attn_output = unshape(torch.matmul(attn_weights, value_states), batch_size)  # (batch_size, seq_length, dim)
-        attn_output = self.o(attn_output)
+            if is_self_attn and past_key_value is not None:
+                key_states = past_key_value[0].slice_scatter(
+                    key_states, dim=2, start=cache_position, end=cache_position + 1
+                )
+                value_states = past_key_value[1].slice_scatter(
+                    value_states, dim=2, start=cache_position, end=cache_position + 1
+                )
+            # compute scores
+            scores = torch.matmul(query_states, key_states.transpose(3, 2))
+            scores += position_bias
+            attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+                scores
+            )  # (batch_size, n_heads, seq_length, key_length)
-        outputs = (attn_output,) + (past_key_value,)
+            attn_output = unshape(
+                torch.matmul(attn_weights, value_states), batch_size
+            )  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
+        present_key_value = (key_states, value_states)
+        outputs = (attn_output,) + (present_key_value,)
         return outputs
@@ -331,6 +400,7 @@ class _T5LayerSelfAttention(T5LayerSelfAttention):
         position_bias: torch.Tensor = None,
         past_key_value: Tuple[torch.Tensor] = None,
         cache_position: Optional[torch.Tensor] = None,
+        batch_index: torch.Tensor = None,
     ):
         normed_hidden_states = self.layer_norm(hidden_states)
         attention_output = _T5Attention.forward(
@@ -339,6 +409,7 @@ class _T5LayerSelfAttention(T5LayerSelfAttention):
             position_bias=position_bias,
             past_key_value=past_key_value,
             cache_position=cache_position,
+            batch_index=batch_index,
             is_self_attn=True,
         )
@@ -356,6 +427,7 @@ class _T5LayerCrossAttention(T5LayerCrossAttention):
         position_bias: torch.Tensor = None,
         past_key_value: Tuple[torch.Tensor] = None,
         cache_position: Optional[torch.Tensor] = None,
+        batch_index: torch.Tensor = None,
     ):
         normed_hidden_states = self.layer_norm(hidden_states)
         attention_output = _T5Attention.forward(
@@ -365,6 +437,7 @@ class _T5LayerCrossAttention(T5LayerCrossAttention):
             position_bias=position_bias,
             past_key_value=past_key_value,
             cache_position=cache_position,
+            batch_index=batch_index,
             is_self_attn=False,
         )
@@ -383,6 +456,7 @@ class _T5Block(T5Block):
         encoder_decoder_position_bias=None,
         past_key_value=None,
         cache_position=None,
+        batch_ids=None,
     ):
         if past_key_value is not None:
             if not self.is_decoder:
@@ -403,13 +477,13 @@ class _T5Block(T5Block):
             cross_attn_past_key_value = past_key_value[2:]
         else:
             self_attn_past_key_value, cross_attn_past_key_value = None, None
         self_attention_outputs = _T5LayerSelfAttention.forward(
             self.layer[0],
             hidden_states=hidden_states,
             position_bias=position_bias,
             past_key_value=self_attn_past_key_value,
             cache_position=cache_position,
+            batch_index=batch_ids,
         )
         hidden_states, present_key_value_state = self_attention_outputs[:2]
@@ -423,6 +497,7 @@ class _T5Block(T5Block):
                 position_bias=encoder_decoder_position_bias,
                 past_key_value=cross_attn_past_key_value,
                 cache_position=cache_position,
+                batch_index=batch_ids,
             )
             hidden_states = cross_attention_outputs[0]
             # Combine self attn and cross attn key value states

optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py CHANGED Viewed

@@ -22,14 +22,14 @@
 # from Rebellions Inc.
 import logging
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Union
 import torch
 from transformers import AutoModelForMaskedLM, PretrainedConfig, Wav2Vec2ForCTC
 from transformers.modeling_outputs import CausalLMOutput
 from ....modeling_base import RBLNModel
-from ....modeling_config import RBLNConfig, RBLNRuntimeConfig
+from ....modeling_config import RBLNCompileConfig, RBLNConfig
 logger = logging.getLogger(__name__)
@@ -65,7 +65,6 @@ class RBLNWav2Vec2ForCTC(RBLNModel):
     - compiling the resulting graph using the RBLN compiler.
     """
-    model_type = "rbln_model"
     main_input_name = "input_values"
     auto_model_class = AutoModelForMaskedLM
@@ -78,10 +77,10 @@ class RBLNWav2Vec2ForCTC(RBLNModel):
         cls,
         preprocessors: Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"],
         model_config: "PretrainedConfig",
-        rbln_max_seq_len: Optional[int] = None,
-        rbln_batch_size: Optional[int] = None,
+        rbln_kwargs: Dict[str, Any] = {},
     ) -> RBLNConfig:
-        meta = {}
+        rbln_max_seq_len = rbln_kwargs.get("max_seq_len", None)
+        rbln_batch_size = rbln_kwargs.get("batch_size", None)
         if rbln_max_seq_len is None:
             for tokenizer in preprocessors:
@@ -91,8 +90,6 @@ class RBLNWav2Vec2ForCTC(RBLNModel):
             if rbln_max_seq_len is None:
                 raise ValueError("`rbln_max_seq_len` should be specified!")
-        meta["rbln_max_seq_len"] = rbln_max_seq_len
         if rbln_batch_size is None:
             rbln_batch_size = 1
@@ -107,11 +104,19 @@ class RBLNWav2Vec2ForCTC(RBLNModel):
             ),
         ]
-        rbln_runtime_config = RBLNRuntimeConfig(input_info=input_info, batch_size=rbln_batch_size)
+        rbln_compile_config = RBLNCompileConfig(input_info=input_info)
+        rbln_config = RBLNConfig(
+            rbln_cls=cls.__name__,
+            compile_cfgs=[rbln_compile_config],
+            rbln_kwargs=rbln_kwargs,
+        )
-        rbln_config = RBLNConfig.from_rbln_runtime_configs(
-            [rbln_runtime_config],
-            _rbln_meta=meta,
+        rbln_config.model_cfg.update(
+            {
+                "max_seq_len": rbln_max_seq_len,
+                "batch_size": rbln_batch_size,
+            }
         )
         return rbln_config

optimum/rbln/transformers/models/whisper/generation_whisper.py ADDED Viewed

@@ -0,0 +1,68 @@
+import torch
+from transformers import GenerationMixin
+from transformers.models.whisper.generation_whisper import WhisperGenerationMixin
+class RBLNWhisperGenerationMixin(WhisperGenerationMixin, GenerationMixin):
+    """
+    This class is based on transformers version 4.44.2.
+    It uses the same generate() method, so it's crucial to maintain the inheritance order.
+    Ensure WhisperGenerationMixin is listed before GenerationMixin.
+    """
+    def _postprocess_outputs(
+        self, seek_outputs, decoder_input_ids, return_token_timestamps, generation_config, *args, **kwargs
+    ):
+        # remove all previously passed decoder input ids
+        ################################## rbln_change for 4.40.2###################################
+        # 4.40.2 has no keyword shortform, it has seperate codes from generation_fallback
+        is_shortform = kwargs.get("is_shortform", False)
+        start_idx = decoder_input_ids.shape[-1] if not is_shortform else torch.tensor(0)
+        if isinstance(seek_outputs, torch.Tensor):
+            seek_outputs = seek_outputs[:, start_idx:]
+            return seek_outputs, seek_outputs
+        ############## rbln validation#############
+        if return_token_timestamps and not self.rbln_token_timestamps:
+            raise RuntimeError(
+                "To use .generate() with return_token_timestamps=True, the model must be compiled with rbln_token_timestamps=True. "
+                "You can compile the model by calling .from_pretrained() with export=True and rbln_token_timestamps=True as keyword arguments, "
+                "or you can generate with return_token_timestamps=False."
+            )
+        if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
+            num_frames = getattr(generation_config, "num_frames", None)
+            seek_outputs["token_timestamps"] = self._extract_token_timestamps(
+                seek_outputs, generation_config.alignment_heads, num_frames=num_frames
+            )
+            seek_outputs["token_timestamps"] = seek_outputs["token_timestamps"][:, start_idx:]
+        seek_outputs["sequences"] = seek_outputs["sequences"][:, start_idx:]
+        def split_by_batch_index(values, key, batch_idx):
+            if key in ["scores", "encoder_attentions", "encoder_hidden_states", "logits"]:
+                return [v[batch_idx].cpu() for v in values]
+            if key in ["decoder_attentions", "decoder_hidden_states", "cross_attentions"]:
+                return tuple(tuple(w[batch_idx][None].cpu() for w in v) for v in values)
+            elif key == "past_key_values":
+                # we don't save `past_key_values in rbln
+                return None
+            return values[batch_idx].cpu()
+        sequence_tokens = seek_outputs["sequences"]
+        ##################################### thkim change #############################################
+        valid_seekoutputs = []
+        for k, v in seek_outputs.items():
+            if v is not None and len(v) > 0 and v[0] is not None:
+                valid_seekoutputs.append((k, v))
+        seek_outputs = [
+            {k: split_by_batch_index(v, k, i) for k, v in valid_seekoutputs}
+            # {k: split_by_batch_index(v, k, i, is_shortform) for k, v in seek_outputs.items()}
+            for i in range(sequence_tokens.shape[0])
+        ]
+        return sequence_tokens, seek_outputs

optimum-rbln 0.1.9__py3-none-any.whl → 0.1.12__py3-none-any.whl

optimum-rbln 0.1.9py3-none-any.whl → 0.1.12py3-none-any.whl