PyPI - optimum-rbln - Versions diffs - 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl - Mend

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

optimum/rbln/transformers/models/siglip/modeling_siglip.py CHANGED Viewed

@@ -21,6 +21,7 @@ from transformers.modeling_outputs import BaseModelOutputWithPooling
 from ....configuration_utils import RBLNCompileConfig
 from ....modeling import RBLNModel
 from ....utils.logging import get_logger
+from ...modeling_outputs import _validate_output_attentions, _validate_output_hidden_states
 from .configuration_siglip import RBLNSiglipVisionModelConfig
@@ -52,7 +53,7 @@ class _SiglipVisionModel(torch.nn.Module):
             interpolate_pos_encoding=self.interpolate_pos_encoding,
             output_attentions=self.output_attentions,
         )
-        return tuple(x for x in enc_out if x is not None)
+        return enc_out
 class RBLNSiglipVisionModel(RBLNModel):
@@ -66,7 +67,9 @@ class RBLNSiglipVisionModel(RBLNModel):
     _tp_support = False
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNSiglipVisionModelConfig) -> torch.nn.Module:
+    def _wrap_model_if_needed(
+        cls, model: torch.nn.Module, rbln_config: RBLNSiglipVisionModelConfig
+    ) -> torch.nn.Module:
         wrapper_cfg = {
             "interpolate_pos_encoding": rbln_config.interpolate_pos_encoding,
             "output_hidden_states": rbln_config.output_hidden_states,
@@ -122,23 +125,22 @@ class RBLNSiglipVisionModel(RBLNModel):
         interpolate_pos_encoding: bool = False,
         **kwargs: Any,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        output_attentions = output_attentions if output_attentions is not None else self.rbln_config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.rbln_config.output_hidden_states
-        )
-        if output_attentions != self.rbln_config.output_attentions:
-            raise ValueError(
-                f"Variable output_attentions {output_attentions} is not equal to rbln_config.output_attentions {self.rbln_config.output_attentions} "
-                f"Please compile again with the correct argument."
-            )
-        if output_hidden_states != self.rbln_config.output_hidden_states:
-            raise ValueError(
-                f"Variable output_hidden_states {output_hidden_states} is not equal to rbln_config.output_hidden_states {self.rbln_config.output_hidden_states} "
-                f"Please compile again with the correct argument."
-            )
+        """
+        Forward pass for the RBLN-optimized SigLIP vision model.
+        Args:
+            pixel_values (torch.FloatTensor of shape (batch_size, num_channels, image_size, image_size), optional): The tensors corresponding to the input images. Pixel values can be obtained using ViTImageProcessor. See ViTImageProcessor.call() for details (processor_class uses ViTImageProcessor for processing images).
+            return_dict (bool, optional): Whether or not to return a ModelOutput instead of a plain tuple.
+            output_attentions (bool, optional): Whether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.
+            output_hidden_states (bool, optional): Whether or not to return the hidden states of all layers. See hidden_states under returned tensors for more detail.
+            interpolate_pos_encoding (bool, defaults to False): Whether to interpolate the pre-trained position encodings.
+        Returns:
+            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPooling object.
+        """
+        output_attentions = _validate_output_attentions(output_attentions, self.rbln_config)
+        output_hidden_states = _validate_output_hidden_states(output_hidden_states, self.rbln_config)
         if interpolate_pos_encoding != self.rbln_config.interpolate_pos_encoding:
             raise ValueError(
                 f"Variable interpolate_pos_encoding {interpolate_pos_encoding} is not equal to rbln_config.interpolate_pos_encoding {self.rbln_config.interpolate_pos_encoding} "

optimum/rbln/transformers/models/swin/configuration_swin.py CHANGED Viewed

@@ -32,11 +32,6 @@ class RBLNSwinBackboneConfig(RBLNModelForImageClassificationConfig):
         Raises:
             ValueError: If batch_size is not a positive integer.
         """
-        super().__init__(**kwargs)
-        self.batch_size = batch_size or 1
-        if not isinstance(self.batch_size, int) or self.batch_size < 0:
-            raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
-        self.image_size = image_size
+        super().__init__(batch_size=batch_size, image_size=image_size, **kwargs)
         self.output_hidden_states = output_hidden_states
         self.output_attentions = output_attentions

optimum/rbln/transformers/models/swin/modeling_swin.py CHANGED Viewed

@@ -203,7 +203,7 @@ class _SwinBackbone(torch.nn.Module):
 class RBLNSwinBackbone(RBLNModel):
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNSwinBackboneConfig) -> torch.nn.Module:
+    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNSwinBackboneConfig) -> torch.nn.Module:
         for layer in model.encoder.layers:
             for block in layer.blocks:
                 block.get_attn_mask = types.MethodType(get_attn_mask, block)
@@ -278,6 +278,19 @@ class RBLNSwinBackbone(RBLNModel):
         output_hidden_states: bool = None,
         **kwargs,
     ) -> Union[Tuple, BackboneOutput]:
+        """
+        Forward pass for the RBLN-optimized Swin backbone model.
+        Args:
+            pixel_values (torch.FloatTensor of shape (batch_size, num_channels, image_size, image_size), optional): The tensors corresponding to the input images. Pixel values can be obtained using ViTImageProcessor. See ViTImageProcessor.call() for details (processor_class uses ViTImageProcessor for processing images).
+            return_dict (bool, optional): Whether or not to return a ModelOutput instead of a plain tuple.
+            output_attentions (bool, optional): Whether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.
+            output_hidden_states (bool, optional): Whether or not to return the hidden states of all layers. See hidden_states under returned tensors for more detail.
+        Returns:
+            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BackboneOutput object.
+        """
         if len(kwargs) > 0 and any(value is not None for value in kwargs.values()):
             logger.warning(
                 f"Currently, optimum-rbln does not support kwargs {kwargs.keys()} for {self.__class__.__name__}."
@@ -314,19 +327,19 @@ class RBLNSwinBackbone(RBLNModel):
         output = self.model[0](padded_pixel_values)
         feature_maps = ()
-        for i in range(len(self.config.out_features)):
+        for _ in range(len(self.config.out_features)):
             feature_maps += (output.pop(0),)
         if self.rbln_config.output_hidden_states:
             hidden_states = ()
-            for i in range(len(self.config.stage_names)):
+            for _ in range(len(self.config.stage_names)):
                 hidden_states += (output.pop(0),)
         else:
             hidden_states = None
         if self.rbln_config.output_attentions:
             attentions = ()
-            for i in range(len(self.config.depths)):
+            for _ in range(len(self.config.depths)):
                 attentions += (output.pop(0),)
         else:
             attentions = None

optimum/rbln/transformers/models/t5/modeling_t5.py CHANGED Viewed

@@ -68,7 +68,7 @@ class RBLNT5EncoderModel(RBLNTransformerEncoderForFeatureExtraction):
     output_class = BaseModelOutputWithPastAndCrossAttentions
     @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: RBLNT5EncoderModelConfig):
+    def _wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: RBLNT5EncoderModelConfig):
         return T5EncoderWrapper(model)
     @classmethod
@@ -113,7 +113,7 @@ class RBLNT5ForConditionalGeneration(RBLNModelForSeq2SeqLM):
     support_causal_attn = False
     @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: RBLNT5ForConditionalGenerationConfig):
+    def _wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: RBLNT5ForConditionalGenerationConfig):
         return T5Wrapper(
             model, enc_max_seq_len=rbln_config.enc_max_seq_len, dec_max_seq_len=rbln_config.dec_max_seq_len
         )

optimum/rbln/transformers/models/t5/t5_architecture.py CHANGED Viewed

@@ -39,7 +39,7 @@ class T5Wrapper:
 class T5EncoderWrapper(Seq2SeqEncoderWrapper):
     def __post_init__(self, model: nn.Module):
-        self.n_layer = getattr(self.config, "num_layers")
+        self.n_layer = self.config.num_layers
         self.cross_k_projects, self.cross_v_projects = self._extract_cross_kv_projects(model.get_decoder().block)
         self.num_heads = self.config.num_heads
         self.d_kv = self.config.d_kv
@@ -111,9 +111,9 @@ class T5ForConditionalGeneration(Seq2SeqForConditionalGeneration):
 class T5Decoder(Seq2SeqDecoder):
     has_pos_emb = False
-    def __post_init__(self, dec_max_seq_len: int = None):
-        self.invert_attention_mask = self._original_mod.invert_attention_mask
-        self._dec_position_bias = self.precompute_dec_position_bias(self._original_mod, dec_max_seq_len)
+    def __post_init__(self, model: nn.Module, dec_max_seq_len: int = None):
+        self.invert_attention_mask = model.invert_attention_mask
+        self._dec_position_bias = self.precompute_dec_position_bias(model, dec_max_seq_len)
     def precompute_dec_position_bias(self, model, dec_max_length):
         attn_layer = model.block[0].layer[0].SelfAttention
@@ -145,13 +145,12 @@ class T5Decoder(Seq2SeqDecoder):
 class T5Block(Seq2SeqDecoderLayer):
     def __init__(self, decoder_layer, self_attn):
         super().__init__(decoder_layer, self_attn, cross_attn=None)
-        self.__post_init__()
-    def __post_init__(self):
-        self.self_attn_layer_norm = self._original_mod.layer[0].layer_norm
-        self.encoder_attn_layer_norm = self._original_mod.layer[1].layer_norm
-        self.cross_attn = T5CrossAttention(self._original_mod.layer[1].EncDecAttention)
-        self.ff_layer = self._original_mod.layer[2]
+    def __post_init__(self, decoder_layer: nn.Module):
+        self.self_attn_layer_norm = decoder_layer.layer[0].layer_norm
+        self.encoder_attn_layer_norm = decoder_layer.layer[1].layer_norm
+        self.cross_attn = T5CrossAttention(decoder_layer.layer[1].EncDecAttention)
+        self.ff_layer = decoder_layer.layer[2]
     def pre_self_attn_layer_norm(self, hidden_states):
         return self.self_attn_layer_norm(hidden_states)
@@ -167,13 +166,13 @@ class T5Block(Seq2SeqDecoderLayer):
 class T5LayerSelfAttention(Seq2SeqSelfAttention):
-    def __post_init__(self):
-        self.q_proj = self._original_mod.q
-        self.k_proj = self._original_mod.k
-        self.v_proj = self._original_mod.v
-        self.out_proj = self._original_mod.o
-        self.num_heads = self._original_mod.n_heads
-        self.head_dim = self._original_mod.key_value_proj_dim
+    def __post_init__(self, attn: nn.Module):
+        self.q_proj = attn.q
+        self.k_proj = attn.k
+        self.v_proj = attn.v
+        self.out_proj = attn.o
+        self.num_heads = attn.n_heads
+        self.head_dim = attn.key_value_proj_dim
         self.attn_decode = torch.ops.rbln_custom_ops.paged_add_softmax_attn_decode
     def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py CHANGED Viewed

@@ -153,7 +153,7 @@ class RBLNTimeSeriesTransformerForPrediction(RBLNModel):
             return redirect(val)
     @classmethod
-    def wrap_model_if_needed(
+    def _wrap_model_if_needed(
         self, model: "PreTrainedModel", rbln_config: RBLNTimeSeriesTransformerForPredictionConfig
     ):
         return TimeSeriesTransformersWrapper(model, rbln_config.num_parallel_samples)
@@ -161,7 +161,7 @@ class RBLNTimeSeriesTransformerForPrediction(RBLNModel):
     @classmethod
     @torch.inference_mode()
     def get_compiled_model(cls, model, rbln_config: RBLNTimeSeriesTransformerForPredictionConfig):
-        wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
+        wrapped_model = cls._wrap_model_if_needed(model, rbln_config)
         enc_compile_config = rbln_config.compile_cfgs[0]
         dec_compile_config = rbln_config.compile_cfgs[1]
@@ -184,14 +184,6 @@ class RBLNTimeSeriesTransformerForPrediction(RBLNModel):
             if "key_value_states" in name:
                 context.mark_static_address(tensor)
-        compiled_decoder = cls.compile(
-            wrapped_model.decoder,
-            dec_compile_config,
-            create_runtimes=rbln_config.create_runtimes,
-            device=rbln_config.device,
-            example_inputs=dec_example_inputs,
-            compile_context=context,
-        )
         compiled_encoder = cls.compile(
             wrapped_model.encoder,
             enc_compile_config,
@@ -201,6 +193,15 @@ class RBLNTimeSeriesTransformerForPrediction(RBLNModel):
             compile_context=context,
         )
+        compiled_decoder = cls.compile(
+            wrapped_model.decoder,
+            dec_compile_config,
+            create_runtimes=rbln_config.create_runtimes,
+            device=rbln_config.device,
+            example_inputs=dec_example_inputs,
+            compile_context=context,
+        )
         return {"encoder": compiled_encoder, "decoder": compiled_decoder}
     @classmethod
@@ -353,6 +354,20 @@ class RBLNTimeSeriesTransformerForPrediction(RBLNModel):
         static_real_features: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> SampleTSPredictionOutput:
+        """
+        Generate pass for the RBLN-optimized Time Series Transformer model for time series forecasting.
+        Args:
+            past_values (torch.FloatTensor of shape (batch_size, sequence_length) or (batch_size, sequence_length, input_size)): Past values of the time series, that serve as context in order to predict the future.
+            past_time_features (torch.FloatTensor of shape (batch_size, sequence_length, num_features)): Required time features, which the model internally will add to past_values.
+            future_time_features (torch.FloatTensor of shape (batch_size, prediction_length, num_features)): Required time features for the prediction window, which the model internally will add to future_values.
+            past_observed_mask (torch.BoolTensor of shape (batch_size, sequence_length) or (batch_size, sequence_length, input_size), optional): Boolean mask to indicate which past_values were observed and which were missing.
+            static_categorical_features (torch.LongTensor of shape (batch_size, number of static categorical features), optional): Optional static categorical features for which the model will learn an embedding, which it will add to the values of the time series.
+            static_real_features (torch.FloatTensor of shape (batch_size, number of static real features), optional): Optional static real features which the model will add to the values of the time series.
+        Returns:
+            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a SampleTSPredictionOutput object.
+        """
         self.validate_batch_size(**{k: v for k, v in locals().items() if isinstance(v, torch.Tensor)})
         outputs = self.encoder(

optimum/rbln/transformers/models/time_series_transformer/time_series_transformers_architecture.py CHANGED Viewed

@@ -140,7 +140,6 @@ class TimeSeriesTransformersDecoderWrapper(torch.nn.Module):
 class TimeSeriesTransformersDecoder(nn.Module):
     def __init__(self, model, layers, **kwargs):
         super().__init__()
-        self._original_mod = model
         self.config = model.config
         self.layers = nn.ModuleList(layers)
         self.value_embedding = model.value_embedding
@@ -190,7 +189,6 @@ class TimeSeriesTransformersDecoder(nn.Module):
 class TimeSeriesTransformersDecoderLayer(nn.Module):
     def __init__(self, decoder_layer, self_attn, cross_attn):
         super().__init__()
-        self._original_mod = decoder_layer
         self.self_attn = self_attn
         self.encoder_attn = cross_attn
         self.embed_dim = decoder_layer.embed_dim
@@ -245,7 +243,6 @@ class TimeSeriesTransformersDecoderLayer(nn.Module):
 class TimeSeriesTransformersAttention(nn.Module):
     def __init__(self, attn, num_parallel_samples):
         super().__init__()
-        self._original_mod = attn
         self.q_proj = attn.q_proj
         self.k_proj = attn.k_proj
         self.v_proj = attn.v_proj

optimum/rbln/transformers/models/vit/modeling_vit.py CHANGED Viewed

@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Tuple, Union
+import torch
+from transformers.modeling_outputs import ImageClassifierOutput
 from ...modeling_generic import RBLNModelForImageClassification
@@ -23,3 +28,17 @@ class RBLNViTForImageClassification(RBLNModelForImageClassification):
     on RBLN devices, supporting image classification with transformer-based architectures
     that process images as sequences of patches.
     """
+    def forward(self, pixel_values: torch.Tensor, **kwargs) -> Union[ImageClassifierOutput, Tuple]:
+        """
+        Forward pass for the RBLN-optimized Vision Transformer model for image classification.
+        Args:
+            pixel_values (torch.FloatTensor of shape (batch_size, channels, height, width)):
+                The tensors corresponding to the input images.
+        Returns:
+            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns an ImageClassifierOutput object.
+        """
+        return super().forward(pixel_values, **kwargs)

optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec2.py CHANGED Viewed

@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ...configuration_generic import RBLNModelForMaskedLMConfig
+from typing import Any, Optional
+from ....configuration_utils import RBLNModelConfig
-class RBLNWav2Vec2ForCTCConfig(RBLNModelForMaskedLMConfig):
+class RBLNWav2Vec2ForCTCConfig(RBLNModelConfig):
     """
     Configuration class for RBLNWav2Vec2ForCTC.
@@ -23,4 +25,14 @@ class RBLNWav2Vec2ForCTCConfig(RBLNModelForMaskedLMConfig):
     RBLN-optimized Wav2Vec2 models for Connectionist Temporal Classification (CTC) tasks.
     """
-    rbln_model_input_names = ["input_values"]
+    def __init__(
+        self,
+        max_seq_len: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.max_seq_len = max_seq_len
+        self.batch_size = batch_size or 1
+        if not isinstance(self.batch_size, int) or self.batch_size < 0:
+            raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")

optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py CHANGED Viewed

@@ -13,13 +13,21 @@
 # limitations under the License.
+from typing import TYPE_CHECKING, Optional, Union
 import torch
-from transformers import AutoModelForMaskedLM, Wav2Vec2ForCTC
+from transformers import AutoModelForCTC, Wav2Vec2Config, Wav2Vec2ForCTC
+from transformers.modeling_outputs import CausalLMOutput
-from ...modeling_generic import RBLNModelForMaskedLM
+from ....configuration_utils import RBLNCompileConfig
+from ....modeling import RBLNModel
 from .configuration_wav2vec2 import RBLNWav2Vec2ForCTCConfig
+if TYPE_CHECKING:
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PreTrainedModel
 class _Wav2Vec2(torch.nn.Module):
     def __init__(self, model: "Wav2Vec2ForCTC"):
         super().__init__()
@@ -30,13 +38,10 @@ class _Wav2Vec2(torch.nn.Module):
         return self.model.lm_head(output[0])
-class RBLNWav2Vec2ForCTC(RBLNModelForMaskedLM):
+class RBLNWav2Vec2ForCTC(RBLNModel):
     """
     Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
-    This model inherits from [`RBLNModelForMaskedLM`]. Check the superclass documentation for the generic methods the
-    library implements for all its model.
     It implements the methods to convert a pre-trained Wav2Vec2 model into a RBLN Wav2Vec2 model by:
     - transferring the checkpoint weights of the original into an optimized RBLN graph,
@@ -44,9 +49,56 @@ class RBLNWav2Vec2ForCTC(RBLNModelForMaskedLM):
     """
     main_input_name = "input_values"
-    auto_model_class = AutoModelForMaskedLM
+    auto_model_class = AutoModelForCTC
     rbln_dtype = "float32"
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNWav2Vec2ForCTCConfig) -> torch.nn.Module:
+    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNWav2Vec2ForCTCConfig) -> torch.nn.Module:
         return _Wav2Vec2(model).eval()
+    @classmethod
+    def _update_rbln_config(
+        cls,
+        preprocessors: Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"],
+        model: Optional["PreTrainedModel"] = None,
+        model_config: "Wav2Vec2Config" = None,
+        rbln_config: Optional[RBLNWav2Vec2ForCTCConfig] = None,
+    ) -> RBLNWav2Vec2ForCTCConfig:
+        if rbln_config.max_seq_len is None:
+            for tokenizer in preprocessors:
+                if hasattr(tokenizer, "model_max_length"):
+                    rbln_config.max_seq_len = tokenizer.model_max_length
+                    break
+            if rbln_config.max_seq_len is None:
+                raise ValueError("`rbln_max_seq_len` should be specified!")
+        rbln_compile_config = RBLNCompileConfig(
+            input_info=[
+                (
+                    "input_values",
+                    [
+                        rbln_config.batch_size,
+                        rbln_config.max_seq_len,
+                    ],
+                    "float32",
+                )
+            ]
+        )
+        rbln_config.set_compile_cfgs([rbln_compile_config])
+        return rbln_config
+    def forward(
+        self, input_values: torch.Tensor, return_dict: Optional[bool] = None, **kwargs
+    ) -> Union[CausalLMOutput, tuple]:
+        """
+        Forward pass for the RBLN-optimized Wav2Vec2 model for Connectionist Temporal Classification (CTC).
+        Args:
+            input_values (torch.FloatTensor of shape (batch_size, sequence_length)): Float values of input raw speech waveform. Values can be obtained by loading a .flac or .wav audio file into an array of type List[float] or a numpy.ndarray, e.g. via the soundfile library (pip install soundfile). To prepare the array into input_values, the AutoProcessor should be used for padding and conversion into a tensor of type torch.FloatTensor.
+            return_dict (bool, optional): Whether or not to return a ModelOutput instead of a plain tuple.
+        Returns:
+            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a CausalLMOutput object.
+        """
+        return super().forward(input_values=input_values, return_dict=return_dict, **kwargs)

optimum/rbln/transformers/models/whisper/generation_whisper.py CHANGED Viewed

@@ -31,29 +31,63 @@ Generation utilities for Whisper.
 Modified from `transformers.models.whisper.generation_whisper.py`
 """
+from typing import Any, Dict, Optional, Union
 import torch
 import transformers
 from packaging import version
 from transformers import GenerationMixin
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.modeling_outputs import ModelOutput
 from transformers.models.whisper.generation_whisper import WhisperGenerationMixin
 class RBLNWhisperGenerationMixin(WhisperGenerationMixin, GenerationMixin):
-    def generate(self, *args, generation_config=None, **kwargs):
-        num_beams = kwargs.get(
-            "num_beams",
-            generation_config.num_beams
-            if hasattr(generation_config, "num_beams") and generation_config.num_beams is not None
-            else 1,
-        )
-        if num_beams > 1:
-            raise ValueError(
-                f"Beam search is not supported in RBLNWhisperGenerationMixin. "
-                f"Received num_beams={num_beams}, but only num_beams=1 is allowed. "
-                f"Please set num_beams=1 for greedy search or adjust your configuration."
-            )
+    def generate(
+        self,
+        input_features: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        return_segments: Optional[bool] = None,
+        return_timestamps: Optional[bool] = None,
+        return_token_timestamps: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[ModelOutput, Dict[str, Any], torch.LongTensor]:
+        """
+        The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
+        Check the [HuggingFace transformers documentation](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/whisper#transformers.WhisperForConditionalGeneration.generate) for more details.
+        Args:
+            input_features(torch.Tensor, optional): The input features to the model.
+            attention_mask(torch.Tensor, optional): Attention mask needs to be passed when doing long-form transcription using a batch size > 1.
+            generation_config(GenerationConfig, optional): The generation configuration to be used as base parametrization for the generation call. **kwargs passed to generate matching the attributes of generation_config will override them.
+                If generation_config is not provided, the default will be used, which had the following loading priority: 1) from the generation_config.json model file, if it exists; 2) from the model configuration.
+                Please note that unspecified parameters will inherit [GenerationConfig](https://huggingface.co/docs/transformers/v4.57.1/en/main_classes/text_generation#transformers.GenerationConfig)’s default values.
+            return_segments(bool, optional): Whether to return segments.
+            return_timestamps(bool, optional): Whether to return the timestamps with the text. For audios longer than 30 seconds, it is necessary to set return_timestamps=True.
+            return_token_timestamps(bool, optional): Whether to return token timestamps.
+            kwargs(dict[str, Any], optional): Additional arguments passed to the generate function.
+        Returns:
+            Transcribes or translates log-mel input features to a sequence of auto-regressively generated token ids.
+        """
+        if kwargs.get("num_beams", None) is not None:
+            if kwargs.get("num_beams") != 1:
+                raise ValueError(
+                    "Beam search is not supported in RBLNWhisperGenerationMixin. "
+                    "Received num_beams={num_beams}, but only num_beams=1 is allowed. "
+                    "Please set num_beams=1 for greedy search or adjust your configuration."
+                )
-        return super().generate(*args, **kwargs)
+        return super().generate(
+            input_features,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            return_segments=return_segments,
+            return_timestamps=return_timestamps,
+            return_token_timestamps=return_token_timestamps,
+            **kwargs,
+        )
     def _postprocess_outputs(
         self,

optimum/rbln/transformers/models/whisper/modeling_whisper.py CHANGED Viewed

@@ -203,7 +203,7 @@ class RBLNWhisperForConditionalGeneration(RBLNModel, RBLNWhisperGenerationMixin)
         raise NotImplementedError
     @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: RBLNWhisperForConditionalGenerationConfig):
+    def _wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: RBLNWhisperForConditionalGenerationConfig):
         return WhisperWrapper(
             model,
             use_attention_mask=rbln_config.use_attention_mask,
@@ -213,7 +213,7 @@ class RBLNWhisperForConditionalGeneration(RBLNModel, RBLNWhisperGenerationMixin)
     @classmethod
     @torch.inference_mode()
     def get_compiled_model(cls, model, rbln_config: RBLNWhisperForConditionalGenerationConfig):
-        wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
+        wrapped_model = cls._wrap_model_if_needed(model, rbln_config)
         enc_compile_config = rbln_config.compile_cfgs[0]
         dec_compile_config = rbln_config.compile_cfgs[1]

optimum/rbln/transformers/models/whisper/whisper_architecture.py CHANGED Viewed

@@ -154,7 +154,6 @@ class WhisperDecoderWrapper(torch.nn.Module):
 class WhisperDecoder(nn.Module):
     def __init__(self, model, layers, **kwargs):
         super().__init__()
-        self._original_mod = model
         self.layers = nn.ModuleList(layers)
         self.embed_tokens = model.embed_tokens
         self.layer_norm = model.layer_norm
@@ -210,7 +209,6 @@ class WhisperDecoder(nn.Module):
 class WhisperDecoderLayer(nn.Module):
     def __init__(self, decoder_layer, self_attn, cross_attn):
         super().__init__()
-        self._original_mod = decoder_layer
         self.self_attn = self_attn
         self.encoder_attn = cross_attn
         self.self_attn_layer_norm = decoder_layer.self_attn_layer_norm
@@ -263,7 +261,6 @@ class WhisperDecoderLayer(nn.Module):
 class WhisperAttention(nn.Module):
     def __init__(self, attn):
         super().__init__()
-        self._original_mod = attn
         self.q_proj = attn.q_proj
         self.k_proj = attn.k_proj
         self.v_proj = attn.v_proj

optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl