PyPI - optimum-rbln - Versions diffs - 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl - Mend

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

optimum/rbln/transformers/modeling_rope_utils.py CHANGED Viewed

@@ -27,7 +27,7 @@
 # limitations under the License.
 import math
-from typing import Optional, Tuple
+from typing import Optional
 import torch
 from transformers import PretrainedConfig
@@ -35,13 +35,16 @@ from transformers import PretrainedConfig
 def _compute_default_rope_parameters(
     config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
-) -> Tuple["torch.Tensor", float]:
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies according to the original RoPE implementation
     Args:
         config ([`~transformers.PretrainedConfig`]):
             The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
     Returns:
@@ -50,40 +53,38 @@ def _compute_default_rope_parameters(
     """
     base = config.rope_theta
     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
-    head_dim = (
-        config.head_dim
-        if hasattr(config, "head_dim") and config.head_dim is not None
-        else config.hidden_size // config.num_attention_heads
-    )
+    head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
     dim = int(head_dim * partial_rotary_factor)
     attention_factor = 1.0  # Unused in this type of RoPE
     # Compute the inverse frequencies
-    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim))
     return inv_freq, attention_factor
 def _compute_linear_scaling_rope_parameters(
     config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
-) -> Tuple["torch.Tensor", float]:
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
     Args:
         config ([`~transformers.PretrainedConfig`]):
             The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
     Returns:
         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
     """
     factor = config.rope_scaling["factor"]
     # Gets the default RoPE parameters
-    inv_freq, attention_factor = _compute_default_rope_parameters(config, seq_len)
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len)
     # Then applies linear scaling to the frequencies.
     # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
@@ -94,20 +95,23 @@ def _compute_linear_scaling_rope_parameters(
 def _compute_dynamic_ntk_parameters(
     config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
-) -> Tuple["torch.Tensor", float]:
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
     Args:
         config ([`~transformers.PretrainedConfig`]):
             The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
             The current sequence length, used to update the dynamic RoPE at inference time.
     Returns:
         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
     """
+    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
     base = config.rope_theta
     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
@@ -117,6 +121,17 @@ def _compute_dynamic_ntk_parameters(
     attention_factor = 1.0  # Unused in this type of RoPE
+    # seq_len: default to max_position_embeddings, e.g. at init time
+    if seq_len is None:
+        seq_len = max_position_embeddings
+    elif isinstance(seq_len, torch.Tensor):
+        seq_len = torch.maximum(
+            seq_len,
+            torch.tensor(max_position_embeddings, dtype=seq_len.dtype, device=seq_len.device),
+        )
+    else:
+        seq_len = max(seq_len, max_position_embeddings)
     # Process with chunk_size to reduce precesion error
     chunk_size = 4096
     chunks = (seq_len + chunk_size - 1) // chunk_size
@@ -140,13 +155,17 @@ def _compute_dynamic_ntk_parameters(
     return final_inv_freq, attention_factor
-def _compute_yarn_parameters(config: PretrainedConfig, seq_len: Optional[int] = None) -> Tuple["torch.Tensor", float]:
+def _compute_yarn_parameters(
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies with NTK scaling. Please refer to the
-    [original paper](https://arxiv.org/abs/2309.00071)
+    [original paper](https://huggingface.co/papers/2309.00071)
     Args:
         config ([`~transformers.PretrainedConfig`]):
             The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
     Returns:
@@ -158,13 +177,25 @@ def _compute_yarn_parameters(config: PretrainedConfig, seq_len: Optional[int] =
     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
     dim = int(head_dim * partial_rotary_factor)
-    max_position_embeddings = config.max_position_embeddings
     factor = config.rope_scaling["factor"]
+    attention_factor = config.rope_scaling.get("attention_factor")
+    mscale = config.rope_scaling.get("mscale")
+    mscale_all_dim = config.rope_scaling.get("mscale_all_dim")
+    original_max_position_embeddings = (
+        config.rope_scaling.get("original_max_position_embeddings") or config.max_position_embeddings
+    )
+    def get_mscale(scale, mscale=1):
+        if scale <= 1:
+            return 1.0
+        return 0.1 * mscale * math.log(scale) + 1.0
     # Sets the attention factor as suggested in the paper
-    attention_factor = config.rope_scaling.get("attention_factor")
     if attention_factor is None:
-        attention_factor = 0.1 * math.log(factor) + 1.0
+        if mscale and mscale_all_dim:
+            attention_factor = float(get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dim))
+        else:
+            attention_factor = get_mscale(factor)
     # Optional config options
     # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
@@ -176,10 +207,13 @@ def _compute_yarn_parameters(config: PretrainedConfig, seq_len: Optional[int] =
         """Inverse dimension formula to find the dimension based on the number of rotations"""
         return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
-    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
+    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings, truncate):
         """Find dimension range bounds based on rotations"""
-        low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
-        high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+        low = find_correction_dim(low_rot, dim, base, max_position_embeddings)
+        high = find_correction_dim(high_rot, dim, base, max_position_embeddings)
+        if truncate:
+            low = math.floor(low)
+            high = math.ceil(high)
         return max(low, 0), min(high, dim - 1)
     def linear_ramp_factor(min, max, dim):
@@ -192,38 +226,40 @@ def _compute_yarn_parameters(config: PretrainedConfig, seq_len: Optional[int] =
     # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
     # to expand the possible context length. In other words, interpolation = apply scaling factor.
-    pos_freqs = base ** (torch.arange(0, dim, 2).float() / dim)
+    pos_freqs = base ** (torch.arange(0, dim, 2).to(device=device, dtype=torch.float) / dim)
     inv_freq_extrapolation = 1.0 / pos_freqs
     inv_freq_interpolation = 1.0 / (factor * pos_freqs)
-    low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
+    truncate = config.rope_scaling.get("truncate", True)
+    low, high = find_correction_range(beta_fast, beta_slow, dim, base, original_max_position_embeddings, truncate)
     # Get n-dimensional rotational scaling corrected for extrapolation
-    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float()
+    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).to(device=device, dtype=torch.float)
     inv_freq = (
         inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
         + inv_freq_extrapolation * inv_freq_extrapolation_factor
     )
     return inv_freq, attention_factor
 def _compute_longrope_parameters(
-    config: PretrainedConfig, seq_len: Optional[int] = None
-) -> Tuple["torch.Tensor", float]:
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies with LongRoPE scaling. Please refer to the
     [original implementation](https://github.com/microsoft/LongRoPE)
     Args:
         config ([`~transformers.PretrainedConfig`]):
             The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
-            The current sequence length. Unused for this type of RoPE.
+            The current sequence length.
     Returns:
         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin.
     """
+    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
     base = config.rope_theta
     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
@@ -237,40 +273,40 @@ def _compute_longrope_parameters(
     # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
     # values to compute the default attention scaling factor, instead of using `factor`.
     if hasattr(config, "original_max_position_embeddings"):
-        max_position_embeddings = config.original_max_position_embeddings
-        expanded_max_position_embeddings = config.max_position_embeddings
-        factor = expanded_max_position_embeddings / max_position_embeddings
+        original_max_position_embeddings = config.original_max_position_embeddings
+        factor = config.max_position_embeddings / config.original_max_position_embeddings
     else:
-        max_position_embeddings = config.max_position_embeddings
-        expanded_max_position_embeddings = max_position_embeddings * factor
+        original_max_position_embeddings = config.max_position_embeddings
     # Sets the attention factor as suggested in the paper
     if attention_factor is None:
         if factor <= 1.0:
             attention_factor = 1.0
         else:
-            attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))
+            attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings))
     # Compute the inverse frequencies -- scaled based on the target sequence length
-    if expanded_max_position_embeddings > max_position_embeddings:
-        ext_factors = torch.tensor(long_factor, dtype=torch.float32)
+    if seq_len and seq_len > original_max_position_embeddings:
+        ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
     else:
-        ext_factors = torch.tensor(short_factor, dtype=torch.float32)
-    inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64).float() / dim
+        ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
+    inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
     inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
     return inv_freq, attention_factor
 def _compute_llama3_parameters(
-    config: PretrainedConfig, seq_len: Optional[int] = None
-) -> Tuple["torch.Tensor", float]:
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies for llama 3.1.
     Args:
         config ([`~transformers.PretrainedConfig`]):
             The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
     Returns:
@@ -278,7 +314,7 @@ def _compute_llama3_parameters(
         post-processing scaling factor applied to the computed cos/sin.
     """
     # Gets the default RoPE parameters
-    inv_freq, attention_factor = _compute_default_rope_parameters(config, seq_len)
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len)
     factor = config.rope_scaling["factor"]  # `8` in the original implementation
     low_freq_factor = config.rope_scaling["low_freq_factor"]  # `1` in the original implementation

optimum/rbln/transformers/models/__init__.py CHANGED Viewed

@@ -88,12 +88,16 @@ _import_structure = {
         "RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
         "RBLNQwen2_5_VLForConditionalGeneration",
         "RBLNQwen2_5_VLForConditionalGenerationConfig",
+        "RBLNQwen2_5_VLModel",
+        "RBLNQwen2_5_VLModelConfig",
     ],
     "qwen2_vl": [
         "RBLNQwen2VisionTransformerPretrainedModel",
         "RBLNQwen2VisionTransformerPretrainedModelConfig",
         "RBLNQwen2VLForConditionalGeneration",
         "RBLNQwen2VLForConditionalGenerationConfig",
+        "RBLNQwen2VLModel",
+        "RBLNQwen2VLModelConfig",
     ],
     "decoderonly": [
         "RBLNDecoderOnlyModelConfig",
@@ -110,12 +114,14 @@ _import_structure = {
     ],
     "exaone": ["RBLNExaoneForCausalLM", "RBLNExaoneForCausalLMConfig"],
     "gemma": ["RBLNGemmaForCausalLM", "RBLNGemmaForCausalLMConfig", "RBLNGemmaModel", "RBLNGemmaModelConfig"],
+    "gemma2": ["RBLNGemma2ForCausalLM", "RBLNGemma2ForCausalLMConfig", "RBLNGemma2Model", "RBLNGemma2ModelConfig"],
     "gemma3": [
         "RBLNGemma3ForCausalLM",
         "RBLNGemma3ForCausalLMConfig",
         "RBLNGemma3ForConditionalGeneration",
         "RBLNGemma3ForConditionalGenerationConfig",
     ],
+    "gpt_oss": ["RBLNGptOssForCausalLM", "RBLNGptOssForCausalLMConfig"],
     "gpt2": ["RBLNGPT2LMHeadModel", "RBLNGPT2LMHeadModelConfig", "RBLNGPT2Model", "RBLNGPT2ModelConfig"],
     "idefics3": [
         "RBLNIdefics3VisionTransformer",
@@ -132,6 +138,12 @@ _import_structure = {
         "RBLNPegasusForConditionalGenerationConfig",
         "RBLNPegasusModelConfig",
     ],
+    "paligemma": [
+        "RBLNPaliGemmaForConditionalGeneration",
+        "RBLNPaliGemmaForConditionalGenerationConfig",
+        "RBLNPaliGemmaModel",
+        "RBLNPaliGemmaModelConfig",
+    ],
     "llava_next": ["RBLNLlavaNextForConditionalGeneration", "RBLNLlavaNextForConditionalGenerationConfig"],
     "midm": ["RBLNMidmLMHeadModel", "RBLNMidmLMHeadModelConfig"],
     "pixtral": ["RBLNPixtralVisionModel", "RBLNPixtralVisionModelConfig"],
@@ -143,7 +155,9 @@ _import_structure = {
     ],
     "phi": ["RBLNPhiForCausalLM", "RBLNPhiForCausalLMConfig", "RBLNPhiModel", "RBLNPhiModelConfig"],
     "qwen2": ["RBLNQwen2ForCausalLM", "RBLNQwen2ForCausalLMConfig", "RBLNQwen2Model", "RBLNQwen2ModelConfig"],
+    "qwen2_moe": ["RBLNQwen2MoeForCausalLM", "RBLNQwen2MoeForCausalLMConfig"],
     "qwen3": ["RBLNQwen3ForCausalLM", "RBLNQwen3ForCausalLMConfig", "RBLNQwen3Model", "RBLNQwen3ModelConfig"],
+    "qwen3_moe": ["RBLNQwen3MoeForCausalLM", "RBLNQwen3MoeForCausalLMConfig"],
     "resnet": ["RBLNResNetForImageClassification", "RBLNResNetForImageClassificationConfig"],
     "roberta": [
         "RBLNRobertaForMaskedLM",
@@ -254,6 +268,7 @@ if TYPE_CHECKING:
     from .dpt import RBLNDPTForDepthEstimation, RBLNDPTForDepthEstimationConfig
     from .exaone import RBLNExaoneForCausalLM, RBLNExaoneForCausalLMConfig
     from .gemma import RBLNGemmaForCausalLM, RBLNGemmaForCausalLMConfig, RBLNGemmaModel, RBLNGemmaModelConfig
+    from .gemma2 import RBLNGemma2ForCausalLM, RBLNGemma2ForCausalLMConfig, RBLNGemma2Model, RBLNGemma2ModelConfig
     from .gemma3 import (
         RBLNGemma3ForCausalLM,
         RBLNGemma3ForCausalLMConfig,
@@ -261,6 +276,7 @@ if TYPE_CHECKING:
         RBLNGemma3ForConditionalGenerationConfig,
     )
     from .gpt2 import RBLNGPT2LMHeadModel, RBLNGPT2LMHeadModelConfig, RBLNGPT2Model, RBLNGPT2ModelConfig
+    from .gpt_oss import RBLNGptOssForCausalLM, RBLNGptOssForCausalLMConfig
     from .grounding_dino import (
         RBLNGroundingDinoDecoder,
         RBLNGroundingDinoDecoderConfig,
@@ -281,6 +297,12 @@ if TYPE_CHECKING:
     from .midm import RBLNMidmLMHeadModel, RBLNMidmLMHeadModelConfig
     from .mistral import RBLNMistralForCausalLM, RBLNMistralForCausalLMConfig, RBLNMistralModel, RBLNMistralModelConfig
     from .opt import RBLNOPTForCausalLM, RBLNOPTForCausalLMConfig, RBLNOPTModel, RBLNOPTModelConfig
+    from .paligemma import (
+        RBLNPaliGemmaForConditionalGeneration,
+        RBLNPaliGemmaForConditionalGenerationConfig,
+        RBLNPaliGemmaModel,
+        RBLNPaliGemmaModelConfig,
+    )
     from .pegasus import (
         RBLNPegasusForConditionalGeneration,
         RBLNPegasusForConditionalGenerationConfig,
@@ -295,14 +317,20 @@ if TYPE_CHECKING:
         RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
         RBLNQwen2_5_VLForConditionalGeneration,
         RBLNQwen2_5_VLForConditionalGenerationConfig,
+        RBLNQwen2_5_VLModel,
+        RBLNQwen2_5_VLModelConfig,
     )
+    from .qwen2_moe import RBLNQwen2MoeForCausalLM, RBLNQwen2MoeForCausalLMConfig
     from .qwen2_vl import (
         RBLNQwen2VisionTransformerPretrainedModel,
         RBLNQwen2VisionTransformerPretrainedModelConfig,
         RBLNQwen2VLForConditionalGeneration,
         RBLNQwen2VLForConditionalGenerationConfig,
+        RBLNQwen2VLModel,
+        RBLNQwen2VLModelConfig,
     )
     from .qwen3 import RBLNQwen3ForCausalLM, RBLNQwen3ForCausalLMConfig, RBLNQwen3Model, RBLNQwen3ModelConfig
+    from .qwen3_moe import RBLNQwen3MoeForCausalLM, RBLNQwen3MoeForCausalLMConfig
     from .resnet import RBLNResNetForImageClassification, RBLNResNetForImageClassificationConfig
     from .roberta import (
         RBLNRobertaForMaskedLM,

optimum/rbln/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py CHANGED Viewed

@@ -12,10 +12,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ...configuration_generic import RBLNModelForAudioClassificationConfig
+from typing import Any, Optional
+from ....configuration_utils import RBLNModelConfig
+from ....utils.deprecation import deprecate_kwarg
-class RBLNASTForAudioClassificationConfig(RBLNModelForAudioClassificationConfig):
+class RBLNASTForAudioClassificationConfig(RBLNModelConfig):
     """
     Configuration class for RBLNASTForAudioClassification.
     """
+    @deprecate_kwarg(old_name="num_mel_bins", version="0.10.0")
+    def __init__(
+        self,
+        batch_size: Optional[int] = None,
+        max_length: Optional[int] = None,
+        **kwargs: Any,
+    ):
+        """
+        Args:
+            batch_size (Optional[int]): The batch size for inference. Defaults to 1.
+            max_length (Optional[int]): Maximum length of the audio input in time dimension.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
+        Raises:
+            ValueError: If batch_size is not a positive integer.
+        """
+        super().__init__(**kwargs)
+        self.batch_size = batch_size or 1
+        if not isinstance(self.batch_size, int) or self.batch_size < 0:
+            raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
+        self.max_length = max_length

optimum/rbln/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py CHANGED Viewed

@@ -12,17 +12,80 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ...modeling_generic import RBLNModelForAudioClassification
+from typing import TYPE_CHECKING, Optional
+import torch
+from transformers import AutoModelForAudioClassification
+from transformers.modeling_outputs import SequenceClassifierOutput
-class RBLNASTForAudioClassification(RBLNModelForAudioClassification):
+from ....configuration_utils import RBLNCompileConfig
+from ....modeling import RBLNModel
+from .configuration_audio_spectrogram_transformer import RBLNASTForAudioClassificationConfig
+if TYPE_CHECKING:
+    from transformers import AutoFeatureExtractor, PretrainedConfig, PreTrainedModel
+class RBLNASTForAudioClassification(RBLNModel):
     """
     Audio Spectrogram Transformer model with an audio classification head on top (a linear layer on top of the pooled output) e.g. for datasets like AudioSet, Speech Commands v2.
-    This model inherits from [`RBLNModelForAudioClassification`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [RBLNModelForAudioClassification]. Check the superclass documentation for the generic methods the library implements for all its models.
-    A class to convert and run pre-trained transformer-based `ASTForAudioClassification` models on RBLN devices.
-    It implements the methods to convert a pre-trained transformers `ASTForAudioClassification` model into a RBLN transformer model by:
+    A class to convert and run pre-trained transformer-based ASTForAudioClassification models on RBLN devices.
+    It implements the methods to convert a pre-trained transformers ASTForAudioClassification model into a RBLN transformer model by:
     - transferring the checkpoint weights of the original into an optimized RBLN graph,
     - compiling the resulting graph using the RBLN Compiler.
     """
+    auto_model_class = AutoModelForAudioClassification
+    @classmethod
+    def _update_rbln_config(
+        cls,
+        preprocessors: "AutoFeatureExtractor" = None,
+        model: Optional["PreTrainedModel"] = None,
+        model_config: "PretrainedConfig" = None,
+        rbln_config: Optional[RBLNASTForAudioClassificationConfig] = None,
+    ) -> RBLNASTForAudioClassificationConfig:
+        num_mel_bins = getattr(model_config, "num_mel_bins", None)
+        if rbln_config.max_length is None:
+            rbln_config.max_length = getattr(model_config, "max_length", None)
+            for feature_extractor in preprocessors:
+                if hasattr(feature_extractor, "max_length"):
+                    rbln_config.max_length = feature_extractor.max_length
+                    break
+        if rbln_config.max_length is None:
+            raise ValueError("max_length should be specified!")
+        input_info = [
+            (
+                "input_values",
+                [rbln_config.batch_size, rbln_config.max_length, num_mel_bins],
+                "float32",
+            ),
+        ]
+        rbln_config.set_compile_cfgs([RBLNCompileConfig(input_info=input_info)])
+        return rbln_config
+    def forward(self, input_values: torch.Tensor, **kwargs) -> SequenceClassifierOutput:
+        """
+        Forward pass for the RBLN-optimized Audio Spectrogram Transformer model for audio classification.
+        Args:
+            input_values (torch.FloatTensor of shape (batch_size, max_length, num_mel_bins)):
+                Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
+                loading a .flac or .wav audio file into an array of type list[float], a numpy.ndarray or a torch.Tensor, *e.g.* via
+                the torchcodec library (pip install torchcodec) or the soundfile library (pip install soundfile).
+                To prepare the array into input_features, the [AutoFeatureExtractor] should be used for extracting the
+                mel features, padding and conversion into a tensor of type torch.FloatTensor.
+        Returns:
+            Returns a SequenceClassifierOutput object.
+        """
+        return super().forward(input_values, **kwargs)

optimum/rbln/transformers/models/auto/auto_factory.py CHANGED Viewed

@@ -150,6 +150,7 @@ class _BaseAutoModelClass:
                 f"from the checkpoint, leading to potential unintended behavior. If this is not intentional, consider calling the "
                 f"`from_pretrained()` method directly from the `RBLN{config.architectures[0]}` class instead.",
                 UserWarning,
+                stacklevel=2,
             )
         return model_class

optimum/rbln/transformers/models/bart/bart_architecture.py CHANGED Viewed

@@ -60,10 +60,10 @@ class BartForConditionalGeneration(Seq2SeqForConditionalGeneration):
 class BartDecoder(Seq2SeqDecoder):
     has_pos_emb = True
-    def __post_init__(self):
-        self.embed_positions = self._original_mod.embed_positions
-        self.layernorm_embedding = self._original_mod.layernorm_embedding
-        self.embed_scale = getattr(self._original_mod, "embed_scale", None)
+    def __post_init__(self, model: nn.Module):
+        self.embed_positions = model.embed_positions
+        self.layernorm_embedding = model.layernorm_embedding
+        self.embed_scale = getattr(model, "embed_scale", None)
     def prepare_attn_mask(self, attention_mask, encoder_attention_mask, **kwargs):
         if attention_mask is not None:
@@ -112,11 +112,11 @@ class BartLayerFF(nn.Module):
 class BartDecoderLayer(Seq2SeqDecoderLayer):
-    def __post_init__(self):
-        self.self_attn_layer_norm = self._original_mod.self_attn_layer_norm
-        self.encoder_attn = self._original_mod.encoder_attn
-        self.encoder_attn_layer_norm = self._original_mod.encoder_attn_layer_norm
-        self.ff_layer = BartLayerFF(self._original_mod)
+    def __post_init__(self, decoder_layer: nn.Module):
+        self.self_attn_layer_norm = decoder_layer.self_attn_layer_norm
+        self.encoder_attn = decoder_layer.encoder_attn
+        self.encoder_attn_layer_norm = decoder_layer.encoder_attn_layer_norm
+        self.ff_layer = BartLayerFF(decoder_layer)
     def pre_self_attn_layer_norm(self, hidden_states):
         return hidden_states
@@ -132,13 +132,13 @@ class BartDecoderLayer(Seq2SeqDecoderLayer):
 class BartSelfAttention(Seq2SeqSelfAttention):
-    def __post_init__(self, use_attention_mask: bool = True):
-        self.q_proj = self._original_mod.q_proj
-        self.k_proj = self._original_mod.k_proj
-        self.v_proj = self._original_mod.v_proj
-        self.out_proj = self._original_mod.out_proj
-        self.num_heads = self._original_mod.num_heads
-        self.head_dim = self._original_mod.embed_dim // self._original_mod.num_heads
+    def __post_init__(self, attn: nn.Module, use_attention_mask: bool = True):
+        self.q_proj = attn.q_proj
+        self.k_proj = attn.k_proj
+        self.v_proj = attn.v_proj
+        self.out_proj = attn.out_proj
+        self.num_heads = attn.num_heads
+        self.head_dim = attn.embed_dim // attn.num_heads
         self.scaling = self.head_dim**-0.5
         if use_attention_mask:
             self.attn_decode = torch.ops.rbln_custom_ops.paged_attn_decode
@@ -153,11 +153,11 @@ class BartSelfAttention(Seq2SeqSelfAttention):
 class BartCrossAttention(Seq2SeqCrossAttention):
-    def __post_init__(self):
-        self.q_proj = self._original_mod.q_proj
-        self.k_proj = self._original_mod.k_proj
-        self.v_proj = self._original_mod.v_proj
-        self.out_proj = self._original_mod.out_proj
-        self.num_heads = self._original_mod.num_heads
-        self.head_dim = self._original_mod.embed_dim // self._original_mod.num_heads
-        self.embed_dim = self._original_mod.embed_dim
+    def __post_init__(self, attn: nn.Module):
+        self.q_proj = attn.q_proj
+        self.k_proj = attn.k_proj
+        self.v_proj = attn.v_proj
+        self.out_proj = attn.out_proj
+        self.num_heads = attn.num_heads
+        self.head_dim = attn.embed_dim // attn.num_heads
+        self.embed_dim = attn.embed_dim

optimum/rbln/transformers/models/bart/modeling_bart.py CHANGED Viewed

@@ -13,9 +13,11 @@
 # limitations under the License.
 import inspect
-from typing import Any, Callable
+from typing import Any, Callable, Optional, Tuple, Union
+import torch
 from transformers import BartForConditionalGeneration, PreTrainedModel
+from transformers.modeling_outputs import Seq2SeqModelOutput
 from ....utils.logging import get_logger
 from ...modeling_generic import RBLNTransformerEncoderForFeatureExtraction
@@ -35,6 +37,25 @@ class RBLNBartModel(RBLNTransformerEncoderForFeatureExtraction):
     on RBLN devices, optimized for feature extraction use cases.
     """
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, Seq2SeqModelOutput]:
+        """
+        Forward pass for the RBLN-optimized BART model for feature extraction tasks.
+        Args:
+            input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
+            attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
+        Returns:
+            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a Seq2SeqModelOutput object.
+        """
+        return super().forward(input_ids, attention_mask, **kwargs)
 class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
     """
@@ -48,7 +69,7 @@ class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
     support_causal_attn = True
     @classmethod
-    def wrap_model_if_needed(self, model: PreTrainedModel, rbln_config: RBLNBartForConditionalGenerationConfig):
+    def _wrap_model_if_needed(self, model: PreTrainedModel, rbln_config: RBLNBartForConditionalGenerationConfig):
         return BartWrapper(
             model, enc_max_seq_len=rbln_config.enc_max_seq_len, use_attention_mask=rbln_config.use_attention_mask
         )

optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl