PyPI - optimum-rbln - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

optimum-rbln 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

optimum/rbln/transformers/modeling_rope_utils.py ADDED Viewed

@@ -0,0 +1,283 @@
+import math
+from typing import Optional, Tuple
+import torch
+from transformers import PretrainedConfig
+def _compute_default_rope_parameters(
+    config: Optional[PretrainedConfig] = None,
+    seq_len: Optional[int] = None,
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies according to the original RoPE implementation
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # Compute the inverse frequencies
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+    return inv_freq, attention_factor
+def _compute_linear_scaling_rope_parameters(
+    config: Optional[PretrainedConfig] = None,
+    seq_len: Optional[int] = None,
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    factor = config.rope_scaling["factor"]
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, seq_len)
+    # Then applies linear scaling to the frequencies.
+    # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
+    # applying scaling to the inverse frequencies is equivalent.
+    inv_freq /= factor
+    return inv_freq, attention_factor
+def _compute_dynamic_ntk_parameters(
+    config: Optional[PretrainedConfig] = None,
+    seq_len: Optional[int] = None,
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        seq_len (`int`, *optional*):
+            The current sequence length, used to update the dynamic RoPE at inference time.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    max_position_embeddings = config.max_position_embeddings
+    factor = config.rope_scaling["factor"]
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # Process with chunk_size to reduce precesion error
+    chunk_size = 4096
+    chunks = (seq_len + chunk_size - 1) // chunk_size
+    inv_freq_list = []
+    for i in range(chunks):
+        start = i * chunk_size
+        end = min((i + 1) * chunk_size, seq_len)
+        seq_lens = torch.arange(start, end, dtype=torch.float32).view(-1, 1) + 1.0
+        seq_lens = torch.where(seq_lens > max_position_embeddings, seq_lens, max_position_embeddings)
+        # Compute the inverse frequencies for each chunk
+        scaled_base = base * ((factor * seq_lens / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
+        inv_freq = 1.0 / (scaled_base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        inv_freq_list.append(inv_freq)
+    final_inv_freq = torch.cat(inv_freq_list, dim=0)
+    return final_inv_freq, attention_factor
+def _compute_yarn_parameters(config: PretrainedConfig, seq_len: Optional[int] = None) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with NTK scaling. Please refer to the
+    [original paper](https://arxiv.org/abs/2309.00071)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    max_position_embeddings = config.max_position_embeddings
+    factor = config.rope_scaling["factor"]
+    # Sets the attention factor as suggested in the paper
+    attention_factor = config.rope_scaling.get("attention_factor")
+    if attention_factor is None:
+        attention_factor = 0.1 * math.log(factor) + 1.0
+    # Optional config options
+    # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
+    beta_fast = config.rope_scaling.get("beta_fast") or 32
+    beta_slow = config.rope_scaling.get("beta_slow") or 1
+    # Compute the inverse frequencies
+    def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
+        """Inverse dimension formula to find the dimension based on the number of rotations"""
+        return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
+    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
+        """Find dimension range bounds based on rotations"""
+        low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
+        high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+        return max(low, 0), min(high, dim - 1)
+    def linear_ramp_factor(min, max, dim):
+        if min == max:
+            max += 0.001  # Prevent singularity
+        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+        ramp_func = torch.clamp(linear_func, 0, 1)
+        return ramp_func
+    # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+    # to expand the possible context length. In other words, interpolation = apply scaling factor.
+    pos_freqs = base ** (torch.arange(0, dim, 2).float() / dim)
+    inv_freq_extrapolation = 1.0 / pos_freqs
+    inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+    low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
+    # Get n-dimensional rotational scaling corrected for extrapolation
+    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float()
+    inv_freq = (
+        inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+        + inv_freq_extrapolation * inv_freq_extrapolation_factor
+    )
+    return inv_freq, attention_factor
+def _compute_longrope_parameters(
+    config: PretrainedConfig, seq_len: Optional[int] = None
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
+    [original implementation](https://github.com/microsoft/LongRoPE)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    long_factor = config.rope_scaling["long_factor"]
+    short_factor = config.rope_scaling["short_factor"]
+    factor = config.rope_scaling.get("factor")
+    attention_factor = config.rope_scaling.get("attention_factor")
+    # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
+    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
+    # values to compute the default attention scaling factor, instead of using `factor`.
+    if hasattr(config, "original_max_position_embeddings"):
+        max_position_embeddings = config.original_max_position_embeddings
+        expanded_max_position_embeddings = config.max_position_embeddings
+        factor = expanded_max_position_embeddings / max_position_embeddings
+    else:
+        max_position_embeddings = config.max_position_embeddings
+        expanded_max_position_embeddings = max_position_embeddings * factor
+    # Sets the attention factor as suggested in the paper
+    if attention_factor is None:
+        if factor <= 1.0:
+            attention_factor = 1.0
+        else:
+            attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))
+    # Compute the inverse frequencies -- scaled based on the target sequence length
+    if expanded_max_position_embeddings > max_position_embeddings:
+        ext_factors = torch.tensor(long_factor, dtype=torch.float32)
+    else:
+        ext_factors = torch.tensor(short_factor, dtype=torch.float32)
+    inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64).float() / dim
+    inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
+    return inv_freq, attention_factor
+def _compute_llama3_parameters(
+    config: PretrainedConfig, seq_len: Optional[int] = None
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies for llama 3.1.
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, seq_len)
+    factor = config.rope_scaling["factor"]  # `8` in the original implementation
+    low_freq_factor = config.rope_scaling["low_freq_factor"]  # `1` in the original implementation
+    high_freq_factor = config.rope_scaling["high_freq_factor"]  # `4` in the original implementation
+    old_context_len = config.rope_scaling["original_max_position_embeddings"]  # `8192` in the original implementation
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    wavelen = 2 * math.pi / inv_freq
+    # wavelen < high_freq_wavelen: do nothing
+    # wavelen > low_freq_wavelen: divide by factor
+    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
+    # otherwise: interpolate between the two, using a smooth factor
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+    return inv_freq_llama, attention_factor
+# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
+# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
+# parameterizations, as long as the callable has the same signature.
+ROPE_INIT_FUNCTIONS = {
+    "default": _compute_default_rope_parameters,
+    "linear": _compute_linear_scaling_rope_parameters,
+    "dynamic": _compute_dynamic_ntk_parameters,
+    "yarn": _compute_yarn_parameters,
+    "longrope": _compute_longrope_parameters,
+    "llama3": _compute_llama3_parameters,
+}

optimum/rbln/transformers/models/__init__.py CHANGED Viewed

@@ -21,32 +21,84 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+from typing import TYPE_CHECKING
-from .auto import (
-    RBLNAutoModel,
-    RBLNAutoModelForAudioClassification,
-    RBLNAutoModelForCausalLM,
-    RBLNAutoModelForCTC,
-    RBLNAutoModelForDepthEstimation,
-    RBLNAutoModelForImageClassification,
-    RBLNAutoModelForMaskedLM,
-    RBLNAutoModelForQuestionAnswering,
-    RBLNAutoModelForSeq2SeqLM,
-    RBLNAutoModelForSequenceClassification,
-    RBLNAutoModelForSpeechSeq2Seq,
-    RBLNAutoModelForVision2Seq,
-)
-from .bart import RBLNBartModel
-from .bert import RBLNBertModel
-from .clip import RBLNCLIPTextModel, RBLNCLIPTextModelWithProjection, RBLNCLIPVisionModel
-from .dpt import RBLNDPTForDepthEstimation
-from .gemma import RBLNGemmaForCausalLM
-from .gpt2 import RBLNGPT2LMHeadModel
-from .llama import RBLNLlamaForCausalLM
-from .llava_next import RBLNLlavaNextForConditionalGeneration
-from .midm import RBLNMidmLMHeadModel
-from .mistral import RBLNMistralForCausalLM
-from .phi import RBLNPhiForCausalLM
-from .wav2vec2 import RBLNWav2Vec2ForCTC
-from .whisper import RBLNWhisperForConditionalGeneration
-from .xlm_roberta import RBLNXLMRobertaModel
+from transformers.utils import _LazyModule
+_import_structure = {
+    "auto": [
+        "RBLNAutoModel",
+        "RBLNAutoModelForAudioClassification",
+        "RBLNAutoModelForCausalLM",
+        "RBLNAutoModelForCTC",
+        "RBLNAutoModelForDepthEstimation",
+        "RBLNAutoModelForImageClassification",
+        "RBLNAutoModelForMaskedLM",
+        "RBLNAutoModelForQuestionAnswering",
+        "RBLNAutoModelForSeq2SeqLM",
+        "RBLNAutoModelForSequenceClassification",
+        "RBLNAutoModelForSpeechSeq2Seq",
+        "RBLNAutoModelForVision2Seq",
+    ],
+    "bart": ["RBLNBartForConditionalGeneration", "RBLNBartModel"],
+    "bert": ["RBLNBertModel"],
+    "clip": ["RBLNCLIPTextModel", "RBLNCLIPTextModelWithProjection", "RBLNCLIPVisionModel"],
+    "dpt": ["RBLNDPTForDepthEstimation"],
+    "exaone": ["RBLNExaoneForCausalLM"],
+    "gemma": ["RBLNGemmaForCausalLM"],
+    "gpt2": ["RBLNGPT2LMHeadModel"],
+    "llama": ["RBLNLlamaForCausalLM"],
+    "llava_next": ["RBLNLlavaNextForConditionalGeneration"],
+    "midm": ["RBLNMidmLMHeadModel"],
+    "mistral": ["RBLNMistralForCausalLM"],
+    "phi": ["RBLNPhiForCausalLM"],
+    "qwen2": ["RBLNQwen2ForCausalLM"],
+    "t5": ["RBLNT5EncoderModel", "RBLNT5ForConditionalGeneration"],
+    "wav2vec2": ["RBLNWav2Vec2ForCTC"],
+    "whisper": ["RBLNWhisperForConditionalGeneration"],
+    "xlm_roberta": ["RBLNXLMRobertaModel"],
+}
+if TYPE_CHECKING:
+    from .auto import (
+        RBLNAutoModel,
+        RBLNAutoModelForAudioClassification,
+        RBLNAutoModelForCausalLM,
+        RBLNAutoModelForCTC,
+        RBLNAutoModelForDepthEstimation,
+        RBLNAutoModelForImageClassification,
+        RBLNAutoModelForMaskedLM,
+        RBLNAutoModelForQuestionAnswering,
+        RBLNAutoModelForSeq2SeqLM,
+        RBLNAutoModelForSequenceClassification,
+        RBLNAutoModelForSpeechSeq2Seq,
+        RBLNAutoModelForVision2Seq,
+    )
+    from .bart import RBLNBartForConditionalGeneration, RBLNBartModel
+    from .bert import RBLNBertModel
+    from .clip import RBLNCLIPTextModel, RBLNCLIPTextModelWithProjection, RBLNCLIPVisionModel
+    from .dpt import RBLNDPTForDepthEstimation
+    from .exaone import RBLNExaoneForCausalLM
+    from .gemma import RBLNGemmaForCausalLM
+    from .gpt2 import RBLNGPT2LMHeadModel
+    from .llama import RBLNLlamaForCausalLM
+    from .llava_next import RBLNLlavaNextForConditionalGeneration
+    from .midm import RBLNMidmLMHeadModel
+    from .mistral import RBLNMistralForCausalLM
+    from .phi import RBLNPhiForCausalLM
+    from .qwen2 import RBLNQwen2ForCausalLM
+    from .t5 import RBLNT5EncoderModel, RBLNT5ForConditionalGeneration
+    from .wav2vec2 import RBLNWav2Vec2ForCTC
+    from .whisper import RBLNWhisperForConditionalGeneration
+    from .xlm_roberta import RBLNXLMRobertaModel
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )

optimum/rbln/transformers/models/auto/modeling_auto.py CHANGED Viewed

@@ -42,6 +42,7 @@ from .auto_factory import _BaseAutoModelClass
 MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.update(
     {
         "midm": "MidmLMHeadModel",
+        "exaone": "ExaoneForCausalLM",
     }
 )

optimum/rbln/transformers/models/bart/__init__.py CHANGED Viewed

@@ -22,4 +22,4 @@
 # from Rebellions Inc.
 from .bart_architecture import BartDecoderWrapper, BartEncoderWrapper
-from .modeling_bart import RBLNBartModel
+from .modeling_bart import RBLNBartForConditionalGeneration, RBLNBartModel

optimum/rbln/transformers/models/bart/bart_architecture.py CHANGED Viewed

@@ -47,6 +47,12 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
+class BartWrapper:
+    def __init__(self, model):
+        self.encoder = BartEncoderWrapper(model)
+        self.decoder = BartDecoderWrapper(model)
 class _BartAttention(BartAttention):
     def forward(
         self,
@@ -238,6 +244,7 @@ class _BartSdpaAttention(BartSdpaAttention):
                     value_states, dim=2, start=cache_position, end=cache_position + 1
                 )
+            # need 4d shape (input tensors) for scaled_dot_product_attention
             attn_output = torch.nn.functional.scaled_dot_product_attention(
                 query_states,
                 key_states,
@@ -324,7 +331,6 @@ class _BartDecoder(BartDecoder):
         attn_impl: str = "eager",
     ):
         # embedding
-        # thkim fix : transformers == 4.44.2 compile
         if hasattr(self, "embed_scale"):
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
         else:
@@ -336,13 +342,15 @@ class _BartDecoder(BartDecoder):
             hidden_states = inputs_embeds + positions
         else:
             hidden_all = []
+            # compiler pattern base dependency -> take + add
             for i in range(input_ids.shape[0]):
                 # cache position [N,1]
                 positions_idx = cache_position[i]
+                # offset is set 2 in bart embedding
                 position_weight = self.embed_positions.weight[2:]
                 position = position_weight[positions_idx]
-                tmp_hidden = position + inputs_embeds[i]
-                hidden_all.append(tmp_hidden)
+                batch_hidden = position + inputs_embeds[i]
+                hidden_all.append(batch_hidden)
             hidden_states = torch.stack(hidden_all, dim=0)
         hidden_states = self.layernorm_embedding(hidden_states)
@@ -444,6 +452,7 @@ class BartDecoderWrapper(torch.nn.Module):
             self_kv_cache.append(past_key_values[i][1])
         self_kv_cache = torch.stack(self_kv_cache, dim=0)
+        # return batch_position to keep it as a variable within the graph
         return lm_logits, self_kv_cache, batch_position
@@ -467,9 +476,6 @@ class BartEncoderWrapper(torch.nn.Module):
         cross_key_value: torch.Tensor = None,
         batch_idx: torch.Tensor = None,
     ) -> Tuple[torch.Tensor]:
-        encoder_batch_size = input_ids.shape[0]
-        decoder_batch_size = encoder_batch_size  # TODO(taehoon) fix to enable beam-search
         # 1. run encoder
         encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
         last_hidden_states = encoder_outputs[0]
@@ -477,19 +483,19 @@ class BartEncoderWrapper(torch.nn.Module):
         # 2. run dummy decoder to get pre-calculated cross-key_values for generation
         dummy_past_key_value = []
         for _ in range(self.num_layers):
-            pkv_self_attn_key = torch.zeros(decoder_batch_size, self.num_heads, self.decoder_max_length, self.d_kv)
-            pkv_self_attn_value = torch.zeros(decoder_batch_size, self.num_heads, self.decoder_max_length, self.d_kv)
-            pkv_cross_attn_key = torch.zeros(encoder_batch_size, self.num_heads, self.encoder_max_length, self.d_kv)
-            pkv_cross_attn_value = torch.zeros(encoder_batch_size, self.num_heads, self.encoder_max_length, self.d_kv)
+            pkv_self_attn_key = torch.zeros(1, self.num_heads, self.decoder_max_length, self.d_kv)
+            pkv_self_attn_value = torch.zeros(1, self.num_heads, self.decoder_max_length, self.d_kv)
+            pkv_cross_attn_key = torch.zeros(1, self.num_heads, self.encoder_max_length, self.d_kv)
+            pkv_cross_attn_value = torch.zeros(1, self.num_heads, self.encoder_max_length, self.d_kv)
             layer_pkv = (pkv_self_attn_key, pkv_self_attn_value, pkv_cross_attn_key, pkv_cross_attn_value)
             dummy_past_key_value.append(layer_pkv)
-        decoder_attention_mask = torch.zeros(decoder_batch_size, self.decoder_max_length, dtype=torch.float32)
+        decoder_attention_mask = torch.zeros(1, self.decoder_max_length, dtype=torch.float32)
         decoder_attention_mask[:, :1] = 1
         decoder_outputs = _BartDecoder.forward(
             self.decoder,
-            input_ids=torch.zeros((decoder_batch_size, 1), dtype=torch.int64),
+            input_ids=torch.zeros((1, 1), dtype=torch.int64),
             attention_mask=decoder_attention_mask,
             encoder_attention_mask=attention_mask,
             cache_position=torch.tensor(0, dtype=torch.int32),

optimum/rbln/transformers/models/bart/modeling_bart.py CHANGED Viewed

@@ -22,23 +22,25 @@
 # from Rebellions Inc.
 import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
-from transformers import AutoModel, BartConfig, BartModel, PretrainedConfig
+from transformers import BartConfig, BartForConditionalGeneration, BartModel, PretrainedConfig
 from ....modeling_base import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig
+from ....utils.logging import get_logger
+from ...models.seq2seq import RBLNModelForSeq2SeqLM
+from .bart_architecture import BartWrapper
-logger = logging.getLogger(__name__)
+logger = get_logger()
 if TYPE_CHECKING:
-    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PreTrainedModel
 class RBLNBartModel(RBLNModel):
-    auto_model_class = AutoModel  # feature extraction
     original_model_class = BartModel
     original_config_class = BartConfig
@@ -104,3 +106,20 @@ class RBLNBartModel(RBLNModel):
         rbln_config.model_cfg.update({"max_seq_len": rbln_max_seq_len})
         return rbln_config
+class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
+    @classmethod
+    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
+        return BartWrapper(model)
+    def __getattr__(self, __name: str) -> Any:
+        def redirect(func):
+            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
+        val = getattr(BartForConditionalGeneration, __name)
+        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
+            return redirect(val)
+        return val

optimum/rbln/transformers/models/bert/modeling_bert.py CHANGED Viewed

@@ -25,7 +25,7 @@ import inspect
 import logging
 from typing import TYPE_CHECKING, Any, Dict, Optional, Union
-from transformers import AutoModel, BertConfig, BertModel, PretrainedConfig
+from transformers import BertConfig, BertModel, PretrainedConfig
 from ....modeling_base import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig
@@ -38,7 +38,6 @@ if TYPE_CHECKING:
 class RBLNBertModel(RBLNModel):
-    auto_model_class = AutoModel  # feature extraction
     original_model_class = BertModel
     original_config_class = BertConfig

optimum/rbln/transformers/models/clip/modeling_clip.py CHANGED Viewed

@@ -26,8 +26,6 @@ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
 import torch
 from transformers import (
-    AutoConfig,
-    AutoModel,
     CLIPTextConfig,
     CLIPTextModel,
     CLIPTextModelWithProjection,
@@ -39,6 +37,7 @@ from transformers.models.clip.modeling_clip import CLIPTextModelOutput
 from ....modeling_base import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig
+from ....utils.context import override_auto_classes
 logger = logging.getLogger(__name__)
@@ -58,19 +57,14 @@ class _TextEncoder(torch.nn.Module):
 class RBLNCLIPTextModel(RBLNModel):
-    auto_model_class = AutoModel  # feature extraction
-    original_model_class = CLIPTextModel
-    original_config_class = CLIPTextConfig
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
-        configtmp = AutoConfig.from_pretrained
-        modeltmp = AutoModel.from_pretrained
-        AutoConfig.from_pretrained = cls.original_config_class.from_pretrained
-        AutoModel.from_pretrained = cls.original_model_class.from_pretrained
-        rt = super().from_pretrained(*args, **kwargs)
-        AutoConfig.from_pretrained = configtmp
-        AutoModel.from_pretrained = modeltmp
+        with override_auto_classes(
+            config_func=CLIPTextConfig.from_pretrained,
+            model_func=CLIPTextModel.from_pretrained,
+            skip_taskmanager=False,
+        ):
+            rt = super().from_pretrained(*args, **kwargs)
         return rt
     @classmethod
@@ -134,18 +128,14 @@ class _VisionEncoder(torch.nn.Module):
 class RBLNCLIPVisionModel(RBLNModel):
-    original_model_class = CLIPVisionModel
-    original_config_class = CLIPVisionConfig
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
-        configtmp = AutoConfig.from_pretrained
-        modeltmp = AutoModel.from_pretrained
-        AutoConfig.from_pretrained = cls.original_config_class.from_pretrained
-        AutoModel.from_pretrained = cls.original_model_class.from_pretrained
-        rt = super().from_pretrained(*args, **kwargs)
-        AutoConfig.from_pretrained = configtmp
-        AutoModel.from_pretrained = modeltmp
+        with override_auto_classes(
+            config_func=CLIPVisionConfig.from_pretrained,
+            model_func=CLIPVisionModel.from_pretrained,
+            skip_taskmanager=False,
+        ):
+            rt = super().from_pretrained(*args, **kwargs)
         return rt
     @classmethod

optimum/rbln/transformers/models/decoderonly/__init__.py CHANGED Viewed

@@ -26,8 +26,6 @@ from .decoderonly_architecture import (
     DecoderOnlyDecoderLayer,
     DecoderOnlyModel,
     DecoderOnlyWrapper,
-    DynamicNTKScalingRotaryEmbedding,
-    LinearScalingRotaryEmbedding,
     RotaryEmbedding,
     apply_rotary_pos_emb,
     rotate_half,

optimum-rbln 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

optimum-rbln 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl