PyPI - optimum-rbln - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

optimum-rbln 0.1.12py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

optimum/rbln/transformers/modeling_rope_utils.py ADDED Viewed

@@ -0,0 +1,283 @@
+import math
+from typing import Optional, Tuple
+import torch
+from transformers import PretrainedConfig
+def _compute_default_rope_parameters(
+    config: Optional[PretrainedConfig] = None,
+    seq_len: Optional[int] = None,
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies according to the original RoPE implementation
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # Compute the inverse frequencies
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+    return inv_freq, attention_factor
+def _compute_linear_scaling_rope_parameters(
+    config: Optional[PretrainedConfig] = None,
+    seq_len: Optional[int] = None,
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    factor = config.rope_scaling["factor"]
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, seq_len)
+    # Then applies linear scaling to the frequencies.
+    # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
+    # applying scaling to the inverse frequencies is equivalent.
+    inv_freq /= factor
+    return inv_freq, attention_factor
+def _compute_dynamic_ntk_parameters(
+    config: Optional[PretrainedConfig] = None,
+    seq_len: Optional[int] = None,
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        seq_len (`int`, *optional*):
+            The current sequence length, used to update the dynamic RoPE at inference time.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    max_position_embeddings = config.max_position_embeddings
+    factor = config.rope_scaling["factor"]
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # Process with chunk_size to reduce precesion error
+    chunk_size = 4096
+    chunks = (seq_len + chunk_size - 1) // chunk_size
+    inv_freq_list = []
+    for i in range(chunks):
+        start = i * chunk_size
+        end = min((i + 1) * chunk_size, seq_len)
+        seq_lens = torch.arange(start, end, dtype=torch.float32).view(-1, 1) + 1.0
+        seq_lens = torch.where(seq_lens > max_position_embeddings, seq_lens, max_position_embeddings)
+        # Compute the inverse frequencies for each chunk
+        scaled_base = base * ((factor * seq_lens / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
+        inv_freq = 1.0 / (scaled_base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        inv_freq_list.append(inv_freq)
+    final_inv_freq = torch.cat(inv_freq_list, dim=0)
+    return final_inv_freq, attention_factor
+def _compute_yarn_parameters(config: PretrainedConfig, seq_len: Optional[int] = None) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with NTK scaling. Please refer to the
+    [original paper](https://arxiv.org/abs/2309.00071)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    max_position_embeddings = config.max_position_embeddings
+    factor = config.rope_scaling["factor"]
+    # Sets the attention factor as suggested in the paper
+    attention_factor = config.rope_scaling.get("attention_factor")
+    if attention_factor is None:
+        attention_factor = 0.1 * math.log(factor) + 1.0
+    # Optional config options
+    # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
+    beta_fast = config.rope_scaling.get("beta_fast") or 32
+    beta_slow = config.rope_scaling.get("beta_slow") or 1
+    # Compute the inverse frequencies
+    def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
+        """Inverse dimension formula to find the dimension based on the number of rotations"""
+        return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
+    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
+        """Find dimension range bounds based on rotations"""
+        low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
+        high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+        return max(low, 0), min(high, dim - 1)
+    def linear_ramp_factor(min, max, dim):
+        if min == max:
+            max += 0.001  # Prevent singularity
+        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+        ramp_func = torch.clamp(linear_func, 0, 1)
+        return ramp_func
+    # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+    # to expand the possible context length. In other words, interpolation = apply scaling factor.
+    pos_freqs = base ** (torch.arange(0, dim, 2).float() / dim)
+    inv_freq_extrapolation = 1.0 / pos_freqs
+    inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+    low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
+    # Get n-dimensional rotational scaling corrected for extrapolation
+    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float()
+    inv_freq = (
+        inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+        + inv_freq_extrapolation * inv_freq_extrapolation_factor
+    )
+    return inv_freq, attention_factor
+def _compute_longrope_parameters(
+    config: PretrainedConfig, seq_len: Optional[int] = None
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
+    [original implementation](https://github.com/microsoft/LongRoPE)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    long_factor = config.rope_scaling["long_factor"]
+    short_factor = config.rope_scaling["short_factor"]
+    factor = config.rope_scaling.get("factor")
+    attention_factor = config.rope_scaling.get("attention_factor")
+    # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
+    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
+    # values to compute the default attention scaling factor, instead of using `factor`.
+    if hasattr(config, "original_max_position_embeddings"):
+        max_position_embeddings = config.original_max_position_embeddings
+        expanded_max_position_embeddings = config.max_position_embeddings
+        factor = expanded_max_position_embeddings / max_position_embeddings
+    else:
+        max_position_embeddings = config.max_position_embeddings
+        expanded_max_position_embeddings = max_position_embeddings * factor
+    # Sets the attention factor as suggested in the paper
+    if attention_factor is None:
+        if factor <= 1.0:
+            attention_factor = 1.0
+        else:
+            attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))
+    # Compute the inverse frequencies -- scaled based on the target sequence length
+    if expanded_max_position_embeddings > max_position_embeddings:
+        ext_factors = torch.tensor(long_factor, dtype=torch.float32)
+    else:
+        ext_factors = torch.tensor(short_factor, dtype=torch.float32)
+    inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64).float() / dim
+    inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
+    return inv_freq, attention_factor
+def _compute_llama3_parameters(
+    config: PretrainedConfig, seq_len: Optional[int] = None
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies for llama 3.1.
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, seq_len)
+    factor = config.rope_scaling["factor"]  # `8` in the original implementation
+    low_freq_factor = config.rope_scaling["low_freq_factor"]  # `1` in the original implementation
+    high_freq_factor = config.rope_scaling["high_freq_factor"]  # `4` in the original implementation
+    old_context_len = config.rope_scaling["original_max_position_embeddings"]  # `8192` in the original implementation
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    wavelen = 2 * math.pi / inv_freq
+    # wavelen < high_freq_wavelen: do nothing
+    # wavelen > low_freq_wavelen: divide by factor
+    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
+    # otherwise: interpolate between the two, using a smooth factor
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+    return inv_freq_llama, attention_factor
+# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
+# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
+# parameterizations, as long as the callable has the same signature.
+ROPE_INIT_FUNCTIONS = {
+    "default": _compute_default_rope_parameters,
+    "linear": _compute_linear_scaling_rope_parameters,
+    "dynamic": _compute_dynamic_ntk_parameters,
+    "yarn": _compute_yarn_parameters,
+    "longrope": _compute_longrope_parameters,
+    "llama3": _compute_llama3_parameters,
+}

optimum/rbln/transformers/models/__init__.py CHANGED Viewed

@@ -21,35 +21,84 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+from typing import TYPE_CHECKING
-from .auto import (
-    RBLNAutoModel,
-    RBLNAutoModelForAudioClassification,
-    RBLNAutoModelForCausalLM,
-    RBLNAutoModelForCTC,
-    RBLNAutoModelForDepthEstimation,
-    RBLNAutoModelForImageClassification,
-    RBLNAutoModelForMaskedLM,
-    RBLNAutoModelForQuestionAnswering,
-    RBLNAutoModelForSeq2SeqLM,
-    RBLNAutoModelForSequenceClassification,
-    RBLNAutoModelForSpeechSeq2Seq,
-    RBLNAutoModelForVision2Seq,
-)
-from .bart import RBLNBartForConditionalGeneration, RBLNBartModel
-from .bert import RBLNBertModel
-from .clip import RBLNCLIPTextModel, RBLNCLIPTextModelWithProjection, RBLNCLIPVisionModel
-from .dpt import RBLNDPTForDepthEstimation
-from .exaone import RBLNExaoneForCausalLM
-from .gemma import RBLNGemmaForCausalLM
-from .gpt2 import RBLNGPT2LMHeadModel
-from .llama import RBLNLlamaForCausalLM
-from .llava_next import RBLNLlavaNextForConditionalGeneration
-from .midm import RBLNMidmLMHeadModel
-from .mistral import RBLNMistralForCausalLM
-from .phi import RBLNPhiForCausalLM
-from .qwen2 import RBLNQwen2ForCausalLM
-from .t5 import RBLNT5ForConditionalGeneration
-from .wav2vec2 import RBLNWav2Vec2ForCTC
-from .whisper import RBLNWhisperForConditionalGeneration
-from .xlm_roberta import RBLNXLMRobertaModel
+from transformers.utils import _LazyModule
+_import_structure = {
+    "auto": [
+        "RBLNAutoModel",
+        "RBLNAutoModelForAudioClassification",
+        "RBLNAutoModelForCausalLM",
+        "RBLNAutoModelForCTC",
+        "RBLNAutoModelForDepthEstimation",
+        "RBLNAutoModelForImageClassification",
+        "RBLNAutoModelForMaskedLM",
+        "RBLNAutoModelForQuestionAnswering",
+        "RBLNAutoModelForSeq2SeqLM",
+        "RBLNAutoModelForSequenceClassification",
+        "RBLNAutoModelForSpeechSeq2Seq",
+        "RBLNAutoModelForVision2Seq",
+    ],
+    "bart": ["RBLNBartForConditionalGeneration", "RBLNBartModel"],
+    "bert": ["RBLNBertModel"],
+    "clip": ["RBLNCLIPTextModel", "RBLNCLIPTextModelWithProjection", "RBLNCLIPVisionModel"],
+    "dpt": ["RBLNDPTForDepthEstimation"],
+    "exaone": ["RBLNExaoneForCausalLM"],
+    "gemma": ["RBLNGemmaForCausalLM"],
+    "gpt2": ["RBLNGPT2LMHeadModel"],
+    "llama": ["RBLNLlamaForCausalLM"],
+    "llava_next": ["RBLNLlavaNextForConditionalGeneration"],
+    "midm": ["RBLNMidmLMHeadModel"],
+    "mistral": ["RBLNMistralForCausalLM"],
+    "phi": ["RBLNPhiForCausalLM"],
+    "qwen2": ["RBLNQwen2ForCausalLM"],
+    "t5": ["RBLNT5EncoderModel", "RBLNT5ForConditionalGeneration"],
+    "wav2vec2": ["RBLNWav2Vec2ForCTC"],
+    "whisper": ["RBLNWhisperForConditionalGeneration"],
+    "xlm_roberta": ["RBLNXLMRobertaModel"],
+}
+if TYPE_CHECKING:
+    from .auto import (
+        RBLNAutoModel,
+        RBLNAutoModelForAudioClassification,
+        RBLNAutoModelForCausalLM,
+        RBLNAutoModelForCTC,
+        RBLNAutoModelForDepthEstimation,
+        RBLNAutoModelForImageClassification,
+        RBLNAutoModelForMaskedLM,
+        RBLNAutoModelForQuestionAnswering,
+        RBLNAutoModelForSeq2SeqLM,
+        RBLNAutoModelForSequenceClassification,
+        RBLNAutoModelForSpeechSeq2Seq,
+        RBLNAutoModelForVision2Seq,
+    )
+    from .bart import RBLNBartForConditionalGeneration, RBLNBartModel
+    from .bert import RBLNBertModel
+    from .clip import RBLNCLIPTextModel, RBLNCLIPTextModelWithProjection, RBLNCLIPVisionModel
+    from .dpt import RBLNDPTForDepthEstimation
+    from .exaone import RBLNExaoneForCausalLM
+    from .gemma import RBLNGemmaForCausalLM
+    from .gpt2 import RBLNGPT2LMHeadModel
+    from .llama import RBLNLlamaForCausalLM
+    from .llava_next import RBLNLlavaNextForConditionalGeneration
+    from .midm import RBLNMidmLMHeadModel
+    from .mistral import RBLNMistralForCausalLM
+    from .phi import RBLNPhiForCausalLM
+    from .qwen2 import RBLNQwen2ForCausalLM
+    from .t5 import RBLNT5EncoderModel, RBLNT5ForConditionalGeneration
+    from .wav2vec2 import RBLNWav2Vec2ForCTC
+    from .whisper import RBLNWhisperForConditionalGeneration
+    from .xlm_roberta import RBLNXLMRobertaModel
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )

optimum/rbln/transformers/models/clip/modeling_clip.py CHANGED Viewed

@@ -26,8 +26,6 @@ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
 import torch
 from transformers import (
-    AutoConfig,
-    AutoModel,
     CLIPTextConfig,
     CLIPTextModel,
     CLIPTextModelWithProjection,
@@ -39,6 +37,7 @@ from transformers.models.clip.modeling_clip import CLIPTextModelOutput
 from ....modeling_base import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig
+from ....utils.context import override_auto_classes
 logger = logging.getLogger(__name__)
@@ -58,18 +57,14 @@ class _TextEncoder(torch.nn.Module):
 class RBLNCLIPTextModel(RBLNModel):
-    original_model_class = CLIPTextModel
-    original_config_class = CLIPTextConfig
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
-        configtmp = AutoConfig.from_pretrained
-        modeltmp = AutoModel.from_pretrained
-        AutoConfig.from_pretrained = cls.original_config_class.from_pretrained
-        AutoModel.from_pretrained = cls.original_model_class.from_pretrained
-        rt = super().from_pretrained(*args, **kwargs)
-        AutoConfig.from_pretrained = configtmp
-        AutoModel.from_pretrained = modeltmp
+        with override_auto_classes(
+            config_func=CLIPTextConfig.from_pretrained,
+            model_func=CLIPTextModel.from_pretrained,
+            skip_taskmanager=False,
+        ):
+            rt = super().from_pretrained(*args, **kwargs)
         return rt
     @classmethod
@@ -133,18 +128,14 @@ class _VisionEncoder(torch.nn.Module):
 class RBLNCLIPVisionModel(RBLNModel):
-    original_model_class = CLIPVisionModel
-    original_config_class = CLIPVisionConfig
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
-        configtmp = AutoConfig.from_pretrained
-        modeltmp = AutoModel.from_pretrained
-        AutoConfig.from_pretrained = cls.original_config_class.from_pretrained
-        AutoModel.from_pretrained = cls.original_model_class.from_pretrained
-        rt = super().from_pretrained(*args, **kwargs)
-        AutoConfig.from_pretrained = configtmp
-        AutoModel.from_pretrained = modeltmp
+        with override_auto_classes(
+            config_func=CLIPVisionConfig.from_pretrained,
+            model_func=CLIPVisionModel.from_pretrained,
+            skip_taskmanager=False,
+        ):
+            rt = super().from_pretrained(*args, **kwargs)
         return rt
     @classmethod

optimum/rbln/transformers/models/decoderonly/__init__.py CHANGED Viewed

@@ -26,8 +26,6 @@ from .decoderonly_architecture import (
     DecoderOnlyDecoderLayer,
     DecoderOnlyModel,
     DecoderOnlyWrapper,
-    DynamicNTKScalingRotaryEmbedding,
-    LinearScalingRotaryEmbedding,
     RotaryEmbedding,
     apply_rotary_pos_emb,
     rotate_half,

optimum-rbln 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl

optimum-rbln 0.1.12py3-none-any.whl → 0.1.13py3-none-any.whl