PyPI - optimum-rbln - Versions diffs - 0.9.4a2__py3-none-any.whl → 0.9.5a4__py3-none-any.whl - Mend

optimum-rbln 0.9.4a2py3-none-any.whl → 0.9.5a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

optimum/rbln/transformers/modeling_rope_utils.py CHANGED Viewed

@@ -27,7 +27,7 @@
 # limitations under the License.
 import math
-from typing import Optional, Tuple
+from typing import Optional
 import torch
 from transformers import PretrainedConfig
@@ -35,13 +35,16 @@ from transformers import PretrainedConfig
 def _compute_default_rope_parameters(
     config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
-) -> Tuple["torch.Tensor", float]:
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies according to the original RoPE implementation
     Args:
         config ([`~transformers.PretrainedConfig`]):
             The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
     Returns:
@@ -50,40 +53,38 @@ def _compute_default_rope_parameters(
     """
     base = config.rope_theta
     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
-    head_dim = (
-        config.head_dim
-        if hasattr(config, "head_dim") and config.head_dim is not None
-        else config.hidden_size // config.num_attention_heads
-    )
+    head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
     dim = int(head_dim * partial_rotary_factor)
     attention_factor = 1.0  # Unused in this type of RoPE
     # Compute the inverse frequencies
-    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim))
     return inv_freq, attention_factor
 def _compute_linear_scaling_rope_parameters(
     config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
-) -> Tuple["torch.Tensor", float]:
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
     Args:
         config ([`~transformers.PretrainedConfig`]):
             The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
     Returns:
         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
     """
     factor = config.rope_scaling["factor"]
     # Gets the default RoPE parameters
-    inv_freq, attention_factor = _compute_default_rope_parameters(config, seq_len)
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len)
     # Then applies linear scaling to the frequencies.
     # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
@@ -94,20 +95,23 @@ def _compute_linear_scaling_rope_parameters(
 def _compute_dynamic_ntk_parameters(
     config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
-) -> Tuple["torch.Tensor", float]:
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
     Args:
         config ([`~transformers.PretrainedConfig`]):
             The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
             The current sequence length, used to update the dynamic RoPE at inference time.
     Returns:
         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
     """
+    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
     base = config.rope_theta
     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
@@ -117,6 +121,17 @@ def _compute_dynamic_ntk_parameters(
     attention_factor = 1.0  # Unused in this type of RoPE
+    # seq_len: default to max_position_embeddings, e.g. at init time
+    if seq_len is None:
+        seq_len = max_position_embeddings
+    elif isinstance(seq_len, torch.Tensor):
+        seq_len = torch.maximum(
+            seq_len,
+            torch.tensor(max_position_embeddings, dtype=seq_len.dtype, device=seq_len.device),
+        )
+    else:
+        seq_len = max(seq_len, max_position_embeddings)
     # Process with chunk_size to reduce precesion error
     chunk_size = 4096
     chunks = (seq_len + chunk_size - 1) // chunk_size
@@ -140,13 +155,17 @@ def _compute_dynamic_ntk_parameters(
     return final_inv_freq, attention_factor
-def _compute_yarn_parameters(config: PretrainedConfig, seq_len: Optional[int] = None) -> Tuple["torch.Tensor", float]:
+def _compute_yarn_parameters(
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies with NTK scaling. Please refer to the
-    [original paper](https://arxiv.org/abs/2309.00071)
+    [original paper](https://huggingface.co/papers/2309.00071)
     Args:
         config ([`~transformers.PretrainedConfig`]):
             The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
     Returns:
@@ -158,13 +177,25 @@ def _compute_yarn_parameters(config: PretrainedConfig, seq_len: Optional[int] =
     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
     dim = int(head_dim * partial_rotary_factor)
-    max_position_embeddings = config.max_position_embeddings
     factor = config.rope_scaling["factor"]
+    attention_factor = config.rope_scaling.get("attention_factor")
+    mscale = config.rope_scaling.get("mscale")
+    mscale_all_dim = config.rope_scaling.get("mscale_all_dim")
+    original_max_position_embeddings = (
+        config.rope_scaling.get("original_max_position_embeddings") or config.max_position_embeddings
+    )
+    def get_mscale(scale, mscale=1):
+        if scale <= 1:
+            return 1.0
+        return 0.1 * mscale * math.log(scale) + 1.0
     # Sets the attention factor as suggested in the paper
-    attention_factor = config.rope_scaling.get("attention_factor")
     if attention_factor is None:
-        attention_factor = 0.1 * math.log(factor) + 1.0
+        if mscale and mscale_all_dim:
+            attention_factor = float(get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dim))
+        else:
+            attention_factor = get_mscale(factor)
     # Optional config options
     # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
@@ -176,10 +207,13 @@ def _compute_yarn_parameters(config: PretrainedConfig, seq_len: Optional[int] =
         """Inverse dimension formula to find the dimension based on the number of rotations"""
         return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
-    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
+    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings, truncate):
         """Find dimension range bounds based on rotations"""
-        low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
-        high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+        low = find_correction_dim(low_rot, dim, base, max_position_embeddings)
+        high = find_correction_dim(high_rot, dim, base, max_position_embeddings)
+        if truncate:
+            low = math.floor(low)
+            high = math.ceil(high)
         return max(low, 0), min(high, dim - 1)
     def linear_ramp_factor(min, max, dim):
@@ -192,38 +226,40 @@ def _compute_yarn_parameters(config: PretrainedConfig, seq_len: Optional[int] =
     # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
     # to expand the possible context length. In other words, interpolation = apply scaling factor.
-    pos_freqs = base ** (torch.arange(0, dim, 2).float() / dim)
+    pos_freqs = base ** (torch.arange(0, dim, 2).to(device=device, dtype=torch.float) / dim)
     inv_freq_extrapolation = 1.0 / pos_freqs
     inv_freq_interpolation = 1.0 / (factor * pos_freqs)
-    low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
+    truncate = config.rope_scaling.get("truncate", True)
+    low, high = find_correction_range(beta_fast, beta_slow, dim, base, original_max_position_embeddings, truncate)
     # Get n-dimensional rotational scaling corrected for extrapolation
-    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float()
+    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).to(device=device, dtype=torch.float)
     inv_freq = (
         inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
         + inv_freq_extrapolation * inv_freq_extrapolation_factor
     )
     return inv_freq, attention_factor
 def _compute_longrope_parameters(
-    config: PretrainedConfig, seq_len: Optional[int] = None
-) -> Tuple["torch.Tensor", float]:
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies with LongRoPE scaling. Please refer to the
     [original implementation](https://github.com/microsoft/LongRoPE)
     Args:
         config ([`~transformers.PretrainedConfig`]):
             The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
-            The current sequence length. Unused for this type of RoPE.
+            The current sequence length.
     Returns:
         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin.
     """
+    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
     base = config.rope_theta
     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
@@ -237,40 +273,40 @@ def _compute_longrope_parameters(
     # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
     # values to compute the default attention scaling factor, instead of using `factor`.
     if hasattr(config, "original_max_position_embeddings"):
-        max_position_embeddings = config.original_max_position_embeddings
-        expanded_max_position_embeddings = config.max_position_embeddings
-        factor = expanded_max_position_embeddings / max_position_embeddings
+        original_max_position_embeddings = config.original_max_position_embeddings
+        factor = config.max_position_embeddings / config.original_max_position_embeddings
     else:
-        max_position_embeddings = config.max_position_embeddings
-        expanded_max_position_embeddings = max_position_embeddings * factor
+        original_max_position_embeddings = config.max_position_embeddings
     # Sets the attention factor as suggested in the paper
     if attention_factor is None:
         if factor <= 1.0:
             attention_factor = 1.0
         else:
-            attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))
+            attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings))
     # Compute the inverse frequencies -- scaled based on the target sequence length
-    if expanded_max_position_embeddings > max_position_embeddings:
-        ext_factors = torch.tensor(long_factor, dtype=torch.float32)
+    if seq_len and seq_len > original_max_position_embeddings:
+        ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
     else:
-        ext_factors = torch.tensor(short_factor, dtype=torch.float32)
-    inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64).float() / dim
+        ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
+    inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
     inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
     return inv_freq, attention_factor
 def _compute_llama3_parameters(
-    config: PretrainedConfig, seq_len: Optional[int] = None
-) -> Tuple["torch.Tensor", float]:
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies for llama 3.1.
     Args:
         config ([`~transformers.PretrainedConfig`]):
             The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
             The current sequence length. Unused for this type of RoPE.
     Returns:
@@ -278,7 +314,7 @@ def _compute_llama3_parameters(
         post-processing scaling factor applied to the computed cos/sin.
     """
     # Gets the default RoPE parameters
-    inv_freq, attention_factor = _compute_default_rope_parameters(config, seq_len)
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len)
     factor = config.rope_scaling["factor"]  # `8` in the original implementation
     low_freq_factor = config.rope_scaling["low_freq_factor"]  # `1` in the original implementation

optimum/rbln/transformers/models/__init__.py CHANGED Viewed

@@ -88,12 +88,16 @@ _import_structure = {
         "RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
         "RBLNQwen2_5_VLForConditionalGeneration",
         "RBLNQwen2_5_VLForConditionalGenerationConfig",
+        "RBLNQwen2_5_VLModel",
+        "RBLNQwen2_5_VLModelConfig",
     ],
     "qwen2_vl": [
         "RBLNQwen2VisionTransformerPretrainedModel",
         "RBLNQwen2VisionTransformerPretrainedModelConfig",
         "RBLNQwen2VLForConditionalGeneration",
         "RBLNQwen2VLForConditionalGenerationConfig",
+        "RBLNQwen2VLModel",
+        "RBLNQwen2VLModelConfig",
     ],
     "decoderonly": [
         "RBLNDecoderOnlyModelConfig",
@@ -110,12 +114,14 @@ _import_structure = {
     ],
     "exaone": ["RBLNExaoneForCausalLM", "RBLNExaoneForCausalLMConfig"],
     "gemma": ["RBLNGemmaForCausalLM", "RBLNGemmaForCausalLMConfig", "RBLNGemmaModel", "RBLNGemmaModelConfig"],
+    "gemma2": ["RBLNGemma2ForCausalLM", "RBLNGemma2ForCausalLMConfig", "RBLNGemma2Model", "RBLNGemma2ModelConfig"],
     "gemma3": [
         "RBLNGemma3ForCausalLM",
         "RBLNGemma3ForCausalLMConfig",
         "RBLNGemma3ForConditionalGeneration",
         "RBLNGemma3ForConditionalGenerationConfig",
     ],
+    "gpt_oss": ["RBLNGptOssForCausalLM", "RBLNGptOssForCausalLMConfig"],
     "gpt2": ["RBLNGPT2LMHeadModel", "RBLNGPT2LMHeadModelConfig", "RBLNGPT2Model", "RBLNGPT2ModelConfig"],
     "idefics3": [
         "RBLNIdefics3VisionTransformer",
@@ -132,6 +138,12 @@ _import_structure = {
         "RBLNPegasusForConditionalGenerationConfig",
         "RBLNPegasusModelConfig",
     ],
+    "paligemma": [
+        "RBLNPaliGemmaForConditionalGeneration",
+        "RBLNPaliGemmaForConditionalGenerationConfig",
+        "RBLNPaliGemmaModel",
+        "RBLNPaliGemmaModelConfig",
+    ],
     "llava_next": ["RBLNLlavaNextForConditionalGeneration", "RBLNLlavaNextForConditionalGenerationConfig"],
     "midm": ["RBLNMidmLMHeadModel", "RBLNMidmLMHeadModelConfig"],
     "pixtral": ["RBLNPixtralVisionModel", "RBLNPixtralVisionModelConfig"],
@@ -143,7 +155,9 @@ _import_structure = {
     ],
     "phi": ["RBLNPhiForCausalLM", "RBLNPhiForCausalLMConfig", "RBLNPhiModel", "RBLNPhiModelConfig"],
     "qwen2": ["RBLNQwen2ForCausalLM", "RBLNQwen2ForCausalLMConfig", "RBLNQwen2Model", "RBLNQwen2ModelConfig"],
+    "qwen2_moe": ["RBLNQwen2MoeForCausalLM", "RBLNQwen2MoeForCausalLMConfig"],
     "qwen3": ["RBLNQwen3ForCausalLM", "RBLNQwen3ForCausalLMConfig", "RBLNQwen3Model", "RBLNQwen3ModelConfig"],
+    "qwen3_moe": ["RBLNQwen3MoeForCausalLM", "RBLNQwen3MoeForCausalLMConfig"],
     "resnet": ["RBLNResNetForImageClassification", "RBLNResNetForImageClassificationConfig"],
     "roberta": [
         "RBLNRobertaForMaskedLM",
@@ -254,6 +268,7 @@ if TYPE_CHECKING:
     from .dpt import RBLNDPTForDepthEstimation, RBLNDPTForDepthEstimationConfig
     from .exaone import RBLNExaoneForCausalLM, RBLNExaoneForCausalLMConfig
     from .gemma import RBLNGemmaForCausalLM, RBLNGemmaForCausalLMConfig, RBLNGemmaModel, RBLNGemmaModelConfig
+    from .gemma2 import RBLNGemma2ForCausalLM, RBLNGemma2ForCausalLMConfig, RBLNGemma2Model, RBLNGemma2ModelConfig
     from .gemma3 import (
         RBLNGemma3ForCausalLM,
         RBLNGemma3ForCausalLMConfig,
@@ -261,6 +276,7 @@ if TYPE_CHECKING:
         RBLNGemma3ForConditionalGenerationConfig,
     )
     from .gpt2 import RBLNGPT2LMHeadModel, RBLNGPT2LMHeadModelConfig, RBLNGPT2Model, RBLNGPT2ModelConfig
+    from .gpt_oss import RBLNGptOssForCausalLM, RBLNGptOssForCausalLMConfig
     from .grounding_dino import (
         RBLNGroundingDinoDecoder,
         RBLNGroundingDinoDecoderConfig,
@@ -281,6 +297,12 @@ if TYPE_CHECKING:
     from .midm import RBLNMidmLMHeadModel, RBLNMidmLMHeadModelConfig
     from .mistral import RBLNMistralForCausalLM, RBLNMistralForCausalLMConfig, RBLNMistralModel, RBLNMistralModelConfig
     from .opt import RBLNOPTForCausalLM, RBLNOPTForCausalLMConfig, RBLNOPTModel, RBLNOPTModelConfig
+    from .paligemma import (
+        RBLNPaliGemmaForConditionalGeneration,
+        RBLNPaliGemmaForConditionalGenerationConfig,
+        RBLNPaliGemmaModel,
+        RBLNPaliGemmaModelConfig,
+    )
     from .pegasus import (
         RBLNPegasusForConditionalGeneration,
         RBLNPegasusForConditionalGenerationConfig,
@@ -295,14 +317,20 @@ if TYPE_CHECKING:
         RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
         RBLNQwen2_5_VLForConditionalGeneration,
         RBLNQwen2_5_VLForConditionalGenerationConfig,
+        RBLNQwen2_5_VLModel,
+        RBLNQwen2_5_VLModelConfig,
     )
+    from .qwen2_moe import RBLNQwen2MoeForCausalLM, RBLNQwen2MoeForCausalLMConfig
     from .qwen2_vl import (
         RBLNQwen2VisionTransformerPretrainedModel,
         RBLNQwen2VisionTransformerPretrainedModelConfig,
         RBLNQwen2VLForConditionalGeneration,
         RBLNQwen2VLForConditionalGenerationConfig,
+        RBLNQwen2VLModel,
+        RBLNQwen2VLModelConfig,
     )
     from .qwen3 import RBLNQwen3ForCausalLM, RBLNQwen3ForCausalLMConfig, RBLNQwen3Model, RBLNQwen3ModelConfig
+    from .qwen3_moe import RBLNQwen3MoeForCausalLM, RBLNQwen3MoeForCausalLMConfig
     from .resnet import RBLNResNetForImageClassification, RBLNResNetForImageClassificationConfig
     from .roberta import (
         RBLNRobertaForMaskedLM,

optimum/rbln/transformers/models/bart/bart_architecture.py CHANGED Viewed

@@ -60,10 +60,10 @@ class BartForConditionalGeneration(Seq2SeqForConditionalGeneration):
 class BartDecoder(Seq2SeqDecoder):
     has_pos_emb = True
-    def __post_init__(self):
-        self.embed_positions = self._original_mod.embed_positions
-        self.layernorm_embedding = self._original_mod.layernorm_embedding
-        self.embed_scale = getattr(self._original_mod, "embed_scale", None)
+    def __post_init__(self, model: nn.Module):
+        self.embed_positions = model.embed_positions
+        self.layernorm_embedding = model.layernorm_embedding
+        self.embed_scale = getattr(model, "embed_scale", None)
     def prepare_attn_mask(self, attention_mask, encoder_attention_mask, **kwargs):
         if attention_mask is not None:
@@ -112,11 +112,11 @@ class BartLayerFF(nn.Module):
 class BartDecoderLayer(Seq2SeqDecoderLayer):
-    def __post_init__(self):
-        self.self_attn_layer_norm = self._original_mod.self_attn_layer_norm
-        self.encoder_attn = self._original_mod.encoder_attn
-        self.encoder_attn_layer_norm = self._original_mod.encoder_attn_layer_norm
-        self.ff_layer = BartLayerFF(self._original_mod)
+    def __post_init__(self, decoder_layer: nn.Module):
+        self.self_attn_layer_norm = decoder_layer.self_attn_layer_norm
+        self.encoder_attn = decoder_layer.encoder_attn
+        self.encoder_attn_layer_norm = decoder_layer.encoder_attn_layer_norm
+        self.ff_layer = BartLayerFF(decoder_layer)
     def pre_self_attn_layer_norm(self, hidden_states):
         return hidden_states
@@ -132,13 +132,13 @@ class BartDecoderLayer(Seq2SeqDecoderLayer):
 class BartSelfAttention(Seq2SeqSelfAttention):
-    def __post_init__(self, use_attention_mask: bool = True):
-        self.q_proj = self._original_mod.q_proj
-        self.k_proj = self._original_mod.k_proj
-        self.v_proj = self._original_mod.v_proj
-        self.out_proj = self._original_mod.out_proj
-        self.num_heads = self._original_mod.num_heads
-        self.head_dim = self._original_mod.embed_dim // self._original_mod.num_heads
+    def __post_init__(self, attn: nn.Module, use_attention_mask: bool = True):
+        self.q_proj = attn.q_proj
+        self.k_proj = attn.k_proj
+        self.v_proj = attn.v_proj
+        self.out_proj = attn.out_proj
+        self.num_heads = attn.num_heads
+        self.head_dim = attn.embed_dim // attn.num_heads
         self.scaling = self.head_dim**-0.5
         if use_attention_mask:
             self.attn_decode = torch.ops.rbln_custom_ops.paged_attn_decode
@@ -153,11 +153,11 @@ class BartSelfAttention(Seq2SeqSelfAttention):
 class BartCrossAttention(Seq2SeqCrossAttention):
-    def __post_init__(self):
-        self.q_proj = self._original_mod.q_proj
-        self.k_proj = self._original_mod.k_proj
-        self.v_proj = self._original_mod.v_proj
-        self.out_proj = self._original_mod.out_proj
-        self.num_heads = self._original_mod.num_heads
-        self.head_dim = self._original_mod.embed_dim // self._original_mod.num_heads
-        self.embed_dim = self._original_mod.embed_dim
+    def __post_init__(self, attn: nn.Module):
+        self.q_proj = attn.q_proj
+        self.k_proj = attn.k_proj
+        self.v_proj = attn.v_proj
+        self.out_proj = attn.out_proj
+        self.num_heads = attn.num_heads
+        self.head_dim = attn.embed_dim // attn.num_heads
+        self.embed_dim = attn.embed_dim

optimum/rbln/transformers/models/colpali/colpali_architecture.py CHANGED Viewed

@@ -77,11 +77,11 @@ class ColPaliModel(nn.Module):
         self, model, layers: List["ColPaliLayer"], output_hidden_states: bool = False, max_seq_len: int = 2048
     ):
         super().__init__()
-        self._original_mod = model
         self.layers = nn.ModuleList(layers)
         self.output_hidden_states = output_hidden_states
-        self.norm = self._original_mod.norm
-        self.hidden_size = self._original_mod.config.hidden_size
+        self.config = model.config
+        self.norm = model.norm
+        self.hidden_size = self.config.hidden_size
         self.max_seq_len = max_seq_len
     def forward(
@@ -118,7 +118,6 @@ class ColPaliModel(nn.Module):
 class ColPaliLayer(nn.Module):
     def __init__(self, layer, self_attn: "ColPaliAttention"):
         super().__init__()
-        self._original_mod = layer
         self.self_attn = self_attn
         self.mlp = layer.mlp
         self.input_layernorm = layer.input_layernorm
@@ -155,27 +154,22 @@ class ColPaliLayer(nn.Module):
 class ColPaliAttention(nn.Module):
     def __init__(self, self_attn):
         super().__init__()
-        self._original_mod = self_attn
-        self.num_heads = (
-            getattr(self._original_mod, "num_heads", None) or self._original_mod.config.num_attention_heads
-        )
-        self.head_dim = self._original_mod.head_dim
+        self.config = self_attn.config
+        self.num_heads = getattr(self_attn, "num_heads", None) or self_attn.config.num_attention_heads
+        self.head_dim = self_attn.head_dim
         self.scaling = self.head_dim**-0.5
-        if hasattr(self._original_mod, "num_key_value_heads"):
-            self.num_key_value_heads = self._original_mod.num_key_value_heads
-        elif hasattr(self._original_mod, "config") and hasattr(self._original_mod.config, "num_key_value_heads"):
-            self.num_key_value_heads = self._original_mod.config.num_key_value_heads
+        if hasattr(self_attn, "num_key_value_heads"):
+            self.num_key_value_heads = self_attn.num_key_value_heads
+        elif hasattr(self_attn, "config") and hasattr(self_attn.config, "num_key_value_heads"):
+            self.num_key_value_heads = self_attn.config.num_key_value_heads
         else:
             self.num_key_value_heads = self.num_heads
-        self.__post_init__()
-    def __post_init__(self):
-        self.q_proj = self._original_mod.q_proj
-        self.k_proj = self._original_mod.k_proj
-        self.v_proj = self._original_mod.v_proj
-        self.o_proj = self._original_mod.o_proj
+        self.q_proj = self_attn.q_proj
+        self.k_proj = self_attn.k_proj
+        self.v_proj = self_attn.v_proj
+        self.o_proj = self_attn.o_proj
     def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         query_states = self.q_proj(hidden_states)

optimum/rbln/transformers/models/colpali/configuration_colpali.py CHANGED Viewed

@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, List, Optional, Union
+from typing import Any, Optional
 from ....configuration_utils import RBLNModelConfig
 from ....utils.logging import get_logger
@@ -33,7 +33,9 @@ class RBLNColPaliForRetrievalConfig(RBLNModelConfig):
         # Create a configuration object
         config = RBLNColPaliForRetrievalConfig(
-            max_seq_lens=1152,
+            vlm={
+                "language_model": {"prefill_chunk_size": 8192},
+            }
             output_hidden_states=False,
             tensor_parallel_size=4
         )
@@ -47,24 +49,21 @@ class RBLNColPaliForRetrievalConfig(RBLNModelConfig):
         ```
     """
-    submodules = ["vision_tower"]
+    _allow_no_compile_cfgs = True
+    submodules = ["vlm"]
     def __init__(
         self,
         batch_size: Optional[int] = None,
-        max_seq_lens: Union[int, List[int]] = None,
+        vlm: Optional[RBLNModelConfig] = None,
         output_hidden_states: Optional[bool] = None,
-        vision_tower: Optional[RBLNModelConfig] = None,
         **kwargs: Any,
     ):
         """
         Args:
             batch_size (Optional[int]): The batch size for the model.
-            vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
-            max_seq_lens (Union[int, List[int]]): The maximum sequence lengths for the language model.
-                This can be multiple values, and the model will be compiled for each max_seq_len, allowing selection of the most appropriate max_seq_len at inference time.
-            output_hidden_states (Optional[bool]): Whether to output the hidden states of the language model.
-            vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
+            vlm (Optional[RBLNModelConfig]): Configuration for the VLM component.
+            output_hidden_states (Optional[bool]): Whether to output the hidden states of the decoder. Defaults to False.
             kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
             ValueError: If batch_size is not a positive integer.
@@ -74,11 +73,7 @@ class RBLNColPaliForRetrievalConfig(RBLNModelConfig):
         if not isinstance(self.batch_size, int) or self.batch_size < 0:
             raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
-        if self.batch_size != 1:
-            logger.warning("Ignore batch_size for ColPali vision tower. It will be set to 1.")
-        self.vision_tower = self.initialize_submodule_config(
-            submodule_config=vision_tower, batch_size=1, force_kwargs=True
+        self.output_hidden_states = output_hidden_states or False
+        self.vlm = self.initialize_submodule_config(
+            submodule_config=vlm, batch_size=batch_size, output_hidden_states=output_hidden_states
         )
-        self.max_seq_lens = max_seq_lens
-        self.output_hidden_states = output_hidden_states

optimum-rbln 0.9.4a2__py3-none-any.whl → 0.9.5a4__py3-none-any.whl

optimum-rbln 0.9.4a2py3-none-any.whl → 0.9.5a4py3-none-any.whl