PyPI - optimum-rbln - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

optimum-rbln 0.1.12py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -20,26 +20,24 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import functools
-import glob
-import os
-from abc import ABC
+import inspect
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
-import rebel  # noqa: F401
-import torch  # noqa: F401
-from safetensors.torch import load_file
+import rebel
+import torch
 from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
 from transformers.modeling_utils import no_init_weights
 from transformers.utils import ModelOutput
-from ....modeling_base import RBLNModel
+from ....modeling import RBLNModel
 from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNCompileConfig, RBLNConfig
 from ....utils.logging import get_logger
 from ....utils.runtime_utils import RBLNPytorchRuntime
 from ....utils.timer_utils import rbln_timer
+from ...utils.rbln_quantization import QuantizationManager
+from .decoderonly_architecture import DecoderOnlyWrapper
 logger = get_logger()
@@ -52,12 +50,6 @@ if TYPE_CHECKING:
         PretrainedConfig,
     )
-SUPPORTED_QUANTIZATIONS = {
-    "rbln": [
-        "w4a16",
-    ],
-}
 class RBLNRuntimeModel(RBLNPytorchRuntime):
     mandatory_members = ["main_input_name", "embed_tokens"]
@@ -102,19 +94,30 @@ class RBLNDecoderOnlyOutput(ModelOutput):
     generate_idx: torch.Tensor = None
-class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
+class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
     """
-    The DecoderOnly Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
-    A class to convert and run pre-trained transformers based DecoderOnlyForCausalLM model on RBLN devices.
-    It implements the methods to convert a pre-trained transformers DecoderOnlyForCausalLM model into a RBLN transformer model by:
-    - transferring the checkpoint weights of the original into an optimized RBLN graph,
-    - compiling the resulting graph using the RBLN compiler.
+    A base class for decoder-only transformer models optimized for causal language modeling tasks on RBLN devices.
+    This class serves as the foundation for various decoder-only architectures like GPT, LLaMA, etc.
+    The class provides core functionality for:
+    1. Converting pre-trained transformer models to RBLN-optimized format
+    2. Handling the compilation process for RBLN devices
+    3. Managing inference operations for causal language modeling
+    This class inherits from RBLNModel and implements specific methods required for
+    decoder-only architectures and causal language modeling tasks.
+    Note:
+        - This class is designed to be subclassed by specific model implementations
+          (e.g., RBLNLlamaForCausalLM, RBLNGPT2LMHeadModel)
+        - Subclasses should implement model-specific conversion logic.
+        - The class handles RBLN-specific optimizations automatically during compilation
     """
     main_input_name = "input_ids"
     auto_model_class = AutoModelForCausalLM
+    _decoder_wrapper_cls = DecoderOnlyWrapper
+    _use_rotary_emb = True
     def __post_init__(self, **kwargs):
         self.batch_size = self.rbln_config.model_cfg["batch_size"]
@@ -173,6 +176,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
     def get_quantized_model(
         cls,
         model_id: str,
+        config: Optional[PretrainedConfig] = None,
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -182,56 +186,47 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         trust_remote_code: bool = False,
         **kwargs,
     ):
-        from ...utils.rbln_quantization import update_layers_to_quantized
+        from ...utils.rbln_quantization import prepare_model_for_quantization
         kwargs = cls.update_kwargs(kwargs)
-        config = AutoConfig.from_pretrained(
-            model_id,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            force_download=force_download,
-            cache_dir=cache_dir,
-            trust_remote_code=trust_remote_code,
-            **kwargs,
-        )
+        if config is None:
+            config = AutoConfig.from_pretrained(
+                model_id,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                force_download=force_download,
+                cache_dir=cache_dir,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
+            )
         with no_init_weights():
             model = AutoModelForCausalLM.from_config(config)
-        update_layers_to_quantized(model)
-        n_layer = kwargs.get("num_hidden_layers", None)
-        cls._load_weights_directly_to_model(model, model_id, n_layer)
+        prepare_model_for_quantization(model, model_id, kwargs.get("num_hidden_layers"))
         return model
-    def _load_weights_directly_to_model(model, model_id, n_layer=None):
+    def __getattr__(self, __name: str) -> Any:
         """
-        Load safetensor file data directly into the model, filtering by layer if n_layer is provided.
+        Special method to delegate attribute access to the original Huggingface LM class.
+        This method is called when an attribute is not found in the current instance's dictionary.
+        It enables transparent access to the original model's attributes and methods while maintaining
+        proper method binding.
+        The method implements a delegation pattern that:
+        1. For methods: Creates a wrapper that properly binds 'self' to method calls
+        2. For other attributes: Returns them directly from the original class
         """
-        model_params = dict(model.named_parameters(recurse=True))
-        model_buffers = dict(model.named_buffers(recurse=True))
-        safetensor_files = glob.glob(f"{model_id}/*.safetensors")
-        target_layers = list(range(n_layer)) if n_layer is not None else None
+        def redirect(func):
+            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        for safetensor_file in safetensor_files:
-            file_data = load_file(safetensor_file)
-            for key, value in file_data.items():
-                if target_layers is not None:
-                    parts = key.split(".")
-                    if len(parts) > 2 and parts[2].isdigit() and (int(parts[2]) not in target_layers):
-                        continue
-                if key in model_params:
-                    model_params[key].data.copy_(value)
-                elif key in model_buffers:
-                    model_buffers[key].data.copy_(value)
-        return 0
+        val = getattr(self.hf_class, __name, None) or getattr(PreTrainedModel, __name)
+        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
+            return redirect(val)
+        return val
     @classmethod
     def get_pytorch_model(cls, *args, **kwargs) -> "PreTrainedModel":
@@ -245,53 +240,17 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         return model
-    def validate_quantization_config(quantize_config):
-        if quantize_config is not None:
-            q_format = quantize_config.get("format")
-            q_precision = quantize_config.get("precision")
-            if q_format not in SUPPORTED_QUANTIZATIONS:
-                raise ValueError(
-                    f"Invalid quantization format: {q_format}. "
-                    f"Supported formats are: {list(SUPPORTED_QUANTIZATIONS.keys())}"
-                )
-            if q_precision not in SUPPORTED_QUANTIZATIONS[q_format]:
-                raise ValueError(
-                    f"Invalid precision: {q_precision} for format: {q_format}. "
-                    f"Supported precisions are: {SUPPORTED_QUANTIZATIONS[q_format]}"
-                )
-        return quantize_config
     @classmethod
-    def set_quantize_env(cls, quantize_config):
-        RBLN_QUANT_BITS_ENV = "RBLN_QUANT_BITS"
-        quantize_config = cls.validate_quantization_config(quantize_config)
-        if quantize_config is not None:
-            q_precision = quantize_config.get("precision")
-            quant_bits = q_precision.split("w")[1].split("a")[0]
-            os.environ[RBLN_QUANT_BITS_ENV] = quant_bits
-            return RBLN_QUANT_BITS_ENV
-        return None
+    def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
+        wrapper_cfg = {"max_seq_len": rbln_config.model_cfg["max_seq_len"]}
-    @classmethod
-    def reset_quantize_env(cls, env_var_name):
-        if env_var_name is not None and env_var_name in os.environ:
-            del os.environ[env_var_name]
+        # If the model wrapper supports rbln-custom-flash-attention
+        if "kvcache_partition_len" in inspect.signature(cls._decoder_wrapper_cls.__init__).parameters:
+            wrapper_cfg["kvcache_partition_len"] = rbln_config.model_cfg.get("kvcache_partition_len")
-    @classmethod
-    def manage_quantize_env(cls, func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            quantize_config = kwargs.get("quantize_config")
-            quantize_env_var = cls.set_quantize_env(quantize_config)
-            try:
-                return func(*args, **kwargs)
-            finally:
-                cls.reset_quantize_env(quantize_env_var)
-        return wrapper
+        wrapper_cfg["use_rotary_emb"] = cls._use_rotary_emb
+        return cls._decoder_wrapper_cls(model, **wrapper_cfg).eval()
     @classmethod
     @torch.inference_mode()
@@ -305,15 +264,15 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         @rbln_timer("JIT trace")
         def get_scripted_model():
             # This function is nested to dealloc the example inputs before compilation.
+            # FIXME: 3rd dummy_input(batch_idx) should be fill zero to compile flash_attn.
             prefill_example_inputs = prefill_rbln_compile_config.get_dummy_inputs(fill=0)
-            dec_example_inputs = dec_rbln_compile_config.get_dummy_inputs(fill=4)
-            batch_index = 3
-            dec_example_inputs[batch_index].fill_(-1)  # fill batch_position -1 to indicate it is decoder.
+            dec_example_inputs = dec_rbln_compile_config.get_dummy_inputs(fill=0)
+            wrapped_model.phase = "prefill"
             prefill_scripted_model = torch.jit.trace(
                 wrapped_model, prefill_example_inputs, check_trace=False, _store_inputs=False
             )
+            wrapped_model.phase = "decode"
             dec_scripted_model = torch.jit.trace(
                 wrapped_model, dec_example_inputs, check_trace=False, _store_inputs=False
             )
@@ -336,6 +295,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         prefill_ir, dec_ir = scripted_model_to_ir()
         # Caching prefill_decoder/decoder I/O
         cache_index_offset = 5
         connections = [
             (prefill_ir.outputs[1 + i], prefill_ir.inputs[cache_index_offset + i])
             for i in range(model.config.num_hidden_layers * 2)
@@ -344,7 +304,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         # Extract quantize_config from rbln_config
         quantize_config = rbln_config.model_cfg.get("quantization", None)
-        @cls.manage_quantize_env
+        @QuantizationManager.with_quantization_env
         def compile_model(*args, **kwargs):
             # Remove quantize_config from kwargs
             kwargs.pop("quantize_config", None)
@@ -374,10 +334,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
     ) -> RBLNConfig:
         rbln_max_seq_len = rbln_kwargs.get("max_seq_len", None)
         rbln_batch_size = rbln_kwargs.get("batch_size", None)
-        rbln_quantization = rbln_kwargs.get("quantization", None)
         rbln_use_inputs_embeds = rbln_kwargs.get("use_inputs_embeds", None)
-        rbln_quantization = cls.validate_quantization_config(rbln_quantization)
+        rbln_quantization = QuantizationManager.validate_quantization_config(rbln_kwargs.get("quantization", None))
         prefill_chunk_size = 128
         if rbln_max_seq_len is None:
@@ -552,8 +510,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         cache_position: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
         generate_idx: Optional[torch.Tensor] = None,
-        # from llava_next forward args
-        batch_idx: Optional[int] = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor]:
         # prefll
@@ -575,7 +531,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
                     input_ids=input_tensor if inputs_embeds is None else None,
                     inputs_embeds=input_tensor if inputs_embeds is not None else None,
                     cache_position=cache_position,
-                    batch_idx=b_idx if batch_idx is None else batch_idx,  # Llava-next prefill
+                    batch_idx=b_idx,
                 )
                 logits.append(logit)
             logits = torch.cat(logits, dim=0)
@@ -676,11 +632,24 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         cache_position: torch.Tensor = None,
     ) -> torch.FloatTensor:
         input_tensors = inputs_embeds if inputs_embeds is not None else input_ids
+        if input_tensors is None:
+            raise ValueError("Either `input_ids` or `inputs_embeds` must be provided.")
         batch_size = input_tensors.shape[0]
+        if batch_size != self.batch_size:
+            raise RuntimeError(
+                f"Batch size mismatch: got {batch_size}, expected {self.batch_size} (compiled batch size)."
+            )
+        if batch_size != cache_position.shape[0]:
+            raise RuntimeError(f"Cache position size mismatch: got {cache_position.shape[0]}, expected {batch_size}.")
         for b_idx in range(batch_size):
             decoding_step = cache_position[b_idx].item()
+            if not (0 <= decoding_step < self.dec_attn_mask.shape[-1]):
+                raise ValueError(
+                    f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
+                )
             self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
         logits, _ = self.decoder(
@@ -693,31 +662,3 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         )
         return logits
-    def vllm_forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        inputs_embeds: torch.Tensor = None,
-        cache_position: torch.Tensor = None,
-        batch_idx: Optional[int] = None,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor]:
-        # prefll
-        if cache_position.shape[-1] > 1:
-            logits = self._forward_prefill(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                cache_position=cache_position,
-                batch_idx=batch_idx,
-            )
-        # decoder
-        else:
-            logits = self._forward_decoder(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                cache_position=cache_position,
-            )
-        return RBLNDecoderOnlyOutput(
-            logits=logits,
-        )

optimum/rbln/transformers/models/dpt/modeling_dpt.py CHANGED Viewed

@@ -27,7 +27,7 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional, Union
 from transformers import AutoModelForDepthEstimation
 from transformers.modeling_outputs import DepthEstimatorOutput
-from ....modeling_base import RBLNModel
+from ....modeling import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig

optimum/rbln/transformers/models/exaone/exaone_architecture.py CHANGED Viewed

@@ -20,53 +20,78 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+from typing import TYPE_CHECKING
+import torch.nn as nn
-from ...models.decoderonly import (
+from ....utils import logging
+from ...models.decoderonly.decoderonly_architecture import (
     DecoderOnlyAttention,
-    DecoderOnlyDecoderLayer,
+    DecoderOnlyFlashAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
     DecoderOnlyModel,
     DecoderOnlyWrapper,
 )
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel as ExaoneForCausalLM
+logger = logging.get_logger(__name__)
 class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
     """A wrapper class for the Exaone model with a language modeling head."""
-    def __init__(self, model, max_seq_len):
-        super(DecoderOnlyWrapper, self).__init__()
-        self.config = model.config
-        self.model = self.convert_attribute_name(model.transformer)
-        self.lm_head = model.lm_head
-        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
-        self.max_position_embeddings = (
-            self.config.max_position_embeddings if max_seq_len > self.config.max_position_embeddings else max_seq_len
-        )
-        self.max_seq_len = max_seq_len
-        self.rope_scaling = getattr(self.config, "rope_scaling", None)
-        self.rotary_emb = self._init_rope()
-    @staticmethod
-    def convert_attribute_name(model):
-        model.embed_tokens = model.wte
-        model.norm = model.ln_f
-        model.layers = model.h
-        for layer in model.layers:
-            layer.input_layernorm = layer.ln_1
-            layer.self_attn = layer.attn.attention
-            layer.post_attention_layernorm = layer.ln_2
-            layer.self_attn.o_proj = layer.self_attn.out_proj
-        return model
-    def get_forward_dict(self):
-        forward_dict = {}
-        forward_dict.update(
-            {
-                "wrapper": DecoderOnlyModel.forward,
-                "model": DecoderOnlyDecoderLayer.forward,
-                "decoder_layer": DecoderOnlyAttention.forward,
-            }
-        )
-        return forward_dict
+    def convert_to_rbln_causal_lm(self, causal_lm: "ExaoneForCausalLM"):
+        new_layers = []
+        for layer in causal_lm.transformer.h:
+            if self.attn_impl == "eager":
+                new_self_attn = ExaoneAttention(layer.attn.attention)
+            elif self.attn_impl == "flash_attn":
+                new_self_attn = ExaoneFlashAttention(
+                    layer.attn.attention, kvcache_partition_len=self.kvcache_partition_len
+                )
+            else:
+                raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
+            new_layer = ExaoneLayer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = ExaoneModel(causal_lm.transformer, new_layers)
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
+        return new_causal_lm
+class ExaoneModel(DecoderOnlyModel):
+    def get_embedding(self) -> nn.Embedding:
+        return self._original_mod.wte
+    def get_last_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_f
+class ExaoneLayer(DecoderOnlyLayer):
+    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_1
+    def get_post_attention_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_2
+class ExaoneAttention(DecoderOnlyAttention):
+    def __post_init__(self):
+        self.q_proj = self._original_mod.q_proj
+        self.k_proj = self._original_mod.k_proj
+        self.v_proj = self._original_mod.v_proj
+        self.o_proj = self._original_mod.out_proj
+        self.num_key_value_heads = self._original_mod.num_key_value_heads
+class ExaoneFlashAttention(DecoderOnlyFlashAttention):
+    def __post_init__(self):
+        self.q_proj = self._original_mod.q_proj
+        self.k_proj = self._original_mod.k_proj
+        self.v_proj = self._original_mod.v_proj
+        self.o_proj = self._original_mod.out_proj
+        self.num_key_value_heads = self._original_mod.num_key_value_heads

optimum/rbln/transformers/models/exaone/modeling_exaone.py CHANGED Viewed

@@ -21,21 +21,15 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from ....modeling_config import RBLNConfig
+from transformers import AutoModelForCausalLM
+from ....utils import logging
 from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .exaone_architecture import ExaoneForCausalLMWrapper
-from .hf_hub_cached.modeling_exaone import ExaoneForCausalLM
-logger = logging.getLogger(__name__)
-if TYPE_CHECKING:
-    from transformers import (
-        PreTrainedModel,
-    )
+logger = logging.get_logger(__name__)
 class RBLNExaoneForCausalLM(RBLNDecoderOnlyModelForCausalLM):
@@ -52,25 +46,8 @@ class RBLNExaoneForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return ExaoneForCausalLMWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        """This is the key method to implement RBLN-Exaone.
-        Returns:
-            Any: Exaone's corresponding method
-        """
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(ExaoneForCausalLM, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = ExaoneForCausalLMWrapper
+    _hf_class = AutoModelForCausalLM
     @classmethod
     def from_pretrained(cls, *args, **kwargs):

optimum-rbln 0.1.12__py3-none-any.whl → 0.1.15__py3-none-any.whl

optimum-rbln 0.1.12py3-none-any.whl → 0.1.15py3-none-any.whl