PyPI - optimum-rbln - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

optimum-rbln 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -20,15 +20,17 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+import functools
 import glob
-import logging
-from abc import ABC
+import inspect
+import os
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
-import rebel  # noqa: F401
-import torch  # noqa: F401
+import rebel
+import torch
+import transformers
 from safetensors.torch import load_file
 from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
 from transformers.modeling_utils import no_init_weights
@@ -36,11 +38,13 @@ from transformers.utils import ModelOutput
 from ....modeling_base import RBLNModel
 from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNCompileConfig, RBLNConfig
+from ....utils.logging import get_logger
 from ....utils.runtime_utils import RBLNPytorchRuntime
 from ....utils.timer_utils import rbln_timer
+from .decoderonly_architecture import DecoderOnlyWrapper
-logger = logging.getLogger(__name__)
+logger = get_logger()
 if TYPE_CHECKING:
     from transformers import (
@@ -97,22 +101,50 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
 @dataclass
 class RBLNDecoderOnlyOutput(ModelOutput):
     logits: torch.FloatTensor = None
-    past_cached_length: Union[int, torch.Tensor] = None
+    generate_idx: torch.Tensor = None
-class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
+class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
     """
-    The DecoderOnly Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
-    A class to convert and run pre-trained transformers based DecoderOnlyForCausalLM model on RBLN devices.
-    It implements the methods to convert a pre-trained transformers DecoderOnlyForCausalLM model into a RBLN transformer model by:
-    - transferring the checkpoint weights of the original into an optimized RBLN graph,
-    - compiling the resulting graph using the RBLN compiler.
+    A base class for decoder-only transformer models optimized for causal language modeling tasks on RBLN devices.
+    This class serves as the foundation for various decoder-only architectures like GPT, LLaMA, etc.
+    The class provides core functionality for:
+    1. Converting pre-trained transformer models to RBLN-optimized format
+    2. Handling the compilation process for RBLN devices
+    3. Managing inference operations for causal language modeling
+    This class inherits from RBLNModel and implements specific methods required for
+    decoder-only architectures and causal language modeling tasks.
+    Note:
+        - This class is designed to be subclassed by specific model implementations
+          (e.g., RBLNLlamaForCausalLM, RBLNGPT2LMHeadModel)
+        - Subclasses should implement model-specific conversion logic.
+        - The class handles RBLN-specific optimizations automatically during compilation
     """
     main_input_name = "input_ids"
     auto_model_class = AutoModelForCausalLM
+    _decoder_wrapper_cls = DecoderOnlyWrapper
+    _original_cls = None
+    @classmethod
+    @property
+    def original_cls(cls):
+        """
+        Lazily loads and caches the corresponding Hugging Face model class.
+        Removes 'RBLN' prefix from the class name to get the original class name
+        (e.g., RBLNLlamaForCausalLM -> LlamaForCausalLM) and imports it from
+        the transformers module.
+        Returns:
+            type: The original Hugging Face model class
+        """
+        if cls._original_cls is None:
+            hf_original_cls_name = cls.__name__[4:]
+            cls._original_cls = getattr(transformers, hf_original_cls_name)
+        return cls._original_cls
     def __post_init__(self, **kwargs):
         self.batch_size = self.rbln_config.model_cfg["batch_size"]
@@ -231,6 +263,26 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         return 0
+    def __getattr__(self, __name: str) -> Any:
+        """
+        Special method to delegate attribute access to the original Huggingface LM class.
+        This method is called when an attribute is not found in the current instance's dictionary.
+        It enables transparent access to the original model's attributes and methods while maintaining
+        proper method binding.
+        The method implements a delegation pattern that:
+        1. For methods: Creates a wrapper that properly binds 'self' to method calls
+        2. For other attributes: Returns them directly from the original class
+        """
+        def redirect(func):
+            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
+        val = getattr(self.original_cls, __name)
+        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
+            return redirect(val)
+        return val
     @classmethod
     def get_pytorch_model(cls, *args, **kwargs) -> "PreTrainedModel":
         rbln_kwargs = kwargs.get("rbln_kwargs", {})
@@ -243,6 +295,64 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         return model
+    def validate_quantization_config(quantize_config):
+        if quantize_config is not None:
+            q_format = quantize_config.get("format")
+            q_precision = quantize_config.get("precision")
+            if q_format not in SUPPORTED_QUANTIZATIONS:
+                raise ValueError(
+                    f"Invalid quantization format: {q_format}. "
+                    f"Supported formats are: {list(SUPPORTED_QUANTIZATIONS.keys())}"
+                )
+            if q_precision not in SUPPORTED_QUANTIZATIONS[q_format]:
+                raise ValueError(
+                    f"Invalid precision: {q_precision} for format: {q_format}. "
+                    f"Supported precisions are: {SUPPORTED_QUANTIZATIONS[q_format]}"
+                )
+        return quantize_config
+    @classmethod
+    def set_quantize_env(cls, quantize_config):
+        RBLN_QUANT_BITS_ENV = "RBLN_QUANT_BITS"
+        quantize_config = cls.validate_quantization_config(quantize_config)
+        if quantize_config is not None:
+            q_precision = quantize_config.get("precision")
+            quant_bits = q_precision.split("w")[1].split("a")[0]
+            os.environ[RBLN_QUANT_BITS_ENV] = quant_bits
+            return RBLN_QUANT_BITS_ENV
+        return None
+    @classmethod
+    def reset_quantize_env(cls, env_var_name):
+        if env_var_name is not None and env_var_name in os.environ:
+            del os.environ[env_var_name]
+    @classmethod
+    def manage_quantize_env(cls, func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            quantize_config = kwargs.get("quantize_config")
+            quantize_env_var = cls.set_quantize_env(quantize_config)
+            try:
+                return func(*args, **kwargs)
+            finally:
+                cls.reset_quantize_env(quantize_env_var)
+        return wrapper
+    @classmethod
+    def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
+        wrapper_cfg = {"max_seq_len": rbln_config.model_cfg["max_seq_len"]}
+        # If the model wrapper supports rbln-custom-flash-attention
+        if "kvcache_partition_len" in inspect.signature(cls._decoder_wrapper_cls.__init__).parameters:
+            wrapper_cfg["kvcache_partition_len"] = rbln_config.model_cfg.get("kvcache_partition_len")
+        return cls._decoder_wrapper_cls(model, **wrapper_cfg).eval()
     @classmethod
     @torch.inference_mode()
     def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNConfig):
@@ -252,14 +362,12 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         prefill_rbln_compile_config = rbln_compile_configs[0]
         dec_rbln_compile_config = rbln_compile_configs[1]
-        @rbln_timer("Jit Trace")
+        @rbln_timer("JIT trace")
         def get_scripted_model():
             # This function is nested to dealloc the example inputs before compilation.
+            # FIXME: 3rd dummy_input(batch_idx) should be fill zero to compile flash_attn.
             prefill_example_inputs = prefill_rbln_compile_config.get_dummy_inputs(fill=0)
-            dec_example_inputs = dec_rbln_compile_config.get_dummy_inputs(fill=4)
-            batch_index = 3
-            dec_example_inputs[batch_index].fill_(-1)  # fill batch_position -1 to indicate it is decoder.
+            dec_example_inputs = dec_rbln_compile_config.get_dummy_inputs(fill=0)
             prefill_scripted_model = torch.jit.trace(
                 wrapped_model, prefill_example_inputs, check_trace=False, _store_inputs=False
@@ -271,7 +379,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         prefill_scripted_model, dec_scripted_model = get_scripted_model()
-        @rbln_timer("TorchScript to IR")
+        @rbln_timer("Model conversion")
         def scripted_model_to_ir():
             prefill_ir = rebel.torchscript_to_ir(
                 prefill_scripted_model,
@@ -291,7 +399,18 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
             for i in range(model.config.num_hidden_layers * 2)
         ]
-        compiled_model = rebel.compile(
+        # Extract quantize_config from rbln_config
+        quantize_config = rbln_config.model_cfg.get("quantization", None)
+        @cls.manage_quantize_env
+        def compile_model(*args, **kwargs):
+            # Remove quantize_config from kwargs
+            kwargs.pop("quantize_config", None)
+            # Call rebel.compile with the updated kwargs
+            return rebel.compile(*args, **kwargs)
+        compiled_model = compile_model(
             prefill_ir,
             dec_ir,
             connections=connections,
@@ -299,7 +418,9 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
             npu=prefill_rbln_compile_config.npu,
             tensor_parallel_size=prefill_rbln_compile_config.tensor_parallel_size,
             use_weight_sharing=True,
+            quantize_config=quantize_config,
         )
         return compiled_model
     @classmethod
@@ -314,6 +435,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         rbln_quantization = rbln_kwargs.get("quantization", None)
         rbln_use_inputs_embeds = rbln_kwargs.get("use_inputs_embeds", None)
+        rbln_quantization = cls.validate_quantization_config(rbln_quantization)
         prefill_chunk_size = 128
         if rbln_max_seq_len is None:
             rbln_max_seq_len = getattr(model_config, "max_position_embeddings", None) or getattr(
@@ -330,16 +453,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         head_dim = getattr(model_config, "head_dim", None) or model_config.hidden_size // num_attention_heads
         hidden_size = getattr(model_config, "n_embd", None) or getattr(model_config, "hidden_size")
-        if rbln_quantization is not None:
-            q_format = rbln_quantization.get("format", None)
-            q_precision = rbln_quantization.get("precision", None)
-            if q_format not in SUPPORTED_QUANTIZATIONS.keys() or q_precision not in SUPPORTED_QUANTIZATIONS[q_format]:
-                raise ValueError(
-                    f'rbln_quantization="{rbln_quantization}" is not a supported quantization format or precesion, '
-                    f"Possible: {SUPPORTED_QUANTIZATIONS}"
-                )
         def get_input_info(
             batch_size,
             query_length,
@@ -439,50 +552,41 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
     def prepare_inputs_for_generation(
         self,
         input_ids: torch.LongTensor,
-        past_cached_length: Optional[torch.Tensor] = None,
+        generate_idx: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ):
         model_inputs = {}
-        # prefill phase
-        if past_cached_length is None:
-            # huggingface make dummy_input_ids if model_input_name is "input_embeds"
-            # https://github.com/huggingface/transformers/blob/174890280b340b89c5bfa092f6b4fb0e2dc2d7fc/src/transformers/generation/utils.py#L469
-            if self.rbln_config.model_cfg["use_inputs_embeds"] and inputs_embeds is not None:
-                input_tensors = inputs_embeds
-            else:
-                input_tensors = input_ids
+        is_prefill_phase = generate_idx is None
-            batch_size = input_tensors.shape[0]
-            l_input_tensors = []
-            cache_positions = []
-            past_cached_length = torch.zeros((batch_size, 1), dtype=torch.int32)
-            for i in range(batch_size):
-                input_tensor = input_tensors[i]
-                input_tensor = input_tensor[attention_mask[i] == 1]
-                valid_len = input_tensor.shape[0]
-                cache_position = torch.arange(0, valid_len, dtype=torch.int32)
-                past_cached_length[i] = valid_len
-                l_input_tensors.append(input_tensor.unsqueeze(0))
-                cache_positions.append(cache_position.unsqueeze(0))
-            input_tensors = l_input_tensors
-            if self.rbln_config.model_cfg["use_inputs_embeds"] and inputs_embeds is not None:
-                model_inputs.update({"inputs_embeds": input_tensors, "input_ids": input_ids})
-            else:
-                model_inputs.update({"input_ids": input_tensors, "inputs_embeds": inputs_embeds})
-        # decoder phase
+        if is_prefill_phase:
+            generate_idx = attention_mask.sum(dim=-1, keepdim=True).int()
+            cache_position = None
         else:
+            if inputs_embeds is not None:
+                raise NotImplementedError("Specifying inputs_embeds in decoder phase is not supported.")
             input_ids = input_ids[:, -1:]
-            cache_positions = past_cached_length
-            past_cached_length = past_cached_length + 1
+            cache_position = generate_idx
+            generate_idx = generate_idx + 1
+            model_inputs.update({"input_ids": input_ids})
+        if inputs_embeds is not None:
+            if self.rbln_config.model_cfg["use_inputs_embeds"]:
+                model_inputs.update({"inputs_embeds": inputs_embeds})
+            else:
+                raise ValueError(
+                    "The specifying inputs_embedst is only supported when using a compiled RBLN model with 'rbln_use_inputs_embeds' set to True."
+                )
+        else:
             model_inputs.update({"input_ids": input_ids})
         model_inputs.update(
             {
-                "cache_position": cache_positions,
-                "past_cached_length": past_cached_length,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "generate_idx": generate_idx,
             }
         )
@@ -494,42 +598,46 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         model_kwargs: Dict[str, Any],
         **kwargs,
     ) -> Dict[str, Any]:
-        # update past_cached_length
-        model_kwargs["past_cached_length"] = outputs.past_cached_length
+        # update generate_idx
+        model_kwargs["generate_idx"] = outputs.generate_idx
         return model_kwargs
     def forward(
         self,
-        input_ids: Optional[Union[List[torch.LongTensor], torch.LongTensor]] = None,
-        inputs_embeds: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
-        cache_position: Union[List[torch.Tensor], torch.Tensor] = None,  # vllm keyword argument
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        generate_idx: Optional[torch.Tensor] = None,
+        # from llava_next forward args
         batch_idx: Optional[int] = None,
-        past_cached_length: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor]:
-        # prefll & hf generate
-        if isinstance(cache_position, list):
+        # prefll
+        if cache_position is None:
             logits = []
-            input_tensors = input_ids if inputs_embeds is None else inputs_embeds
-            for batch_idx, (input_tensor, cache_pos) in enumerate(zip(input_tensors, cache_position)):
+            input_tensors = inputs_embeds if inputs_embeds is not None else input_ids
+            batch_size = input_tensors.shape[0]
+            for b_idx in range(batch_size):
+                # Transform inputs as vllm format
+                if attention_mask is not None:
+                    input_tensor = input_tensors[b_idx : b_idx + 1, attention_mask[b_idx].bool()]
+                else:
+                    input_tensor = input_tensors[b_idx : b_idx + 1]
+                cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
                 logit = self._forward_prefill(
                     input_ids=input_tensor if inputs_embeds is None else None,
                     inputs_embeds=input_tensor if inputs_embeds is not None else None,
-                    cache_position=cache_pos,
-                    batch_idx=batch_idx,
+                    cache_position=cache_position,
+                    batch_idx=b_idx if batch_idx is None else batch_idx,  # Llava-next prefill
                 )
                 logits.append(logit)
             logits = torch.cat(logits, dim=0)
-        # prefill & vllm step
-        elif cache_position.shape[-1] > 1:
-            logits = self._forward_prefill(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                cache_position=cache_position,
-                batch_idx=batch_idx,
-            )
-        # common decoder
+        # decoder
         else:
             logits = self._forward_decoder(
                 input_ids=input_ids,
@@ -539,7 +647,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         return RBLNDecoderOnlyOutput(
             logits=logits,
-            past_cached_length=past_cached_length,
+            generate_idx=generate_idx,
         )
     def _forward_prefill(
@@ -567,23 +675,18 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
             torch.empty(size=[], dtype=torch.int16, device="cpu"),
         ]
-        if self.rbln_config.model_cfg["use_inputs_embeds"] and inputs_embeds is not None:
-            model_input_name = "inputs_embeds"
-        else:
-            model_input_name = "input_ids"
-        input_tensors = input_ids if model_input_name == "input_ids" else inputs_embeds
+        input_tensors = inputs_embeds if inputs_embeds is not None else input_ids
         query_length = input_tensors.shape[1]
-        attention_mask = self.prefill_attention_mask.clone()
+        _attention_mask = self.prefill_attention_mask.clone()
         for step in range(0, query_length, self.prefill_chunk_size):
-            if step + self.prefill_chunk_size > query_length:
-                # input_tensors = torch.nn.functional.pad(input_tensors, (0, step + self.prefill_chunk_size - query_length))
-                padding_needed = step + self.prefill_chunk_size - query_length
-                if model_input_name == "input_ids":
-                    input_tensors = torch.nn.functional.pad(input_tensors, (0, padding_needed))
+            # pad input_tensors & cache_position for prefill_chunk
+            if (step + self.prefill_chunk_size) > query_length:
+                pad_to_chunk = step + self.prefill_chunk_size - query_length
+                if inputs_embeds is not None:
+                    input_tensors = torch.nn.functional.pad(input_tensors, (0, 0, 0, pad_to_chunk))
                 else:
-                    input_tensors = torch.nn.functional.pad(input_tensors, (0, 0, 0, padding_needed))
+                    input_tensors = torch.nn.functional.pad(input_tensors, (0, pad_to_chunk))
                 cache_position = torch.cat(
                     [
@@ -597,25 +700,28 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
                     dim=-1,
                 )
-            sliced_input_tensors = input_tensors[:, step : step + self.prefill_chunk_size]
-            sliced_cache_positions = cache_position[:, step : step + self.prefill_chunk_size]
+            # slice input_tensor & cache_position with prefill_chunk_size
+            _input_tensors = input_tensors[:, step : step + self.prefill_chunk_size]
+            _cache_position = cache_position[:, step : step + self.prefill_chunk_size]
+            # update attention_mask
             if step >= self.prefill_chunk_size:
-                attention_mask[:, :, :, step - self.prefill_chunk_size : step] = 1
-            attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
+                _attention_mask[:, :, :, step - self.prefill_chunk_size : step] = 1
+            _attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
-            query_idx = query_length % self.prefill_chunk_size - 1
+            query_idx = (query_length - 1) % self.prefill_chunk_size
             logits, _ = self.prefill_decoder(
-                input_ids=sliced_input_tensors.contiguous() if model_input_name == "input_ids" else None,
-                inputs_embeds=sliced_input_tensors.contiguous() if model_input_name == "inputs_embeds" else None,
-                attention_mask=attention_mask.contiguous(),
-                cache_position=sliced_cache_positions.contiguous(),
+                input_ids=_input_tensors.contiguous() if inputs_embeds is None else None,
+                inputs_embeds=_input_tensors.contiguous() if inputs_embeds is not None else None,
+                attention_mask=_attention_mask.contiguous(),
+                cache_position=_cache_position.contiguous(),
                 batch_position=torch.tensor(batch_idx, dtype=torch.int16),
                 query_idx=torch.tensor(query_idx, dtype=torch.int16),
                 out=out_buffers,
             )
+        # update decoder_attn_mask with preprocessed kv-cache length in prefill phase
         self.dec_attn_mask[batch_idx] = self.dec_attn_mask_init.clone()
         self.dec_attn_mask[batch_idx, :, :, :query_length] = 1
@@ -627,11 +733,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         inputs_embeds: torch.Tensor = None,
         cache_position: torch.Tensor = None,
     ) -> torch.FloatTensor:
-        if self.rbln_config.model_cfg["use_inputs_embeds"] and inputs_embeds is not None:
-            model_input_name = "inputs_embeds"
-        else:
-            model_input_name = "input_ids"
-        input_tensors = input_ids if model_input_name == "input_ids" else inputs_embeds
+        input_tensors = inputs_embeds if inputs_embeds is not None else input_ids
         batch_size = input_tensors.shape[0]
@@ -640,8 +742,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
             self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
         logits, _ = self.decoder(
-            input_ids=input_tensors.contiguous() if model_input_name == "input_ids" else None,
-            inputs_embeds=input_tensors.contiguous() if model_input_name == "inputs_embeds" else None,
+            input_ids=input_tensors.contiguous() if inputs_embeds is None else None,
+            inputs_embeds=input_tensors.contiguous() if inputs_embeds is not None else None,
             attention_mask=self.dec_attn_mask.contiguous(),
             cache_position=cache_position.contiguous(),
             batch_position=torch.tensor(0, dtype=torch.int16),
@@ -649,3 +751,31 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         )
         return logits
+    def vllm_forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: torch.Tensor = None,
+        cache_position: torch.Tensor = None,
+        batch_idx: Optional[int] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor]:
+        # prefll
+        if cache_position.shape[-1] > 1:
+            logits = self._forward_prefill(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                cache_position=cache_position,
+                batch_idx=batch_idx,
+            )
+        # decoder
+        else:
+            logits = self._forward_decoder(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                cache_position=cache_position,
+            )
+        return RBLNDecoderOnlyOutput(
+            logits=logits,
+        )

optimum/rbln/transformers/models/dpt/modeling_dpt.py CHANGED Viewed

@@ -38,7 +38,6 @@ if TYPE_CHECKING:
 class RBLNDPTForDepthEstimation(RBLNModel):
-    model_type = "rbln_model"
     auto_model_class = AutoModelForDepthEstimation
     main_input_name = "pixel_values"

optimum/rbln/transformers/models/exaone/__init__.py ADDED Viewed

@@ -0,0 +1,32 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+import os
+from os import environ
+this_path = os.path.abspath(__file__)
+local_dir = "/" + os.path.join(*this_path.split("/")[:-1]) + "/hf_hub_cached"
+environ["LOCAL_CACHE_ROOT_CUSTOM_CODE_MIDM"] = local_dir
+from .modeling_exaone import RBLNExaoneForCausalLM

optimum/rbln/transformers/models/exaone/exaone_architecture.py ADDED Viewed

@@ -0,0 +1,81 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+import torch
+from ....utils import logging
+from ...models.decoderonly import (
+    DecoderOnlyAttention,
+    DecoderOnlyDecoderLayer,
+    DecoderOnlyModel,
+    DecoderOnlyWrapper,
+    RotaryEmbedding,
+)
+logger = logging.get_logger(__name__)
+class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
+    """A wrapper class for the Exaone model with a language modeling head."""
+    def __init__(self, model, max_seq_len, kvcache_partition_len=None):
+        super(DecoderOnlyWrapper, self).__init__()
+        self.config = model.config
+        self.model = self.convert_attribute_name(model.transformer)
+        self.lm_head = model.lm_head
+        self.rotary_emb = RotaryEmbedding(config=self.config, max_seq_len_cached=max_seq_len)
+        if kvcache_partition_len is not None:
+            # WORKAROUND : for passing partition length as a value to the rbln compiler.
+            # What is actually used is the shape of this tensor.
+            self.kvcache_partition_size = torch.zeros(kvcache_partition_len, dtype=torch.int32)
+            self.attn_implementation = "flash_attn_rbln"
+            logger.info(f"Using rbln-flash-attention. (partition length : {kvcache_partition_len})")
+        else:
+            self.kvcache_partition_size = None
+            self.attn_implementation = "eager"
+    @staticmethod
+    def convert_attribute_name(model):
+        model.embed_tokens = model.wte
+        model.norm = model.ln_f
+        model.layers = model.h
+        for layer in model.layers:
+            layer.input_layernorm = layer.ln_1
+            layer.self_attn = layer.attn.attention
+            layer.post_attention_layernorm = layer.ln_2
+            layer.self_attn.o_proj = layer.self_attn.out_proj
+        return model
+    def get_forward_dict(self):
+        forward_dict = {}
+        forward_dict.update(
+            {
+                "wrapper": DecoderOnlyModel.forward,
+                "model": DecoderOnlyDecoderLayer.forward,
+                "decoder_layer": DecoderOnlyAttention.forward,
+            }
+        )
+        return forward_dict

optimum-rbln 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

optimum-rbln 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl