PyPI - optimum-rbln - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

optimum-rbln 0.1.11py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

optimum/rbln/transformers/models/bart/bart_architecture.py CHANGED Viewed

@@ -47,6 +47,12 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
+class BartWrapper:
+    def __init__(self, model):
+        self.encoder = BartEncoderWrapper(model)
+        self.decoder = BartDecoderWrapper(model)
 class _BartAttention(BartAttention):
     def forward(
         self,
@@ -238,6 +244,7 @@ class _BartSdpaAttention(BartSdpaAttention):
                     value_states, dim=2, start=cache_position, end=cache_position + 1
                 )
+            # need 4d shape (input tensors) for scaled_dot_product_attention
             attn_output = torch.nn.functional.scaled_dot_product_attention(
                 query_states,
                 key_states,
@@ -324,7 +331,6 @@ class _BartDecoder(BartDecoder):
         attn_impl: str = "eager",
     ):
         # embedding
-        # thkim fix : transformers == 4.44.2 compile
         if hasattr(self, "embed_scale"):
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
         else:
@@ -336,13 +342,15 @@ class _BartDecoder(BartDecoder):
             hidden_states = inputs_embeds + positions
         else:
             hidden_all = []
+            # compiler pattern base dependency -> take + add
             for i in range(input_ids.shape[0]):
                 # cache position [N,1]
                 positions_idx = cache_position[i]
+                # offset is set 2 in bart embedding
                 position_weight = self.embed_positions.weight[2:]
                 position = position_weight[positions_idx]
-                tmp_hidden = position + inputs_embeds[i]
-                hidden_all.append(tmp_hidden)
+                batch_hidden = position + inputs_embeds[i]
+                hidden_all.append(batch_hidden)
             hidden_states = torch.stack(hidden_all, dim=0)
         hidden_states = self.layernorm_embedding(hidden_states)
@@ -444,6 +452,7 @@ class BartDecoderWrapper(torch.nn.Module):
             self_kv_cache.append(past_key_values[i][1])
         self_kv_cache = torch.stack(self_kv_cache, dim=0)
+        # return batch_position to keep it as a variable within the graph
         return lm_logits, self_kv_cache, batch_position
@@ -467,9 +476,6 @@ class BartEncoderWrapper(torch.nn.Module):
         cross_key_value: torch.Tensor = None,
         batch_idx: torch.Tensor = None,
     ) -> Tuple[torch.Tensor]:
-        encoder_batch_size = input_ids.shape[0]
-        decoder_batch_size = encoder_batch_size  # TODO(taehoon) fix to enable beam-search
         # 1. run encoder
         encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
         last_hidden_states = encoder_outputs[0]
@@ -477,19 +483,19 @@ class BartEncoderWrapper(torch.nn.Module):
         # 2. run dummy decoder to get pre-calculated cross-key_values for generation
         dummy_past_key_value = []
         for _ in range(self.num_layers):
-            pkv_self_attn_key = torch.zeros(decoder_batch_size, self.num_heads, self.decoder_max_length, self.d_kv)
-            pkv_self_attn_value = torch.zeros(decoder_batch_size, self.num_heads, self.decoder_max_length, self.d_kv)
-            pkv_cross_attn_key = torch.zeros(encoder_batch_size, self.num_heads, self.encoder_max_length, self.d_kv)
-            pkv_cross_attn_value = torch.zeros(encoder_batch_size, self.num_heads, self.encoder_max_length, self.d_kv)
+            pkv_self_attn_key = torch.zeros(1, self.num_heads, self.decoder_max_length, self.d_kv)
+            pkv_self_attn_value = torch.zeros(1, self.num_heads, self.decoder_max_length, self.d_kv)
+            pkv_cross_attn_key = torch.zeros(1, self.num_heads, self.encoder_max_length, self.d_kv)
+            pkv_cross_attn_value = torch.zeros(1, self.num_heads, self.encoder_max_length, self.d_kv)
             layer_pkv = (pkv_self_attn_key, pkv_self_attn_value, pkv_cross_attn_key, pkv_cross_attn_value)
             dummy_past_key_value.append(layer_pkv)
-        decoder_attention_mask = torch.zeros(decoder_batch_size, self.decoder_max_length, dtype=torch.float32)
+        decoder_attention_mask = torch.zeros(1, self.decoder_max_length, dtype=torch.float32)
         decoder_attention_mask[:, :1] = 1
         decoder_outputs = _BartDecoder.forward(
             self.decoder,
-            input_ids=torch.zeros((decoder_batch_size, 1), dtype=torch.int64),
+            input_ids=torch.zeros((1, 1), dtype=torch.int64),
             attention_mask=decoder_attention_mask,
             encoder_attention_mask=attention_mask,
             cache_position=torch.tensor(0, dtype=torch.int32),

optimum/rbln/transformers/models/bart/modeling_bart.py CHANGED Viewed

@@ -22,23 +22,25 @@
 # from Rebellions Inc.
 import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
-from transformers import AutoModel, BartConfig, BartModel, PretrainedConfig
+from transformers import BartConfig, BartForConditionalGeneration, BartModel, PretrainedConfig
 from ....modeling_base import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig
+from ....utils.logging import get_logger
+from ...models.seq2seq import RBLNModelForSeq2SeqLM
+from .bart_architecture import BartWrapper
-logger = logging.getLogger(__name__)
+logger = get_logger()
 if TYPE_CHECKING:
-    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PreTrainedModel
 class RBLNBartModel(RBLNModel):
-    auto_model_class = AutoModel  # feature extraction
     original_model_class = BartModel
     original_config_class = BartConfig
@@ -104,3 +106,20 @@ class RBLNBartModel(RBLNModel):
         rbln_config.model_cfg.update({"max_seq_len": rbln_max_seq_len})
         return rbln_config
+class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
+    @classmethod
+    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
+        return BartWrapper(model)
+    def __getattr__(self, __name: str) -> Any:
+        def redirect(func):
+            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
+        val = getattr(BartForConditionalGeneration, __name)
+        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
+            return redirect(val)
+        return val

optimum/rbln/transformers/models/bert/modeling_bert.py CHANGED Viewed

@@ -25,7 +25,7 @@ import inspect
 import logging
 from typing import TYPE_CHECKING, Any, Dict, Optional, Union
-from transformers import AutoModel, BertConfig, BertModel, PretrainedConfig
+from transformers import BertConfig, BertModel, PretrainedConfig
 from ....modeling_base import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig
@@ -38,7 +38,6 @@ if TYPE_CHECKING:
 class RBLNBertModel(RBLNModel):
-    auto_model_class = AutoModel  # feature extraction
     original_model_class = BertModel
     original_config_class = BertConfig

optimum/rbln/transformers/models/clip/modeling_clip.py CHANGED Viewed

@@ -58,7 +58,6 @@ class _TextEncoder(torch.nn.Module):
 class RBLNCLIPTextModel(RBLNModel):
-    auto_model_class = AutoModel  # feature extraction
     original_model_class = CLIPTextModel
     original_config_class = CLIPTextConfig

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -20,8 +20,9 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+import functools
 import glob
-import logging
+import os
 from abc import ABC
 from dataclasses import dataclass
 from pathlib import Path
@@ -36,11 +37,12 @@ from transformers.utils import ModelOutput
 from ....modeling_base import RBLNModel
 from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNCompileConfig, RBLNConfig
+from ....utils.logging import get_logger
 from ....utils.runtime_utils import RBLNPytorchRuntime
 from ....utils.timer_utils import rbln_timer
-logger = logging.getLogger(__name__)
+logger = get_logger()
 if TYPE_CHECKING:
     from transformers import (
@@ -97,7 +99,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
 @dataclass
 class RBLNDecoderOnlyOutput(ModelOutput):
     logits: torch.FloatTensor = None
-    past_cached_length: Union[int, torch.Tensor] = None
+    generate_idx: torch.Tensor = None
 class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
@@ -243,6 +245,54 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         return model
+    def validate_quantization_config(quantize_config):
+        if quantize_config is not None:
+            q_format = quantize_config.get("format")
+            q_precision = quantize_config.get("precision")
+            if q_format not in SUPPORTED_QUANTIZATIONS:
+                raise ValueError(
+                    f"Invalid quantization format: {q_format}. "
+                    f"Supported formats are: {list(SUPPORTED_QUANTIZATIONS.keys())}"
+                )
+            if q_precision not in SUPPORTED_QUANTIZATIONS[q_format]:
+                raise ValueError(
+                    f"Invalid precision: {q_precision} for format: {q_format}. "
+                    f"Supported precisions are: {SUPPORTED_QUANTIZATIONS[q_format]}"
+                )
+        return quantize_config
+    @classmethod
+    def set_quantize_env(cls, quantize_config):
+        RBLN_QUANT_BITS_ENV = "RBLN_QUANT_BITS"
+        quantize_config = cls.validate_quantization_config(quantize_config)
+        if quantize_config is not None:
+            q_precision = quantize_config.get("precision")
+            quant_bits = q_precision.split("w")[1].split("a")[0]
+            os.environ[RBLN_QUANT_BITS_ENV] = quant_bits
+            return RBLN_QUANT_BITS_ENV
+        return None
+    @classmethod
+    def reset_quantize_env(cls, env_var_name):
+        if env_var_name is not None and env_var_name in os.environ:
+            del os.environ[env_var_name]
+    @classmethod
+    def manage_quantize_env(cls, func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            quantize_config = kwargs.get("quantize_config")
+            quantize_env_var = cls.set_quantize_env(quantize_config)
+            try:
+                return func(*args, **kwargs)
+            finally:
+                cls.reset_quantize_env(quantize_env_var)
+        return wrapper
     @classmethod
     @torch.inference_mode()
     def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNConfig):
@@ -252,7 +302,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         prefill_rbln_compile_config = rbln_compile_configs[0]
         dec_rbln_compile_config = rbln_compile_configs[1]
-        @rbln_timer("Jit Trace")
+        @rbln_timer("JIT trace")
         def get_scripted_model():
             # This function is nested to dealloc the example inputs before compilation.
             prefill_example_inputs = prefill_rbln_compile_config.get_dummy_inputs(fill=0)
@@ -271,7 +321,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         prefill_scripted_model, dec_scripted_model = get_scripted_model()
-        @rbln_timer("TorchScript to IR")
+        @rbln_timer("Model conversion")
         def scripted_model_to_ir():
             prefill_ir = rebel.torchscript_to_ir(
                 prefill_scripted_model,
@@ -291,7 +341,18 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
             for i in range(model.config.num_hidden_layers * 2)
         ]
-        compiled_model = rebel.compile(
+        # Extract quantize_config from rbln_config
+        quantize_config = rbln_config.model_cfg.get("quantization", None)
+        @cls.manage_quantize_env
+        def compile_model(*args, **kwargs):
+            # Remove quantize_config from kwargs
+            kwargs.pop("quantize_config", None)
+            # Call rebel.compile with the updated kwargs
+            return rebel.compile(*args, **kwargs)
+        compiled_model = compile_model(
             prefill_ir,
             dec_ir,
             connections=connections,
@@ -299,7 +360,9 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
             npu=prefill_rbln_compile_config.npu,
             tensor_parallel_size=prefill_rbln_compile_config.tensor_parallel_size,
             use_weight_sharing=True,
+            quantize_config=quantize_config,
         )
         return compiled_model
     @classmethod
@@ -314,6 +377,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         rbln_quantization = rbln_kwargs.get("quantization", None)
         rbln_use_inputs_embeds = rbln_kwargs.get("use_inputs_embeds", None)
+        rbln_quantization = cls.validate_quantization_config(rbln_quantization)
         prefill_chunk_size = 128
         if rbln_max_seq_len is None:
             rbln_max_seq_len = getattr(model_config, "max_position_embeddings", None) or getattr(
@@ -330,16 +395,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         head_dim = getattr(model_config, "head_dim", None) or model_config.hidden_size // num_attention_heads
         hidden_size = getattr(model_config, "n_embd", None) or getattr(model_config, "hidden_size")
-        if rbln_quantization is not None:
-            q_format = rbln_quantization.get("format", None)
-            q_precision = rbln_quantization.get("precision", None)
-            if q_format not in SUPPORTED_QUANTIZATIONS.keys() or q_precision not in SUPPORTED_QUANTIZATIONS[q_format]:
-                raise ValueError(
-                    f'rbln_quantization="{rbln_quantization}" is not a supported quantization format or precesion, '
-                    f"Possible: {SUPPORTED_QUANTIZATIONS}"
-                )
         def get_input_info(
             batch_size,
             query_length,
@@ -439,50 +494,41 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
     def prepare_inputs_for_generation(
         self,
         input_ids: torch.LongTensor,
-        past_cached_length: Optional[torch.Tensor] = None,
+        generate_idx: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ):
         model_inputs = {}
-        # prefill phase
-        if past_cached_length is None:
-            # huggingface make dummy_input_ids if model_input_name is "input_embeds"
-            # https://github.com/huggingface/transformers/blob/174890280b340b89c5bfa092f6b4fb0e2dc2d7fc/src/transformers/generation/utils.py#L469
-            if self.rbln_config.model_cfg["use_inputs_embeds"] and inputs_embeds is not None:
-                input_tensors = inputs_embeds
-            else:
-                input_tensors = input_ids
+        is_prefill_phase = generate_idx is None
-            batch_size = input_tensors.shape[0]
-            l_input_tensors = []
-            cache_positions = []
-            past_cached_length = torch.zeros((batch_size, 1), dtype=torch.int32)
-            for i in range(batch_size):
-                input_tensor = input_tensors[i]
-                input_tensor = input_tensor[attention_mask[i] == 1]
-                valid_len = input_tensor.shape[0]
-                cache_position = torch.arange(0, valid_len, dtype=torch.int32)
-                past_cached_length[i] = valid_len
-                l_input_tensors.append(input_tensor.unsqueeze(0))
-                cache_positions.append(cache_position.unsqueeze(0))
-            input_tensors = l_input_tensors
-            if self.rbln_config.model_cfg["use_inputs_embeds"] and inputs_embeds is not None:
-                model_inputs.update({"inputs_embeds": input_tensors, "input_ids": input_ids})
-            else:
-                model_inputs.update({"input_ids": input_tensors, "inputs_embeds": inputs_embeds})
-        # decoder phase
+        if is_prefill_phase:
+            generate_idx = attention_mask.sum(dim=-1, keepdim=True).int()
+            cache_position = None
         else:
+            if inputs_embeds is not None:
+                raise NotImplementedError("Specifying inputs_embeds in decoder phase is not supported.")
             input_ids = input_ids[:, -1:]
-            cache_positions = past_cached_length
-            past_cached_length = past_cached_length + 1
+            cache_position = generate_idx
+            generate_idx = generate_idx + 1
+            model_inputs.update({"input_ids": input_ids})
+        if inputs_embeds is not None:
+            if self.rbln_config.model_cfg["use_inputs_embeds"]:
+                model_inputs.update({"inputs_embeds": inputs_embeds})
+            else:
+                raise ValueError(
+                    "The specifying inputs_embedst is only supported when using a compiled RBLN model with 'rbln_use_inputs_embeds' set to True."
+                )
+        else:
             model_inputs.update({"input_ids": input_ids})
         model_inputs.update(
             {
-                "cache_position": cache_positions,
-                "past_cached_length": past_cached_length,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "generate_idx": generate_idx,
             }
         )
@@ -494,42 +540,46 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         model_kwargs: Dict[str, Any],
         **kwargs,
     ) -> Dict[str, Any]:
-        # update past_cached_length
-        model_kwargs["past_cached_length"] = outputs.past_cached_length
+        # update generate_idx
+        model_kwargs["generate_idx"] = outputs.generate_idx
         return model_kwargs
     def forward(
         self,
-        input_ids: Optional[Union[List[torch.LongTensor], torch.LongTensor]] = None,
-        inputs_embeds: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
-        cache_position: Union[List[torch.Tensor], torch.Tensor] = None,  # vllm keyword argument
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        generate_idx: Optional[torch.Tensor] = None,
+        # from llava_next forward args
         batch_idx: Optional[int] = None,
-        past_cached_length: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor]:
-        # prefll & hf generate
-        if isinstance(cache_position, list):
+        # prefll
+        if cache_position is None:
             logits = []
-            input_tensors = input_ids if inputs_embeds is None else inputs_embeds
-            for batch_idx, (input_tensor, cache_pos) in enumerate(zip(input_tensors, cache_position)):
+            input_tensors = inputs_embeds if inputs_embeds is not None else input_ids
+            batch_size = input_tensors.shape[0]
+            for b_idx in range(batch_size):
+                # Transform inputs as vllm format
+                if attention_mask is not None:
+                    input_tensor = input_tensors[b_idx : b_idx + 1, attention_mask[b_idx].bool()]
+                else:
+                    input_tensor = input_tensors[b_idx : b_idx + 1]
+                cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
                 logit = self._forward_prefill(
                     input_ids=input_tensor if inputs_embeds is None else None,
                     inputs_embeds=input_tensor if inputs_embeds is not None else None,
-                    cache_position=cache_pos,
-                    batch_idx=batch_idx,
+                    cache_position=cache_position,
+                    batch_idx=b_idx if batch_idx is None else batch_idx,  # Llava-next prefill
                 )
                 logits.append(logit)
             logits = torch.cat(logits, dim=0)
-        # prefill & vllm step
-        elif cache_position.shape[-1] > 1:
-            logits = self._forward_prefill(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                cache_position=cache_position,
-                batch_idx=batch_idx,
-            )
-        # common decoder
+        # decoder
         else:
             logits = self._forward_decoder(
                 input_ids=input_ids,
@@ -539,7 +589,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         return RBLNDecoderOnlyOutput(
             logits=logits,
-            past_cached_length=past_cached_length,
+            generate_idx=generate_idx,
         )
     def _forward_prefill(
@@ -567,23 +617,18 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
             torch.empty(size=[], dtype=torch.int16, device="cpu"),
         ]
-        if self.rbln_config.model_cfg["use_inputs_embeds"] and inputs_embeds is not None:
-            model_input_name = "inputs_embeds"
-        else:
-            model_input_name = "input_ids"
-        input_tensors = input_ids if model_input_name == "input_ids" else inputs_embeds
+        input_tensors = inputs_embeds if inputs_embeds is not None else input_ids
         query_length = input_tensors.shape[1]
-        attention_mask = self.prefill_attention_mask.clone()
+        _attention_mask = self.prefill_attention_mask.clone()
         for step in range(0, query_length, self.prefill_chunk_size):
-            if step + self.prefill_chunk_size > query_length:
-                # input_tensors = torch.nn.functional.pad(input_tensors, (0, step + self.prefill_chunk_size - query_length))
-                padding_needed = step + self.prefill_chunk_size - query_length
-                if model_input_name == "input_ids":
-                    input_tensors = torch.nn.functional.pad(input_tensors, (0, padding_needed))
+            # pad input_tensors & cache_position for prefill_chunk
+            if (step + self.prefill_chunk_size) > query_length:
+                pad_to_chunk = step + self.prefill_chunk_size - query_length
+                if inputs_embeds is not None:
+                    input_tensors = torch.nn.functional.pad(input_tensors, (0, 0, 0, pad_to_chunk))
                 else:
-                    input_tensors = torch.nn.functional.pad(input_tensors, (0, 0, 0, padding_needed))
+                    input_tensors = torch.nn.functional.pad(input_tensors, (0, pad_to_chunk))
                 cache_position = torch.cat(
                     [
@@ -597,25 +642,28 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
                     dim=-1,
                 )
-            sliced_input_tensors = input_tensors[:, step : step + self.prefill_chunk_size]
-            sliced_cache_positions = cache_position[:, step : step + self.prefill_chunk_size]
+            # slice input_tensor & cache_position with prefill_chunk_size
+            _input_tensors = input_tensors[:, step : step + self.prefill_chunk_size]
+            _cache_position = cache_position[:, step : step + self.prefill_chunk_size]
+            # update attention_mask
             if step >= self.prefill_chunk_size:
-                attention_mask[:, :, :, step - self.prefill_chunk_size : step] = 1
-            attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
+                _attention_mask[:, :, :, step - self.prefill_chunk_size : step] = 1
+            _attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
-            query_idx = query_length % self.prefill_chunk_size - 1
+            query_idx = (query_length - 1) % self.prefill_chunk_size
             logits, _ = self.prefill_decoder(
-                input_ids=sliced_input_tensors.contiguous() if model_input_name == "input_ids" else None,
-                inputs_embeds=sliced_input_tensors.contiguous() if model_input_name == "inputs_embeds" else None,
-                attention_mask=attention_mask.contiguous(),
-                cache_position=sliced_cache_positions.contiguous(),
+                input_ids=_input_tensors.contiguous() if inputs_embeds is None else None,
+                inputs_embeds=_input_tensors.contiguous() if inputs_embeds is not None else None,
+                attention_mask=_attention_mask.contiguous(),
+                cache_position=_cache_position.contiguous(),
                 batch_position=torch.tensor(batch_idx, dtype=torch.int16),
                 query_idx=torch.tensor(query_idx, dtype=torch.int16),
                 out=out_buffers,
             )
+        # update decoder_attn_mask with preprocessed kv-cache length in prefill phase
         self.dec_attn_mask[batch_idx] = self.dec_attn_mask_init.clone()
         self.dec_attn_mask[batch_idx, :, :, :query_length] = 1
@@ -627,11 +675,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         inputs_embeds: torch.Tensor = None,
         cache_position: torch.Tensor = None,
     ) -> torch.FloatTensor:
-        if self.rbln_config.model_cfg["use_inputs_embeds"] and inputs_embeds is not None:
-            model_input_name = "inputs_embeds"
-        else:
-            model_input_name = "input_ids"
-        input_tensors = input_ids if model_input_name == "input_ids" else inputs_embeds
+        input_tensors = inputs_embeds if inputs_embeds is not None else input_ids
         batch_size = input_tensors.shape[0]
@@ -640,8 +684,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
             self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
         logits, _ = self.decoder(
-            input_ids=input_tensors.contiguous() if model_input_name == "input_ids" else None,
-            inputs_embeds=input_tensors.contiguous() if model_input_name == "inputs_embeds" else None,
+            input_ids=input_tensors.contiguous() if inputs_embeds is None else None,
+            inputs_embeds=input_tensors.contiguous() if inputs_embeds is not None else None,
             attention_mask=self.dec_attn_mask.contiguous(),
             cache_position=cache_position.contiguous(),
             batch_position=torch.tensor(0, dtype=torch.int16),
@@ -649,3 +693,31 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         )
         return logits
+    def vllm_forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: torch.Tensor = None,
+        cache_position: torch.Tensor = None,
+        batch_idx: Optional[int] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor]:
+        # prefll
+        if cache_position.shape[-1] > 1:
+            logits = self._forward_prefill(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                cache_position=cache_position,
+                batch_idx=batch_idx,
+            )
+        # decoder
+        else:
+            logits = self._forward_decoder(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                cache_position=cache_position,
+            )
+        return RBLNDecoderOnlyOutput(
+            logits=logits,
+        )

optimum/rbln/transformers/models/dpt/modeling_dpt.py CHANGED Viewed

@@ -38,7 +38,6 @@ if TYPE_CHECKING:
 class RBLNDPTForDepthEstimation(RBLNModel):
-    model_type = "rbln_model"
     auto_model_class = AutoModelForDepthEstimation
     main_input_name = "pixel_values"

optimum/rbln/transformers/models/exaone/__init__.py ADDED Viewed

@@ -0,0 +1,32 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+import os
+from os import environ
+this_path = os.path.abspath(__file__)
+local_dir = "/" + os.path.join(*this_path.split("/")[:-1]) + "/hf_hub_cached"
+environ["LOCAL_CACHE_ROOT_CUSTOM_CODE_MIDM"] = local_dir
+from .modeling_exaone import RBLNExaoneForCausalLM

optimum-rbln 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

optimum-rbln 0.1.11py3-none-any.whl → 0.1.12py3-none-any.whl