PyPI - optimum-rbln - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

optimum-rbln 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

optimum/rbln/__init__.py CHANGED Viewed

@@ -32,6 +32,7 @@ _import_structure = {
     "modeling_alias": [
         "RBLNASTForAudioClassification",
         "RBLNBertForQuestionAnswering",
+        "RBLNDistilBertForQuestionAnswering",
         "RBLNResNetForImageClassification",
         "RBLNT5ForConditionalGeneration",
         "RBLNBartForConditionalGeneration",
@@ -61,6 +62,7 @@ _import_structure = {
         "RBLNWav2Vec2ForCTC",
         "RBLNLlamaForCausalLM",
         "RBLNMidmLMHeadModel",
+        "RBLNMistralForCausalLM",
         "RBLNWhisperForConditionalGeneration",
         "RBLNXLMRobertaModel",
     ],
@@ -126,6 +128,7 @@ if TYPE_CHECKING:
         RBLNGPT2LMHeadModel,
         RBLNLlamaForCausalLM,
         RBLNMidmLMHeadModel,
+        RBLNMistralForCausalLM,
         RBLNWav2Vec2ForCTC,
         RBLNWhisperForConditionalGeneration,
         RBLNXLMRobertaModel,

optimum/rbln/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '0.1.8'
1	+ __version__ = '0.1.9'

optimum/rbln/diffusers/models/autoencoder_kl.py CHANGED Viewed

@@ -26,7 +26,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 import rebel
-import torch
+import torch  # noqa: I001
 from diffusers import AutoencoderKL
 from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
 from diffusers.models.modeling_outputs import AutoencoderKLOutput
@@ -38,12 +38,12 @@ from ...modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNConfig, RBLNRunt
 from ...utils.runtime_utils import RBLNPytorchRuntime
-logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     import torch
     from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig
+logger = logging.getLogger(__name__)
 class RBLNRuntimeVAEEncoder(RBLNPytorchRuntime):
     def encode(self, x: torch.FloatTensor, **kwargs) -> torch.FloatTensor:

optimum/rbln/diffusers/models/controlnet.py CHANGED Viewed

@@ -34,12 +34,13 @@ from ...modeling_base import RBLNModel
 from ...modeling_config import RBLNConfig, RBLNRuntimeConfig
-logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
+logger = logging.getLogger(__name__)
 class _ControlNetModel(torch.nn.Module):
     def __init__(self, controlnet: "ControlNetModel"):
         super().__init__()
@@ -138,7 +139,7 @@ class RBLNControlNetModel(RBLNModel):
         return rt
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module) -> torch.nn.Module:
+    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNConfig) -> torch.nn.Module:
         use_encoder_hidden_states = False
         for down_block in model.down_blocks:
             if use_encoder_hidden_states := getattr(down_block, "has_cross_attention", False):

optimum/rbln/diffusers/models/unet_2d_condition.py CHANGED Viewed

@@ -35,11 +35,11 @@ from ...modeling_base import RBLNModel
 from ...modeling_config import RBLNConfig, RBLNRuntimeConfig
-logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
+logger = logging.getLogger(__name__)
 class _UNet_SD(torch.nn.Module):
     def __init__(self, unet: "UNet2DConditionModel"):
@@ -172,7 +172,7 @@ class RBLNUNet2DConditionModel(RBLNModel):
         return rt
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module) -> torch.nn.Module:
+    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNConfig) -> torch.nn.Module:
         if model.config.addition_embed_type == "text_time":
             return _UNet_SDXL(model).eval()
         else:

optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py CHANGED Viewed

@@ -37,11 +37,11 @@ from ....modeling_config import RBLNConfig
 from ...models.controlnet import RBLNControlNetModel
-logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     pass
+logger = logging.getLogger(__name__)
 class RBLNMultiControlNetModel(RBLNModel):
     def __init__(
@@ -79,7 +79,6 @@ class RBLNMultiControlNetModel(RBLNModel):
         model_id: Union[str, Path],
         **kwargs,
     ) -> RBLNModel:
         idx = 0
         controlnets = []
         model_path_to_load = model_id

optimum/rbln/modeling_alias.py CHANGED Viewed

@@ -36,7 +36,11 @@ class RBLNASTForAudioClassification(RBLNModelForAudioClassification):
 class RBLNBertForQuestionAnswering(RBLNModelForQuestionAnswering):
-    pass
+    rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
+class RBLNDistilBertForQuestionAnswering(RBLNModelForQuestionAnswering):
+    rbln_model_input_names = ["input_ids", "attention_mask"]
 class RBLNResNetForImageClassification(RBLNModelForImageClassification):

optimum/rbln/modeling_base.py CHANGED Viewed

@@ -51,10 +51,15 @@ from .utils.runtime_utils import UnavailableRuntime
 from .utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
-logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
-    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PreTrainedModel
+    from transformers import (
+        AutoFeatureExtractor,
+        AutoProcessor,
+        AutoTokenizer,
+        PreTrainedModel,
+    )
+logger = logging.getLogger(__name__)
 class RBLNBaseModel(OptimizedModel, ABC):
@@ -156,13 +161,23 @@ class RBLNBaseModel(OptimizedModel, ABC):
                 Directory where to save the model file.
         """
         real_save_dir = self.model_save_dir / self.subfolder
+        save_directory_path = Path(save_directory)
         if os.path.exists(real_save_dir) and os.path.isdir(real_save_dir):
+            if save_directory_path.absolute() == real_save_dir.absolute():
+                raise FileExistsError(
+                    f"Cannot save model to '{save_directory}'. "
+                    f"This directory already exists and contains the model files."
+                )
             shutil.copytree(real_save_dir, save_directory, dirs_exist_ok=True)
             self.config.save_pretrained(save_directory)
             if self.generation_config is not None:
                 self.generation_config.save_pretrained(save_directory)
         else:
-            raise FileNotFoundError(f"Saving compiled model failed.({real_save_dir}).")
+            raise FileNotFoundError(
+                f"Unable to save the model. The model directory '{real_save_dir}' does not exist or is not accessible. "
+                f"Cannot save to the specified destination '{save_directory}'. "
+                f"Please ensure the model directory exists and you have the necessary permissions to access it."
+            )
     @classmethod
     def _from_pretrained(
@@ -196,7 +211,12 @@ class RBLNBaseModel(OptimizedModel, ABC):
                 token = HfFolder().get_token()
             else:
                 token = use_auth_token
-            repo_files = list(map(Path, HfApi().list_repo_files(model_id, revision=revision, token=token)))
+            repo_files = list(
+                map(
+                    Path,
+                    HfApi().list_repo_files(model_id, revision=revision, token=token),
+                )
+            )
             pattern = "*.rbln" if subfolder == "" else f"{subfolder}/*.rbln"
             rbln_files = [p for p in repo_files if p.match(pattern)]
@@ -287,7 +307,7 @@ class RBLNBaseModel(OptimizedModel, ABC):
             preprocessors,
             model_save_dir=model_save_dir,
             subfolder=subfolder,
-            rbln_compiled_models=None if rbln_optimize_host_memory else rbln_compiled_models,
+            rbln_compiled_models=(None if rbln_optimize_host_memory else rbln_compiled_models),
             **kwargs,
         )
@@ -377,7 +397,7 @@ class RBLNBaseModel(OptimizedModel, ABC):
         return self.forward(*args, **kwargs)
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module) -> torch.nn.Module:
+    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNConfig) -> torch.nn.Module:
         # Wrap the model if needed.
         return model
@@ -400,7 +420,9 @@ class RBLNBaseModel(OptimizedModel, ABC):
     @classmethod
     @abstractmethod
     def _create_runtimes(
-        cls, compiled_models: List[rebel.RBLNCompiledModel], rbln_device_map: Dict[str, int]
+        cls,
+        compiled_models: List[rebel.RBLNCompiledModel],
+        rbln_device_map: Dict[str, int],
     ) -> List[rebel.Runtime]:
         # compiled_models -> runtimes
         pass
@@ -497,7 +519,7 @@ class RBLNModel(RBLNBaseModel):
     @classmethod
     def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNConfig):
-        model = cls.wrap_model_if_needed(model)
+        model = cls.wrap_model_if_needed(model, rbln_config)
         rbln_runtime_configs = list(rbln_config.values())
         if len(rbln_runtime_configs) != 1:
             raise ValueError
@@ -598,7 +620,9 @@ class RBLNModel(RBLNBaseModel):
     @classmethod
     def _create_runtimes(
-        cls, compiled_models: List[rebel.RBLNCompiledModel], rbln_device_map: Dict[str, int]
+        cls,
+        compiled_models: List[rebel.RBLNCompiledModel],
+        rbln_device_map: Dict[str, int],
     ) -> List[rebel.Runtime]:
         device = rbln_device_map[DEFAULT_COMPILED_MODEL_NAME]
         return [compiled_model.create_runtime(tensor_type="pt", device=device) for compiled_model in compiled_models]
@@ -618,8 +642,8 @@ class RBLNModelForQuestionAnswering(RBLNModel):
         preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]],
         model_config: Optional["PretrainedConfig"] = None,
         rbln_max_seq_len: Optional[int] = None,
-        rbln_model_input_names: Optional[List[str]] = None,
         rbln_batch_size: Optional[int] = None,
+        rbln_model_input_names: Optional[List[str]] = None,
     ) -> RBLNConfig:
         if rbln_max_seq_len is None:
             for tokenizer in preprocessors:
@@ -629,15 +653,15 @@ class RBLNModelForQuestionAnswering(RBLNModel):
             if rbln_max_seq_len is None:
                 raise ValueError("`rbln_max_seq_len` should be specified!")
-        if rbln_model_input_names is None:
-            # These are BERT's inputs
-            rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
         if rbln_batch_size is None:
             rbln_batch_size = 1
+        if rbln_model_input_names is not None:
+            cls.rbln_model_input_names = rbln_model_input_names
         input_info = [
             (model_input_name, [rbln_batch_size, rbln_max_seq_len], "int64")
-            for model_input_name in rbln_model_input_names
+            for model_input_name in cls.rbln_model_input_names
         ]
         rbln_runtime_config = RBLNRuntimeConfig(input_info=input_info)
@@ -674,7 +698,13 @@ class RBLNModelForImageClassification(RBLNModel):
         if rbln_batch_size is None:
             rbln_batch_size = 1
-        input_info = [("pixel_values", [rbln_batch_size, 3, rbln_image_size, rbln_image_size], "float32")]
+        input_info = [
+            (
+                "pixel_values",
+                [rbln_batch_size, 3, rbln_image_size, rbln_image_size],
+                "float32",
+            )
+        ]
         rbln_runtime_config = RBLNRuntimeConfig(input_info=input_info)
         rbln_runtime_config.batch_size = rbln_batch_size
@@ -739,7 +769,11 @@ class RBLNModelForAudioClassification(RBLNModel):
         meta["rbln_num_mel_bins"] = rbln_num_mel_bins
         model_input_info = [
-            ("input_values", [rbln_batch_size, rbln_max_length, rbln_num_mel_bins], "float32"),
+            (
+                "input_values",
+                [rbln_batch_size, rbln_max_length, rbln_num_mel_bins],
+                "float32",
+            ),
         ]
         rbln_runtime_config = RBLNRuntimeConfig(input_info=model_input_info, batch_size=rbln_batch_size)
@@ -777,7 +811,6 @@ class RBLNModelForSequenceClassification(RBLNModel):
         rbln_model_input_names: Optional[List[str]] = None,
         rbln_batch_size: Optional[int] = None,
     ) -> RBLNConfig:
         max_position_embeddings = getattr(model_config, "n_positions", None) or getattr(
             model_config, "max_position_embeddings", None
         )
@@ -812,6 +845,7 @@ class RBLNModelForSequenceClassification(RBLNModel):
         return RBLNConfig.from_rbln_runtime_configs([rbln_runtime_config], _rbln_meta=meta)
 class RBLNModelForMaskedLM(RBLNModel):
     model_type = "rbln_model"
     auto_model_class = AutoModelForMaskedLM

optimum/rbln/transformers/__init__.py CHANGED Viewed

@@ -39,7 +39,8 @@ _import_structure = {
         "RBLNWhisperForConditionalGeneration",
         "RBLNLlamaForCausalLM",
         "RBLNMidmLMHeadModel",
-        "RBLNXLMRobertaModel"
+        "RBLNMistralForCausalLM",
+        "RBLNXLMRobertaModel",
     ],
 }
@@ -54,6 +55,7 @@ if TYPE_CHECKING:
         RBLNGPT2LMHeadModel,
         RBLNLlamaForCausalLM,
         RBLNMidmLMHeadModel,
+        RBLNMistralForCausalLM,
         RBLNWav2Vec2ForCTC,
         RBLNWhisperForConditionalGeneration,
         RBLNXLMRobertaModel,

optimum/rbln/transformers/models/__init__.py CHANGED Viewed

@@ -27,6 +27,7 @@ from .gemma import RBLNGemmaForCausalLM
 from .gpt2 import RBLNGPT2LMHeadModel
 from .llama import RBLNLlamaForCausalLM
 from .midm import RBLNMidmLMHeadModel
+from .mistral import RBLNMistralForCausalLM
 from .wav2vec2 import RBLNWav2Vec2ForCTC
 from .whisper import RBLNWhisperForConditionalGeneration
 from .xlm_roberta import RBLNXLMRobertaModel

optimum/rbln/transformers/models/clip/modeling_clip.py CHANGED Viewed

@@ -70,7 +70,7 @@ class RBLNCLIPTextModel(RBLNModel):
         return rt
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module) -> torch.nn.Module:
+    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNConfig) -> torch.nn.Module:
         return _TextEncoder(model).eval()
     @classmethod

optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py CHANGED Viewed

@@ -49,18 +49,19 @@ class DecoderOnlyWrapper(torch.nn.Module):
             self.config.max_position_embeddings if max_seq_len > self.config.max_position_embeddings else max_seq_len
         )
         self.max_seq_len = max_seq_len
+        self.rope_scaling = getattr(self.config, "rope_scaling", None)
         self.rotary_emb = self._init_rope()
     def _init_rope(self):
-        if self.config.rope_scaling is None:
+        if self.rope_scaling is None:
             rotary_emb = RotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.config.rope_theta,
             )
         else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
+            scaling_type = self.rope_scaling["type"]
+            scaling_factor = self.rope_scaling["factor"]
             if scaling_type == "linear":
                 rotary_emb = LinearScalingRotaryEmbedding(
                     self.head_dim,

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -20,18 +20,22 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+import glob
 import logging
-from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from abc import ABC
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 import rebel  # noqa: F401
 import torch  # noqa: F401
-from transformers import AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
+from safetensors.torch import load_file
+from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import no_init_weights
 from ....modeling_base import RBLNModel
 from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNConfig, RBLNRuntimeConfig
 from ....utils.runtime_utils import RBLNPytorchRuntime
+from ...utils.rbln_quantization import replace_quantized_linear_layers
 logger = logging.getLogger(__name__)
@@ -44,6 +48,12 @@ if TYPE_CHECKING:
         PretrainedConfig,
     )
+SUPPORTED_QUANTIZATIONS = {
+    "rbln": [
+        "w4a16",
+    ],
+}
 class RBLNRuntimeModel(RBLNPytorchRuntime):
     mandatory_members = ["main_input_name"]
@@ -78,26 +88,98 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         self.decoder = RBLNRuntimeModel(runtime=self.model[1], main_input_name="input_ids")
     @classmethod
-    @abstractmethod
-    def wrapping_torch_model(self, model: "PreTrainedModel", rbln_max_seq_len: int):
-        pass
+    def get_quantized_model(
+        cls,
+        model_id: str,
+        use_auth_token: Optional[Union[bool, str]] = None,
+        revision: Optional[str] = None,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        subfolder: str = "",
+        local_files_only: bool = False,
+        trust_remote_code: bool = False,
+        rbln_config_kwargs: Optional[Dict[str, Any]] = None,
+        rbln_constructor_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        kwargs = cls.update_kwargs(kwargs)
+        config = AutoConfig.from_pretrained(
+            model_id,
+            use_auth_token=use_auth_token,
+            revision=revision,
+            force_download=force_download,
+            cache_dir=cache_dir,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+        with no_init_weights():
+            model = AutoModelForCausalLM.from_config(config)
+        replace_quantized_linear_layers(model)
+        state_dict = {}
+        for safetensor_file in glob.glob(f"{model_id}/*.safetensors"):
+            partial_state_dict = load_file(safetensor_file)
+            state_dict.update(partial_state_dict)
+        n_layer = kwargs.get("num_hidden_layers", None)
+        if n_layer is not None:
+            keys_to_delete = []
+            for key in state_dict.keys():
+                parts = key.split(".")
+                if len(parts) > 2 and parts[2].isdigit():
+                    layer_num = int(parts[2])
+                    if layer_num >= n_layer:
+                        keys_to_delete.append(key)
+            for key in keys_to_delete:
+                del state_dict[key]
+        model.load_state_dict(state_dict)
+        return model
+    @classmethod
+    def get_pytorch_model(
+        cls,
+        *args,
+        **kwargs,
+    ) -> "PreTrainedModel":
+        rbln_config_kwargs = kwargs.get("rbln_config_kwargs", {})
+        rbln_quantization = rbln_config_kwargs.get("rbln_quantization", None)
+        if rbln_quantization is not None and rbln_quantization["format"] == "rbln":
+            model = cls.get_quantized_model(*args, **kwargs)
+        else:
+            model = super().get_pytorch_model(*args, **kwargs)
+        return model
     @classmethod
     @torch.inference_mode()
     def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNConfig):
-        wrapped_model = cls.wrapping_torch_model(model, rbln_config.meta["rbln_max_seq_len"])
+        wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
         prefill_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][0]
         dec_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][1]
-        prefill_example_inputs = prefill_rbln_runtime_config.get_dummy_inputs(fill=0)
-        dec_example_inputs = dec_rbln_runtime_config.get_dummy_inputs(fill=4)
+        def get_scripted_model():
+            # This function is nested to dealloc the example inputs before compilation.
+            prefill_example_inputs = prefill_rbln_runtime_config.get_dummy_inputs(fill=0)
+            dec_example_inputs = dec_rbln_runtime_config.get_dummy_inputs(fill=4)
+            batch_index = 3
+            dec_example_inputs[batch_index].fill_(-1)  # fill batch_position -1 to indicate it is decoder.
-        batch_index = 3
-        dec_example_inputs[batch_index].fill_(-1)  # fill batch_position -1 to indicate it is decoder.
+            prefill_scripted_model = torch.jit.trace(
+                wrapped_model, prefill_example_inputs, check_trace=False, _store_inputs=False
+            )
+            dec_scripted_model = torch.jit.trace(
+                wrapped_model, dec_example_inputs, check_trace=False, _store_inputs=False
+            )
+            return prefill_scripted_model, dec_scripted_model
-        prefill_scripted_model = torch.jit.trace(wrapped_model, prefill_example_inputs, check_trace=False)
-        dec_scripted_model = torch.jit.trace(wrapped_model, dec_example_inputs, check_trace=False)
+        prefill_scripted_model, dec_scripted_model = get_scripted_model()
         prefill_ir = rebel.torchscript_to_ir(
             prefill_scripted_model,
@@ -133,28 +215,44 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         model_config: "PretrainedConfig",
         rbln_max_seq_len: Optional[int] = None,
         rbln_batch_size: Optional[int] = None,
+        rbln_quantization: Optional[Dict[str, str]] = None,
         **kwargs,
     ) -> RBLNConfig:
         meta = {}
         prefill_chunk_size = 128
         if rbln_max_seq_len is None:
-            rbln_max_seq_len = getattr(model_config, "max_position_embeddings", None)
+            rbln_max_seq_len = getattr(model_config, "max_position_embeddings", None) or getattr(
+                model_config, "n_positions", None
+            )
+        if rbln_max_seq_len is None:
+            raise ValueError("`rbln_max_seq_len` should be specified.")
         rbln_batch_size = 1 if rbln_batch_size is None else rbln_batch_size
         meta["rbln_max_seq_len"] = rbln_max_seq_len
         meta["rbln_batch_size"] = rbln_batch_size
         meta["rbln_prefill_chunk_size"] = prefill_chunk_size
+        num_attention_heads = getattr(model_config, "n_head", None) or getattr(model_config, "num_attention_heads")
+        num_key_value_heads = getattr(model_config, "num_key_value_heads", None) or num_attention_heads
+        num_hidden_layers = getattr(model_config, "n_layer", None) or getattr(model_config, "num_hidden_layers")
+        head_dim = getattr(model_config, "head_dim", None) or model_config.hidden_size // num_attention_heads
+        if rbln_quantization is not None:
+            q_format = rbln_quantization.get("format", None)
+            q_precision = rbln_quantization.get("precision", None)
+            if q_format not in SUPPORTED_QUANTIZATIONS.keys() or q_precision not in SUPPORTED_QUANTIZATIONS[q_format]:
+                raise ValueError(
+                    f'rbln_quantization="{rbln_quantization}" is not a supported quantization format or precesion, '
+                    f"Possible: {SUPPORTED_QUANTIZATIONS}"
+                )
+            meta["rbln_quantization"] = rbln_quantization
         def get_input_info(
             batch_size,
             query_length,
         ):
-            head_dim = (
-                model_config.head_dim
-                if hasattr(model_config, "head_dim")
-                else model_config.hidden_size // model_config.num_attention_heads
-            )
             input_info = [
                 ("input_ids", [batch_size, query_length], "int64"),
                 ("attention_mask", [batch_size, 1, query_length, rbln_max_seq_len], "int64"),
@@ -172,13 +270,13 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
                         f"past_key_values_{i}",
                         [
                             rbln_batch_size,
-                            model_config.num_key_value_heads,
+                            num_key_value_heads,
                             rbln_max_seq_len,
                             head_dim,
                         ],
                         "float32",
                     )
-                    for i in range(model_config.num_hidden_layers * 2)
+                    for i in range(num_hidden_layers * 2)
                 ]
             )
@@ -295,6 +393,20 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
             raise RuntimeError(
                 f"Invalid batch_idx ({batch_idx}). It must be a non-null value less than the batch size ({self.batch_size})."
             )
+        out_buffers = [
+            torch.empty(
+                size=[
+                    1,
+                    self.prefill_chunk_size,
+                    self.config.vocab_size,
+                ],
+                dtype=torch.float32,
+                device="cpu",
+            ),
+            torch.empty(size=[], dtype=torch.int16, device="cpu"),
+        ]
         query_length = input_ids.shape[1]
         attention_mask = self.prefill_attention_mask.clone()
         for step in range(0, query_length, self.prefill_chunk_size):
@@ -314,7 +426,9 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
             sliced_input_ids = input_ids[:, step : step + self.prefill_chunk_size]
             sliced_cache_positions = cache_position[:, step : step + self.prefill_chunk_size]
-            attention_mask[:, :, :, :step] = 1
+            if step >= self.prefill_chunk_size:
+                attention_mask[:, :, :, step - self.prefill_chunk_size : step] = 1
             attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
             logits, _ = self.prefill_decoder(
@@ -322,6 +436,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
                 attention_mask.contiguous(),
                 sliced_cache_positions.contiguous(),
                 torch.tensor(batch_idx, dtype=torch.int16),
+                out=out_buffers,
             )
         logits = logits[:, query_length % self.prefill_chunk_size - 1].unsqueeze(1)

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -39,9 +39,16 @@ from ...models.decoderonly import (
 class GemmaWrapper(DecoderOnlyWrapper):
     def get_forward_dict(self):
         forward_dict = {}
-        forward_dict.update({"wrapper": GemmaModel.forward, "model": DecoderOnlyDecoderLayer.forward, "decoder_layer": DecoderOnlyAttention.forward,})
+        forward_dict.update(
+            {
+                "wrapper": GemmaModel.forward,
+                "model": DecoderOnlyDecoderLayer.forward,
+                "decoder_layer": DecoderOnlyAttention.forward,
+            }
+        )
         return forward_dict
 class GemmaModel:
     def forward(
         self,
@@ -54,7 +61,7 @@ class GemmaModel:
         use_cache: Optional[bool] = True,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
-        forward_dict : Optional[Dict[str, classmethod]] = None,
+        forward_dict: Optional[Dict[str, classmethod]] = None,
         rotary_pos_emb=None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         # embed positions
@@ -89,7 +96,7 @@ class GemmaModel:
                 batch_ids=batch_ids,
                 cos=cos,
                 sin=sin,
-                forward_dict=forward_dict
+                forward_dict=forward_dict,
             )
             hidden_states = layer_outputs[0]

optimum-rbln 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

optimum-rbln 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl