PyPI - optimum-rbln - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

optimum-rbln 0.1.12py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

optimum/rbln/transformers/models/t5/modeling_t5.py CHANGED Viewed

@@ -22,12 +22,23 @@
 # from Rebellions Inc.
 import inspect
-from typing import TYPE_CHECKING, Any, Callable
-from transformers import T5ForConditionalGeneration
-from ....modeling_config import RBLNConfig
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
+import torch
+import transformers
+from transformers import (
+    AutoModelForTextEncoding,
+    PretrainedConfig,
+    T5EncoderModel,
+    T5ForConditionalGeneration,
+)
+from transformers.modeling_outputs import BaseModelOutput
+from ....modeling import RBLNModel
+from ....modeling_config import RBLNCompileConfig, RBLNConfig
+from ....modeling_diffusers import RBLNDiffusionMixin
 from ....utils.logging import get_logger
+from ....utils.runtime_utils import RBLNPytorchRuntime
 from ...models.seq2seq import RBLNModelForSeq2SeqLM
 from .t5_architecture import T5Wrapper
@@ -35,7 +46,147 @@ from .t5_architecture import T5Wrapper
 logger = get_logger()
 if TYPE_CHECKING:
-    from transformers import PreTrainedModel
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PreTrainedModel
+class RBLNRuntimeModel(RBLNPytorchRuntime):
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.FloatTensor,
+        head_mask: torch.FloatTensor,
+        inputs_embeds: torch.FloatTensor,
+        **kwargs,
+    ):
+        return super().forward(
+            input_ids,
+            attention_mask,
+            head_mask,
+            inputs_embeds,
+            **kwargs,
+        )
+class T5EncoderWrapper(torch.nn.Module):
+    def __init__(self, model: "T5EncoderModel") -> None:
+        super().__init__()
+        self.model = model
+    def forward(self, *args, **kwargs):
+        kwargs.pop("return_dict", None)
+        return self.model(*args, **kwargs, return_dict=False)
+class RBLNT5EncoderModel(RBLNModel):
+    auto_model_class = AutoModelForTextEncoding
+    rbln_model_input_names = ["input_ids", "attention_mask"]
+    def __post_init__(self, **kwargs):
+        self.model = RBLNRuntimeModel(runtime=self.model[0])
+    @classmethod
+    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
+        return T5EncoderWrapper(model)
+    @classmethod
+    def update_rbln_config_using_pipe(cls, pipe: RBLNDiffusionMixin, rbln_config: Dict[str, Any]) -> Dict[str, Any]:
+        batch_size = rbln_config.get("batch_size", 1)
+        max_sequence_length = rbln_config.get("max_sequence_length", 256)
+        model_input_names = ["input_ids"]
+        rbln_config.update(
+            {
+                "batch_size": batch_size,
+                "max_seq_len": max_sequence_length,
+                "model_input_names": model_input_names,
+            }
+        )
+        return rbln_config
+    @classmethod
+    def _get_rbln_config(
+        cls,
+        preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]],
+        model_config: Optional["PretrainedConfig"] = None,
+        rbln_kwargs: Dict[str, Any] = {},
+    ) -> RBLNConfig:
+        rbln_max_seq_len = rbln_kwargs.get("max_seq_len", None)
+        rbln_model_input_names = rbln_kwargs.get("model_input_names", None)
+        rbln_batch_size = rbln_kwargs.get("batch_size", None)
+        max_position_embeddings = getattr(model_config, "n_positions", None)
+        if rbln_max_seq_len is None:
+            rbln_max_seq_len = max_position_embeddings
+            if rbln_max_seq_len is None:
+                for tokenizer in preprocessors:
+                    if hasattr(tokenizer, "model_max_length"):
+                        rbln_max_seq_len = tokenizer.model_max_length
+                        break
+                if rbln_max_seq_len is None:
+                    raise ValueError("`rbln_max_seq_len` should be specified!")
+        if max_position_embeddings is not None and rbln_max_seq_len > max_position_embeddings:
+            raise ValueError("`rbln_max_seq_len` should be less or equal than max_position_embeddings!")
+        if rbln_model_input_names is None:
+            for tokenizer in preprocessors:
+                if hasattr(tokenizer, "model_input_names"):
+                    rbln_model_input_names = tokenizer.model_input_names
+                    break
+            if rbln_model_input_names is None and hasattr(cls, "rbln_model_input_names"):
+                rbln_model_input_names = cls.rbln_model_input_names
+            elif rbln_model_input_names is None and hasattr(cls, "rbln_model_input_names") is False:
+                original_model_class = getattr(transformers, model_config.architectures[0])
+                input_names_order = inspect.signature(original_model_class.forward).parameters.keys()
+                raise ValueError(
+                    "Specify the model input names obtained by the tokenizer via `rbln_model_input_names`, "
+                    f"and be sure to make the order of the inputs same as T5EncoderModel forward() arguments like ({list(input_names_order)})"
+                )
+        if rbln_batch_size is None:
+            rbln_batch_size = 1
+        input_info = [
+            (model_input_name, [rbln_batch_size, rbln_max_seq_len], "int64")
+            for model_input_name in rbln_model_input_names
+        ]
+        rbln_compile_config = RBLNCompileConfig(input_info=input_info)
+        rbln_config = RBLNConfig(
+            rbln_cls=cls.__name__,
+            compile_cfgs=[rbln_compile_config],
+            rbln_kwargs=rbln_kwargs,
+        )
+        rbln_config.model_cfg.update({"max_seq_len": rbln_max_seq_len})
+        return rbln_config
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
+        encoder_outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return (encoder_outputs,)
+        else:
+            return BaseModelOutput(last_hidden_state=encoder_outputs)
 class RBLNT5ForConditionalGeneration(RBLNModelForSeq2SeqLM):

optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py CHANGED Viewed

@@ -28,7 +28,7 @@ import torch
 from transformers import AutoModelForMaskedLM, PretrainedConfig, Wav2Vec2ForCTC
 from transformers.modeling_outputs import CausalLMOutput
-from ....modeling_base import RBLNModel
+from ....modeling import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig

optimum/rbln/transformers/models/whisper/modeling_whisper.py CHANGED Viewed

@@ -36,7 +36,7 @@ from transformers import (
 )
 from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
-from ....modeling_base import RBLNModel
+from ....modeling import RBLNModel
 from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNCompileConfig, RBLNConfig
 from ....utils.runtime_utils import RBLNPytorchRuntime
 from .generation_whisper import RBLNWhisperGenerationMixin
@@ -102,7 +102,7 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
 class RBLNWhisperForConditionalGeneration(RBLNModel, RBLNWhisperGenerationMixin):
     """
     The Whisper Model with a language modeling head. Can be used for automatic speech recognition.
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
     A class to convert and run pre-trained transformers based LlamaForCausalLM model on RBLN devices.
     It implements the methods to convert a pre-trained transformers LlamaForCausalLM model into a RBLN transformer model by:

optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py CHANGED Viewed

@@ -22,12 +22,12 @@
 # from Rebellions Inc.
 import logging
-from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+from typing import TYPE_CHECKING, Optional, Union
 import torch
-from transformers import PretrainedConfig, PreTrainedModel, XLMRobertaConfig, XLMRobertaModel
+from transformers import PretrainedConfig
-from ....modeling_base import RBLNModel
+from ....modeling import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig
@@ -38,38 +38,6 @@ if TYPE_CHECKING:
 class RBLNXLMRobertaModel(RBLNModel):
-    original_model_class = XLMRobertaModel
-    original_config_class = XLMRobertaConfig
-    @classmethod
-    def get_pytorch_model(
-        cls,
-        model_id: str,
-        use_auth_token: Optional[Union[bool, str]] = None,
-        revision: Optional[str] = None,
-        force_download: bool = False,
-        cache_dir: Optional[str] = None,
-        subfolder: str = "",
-        local_files_only: bool = False,
-        trust_remote_code: bool = False,
-        rbln_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs,
-    ) -> "PreTrainedModel":
-        model: "PreTrainedModel" = super().get_pytorch_model(
-            model_id=model_id,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            force_download=force_download,
-            cache_dir=cache_dir,
-            subfolder=subfolder,
-            local_files_only=local_files_only,
-            trust_remote_code=trust_remote_code,
-            rbln_kwargs=rbln_kwargs,
-            library_name="transformers",
-        )
-        return model
     @classmethod
     def _get_rbln_config(
         cls,

optimum/rbln/transformers/utils/rbln_quantization.py CHANGED Viewed

@@ -22,21 +22,117 @@
 # from Rebellions Inc.
-from typing import Any
+import functools
+import glob
+import os
+from typing import Any, Callable, Dict, Optional
 import torch
+from safetensors.torch import load_file
 from torch.nn import Linear, Parameter
 from torch.nn import functional as F
+from ...utils.logging import get_logger
+logger = get_logger()
+SUPPORTED_QUANTIZATIONS: Dict[str, list[str]] = {
+    "rbln": ["w4a16"],
+}
+class QuantizationManager:
+    # The RBLN_QUANT_BITS environment variable defines the precision of each layer during the graph compilation process.
+    # It specifies the quantization bit depth. For instance, setting RBLN_QUANT_BITS=4 will apply 4-bit precision for quantization.
+    RBLN_QUANT_BITS_ENV = "RBLN_QUANT_BITS"
+    @staticmethod
+    def _raise_invalid_config_error(
+        key: str, value: str, valid_values: list[str], context: Optional[str] = None
+    ) -> None:
+        context_info = f" for {context}" if context else ""
+        valid_values_str = ", ".join(valid_values)
+        raise ValueError(f"Invalid {key}: {value}{context_info}. " f"Supported values are: {valid_values_str}")
+    @staticmethod
+    def validate_quantization_config(quantize_config: Optional[dict]) -> Optional[dict]:
+        if not quantize_config:
+            return None
+        q_format = quantize_config.get("format")
+        q_precision = quantize_config.get("precision")
+        if q_format not in SUPPORTED_QUANTIZATIONS:
+            QuantizationManager._raise_invalid_config_error(
+                "quantization format", q_format, list(SUPPORTED_QUANTIZATIONS.keys())
+            )
+        if q_precision not in SUPPORTED_QUANTIZATIONS[q_format]:
+            QuantizationManager._raise_invalid_config_error(
+                "precision", q_precision, SUPPORTED_QUANTIZATIONS[q_format], q_format
+            )
+        return quantize_config
+    @classmethod
+    def _set_env_var(cls, name: str, value: str) -> None:
+        os.environ[name] = value
+    @classmethod
+    def _unset_env_var(cls, name: str) -> None:
+        os.environ.pop(name, None)
+    @classmethod
+    def set_quantization_env(cls, quantize_config: Optional[dict]) -> Optional[str]:
+        quantize_config = cls.validate_quantization_config(quantize_config)
+        if quantize_config:
+            q_precision: str = quantize_config["precision"]
+            quant_bits = q_precision.split("w")[1].split("a")[0]
+            cls._set_env_var(cls.RBLN_QUANT_BITS_ENV, quant_bits)
+            return cls.RBLN_QUANT_BITS_ENV
+        return None
+    @classmethod
+    def reset_quantization_env(cls, env_var_name: Optional[str]) -> None:
+        if env_var_name:
+            cls._unset_env_var(env_var_name)
+    @classmethod
+    def with_quantization_env(cls, func: Callable) -> Callable:
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            quantize_config = kwargs.get("quantize_config")
+            quantize_env_var = cls.set_quantization_env(quantize_config)
+            try:
+                return func(*args, **kwargs)
+            finally:
+                cls.reset_quantization_env(quantize_env_var)
+        return wrapper
 # Constants
 QUANTIZED_WEIGHTS = {
-    "q_proj", "k_proj", "v_proj", "o_proj",
-    "gate_proj", "up_proj", "down_proj",
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
 }
-def update_layers_to_quantized(module: torch.nn.Module) -> None:
+def prepare_model_for_quantization(model: torch.nn.Module, model_id: str, n_layer: Optional[int] = None) -> None:
+    """
+    Prepare the model for quantization by updating specified linear layers to quantized (qlinear) layers.
+    """
+    update_layers_to_quantize(model)
+    load_weights(model, model_id, n_layer)
+def update_layers_to_quantize(module: torch.nn.Module) -> None:
     """
     Updates specified linear layers to quantized (qlinear) layers in the given module.
     """
@@ -49,7 +145,33 @@ def update_layers_to_quantized(module: torch.nn.Module) -> None:
             processed_layers.append(name)
     if processed_layers:
-        print(f"Updated the following linear layers to quantized layers:\n {{{', '.join(processed_layers)}}}")
+        logger.debug(f"Updated the following linear layers to quantized layers:\n {{{', '.join(processed_layers)}}}")
+def load_weights(model, model_id, n_layer=None):
+    """
+    Load safetensor file data directly into the model, filtering by layer if n_layer is provided.
+    """
+    model_params = dict(model.named_parameters(recurse=True))
+    model_buffers = dict(model.named_buffers(recurse=True))
+    safetensor_files = glob.glob(f"{model_id}/*.safetensors")
+    target_layers = list(range(n_layer)) if n_layer is not None else None
+    for safetensor_file in safetensor_files:
+        file_data = load_file(safetensor_file)
+        for key, value in file_data.items():
+            if target_layers is not None:
+                parts = key.split(".")
+                if len(parts) > 2 and parts[2].isdigit() and (int(parts[2]) not in target_layers):
+                    continue
+            if key in model_params:
+                model_params[key].data.copy_(value)
+            elif key in model_buffers:
+                model_buffers[key].data.copy_(value)
 def is_target_for_qlinear_replacement(layer_name: str, layer: torch.nn.Module) -> bool:
@@ -81,6 +203,7 @@ def create_qlinear(layer: Linear) -> Linear:
     """
     Converts a standard linear layer to a quantized linear (qlinear) layer with a custom forward pass.
     """
     def qlinear_forward(self, inputs: torch.Tensor) -> torch.Tensor:
         if inputs.dtype != self.scales.dtype:
             raise TypeError(f"Expected input dtype {self.scales.dtype}, but got {inputs.dtype}")

optimum/rbln/utils/decorator_utils.py ADDED Viewed

@@ -0,0 +1,59 @@
+from functools import wraps
+from .logging import get_logger
+logger = get_logger(__name__)
+def remove_compile_time_kwargs(func):
+    """
+    Decorator to handle compile-time parameters during inference.
+    For RBLN-optimized pipelines, several parameters must be determined during compilation
+    and cannot be modified during inference. This decorator:
+    1. Removes and warns about LoRA scale in cross_attention_kwargs
+    2. Removes and warns about image dimension parameters (height, width)
+    Args:
+        func: The pipeline's __call__ method to be wrapped
+    """
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        height_exists = "height" in kwargs and kwargs["height"] is not None
+        width_exists = "width" in kwargs and kwargs["width"] is not None
+        compiled_image_size = self.vae.image_size
+        if height_exists or width_exists:
+            if kwargs["height"] == compiled_image_size[0] and kwargs["width"] == compiled_image_size[1]:
+                pass
+            else:
+                logger.warning(
+                    "Image dimension parameters (`height`, `width`) will be ignored during inference. "
+                    "Image dimensions must be specified during model compilation using from_pretrained()."
+                )
+                kwargs.pop("width", None)
+                kwargs.pop("height", None)
+        if "cross_attention_kwargs" in kwargs:
+            cross_attention_kwargs = kwargs.get("cross_attention_kwargs")
+            if not cross_attention_kwargs:
+                return func(self, *args, **kwargs)
+            has_scale = "scale" in cross_attention_kwargs
+            if has_scale:
+                logger.warning(
+                    "LoRA scale in cross_attention_kwargs will be ignored during inference. "
+                    "To adjust LoRA scale, specify it during model compilation using from_pretrained()."
+                )
+                # If scale is the only key, set to None
+                # Otherwise, remove scale and preserve other settings
+                if len(cross_attention_kwargs) == 1:
+                    kwargs["cross_attention_kwargs"] = None
+                else:
+                    kwargs["cross_attention_kwargs"].pop("scale")
+        return func(self, *args, **kwargs)
+    return wrapper

optimum/rbln/utils/hub.py ADDED Viewed

@@ -0,0 +1,131 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+import os
+from pathlib import Path
+from typing import List, Optional, Union
+from huggingface_hub import HfApi, HfFolder, hf_hub_download
+class PushToHubMixin:
+    def push_to_hub(
+        self,
+        save_directory: str,
+        repository_id: str,
+        private: Optional[bool] = None,
+        use_auth_token: Union[bool, str] = True,
+    ) -> str:
+        huggingface_token = _get_huggingface_token(use_auth_token)
+        api = HfApi()
+        api.create_repo(
+            token=huggingface_token,
+            repo_id=repository_id,
+            exist_ok=True,
+            private=private,
+        )
+        for path, subdirs, files in os.walk(save_directory):
+            for name in files:
+                local_file_path = os.path.join(path, name)
+                _, hub_file_path = os.path.split(local_file_path)
+                # FIXME: when huggingface_hub fixes the return of upload_file
+                try:
+                    api.upload_file(
+                        token=huggingface_token,
+                        repo_id=f"{repository_id}",
+                        path_or_fileobj=os.path.join(os.getcwd(), local_file_path),
+                        path_in_repo=hub_file_path,
+                    )
+                except KeyError:
+                    pass
+                except NameError:
+                    pass
+def pull_compiled_model_from_hub(
+    model_id: Union[str, Path],
+    subfolder: str,
+    use_auth_token: Optional[Union[bool, str]],
+    revision: Optional[str],
+    cache_dir: Optional[str],
+    force_download: bool,
+    local_files_only: bool,
+) -> Path:
+    """Pull model files from the Hugging Face Hub."""
+    huggingface_token = _get_huggingface_token(use_auth_token)
+    repo_files = list(
+        map(
+            Path,
+            HfApi().list_repo_files(model_id, revision=revision, token=huggingface_token),
+        )
+    )
+    pattern_rbln = "*.rbln" if subfolder == "" else f"{subfolder}/*.rbln"
+    rbln_files = [p for p in repo_files if p.match(pattern_rbln)]
+    pattern_config = "rbln_config.json" if subfolder == "" else f"{subfolder}/rbln_config.json"
+    rbln_config_filenames = [p for p in repo_files if p.match(pattern_config)]
+    validate_files(rbln_files, rbln_config_filenames, f"repository {model_id}")
+    filenames = [str(path) for path in repo_files]
+    for filename in filenames:
+        rbln_config_cache_path = hf_hub_download(
+            repo_id=model_id,
+            filename=filename,
+            subfolder=subfolder,
+            use_auth_token=use_auth_token,
+            revision=revision,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            local_files_only=local_files_only,
+        )
+    return Path(rbln_config_cache_path).parent
+def validate_files(
+    files: List[Path],
+    config_files: List[Path],
+    location: str,
+):
+    """Validate the presence and count of required files."""
+    if len(files) == 0:
+        raise FileNotFoundError(f"Could not find any rbln model file in {location}")
+    if len(config_files) == 0:
+        raise FileNotFoundError(f"Could not find `rbln_config.json` file in {location}")
+    if len(config_files) > 1:
+        raise FileExistsError(f"Multiple rbln_config.json files found in {location}. This is not expected.")
+def _get_huggingface_token(use_auth_token: Union[bool, str]) -> str:
+    if isinstance(use_auth_token, str):
+        return use_auth_token
+    elif use_auth_token:
+        return HfFolder.get_token()
+    else:
+        raise ValueError("`use_auth_token` must be provided to interact with the Hugging Face Hub.")

optimum/rbln/utils/import_utils.py CHANGED Viewed

@@ -37,6 +37,27 @@ class VersionCompat:
 RBLN_VERSION_COMPATS = {
+    "0.1.15": [
+        VersionCompat(
+            package_name="rebel-compiler",
+            min_version="0.6.2",
+            max_version="0.6.3",
+        ),
+    ],
+    "0.1.14": [
+        VersionCompat(
+            package_name="rebel-compiler",
+            min_version="0.6.2",
+            max_version="0.6.3",
+        ),
+    ],
+    "0.1.13": [
+        VersionCompat(
+            package_name="rebel-compiler",
+            min_version="0.6.0",
+            max_version="0.6.2",
+        ),
+    ],
     "0.1.12": [
         VersionCompat(
             package_name="rebel-compiler",

optimum-rbln 0.1.12__py3-none-any.whl → 0.1.15__py3-none-any.whl

optimum-rbln 0.1.12py3-none-any.whl → 0.1.15py3-none-any.whl