PyPI - optimum-rbln - Versions diffs - 0.2.1a4__py3-none-any.whl → 0.7.2__py3-none-any.whl - Mend

optimum-rbln 0.2.1a4py3-none-any.whl → 0.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

optimum/rbln/diffusers/models/transformers/prior_transformer.py ADDED Viewed

@@ -0,0 +1,174 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+import torch
+from diffusers.models.transformers.prior_transformer import PriorTransformer, PriorTransformerOutput
+from transformers import PretrainedConfig, PreTrainedModel
+from ....modeling import RBLNModel
+from ....modeling_config import RBLNCompileConfig, RBLNConfig
+from ....utils.logging import get_logger
+from ....utils.runtime_utils import RBLNPytorchRuntime
+from ...modeling_diffusers import RBLNDiffusionMixin
+logger = get_logger(__name__)
+class RBLNRuntimePriorTransformer(RBLNPytorchRuntime):
+    def forward(
+        self, hidden_states, timestep, proj_embedding, encoder_hidden_states, attention_mask, return_dict: bool = True
+    ):
+        predicted_image_embedding = super().forward(
+            hidden_states,
+            timestep,
+            proj_embedding,
+            encoder_hidden_states,
+            attention_mask,
+        )
+        if return_dict:
+            return PriorTransformerOutput(predicted_image_embedding=predicted_image_embedding)
+        else:
+            return (predicted_image_embedding,)
+class _PriorTransformer(torch.nn.Module):
+    def __init__(self, prior: PriorTransformer):
+        super().__init__()
+        self._prior = prior
+    def forward(
+        self,
+        hidden_states,
+        timestep,
+        proj_embedding,
+        encoder_hidden_states,
+        attention_mask,
+        return_dict=True,
+    ):
+        return self._prior.forward(
+            hidden_states,
+            timestep,
+            proj_embedding,
+            encoder_hidden_states,
+            attention_mask,
+            return_dict=False,
+        )
+class RBLNPriorTransformer(RBLNModel):
+    hf_library_name = "diffusers"
+    auto_model_class = PriorTransformer
+    def __post_init__(self, **kwargs):
+        super().__post_init__(**kwargs)
+        self.runtime = RBLNRuntimePriorTransformer(runtime=self.model[0])
+        artifacts = torch.load(self.model_save_dir / self.subfolder / "torch_artifacts.pth", weights_only=False)
+        self.clip_mean = artifacts["clip_mean"]
+        self.clip_std = artifacts["clip_std"]
+    @classmethod
+    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNConfig) -> torch.nn.Module:
+        return _PriorTransformer(model).eval()
+    @classmethod
+    def update_rbln_config_using_pipe(cls, pipe: RBLNDiffusionMixin, rbln_config: Dict[str, Any]) -> Dict[str, Any]:
+        batch_size = rbln_config.get("batch_size")
+        if not batch_size:
+            do_classifier_free_guidance = rbln_config.get("guidance_scale", 5.0) > 1.0
+            batch_size = 2 if do_classifier_free_guidance else 1
+        else:
+            if rbln_config.get("guidance_scale"):
+                logger.warning(
+                    "guidance_scale is ignored because batch size is explicitly specified. "
+                    "To ensure consistent behavior, consider removing the guidance scale or "
+                    "adjusting the batch size configuration as needed."
+                )
+        embedding_dim = rbln_config.get("embedding_dim", pipe.prior.config.embedding_dim)
+        num_embeddings = rbln_config.get("num_embeddings", pipe.prior.config.num_embeddings)
+        rbln_config.update(
+            {
+                "batch_size": batch_size,
+                "embedding_dim": embedding_dim,
+                "num_embeddings": num_embeddings,
+            }
+        )
+        return rbln_config
+    @classmethod
+    def save_torch_artifacts(
+        cls,
+        model: "PreTrainedModel",
+        save_dir_path: Path,
+        subfolder: str,
+        rbln_config: RBLNConfig,
+    ):
+        save_dict = {}
+        save_dict["clip_mean"] = model.clip_mean
+        save_dict["clip_std"] = model.clip_std
+        torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")
+    @classmethod
+    def _get_rbln_config(
+        cls,
+        preprocessors,
+        model_config: PretrainedConfig,
+        rbln_kwargs,
+    ) -> RBLNConfig:
+        batch_size = rbln_kwargs.get("batch_size") or 1
+        embedding_dim = rbln_kwargs.get("embedding_dim") or model_config.embedding_dim
+        num_embeddings = rbln_kwargs.get("num_embeddings") or model_config.num_embeddings
+        input_info = [
+            ("hidden_states", [batch_size, embedding_dim], "float32"),
+            ("timestep", [], "float32"),
+            ("proj_embedding", [batch_size, embedding_dim], "float32"),
+            ("encoder_hidden_states", [batch_size, num_embeddings, embedding_dim], "float32"),
+            ("attention_mask", [batch_size, num_embeddings], "float32"),
+        ]
+        rbln_compile_config = RBLNCompileConfig(input_info=input_info)
+        rbln_config = RBLNConfig(
+            rbln_cls=cls.__name__,
+            compile_cfgs=[rbln_compile_config],
+            rbln_kwargs=rbln_kwargs,
+        )
+        return rbln_config
+    def forward(
+        self,
+        hidden_states,
+        timestep: Union[torch.Tensor, float, int],
+        proj_embedding: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        return_dict: bool = True,
+    ):
+        return self.runtime.forward(
+            hidden_states.contiguous(),
+            timestep.float(),
+            proj_embedding,
+            encoder_hidden_states,
+            attention_mask.float(),
+            return_dict,
+        )
+    def post_process_latents(self, prior_latents):
+        prior_latents = (prior_latents * self.clip_std) + self.clip_mean
+        return prior_latents

optimum/rbln/diffusers/models/unets/unet_2d_condition.py CHANGED Viewed

@@ -115,6 +115,29 @@ class _UNet_SDXL(torch.nn.Module):
         return unet_out
+class _UNet_Kandinsky(torch.nn.Module):
+    def __init__(self, unet: "UNet2DConditionModel"):
+        super().__init__()
+        self.unet = unet
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        image_embeds: torch.Tensor,
+    ) -> torch.Tensor:
+        added_cond_kwargs = {"image_embeds": image_embeds}
+        unet_out = self.unet(
+            sample=sample,
+            timestep=timestep,
+            encoder_hidden_states=None,
+            added_cond_kwargs=added_cond_kwargs,
+            return_dict=False,
+        )
+        return unet_out
 class RBLNUNet2DConditionModel(RBLNModel):
     hf_library_name = "diffusers"
     auto_model_class = UNet2DConditionModel
@@ -138,6 +161,8 @@ class RBLNUNet2DConditionModel(RBLNModel):
     def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNConfig) -> torch.nn.Module:
         if model.config.addition_embed_type == "text_time":
             return _UNet_SDXL(model).eval()
+        elif model.config.addition_embed_type == "image":
+            return _UNet_Kandinsky(model).eval()
         else:
             return _UNet_SD(model).eval()
@@ -146,6 +171,7 @@ class RBLNUNet2DConditionModel(RBLNModel):
         cls, pipe: RBLNDiffusionMixin, rbln_config: Dict[str, Any]
     ) -> Union[int, Tuple[int, int]]:
         image_size = (rbln_config.get("img_height"), rbln_config.get("img_width"))
+        scale_factor = pipe.movq_scale_factor if hasattr(pipe, "movq_scale_factor") else pipe.vae_scale_factor
         if (image_size[0] is None) != (image_size[1] is None):
             raise ValueError("Both image height and image width must be given or not given")
         elif image_size[0] is None and image_size[1] is None:
@@ -153,22 +179,23 @@ class RBLNUNet2DConditionModel(RBLNModel):
                 # In case of img2img, sample size of unet is determined by vae encoder.
                 vae_sample_size = pipe.vae.config.sample_size
                 if isinstance(vae_sample_size, int):
-                    sample_size = vae_sample_size // pipe.vae_scale_factor
+                    sample_size = vae_sample_size // scale_factor
                 else:
                     sample_size = (
-                        vae_sample_size[0] // pipe.vae_scale_factor,
-                        vae_sample_size[1] // pipe.vae_scale_factor,
+                        vae_sample_size[0] // scale_factor,
+                        vae_sample_size[1] // scale_factor,
                     )
             else:
                 sample_size = pipe.unet.config.sample_size
         else:
-            sample_size = (image_size[0] // pipe.vae_scale_factor, image_size[1] // pipe.vae_scale_factor)
+            sample_size = (image_size[0] // scale_factor, image_size[1] // scale_factor)
         return sample_size
     @classmethod
     def update_rbln_config_using_pipe(cls, pipe: RBLNDiffusionMixin, rbln_config: Dict[str, Any]) -> Dict[str, Any]:
         text_model_hidden_size = pipe.text_encoder_2.config.hidden_size if hasattr(pipe, "text_encoder_2") else None
+        image_model_hidden_size = pipe.unet.config.encoder_hid_dim if hasattr(pipe, "unet") else None
         batch_size = rbln_config.get("batch_size")
         if not batch_size:
@@ -184,10 +211,12 @@ class RBLNUNet2DConditionModel(RBLNModel):
                     "adjusting the batch size configuration as needed."
                 )
+        max_seq_len = pipe.text_encoder.config.max_position_embeddings if hasattr(pipe, "text_encoder") else None
         rbln_config.update(
             {
-                "max_seq_len": pipe.text_encoder.config.max_position_embeddings,
+                "max_seq_len": max_seq_len,
                 "text_model_hidden_size": text_model_hidden_size,
+                "image_model_hidden_size": image_model_hidden_size,
                 "sample_size": cls.get_unet_sample_size(pipe, rbln_config),
                 "batch_size": batch_size,
                 "is_controlnet": "controlnet" in pipe.config.keys(),
@@ -218,15 +247,16 @@ class RBLNUNet2DConditionModel(RBLNModel):
         if isinstance(sample_size, int):
             sample_size = (sample_size, sample_size)
-        if max_seq_len is None:
-            raise ValueError("`rbln_max_seq_len` (ex. text_encoder's max_position_embeddings) must be specified.")
         input_info = [
             ("sample", [batch_size, model_config.in_channels, sample_size[0], sample_size[1]], "float32"),
             ("timestep", [], "float32"),
-            ("encoder_hidden_states", [batch_size, max_seq_len, model_config.cross_attention_dim], "float32"),
         ]
+        if max_seq_len is not None:
+            input_info.append(
+                ("encoder_hidden_states", [batch_size, max_seq_len, model_config.cross_attention_dim], "float32"),
+            )
         if is_controlnet:
             # down block addtional residuals
             first_shape = [batch_size, model_config.block_out_channels[0], sample_size[0], sample_size[1]]
@@ -256,11 +286,15 @@ class RBLNUNet2DConditionModel(RBLNModel):
             ]
             input_info.append(("mid_block_additional_residual", shape, "float32"))
-        if hasattr(model_config, "addition_embed_type") and model_config.addition_embed_type == "text_time":
-            rbln_text_model_hidden_size = rbln_kwargs["text_model_hidden_size"]
-            rbln_in_features = model_config.projection_class_embeddings_input_dim
-            input_info.append(("text_embeds", [batch_size, rbln_text_model_hidden_size], "float32"))
-            input_info.append(("time_ids", [batch_size, 6], "float32"))
+        if hasattr(model_config, "addition_embed_type"):
+            if model_config.addition_embed_type == "text_time":
+                rbln_text_model_hidden_size = rbln_kwargs["text_model_hidden_size"]
+                rbln_in_features = model_config.projection_class_embeddings_input_dim
+                input_info.append(("text_embeds", [batch_size, rbln_text_model_hidden_size], "float32"))
+                input_info.append(("time_ids", [batch_size, 6], "float32"))
+            elif model_config.addition_embed_type == "image":
+                rbln_image_model_hidden_size = rbln_kwargs["image_model_hidden_size"]
+                input_info.append(("image_embeds", [batch_size, rbln_image_model_hidden_size], "float32"))
         rbln_compile_config = RBLNCompileConfig(input_info=input_info)
@@ -323,6 +357,15 @@ class RBLNUNet2DConditionModel(RBLNModel):
                 ),
             )
+        if "image_embeds" in added_cond_kwargs:
+            return (
+                super().forward(
+                    sample.contiguous(),
+                    timestep.float(),
+                    **added_cond_kwargs,
+                ),
+            )
         return (
             super().forward(
                 sample.contiguous(),

optimum/rbln/diffusers/pipelines/__init__.py CHANGED Viewed

@@ -25,6 +25,11 @@ _import_structure = {
         "RBLNStableDiffusionXLControlNetImg2ImgPipeline",
         "RBLNStableDiffusionXLControlNetPipeline",
     ],
+    "kandinsky2_2": [
+        "RBLNKandinskyV22InpaintCombinedPipeline",
+        "RBLNKandinskyV22InpaintPipeline",
+        "RBLNKandinskyV22PriorPipeline",
+    ],
     "stable_diffusion": [
         "RBLNStableDiffusionImg2ImgPipeline",
         "RBLNStableDiffusionPipeline",
@@ -49,6 +54,11 @@ if TYPE_CHECKING:
         RBLNStableDiffusionXLControlNetImg2ImgPipeline,
         RBLNStableDiffusionXLControlNetPipeline,
     )
+    from .kandinsky2_2 import (
+        RBLNKandinskyV22InpaintCombinedPipeline,
+        RBLNKandinskyV22InpaintPipeline,
+        RBLNKandinskyV22PriorPipeline,
+    )
     from .stable_diffusion import (
         RBLNStableDiffusionImg2ImgPipeline,
         RBLNStableDiffusionInpaintPipeline,

optimum/rbln/diffusers/pipelines/kandinsky2_2/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .pipeline_kandinsky2_2_combined import RBLNKandinskyV22InpaintCombinedPipeline
+from .pipeline_kandinsky2_2_inpaint import RBLNKandinskyV22InpaintPipeline
+from .pipeline_kandinsky2_2_prior import RBLNKandinskyV22PriorPipeline

optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py ADDED Viewed

@@ -0,0 +1,83 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from diffusers import (
+    DDPMScheduler,
+    KandinskyV22InpaintCombinedPipeline,
+    PriorTransformer,
+    UnCLIPScheduler,
+    UNet2DConditionModel,
+    VQModel,
+)
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+from ...modeling_diffusers import RBLNDiffusionMixin
+from .pipeline_kandinsky2_2_inpaint import RBLNKandinskyV22InpaintPipeline
+from .pipeline_kandinsky2_2_prior import RBLNKandinskyV22PriorPipeline
+class RBLNKandinskyV22InpaintCombinedPipeline(RBLNDiffusionMixin, KandinskyV22InpaintCombinedPipeline):
+    original_class = KandinskyV22InpaintCombinedPipeline
+    _connected_classes = {"prior_pipe": RBLNKandinskyV22PriorPipeline, "decoder_pipe": RBLNKandinskyV22InpaintPipeline}
+    _submodules = ["prior_pipe", "decoder_pipe"]
+    _prefix = {"prior_pipe": "prior_"}
+    def __init__(
+        self,
+        unet: "UNet2DConditionModel",
+        scheduler: "DDPMScheduler",
+        movq: "VQModel",
+        prior_prior: "PriorTransformer",
+        prior_image_encoder: "CLIPVisionModelWithProjection",
+        prior_text_encoder: "CLIPTextModelWithProjection",
+        prior_tokenizer: "CLIPTokenizer",
+        prior_scheduler: "UnCLIPScheduler",
+        prior_image_processor: "CLIPImageProcessor",
+    ):
+        RBLNDiffusionMixin.__init__(self)
+        super(KandinskyV22InpaintCombinedPipeline, self).__init__()
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = RBLNKandinskyV22PriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = RBLNKandinskyV22InpaintPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+    def get_compiled_image_size(self):
+        return self.movq.image_size

optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py ADDED Viewed

@@ -0,0 +1,22 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from diffusers import KandinskyV22InpaintPipeline
+from ...modeling_diffusers import RBLNDiffusionMixin
+class RBLNKandinskyV22InpaintPipeline(RBLNDiffusionMixin, KandinskyV22InpaintPipeline):
+    original_class = KandinskyV22InpaintPipeline
+    _submodules = ["unet", "movq"]

optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py ADDED Viewed

@@ -0,0 +1,22 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from diffusers import KandinskyV22PriorPipeline
+from ...modeling_diffusers import RBLNDiffusionMixin
+class RBLNKandinskyV22PriorPipeline(RBLNDiffusionMixin, KandinskyV22PriorPipeline):
+    original_class = KandinskyV22PriorPipeline
+    _submodules = ["text_encoder", "image_encoder", "prior"]

optimum/rbln/modeling_base.py CHANGED Viewed

@@ -442,8 +442,9 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
-        real_save_dir = self.model_save_dir / self.subfolder
-        save_directory_path = Path(save_directory)
+        # Normalize paths to handle relative paths and symlinks
+        real_save_dir = Path(self.model_save_dir).resolve() / self.subfolder
+        save_directory_path = Path(save_directory).resolve()
         if not os.path.exists(real_save_dir) or not os.path.isdir(real_save_dir):
             raise FileNotFoundError(
@@ -452,13 +453,13 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
                 f"Please ensure the model directory exists and you have the necessary permissions to access it."
             )
-        if save_directory_path.absolute() == real_save_dir.absolute():
+        if save_directory_path == real_save_dir:
             raise FileExistsError(
                 f"Cannot save model to '{save_directory}'. This directory already exists and contains the model files."
             )
-        # Create a temporary directory next to the target directory
-        tmp_dir = save_directory + ".tmp"
+        # Create a temporary directory with normalized path
+        tmp_dir = str(save_directory_path) + ".tmp"
         try:
             # Remove temporary directory if it exists from a previous failed attempt
             if os.path.exists(tmp_dir):
@@ -473,9 +474,9 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
                 self.generation_config.save_pretrained(tmp_dir)
             # If everything succeeded, atomically replace the target directory
-            if os.path.exists(save_directory):
-                shutil.rmtree(save_directory)
-            os.rename(tmp_dir, save_directory)
+            if os.path.exists(save_directory_path):
+                shutil.rmtree(save_directory_path)
+            os.rename(tmp_dir, save_directory_path)
         except Exception as e:
             # Clean up the temporary directory if anything fails
@@ -484,7 +485,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
             raise e  # Re-raise the exception after cleanup
         if push_to_hub:
-            return super().push_to_hub(save_directory, **kwargs)
+            return super().push_to_hub(str(save_directory_path), **kwargs)
     @staticmethod
     def _raise_missing_compiled_file_error(missing_files: List[str]):

optimum/rbln/transformers/__init__.py CHANGED Viewed

@@ -40,6 +40,7 @@ _import_structure = {
         "RBLNCLIPTextModel",
         "RBLNCLIPTextModelWithProjection",
         "RBLNCLIPVisionModel",
+        "RBLNCLIPVisionModelWithProjection",
         "RBLNDPTForDepthEstimation",
         "RBLNExaoneForCausalLM",
         "RBLNGemmaForCausalLM",
@@ -99,6 +100,7 @@ if TYPE_CHECKING:
         RBLNCLIPTextModel,
         RBLNCLIPTextModelWithProjection,
         RBLNCLIPVisionModel,
+        RBLNCLIPVisionModelWithProjection,
         RBLNDPTForDepthEstimation,
         RBLNExaoneForCausalLM,
         RBLNGemmaForCausalLM,

optimum/rbln/transformers/models/__init__.py CHANGED Viewed

@@ -34,7 +34,12 @@ _import_structure = {
     ],
     "bart": ["RBLNBartForConditionalGeneration", "RBLNBartModel"],
     "bert": ["RBLNBertModel", "RBLNBertForQuestionAnswering", "RBLNBertForMaskedLM"],
-    "clip": ["RBLNCLIPTextModel", "RBLNCLIPTextModelWithProjection", "RBLNCLIPVisionModel"],
+    "clip": [
+        "RBLNCLIPTextModel",
+        "RBLNCLIPTextModelWithProjection",
+        "RBLNCLIPVisionModel",
+        "RBLNCLIPVisionModelWithProjection",
+    ],
     "dpt": ["RBLNDPTForDepthEstimation"],
     "exaone": ["RBLNExaoneForCausalLM"],
     "gemma": ["RBLNGemmaForCausalLM"],
@@ -68,7 +73,12 @@ if TYPE_CHECKING:
     )
     from .bart import RBLNBartForConditionalGeneration, RBLNBartModel
     from .bert import RBLNBertForMaskedLM, RBLNBertForQuestionAnswering, RBLNBertModel
-    from .clip import RBLNCLIPTextModel, RBLNCLIPTextModelWithProjection, RBLNCLIPVisionModel
+    from .clip import (
+        RBLNCLIPTextModel,
+        RBLNCLIPTextModelWithProjection,
+        RBLNCLIPVisionModel,
+        RBLNCLIPVisionModelWithProjection,
+    )
     from .dpt import RBLNDPTForDepthEstimation
     from .exaone import RBLNExaoneForCausalLM
     from .gemma import RBLNGemmaForCausalLM

optimum/rbln/transformers/models/clip/__init__.py CHANGED Viewed

@@ -12,4 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .modeling_clip import RBLNCLIPTextModel, RBLNCLIPTextModelWithProjection, RBLNCLIPVisionModel
+from .modeling_clip import (
+    RBLNCLIPTextModel,
+    RBLNCLIPTextModelWithProjection,
+    RBLNCLIPVisionModel,
+    RBLNCLIPVisionModelWithProjection,
+)

optimum/rbln/transformers/models/clip/modeling_clip.py CHANGED Viewed

@@ -22,7 +22,7 @@ from transformers import (
     CLIPVisionModel,
 )
 from transformers.modeling_outputs import BaseModelOutputWithPooling
-from transformers.models.clip.modeling_clip import CLIPTextModelOutput
+from transformers.models.clip.modeling_clip import CLIPTextModelOutput, CLIPVisionModelOutput
 from ....diffusers.modeling_diffusers import RBLNDiffusionMixin
 from ....modeling import RBLNModel
@@ -116,6 +116,10 @@ class RBLNCLIPVisionModel(RBLNModel):
     def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNConfig) -> torch.nn.Module:
         return _VisionEncoder(model).eval()
+    @classmethod
+    def update_rbln_config_using_pipe(cls, pipe: RBLNDiffusionMixin, rbln_config: Dict[str, Any]) -> Dict[str, Any]:
+        return rbln_config
     @classmethod
     def _get_rbln_config(
         cls,
@@ -179,3 +183,24 @@ class RBLNCLIPVisionModel(RBLNModel):
             pooler_output=output[1],
             hidden_states=output[2:],
         )
+class RBLNCLIPVisionModelWithProjection(RBLNCLIPVisionModel):
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, CLIPVisionModelOutput]:
+        if len(kwargs) > 0 and any(kwargs.values()):
+            logger.warning(f"Currently, optimum-rbln does not support kwargs {kwargs.keys()} for {self.__class__}.")
+        output = super().forward(pixel_values)
+        image_embeds = output[0]
+        last_hidden_state = output[1]
+        hidden_states = output[2:]
+        return CLIPVisionModelOutput(
+            image_embeds=image_embeds,
+            last_hidden_state=last_hidden_state,
+            hidden_states=hidden_states,
+        )

optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py CHANGED Viewed

@@ -427,12 +427,14 @@ class DecoderOnlyModel(nn.Module):
             cos, sin = None, None
         # (batch, seq_len) -> (batch,)
-        seq_positions = cache_position[:, 0]
         if self.attn_impl == "flash_attn":
+            seq_positions = cache_position[:, 0]
             max_seq_len = past_key_values[0][0].shape[-2]
             seq_positions = self.convert_sequence_positions_for_flash_attn(
                 seq_positions=seq_positions, max_seq_len=max_seq_len
             )
+        else:
+            seq_positions = cache_position[:, :1]
         present_key_values = past_key_values
         for layer in self.layers:

optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py CHANGED Viewed

@@ -459,7 +459,7 @@ class Seq2SeqSelfAttention(nn.Module):
             ),  # Unsqueeze group axis since CustomKernel expects it for group query attention
             past_key_value[0].view(bsz, self.num_heads, 1, -1, self.head_dim),
             past_key_value[1].view(bsz, self.num_heads, 1, -1, self.head_dim),
-            cache_position.squeeze(1),
+            cache_position,
             torch.tensor(1.0, dtype=torch.float32),  # scale
         )

optimum-rbln 0.2.1a4__py3-none-any.whl → 0.7.2__py3-none-any.whl

optimum-rbln 0.2.1a4py3-none-any.whl → 0.7.2py3-none-any.whl