PyPI - optimum-rbln - Versions diffs - 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl - Mend

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py ADDED Viewed

@@ -0,0 +1,114 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Optional
+from ....configuration_utils import RBLNModelConfig
+from ....transformers import RBLNCLIPVisionModelWithProjectionConfig
+from ..models import RBLNAutoencoderKLTemporalDecoderConfig, RBLNUNetSpatioTemporalConditionModelConfig
+class RBLNStableVideoDiffusionPipelineConfig(RBLNModelConfig):
+    submodules = ["image_encoder", "unet", "vae"]
+    _vae_uses_encoder = True
+    def __init__(
+        self,
+        image_encoder: Optional[RBLNCLIPVisionModelWithProjectionConfig] = None,
+        unet: Optional[RBLNUNetSpatioTemporalConditionModelConfig] = None,
+        vae: Optional[RBLNAutoencoderKLTemporalDecoderConfig] = None,
+        *,
+        batch_size: Optional[int] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: Optional[int] = None,
+        decode_chunk_size: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        **kwargs: Any,
+    ):
+        """
+        Args:
+            image_encoder (Optional[RBLNCLIPVisionModelWithProjectionConfig]): Configuration for the image encoder component.
+                Initialized as RBLNCLIPVisionModelWithProjectionConfig if not provided.
+            unet (Optional[RBLNUNetSpatioTemporalConditionModelConfig]): Configuration for the UNet model component.
+                Initialized as RBLNUNetSpatioTemporalConditionModelConfig if not provided.
+            vae (Optional[RBLNAutoencoderKLTemporalDecoderConfig]): Configuration for the VAE model component.
+                Initialized as RBLNAutoencoderKLTemporalDecoderConfig if not provided.
+            batch_size (Optional[int]): Batch size for inference, applied to all submodules.
+            height (Optional[int]): Height of the generated images.
+            width (Optional[int]): Width of the generated images.
+            num_frames (Optional[int]): The number of frames in the generated video.
+            decode_chunk_size (Optional[int]): The number of frames to decode at once during VAE decoding.
+                Useful for managing memory usage during video generation.
+            guidance_scale (Optional[float]): Scale for classifier-free guidance.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
+        Raises:
+            ValueError: If both image_size and height/width are provided.
+        Note:
+            When guidance_scale > 1.0, the UNet batch size is automatically doubled to
+            accommodate classifier-free guidance.
+        """
+        super().__init__(**kwargs)
+        if height is not None and width is not None:
+            image_size = (height, width)
+        else:
+            # Get default image size from original class to set UNet, VAE image size
+            height = self.get_default_values_for_original_cls("__call__", ["height"])["height"]
+            width = self.get_default_values_for_original_cls("__call__", ["width"])["width"]
+            image_size = (height, width)
+        self.image_encoder = self.initialize_submodule_config(
+            image_encoder, cls_name="RBLNCLIPVisionModelWithProjectionConfig", batch_size=batch_size
+        )
+        self.unet = self.initialize_submodule_config(
+            unet,
+            cls_name="RBLNUNetSpatioTemporalConditionModelConfig",
+            num_frames=num_frames,
+        )
+        self.vae = self.initialize_submodule_config(
+            vae,
+            cls_name="RBLNAutoencoderKLTemporalDecoderConfig",
+            batch_size=batch_size,
+            num_frames=num_frames,
+            decode_chunk_size=decode_chunk_size,
+            uses_encoder=self.__class__._vae_uses_encoder,
+            sample_size=image_size,  # image size is equal to sample size in vae
+        )
+        # Get default guidance scale from original class to set UNet batch size
+        if guidance_scale is None:
+            guidance_scale = self.get_default_values_for_original_cls("__call__", ["max_guidance_scale"])[
+                "max_guidance_scale"
+            ]
+        if not self.unet.batch_size_is_specified:
+            do_classifier_free_guidance = guidance_scale > 1.0
+            if do_classifier_free_guidance:
+                self.unet.batch_size = self.image_encoder.batch_size * 2
+            else:
+                self.unet.batch_size = self.image_encoder.batch_size
+    @property
+    def batch_size(self):
+        return self.vae.batch_size
+    @property
+    def sample_size(self):
+        return self.unet.sample_size
+    @property
+    def image_size(self):
+        return self.vae.sample_size

optimum/rbln/diffusers/modeling_diffusers.py CHANGED Viewed

@@ -136,7 +136,7 @@ class RBLNDiffusionMixin:
         *,
         export: bool = None,
         model_save_dir: Optional[PathLike] = None,
-        rbln_config: Dict[str, Any] = {},
+        rbln_config: Optional[Dict[str, Any]] = None,
         lora_ids: Optional[Union[str, List[str]]] = None,
         lora_weights_names: Optional[Union[str, List[str]]] = None,
         lora_scales: Optional[Union[float, List[float]]] = None,

optimum/rbln/diffusers/models/__init__.py CHANGED Viewed

@@ -22,9 +22,11 @@ _import_structure = {
         "RBLNAutoencoderKL",
         "RBLNAutoencoderKLCosmos",
         "RBLNVQModel",
+        "RBLNAutoencoderKLTemporalDecoder",
     ],
     "unets": [
         "RBLNUNet2DConditionModel",
+        "RBLNUNetSpatioTemporalConditionModel",
     ],
     "controlnet": ["RBLNControlNetModel"],
     "transformers": [
@@ -35,10 +37,22 @@ _import_structure = {
 }
 if TYPE_CHECKING:
-    from .autoencoders import RBLNAutoencoderKL, RBLNAutoencoderKLCosmos, RBLNVQModel
+    from .autoencoders import (
+        RBLNAutoencoderKL,
+        RBLNAutoencoderKLCosmos,
+        RBLNAutoencoderKLTemporalDecoder,
+        RBLNVQModel,
+    )
     from .controlnet import RBLNControlNetModel
-    from .transformers import RBLNCosmosTransformer3DModel, RBLNPriorTransformer, RBLNSD3Transformer2DModel
-    from .unets import RBLNUNet2DConditionModel
+    from .transformers import (
+        RBLNCosmosTransformer3DModel,
+        RBLNPriorTransformer,
+        RBLNSD3Transformer2DModel,
+    )
+    from .unets import (
+        RBLNUNet2DConditionModel,
+        RBLNUNetSpatioTemporalConditionModel,
+    )
 else:
     import sys

optimum/rbln/diffusers/models/autoencoders/__init__.py CHANGED Viewed

@@ -14,4 +14,5 @@
 from .autoencoder_kl import RBLNAutoencoderKL
 from .autoencoder_kl_cosmos import RBLNAutoencoderKLCosmos
+from .autoencoder_kl_temporal_decoder import RBLNAutoencoderKLTemporalDecoder
 from .vq_model import RBLNVQModel

optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py CHANGED Viewed

@@ -68,7 +68,7 @@ class RBLNAutoencoderKLCosmos(RBLNModel):
         self.image_size = self.rbln_config.image_size
     @classmethod
-    def wrap_model_if_needed(
+    def _wrap_model_if_needed(
         cls, model: torch.nn.Module, rbln_config: RBLNAutoencoderKLCosmosConfig
     ) -> torch.nn.Module:
         decoder_model = _VAECosmosDecoder(model)
@@ -98,7 +98,7 @@ class RBLNAutoencoderKLCosmos(RBLNModel):
             compiled_models = {}
             if rbln_config.uses_encoder:
-                encoder_model, decoder_model = cls.wrap_model_if_needed(model, rbln_config)
+                encoder_model, decoder_model = cls._wrap_model_if_needed(model, rbln_config)
                 enc_compiled_model = cls.compile(
                     encoder_model,
                     rbln_compile_config=rbln_config.compile_cfgs[0],
@@ -107,7 +107,7 @@ class RBLNAutoencoderKLCosmos(RBLNModel):
                 )
                 compiled_models["encoder"] = enc_compiled_model
             else:
-                decoder_model = cls.wrap_model_if_needed(model, rbln_config)
+                decoder_model = cls._wrap_model_if_needed(model, rbln_config)
             dec_compiled_model = cls.compile(
                 decoder_model,
                 rbln_compile_config=rbln_config.compile_cfgs[-1],

optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py ADDED Viewed

@@ -0,0 +1,275 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Dict, List, Tuple, Union
+import rebel
+import torch  # noqa: I001
+from diffusers import AutoencoderKLTemporalDecoder
+from diffusers.models.autoencoders.vae import DecoderOutput
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from transformers import PretrainedConfig
+from ....configuration_utils import RBLNCompileConfig
+from ....modeling import RBLNModel
+from ....utils.logging import get_logger
+from ...configurations import RBLNAutoencoderKLTemporalDecoderConfig
+from ...modeling_diffusers import RBLNDiffusionMixin
+from .vae import (
+    DiagonalGaussianDistribution,
+    RBLNRuntimeVAEDecoder,
+    RBLNRuntimeVAEEncoder,
+    _VAEEncoder,
+    _VAETemporalDecoder,
+)
+if TYPE_CHECKING:
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig, PreTrainedModel
+    from ...modeling_diffusers import RBLNDiffusionMixin, RBLNDiffusionMixinConfig
+logger = get_logger(__name__)
+class RBLNAutoencoderKLTemporalDecoder(RBLNModel):
+    auto_model_class = AutoencoderKLTemporalDecoder
+    hf_library_name = "diffusers"
+    _rbln_config_class = RBLNAutoencoderKLTemporalDecoderConfig
+    def __post_init__(self, **kwargs):
+        super().__post_init__(**kwargs)
+        if self.rbln_config.uses_encoder:
+            self.encoder = RBLNRuntimeVAEEncoder(runtime=self.model[0], main_input_name="x")
+        self.decoder = RBLNRuntimeVAEDecoder(runtime=self.model[-1], main_input_name="z")
+        self.image_size = self.rbln_config.image_size
+    @classmethod
+    def _wrap_model_if_needed(
+        cls, model: torch.nn.Module, rbln_config: RBLNAutoencoderKLTemporalDecoderConfig
+    ) -> torch.nn.Module:
+        decoder_model = _VAETemporalDecoder(model)
+        decoder_model.num_frames = rbln_config.decode_chunk_size
+        decoder_model.eval()
+        if rbln_config.uses_encoder:
+            encoder_model = _VAEEncoder(model)
+            encoder_model.eval()
+            return encoder_model, decoder_model
+        else:
+            return decoder_model
+    @classmethod
+    def get_compiled_model(
+        cls, model, rbln_config: RBLNAutoencoderKLTemporalDecoderConfig
+    ) -> Dict[str, rebel.RBLNCompiledModel]:
+        compiled_models = {}
+        if rbln_config.uses_encoder:
+            encoder_model, decoder_model = cls._wrap_model_if_needed(model, rbln_config)
+            enc_compiled_model = cls.compile(
+                encoder_model,
+                rbln_compile_config=rbln_config.compile_cfgs[0],
+                create_runtimes=rbln_config.create_runtimes,
+                device=rbln_config.device_map["encoder"],
+            )
+            compiled_models["encoder"] = enc_compiled_model
+        else:
+            decoder_model = cls._wrap_model_if_needed(model, rbln_config)
+        dec_compiled_model = cls.compile(
+            decoder_model,
+            rbln_compile_config=rbln_config.compile_cfgs[-1],
+            create_runtimes=rbln_config.create_runtimes,
+            device=rbln_config.device_map["decoder"],
+        )
+        compiled_models["decoder"] = dec_compiled_model
+        return compiled_models
+    @classmethod
+    def get_vae_sample_size(
+        cls,
+        pipe: "RBLNDiffusionMixin",
+        rbln_config: RBLNAutoencoderKLTemporalDecoderConfig,
+        return_vae_scale_factor: bool = False,
+    ) -> Tuple[int, int]:
+        sample_size = rbln_config.sample_size
+        if hasattr(pipe, "vae_scale_factor"):
+            vae_scale_factor = pipe.vae_scale_factor
+        else:
+            if hasattr(pipe.vae.config, "block_out_channels"):
+                vae_scale_factor = 2 ** (len(pipe.vae.config.block_out_channels) - 1)
+            else:
+                vae_scale_factor = 8  # vae image processor default value 8 (int)
+        if sample_size is None:
+            sample_size = pipe.unet.config.sample_size
+            if isinstance(sample_size, int):
+                sample_size = (sample_size, sample_size)
+            sample_size = (sample_size[0] * vae_scale_factor, sample_size[1] * vae_scale_factor)
+        if return_vae_scale_factor:
+            return sample_size, vae_scale_factor
+        else:
+            return sample_size
+    @classmethod
+    def update_rbln_config_using_pipe(
+        cls, pipe: "RBLNDiffusionMixin", rbln_config: "RBLNDiffusionMixinConfig", submodule_name: str
+    ) -> "RBLNDiffusionMixinConfig":
+        rbln_config.vae.sample_size, rbln_config.vae.vae_scale_factor = cls.get_vae_sample_size(
+            pipe, rbln_config.vae, return_vae_scale_factor=True
+        )
+        if rbln_config.vae.num_frames is None:
+            if hasattr(pipe.unet.config, "num_frames"):
+                rbln_config.vae.num_frames = pipe.unet.config.num_frames
+            else:
+                raise ValueError("num_frames should be specified in unet config.json")
+        if rbln_config.vae.decode_chunk_size is None:
+            rbln_config.vae.decode_chunk_size = rbln_config.vae.num_frames
+        def chunk_frame(num_frames, decode_chunk_size):
+            # get closest divisor to num_frames
+            divisors = [i for i in range(1, num_frames) if num_frames % i == 0]
+            closest = min(divisors, key=lambda x: abs(x - decode_chunk_size))
+            if decode_chunk_size != closest:
+                logger.warning(
+                    f"To ensure successful model compilation and prevent device OOM, {decode_chunk_size} is set to {closest}."
+                )
+            return closest
+        decode_chunk_size = chunk_frame(rbln_config.vae.num_frames, rbln_config.vae.decode_chunk_size)
+        rbln_config.vae.decode_chunk_size = decode_chunk_size
+        return rbln_config
+    @classmethod
+    def _update_rbln_config(
+        cls,
+        preprocessors: Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"],
+        model: "PreTrainedModel",
+        model_config: "PretrainedConfig",
+        rbln_config: RBLNAutoencoderKLTemporalDecoderConfig,
+    ) -> RBLNAutoencoderKLTemporalDecoderConfig:
+        if rbln_config.sample_size is None:
+            rbln_config.sample_size = model_config.sample_size
+        if rbln_config.vae_scale_factor is None:
+            if hasattr(model_config, "block_out_channels"):
+                rbln_config.vae_scale_factor = 2 ** (len(model_config.block_out_channels) - 1)
+            else:
+                # vae image processor default value 8 (int)
+                rbln_config.vae_scale_factor = 8
+        compile_cfgs = []
+        if rbln_config.uses_encoder:
+            vae_enc_input_info = [
+                (
+                    "x",
+                    [
+                        rbln_config.batch_size,
+                        model_config.in_channels,
+                        rbln_config.sample_size[0],
+                        rbln_config.sample_size[1],
+                    ],
+                    "float32",
+                )
+            ]
+            compile_cfgs.append(RBLNCompileConfig(compiled_model_name="encoder", input_info=vae_enc_input_info))
+        decode_batch_size = rbln_config.batch_size * rbln_config.decode_chunk_size
+        vae_dec_input_info = [
+            (
+                "z",
+                [
+                    decode_batch_size,
+                    model_config.latent_channels,
+                    rbln_config.latent_sample_size[0],
+                    rbln_config.latent_sample_size[1],
+                ],
+                "float32",
+            )
+        ]
+        compile_cfgs.append(RBLNCompileConfig(compiled_model_name="decoder", input_info=vae_dec_input_info))
+        rbln_config.set_compile_cfgs(compile_cfgs)
+        return rbln_config
+    @classmethod
+    def _create_runtimes(
+        cls,
+        compiled_models: List[rebel.RBLNCompiledModel],
+        rbln_config: RBLNAutoencoderKLTemporalDecoderConfig,
+    ) -> List[rebel.Runtime]:
+        if len(compiled_models) == 1:
+            # decoder
+            expected_models = ["decoder"]
+        else:
+            expected_models = ["encoder", "decoder"]
+        if any(model_name not in rbln_config.device_map for model_name in expected_models):
+            cls._raise_missing_compiled_file_error(expected_models)
+        device_vals = [rbln_config.device_map[model_name] for model_name in expected_models]
+        return [
+            rebel.Runtime(
+                compiled_model,
+                tensor_type="pt",
+                device=device_val,
+                activate_profiler=rbln_config.activate_profiler,
+                timeout=rbln_config.timeout,
+            )
+            for compiled_model, device_val in zip(compiled_models, device_vals)
+        ]
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode an input image into a latent representation.
+        Args:
+            x: The input image to encode.
+            return_dict:
+                Whether to return output as a dictionary. Defaults to True.
+        Returns:
+            The latent representation or AutoencoderKLOutput if return_dict=True
+        """
+        posterior = self.encoder.encode(x)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> torch.FloatTensor:
+        """
+        Decode a latent representation into a video.
+        Args:
+            z: The latent representation to decode.
+            return_dict:
+                Whether to return output as a dictionary. Defaults to True.
+        Returns:
+            The decoded video or DecoderOutput if return_dict=True
+        """
+        decoded = self.decoder.decode(z)
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)

optimum/rbln/diffusers/models/autoencoders/vae.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Union
 import torch
 from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution, IdentityDistribution
@@ -21,7 +21,7 @@ from ....utils.runtime_utils import RBLNPytorchRuntime
 if TYPE_CHECKING:
-    from diffusers import AutoencoderKL, AutoencoderKLCosmos, VQModel
+    from diffusers import AutoencoderKL, AutoencoderKLCosmos, AutoencoderKLTemporalDecoder, VQModel
 class RBLNRuntimeVAEEncoder(RBLNPytorchRuntime):
@@ -67,18 +67,37 @@ class _VAEDecoder(torch.nn.Module):
         return vae_out
+class _VAETemporalDecoder(torch.nn.Module):
+    def __init__(self, vae: "AutoencoderKLTemporalDecoder"):
+        super().__init__()
+        self.vae = vae
+        self.num_frames = None
+    def forward(self, z):
+        vae_out = self.vae.decode(z, num_frames=self.num_frames, return_dict=False)
+        return vae_out
 class _VAEEncoder(torch.nn.Module):
-    def __init__(self, vae: "AutoencoderKL"):
+    def __init__(self, vae: Union["AutoencoderKL", "AutoencoderKLTemporalDecoder"]):
         super().__init__()
         self.vae = vae
     def encode(self, x: torch.FloatTensor, return_dict: bool = True):
-        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
-            return self.tiled_encode(x, return_dict=return_dict)
+        if hasattr(self, "use_tiling") and hasattr(self, "use_slicing"):
+            if self.use_tiling and (
+                x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size
+            ):
+                return self.tiled_encode(x, return_dict=return_dict)
+            if self.use_slicing and x.shape[0] > 1:
+                encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+                h = torch.cat(encoded_slices)
+            else:
+                h = self.encoder(x)
+                if self.quant_conv is not None:
+                    h = self.quant_conv(h)
-        if self.use_slicing and x.shape[0] > 1:
-            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
-            h = torch.cat(encoded_slices)
         else:
             h = self.encoder(x)
             if self.quant_conv is not None:

optimum/rbln/diffusers/models/controlnet.py CHANGED Viewed

@@ -118,7 +118,7 @@ class RBLNControlNetModel(RBLNModel):
         )
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
+    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
         use_encoder_hidden_states = False
         for down_block in model.down_blocks:
             if use_encoder_hidden_states := getattr(down_block, "has_cross_attention", False):
@@ -215,10 +215,25 @@ class RBLNControlNetModel(RBLNModel):
         encoder_hidden_states: torch.Tensor,
         controlnet_cond: torch.FloatTensor,
         conditioning_scale: torch.Tensor = 1.0,
-        added_cond_kwargs: Dict[str, torch.Tensor] = {},
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
         return_dict: bool = True,
         **kwargs,
     ):
+        """
+        Forward pass for the RBLN-optimized ControlNetModel.
+        Args:
+            sample (torch.FloatTensor): The noisy input tensor.
+            timestep (Union[torch.Tensor, float, int]): The number of timesteps to denoise an input.
+            encoder_hidden_states (torch.Tensor): The encoder hidden states.
+            controlnet_cond (torch.FloatTensor): The conditional input tensor of shape `(batch_size, max_seq_len, hidden_size)`.
+            conditioning_scale (torch.Tensor): The scale factor for ControlNet outputs.
+            added_cond_kwargs (Dict[str, torch.Tensor]): Additional conditions for the Stable Diffusion XL UNet.
+            return_dict (bool): Whether or not to return a [`~diffusers.models.controlnets.controlnet.ControlNetOutput`] instead of a plain tuple
+        Returns:
+            (Union[`~diffusers.models.controlnets.controlnet.ControlNetOutput`], Tuple)
+        """
         sample_batch_size = sample.size()[0]
         compiled_batch_size = self.compiled_batch_size
         if sample_batch_size != compiled_batch_size and (

optimum/rbln/diffusers/models/transformers/prior_transformer.py CHANGED Viewed

@@ -77,7 +77,7 @@ class RBLNPriorTransformer(RBLNModel):
         self.clip_std = artifacts["clip_std"]
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
+    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
         return _PriorTransformer(model).eval()
     @classmethod
@@ -128,13 +128,27 @@ class RBLNPriorTransformer(RBLNModel):
     def forward(
         self,
-        hidden_states,
+        hidden_states: torch.Tensor,
         timestep: Union[torch.Tensor, float, int],
         proj_embedding: torch.Tensor,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ):
+        """
+        Forward pass for the RBLN-optimized PriorTransformer.
+        Args:
+            hidden_states (torch.Tensor): The currently predicted image embeddings.
+            timestep (Union[torch.Tensor, float, int]): Current denoising step.
+            proj_embedding (torch.Tensor): Projected embedding vector the denoising process is conditioned on.
+            encoder_hidden_states (Optional[torch.Tensor]): Hidden states of the text embeddings the denoising process is conditioned on.
+            attention_mask (Optional[torch.Tensor]): Text mask for the text embeddings.
+            return_dict (bool): Whether or not to return a [`~diffusers.models.transformers.prior_transformer.PriorTransformerOutput`] instead of a plain tuple.
+        Returns:
+            (Union[`~diffusers.models.transformers.prior_transformer.PriorTransformerOutput`, Tuple])
+        """
         # Convert timestep(long) and attention_mask(bool) to float
         return super().forward(
             hidden_states,

optimum/rbln/diffusers/models/transformers/transformer_cosmos.py CHANGED Viewed

@@ -185,7 +185,7 @@ class RBLNCosmosTransformer3DModel(RBLNModel):
         )
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
+    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
         num_latent_frames = rbln_config.num_latent_frames
         latent_height = rbln_config.latent_height
         latent_width = rbln_config.latent_width
@@ -303,6 +303,21 @@ class RBLNCosmosTransformer3DModel(RBLNModel):
         padding_mask: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ):
+        """
+        Forward pass for the RBLN-optimized CosmosTransformer3DModel.
+        Args:
+            hidden_states (torch.Tensor): The currently predicted image embeddings.
+            timestep (torch.Tensor): Current denoising step.
+            encoder_hidden_states (torch.Tensor): Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            fps: (Optional[int]): Frames per second for the video being generated.
+            condition_mask (Optional[torch.Tensor]): Tensor of condition mask.
+            padding_mask (Optional[torch.Tensor]): Tensor of padding mask.
+            return_dict (bool): Whether or not to return a [`~diffusers.models.modeling_output.Transformer2DModelOutput`] instead of a plain tuple.
+        Returns:
+            (Union[`~diffusers.models.modeling_output.Transformer2DModelOutput`, Tuple])
+        """
         (
             hidden_states,
             temb,

optimum/rbln/diffusers/models/transformers/transformer_sd3.py CHANGED Viewed

@@ -77,7 +77,7 @@ class RBLNSD3Transformer2DModel(RBLNModel):
         super().__post_init__(**kwargs)
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
+    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
         return SD3Transformer2DModelWrapper(model).eval()
     @classmethod
@@ -161,6 +161,19 @@ class RBLNSD3Transformer2DModel(RBLNModel):
         return_dict: bool = True,
         **kwargs,
     ):
+        """
+        Forward pass for the RBLN-optimized SD3Transformer2DModel.
+        Args:
+            hidden_states (torch.FloatTensor): The currently predicted image embeddings.
+            encoder_hidden_states (torch.FloatTensor): Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (torch.FloatTensor): Embeddings projected from the embeddings of input conditions.
+            timestep (torch.LongTensor): Current denoising step.
+            return_dict (bool): Whether or not to return a [`~diffusers.models.modeling_output.Transformer2DModelOutput`] instead of a plain tuple.
+        Returns:
+            (Union[`~diffusers.models.modeling_output.Transformer2DModelOutput`, Tuple])
+        """
         sample_batch_size = hidden_states.size()[0]
         compiled_batch_size = self.compiled_batch_size
         if sample_batch_size != compiled_batch_size and (

optimum/rbln/diffusers/models/unets/__init__.py CHANGED Viewed

@@ -13,3 +13,4 @@
 # limitations under the License.
 from .unet_2d_condition import RBLNUNet2DConditionModel
+from .unet_spatio_temporal_condition import RBLNUNetSpatioTemporalConditionModel

optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl