PyPI - optimum-rbln - Versions diffs - 0.8.2a0__py3-none-any.whl → 0.9.3__py3-none-any.whl - Mend

optimum-rbln 0.8.2a0py3-none-any.whl → 0.9.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (197) hide show

optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Optional, Tuple
 from ....configuration_utils import RBLNModelConfig
 from ....transformers import RBLNCLIPTextModelWithProjectionConfig, RBLNT5EncoderModelConfig
@@ -40,7 +40,7 @@ class RBLNStableDiffusion3PipelineBaseConfig(RBLNModelConfig):
         height: Optional[int] = None,
         width: Optional[int] = None,
         guidance_scale: Optional[float] = None,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ):
         """
         Args:
@@ -64,7 +64,7 @@ class RBLNStableDiffusion3PipelineBaseConfig(RBLNModelConfig):
             height (Optional[int]): Height of the generated images.
             width (Optional[int]): Width of the generated images.
             guidance_scale (Optional[float]): Scale for classifier-free guidance.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
             ValueError: If both image_size and img_height/img_width are provided.
@@ -100,27 +100,31 @@ class RBLNStableDiffusion3PipelineBaseConfig(RBLNModelConfig):
         max_seq_len = max_seq_len or 256
-        self.text_encoder = self.init_submodule_config(
-            RBLNCLIPTextModelWithProjectionConfig, text_encoder, batch_size=batch_size
+        self.text_encoder = self.initialize_submodule_config(
+            text_encoder,
+            cls_name="RBLNCLIPTextModelWithProjectionConfig",
+            batch_size=batch_size,
         )
-        self.text_encoder_2 = self.init_submodule_config(
-            RBLNCLIPTextModelWithProjectionConfig, text_encoder_2, batch_size=batch_size
+        self.text_encoder_2 = self.initialize_submodule_config(
+            text_encoder_2,
+            cls_name="RBLNCLIPTextModelWithProjectionConfig",
+            batch_size=batch_size,
         )
-        self.text_encoder_3 = self.init_submodule_config(
-            RBLNT5EncoderModelConfig,
+        self.text_encoder_3 = self.initialize_submodule_config(
             text_encoder_3,
+            cls_name="RBLNT5EncoderModelConfig",
             batch_size=batch_size,
             max_seq_len=max_seq_len,
             model_input_names=["input_ids"],
         )
-        self.transformer = self.init_submodule_config(
-            RBLNSD3Transformer2DModelConfig,
+        self.transformer = self.initialize_submodule_config(
             transformer,
+            cls_name="RBLNSD3Transformer2DModelConfig",
             sample_size=sample_size,
         )
-        self.vae = self.init_submodule_config(
-            RBLNAutoencoderKLConfig,
+        self.vae = self.initialize_submodule_config(
             vae,
+            cls_name="RBLNAutoencoderKLConfig",
             batch_size=batch_size,
             uses_encoder=self.__class__._vae_uses_encoder,
             sample_size=image_size,

optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Optional, Tuple
 from ....configuration_utils import RBLNModelConfig
 from ....transformers import RBLNCLIPTextModelConfig, RBLNCLIPTextModelWithProjectionConfig
@@ -38,7 +38,7 @@ class RBLNStableDiffusionXLPipelineBaseConfig(RBLNModelConfig):
         sample_size: Optional[Tuple[int, int]] = None,
         image_size: Optional[Tuple[int, int]] = None,
         guidance_scale: Optional[float] = None,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ):
         """
         Args:
@@ -59,7 +59,7 @@ class RBLNStableDiffusionXLPipelineBaseConfig(RBLNModelConfig):
             image_size (Optional[Tuple[int, int]]): Alternative way to specify image dimensions.
                 Cannot be used together with img_height/img_width.
             guidance_scale (Optional[float]): Scale for classifier-free guidance.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
             ValueError: If both image_size and img_height/img_width are provided.
@@ -93,18 +93,25 @@ class RBLNStableDiffusionXLPipelineBaseConfig(RBLNModelConfig):
         elif (img_height is not None and img_width is None) or (img_height is None and img_width is not None):
             raise ValueError("Both img_height and img_width must be provided together if used")
-        self.text_encoder = self.init_submodule_config(RBLNCLIPTextModelConfig, text_encoder, batch_size=batch_size)
-        self.text_encoder_2 = self.init_submodule_config(
-            RBLNCLIPTextModelWithProjectionConfig, text_encoder_2, batch_size=batch_size
+        self.text_encoder = self.initialize_submodule_config(
+            text_encoder,
+            cls_name="RBLNCLIPTextModelConfig",
+            batch_size=batch_size,
+        )
+        self.text_encoder_2 = self.initialize_submodule_config(
+            text_encoder_2,
+            cls_name="RBLNCLIPTextModelWithProjectionConfig",
+            batch_size=batch_size,
         )
-        self.unet = self.init_submodule_config(
-            RBLNUNet2DConditionModelConfig,
+        self.unet = self.initialize_submodule_config(
             unet,
+            cls_name="RBLNUNet2DConditionModelConfig",
             sample_size=sample_size,
         )
-        self.vae = self.init_submodule_config(
-            RBLNAutoencoderKLConfig,
+        self.vae = self.initialize_submodule_config(
             vae,
+            cls_name="RBLNAutoencoderKLConfig",
             batch_size=batch_size,
             uses_encoder=self.__class__._vae_uses_encoder,
             sample_size=image_size,  # image size is equal to sample size in vae

optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py ADDED Viewed

@@ -0,0 +1,114 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Optional
+from ....configuration_utils import RBLNModelConfig
+from ....transformers import RBLNCLIPVisionModelWithProjectionConfig
+from ..models import RBLNAutoencoderKLTemporalDecoderConfig, RBLNUNetSpatioTemporalConditionModelConfig
+class RBLNStableVideoDiffusionPipelineConfig(RBLNModelConfig):
+    submodules = ["image_encoder", "unet", "vae"]
+    _vae_uses_encoder = True
+    def __init__(
+        self,
+        image_encoder: Optional[RBLNCLIPVisionModelWithProjectionConfig] = None,
+        unet: Optional[RBLNUNetSpatioTemporalConditionModelConfig] = None,
+        vae: Optional[RBLNAutoencoderKLTemporalDecoderConfig] = None,
+        *,
+        batch_size: Optional[int] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: Optional[int] = None,
+        decode_chunk_size: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        **kwargs: Any,
+    ):
+        """
+        Args:
+            image_encoder (Optional[RBLNCLIPVisionModelWithProjectionConfig]): Configuration for the image encoder component.
+                Initialized as RBLNCLIPVisionModelWithProjectionConfig if not provided.
+            unet (Optional[RBLNUNetSpatioTemporalConditionModelConfig]): Configuration for the UNet model component.
+                Initialized as RBLNUNetSpatioTemporalConditionModelConfig if not provided.
+            vae (Optional[RBLNAutoencoderKLTemporalDecoderConfig]): Configuration for the VAE model component.
+                Initialized as RBLNAutoencoderKLTemporalDecoderConfig if not provided.
+            batch_size (Optional[int]): Batch size for inference, applied to all submodules.
+            height (Optional[int]): Height of the generated images.
+            width (Optional[int]): Width of the generated images.
+            num_frames (Optional[int]): The number of frames in the generated video.
+            decode_chunk_size (Optional[int]): The number of frames to decode at once during VAE decoding.
+                Useful for managing memory usage during video generation.
+            guidance_scale (Optional[float]): Scale for classifier-free guidance.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
+        Raises:
+            ValueError: If both image_size and height/width are provided.
+        Note:
+            When guidance_scale > 1.0, the UNet batch size is automatically doubled to
+            accommodate classifier-free guidance.
+        """
+        super().__init__(**kwargs)
+        if height is not None and width is not None:
+            image_size = (height, width)
+        else:
+            # Get default image size from original class to set UNet, VAE image size
+            height = self.get_default_values_for_original_cls("__call__", ["height"])["height"]
+            width = self.get_default_values_for_original_cls("__call__", ["width"])["width"]
+            image_size = (height, width)
+        self.image_encoder = self.initialize_submodule_config(
+            image_encoder, cls_name="RBLNCLIPVisionModelWithProjectionConfig", batch_size=batch_size
+        )
+        self.unet = self.initialize_submodule_config(
+            unet,
+            cls_name="RBLNUNetSpatioTemporalConditionModelConfig",
+            num_frames=num_frames,
+        )
+        self.vae = self.initialize_submodule_config(
+            vae,
+            cls_name="RBLNAutoencoderKLTemporalDecoderConfig",
+            batch_size=batch_size,
+            num_frames=num_frames,
+            decode_chunk_size=decode_chunk_size,
+            uses_encoder=self.__class__._vae_uses_encoder,
+            sample_size=image_size,  # image size is equal to sample size in vae
+        )
+        # Get default guidance scale from original class to set UNet batch size
+        if guidance_scale is None:
+            guidance_scale = self.get_default_values_for_original_cls("__call__", ["max_guidance_scale"])[
+                "max_guidance_scale"
+            ]
+        if not self.unet.batch_size_is_specified:
+            do_classifier_free_guidance = guidance_scale > 1.0
+            if do_classifier_free_guidance:
+                self.unet.batch_size = self.image_encoder.batch_size * 2
+            else:
+                self.unet.batch_size = self.image_encoder.batch_size
+    @property
+    def batch_size(self):
+        return self.vae.batch_size
+    @property
+    def sample_size(self):
+        return self.unet.sample_size
+    @property
+    def image_size(self):
+        return self.vae.sample_size

optimum/rbln/diffusers/modeling_diffusers.py CHANGED Viewed

@@ -33,6 +33,10 @@ if TYPE_CHECKING:
 class RBLNDiffusionMixinConfig(RBLNModelConfig):
+    """
+    Configuration class for RBLN diffusion pipelines.
+    """
     pass
@@ -54,8 +58,8 @@ class RBLNDiffusionMixin:
         ```
     Class Variables:
-        _submodules: List of submodule names that should be compiled (typically ["text_encoder", "unet", "vae"])
-        _optional_submodules: List of submodule names compiled without inheriting RBLNModel (typically ["safety_checker"])
+        - `_submodules`: List of submodule names that should be compiled (typically ["text_encoder", "unet", "vae"])
+        - `_optional_submodules`: List of submodule names compiled without inheriting RBLNModel (typically ["safety_checker"])
     Methods:
         from_pretrained: Creates and optionally compiles a model from a pretrained checkpoint
@@ -70,8 +74,6 @@ class RBLNDiffusionMixin:
     _submodules = []
     _optional_submodules = []
     _prefix = {}
-    _rbln_config_class = None
-    _hf_class = None
     @staticmethod
     def _maybe_apply_and_fuse_lora(
@@ -114,14 +116,14 @@ class RBLNDiffusionMixin:
     @classmethod
     def get_rbln_config_class(cls) -> Type[RBLNModelConfig]:
         # Lazily loads and caches the corresponding RBLN model config class.
-        if cls._rbln_config_class is None:
+        if "_rbln_config_class" not in cls.__dict__ or cls._rbln_config_class is None:
             rbln_config_class_name = cls.__name__ + "Config"
             cls._rbln_config_class = get_rbln_config_class(rbln_config_class_name)
         return cls._rbln_config_class
     @classmethod
     def get_hf_class(cls):
-        if cls._hf_class is None:
+        if "_hf_class" not in cls.__dict__ or cls._hf_class is None:
             hf_cls_name = cls.__name__[4:]
             library = importlib.import_module("diffusers")
             cls._hf_class = getattr(library, hf_cls_name, None)
@@ -132,20 +134,20 @@ class RBLNDiffusionMixin:
         cls,
         model_id: str,
         *,
-        export: bool = False,
+        export: bool = None,
         model_save_dir: Optional[PathLike] = None,
         rbln_config: Dict[str, Any] = {},
         lora_ids: Optional[Union[str, List[str]]] = None,
         lora_weights_names: Optional[Union[str, List[str]]] = None,
         lora_scales: Optional[Union[float, List[float]]] = None,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ) -> "RBLNDiffusionMixin":
         """
         Load a pretrained diffusion pipeline from a model checkpoint, with optional compilation for RBLN NPUs.
         This method has two distinct operating modes:
-        - When `export=True`: Takes a PyTorch-based diffusion model, compiles it for RBLN NPUs, and loads the compiled model
-        - When `export=False`: Loads an already compiled RBLN model from `model_id` without recompilation
+            - When `export=True`: Takes a PyTorch-based diffusion model, compiles it for RBLN NPUs, and loads the compiled model
+            - When `export=False`: Loads an already compiled RBLN model from `model_id` without recompilation
         It supports various diffusion pipelines including Stable Diffusion, Kandinsky, ControlNet, and other diffusers-based models.
@@ -172,7 +174,7 @@ class RBLNDiffusionMixin:
                 Names of specific LoRA weight files to load, corresponding to lora_ids. Only used when `export=True`.
             lora_scales:
                 Scaling factor(s) to apply to the LoRA adapter(s). Only used when `export=True`.
-            **kwargs:
+            kwargs:
                 Additional arguments to pass to the underlying diffusion pipeline constructor or the
                 RBLN compilation process. These may include parameters specific to individual submodules
                 or the particular diffusion pipeline being used.
@@ -183,6 +185,20 @@ class RBLNDiffusionMixin:
         """
         rbln_config, kwargs = cls.get_rbln_config_class().initialize_from_kwargs(rbln_config, **kwargs)
+        if export is None:
+            export = any(
+                not RBLNModel._is_compiled(
+                    model_id,
+                    token=kwargs.get("token"),
+                    revision=kwargs.get("revision"),
+                    force_download=kwargs.get("force_download", False),
+                    cache_dir=kwargs.get("cache_dir"),
+                    subfolder=submodule_name,
+                    local_files_only=kwargs.get("local_files_only", False),
+                )
+                for submodule_name in cls._submodules
+            )
         if export:
             # keep submodules if user passed any of them.
             passed_submodules = {
@@ -228,8 +244,8 @@ class RBLNDiffusionMixin:
             device=rbln_config.device,
             device_map=rbln_config.device_map,
             create_runtimes=rbln_config.create_runtimes,
-            optimize_host_mem=rbln_config.optimize_host_memory,
             activate_profiler=rbln_config.activate_profiler,
+            timeout=rbln_config.timeout,
         ):
             model = super().from_pretrained(pretrained_model_name_or_path=model_id, **kwargs)
@@ -395,12 +411,11 @@ class RBLNDiffusionMixin:
             # overwrite to replace incorrect config
             model.save_config(model_save_dir)
-        if rbln_config.optimize_host_memory is False:
-            # Keep compiled_model objs to further analysis. -> TODO: remove soon...
-            model.compiled_models = []
-            for name in cls._submodules:
-                submodule = getattr(model, name)
-                model.compiled_models.extend(submodule.compiled_models)
+        # Keep compiled_model objs to further analysis. -> TODO: remove soon...
+        model.compiled_models = []
+        for name in cls._submodules:
+            submodule = getattr(model, name)
+            model.compiled_models.extend(submodule.compiled_models)
         return model

optimum/rbln/diffusers/models/__init__.py CHANGED Viewed

@@ -22,9 +22,11 @@ _import_structure = {
         "RBLNAutoencoderKL",
         "RBLNAutoencoderKLCosmos",
         "RBLNVQModel",
+        "RBLNAutoencoderKLTemporalDecoder",
     ],
     "unets": [
         "RBLNUNet2DConditionModel",
+        "RBLNUNetSpatioTemporalConditionModel",
     ],
     "controlnet": ["RBLNControlNetModel"],
     "transformers": [
@@ -38,6 +40,7 @@ if TYPE_CHECKING:
     from .autoencoders import (
         RBLNAutoencoderKL,
         RBLNAutoencoderKLCosmos,
+        RBLNAutoencoderKLTemporalDecoder,
         RBLNVQModel,
     )
     from .controlnet import RBLNControlNetModel
@@ -48,6 +51,7 @@ if TYPE_CHECKING:
     )
     from .unets import (
         RBLNUNet2DConditionModel,
+        RBLNUNetSpatioTemporalConditionModel,
     )
 else:
     import sys

optimum/rbln/diffusers/models/autoencoders/__init__.py CHANGED Viewed

@@ -14,4 +14,5 @@
 from .autoencoder_kl import RBLNAutoencoderKL
 from .autoencoder_kl_cosmos import RBLNAutoencoderKLCosmos
+from .autoencoder_kl_temporal_decoder import RBLNAutoencoderKLTemporalDecoder
 from .vq_model import RBLNVQModel

optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, Dict, List, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
 import rebel
 import torch
@@ -209,17 +209,46 @@ class RBLNAutoencoderKL(RBLNModel):
                 tensor_type="pt",
                 device=device_val,
                 activate_profiler=rbln_config.activate_profiler,
+                timeout=rbln_config.timeout,
             )
             for compiled_model, device_val in zip(compiled_models, device_vals)
         ]
-    def encode(self, x: torch.FloatTensor, return_dict: bool = True, **kwargs) -> torch.FloatTensor:
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True, **kwargs: Dict[str, Any]
+    ) -> Union[torch.FloatTensor, AutoencoderKLOutput]:
+        """
+        Encode an input image into a latent representation.
+        Args:
+            x: The input image to encode.
+            return_dict:
+                Whether to return output as a dictionary. Defaults to True.
+            kwargs: Additional arguments to pass to the encoder.
+        Returns:
+            The latent representation or AutoencoderKLOutput if return_dict=True
+        """
         posterior = self.encoder.encode(x)
         if not return_dict:
             return (posterior,)
         return AutoencoderKLOutput(latent_dist=posterior)
-    def decode(self, z: torch.FloatTensor, return_dict: bool = True, **kwargs) -> torch.FloatTensor:
+    def decode(
+        self, z: torch.FloatTensor, return_dict: bool = True, **kwargs: Dict[str, Any]
+    ) -> Union[torch.FloatTensor, DecoderOutput]:
+        """
+        Decode a latent representation into an image.
+        Args:
+            z: The latent representation to decode.
+            return_dict:
+                Whether to return output as a dictionary. Defaults to True.
+            kwargs: Additional arguments to pass to the decoder.
+        Returns:
+            The decoded image or DecoderOutput if return_dict=True
+        """
         dec = self.decoder.decode(z)
         if not return_dict:
             return (dec,)

optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, Dict, List, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Union
 import rebel
 import torch
@@ -68,7 +68,7 @@ class RBLNAutoencoderKLCosmos(RBLNModel):
         self.image_size = self.rbln_config.image_size
     @classmethod
-    def wrap_model_if_needed(
+    def _wrap_model_if_needed(
         cls, model: torch.nn.Module, rbln_config: RBLNAutoencoderKLCosmosConfig
     ) -> torch.nn.Module:
         decoder_model = _VAECosmosDecoder(model)
@@ -98,7 +98,7 @@ class RBLNAutoencoderKLCosmos(RBLNModel):
             compiled_models = {}
             if rbln_config.uses_encoder:
-                encoder_model, decoder_model = cls.wrap_model_if_needed(model, rbln_config)
+                encoder_model, decoder_model = cls._wrap_model_if_needed(model, rbln_config)
                 enc_compiled_model = cls.compile(
                     encoder_model,
                     rbln_compile_config=rbln_config.compile_cfgs[0],
@@ -107,7 +107,7 @@ class RBLNAutoencoderKLCosmos(RBLNModel):
                 )
                 compiled_models["encoder"] = enc_compiled_model
             else:
-                decoder_model = cls.wrap_model_if_needed(model, rbln_config)
+                decoder_model = cls._wrap_model_if_needed(model, rbln_config)
             dec_compiled_model = cls.compile(
                 decoder_model,
                 rbln_compile_config=rbln_config.compile_cfgs[-1],
@@ -200,17 +200,43 @@ class RBLNAutoencoderKLCosmos(RBLNModel):
                 tensor_type="pt",
                 device=device_val,
                 activate_profiler=rbln_config.activate_profiler,
+                timeout=rbln_config.timeout,
             )
             for compiled_model, device_val in zip(compiled_models, device_vals)
         ]
-    def encode(self, x: torch.FloatTensor, return_dict: bool = True, **kwargs) -> torch.FloatTensor:
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True, **kwargs: Dict[str, Any]
+    ) -> Union[torch.FloatTensor, AutoencoderKLOutput]:
+        """
+        Encode an input video into a latent representation.
+        Args:
+            x: The input video to encode.
+            return_dict:
+                Whether to return output as a dictionary. Defaults to True.
+            kwargs: Additional arguments to pass to the encoder.
+        Returns:
+            The latent representation or AutoencoderKLOutput if return_dict=True
+        """
         posterior = self.encoder.encode(x)
         if not return_dict:
             return (posterior,)
         return AutoencoderKLOutput(latent_dist=posterior)
-    def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> torch.FloatTensor:
+    def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[torch.FloatTensor, DecoderOutput]:
+        """
+        Decode a latent representation into a video.
+        Args:
+            z: The latent representation to decode.
+            return_dict:
+                Whether to return output as a dictionary. Defaults to True.
+        Returns:
+            The decoded video or DecoderOutput if return_dict=True
+        """
         decoded = self.decoder.decode(z)
         if not return_dict:

optimum-rbln 0.8.2a0__py3-none-any.whl → 0.9.3__py3-none-any.whl

optimum-rbln 0.8.2a0py3-none-any.whl → 0.9.3py3-none-any.whl