PyPI - diffusers - Versions diffs - 0.29.2__py3-none-any.whl → 0.30.1__py3-none-any.whl - Mend

diffusers 0.29.2py3-none-any.whl → 0.30.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

diffusers/pipelines/pia/pipeline_pia.py CHANGED Viewed

@@ -22,7 +22,7 @@ import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 from ...image_processor import PipelineImageInput
-from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...models.unets.unet_motion_model import MotionAdapter
@@ -54,22 +54,21 @@ EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import (
-        ...     EulerDiscreteScheduler,
-        ...     MotionAdapter,
-        ...     PIAPipeline,
-        ... )
+        >>> from diffusers import EulerDiscreteScheduler, MotionAdapter, PIAPipeline
         >>> from diffusers.utils import export_to_gif, load_image
-        >>> adapter = MotionAdapter.from_pretrained("../checkpoints/pia-diffusers")
-        >>> pipe = PIAPipeline.from_pretrained("SG161222/Realistic_Vision_V6.0_B1_noVAE", motion_adapter=adapter)
+        >>> adapter = MotionAdapter.from_pretrained("openmmlab/PIA-condition-adapter")
+        >>> pipe = PIAPipeline.from_pretrained(
+        ...     "SG161222/Realistic_Vision_V6.0_B1_noVAE", motion_adapter=adapter, torch_dtype=torch.float16
+        ... )
         >>> pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
         >>> image = load_image(
         ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"
         ... )
         >>> image = image.resize((512, 512))
         >>> prompt = "cat in a hat"
-        >>> negative_prompt = "wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
+        >>> negative_prompt = "wrong white balance, dark, sketches, worst quality, low quality, deformed, distorted"
         >>> generator = torch.Generator("cpu").manual_seed(0)
         >>> output = pipe(image=image, prompt=prompt, negative_prompt=negative_prompt, generator=generator)
         >>> frames = output.frames[0]
@@ -128,7 +127,7 @@ class PIAPipeline(
     StableDiffusionMixin,
     TextualInversionLoaderMixin,
     IPAdapterMixin,
-    LoraLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
     FromSingleFileMixin,
     FreeInitMixin,
 ):
@@ -140,8 +139,8 @@ class PIAPipeline(
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
     Args:
@@ -243,7 +242,7 @@ class PIAPipeline(
         """
         # set lora scale so that monkey patched LoRA
         # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
             self._lora_scale = lora_scale
             # dynamically adjust the LoRA scale
@@ -376,7 +375,7 @@ class PIAPipeline(
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
         if self.text_encoder is not None:
-            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
                 # Retrieve the original scale by scaling back the LoRA layers
                 unscale_lora_layers(self.text_encoder, lora_scale)
@@ -505,6 +504,9 @@ class PIAPipeline(
     def prepare_ip_adapter_image_embeds(
         self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
     ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
         if ip_adapter_image_embeds is None:
             if not isinstance(ip_adapter_image, list):
                 ip_adapter_image = [ip_adapter_image]
@@ -514,7 +516,6 @@ class PIAPipeline(
                     f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
                 )
-            image_embeds = []
             for single_ip_adapter_image, image_proj_layer in zip(
                 ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
             ):
@@ -522,36 +523,28 @@ class PIAPipeline(
                 single_image_embeds, single_negative_image_embeds = self.encode_image(
                     single_ip_adapter_image, device, 1, output_hidden_state
                 )
-                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
-                single_negative_image_embeds = torch.stack(
-                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
-                )
+                image_embeds.append(single_image_embeds[None, :])
                 if do_classifier_free_guidance:
-                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
-                    single_image_embeds = single_image_embeds.to(device)
-                image_embeds.append(single_image_embeds)
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
         else:
-            repeat_dims = [1]
-            image_embeds = []
             for single_image_embeds in ip_adapter_image_embeds:
                 if do_classifier_free_guidance:
                     single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
-                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
-                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    negative_image_embeds.append(single_negative_image_embeds)
                 image_embeds.append(single_image_embeds)
-        return image_embeds
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+        return ip_adapter_image_embeds
     # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
     def prepare_latents(

diffusers/pipelines/pipeline_flax_utils.py CHANGED Viewed

@@ -254,9 +254,7 @@ class FlaxDiffusionPipeline(ConfigMixin, PushToHubMixin):
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download:
-                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -296,7 +294,7 @@ class FlaxDiffusionPipeline(ConfigMixin, PushToHubMixin):
         >>> # see more in [the documentation](https://huggingface.co/docs/hub/security-tokens)
         >>> pipeline, params = FlaxDiffusionPipeline.from_pretrained(
         ...     "runwayml/stable-diffusion-v1-5",
-        ...     revision="bf16",
+        ...     variant="bf16",
         ...     dtype=jnp.bfloat16,
         ... )
@@ -310,13 +308,12 @@ class FlaxDiffusionPipeline(ConfigMixin, PushToHubMixin):
         ... )
         >>> dpm_pipe, dpm_params = FlaxStableDiffusionPipeline.from_pretrained(
-        ...     model_id, revision="bf16", dtype=jnp.bfloat16, scheduler=dpmpp
+        ...     model_id, variant="bf16", dtype=jnp.bfloat16, scheduler=dpmpp
         ... )
         >>> dpm_params["scheduler"] = dpmpp_state
         ```
         """
         cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", False)
         token = kwargs.pop("token", None)
@@ -332,7 +329,6 @@ class FlaxDiffusionPipeline(ConfigMixin, PushToHubMixin):
             config_dict = cls.load_config(
                 pretrained_model_name_or_path,
                 cache_dir=cache_dir,
-                resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
                 token=token,
@@ -363,7 +359,6 @@ class FlaxDiffusionPipeline(ConfigMixin, PushToHubMixin):
             cached_folder = snapshot_download(
                 pretrained_model_name_or_path,
                 cache_dir=cache_dir,
-                resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
                 token=token,
@@ -564,7 +559,7 @@ class FlaxDiffusionPipeline(ConfigMixin, PushToHubMixin):
         ... )
         >>> text2img = FlaxStableDiffusionPipeline.from_pretrained(
-        ...     "runwayml/stable-diffusion-v1-5", revision="bf16", dtype=jnp.bfloat16
+        ...     "runwayml/stable-diffusion-v1-5", variant="bf16", dtype=jnp.bfloat16
         ... )
         >>> img2img = FlaxStableDiffusionImg2ImgPipeline(**text2img.components)
         ```

diffusers/pipelines/pipeline_loading_utils.py CHANGED Viewed

@@ -435,7 +435,6 @@ def _load_empty_model(
             return_unused_kwargs=True,
             return_commit_hash=True,
             force_download=kwargs.pop("force_download", False),
-            resume_download=kwargs.pop("resume_download", None),
             proxies=kwargs.pop("proxies", None),
             local_files_only=kwargs.pop("local_files_only", False),
             token=kwargs.pop("token", None),
@@ -454,7 +453,6 @@ def _load_empty_model(
             cached_folder,
             subfolder=name,
             force_download=kwargs.pop("force_download", False),
-            resume_download=kwargs.pop("resume_download", None),
             proxies=kwargs.pop("proxies", None),
             local_files_only=kwargs.pop("local_files_only", False),
             token=kwargs.pop("token", None),
@@ -544,7 +542,6 @@ def _get_final_device_map(device_map, pipeline_class, passed_class_obj, init_dic
                 torch_dtype=torch_dtype,
                 cached_folder=kwargs.get("cached_folder", None),
                 force_download=kwargs.get("force_download", None),
-                resume_download=kwargs.get("resume_download", None),
                 proxies=kwargs.get("proxies", None),
                 local_files_only=kwargs.get("local_files_only", None),
                 token=kwargs.get("token", None),

diffusers/pipelines/pipeline_utils.py CHANGED Viewed

@@ -533,9 +533,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                 is not used.
-            resume_download:
-                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -625,7 +623,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
         ```
         """
         cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", None)
         force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", None)
@@ -702,7 +699,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
             cached_folder = cls.download(
                 pretrained_model_name_or_path,
                 cache_dir=cache_dir,
-                resume_download=resume_download,
                 force_download=force_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
@@ -842,7 +838,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                 torch_dtype=torch_dtype,
                 cached_folder=cached_folder,
                 force_download=force_download,
-                resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
                 token=token,
@@ -910,7 +905,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
             connected_pipes = {prefix: getattr(modelcard.data, prefix, [None])[0] for prefix in CONNECTED_PIPES_KEYS}
             load_kwargs = {
                 "cache_dir": cache_dir,
-                "resume_download": resume_download,
                 "force_download": force_download,
                 "proxies": proxies,
                 "local_files_only": local_files_only,
@@ -1216,9 +1210,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download:
-                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -1271,7 +1263,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
         """
         cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", None)
         force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", None)
@@ -1311,7 +1302,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                 revision=revision,
                 proxies=proxies,
                 force_download=force_download,
-                resume_download=resume_download,
                 token=token,
             )
@@ -1500,7 +1490,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
             cached_folder = snapshot_download(
                 pretrained_model_name,
                 cache_dir=cache_dir,
-                resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
                 token=token,
@@ -1523,7 +1512,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                 for connected_pipe_repo_id in connected_pipes:
                     download_kwargs = {
                         "cache_dir": cache_dir,
-                        "resume_download": resume_download,
                         "force_download": force_download,
                         "proxies": proxies,
                         "local_files_only": local_files_only,

diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py CHANGED Viewed

@@ -661,7 +661,6 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
                         noise_guidance_edit_tmp = torch.einsum(
                             "cb,cbijk->bijk", concept_weights_tmp, noise_guidance_edit_tmp
                         )
-                        noise_guidance_edit_tmp = noise_guidance_edit_tmp
                         noise_guidance = noise_guidance + noise_guidance_edit_tmp
                         self.sem_guidance[i] = noise_guidance_edit_tmp.detach().cpu()

diffusers/pipelines/stable_audio/__init__.py ADDED Viewed

@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+_dummy_objects = {}
+_import_structure = {}
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modeling_stable_audio"] = ["StableAudioProjectionModel"]
+    _import_structure["pipeline_stable_audio"] = ["StableAudioPipeline"]
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .modeling_stable_audio import StableAudioProjectionModel
+        from .pipeline_stable_audio import StableAudioPipeline
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)

diffusers/pipelines/stable_audio/modeling_stable_audio.py ADDED Viewed

@@ -0,0 +1,158 @@
+# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from math import pi
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.modeling_utils import ModelMixin
+from ...utils import BaseOutput, logging
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class StableAudioPositionalEmbedding(nn.Module):
+    """Used for continuous time"""
+    def __init__(self, dim: int):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        self.weights = nn.Parameter(torch.randn(half_dim))
+    def forward(self, times: torch.Tensor) -> torch.Tensor:
+        times = times[..., None]
+        freqs = times * self.weights[None] * 2 * pi
+        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
+        fouriered = torch.cat((times, fouriered), dim=-1)
+        return fouriered
+@dataclass
+class StableAudioProjectionModelOutput(BaseOutput):
+    """
+    Args:
+    Class for StableAudio projection layer's outputs.
+        text_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states obtained by linearly projecting the hidden-states for the text encoder.
+        seconds_start_hidden_states (`torch.Tensor` of shape `(batch_size, 1, hidden_size)`, *optional*):
+            Sequence of hidden-states obtained by linearly projecting the audio start hidden states.
+        seconds_end_hidden_states (`torch.Tensor` of shape `(batch_size, 1, hidden_size)`, *optional*):
+            Sequence of hidden-states obtained by linearly projecting the audio end hidden states.
+    """
+    text_hidden_states: Optional[torch.Tensor] = None
+    seconds_start_hidden_states: Optional[torch.Tensor] = None
+    seconds_end_hidden_states: Optional[torch.Tensor] = None
+class StableAudioNumberConditioner(nn.Module):
+    """
+    A simple linear projection model to map numbers to a latent space.
+    Args:
+        number_embedding_dim (`int`):
+            Dimensionality of the number embeddings.
+        min_value (`int`):
+            The minimum value of the seconds number conditioning modules.
+        max_value (`int`):
+            The maximum value of the seconds number conditioning modules
+        internal_dim (`int`):
+            Dimensionality of the intermediate number hidden states.
+    """
+    def __init__(
+        self,
+        number_embedding_dim,
+        min_value,
+        max_value,
+        internal_dim: Optional[int] = 256,
+    ):
+        super().__init__()
+        self.time_positional_embedding = nn.Sequential(
+            StableAudioPositionalEmbedding(internal_dim),
+            nn.Linear(in_features=internal_dim + 1, out_features=number_embedding_dim),
+        )
+        self.number_embedding_dim = number_embedding_dim
+        self.min_value = min_value
+        self.max_value = max_value
+    def forward(
+        self,
+        floats: torch.Tensor,
+    ):
+        floats = floats.clamp(self.min_value, self.max_value)
+        normalized_floats = (floats - self.min_value) / (self.max_value - self.min_value)
+        # Cast floats to same type as embedder
+        embedder_dtype = next(self.time_positional_embedding.parameters()).dtype
+        normalized_floats = normalized_floats.to(embedder_dtype)
+        embedding = self.time_positional_embedding(normalized_floats)
+        float_embeds = embedding.view(-1, 1, self.number_embedding_dim)
+        return float_embeds
+class StableAudioProjectionModel(ModelMixin, ConfigMixin):
+    """
+    A simple linear projection model to map the conditioning values to a shared latent space.
+    Args:
+        text_encoder_dim (`int`):
+            Dimensionality of the text embeddings from the text encoder (T5).
+        conditioning_dim (`int`):
+            Dimensionality of the output conditioning tensors.
+        min_value (`int`):
+            The minimum value of the seconds number conditioning modules.
+        max_value (`int`):
+            The maximum value of the seconds number conditioning modules
+    """
+    @register_to_config
+    def __init__(self, text_encoder_dim, conditioning_dim, min_value, max_value):
+        super().__init__()
+        self.text_projection = (
+            nn.Identity() if conditioning_dim == text_encoder_dim else nn.Linear(text_encoder_dim, conditioning_dim)
+        )
+        self.start_number_conditioner = StableAudioNumberConditioner(conditioning_dim, min_value, max_value)
+        self.end_number_conditioner = StableAudioNumberConditioner(conditioning_dim, min_value, max_value)
+    def forward(
+        self,
+        text_hidden_states: Optional[torch.Tensor] = None,
+        start_seconds: Optional[torch.Tensor] = None,
+        end_seconds: Optional[torch.Tensor] = None,
+    ):
+        text_hidden_states = (
+            text_hidden_states if text_hidden_states is None else self.text_projection(text_hidden_states)
+        )
+        seconds_start_hidden_states = (
+            start_seconds if start_seconds is None else self.start_number_conditioner(start_seconds)
+        )
+        seconds_end_hidden_states = end_seconds if end_seconds is None else self.end_number_conditioner(end_seconds)
+        return StableAudioProjectionModelOutput(
+            text_hidden_states=text_hidden_states,
+            seconds_start_hidden_states=seconds_start_hidden_states,
+            seconds_end_hidden_states=seconds_end_hidden_states,
+        )

diffusers 0.29.2__py3-none-any.whl → 0.30.1__py3-none-any.whl

diffusers 0.29.2py3-none-any.whl → 0.30.1py3-none-any.whl