PyPI - diffusers - Versions diffs - 0.33.0__py3-none-any.whl → 0.34.0__py3-none-any.whl - Mend

diffusers 0.33.0py3-none-any.whl → 0.34.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (478) hide show

diffusers/pipelines/ltx/__init__.py CHANGED Viewed

@@ -22,9 +22,11 @@ except OptionalDependencyNotAvailable:
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
+    _import_structure["modeling_latent_upsampler"] = ["LTXLatentUpsamplerModel"]
     _import_structure["pipeline_ltx"] = ["LTXPipeline"]
     _import_structure["pipeline_ltx_condition"] = ["LTXConditionPipeline"]
     _import_structure["pipeline_ltx_image2video"] = ["LTXImageToVideoPipeline"]
+    _import_structure["pipeline_ltx_latent_upsample"] = ["LTXLatentUpsamplePipeline"]
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -34,9 +36,11 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *
     else:
+        from .modeling_latent_upsampler import LTXLatentUpsamplerModel
         from .pipeline_ltx import LTXPipeline
         from .pipeline_ltx_condition import LTXConditionPipeline
         from .pipeline_ltx_image2video import LTXImageToVideoPipeline
+        from .pipeline_ltx_latent_upsample import LTXLatentUpsamplePipeline
 else:
     import sys

diffusers/pipelines/ltx/modeling_latent_upsampler.py ADDED Viewed

@@ -0,0 +1,188 @@
+# Copyright 2025 Lightricks and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.modeling_utils import ModelMixin
+class ResBlock(torch.nn.Module):
+    def __init__(self, channels: int, mid_channels: Optional[int] = None, dims: int = 3):
+        super().__init__()
+        if mid_channels is None:
+            mid_channels = channels
+        Conv = torch.nn.Conv2d if dims == 2 else torch.nn.Conv3d
+        self.conv1 = Conv(channels, mid_channels, kernel_size=3, padding=1)
+        self.norm1 = torch.nn.GroupNorm(32, mid_channels)
+        self.conv2 = Conv(mid_channels, channels, kernel_size=3, padding=1)
+        self.norm2 = torch.nn.GroupNorm(32, channels)
+        self.activation = torch.nn.SiLU()
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.activation(hidden_states + residual)
+        return hidden_states
+class PixelShuffleND(torch.nn.Module):
+    def __init__(self, dims, upscale_factors=(2, 2, 2)):
+        super().__init__()
+        self.dims = dims
+        self.upscale_factors = upscale_factors
+        if dims not in [1, 2, 3]:
+            raise ValueError("dims must be 1, 2, or 3")
+    def forward(self, x):
+        if self.dims == 3:
+            # spatiotemporal: b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)
+            return (
+                x.unflatten(1, (-1, *self.upscale_factors[:3]))
+                .permute(0, 1, 5, 2, 6, 3, 7, 4)
+                .flatten(6, 7)
+                .flatten(4, 5)
+                .flatten(2, 3)
+            )
+        elif self.dims == 2:
+            # spatial: b (c p1 p2) h w -> b c (h p1) (w p2)
+            return (
+                x.unflatten(1, (-1, *self.upscale_factors[:2])).permute(0, 1, 4, 2, 5, 3).flatten(4, 5).flatten(2, 3)
+            )
+        elif self.dims == 1:
+            # temporal: b (c p1) f h w -> b c (f p1) h w
+            return x.unflatten(1, (-1, *self.upscale_factors[:1])).permute(0, 1, 3, 2, 4, 5).flatten(2, 3)
+class LTXLatentUpsamplerModel(ModelMixin, ConfigMixin):
+    """
+    Model to spatially upsample VAE latents.
+    Args:
+        in_channels (`int`, defaults to `128`):
+            Number of channels in the input latent
+        mid_channels (`int`, defaults to `512`):
+            Number of channels in the middle layers
+        num_blocks_per_stage (`int`, defaults to `4`):
+            Number of ResBlocks to use in each stage (pre/post upsampling)
+        dims (`int`, defaults to `3`):
+            Number of dimensions for convolutions (2 or 3)
+        spatial_upsample (`bool`, defaults to `True`):
+            Whether to spatially upsample the latent
+        temporal_upsample (`bool`, defaults to `False`):
+            Whether to temporally upsample the latent
+    """
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 128,
+        mid_channels: int = 512,
+        num_blocks_per_stage: int = 4,
+        dims: int = 3,
+        spatial_upsample: bool = True,
+        temporal_upsample: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.mid_channels = mid_channels
+        self.num_blocks_per_stage = num_blocks_per_stage
+        self.dims = dims
+        self.spatial_upsample = spatial_upsample
+        self.temporal_upsample = temporal_upsample
+        ConvNd = torch.nn.Conv2d if dims == 2 else torch.nn.Conv3d
+        self.initial_conv = ConvNd(in_channels, mid_channels, kernel_size=3, padding=1)
+        self.initial_norm = torch.nn.GroupNorm(32, mid_channels)
+        self.initial_activation = torch.nn.SiLU()
+        self.res_blocks = torch.nn.ModuleList([ResBlock(mid_channels, dims=dims) for _ in range(num_blocks_per_stage)])
+        if spatial_upsample and temporal_upsample:
+            self.upsampler = torch.nn.Sequential(
+                torch.nn.Conv3d(mid_channels, 8 * mid_channels, kernel_size=3, padding=1),
+                PixelShuffleND(3),
+            )
+        elif spatial_upsample:
+            self.upsampler = torch.nn.Sequential(
+                torch.nn.Conv2d(mid_channels, 4 * mid_channels, kernel_size=3, padding=1),
+                PixelShuffleND(2),
+            )
+        elif temporal_upsample:
+            self.upsampler = torch.nn.Sequential(
+                torch.nn.Conv3d(mid_channels, 2 * mid_channels, kernel_size=3, padding=1),
+                PixelShuffleND(1),
+            )
+        else:
+            raise ValueError("Either spatial_upsample or temporal_upsample must be True")
+        self.post_upsample_res_blocks = torch.nn.ModuleList(
+            [ResBlock(mid_channels, dims=dims) for _ in range(num_blocks_per_stage)]
+        )
+        self.final_conv = ConvNd(mid_channels, in_channels, kernel_size=3, padding=1)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        if self.dims == 2:
+            hidden_states = hidden_states.permute(0, 2, 1, 3, 4).flatten(0, 1)
+            hidden_states = self.initial_conv(hidden_states)
+            hidden_states = self.initial_norm(hidden_states)
+            hidden_states = self.initial_activation(hidden_states)
+            for block in self.res_blocks:
+                hidden_states = block(hidden_states)
+            hidden_states = self.upsampler(hidden_states)
+            for block in self.post_upsample_res_blocks:
+                hidden_states = block(hidden_states)
+            hidden_states = self.final_conv(hidden_states)
+            hidden_states = hidden_states.unflatten(0, (batch_size, -1)).permute(0, 2, 1, 3, 4)
+        else:
+            hidden_states = self.initial_conv(hidden_states)
+            hidden_states = self.initial_norm(hidden_states)
+            hidden_states = self.initial_activation(hidden_states)
+            for block in self.res_blocks:
+                hidden_states = block(hidden_states)
+            if self.temporal_upsample:
+                hidden_states = self.upsampler(hidden_states)
+                hidden_states = hidden_states[:, :, 1:, :, :]
+            else:
+                hidden_states = hidden_states.permute(0, 2, 1, 3, 4).flatten(0, 1)
+                hidden_states = self.upsampler(hidden_states)
+                hidden_states = hidden_states.unflatten(0, (batch_size, -1)).permute(0, 2, 1, 3, 4)
+            for block in self.post_upsample_res_blocks:
+                hidden_states = block(hidden_states)
+            hidden_states = self.final_conv(hidden_states)
+        return hidden_states

diffusers/pipelines/ltx/pipeline_ltx.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2024 Lightricks and The HuggingFace Team. All rights reserved.
+# Copyright 2025 Lightricks and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -140,6 +140,33 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    r"""
+    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
+    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
+    Flawed](https://huggingface.co/papers/2305.08891).
+    Args:
+        noise_cfg (`torch.Tensor`):
+            The predicted noise tensor for the guided diffusion process.
+        noise_pred_text (`torch.Tensor`):
+            The predicted noise tensor for the text-guided diffusion process.
+        guidance_rescale (`float`, *optional*, defaults to 0.0):
+            A rescale factor applied to the noise predictions.
+    Returns:
+        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
 class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
@@ -481,6 +508,10 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
     def guidance_scale(self):
         return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
     @property
     def do_classifier_free_guidance(self):
         return self._guidance_scale > 1.0
@@ -514,6 +545,7 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
         num_inference_steps: int = 50,
         timesteps: List[int] = None,
         guidance_scale: float = 3,
+        guidance_rescale: float = 0.0,
         num_videos_per_prompt: Optional[int] = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.Tensor] = None,
@@ -551,11 +583,16 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
                 in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                 passed will be used. Must be in descending order.
             guidance_scale (`float`, defaults to `3 `):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 The number of videos to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -624,6 +661,7 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
         )
         self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
         self._attention_kwargs = attention_kwargs
         self._interrupt = False
         self._current_timestep = None
@@ -737,6 +775,12 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    if self.guidance_rescale > 0:
+                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        noise_pred = rescale_noise_cfg(
+                            noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
+                        )
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
@@ -789,6 +833,7 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
                 ]
                 latents = (1 - decode_noise_scale) * latents + decode_noise_scale * noise
+            latents = latents.to(self.vae.dtype)
             video = self.vae.decode(latents, timestep, return_dict=False)[0]
             video = self.video_processor.postprocess_video(video, output_type=output_type)

diffusers/pipelines/ltx/pipeline_ltx_condition.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2024 Lightricks and The HuggingFace Team. All rights reserved.
+# Copyright 2025 Lightricks and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -222,6 +222,33 @@ def retrieve_latents(
         raise AttributeError("Could not access latents of provided encoder_output")
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    r"""
+    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
+    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
+    Flawed](https://huggingface.co/papers/2305.08891).
+    Args:
+        noise_cfg (`torch.Tensor`):
+            The predicted noise tensor for the guided diffusion process.
+        noise_pred_text (`torch.Tensor`):
+            The predicted noise tensor for the text-guided diffusion process.
+        guidance_rescale (`float`, *optional*, defaults to 0.0):
+            A rescale factor applied to the noise predictions.
+    Returns:
+        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
 class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixin):
     r"""
     Pipeline for text/image/video-to-video generation.
@@ -430,6 +457,7 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
         video,
         frame_index,
         strength,
+        denoise_strength,
         height,
         width,
         callback_on_step_end_tensor_inputs=None,
@@ -497,6 +525,9 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
             elif isinstance(video, list) and isinstance(strength, list) and len(video) != len(strength):
                 raise ValueError("If `conditions` is not provided, `video` and `strength` must be of the same length.")
+        if denoise_strength < 0 or denoise_strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {denoise_strength}")
     @staticmethod
     def _prepare_video_ids(
         batch_size: int,
@@ -649,6 +680,8 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
         width: int = 704,
         num_frames: int = 161,
         num_prefix_latent_frames: int = 2,
+        sigma: Optional[torch.Tensor] = None,
+        latents: Optional[torch.Tensor] = None,
         generator: Optional[torch.Generator] = None,
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
@@ -658,7 +691,18 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
         latent_width = width // self.vae_spatial_compression_ratio
         shape = (batch_size, num_channels_latents, num_latent_frames, latent_height, latent_width)
-        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        if latents is not None and sigma is not None:
+            if latents.shape != shape:
+                raise ValueError(
+                    f"Latents shape {latents.shape} does not match expected shape {shape}. Please check the input."
+                )
+            latents = latents.to(device=device, dtype=dtype)
+            sigma = sigma.to(device=device, dtype=dtype)
+            latents = sigma * noise + (1 - sigma) * latents
+        else:
+            latents = noise
         if len(conditions) > 0:
             condition_latent_frames_mask = torch.zeros(
@@ -766,10 +810,21 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
         return latents, conditioning_mask, video_ids, extra_conditioning_num_latents
+    def get_timesteps(self, sigmas, timesteps, num_inference_steps, strength):
+        num_steps = min(int(num_inference_steps * strength), num_inference_steps)
+        start_index = max(num_inference_steps - num_steps, 0)
+        sigmas = sigmas[start_index:]
+        timesteps = timesteps[start_index:]
+        return sigmas, timesteps, num_inference_steps - start_index
     @property
     def guidance_scale(self):
         return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
     @property
     def do_classifier_free_guidance(self):
         return self._guidance_scale > 1.0
@@ -799,6 +854,7 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
         video: List[PipelineImageInput] = None,
         frame_index: Union[int, List[int]] = 0,
         strength: Union[float, List[float]] = 1.0,
+        denoise_strength: float = 1.0,
         prompt: Union[str, List[str]] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         height: int = 512,
@@ -808,6 +864,7 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
         num_inference_steps: int = 50,
         timesteps: List[int] = None,
         guidance_scale: float = 3,
+        guidance_rescale: float = 0.0,
         image_cond_noise_scale: float = 0.15,
         num_videos_per_prompt: Optional[int] = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
@@ -842,6 +899,10 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
                 generation. If not provided, one has to pass `conditions`.
             strength (`float` or `List[float]`, *optional*):
                 The strength or strengths of the conditioning effect. If not provided, one has to pass `conditions`.
+            denoise_strength (`float`, defaults to `1.0`):
+                The strength of the noise added to the latents for editing. Higher strength leads to more noise added
+                to the latents, therefore leading to more differences between original video and generated video. This
+                is useful for video-to-video editing.
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
@@ -859,11 +920,16 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
                 in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                 passed will be used. Must be in descending order.
             guidance_scale (`float`, defaults to `3 `):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 The number of videos to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -918,8 +984,6 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-        if latents is not None:
-            raise ValueError("Passing latents is not yet supported.")
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
@@ -929,6 +993,7 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
             video=video,
             frame_index=frame_index,
             strength=strength,
+            denoise_strength=denoise_strength,
             height=height,
             width=width,
             callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
@@ -939,6 +1004,7 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
         )
         self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
         self._attention_kwargs = attention_kwargs
         self._interrupt = False
         self._current_timestep = None
@@ -977,8 +1043,9 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
                 strength = [strength] * num_conditions
         device = self._execution_device
+        vae_dtype = self.vae.dtype
-        # 3. Prepare text embeddings
+        # 3. Prepare text embeddings & conditioning image/video
         (
             prompt_embeds,
             prompt_attention_mask,
@@ -1000,8 +1067,6 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
             prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
-        vae_dtype = self.vae.dtype
         conditioning_tensors = []
         is_conditioning_image_or_video = image is not None or video is not None
         if is_conditioning_image_or_video:
@@ -1032,7 +1097,27 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
                     )
                 conditioning_tensors.append(condition_tensor)
-        # 4. Prepare latent variables
+        # 4. Prepare timesteps
+        latent_num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
+        latent_height = height // self.vae_spatial_compression_ratio
+        latent_width = width // self.vae_spatial_compression_ratio
+        if timesteps is None:
+            sigmas = linear_quadratic_schedule(num_inference_steps)
+            timesteps = sigmas * 1000
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        sigmas = self.scheduler.sigmas
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        latent_sigma = None
+        if denoise_strength < 1:
+            sigmas, timesteps, num_inference_steps = self.get_timesteps(
+                sigmas, timesteps, num_inference_steps, denoise_strength
+            )
+            latent_sigma = sigmas[:1].repeat(batch_size * num_videos_per_prompt)
+        self._num_timesteps = len(timesteps)
+        # 5. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels
         latents, conditioning_mask, video_coords, extra_conditioning_num_latents = self.prepare_latents(
             conditioning_tensors,
@@ -1043,6 +1128,8 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
             height=height,
             width=width,
             num_frames=num_frames,
+            sigma=latent_sigma,
+            latents=latents,
             generator=generator,
             device=device,
             dtype=torch.float32,
@@ -1056,21 +1143,6 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
         if self.do_classifier_free_guidance:
             video_coords = torch.cat([video_coords, video_coords], dim=0)
-        # 5. Prepare timesteps
-        latent_num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
-        latent_height = height // self.vae_spatial_compression_ratio
-        latent_width = width // self.vae_spatial_compression_ratio
-        sigmas = linear_quadratic_schedule(num_inference_steps)
-        timesteps = sigmas * 1000
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler,
-            num_inference_steps,
-            device,
-            timesteps=timesteps,
-        )
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        self._num_timesteps = len(timesteps)
         # 6. Denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
@@ -1120,6 +1192,12 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
                     timestep, _ = timestep.chunk(2)
+                    if self.guidance_rescale > 0:
+                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        noise_pred = rescale_noise_cfg(
+                            noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
+                        )
                 denoised_latents = self.scheduler.step(
                     -noise_pred, t, latents, per_token_timesteps=timestep, return_dict=False
                 )[0]
@@ -1168,7 +1246,7 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
             if not self.vae.config.timestep_conditioning:
                 timestep = None
             else:
-                noise = torch.randn(latents.shape, generator=generator, device=device, dtype=latents.dtype)
+                noise = randn_tensor(latents.shape, generator=generator, device=device, dtype=latents.dtype)
                 if not isinstance(decode_timestep, list):
                     decode_timestep = [decode_timestep] * batch_size
                 if decode_noise_scale is None:

diffusers 0.33.0__py3-none-any.whl → 0.34.0__py3-none-any.whl

diffusers 0.33.0py3-none-any.whl → 0.34.0py3-none-any.whl